Source code for archledger.config.parse

from __future__ import annotations

import sys
from pathlib import Path
from typing import cast

from archledger.config.model import (
    DEFAULT_ID_SEGMENT,
    DEFAULT_ID_SEGMENT_MAP,
    DEFAULT_TRACKING_EXCLUDE,
    DEFAULT_TRACKING_INCLUDE,
    VALID_BUILD_CONVERTERS,
    VALID_DIAGRAM_IMAGE_FORMATS,
    VALID_DIAGRAM_RENDERERS,
    VALID_DIAGRAM_TYPES,
    VALID_TRACKING_HASH_ALGORITHMS,
    VALID_TRACKING_SCANNERS,
    ProjectConfig,
    normalize_project_name,
    validate_uuid,
)
from archledger.config.schema import FieldSpec, TableSpec, parse_table_from_spec
from archledger.errors import ConfigError
from archledger.ids import (
    DEFAULT_ID_PREFIX,
    DEFAULT_ID_SEGMENT_MODE,
    DEFAULT_ID_WIDTH,
    validate_id_prefix,
    validate_id_segment,
    validate_id_segment_mode,
    validate_id_width,
)
from archledger.model import (
    CURRENT_SOURCE_SCHEMA_VERSION,
    VALID_OUTPUT_FORMATS,
    VALID_RECORD_TYPES,
    VALID_SOURCE_FORMATS,
    default_document_filename_for_output_format,
    default_extension_for_source_format,
    infer_output_format_from_path,
)
from archledger.storage.common import read_text

if sys.version_info >= (3, 11):
    import tomllib
else:  # pragma: no cover
    import tomli as tomllib

_ALLOWED_TOP_LEVEL_KEYS = {
    "config_version",
    "archledger_dir",
    "project_uuid",
    "project_name",
    "ids",
    "source",
    "build",
    "arc42",
    "skill",
    "tracking",
    "diagrams",
}
_ALLOWED_BUILD_KEYS = {
    "default_output",
    "default_format",
    "default_output_dir",
    "include_draft",
    "include_superseded",
    "strict",
    "keep_intermediate",
    "converter",
    "pdf_engine",
    "reference_docx",
    "outputs",
    "diagrams",
}
_ALLOWED_BUILD_OUTPUT_KEYS = {"enabled", "pdf_engine", "reference_docx", "tool"}
_ALLOWED_IDS_KEYS = {
    "prefix",
    "width",
    "segment_mode",
    "default_segment",
    "segment_map",
}
_ALLOWED_ARC42_KEYS = {"template_version", "language", "title", "include_help"}
_ALLOWED_SKILL_KEYS = {"installed", "path"}
_ALLOWED_TRACKING_KEYS = {
    "enabled",
    "state_file",
    "scanner",
    "include",
    "exclude",
    "max_file_bytes",
    "hash_algorithm",
}
_ALLOWED_SOURCE_KEYS = {
    "format",
    "front_matter",
    "section_extension",
    "record_extension",
    "schema_version",
}
_ALLOWED_DIAGRAM_KEYS = {
    "enabled",
    "renderer",
    "default_type",
    "output_dir",
    "image_format",
    "kroki_url",
}
_ALLOWED_BUILD_CONVERTERS = VALID_BUILD_CONVERTERS
_ALLOWED_TRACKING_SCANNERS = VALID_TRACKING_SCANNERS
_ALLOWED_TRACKING_HASH_ALGORITHMS = VALID_TRACKING_HASH_ALGORITHMS
_ALLOWED_DIAGRAM_RENDERERS = VALID_DIAGRAM_RENDERERS
_ALLOWED_DIAGRAM_TYPES = VALID_DIAGRAM_TYPES
_ALLOWED_DIAGRAM_IMAGE_FORMATS = VALID_DIAGRAM_IMAGE_FORMATS


# --- Schema-driven table specs ---


def _parse_tracking_enabled(raw: object, field_name: str) -> bool:
    return _require_bool(raw, field_name)


def _parse_tracking_state_file(raw: object, field_name: str) -> str:
    return _require_non_empty_string(raw, field_name)


def _parse_tracking_scanner(raw: object, field_name: str) -> str:
    return _require_choice(raw, field_name, _ALLOWED_TRACKING_SCANNERS)


def _parse_tracking_include(raw: object, field_name: str) -> tuple[str, ...]:
    return _require_string_tuple(raw, field_name)


def _parse_tracking_exclude(raw: object, field_name: str) -> tuple[str, ...]:
    return _require_string_tuple(raw, field_name)


def _parse_tracking_max_file_bytes(raw: object, field_name: str) -> int:
    return _require_positive_int(raw, field_name)


def _parse_tracking_hash_algorithm(raw: object, field_name: str) -> str:
    return _require_choice(raw, field_name, _ALLOWED_TRACKING_HASH_ALGORITHMS)


def _make_tracking(
    enabled: bool,
    state_file: str,
    scanner: str,
    include: tuple[str, ...],
    exclude: tuple[str, ...],
    max_file_bytes: int,
    hash_algorithm: str,
) -> tuple[bool, str, str, tuple[str, ...], tuple[str, ...], int, str]:
    """Factory for the tracking table row."""
    return (
        enabled,
        state_file,
        scanner,
        include,
        exclude,
        max_file_bytes,
        hash_algorithm,
    )


_TRACKING_TABLE = TableSpec(
    name="tracking",
    fields=(
        FieldSpec("enabled", True, _parse_tracking_enabled),
        FieldSpec("state_file", "source-state.json", _parse_tracking_state_file),
        FieldSpec("scanner", "auto", _parse_tracking_scanner),
        FieldSpec("include", DEFAULT_TRACKING_INCLUDE, _parse_tracking_include),
        FieldSpec("exclude", DEFAULT_TRACKING_EXCLUDE, _parse_tracking_exclude),
        FieldSpec("max_file_bytes", 1_000_000, _parse_tracking_max_file_bytes),
        FieldSpec("hash_algorithm", "sha256", _parse_tracking_hash_algorithm),
    ),
    factory=_make_tracking,
)


[docs] def load_project_config(path: Path) -> ProjectConfig: try: raw_data = tomllib.loads(read_text(path)) except tomllib.TOMLDecodeError as exc: raise ConfigError(f"Failed to parse {path.name}: {exc}") from exc if not isinstance(raw_data, dict): raise ConfigError(f"{path.name} did not parse to a TOML table.") unknown_top_level = sorted(set(raw_data) - _ALLOWED_TOP_LEVEL_KEYS) if unknown_top_level: joined = ", ".join(unknown_top_level) raise ConfigError(f"Unknown config keys in {path.name}: {joined}") build_data = _validate_subtable( path, raw_data.get("build"), _ALLOWED_BUILD_KEYS, "build", ) ids_data = _validate_subtable( path, raw_data.get("ids"), _ALLOWED_IDS_KEYS, "ids", ) source_data = _validate_subtable( path, raw_data.get("source"), _ALLOWED_SOURCE_KEYS, "source", ) arc42_data = _validate_subtable( path, raw_data.get("arc42"), _ALLOWED_ARC42_KEYS, "arc42", ) skill_data = _validate_subtable( path, raw_data.get("skill"), _ALLOWED_SKILL_KEYS, "skill", ) tracking_data = _validate_subtable( path, raw_data.get("tracking"), _ALLOWED_TRACKING_KEYS, "tracking", ) diagrams_data = _validate_subtable( path, raw_data.get("diagrams"), _ALLOWED_DIAGRAM_KEYS, "diagrams", ) config_version = raw_data.get("config_version") if isinstance(config_version, bool) or config_version not in {1, 2, 3, 4, 5, 6, 7}: raise ConfigError("config_version must be 1, 2, 3, 4, 5, 6, or 7.") archledger_dir = raw_data.get("archledger_dir") if not isinstance(archledger_dir, str) or not archledger_dir.strip(): raise ConfigError("archledger_dir must be a non-empty string.") project_uuid = raw_data.get("project_uuid") if not isinstance(project_uuid, str): raise ConfigError("project_uuid must be a string.") project_name = raw_data.get("project_name") if not isinstance(project_name, str): raise ConfigError("project_name must be a string.") ( source_format, source_schema_version, front_matter, section_extension, record_extension, ) = _parse_source_config(source_data, cast(int, config_version)) ( default_output, build_default_format, output_dir, include_draft, include_superseded, strict, keep_intermediate, build_converter, build_pdf_engine, reference_docx, build_outputs, ) = _parse_build_config(build_data, cast(int, config_version), source_format) template_version, language, title, include_help = _parse_arc42_config(arc42_data) ( id_prefix, id_width, id_segment_mode, id_default_segment, id_segment_map, ) = _parse_ids_config(ids_data) skill_installed, skill_path = _parse_skill_config(skill_data) ( tracking_enabled, tracking_state_file, tracking_scanner, tracking_include, tracking_exclude, tracking_max_file_bytes, tracking_hash_algorithm, ) = _parse_tracking_config(tracking_data) ( diagram_enabled, diagram_renderer, diagram_default_type, diagram_output_dir, diagram_image_format, diagram_kroki_url, ) = _parse_diagram_config(diagrams_data, build_data) return ProjectConfig( config_version=cast(int, config_version), archledger_dir=archledger_dir, project_uuid=validate_uuid(project_uuid), project_name=normalize_project_name(project_name), id_prefix=id_prefix, id_width=id_width, id_segment_mode=id_segment_mode, id_default_segment=id_default_segment, id_segment_map=id_segment_map, source_format=source_format, source_schema_version=source_schema_version, front_matter=front_matter.strip().lower(), section_extension=section_extension, record_extension=record_extension, build_default_output=default_output, build_default_format=build_default_format, build_output_dir=output_dir.strip(), build_include_draft=include_draft, build_include_superseded=include_superseded, build_strict=strict, build_keep_intermediate=keep_intermediate, build_converter=build_converter, build_pdf_engine=build_pdf_engine, build_reference_docx=reference_docx, build_outputs=build_outputs, arc42_template_version=template_version, arc42_language=language, arc42_title=title, arc42_include_help=include_help, skill_installed=skill_installed, skill_path=skill_path, tracking_enabled=tracking_enabled, tracking_state_file=tracking_state_file, tracking_scanner=tracking_scanner, tracking_include=tracking_include, tracking_exclude=tracking_exclude, tracking_max_file_bytes=tracking_max_file_bytes, tracking_hash_algorithm=tracking_hash_algorithm, diagram_enabled=diagram_enabled, diagram_renderer=diagram_renderer, diagram_default_type=diagram_default_type, diagram_output_dir=diagram_output_dir, diagram_image_format=diagram_image_format, diagram_kroki_url=diagram_kroki_url, )
def _parse_ids_config( ids_data: dict[str, object], ) -> tuple[str, int, str, str, dict[str, str]]: prefix_value = ids_data.get("prefix", DEFAULT_ID_PREFIX) width_value = ids_data.get("width", DEFAULT_ID_WIDTH) segment_mode_value = ids_data.get("segment_mode", DEFAULT_ID_SEGMENT_MODE) default_segment_value = ids_data.get("default_segment", DEFAULT_ID_SEGMENT) segment_map_value = ids_data.get("segment_map") if not isinstance(prefix_value, str): raise ConfigError("ids.prefix must be a string.") try: prefix = validate_id_prefix(prefix_value) except ValueError as exc: raise ConfigError(str(exc)) from exc try: width = validate_id_width(width_value) # type: ignore[arg-type] except ValueError as exc: raise ConfigError(str(exc)) from exc if not isinstance(segment_mode_value, str): raise ConfigError("ids.segment_mode must be a string.") try: segment_mode = validate_id_segment_mode(segment_mode_value) except ValueError as exc: raise ConfigError(str(exc)) from exc if not isinstance(default_segment_value, str): raise ConfigError("ids.default_segment must be a string.") try: default_segment = validate_id_segment(default_segment_value) except ValueError as exc: raise ConfigError(str(exc)) from exc segment_map = dict(DEFAULT_ID_SEGMENT_MAP) if segment_map_value is not None: if not isinstance(segment_map_value, dict): raise ConfigError("ids.segment_map must be a TOML table.") allowed_segment_keys = set(VALID_RECORD_TYPES) | { "section", "archive_tombstone", } unknown_keys = sorted(set(segment_map_value) - allowed_segment_keys) if unknown_keys: joined = ", ".join(unknown_keys) raise ConfigError( "ids.segment_map contains unknown record types: " + joined ) for key, value in segment_map_value.items(): if not isinstance(value, str): raise ConfigError(f"ids.segment_map.{key} must be a string.") try: segment_map[key] = validate_id_segment(value) except ValueError as exc: raise ConfigError(str(exc)) from exc return prefix, width, segment_mode, default_segment, segment_map def _validate_subtable( path: Path, value: object, allowed_keys: set[str], table_name: str, ) -> dict[str, object]: if value is None: return {} if not isinstance(value, dict): raise ConfigError(f"{table_name} in {path.name} must be a TOML table.") unknown_keys = sorted(set(value) - allowed_keys) if unknown_keys: joined = ", ".join(unknown_keys) raise ConfigError(f"Unknown keys in {table_name}: {joined}") return dict(value) def _parse_source_config( source_data: dict[str, object], config_version: int, ) -> tuple[str, int, str, str, str]: if config_version >= 4 and "format" not in source_data: raise ConfigError( f"source.format is required for config_version {config_version}." ) source_format_default = "asciidoc" if config_version == 3 else "markdown" source_format_value = source_data.get("format", source_format_default) if not isinstance(source_format_value, str): raise ConfigError("source.format must be a string.") source_format = source_format_value.strip().lower() if source_format not in VALID_SOURCE_FORMATS: raise ConfigError( "source.format must be one of: " + ", ".join(sorted(VALID_SOURCE_FORMATS)) + "." ) front_matter_value = source_data.get("front_matter", "yaml") if ( not isinstance(front_matter_value, str) or front_matter_value.strip().lower() != "yaml" ): raise ConfigError('source.front_matter must be the string "yaml".') schema_version_value = source_data.get( "schema_version", CURRENT_SOURCE_SCHEMA_VERSION, ) if isinstance(schema_version_value, bool) or not isinstance( schema_version_value, int ): raise ConfigError("source.schema_version must be an integer.") default_extension = default_extension_for_source_format(source_format) section_extension = _normalize_extension( source_data.get("section_extension", default_extension), "source.section_extension", ) record_extension = _normalize_extension( source_data.get("record_extension", default_extension), "source.record_extension", ) return ( source_format, schema_version_value, front_matter_value.strip().lower(), section_extension, record_extension, ) def _parse_build_config( build_data: dict[str, object], config_version: int, source_format: str, ) -> tuple[ str, str, str, bool, bool, bool, bool, str, str, str, dict[str, dict[str, object]], ]: default_output_value = build_data.get("default_output") if default_output_value is None: default_output = "" elif isinstance(default_output_value, str) and default_output_value.strip(): default_output = default_output_value.strip() else: raise ConfigError("build.default_output must be a non-empty string.") default_format_value = build_data.get("default_format") if default_format_value is None: inferred_default_format = ( infer_output_format_from_path(default_output) if default_output else None ) build_default_format = source_format if config_version == 3 else "markdown" if inferred_default_format is not None: build_default_format = inferred_default_format elif isinstance(default_format_value, str): build_default_format = default_format_value.strip().lower() if build_default_format not in VALID_OUTPUT_FORMATS: raise ConfigError( "build.default_format must be one of: " + ", ".join(sorted(VALID_OUTPUT_FORMATS)) + "." ) else: raise ConfigError("build.default_format must be a string.") if not default_output: default_output = default_document_filename_for_output_format( build_default_format ) inferred_output_format = infer_output_format_from_path(default_output) if ( inferred_output_format is not None and inferred_output_format != build_default_format ): raise ConfigError( "build.default_output extension must match build.default_format." ) output_dir = _require_non_empty_string( build_data.get("default_output_dir", "build"), "build.default_output_dir", ) include_draft = _require_bool( build_data.get("include_draft", False), "build.include_draft", ) include_superseded = _require_bool( build_data.get("include_superseded", False), "build.include_superseded", ) strict = _require_bool(build_data.get("strict", False), "build.strict") keep_intermediate = _require_bool( build_data.get("keep_intermediate", False), "build.keep_intermediate", ) converter = _require_choice( build_data.get("converter", "auto"), "build.converter", _ALLOWED_BUILD_CONVERTERS, ) pdf_engine = _require_optional_string( build_data.get("pdf_engine", ""), "build.pdf_engine", ) reference_docx = build_data.get("reference_docx", "") if not isinstance(reference_docx, str): raise ConfigError("build.reference_docx must be a string.") outputs_value = build_data.get("outputs", {}) if not isinstance(outputs_value, dict): raise ConfigError("build.outputs must be a TOML table.") build_outputs = _normalize_build_outputs(outputs_value) return ( default_output, build_default_format, output_dir, include_draft, include_superseded, strict, keep_intermediate, converter, pdf_engine, reference_docx, build_outputs, ) def _parse_arc42_config( arc42_data: dict[str, object], ) -> tuple[str, str, str, bool]: template_version = _require_non_empty_string( arc42_data.get("template_version", "9.0-EN"), "arc42.template_version", ) language = _require_non_empty_string( arc42_data.get("language", "en"), "arc42.language", ) title = _require_non_empty_string( arc42_data.get("title", "Architecture Documentation"), "arc42.title", ) include_help = _require_bool( arc42_data.get("include_help", False), "arc42.include_help", ) return template_version, language, title, include_help def _parse_skill_config(skill_data: dict[str, object]) -> tuple[bool, str]: skill_installed = _require_bool( skill_data.get("installed", False), "skill.installed", ) skill_path = _require_non_empty_string( skill_data.get("path", "skills/archledger/SKILL.md"), "skill.path", ) return skill_installed, skill_path def _parse_tracking_config( tracking_data: dict[str, object], ) -> tuple[bool, str, str, tuple[str, ...], tuple[str, ...], int, str]: parsed = parse_table_from_spec(tracking_data, _TRACKING_TABLE) return cast( tuple[bool, str, str, tuple[str, ...], tuple[str, ...], int, str], _TRACKING_TABLE.factory(**parsed), ) def _parse_diagram_config( diagrams_data: dict[str, object], build_data: dict[str, object], ) -> tuple[bool, str, str, str, str, str]: build_diagrams_raw = build_data.get("diagrams") if build_diagrams_raw is None: build_diagrams_data: dict[str, object] = {} elif isinstance(build_diagrams_raw, dict): unknown_keys = sorted(set(build_diagrams_raw) - _ALLOWED_DIAGRAM_KEYS) if unknown_keys: joined = ", ".join(unknown_keys) raise ConfigError(f"Unknown keys in build.diagrams: {joined}") build_diagrams_data = dict(build_diagrams_raw) else: raise ConfigError("build.diagrams must be a TOML table.") effective_data = diagrams_data if diagrams_data else build_diagrams_data enabled = _require_bool(effective_data.get("enabled", False), "diagrams.enabled") renderer = _require_choice( effective_data.get("renderer", "pass-through"), "diagrams.renderer", _ALLOWED_DIAGRAM_RENDERERS, ) default_type = _require_choice( effective_data.get("default_type", "text"), "diagrams.default_type", _ALLOWED_DIAGRAM_TYPES, ) output_dir = _require_non_empty_string( effective_data.get("output_dir", "diagrams"), "diagrams.output_dir", ) image_format = _require_choice( effective_data.get("image_format", "svg"), "diagrams.image_format", _ALLOWED_DIAGRAM_IMAGE_FORMATS, ) kroki_url = _require_optional_string( effective_data.get("kroki_url", ""), "diagrams.kroki_url", ) return enabled, renderer, default_type, output_dir, image_format, kroki_url def _normalize_extension(value: object, field_name: str) -> str: if not isinstance(value, str) or not value.strip(): raise ConfigError(f"{field_name} must be a non-empty string.") normalized = value.strip().lower() if not normalized.startswith(".") or len(normalized) == 1: raise ConfigError(f"{field_name} must start with a file extension dot.") return normalized def _require_bool(value: object, field_name: str) -> bool: if not isinstance(value, bool): raise ConfigError(f"{field_name} must be a boolean.") return value def _require_non_empty_string(value: object, field_name: str) -> str: if not isinstance(value, str) or not value.strip(): raise ConfigError(f"{field_name} must be a non-empty string.") return value.strip() def _require_optional_string(value: object, field_name: str) -> str: if not isinstance(value, str): raise ConfigError(f"{field_name} must be a string.") return value.strip() def _require_choice(value: object, field_name: str, allowed: frozenset[str]) -> str: if not isinstance(value, str): raise ConfigError(f"{field_name} must be a string.") normalized = value.strip().lower() if normalized not in allowed: raise ConfigError( f"{field_name} must be one of: " + ", ".join(sorted(allowed)) + "." ) return normalized def _require_string_tuple(value: object, field_name: str) -> tuple[str, ...]: if not isinstance(value, (list, tuple)): raise ConfigError(f"{field_name} must be a list of strings.") items: list[str] = [] for item in value: if not isinstance(item, str) or not item.strip(): raise ConfigError(f"{field_name} must contain only non-empty strings.") items.append(item.strip()) return tuple(items) def _require_positive_int(value: object, field_name: str) -> int: if isinstance(value, bool) or not isinstance(value, int) or value <= 0: raise ConfigError(f"{field_name} must be a positive integer.") return value def _normalize_build_outputs(value: dict[str, object]) -> dict[str, dict[str, object]]: normalized: dict[str, dict[str, object]] = {} for output_name, raw_config in value.items(): normalized_name = str(output_name).strip().lower() if normalized_name not in VALID_OUTPUT_FORMATS: raise ConfigError( f"build.outputs.{output_name} is not a supported output format." ) if not isinstance(raw_config, dict): raise ConfigError(f"build.outputs.{output_name} must be a TOML table.") unknown_keys = sorted(set(raw_config) - _ALLOWED_BUILD_OUTPUT_KEYS) if unknown_keys: raise ConfigError( f"Unknown keys in build.outputs.{output_name}: " + ", ".join(unknown_keys) ) output_config: dict[str, object] = {} enabled = raw_config.get("enabled") if enabled is not None: output_config["enabled"] = _require_bool( enabled, f"build.outputs.{normalized_name}.enabled", ) tool = raw_config.get("tool") if tool is not None: output_config["tool"] = _require_choice( tool, f"build.outputs.{normalized_name}.tool", _ALLOWED_BUILD_CONVERTERS, ) pdf_engine = raw_config.get("pdf_engine") if pdf_engine is not None: output_config["pdf_engine"] = _require_optional_string( pdf_engine, f"build.outputs.{normalized_name}.pdf_engine", ) reference_docx = raw_config.get("reference_docx") if reference_docx is not None: output_config["reference_docx"] = _require_optional_string( reference_docx, f"build.outputs.{normalized_name}.reference_docx", ) normalized[normalized_name] = output_config return normalized