diff --git a/src/afs_scawful/cli.py b/src/afs_scawful/cli.py index d14dcff..8584c6b 100644 --- a/src/afs_scawful/cli.py +++ b/src/afs_scawful/cli.py @@ -8,6 +8,7 @@ import json from pathlib import Path from typing import Iterable +from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .resource_index import ResourceIndexer from .paths import resolve_datasets_root, resolve_index_root @@ -86,6 +87,32 @@ def _validators_run_command(args: argparse.Namespace) -> int: return 0 if overall_ok else 1 +def _generators_doc_sections_command(args: argparse.Namespace) -> int: + index_path = Path(args.index).expanduser().resolve() if args.index else None + roots = [Path(path).expanduser().resolve() for path in args.root] if args.root else None + config = DocSectionConfig(min_chars=args.min_chars, max_chars=args.max_chars) + generator = DocSectionGenerator( + resource_index=index_path, + resource_roots=roots, + config=config, + ) + result = generator.generate() + output_path = ( + Path(args.output).expanduser().resolve() + if args.output + else resolve_index_root() / "doc_sections.jsonl" + ) + write_jsonl(result.samples, output_path) + print(f"doc_sections: {output_path}") + print( + f"samples={len(result.samples)} skipped={result.skipped} errors={len(result.errors)}" + ) + if result.errors: + for err in result.errors[:5]: + print(f"error: {err}") + return 0 if not result.errors else 1 + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="afs_scawful") subparsers = parser.add_subparsers(dest="command") @@ -135,6 +162,39 @@ def build_parser() -> argparse.ArgumentParser: ) validators_run.set_defaults(func=_validators_run_command) + generators_parser = subparsers.add_parser("generators", help="Generator tools.") + generators_sub = generators_parser.add_subparsers(dest="generators_command") + + doc_sections = generators_sub.add_parser( + "doc-sections", help="Generate samples from documentation." + ) + doc_sections.add_argument( + "--index", + help="Resource index path override (optional).", + ) + doc_sections.add_argument( + "--root", + action="append", + help="Resource root override (repeatable).", + ) + doc_sections.add_argument( + "--output", + help="Output JSONL path (default: training index/doc_sections.jsonl).", + ) + doc_sections.add_argument( + "--min-chars", + type=int, + default=120, + help="Minimum section length to keep.", + ) + doc_sections.add_argument( + "--max-chars", + type=int, + default=2000, + help="Maximum section length to keep.", + ) + doc_sections.set_defaults(func=_generators_doc_sections_command) + return parser @@ -153,6 +213,9 @@ def main(argv: Iterable[str] | None = None) -> int: if args.command == "validators" and not getattr(args, "validators_command", None): parser.print_help() return 1 + if args.command == "generators" and not getattr(args, "generators_command", None): + parser.print_help() + return 1 return args.func(args) diff --git a/src/afs_scawful/generators/__init__.py b/src/afs_scawful/generators/__init__.py new file mode 100644 index 0000000..5df2ba5 --- /dev/null +++ b/src/afs_scawful/generators/__init__.py @@ -0,0 +1,12 @@ +"""Generator registry for AFS Scawful.""" + +from .base import BaseGenerator, GenerationResult, write_jsonl +from .doc_sections import DocSectionConfig, DocSectionGenerator + +__all__ = [ + "BaseGenerator", + "DocSectionConfig", + "DocSectionGenerator", + "GenerationResult", + "write_jsonl", +] diff --git a/src/afs_scawful/generators/base.py b/src/afs_scawful/generators/base.py new file mode 100644 index 0000000..eb2fcfa --- /dev/null +++ b/src/afs_scawful/generators/base.py @@ -0,0 +1,44 @@ +"""Generator base classes for AFS Scawful.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from datetime import datetime +from pathlib import Path +from typing import Iterable + +from ..training import TrainingSample + + +@dataclass +class GenerationResult: + samples: list[TrainingSample] = field(default_factory=list) + errors: list[str] = field(default_factory=list) + skipped: int = 0 + generated_at: str = field(default_factory=lambda: datetime.now().isoformat()) + + def to_dict(self) -> dict[str, object]: + return { + "samples": [sample.to_dict() for sample in self.samples], + "errors": list(self.errors), + "skipped": self.skipped, + "generated_at": self.generated_at, + } + + +class BaseGenerator(ABC): + def __init__(self, name: str, domain: str) -> None: + self.name = name + self.domain = domain + + @abstractmethod + def generate(self) -> GenerationResult: + raise NotImplementedError + + +def write_jsonl(samples: Iterable[TrainingSample], output_path: Path) -> Path: + output_path.parent.mkdir(parents=True, exist_ok=True) + lines = [sample.to_jsonl_entry() for sample in samples] + output_path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return output_path diff --git a/src/afs_scawful/generators/doc_sections.py b/src/afs_scawful/generators/doc_sections.py new file mode 100644 index 0000000..294c62f --- /dev/null +++ b/src/afs_scawful/generators/doc_sections.py @@ -0,0 +1,111 @@ +"""Generate training samples from documentation sections.""" + +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable + +from ..resource_index import ResourceIndexer +from ..training import TrainingSample +from .base import BaseGenerator, GenerationResult + + +@dataclass +class DocSectionConfig: + min_chars: int = 120 + max_chars: int = 2000 + file_globs: tuple[str, ...] = ("**/*.md", "**/*.txt") + + +class DocSectionGenerator(BaseGenerator): + """Build training samples by extracting sections from docs.""" + + def __init__( + self, + *, + resource_index: Path | None = None, + resource_roots: list[Path] | None = None, + config: DocSectionConfig | None = None, + ) -> None: + super().__init__(name="DocSectionGenerator", domain="docs") + self.resource_index = resource_index + self.resource_roots = resource_roots + self.config = config or DocSectionConfig() + + def generate(self) -> GenerationResult: + result = GenerationResult() + files = self._collect_files() + for path in files: + try: + samples = self._samples_from_file(path) + result.samples.extend(samples) + if not samples: + result.skipped += 1 + except Exception as exc: + result.errors.append(f"{path}: {exc}") + return result + + def _collect_files(self) -> list[Path]: + if self.resource_index: + indexer = ResourceIndexer(index_path=self.resource_index) + loaded = indexer.load_index() + if loaded: + return [item.path for item in loaded.files] + + indexer = ResourceIndexer( + resource_roots=self.resource_roots, + search_patterns=list(self.config.file_globs), + ) + result = indexer.build_index() + return [item.path for item in result.files] + + def _samples_from_file(self, path: Path) -> list[TrainingSample]: + if not path.exists() or not path.is_file(): + return [] + text = path.read_text(encoding="utf-8", errors="ignore") + sections = _split_sections(path, text) + samples: list[TrainingSample] = [] + for heading, content in sections: + content = content.strip() + if len(content) < self.config.min_chars: + continue + if len(content) > self.config.max_chars: + content = content[: self.config.max_chars].rstrip() + instruction = f"Extract the documentation section '{heading}'." + sample = TrainingSample( + instruction=instruction, + input=f"source: {path.name}", + output=content, + domain=self.domain, + source=str(path), + metadata={"heading": heading, "path": str(path)}, + ) + samples.append(sample) + return samples + + +def _split_sections(path: Path, text: str) -> list[tuple[str, str]]: + if path.suffix.lower() not in {".md", ".markdown"}: + content = text.strip() + if not content: + return [] + return [(path.stem, content)] + + sections: list[tuple[str, str]] = [] + current_heading = path.stem + buffer: list[str] = [] + + for line in text.splitlines(): + stripped = line.strip() + if stripped.startswith("#"): + if buffer: + sections.append((current_heading, "\n".join(buffer).strip())) + current_heading = stripped.lstrip("#").strip() or current_heading + buffer = [] + else: + buffer.append(line) + + if buffer: + sections.append((current_heading, "\n".join(buffer).strip())) + return sections diff --git a/tests/test_generators.py b/tests/test_generators.py new file mode 100644 index 0000000..30509b0 --- /dev/null +++ b/tests/test_generators.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pathlib import Path + +from afs_scawful.generators import DocSectionConfig, DocSectionGenerator + + +def test_doc_section_generator_basic(tmp_path: Path) -> None: + doc_path = tmp_path / "guide.md" + doc_path.write_text( + "# Intro\n\nThis is a short intro section.\n\n# Details\n\nMore details here.\n", + encoding="utf-8", + ) + + config = DocSectionConfig(min_chars=10, max_chars=200) + generator = DocSectionGenerator(resource_roots=[tmp_path], config=config) + result = generator.generate() + + assert result.samples + assert result.samples[0].domain == "docs"