core: add doc-section generator

This commit is contained in:
scawful
2025-12-30 16:15:14 -05:00
parent 18dcfe51d2
commit 162ec68583
5 changed files with 250 additions and 0 deletions

View File

@@ -8,6 +8,7 @@ import json
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .registry import build_dataset_registry, index_datasets, write_dataset_registry
from .resource_index import ResourceIndexer from .resource_index import ResourceIndexer
from .paths import resolve_datasets_root, resolve_index_root from .paths import resolve_datasets_root, resolve_index_root
@@ -86,6 +87,32 @@ def _validators_run_command(args: argparse.Namespace) -> int:
return 0 if overall_ok else 1 return 0 if overall_ok else 1
def _generators_doc_sections_command(args: argparse.Namespace) -> int:
index_path = Path(args.index).expanduser().resolve() if args.index else None
roots = [Path(path).expanduser().resolve() for path in args.root] if args.root else None
config = DocSectionConfig(min_chars=args.min_chars, max_chars=args.max_chars)
generator = DocSectionGenerator(
resource_index=index_path,
resource_roots=roots,
config=config,
)
result = generator.generate()
output_path = (
Path(args.output).expanduser().resolve()
if args.output
else resolve_index_root() / "doc_sections.jsonl"
)
write_jsonl(result.samples, output_path)
print(f"doc_sections: {output_path}")
print(
f"samples={len(result.samples)} skipped={result.skipped} errors={len(result.errors)}"
)
if result.errors:
for err in result.errors[:5]:
print(f"error: {err}")
return 0 if not result.errors else 1
def build_parser() -> argparse.ArgumentParser: def build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(prog="afs_scawful") parser = argparse.ArgumentParser(prog="afs_scawful")
subparsers = parser.add_subparsers(dest="command") subparsers = parser.add_subparsers(dest="command")
@@ -135,6 +162,39 @@ def build_parser() -> argparse.ArgumentParser:
) )
validators_run.set_defaults(func=_validators_run_command) validators_run.set_defaults(func=_validators_run_command)
generators_parser = subparsers.add_parser("generators", help="Generator tools.")
generators_sub = generators_parser.add_subparsers(dest="generators_command")
doc_sections = generators_sub.add_parser(
"doc-sections", help="Generate samples from documentation."
)
doc_sections.add_argument(
"--index",
help="Resource index path override (optional).",
)
doc_sections.add_argument(
"--root",
action="append",
help="Resource root override (repeatable).",
)
doc_sections.add_argument(
"--output",
help="Output JSONL path (default: training index/doc_sections.jsonl).",
)
doc_sections.add_argument(
"--min-chars",
type=int,
default=120,
help="Minimum section length to keep.",
)
doc_sections.add_argument(
"--max-chars",
type=int,
default=2000,
help="Maximum section length to keep.",
)
doc_sections.set_defaults(func=_generators_doc_sections_command)
return parser return parser
@@ -153,6 +213,9 @@ def main(argv: Iterable[str] | None = None) -> int:
if args.command == "validators" and not getattr(args, "validators_command", None): if args.command == "validators" and not getattr(args, "validators_command", None):
parser.print_help() parser.print_help()
return 1 return 1
if args.command == "generators" and not getattr(args, "generators_command", None):
parser.print_help()
return 1
return args.func(args) return args.func(args)

View File

@@ -0,0 +1,12 @@
"""Generator registry for AFS Scawful."""
from .base import BaseGenerator, GenerationResult, write_jsonl
from .doc_sections import DocSectionConfig, DocSectionGenerator
__all__ = [
"BaseGenerator",
"DocSectionConfig",
"DocSectionGenerator",
"GenerationResult",
"write_jsonl",
]

View File

@@ -0,0 +1,44 @@
"""Generator base classes for AFS Scawful."""
from __future__ import annotations
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Iterable
from ..training import TrainingSample
@dataclass
class GenerationResult:
samples: list[TrainingSample] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
skipped: int = 0
generated_at: str = field(default_factory=lambda: datetime.now().isoformat())
def to_dict(self) -> dict[str, object]:
return {
"samples": [sample.to_dict() for sample in self.samples],
"errors": list(self.errors),
"skipped": self.skipped,
"generated_at": self.generated_at,
}
class BaseGenerator(ABC):
def __init__(self, name: str, domain: str) -> None:
self.name = name
self.domain = domain
@abstractmethod
def generate(self) -> GenerationResult:
raise NotImplementedError
def write_jsonl(samples: Iterable[TrainingSample], output_path: Path) -> Path:
output_path.parent.mkdir(parents=True, exist_ok=True)
lines = [sample.to_jsonl_entry() for sample in samples]
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
return output_path

View File

@@ -0,0 +1,111 @@
"""Generate training samples from documentation sections."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
from ..resource_index import ResourceIndexer
from ..training import TrainingSample
from .base import BaseGenerator, GenerationResult
@dataclass
class DocSectionConfig:
min_chars: int = 120
max_chars: int = 2000
file_globs: tuple[str, ...] = ("**/*.md", "**/*.txt")
class DocSectionGenerator(BaseGenerator):
"""Build training samples by extracting sections from docs."""
def __init__(
self,
*,
resource_index: Path | None = None,
resource_roots: list[Path] | None = None,
config: DocSectionConfig | None = None,
) -> None:
super().__init__(name="DocSectionGenerator", domain="docs")
self.resource_index = resource_index
self.resource_roots = resource_roots
self.config = config or DocSectionConfig()
def generate(self) -> GenerationResult:
result = GenerationResult()
files = self._collect_files()
for path in files:
try:
samples = self._samples_from_file(path)
result.samples.extend(samples)
if not samples:
result.skipped += 1
except Exception as exc:
result.errors.append(f"{path}: {exc}")
return result
def _collect_files(self) -> list[Path]:
if self.resource_index:
indexer = ResourceIndexer(index_path=self.resource_index)
loaded = indexer.load_index()
if loaded:
return [item.path for item in loaded.files]
indexer = ResourceIndexer(
resource_roots=self.resource_roots,
search_patterns=list(self.config.file_globs),
)
result = indexer.build_index()
return [item.path for item in result.files]
def _samples_from_file(self, path: Path) -> list[TrainingSample]:
if not path.exists() or not path.is_file():
return []
text = path.read_text(encoding="utf-8", errors="ignore")
sections = _split_sections(path, text)
samples: list[TrainingSample] = []
for heading, content in sections:
content = content.strip()
if len(content) < self.config.min_chars:
continue
if len(content) > self.config.max_chars:
content = content[: self.config.max_chars].rstrip()
instruction = f"Extract the documentation section '{heading}'."
sample = TrainingSample(
instruction=instruction,
input=f"source: {path.name}",
output=content,
domain=self.domain,
source=str(path),
metadata={"heading": heading, "path": str(path)},
)
samples.append(sample)
return samples
def _split_sections(path: Path, text: str) -> list[tuple[str, str]]:
if path.suffix.lower() not in {".md", ".markdown"}:
content = text.strip()
if not content:
return []
return [(path.stem, content)]
sections: list[tuple[str, str]] = []
current_heading = path.stem
buffer: list[str] = []
for line in text.splitlines():
stripped = line.strip()
if stripped.startswith("#"):
if buffer:
sections.append((current_heading, "\n".join(buffer).strip()))
current_heading = stripped.lstrip("#").strip() or current_heading
buffer = []
else:
buffer.append(line)
if buffer:
sections.append((current_heading, "\n".join(buffer).strip()))
return sections

20
tests/test_generators.py Normal file
View File

@@ -0,0 +1,20 @@
from __future__ import annotations
from pathlib import Path
from afs_scawful.generators import DocSectionConfig, DocSectionGenerator
def test_doc_section_generator_basic(tmp_path: Path) -> None:
doc_path = tmp_path / "guide.md"
doc_path.write_text(
"# Intro\n\nThis is a short intro section.\n\n# Details\n\nMore details here.\n",
encoding="utf-8",
)
config = DocSectionConfig(min_chars=10, max_chars=200)
generator = DocSectionGenerator(resource_roots=[tmp_path], config=config)
result = generator.generate()
assert result.samples
assert result.samples[0].domain == "docs"