core: add doc-section generator
This commit is contained in:
@@ -8,6 +8,7 @@ import json
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
|
||||
from .registry import build_dataset_registry, index_datasets, write_dataset_registry
|
||||
from .resource_index import ResourceIndexer
|
||||
from .paths import resolve_datasets_root, resolve_index_root
|
||||
@@ -86,6 +87,32 @@ def _validators_run_command(args: argparse.Namespace) -> int:
|
||||
return 0 if overall_ok else 1
|
||||
|
||||
|
||||
def _generators_doc_sections_command(args: argparse.Namespace) -> int:
|
||||
index_path = Path(args.index).expanduser().resolve() if args.index else None
|
||||
roots = [Path(path).expanduser().resolve() for path in args.root] if args.root else None
|
||||
config = DocSectionConfig(min_chars=args.min_chars, max_chars=args.max_chars)
|
||||
generator = DocSectionGenerator(
|
||||
resource_index=index_path,
|
||||
resource_roots=roots,
|
||||
config=config,
|
||||
)
|
||||
result = generator.generate()
|
||||
output_path = (
|
||||
Path(args.output).expanduser().resolve()
|
||||
if args.output
|
||||
else resolve_index_root() / "doc_sections.jsonl"
|
||||
)
|
||||
write_jsonl(result.samples, output_path)
|
||||
print(f"doc_sections: {output_path}")
|
||||
print(
|
||||
f"samples={len(result.samples)} skipped={result.skipped} errors={len(result.errors)}"
|
||||
)
|
||||
if result.errors:
|
||||
for err in result.errors[:5]:
|
||||
print(f"error: {err}")
|
||||
return 0 if not result.errors else 1
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(prog="afs_scawful")
|
||||
subparsers = parser.add_subparsers(dest="command")
|
||||
@@ -135,6 +162,39 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
)
|
||||
validators_run.set_defaults(func=_validators_run_command)
|
||||
|
||||
generators_parser = subparsers.add_parser("generators", help="Generator tools.")
|
||||
generators_sub = generators_parser.add_subparsers(dest="generators_command")
|
||||
|
||||
doc_sections = generators_sub.add_parser(
|
||||
"doc-sections", help="Generate samples from documentation."
|
||||
)
|
||||
doc_sections.add_argument(
|
||||
"--index",
|
||||
help="Resource index path override (optional).",
|
||||
)
|
||||
doc_sections.add_argument(
|
||||
"--root",
|
||||
action="append",
|
||||
help="Resource root override (repeatable).",
|
||||
)
|
||||
doc_sections.add_argument(
|
||||
"--output",
|
||||
help="Output JSONL path (default: training index/doc_sections.jsonl).",
|
||||
)
|
||||
doc_sections.add_argument(
|
||||
"--min-chars",
|
||||
type=int,
|
||||
default=120,
|
||||
help="Minimum section length to keep.",
|
||||
)
|
||||
doc_sections.add_argument(
|
||||
"--max-chars",
|
||||
type=int,
|
||||
default=2000,
|
||||
help="Maximum section length to keep.",
|
||||
)
|
||||
doc_sections.set_defaults(func=_generators_doc_sections_command)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@@ -153,6 +213,9 @@ def main(argv: Iterable[str] | None = None) -> int:
|
||||
if args.command == "validators" and not getattr(args, "validators_command", None):
|
||||
parser.print_help()
|
||||
return 1
|
||||
if args.command == "generators" and not getattr(args, "generators_command", None):
|
||||
parser.print_help()
|
||||
return 1
|
||||
return args.func(args)
|
||||
|
||||
|
||||
|
||||
12
src/afs_scawful/generators/__init__.py
Normal file
12
src/afs_scawful/generators/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""Generator registry for AFS Scawful."""
|
||||
|
||||
from .base import BaseGenerator, GenerationResult, write_jsonl
|
||||
from .doc_sections import DocSectionConfig, DocSectionGenerator
|
||||
|
||||
__all__ = [
|
||||
"BaseGenerator",
|
||||
"DocSectionConfig",
|
||||
"DocSectionGenerator",
|
||||
"GenerationResult",
|
||||
"write_jsonl",
|
||||
]
|
||||
44
src/afs_scawful/generators/base.py
Normal file
44
src/afs_scawful/generators/base.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Generator base classes for AFS Scawful."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from ..training import TrainingSample
|
||||
|
||||
|
||||
@dataclass
|
||||
class GenerationResult:
|
||||
samples: list[TrainingSample] = field(default_factory=list)
|
||||
errors: list[str] = field(default_factory=list)
|
||||
skipped: int = 0
|
||||
generated_at: str = field(default_factory=lambda: datetime.now().isoformat())
|
||||
|
||||
def to_dict(self) -> dict[str, object]:
|
||||
return {
|
||||
"samples": [sample.to_dict() for sample in self.samples],
|
||||
"errors": list(self.errors),
|
||||
"skipped": self.skipped,
|
||||
"generated_at": self.generated_at,
|
||||
}
|
||||
|
||||
|
||||
class BaseGenerator(ABC):
|
||||
def __init__(self, name: str, domain: str) -> None:
|
||||
self.name = name
|
||||
self.domain = domain
|
||||
|
||||
@abstractmethod
|
||||
def generate(self) -> GenerationResult:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def write_jsonl(samples: Iterable[TrainingSample], output_path: Path) -> Path:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
lines = [sample.to_jsonl_entry() for sample in samples]
|
||||
output_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
return output_path
|
||||
111
src/afs_scawful/generators/doc_sections.py
Normal file
111
src/afs_scawful/generators/doc_sections.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""Generate training samples from documentation sections."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from ..resource_index import ResourceIndexer
|
||||
from ..training import TrainingSample
|
||||
from .base import BaseGenerator, GenerationResult
|
||||
|
||||
|
||||
@dataclass
|
||||
class DocSectionConfig:
|
||||
min_chars: int = 120
|
||||
max_chars: int = 2000
|
||||
file_globs: tuple[str, ...] = ("**/*.md", "**/*.txt")
|
||||
|
||||
|
||||
class DocSectionGenerator(BaseGenerator):
|
||||
"""Build training samples by extracting sections from docs."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
resource_index: Path | None = None,
|
||||
resource_roots: list[Path] | None = None,
|
||||
config: DocSectionConfig | None = None,
|
||||
) -> None:
|
||||
super().__init__(name="DocSectionGenerator", domain="docs")
|
||||
self.resource_index = resource_index
|
||||
self.resource_roots = resource_roots
|
||||
self.config = config or DocSectionConfig()
|
||||
|
||||
def generate(self) -> GenerationResult:
|
||||
result = GenerationResult()
|
||||
files = self._collect_files()
|
||||
for path in files:
|
||||
try:
|
||||
samples = self._samples_from_file(path)
|
||||
result.samples.extend(samples)
|
||||
if not samples:
|
||||
result.skipped += 1
|
||||
except Exception as exc:
|
||||
result.errors.append(f"{path}: {exc}")
|
||||
return result
|
||||
|
||||
def _collect_files(self) -> list[Path]:
|
||||
if self.resource_index:
|
||||
indexer = ResourceIndexer(index_path=self.resource_index)
|
||||
loaded = indexer.load_index()
|
||||
if loaded:
|
||||
return [item.path for item in loaded.files]
|
||||
|
||||
indexer = ResourceIndexer(
|
||||
resource_roots=self.resource_roots,
|
||||
search_patterns=list(self.config.file_globs),
|
||||
)
|
||||
result = indexer.build_index()
|
||||
return [item.path for item in result.files]
|
||||
|
||||
def _samples_from_file(self, path: Path) -> list[TrainingSample]:
|
||||
if not path.exists() or not path.is_file():
|
||||
return []
|
||||
text = path.read_text(encoding="utf-8", errors="ignore")
|
||||
sections = _split_sections(path, text)
|
||||
samples: list[TrainingSample] = []
|
||||
for heading, content in sections:
|
||||
content = content.strip()
|
||||
if len(content) < self.config.min_chars:
|
||||
continue
|
||||
if len(content) > self.config.max_chars:
|
||||
content = content[: self.config.max_chars].rstrip()
|
||||
instruction = f"Extract the documentation section '{heading}'."
|
||||
sample = TrainingSample(
|
||||
instruction=instruction,
|
||||
input=f"source: {path.name}",
|
||||
output=content,
|
||||
domain=self.domain,
|
||||
source=str(path),
|
||||
metadata={"heading": heading, "path": str(path)},
|
||||
)
|
||||
samples.append(sample)
|
||||
return samples
|
||||
|
||||
|
||||
def _split_sections(path: Path, text: str) -> list[tuple[str, str]]:
|
||||
if path.suffix.lower() not in {".md", ".markdown"}:
|
||||
content = text.strip()
|
||||
if not content:
|
||||
return []
|
||||
return [(path.stem, content)]
|
||||
|
||||
sections: list[tuple[str, str]] = []
|
||||
current_heading = path.stem
|
||||
buffer: list[str] = []
|
||||
|
||||
for line in text.splitlines():
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("#"):
|
||||
if buffer:
|
||||
sections.append((current_heading, "\n".join(buffer).strip()))
|
||||
current_heading = stripped.lstrip("#").strip() or current_heading
|
||||
buffer = []
|
||||
else:
|
||||
buffer.append(line)
|
||||
|
||||
if buffer:
|
||||
sections.append((current_heading, "\n".join(buffer).strip()))
|
||||
return sections
|
||||
20
tests/test_generators.py
Normal file
20
tests/test_generators.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from afs_scawful.generators import DocSectionConfig, DocSectionGenerator
|
||||
|
||||
|
||||
def test_doc_section_generator_basic(tmp_path: Path) -> None:
|
||||
doc_path = tmp_path / "guide.md"
|
||||
doc_path.write_text(
|
||||
"# Intro\n\nThis is a short intro section.\n\n# Details\n\nMore details here.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
config = DocSectionConfig(min_chars=10, max_chars=200)
|
||||
generator = DocSectionGenerator(resource_roots=[tmp_path], config=config)
|
||||
result = generator.generate()
|
||||
|
||||
assert result.samples
|
||||
assert result.samples[0].domain == "docs"
|
||||
Reference in New Issue
Block a user