diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..10b940d --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +*.pyc +.pytest_cache/ +.DS_Store diff --git a/README.md b/README.md index 5e9e3b4..7cd6590 100644 --- a/README.md +++ b/README.md @@ -11,12 +11,16 @@ Docs: - `docs/ROADMAP.md` - `docs/REPO_FACTS.json` - `docs/TRAINING_ROADMAP.md` +- `docs/TRAINING_PLAN.md` +- `docs/PDF_WORKFLOW.md` Quickstart: - `python -m afs_scawful datasets index` - `python -m afs_scawful resources index` - `python -m afs_scawful validators list` - `python -m afs_scawful generators doc-sections --output ~/src/training/index/doc_sections.jsonl` +- `python -m afs_scawful research catalog` +- `python -m afs_scawful research list` Mounts (AFS Studio): - Create `mounts.json` in `~/.config/afs/afs_scawful/` or `~/.config/afs/plugins/afs_scawful/config/` diff --git a/docs/PDF_WORKFLOW.md b/docs/PDF_WORKFLOW.md new file mode 100644 index 0000000..3fb5b90 --- /dev/null +++ b/docs/PDF_WORKFLOW.md @@ -0,0 +1,34 @@ +# PDF Workflow + +Goal: keep research PDFs in a known place, catalog them, and open them fast. + +## Defaults +- Research root: `~/Documents/Research` +- Catalog output: `~/src/context/index/research_catalog.json` + +## Commands +```sh +python -m afs_scawful research catalog +python -m afs_scawful research list +python -m afs_scawful research show 2512-20957v2-XXXXXXXX +python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open +``` + +## Overrides +- `AFS_RESEARCH_ROOT=/path/to/Research` +- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json` +- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or + `~/.config/afs/plugins/afs_scawful/config/` + +Example `research_paths.toml`: +```toml +[paths] +research_root = "~/Documents/Research" +research_catalog = "~/src/context/index/research_catalog.json" +``` + +## Notes +- Abstract excerpts are auto-extracted from the first pages; verify before quoting. +- `--open` uses the OS default PDF viewer (Preview on macOS). +- For richer metadata extraction, install the optional dependency: + `pip install -e '.[research]'` diff --git a/docs/STATUS.md b/docs/STATUS.md index 935f277..7a80e98 100644 --- a/docs/STATUS.md +++ b/docs/STATUS.md @@ -1,7 +1,7 @@ # STATUS Stage: Prototype -Now: config helpers; dataset registry builder; resource indexer; training sample model; validator base + initial validators; doc-section generator; pytest coverage. +Now: config helpers; dataset registry builder; resource indexer; training sample model; validator base + initial validators; doc-section generator; research catalog CLI + PDF workflow docs; pytest coverage. Not yet: more generators; training runner; dataset QA reports. Next: add generator QA summary + manifest; wire generator outputs into AFS Studio. Issues: no training runtime yet. diff --git a/docs/TRAINING_PLAN.md b/docs/TRAINING_PLAN.md new file mode 100644 index 0000000..8c5f146 --- /dev/null +++ b/docs/TRAINING_PLAN.md @@ -0,0 +1,48 @@ +# Training Plan (AFS Scawful) + +Scope: local-only training data pipelines and evaluation for AFS workflows. +Research-only. See `../afs/docs/RESEARCH_SOURCES.md` for citations. + +## Goals +- Keep datasets reproducible, small, and auditable. +- Prioritize agentic filesystem primitives before model training complexity. +- Use evaluation loops to avoid training on noise. + +## Phase 0 — Inventory + Research Catalog (now) +- Use `afs_scawful research catalog` to index `~/Documents/Research`. +- Keep the catalog JSON in `~/src/context/index/research_catalog.json`. +- Verify metadata/abstract excerpts before quoting. [R1] + +## Phase 1 — Dataset QA (near-term) +- Expand dataset registry with QA summaries (counts, schema drift, invalid rows). +- Define a minimal JSON schema for training samples. +- Track provenance per dataset and per generator. [R1] + +## Phase 2 — Task Design (near-term) +- Start with repo-level navigation tasks that assume a small tool surface. [R3] +- Keep tasks focused on file discovery, symbol lookup, and context assembly. +- Use small, deterministic datasets to validate task framing before scaling. + +## Phase 3 — Context Packaging (mid-term) +- Treat training samples as explicit context pipelines with clear state and error + propagation. [R4] +- Build a minimal "context transcript" format (inputs, tool calls, outputs). + +## Phase 4 — Evaluation (mid-term) +- Add human+agent evaluation metrics to avoid overfitting to synthetic tasks. [R7] +- Include tone-variant prompts as a controlled ablation (optional). [R6] + +## Phase 5 — Efficiency References (later) +- Use MoE efficiency papers only when scaling becomes a bottleneck. [R5] + +## Unknown / needs verification +- Which tasks best reflect AFS workflows (agentic filesystem vs orchestration). +- Whether RL is needed or if supervised data is sufficient for early stages. + +## Citations +- [R1] `../afs/docs/RESEARCH_SOURCES.md` +- [R3] `../afs/docs/RESEARCH_SOURCES.md` +- [R4] `../afs/docs/RESEARCH_SOURCES.md` +- [R5] `../afs/docs/RESEARCH_SOURCES.md` +- [R6] `../afs/docs/RESEARCH_SOURCES.md` +- [R7] `../afs/docs/RESEARCH_SOURCES.md` diff --git a/docs/TRAINING_ROADMAP.md b/docs/TRAINING_ROADMAP.md index 5f7d72f..8ae0eb7 100644 --- a/docs/TRAINING_ROADMAP.md +++ b/docs/TRAINING_ROADMAP.md @@ -5,6 +5,7 @@ Scope: AFS Scawful training data pipelines and monitoring. Research-only. ## Committed (exists now) - Dataset registry indexing (local) - Resource indexing (local) +- Research PDF catalog (local) - Plugin config loader for training paths/resources - Validator base + initial validators (ASM/C++/KG/ASAR) - Generator base + doc-section generator diff --git a/pyproject.toml b/pyproject.toml index 1edf202..40bbb93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,9 @@ authors = [ test = [ "pytest>=7.4" ] +research = [ + "pypdf>=4.0" +] [tool.pytest.ini_options] testpaths = ["tests"] diff --git a/src/afs_scawful/__init__.py b/src/afs_scawful/__init__.py index 6f4a5e6..1efd7d6 100644 --- a/src/afs_scawful/__init__.py +++ b/src/afs_scawful/__init__.py @@ -2,17 +2,30 @@ __version__ = "0.0.0" -from .config import load_training_paths, load_training_resources +from .config import load_research_paths, load_training_paths, load_training_resources from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root +from .research import ( + build_research_catalog, + load_research_catalog, + resolve_research_catalog_path, + resolve_research_root, + write_research_catalog, +) from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .resource_index import ResourceIndexer __all__ = [ + "load_research_paths", "load_training_paths", "load_training_resources", "resolve_training_root", "resolve_datasets_root", "resolve_index_root", + "resolve_research_root", + "resolve_research_catalog_path", + "build_research_catalog", + "write_research_catalog", + "load_research_catalog", "build_dataset_registry", "write_dataset_registry", "index_datasets", diff --git a/src/afs_scawful/cli.py b/src/afs_scawful/cli.py index 8584c6b..d8a9900 100644 --- a/src/afs_scawful/cli.py +++ b/src/afs_scawful/cli.py @@ -12,6 +12,15 @@ from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .resource_index import ResourceIndexer from .paths import resolve_datasets_root, resolve_index_root +from .research import ( + build_research_catalog, + load_research_catalog, + open_pdf, + resolve_paper_path, + resolve_research_catalog_path, + resolve_research_root, + write_research_catalog, +) from .training import TrainingSample from .validators import default_validators @@ -113,6 +122,81 @@ def _generators_doc_sections_command(args: argparse.Namespace) -> int: return 0 if not result.errors else 1 +def _research_catalog_command(args: argparse.Namespace) -> int: + root = Path(args.root).expanduser().resolve() if args.root else resolve_research_root() + output_path = ( + Path(args.output).expanduser().resolve() + if args.output + else resolve_research_catalog_path() + ) + catalog = build_research_catalog( + root, + include_abstract=not args.no_abstract, + max_pages=args.max_pages, + max_abstract_chars=args.max_abstract_chars, + ) + write_research_catalog(catalog, output_path) + print(f"research_catalog: {output_path}") + errors = catalog.get("errors", []) + print(f"papers={catalog.get('count', 0)} errors={len(errors)}") + for err in errors[:5]: + print(f"error: {err.get('path')}: {err.get('error')}") + return 0 if not errors else 1 + + +def _research_list_command(args: argparse.Namespace) -> int: + catalog_path = ( + Path(args.catalog).expanduser().resolve() + if args.catalog + else resolve_research_catalog_path() + ) + catalog = load_research_catalog(catalog_path) + for entry in catalog.get("papers", []): + if not isinstance(entry, dict): + continue + title = entry.get("title") or "(untitled)" + print(f"{entry.get('id')}\t{title}\t{entry.get('relative_path')}") + return 0 + + +def _research_show_command(args: argparse.Namespace) -> int: + catalog_path = ( + Path(args.catalog).expanduser().resolve() + if args.catalog + else resolve_research_catalog_path() + ) + catalog = load_research_catalog(catalog_path) + entry_path = resolve_paper_path(catalog, args.paper_id) + if entry_path is None: + print(f"Paper not found: {args.paper_id}") + return 1 + for entry in catalog.get("papers", []): + if entry.get("path") == str(entry_path): + print(json.dumps(entry, indent=2, sort_keys=True)) + return 0 + print(f"Paper not found: {args.paper_id}") + return 1 + + +def _research_open_command(args: argparse.Namespace) -> int: + catalog_path = ( + Path(args.catalog).expanduser().resolve() + if args.catalog + else resolve_research_catalog_path() + ) + catalog = load_research_catalog(catalog_path) + entry_path = resolve_paper_path(catalog, args.paper_id) + if entry_path is None: + print(f"Paper not found: {args.paper_id}") + return 1 + print(str(entry_path)) + if args.open: + if not open_pdf(entry_path): + print("Unable to open PDF with the default viewer.") + return 1 + return 0 + + def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(prog="afs_scawful") subparsers = parser.add_subparsers(dest="command") @@ -195,6 +279,52 @@ def build_parser() -> argparse.ArgumentParser: ) doc_sections.set_defaults(func=_generators_doc_sections_command) + research_parser = subparsers.add_parser("research", help="Research PDF tools.") + research_sub = research_parser.add_subparsers(dest="research_command") + + research_catalog = research_sub.add_parser( + "catalog", help="Build a research PDF catalog." + ) + research_catalog.add_argument("--root", help="Research root override.") + research_catalog.add_argument("--output", help="Output catalog path.") + research_catalog.add_argument( + "--no-abstract", + action="store_true", + help="Skip abstract extraction.", + ) + research_catalog.add_argument( + "--max-pages", + type=int, + default=2, + help="Max pages to scan for abstract extraction.", + ) + research_catalog.add_argument( + "--max-abstract-chars", + type=int, + default=1200, + help="Max abstract characters to store.", + ) + research_catalog.set_defaults(func=_research_catalog_command) + + research_list = research_sub.add_parser("list", help="List catalog entries.") + research_list.add_argument("--catalog", help="Catalog path override.") + research_list.set_defaults(func=_research_list_command) + + research_show = research_sub.add_parser("show", help="Show catalog entry JSON.") + research_show.add_argument("paper_id", help="Catalog id, path, or filename.") + research_show.add_argument("--catalog", help="Catalog path override.") + research_show.set_defaults(func=_research_show_command) + + research_open = research_sub.add_parser("open", help="Print or open a PDF.") + research_open.add_argument("paper_id", help="Catalog id, path, or filename.") + research_open.add_argument("--catalog", help="Catalog path override.") + research_open.add_argument( + "--open", + action="store_true", + help="Open using the OS default viewer.", + ) + research_open.set_defaults(func=_research_open_command) + return parser @@ -216,6 +346,9 @@ def main(argv: Iterable[str] | None = None) -> int: if args.command == "generators" and not getattr(args, "generators_command", None): parser.print_help() return 1 + if args.command == "research" and not getattr(args, "research_command", None): + parser.print_help() + return 1 return args.func(args) diff --git a/src/afs_scawful/config.py b/src/afs_scawful/config.py index 50ee729..8ded93a 100644 --- a/src/afs_scawful/config.py +++ b/src/afs_scawful/config.py @@ -59,3 +59,16 @@ def load_training_resources(config_path: Path | None = None) -> dict[str, Any]: _expand_path(p) for p in resource["resource_roots"] if isinstance(p, str) ] return data + + +def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str, Path]]: + path = config_path or _find_config("research_paths.toml") + data = _load_toml(path) + expanded: dict[str, dict[str, Path]] = {} + if "paths" in data and isinstance(data["paths"], dict): + expanded["paths"] = { + key: _expand_path(value) + for key, value in data["paths"].items() + if isinstance(value, str) + } + return expanded diff --git a/src/afs_scawful/research.py b/src/afs_scawful/research.py new file mode 100644 index 0000000..9880e83 --- /dev/null +++ b/src/afs_scawful/research.py @@ -0,0 +1,305 @@ +"""Research catalog utilities for AFS Scawful.""" + +from __future__ import annotations + +import hashlib +import json +import os +import re +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Iterable + +from .config import load_research_paths + +try: # Optional dependency for richer metadata extraction. + from pypdf import PdfReader +except Exception: # pragma: no cover - optional import + PdfReader = None + + +_PDF_FIELDS = ("Title", "Author", "Subject", "Keywords") + + +def default_research_root() -> Path: + candidates = [ + Path.home() / "Documents" / "Research", + Path.home() / "Documents" / "research", + Path.home() / "Research", + ] + for candidate in candidates: + if candidate.exists(): + return candidate + return candidates[0] + + +def resolve_research_root(config_path: Path | None = None) -> Path: + env_value = os.getenv("AFS_RESEARCH_ROOT") + if env_value: + return Path(env_value).expanduser().resolve() + data = load_research_paths(config_path=config_path) + paths = data.get("paths", {}) if isinstance(data, dict) else {} + root = paths.get("research_root") or paths.get("research") + if isinstance(root, Path): + return root + if isinstance(root, str) and root: + return Path(root).expanduser().resolve() + return default_research_root() + + +def resolve_research_catalog_path(config_path: Path | None = None) -> Path: + env_value = os.getenv("AFS_RESEARCH_CATALOG") + if env_value: + return Path(env_value).expanduser().resolve() + data = load_research_paths(config_path=config_path) + paths = data.get("paths", {}) if isinstance(data, dict) else {} + catalog = paths.get("research_catalog") or paths.get("catalog") + if isinstance(catalog, Path): + return catalog + if isinstance(catalog, str) and catalog: + return Path(catalog).expanduser().resolve() + return Path.home() / "src" / "context" / "index" / "research_catalog.json" + + +def iter_pdf_paths(root: Path) -> Iterable[Path]: + if not root.exists(): + return [] + paths: list[Path] = [] + for path in root.rglob("*.pdf"): + if any(part.startswith(".") for part in path.parts): + continue + paths.append(path) + return sorted(paths) + + +def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()) + return slug.strip("-") + + +def make_paper_id(relative_path: Path) -> str: + stem = relative_path.with_suffix("").as_posix() + slug = _slugify(stem) + digest = hashlib.sha1(relative_path.as_posix().encode("utf-8")).hexdigest()[:8] + return f"{slug}-{digest}" if slug else digest + + +def _normalize_meta_value(value: object) -> str | None: + if value is None: + return None + text = str(value).strip() + if not text or text.lower() == "none": + return None + return " ".join(text.split()) + + +def _parse_pdf_literal(value: bytes) -> str: + text = value.decode("latin-1", errors="ignore") + text = text.replace("\\(", "(").replace("\\)", ")") + text = text.replace("\\n", " ").replace("\\r", " ") + return " ".join(text.split()).strip() + + +def _read_pdf_snippet(path: Path, limit_bytes: int = 2_000_000) -> bytes: + with path.open("rb") as handle: + return handle.read(limit_bytes) + + +def _has_pdf_header(path: Path) -> bool: + try: + with path.open("rb") as handle: + return handle.read(5) == b"%PDF-" + except OSError: + return False + + +def _extract_metadata_regex(path: Path) -> dict[str, str]: + data = _read_pdf_snippet(path) + meta: dict[str, str] = {} + for field in _PDF_FIELDS: + pattern = re.compile(rb"/" + field.encode("ascii") + rb"\s*\(([^)]{0,512})\)", re.IGNORECASE | re.DOTALL) + match = pattern.search(data) + if match: + meta[field.lower()] = _parse_pdf_literal(match.group(1)) + return meta + + +def extract_abstract_excerpt(text: str, max_chars: int = 1200) -> str | None: + if not text: + return None + match = re.search(r"\bAbstract\b", text, re.IGNORECASE) + if not match: + return None + snippet = text[match.end():].lstrip(":\n\r\t ") + end = re.search(r"\n\s*(?:\d+\s+Introduction|Introduction|Keywords|Index Terms|1\.|I\.)", snippet) + if end: + snippet = snippet[: end.start()] + snippet = " ".join(snippet.split()).strip() + if not snippet: + return None + if len(snippet) > max_chars: + snippet = snippet[:max_chars].rstrip() + "..." + return snippet + + +def _extract_metadata_pypdf( + path: Path, + include_abstract: bool, + max_pages: int, + max_abstract_chars: int, +) -> tuple[dict[str, str | None], int | None, str | None]: + if PdfReader is None: + raise RuntimeError("pypdf not available") + reader = PdfReader(str(path)) + metadata = reader.metadata or {} + values = { + "title": _normalize_meta_value(metadata.get("/Title") or metadata.get("Title")), + "author": _normalize_meta_value(metadata.get("/Author") or metadata.get("Author")), + "subject": _normalize_meta_value(metadata.get("/Subject") or metadata.get("Subject")), + "keywords": _normalize_meta_value(metadata.get("/Keywords") or metadata.get("Keywords")), + } + abstract_excerpt = None + if include_abstract: + page_text: list[str] = [] + for page in reader.pages[:max_pages]: + try: + page_text.append(page.extract_text() or "") + except Exception: + page_text.append("") + abstract_excerpt = extract_abstract_excerpt("\n".join(page_text), max_chars=max_abstract_chars) + return values, len(reader.pages), abstract_excerpt + + +def build_paper_entry( + path: Path, + root: Path, + include_abstract: bool = True, + max_pages: int = 2, + max_abstract_chars: int = 1200, +) -> dict[str, object]: + relative_path = path.relative_to(root) + entry = { + "id": make_paper_id(relative_path), + "path": str(path), + "relative_path": relative_path.as_posix(), + "filename": path.name, + "title": None, + "author": None, + "subject": None, + "keywords": None, + "page_count": None, + "file_size": path.stat().st_size, + "modified_time": datetime.fromtimestamp(path.stat().st_mtime, tz=timezone.utc).isoformat(), + "abstract_excerpt": None, + "abstract_source": "none", + "metadata_source": "none", + } + + metadata: dict[str, str | None] = {} + page_count = None + abstract_excerpt = None + if PdfReader is not None and _has_pdf_header(path): + try: + metadata, page_count, abstract_excerpt = _extract_metadata_pypdf( + path, + include_abstract=include_abstract, + max_pages=max_pages, + max_abstract_chars=max_abstract_chars, + ) + entry["metadata_source"] = "pypdf" + except Exception: + metadata = {} + if not metadata: + metadata = _extract_metadata_regex(path) + if metadata: + entry["metadata_source"] = "regex" + + for key, value in metadata.items(): + if value: + entry[key] = value + if page_count: + entry["page_count"] = page_count + if abstract_excerpt: + entry["abstract_excerpt"] = abstract_excerpt + entry["abstract_source"] = "pypdf" + return entry + + +def build_research_catalog( + root: Path, + include_abstract: bool = True, + max_pages: int = 2, + max_abstract_chars: int = 1200, +) -> dict[str, object]: + papers: list[dict[str, object]] = [] + errors: list[dict[str, str]] = [] + for path in iter_pdf_paths(root): + try: + entry = build_paper_entry( + path, + root=root, + include_abstract=include_abstract, + max_pages=max_pages, + max_abstract_chars=max_abstract_chars, + ) + except Exception as exc: # pragma: no cover - defensive + errors.append({"path": str(path), "error": str(exc)}) + continue + papers.append(entry) + + catalog: dict[str, object] = { + "schema_version": "1", + "generated_at": datetime.now(timezone.utc).isoformat(), + "root": str(root), + "count": len(papers), + "papers": papers, + } + if errors: + catalog["errors"] = errors + return catalog + + +def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text( + json.dumps(catalog, indent=2, sort_keys=True), + encoding="utf-8", + ) + + +def load_research_catalog(path: Path) -> dict[str, object]: + payload = path.read_text(encoding="utf-8") + return json.loads(payload) + + +def resolve_paper_path(catalog: dict[str, object], paper_id: str) -> Path | None: + candidate_path = Path(paper_id).expanduser() + if candidate_path.exists(): + return candidate_path.resolve() + + root_value = catalog.get("root") + root = Path(root_value).expanduser().resolve() if root_value else None + for entry in catalog.get("papers", []): + if not isinstance(entry, dict): + continue + if paper_id in (entry.get("id"), entry.get("filename"), entry.get("relative_path")): + if root is None: + return Path(entry.get("path", "")).expanduser().resolve() + return (root / entry.get("relative_path", "")).expanduser().resolve() + return None + + +def open_pdf(path: Path) -> bool: + if sys.platform == "darwin": + command = ["open", str(path)] + elif os.name == "nt": + command = ["cmd", "/c", "start", "", str(path)] + else: + command = ["xdg-open", str(path)] + try: + subprocess.run(command, check=False) + except FileNotFoundError: + return False + return True diff --git a/tests/test_research.py b/tests/test_research.py new file mode 100644 index 0000000..e9a353d --- /dev/null +++ b/tests/test_research.py @@ -0,0 +1,26 @@ +from __future__ import annotations + +from pathlib import Path + +from afs_scawful.research import build_research_catalog, extract_abstract_excerpt + + +def test_extract_abstract_excerpt() -> None: + text = "Title\nAbstract\nThis is the abstract.\n1 Introduction\nBody" + assert extract_abstract_excerpt(text, max_chars=200) == "This is the abstract." + + +def test_build_research_catalog_regex(tmp_path: Path) -> None: + research_root = tmp_path / "Research" + research_root.mkdir() + pdf_path = research_root / "paper.pdf" + pdf_path.write_bytes( + b"not a real pdf /Title (Test Paper) /Author (Jane Doe)", + ) + + catalog = build_research_catalog(research_root, include_abstract=False) + assert catalog["count"] == 1 + entry = catalog["papers"][0] + assert entry["title"] == "Test Paper" + assert entry["author"] == "Jane Doe" + assert entry["metadata_source"] == "regex"