diff --git a/docs/PDF_WORKFLOW.md b/docs/PDF_WORKFLOW.md index 3fb5b90..41abcea 100644 --- a/docs/PDF_WORKFLOW.md +++ b/docs/PDF_WORKFLOW.md @@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open ## Overrides - `AFS_RESEARCH_ROOT=/path/to/Research` - `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json` +- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json` - Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or `~/.config/afs/plugins/afs_scawful/config/` +- Optional overrides: `research_overrides.json` in the same config directories. Example `research_paths.toml`: ```toml @@ -27,6 +29,22 @@ research_root = "~/Documents/Research" research_catalog = "~/src/context/index/research_catalog.json" ``` +Example `research_overrides.json`: +```json +{ + "papers": { + "2510.04950v1.pdf": { + "title": "Unknown / needs verification", + "author": "Unknown / needs verification" + }, + "7799_Quantifying_Human_AI_Syne.pdf": { + "title": "Unknown / needs verification", + "author": "Unknown / needs verification" + } + } +} +``` + ## Notes - Abstract excerpts are auto-extracted from the first pages; verify before quoting. - `--open` uses the OS default PDF viewer (Preview on macOS). diff --git a/src/afs_scawful/__init__.py b/src/afs_scawful/__init__.py index 1efd7d6..48be341 100644 --- a/src/afs_scawful/__init__.py +++ b/src/afs_scawful/__init__.py @@ -2,7 +2,12 @@ __version__ = "0.0.0" -from .config import load_research_paths, load_training_paths, load_training_resources +from .config import ( + load_research_overrides, + load_research_paths, + load_training_paths, + load_training_resources, +) from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root from .research import ( build_research_catalog, @@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer __all__ = [ "load_research_paths", + "load_research_overrides", "load_training_paths", "load_training_resources", "resolve_training_root", diff --git a/src/afs_scawful/cli.py b/src/afs_scawful/cli.py index d8a9900..16b0be9 100644 --- a/src/afs_scawful/cli.py +++ b/src/afs_scawful/cli.py @@ -5,9 +5,11 @@ from __future__ import annotations import argparse import asyncio import json +import os from pathlib import Path from typing import Iterable +from .config import load_research_overrides from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .resource_index import ResourceIndexer @@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int: if args.output else resolve_research_catalog_path() ) + overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES") + overrides = load_research_overrides( + Path(overrides_path).expanduser().resolve() if overrides_path else None + ) catalog = build_research_catalog( root, + overrides=overrides, include_abstract=not args.no_abstract, max_pages=args.max_pages, max_abstract_chars=args.max_abstract_chars, @@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser: ) research_catalog.add_argument("--root", help="Research root override.") research_catalog.add_argument("--output", help="Output catalog path.") + research_catalog.add_argument("--overrides", help="Overrides JSON path.") research_catalog.add_argument( "--no-abstract", action="store_true", diff --git a/src/afs_scawful/config.py b/src/afs_scawful/config.py index 8ded93a..02fa599 100644 --- a/src/afs_scawful/config.py +++ b/src/afs_scawful/config.py @@ -2,6 +2,7 @@ from __future__ import annotations +import json import tomllib from pathlib import Path from typing import Any @@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str, if isinstance(value, str) } return expanded + + +def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]: + path = config_path or _find_config("research_overrides.json") + if not path or not path.exists(): + return {} + payload = path.read_text(encoding="utf-8") + data = json.loads(payload) + return data if isinstance(data, dict) else {} diff --git a/src/afs_scawful/research.py b/src/afs_scawful/research.py index 9880e83..c65f0f5 100644 --- a/src/afs_scawful/research.py +++ b/src/afs_scawful/research.py @@ -12,7 +12,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import Iterable -from .config import load_research_paths +from .config import load_research_overrides, load_research_paths try: # Optional dependency for richer metadata extraction. from pypdf import PdfReader @@ -229,10 +229,12 @@ def build_paper_entry( def build_research_catalog( root: Path, + overrides: dict[str, object] | None = None, include_abstract: bool = True, max_pages: int = 2, max_abstract_chars: int = 1200, ) -> dict[str, object]: + override_map = normalize_overrides(overrides) papers: list[dict[str, object]] = [] errors: list[dict[str, str]] = [] for path in iter_pdf_paths(root): @@ -244,6 +246,7 @@ def build_research_catalog( max_pages=max_pages, max_abstract_chars=max_abstract_chars, ) + apply_overrides(entry, override_map) except Exception as exc: # pragma: no cover - defensive errors.append({"path": str(path), "error": str(exc)}) continue @@ -261,6 +264,53 @@ def build_research_catalog( return catalog +def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]: + if not overrides: + return {} + if "papers" in overrides and isinstance(overrides["papers"], dict): + data = overrides["papers"] + else: + data = overrides + cleaned: dict[str, dict[str, object]] = {} + for key, value in data.items(): + if isinstance(value, dict): + cleaned[str(key)] = value + return cleaned + + +def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None: + if not overrides: + return + keys = [ + entry.get("id"), + entry.get("filename"), + entry.get("relative_path"), + entry.get("path"), + ] + override: dict[str, object] | None = None + for key in keys: + if isinstance(key, str) and key in overrides: + override = overrides[key] + break + if not override: + return + touched = False + for field, value in override.items(): + if isinstance(value, str): + cleaned = value.strip() + if not cleaned: + continue + entry[field] = cleaned + touched = True + elif value is not None: + entry[field] = value + touched = True + if touched: + entry["metadata_source"] = "override" + if "abstract_excerpt" in override: + entry["abstract_source"] = "override" + + def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text( diff --git a/tests/test_research.py b/tests/test_research.py index e9a353d..8c6190a 100644 --- a/tests/test_research.py +++ b/tests/test_research.py @@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None: assert entry["title"] == "Test Paper" assert entry["author"] == "Jane Doe" assert entry["metadata_source"] == "regex" + + +def test_build_research_catalog_overrides(tmp_path: Path) -> None: + research_root = tmp_path / "Research" + research_root.mkdir() + pdf_path = research_root / "paper.pdf" + pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)") + + overrides = { + "papers": { + "paper.pdf": { + "title": "Manual Title", + "author": "Manual Author", + } + } + } + catalog = build_research_catalog( + research_root, + overrides=overrides, + include_abstract=False, + ) + entry = catalog["papers"][0] + assert entry["title"] == "Manual Title" + assert entry["author"] == "Manual Author" + assert entry["metadata_source"] == "override"