Add research overrides support

2025-12-30 17:26:03 -05:00
parent f37ad164bc
commit 452ce64b11
6 changed files with 119 additions and 2 deletions
--- a/docs/PDF_WORKFLOW.md
+++ b/docs/PDF_WORKFLOW.md
@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
 ## Overrides
 - `AFS_RESEARCH_ROOT=/path/to/Research`
 - `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
 - `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
 - Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
  `~/.config/afs/plugins/afs_scawful/config/`
 - Optional overrides: `research_overrides.json` in the same config directories.
 Example `research_paths.toml`:
 ```toml
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
 research_catalog = "~/src/context/index/research_catalog.json"
 ```
 Example `research_overrides.json`:
 ```json
 {
  "papers": {
    "2510.04950v1.pdf": {
      "title": "Unknown / needs verification",
      "author": "Unknown / needs verification"
    },
    "7799_Quantifying_Human_AI_Syne.pdf": {
      "title": "Unknown / needs verification",
      "author": "Unknown / needs verification"
    }
  }
 }
 ```
 ## Notes
 - Abstract excerpts are auto-extracted from the first pages; verify before quoting.
 - `--open` uses the OS default PDF viewer (Preview on macOS).
--- a/src/afs_scawful/init.py
+++ b/src/afs_scawful/init.py
@@ -2,7 +2,12 @@
 __version__ = "0.0.0"
-from .config import load_research_paths, load_training_paths, load_training_resources
+from .config import (
    load_research_overrides,
    load_research_paths,
    load_training_paths,
    load_training_resources,
 )
 from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
 from .research import (
    build_research_catalog,
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer
 __all__ = [
    "load_research_paths",
    "load_research_overrides",
    "load_training_paths",
    "load_training_resources",
    "resolve_training_root",
--- a/src/afs_scawful/cli.py
+++ b/src/afs_scawful/cli.py
@@ -5,9 +5,11 @@ from __future__ import annotations
 import argparse
 import asyncio
 import json
 import os
 from pathlib import Path
 from typing import Iterable
 from .config import load_research_overrides
 from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
 from .registry import build_dataset_registry, index_datasets, write_dataset_registry
 from .resource_index import ResourceIndexer
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
        if args.output
        else resolve_research_catalog_path()
    )
    overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
    overrides = load_research_overrides(
        Path(overrides_path).expanduser().resolve() if overrides_path else None
    )
    catalog = build_research_catalog(
        root,
        overrides=overrides,
        include_abstract=not args.no_abstract,
        max_pages=args.max_pages,
        max_abstract_chars=args.max_abstract_chars,
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
    )
    research_catalog.add_argument("--root", help="Research root override.")
    research_catalog.add_argument("--output", help="Output catalog path.")
    research_catalog.add_argument("--overrides", help="Overrides JSON path.")
    research_catalog.add_argument(
        "--no-abstract",
        action="store_true",
--- a/src/afs_scawful/config.py
+++ b/src/afs_scawful/config.py
@@ -2,6 +2,7 @@
 from __future__ import annotations
 import json
 import tomllib
 from pathlib import Path
 from typing import Any
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
            if isinstance(value, str)
        }
    return expanded
 def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
    path = config_path or _find_config("research_overrides.json")
    if not path or not path.exists():
        return {}
    payload = path.read_text(encoding="utf-8")
    data = json.loads(payload)
    return data if isinstance(data, dict) else {}
--- a/src/afs_scawful/research.py
+++ b/src/afs_scawful/research.py
@@ -12,7 +12,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterable
-from .config import load_research_paths
+from .config import load_research_overrides, load_research_paths
 try:  # Optional dependency for richer metadata extraction.
    from pypdf import PdfReader
@@ -229,10 +229,12 @@ def build_paper_entry(
 def build_research_catalog(
    root: Path,
    overrides: dict[str, object] | None = None,
    include_abstract: bool = True,
    max_pages: int = 2,
    max_abstract_chars: int = 1200,
 ) -> dict[str, object]:
    override_map = normalize_overrides(overrides)
    papers: list[dict[str, object]] = []
    errors: list[dict[str, str]] = []
    for path in iter_pdf_paths(root):
@@ -244,6 +246,7 @@ def build_research_catalog(
                max_pages=max_pages,
                max_abstract_chars=max_abstract_chars,
            )
            apply_overrides(entry, override_map)
        except Exception as exc:  # pragma: no cover - defensive
            errors.append({"path": str(path), "error": str(exc)})
            continue
@@ -261,6 +264,53 @@ def build_research_catalog(
    return catalog
 def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
    if not overrides:
        return {}
    if "papers" in overrides and isinstance(overrides["papers"], dict):
        data = overrides["papers"]
    else:
        data = overrides
    cleaned: dict[str, dict[str, object]] = {}
    for key, value in data.items():
        if isinstance(value, dict):
            cleaned[str(key)] = value
    return cleaned
 def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
    if not overrides:
        return
    keys = [
        entry.get("id"),
        entry.get("filename"),
        entry.get("relative_path"),
        entry.get("path"),
    ]
    override: dict[str, object] | None = None
    for key in keys:
        if isinstance(key, str) and key in overrides:
            override = overrides[key]
            break
    if not override:
        return
    touched = False
    for field, value in override.items():
        if isinstance(value, str):
            cleaned = value.strip()
            if not cleaned:
                continue
            entry[field] = cleaned
            touched = True
        elif value is not None:
            entry[field] = value
            touched = True
    if touched:
        entry["metadata_source"] = "override"
        if "abstract_excerpt" in override:
            entry["abstract_source"] = "override"
 def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(
--- a/tests/test_research.py
+++ b/tests/test_research.py
@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
    assert entry["title"] == "Test Paper"
    assert entry["author"] == "Jane Doe"
    assert entry["metadata_source"] == "regex"
 def test_build_research_catalog_overrides(tmp_path: Path) -> None:
    research_root = tmp_path / "Research"
    research_root.mkdir()
    pdf_path = research_root / "paper.pdf"
    pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
    overrides = {
        "papers": {
            "paper.pdf": {
                "title": "Manual Title",
                "author": "Manual Author",
            }
        }
    }
    catalog = build_research_catalog(
        research_root,
        overrides=overrides,
        include_abstract=False,
    )
    entry = catalog["papers"][0]
    assert entry["title"] == "Manual Title"
    assert entry["author"] == "Manual Author"
    assert entry["metadata_source"] == "override"