Add research overrides support

2025-12-30 17:26:03 -05:00
parent f37ad164bc
commit 452ce64b11
6 changed files with 119 additions and 2 deletions
--- a/docs/PDF_WORKFLOW.md
+++ b/docs/PDF_WORKFLOW.md
@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
 ## Overrides
 - `AFS_RESEARCH_ROOT=/path/to/Research`
 - `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
+- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
 - Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
  `~/.config/afs/plugins/afs_scawful/config/`
+- Optional overrides: `research_overrides.json` in the same config directories.

 Example `research_paths.toml`:
 ```toml
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
 research_catalog = "~/src/context/index/research_catalog.json"
 ```

+Example `research_overrides.json`:
+```json
+{
+  "papers": {
+    "2510.04950v1.pdf": {
+      "title": "Unknown / needs verification",
+      "author": "Unknown / needs verification"
+    },
+    "7799_Quantifying_Human_AI_Syne.pdf": {
+      "title": "Unknown / needs verification",
+      "author": "Unknown / needs verification"
+    }
+  }
+}
+```
+
 ## Notes
 - Abstract excerpts are auto-extracted from the first pages; verify before quoting.
 - `--open` uses the OS default PDF viewer (Preview on macOS).
--- a/src/afs_scawful/init.py
+++ b/src/afs_scawful/init.py
@@ -2,7 +2,12 @@

 __version__ = "0.0.0"

-from .config import load_research_paths, load_training_paths, load_training_resources
+from .config import (
+    load_research_overrides,
+    load_research_paths,
+    load_training_paths,
+    load_training_resources,
+)
 from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
 from .research import (
    build_research_catalog,
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer

 __all__ = [
    "load_research_paths",
+    "load_research_overrides",
    "load_training_paths",
    "load_training_resources",
    "resolve_training_root",
--- a/src/afs_scawful/cli.py
+++ b/src/afs_scawful/cli.py
@@ -5,9 +5,11 @@ from __future__ import annotations
 import argparse
 import asyncio
 import json
+import os
 from pathlib import Path
 from typing import Iterable

+from .config import load_research_overrides
 from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
 from .registry import build_dataset_registry, index_datasets, write_dataset_registry
 from .resource_index import ResourceIndexer
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
        if args.output
        else resolve_research_catalog_path()
    )
+    overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
+    overrides = load_research_overrides(
+        Path(overrides_path).expanduser().resolve() if overrides_path else None
+    )
    catalog = build_research_catalog(
        root,
+        overrides=overrides,
        include_abstract=not args.no_abstract,
        max_pages=args.max_pages,
        max_abstract_chars=args.max_abstract_chars,
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
    )
    research_catalog.add_argument("--root", help="Research root override.")
    research_catalog.add_argument("--output", help="Output catalog path.")
+    research_catalog.add_argument("--overrides", help="Overrides JSON path.")
    research_catalog.add_argument(
        "--no-abstract",
        action="store_true",
--- a/src/afs_scawful/config.py
+++ b/src/afs_scawful/config.py
@@ -2,6 +2,7 @@

 from __future__ import annotations

+import json
 import tomllib
 from pathlib import Path
 from typing import Any
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
            if isinstance(value, str)
        }
    return expanded
+
+
+def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
+    path = config_path or _find_config("research_overrides.json")
+    if not path or not path.exists():
+        return {}
+    payload = path.read_text(encoding="utf-8")
+    data = json.loads(payload)
+    return data if isinstance(data, dict) else {}
--- a/src/afs_scawful/research.py
+++ b/src/afs_scawful/research.py
@@ -12,7 +12,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import Iterable

-from .config import load_research_paths
+from .config import load_research_overrides, load_research_paths

 try:  # Optional dependency for richer metadata extraction.
    from pypdf import PdfReader
@@ -229,10 +229,12 @@ def build_paper_entry(

 def build_research_catalog(
    root: Path,
+    overrides: dict[str, object] | None = None,
    include_abstract: bool = True,
    max_pages: int = 2,
    max_abstract_chars: int = 1200,
 ) -> dict[str, object]:
+    override_map = normalize_overrides(overrides)
    papers: list[dict[str, object]] = []
    errors: list[dict[str, str]] = []
    for path in iter_pdf_paths(root):
@@ -244,6 +246,7 @@ def build_research_catalog(
                max_pages=max_pages,
                max_abstract_chars=max_abstract_chars,
            )
+            apply_overrides(entry, override_map)
        except Exception as exc:  # pragma: no cover - defensive
            errors.append({"path": str(path), "error": str(exc)})
            continue
@@ -261,6 +264,53 @@ def build_research_catalog(
    return catalog


+def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
+    if not overrides:
+        return {}
+    if "papers" in overrides and isinstance(overrides["papers"], dict):
+        data = overrides["papers"]
+    else:
+        data = overrides
+    cleaned: dict[str, dict[str, object]] = {}
+    for key, value in data.items():
+        if isinstance(value, dict):
+            cleaned[str(key)] = value
+    return cleaned
+
+
+def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
+    if not overrides:
+        return
+    keys = [
+        entry.get("id"),
+        entry.get("filename"),
+        entry.get("relative_path"),
+        entry.get("path"),
+    ]
+    override: dict[str, object] | None = None
+    for key in keys:
+        if isinstance(key, str) and key in overrides:
+            override = overrides[key]
+            break
+    if not override:
+        return
+    touched = False
+    for field, value in override.items():
+        if isinstance(value, str):
+            cleaned = value.strip()
+            if not cleaned:
+                continue
+            entry[field] = cleaned
+            touched = True
+        elif value is not None:
+            entry[field] = value
+            touched = True
+    if touched:
+        entry["metadata_source"] = "override"
+        if "abstract_excerpt" in override:
+            entry["abstract_source"] = "override"
+
+
 def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.write_text(
--- a/tests/test_research.py
+++ b/tests/test_research.py
@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
    assert entry["title"] == "Test Paper"
    assert entry["author"] == "Jane Doe"
    assert entry["metadata_source"] == "regex"
+
+
+def test_build_research_catalog_overrides(tmp_path: Path) -> None:
+    research_root = tmp_path / "Research"
+    research_root.mkdir()
+    pdf_path = research_root / "paper.pdf"
+    pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
+
+    overrides = {
+        "papers": {
+            "paper.pdf": {
+                "title": "Manual Title",
+                "author": "Manual Author",
+            }
+        }
+    }
+    catalog = build_research_catalog(
+        research_root,
+        overrides=overrides,
+        include_abstract=False,
+    )
+    entry = catalog["papers"][0]
+    assert entry["title"] == "Manual Title"
+    assert entry["author"] == "Manual Author"
+    assert entry["metadata_source"] == "override"