Add research overrides support

This commit is contained in:
scawful
2025-12-30 17:26:03 -05:00
parent f37ad164bc
commit 452ce64b11
6 changed files with 119 additions and 2 deletions

View File

@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
## Overrides ## Overrides
- `AFS_RESEARCH_ROOT=/path/to/Research` - `AFS_RESEARCH_ROOT=/path/to/Research`
- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json` - `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or - Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
`~/.config/afs/plugins/afs_scawful/config/` `~/.config/afs/plugins/afs_scawful/config/`
- Optional overrides: `research_overrides.json` in the same config directories.
Example `research_paths.toml`: Example `research_paths.toml`:
```toml ```toml
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
research_catalog = "~/src/context/index/research_catalog.json" research_catalog = "~/src/context/index/research_catalog.json"
``` ```
Example `research_overrides.json`:
```json
{
"papers": {
"2510.04950v1.pdf": {
"title": "Unknown / needs verification",
"author": "Unknown / needs verification"
},
"7799_Quantifying_Human_AI_Syne.pdf": {
"title": "Unknown / needs verification",
"author": "Unknown / needs verification"
}
}
}
```
## Notes ## Notes
- Abstract excerpts are auto-extracted from the first pages; verify before quoting. - Abstract excerpts are auto-extracted from the first pages; verify before quoting.
- `--open` uses the OS default PDF viewer (Preview on macOS). - `--open` uses the OS default PDF viewer (Preview on macOS).

View File

@@ -2,7 +2,12 @@
__version__ = "0.0.0" __version__ = "0.0.0"
from .config import load_research_paths, load_training_paths, load_training_resources from .config import (
load_research_overrides,
load_research_paths,
load_training_paths,
load_training_resources,
)
from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
from .research import ( from .research import (
build_research_catalog, build_research_catalog,
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer
__all__ = [ __all__ = [
"load_research_paths", "load_research_paths",
"load_research_overrides",
"load_training_paths", "load_training_paths",
"load_training_resources", "load_training_resources",
"resolve_training_root", "resolve_training_root",

View File

@@ -5,9 +5,11 @@ from __future__ import annotations
import argparse import argparse
import asyncio import asyncio
import json import json
import os
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from .config import load_research_overrides
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
from .registry import build_dataset_registry, index_datasets, write_dataset_registry from .registry import build_dataset_registry, index_datasets, write_dataset_registry
from .resource_index import ResourceIndexer from .resource_index import ResourceIndexer
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
if args.output if args.output
else resolve_research_catalog_path() else resolve_research_catalog_path()
) )
overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
overrides = load_research_overrides(
Path(overrides_path).expanduser().resolve() if overrides_path else None
)
catalog = build_research_catalog( catalog = build_research_catalog(
root, root,
overrides=overrides,
include_abstract=not args.no_abstract, include_abstract=not args.no_abstract,
max_pages=args.max_pages, max_pages=args.max_pages,
max_abstract_chars=args.max_abstract_chars, max_abstract_chars=args.max_abstract_chars,
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
) )
research_catalog.add_argument("--root", help="Research root override.") research_catalog.add_argument("--root", help="Research root override.")
research_catalog.add_argument("--output", help="Output catalog path.") research_catalog.add_argument("--output", help="Output catalog path.")
research_catalog.add_argument("--overrides", help="Overrides JSON path.")
research_catalog.add_argument( research_catalog.add_argument(
"--no-abstract", "--no-abstract",
action="store_true", action="store_true",

View File

@@ -2,6 +2,7 @@
from __future__ import annotations from __future__ import annotations
import json
import tomllib import tomllib
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
if isinstance(value, str) if isinstance(value, str)
} }
return expanded return expanded
def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
path = config_path or _find_config("research_overrides.json")
if not path or not path.exists():
return {}
payload = path.read_text(encoding="utf-8")
data = json.loads(payload)
return data if isinstance(data, dict) else {}

View File

@@ -12,7 +12,7 @@ from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from .config import load_research_paths from .config import load_research_overrides, load_research_paths
try: # Optional dependency for richer metadata extraction. try: # Optional dependency for richer metadata extraction.
from pypdf import PdfReader from pypdf import PdfReader
@@ -229,10 +229,12 @@ def build_paper_entry(
def build_research_catalog( def build_research_catalog(
root: Path, root: Path,
overrides: dict[str, object] | None = None,
include_abstract: bool = True, include_abstract: bool = True,
max_pages: int = 2, max_pages: int = 2,
max_abstract_chars: int = 1200, max_abstract_chars: int = 1200,
) -> dict[str, object]: ) -> dict[str, object]:
override_map = normalize_overrides(overrides)
papers: list[dict[str, object]] = [] papers: list[dict[str, object]] = []
errors: list[dict[str, str]] = [] errors: list[dict[str, str]] = []
for path in iter_pdf_paths(root): for path in iter_pdf_paths(root):
@@ -244,6 +246,7 @@ def build_research_catalog(
max_pages=max_pages, max_pages=max_pages,
max_abstract_chars=max_abstract_chars, max_abstract_chars=max_abstract_chars,
) )
apply_overrides(entry, override_map)
except Exception as exc: # pragma: no cover - defensive except Exception as exc: # pragma: no cover - defensive
errors.append({"path": str(path), "error": str(exc)}) errors.append({"path": str(path), "error": str(exc)})
continue continue
@@ -261,6 +264,53 @@ def build_research_catalog(
return catalog return catalog
def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
if not overrides:
return {}
if "papers" in overrides and isinstance(overrides["papers"], dict):
data = overrides["papers"]
else:
data = overrides
cleaned: dict[str, dict[str, object]] = {}
for key, value in data.items():
if isinstance(value, dict):
cleaned[str(key)] = value
return cleaned
def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
if not overrides:
return
keys = [
entry.get("id"),
entry.get("filename"),
entry.get("relative_path"),
entry.get("path"),
]
override: dict[str, object] | None = None
for key in keys:
if isinstance(key, str) and key in overrides:
override = overrides[key]
break
if not override:
return
touched = False
for field, value in override.items():
if isinstance(value, str):
cleaned = value.strip()
if not cleaned:
continue
entry[field] = cleaned
touched = True
elif value is not None:
entry[field] = value
touched = True
if touched:
entry["metadata_source"] = "override"
if "abstract_excerpt" in override:
entry["abstract_source"] = "override"
def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None: def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True) output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text( output_path.write_text(

View File

@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
assert entry["title"] == "Test Paper" assert entry["title"] == "Test Paper"
assert entry["author"] == "Jane Doe" assert entry["author"] == "Jane Doe"
assert entry["metadata_source"] == "regex" assert entry["metadata_source"] == "regex"
def test_build_research_catalog_overrides(tmp_path: Path) -> None:
research_root = tmp_path / "Research"
research_root.mkdir()
pdf_path = research_root / "paper.pdf"
pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
overrides = {
"papers": {
"paper.pdf": {
"title": "Manual Title",
"author": "Manual Author",
}
}
}
catalog = build_research_catalog(
research_root,
overrides=overrides,
include_abstract=False,
)
entry = catalog["papers"][0]
assert entry["title"] == "Manual Title"
assert entry["author"] == "Manual Author"
assert entry["metadata_source"] == "override"