Add research overrides support

This commit is contained in:
scawful
2025-12-30 17:26:03 -05:00
parent f37ad164bc
commit 452ce64b11
6 changed files with 119 additions and 2 deletions

View File

@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
## Overrides
- `AFS_RESEARCH_ROOT=/path/to/Research`
- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
`~/.config/afs/plugins/afs_scawful/config/`
- Optional overrides: `research_overrides.json` in the same config directories.
Example `research_paths.toml`:
```toml
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
research_catalog = "~/src/context/index/research_catalog.json"
```
Example `research_overrides.json`:
```json
{
"papers": {
"2510.04950v1.pdf": {
"title": "Unknown / needs verification",
"author": "Unknown / needs verification"
},
"7799_Quantifying_Human_AI_Syne.pdf": {
"title": "Unknown / needs verification",
"author": "Unknown / needs verification"
}
}
}
```
## Notes
- Abstract excerpts are auto-extracted from the first pages; verify before quoting.
- `--open` uses the OS default PDF viewer (Preview on macOS).

View File

@@ -2,7 +2,12 @@
__version__ = "0.0.0"
from .config import load_research_paths, load_training_paths, load_training_resources
from .config import (
load_research_overrides,
load_research_paths,
load_training_paths,
load_training_resources,
)
from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
from .research import (
build_research_catalog,
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer
__all__ = [
"load_research_paths",
"load_research_overrides",
"load_training_paths",
"load_training_resources",
"resolve_training_root",

View File

@@ -5,9 +5,11 @@ from __future__ import annotations
import argparse
import asyncio
import json
import os
from pathlib import Path
from typing import Iterable
from .config import load_research_overrides
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
from .registry import build_dataset_registry, index_datasets, write_dataset_registry
from .resource_index import ResourceIndexer
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
if args.output
else resolve_research_catalog_path()
)
overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
overrides = load_research_overrides(
Path(overrides_path).expanduser().resolve() if overrides_path else None
)
catalog = build_research_catalog(
root,
overrides=overrides,
include_abstract=not args.no_abstract,
max_pages=args.max_pages,
max_abstract_chars=args.max_abstract_chars,
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
)
research_catalog.add_argument("--root", help="Research root override.")
research_catalog.add_argument("--output", help="Output catalog path.")
research_catalog.add_argument("--overrides", help="Overrides JSON path.")
research_catalog.add_argument(
"--no-abstract",
action="store_true",

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
import json
import tomllib
from pathlib import Path
from typing import Any
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
if isinstance(value, str)
}
return expanded
def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
path = config_path or _find_config("research_overrides.json")
if not path or not path.exists():
return {}
payload = path.read_text(encoding="utf-8")
data = json.loads(payload)
return data if isinstance(data, dict) else {}

View File

@@ -12,7 +12,7 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable
from .config import load_research_paths
from .config import load_research_overrides, load_research_paths
try: # Optional dependency for richer metadata extraction.
from pypdf import PdfReader
@@ -229,10 +229,12 @@ def build_paper_entry(
def build_research_catalog(
root: Path,
overrides: dict[str, object] | None = None,
include_abstract: bool = True,
max_pages: int = 2,
max_abstract_chars: int = 1200,
) -> dict[str, object]:
override_map = normalize_overrides(overrides)
papers: list[dict[str, object]] = []
errors: list[dict[str, str]] = []
for path in iter_pdf_paths(root):
@@ -244,6 +246,7 @@ def build_research_catalog(
max_pages=max_pages,
max_abstract_chars=max_abstract_chars,
)
apply_overrides(entry, override_map)
except Exception as exc: # pragma: no cover - defensive
errors.append({"path": str(path), "error": str(exc)})
continue
@@ -261,6 +264,53 @@ def build_research_catalog(
return catalog
def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
if not overrides:
return {}
if "papers" in overrides and isinstance(overrides["papers"], dict):
data = overrides["papers"]
else:
data = overrides
cleaned: dict[str, dict[str, object]] = {}
for key, value in data.items():
if isinstance(value, dict):
cleaned[str(key)] = value
return cleaned
def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
if not overrides:
return
keys = [
entry.get("id"),
entry.get("filename"),
entry.get("relative_path"),
entry.get("path"),
]
override: dict[str, object] | None = None
for key in keys:
if isinstance(key, str) and key in overrides:
override = overrides[key]
break
if not override:
return
touched = False
for field, value in override.items():
if isinstance(value, str):
cleaned = value.strip()
if not cleaned:
continue
entry[field] = cleaned
touched = True
elif value is not None:
entry[field] = value
touched = True
if touched:
entry["metadata_source"] = "override"
if "abstract_excerpt" in override:
entry["abstract_source"] = "override"
def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(

View File

@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
assert entry["title"] == "Test Paper"
assert entry["author"] == "Jane Doe"
assert entry["metadata_source"] == "regex"
def test_build_research_catalog_overrides(tmp_path: Path) -> None:
research_root = tmp_path / "Research"
research_root.mkdir()
pdf_path = research_root / "paper.pdf"
pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
overrides = {
"papers": {
"paper.pdf": {
"title": "Manual Title",
"author": "Manual Author",
}
}
}
catalog = build_research_catalog(
research_root,
overrides=overrides,
include_abstract=False,
)
entry = catalog["papers"][0]
assert entry["title"] == "Manual Title"
assert entry["author"] == "Manual Author"
assert entry["metadata_source"] == "override"