Add research overrides support
This commit is contained in:
@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
|
||||
## Overrides
|
||||
- `AFS_RESEARCH_ROOT=/path/to/Research`
|
||||
- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
|
||||
- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
|
||||
- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
|
||||
`~/.config/afs/plugins/afs_scawful/config/`
|
||||
- Optional overrides: `research_overrides.json` in the same config directories.
|
||||
|
||||
Example `research_paths.toml`:
|
||||
```toml
|
||||
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
|
||||
research_catalog = "~/src/context/index/research_catalog.json"
|
||||
```
|
||||
|
||||
Example `research_overrides.json`:
|
||||
```json
|
||||
{
|
||||
"papers": {
|
||||
"2510.04950v1.pdf": {
|
||||
"title": "Unknown / needs verification",
|
||||
"author": "Unknown / needs verification"
|
||||
},
|
||||
"7799_Quantifying_Human_AI_Syne.pdf": {
|
||||
"title": "Unknown / needs verification",
|
||||
"author": "Unknown / needs verification"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
## Notes
|
||||
- Abstract excerpts are auto-extracted from the first pages; verify before quoting.
|
||||
- `--open` uses the OS default PDF viewer (Preview on macOS).
|
||||
|
||||
@@ -2,7 +2,12 @@
|
||||
|
||||
__version__ = "0.0.0"
|
||||
|
||||
from .config import load_research_paths, load_training_paths, load_training_resources
|
||||
from .config import (
|
||||
load_research_overrides,
|
||||
load_research_paths,
|
||||
load_training_paths,
|
||||
load_training_resources,
|
||||
)
|
||||
from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
|
||||
from .research import (
|
||||
build_research_catalog,
|
||||
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer
|
||||
|
||||
__all__ = [
|
||||
"load_research_paths",
|
||||
"load_research_overrides",
|
||||
"load_training_paths",
|
||||
"load_training_resources",
|
||||
"resolve_training_root",
|
||||
|
||||
@@ -5,9 +5,11 @@ from __future__ import annotations
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from .config import load_research_overrides
|
||||
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
|
||||
from .registry import build_dataset_registry, index_datasets, write_dataset_registry
|
||||
from .resource_index import ResourceIndexer
|
||||
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
|
||||
if args.output
|
||||
else resolve_research_catalog_path()
|
||||
)
|
||||
overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
|
||||
overrides = load_research_overrides(
|
||||
Path(overrides_path).expanduser().resolve() if overrides_path else None
|
||||
)
|
||||
catalog = build_research_catalog(
|
||||
root,
|
||||
overrides=overrides,
|
||||
include_abstract=not args.no_abstract,
|
||||
max_pages=args.max_pages,
|
||||
max_abstract_chars=args.max_abstract_chars,
|
||||
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
)
|
||||
research_catalog.add_argument("--root", help="Research root override.")
|
||||
research_catalog.add_argument("--output", help="Output catalog path.")
|
||||
research_catalog.add_argument("--overrides", help="Overrides JSON path.")
|
||||
research_catalog.add_argument(
|
||||
"--no-abstract",
|
||||
action="store_true",
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import tomllib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
|
||||
if isinstance(value, str)
|
||||
}
|
||||
return expanded
|
||||
|
||||
|
||||
def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
|
||||
path = config_path or _find_config("research_overrides.json")
|
||||
if not path or not path.exists():
|
||||
return {}
|
||||
payload = path.read_text(encoding="utf-8")
|
||||
data = json.loads(payload)
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
@@ -12,7 +12,7 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from .config import load_research_paths
|
||||
from .config import load_research_overrides, load_research_paths
|
||||
|
||||
try: # Optional dependency for richer metadata extraction.
|
||||
from pypdf import PdfReader
|
||||
@@ -229,10 +229,12 @@ def build_paper_entry(
|
||||
|
||||
def build_research_catalog(
|
||||
root: Path,
|
||||
overrides: dict[str, object] | None = None,
|
||||
include_abstract: bool = True,
|
||||
max_pages: int = 2,
|
||||
max_abstract_chars: int = 1200,
|
||||
) -> dict[str, object]:
|
||||
override_map = normalize_overrides(overrides)
|
||||
papers: list[dict[str, object]] = []
|
||||
errors: list[dict[str, str]] = []
|
||||
for path in iter_pdf_paths(root):
|
||||
@@ -244,6 +246,7 @@ def build_research_catalog(
|
||||
max_pages=max_pages,
|
||||
max_abstract_chars=max_abstract_chars,
|
||||
)
|
||||
apply_overrides(entry, override_map)
|
||||
except Exception as exc: # pragma: no cover - defensive
|
||||
errors.append({"path": str(path), "error": str(exc)})
|
||||
continue
|
||||
@@ -261,6 +264,53 @@ def build_research_catalog(
|
||||
return catalog
|
||||
|
||||
|
||||
def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
|
||||
if not overrides:
|
||||
return {}
|
||||
if "papers" in overrides and isinstance(overrides["papers"], dict):
|
||||
data = overrides["papers"]
|
||||
else:
|
||||
data = overrides
|
||||
cleaned: dict[str, dict[str, object]] = {}
|
||||
for key, value in data.items():
|
||||
if isinstance(value, dict):
|
||||
cleaned[str(key)] = value
|
||||
return cleaned
|
||||
|
||||
|
||||
def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
|
||||
if not overrides:
|
||||
return
|
||||
keys = [
|
||||
entry.get("id"),
|
||||
entry.get("filename"),
|
||||
entry.get("relative_path"),
|
||||
entry.get("path"),
|
||||
]
|
||||
override: dict[str, object] | None = None
|
||||
for key in keys:
|
||||
if isinstance(key, str) and key in overrides:
|
||||
override = overrides[key]
|
||||
break
|
||||
if not override:
|
||||
return
|
||||
touched = False
|
||||
for field, value in override.items():
|
||||
if isinstance(value, str):
|
||||
cleaned = value.strip()
|
||||
if not cleaned:
|
||||
continue
|
||||
entry[field] = cleaned
|
||||
touched = True
|
||||
elif value is not None:
|
||||
entry[field] = value
|
||||
touched = True
|
||||
if touched:
|
||||
entry["metadata_source"] = "override"
|
||||
if "abstract_excerpt" in override:
|
||||
entry["abstract_source"] = "override"
|
||||
|
||||
|
||||
def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
output_path.write_text(
|
||||
|
||||
@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
|
||||
assert entry["title"] == "Test Paper"
|
||||
assert entry["author"] == "Jane Doe"
|
||||
assert entry["metadata_source"] == "regex"
|
||||
|
||||
|
||||
def test_build_research_catalog_overrides(tmp_path: Path) -> None:
|
||||
research_root = tmp_path / "Research"
|
||||
research_root.mkdir()
|
||||
pdf_path = research_root / "paper.pdf"
|
||||
pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
|
||||
|
||||
overrides = {
|
||||
"papers": {
|
||||
"paper.pdf": {
|
||||
"title": "Manual Title",
|
||||
"author": "Manual Author",
|
||||
}
|
||||
}
|
||||
}
|
||||
catalog = build_research_catalog(
|
||||
research_root,
|
||||
overrides=overrides,
|
||||
include_abstract=False,
|
||||
)
|
||||
entry = catalog["papers"][0]
|
||||
assert entry["title"] == "Manual Title"
|
||||
assert entry["author"] == "Manual Author"
|
||||
assert entry["metadata_source"] == "override"
|
||||
|
||||
Reference in New Issue
Block a user