Add research overrides support
This commit is contained in:
@@ -17,8 +17,10 @@ python -m afs_scawful research open 2512-20957v2-XXXXXXXX --open
|
|||||||
## Overrides
|
## Overrides
|
||||||
- `AFS_RESEARCH_ROOT=/path/to/Research`
|
- `AFS_RESEARCH_ROOT=/path/to/Research`
|
||||||
- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
|
- `AFS_RESEARCH_CATALOG=/path/to/research_catalog.json`
|
||||||
|
- `AFS_RESEARCH_OVERRIDES=/path/to/research_overrides.json`
|
||||||
- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
|
- Optional config: `research_paths.toml` in `~/.config/afs/afs_scawful/` or
|
||||||
`~/.config/afs/plugins/afs_scawful/config/`
|
`~/.config/afs/plugins/afs_scawful/config/`
|
||||||
|
- Optional overrides: `research_overrides.json` in the same config directories.
|
||||||
|
|
||||||
Example `research_paths.toml`:
|
Example `research_paths.toml`:
|
||||||
```toml
|
```toml
|
||||||
@@ -27,6 +29,22 @@ research_root = "~/Documents/Research"
|
|||||||
research_catalog = "~/src/context/index/research_catalog.json"
|
research_catalog = "~/src/context/index/research_catalog.json"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example `research_overrides.json`:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"papers": {
|
||||||
|
"2510.04950v1.pdf": {
|
||||||
|
"title": "Unknown / needs verification",
|
||||||
|
"author": "Unknown / needs verification"
|
||||||
|
},
|
||||||
|
"7799_Quantifying_Human_AI_Syne.pdf": {
|
||||||
|
"title": "Unknown / needs verification",
|
||||||
|
"author": "Unknown / needs verification"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
- Abstract excerpts are auto-extracted from the first pages; verify before quoting.
|
- Abstract excerpts are auto-extracted from the first pages; verify before quoting.
|
||||||
- `--open` uses the OS default PDF viewer (Preview on macOS).
|
- `--open` uses the OS default PDF viewer (Preview on macOS).
|
||||||
|
|||||||
@@ -2,7 +2,12 @@
|
|||||||
|
|
||||||
__version__ = "0.0.0"
|
__version__ = "0.0.0"
|
||||||
|
|
||||||
from .config import load_research_paths, load_training_paths, load_training_resources
|
from .config import (
|
||||||
|
load_research_overrides,
|
||||||
|
load_research_paths,
|
||||||
|
load_training_paths,
|
||||||
|
load_training_resources,
|
||||||
|
)
|
||||||
from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
|
from .paths import resolve_datasets_root, resolve_index_root, resolve_training_root
|
||||||
from .research import (
|
from .research import (
|
||||||
build_research_catalog,
|
build_research_catalog,
|
||||||
@@ -16,6 +21,7 @@ from .resource_index import ResourceIndexer
|
|||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"load_research_paths",
|
"load_research_paths",
|
||||||
|
"load_research_overrides",
|
||||||
"load_training_paths",
|
"load_training_paths",
|
||||||
"load_training_resources",
|
"load_training_resources",
|
||||||
"resolve_training_root",
|
"resolve_training_root",
|
||||||
|
|||||||
@@ -5,9 +5,11 @@ from __future__ import annotations
|
|||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
|
from .config import load_research_overrides
|
||||||
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
|
from .generators import DocSectionConfig, DocSectionGenerator, write_jsonl
|
||||||
from .registry import build_dataset_registry, index_datasets, write_dataset_registry
|
from .registry import build_dataset_registry, index_datasets, write_dataset_registry
|
||||||
from .resource_index import ResourceIndexer
|
from .resource_index import ResourceIndexer
|
||||||
@@ -129,8 +131,13 @@ def _research_catalog_command(args: argparse.Namespace) -> int:
|
|||||||
if args.output
|
if args.output
|
||||||
else resolve_research_catalog_path()
|
else resolve_research_catalog_path()
|
||||||
)
|
)
|
||||||
|
overrides_path = args.overrides or os.getenv("AFS_RESEARCH_OVERRIDES")
|
||||||
|
overrides = load_research_overrides(
|
||||||
|
Path(overrides_path).expanduser().resolve() if overrides_path else None
|
||||||
|
)
|
||||||
catalog = build_research_catalog(
|
catalog = build_research_catalog(
|
||||||
root,
|
root,
|
||||||
|
overrides=overrides,
|
||||||
include_abstract=not args.no_abstract,
|
include_abstract=not args.no_abstract,
|
||||||
max_pages=args.max_pages,
|
max_pages=args.max_pages,
|
||||||
max_abstract_chars=args.max_abstract_chars,
|
max_abstract_chars=args.max_abstract_chars,
|
||||||
@@ -287,6 +294,7 @@ def build_parser() -> argparse.ArgumentParser:
|
|||||||
)
|
)
|
||||||
research_catalog.add_argument("--root", help="Research root override.")
|
research_catalog.add_argument("--root", help="Research root override.")
|
||||||
research_catalog.add_argument("--output", help="Output catalog path.")
|
research_catalog.add_argument("--output", help="Output catalog path.")
|
||||||
|
research_catalog.add_argument("--overrides", help="Overrides JSON path.")
|
||||||
research_catalog.add_argument(
|
research_catalog.add_argument(
|
||||||
"--no-abstract",
|
"--no-abstract",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import tomllib
|
import tomllib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -72,3 +73,12 @@ def load_research_paths(config_path: Path | None = None) -> dict[str, dict[str,
|
|||||||
if isinstance(value, str)
|
if isinstance(value, str)
|
||||||
}
|
}
|
||||||
return expanded
|
return expanded
|
||||||
|
|
||||||
|
|
||||||
|
def load_research_overrides(config_path: Path | None = None) -> dict[str, Any]:
|
||||||
|
path = config_path or _find_config("research_overrides.json")
|
||||||
|
if not path or not path.exists():
|
||||||
|
return {}
|
||||||
|
payload = path.read_text(encoding="utf-8")
|
||||||
|
data = json.loads(payload)
|
||||||
|
return data if isinstance(data, dict) else {}
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from datetime import datetime, timezone
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
|
||||||
from .config import load_research_paths
|
from .config import load_research_overrides, load_research_paths
|
||||||
|
|
||||||
try: # Optional dependency for richer metadata extraction.
|
try: # Optional dependency for richer metadata extraction.
|
||||||
from pypdf import PdfReader
|
from pypdf import PdfReader
|
||||||
@@ -229,10 +229,12 @@ def build_paper_entry(
|
|||||||
|
|
||||||
def build_research_catalog(
|
def build_research_catalog(
|
||||||
root: Path,
|
root: Path,
|
||||||
|
overrides: dict[str, object] | None = None,
|
||||||
include_abstract: bool = True,
|
include_abstract: bool = True,
|
||||||
max_pages: int = 2,
|
max_pages: int = 2,
|
||||||
max_abstract_chars: int = 1200,
|
max_abstract_chars: int = 1200,
|
||||||
) -> dict[str, object]:
|
) -> dict[str, object]:
|
||||||
|
override_map = normalize_overrides(overrides)
|
||||||
papers: list[dict[str, object]] = []
|
papers: list[dict[str, object]] = []
|
||||||
errors: list[dict[str, str]] = []
|
errors: list[dict[str, str]] = []
|
||||||
for path in iter_pdf_paths(root):
|
for path in iter_pdf_paths(root):
|
||||||
@@ -244,6 +246,7 @@ def build_research_catalog(
|
|||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
max_abstract_chars=max_abstract_chars,
|
max_abstract_chars=max_abstract_chars,
|
||||||
)
|
)
|
||||||
|
apply_overrides(entry, override_map)
|
||||||
except Exception as exc: # pragma: no cover - defensive
|
except Exception as exc: # pragma: no cover - defensive
|
||||||
errors.append({"path": str(path), "error": str(exc)})
|
errors.append({"path": str(path), "error": str(exc)})
|
||||||
continue
|
continue
|
||||||
@@ -261,6 +264,53 @@ def build_research_catalog(
|
|||||||
return catalog
|
return catalog
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_overrides(overrides: dict[str, object] | None) -> dict[str, dict[str, object]]:
|
||||||
|
if not overrides:
|
||||||
|
return {}
|
||||||
|
if "papers" in overrides and isinstance(overrides["papers"], dict):
|
||||||
|
data = overrides["papers"]
|
||||||
|
else:
|
||||||
|
data = overrides
|
||||||
|
cleaned: dict[str, dict[str, object]] = {}
|
||||||
|
for key, value in data.items():
|
||||||
|
if isinstance(value, dict):
|
||||||
|
cleaned[str(key)] = value
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def apply_overrides(entry: dict[str, object], overrides: dict[str, dict[str, object]]) -> None:
|
||||||
|
if not overrides:
|
||||||
|
return
|
||||||
|
keys = [
|
||||||
|
entry.get("id"),
|
||||||
|
entry.get("filename"),
|
||||||
|
entry.get("relative_path"),
|
||||||
|
entry.get("path"),
|
||||||
|
]
|
||||||
|
override: dict[str, object] | None = None
|
||||||
|
for key in keys:
|
||||||
|
if isinstance(key, str) and key in overrides:
|
||||||
|
override = overrides[key]
|
||||||
|
break
|
||||||
|
if not override:
|
||||||
|
return
|
||||||
|
touched = False
|
||||||
|
for field, value in override.items():
|
||||||
|
if isinstance(value, str):
|
||||||
|
cleaned = value.strip()
|
||||||
|
if not cleaned:
|
||||||
|
continue
|
||||||
|
entry[field] = cleaned
|
||||||
|
touched = True
|
||||||
|
elif value is not None:
|
||||||
|
entry[field] = value
|
||||||
|
touched = True
|
||||||
|
if touched:
|
||||||
|
entry["metadata_source"] = "override"
|
||||||
|
if "abstract_excerpt" in override:
|
||||||
|
entry["abstract_source"] = "override"
|
||||||
|
|
||||||
|
|
||||||
def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
|
def write_research_catalog(catalog: dict[str, object], output_path: Path) -> None:
|
||||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
output_path.write_text(
|
output_path.write_text(
|
||||||
|
|||||||
@@ -24,3 +24,28 @@ def test_build_research_catalog_regex(tmp_path: Path) -> None:
|
|||||||
assert entry["title"] == "Test Paper"
|
assert entry["title"] == "Test Paper"
|
||||||
assert entry["author"] == "Jane Doe"
|
assert entry["author"] == "Jane Doe"
|
||||||
assert entry["metadata_source"] == "regex"
|
assert entry["metadata_source"] == "regex"
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_research_catalog_overrides(tmp_path: Path) -> None:
|
||||||
|
research_root = tmp_path / "Research"
|
||||||
|
research_root.mkdir()
|
||||||
|
pdf_path = research_root / "paper.pdf"
|
||||||
|
pdf_path.write_bytes(b"%PDF-1.0 /Title (Ignored)")
|
||||||
|
|
||||||
|
overrides = {
|
||||||
|
"papers": {
|
||||||
|
"paper.pdf": {
|
||||||
|
"title": "Manual Title",
|
||||||
|
"author": "Manual Author",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catalog = build_research_catalog(
|
||||||
|
research_root,
|
||||||
|
overrides=overrides,
|
||||||
|
include_abstract=False,
|
||||||
|
)
|
||||||
|
entry = catalog["papers"][0]
|
||||||
|
assert entry["title"] == "Manual Title"
|
||||||
|
assert entry["author"] == "Manual Author"
|
||||||
|
assert entry["metadata_source"] == "override"
|
||||||
|
|||||||
Reference in New Issue
Block a user