"""Fetchers for PubMed E-utilities, ClinicalTrials.gov v2, and medRxiv.

Default mode is OFFLINE — load synthetic JSON from data/. Live network calls
are intentionally not implemented in MVP (raise NotImplementedError) to comply
with project no-network constraint. Stubs document the intended endpoint.
"""

import json
from pathlib import Path


# Documented endpoints (NOT called at runtime in MVP):
PUBMED_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
PUBMED_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
CTG_V2 = "https://clinicaltrials.gov/api/v2/studies"
MEDRXIV_API = "https://api.medrxiv.org/details/medrxiv"


def _load_json(path):
    p = Path(path)
    if not p.exists():
        return []
    with p.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if isinstance(data, dict) and "records" in data:
        return data["records"]
    return data if isinstance(data, list) else []


def fetch_pubmed(data_dir, run_date, offline=True):
    if not offline:
        raise NotImplementedError(
            "Live PubMed fetch is disabled in this build. Run with --offline. "
            "Live mode would call ESearch/EFetch with the configured DKD query."
        )
    return _load_json(Path(data_dir) / f"mock_pubmed_{run_date}.json")


def fetch_ctg(data_dir, run_date, offline=True):
    if not offline:
        raise NotImplementedError(
            "Live ClinicalTrials.gov v2 fetch is disabled. Run with --offline."
        )
    return _load_json(Path(data_dir) / f"mock_ctg_{run_date}.json")


def fetch_medrxiv(data_dir, run_date, offline=True):
    if not offline:
        raise NotImplementedError(
            "Live medRxiv fetch is disabled. Run with --offline."
        )
    return _load_json(Path(data_dir) / f"mock_medrxiv_{run_date}.json")


def load_seen(data_dir):
    p = Path(data_dir) / "seen_pmids.json"
    if not p.exists():
        return set()
    with p.open("r", encoding="utf-8") as f:
        try:
            data = json.load(f)
        except json.JSONDecodeError:
            return set()
    if isinstance(data, list):
        return set(str(x) for x in data)
    return set()


def save_seen(data_dir, seen):
    p = Path(data_dir) / "seen_pmids.json"
    with p.open("w", encoding="utf-8") as f:
        json.dump(sorted(seen), f, indent=2)


def fetch_all(data_dir, run_date, offline=True):
    """Aggregate the three sources into a single list with source labels preserved."""
    out = []
    for src, fn in (("pubmed", fetch_pubmed), ("ctg", fetch_ctg), ("medrxiv", fetch_medrxiv)):
        recs = fn(data_dir, run_date, offline=offline) or []
        for r in recs:
            if "source" not in r:
                r = dict(r)
                r["source"] = src
            out.append(r)
    return out
