#!/usr/bin/env python3
"""
GlpHypoMine (글피하이포마인) — GLP-1 off-target hypothesis mining engine.

Pipeline (offline / synthetic data):
  1. PubMed-like incremental fetch (mocked) for GLP-1RA x organ systems
  2. FAERS quarterly dump parser -> PRR/ROR signal extraction
  3. NER + local LLM (mocked) -> (drug, organ, mechanism, population) tuples
  4. ClinicalTrials.gov cross-reference -> exclude already-investigated combos
  5. Rank unexplored combos by mechanism plausibility -> hypothesis cards

DISCLAIMER:
  본 도구는 연구·참고용입니다. 임상 의사결정에 사용 금지.
  Generated hypotheses require expert validation before grant submission
  or experimental design.
"""

from __future__ import annotations

import argparse
import csv
import json
import math
import os
import sys
from collections import defaultdict
from typing import Any

DISCLAIMER = (
    "WARNING: 본 도구는 연구·참고용입니다. 임상 의사결정에 사용 금지.\n"
    "Generated hypotheses require expert validation before grant submission "
    "or experimental design."
)

DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")


# ---------------------------------------------------------------------------
# Data loaders
# ---------------------------------------------------------------------------

def _load_json(name: str) -> Any:
    path = os.path.join(DATA_DIR, name)
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def _load_csv(name: str) -> list[dict[str, str]]:
    path = os.path.join(DATA_DIR, name)
    with open(path, "r", encoding="utf-8") as f:
        return list(csv.DictReader(f))


def load_corpus() -> dict[str, Any]:
    return {
        "drugs": _load_json("glp1_drugs.json")["drugs"],
        "organs": _load_json("organ_systems.json")["organ_systems"],
        "pubmed": _load_json("pubmed_sample.json")["records"],
        "faers": _load_csv("faers_sample.csv"),
        "trials": _load_json("clinicaltrials_sample.json")["trials"],
    }


# ---------------------------------------------------------------------------
# FAERS disproportionality (PRR / ROR)
# ---------------------------------------------------------------------------

def faers_signals(faers: list[dict[str, str]]) -> list[dict[str, Any]]:
    """
    Compute PRR (Proportional Reporting Ratio) and ROR (Reporting Odds Ratio)
    per (drug, organ) using the standard 2x2 table:

        a = reports of drug & organ
        b = reports of drug & not organ
        c = reports of not drug & organ
        d = reports of not drug & not organ

        PRR = (a/(a+b)) / (c/(c+d))
        ROR = (a*d) / (b*c)
    """
    drugs = sorted({r["drug"] for r in faers})
    organs = sorted({r["organ"] for r in faers})
    counts: dict[tuple[str, str], int] = defaultdict(int)
    for r in faers:
        counts[(r["drug"], r["organ"])] += 1
    total = len(faers)
    by_drug = defaultdict(int)
    by_organ = defaultdict(int)
    for (drug, organ), n in counts.items():
        by_drug[drug] += n
        by_organ[organ] += n

    out = []
    for drug in drugs:
        for organ in organs:
            a = counts.get((drug, organ), 0)
            if a == 0:
                continue
            b = by_drug[drug] - a
            c = by_organ[organ] - a
            d = total - a - b - c
            # add 0.5 to avoid zero-division (Haldane-Anscombe)
            a_, b_, c_, d_ = a + 0.5, b + 0.5, c + 0.5, d + 0.5
            prr = (a_ / (a_ + b_)) / (c_ / (c_ + d_))
            ror = (a_ * d_) / (b_ * c_)
            out.append({
                "drug": drug,
                "organ": organ,
                "n": a,
                "PRR": round(prr, 3),
                "ROR": round(ror, 3),
            })
    out.sort(key=lambda x: x["ROR"], reverse=True)
    return out


# ---------------------------------------------------------------------------
# Mocked NER + local LLM tuple extraction
# ---------------------------------------------------------------------------

def extract_tuples(pubmed: list[dict[str, Any]]) -> list[dict[str, str]]:
    """
    In production this would call a local LLM (e.g. quantized Llama / Mistral)
    over abstracts. For the MVP we already have structured fields per record,
    so we project them as if the extractor produced (drug, organ, mechanism,
    population).
    """
    out = []
    for r in pubmed:
        out.append({
            "drug": r["drug"],
            "organ": r["organ"],
            "mechanism": r.get("mechanism", "unspecified"),
            "population": r.get("population", "unspecified"),
            "pmid": r["pmid"],
            "year": r["year"],
        })
    return out


# ---------------------------------------------------------------------------
# ClinicalTrials.gov cross-reference (exclude already-investigated combos)
# ---------------------------------------------------------------------------

def already_in_trials(trials: list[dict[str, Any]]) -> set[tuple[str, str]]:
    return {(t["drug"], t["organ"]) for t in trials}


# ---------------------------------------------------------------------------
# Plausibility ranking
# ---------------------------------------------------------------------------

# Curated heuristic: organ systems where GLP-1R or downstream pathway
# expression is plausible enough to deserve a hypothesis bonus. Hand-graded
# 0-1 from current mechanistic literature (rough prior, not authoritative).
ORGAN_PRIOR = {
    "cardiovascular": 0.85,
    "renal": 0.75,
    "hepatic": 0.80,
    "pancreatic": 0.95,
    "gastrointestinal": 0.95,
    "neurologic_central": 0.70,
    "neurologic_peripheral": 0.45,
    "psychiatric": 0.60,
    "endocrine_thyroid": 0.70,
    "endocrine_adrenal": 0.40,
    "reproductive_male": 0.45,
    "reproductive_female": 0.55,
    "musculoskeletal": 0.55,
    "skin_dermatologic": 0.30,
    "ocular": 0.45,
    "auditory": 0.20,
    "respiratory": 0.35,
    "hematologic": 0.30,
    "immunologic": 0.65,
    "oncologic": 0.50,
    "metabolic_lipid": 0.75,
    "metabolic_glucose": 0.95,
    "adipose": 0.85,
    "biliary": 0.80,
    "urinary_lower": 0.30,
    "vascular_peripheral": 0.55,
    "lymphatic": 0.25,
    "oral_dental": 0.20,
    "olfactory_taste": 0.40,
    "sleep": 0.55,
    "behavioral_addiction": 0.70,
    "thermoregulation": 0.40,
    "circadian": 0.45,
    "microbiome": 0.65,
    "connective_tissue": 0.30,
    "vestibular": 0.20,
}


def plausibility_score(drug: str, organ: str,
                       faers_index: dict[tuple[str, str], dict[str, Any]],
                       pubmed_index: dict[tuple[str, str], list[dict[str, Any]]],
                       in_trials: set[tuple[str, str]]) -> dict[str, float]:
    """
    Combined score:
       0.45 * organ prior
     + 0.25 * normalized log(ROR) from FAERS
     + 0.20 * PubMed mechanistic citation count (capped)
     - 0.30 * already-in-trials penalty (so unexplored combos rise)
    """
    prior = ORGAN_PRIOR.get(organ, 0.25)

    sig = faers_index.get((drug, organ))
    if sig and sig["ROR"] > 0:
        ror_term = min(1.0, math.log(1 + sig["ROR"]) / math.log(10))
    else:
        ror_term = 0.0

    pubs = pubmed_index.get((drug, organ), [])
    pub_term = min(1.0, len(pubs) / 3.0)

    in_trial = (drug, organ) in in_trials
    trial_penalty = 1.0 if in_trial else 0.0

    score = (
        0.45 * prior
        + 0.25 * ror_term
        + 0.20 * pub_term
        - 0.30 * trial_penalty
    )
    return {
        "score": round(score, 3),
        "prior": prior,
        "ror_term": round(ror_term, 3),
        "pub_term": round(pub_term, 3),
        "in_trial": in_trial,
    }


# ---------------------------------------------------------------------------
# Hypothesis card builder
# ---------------------------------------------------------------------------

def build_cards(corpus: dict[str, Any], domain: str | None = None,
                top_n: int = 10) -> list[dict[str, Any]]:
    drugs = [d["name"] for d in corpus["drugs"]]
    organs = [o["id"] for o in corpus["organs"]]
    if domain:
        organs = [o for o in organs if o == domain]
        if not organs:
            return []

    sigs = faers_signals(corpus["faers"])
    faers_index = {(s["drug"], s["organ"]): s for s in sigs}

    tuples = extract_tuples(corpus["pubmed"])
    pubmed_index: dict[tuple[str, str], list[dict[str, Any]]] = defaultdict(list)
    for t in tuples:
        pubmed_index[(t["drug"], t["organ"])].append(t)

    in_trials = already_in_trials(corpus["trials"])

    cards: list[dict[str, Any]] = []
    for drug in drugs:
        for organ in organs:
            comps = plausibility_score(
                drug, organ, faers_index, pubmed_index, in_trials,
            )
            # Skip combos with no signal at all (no FAERS, no PubMed).
            if (drug, organ) not in faers_index and not pubmed_index.get((drug, organ)):
                continue
            mechs = [t["mechanism"] for t in pubmed_index.get((drug, organ), [])]
            pops = [t["population"] for t in pubmed_index.get((drug, organ), [])]
            faers_sig = faers_index.get((drug, organ))
            cards.append({
                "drug": drug,
                "organ": organ,
                "mechanism_candidates": mechs or ["unspecified (FAERS-only signal)"],
                "population_candidates": pops or ["unspecified"],
                "faers": faers_sig,
                "n_pubmed": len(pubmed_index.get((drug, organ), [])),
                "in_trial": comps["in_trial"],
                "components": comps,
                "score": comps["score"],
                "novelty": "explored" if comps["in_trial"] else "unexplored",
            })

    cards.sort(key=lambda c: c["score"], reverse=True)
    return cards[:top_n]


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def cmd_rank(args: argparse.Namespace) -> int:
    corpus = load_corpus()
    cards = build_cards(corpus, domain=args.domain, top_n=args.top)
    print(DISCLAIMER)
    print()
    if not cards:
        print(f"No hypothesis cards generated (domain={args.domain!r}).")
        return 0
    print(f"Top {len(cards)} hypothesis cards"
          + (f" [domain={args.domain}]" if args.domain else "")
          + ":")
    print("=" * 72)
    for i, c in enumerate(cards, 1):
        faers = c["faers"]
        faers_str = (
            f"PRR={faers['PRR']} ROR={faers['ROR']} n={faers['n']}"
            if faers else "no FAERS signal"
        )
        print(f"[{i:02d}] score={c['score']:.3f}  novelty={c['novelty']}")
        print(f"     drug      : {c['drug']}")
        print(f"     organ     : {c['organ']}")
        print(f"     mechanism : {c['mechanism_candidates'][0]}")
        print(f"     population: {c['population_candidates'][0]}")
        print(f"     FAERS     : {faers_str}")
        print(f"     PubMed n  : {c['n_pubmed']}    in-trial: {c['in_trial']}")
        print("-" * 72)
    if args.json:
        print()
        print(json.dumps(cards, ensure_ascii=False, indent=2))
    return 0


def cmd_stats(_args: argparse.Namespace) -> int:
    corpus = load_corpus()
    print(DISCLAIMER)
    print()
    print("Corpus statistics")
    print("=" * 40)
    print(f"GLP-1RA drugs       : {len(corpus['drugs'])}")
    print(f"Organ systems       : {len(corpus['organs'])}")
    print(f"PubMed records      : {len(corpus['pubmed'])}")
    print(f"FAERS reports       : {len(corpus['faers'])}")
    print(f"ClinicalTrials.gov  : {len(corpus['trials'])}")
    sigs = faers_signals(corpus["faers"])
    print(f"FAERS (drug,organ) signals computed: {len(sigs)}")
    if sigs:
        top = sigs[:5]
        print()
        print("Top FAERS ROR signals:")
        for s in top:
            print(f"  {s['drug']:15s} x {s['organ']:25s} "
                  f"ROR={s['ROR']:.2f} PRR={s['PRR']:.2f} n={s['n']}")
    return 0


def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="glp-hypo-mine",
        description=(
            "GlpHypoMine — GLP-1 off-target hypothesis mining engine "
            "(synthetic offline data). 본 도구는 연구·참고용입니다."
        ),
    )
    sub = p.add_subparsers(dest="cmd", required=True)

    p_rank = sub.add_parser("rank", help="rank hypothesis cards")
    p_rank.add_argument("--domain", default=None,
                        help="restrict to one organ system id (e.g. cardiovascular)")
    p_rank.add_argument("--top", type=int, default=10,
                        help="number of cards to return (default 10)")
    p_rank.add_argument("--json", action="store_true",
                        help="also dump cards as JSON after the human view")
    p_rank.set_defaults(func=cmd_rank)

    p_stats = sub.add_parser("stats", help="corpus statistics")
    p_stats.set_defaults(func=cmd_stats)

    return p


def main(argv: list[str] | None = None) -> int:
    parser = build_parser()
    args = parser.parse_args(argv)
    return args.func(args)


if __name__ == "__main__":
    sys.exit(main())