#!/usr/bin/env python3
"""
ObesityCVHypoMap (오베시티시브이하이포맵)
=========================================

항비만 약물 x CV outcome subtype x phenotype x mediation pathway 6D 온톨로지를
이용해 미탐색 hypothesis cell을 발굴하고, mediation analysis framework + 한국
RWE/IIS 가능성 score + grant-ready hypothesis card를 생성하는 CLI.

본 도구는 연구용/참고용입니다. 임상의사결정에 직접 사용하지 마십시오.
생성된 가설은 IRB 승인과 전문가 검토를 거친 후에만 활용하십시오.
항비만 약물의 CV outcome 권고는 학회 가이드라인을 우선시하십시오.

표준 라이브러리만 사용 (argparse, json, dataclasses, math, pathlib, sys).
"""

from __future__ import annotations

import argparse
import json
import math
import sys
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any

ROOT = Path(__file__).resolve().parent
DATA_DIR = ROOT / "data"
OUT_DIR = ROOT / "outputs"

DISCLAIMER = (
    "본 도구는 연구용/참고용입니다. 임상의사결정에 직접 사용하지 마십시오. "
    "생성된 가설은 IRB 승인과 전문가 검토를 거친 후에만 활용하십시오. "
    "항비만 약물의 CV outcome 권고는 학회 가이드라인을 우선시하십시오."
)


# ---------- data loading ----------

def _load(name: str) -> dict:
    path = DATA_DIR / name
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def load_all() -> dict[str, Any]:
    return {
        "drugs": _load("drugs.json")["drugs"],
        "cv_subtypes": _load("cv_subtypes.json")["subtypes"],
        "phenotypes": _load("phenotypes.json")["phenotypes"],
        "mediations": _load("mediations.json")["mediations"],
        "evidence": _load("evidence.json"),
    }


# ---------- model ----------

@dataclass
class HypothesisCell:
    drug_id: str
    drug_name: str
    cv_id: str
    cv_name: str
    cv_group: str
    pheno_id: str
    pheno_name: str
    mediation_id: str
    mediation_name: str
    surrogate: str
    analysis_model: str
    pubmed_n: int = 0
    has_active_trial: bool = False
    korea_sites: int = 0
    faers_prr: float = 0.0
    preprint_signal: bool = False
    iis_potential_korea: str = "low"
    score_components: dict = field(default_factory=dict)
    total_score: float = 0.0
    rationale: str = ""
    suggested_n: int = 0
    power_pct: float = 0.0
    pmid_mock: str = ""
    nct_mock: str = ""

    def to_md_card(self, rank: int) -> str:
        return (
            f"### #{rank}. {self.drug_name} -> {self.cv_name} ({self.cv_group}) "
            f"in {self.pheno_name}\n"
            f"- **Dominant mediation**: {self.mediation_name} "
            f"(surrogate: {self.surrogate})\n"
            f"- **Analysis model**: {self.analysis_model}\n"
            f"- **Score**: {self.total_score:.2f} "
            f"({_fmt_components(self.score_components)})\n"
            f"- **Evidence**: PubMed n={self.pubmed_n}, "
            f"FAERS PRR={self.faers_prr:.2f}, "
            f"preprint={'yes' if self.preprint_signal else 'no'}, "
            f"active trial={'yes' if self.has_active_trial else 'no'} "
            f"(Korea sites={self.korea_sites})\n"
            f"- **Suggested N**: {self.suggested_n} "
            f"(power ~{self.power_pct:.0f}% for HR 0.80)\n"
            f"- **Korea IIS/RWE potential**: {self.iis_potential_korea}\n"
            f"- **Mock IDs**: {self.pmid_mock or '-'} / {self.nct_mock or '-'}\n"
            f"- **Rationale**: {self.rationale}\n"
        )


def _fmt_components(c: dict) -> str:
    return ", ".join(f"{k}={v:.2f}" for k, v in c.items())


# ---------- scoring ----------

def _ev_lookup(evidence: dict, key: str) -> dict[tuple, dict]:
    out: dict[tuple, dict] = {}
    for row in evidence.get(key, []):
        k = (row.get("drug"), row.get("cv_subtype"), row.get("phenotype"))
        out[k] = row
    return out


def build_cells(state: dict, korea_only: bool = False, drug_filter: str | None = None) -> list[HypothesisCell]:
    drugs = state["drugs"]
    cvs = state["cv_subtypes"]
    phenos = state["phenotypes"]
    medi = state["mediations"]
    ev = state["evidence"]

    pubmed_idx = _ev_lookup(ev, "pubmed_evidence")
    preprint_idx = _ev_lookup(ev, "preprint_signals")
    ctg_idx: dict[tuple, dict] = {}
    for row in ev.get("ctg_active_trials", []):
        ctg_idx[(row["drug"], row["cv_subtype"])] = row
    faers_idx: dict[tuple, dict] = {}
    for row in ev.get("faers_signals", []):
        faers_idx[(row["drug"], row["cv_subtype"])] = row

    medi_by_id = {m["id"]: m for m in medi}

    cells: list[HypothesisCell] = []
    for d in drugs:
        if drug_filter and drug_filter.lower() not in (d["id"] + " " + d["name"]).lower():
            continue
        if korea_only and not d.get("korea_approved", False):
            continue
        for cv in cvs:
            for ph in phenos:
                # one cell per (drug, cv, pheno); pick dominant mediation = first listed
                # primary, intersected with mediation set if available, else first global mediation
                primary = d.get("primary_mediations") or [medi[0]["id"]]
                m_id = primary[0]
                m = medi_by_id.get(m_id, medi[0])

                pubmed_row = pubmed_idx.get((d["id"], cv["id"], ph["id"]), {})
                pubmed_n = int(pubmed_row.get("n_papers", 0))
                pmid_mock = pubmed_row.get("top_pmid_mock", "")

                ctg_row = ctg_idx.get((d["id"], cv["id"]), {})
                has_trial = bool(ctg_row)
                k_sites = int(ctg_row.get("korea_sites", 0))
                nct_mock = ctg_row.get("nct_mock", "")

                faers_row = faers_idx.get((d["id"], cv["id"]), {})
                prr = float(faers_row.get("prr", 0.0))

                preprint_row = preprint_idx.get((d["id"], cv["id"], ph["id"]), {})
                has_preprint = bool(preprint_row)

                # mediation plausibility: drug.primary_mediations match
                med_score = 1.0 if m_id in primary else 0.4

                # Score components (higher = more attractive, "underexplored" boosts):
                gap_score = 1.0 / (1.0 + pubmed_n)            # fewer papers = bigger gap
                signal_score = min(prr / 3.0, 1.0)            # FAERS signal (cap 1)
                trial_score = (0.5 if has_trial else 0.0) + min(k_sites, 5) / 10.0
                preprint_score = 0.7 if has_preprint else 0.0
                korea_rwe_score = _korea_rwe_score(d, cv, ph)

                components = {
                    "gap": round(gap_score, 3),
                    "faers": round(signal_score, 3),
                    "trial_korea": round(trial_score, 3),
                    "preprint": round(preprint_score, 3),
                    "mediation": round(med_score, 3),
                    "korea_rwe": round(korea_rwe_score, 3),
                }
                total = (
                    1.5 * gap_score
                    + 1.2 * signal_score
                    + 1.0 * trial_score
                    + 0.8 * preprint_score
                    + 0.7 * med_score
                    + 1.3 * korea_rwe_score
                )

                n_suggest, power = _power_sim(cv, ph)

                rationale = _rationale(d, cv, ph, m, components, has_preprint, prr)

                cell = HypothesisCell(
                    drug_id=d["id"], drug_name=d["name"],
                    cv_id=cv["id"], cv_name=cv["name"], cv_group=cv["group"],
                    pheno_id=ph["id"], pheno_name=ph["name"],
                    mediation_id=m["id"], mediation_name=m["name"],
                    surrogate=m["surrogate"], analysis_model=m["analysis_model"],
                    pubmed_n=pubmed_n,
                    has_active_trial=has_trial,
                    korea_sites=k_sites,
                    faers_prr=prr,
                    preprint_signal=has_preprint,
                    iis_potential_korea=d.get("iis_potential_korea", "low"),
                    score_components=components,
                    total_score=round(total, 3),
                    rationale=rationale,
                    suggested_n=n_suggest,
                    power_pct=power,
                    pmid_mock=pmid_mock,
                    nct_mock=nct_mock,
                )
                cells.append(cell)
    return cells


def _korea_rwe_score(drug: dict, cv: dict, ph: dict) -> float:
    base = {"high": 0.9, "moderate": 0.6, "low": 0.2}.get(drug.get("iis_potential_korea", "low"), 0.2)
    incidence = float(cv.get("korea_incidence_per_100k", 0))
    inc_factor = min(incidence / 100.0, 1.0)
    pheno_factor = min(float(ph.get("korea_prevalence_pct", 0)) / 30.0, 1.0)
    return round(base * 0.6 + inc_factor * 0.25 + pheno_factor * 0.15, 3)


def _power_sim(cv: dict, ph: dict) -> tuple[int, float]:
    # toy power simulation for HR 0.80 vs control, alpha 0.05, two-sided
    incidence = max(float(cv.get("korea_incidence_per_100k", 1)), 1.0) / 100000.0
    pheno = max(float(ph.get("korea_prevalence_pct", 1)), 1.0) / 100.0
    # assume 3-year follow-up
    event_rate = min(incidence * 3 * 50, 0.25)  # toy multiplier
    if event_rate <= 0:
        return 0, 0.0
    # required events for HR 0.80 at 80% power, alpha 0.05 ~= 631
    required_events = 631
    n = int(required_events / event_rate / max(pheno, 0.01))
    n = max(min(n, 25000), 200)
    # rough power if N=fixed-budget 5000:
    fixed_n = 5000
    expected_events = fixed_n * event_rate * pheno
    if expected_events <= 0:
        power = 0.0
    else:
        z_alpha = 1.96
        z_beta = math.sqrt(expected_events) * abs(math.log(0.80)) / 2.0 - z_alpha
        power = 100.0 * 0.5 * (1.0 + math.erf(z_beta / math.sqrt(2)))
        power = max(0.0, min(power, 99.0))
    return n, power


def _rationale(drug: dict, cv: dict, ph: dict, m: dict, comps: dict, preprint: bool, prr: float) -> str:
    bits = []
    if comps["gap"] > 0.5:
        bits.append("PubMed 미탐색 cell")
    if prr >= 1.5:
        bits.append(f"FAERS signal PRR={prr:.2f}")
    if preprint:
        bits.append("medRxiv pre-publication signal")
    if comps["trial_korea"] > 0.6:
        bits.append("active trial + 한국 사이트")
    bits.append(f"dominant mediation: {m['name']}")
    bits.append(f"phenotype 한국유병률 {ph.get('korea_prevalence_pct', 0)}%")
    return "; ".join(bits)


# ---------- output rendering ----------

def render_top_md(cells: list[HypothesisCell], top: int) -> str:
    lines = ["# ObesityCVHypoMap - Top Hypothesis Cards", "", DISCLAIMER, ""]
    cells_sorted = sorted(cells, key=lambda c: c.total_score, reverse=True)[:top]
    for i, c in enumerate(cells_sorted, 1):
        lines.append(c.to_md_card(i))
    lines.append("")
    lines.append(
        f"_총 {len(cells)} cell 중 top {len(cells_sorted)} 출력. "
        "score = 1.5*gap + 1.2*FAERS + 1.0*trial+한국 + 0.8*preprint + 0.7*mediation + 1.3*korea_rwe._"
    )
    return "\n".join(lines)


def render_grant_md(cells: list[HypothesisCell]) -> str:
    top1 = sorted(cells, key=lambda c: c.total_score, reverse=True)[0]
    return f"""# Grant Application Draft (1-page concept)

{DISCLAIMER}

## Title
Mediation analysis of {top1.drug_name} on {top1.cv_name} in {top1.pheno_name}: a Korean RWE/IIS proposal.

## Specific aims
1. Quantify the {top1.mediation_name} pathway as a mediator of CV outcome
   reduction with {top1.drug_name} in {top1.pheno_name}.
2. Estimate natural direct vs natural indirect effects via
   {top1.analysis_model}.
3. Test interaction with baseline phenotype and report Korea-specific RWE
   estimates linkable to NHIS/HIRA claims.

## Background and rationale
{top1.rationale}.

## Design
Prospective IIS, target N = {top1.suggested_n} (estimated power ~{top1.power_pct:.0f}%
for HR 0.80, 3-year follow-up). Surrogate mediator: {top1.surrogate}.
Measurements at baseline, W12, W24, W52.

## Analysis
{top1.analysis_model}. Sensitivity analysis for unmeasured confounding
(E-value). Pre-specify decomposition of total effect into direct vs
indirect through {top1.mediation_name} and through 4 alternative
mediation pathways.

## Korean cohort feasibility
IIS potential: {top1.iis_potential_korea}. Active trial reference:
{top1.nct_mock or 'n/a'}. PubMed prior art: n={top1.pubmed_n}
({top1.pmid_mock or 'no high-relevance hit'}).

## Deliverables
- Pre-registered analysis plan (OSF)
- Mediation primary paper
- KSSO / 대한심장학회 abstract
- Hand-off dataset for AI sub-task in 닥터앤서 3.0 비만 모듈
"""


def render_aha_abstract(cells: list[HypothesisCell]) -> str:
    top1 = sorted(cells, key=lambda c: c.total_score, reverse=True)[0]
    return f"""# AHA Scientific Sessions - Abstract Draft (~250 words)

{DISCLAIMER}

**Title.** Mediation of cardiovascular outcomes by the {top1.mediation_name}
pathway in patients with {top1.pheno_name} treated with {top1.drug_name}:
a Korean prospective investigator-initiated study design.

**Background.** The relative contribution of weight loss vs non-weight
mediation pathways to CV outcomes with novel anti-obesity therapy
remains unresolved, particularly for {top1.cv_name}
({top1.cv_group}) in {top1.pheno_name}, an under-studied phenotype in
pivotal CVOTs.

**Hypothesis.** {top1.drug_name} reduces {top1.cv_name} risk and a
quantifiable share of the total effect is mediated by
{top1.mediation_name} (surrogate: {top1.surrogate}).

**Methods.** Prospective IIS in Korean academic centers,
target N={top1.suggested_n}, 3-year follow-up. Mediator measured at
baseline, week 12, 24, and 52. Causal mediation analysis using
{top1.analysis_model}, with E-value sensitivity for unmeasured
confounding. Five candidate mediators (weight loss, glucose,
anti-inflammatory, natriuresis, autonomic) decomposed jointly.

**Expected outcomes.** Estimated study power for HR 0.80 vs
matched control: ~{top1.power_pct:.0f}%. Primary effect estimate:
proportion of total effect mediated by {top1.mediation_name},
with 95% CI; secondary: comparison vs alternative mediators.

**Implications.** Quantifying mediation in a non-Western cohort
(BMI 23-30 lean diabetic Asian, sarcopenic obesity, post-bariatric)
addresses a major gap in the SELECT/STEP-HFpEF/SUMMIT/SURPASS-CVOT/
SURMOUNT-MMO evidence base and can inform Korean obesity-CV
guidelines and the 닥터앤서 3.0 obesity sub-task AI agent.
"""


def render_ksso_abstract(cells: list[HypothesisCell]) -> str:
    top1 = sorted(cells, key=lambda c: c.total_score, reverse=True)[0]
    return f"""# KSSO 2026 - 초록 초안 (한국어, ~250 단어)

{DISCLAIMER}

**제목.** {top1.pheno_name} 환자에서 {top1.drug_name} 투여 시 {top1.cv_name}
({top1.cv_group})에 대한 {top1.mediation_name} 매개경로 효과: 한국형 IIS 설계.

**배경.** 항비만 약물의 CV outcome 효과가 체중감소 자체에 의한 것인지,
또는 비체중 매개경로(글루코오스, 항염, 나트륨이뇨, 자율신경)에 의한
것인지에 대한 분리 분석은 SELECT/STEP-HFpEF/SUMMIT/SURPASS-CVOT/
SURMOUNT-MMO에서 명확하지 않으며, 특히 {top1.pheno_name} 표현형에서는
근거가 부족하다.

**목적.** {top1.drug_name} 투여가 {top1.cv_name} 위험을 감소시키며, 이
효과의 일정 부분이 {top1.mediation_name} 경로로 매개되는지 확인한다.
표지 대리지표(surrogate)는 {top1.surrogate}.

**방법.** 한국 대학병원 다기관 전향 IIS, 목표 N={top1.suggested_n},
3년 추적. 기저, 12주, 24주, 52주 시점에서 매개변수 측정. 분석 모델은
{top1.analysis_model}이며, 5개 후보 경로를 동시 분해. 미측정
교란변수에 대한 E-value 민감도 분석 포함.

**예상 결과.** HR 0.80 검출 검정력 약 {top1.power_pct:.0f}%
(N={top1.suggested_n}). 주 결과는 {top1.mediation_name} 경로의
총효과 대비 매개분율(95% CI)이다.

**의의.** 한국인 표현형(린-당뇨, 근감소, 수면무호흡, 수술후재발)에
특화된 매개분석 결과는 KSSO/대한심장학회 진료 권고와 닥터앤서 3.0
비만 sub-task AI agent의 의사결정 모듈에 바로 연결될 수 있다.
"""


# ---------- CLI ----------

def cli(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        prog="obesity-cv-hypo-map",
        description=(
            "ObesityCVHypoMap: 항비만약물 x CV outcome subtype x phenotype x "
            "mediation 6D 온톨로지 기반 hypothesis cell ranking + grant card "
            "generator. 모든 데이터는 합성/mock입니다. 연구용/참고용."
        ),
        epilog=DISCLAIMER,
    )
    parser.add_argument("--top", type=int, default=50, help="output top-N hypothesis cards (default: 50)")
    parser.add_argument("--drug", type=str, default=None, help="filter to drug substring (e.g. semaglutide)")
    parser.add_argument("--korea-only", action="store_true", help="only Korea-approved drugs")
    parser.add_argument("--write", action="store_true", help="also write outputs/ markdown files")
    parser.add_argument("--summary", action="store_true", help="print scoring summary only")
    args = parser.parse_args(argv)

    state = load_all()
    cells = build_cells(state, korea_only=args.korea_only, drug_filter=args.drug)

    if not cells:
        print("[!] no cells matched filters", file=sys.stderr)
        return 1

    cells_sorted = sorted(cells, key=lambda c: c.total_score, reverse=True)
    print(f"[i] generated {len(cells)} hypothesis cells")
    print(f"[i] top-1 score = {cells_sorted[0].total_score:.3f}")
    print(f"[i] median score = {cells_sorted[len(cells_sorted)//2].total_score:.3f}")
    print(f"[i] DISCLAIMER: {DISCLAIMER}")

    if args.summary:
        for i, c in enumerate(cells_sorted[: args.top], 1):
            print(f"  #{i:>3}  score={c.total_score:.3f}  {c.drug_name} -> {c.cv_name} / {c.pheno_name}")
        return 0

    md_top = render_top_md(cells, args.top)
    print(md_top)

    if args.write:
        OUT_DIR.mkdir(parents=True, exist_ok=True)
        (OUT_DIR / "top_hypothesis_cards.md").write_text(md_top, encoding="utf-8")
        (OUT_DIR / "grant_draft.md").write_text(render_grant_md(cells), encoding="utf-8")
        (OUT_DIR / "aha_abstract.md").write_text(render_aha_abstract(cells), encoding="utf-8")
        (OUT_DIR / "ksso_abstract.md").write_text(render_ksso_abstract(cells), encoding="utf-8")
        print(f"[i] wrote outputs to {OUT_DIR}", file=sys.stderr)
    return 0


if __name__ == "__main__":
    raise SystemExit(cli())
