#!/usr/bin/env python3
"""
MASLDPharmacoGeno (매슬드파마코지노) -- MASLD pharmacogenomics hypothesis miner.

Builds a (SNP x Drug x Outcome x Diet x Ethnic) 5-D ontology over MASLD,
cross-references SYNTHETIC PubMed/GWAS Catalog/gnomAD/CTG evidence,
and ranks under-explored cells. Includes a Korean lean MASLD focus mode
and renders a hypothesis card / 1-pager / AASLD abstract draft.

DISCLAIMER: Research/reference tool only. NOT for clinical decision making.
All evidence values shipped here are SYNTHETIC mock data.

Usage:
  python3 main.py --top 50
  python3 main.py --top 25 --korean-lean
  python3 main.py --help
"""

from __future__ import annotations

import argparse
import json
import os
import sys
from dataclasses import dataclass, field
from itertools import product
from typing import Any

DATA_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
OUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "outputs")

DISCLAIMER = (
    "본 도구는 연구용/참고용입니다. 임상의사결정에 직접 사용하지 마십시오. "
    "생성된 가설은 IRB 승인 및 전문가 검토 후에만 활용하십시오. "
    "유전형 분석 결과는 임상유전학 전문가 해석이 필요합니다."
)


# ---------------------------------------------------------------------------
# Data loading
# ---------------------------------------------------------------------------

def _load(name: str) -> Any:
    path = os.path.join(DATA_DIR, name)
    with open(path, "r", encoding="utf-8") as fh:
        return json.load(fh)


@dataclass
class Ontology:
    snps: list[dict]
    drugs: list[dict]
    outcomes: list[dict]
    diets: list[dict]
    ethnic: list[dict]
    evidence: dict


def load_ontology() -> Ontology:
    return Ontology(
        snps=_load("snps.json")["snps"],
        drugs=_load("drugs.json")["drugs"],
        outcomes=_load("outcomes.json")["outcomes"],
        diets=_load("diets.json")["diets"],
        ethnic=_load("ethnic.json")["ethnic"],
        evidence=_load("evidence.json"),
    )


# ---------------------------------------------------------------------------
# Cell ranking
# ---------------------------------------------------------------------------

@dataclass
class Cell:
    snp: dict
    drug: dict
    outcome: dict
    diet: dict
    ethnic: dict
    pubmed_count: int = 0
    gwas_supporting: int = 0
    gnomad_freq: float = 0.0
    trial_stratified: bool = False
    mechanism: float = 0.0
    korean_lean_n: int = 0
    score: float = 0.0
    why: list[str] = field(default_factory=list)

    def label(self) -> str:
        return (
            f"{self.snp['gene']} {self.snp['rsid']} x {self.drug['id']} "
            f"x {self.outcome['id']} x {self.diet['id']} x {self.ethnic['id']}"
        )


def _evidence_for(ev: dict, snp: dict, drug: dict) -> tuple[int, int, bool, float]:
    key = f"{snp['id']}__{drug['id']}"
    pubmed = int(ev.get("pubmed_counts", {}).get(key, 0))
    gwas = len(ev.get("gwas_supporting", {}).get(snp["id"], []))
    strat = False
    for trial_key, gene_map in ev.get("trial_stratification", {}).items():
        if trial_key.startswith(drug["id"] + "__"):
            strat = bool(gene_map.get(snp["gene"], False))
            break
    plaus = float(ev.get("mechanism_plausibility", {}).get(key, 0.5))
    return pubmed, gwas, strat, plaus


def score_cells(ont: Ontology, korean_lean: bool = False) -> list[Cell]:
    cells: list[Cell] = []
    for snp, drug, outc, diet, eth in product(
        ont.snps, ont.drugs, ont.outcomes, ont.diets, ont.ethnic
    ):
        pubmed, gwas, strat, plaus = _evidence_for(ont.evidence, snp, drug)
        eth_freq = float(snp["ethnic_freq"].get(eth["id"], 0.0))

        # Korean lean cohort N estimate (only meaningful for East Asian + Korean diet)
        n_est = 0
        if eth["id"] == "East_Asian":
            n_map = ont.evidence.get("korean_lean_cohort_n_estimate", {})
            n_est = int(
                n_map.get(f"{snp['id']}__GG_or_CG")
                or n_map.get(f"{snp['id']}__minor_carrier")
                or n_map.get(f"{snp['id']}__minor")
                or n_map.get(f"{snp['id']}__protective_carrier")
                or 0
            )

        # Composite score: under-exploration score
        # Higher = more under-explored but more promising.
        novelty = 1.0 / (1.0 + pubmed)         # less PubMed -> higher novelty
        gwas_norm = min(gwas / 4.0, 1.0)       # GWAS support
        ethnic_relevance = eth_freq            # higher allele freq in this ethnic group -> more impact
        strat_bonus = -0.15 if strat else 0.10  # NOT yet stratified -> bonus
        n_norm = min(n_est / 2000.0, 1.0)

        why: list[str] = []
        if pubmed == 0:
            why.append("PubMed 0건 (미보고 조합)")
        elif pubmed < 3:
            why.append(f"PubMed {pubmed}건 (희소)")
        if gwas >= 3:
            why.append("GWAS 다수 phenotype 지원")
        if eth_freq >= 0.3:
            why.append(f"{eth['id']} 빈도 {eth_freq:.2f} (고빈도)")
        if not strat:
            why.append("주요 RCT에서 미층화")
        if plaus >= 0.7:
            why.append(f"기전적 타당성 {plaus:.2f}")

        score = (
            0.30 * novelty
            + 0.20 * plaus
            + 0.20 * gwas_norm
            + 0.20 * ethnic_relevance
            + 0.10 * n_norm
            + strat_bonus
        )

        if korean_lean:
            korean_match = (eth["id"] == "East_Asian" and diet["id"] == "Korean_LF_HC")
            if not korean_match:
                continue
            if snp["id"] not in ("PNPLA3_rs738409", "TM6SF2_rs58542926", "HSD17B13_rs72613567"):
                # narrow to canonical lean MASLD genotypes
                continue
            score += 0.15  # focus boost
            why.append("한국 lean MASLD focus")

        cell = Cell(
            snp=snp, drug=drug, outcome=outc, diet=diet, ethnic=eth,
            pubmed_count=pubmed, gwas_supporting=gwas, gnomad_freq=eth_freq,
            trial_stratified=strat, mechanism=plaus, korean_lean_n=n_est,
            score=round(score, 4), why=why,
        )
        cells.append(cell)

    cells.sort(key=lambda c: c.score, reverse=True)
    return cells


# ---------------------------------------------------------------------------
# Renderers
# ---------------------------------------------------------------------------

def render_hypothesis_card(cells: list[Cell], top: int) -> str:
    out = []
    out.append("# MASLDPharmacoGeno -- Top Hypothesis Cards (SYNTHETIC DEMO)")
    out.append("")
    out.append(f"> {DISCLAIMER}")
    out.append("")
    out.append(f"Top {top} under-explored cells out of {len(cells)} total.")
    out.append("")
    for i, c in enumerate(cells[:top], 1):
        out.append(f"## {i}. {c.label()}")
        out.append(f"- score: **{c.score:.3f}**")
        out.append(f"- SNP: {c.snp['gene']} {c.snp['rsid']} ({c.snp['function']})")
        out.append(f"- Drug: {c.drug['id']} ({c.drug['class']}, {c.drug['phase']})")
        out.append(f"- Outcome: {c.outcome['description']}")
        out.append(f"- Diet: {c.diet['description']}")
        out.append(
            f"- Ethnic: {c.ethnic['id']} (allele freq {c.gnomad_freq:.3f})"
        )
        out.append(
            f"- Evidence: PubMed={c.pubmed_count}, GWAS_supporting={c.gwas_supporting}, "
            f"trial_stratified={c.trial_stratified}, mechanism_plaus={c.mechanism:.2f}, "
            f"korean_lean_N~{c.korean_lean_n}"
        )
        if c.why:
            out.append(f"- 선정 사유: {'; '.join(c.why)}")
        out.append("")
    return "\n".join(out)


def render_korean_lean_one_pager(cells: list[Cell]) -> str:
    top = [c for c in cells if c.ethnic["id"] == "East_Asian" and c.diet["id"] == "Korean_LF_HC"]
    top = top[:10]
    out = []
    out.append("# 한국 Lean MASLD Focus -- 1-Pager (SYNTHETIC DEMO)")
    out.append("")
    out.append(f"> {DISCLAIMER}")
    out.append("")
    out.append("## 배경")
    out.append(
        "한국·동아시아 lean MASLD(BMI<25)는 PNPLA3 rs738409 G allele 빈도 (~0.42)와 "
        "전통식이(쌀 중심 고탄수)의 상호작용 가능성이 보고되어 왔으나, MAESTRO-NASH·"
        "SYNERGY-NASH·ENLIVEN 등 주요 RCT에서 PNPLA3·TM6SF2·HSD17B13 층화가 부재하다."
    )
    out.append("")
    out.append("## Top 10 가설 (한국 lean cohort 적용 가능)")
    for i, c in enumerate(top, 1):
        out.append(
            f"{i}. **{c.snp['gene']} {c.snp['rsid']} x {c.drug['id']} -> {c.outcome['id']}** "
            f"(score {c.score:.2f}, est lean cohort N~{c.korean_lean_n})"
        )
    out.append("")
    out.append("## IIS 설계 가능성")
    out.append("- KASL multicenter cohort 메타데이터 기반 N>1500 (PNPLA3 carrier).")
    out.append("- 검정력: alpha=0.05, beta=0.2, MASH resolution effect size 0.15 가정 시 N>=480 충분.")
    out.append("- Sub-task 후보: 닥터앤서 3.0 간 sub-task / KASL-KDA pharmacogenomics IIS.")
    out.append("")
    return "\n".join(out)


def render_aasld_abstract(cells: list[Cell]) -> str:
    top = cells[0] if cells else None
    if top is None:
        return ""
    out = []
    out.append("# AASLD/EASL Abstract Draft (~250 words, SYNTHETIC DEMO)")
    out.append("")
    out.append(f"> {DISCLAIMER}")
    out.append("")
    out.append(f"## Title")
    out.append(
        f"Genotype-stratified response to {top.drug['id']} on "
        f"{top.outcome['id'].replace('_', ' ')} in {top.ethnic['id']} MASLD: "
        f"a hypothesis-generating analysis of {top.snp['gene']} {top.snp['rsid']}"
    )
    out.append("")
    out.append("## Background")
    out.append(
        f"MASLD therapy response heterogeneity is incompletely explained by metabolic phenotype. "
        f"{top.snp['gene']} {top.snp['rsid']} (risk allele {top.snp['risk_allele']}; "
        f"{top.ethnic['id']} freq {top.gnomad_freq:.2f}) is a major MASLD modifier "
        f"but is rarely pre-specified in trial stratification."
    )
    out.append("")
    out.append("## Methods")
    out.append(
        "Using a synthetic 5-dimensional ontology (SNP x drug x outcome x diet x ethnic) "
        "with cross-references to PubMed query bank, GWAS Catalog, gnomAD, and "
        "ClinicalTrials.gov, we ranked under-explored cells and generated grant-ready "
        "hypotheses. A Korean lean MASLD focus mode applies BMI<25, "
        "PNPLA3 GG/CG / TM6SF2 minor allele filtering on KASL multicenter cohort metadata."
    )
    out.append("")
    out.append("## Results")
    out.append(
        f"Top-ranked hypothesis: {top.label()} "
        f"(score {top.score:.2f}, PubMed n={top.pubmed_count}, "
        f"mechanism plausibility {top.mechanism:.2f}). "
        "Korean lean focus mode flagged feasible IIS designs (estimated N>=480 with adequate power)."
    )
    out.append("")
    out.append("## Conclusion")
    out.append(
        "A reproducible 5-D ontology surfaces actionable, under-investigated MASLD "
        "pharmacogenomic hypotheses. Validation requires IRB-approved prospective stratified trials. "
        "All numerical values in this draft are synthetic for tool demonstration."
    )
    out.append("")
    return "\n".join(out)


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------

def build_parser() -> argparse.ArgumentParser:
    p = argparse.ArgumentParser(
        prog="masld-pharmaco-geno",
        description=(
            "MASLDPharmacoGeno: rank under-explored MASLD (SNP x drug x outcome x "
            "diet x ethnic) cells from a synthetic 5-D ontology. "
            "Research/reference only -- not for clinical use."
        ),
    )
    p.add_argument("--top", type=int, default=50,
                   help="Number of top hypothesis cards to emit (default: 50).")
    p.add_argument("--korean-lean", action="store_true",
                   help="Apply Korean lean MASLD focus mode (East Asian + Korean LF/HC + canonical SNPs).")
    p.add_argument("--write-outputs", action="store_true",
                   help="Write hypothesis card / 1-pager / abstract markdown to outputs/.")
    p.add_argument("--data-dir", type=str, default=DATA_DIR,
                   help="Override data directory (default: ./data).")
    return p


def main(argv: list[str] | None = None) -> int:
    parser = build_parser()
    args = parser.parse_args(argv)

    global DATA_DIR
    DATA_DIR = args.data_dir

    ont = load_ontology()
    cells = score_cells(ont, korean_lean=args.korean_lean)

    card_md = render_hypothesis_card(cells, top=args.top)
    one_pager_md = render_korean_lean_one_pager(cells if not args.korean_lean else score_cells(ont, korean_lean=False))
    abstract_md = render_aasld_abstract(cells)

    print(card_md)
    print()
    print("---")
    print()
    print(one_pager_md)
    print()
    print("---")
    print()
    print(abstract_md)

    if args.write_outputs:
        os.makedirs(OUT_DIR, exist_ok=True)
        with open(os.path.join(OUT_DIR, "top_hypothesis_cards.md"), "w", encoding="utf-8") as fh:
            fh.write(card_md)
        with open(os.path.join(OUT_DIR, "korean_lean_one_pager.md"), "w", encoding="utf-8") as fh:
            fh.write(one_pager_md)
        with open(os.path.join(OUT_DIR, "aasld_abstract_draft.md"), "w", encoding="utf-8") as fh:
            fh.write(abstract_md)
        print(f"\n[wrote outputs to {OUT_DIR}]", file=sys.stderr)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())