#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
DiaRescueMiner-Kor (디아레스큐마이너코어) — CLI 진입점
====================================================

도메인: 당뇨(DM) 및 합병증 신약 개발
카테고리: 연구 아이디어 생성 (가설 생성 / 문헌 갭 분석 / 온톨로지 기반 새 연구 도출)

효능부족·안전성 문제로 종료(terminated)/중단(withdrawn)된 당뇨·합병증 신약 프로그램
(glitazar/dual PPAR, DKD 표적, 11b-HSD1, glucokinase activator, GPR40 agonist 등)을
ClinicalTrials.gov terminated/withdrawn + why_stopped + post-mortem 문헌 형태로 구조화하고,
"실패원인 -> 부활레버" 온톨로지로 검증가능한 rescue 연구 가설을 ranked 로 생성한다.

중요:
  - 이 파일(main.py)은 표준 라이브러리만으로 완전히 동작한다.
    (streamlit / networkx 설치 불필요. app.py 만 streamlit 에 의존.)
  - 외부 네트워크/API 호출 없음. 전부 오프라인 curated/synthetic 데이터.

디스클레이머:
  본 도구는 참고용·연구용 가설 생성 도구이며 임상 의사결정 도구가 아니다.
  약물명과 역사적 중단 사유는 공개된 사실에 기반하나, 모든 점수/세부수치는 합성 데모값이다.
"""

import argparse
import json
import os
import sys
import textwrap

DATA_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         "data", "terminated_programs.json")

DISCLAIMER = (
    "참고용·연구용 가설 생성 도구입니다. 임상 의사결정 도구가 아닙니다. "
    "약물명/중단사유는 공개 사실 기반이며, 모든 점수는 합성 데모값입니다."
)

# ---------------------------------------------------------------------------
# 실패원인 축(axis) -> 부활 레버(lever) 온톨로지 (규칙기반)
#   각 실패원인 축이 어떤 부활 레버를 "선호"하는지에 대한 도메인 규칙.
#   이 매핑이 networkx 그래프 없이도 동작하는 온톨로지의 핵심이다.
# ---------------------------------------------------------------------------
AXIS_TO_LEVER = {
    "efficacy_insufficient":   ["alternative_endpoint", "biomarker_stratified_responder",
                                "combination_de_risking"],
    "safety_cardiovascular":   ["combination_de_risking", "tissue_selective_reformulation",
                                "biomarker_stratified_responder", "alternative_dosing_chronotherapy"],
    "safety_hepatic":          ["tissue_selective_reformulation", "alternative_dosing_chronotherapy",
                                "biomarker_stratified_responder"],
    "safety_renal":            ["alternative_dosing_chronotherapy", "combination_de_risking",
                                "biomarker_stratified_responder"],
    "safety_oncologic":        ["tissue_selective_reformulation", "biomarker_stratified_responder"],
    "safety_skeletal":         ["tissue_selective_reformulation", "alternative_dosing_chronotherapy"],
    "endpoint_design":         ["alternative_endpoint", "biomarker_stratified_responder"],
    "commercial":              ["combination_de_risking", "biomarker_stratified_responder"],
    "futility_interim":        ["alternative_endpoint", "biomarker_stratified_responder"],
}

# 표적 유효성(verdict) 키워드 -> rescue 보너스 가중치
#   표적이 유전적/MR/동일표적 근거로 valid 할수록 rescue 가설의 신뢰도가 올라간다.
def target_validity_bonus(verdict):
    v = (verdict or "").lower()
    if "strongly_valid" in v:
        return 0.20
    if "valid" in v and "invalid" not in v and "questionable" not in v:
        return 0.12
    if "partially_valid" in v:
        return 0.05
    if "questionable" in v:
        return -0.05
    if "invalid" in v:
        return -0.15
    return 0.0

LEVER_LABELS = {
    "biomarker_stratified_responder": "바이오마커 층화 반응자(responder enrichment)",
    "alternative_endpoint":           "대안 endpoint / 적응증 reframing",
    "combination_de_risking":         "조합 de-risking (백본 병용)",
    "tissue_selective_reformulation": "조직/말초 선택 reformulation·backup chemotype",
    "alternative_dosing_chronotherapy": "대안 dosing / chronotherapy / 저용량 window",
}

AXIS_LABELS = {
    "efficacy_insufficient": "효능부족",
    "safety_cardiovascular": "안전성-심혈관(CV)",
    "safety_hepatic":        "안전성-간(hepatic)",
    "safety_renal":          "안전성-신장(renal)",
    "safety_oncologic":      "안전성-종양(oncologic)",
    "safety_skeletal":       "안전성-골격(skeletal)",
    "endpoint_design":       "endpoint/시험설계",
    "commercial":            "상업적 종료",
    "futility_interim":      "중간분석 futility",
}


# ---------------------------------------------------------------------------
# 데이터 로드
# ---------------------------------------------------------------------------
def load_data(path=DATA_PATH):
    if not os.path.exists(path):
        raise FileNotFoundError("데이터 파일을 찾을 수 없습니다: %s" % path)
    with open(path, "r", encoding="utf-8") as f:
        blob = json.load(f)
    return blob


# ---------------------------------------------------------------------------
# why_stopped 규칙기반 NLP 분류 (표준 라이브러리만 사용)
#   curated 데이터에 failure_attribution 이 이미 있지만, 텍스트로부터도
#   독립적으로 축을 재분류하여 cross-check 한다 (근거 추적용).
# ---------------------------------------------------------------------------
NLP_RULES = {
    "safety_cardiovascular": ["cardiovas", "cv ", "heart failure", "myocardial", "stroke",
                              "chf", "mace", "fluid retention", "fluid overload", "congestive",
                              "bp ", "aldosterone", "mortality"],
    "safety_hepatic":        ["liver", "hepato", "transaminase", "alt", "ast", "bile",
                              "hepatic", "dili", "steatosis"],
    "safety_renal":          ["egfr", "creatinine", "renal", "kidney", "ketoacidosis", "dka",
                              "nephro"],
    "safety_skeletal":       ["fracture", "bone", "skeletal"],
    "safety_oncologic":      ["cancer", "tumor", "tumour", "malignan", "oncolog", "carcino"],
    "efficacy_insufficient": ["modest", "insufficient", "small effect", "loss of", "durability",
                              "underpower", "weak effect", "only modest"],
    "endpoint_design":       ["endpoint", "design", "comparator", "dose", "indication", "framing",
                              "population"],
    "commercial":            ["portfolio", "commercial", "business", "strategic", "out-licens"],
    "futility_interim":      ["futility", "futile", "interim", "stopped early for futility"],
}


def classify_why_stopped(text):
    """why_stopped 자유텍스트 -> 실패원인 축 set (규칙기반)."""
    t = (text or "").lower()
    hits = []
    for axis, kws in NLP_RULES.items():
        for kw in kws:
            if kw in t:
                hits.append(axis)
                break
    return hits


# ---------------------------------------------------------------------------
# 가설 생성 + ranking 엔진
# ---------------------------------------------------------------------------
def derive_axes(program):
    """curated attribution + NLP 분류를 합쳐 최종 실패원인 축 리스트 반환."""
    attr = program.get("failure_attribution", {})
    curated = list(attr.get("primary", [])) + list(attr.get("secondary", []))
    nlp = classify_why_stopped(program.get("why_stopped_text", ""))
    merged = []
    for a in curated + nlp:
        if a not in merged:
            merged.append(a)
    return merged, curated, nlp


def build_hypotheses(program):
    """한 프로그램에 대해 (실패원인 -> 부활레버) 매핑으로 rescue 가설들을 생성."""
    axes, curated, nlp = derive_axes(program)
    levers_in_data = {lv["lever"]: lv for lv in program.get("rescue_levers", [])}
    scores = program.get("scores", {})
    plaus = float(scores.get("plausibility", 0.5))
    feas = float(scores.get("feasibility", 0.5))
    impact = float(scores.get("impact", 0.5))
    vbonus = target_validity_bonus(
        program.get("target_validity", {}).get("verdict", ""))

    # 어떤 레버가 어떤 실패원인 축에 의해 지지되는지 집계
    lever_support = {}   # lever -> set(axes)
    for ax in axes:
        for lv in AXIS_TO_LEVER.get(ax, []):
            lever_support.setdefault(lv, set()).add(ax)

    hyps = []
    for lever, supporting_axes in lever_support.items():
        data_lever = levers_in_data.get(lever)
        # 데이터에 명시된 레버면 그 weight 사용, 아니면 온톨로지 추론 레버(낮은 weight)
        if data_lever:
            base_w = float(data_lever.get("weight", 0.5))
            detail = data_lever.get("detail", "")
            source = "curated_lever"
        else:
            base_w = 0.4
            detail = "(온톨로지 추론) 실패원인 축으로부터 매핑된 후보 레버."
            source = "ontology_inferred"

        # 다축 지지 보너스: 여러 실패원인이 같은 레버를 지지하면 가산
        multi_axis_bonus = 0.05 * (len(supporting_axes) - 1)

        rescue_score = (
            0.40 * plaus +
            0.20 * feas +
            0.25 * impact +
            0.15 * base_w +
            vbonus +
            multi_axis_bonus
        )
        rescue_score = max(0.0, min(1.0, rescue_score))

        hyps.append({
            "program_id": program["id"],
            "drug": program["drug"],
            "target": program["target"],
            "class": program["class"],
            "lever": lever,
            "lever_label": LEVER_LABELS.get(lever, lever),
            "detail": detail,
            "supporting_axes": sorted(supporting_axes),
            "source": source,
            "rescue_score": round(rescue_score, 3),
            "plausibility": plaus,
            "feasibility": feas,
            "impact": impact,
            "target_validity_bonus": round(vbonus, 3),
            "min_study": draft_min_study(program, lever, supporting_axes),
            "audit": build_audit(program, lever, supporting_axes, curated, nlp),
        })

    hyps.sort(key=lambda h: h["rescue_score"], reverse=True)
    return hyps


def draft_min_study(program, lever, axes):
    """이 가설을 깰/세울 최소 confirmatory 연구(design/N/endpoint) 자동 초안."""
    target = program["target"]
    if lever == "biomarker_stratified_responder":
        design = "Biomarker-enrichment 적응형 2상 (run-in 반응자 선별 후 randomize)"
        n = "약 N=180 (enriched 1:1, 80% power, delta-HbA1c 0.5%)"
        endpoint = "1차: 층화집단 HbA1c 변화 / 2차: 반응자 vs 비반응자 effect modification"
    elif lever == "alternative_endpoint":
        design = "Endpoint-reframing 개념증명 2상 (원 적응증 -> 신 적응증)"
        n = "약 N=120 (1:1, 신 endpoint 효과크기 가정)"
        endpoint = "1차: 재배치 적응증의 surrogate (예: MASH 간지방/UACR/eGFR slope)"
    elif lever == "combination_de_risking":
        design = "Factorial / add-on 무작위 2상 (단독 vs 백본병용)"
        n = "약 N=200 (4-arm 또는 2x2)"
        endpoint = "1차: 효능 endpoint / 공동1차: 표적 안전성 신호(예: HF/eGFR/ALT) 비열등"
    elif lever == "tissue_selective_reformulation":
        design = "Backup chemotype 1상 SAD/MAD + 표적 안전성 바이오마커"
        n = "약 N=48-72 (용량단계별)"
        endpoint = "1차: 안전성/내약성 + off-target 마커(예: 담즙산/BSEP, 골표지자) 정상범위"
    elif lever == "alternative_dosing_chronotherapy":
        design = "용량범위/chronotherapy 무작위 2상 (저용량 window 탐색)"
        n = "약 N=150 (다용량군)"
        endpoint = "1차: 효능-안전성 trade-off 곡선 (therapeutic window 확인)"
    else:
        design = "탐색적 2상"
        n = "약 N=150"
        endpoint = "1차: 효능 + 표적 안전성"

    falsifier = "이 가설이 거짓이면: 층화/재설계 후에도 동일 표적장기 안전성 신호 또는 효과부재가 재현된다."
    confirmer = "이 가설이 참이면: 부활레버 적용군에서 benefit-risk 가 원 시험 대비 통계적으로 개선된다."
    return {
        "design": design,
        "suggested_N": n,
        "primary_endpoint": endpoint,
        "target_context": target,
        "falsifier": falsifier,
        "confirmer": confirmer,
    }


def build_audit(program, lever, axes, curated, nlp):
    """근거 추적: 모든 귀속/레버의 출처를 audit 링크 형태(필드 참조)로."""
    tv = program.get("target_validity", {})
    return {
        "why_stopped_source": "field:why_stopped_text (CT.gov v2 status + post-mortem lit)",
        "why_stopped_excerpt": program.get("why_stopped_text", ""),
        "failure_attribution_curated": curated,
        "failure_attribution_nlp_crosscheck": nlp,
        "lever_source": "field:rescue_levers[%s]" % lever,
        "target_validity_verdict": tv.get("verdict", ""),
        "target_validity_mr": tv.get("mr_support", ""),
        "target_validity_same_target": tv.get("same_target_other_drugs", ""),
        "ctgov_status": program.get("ctgov_status", ""),
        "public_sources": "CT.gov v2 / Inxight / Open Targets / PubMed (무료 공개)",
    }


def all_hypotheses(data):
    out = []
    for prog in data["programs"]:
        out.extend(build_hypotheses(prog))
    out.sort(key=lambda h: h["rescue_score"], reverse=True)
    return out


# ---------------------------------------------------------------------------
# 출력 헬퍼
# ---------------------------------------------------------------------------
def hr(ch="-", n=78):
    return ch * n


def print_banner():
    print(hr("="))
    print("DiaRescueMiner-Kor (디아레스큐마이너코어)")
    print("당뇨·합병증 중단 신약 -> rescue 연구 가설 생성기 (오프라인 데모)")
    print(hr("="))
    print("[디스클레이머] " + DISCLAIMER)
    print(hr("="))


def print_list(data):
    print_banner()
    print("\n중단(terminated/withdrawn) 프로그램 레지스트리 (%d종):\n"
          % len(data["programs"]))
    print("%-3s %-30s %-34s %-10s" % ("#", "약물(drug)", "표적/class", "year"))
    print(hr())
    for i, p in enumerate(data["programs"], 1):
        tc = "%s" % p["class"]
        print("%-3d %-30s %-34s %-10s" % (
            i, p["drug"][:30], tc[:34], p.get("year_stopped", "?")))
    print()


def print_summary(data):
    print_banner()
    progs = data["programs"]
    print("\n[요약 통계]\n")
    print("총 중단 프로그램: %d종" % len(progs))

    # class 분포
    from collections import Counter
    classes = Counter(p["class"] for p in progs)
    print("\n클래스 분포:")
    for c, n in classes.most_common():
        print("  - %-42s %d" % (c, n))

    # 실패원인 축 분포 (primary 기준)
    axis_counter = Counter()
    for p in progs:
        for a in p.get("failure_attribution", {}).get("primary", []):
            axis_counter[a] += 1
    print("\n실패원인 축 분포 (primary 기준):")
    for a, n in axis_counter.most_common():
        print("  - %-30s %d" % (AXIS_LABELS.get(a, a), n))

    # 표적 유효성 verdict 요약
    print("\n표적 유효성(verdict) 분포:")
    vc = Counter()
    for p in progs:
        v = p.get("target_validity", {}).get("verdict", "unknown")
        key = "valid" if ("valid" in v and "invalid" not in v and "questionable" not in v) \
            else ("partially_valid" if "partially_valid" in v
                  else ("questionable" if "questionable" in v
                        else ("invalid" if "invalid" in v else "other")))
        vc[key] += 1
    for k, n in vc.most_common():
        print("  - %-20s %d" % (k, n))

    hyps = all_hypotheses(data)
    print("\n생성된 rescue 가설 총합: %d개" % len(hyps))
    print("최고 점수 가설: %s -> %s (score=%.3f)"
          % (hyps[0]["drug"], hyps[0]["lever_label"], hyps[0]["rescue_score"]))
    print()


def print_hypothesis(h, idx=None, verbose=False):
    head = "[#%s] " % idx if idx is not None else ""
    print(hr())
    print("%s%s  |  rescue_score = %.3f" % (head, h["drug"], h["rescue_score"]))
    print("   표적/class : %s / %s" % (h["target"], h["class"]))
    print("   부활레버   : %s" % h["lever_label"])
    print("   지지 실패축: %s"
          % ", ".join(AXIS_LABELS.get(a, a) for a in h["supporting_axes"]))
    print("   레버 상세  : %s" % textwrap.fill(h["detail"], width=72,
                                              subsequent_indent=" " * 16))
    print("   점수내역   : plaus=%.2f feas=%.2f impact=%.2f tgtbonus=%+.2f (source=%s)"
          % (h["plausibility"], h["feasibility"], h["impact"],
             h["target_validity_bonus"], h["source"]))
    ms = h["min_study"]
    print("   최소검증설계:")
    print("       design   : %s" % ms["design"])
    print("       N        : %s" % ms["suggested_N"])
    print("       endpoint : %s" % ms["primary_endpoint"])
    print("       반증조건 : %s" % ms["falsifier"])
    if verbose:
        au = h["audit"]
        print("   [근거 추적 / audit]")
        print("       why_stopped: %s" % au["why_stopped_excerpt"])
        print("       귀속(curated): %s" % au["failure_attribution_curated"])
        print("       귀속(NLP교차): %s" % au["failure_attribution_nlp_crosscheck"])
        print("       표적 verdict : %s" % au["target_validity_verdict"])
        print("       MR/유전 근거 : %s" % au["target_validity_mr"])
        print("       동일표적 타약: %s" % au["target_validity_same_target"])
        print("       출처         : %s" % au["public_sources"])


def print_drug_detail(data, name):
    name_l = name.lower()
    matches = [p for p in data["programs"]
               if name_l in p["drug"].lower() or name_l in p["id"].lower()
               or name_l in p["target"].lower() or name_l in p["class"].lower()]
    if not matches:
        print("일치하는 약물/표적/클래스 없음: %r" % name)
        print("--list 로 전체 목록을 확인하세요.")
        return
    print_banner()
    for p in matches:
        print("\n" + hr("#"))
        print("약물: %s   (id=%s)" % (p["drug"], p["id"]))
        print("표적: %s | class: %s | 적응증: %s"
              % (p["target"], p["class"], p["indication"]))
        print("phase: %s | year_stopped: %s | ct.gov: %s"
              % (p["max_phase"], p.get("year_stopped", "?"), p.get("ctgov_status", "?")))
        print("why_stopped: %s" % p["why_stopped_text"])
        axes, curated, nlp = derive_axes(p)
        print("실패원인 축(병합): %s"
              % ", ".join(AXIS_LABELS.get(a, a) for a in axes))
        tv = p.get("target_validity", {})
        print("표적 유효성 verdict: %s" % tv.get("verdict", ""))
        print(hr("#"))
        hyps = build_hypotheses(p)
        print("rescue 가설 (%d개, 점수 내림차순):" % len(hyps))
        for i, h in enumerate(hyps, 1):
            print_hypothesis(h, idx=i, verbose=True)
    print()


# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------
def build_parser():
    p = argparse.ArgumentParser(
        prog="main.py",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent("""\
            DiaRescueMiner-Kor — 당뇨·합병증 중단 신약 rescue 가설 생성기 (오프라인 CLI)

            효능부족/안전성으로 종료·중단된 당뇨·합병증 프로그램을 구조화하고,
            (실패원인 -> 부활레버) 온톨로지로 검증가능한 rescue 연구 가설을 ranked 생성합니다.

            예시:
              python3 main.py --list
              python3 main.py --summary
              python3 main.py --top 8
              python3 main.py --top 5 --verbose
              python3 main.py --drug fasiglifam
              python3 main.py --drug PPAR --json
            """),
        epilog="[디스클레이머] " + DISCLAIMER,
    )
    p.add_argument("--top", type=int, metavar="N",
                   help="rescue_score 상위 N개 가설을 출력 (기본 동작)")
    p.add_argument("--drug", type=str, metavar="NAME",
                   help="약물/표적/클래스 이름으로 상세 + 가설 출력")
    p.add_argument("--list", action="store_true",
                   help="중단 프로그램 레지스트리 목록 출력")
    p.add_argument("--summary", action="store_true",
                   help="요약 통계 (클래스/실패축/표적유효성 분포) 출력")
    p.add_argument("--verbose", action="store_true",
                   help="가설 출력 시 근거추적(audit) 상세 포함")
    p.add_argument("--json", action="store_true",
                   help="결과를 JSON 으로 출력 (파이프/후처리용)")
    p.add_argument("--data", type=str, default=DATA_PATH,
                   help="데이터 JSON 경로 (기본: data/terminated_programs.json)")
    return p


def main(argv=None):
    parser = build_parser()
    args = parser.parse_args(argv)

    try:
        data = load_data(args.data)
    except Exception as e:
        print("ERROR: 데이터 로드 실패: %s" % e, file=sys.stderr)
        return 2

    # --drug
    if args.drug:
        if args.json:
            matches = [p for p in data["programs"]
                       if args.drug.lower() in (p["drug"] + p["id"] + p["target"]
                                                + p["class"]).lower()]
            out = {"query": args.drug,
                   "hypotheses": [h for p in matches for h in build_hypotheses(p)]}
            print(json.dumps(out, ensure_ascii=False, indent=2))
        else:
            print_drug_detail(data, args.drug)
        return 0

    # --list
    if args.list:
        if args.json:
            print(json.dumps(data["programs"], ensure_ascii=False, indent=2))
        else:
            print_list(data)
        return 0

    # --summary
    if args.summary:
        if args.json:
            print(json.dumps({"n_programs": len(data["programs"]),
                              "n_hypotheses": len(all_hypotheses(data))},
                             ensure_ascii=False, indent=2))
        else:
            print_summary(data)
        return 0

    # 기본: top N rescue 가설
    top_n = args.top if args.top else 10
    hyps = all_hypotheses(data)
    sel = hyps[:top_n]

    if args.json:
        print(json.dumps(sel, ensure_ascii=False, indent=2))
        return 0

    print_banner()
    print("\nTop %d rescue 연구 가설 (rescue_score 내림차순, 전체 %d개 중):\n"
          % (len(sel), len(hyps)))
    for i, h in enumerate(sel, 1):
        print_hypothesis(h, idx=i, verbose=args.verbose)
    print(hr())
    print("총 %d개 가설 생성됨. 더 보려면 --top N, 상세는 --verbose, "
          "약물별은 --drug NAME." % len(hyps))
    print("[디스클레이머] " + DISCLAIMER)
    return 0


if __name__ == "__main__":
    sys.exit(main())