"""
MASHBiospecimenCoC-Kor — 합성 데모 데이터 생성 스크립트
==========================================================
외부 네트워크/API 호출 없음. numpy 시드 고정으로 재현 가능한 합성 데이터 생성.

생성 CSV (모두 data/ 폴더):
  1. specimen_manifest.csv   — 환자별 visit별 검체(biopsy/ELF/PRO-C3/VCTE) 채취 due/완료/window
  2. custody_log.csv         — chain-of-custody 단계별 timestamp/담당자/온도
  3. shipment_log.csv        — cold-chain 운송 QC(온도일탈/용혈/integrity)
  4. pathology_turnaround.csv— central pathology 접수→판독 turnaround/재판독
  5. visit_schedule.csv      — MASH 시험 typical visit schedule window 참조표

주의: 본 데이터는 전부 합성(가짜)이며 실제 임상시험 데이터가 아니다.
"""

import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

SEED = 20260531
rng = np.random.default_rng(SEED)

HERE = os.path.dirname(os.path.abspath(__file__))

# -------------------------------------------------------------------
# 참조: MASH (phase 2/3) 전형적 visit schedule 및 검체 종류
#   - 출처 개념: ICH GCP 검체 사슬, IATA PI650 검체 운송, MASH 시험 typical schedule
#   - liver biopsy: baseline / wk52 (조직학 1차 종결점)
#   - 혈액 NIT(ELF, PRO-C3): baseline / wk24 / wk52
#   - VCTE(FibroScan): baseline / wk24 / wk52
# -------------------------------------------------------------------
VISITS = ["Baseline", "Week24", "Week52"]
# visit별 study day(baseline=day0 기준) 및 허용 window(±일)
VISIT_DEF = {
    "Baseline": {"target_day": 0,   "window_days": 14},
    "Week24":   {"target_day": 168, "window_days": 14},
    "Week52":   {"target_day": 364, "window_days": 21},
}
# 검체별 어떤 visit에서 채취하는가
SPECIMEN_VISITS = {
    "LiverBiopsy": ["Baseline", "Week52"],
    "ELF":         ["Baseline", "Week24", "Week52"],
    "PRO-C3":      ["Baseline", "Week24", "Week52"],
    "VCTE":        ["Baseline", "Week24", "Week52"],
}
SPECIMEN_TEMP = {
    # 권장 운송 온도대(라벨), QC 판정 기준에 사용
    "LiverBiopsy": "FFPE_ambient",   # FFPE block 상온, 또는 frozen -80
    "ELF":         "frozen_-20",
    "PRO-C3":      "frozen_-20",
    "VCTE":        "device_result",  # 영상 장비 결과(물리 운송 없음)
}

SITES = ["SITE-SCH-BC", "SITE-SNU", "SITE-AMC", "SITE-SMC"]
SITE_STAFF = {
    "SITE-SCH-BC": ["CRC_Kim", "CRC_Lee"],
    "SITE-SNU":    ["CRC_Park", "CRC_Choi"],
    "SITE-AMC":    ["CRC_Jung", "CRC_Kang"],
    "SITE-SMC":    ["CRC_Yoon", "CRC_Lim"],
}
COURIERS = ["WorldCourier", "MarkenLogix", "CryoExpress"]
PATHOLOGISTS = ["Path_Dr_A", "Path_Dr_B", "Path_Dr_C"]

STUDY_START = datetime(2025, 6, 2)   # 합성 기준 study day0 평균 시점
TODAY = datetime(2026, 5, 31)        # CLAUDE 컨텍스트의 today (snapshot 기준일)

N_PATIENTS = 40


def _fmt(dt):
    return dt.strftime("%Y-%m-%d %H:%M") if dt is not None else ""


def _date(dt):
    return dt.strftime("%Y-%m-%d") if dt is not None else ""


def build_visit_schedule():
    rows = []
    for v in VISITS:
        d = VISIT_DEF[v]
        specs = [s for s, vs in SPECIMEN_VISITS.items() if v in vs]
        rows.append({
            "visit": v,
            "target_study_day": d["target_day"],
            "window_days_plus_minus": d["window_days"],
            "specimens": "|".join(specs),
        })
    return pd.DataFrame(rows)


def build_manifest():
    """환자별·visit별·검체별 1행. due/완료/window 위반 계산."""
    rows = []
    for i in range(N_PATIENTS):
        pid = f"P{i+1:03d}"
        site = SITES[i % len(SITES)]
        # 환자별 enrollment 오프셋(study day0 실제 날짜가 환자마다 다름)
        enroll_offset = int(rng.integers(-120, 60))  # study 시작 전후 분포
        patient_day0 = STUDY_START + timedelta(days=enroll_offset)

        # 환자별 dropout 확률(일부 환자는 후반 visit 미도래/중도탈락)
        dropout_after = rng.choice(
            ["none", "Week24", "Week52"], p=[0.75, 0.10, 0.15]
        )

        for v in VISITS:
            d = VISIT_DEF[v]
            due_date = patient_day0 + timedelta(days=d["target_day"])
            window = d["window_days"]
            win_start = due_date - timedelta(days=window)
            win_end = due_date + timedelta(days=window)

            # visit이 아직 미도래인지
            not_yet_due = win_start > TODAY

            # dropout 처리
            dropped = (
                (dropout_after == "Week24" and v in ["Week24", "Week52"]) or
                (dropout_after == "Week52" and v == "Week52")
            )

            for spec in SPECIMEN_VISITS_for_visit(v):
                status = "scheduled"
                collected_date = None
                window_violation = "NA"

                if dropped:
                    status = "discontinued"
                    window_violation = "NA"
                elif not_yet_due:
                    status = "scheduled"
                    window_violation = "NA"
                else:
                    # 도래한 visit: 대부분 채취 완료, 일부 미채취/window 위반
                    r = rng.random()
                    if r < 0.82:
                        # window 내 채취
                        offset = int(rng.integers(-window, window + 1))
                        collected_date = due_date + timedelta(days=offset)
                        if collected_date > TODAY:
                            collected_date = TODAY - timedelta(days=int(rng.integers(0, 5)))
                        status = "collected"
                        in_window = win_start <= collected_date <= win_end
                        window_violation = "no" if in_window else "yes"
                    elif r < 0.93:
                        # window 밖 늦은 채취(위반)
                        offset = int(rng.integers(window + 1, window + 20))
                        collected_date = due_date + timedelta(days=offset)
                        if collected_date > TODAY:
                            collected_date = TODAY - timedelta(days=int(rng.integers(0, 3)))
                        status = "collected"
                        in_window = win_start <= collected_date <= win_end
                        window_violation = "no" if in_window else "yes"
                    else:
                        # 미채취(due 지났는데 missing) — alert 대상
                        status = "missed"
                        window_violation = "yes"

                rows.append({
                    "patient_id": pid,
                    "site_id": site,
                    "visit": v,
                    "specimen_type": spec,
                    "target_due_date": _date(due_date),
                    "window_start": _date(win_start),
                    "window_end": _date(win_end),
                    "collected_date": _date(collected_date),
                    "status": status,
                    "window_violation": window_violation,
                })
    return pd.DataFrame(rows)


def SPECIMEN_VISITS_for_visit(visit):
    return [s for s, vs in SPECIMEN_VISITS.items() if visit in vs]


def build_custody_and_shipment(manifest):
    """채취 완료된 물리 검체(VCTE 제외)에 대해 custody + shipment 행 생성."""
    custody_rows = []
    shipment_rows = []
    specimen_uid = 0

    collected = manifest[manifest["status"] == "collected"].copy()
    for _, row in collected.iterrows():
        spec = row["specimen_type"]
        if spec == "VCTE":
            # 영상 결과 — 물리 운송 없음, custody만 device_result로 1행
            specimen_uid += 1
            uid = f"SPC{specimen_uid:05d}"
            coll_dt = datetime.strptime(row["collected_date"], "%Y-%m-%d") + timedelta(hours=9)
            staff = rng.choice(SITE_STAFF[row["site_id"]])
            custody_rows.append({
                "specimen_uid": uid,
                "patient_id": row["patient_id"],
                "site_id": row["site_id"],
                "specimen_type": spec,
                "step": "collection",
                "timestamp": _fmt(coll_dt),
                "handler": staff,
                "temp_c": "NA",
                "location": row["site_id"],
                "notes": "VCTE device result (no physical shipment)",
            })
            continue

        specimen_uid += 1
        uid = f"SPC{specimen_uid:05d}"
        coll_dt = datetime.strptime(row["collected_date"], "%Y-%m-%d") + timedelta(
            hours=int(rng.integers(8, 16)), minutes=int(rng.integers(0, 60))
        )
        staff = rng.choice(SITE_STAFF[row["site_id"]])

        temp_label = SPECIMEN_TEMP[spec]
        if temp_label == "frozen_-20":
            base_temp, target_lo, target_hi = -20.0, -25.0, -15.0
        elif temp_label == "FFPE_ambient":
            base_temp, target_lo, target_hi = 21.0, 15.0, 30.0
        else:
            base_temp, target_lo, target_hi = 21.0, 15.0, 30.0

        # custody 단계 4개: collection -> site_storage -> shipment_dispatch -> central_receipt
        t0 = coll_dt
        custody_rows.append({
            "specimen_uid": uid, "patient_id": row["patient_id"], "site_id": row["site_id"],
            "specimen_type": spec, "step": "collection", "timestamp": _fmt(t0),
            "handler": staff, "temp_c": round(base_temp + rng.normal(0, 0.5), 1),
            "location": row["site_id"], "notes": "",
        })
        t1 = t0 + timedelta(hours=int(rng.integers(1, 6)))
        custody_rows.append({
            "specimen_uid": uid, "patient_id": row["patient_id"], "site_id": row["site_id"],
            "specimen_type": spec, "step": "site_storage", "timestamp": _fmt(t1),
            "handler": staff, "temp_c": round(base_temp + rng.normal(0, 0.8), 1),
            "location": f"{row['site_id']}_freezer", "notes": "",
        })
        t2 = t1 + timedelta(days=int(rng.integers(0, 4)), hours=int(rng.integers(0, 12)))
        courier = rng.choice(COURIERS)
        custody_rows.append({
            "specimen_uid": uid, "patient_id": row["patient_id"], "site_id": row["site_id"],
            "specimen_type": spec, "step": "shipment_dispatch", "timestamp": _fmt(t2),
            "handler": courier, "temp_c": round(base_temp + rng.normal(0, 1.0), 1),
            "location": "in_transit", "notes": f"courier={courier}",
        })

        # shipment QC: 온도일탈/용혈/integrity reject 분류
        transit_hours = int(rng.integers(18, 96))
        t3 = t2 + timedelta(hours=transit_hours)

        # 온도 일탈 발생 여부
        r = rng.random()
        temp_excursion = "no"
        max_temp = round(base_temp + rng.normal(0, 1.0), 1)
        if r < 0.12:
            temp_excursion = "yes"
            if temp_label == "frozen_-20":
                max_temp = round(target_hi + rng.uniform(1, 8), 1)  # 너무 따뜻해짐
            else:
                max_temp = round(target_hi + rng.uniform(1, 10), 1)

        # integrity / 용혈 reject
        reject_reason = "none"
        accepted = "accepted"
        rr = rng.random()
        if spec in ("ELF", "PRO-C3") and rr < 0.08:
            reject_reason = "hemolysis"
            accepted = "rejected"
        elif rr < 0.11:
            reject_reason = "integrity_breach"
            accepted = "rejected"
        elif temp_excursion == "yes" and rng.random() < 0.4:
            reject_reason = "temp_excursion_out_of_range"
            accepted = "rejected"

        recollection_triggered = "yes" if accepted == "rejected" else "no"

        shipment_rows.append({
            "shipment_id": f"SHP{specimen_uid:05d}",
            "specimen_uid": uid,
            "patient_id": row["patient_id"],
            "site_id": row["site_id"],
            "specimen_type": spec,
            "courier": courier,
            "dispatch_timestamp": _fmt(t2),
            "central_receipt_timestamp": _fmt(t3),
            "transit_hours": transit_hours,
            "target_temp_range_c": f"{target_lo}~{target_hi}",
            "max_temp_observed_c": max_temp,
            "temp_excursion": temp_excursion,
            "iata_pi650_compliant": "yes" if temp_excursion == "no" else "review",
            "qc_result": accepted,
            "reject_reason": reject_reason,
            "recollection_triggered": recollection_triggered,
        })

        # central_receipt custody (accepted인 경우만 정상 인수)
        custody_rows.append({
            "specimen_uid": uid, "patient_id": row["patient_id"], "site_id": row["site_id"],
            "specimen_type": spec, "step": "central_receipt", "timestamp": _fmt(t3),
            "handler": "CentralLab_Intake", "temp_c": max_temp,
            "location": "CentralLab", "notes": f"qc={accepted};reason={reject_reason}",
        })

    return pd.DataFrame(custody_rows), pd.DataFrame(shipment_rows)


def build_pathology(shipment):
    """central pathology: accepted된 LiverBiopsy block에 대해 접수→판독 turnaround."""
    rows = []
    biopsy_accepted = shipment[
        (shipment["specimen_type"] == "LiverBiopsy") &
        (shipment["qc_result"] == "accepted")
    ].copy()

    for _, row in biopsy_accepted.iterrows():
        recv = datetime.strptime(row["central_receipt_timestamp"], "%Y-%m-%d %H:%M")
        # 판독 turnaround: 목표 ≤14일. 일부 적체(backlog)로 지연
        r = rng.random()
        if r < 0.65:
            tat_days = int(rng.integers(3, 14))
            read_status = "read_complete"
        elif r < 0.85:
            tat_days = int(rng.integers(15, 35))  # backlog 초과
            read_status = "read_complete"
        else:
            tat_days = None
            read_status = "pending"  # 아직 미판독(적체)

        if read_status == "read_complete":
            read_dt = recv + timedelta(days=tat_days, hours=int(rng.integers(0, 12)))
            if read_dt > TODAY:
                read_dt = None
                read_status = "pending"
                tat_days = None
        else:
            read_dt = None

        # 재판독 필요 여부(adjudication / QC fail read)
        reread = "yes" if (read_status == "read_complete" and rng.random() < 0.12) else "no"

        # block 잔여(재절편 가능 슬라이드 수)
        residual_sections = int(rng.integers(0, 8))

        rows.append({
            "block_id": f"BLK{row['specimen_uid'][3:]}",
            "specimen_uid": row["specimen_uid"],
            "patient_id": row["patient_id"],
            "site_id": row["site_id"],
            "central_receipt_date": _date(recv),
            "read_complete_date": _date(read_dt),
            "turnaround_days": tat_days if tat_days is not None else "",
            "read_status": read_status,
            "target_tat_days": 14,
            "tat_breach": ("yes" if (tat_days is not None and tat_days > 14)
                           else ("NA" if tat_days is None else "no")),
            "reread_required": reread,
            "residual_sections": residual_sections,
        })
    return pd.DataFrame(rows)


def main():
    visit_schedule = build_visit_schedule()
    manifest = build_manifest()
    custody, shipment = build_custody_and_shipment(manifest)
    pathology = build_pathology(shipment)

    out = {
        "visit_schedule.csv": visit_schedule,
        "specimen_manifest.csv": manifest,
        "custody_log.csv": custody,
        "shipment_log.csv": shipment,
        "pathology_turnaround.csv": pathology,
    }
    for fname, df in out.items():
        path = os.path.join(HERE, fname)
        df.to_csv(path, index=False)
        print(f"wrote {fname}: shape={df.shape}")


if __name__ == "__main__":
    main()
