"""seahorse_parser.py

Seahorse XF raw data ingest.

- `.asyr` binary parsing은 Agilent 비공개 포맷이므로 MVP에서는 stub.
  (실제 사용: Wave에서 Excel/CSV로 export하여 사용)
- Excel `.xlsx` (Wave export 기본 포맷) + CSV multi-plate ingest.
- Plate map 인식 (well / group / cell type / drug / dose / substrate).
- 4 protocol auto-detect: injection sequence pattern matching.
    * Mito Stress: Oligomycin -> FCCP -> Rotenone/Antimycin A
    * Glycolysis Stress: Glucose -> Oligomycin -> 2-DG
    * FAO Assay: BSA-palmitate -> (Etomoxir 옵션) -> Oligomycin -> FCCP -> Rot/AA
    * ATP Rate Assay: Oligomycin -> Rotenone/Antimycin A
"""

from __future__ import annotations

import csv
import os
import re
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Tuple

import pandas as pd


# ---------------------------------------------------------------------------
# 1. Data classes
# ---------------------------------------------------------------------------


@dataclass
class PlateMeta:
    plate_id: str
    protocol: str          # "Mito Stress" | "Glycolysis Stress" | "FAO Assay" | "ATP Rate Assay" | "Unknown"
    injections: List[str]  # ordered injection names
    cell_types: List[str]
    drugs: List[str]
    substrates: List[str]
    source_file: str
    n_wells: int
    n_timepoints: int


@dataclass
class PlateData:
    meta: PlateMeta
    # long-format DataFrame: well, group, cell_type, drug, dose, substrate,
    # timepoint, time_min, measurement, ocr, ecar, ppr (optional)
    df: pd.DataFrame = field(default_factory=pd.DataFrame)


# ---------------------------------------------------------------------------
# 2. Wave .asyr stub
# ---------------------------------------------------------------------------


def parse_asyr_stub(path: str) -> PlateData:
    """Wave .asyr binary stub.

    Agilent Wave .asyr (실은 zip 내 .xml + binary) 포맷은 비공개. MVP에서는
    파일 확장자 확인 후 친절한 NotImplementedError를 던지고, Excel/CSV 경로를
    안내한다. (실제 reverse engineering은 별도 작업)
    """
    if not path.lower().endswith(".asyr"):
        raise ValueError(f"not an .asyr file: {path}")
    raise NotImplementedError(
        "Wave .asyr binary parsing is not implemented in MVP. "
        "Please export the plate from Wave as Excel (.xlsx) or CSV and "
        "re-ingest. (Wave > File > Export > Excel / CSV)"
    )


# ---------------------------------------------------------------------------
# 3. Injection / protocol auto-detect
# ---------------------------------------------------------------------------


# canonical compound name normalization
_COMPOUND_ALIASES = {
    "oligo": "oligomycin",
    "oligomycin": "oligomycin",
    "fccp": "fccp",
    "bam15": "fccp",       # alt uncoupler (treat as FCCP-like)
    "rot": "rotenone",
    "aa": "antimycin",
    "rot/aa": "rotenone_antimycin",
    "rotenone": "rotenone",
    "antimycin": "antimycin",
    "rotenone/antimycin": "rotenone_antimycin",
    "rotenone_antimycin": "rotenone_antimycin",
    "glucose": "glucose",
    "2-dg": "2dg",
    "2dg": "2dg",
    "deoxyglucose": "2dg",
    "etomoxir": "etomoxir",
    "palmitate": "palmitate_bsa",
    "palmitate-bsa": "palmitate_bsa",
    "bsa": "bsa",
    "bptes": "bptes",
    "uk5099": "uk5099",
}


def _norm_compound(s: str) -> str:
    s2 = s.strip().lower().replace(" ", "")
    return _COMPOUND_ALIASES.get(s2, s2)


def detect_protocol(injections: List[str]) -> str:
    """Inspect ordered injection compound names and classify protocol."""
    norm = [_norm_compound(x) for x in injections if x and str(x).strip()]
    # ignore explicit "baseline" / "media" labels
    norm = [x for x in norm if x not in ("baseline", "media", "")]

    if not norm:
        return "Unknown"

    has_oligo = any(x == "oligomycin" for x in norm)
    has_fccp = any(x == "fccp" for x in norm)
    has_rotaa = any(x in ("rotenone_antimycin", "rotenone", "antimycin") for x in norm)
    has_glucose = any(x == "glucose" for x in norm)
    has_2dg = any(x == "2dg" for x in norm)
    has_etomoxir = any(x == "etomoxir" for x in norm)
    has_palmitate = any(x == "palmitate_bsa" for x in norm)

    # Glycolysis Stress Test
    if has_glucose and has_oligo and has_2dg:
        return "Glycolysis Stress"
    # ATP Rate Assay = Oligo + Rot/AA only (no FCCP)
    if has_oligo and has_rotaa and not has_fccp:
        return "ATP Rate Assay"
    # FAO Assay = palmitate-BSA + (optional etomoxir) + Mito Stress core
    if has_palmitate or has_etomoxir:
        return "FAO Assay"
    # Mito Stress = Oligo -> FCCP -> Rot/AA
    if has_oligo and has_fccp and has_rotaa:
        return "Mito Stress"
    return "Unknown"


# ---------------------------------------------------------------------------
# 4. CSV / Excel parser
# ---------------------------------------------------------------------------


# expected columns in long-form CSV (Wave "Rate" sheet style + flattened)
REQUIRED_COLS = {"well", "measurement", "ocr", "ecar"}
OPTIONAL_COLS = {
    "group", "cell_type", "drug", "dose", "substrate",
    "time_min", "timepoint", "injection", "ppr", "plate_id",
}


def _read_table(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext in (".xlsx", ".xls"):
        # Try the Wave "Rate" sheet first if present
        try:
            xl = pd.ExcelFile(path)
            sheets = [s.lower() for s in xl.sheet_names]
            if "rate" in sheets:
                idx = sheets.index("rate")
                return xl.parse(xl.sheet_names[idx])
            return xl.parse(xl.sheet_names[0])
        except Exception as e:
            raise RuntimeError(f"failed to read excel {path}: {e}")
    if ext in (".csv", ".tsv", ".txt"):
        sep = "," if ext == ".csv" else "\t"
        return pd.read_csv(path, sep=sep)
    raise ValueError(f"unsupported extension: {ext}")


def _infer_injections(df: pd.DataFrame) -> List[str]:
    """Infer ordered injection names from the 'injection' column.

    We collapse repeated baseline / measurement labels to unique compound
    names while preserving order of first appearance.
    """
    if "injection" not in df.columns:
        return []
    seen: List[str] = []
    for v in df["injection"].astype(str).tolist():
        v2 = v.strip()
        if not v2 or v2.lower() in ("nan", "none"):
            continue
        if v2 not in seen:
            seen.append(v2)
    return seen


def parse_plate(path: str, plate_id: Optional[str] = None) -> PlateData:
    """Parse a single Excel / CSV plate file."""
    if path.lower().endswith(".asyr"):
        return parse_asyr_stub(path)

    df = _read_table(path)
    # normalize column names
    df = df.rename(columns={c: c.strip().lower() for c in df.columns})

    missing = REQUIRED_COLS - set(df.columns)
    if missing:
        raise ValueError(f"missing required columns {missing} in {path}")

    # backfill optional cols
    for c in OPTIONAL_COLS:
        if c not in df.columns:
            df[c] = pd.NA

    if plate_id is None:
        plate_id = (
            str(df["plate_id"].dropna().iloc[0])
            if df["plate_id"].notna().any()
            else os.path.splitext(os.path.basename(path))[0]
        )

    injections = _infer_injections(df)
    protocol = detect_protocol(injections)
    # Refine: if substrate column hints palmitate/etomoxir, prefer FAO Assay
    if protocol == "Mito Stress" and "substrate" in df.columns:
        subs = " ".join(df["substrate"].dropna().astype(str).str.lower().unique())
        if "palmitate" in subs or "etomoxir" in subs or "bsa" in subs:
            protocol = "FAO Assay"

    meta = PlateMeta(
        plate_id=plate_id,
        protocol=protocol,
        injections=injections,
        cell_types=sorted(set(df["cell_type"].dropna().astype(str).tolist())),
        drugs=sorted(set(df["drug"].dropna().astype(str).tolist())),
        substrates=sorted(set(df["substrate"].dropna().astype(str).tolist())),
        source_file=os.path.abspath(path),
        n_wells=df["well"].nunique(),
        n_timepoints=df["measurement"].nunique(),
    )
    return PlateData(meta=meta, df=df)


def parse_multi(paths: List[str]) -> List[PlateData]:
    """Parse multiple plate files. Skips files that fail with a warning."""
    out: List[PlateData] = []
    for p in paths:
        try:
            out.append(parse_plate(p))
        except NotImplementedError as e:
            print(f"[skip] {p}: {e}")
        except Exception as e:
            print(f"[error] {p}: {e}")
    return out


# ---------------------------------------------------------------------------
# 5. helpers
# ---------------------------------------------------------------------------


def list_plate_dir(dirpath: str) -> List[str]:
    """List candidate plate files in a directory."""
    out: List[str] = []
    if not os.path.isdir(dirpath):
        return out
    for f in sorted(os.listdir(dirpath)):
        if f.lower().endswith((".csv", ".tsv", ".xlsx", ".xls", ".asyr")):
            out.append(os.path.join(dirpath, f))
    return out