"""
dataio.py — CSV loading + schema validation for WtLossSurrogate-Kor.

Pure pandas. Comment lines beginning with '#' are skipped so the demo CSV can
carry a documented schema header inline.
"""

from __future__ import annotations

import os
import pandas as pd

REQUIRED_COLS = [
    "trial", "drug_class", "pct_weight_loss", "pct_weight_loss_se",
    "hard_outcome", "loghr", "loghr_se",
]

NUMERIC_COLS = ["pct_weight_loss", "pct_weight_loss_se", "loghr", "loghr_se"]

DEFAULT_DATA = os.path.join(os.path.dirname(__file__), "data", "demo_trials.csv")


class DataError(Exception):
    pass


def load_trials(path: str = None) -> pd.DataFrame:
    """Load and validate a trials CSV. Returns a clean DataFrame.

    Raises DataError with an actionable message on schema problems.
    """
    path = path or DEFAULT_DATA
    if not os.path.exists(path):
        raise DataError(f"data file not found: {path}")

    try:
        df = pd.read_csv(path, comment="#", skipinitialspace=True)
    except Exception as e:  # pragma: no cover - defensive
        raise DataError(f"could not parse CSV '{path}': {e}")

    df.columns = [c.strip() for c in df.columns]

    missing = [c for c in REQUIRED_COLS if c not in df.columns]
    if missing:
        raise DataError(
            f"missing required column(s): {missing}. "
            f"Required schema: {REQUIRED_COLS}"
        )

    # coerce numerics
    for c in NUMERIC_COLS:
        df[c] = pd.to_numeric(df[c], errors="coerce")

    # string columns: strip whitespace
    for c in ["trial", "drug_class", "hard_outcome"]:
        df[c] = df[c].astype(str).str.strip()

    # drop rows with missing essentials
    before = len(df)
    df = df.dropna(subset=NUMERIC_COLS + ["drug_class", "hard_outcome"]).reset_index(drop=True)
    dropped = before - len(df)

    if len(df) == 0:
        raise DataError("no usable rows after validation (all rows had missing values).")

    # guard against zero / negative SEs that would explode weights
    for c in ["pct_weight_loss_se", "loghr_se"]:
        bad = (df[c] <= 0).sum()
        if bad:
            df.loc[df[c] <= 0, c] = 1e-3

    df.attrs["dropped_rows"] = dropped
    df.attrs["source_path"] = path
    return df


def summary_counts(df: pd.DataFrame) -> dict:
    return {
        "n_trials": int(len(df)),
        "n_classes": int(df["drug_class"].nunique()),
        "n_outcomes": int(df["hard_outcome"].nunique()),
        "classes": sorted(df["drug_class"].unique().tolist()),
        "outcomes": sorted(df["hard_outcome"].unique().tolist()),
    }