"""Abstract classifier — drug / target / sponsor / phase tagging."""

import re

from . import vocab


PHASE_PATTERNS = [
    (re.compile(r"\bphase\s*(?:1\s*/\s*2|i\s*/\s*ii)\b", re.I), "Phase 1/2"),
    (re.compile(r"\bphase\s*(?:2\s*/\s*3|ii\s*/\s*iii)\b", re.I), "Phase 2/3"),
    (re.compile(r"\bphase\s*(?:3|iii)\b", re.I), "Phase 3"),
    (re.compile(r"\bphase\s*(?:2b|iib)\b", re.I), "Phase 2b"),
    (re.compile(r"\bphase\s*(?:2a|iia)\b", re.I), "Phase 2a"),
    (re.compile(r"\bphase\s*(?:2|ii)\b", re.I), "Phase 2"),
    (re.compile(r"\bphase\s*(?:1b|ib)\b", re.I), "Phase 1b"),
    (re.compile(r"\bphase\s*(?:1|i)\b", re.I), "Phase 1"),
    (re.compile(r"\bphase\s*(?:4|iv)\b", re.I), "Phase 4"),
]


def detect_drugs(text):
    """Return list of canonical drug names matched in text."""
    if not text:
        return []
    text_lower = text.lower()
    matched = []
    seen = set()
    for alias_lower, canonical in vocab.all_aliases():
        # Word-boundary match for short aliases; substring for long aliases.
        # Use regex with boundaries to avoid spurious matches.
        pattern = r"(?<![a-z0-9])" + re.escape(alias_lower) + r"(?![a-z0-9])"
        if re.search(pattern, text_lower):
            if canonical not in seen:
                matched.append(canonical)
                seen.add(canonical)
    return matched


def detect_phase(text):
    """Return phase label or None."""
    if not text:
        return None
    for pattern, label in PHASE_PATTERNS:
        if pattern.search(text):
            return label
    return None


def classify(abstract):
    """Tag an abstract record with drugs, targets, sponsors, phase.

    Args:
        abstract: dict with at least 'title' and 'abstract_text'.
    Returns:
        dict with added keys: drugs, targets, sponsors, phase.
    """
    text = " ".join([
        abstract.get("title", ""),
        abstract.get("abstract_text", ""),
    ])
    drugs = detect_drugs(text)

    targets = []
    sponsors = []
    seen_t = set()
    seen_s = set()
    for d in drugs:
        entry = vocab.lookup(d)
        if entry is None:
            continue
        for t in entry["targets"]:
            if t not in seen_t:
                targets.append(t)
                seen_t.add(t)
        if entry["sponsor"] not in seen_s:
            sponsors.append(entry["sponsor"])
            seen_s.add(entry["sponsor"])

    # Prefer explicit phase declared in record metadata.
    phase = abstract.get("phase") or detect_phase(text)

    enriched = dict(abstract)
    enriched["drugs"] = drugs
    enriched["targets"] = targets
    enriched["sponsors"] = sponsors
    enriched["phase"] = phase
    return enriched


def classify_all(abstracts):
    return [classify(a) for a in abstracts]
