"""DMPatentLandscape-Kor — Streamlit 메인 앱.

5개 기능을 탭으로 구분:
1. multi-office API 통합 + family clustering
2. 당뇨 분류 + claim scope 요약
3. 만료·PTE/SPC + Orange Book 연계
4. 한국 출원인 동향 + 경쟁 landscape
5. weekly digest + FTO 보조 + 리포트(docx export)

실행: `streamlit run app.py`
의존성: streamlit, pandas, plotly, python-docx (requirements.txt 참고)
모든 데이터는 mock/synthetic. 외부 네트워크 호출 0.
"""

from __future__ import annotations

import io
import json
import os
from datetime import date, datetime, timedelta
from typing import Optional

import pandas as pd
import plotly.express as px
import streamlit as st

PROJECT_ROOT = os.path.dirname(os.path.abspath(__file__))
DATA_PATH = os.path.join(PROJECT_ROOT, "data", "patents.json")

KOREAN_APPLICANTS_HINTS = [
    "LG", "엘지", "HK inno.N", "에이치케이이노엔", "종근당", "동아ST", "동아에스티",
    "대웅", "한미", "일동", "녹십자", "삼성바이오에피스", "셀트리온", "SK바이오팜",
    "에스케이바이오팜", "카카오헬스케어", "메디센서", "Hanmi", "Samsung", "Celltrion",
]

GLOBAL_TOP_ASSIGNEE_HINTS = [
    "Novo Nordisk", "Eli Lilly", "Sanofi", "Merck", "Boehringer", "AstraZeneca",
    "Bristol-Myers Squibb", "Dexcom", "Abbott", "Medtronic", "Takeda", "Janssen",
    "Pfizer",
]

DISCLAIMER = (
    "본 도구는 **연구·참고용**이며, 실제 IP/FTO(Freedom-to-Operate) 결정은 "
    "**변리사 자문**을 받아야 한다. 본 도구의 특허 데이터는 **mock/synthetic**이며, "
    "실제 USPTO·EPO·KIPO·JPO·CNIPA·WIPO 등록 정보가 아니다."
)


# ---------- 유틸 ----------

@st.cache_data
def load_patents() -> pd.DataFrame:
    with open(DATA_PATH, "r", encoding="utf-8") as f:
        data = json.load(f)
    df = pd.DataFrame(data)
    # 날짜 컬럼 parse
    for col in ["priority_date", "app_date", "grant_date", "expiry_date"]:
        df[col] = pd.to_datetime(df[col], errors="coerce")
    df["is_korean_applicant"] = df["assignee"].apply(_is_korean_applicant)
    df["is_global_top"] = df["assignee"].apply(_is_global_top_assignee)
    df["effective_expiry"] = df.apply(_effective_expiry_row, axis=1)
    return df


def _is_korean_applicant(assignee: str) -> bool:
    if not assignee:
        return False
    return any(h.lower() in assignee.lower() for h in KOREAN_APPLICANTS_HINTS)


def _is_global_top_assignee(assignee: str) -> bool:
    if not assignee:
        return False
    return any(h.lower() in assignee.lower() for h in GLOBAL_TOP_ASSIGNEE_HINTS)


def _effective_expiry_row(row: pd.Series) -> Optional[datetime]:
    base = row.get("expiry_date")
    if pd.isna(base):
        return None
    pte = int(row.get("pte_days") or 0)
    spc_months = int(row.get("spc_extension_months") or 0)
    pediatric = bool(row.get("pediatric_extension"))
    extra = pte + spc_months * 30 + (180 if pediatric else 0)
    return base + timedelta(days=extra)


def _today() -> datetime:
    return datetime.combine(date.today(), datetime.min.time())


def _days_until(d: Optional[datetime]) -> Optional[int]:
    if d is None or pd.isna(d):
        return None
    return (d - _today()).days


def _build_digest_md(digest_df: pd.DataFrame, focus, today_str: str) -> str:
    """tab5 download용 md 텍스트 빌드 (모듈 상단에서 정의해야 NameError 회피)."""
    lines = []
    lines.append(f"# DMPatentLandscape-Kor — Weekly Digest ({today_str})")
    lines.append("")
    lines.append(DISCLAIMER)
    lines.append("")
    lines.append(f"**관심영역**: {', '.join(focus) if focus else '전체'} "
                 f"(총 {len(digest_df)}건)")
    lines.append("")

    lines.append("## 만료 임박 (D-365 이내)")
    soon = digest_df[
        (digest_df["days_to_expiry"].notna())
        & (digest_df["days_to_expiry"] >= 0)
        & (digest_df["days_to_expiry"] <= 365)
    ].sort_values("days_to_expiry")
    if len(soon) == 0:
        lines.append("- (해당 없음)")
    else:
        for _, r in soon.head(20).iterrows():
            lines.append(
                f"- **D-{int(r['days_to_expiry'])}** | {r['office']} "
                f"{r['pub_no']} | {r['drug_ref']} ({r['category']}) | "
                f"{r['assignee']}"
            )
    lines.append("")

    lines.append("## 최근 grant (priority 2020+)")
    recent = digest_df[
        (digest_df["priority_date"] >= pd.Timestamp("2020-01-01"))
        & (digest_df["grant_date"].notna())
    ].sort_values("grant_date", ascending=False).head(10)
    for _, r in recent.iterrows():
        lines.append(
            f"- {r['grant_date'].date()} | {r['office']} {r['pub_no']} | "
            f"{r['drug_ref']} ({r['category']}) | {r['assignee']}"
        )
    lines.append("")

    lines.append("## 한국 출원인 / 한국 출원 동향")
    kr = digest_df[digest_df["is_korean_applicant"] | digest_df["korean_filing"]]
    kr = kr.sort_values("app_date", ascending=False).head(15)
    if len(kr) == 0:
        lines.append("- (해당 없음)")
    else:
        for _, r in kr.iterrows():
            ap = r["app_date"].date() if pd.notna(r["app_date"]) else "?"
            lines.append(
                f"- {ap} | {r['office']} {r['pub_no']} | {r['assignee']} | "
                f"{r['drug_ref']} ({r['category']})"
            )
            lines.append(f"    - claim 요약: {r.get('claim1_summary_ko','')}")
    lines.append("")
    return "\n".join(lines)


# ---------- 페이지 설정 ----------

st.set_page_config(
    page_title="DMPatentLandscape-Kor",
    page_icon=None,
    layout="wide",
)

st.title("DMPatentLandscape-Kor (디엠패턴트랜드스케이프코어)")
st.caption(
    "USPTO·EPO·KIPO·JPO·CNIPA·WIPO 당뇨 특허 통합 landscape — "
    "family clustering · 만료/PTE/SPC · 한국 출원인 동향 · weekly digest"
)
st.warning(DISCLAIMER)

df = load_patents()

# ---------- 사이드바 ----------

st.sidebar.header("필터")
office_sel = st.sidebar.multiselect(
    "Office",
    options=sorted(df["office"].unique()),
    default=sorted(df["office"].unique()),
)
cat_sel = st.sidebar.multiselect(
    "Category",
    options=sorted(df["category"].unique()),
    default=sorted(df["category"].unique()),
)
korean_only = st.sidebar.checkbox("한국 출원인/한국 출원만", value=False)

filtered = df[df["office"].isin(office_sel) & df["category"].isin(cat_sel)].copy()
if korean_only:
    filtered = filtered[filtered["is_korean_applicant"] | filtered["korean_filing"]]

st.sidebar.metric("필터 후 records", len(filtered))
st.sidebar.metric("필터 후 families", filtered["family_id"].nunique())

st.sidebar.markdown("---")
st.sidebar.caption(
    "데이터 출처(시뮬레이션): USPTO PEDS / EPO OPS / KIPRIS / J-PlatPat / "
    "CNIPA / WIPO PATENTSCOPE / FDA Orange Book. 모두 mock."
)

# ---------- 탭 ----------

tab1, tab2, tab3, tab4, tab5 = st.tabs(
    [
        "1. Multi-office + Family",
        "2. 분류 + Claim 요약",
        "3. 만료·PTE/SPC + Orange Book",
        "4. 한국 출원인 동향",
        "5. Weekly Digest + 리포트",
    ]
)


# === Tab 1: multi-office + family clustering ===
with tab1:
    st.subheader("Multi-office 통합 + INPADOC family clustering")
    st.markdown(
        "USPTO PEDS / EPO OPS / KIPRIS / J-PlatPat / CNIPA / WIPO PATENTSCOPE에서 "
        "수집했다고 가정한 record를 priority date 기준으로 INPADOC family로 묶었다."
    )

    col1, col2, col3 = st.columns(3)
    col1.metric("총 records", len(filtered))
    col2.metric("Family 수", filtered["family_id"].nunique())
    col3.metric("Multi-office family",
                int((filtered.groupby("family_id")["office"].nunique() > 1).sum()))

    # Office별 분포
    office_counts = filtered["office"].value_counts().reset_index()
    office_counts.columns = ["office", "count"]
    fig = px.bar(office_counts, x="office", y="count",
                 title="Office별 patent 분포", text="count")
    st.plotly_chart(fig, use_container_width=True)

    st.markdown("### Family 통합 view")
    fam_summary = (
        filtered.groupby("family_id")
        .agg(
            offices=("office", lambda x: ", ".join(sorted(set(x)))),
            n_records=("pub_no", "count"),
            assignee=("assignee", "first"),
            category=("category", "first"),
            drug_ref=("drug_ref", "first"),
            priority_date=("priority_date", "min"),
        )
        .reset_index()
        .sort_values("priority_date", ascending=False)
    )
    st.dataframe(fam_summary, use_container_width=True, height=400)

    st.markdown("### 선택한 family의 office별 detail")
    pick = st.selectbox("Family ID", fam_summary["family_id"].tolist())
    detail = filtered[filtered["family_id"] == pick][
        ["office", "pub_no", "app_no", "app_date", "grant_date",
         "expiry_date", "ipc", "assignee", "applicant_country"]
    ]
    st.dataframe(detail, use_container_width=True)


# === Tab 2: 분류 + claim 요약 ===
with tab2:
    st.subheader("IPC/CPC 분류 + 카테고리 자동 태깅 + Claim 1 한국어 요약")
    st.markdown(
        "IPC A61K 31/4985(DPP-4i)·A61K 31/7042(SGLT2i)·A61K 38/26(GLP-1RA/incretin)·"
        "A61K 38/28(insulin)·A61B 5/145(CGM)·A61M 5/172(insulin pump)·G16H 20/17(DTx) "
        "등을 기준으로 자동 태깅. 독립 claim 1은 rule-based 한국어 요약."
    )

    cat_counts = filtered["category"].value_counts().reset_index()
    cat_counts.columns = ["category", "count"]
    fig = px.bar(cat_counts, x="category", y="count",
                 title="Category별 patent 분포", text="count")
    fig.update_xaxes(tickangle=-45)
    st.plotly_chart(fig, use_container_width=True)

    # IPC 분포 (1차 분류만)
    def primary_ipc(lst):
        if isinstance(lst, list) and lst:
            return lst[0]
        return None

    filtered["primary_ipc"] = filtered["ipc"].apply(primary_ipc)
    ipc_counts = filtered["primary_ipc"].value_counts().head(15).reset_index()
    ipc_counts.columns = ["primary_ipc", "count"]
    fig2 = px.bar(ipc_counts, x="primary_ipc", y="count",
                  title="Primary IPC 상위 15", text="count")
    fig2.update_xaxes(tickangle=-45)
    st.plotly_chart(fig2, use_container_width=True)

    st.markdown("### Claim 1 한국어 요약 검색")
    q = st.text_input("키워드 (drug/category/요약 검색)", "")
    view = filtered.copy()
    if q:
        ql = q.lower()
        view = view[
            view["drug_ref"].str.lower().str.contains(ql, na=False)
            | view["category"].str.lower().str.contains(ql, na=False)
            | view["claim1_summary_ko"].str.contains(q, na=False)
        ]
    st.dataframe(
        view[["office", "pub_no", "drug_ref", "category", "assignee",
              "claim1_summary_ko"]],
        use_container_width=True,
        height=400,
    )


# === Tab 3: 만료·PTE/SPC + Orange Book ===
with tab3:
    st.subheader("만료일·PTE(US)/SPC(EU) + Orange Book Express 연계")
    st.markdown(
        "Standard 20년 + 미국 PTE(최대 5년) + EU SPC(최대 5년) + 소아 전용 6개월을 "
        "단순 가산해 effective expiry를 계산한다. Generic/biosimilar 진입 가능 시점의 reference."
    )
    st.info(
        "주의: 실제 PTE/SPC/소아 전용은 office별 복잡한 규정에 따른다. 본 도구는 "
        "단순 합산이므로 실제 만료일 확정은 USPTO/FDA/EPO 공식 자료와 변리사 자문 필수."
    )

    view = filtered.copy()
    view["effective_expiry_date"] = view["effective_expiry"]
    view["days_to_expiry"] = view["effective_expiry"].apply(_days_until)

    # 카운트다운 alert
    d365 = view[(view["days_to_expiry"] >= 0) & (view["days_to_expiry"] <= 365)]
    d180 = view[(view["days_to_expiry"] >= 0) & (view["days_to_expiry"] <= 180)]
    d90 = view[(view["days_to_expiry"] >= 0) & (view["days_to_expiry"] <= 90)]
    expired = view[view["days_to_expiry"] < 0]

    c1, c2, c3, c4 = st.columns(4)
    c1.metric("D-365 이내", len(d365))
    c2.metric("D-180 이내", len(d180))
    c3.metric("D-90 이내", len(d90))
    c4.metric("이미 만료", len(expired))

    # Orange Book
    ob = view[view["in_orange_book"]]
    st.metric("Orange Book 등재 (US use code 포함)", len(ob))

    st.markdown("### 만료 카운트다운 (D-365 이내, effective expiry 기준)")
    if len(d365) == 0:
        st.info("D-365 이내 만료 예정 record 없음 (필터 조정).")
    else:
        show = d365.sort_values("days_to_expiry")[
            ["office", "pub_no", "drug_ref", "category", "assignee",
             "expiry_date", "effective_expiry_date", "days_to_expiry",
             "pte_days", "spc_extension_months", "pediatric_extension",
             "in_orange_book", "ob_use_code"]
        ]
        st.dataframe(show, use_container_width=True, height=300)

    st.markdown("### Effective expiry 분포")
    # 연도별 만료 분포
    view["exp_year"] = view["effective_expiry"].dt.year
    year_counts = view["exp_year"].value_counts().sort_index().reset_index()
    year_counts.columns = ["year", "count"]
    fig = px.bar(year_counts, x="year", y="count",
                 title="Effective expiry 연도별 분포")
    st.plotly_chart(fig, use_container_width=True)


# === Tab 4: 한국 출원인 동향 + landscape ===
with tab4:
    st.subheader("한국 출원인 누적 출원/grant + 글로벌 top assignee 시계열 + IP whitespace")

    kr_df = df[df["is_korean_applicant"] | df["korean_filing"]].copy()
    st.metric("한국 관련 records (filter와 별개로 전체)", len(kr_df))

    # 한국 출원인 카운트
    kr_only = df[df["is_korean_applicant"]].copy()
    kr_counts = kr_only["assignee"].value_counts().reset_index()
    kr_counts.columns = ["assignee", "count"]
    if len(kr_counts):
        fig = px.bar(kr_counts, x="assignee", y="count",
                     title="한국 출원인 record 수", text="count")
        fig.update_xaxes(tickangle=-30)
        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("필터에 매칭되는 한국 출원인 record 없음.")

    # 글로벌 top assignee 시계열
    st.markdown("### 글로벌 top assignee 연도별 출원 추세 (priority year)")
    top = filtered[filtered["is_global_top"]].copy()
    top["priority_year"] = top["priority_date"].dt.year
    if len(top):
        ts = top.groupby(["priority_year", "assignee"]).size().reset_index(name="count")
        fig2 = px.line(ts, x="priority_year", y="count", color="assignee",
                       title="글로벌 top assignee priority year 출원 추세", markers=True)
        st.plotly_chart(fig2, use_container_width=True)
    else:
        st.info("필터에 매칭되는 글로벌 top assignee 없음.")

    # IP whitespace
    st.markdown("### IP whitespace 자동 highlight (category x office matrix)")
    pivot = (
        filtered.pivot_table(
            index="category", columns="office", values="pub_no", aggfunc="count"
        )
        .fillna(0)
        .astype(int)
    )
    st.dataframe(pivot, use_container_width=True)
    st.caption(
        "값이 0인 (category, office) 셀은 해당 office에서 신규 출원 검토 가치가 있는 영역."
    )

    # 한국 출원인의 글로벌 cover 비율
    st.markdown("### 한국 출원인 글로벌 cover 비율")
    if len(kr_only):
        kr_family_offices = (
            kr_only.groupby("family_id")["office"].nunique().reset_index(name="offices_covered")
        )
        kr_family_offices = kr_family_offices.merge(
            kr_only.groupby("family_id")["assignee"].first().reset_index(),
            on="family_id",
        )
        st.dataframe(kr_family_offices.sort_values("offices_covered", ascending=False),
                     use_container_width=True)


# === Tab 5: weekly digest + FTO + 리포트 ===
with tab5:
    st.subheader("Weekly Digest + FTO 보조 + Quarterly 리포트(docx)")

    # 관심영역 필터
    focus = st.multiselect(
        "관심영역 (digest 필터)",
        options=sorted(df["category"].unique()),
        default=["GLP-1RA", "SGLT2i", "CGM", "smart pen", "DTx"],
    )
    today_input = st.date_input("기준일", value=date.today())
    today_dt = datetime.combine(today_input, datetime.min.time())

    digest_df = df[df["category"].isin(focus)].copy() if focus else df.copy()
    digest_df["effective_expiry_date"] = digest_df["effective_expiry"]
    digest_df["days_to_expiry"] = digest_df["effective_expiry"].apply(
        lambda d: None if d is None or pd.isna(d) else (d - today_dt).days
    )

    st.markdown("### 만료 임박 (effective expiry 기준 D-365 이내)")
    soon = digest_df[
        (digest_df["days_to_expiry"].notna())
        & (digest_df["days_to_expiry"] >= 0)
        & (digest_df["days_to_expiry"] <= 365)
    ].sort_values("days_to_expiry")
    st.dataframe(
        soon[["office", "pub_no", "drug_ref", "category", "assignee",
              "effective_expiry_date", "days_to_expiry"]],
        use_container_width=True,
    )

    st.markdown("### 최근 grant")
    recent = digest_df[
        (digest_df["priority_date"] >= pd.Timestamp("2020-01-01"))
        & (digest_df["grant_date"].notna())
    ].sort_values("grant_date", ascending=False).head(15)
    st.dataframe(
        recent[["grant_date", "office", "pub_no", "drug_ref",
                "category", "assignee"]],
        use_container_width=True,
    )

    st.markdown("### 한국 출원인 / 한국 출원 동향")
    kr_section = digest_df[
        digest_df["is_korean_applicant"] | digest_df["korean_filing"]
    ].sort_values("app_date", ascending=False)
    st.dataframe(
        kr_section[["app_date", "office", "pub_no", "assignee",
                    "drug_ref", "category", "claim1_summary_ko"]],
        use_container_width=True,
    )

    # FTO 보조
    st.markdown("### FTO 보조 view")
    st.caption(
        "특정 약물/카테고리에 대해 현재 유효(미만료) + Orange Book 등재 또는 "
        "주요 office grant 상태의 record를 표시."
    )
    fto_target = st.text_input("FTO 대상 (약물명/카테고리 일부)", "semaglutide")
    if fto_target:
        ql = fto_target.lower()
        fto = df[
            (df["drug_ref"].str.lower().str.contains(ql, na=False)
             | df["category"].str.lower().str.contains(ql, na=False))
            & (df["effective_expiry"] >= today_dt)
        ]
        st.dataframe(
            fto[["office", "pub_no", "drug_ref", "category", "assignee",
                 "grant_date", "effective_expiry", "in_orange_book", "ob_use_code"]],
            use_container_width=True,
        )
        st.warning(
            "FTO 분석은 단순 만료/등재 여부 이상의 claim scope·국가별 enforcement·"
            "license 상태를 종합 판단해야 한다. 반드시 변리사 자문 필요."
        )

    # Markdown digest download
    st.markdown("### Markdown weekly digest 다운로드")
    md_text = _build_digest_md(digest_df, focus, today_input.isoformat())
    st.download_button(
        "weekly_digest.md 다운로드",
        data=md_text.encode("utf-8"),
        file_name=f"weekly_digest_{today_input.isoformat()}.md",
        mime="text/markdown",
    )

    # docx quarterly 리포트
    st.markdown("### 한국 출원인 Quarterly 리포트 (docx)")
    kr_pick = sorted(kr_only["assignee"].unique()) if len(df[df["is_korean_applicant"]]) else []
    if kr_pick:
        applicant_pick = st.selectbox(
            "한국 출원인 선택",
            options=["(전체 한국 출원인)"] + list(kr_pick),
        )
    else:
        applicant_pick = "(전체 한국 출원인)"

    if st.button("Quarterly docx 리포트 생성"):
        try:
            from docx import Document
        except Exception as e:
            st.error(
                f"python-docx 가 설치되어 있지 않다: {e}. "
                f"`pip install -r requirements.txt`로 설치한다."
            )
        else:
            doc = Document()
            doc.add_heading("DMPatentLandscape-Kor — Quarterly Report", level=1)
            doc.add_paragraph(f"기준일: {today_input.isoformat()}")
            doc.add_paragraph(DISCLAIMER)
            doc.add_heading("대상 한국 출원인", level=2)
            doc.add_paragraph(applicant_pick)

            target = df[df["is_korean_applicant"]].copy()
            if applicant_pick != "(전체 한국 출원인)":
                target = target[target["assignee"] == applicant_pick]

            doc.add_heading("최근 출원/등록", level=2)
            for _, r in target.sort_values("app_date", ascending=False).head(20).iterrows():
                doc.add_paragraph(
                    f"- {r['office']} {r['pub_no']} | 출원 {r.get('app_date')} | "
                    f"등록 {r.get('grant_date')} | {r['drug_ref']} ({r['category']})"
                )
                doc.add_paragraph(
                    f"    claim 요약: {r.get('claim1_summary_ko','')}"
                )

            doc.add_heading("Category 분포", level=2)
            for cat, n in target["category"].value_counts().items():
                doc.add_paragraph(f"- {cat}: {n}건")

            buf = io.BytesIO()
            doc.save(buf)
            buf.seek(0)
            st.download_button(
                "리포트 docx 다운로드",
                data=buf.getvalue(),
                file_name=f"quarterly_report_{today_input.isoformat()}.docx",
                mime=(
                    "application/vnd.openxmlformats-officedocument."
                    "wordprocessingml.document"
                ),
            )
            st.success("리포트 생성 완료.")