#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
将 logeion_dictionary.jsonl 转为 GoldenDict/MDX 的 txt 数据。
- 生成规范 HTML 结构，方便 CSS 美化
- 自动添加 IPA（CLTK），失败则留空
- 生成丰富的检索别名（大小写/去附标/罗马转写多风格）为 @@@LINK
"""

import argparse
import json
import os
import sys
import unicodedata
from typing import Dict, List, Tuple, Iterable, Set, Optional

# ========== 组合附标（NFD 分解后使用） ==========
SMOOTH = "\u0313"       # ̓
ROUGH = "\u0314"        # ̔
ACUTE = "\u0301"        # ́
GRAVE = "\u0300"        # ̀
CIRCUMFLEX = "\u0342"   # ͂ (perispomeni)
DIAERESIS = "\u0308"    # ̈
MACRON = "\u0304"       # ̄
BREVE = "\u0306"        # ̆
IOTA_SUB = "\u0345"     # ͅ
TONOS = "\u0301"        # ́ (monotonic)
VARIA = "\u0300"        # ̀
PERISP = "\u0342"
PSILI = SMOOTH
DASIA = ROUGH

# 常见希腊变体统一
VARIANT_TO_BASE = {
    "ϐ": "β", "ϑ": "θ", "ϕ": "φ", "ϱ": "ρ", "ϖ": "π",
    "ϲ": "σ", "ς": "σ", "Ϲ": "Σ",
    "Ϝ": "ϝ", "Ϛ": "ϛ", "Ϟ": "ϟ", "Ϡ": "ϡ",
}

# 元音/辅音集合
VOWELS = set("αεηιουωΑΕΗΙΟΥΩ")
VOWELS_LC = set("αεηιουω")
CONSONANTS = set([
    "β","γ","δ","ζ","θ","κ","λ","μ","ν","ξ","π","ρ","σ","τ","φ","χ","ψ",
    "Β","Γ","Δ","Ζ","Θ","Κ","Λ","Μ","Ν","Ξ","Π","Ρ","Σ","Τ","Φ","Χ","Ψ"
])

# 基本拉丁转写（单字母，不含上下文规则）
BASE_MAP = {
    "α": "a", "β": "b", "γ": "g", "δ": "d", "ε": "e", "ζ": "z",
    "η": "e", "θ": "th", "ι": "i", "κ": "k", "λ": "l", "μ": "m",
    "ν": "n", "ξ": "x", "ο": "o", "π": "p", "ρ": "r", "σ": "s",
    "τ": "t", "υ": "y", "φ": "ph", "χ": "kh", "ψ": "ps", "ω": "o",
    # 罕见/数字用字母（按常见学术习惯）
    "ϝ": "w",       # digamma
    "ϛ": "st",      # stigma
    "ϟ": "q",       # koppa
    "ϡ": "s",       # sampi（近似 s）
}

# 可形成二重元音（小写基准）
DIPHTH_MAP_DEFAULT = {
    ("α", "ι"): "ai",
    ("ε", "ι"): "ei",
    ("ο", "ι"): "oi",
    ("α", "υ"): "au",
    ("ε", "υ"): "eu",
    ("η", "υ"): "eu",
    ("ο", "υ"): "ou",
    # υι -> 'yi' 或 'ui' 见参数
}

def _consume_combining(s: str, i: int) -> Tuple[str, List[str], int]:
    """从位置 i 开始，读取一个 base 字符及其所有组合附标（NFD 下）。"""
    base = s[i]
    i += 1
    marks: List[str] = []
    while i < len(s) and unicodedata.category(s[i]) == "Mn":
        marks.append(s[i])
        i += 1
    return base, marks, i

def _latin_case(s: str, keep_case: bool, is_upper: bool) -> str:
    """按需保留大写：只把转写片段首字母大写（Ph, Kh, Rh 等）。"""
    if not keep_case or not is_upper or not s:
        return s
    return s[0].upper() + s[1:]

def gr_to_lat(
    text: str,
    *,
    upsilon_style: str = "y",   # 'y'（默认）或 'u'
    yi_diphthong: str = "yi",   # 'yi'（默认）或 'ui'
    nasal_gamma: bool = False,  # γγ/γκ/γξ/γχ → ng/nk/nx/nkh（默认关闭：用户不要鼻化规则）
    mark_diaeresis: bool = False,  # True: 断音用连字符（a-i）；False: 直接 ai
    keep_case: bool = False,       # True: Φ→Ph, Ῥ→Rh 等；False: 全小写
    drop_punct: bool = True,       # True: 删除标点/空白；False: 保留
    chi_style: str = "kh",         # 'kh'（默认）或 'ch'
) -> str:
    """
    将古希腊文（多音调）转为 ASCII 拉丁字母。
    - 不使用上标/变音符
    - 默认不做鼻化规则（可开关）
    """
    # 1) 规范化为 NFD
    s = unicodedata.normalize("NFD", text)

    # 2) 统一变体
    s = "".join(VARIANT_TO_BASE.get(ch, ch) for ch in s)

    out: List[str] = []
    i = 0

    # 准备 χ 风格映射（kh 或 ch）
    base_map = dict(BASE_MAP)
    if chi_style == "ch":
        base_map["χ"] = "ch"

    while i < len(s):
        ch = s[i]

        # 忽略孤立的组合附标
        if unicodedata.category(ch) == "Mn":
            i += 1
            continue

        # 非希腊字母：按需保留或删除
        if ch not in VOWELS and ch not in CONSONANTS and ch.lower() not in base_map and ch not in ("Σ",):
            if not drop_punct:
                out.append(ch)
            i += 1
            continue

        # 取 base + combining marks
        base, marks, i = _consume_combining(s, i)
        is_upper = base.isupper()
        base_lc = base.lower()

        has_rough = ROUGH in marks
        has_diaer = DIAERESIS in marks
        has_iota_sub = IOTA_SUB in marks

        # rho 粗气：ῥ/Ῥ → rh
        if base_lc == "ρ":
            t = "rh" if has_rough else "r"
            out.append(_latin_case(t, keep_case, is_upper))
            continue

        # 元音处理（含二重元音）
        if base_lc in VOWELS_LC:
            iota_sub_tail = "i" if has_iota_sub else ""

            # Lookahead 判断二重元音（下一个 cluster）
            j = i
            next_base = None
            next_marks: List[str] = []
            if j < len(s):
                nb = s[j]
                if unicodedata.category(nb) != "Mn":
                    nb_base, nb_marks, _ = _consume_combining(s, j)
                    next_base, next_marks = nb_base, nb_marks

            formed_diph = False
            diph_text = None

            if next_base:
                nb_lc = next_base.lower()
                nb_has_diaer = DIAERESIS in next_marks
                if not nb_has_diaer:
                    diph_map = dict(DIPHTH_MAP_DEFAULT)
                    diph_map[("υ", "ι")] = yi_diphthong
                    key = (base_lc, nb_lc)
                    if key in diph_map:
                        diph_text = diph_map[key]
                        if has_rough:  # 元音起首粗气加 h
                            diph_text = "h" + diph_text
                        formed_diph = True

            if formed_diph:
                # 消耗下一个 cluster
                _nb, _nm, i2 = _consume_combining(s, i)
                i = i2
                diph_text = _latin_case(diph_text, keep_case, is_upper)
                out.append(diph_text + iota_sub_tail)
                continue

            # 非二重元音：单元音处理
            if base_lc == "υ":
                core = "y" if upsilon_style == "y" else "u"
                t = ("h" + core) if has_rough else core
            else:
                core = base_map.get(base_lc, base_lc)
                t = ("h" + core) if has_rough else core

            if has_diaer and mark_diaeresis:
                if out and (out[-1][-1:].isalpha()):
                    t = "-" + t

            t = _latin_case(t, keep_case, is_upper)
            out.append(t + iota_sub_tail)
            continue

        # 辅音处理
        if base_lc in base_map:
            t_raw = base_map[base_lc]
        elif base == "Σ":  # 大写 lunate sigma
            t_raw = "s"
        else:
            t_raw = base_map.get(base_lc, "")

        # 鼻化 γ（可选，默认关闭）
        if nasal_gamma and base_lc == "γ":
            j = i
            next_base = None
            if j < len(s):
                nb = s[j]
                if unicodedata.category(nb) != "Mn":
                    nb_base, nb_marks, _ = _consume_combining(s, j)
                    next_base = nb_base.lower()
            if next_base in ("γ", "κ", "ξ", "χ"):
                out.append(_latin_case("n", keep_case, is_upper))
                continue

        out.append(_latin_case(t_raw, keep_case, is_upper))

    result = "".join(out)
    result = "".join(ch for ch in result if unicodedata.category(ch) != "Mn")
    return result

# ========== 去除希腊附标 / 变体统一 ==========

GREEK_COMBINING_TO_DROP = {
    SMOOTH, ROUGH, ACUTE, GRAVE, CIRCUMFLEX, DIAERESIS, MACRON, BREVE, IOTA_SUB,
    TONOS, VARIA, PERISP, PSILI, DASIA
}

def greek_strip_marks(s: str, *, keep_iota_sub: bool = False, keep_diaeresis: bool = False) -> str:
    """去除多音调附标；可选保留 iota sub/diaeresis。"""
    s_nfd = unicodedata.normalize("NFD", s)
    out = []
    for ch in s_nfd:
        if unicodedata.category(ch) != "Mn":
            out.append(VARIANT_TO_BASE.get(ch, ch))
        else:
            if ch == IOTA_SUB and keep_iota_sub:
                out.append(ch)
            elif ch == DIAERESIS and keep_diaeresis:
                out.append(ch)
            else:
                # drop combining
                pass
    return unicodedata.normalize("NFC", "".join(out))

def greek_iota_sub_to_i(s: str) -> str:
    """将 iota 下标 ͅ 转成显式 i（在 NFD 下处理）。"""
    s_nfd = unicodedata.normalize("NFD", s)
    out = []
    i = 0
    while i < len(s_nfd):
        ch = s_nfd[i]
        if unicodedata.category(ch) != "Mn":
            # peek marks
            base, marks, j = _consume_combining(s_nfd, i)
            if IOTA_SUB in marks:
                # 去掉 iota_sub，自行加一个 i
                new_marks = [m for m in marks if m != IOTA_SUB]
                out.append(base)
                out.extend(new_marks)
                out.append("ι")
            else:
                out.append(base)
                out.extend(marks)
            i = j
        else:
            i += 1
    return unicodedata.normalize("NFC", "".join(out))

def to_titlecase_grc(s: str) -> str:
    """希腊词首字母大写（简化处理）：使用 NFC + str.title()。"""
    return unicodedata.normalize("NFC", s).title()

# ========== CLTK IPA ==========
def get_cltk_transcriber():
    """返回 (transcribe_word) 可调用对象，或 None。"""
    # 优先使用新版 grc.transcription.Transcriber
    try:
        from cltk.phonology.grc.transcription import Transcriber
        trans = Transcriber(dialect="Attic", reconstruction="Probert")
        def _tx(word: str) -> str:
            try:
                out = trans.transcribe(word, accentuate=True, syllabify=True)
                # 常见形如 "[dé.kɑ...]"，这里直接返回
                return out
            except Exception:
                return ""
        return _tx
    except Exception:
        pass

    # 退化：grc.phonology.GreekTranscription
    try:
        from cltk.phonology.grc.phonology import GreekTranscription
        gt = GreekTranscription()
        def _tx(word: str) -> str:
            try:
                out = gt.transcribe(word)
                return out
            except Exception:
                return ""
        return _tx
    except Exception:
        pass

    return None

# ========== 生成别名 ==========
def make_romanization_variants(word: str) -> Set[str]:
    """生成罗马转写多风格（用于检索别名）。"""
    variants: Set[str] = set()

    # 组合：upsilon y/u × yi/ui × χ kh/ch
    for ups in ("y", "u"):
        for yi in ("yi", "ui"):
            for chi in ("kh", "ch"):
                t = gr_to_lat(
                    word,
                    upsilon_style=ups,
                    yi_diphthong=yi,
                    nasal_gamma=False,     # 不要鼻化规则
                    mark_diaeresis=False,
                    keep_case=False,
                    drop_punct=True,
                    chi_style=chi,
                )
                if t:
                    variants.add(t)

    # 去重后的基本清洗（再降一遍大小写）
    variants = {v.strip().lower() for v in variants if v.strip()}
    return variants

def make_greek_variants(word: str) -> Set[str]:
    """生成希腊文变体（大小写/去附标/下标 i）。"""
    variants: Set[str] = set()

    w = unicodedata.normalize("NFC", word.strip())
    if not w:
        return variants

    # 原词的首字母大写（标题式）
    variants.add(to_titlecase_grc(w))

    # 去所有附标（带/不带 iota_sub→i）
    deaccent = greek_strip_marks(w, keep_iota_sub=False, keep_diaeresis=False)
    variants.add(deaccent)
    variants.add(to_titlecase_grc(deaccent))

    # iota 下标 → 显式 i（并移除附标）
    iota_i = greek_iota_sub_to_i(w)
    iota_i = greek_strip_marks(iota_i, keep_iota_sub=False, keep_diaeresis=False)
    variants.add(iota_i)
    variants.add(to_titlecase_grc(iota_i))

    # lunate/final sigma 等归并（已在变体归并中处理，这里再保底）
    variants = {unicodedata.normalize("NFC", v.replace("ς", "σ")) for v in variants}

    # 去空
    variants = {v for v in variants if v and v != w}
    return variants

# ========== 渲染 HTML ==========
def render_entry_html(word: str, ipa: str, latin_main: str, latin_all: List[str], defs: Dict[str, str]) -> str:
    """
    统一的 HTML 结构（尽量 CSS 友好）。
    """
    parts: List[str] = []
    parts.append("<link rel='stylesheet' href='logeion.css' type='text/css'>")
    parts.append("<div class='logeion-entry'>")

    parts.append("  <div class='hw'>")
    parts.append(f"    <span class='hw-grc'>{word}</span>")
    if latin_main:
        parts.append(f"    <span class='hw-latin'>{latin_main}</span>")
    if ipa:
        parts.append(f"    <span class='hw-ipa'>{ipa}</span>")
    parts.append("  </div>")

    # 所有拉丁变体（可用 CSS 控制显示/隐藏）
    if latin_all:
        parts.append("  <div class='latin-variants'>")
        parts.append("    <span class='label'>Romanization variants:</span>")
        for v in sorted(set(latin_all)):
            parts.append(f"    <span class='lat'>{v}</span>")
        parts.append("  </div>")

    parts.append("  <div class='sources'>")
    for src, html in defs.items():
        parts.append(f"    <div class='source' data-src='{src}'>")
        parts.append(f"      <div class='src-title'>{src}</div>")
        parts.append("      <div class='src-body'>")
        # 定义内容可能自带 HTML/XML，直接嵌入
        parts.append(f"{html}")
        parts.append("      </div>")
        parts.append("    </div>")
    parts.append("  </div>")

    parts.append("</div>")
    return "\n".join(parts)

# ========== 主流程 ==========
def main():
    ap = argparse.ArgumentParser(description="Convert logeion JSONL to GoldenDict/MDX txt")
    ap.add_argument("-i", "--input", default="logeion_dictionary.jsonl", help="Input JSONL path")
    ap.add_argument("-o", "--output", default="logeion_gd.txt", help="Output MDX-txt path")
    ap.add_argument("--no-ipa", action="store_true", help="Do not call CLTK for IPA")
    args = ap.parse_args()

    if not os.path.isfile(args.input):
        print(f"Input not found: {args.input}", file=sys.stderr)
        sys.exit(1)

    transcribe_word = None if args.no_ipa else get_cltk_transcriber()
    if args.no_ipa:
        print("CLTK IPA disabled by --no-ipa", file=sys.stderr)
    elif transcribe_word is None:
        print("CLTK not available; IPA will be empty.", file=sys.stderr)

    n_total = 0
    with open(args.input, "r", encoding="utf-8") as fin, open(args.output, "w", encoding="utf-8") as fout:
        for line in fin:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"[WARN] JSON decode failed: {e}", file=sys.stderr)
                continue

            word = (obj.get("word") or "").strip()
            if not word:
                continue
            defs = obj.get("definitions", {})
            # 主词条键（使用原始形态 NFC）
            key_main = unicodedata.normalize("NFC", word)

            # IPA
            ipa = ""
            if transcribe_word is not None:
                try:
                    ipa = transcribe_word(key_main)
                except Exception:
                    ipa = ""

            # 拉丁转写（主显示：标准配置 y/yi/kh）
            latin_main = gr_to_lat(
                key_main,
                upsilon_style="y",
                yi_diphthong="yi",
                nasal_gamma=False,     # 不做鼻化
                mark_diaeresis=False,
                keep_case=False,
                drop_punct=True,
                chi_style="kh",
            )
            # 拉丁所有变体（用于展示 + 检索）
            latin_variants = sorted(make_romanization_variants(key_main))

            # 渲染内容
            html = render_entry_html(
                key_main,
                ipa=ipa,
                latin_main=latin_main,
                latin_all=latin_variants,
                defs=defs
            )

            # 先输出别名（@@@LINK）
            alias_keys: Set[str] = set()
            alias_keys |= make_greek_variants(key_main)      # 希腊变体
            alias_keys |= set(latin_variants)                 # 拉丁变体

            # 例：特别确保含不带附标的大小写两种 + 原大小写两种（已覆盖，但这里稳妥再加）
            alias_keys.add(greek_strip_marks(key_main))
            alias_keys.add(to_titlecase_grc(greek_strip_marks(key_main)))
            # 罗马主转写也加一次
            if latin_main:
                alias_keys.add(latin_main.lower())

            # 清理：不把主键本身作为别名
            alias_keys.discard(key_main)

            # 写出别名
            for alias in sorted(alias_keys):
                if not alias or alias == key_main:
                    continue
                fout.write(f"{alias}\n")
                fout.write(f"@@@LINK={key_main}\n")
                fout.write("</>\n")

            # 写出主词条
            fout.write(f"{key_main}\n")
            fout.write(html)
            fout.write("\n</>\n")

            n_total += 1

    print(f"Done. Wrote {n_total} entries to {args.output}")

if __name__ == "__main__":
    main()