import re

# ===== 1. 预编译正则表达式，提升匹配速度 =====
PY_PATTERN = re.compile(r"(?<=\[)([^]]+[1-5])")


def pinyinify(pystr):
    c1s = {
        'A': ['Ā', 'Á', 'Ǎ', 'À', 'A'], 'O': ['Ō', 'Ó', 'Ǒ', 'Ò', 'O'],
        'E': ['Ē', 'É', 'Ě', 'È', 'E'], 'a': ['ā', 'á', 'ǎ', 'à', 'a'],
        'o': ['ō', 'ó', 'ǒ', 'ò', 'o'], 'e': ['ē', 'é', 'ě', 'è', 'e'],
    }
    c2s = {
        'i': ['ī', 'í', 'ǐ', 'ì', 'i'], 'u': ['ū', 'ú', 'ǔ', 'ù', 'u'],
        'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ', 'ü'],
    }
    rms = {'r5': 'r', 'm2': 'ḿ', 'm4': 'm̀'}

    pys = pystr.split()
    results = []
    for pyo in pys:
        py = pyo[:]
        if py in rms:
            results.append(rms[py])
            continue
        if py[-1] not in ('1', '2', '3', '4', '5'):
            results.append(py)
            continue
        py = py.replace('u:', 'ü')
        n = int(py[-1]) - 1
        flag = 0
        # 优先级处理
        for c in py[:-1]:
            if c in c1s:
                results.append(py[:-1].replace(c, c1s[c][n]))
                flag = 1
                break
        if flag: continue
        for c in reversed(py[:-1]):
            if c in c2s:
                results.append(py[:-1].replace(c, c2s[c][n]))
                flag = 1
                break
        if flag: continue
        results.append(pyo)
    return ' '.join(results)


# ===== 主程序 =====
simplified_indexes = set()
css = '<link href="hycd.css" rel="stylesheet">'
count = 0

print("开始处理，请稍候...")

# 一次性打开输出文件，全程保持打开状态
with open("dict.txt", encoding="utf-8", mode="w") as f_out:
    with open("cedict_ts.u8", encoding="utf-8-sig", mode="r") as f_in:
        for line in f_in:
            line = line.strip()
            if not line or line.startswith("#"):
                continue

            # 解析行数据
            parts = line.split(" [")
            cnhws = list(dict.fromkeys(parts[0].split()))

            pinyin_part = parts[1].split("] ")[0]
            endefs = parts[1].split(" /")[1].strip("/").split("/")

            # 生成 Header
            if len(cnhws) == 2:
                hw_html = '<hwt>' + '</hwt><hw>'.join(cnhws) + '</hw>'
                simplified_indexes.add((cnhws[1], cnhws[0]))  # (简体, 繁体)
            else:
                hw_html = '<hw>' + cnhws[0] + '</hw>'

            header = '<entry>' + hw_html + '<def><py>' + pinyinify(pinyin_part) + '</py></def>'

            # 生成并处理 Definition (只处理一次)
            dc_raw = '<def>❍ ' + '</def><def>❍ '.join(endefs) + '</def></entry>'
            # 使用预编译的正则进行替换
            dc = PY_PATTERN.sub(lambda m: pinyinify(m.group(1)), dc_raw)

            # 写入词条 (繁体和简体都指向同一份内容)
            full_entry = "\n" + css + header + dc + "\n</>\n"
            for cnhw in cnhws:
                f_out.write(cnhw + full_entry)
                count += 1

    # 写入简体索引
    print("正在生成简体索引...")
    for hw_simplified, hw_traditional in sorted(simplified_indexes):
        f_out.write(hw_simplified + "\n@@@LINK=" + hw_traditional + "\n</>\n")
        count += 1

print(f"完成！共处理词条 {count} 条。")