import sqlite3, unicodedata, json
import xml.etree.ElementTree as ET

DBPATH = "ldoce.db"
DICTID = "simple_LDOCE_en-en_draft_v0"

def extractMainText(elem):
    """
    去除例句中的GLOSS GEO
    """
    parts = []

    # 当前节点自己的文本
    if elem.text:
        parts.append(elem.text)

    # 遍历子节点
    for child in elem:
        # 跳过指定
        if child.tag not in ("GLOSS", "GEO"):
            if child.text:
                parts.append(child.text)

        # 加上子节点后面的尾文本
        if child.tail:
            parts.append(child.tail)

    return "".join(parts).strip()

def removeEmpty(data, whiteList=None):
    """
    递归删除字典中的空值：
    None, "", [], {}
    """
    if whiteList is None:
        whiteList = set()
    else:
        whiteList = set(whiteList)

    if isinstance(data, dict):
        new_dict = {}
        for k, v in data.items():
            cleaned = removeEmpty(v, whiteList)

            # 如果 key 在白名单里，保留
            if k in whiteList:
                new_dict[k] = cleaned
            else:
                if cleaned not in (None, "", [], {}):
                    new_dict[k] = cleaned

        return new_dict

    elif isinstance(data, list):
        new_list = []
        for item in data:
            cleaned = removeEmpty(item, whiteList)
            if cleaned not in (None, "", [], {}):
                new_list.append(cleaned)
        return new_list

    return data

def getElementText(elem, path:str, default=None) -> str|None:
    """获取文本"""
    elem = elem.find(path)
    if elem is not None:
        return elem.text
    else:
        return default

def removeAccents(text: str) -> str:
    """
    去除音调
    café > cafe
    """
    normalized = unicodedata.normalize('NFKD', text)
    return ''.join(ch for ch in normalized if not unicodedata.combining(ch))


with sqlite3.connect(DBPATH) as dictDb:
    dictCur = dictDb.cursor()

    # dictCur.execute("SELECT id, HWD, CORE from core WHERE HOMNUM='1' OR HOMNUM='2'")
    dictCur.execute("SELECT id, HWD, CORE from core")

    i, entryID = 0, 1
    limit = 2000
    while True and i < limit:
        # ---------------------------------
        # 到这开始了，处理这个词条
        entryRow = dictCur.fetchone()
        if entryRow == None:
            break
        headWord = entryRow[1]
        entry = ET.fromstring(entryRow[2])
        # 可以先拿高频词试试
        freq = [f.text for f in entry.findall("./Head/FREQ")]
        if True or "S1" in freq:
            if entry.find("./Head/PronCodes/AMEVARPRON") is None:
                pronunciation = [
                    {"notation": getElementText(entry, "./Head/PronCodes/PRON")}
                ]
            else:
                pronunciation = [
                    {"region": "US", "notation": getElementText(entry, "./Head/PronCodes/AMEVARPRON")},
                    {"region": "UK", "notation": getElementText(entry, "./Head/PronCodes/PRON")}
                ]
            entryJson = {
                "dict_id": DICTID,
                "entry_id": entryID,
                "headword": headWord,
                # "headword_normalized": removeAccents(headWord.lower()),
                "entry_type": "word",
                "page": "",
                "section": getElementText(entry, "./Head/POS", default=""),
                "pronunciation": pronunciation,
                "sense": []
            }
            entryID += 1
            senseIndex = 1
            for sense in entry.findall("./Sense"):
                # 一些结构没有，只能舍去
                # 跳过固定搭配下的
                lexunitElement = sense.find("./LEXUNIT")
                if lexunitElement is not None:
                    break
                # 跳过含子释义的
                subsenseElement = sense.find("./DEF")
                if subsenseElement is None:
                    break
                definition = {
                    "en": getElementText(sense, "./DEF")
                }
                example = [
                    {"en": extractMainText(exampleSentenceElement)} for exampleSentenceElement in sense.findall(".//EXAMPLE")
                ]
                label = {
                    "pattern": [patternElement.text for patternElement in sense.findall("./COLLO")],
                    "grammar": [getElementText(sense, "./GRAM")],
                    "region": getElementText(sense, "./GEO"),
                    "regester": getElementText(sense, "./REGISTERLAB"),
                }
                # print(label)
                entryJson["sense"].append({
                    "index": senseIndex,
                    "label": label,
                    "definition": definition,
                    "example": example
                })
                senseIndex += 1
                # print(example)
            # print(headWord)
            entryJson = json.dumps(removeEmpty(entryJson, whiteList=["page", "section"]), ensure_ascii=False)
            if headWord == "answer":
                print(entryJson)
            i += 1