import json
import os
import glob
from inspect import signature

superscript_map = str.maketrans({
        '0': '⁰',
        '1': '¹',
        '2': '²',
        '3': '³',
        '4': '⁴',
        '5': '⁵',
        '6': '⁶',
        '7': '⁷',
        '8': '⁸',
        '9': '⁹',
        '+': '⁺',
        '-': '⁻',
    })

def convert_list(d,tag):
    path = d[0]['path']
    parts = path.split('/')
    while parts and parts[-1] != tag:
        parts.pop()
    path = '/'.join(parts)
    return convert_list_path(d,path)
def convert_list_path(d,path):
    html = ''
    orig_path = path.split('/')
    cur_path = path.split('/')
    prev_path = ''
    for e in d:
        tag_path = e.get('path').split('/')
        if prev_path == e.get('path','') and tag_path[-1] != e['tag'] and e['tag'] in tag_path:
            while tag_path[-1] != e['tag']:
                tag_path.pop()
        while cur_path and cur_path[-1] != e['tag'] and not e['path'].startswith('/'.join(cur_path)) or len(tag_path) < len(cur_path):
            html += f'</{cur_path[-1]}>'
            cur_path.pop()
        if len(tag_path) > len(cur_path):
            for part in tag_path[len(cur_path):]:
                cur_path.append(part)
                if part == 'audio':
                    continue
                html += f'<{part}>'
        value = e.get('value')
        if isinstance(value,dict):
            t = ''
            for key,value in value.items():
                t += convert_tree(value,e['tag'])
            value = t
        if e['tag'] == 'ill':
            value = f'<img src="{value}.jpg" />'
        elif e['tag'] == 'phon':
            value = '/' + value + '/'
        elif e['tag'] == 'audio':
            value = f'<a class="audio" href="sound://{value}"><img src="audio_bre_initial.svg"/></a>'
        elif e['tag'] == 'pron-gs' and value == '/':
            value = ''
        elif e['tag'] == 'h-cefr':
            value= value.replace('[Ox3000 key_L][CEFR_A1_L]','<img src="ox3000_a1.svg"/>').replace('[Ox3000 key_L][CEFR_A2_L]','<img src="ox3000_a2.svg"/>')\
            .replace('[Ox3000 key_L][CEFR_B1_L]','<img src="ox3000_b1.svg"/>').replace('[Ox3000 key_L][CEFR_B2_L]','<img src="ox3000_b2.svg"/>')
        if value and e.get('bold') and e['tag'] not in ['eb','dr','bf','if']:
            if e['tag'] == 'sn-g' and value.endswith('.'):
                value = f'<num>{value}</num>'
            else:
                value = f'<b>{value}</b>'

        if cur_path[-1] == e['tag']:
            if value:
                html += value
        elif e['tag'] in tag_path:
            if value:
                html += value
        elif value:
            html += f"<{e['tag']}>{value}</{e['tag']}>"
        else:
            html += f"<{e['tag']}/>"
        prev_path = e.get('path','')
    while cur_path and cur_path != orig_path:
        if part != 'audio':
            html += f'</{cur_path[-1]}>'
        cur_path.pop()
    return html
def convert_head(d):
    head = ''
    for e in d:
        if e['tag'] in 'h':
            head = e['value']
        elif e['tag'] == 'hm':
            head += e['value'].translate(superscript_map)
    return head


def process_xrg(data):
    html = '<xrg>'
    html += data["xrgs_text"]
    
    def process_xrg_entry(xrg):
        html = ''
        if 'xh' in xrg:
            html += f"<xh>{xrg['xh']}</xh>"
        elif 'xw' in xrg:
            html += f"<xw>{xrg['xw']}</xw>"
        if 'xhm' in xrg:
            html += f"<xhm>{xrg['xhm']}</xhm>"
        if 'xs' in xrg:
            html += f"<xs>{xrg['xs'].replace(' $2','')}</xs>"
        return html
    #if len(data['xrg']) > 2 or len(data['xrg_trans']) > 2:
    #    print("multiple xrgs")
    for i,xrg in enumerate(data['xrg'],1):
        if i > 1 and i == len(data['xrg']):
            html += ' and '
        elif i > 1:
            html += ', '
        html += process_xrg_entry(xrg)

    html += '<xrg_trans>'
    if data.get("xrgs_text_trans"):
        html += f'{data["xrgs_text_trans"]}'
    for i, xrg in enumerate(data['xrg_trans'],1):
        if i > 1 and i == len(data['xrg']):
            html += ' 和 '
        elif i > 1:
            html += '，'
        html += process_xrg_entry(xrg)
    
    if data.get("xrgs_text_end"):
        html += f'{data["xrgs_text_end"]}'
    html += '</xrg_trans>'
    html += "</xrg>"
    return html
def process_phong(d):
    
    phonetic = f"<pron-g>"
    if 'form' in d:
        phonetic += f'<form>/{d['form']}/<form>'

    
    phonetic+= f'<phon>/{d.get('phon', '')}/</phon>'
    phonetic += f'<a class="audio" href="sound://{d.get('audio', '').replace('#','%23')}.mp3"><img src="audio_bre_initial.svg"/></a>'
    phonetic += '</pron-g>'
    return phonetic

def process_top_g(data):
    html  = "<top-g>"
    # 提取单词头部

    for key in ['h','prongs','v_gs','pos','xrgs','top_un','top_text']:
        if not key in data:
            continue
        if key == 'xrgs':
            html += process_list(data[key],'xrgs',process_xrg)
        elif key == 'prongs':
            html += process_list(data[key],'pron-gs',process_phong)
        else:
            html += convert_list(data[key],'top-g')

    for key in ['ill','wfg','topic']:
        if key in data:
            print(key,data[key])
    html += "</top-g>"
    
    return html

def process_list(d,tag,child_func):
    if tag:
        t = f'<{tag}>'
    else:
        t = ''
    for c in d:
        t += child_func(c)
    if tag:
        t += f'</{tag}>'
    return t
def process_bf_g(d):
    t = '<bf-g>'
    t += '<top-g>' + convert_list(d['bf_name'],"top-g")
    if "bf_text" in d:
        t += convert_list(d['bf_text'],"top-g")
    t += '</top-g>'
    #if len(d['sn_g'])!=1:
    #    print('error sn_g != 1')
    t += process_list(d['sn_g'],'sn-gs',process_sn_g)


    t += '</bf-g>'
    return t
def process_x_g(d):
    t = '<x-g><x>'
    t += convert_list(d['x_eng'],"x")
    t += convert_list(d['x_simp'],"x")
    t += '</x></x-g>'
    for key in ["un","ill"]:
        if key in d:
            print(key)
    return t
def process_dr_g(d):
    t = '<dr-g>'
    t += process_top_g(d['top_g'])
    t += process_sn_gs(d['sn_gs'])
    t += '</dr-g>'
    return t

def convert_tree(d,tag):
    t = ''
    if isinstance(d, dict):
        for key,value in d.items():
            t += f'<{key}>' + convert_tree(value,key) + f'</{key}>'
    elif isinstance(d, list) and d:
        if 'tag' in d[0]:
            t += convert_list(d,tag)
        else:
            for item in d:
                t += convert_tree(item,tag)
    return t

def process_unbox(d):
    t = '<unbox>'
    t += f"<heading>{d['tile']['type']}</heading>"
    t += '<body>'
    t += convert_tree(d['body'],'body')
    t += '</body>'
    t += '</unbox>'
    return t
def process_sn_g(d,):
    t = '<sn-g>'
    t += convert_list(d['sng_text'],'sn-g') 
   
    
    if 'ill' in d:
        t += convert_list(d['ill'],'sn-g')
    if 'def_eng' in d:
        t += '<def>' + convert_list(d['def_eng'],'def')
    if 'def_simp' in d:
        t += convert_list(d['def_simp'],'def') + '</def>'
    if 'xrgs' in d:
        t += process_list(d['xrgs'],'xrgs',process_xrg)
    if 'x_gs' in d:
        t += process_list(d['x_gs'],"x_gs",process_x_g)
    if 'un' in d:
        t += convert_list(d['un'],'sn-g')
    if 'unbox' in d:
        t += process_list(d['unbox'],"",process_unbox)
    t += '</sn-g>'

    return t
def process_sn_gs(data,path='h-g',unboxs = None):
    
    t = ''
    
    
    if 'sn_g' in data:
        t += '<sn_gs>'
        if unboxs:
            child_unboxes = []
            for sn_g in data.get('sn_g',[]):
                if sn_g.get('unbox'):
                    child_unboxes.extend(sn_g['unbox'])
            for unbox in unboxs:
                found_in_child = False
                for child_unbox in child_unboxes:
                    if unbox == child_unbox:
                        found_in_child = True
                        continue
                if not found_in_child:
                    t += process_unbox(unbox)
        t += process_list(data['sn_g'],'',process_sn_g)
        t += '</sn_gs>'
    if 'xrgs' in data:
        t += process_list(data['xrgs'],'xrgs',process_xrg)
    if 'bf_gs' in data:
        if len(data['bf_gs']) > 1:
            print('multiple bf gs')
        t += process_list(data['bf_gs'][0]['bf_g'],'bf-gs',process_bf_g)
    if 'dr_gs' in data:
        t += process_list(data['dr_gs'],'dr-gs',process_dr_g)
    #"subentry_cefr"
    for key in ['topic','idm_gs','shcut_g','xrgs_subtren','pv_gs',"un","ill"]:
        if key in data:
            print(key)
    return t

def convert_x_g(d):
    t = '<x_g><x>'
    if 'un' in d:
        t += convert_list(d['un'],'x')
    t += convert_list(t['x_eng'],'x')
    t += convert_list(t['x_simp'],'x')

    t += '</x></x_g>'
    return t
def to_html(word):
    words = []
    top_data = word['data'].get('top_data', {})
    sngs_data = word['data'].get('sngs_data', [])
    unbox = word['data'].get('unbox', {})
    word_head = word['head']
    for subword in sngs_data:
        word = {}
        top_in_sngs = len(sngs_data) > 1
        real_top_data = top_data if not top_in_sngs else subword['top_data']
        real_sngs_data = subword['sngs_data'] if not top_in_sngs else subword['sngs_data']
        h_g = '<h-g>'
        if real_top_data.get('pos') != real_sngs_data.get('pos'):
            print("pos data mismatch")
        head = convert_head(real_top_data['h'])
        top_g = process_top_g(real_top_data) 
        h_g += top_g
        if unbox:
            h_g += process_list(unbox,'',process_unbox)
        sn_g = process_sn_gs(real_sngs_data,unboxs=subword.get('unbox'))
        expanded_words = []
        if head != word_head:
            expanded_words.append(word_head)
        if real_sngs_data.get('bf_gs'):
            for bf_gs in real_sngs_data['bf_gs']:
                for bf_g in bf_gs['bf_g']:
                    for bf_name in bf_g['bf_name']:
                        bf_head = bf_name['value'].strip(' ,')
                        expanded_words.append(bf_head)

        h_g += sn_g
        h_g += '</h-g>'
        word['head'] = head
        word['xml'] = h_g
        if expanded_words:
            word['expanded_words'] = expanded_words
        
        words.append(word)
    return words

    
def remove_falsy(data):
    """
    递归删除字典或列表中的所有假值字段
    假值包括：0, "", [], {}, None, False 等
    """
    if isinstance(data, dict):
        new_dict = {}
        for k, v in data.items():
            cleaned = remove_falsy(v)
            if cleaned:  # 关键：过滤所有假值
                new_dict[k] = cleaned
        return new_dict

    elif isinstance(data, list):
        new_list = []
        for item in data:
            cleaned = remove_falsy(item)
            if cleaned:
                new_list.append(cleaned)
        return new_list

    else:
        return data

def merge_data(input_dir,output_json):
    json_files = glob.glob(os.path.join(input_dir, '*.json'))
    
    if not json_files:
        print(f"在 {input_dir} 目录下没有找到 .json 文件。")
        return

    print(f"找到 {len(json_files)} 个文件，开始处理...")
    words = []
    count = 0
    for file_path in json_files:
        try:
            # 读取 JSON
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # 获取作为文件名的单词
            file_name_base = data.get('text')
            
            # 如果 JSON 中没有 'word' 字段，或者为空，则使用原文件名
            if not file_name_base:
                file_name_base = os.path.splitext(os.path.basename(file_path))[0]
                print(f"警告: 文件 {file_path} 中未找到 key 'text'，将使用原文件名。")

            word = {}
            word['head'] = file_name_base
            for word_grp in data['word_groups']:
                if word_grp.get('is_oxford_junior'):

                   word['data'] = remove_falsy(word_grp['oxford_junior_data'])

            count += 1
            words.append(word)

        except json.JSONDecodeError:
            print(f"错误: 文件 {file_path} 不是有效的 JSON 格式。")
        except Exception as e:
            print(f"处理文件 {file_path} 时发生未知错误: {e}")
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(words,f,ensure_ascii=False,indent=2)

def json_to_html(json_file, output_file):
    """
    将单个 JSON 文件转换为 HTML 文件。
    """
    with open(json_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
    words = []
    for w in json_data:
        words.extend(to_html(w))
    words.sort(key=lambda x:x['head'])
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(words,f,ensure_ascii=False,indent=2)

def write_mdx_source(json_file, output_txt_path):
    with open(json_file, 'r', encoding='utf-8') as f:
        words = json.load(f)
    # 打开输出文件准备写入
    with open(output_txt_path, 'w', encoding='utf-8') as f_out:
        
        # 遍历JSON数据中的每一个词条
        for word in words:
            if not word['head']:
                continue
            headword = word['head']
            xml = '<link rel="stylesheet" href="OELECD5.css" />'
            xml += word['xml']
            expanded_words = word.get('expanded_words',[])

            f_out.write(headword + '\n')
            f_out.write(xml + '\n')
            f_out.write('</>\n')

            for expanded_word in expanded_words:
                expanded_word = expanded_word.strip()
                f_out.write(expanded_word + '\n')
                f_out.write(f'@@@LINK={headword}\n')
                f_out.write('</>\n')
if __name__ == '__main__':
    #merge_data('niudetail','oxford.json')
    json_to_html('oxford.json', 'oxford_html.json')
    write_mdx_source('oxford_html.json', 'oxford.mdx_src.txt')