import json
import re
import os

# Define numbering symbols map
# ❶ to ⓴ (20)
NUMBERS = "❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴㉑㉒㉓㉔㉕㉖㉗"
NUMBER_MAP = {char: i + 1 for i, char in enumerate(NUMBERS)}

def split_tags(line):
    tag_pattern = r'SIN\.|ANT\.|OBS\.|FAM\.'
    tag_contents = re.split(tag_pattern, line)
    tags = re.findall(tag_pattern, line)
    prefix = tag_contents[0].strip()
    if len(tag_contents) == 1:
        return line, []

    return prefix, list(zip(tags, tag_contents[1:]))

def convert_tags_to_xml(tags_parts):
    xml_parts = []
    for tag_name, tag_content in tags_parts:
        tag_content = tag_content.strip()
        if tag_name == "SIN." or tag_name == "ANT.":
            tag_content = re.sub(r'\w+', lambda x: f'<word_ref><a href="entry://{x.group(0)}">{x.group(0)}</a></word_ref>', tag_content)
        elif tag_name == "FAM.":
            tag_content = re.sub(r'\w+', lambda x: f'<word_ref>{x.group(0)}</word_ref>', tag_content)
        xml_parts.append(f'<tag type="{tag_name.replace(".", "")}"><tag_name>{tag_name} </tag_name><tag_content>{tag_content}</tag_content></tag>')
    return xml_parts
def parse_entry(entry):
    if 'text' not in entry or not entry['text']:
        return "", [], []

    lines = [line.strip() for line in entry['text'].split('\n') if line.strip()]
    if not lines:
        return "", [], []

    # 1. Parse Headword Line (Line 0)
    head_line = lines[0]
    if head_line.startswith('>'):
        head_line = head_line[1:].strip()
    
    # Extract Headword and POS
    # Pattern: "word POS description" or "word, word POS" or "【phrase】 POS"
    
    # Check if headword is a phrase in brackets
    if head_line.startswith('【'):
        match = re.search(r'^(【.*?】)\s*([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+)*.*)$', head_line)
    else:
        # Regex to capture the part starting with an uppercase word
        match = re.search(r'^(.*?) ([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+)*.*)$', head_line)
    
    if match:
        headword_text = match.group(1).strip()
        pos_text = match.group(2).strip()
    else:
        # Fallback split by first space if no uppercase block found
        parts = head_line.split(' ', 1)
        headword_text = parts[0]
        pos_text = parts[1] if len(parts) > 1 else ""

    expanded_words = set()
    headword = headword_text
    if ',' in headword:
        word_list = headword.split(',')
        for i, word in enumerate(word_list):
            word = word.strip()
            if i == 0:
                headword = word
            elif len(word) < len(headword):
                first = word[0]
                pos = headword.rfind(first)
                if pos == -1:
                    pass
                else:
                    word = headword[:pos] + word
                expanded_words.add(word)
            else:
                expanded_words.add(word)
    elif '[' in headword and ']' in headword:
        word_list = headword.replace(']', '').split('[')
        for i, word in enumerate(word_list):
            if i == 0:
                headword = word.strip()
            else:
                expanded_words.add(word.strip())

    xml_parts = []
    xml_parts.append(f'<headword>{headword_text}</headword><pos>{pos_text}</pos>')

    current_sense_number = 0
    tables = []
    current_table = []
    
    i = 1
    while i < len(lines):
        line = lines[i]
        i += 1

        # Table parsing
        if line.startswith('|'):
            current_table.append(line)
            continue
        else:
            if current_table:
                tables.append("\n".join(current_table))
                current_table = []
        
        # Determine line type
        first_char = line[0]
        
        # Case A: Definition (❶...)
        if first_char in NUMBER_MAP:
            num = NUMBER_MAP[first_char]
            
            # Check continuity
            if num != current_sense_number + 1:
                print(f"WARNING: Discontinuity in '{headword_text}': Expected {current_sense_number + 1}, found {num} (Line: {line[:20]}...)")
            current_sense_number = num
            
            # Preserve symbol in content
            content = line.strip() 
            text_after_symbol = content[1:].strip()
            
            # Split Definition and Examples
            parts = text_after_symbol.split(':', 1)
            definition_part = parts[0].strip()
            example_part = parts[1].strip() if len(parts) > 1 else ""
            if example_part:
                example_part, tags_parts = split_tags(example_part)
            else:
                definition_part, tags_parts = split_tags(definition_part)
            
            match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part)
            
            if match_cn:
                split_idx = match_cn.start()
                def_es = definition_part[:split_idx].strip()
                def_cn = definition_part[split_idx:].strip()
            else:
                def_es = definition_part
                def_cn = ""
            
            # Prepend symbol to <es>
            num_text = f"<num>{first_char}</num>"

            xml_parts.append(f'<sense n="{num}">{num_text}')
            xml_parts.append(f'<def><es>{def_es}</es><cn>{def_cn}</cn></def>')
            if example_part:
                # Split example
                match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part)
                if match_ex_cn:
                    ex_split_idx = match_ex_cn.start()
                    ex_es = example_part[:ex_split_idx].strip()
                    ex_cn = example_part[ex_split_idx:].strip()
                else:
                    ex_es = example_part
                    ex_cn = ""
                xml_parts.append(f'<example><es>{ex_es}</es><cn>{ex_cn}</cn></example>')
            if tags_parts:
                xml_parts.extend(convert_tags_to_xml(tags_parts))
            xml_parts.append('</sense>')

        # Case B/C: Diamond (♦)
        elif line.startswith('♦'):
            content = line.strip() # Keep diamond
            text_after_diamond = content[1:].strip()
            
            if not text_after_diamond:
                continue
                
            first_char_inner = text_after_diamond[0]
            
            if first_char_inner in NUMBER_MAP:
                # Case B: ♦ ❷ POS ...
                num = NUMBER_MAP[first_char_inner]
                if num != current_sense_number + 1:
                    print(f"WARNING: Discontinuity in '{headword_text}' (POS Change): Expected {current_sense_number + 1}, found {num}")
                current_sense_number = num
                
                # Extract POS
                # content looks like "♦ ❷ POS ..."
                # remove "♦ ❷ " to find POS
                text_after_num = text_after_diamond[1:].strip()
                
                match_pos = re.match(r'^([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+(?![a-záéíóúñ]))*)(.*)', text_after_num)
                
                if match_pos:
                    new_pos = match_pos.group(1).strip()
                    rest = match_pos.group(2).strip()
                    
                    parts = rest.split(':', 1)
                    definition_part = parts[0].strip()
                    example_part = parts[1].strip() if len(parts) > 1 else ""
                    if example_part:
                        example_part, tags_parts = split_tags(example_part)
                    else:
                        definition_part, tags_parts = split_tags(definition_part)

                    match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part)
                    if match_cn:
                        split_idx = match_cn.start()
                        def_es = definition_part[:split_idx].strip()
                        def_cn = definition_part[split_idx:].strip()
                    else:
                        def_es = definition_part
                        def_cn = ""
                    
                    # Prepend "♦ " to POS
                    num_and_diamond = f"<diamond>♦ </diamond><num>{first_char_inner}</num>"

                    xml_parts.append(f'<sense n="{num}">{num_and_diamond}<pos>{new_pos}</pos>')
                    xml_parts.append(f'<def><es>{def_es}</es><cn>{def_cn}</cn></def>')
                    if example_part:
                        match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part)
                        if match_ex_cn:
                            ex_split_idx = match_ex_cn.start()
                            ex_es = example_part[:ex_split_idx].strip()
                            ex_cn = example_part[ex_split_idx:].strip()
                        else:
                            ex_es = example_part
                            ex_cn = ""
                        xml_parts.append(f'<example><es>{ex_es}</es><cn>{ex_cn}</cn></example>')
                    if tags_parts:
                        xml_parts.extend(convert_tags_to_xml(tags_parts))
                    xml_parts.append('</sense>')
                else:
                    xml_parts.append(f'<sense n="{num}"><note>POS Parse Error: {content}</note></sense>')

            else:
                # Case C: ♦ sub-entry
                # Treat as <subword>
                match_sub = re.match(r'^(.*?) ([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+(?![a-záéíóúñ]))*.*)$', text_after_diamond)
                if match_sub:
                    sub_headword = match_sub.group(1).strip()
                    sub_pos = match_sub.group(2).strip()
                else:
                    parts = text_after_diamond.split(' ', 1)
                    sub_headword = parts[0]
                    sub_pos = parts[1] if len(parts) > 1 else ""
                
                expanded_words.add(sub_headword)
                if ',' in sub_headword:
                     expanded_words.add(sub_headword.split(',')[0].strip())

                xml_parts.append(f'<subword><diamond>♦ </diamond>{sub_headword}</subword><pos>{sub_pos}</pos>')

        # Case D: Tags (SIN., ANT., OBS., FAM.)
        elif any(line.startswith(tag) for tag in ['SIN.', 'ANT.', 'OBS.', 'FAM.']):
            prefix, tags_parts = split_tags(line)
            xml_parts.extend(convert_tags_to_xml(tags_parts))

        # Case E: Phrase 【...】
        elif line.startswith('【'):
            match_phr = re.match(r'^(【.*?】)(.*)', line)
            if match_phr:
                phrase = match_phr.group(1)
                rest = match_phr.group(2).strip()
                
                phrase_clean = phrase.replace('【', '').replace('】', '')
                expanded_words.add(phrase_clean)
                
                # Treat as subword as per user request to only use headword/subword
                xml_parts.append(f'<phrase><phrase_head>{phrase_clean}</phrase_head>')
                if rest:
                    # Check if rest has definition/examples
                    parts = rest.split(':', 1)
                    definition_part = parts[0].strip()
                    example_part = parts[1].strip() if len(parts) > 1 else ""
                    if example_part:
                        example_part, tags_parts = split_tags(example_part)
                    else:
                        definition_part, tags_parts = split_tags(definition_part)

                    match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part)
                    if match_cn:
                        split_idx = match_cn.start()
                        def_es = definition_part[:split_idx].strip()
                        def_cn = definition_part[split_idx:].strip()
                    else:
                        def_es = definition_part
                        def_cn = ""
                        
                    xml_parts.append(f'<def><es>{def_es}</es><cn>{def_cn}</cn></def>')
                    if example_part:
                        match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part)
                        if match_ex_cn:
                            ex_split_idx = match_ex_cn.start()
                            ex_es = example_part[:ex_split_idx].strip()
                            ex_cn = example_part[ex_split_idx:].strip()
                        else:
                            ex_es = example_part
                            ex_cn = ""
                        xml_parts.append(f'<example><es>{ex_es}</es><cn>{ex_cn}</cn></example>')
                    if tags_parts:
                        xml_parts.extend(convert_tags_to_xml(tags_parts))
                xml_parts.append('</phrase>')
            else:
                 xml_parts.append(f'<note>Unparsed Phrase: {line}</note>')

        else:
            xml_parts.append(f'<text>{line}</text>')

    # Capture trailing table
    if current_table:
        tables.append("\n".join(current_table))

    return headword,"".join(xml_parts), list(expanded_words), tables

def write_mdx_source(json_file, output_txt_path):
    with open(json_file, 'r', encoding='utf-8') as f:
        words = json.load(f)
    # 打开输出文件准备写入
    with open(output_txt_path, 'w', encoding='utf-8') as f_out:
        
        # 遍历JSON数据中的每一个词条
        for word in words:
            if not word['head']:
                continue
            headword = word['head']
            xml = '<link rel="stylesheet" href="DLEEP.css" />'
            xml += word['xml']
            expanded_words = word.get('expanded_words',[])

            f_out.write(headword + '\n')
            f_out.write(xml + '\n')
            f_out.write('</>\n')

            for expanded_word in expanded_words:
                expanded_word = expanded_word.strip()
                f_out.write(expanded_word + '\n')
                f_out.write(f'@@@LINK={headword}\n')
                f_out.write('</>\n')

def main():
    input_path = r"精选双解西班牙语学习词典_left.json"
    output_json_path = r"精选双解西班牙语学习词典_parsed.json"
    output_mdx_path = r"精选双解西班牙语学习词典_parsed.mdx.txt"
    
    if not os.path.exists(input_path):
        print(f"File not found: {input_path}")
        return

    print("Loading JSON...")
    with open(input_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    print(f"Parsing {len(data)} entries...")
    
    for entry in data:
        head, xml_str, expanded_list, tables = parse_entry(entry)
        entry['head'] = head
        entry['xml'] = f"<entry>{xml_str}</entry>"
        entry['expanded_words'] = expanded_list
        if tables:
            entry['table'] = tables
        
        # Remove raw 'text' if not needed? User said "Besides original content... include xml... expanded_words".
        # So we keep 'text'.

    print("Saving JSON...")
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Done! Saved to {output_json_path}")

    write_mdx_source(output_json_path, output_mdx_path)
    print(f"MDX source saved to {output_mdx_path}")

if __name__ == '__main__':
    main()
