import json
import re
from xml.sax.saxutils import escape

INPUT_FILE = r'西班牙语习语词典_left.json'
OUTPUT_FILE = r'西班牙语习语词典_parsed.json'

def is_cjk(char):
    return '\u4e00' <= char <= '\u9fff'

def split_example_line(line):
    # Find the index where Chinese characters start appearing significantly
    # Heuristic: the first CJK Unified Ideograph marks the split
    split_index = -1
    for i, char in enumerate(line):
        if is_cjk(char):
            split_index = i
            break
    
    if split_index == -1:
        return line.strip(), ""
        
    # Lookback for opening punctuation that belongs to the Chinese part
    # “ ‘ （ 【 《
    openers = ['“', '‘', '（', '【', '《']
    if split_index > 0 and line[split_index-1] in openers:
        split_index -= 1
        
    return line[:split_index].strip(), line[split_index:].strip()

def parse_entry(entry):
    text = entry.get('text', '')
    headword_text = entry.get('headword', '')
    
    lines = [line.strip() for line in text.split('\n') if line.strip()]
    if not lines:
        entry['xml'] = ''
        entry['definition'] = ''
        entry['explanation'] = [] # Changed to empty list
        entry['examples'] = []
        return entry

    definition = ""
    explanation_lines = []
    example_lines = []
    
    # 1. Process first line (Headword + Definition)
    first_line_content = lines[0]
    
    match = re.match(r'^\*\*(.*?)\*\*\s*(.*)', first_line_content)
    
    current_line_idx = 1
    
    if match:
        definition = match.group(2).strip()
    else:
        definition = first_line_content

    if not definition and len(lines) > 1:
        definition = lines[1].strip()
        current_line_idx = 2

    # 2. Process rest of lines (Explanation vs Examples)
    mode = 'explanation' 
    
    for i in range(current_line_idx, len(lines)):
        line = lines[i]
        
        if line.startswith('例') or line.startswith('例:'):
            mode = 'examples'
            if line.startswith('例'):
                line = line[1:].strip()
            elif line.startswith('例:'):
                 line = line[2:].strip()
        
        if mode == 'explanation':
            explanation_lines.append(line)
        elif mode == 'examples':
            example_lines.append(line)
            
    # 3. Process Examples (Split ES/CN)
    structured_examples = []
    for ex_line in example_lines:
        es, cn = split_example_line(ex_line)
        if es or cn:
            structured_examples.append({'es': es, 'cn': cn})
    
    # Escape function helper
    def esc(s):
        return escape(s)
            
    # 4. Generate XML
    xml_parts = []
    xml_parts.append(f'<entry>')
    # Renamed head to hw
    xml_parts.append(f'<hw>{esc(headword_text)}</hw>')
    
    if definition:
        xml_parts.append(f'<def>{esc(definition)}</def>')
        
    if explanation_lines:
        xml_parts.append(f'<expl>')
        for paragraph in explanation_lines:
            # Add two full-width spaces at the beginning of each paragraph
            indented_paragraph = '\u3000\u3000' + paragraph
            xml_parts.append(f'<p>{esc(indented_paragraph)}</p>')
        xml_parts.append(f'</expl>')
        
    if structured_examples:
        xml_parts.append('<examples>')
        for ex in structured_examples:
             xml_parts.append('<ex>')
             if ex['es']: xml_parts.append(f'<es>{esc(ex["es"])}</es>')
             if ex['cn']: xml_parts.append(f'<cn>{esc(ex["cn"])}</cn>')
             xml_parts.append('</ex>')
        xml_parts.append('</examples>')
        
    xml_parts.append('</entry>')
    xml_str = "".join(xml_parts)

    # Update Entry
    entry['definition'] = definition
    entry['explanation'] = explanation_lines # now a list of strings
    entry['examples'] = structured_examples
    entry['xml'] = xml_str
    
    return entry

def main():
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        parsed_data = []
        for item in data:
            try:
                parsed_data.append(parse_entry(item))
            except Exception as e:
                print(f"Error parsing item {item.get('headword', 'unknown')}: {e}")
            
        with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
            json.dump(parsed_data, f, ensure_ascii=False, indent=2)
            
        print(f"Processed {len(parsed_data)} entries. Output saved to {OUTPUT_FILE}")
    except Exception as e:
        print(f"Error: {e}")

if __name__ == '__main__':
    main()
