import json import re import os # Define numbering symbols map # ❶ to ⓴ (20) NUMBERS = "❶❷❸❹❺❻❼❽❾❿⓫⓬⓭⓮⓯⓰⓱⓲⓳⓴㉑㉒㉓㉔㉕㉖㉗" NUMBER_MAP = {char: i + 1 for i, char in enumerate(NUMBERS)} def split_tags(line): tag_pattern = r'SIN\.|ANT\.|OBS\.|FAM\.' tag_contents = re.split(tag_pattern, line) tags = re.findall(tag_pattern, line) prefix = tag_contents[0].strip() if len(tag_contents) == 1: return line, [] return prefix, list(zip(tags, tag_contents[1:])) def convert_tags_to_xml(tags_parts): xml_parts = [] for tag_name, tag_content in tags_parts: tag_content = tag_content.strip() if tag_name == "SIN." or tag_name == "ANT.": tag_content = re.sub(r'\w+', lambda x: f'{x.group(0)}', tag_content) elif tag_name == "FAM.": tag_content = re.sub(r'\w+', lambda x: f'{x.group(0)}', tag_content) xml_parts.append(f'{tag_name} {tag_content}') return xml_parts def parse_entry(entry): if 'text' not in entry or not entry['text']: return "", [], [] lines = [line.strip() for line in entry['text'].split('\n') if line.strip()] if not lines: return "", [], [] # 1. Parse Headword Line (Line 0) head_line = lines[0] if head_line.startswith('>'): head_line = head_line[1:].strip() # Extract Headword and POS # Pattern: "word POS description" or "word, word POS" or "【phrase】 POS" # Check if headword is a phrase in brackets if head_line.startswith('【'): match = re.search(r'^(【.*?】)\s*([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+)*.*)$', head_line) else: # Regex to capture the part starting with an uppercase word match = re.search(r'^(.*?) ([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+)*.*)$', head_line) if match: headword_text = match.group(1).strip() pos_text = match.group(2).strip() else: # Fallback split by first space if no uppercase block found parts = head_line.split(' ', 1) headword_text = parts[0] pos_text = parts[1] if len(parts) > 1 else "" expanded_words = set() headword = headword_text if ',' in headword: word_list = headword.split(',') for i, word in enumerate(word_list): word = word.strip() if i == 0: headword = word elif len(word) < len(headword): first = word[0] pos = headword.rfind(first) if pos == -1: pass else: word = headword[:pos] + word expanded_words.add(word) else: expanded_words.add(word) elif '[' in headword and ']' in headword: word_list = headword.replace(']', '').split('[') for i, word in enumerate(word_list): if i == 0: headword = word.strip() else: expanded_words.add(word.strip()) xml_parts = [] xml_parts.append(f'{headword_text}{pos_text}') current_sense_number = 0 tables = [] current_table = [] i = 1 while i < len(lines): line = lines[i] i += 1 # Table parsing if line.startswith('|'): current_table.append(line) continue else: if current_table: tables.append("\n".join(current_table)) current_table = [] # Determine line type first_char = line[0] # Case A: Definition (❶...) if first_char in NUMBER_MAP: num = NUMBER_MAP[first_char] # Check continuity if num != current_sense_number + 1: print(f"WARNING: Discontinuity in '{headword_text}': Expected {current_sense_number + 1}, found {num} (Line: {line[:20]}...)") current_sense_number = num # Preserve symbol in content content = line.strip() text_after_symbol = content[1:].strip() # Split Definition and Examples parts = text_after_symbol.split(':', 1) definition_part = parts[0].strip() example_part = parts[1].strip() if len(parts) > 1 else "" if example_part: example_part, tags_parts = split_tags(example_part) else: definition_part, tags_parts = split_tags(definition_part) match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part) if match_cn: split_idx = match_cn.start() def_es = definition_part[:split_idx].strip() def_cn = definition_part[split_idx:].strip() else: def_es = definition_part def_cn = "" # Prepend symbol to num_text = f"{first_char}" xml_parts.append(f'{num_text}') xml_parts.append(f'{def_es}{def_cn}') if example_part: # Split example match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part) if match_ex_cn: ex_split_idx = match_ex_cn.start() ex_es = example_part[:ex_split_idx].strip() ex_cn = example_part[ex_split_idx:].strip() else: ex_es = example_part ex_cn = "" xml_parts.append(f'{ex_es}{ex_cn}') if tags_parts: xml_parts.extend(convert_tags_to_xml(tags_parts)) xml_parts.append('') # Case B/C: Diamond (♦) elif line.startswith('♦'): content = line.strip() # Keep diamond text_after_diamond = content[1:].strip() if not text_after_diamond: continue first_char_inner = text_after_diamond[0] if first_char_inner in NUMBER_MAP: # Case B: ♦ ❷ POS ... num = NUMBER_MAP[first_char_inner] if num != current_sense_number + 1: print(f"WARNING: Discontinuity in '{headword_text}' (POS Change): Expected {current_sense_number + 1}, found {num}") current_sense_number = num # Extract POS # content looks like "♦ ❷ POS ..." # remove "♦ ❷ " to find POS text_after_num = text_after_diamond[1:].strip() match_pos = re.match(r'^([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+(?![a-záéíóúñ]))*)(.*)', text_after_num) if match_pos: new_pos = match_pos.group(1).strip() rest = match_pos.group(2).strip() parts = rest.split(':', 1) definition_part = parts[0].strip() example_part = parts[1].strip() if len(parts) > 1 else "" if example_part: example_part, tags_parts = split_tags(example_part) else: definition_part, tags_parts = split_tags(definition_part) match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part) if match_cn: split_idx = match_cn.start() def_es = definition_part[:split_idx].strip() def_cn = definition_part[split_idx:].strip() else: def_es = definition_part def_cn = "" # Prepend "♦ " to POS num_and_diamond = f"♦ {first_char_inner}" xml_parts.append(f'{num_and_diamond}{new_pos}') xml_parts.append(f'{def_es}{def_cn}') if example_part: match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part) if match_ex_cn: ex_split_idx = match_ex_cn.start() ex_es = example_part[:ex_split_idx].strip() ex_cn = example_part[ex_split_idx:].strip() else: ex_es = example_part ex_cn = "" xml_parts.append(f'{ex_es}{ex_cn}') if tags_parts: xml_parts.extend(convert_tags_to_xml(tags_parts)) xml_parts.append('') else: xml_parts.append(f'POS Parse Error: {content}') else: # Case C: ♦ sub-entry # Treat as match_sub = re.match(r'^(.*?) ([A-ZÁÉÍÓÚÑ\.]+(?: [A-ZÁÉÍÓÚÑ\.]+(?![a-záéíóúñ]))*.*)$', text_after_diamond) if match_sub: sub_headword = match_sub.group(1).strip() sub_pos = match_sub.group(2).strip() else: parts = text_after_diamond.split(' ', 1) sub_headword = parts[0] sub_pos = parts[1] if len(parts) > 1 else "" expanded_words.add(sub_headword) if ',' in sub_headword: expanded_words.add(sub_headword.split(',')[0].strip()) xml_parts.append(f'♦ {sub_headword}{sub_pos}') # Case D: Tags (SIN., ANT., OBS., FAM.) elif any(line.startswith(tag) for tag in ['SIN.', 'ANT.', 'OBS.', 'FAM.']): prefix, tags_parts = split_tags(line) xml_parts.extend(convert_tags_to_xml(tags_parts)) # Case E: Phrase 【...】 elif line.startswith('【'): match_phr = re.match(r'^(【.*?】)(.*)', line) if match_phr: phrase = match_phr.group(1) rest = match_phr.group(2).strip() phrase_clean = phrase.replace('【', '').replace('】', '') expanded_words.add(phrase_clean) # Treat as subword as per user request to only use headword/subword xml_parts.append(f'{phrase_clean}') if rest: # Check if rest has definition/examples parts = rest.split(':', 1) definition_part = parts[0].strip() example_part = parts[1].strip() if len(parts) > 1 else "" if example_part: example_part, tags_parts = split_tags(example_part) else: definition_part, tags_parts = split_tags(definition_part) match_cn = re.search(r'([（\u4e00-\u9fff])', definition_part) if match_cn: split_idx = match_cn.start() def_es = definition_part[:split_idx].strip() def_cn = definition_part[split_idx:].strip() else: def_es = definition_part def_cn = "" xml_parts.append(f'{def_es}{def_cn}') if example_part: match_ex_cn = re.search(r'([（\u4e00-\u9fff])', example_part) if match_ex_cn: ex_split_idx = match_ex_cn.start() ex_es = example_part[:ex_split_idx].strip() ex_cn = example_part[ex_split_idx:].strip() else: ex_es = example_part ex_cn = "" xml_parts.append(f'{ex_es}{ex_cn}') if tags_parts: xml_parts.extend(convert_tags_to_xml(tags_parts)) xml_parts.append('') else: xml_parts.append(f'Unparsed Phrase: {line}') else: xml_parts.append(f'{line}') # Capture trailing table if current_table: tables.append("\n".join(current_table)) return headword,"".join(xml_parts), list(expanded_words), tables def write_mdx_source(json_file, output_txt_path): with open(json_file, 'r', encoding='utf-8') as f: words = json.load(f) # 打开输出文件准备写入 with open(output_txt_path, 'w', encoding='utf-8') as f_out: # 遍历JSON数据中的每一个词条 for word in words: if not word['head']: continue headword = word['head'] xml = '' xml += word['xml'] expanded_words = word.get('expanded_words',[]) f_out.write(headword + '\n') f_out.write(xml + '\n') f_out.write('\n') for expanded_word in expanded_words: expanded_word = expanded_word.strip() f_out.write(expanded_word + '\n') f_out.write(f'@@@LINK={headword}\n') f_out.write('\n') def main(): input_path = r"精选双解西班牙语学习词典_left.json" output_json_path = r"精选双解西班牙语学习词典_parsed.json" output_mdx_path = r"精选双解西班牙语学习词典_parsed.mdx.txt" if not os.path.exists(input_path): print(f"File not found: {input_path}") return print("Loading JSON...") with open(input_path, 'r', encoding='utf-8') as f: data = json.load(f) print(f"Parsing {len(data)} entries...") for entry in data: head, xml_str, expanded_list, tables = parse_entry(entry) entry['head'] = head entry['xml'] = f"{xml_str}" entry['expanded_words'] = expanded_list if tables: entry['table'] = tables # Remove raw 'text' if not needed? User said "Besides original content... include xml... expanded_words". # So we keep 'text'. print("Saving JSON...") with open(output_json_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) print(f"Done! Saved to {output_json_path}") write_mdx_source(output_json_path, output_mdx_path) print(f"MDX source saved to {output_mdx_path}") if __name__ == '__main__': main()