import json
import re
import os
from bs4 import BeautifulSoup

words = []
current_no = 0
current_page = 0
def parse_txt(file_path):
    global current_no, words, current_page
    
    page_no = 0
    prev = 0
    curword = None
    if words:
        curword = words[-1]
    with open(file_path, 'r', encoding='utf-8') as file:
        print(f"--- Reading {file_path} ---")
        lines = file.read().splitlines()
        isword = True
        for i,line in enumerate(lines):
            if line.strip() == '':
                continue
            if match:=re.fullmatch(r'<(\d+)>',line):
                prev = 0
                pagenum = int(match.group(1))
                if pagenum != current_page + 1:
                    print("page error", current_page, pagenum)
                else:
                    current_page += 1
                prev = 1
                page_no = 1
                #print(current_page,line)
            elif line.startswith('  ') or line.startswith('\t'):
                if curword:
                    isword = False
                    if not 'subwords' in curword:
                        curword['subwords'] = []
                    curword['subwords'].append(line.strip())
                    
                else:
                    print("indent but no current word", line)    
            elif match:=re.match(r'([0-9α-ω\-,]+)?[a-zA-Z \.]+',line):
                #print(current_page,line)
                prev = 0
                curword = {'page': current_page, 'index':current_no, 'id': f'{current_page}.{page_no}', 'text': line}
                curword['headword'] = match.group(0)
                words.append(curword)
                page_no += 1
                current_no += 1
                prev = 0
                isword = True
            
            else:
                print(line)
                if isword:
                    curword['text'] += line
                else:
                    curword['subwords'][-1] = curword['subwords'][-1]+line
                        
                
                prev = 0
    return words

def create_mdx_source(words, output_txt_path):


    # 打开输出文件准备写入
    with open(output_txt_path, 'w', encoding='utf-8') as f_out:
        # 遍历JSON数据中的每一个词条
        for entry in words:
            headword = entry['headword']
            expaneded_words = []
            
            text = entry['text']
            text = re.sub(r'^(([0-9α-ω\-,]+)?[a-zA-Z \.\(\),]+)(?! *\b(f|m|n|pl)\.)', r'<b>\1</b>', text).strip()
            if 'subwords' in entry:
                for subword in entry['subwords']:
                    match = re.match(r'^([a-zA-Z \.\(\),]+) *', subword)
                    if match:
                        expaneded_words.append(match.group(1))
                    text += '\n<br>    ' + subword
            
            f_out.write(headword + '\n')
            f_out.write(text + '\n')
            f_out.write('</>\n')
            
            for expanede_word in expaneded_words:
                f_out.write(expanede_word + '\n')
                f_out.write(f'@@@LINK={headword}\n')
                f_out.write('</>\n')

    print(f"处理完成！MDX源文件已成功生成并保存到 '{output_txt_path}'。")



if __name__ == "__main__":
    words = parse_txt('拉汉科技词典_quark.txt')
    create_mdx_source(words, '拉汉科技词典.mdx.txt')