#!/usr/bin/env python3
import sqlite3
import os
import re
import sys
from pathlib import Path
from collections import defaultdict, Counter
from concurrent.futures import ProcessPoolExecutor, as_completed
from bs4 import BeautifulSoup

# --- 改进后的核心逻辑：提取主词条词性并合并 ---
def load_primary_meta(conn, needed_display_ids):
    meta = {}
    # 扫描优先级 1 和 2，确保涵盖主词条和重要变体
    cursor = conn.execute("""
        SELECT q.display_id, q.posg, t.translation, q.priority
        FROM t_query_index AS q
        LEFT JOIN t_translation AS t ON t.translation_id = q.translation_id
        WHERE q.priority <= 2
        ORDER BY q.display_id, q.priority ASC
    """)
    
    pos_collect = defaultdict(set)
    gloss_collect = {}

    for display_id, posg, translation, priority in cursor:
        display_id = int(display_id)
        if display_id not in needed_display_ids:
            continue
        # 收集所有发现的词性（解决主词条词性缺失问题）
        if posg:
            clean_pos = posg.strip()
            if clean_pos:
                pos_collect[display_id].add(clean_pos)
        # 仅保留优先级最高的释义作为后缀
        if display_id not in gloss_collect and translation:
            # 简单清洗释义文本
            text = BeautifulSoup(translation, "lxml").get_text().strip()
            gloss_collect[display_id] = text[:15] + "..." if len(text) > 15 else text

    for d_id in needed_display_ids:
        # 合并词性，如 "n. vt."
        all_pos = " ".join(sorted(list(pos_collect[d_id])))
        main_gloss = gloss_collect.get(d_id, "")
        meta[d_id] = (all_pos, main_gloss)
    return meta

# --- 基础工具函数 ---
def normalize_display_text(value):
    if not value: return ""
    value = value.replace("<homo>", "〔").replace("</homo>", "〕")
    return BeautifulSoup(value, "lxml").get_text().strip()

def clean_fragment(raw_fragment):
    # 这里简写了原脚本的清洗逻辑，实际运行时会保留 HTML
    soup = BeautifulSoup(raw_fragment, "lxml")
    # 移除不需要的按钮和脚本
    for s in soup.select("script, style, .favoriteBtn, .adviseBtn"):
        s.decompose()
    return str(soup.body.decode_contents()) if soup.body else str(soup)

# --- 主转换流程 ---
def main():
    # 1. 自动定位同目录下的文件
    base_path = Path(__file__).parent
    db_path = base_path / "dictx.db"
    html_dir = base_path / "html"
    output_file = base_path / "英汉大词典_output.txt"

    if not db_path.exists():
        print(f"错误：找不到数据库 {db_path}")
        return

    print(f"正在连接数据库: {db_path}")
    conn = sqlite3.connect(db_path)
    
    # 获取数据偏移量
    cursor = conn.execute("SELECT word_id, data_offset, data_size, file_name FROM t_html_offset")
    offsets = cursor.fetchall()
    print(f"找到 {len(offsets)} 个词条。")

    # 获取词头信息
    cursor = conn.execute("SELECT word_id, display_id, display_word FROM t_display_word")
    display_data = {row[0]: (row[1], row[2]) for row in cursor.fetchall()}
    
    # 提取合并后的词性
    needed_ids = {info[0] for info in display_data.values()}
    meta_info = load_primary_meta(conn, needed_ids)

    print("开始生成词典正文...")
    with open(output_file, "w", encoding="utf-8") as f:
        for word_id, offset, size, file_name in offsets:
            # 读取 HTML 片段
            h_path = html_dir / f"{file_name}.html"
            if not h_path.exists(): continue
            
            with open(h_path, "rb") as hf:
                hf.seek(offset)
                raw_html = hf.read(size).decode("utf-8", errors="replace")
            
            # 构建标题：单词 + 词性
            d_id, d_word = display_data.get(word_id, (0, word_id))
            clean_title = normalize_display_text(d_word)
            pos, gloss = meta_info.get(d_id, ("", ""))
            
            # 如果有词性，加到标题括号里
            final_title = f"{clean_title} ({pos})" if pos else clean_title
            
            # 写入符合 Goldendict 源文件格式的内容
            f.write(f"{final_title}\n")
            f.write(f'<link rel="stylesheet" href="yinghanda.css">\n')
            f.write(clean_fragment(raw_html))
            f.write("\n</>\n")

    conn.close()
    print(f"转换成功！生成文件: {output_file}")

if __name__ == "__main__":
    main()