import re
import tkinter as tk
from tkinter import filedialog, messagebox, ttk # 修复ttk导入问题
from bs4 import BeautifulSoup, Comment
import threading
import os
def extract_whatis_content(html_content):
"""精确提取What is栏目内容,保留原始样式"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# 查找What is栏目 - 使用多个特征确保精确匹配
whatis_div = soup.find('div', class_=lambda x: x and 'tw-bg-orange-light' in x and 'tw-mx-4' in x)
if whatis_div:
# 移除所有注释
for comment in whatis_div.find_all(string=lambda text: isinstance(text, Comment)):
comment.extract()
# 返回原始HTML字符串
return str(whatis_div).strip()
return None
except Exception:
return None
def process_mdx_file(input_path, output_path, progress_callback):
"""处理MDX源文件 - 三行格式处理"""
try:
total_lines = 0
with open(input_path, 'r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f)
processed = 0
with open(input_path, 'r', encoding='utf-8') as f_in, \
open(output_path, 'w', encoding='utf-8') as f_out:
for line in f_in:
# 跳过空行
if not line.strip():
continue
# 分割词头和内容
parts = line.split('\t', 1)
if len(parts) < 2:
continue
word, content = parts
# 提取CSS链接 - 修复缺少>的问题
css_match = re.search(r'<link\s+rel=[\'"]stylesheet[\'"].*?href=[\'"][^\'"]+\.css[\'"]\s*/?>', content)
css_tag = css_match.group(0) if css_match else ""
# 确保CSS标签正确闭合
if css_tag and not css_tag.endswith('>'):
css_tag += '>'
# 提取What is内容
whatis_content = extract_whatis_content(content)
if whatis_content:
# 构建新词条(三行格式)
f_out.write(f"{word}\n")
f_out.write(f"{css_tag}{whatis_content}<br></>\n")
f_out.write("</>\n")
processed += 1
# 更新进度
if processed % 100 == 0:
progress_callback(processed, total_lines)
return processed, None
except Exception as e:
return 0, str(e)
def process_file_thread(input_path, output_path, status_var, progress_bar, result_label, process_btn):
"""在后台线程中处理文件"""
process_btn.config(state=tk.DISABLED)
status_var.set("处理中...")
def update_progress(processed, total):
progress = int((processed / total) * 100) if total > 0 else 0
progress_bar['value'] = progress
status_var.set(f"已处理: {processed} 个词条")
root.update_idletasks()
try:
processed_count, error = process_mdx_file(input_path, output_path, update_progress)
if error:
messagebox.showerror("处理错误", f"处理过程中发生错误:\n{error}")
else:
messagebox.showinfo(
"处理完成",
f"成功处理 {processed_count} 个词条!\n"
f"输出文件已保存至:\n{output_path}"
)
finally:
status_var.set("准备就绪")
progress_bar['value'] = 0
process_btn.config(state=tk.NORMAL)
result_label.config(text="")
def select_and_process(status_var, progress_bar, result_label, process_btn):
"""GUI文件选择和处理函数"""
input_file = filedialog.askopenfilename(
title="选择原始词典文件",
filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
)
if not input_file:
return
output_file = filedialog.asksaveasfilename(
title="保存处理后的词典文件",
defaultextension=".txt",
filetypes=[("Text files", "*.txt"), ("All files", "*.*")]
)
if not output_file:
return
# 在后台线程中处理文件
threading.Thread(
target=process_file_thread,
args=(input_file, output_file, status_var, progress_bar, result_label, process_btn),
daemon=True
).start()
# 创建GUI界面
root = tk.Tk()
root.title("MDX词典洗版工具 - What is栏目提取")
root.geometry("600x450")
root.configure(bg="#f5f5f5")
# 标题
title_frame = tk.Frame(root, bg="#4a6572", height=90)
title_frame.pack(fill="x", side="top", pady=(0, 10))
tk.Label(
title_frame,
text="MDX词典洗版工具",
font=("Microsoft YaHei", 16, "bold"),
fg="white",
bg="#4a6572",
pady=20
).pack(fill="x")
# 说明区域
info_frame = tk.Frame(root, bg="#f5f5f5", padx=20, pady=10)
info_frame.pack(fill="both", expand=True, padx=20, pady=5)
tk.Label(
info_frame,
text="功能说明:",
font=("Microsoft YaHei", 11, "bold"),
bg="#f5f5f5",
anchor="w"
).pack(fill="x", pady=(0, 5))
info_text = tk.Label(
info_frame,
text="1. 提取词典中的'What is'栏目内容\n"
"2. 保留原始样式和格式\n"
"3. 生成三行格式的词条:\n"
" 第一行: 词头\n"
" 第二行: CSS链接 + What is内容 + <br></>\n"
" 第三行: </>\n\n"
"4. 优化大文件处理性能",
font=("Microsoft YaHei", 9),
bg="#f5f5f5",
justify="left",
anchor="w"
)
info_text.pack(fill="x", pady=(0, 15))
# 进度条
progress_frame = tk.Frame(root, bg="#f5f5f5")
progress_frame.pack(fill="x", padx=20, pady=5)
status_var = tk.StringVar(value="准备就绪")
status_label = tk.Label(
progress_frame,
textvariable=status_var,
font=("Microsoft YaHei", 9),
bg="#f5f5f5",
anchor="w"
)
status_label.pack(fill="x", pady=(0, 5))
progress_bar = tk.ttk.Progressbar(
progress_frame,
orient="horizontal",
length=500,
mode="determinate"
)
progress_bar.pack(fill="x", pady=5)
result_label = tk.Label(
progress_frame,
text="",
font=("Microsoft YaHei", 9),
bg="#f5f5f5",
anchor="w"
)
result_label.pack(fill="x")
# 处理按钮
btn_frame = tk.Frame(root, bg="#f5f5f5")
btn_frame.pack(pady=15)
process_btn = tk.Button(
btn_frame,
text="选择文件并处理",
command=lambda: select_and_process(status_var, progress_bar, result_label, process_btn),
font=("Microsoft YaHei", 10, "bold"),
bg="#4CAF50",
fg="white",
padx=25,
pady=10,
relief="flat",
cursor="hand2"
)
process_btn.pack()
# 提示信息
tk.Label(
root,
text="输出格式:三行格式,不添加额外空行 | 修复CSS链接问题 | 优化大文件处理",
font=("Microsoft YaHei", 8),
fg="#666666",
bg="#f5f5f5"
).pack(side="bottom", pady=10)
root.mainloop()
1 个赞