分享《牛津英语用法指南》第3版中英对照完善索引图片词典(首发)

该代码在提取 epub 中的标题数字时,会有两处识别错误,手工修改了

# -*- coding: utf-8 -*-

import re, pathlib
path = pathlib.Path('./')
fl_out = open("head_word_deep.txt","w", encoding="utf8")
pattern = r'<h[3|4] class="hh(\d+)".*?>(\d+)(.*?)</h[3|4]>'

h1 = ''
for p2f in path.glob("*.xhtml"):
    with open(p2f, encoding="utf8") as fl_in:
        text = fl_in.read()
    heads = re.findall(pattern, text)
    for head in heads:
        head_word = re.sub(r"<.*?>", "", head[2]).strip()
        if head[0] == '1':
            h1 = head[1]
            head_num = h1
        else:
            head_num = ".".join([h1,head[1]])
        head_txt = head_num + ' ' + head_word
        fl_out.write(head_txt + '\n')
fl_out.close()
1 个赞