该代码在提取 epub 中的标题数字时,会有两处识别错误,手工修改了
# -*- coding: utf-8 -*-
import re, pathlib
path = pathlib.Path('./')
fl_out = open("head_word_deep.txt","w", encoding="utf8")
pattern = r'<h[3|4] class="hh(\d+)".*?>(\d+)(.*?)</h[3|4]>'
h1 = ''
for p2f in path.glob("*.xhtml"):
with open(p2f, encoding="utf8") as fl_in:
text = fl_in.read()
heads = re.findall(pattern, text)
for head in heads:
head_word = re.sub(r"<.*?>", "", head[2]).strip()
if head[0] == '1':
h1 = head[1]
head_num = h1
else:
head_num = ".".join([h1,head[1]])
head_txt = head_num + ' ' + head_word
fl_out.write(head_txt + '\n')
fl_out.close()