# -*- coding: utf-8 -*- import re mdx_tmpl = "{head}\n{body}\n>\n" item_tmpl = '
(.*?)(.*?)
' with open("Section31.xhtml", encoding="utf8") as fl_in: text = fl_in.read() text = text.replace('', '') items = re.findall(item_tmpl, text) print(len(items)) entry_page_dict = {} with open("PEU3_headword_page.txt", encoding="utf-8") as fl_in: for line in fl_in: line = line.strip() entry_number, page_number = line.split("\t") entry_page_dict[entry_number.strip()] = page_number.strip() def get_page_by_entry(entry): page_number = entry_page_dict[entry] return page_number with open("index2_mdx.txt", "w", encoding="utf8") as fl_out: for item in items: head, link_to = item head = head.strip() body = re.sub(r" (\d+)", lambda match: ' {}'.format(int(get_page_by_entry(match.group(1)) ), match.group(1)), link_to) # print(head, ': ', new_link_to) fl_out.write(mdx_tmpl.format(head=head, body=body))