# -*- coding: utf-8 -*-
import re
with open(r"Section13.xhtml", encoding="utf8") as fl_in:
text = fl_in.read()
pattern = r'
(\d+)(.*?)
'
heads = re.findall(pattern, text)
for head in heads:
entry_id, head_word = head
head_word = re.sub('<.*?>', '', head_word).strip()
print(entry_id, head_word)