发音
# html>body>entry>derivative>poll>sk_pron
from bs4 import BeautifulSoup
with open('../output', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser')
tags = soup.select('html>body>entry>foreignpron>sk_pron')
更多词头
with open('../../mdx_release/entries.txt', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser', multi_valued_attributes=None)
hwds = []
# driv
for deriv in soup.find_all("deriv"):
hwd_deriv = str(deriv.decode_contents())
hwds.append(hwd_deriv)
词头元字符统计
def is_valid_string(s):
return all(char in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ |’/-,.~" for char in s)
comp 元素替换表
for entry in soup.find_all('entry'):
for comp in entry.find_all('comp'):
hwds = comp.get_text()
hwds = hwds.replace('ˈ', '')
hwds = hwds.replace('ˌ', '')
hwds = hwds.replace('◂', '')
hwds = hwds.replace('₍₎', '')
hwds = hwds.split(', ')
hwds_comp_json.extend(hwds)
mdx mp3 格式
sk_pron["href"] = "sound://" + mp3_path_mdx
mdx entry css 格式
hwd + '\n' + '<div class="j24lpd2008">' + '<link rel="stylesheet" href="lpd.css" />' + v0['entry'] + '</div>' + '\n' + '</>'