from bs4 import BeautifulSoup
import re
def clean_ex(matchobj):
return matchobj.group(0).replace('', ' ')
def dc_repl(dc):
dc = dc.replace(' ', ' ').replace(":''", '":').replace("''", '"')
dc = re.sub(r' {2,}', r' ', dc)
dc = dc.replace('( ', '(').replace('[ ', '[')
dc = dc.replace(' )', ')').replace(' ]', ']')
dc = re.sub(r'([^<>]+)()([;.])', r'\1\3\2', dc)
dc = dc.replace('-', '-')
dc = re.sub(r'(?)(See|Compare) ?([^<>]+\.|.+?)', r'\1 \2', dc)
dc = re.sub(r'([,;]\W*)()', r'\2\1', dc)
dc = re.sub(r'(\])(\)?)\]', r'\2\1', dc)
dc = re.sub(r'(?)()', r'\1', dc)
dc = re.sub(r' [][]:? ', r' ', dc)
dc = re.sub(r'\]( + )(?:\[)?', r'\1', dc)
dc = re.sub(r'\[([^][]*)(\[)', r'\2\1', dc)
dc = re.sub(r'(?)(\[[^][]*<[^][<>]+>[^][]*\])', r'\1', dc)
dc = re.sub(r'\[[^][]*[^][]*\]', clean_ex, dc)
dc = re.sub(r'\([^)(]*[^)(]*\)', clean_ex, dc)
'''
dc = re.sub(r'(\])()([^<>]*|\w[^<>]*(?:)?[^<>]*)', r'\1\3\2', dc)
'''
dc = dc.replace(']:', ']')
dc = dc.replace('', '')
dc = dc.replace('', ' ')
dc = re.sub(r': ?', r': ', dc)
dc = re.sub(r'()(′[^,.]*[,.])', r'\2\1', dc)
return dc
def dc_soup(dc):
soup1 = BeautifulSoup(dc, 'html5lib')
s1b = soup1.select_one('div.rh_main')
for ell in s1b.select('.rh_ex > .rh_lab:last-child'):
if ell.parent.contents[-1] != ell:
continue
for els in ell('span', string=re.compile(r'^[][]$')):
els.decompose()
ell.unwrap()
for sc2 in s1b.select('sc > sc'):
sc2.parent.unwrap()
for lai in s1b.select('.rh_lab :not(sup):not(sub)'):
lai.unwrap()
for rb in s1b.select('ros br,def + br'):
rb.decompose()
for bb in s1b.select('b b'):
bb.unwrap()
dfs = ['sdef', 'def']
for df in dfs:
for de in s1b.select(f'{df} + .rh_ex'):
dps = de.find_previous_sibling(df)
dee = de.extract()
dps.append(dee)
for ded in s1b.select('def > .rh_ex + def'):
ded.insert_before('hodor')
dc = str(s1b).replace('hodor', '')
dc = dc.replace(': ', ': ')
return dc
def dc_entr(dc):
soup2 = BeautifulSoup(dc, 'html5lib')
s2b = soup2.body
s2b.name = 'entry'
dc = str(s2b)
return dc
def get_hw(soup):
for sup in soup('sup'):
sup.decompose()
hw = soup.get_text().strip()
if hw.startswith('—'):
return
hw = hw.replace('•', '').replace('ˌ', '').replace('ˈ', '').replace('.', '').replace('′', '').strip(',;:- ')
return hw.lower()
def get_hws(dc):
soup = BeautifulSoup(dc, 'html5lib')
for rmva in soup.select('.rh_me + var'):
rm = rmva.find_previous_sibling(class_='rh_me')
if rmva.get_text().startswith('('):
tmp = get_hw(rm) + get_hw(rmva)
tmp = re.sub(r'\([^)(]*\)', r'', tmp)
hws = set(tmp.split('or '))
return hws
hws = set()
for rm in soup.select('.rh_me'):
hw = get_hw(rm)
if hw:
hws.add(hw)
for vb in soup.select('var b'):
if vb.get_text().startswith('-'):
continue
hw = get_hw(vb)
if hw:
hws.add(hw)
return hws
with open('dc.txt', 'r') as frd:
lns = {line.strip() for line in frd}
hds = set()
for ln in lns:
soup = BeautifulSoup(ln, 'html5lib')
for de in soup.select('div.entryRH'):
ad = de.select_one('[id^="advanced_"]')
if not ad:
continue
for al in de.select('a,ol,ul,li,lab,i .rh_ex'):
al.unwrap()
for su in de('span', string='USA pronunciation'):
su.decompose()
for ae in de.select('art,br,:empty'):
ae.decompose()
for st in de.select('[style]'):
del st['style']
dfs = ['pdef', 'def', 'sdef', 'ros', 'xr', 'var', 'sc']
for df in dfs:
for ddf in de(class_=f'rh_{df}'):
ddf.attrs.clear()
ddf.name = df
for us in de(class_='supnt'):
us.attrs.clear()
us.name = 'supnt'
de.attrs.clear()
de['class'] = 'rh_main'
dc1 = dc_repl(str(de))
dc1 = dc_soup(dc1)
dc1 = dc_entr(dc1)
hw1s = get_hws(dc1)
for hw1 in hw1s:
hd1 = (hw1, dc1)
hds.add(hd1)
soup_dc1 = BeautifulSoup(dc1, 'html5lib')
for srb in soup_dc1.select('def > b:only-of-type'):
hw2 = get_hw(srb)
if not hw2:
continue
srd = srb.parent
dc2 = dc_entr(str(srd))
if not re.match(r'(?: |[^<>]*)*', dc2):
continue
hd2 = (hw2, dc2)
hds.add(hd2)
for hd in sorted(hds):
hw, dc = hd
CSS = ''
with open('dict.txt', 'a') as fad:
fad.write(f'{hw}\n{CSS}{dc}\n>\n')
print(hw)