
import string
from unidecode import unidecode
import re
from bs4 import BeautifulSoup


with open('dc.htm', encoding='iso-8859-1', mode='r') as frd:
    dc = frd.read()

dc = re.sub(r'(?<=\n\n  )(<p><b>(?!The )|<div class="fig)', r'┳<entry>\1', dc)

dc = re.sub(r'(<p><br |<div style="clear: both">|<pre>)', r'┳\1', dc)

dc = re.sub(r'(<entry>[^┳]*)(?=\n\n)', r'\1</entry>┳', dc)

dc = dc.replace('\n', '').replace('┳', '\n')

dc = re.sub(r' {2,}', r' ', dc)


with open('ndc.htm', 'w') as fwn:
    fwn.write(dc)

with open('ndc.htm', 'r') as frn:
    lines = [str(BeautifulSoup(line, 'html5lib').entry) for line in frn if line.strip().startswith('<entry>')]

hds = set()
for line in lines:

    line = line.replace('<b>adj</b>.', '<i>adj.</i>')
    line = line.replace('<b>adjs.</b>', '<i>adjs.</i>')
    line = line.replace('—<b>Also ', '<br>—Also <b>')

    line = line.replace('<b><span class="nw">What′-d\'ye-call</span></b> (<b><span class="nw">-it</span></b>, <b><span class="nw">-\'em</span></b>),', '<b><span class="nw">What′-d\'ye-call-it</span></b>, <b><span class="nw">What′-d\'ye-call-\'em</span></b>,')


    line = line.replace('<b>Sterēo <span class="nw">tom′ic</span></b>', '<b><span class="nw">Sterēotom′ic</span></b>')

    line = line.replace('<b><span class="nw">Czar′evitch</span></b>, <b>Tsar-</b>', '<b><span class="nw">Czar′evitch</span></b>, <b>Tsar′evitch</b>')

    line = line.replace('<b><span class="nw">Hereunto′</span></b> (also <b><span class="nw">-un′-</span></b>)', '<b><span class="nw">Hereunto′</span></b> (also <b><span class="nw">Hereun′to</span></b>)')

    line = line.replace('<b>Domesday-</b>, <b>Doomsday-book</b>', '<b>Domesday-book</b>, <b>Doomsday-book</b>')



    line = re.sub(r'(<p><b>Hæm([^<> ]+)</b>, <b><span class="nw">Hem)-(</span></b>)', r'\1\2\3', line)



    line = re.sub(r'(<p><b>([^<> ]+)</b>, )-<b>([^<> ]+)</b>', r'\1<b><span class="nw">\2\3</span></b>', line)


    line = re.sub(r'(<b><span class="nw">[^<> ]+-)(</span></b>, <b><span class="nw">[^<> ]+-([^<> ]+)</span></b>)', r'\1\3\2', line)
    

    line = re.sub(r'(?<=</i> )<b>([^<>]+)</b>', r'<b><span class="nw">\1</span></b>', line)

    line = re.sub(r'<b><span class="nw">([^<>]+)</b>(?= \((?!<i>)|,? (?:or |and )?<b>(?!<span class="nw">))', r'<b>\1</b>', line)


    line = re.sub(r'(?<=; )<b>([^<>]+)</b>(?! \((?!<i>)|,? (?:or |and )?<b>(?!<span class="nw">))', r'<b><span class="nw">\1</span></b>', line)
    line = re.sub(r'(?<=—)<b>([^<>]+)</b>(?! \((?!<i>)|,? (?:or |and )?<b>(?!<span class="nw">))', r'<b><span class="nw">\1</span></b>', line)
    
    retmp0 = re.compile(r'(<b><span class="nw">([^<> ]+)-[^<> ]+</span></b>, (?:or )?<b><span class="nw">)(-[^<> ]+</span></b>)')
    while retmp0.search(line):
        line = retmp0.sub(r'\1\2\3', line)

    retmp1 = re.compile(r'(<b><span class="nw">([^<> -]+-?)[^<> -]*</span></b>, )<span class="nw">-</span><b>([^<> -]+)</b>')
    while retmp1.search(line):
        line = retmp1.sub(r'\1<b><span class="nw">\2\3</span></b>', line)

    line = line.replace('[', '<span class="etym">[').replace(']', ']</span>')


    soup1 = BeautifulSoup(line, 'html5lib')
    s1e = soup1.entry
    
    for bb in s1e.select("b + b"):
        bps = bb.find_previous_sibling("b")

        if not bb.string or not bps.string or bps.string.startswith('-'):
            continue

        listtmp = ['al', 'al,', 'al.', 'ed', 'd', 's', 'ic', 'e', 'y', 'er', 'a', 'ous', 'n',]
        dicttmp = {
                    'or': ['er'],
                    'ent': ['ant'],
                    'ence': ['ance'],
                    'er': ['or'],
                    'jingle': ['clink'],
                    'temporary': ['temporaneous'],
                    'polit′ical': ['pol′itic'],
                    'typ′ical': ['typ′al'],
                    'bowman': ['bower'],
                    'rhē′al': ['rhœt′ic'],
                    'rhē′ic': ['rhē′al'],
                    'rhet′ic': ['rhē′ic'],
                    'cy': ['ce'],
                    'cote': ['cot'],
                    'ous': ['ose'],
                    'cum': ['con'],
                    'wards': ['ward'],
                    'abouts': ['about'],
                    'maid': ['man'],
                    'shirt': ['dress'],
                    'laryn′geal': ['glos′sal'],
                    'nā′sal': ['laryn′geal'],
                    'ō′ral': ['nā′sal'],
                    'eable': ['able'],
                    'uise': ['ise'],
                    'tor': ['ter'],
                    'streaked': ['straked'],
                    'abouts′': ['about′'],
                    'te′': ['′'],
                    'dor': ['der'],
                    'tos': ['ti'],
                    }

        for ktmp, vtmps in dicttmp.items():
            for vtmp in vtmps:
                if bb.string == '-' + ktmp and bps.string.endswith(vtmp):
                    bb.string = re.sub(fr'{vtmp}$', fr'{ktmp}', bps.string)
        
        for tmp in listtmp:
            if bb.string == '-' + tmp:
                bb.string = bps.string + tmp


    for pbb in s1e.select("entry > p:first-of-type > b:first-child + b"):
        if len(list(pbb.descendants)) == 1:
            pbb.string.wrap(soup1.new_tag("span", class_="nw"))

    
    for nen in s1e.select(".nw:not(b > .nw),.etym .etym,.etym .nw"):
        nen.unwrap()

    for etym in s1e.select(".etym"):
        if '.' not in etym.get_text():
            etym.unwrap()

    for bn in s1e.select("b > .nw"):
        bnp = bn.parent

        if len(bnp.contents) > 1:
            bn.unwrap()


    tmp1 = str(s1e)
    tmp1 = re.sub(r'(?<!:)—(?=(?:<i>[^<>]+</i> )?<b>)', r'<br>—', tmp1)
    '''
    tmp1 = re.sub(r'(?<!:)—(?=(?:<i>[^<>]+</i> )?<b><span class="nw">)', r'<br>—', tmp1)
    tmp1 = re.sub(r'(?<!:)—(?=(?:<i>[^<>]+</i> )?<b>[^<>]+</b>(?: \((?!<i>)|,? (?:or |and )?<b>(?!<span class="nw">)))', r'<br>—', tmp1)
    '''

    retmp2 = re.compile(r'(</span></b>, (?:[^<> ]+ )?)<b>([A-Z][^<>]+)</b>(?! \((?!<i>)|,? (?:or |and )?<b>(?!<span class="nw">))')
    while retmp2.search(tmp1):
        tmp1 = retmp2.sub(r'\1<b><span class="nw">\2</span></b>', tmp1)
    
    soup2 = BeautifulSoup(tmp1, 'html5lib')
    s2e = soup2.entry

    hws = list()
    dfl = s2e.select_one('entry > div.figleft:first-child')

    if dfl:
        hw = dfl.img['alt'].strip()
        if hw in string.ascii_uppercase:
            hws.append(hw)

    for bfn in s2e.select("entry > p:first-of-type > b:first-child,.nw"):
        hw = bfn.get_text().strip()

        if not hw:
            continue

        hw = hw.replace("′", "").strip("; ")
        hw = unidecode(hw, errors="preserve")
        hw = re.sub(r'\W', r'', hw)

        if hw and hw not in hws:
            hws.append(hw)

    for hw in hws:
        for bfn2 in reversed(s2e.select("entry > p:first-of-type > b:first-child,.nw")):
            hw2 = bfn2.get_text().strip()

            if not hw2:
                continue

            hw2 = hw2.replace("′", "").strip("; ")
            hw2 = unidecode(hw2, errors="preserve")

            if hw == re.sub(r'\W', r'', hw2):
                bfn2.wrap(soup2.new_tag("mark"))
                tmp2 = hw2

        dfl2 = s2e.select_one('entry > div.figleft:first-child')

        if dfl2:
            hw2 = dfl2.img['alt'].strip()

            if hw2 in string.ascii_uppercase and hw == hw2:
                tmp2 = hw2

        for mm in s2e.select("mark mark"):
            mm.unwrap()

        dc = str(s2e)
        hd = (tmp2, dc)

        for mark in s2e("mark"):
            mark.unwrap()

        if hd in hds:
            continue

        hds.add(hd)

        CSS = '<link href="ctcd.css" rel="stylesheet">'
        with open('dict.txt', 'a') as fad:
            fad.write(f'{tmp2}\n{CSS}{dc}\n</>\n')
        print(tmp2)

