from bs4 import BeautifulSoup
import os
from multiprocessing import Pool
import re

with os.scandir("OEBPS") as it:
    list_entryname = [entry.name for entry in it if entry.name.endswith(".html")]

def fs2dc(fs):
    pd = fs.find_parent("div")
    try:
        ps2 = pd.previous_sibling.previous_sibling
        ps2n = ps2.name
    except:
        dc = str(pd).replace("</u>\n ", "</u>").replace("\n", "").replace("&amp;", "&")
    else:
        if ps2n == "span":
            dc = str(ps2)+str(pd).replace("</u>\n ", "</u>").replace("\n", "").replace("&amp;", "&")
        else:
            dc = str(pd).replace("</u>\n ", "</u>").replace("\n", "").replace("&amp;", "&")
    return dc

def cats(fs, dc, hw):
    hwr = "OCISLY"
    soup_dc = BeautifulSoup(dc, "html5lib")
    fsn = fs.find_next("font", size="+2")
    if fsn == None:
        hwr = hw
        return dc, hwr
    dcn = fs2dc(fsn)
    soup_dcn = BeautifulSoup(dcn, "html5lib")
    for si in soup_dcn.find_all("span", id=re.compile("^filepos")):
        ah = "#"+si["id"]
        if soup_dc.find("a", href=ah) != None:
            dc += dcn
            dc, hwr = cats(fsn, dc, hw)
            hwr = hw
            break
    return dc, hwr

def html2txt(entry_name):
    with open("OEBPS/"+entry_name, "rb") as fre:
        soup = BeautifulSoup(fre, "html5lib")
        for ah in soup.find_all("a"):
            if ".html#" in ah["href"]:
                ah["href"] = "#"+ah["href"].split(".html#")[-1].strip()
                if ah.b != None:
                    ah["href"] = "entry://"+ah.b.get_text().replace("\n ", "").replace("\n", "").replace("&amp;", "&").strip()+ah["href"]
            elif ah.b != None:
                ah["href"] = "entry://"+ah.b.get_text().replace("\n ", "").replace("\n", "").replace("&amp;", "&").strip()
            else:
                ah.unwrap()
        hwr = "OCISLY"
        for fs in soup.find_all("font", size="+2"):
            hw = fs.b.get_text().replace("\n ", "").replace("\n", "").replace("&amp;", "&").replace("|", "").strip()
            if hw == hwr:
                continue
            dc = fs2dc(fs)
            ndc, hwr = cats(fs, dc, hw)
            dict_txt = "dict_"+entry_name.split(".")[-2]+".txt"
            with open(dict_txt, "a") as fad:
                fad.write(hw+"\n"+ndc+"\n</>\n")
            print(hw)

if __name__ == '__main__':
    with Pool(6) as p:
        p.map(html2txt, list_entryname)
