import os import re from bs4 import BeautifulSoup import random from genanki import Model from genanki import Note from genanki import Deck from genanki import Package mid = random.randrange(1 << 30, 1 << 31) did = random.randrange(1 << 30, 1 << 31) MY_MODEL = Model( mid, 'Word Root Of The Day Archive', fields = [ {'name': 'Text'}, {'name': 'Supplement'}, ], templates = [{ 'name': 'CARD', 'qfmt': '{{Text}}', 'afmt': '{{FrontSide}}{{Supplement}}', },] ) my_deck = Deck(did, 'Word Root Of The Day Archive') for root, dirs, files in os.walk("wrotds"): for name in files: with open(os.path.join(root, name), "r") as frn: soup = BeautifulSoup(frn, "html5lib") for ps in soup.select("div#wrotd-quicksum p"): for pi in ps("img"): pi.decompose() for pt in ps(True): pt.attrs.clear() ps.attrs.clear() tp = re.sub(r' {2,}', ' ', str(ps)).replace('\n', '') fields = [tp, ''] my_note = Note(model=MY_MODEL, fields=fields) my_deck.add_note(my_note) for pp in soup.select("div#wrotd-full-text p"): newp = re.sub(r'([.!][”’]?) {2,}', r'\1
', str(pp)) newp = re.sub(r' {2,}', ' ', newp).replace('\n', '') soup2 = BeautifulSoup(newp, "html5lib") for st in soup2(True): st.attrs.clear() for sp in soup2("p"): if not re.search(r'\w|\w', str(sp)): continue fields = [str(sp), ''] my_note = Note(model=MY_MODEL, fields=fields) my_deck.add_note(my_note) Package(my_deck).write_to_file('Word_Root_Of_The_Day_Archive.apkg')