# import sys
# sys.path.append('./../../../dict_tech/py/')
# import mdx_util as u
# u.hello_mdx_util()
def hello_mdx_util():
print('hello_mdx_util')
import os
os.makedirs(path, exist_ok=True)
('\n'.join(ls))
import json
json.dumps(hwds, indent=4)
json.loads(f.read())
from bs4 import BeautifulSoup
with open('entries.txt', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, 'html.parser', multi_valued_attributes=None)
tags = soup.select('di>di-body>sense-block>sense-head>sense-info>soundfile>media')
# https://www.collinsdictionary.com/sitemap/english-conjugation/sitemap1.xml
from bs4 import BeautifulSoup
import os
word_urls = []
for i in range(1):
with open(f'../xmls/sitemap{i+1}.xml', 'r', encoding='utf-8') as f:
soup = BeautifulSoup(f, "xml")
urls = soup.find_all("url")
# if find word locs by "loc" directly, there are some other imgage locs...
# url's first loc is the page url
urls = urls[1:]
# ignore first trash link: https://www.collinsdictionary.com/dictionary/english-thesaurus/
for i, url in enumerate(urls):
word_urls.append(url.find("loc").text)
path = '../urls.txt'
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w', encoding='utf-8') as f:
f.write('\n'.join(word_urls))
# handle CLOUDFARE issue
import cloudscraper
import concurrent.futures
html_statusNot200_dir = '../err_html_statusNot200/'
os.makedirs(os.path.dirname(html_statusNot200_dir), exist_ok=True)
html_timeout_dir = '../err_html_timeout_dir/'
os.makedirs(os.path.dirname(html_timeout_dir), exist_ok=True)
scraper = cloudscraper.create_scraper()
def get_html(word):
url = word['url']
path = word['path']
try:
r = scraper.get(url, timeout=180)
if r.status_code == 200:
with open (path, 'w', encoding='utf-8') as f:
f.write(r.text)
else:
# don't want to check it later, so check it here
with open(html_statusNot200_dir + path.split('/')[-1], 'a', encoding='utf-8') as f:
f.write(url)
except scraper.exceptions.Timeout:
with open(html_timeout_dir + path.split('/')[-1], 'a', encoding='utf-8') as f:
f.write(url)
with concurrent.futures.ThreadPoolExecutor() as e:
for word in words:
e.submit(get_html, word)
1 个赞