下载的网页用python提取部分内容,然后批量合并,出现合并的条目与实际的文件数量相比少了很多,中间也没报错,实际文件数量十万多,只合并了三万多,不知道咋回事,
import re
import pandas as pd
import os
import random
import requests
from lxml import etree
import sys
import subprocess
import copy
import chardet
import urllib.parse
from tidylib import tidy_document
w_path = "F:/1"
w_files = os.listdir(w_path)
# print(w_files)
for w_file in w_files:
f1 = open(w_path+"/"+str(w_file), encoding="utf-8")
print(f1)
w_txt=f1.read()
obj = re.compile(r'<title>(?P<bi>.*?) - 维基百科,自由的百科全书</title>.*?'
r'<div class="pre-content heading-holder">(?P<h1>.*?)<nav class="page-actions-menu">.*?'
r'<div id="mw-content-subtitle" lang="zh-Hans-CN" dir="ltr">(?P<neirong>.*?)<div class="post-content footer-content">', re.S)
resu = obj.finditer(w_txt)
f = open("F:/ceshi/weiji1.txt",'a',encoding='utf-8')
for it in resu:
# print(it.group("title"))
biaoti = it.group("bi")
name = it.group("h1")
neirong = it.group("neirong")
# # dic = it.groupdict()
# print(name)
# print(biaoti)
f.write(f'{biaoti}\n{name}\n{neirong}\n</>\n')
print('正在打印')
f.close()
print('完成')