html文件都抓取好了,那么如何批量处理呢?
我尝试过方法
import os
from bs4 import BeautifulSoup
# 输入文件夹路径
input_folder = "output"
# 输出文件夹路径
output_folder = "output_new"
# 确保输出文件夹存在
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# 遍历文件夹中的HTML文件
for filename in os.listdir(input_folder):
if filename.endswith(".html"):
# 构建输入文件的完整路径
input_file_path = os.path.join(input_folder, filename)
# 读取HTML文件内容
with open(input_file_path, "r", encoding="utf-8") as file:
html_content = file.read()
# 使用Beautiful Soup解析HTML
soup = BeautifulSoup(html_content, "html.parser")
# 找到#root元素
root_element = soup.select_one("#root")
# 如果找到了#root元素
if root_element:
# 删除#root同级和上级的元素
siblings = root_element.find_parents() + root_element.find_all_next() + root_element.find_all_previous()
for sibling in siblings:
sibling.decompose()
# 保留#root下的特定子元素
valid_tags = [
'div.v0FDaSYd',
'div:nth-child(5) > div > div.jjO2Nc7v > div.mfFXfdZK',
'div:nth-child(5) > div > div.Vx8ajdmK > div',
'div.XJmTj2oN',
'div:nth-child(8) > div.MMRp6QwT',
'div:nth-child(8) > div:nth-child(4)',
'div:nth-child(8) > table'
]
for tag in valid_tags:
# 移除除了指定标签之外的所有子元素
for child in root_element.find_all(recursive=False):
try:
if not child.is_selector(tag):
child.decompose()
except AttributeError:
pass # 避免处理NoneType
try:
# 获取#root的内容,包含HTML标签
root_content = str(root_element)
except TypeError:
root_content = "" # 避免处理NoneType
# 构建输出文件的完整路径,将后缀改为txt
output_file_path = os.path.join(output_folder, filename.replace(".html", ".txt"))
# 将内容保存到输出文件
with open(output_file_path, "w", encoding="utf-8") as output_file:
output_file.write(root_content)
这不行,为什么?
output.zip (367.7 KB)