抓取词典数据的问题

Howie · 2023 年12 月 21 日 13:22

html文件都抓取好了，那么如何批量处理呢？
我尝试过方法

import os
from bs4 import BeautifulSoup

# 输入文件夹路径
input_folder = "output"

# 输出文件夹路径
output_folder = "output_new"

# 确保输出文件夹存在
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# 遍历文件夹中的HTML文件
for filename in os.listdir(input_folder):
    if filename.endswith(".html"):
        # 构建输入文件的完整路径
        input_file_path = os.path.join(input_folder, filename)

        # 读取HTML文件内容
        with open(input_file_path, "r", encoding="utf-8") as file:
            html_content = file.read()

        # 使用Beautiful Soup解析HTML
        soup = BeautifulSoup(html_content, "html.parser")

        # 找到#root元素
        root_element = soup.select_one("#root")

        # 如果找到了#root元素
        if root_element:
            # 删除#root同级和上级的元素
            siblings = root_element.find_parents() + root_element.find_all_next() + root_element.find_all_previous()
            for sibling in siblings:
                sibling.decompose()

            # 保留#root下的特定子元素
            valid_tags = [
                'div.v0FDaSYd',
                'div:nth-child(5) > div > div.jjO2Nc7v > div.mfFXfdZK',
                'div:nth-child(5) > div > div.Vx8ajdmK > div',
                'div.XJmTj2oN',
                'div:nth-child(8) > div.MMRp6QwT',
                'div:nth-child(8) > div:nth-child(4)',
                'div:nth-child(8) > table'
            ]

            for tag in valid_tags:
                # 移除除了指定标签之外的所有子元素
                for child in root_element.find_all(recursive=False):
                    try:
                        if not child.is_selector(tag):
                            child.decompose()
                    except AttributeError:
                        pass  # 避免处理NoneType

            try:
                # 获取#root的内容，包含HTML标签
                root_content = str(root_element)
            except TypeError:
                root_content = ""  # 避免处理NoneType

            # 构建输出文件的完整路径，将后缀改为txt
            output_file_path = os.path.join(output_folder, filename.replace(".html", ".txt"))

            # 将内容保存到输出文件
            with open(output_file_path, "w", encoding="utf-8") as output_file:
                output_file.write(root_content)

这不行，为什么？
output.zip (367.7 KB)