# -*- coding: utf-8 -*-

import os.path
from os import path
import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    for i, line in enumerate(open("address3.txt")):
        filename = str(i) + ".html"  # 保存的文件名
        line = line.strip()  # 移除文本行前后空格

        # 检查文件是否存在，存在跳过
        if path.exists(filename):
            continue

        page.goto(line)

        # 等待2秒，确保动态网页也可以爬取
        time.sleep(2)

        # 读取网页内容
        content = page.content()
        # 打印文本行，去除前后空格换行，响应内容长度
        print('current: ', i, line, len(content))

        # 保存网页到文件
        with open(filename, "w", encoding='utf-8') as f:
            f.write(content)

    browser.close()
