# -*- coding: utf-8 -*-

import os.path
from os import path
import time
from playwright.sync_api import sync_playwright

with sync_playwright() as p:
    browser = p.chromium.launch()
    page = browser.new_page()
    
    for i, line in enumerate(open("address3.txt")):
        filename = str(i) + ".html"  # 保存的文件名
        line = line.strip()  # 移除文本行前后空格

        # 检查文件是否存在，存在跳过
        if path.exists(filename):
            continue

        try:
            page.goto(line)

            # 等待2秒，确保动态网页也可以爬取
            time.sleep(2)

            # 读取网页内容
            content = page.content()
            # 打印文本行，去除前后空格换行，响应内容长度
            print('current: ', i, line, len(content))

            # 保存网页到文件
            with open(filename, "w", encoding='utf-8') as f:
                f.write(content)
        except Exception as e:
            # 打印错误
            print(e)
            # 等待60秒，继续爬取
            time.sleep(60)
            continue

    browser.close()