from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError, ViewportSize
import time
import random
import os
import re
from urllib.parse import urlparse


# 断点续传：获取已处理的详情页链接
def get_processed_detail_urls():
    if os.path.exists("processed_details.txt"):
        with open("processed_details.txt", "r", encoding="utf-8") as f:
            return set(line.strip() for line in f.readlines())
    return set()


# 保存已处理的详情页链接
def save_processed_detail_url(url):
    with open("processed_details.txt", "a", encoding="utf-8") as f:
        f.write(url + "\n")


# 获取已存在的车型链接（用于去重）
def get_existing_model_urls():
    if os.path.exists("kachezhijia_xiangqing_url.txt"):
        with open("kachezhijia_xiangqing_url.txt", "r", encoding="utf-8") as f:
            return set(line.strip() for line in f.readlines())
    return set()


# 保存车型链接到文件（立即保存）
def save_model_urls_immediately(urls):
    if not urls:
        return
    existing = get_existing_model_urls()
    new_urls = [url for url in urls if url not in existing]
    if new_urls:
        with open("kachezhijia_xiangqing_url.txt", "a", encoding="utf-8") as f:
            for url in new_urls:
                f.write(url + "\n")
        print(f"✅ 已立即保存{len(new_urls)}个新车型链接到kachezhijia_xiangqing_url.txt")


# 随机用户代理
def generate_random_user_agent():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.60",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5 Safari/605.1.15",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0"
    ]
    return random.choice(user_agents)


# 随机延迟
def random_delay(min_seconds=1, max_seconds=3):
    time.sleep(random.uniform(min_seconds, max_seconds))


# 模拟人类滚动（确保图片加载）
def human_like_scroll(page, target_selector=None):
    if target_selector:
        try:
            element = page.query_selector(target_selector)
            if element:
                print(f"📜 滚动到目标元素: {target_selector}")
                element.scroll_into_view_if_needed()
                random_delay(1, 2)
                return
        except:
            pass

    scroll_count = random.randint(2, 6)
    for i in range(scroll_count):
        scroll_distance = random.randint(800, 1990)
        print(f"📜 模拟滚动第{i + 1}次，距离: {scroll_distance}px")
        page.evaluate(f"window.scrollBy(0, {scroll_distance})")
        random_delay(0.8, 1.5)

    if random.random() > 0.6:
        rollback_distance = random.randint(500, 1500)
        print(f"📜 滚动回退: {rollback_distance}px")
        page.evaluate(f"window.scrollBy(0, -{rollback_distance})")
        random_delay(0.5, 1)


# 处理滑动验证
def handle_captcha(page):
    captcha_selectors = [".captcha-container", "#slide-captcha", ".geetest_panel"]
    try:
        for selector in captcha_selectors:
            if page.query_selector(selector, timeout=9000):
                print(f"⚠️  检测到验证元素: {selector}，尝试自动处理")
                return process_slide_captcha(page, selector)
        print("✅ 未检测到滑动验证")
        return True
    except:
        print("✅ 验证检测超时，默认无验证")
        return True


# 滑动验证处理
def process_slide_captcha(page, selector):
    try:
        page.wait_for_selector(selector, state="visible", timeout=9000)
        random_delay(1, 2)

        slider = page.query_selector(f"{selector} .slider")
        track = page.query_selector(f"{selector} .track")
        if not slider or not track:
            print("❌ 未找到滑块元素，需手动处理验证")
            return False

        slider_bbox = slider.bounding_box()
        track_bbox = track.bounding_box()
        if not slider_bbox or not track_bbox:
            print("❌ 无法获取验证元素位置")
            return False

        distance = track_bbox["width"] - slider_bbox["width"]
        if distance < 50:
            distance = random.randint(150, 300)

        trajectory = []
        current = 0
        while current < distance:
            step = int(random.uniform(5, 25) * (1 - current / distance))
            step = min(step, distance - current)
            trajectory.append(step)
            current += step

        print(f"📜 开始模拟滑动验证，总距离: {distance}px")
        page.mouse.move(
            slider_bbox["x"] + slider_bbox["width"] / 2,
            slider_bbox["y"] + slider_bbox["height"] / 2
        )
        page.mouse.down()
        random_delay(0.1, 0.3)

        for i, step in enumerate(trajectory):
            y_offset = random.randint(-3, 3)
            page.mouse.move(
                slider_bbox["x"] + slider_bbox["width"] / 2 + sum(trajectory[:i + 1]),
                slider_bbox["y"] + slider_bbox["height"] / 2 + y_offset,
                delay=random.randint(50, 120)
            )

        page.mouse.up()
        random_delay(1.5, 2.5)

        if not page.query_selector(selector, timeout=9000):
            print("✅ 滑动验证处理成功")
            return True
        else:
            print("❌ 滑动验证处理失败，需手动干预")
            return False
    except:
        print("❌ 验证处理异常")
        return False


# 下载图片并保存（完全还原图片URL的目录结构，跳过已下载的图片）
def download_image(page, img_url):
    try:
        # 修正图片URL尺寸（保持原有逻辑）
        original_url = img_url
        if "_360x240.jpg" in img_url:
            img_url = img_url.replace("_360x240.jpg", "_1200x800.jpg")
            print(f"📷 图片URL替换: {original_url[:50]}... -> {img_url[:50]}...")

        # 解析图片URL，完全还原路径结构（兼容旧图片存储目录）
        parsed_img = urlparse(img_url)
        img_full_path = os.path.join("images", parsed_img.netloc, parsed_img.path.lstrip("/"))
        save_dir = os.path.dirname(img_full_path)
        os.makedirs(save_dir, exist_ok=True)  # 已存在的目录不报错，兼容旧图片

        # 核心优化：检查图片是否已下载，已存在则直接跳过（避免重复下载）
        if os.path.exists(img_full_path):
            print(f"📁 图片已存在，跳过下载: {img_full_path.split(os.sep)[-1]}")
            return img_full_path, original_url, img_url

        # 不存在则下载
        print(f"📁 图片保存路径: {img_full_path}")
        response = page.request.get(img_url, timeout=95000)
        with open(img_full_path, "wb") as f:
            f.write(response.body())

        print(f"✅ 图片下载完成: {img_full_path.split(os.sep)[-1]}")
        return img_full_path, original_url, img_url
    except Exception as e:
        print(f"❌ 图片下载失败 {img_url[:50]}...: {str(e)[:50]}")
        return None, original_url, img_url


# 辅助函数：获取元素完整HTML（兼容旧版本Playwright）
def get_element_full_html(page, element, selector):
    try:
        return element.outer_html()
    except AttributeError:
        tag_name = page.evaluate("el => el.tagName.toLowerCase()", element)
        attributes = page.evaluate("""
            el => {
                const attrs = {};
                for (let i = 0; i < el.attributes.length; i++) {
                    const attr = el.attributes[i];
                    attrs[attr.name] = attr.value;
                }
                return attrs;
            }
        """, element)
        attr_str = ""
        for key, value in attributes.items():
            attr_str += f' {key}="{value}"'
        inner_html = element.inner_html()
        return f"<{tag_name}{attr_str}>{inner_html}</{tag_name}>"


# 提取参数配置页面内容
def extract_param_content(context, param_url, base_url):
    try:
        print(f"\n📋 开始提取参数配置: {param_url[:50]}...")
        param_page = context.new_page()
        param_page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
            Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
            window.chrome = { app: { isInstalled: false }, runtime: {} };
        """)

        param_page.goto(param_url, timeout=90000)
        random_delay(2, 3)
        if not handle_captcha(param_page):
            param_page.close()
            print("❌ 验证失败，跳过参数配置提取")
            return ""

        # 清理无关标签
        print("📜 开始清理参数页面无关标签")
        delete_scripts = """
            const toRemove = [
                '.shortcut-content', '.suspend', '.sign', '.foot-tips', '#foot-price',
                '.m-footer-bar', '#back_top', '.top', '.new-nav-list', '#truck_choose', '.consult'
            ];
            toRemove.forEach(selector => {
                document.querySelectorAll(selector).forEach(el => el.remove());
            });
            document.querySelectorAll('a').forEach(a => {
                if (a.textContent.includes('询底价')) a.remove();
            });
            document.querySelectorAll('a').forEach(a => {
                const parent = a.parentNode;
                while (a.firstChild) parent.insertBefore(a.firstChild, a);
                parent.removeChild(a);
            });
        """
        param_page.evaluate(delete_scripts)
        random_delay(1, 2)
        print("✅ 无关标签清理完成")

        # 提取.compare-wrapper标签
        print("📜 提取.compare-wrapper标签内容")
        compare_wrapper = param_page.query_selector(".compare-wrapper")
        if not compare_wrapper:
            print("❌ 未找到.compare-wrapper标签")
            param_page.close()
            return ""

        param_content = get_element_full_html(param_page, compare_wrapper, ".compare-wrapper")

        # 提取注释放文本标签
        print("📜 提取注释放文本标签")
        next_sibling_data = param_page.evaluate("""
            (el) => {
                let sibling = el.nextElementSibling;
                while (sibling) {
                    if (sibling.tagName === 'DIV' && sibling.style.backgroundColor === 'rgb(250, 250, 250)') {
                        const attrs = {};
                        for (let i = 0; i < sibling.attributes.length; i++) {
                            const attr = sibling.attributes[i];
                            attrs[attr.name] = attr.value;
                        }
                        return {
                            tag: sibling.tagName.toLowerCase(),
                            attrs: attrs,
                            innerHtml: sibling.innerHTML
                        };
                    }
                    sibling = sibling.nextElementSibling;
                }
                return null;
            }
        """, compare_wrapper)

        if next_sibling_data:
            attr_str = ""
            for key, value in next_sibling_data["attrs"].items():
                attr_str += f' {key}="{value}"'
            next_sibling_html = f"<{next_sibling_data['tag']}{attr_str}>{next_sibling_data['innerHtml']}</{next_sibling_data['tag']}>"
            param_content += "\n" + next_sibling_html
            print("✅ 注释放文本标签提取完成")
        else:
            print("⚠️  未找到注释放文本标签")

        print(f"✅ 参数配置提取完成，总长度: {len(param_content)}字符")
        param_page.close()
        return param_content
    except Exception as e:
        print(f"❌ 提取参数配置失败 {param_url[:50]}...: {str(e)[:50]}")
        return ""


# 辅助函数：获取最后一个li标签中p后的数字（如p46→46）
def get_last_li_p_number(page):
    """提取最后一个li标签中a链接href里的p数字"""
    try:
        # 获取所有图片li标签
        li_tags = page.query_selector_all("section.photo#quote .image-type li")
        if not li_tags:
            return 0

        # 获取最后一个li标签
        last_li = li_tags[-1]
        a_tag = last_li.query_selector("a")
        if not a_tag:
            return 0

        href = a_tag.get_attribute("href")
        if not href:
            return 0

        # 匹配p后面的数字（如#p46→46）
        p_match = re.search(r'#p(\d+)', href)
        if p_match:
            p_number = int(p_match.group(1))
            print(f"📜 最后一个li标签的p数字: {p_number}")
            return p_number
        else:
            print("📜 最后一个li标签的href中未找到p数字")
            return 0
    except Exception as e:
        print(f"❌ 获取p数字失败: {str(e)[:50]}")
        return 0


# 核心优化：支持p数字验证+超大图片量（6000+）加载
def scroll_with_mouse_wheel(page, expected_count):
    # 区分普通图片量和超大图片量（6000+）
    is_large_quantity = expected_count >= 3000
    max_attempts = 410 if is_large_quantity else 350  # 超大图片量加倍尝试次数
    attempts = 0
    no_increase_count = 0  # 连续未增长计数器
    max_no_increase = 4 if is_large_quantity else 3  # 超大图片量放宽连续未增长限制
    last_loaded_count = 0
    last_p_number = 0

    print(f"📜 开始滚动加载 - 目标数量: {expected_count}张，{'超大图片量模式' if is_large_quantity else '普通模式'}")

    # 定位图片根容器
    photo_section = page.query_selector("section.photo#quote")
    if not photo_section:
        print("⚠️  未找到<section class='photo' id='quote'>，使用默认页面滚动")
        photo_section = page.query_selector("body")

    # 滚动图片容器到可视区域
    image_type = page.query_selector("section.photo#quote .image-type")
    if image_type:
        print("📜 将<div class='image-type'>滚动到可视区域")
        image_type.scroll_into_view_if_needed()
        random_delay(2, 4)  # 超大图片量延长初始化等待

    # 鼠标移动到容器中心（增强交互真实性）
    bbox = photo_section.bounding_box()
    if bbox:
        center_x = bbox["x"] + bbox["width"] // 2
        center_y = bbox["y"] + bbox["height"] // 2
        print(f"📜 鼠标移动到容器中心: ({center_x}, {center_y})")
        page.mouse.move(center_x, center_y)
        random_delay(2, 3)

    while attempts < max_attempts:
        # 1. 统计已加载且完成的图片
        current_images = page.query_selector_all("section.photo#quote .image-type img")
        loaded_count = page.evaluate("""
            (imgs) => imgs.filter(img => img.complete && img.naturalWidth > 0).length
        """, current_images)

        # 2. 获取最后一个li的p数字
        current_p_number = get_last_li_p_number(page)

        print(f"📜 第{attempts + 1}次尝试 - 已加载{loaded_count}/{expected_count}张，p数字: {current_p_number}")

        # 双重验证：满足任一条件即视为加载完成
        condition1 = loaded_count >= expected_count  # 图片数量达标
        condition2 = current_p_number >= expected_count  # p数字达标（核心新增）
        if condition1 or condition2:
            print(f"✅ 图片加载完成 - 条件达标: {'数量达标' if condition1 else 'p数字达标'}")
            return True

        # 3. 处理连续未增长逻辑
        if loaded_count == last_loaded_count and current_p_number == last_p_number:
            no_increase_count += 1
            print(f"⚠️  连续{no_increase_count}次加载数量和p数字均未增长")

            # 触发强制上滑下滑（超大图片量增强滑动力度）
            if no_increase_count >= max_no_increase:
                print(f"📜 触发强制上滑下滑（连续{max_no_increase}次未增长）")
                # 获取容器滚动高度（超大图片量使用更大滑动比例）
                scroll_height = page.evaluate("""
                    () => document.querySelector('section.photo#quote .image-type').scrollHeight || document.body.scrollHeight
                """)
                up_ratio = 0.6 if is_large_quantity else 0.4  # 超大图片量上滑60%
                up_distance = int(scroll_height * up_ratio)
                print(f"📜 向上滑动{up_distance}px（{'超大图片量增强版' if is_large_quantity else '普通版'}）")
                page.mouse.wheel(0, -up_distance)
                random_delay(2, 4)  # 超大图片量延长上滑等待

                # 下滑：超过原位置，超大图片量分多步下滑
                down_distance = up_distance + (
                    random.randint(1500, 3000) if is_large_quantity else random.randint(800, 1500))
                print(f"📜 向下滑动{down_distance}px（分多步）")
                step_count = 10 if is_large_quantity else 5  # 超大图片量分5步下滑
                step_size = down_distance // step_count
                for i in range(step_count):
                    page.mouse.wheel(0, step_size + random.randint(50, 100))
                    random_delay(1, 2)  # 每步下滑后等待

                # 轻微抖动+停留，确保触发懒加载
                print("📜 轻微抖动页面，触发剩余图片加载")
                for _ in range(10 if is_large_quantity else 6):
                    page.mouse.wheel(0, random.randint(100, 200))
                    random_delay(0.5, 1)
                    page.mouse.wheel(0, -random.randint(50, 100))
                    random_delay(0.5, 1)

                no_increase_count = 0  # 重置计数器
        else:
            no_increase_count = 0  # 数量或p数字增长，重置计数器

        # 4. 常规滚动（超大图片量使用更大力度和更多步数）
        if no_increase_count < max_no_increase:
            wheel_steps = random.randint(5, 8) if is_large_quantity else random.randint(2, 4)
            for _ in range(wheel_steps):
                delta_y = random.randint(8000, 55000) if is_large_quantity else random.randint(12000, 88000)
                page.mouse.wheel(0, delta_y)
                print(f"📜 常规滚动 - 滚动力度: {delta_y}（{'超大力度' if is_large_quantity else '常规力度'}）")
                random_delay(1.5, 2.5) if is_large_quantity else random_delay(0.8, 1.5)

        # 5. 等待图片加载（超大图片量动态延长等待时间）
        base_wait = 8 if is_large_quantity else 3
        wait_time = base_wait + (attempts // 10) * 1  # 每10次尝试增加2秒等待
        wait_time = min(wait_time, 6)  # 最大等待不超过30秒
        print(f"📜 等待图片加载: {wait_time:.1f}秒（{'超大图片量延长等待' if is_large_quantity else '常规等待'}）")
        random_delay(wait_time, wait_time)

        # 6. 定期滚动到页面底部（超大图片量必做）
        if attempts % 10 == 0:
            print("📜 滚动到页面底部，强制触发所有懒加载")
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            random_delay(5, 8) if is_large_quantity else random_delay(3, 5)

        # 更新状态
        last_loaded_count = loaded_count
        last_p_number = current_p_number
        attempts += 1

    # 最终校验：即使达到最大尝试次数，也检查双重条件
    final_loaded = page.evaluate("""
        () => document.querySelectorAll("section.photo#quote .image-type img").length
    """)
    final_p_number = get_last_li_p_number(page)
    final_condition = final_loaded >= expected_count or final_p_number >= expected_count
    print(f"⚠️  达到最大尝试次数 - 实际加载{final_loaded}/{expected_count}张，p数字: {final_p_number}")
    return final_condition


# 【核心修复】提取图片列表内容（解决变量未定义、容器跳过、无效补充加载）
def extract_image_content(context, pic_url, base_url):
    try:
        print(f"\n" + "=" * 50)
        print(f"📋 开始提取图片列表: {pic_url[:50]}...")
        print("=" * 50)

        # 1. 初始化图片列表页面（反检测配置）
        pic_page = context.new_page()
        pic_page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
            Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
            Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
        """)

        # 2. 访问图片列表页（延长超时应对大资源）
        pic_page.goto(pic_url, timeout=120000)  # 2分钟超时
        random_delay(3, 8)  # 初始等待，确保页面加载
        # 处理验证码（若有）
        if not handle_captcha(pic_page):
            pic_page.close()
            print("❌ 验证码验证失败，跳过当前图片列表提取")
            return []

        # 3. 定位图片分类导航（如“外观”“内饰”“全景”）
        style_nav = pic_page.query_selector("nav.style-type")
        if not style_nav:
            pic_page.close()
            print("❌ 未找到图片分类导航标签（nav.style-type），提取终止")
            return []

        # 4. 筛选有效分类链接（跳过“全景”“视频”）
        nav_links = style_nav.query_selector_all("div a")
        valid_links = []
        has_panorama = False  # 标记是否存在“全景”分类（影响后续容器索引）

        for link in nav_links:
            link_text = link.text_content().strip()
            link_href = link.get_attribute("href")

            # 跳过“全景”分类（后续容器可能需要跳过第一个）
            if link_text == "全景":
                has_panorama = True
                print(f"📜 检测到【全景】分类，标记后续容器处理规则")
                continue
            # 跳过“视频”分类（非图片资源）
            if link_text == "视频":
                print(f"📜 检测到【视频】分类，跳过非图片资源处理")
                continue
            # 补全相对链接为绝对链接
            if link_href:
                if link_href.startswith("/"):
                    parsed_base = urlparse(base_url)
                    link_href = f"{parsed_base.scheme}://{parsed_base.netloc}{link_href}"
                valid_links.append((link_text, link_href))
                print(f"📜 有效图片分类: {link_text} -> {link_href[:50]}...")

        # 无有效分类时终止
        if not valid_links:
            pic_page.close()
            print("❌ 未筛选出有效图片分类链接，提取终止")
            return []
        print(f"\n✅ 共筛选出 {len(valid_links)} 个有效图片分类")

        # 5. 处理每个图片分类（核心逻辑）
        image_contents = []  # 最终返回的图片内容列表
        for idx, (category, category_url) in enumerate(valid_links):
            try:
                print(f"\n" + "-" * 50)
                print(f"📋 开始处理分类 {idx + 1}/{len(valid_links)}: 【{category}】")
                print(f"🔗 分类链接: {category_url[:50]}...")
                print("-" * 50)

                # 5.1 初始化分类页面
                img_page = context.new_page()
                img_page.add_init_script("""
                    Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
                """)
                img_page.goto(category_url, timeout=120000)  # 2分钟超时
                random_delay(3, 8)  # 等待分类页面加载
                # 处理分类页验证码
                if not handle_captcha(img_page):
                    img_page.close()
                    print(f"❌ 【{category}】分类页验证码失败，跳过该分类")
                    continue

                # 5.2 提取预期图片数量（从header的<em>标签获取）
                expected_count = 0
                header_em = img_page.query_selector("header h4 em")
                if header_em:
                    em_text = header_em.text_content().strip()
                    num_match = re.search(r'\d+', em_text)  # 匹配数字
                    if num_match:
                        expected_count = int(num_match.group())
                        print(f"✅ 从<em>标签获取预期图片数量: {expected_count}张")
                if expected_count == 0:
                    print("⚠️  未找到有效预期图片数量（<em>标签缺失或无数字），使用默认滚动策略")

                # 5.3 滚动加载图片（调用优化后的滚动函数）
                print(f"📜 开始滚动加载【{category}】分类图片")
                scroll_success = False
                if expected_count > 0:
                    scroll_success = scroll_with_mouse_wheel(img_page, expected_count)
                    if not scroll_success:
                        print(f"⚠️  基础滚动未加载到预期数量，执行深度补充滚动")
                        # 超大数量图片（≥3000张）分3次滚动，避免一次性加载失败
                        if expected_count >= 3000:
                            for i in range(3):
                                img_page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                                print(f"📜 超大图片量补充滚动: 第{i + 1}次滚动到底部")
                                random_delay(8, 12)
                        else:
                            img_page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                            random_delay(5, 8)
                else:
                    # 未知预期数量，用增强型随机滚动
                    viewport = img_page.viewport_size
                    if viewport:
                        # 鼠标先移动到页面中心，模拟人类操作
                        center_x = viewport["width"] // 2
                        center_y = viewport["height"] // 2
                        img_page.mouse.move(center_x, center_y)
                        random_delay(2, 3)
                        # 多轮随机滚动，确保加载所有图片
                        for _ in range(random.randint(8, 12)):
                            img_page.mouse.wheel(0, random.randint(2000, 5000))  # 向下滚动
                            random_delay(1.5, 2.5)
                        # 最后小幅上下滚动，触发残留加载
                        img_page.mouse.wheel(0, -1000)  # 向上回滚
                        random_delay(2, 3)
                        img_page.mouse.wheel(0, 1500)  # 再向下滚动
                        random_delay(3, 5)
                # 滚动后最终等待，确保图片资源加载完成
                random_delay(5, 10)
                print(f"✅ 【{category}】分类图片滚动加载完成")

                # 5.4 获取图片容器（.image-type）并处理索引（核心修复：避免跳过唯一容器）
                image_containers = img_page.query_selector_all(".image-type")
                if not image_containers:
                    img_page.close()
                    print(f"❌ 【{category}】分类未找到图片容器（.image-type），跳过该分类")
                    continue
                print(f"📜 【{category}】分类找到 {len(image_containers)} 个图片容器")

                # 关键修复：计算起始索引（仅当“有全景+是第一个分类+容器数>1”时才跳过）
                start_idx = 0
                if has_panorama and idx == 0:
                    if len(image_containers) > 1:
                        start_idx = 1
                        print(f"📜 因【全景】分类存在且容器数>1，起始索引设为1（跳过第一个容器）")
                    else:
                        start_idx = 0
                        print(f"📜 因【全景】分类存在但容器数=1，起始索引设为0（不跳过容器）")
                # 筛选实际要处理的容器
                target_containers = image_containers[start_idx:]
                print(f"📜 【{category}】分类实际处理容器数量: {len(target_containers)}（起始索引: {start_idx}）")

                # 5.5 遍历容器下载图片（核心修复：解决downloaded_paths未定义）
                total_loaded = 0  # 该分类总下载/跳过的图片数
                category_downloaded = []  # 分类级路径列表（作用域覆盖补充加载）
                container_data_list = []  # 存储每个容器的HTML和路径

                for container in target_containers:
                    # 容器滚动到可视区域
                    container.scroll_into_view_if_needed(timeout=10000)
                    random_delay(2, 4)

                    # 获取容器完整HTML（用于后续替换图片URL）
                    original_html = get_element_full_html(img_page, container, ".image-type")
                    modified_html = original_html  # 初始化修改后的HTML

                    # 提取容器内所有图片标签
                    img_tags = container.query_selector_all("img")
                    container_downloaded = []  # 当前容器的图片路径列表
                    print(f"\n📜 容器内包含 {len(img_tags)} 张图片，开始处理")

                    for img_tag in img_tags:
                        img_src = img_tag.get_attribute("src")
                        # 过滤无效URL（仅处理http/https链接）
                        if not img_src or not img_src.startswith(("http://", "https://")):
                            print(f"⚠️  无效图片URL，跳过: {img_src[:30]}...")
                            continue

                        # 调用下载函数（返回：本地路径、原始URL、高清URL）
                        saved_path, original_img_url, modified_img_url = download_image(img_page, img_src)

                        # 替换HTML中的图片URL为高清版（如360x240 -> 1200x800）
                        if "_360x240.jpg" in original_img_url:
                            modified_html = modified_html.replace(original_img_url, modified_img_url)
                            print(f"📷 图片URL替换: {original_img_url[:50]}... -> {modified_img_url[:50]}...")

                        # 记录有效路径（已下载或已存在）
                        if saved_path:
                            container_downloaded.append(saved_path)
                            category_downloaded.append(saved_path)
                            print(f"📁 图片已处理: {saved_path.split(os.sep)[-1]}")

                    # 更新统计信息
                    container_img_count = len(container_downloaded)
                    total_loaded += container_img_count
                    # 存储当前容器的处理结果
                    container_data_list.append({
                        "html": modified_html,
                        "downloaded": container_downloaded,
                        "img_count": container_img_count
                    })
                    print(f"✅ 容器处理完成，累计处理 {container_img_count} 张图片")

                # 5.6 数量校验与补充加载（核心修复：避免无效补充）
                print(f"\n📊 【{category}】分类基础处理完成 - 预期{expected_count}张，实际处理{total_loaded}张")
                if expected_count > 0 and total_loaded < expected_count:
                    # 计算缺失数量（排除已存在的图片，避免重复补充）
                    existing_count = len([p for p in category_downloaded if os.path.exists(p)])
                    actual_missing = expected_count - existing_count

                    if actual_missing <= 0:
                        print(f"✅ 无需补充加载：已存在图片数({existing_count}) ≥ 预期数量({expected_count})")
                    else:
                        print(f"📜 开始补充加载：还差 {actual_missing} 张未加载图片")
                        # 再次滚动触发残留图片加载
                        img_page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                        random_delay(5, 10) if expected_count >= 3000 else random_delay(3, 5)
                        # 重新获取所有图片标签（补充遗漏）
                        all_img_tags = img_page.query_selector_all("section.photo#quote .image-type img")
                        supplement_count = 0

                        for img_tag in all_img_tags:
                            img_src = img_tag.get_attribute("src")
                            if img_src and img_src.startswith(("http://", "https://")):
                                saved_path, _, modified_img_url = download_image(img_page, img_src)
                                # 仅处理未记录的新图片
                                if saved_path and saved_path not in category_downloaded:
                                    supplement_count += 1
                                    category_downloaded.append(saved_path)
                                    print(f"📥 补充加载成功: {saved_path.split(os.sep)[-1]}")

                        # 更新补充后的统计
                        total_loaded += supplement_count
                        print(f"📊 补充加载完成 - 新增 {supplement_count} 张，累计处理 {total_loaded} 张")

                # 5.7 整理当前分类的最终结果
                for container_data in container_data_list:
                    image_contents.append({
                        "category": category,
                        "html": container_data["html"],
                        "downloaded": container_data["downloaded"],
                        "processed_img_count": container_data["img_count"],
                        "total_category_img_count": total_loaded
                    })

                # 5.8 关闭当前分类页面，进入下一个分类
                img_page.close()
                print(f"\n✅ 【{category}】分类处理完成（页面已关闭）")
                random_delay(3, 8)  # 延长分类间隔，避免反爬

            except Exception as e:
                print(f"\n❌ 【{category}】分类处理失败: {str(e)[:100]}")
                # 打印详细错误堆栈（便于调试）
                import traceback
                traceback.print_exc()
                continue

        # 6. 所有分类处理完成，关闭列表页面
        pic_page.close()
        print(f"\n" + "=" * 50)
        print(f"✅ 所有图片分类处理完成")
        print(f"📊 最终结果：共提取 {len(image_contents)} 个图片容器，涉及 {len(valid_links)} 个分类")
        print("=" * 50)
        return image_contents

    except Exception as e:
        print(f"\n❌ 图片列表整体提取失败（URL: {pic_url[:50]}...）: {str(e)[:100]}")
        import traceback
        traceback.print_exc()
        # 异常时确保关闭所有页面，避免资源泄漏
        try:
            pic_page.close()
            img_page.close()
        except:
            pass
        return []


# 处理产品筛选区并提取链接（修复加载更多按钮定位，确保加载全部车型）
def process_filter_section(main_page, url, filter_tag):
    all_links = []
    print("📜 进入产品筛选区处理逻辑，优化加载更多按钮定位（不依赖data-v属性）")

    # -------------------------- 辅助函数：提取车型链接 --------------------------
    def extract_links_from_model_list():
        model_list = filter_tag.query_selector("ul.model-list")
        if not model_list:
            print("❌ 未找到车型列表标签ul.model-list（基于类名定位）")
            return []

        li_tags = model_list.query_selector_all("li")
        if not li_tags:
            print("⚠️  车型列表中无li标签，返回空链接列表")
            return []

        links = []
        for li in li_tags:
            first_a = li.query_selector("a:first-of-type")
            if first_a:
                href = first_a.get_attribute("href")
                if href and href.strip().endswith("_index.html"):
                    if href.startswith("/"):
                        parsed_base = urlparse(url)
                        href = f"{parsed_base.scheme}://{parsed_base.netloc}{href.strip()}"
                    if href not in links:
                        links.append(href)
                        # 仅打印前5个和最后1个链接，避免日志冗余
                        if len(links) <= 5 or len(links) == len(li_tags):
                            print(f"📜 提取有效车型链接: {href[:50]}...")
        print(f"📊 本次共提取{len(links)}个车型链接")
        return links

    # -------------------------- 辅助函数：获取当前选中的筛选标签文本 --------------------------
    def get_current_selected_filter_text():
        visible_filter = filter_tag.query_selector("p.filter.visible")
        if visible_filter:
            text = visible_filter.text_content().strip()
            print(f"📜 当前选中的筛选标签文本: {text}")
            return text
        return ""

    # -------------------------- 辅助函数：提取停售车型数量 --------------------------
    def get_discontinued_car_count():
        car_num_tag = filter_tag.query_selector("p.car-num")
        if not car_num_tag:
            print("⚠️  未找到车型数量标签p.car-num")
            return 0

        strong_tag = car_num_tag.query_selector("strong")
        if strong_tag:
            num_text = strong_tag.text_content().strip()
            if num_text.isdigit():
                count = int(num_text)
                print(f"✅ 提取到停售车型总数: {count}款")
                return count
        print("⚠️  未从p.car-num中提取到有效数字")
        return 0

    # -------------------------- 辅助函数：定位加载更多按钮（核心修复） --------------------------
    def find_load_more_button():
        """不依赖data-v属性，仅通过类名+文本定位加载更多按钮"""
        # 方案1：优先通过“类名+文本”定位（完全匹配按钮结构：<p class="load-more">点击加载更多</p>）
        load_more = filter_tag.query_selector('p.load-more:has-text("点击加载更多")')
        if load_more:
            print("✅ 定位到加载更多按钮（基于类名+文本）")
            return load_more

        # 方案2：降级通过“仅文本”定位（兼容按钮类名变化的情况）
        load_more_text = filter_tag.query_selector('p:has-text("点击加载更多")')
        if load_more_text:
            print("✅ 降级定位到加载更多按钮（基于文本）")
            return load_more_text

        # 方案3：最后尝试通过“类名+包含文本”定位（兼容文本有空格的情况）
        load_more_contains = filter_tag.query_selector('p.load-more:has-text("加载更多")')
        if load_more_contains:
            print("✅ 兼容定位到加载更多按钮（基于类名+包含文本）")
            return load_more_contains

        print("❌ 未检测到加载更多按钮")
        return None

    # -------------------------- 第一步：智能识别初始状态 --------------------------
    initial_selected_text = get_current_selected_filter_text()
    initial_car_num_text = ""
    car_num_tag = filter_tag.query_selector("p.car-num")
    if car_num_tag:
        initial_car_num_text = car_num_tag.text_content().strip()

    # 判定初始状态
    initial_state = "unknown"
    if initial_selected_text == "在售":
        initial_state = "onsale"
        print(f"📊 智能识别：初始状态为【在售】（数量标签：{initial_car_num_text}）")
    elif initial_selected_text == "停售":
        initial_state = "discontinued"
        print(f"📊 智能识别：初始状态为【停售】（数量标签：{initial_car_num_text}）")
    else:
        initial_state = "onsale"  # 降级默认按在售处理
        print(f"⚠️  初始状态未明确（文本：{initial_selected_text}），默认按【在售】处理")

    # -------------------------- 第二步：加载并提取初始状态车型链接 --------------------------
    # 循环加载更多，直到无按钮或达到预期数量
    def load_all_models(target_count=0):
        """加载所有车型：循环点击加载更多，直到无按钮或达到目标数量"""
        loaded_links = []
        max_attempts = 110  # 最多尝试30次（适配大量车型）
        attempts = 0
        last_loaded_count = 0

        while attempts < max_attempts:
            # 提取当前页面的链接
            current_links = extract_links_from_model_list()
            # 去重并更新已加载链接
            new_links = [link for link in current_links if link not in loaded_links]
            loaded_links.extend(new_links)
            current_count = len(loaded_links)

            # 检查是否达到目标数量（如有）
            if target_count > 0 and current_count >= target_count:
                print(f"✅ 已加载{current_count}款车型，达到目标数量{target_count}款，停止加载")
                break

            # 检查是否有新链接加载（避免无限循环）
            if current_count == last_loaded_count and attempts > 3:
                print(f"⚠️  连续3次未加载到新车型，停止加载（当前已加载{current_count}款）")
                break

            # 定位并点击加载更多按钮
            load_more = find_load_more_button()
            if not load_more:
                print(f"✅ 未检测到加载更多按钮，停止加载（当前已加载{current_count}款）")
                break

            try:
                print(f"📜 点击加载更多按钮（第{attempts + 1}次），当前已加载{current_count}款")
                # 滚动到按钮可视区域（确保点击有效）
                load_more.scroll_into_view_if_needed(timeout=90000)
                random_delay(1, 2)  # 延长等待，确保按钮可点击
                # 模拟真实点击（添加延迟，防反爬）
                load_more.click(click_count=1, delay=random.randint(3, 5))
                # 等待新车型加载（延长超时到15秒）
                main_page.wait_for_function(
                    """() => {
                        const liCount = document.querySelectorAll('ul.model-list li').length;
                        return liCount > window.prevLiCount;
                    }""",
                    timeout=5,
                    arg=({"prevLiCount": len(filter_tag.query_selector_all("ul.model-list li"))})
                )
                random_delay(2, 3)  # 延长等待，确保新车型渲染完成
                last_loaded_count = current_count
                attempts += 1
            except Exception as e:
                attempts += 1
                random_delay(2, 3)  # 失败后延长等待，避免频繁重试

        print(f"📊 初始状态加载完成：共提取{len(loaded_links)}款车型链接")
        return loaded_links

    # 加载初始状态的所有车型（若为停售状态，使用提取到的总数作为目标）
    target_initial_count = 0
    if initial_state == "discontinued":
        target_initial_count = get_discontinued_car_count()
    initial_links = load_all_models(target_initial_count)
    all_links.extend(initial_links)

    # -------------------------- 第三步：若初始为在售，执行切换到停售流程 --------------------------
    if initial_state == "onsale":
        # 定位筛选容器和在售标签
        filters_container = filter_tag.query_selector("div.filters")
        if not filters_container:
            print("❌ 未找到筛选标签容器div.filters，跳过在售/停售切换")
            return all_links
        print("✅ 成功定位筛选容器div.filters（基于类名）")

        # 定位“在售”标签（仅基于类名+文本）
        sale_status_tag = None
        filter_p_tags = filters_container.query_selector_all("p.filter")
        for tag in filter_p_tags:
            tag_text = tag.text_content().strip()
            if "在售" in tag_text:
                sale_status_tag = tag
                print(f"✅ 定位到在售状态标签：文本={tag_text}，是否可见={tag.is_visible()}")
                break

        if not sale_status_tag:
            print("❌ 未找到包含“在售”文本的p.filter标签，无法触发弹窗")
            return all_links

        # 触发右侧弹窗
        popup_selector = "div.van-popup.van-popup--right.pop-up"
        try:
            print(f"📜 点击在售标签（文本：{sale_status_tag.text_content().strip()}），触发右侧弹窗")
            sale_status_tag.scroll_into_view_if_needed(timeout=8000)
            random_delay(2, 3)
            sale_status_tag.click(delay=random.randint(200, 400))
            random_delay(2, 3)

            # 等待弹窗加载（延长到15秒）
            popup_loaded = False
            for _ in range(15):
                popup = main_page.query_selector(popup_selector)
                if popup and popup.is_visible():
                    popup_loaded = True
                    print("✅ 右侧在售/停售弹窗已弹出并可见")
                    break
                time.sleep(1)
            if not popup_loaded:
                raise Exception("弹窗超时15秒未弹出或不可见")
        except Exception as e:
            print(f"❌ 触发右侧弹窗失败: {str(e)[:100]}，跳过在售/停售切换")
            return all_links

        # 点击停售选项并验证切换
        try:
            popup = main_page.query_selector(popup_selector)
            if not popup:
                raise Exception("弹窗弹出后丢失，无法定位")

            popup_ul = popup.query_selector("ul")
            if not popup_ul:
                raise Exception("弹窗内未找到ul列表（停售选项容器）")

            # 定位“停售”选项（仅基于类名+文本）
            target_li = None
            popup_li_tags = popup_ul.query_selector_all("li.half-line")
            for li in popup_li_tags:
                li_text = li.text_content().strip()
                if li_text == "停售":
                    target_li = li
                    print(f"✅ 在弹窗中定位到停售选项：文本={li_text}，是否可见={li.is_visible()}")
                    break

            if not target_li:
                print("⚠️  弹窗中未找到“停售”选项，无需切换")
                main_page.keyboard.press("Escape")
                random_delay(2, 3)
                return all_links

            # 点击停售选项
            print("📜 在弹窗中点击“停售”选项，切换车型状态")
            target_li.scroll_into_view_if_needed(timeout=8000)
            random_delay(3, 5)
            target_li.click(delay=random.randint(150, 300))
            random_delay(5, 8)  # 延长渲染等待，确保页面完全更新

            # 三重验证切换成功
            # 验证1：弹窗是否关闭
            popup_closed = False
            popup_after_click = main_page.query_selector(popup_selector)
            if not popup_after_click:
                popup_closed = True
            else:
                popup_style = popup_after_click.get_attribute("style") or ""
                if "display: none" in popup_style or "visibility: hidden" in popup_style:
                    popup_closed = True

            # 验证2：筛选标签文本是否变为“停售”
            current_selected_text = get_current_selected_filter_text()
            is_text_switched = current_selected_text == "停售"

            # 验证3：提取停售车型总数
            discontinued_count = get_discontinued_car_count()
            has_discontinued_count = discontinued_count > 0

            print(f"📋 切换三重验证结果：")
            print(f"   - 弹窗是否关闭: {popup_closed}")
            print(f"   - 筛选标签文本是否变为停售: {is_text_switched}（当前：{current_selected_text}）")
            print(f"   - 是否提取到停售车型数量: {has_discontinued_count}（数量：{discontinued_count}款）")

            if not (popup_closed and is_text_switched):
                raise Exception(
                    f"切换验证失败：弹窗关闭={popup_closed}，文本切换={is_text_switched}"
                )
            print(f"✅ 停售状态切换成功！开始加载所有停售车型（共{discontinued_count}款）")

        except Exception as e:
            print(f"❌ 点击停售选项或切换验证失败: {str(e)[:100]}，提取当前链接后跳过")
            switched_links = load_all_models()
            new_switched_links = [link for link in switched_links if link not in all_links]
            all_links.extend(new_switched_links)
            print(f"⚠️  切换失败后，新增提取{len(new_switched_links)}个停售车型链接")
            return all_links

        # -------------------------- 第四步：加载所有停售车型 --------------------------
        print("📜 开始加载所有停售车型")
        discontinued_count = get_discontinued_car_count()
        switched_links = load_all_models(discontinued_count)
        # 去重并添加停售链接
        new_switched_links = [link for link in switched_links if link not in all_links]
        all_links.extend(new_switched_links)
        print(f"📊 停售状态加载完成：新增提取{len(new_switched_links)}个停售车型链接")

    # -------------------------- 第五步：最终结果校验 --------------------------
    final_total = len(all_links)
    expected_total = 0
    if initial_state == "discontinued":
        expected_total = get_discontinued_car_count()
    elif initial_state == "onsale":
        # 在售+停售总数（在售按初始加载数，停售按提取的总数）
        onsale_count = len(initial_links)
        discontinued_count = get_discontinued_car_count()
        expected_total = onsale_count + discontinued_count

    print(f"\n📊 产品筛选区处理完成：")
    print(f"   - 预期总车型数量: {expected_total if expected_total > 0 else '未知'}款")
    print(f"   - 实际提取总链接数: {final_total}款")
    if expected_total > 0 and final_total < expected_total:
        print(f"⚠️  实际提取数量少于预期（差{expected_total - final_total}款），建议检查网络或增加加载尝试次数")
    else:
        print(f"✅ 实际提取数量符合预期")

    return all_links


# 处理单个详情页链接时，筛选区标签定位逻辑也需优化（同步修改）
def process_detail_page(context, main_page, url):
    result = {
        "title": "",
        "title_box": "",
        "param_content": "",
        "image_content": [],
        "url": url
    }

    try:
        print(f"\n" + "=" * 50)
        print(f"📋 开始处理详情页: {url[:50]}...")
        print("=" * 50)

        # 访问页面并处理验证
        main_page.goto(url, timeout=60000)
        random_delay(3, 5)  # 延长初始等待，确保页面完整加载
        if not handle_captcha(main_page):
            print(f"❌ 验证失败，跳过链接: {url[:50]}...")
            return None

        # -------------------------- 核心修复：标题框提取（按实际HTML结构，删除询底价标签） --------------------------
        print("\n📜 开始提取标题框（按实际HTML结构，保留.info容器，删除a.ask-price）")
        # 优先定位最外层的.info容器（完全匹配HTML结构）
        info_container = main_page.query_selector("div.info")
        if info_container:
            print("✅ 成功定位标题框外层容器div.info")

            # 提取.info容器的完整HTML（包含所有子元素）
            info_html = get_element_full_html(main_page, info_container, "div.info")

            # 关键操作：删除其中的a.ask-price标签（询底价按钮）
            cleaned_info_html = re.sub(
                r'<a\s+[^>]*class="[^"]*ask-price[^"]*"[^>]*>.*?</a>',  # 匹配ask-price链接
                '',
                info_html,
                flags=re.DOTALL  # 允许.匹配换行符
            )

            # 提取标题文本（从.info > .series > p中获取）
            series_tag = info_container.query_selector("div.series")
            if series_tag:
                title_p = series_tag.query_selector("p")
                if title_p:
                    result["title"] = title_p.text_content().strip()
                    print(f"✅ 标题提取成功: {result['title']}")
                else:
                    print("⚠️  .series容器内未找到p标签，从.info直接提取标题")
                    result["title"] = series_tag.text_content().strip()  # 限制长度避免冗余

            # 赋值清理后的标题框HTML（符合要求的结构）
            result["title_box"] = cleaned_info_html
            print(f"✅ 标题框处理完成，已删除询底价标签，长度: {len(result['title_box'])}字符")
            print(f"📜 处理后的标题框结构预览: {result['title_box'][:200]}...")
        else:
            # 降级方案：如果未找到div.info，按之前的多层级定位
            print("⚠️  未找到div.info容器，尝试降级定位标题框")
            series_tag = main_page.query_selector("div.series")
            if not series_tag:
                series_tag = main_page.query_selector("div:has(p:has-text('车型')), div:has(p:has-text('系列'))")
            if series_tag:
                title_p = series_tag.query_selector("p:first-of-type, h1, h2")
                if title_p:
                    result["title"] = title_p.text_content().strip()
                else:
                    result["title"] = series_tag.text_content().strip()[:50]
                result["title_box"] = get_element_full_html(main_page, series_tag, "div.series")
                print(f"✅ 降级提取标题框成功: {result['title']}")
            else:
                print("❌ 未找到任何标题相关容器")

        # -------------------------- 关键优化：定位产品筛选区（不依赖data-v属性） --------------------------
        print("\n📜 开始处理产品筛选区（智能识别在售/停售标签）")
        # 定位筛选区容器：仅基于类名"product-filters"，不依赖任何data-v属性（核心修改）
        filter_tag = main_page.query_selector("div.product-filters")
        if filter_tag:
            print("✅ 成功定位产品筛选区容器div.product-filters（仅基于类名）")
            all_valid_links = process_filter_section(main_page, url, filter_tag)
            save_model_urls_immediately(all_valid_links)
            print(f"📜 筛选区共提取{len(all_valid_links)}个有效车型链接")
        else:
            # 降级策略：如果未找到，尝试带部分data-v属性的选择器（兼容旧结构）
            filter_tag_fallback = main_page.query_selector("div.product-filters[data-v-4e394ce1]")
            if filter_tag_fallback:
                print("✅ 降级定位到产品筛选区（基于类名+部分固定data-v属性）")
                all_valid_links = process_filter_section(main_page, url, filter_tag_fallback)
                save_model_urls_immediately(all_valid_links)
                print(f"📜 筛选区共提取{len(all_valid_links)}个有效车型链接")
            else:
                print("❌ 未找到产品筛选区标签（div.product-filters），跳过筛选区处理")

        # 提取参数配置（保持原有逻辑）
        print("\n📜 开始处理参数配置")
        param_url = re.sub(r"index\.html$", "param.html", url)
        if param_url != url:
            print(f"📜 参数配置链接: {param_url[:50]}...")
            result["param_content"] = extract_param_content(context, param_url, url)
        else:
            print("❌ 无法构造参数配置链接")

        # 提取图片列表（调用修复后的extract_image_content）
        print("\n📜 开始处理图片列表")
        pic_url = re.sub(r"index\.html$", "Pic.html", url)
        if pic_url != url:
            print(f"📜 图片列表链接: {pic_url[:50]}...")
            result["image_content"] = extract_image_content(context, pic_url, url)
        else:
            print("❌ 无法构造图片列表链接")

        print(f"\n✅ 详情页{url[:50]}...处理完成")
        return result

    except PlaywrightTimeoutError:
        print(f"❌ 页面超时: {url[:50]}...")
    except Exception as e:
        print(f"❌ 处理详情页出错 {url[:50]}...: {str(e)[:150]}")
    return None


# 格式化结果
def format_result(result):
    image_str = ""
    for img_item in result["image_content"]:
        image_str += f"[图片分类: {img_item['category']}]\n"
        image_str += f"[图片标签内容]: {img_item['html']}\n"

    return (
        f"标题: {result['title']}\n"
        f"标题框内容: {result['title_box']}\n\n"
        f"参数配置: {result['param_content']}\n\n"
        f"图片列表: {image_str}\n"
        f"地址: [{result['url']}]\n"
        f"</>\n\n"
    )


def main():
    if not os.path.exists("kachezhijia_xiangqing_url.txt"):
        print("❌ 未找到kachezhijia_xiangqing_url.txt文件")
        return

    with open("kachezhijia_xiangqing_url.txt", "r", encoding="utf-8") as f:
        detail_urls = [line.strip() for line in f.readlines() if line.strip()]

    processed_urls = get_processed_detail_urls()
    remaining_urls = [url for url in detail_urls if url not in processed_urls]
    total_remaining = len(remaining_urls)

    print(f"📊 检测到{len(detail_urls)}个详情页链接，已处理{len(processed_urls)}个，剩余{total_remaining}个待处理")

    if total_remaining == 0:
        print("✅ 所有详情页链接已处理完毕")
        return

    with sync_playwright() as p:
        user_agent = generate_random_user_agent()
        print(f"📜 初始用户代理: {user_agent}")

        browser = p.chromium.launch(
            headless=False,
            args=[
                "--disable-blink-features=AutomationControlled",
                "--disable-dev-shm-usage",
                "--no-sandbox",
                "--disable-extensions",
                f"--user-agent={user_agent}"
            ],
            slow_mo=random.randint(50, 150)
        )

        context = browser.new_context(
            user_agent=user_agent,
            viewport=ViewportSize(
                width=random.randint(1280, 1920),
                height=random.randint(720, 1080)
            ),
            locale="zh-CN",
            timezone_id="Asia/Shanghai",
            java_script_enabled=True
        )

        main_page = context.new_page()
        main_page.add_init_script("""
            Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
            Object.defineProperty(navigator, 'languages', { get: () => ['zh-CN', 'zh'] });
            window.chrome = { app: { isInstalled: false }, runtime: {} };
            window.navigator.permissions.query = (parameters) => 
                Promise.resolve({ state: parameters.name === 'notifications' ? 'denied' : 'granted' });
        """)

        for i, url in enumerate(remaining_urls, 1):
            print(f"\n" + "=" * 50)
            print(f"📊 正在处理第{i}/{total_remaining}个详情页")
            print("=" * 50)

            # 定期更换用户代理，避免反爬
            if i % random.randint(3, 5) == 0:
                user_agent = generate_random_user_agent()
                main_page.set_extra_http_headers({"User-Agent": user_agent})
                print(f"📜 更换用户代理: {user_agent}")

            # 处理详情页（调用修复后的逻辑）
            result = process_detail_page(context, main_page, url)

            # 保存结果
            if result:
                formatted = format_result(result)
                with open("kache_data.txt", "a", encoding="utf-8") as f:
                    f.write(formatted)
                print(f"✅ 结果已写入kache_data.txt")

            # 标记为已处理
            save_processed_detail_url(url)
            print(f"✅ 标记链接为已处理: {url[:50]}...")

            # 定期延长休息时间，避免反爬
            if i % random.randint(2, 4) == 0:
                rest_time = random.uniform(2, 6)
                print(f"📜 长时间休息 {rest_time:.1f} 秒...")
                time.sleep(rest_time)
            else:
                delay_time = random.uniform(5, 12)
                print(f"📜 休息 {delay_time:.1f} 秒...")
                random_delay(delay_time, delay_time)

        # 关闭资源
        main_page.close()
        context.close()
        browser.close()

    print("\n" + "=" * 50)
    print("✅ 所有详情页链接处理完毕")
    print(f"📊 结果文件: {os.path.abspath('kache_data.txt')}")
    print(f"📊 图片目录: {os.path.abspath('images')}")
    print("=" * 50)


if __name__ == "__main__":
    main()