import asyncio
import aiohttp
import json
import os
import time
import random
import signal
import sys
from contextlib import asynccontextmanager
from typing import Optional, List, Dict, Set, Tuple

import requests  # 仅用于获取 headwords（一次性），保留你的旧逻辑
# from bs4 import BeautifulSoup  # <<<--- 已移除：不再需要此库
from tqdm import tqdm

# --- 配置 ---
API_KEY = os.getenv("LOGEION_API_KEY", "AIzaSyCT5aVzk3Yx-m8FH8rmTpEgfVyVA3pYbqg")
BASE_URL = "https://anastrophe.uchicago.edu/logeion-api"

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "application/json, text/plain, */*",
    "Referer": "https://logeion.uchicago.edu/",
    "Origin": "https://logeion.uchicago.edu",
}

# --- 文件路径 (已更新) ---
WORD_LIST_FILE = "greek_headwords.json"            # 词表（列表）
OUTPUT_JSONL = "logeion_dictionary.jsonl"          # 随抓随存（一行一条，包含所有词典）
OUTPUT_JSON_SNAPSHOT = "logeion_dictionary.json"     # 可选：周期性合并快照
ERRORS_JSONL = "logeion_errors.jsonl"              # 失败或错误记录

# --- 并发与节流 ---
CONCURRENCY = 1000              # 并发 worker 数
REQUEST_TIMEOUT = 15          # 请求超时（秒）
MAX_RETRIES = 5               # 最大重试次数
BACKOFF_BASE = 1.8            # 指数退避底数
JITTER_RANGE = (0.05, 0.25)   # 抖动，避免同步化洪峰
SNAPSHOT_EVERY = 5000         # 每抓到这么多新词，合并生成一次 JSON 快照（设为0关闭）
WRITER_FLUSH_EVERY = 50       # 写文件多少条 flush 一次
QUEUE_MAXSIZE = 1000          # 任务队列缓冲

# --- 其他 (旧的 ONLY_LSJ 已移除) ---

def ensure_dir_for(file_path: str):
    """确保文件所在的目录存在。"""
    d = os.path.dirname(os.path.abspath(file_path))
    if d and not os.path.exists(d):
        os.makedirs(d, exist_ok=True)


def load_headwords() -> List[str]:
    """
    加载（或获取）希腊语 headwords 列表。
    如果本地已存在 WORD_LIST_FILE，则直接加载；否则通过 API 获取。
    """
    if os.path.exists(WORD_LIST_FILE):
        print(f"发现已存在的单词列表文件 '{WORD_LIST_FILE}'，直接加载...")
        with open(WORD_LIST_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)

    print("未找到本地单词列表，开始从 API 获取...")
    all_words = set()
    current_word = 'α'
    last_save = time.time()

    with tqdm(desc="获取单词列表") as pbar:
        while True:
            params = {'dir': 'down', 'key': API_KEY, 'type': 'inverse', 'w': current_word}
            try:
                resp = requests.get(f"{BASE_URL}/wheel", params=params, headers=HEADERS, timeout=REQUEST_TIMEOUT)
                resp.raise_for_status()
                results = resp.json().get("results", [])

                if not results or results[-1] == current_word:
                    print("\n已到达词典末尾。")
                    break

                new_words = set(results) - all_words
                if not new_words and len(results) > 0:
                    current_word = results[-1]

                all_words.update(results)
                current_word = results[-1]
                pbar.set_description(f"获取单词列表 (最后一个词: {current_word})")
                pbar.update(1)

                if time.time() - last_save > 10:
                    sorted_words = sorted(all_words)
                    ensure_dir_for(WORD_LIST_FILE)
                    with open(WORD_LIST_FILE, 'w', encoding='utf-8') as f:
                        json.dump(sorted_words, f, ensure_ascii=False, indent=2)
                    last_save = time.time()

                time.sleep(random.uniform(0.15, 0.45))

            except requests.exceptions.RequestException as e:
                print(f"\n获取单词列表时发生网络错误: {e}, 5秒后重试...")
                time.sleep(5)
            except json.JSONDecodeError:
                print(f"\n无法解析来自 {current_word} 的响应，5秒后重试...")
                time.sleep(5)

    sorted_words = sorted(list(all_words))
    print(f"总共获取到 {len(sorted_words)} 个独立单词。")
    print(f"正在保存单词列表到 '{WORD_LIST_FILE}'...")
    ensure_dir_for(WORD_LIST_FILE)
    with open(WORD_LIST_FILE, 'w', encoding='utf-8') as f:
        json.dump(sorted_words, f, ensure_ascii=False, indent=2)
    return sorted_words


def load_done_words() -> Set[str]:
    """读取已抓到的词（兼容新旧格式的断点文件）。"""
    done = set()
    if os.path.exists(OUTPUT_JSONL):
        with open(OUTPUT_JSONL, 'r', encoding='utf-8') as f:
            for line in f:
                if not line.strip(): continue
                try:
                    obj = json.loads(line)
                    if "word" in obj: done.add(obj["word"])
                except Exception: continue
    elif os.path.exists(OUTPUT_JSON_SNAPSHOT):
        try:
            with open(OUTPUT_JSON_SNAPSHOT, 'r', encoding='utf-8') as f:
                m = json.load(f)
                if isinstance(m, dict): done.update(m.keys())
        except Exception: pass
    return done


def materialize_snapshot_from_jsonl(jsonl_path: str, out_json_path: str):
    """从 JSONL 增量文件合并生成一个完整的 JSON 快照。"""
    mapping = {}
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip(): continue
            try:
                obj = json.loads(line)
                w = obj.get("word")
                d = obj.get("definitions", {})
                if w and d:
                    mapping[w] = d
            except Exception: continue

    ensure_dir_for(out_json_path)
    with open(out_json_path, 'w', encoding='utf-8') as f:
        json.dump(mapping, f, ensure_ascii=False, indent=2)


# ==============================================================================
#  核心改动区域
# ==============================================================================
def extract_all_definitions_from_detail(detail_json: Dict) -> Dict[str, str]:
    """
    (***核心升级***)
    从 /detail 返回的 JSON 中提取所有词典的原始 HTML 释义。
    返回一个字典，键是词典名，值是释义的完整 HTML 字符串。
    """
    all_definitions = {}
    dicos = detail_json.get("detail", {}).get("dicos", [])

    for dico in dicos:
        dname = dico.get("dname", "").strip()
        if not dname:
            continue

        html_list = dico.get("es", [])
        # 直接将列表中的所有 HTML 片段连接成一个完整的 HTML 字符串
        html_content = "".join(html_list) if isinstance(html_list, list) else str(html_list)

        if html_content:
            # 不再使用 BeautifulSoup 提取纯文本，而是直接存储原始的 HTML
            # 这样可以保留所有的格式，包括换行、加粗、列表等
            all_definitions[dname] = html_content

    return all_definitions
# ==============================================================================
#  改动结束
# ==============================================================================


async def async_sleep_jitter():
    """异步随机抖动延时。"""
    await asyncio.sleep(random.uniform(*JITTER_RANGE))


async def fetch_json_with_retries(session: aiohttp.ClientSession, url: str, params: Dict, max_retries=MAX_RETRIES) -> Dict:
    """带重试、指数退避和抖动的通用异步 GET JSON 请求。"""
    attempt = 0
    while True:
        try:
            timeout = aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
            async with session.get(url, params=params, headers=HEADERS, timeout=timeout) as resp:
                if resp.status == 429: # Too Many Requests
                    retry_after = resp.headers.get("Retry-After", (BACKOFF_BASE ** attempt))
                    await asyncio.sleep(float(retry_after) + random.random())
                elif 500 <= resp.status < 600: # Server errors
                    await asyncio.sleep((BACKOFF_BASE ** attempt) + random.random())

                if resp.status in {429, 500, 502, 503, 504}:
                    attempt += 1
                    if attempt > max_retries:
                        resp.raise_for_status()
                    continue

                resp.raise_for_status()
                return await resp.json(content_type=None)
        except (aiohttp.ClientError, asyncio.TimeoutError) as e:
            attempt += 1
            if attempt > max_retries: raise
            await asyncio.sleep((BACKOFF_BASE ** attempt) + random.random())


async def get_word_detail_async(session: aiohttp.ClientSession, word: str) -> Tuple[str, Dict[str, str]]:
    """
    获取单个单词的所有词典释义，返回 (word, definitions_dict)。
    definitions_dict 为空字典时表示未找到或失败。
    """
    params = {'key': API_KEY, 'type': 'normal', 'w': word}
    url = f"{BASE_URL}/detail"
    await async_sleep_jitter()
    try:
        data = await fetch_json_with_retries(session, url, params)
        # 调用更新后的提取函数
        definitions = extract_all_definitions_from_detail(data)
        return word, definitions
    except Exception:
        # 错误交由 worker 捕获并记录
        raise


async def writer_task(result_q: "asyncio.Queue", pbar: tqdm):
    """
    单线程写入器：从队列中读取结果，写入 JSONL，并更新进度条。
    """
    ensure_dir_for(OUTPUT_JSONL)
    ensure_dir_for(ERRORS_JSONL)

    f_out = open(OUTPUT_JSONL, 'a', encoding='utf-8', buffering=1)
    f_err = open(ERRORS_JSONL, 'a', encoding='utf-8', buffering=1)

    count_since_flush = 0
    new_since_snapshot = 0

    try:
        while True:
            item = await result_q.get()
            if item is None: break

            word, definitions, error_msg = item
            if error_msg:
                f_err.write(json.dumps({"word": word, "error": error_msg}, ensure_ascii=False) + "\n")
                f_err.flush()
            elif definitions: # 确保有内容才写入
                f_out.write(json.dumps({"word": word, "definitions": definitions}, ensure_ascii=False) + "\n")
                count_since_flush += 1
                new_since_snapshot += 1
                if count_since_flush >= WRITER_FLUSH_EVERY:
                    f_out.flush()
                    count_since_flush = 0

            pbar.update(1)
            result_q.task_done()

            if SNAPSHOT_EVERY > 0 and new_since_snapshot >= SNAPSHOT_EVERY:
                f_out.flush() # 生成快照前先刷盘
                try:
                    materialize_snapshot_from_jsonl(OUTPUT_JSONL, OUTPUT_JSON_SNAPSHOT)
                    pbar.set_postfix_str("Snapshot saved!")
                except Exception as e:
                    f_err.write(json.dumps({"snapshot_error": str(e)}, ensure_ascii=False) + "\n")
                new_since_snapshot = 0
    finally:
        if SNAPSHOT_EVERY > 0:
            f_out.flush()
            try: materialize_snapshot_from_jsonl(OUTPUT_JSONL, OUTPUT_JSON_SNAPSHOT)
            except Exception: pass
        f_out.close()
        f_err.close()
        result_q.task_done()


async def worker_task(name: int, session: aiohttp.ClientSession, job_q: "asyncio.Queue", result_q: "asyncio.Queue"):
    """并发 worker：从 job 队列取词，抓取 detail，放入 result 队列。"""
    while True:
        word = await job_q.get()
        if word is None: break

        error_msg = None
        definitions = {}
        try:
            _, definitions = await get_word_detail_async(session, word)
        except Exception as e:
            error_msg = f"Request/Parse failed for '{word}': {repr(e)}"

        await result_q.put((word, definitions, error_msg))
        job_q.task_done()
    job_q.task_done()


@asynccontextmanager
async def aiohttp_session():
    """统一管理 aiohttp ClientSession。"""
    connector = aiohttp.TCPConnector(limit=0, force_close=False, ttl_dns_cache=300)
    timeout = aiohttp.ClientTimeout(total=REQUEST_TIMEOUT)
    async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
        yield session


async def run_scrape(words_to_scrape: List[str]):
    """主并发抓取流程。"""
    if not words_to_scrape:
        print("所有单词均已爬取完毕！")
        return

    print(f"需要爬取 {len(words_to_scrape)} 个新单词的释义...")
    job_q = asyncio.Queue(maxsize=QUEUE_MAXSIZE)
    result_q = asyncio.Queue()

    loop = asyncio.get_running_loop()
    stop_event = asyncio.Event()

    def handle_sigint():
        print("\n收到中断信号，准备优雅退出（已抓进度不会丢失）...")
        stop_event.set()

    for sig in (signal.SIGINT, signal.SIGTERM):
        try: loop.add_signal_handler(sig, handle_sigint)
        except NotImplementedError: pass

    pbar = tqdm(total=len(words_to_scrape), desc="爬取单词释义（并发）")
    writer = asyncio.create_task(writer_task(result_q, pbar))

    async with aiohttp_session() as session:
        workers = [
            asyncio.create_task(worker_task(i + 1, session, job_q, result_q))
            for i in range(CONCURRENCY)
        ]

        # 生产者
        for w in words_to_scrape:
            if stop_event.is_set(): break
            await job_q.put(w)

        # 等待所有任务完成
        await job_q.join()

        # 发送结束信号给 workers
        for _ in workers: await job_q.put(None)
        await job_q.join()

        # 等待 writer 完成
        await result_q.put(None)
        await result_q.join()

    pbar.close()


def main():
    headwords = load_headwords()
    done_words = load_done_words()
    if done_words:
        print(f"已加载 {len(done_words)} 条已爬取的数据（断点续传）。")

    words_to_scrape = [w for w in headwords if w not in done_words]

    try:
        asyncio.run(run_scrape(words_to_scrape))
    except (KeyboardInterrupt, asyncio.CancelledError):
        print("\n检测到手动中断。已保存当前进度，下次可继续。")
    finally:
        print(f"\n完成或中断。结果已写入：\n- 数据文件: {OUTPUT_JSONL}")
        if SNAPSHOT_EVERY:
            print(f"- 快照文件: {OUTPUT_JSON_SNAPSHOT}")
        print(f"- 错误日志: {ERRORS_JSONL}")


if __name__ == "__main__":
    main()