# -----------------------------------------------------------
# Python 爬虫代码，适配 Google Colab 环境
# 已添加解决方案来处理 CERTIFICATE_VERIFY_FAILED 错误
# -----------------------------------------------------------

# 首先，确保必要的库已安装
!pip install requests beautifulsoup4

import requests
import json
import time
from bs4 import BeautifulSoup
# 导入 urllib3 以便禁用 InsecureRequestWarning 警告
import urllib3

# ----------------- 关键修改 -----------------
# 当我们使用 verify=False 时，requests 会打印一个警告信息。
# 下面这行代码可以禁用这个警告，让输出更干净。
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# -----------------------------------------------

# --- Configuration ---
CATALOG_URL = "https://koshashri-dc.ac.in/search/catalog/1/10?searchTerm=अ"
API_URL = "https://koshashri-dc.ac.in/search/catalogSearch/{page}/{size}"
PAGE_SIZE = 500
SAVE_PATH = '/content/drive/MyDrive/vocable_list.json'

def get_csrf_token(session, url):
    """从初始页面获取 CSRF 令牌。"""
    print("Fetching CSRF token...")
    try:
        # ----------------- 关键修改 -----------------
        # 添加 verify=False 来跳过 SSL 证书验证
        response = session.get(url, timeout=20, verify=False)
        # -----------------------------------------------
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        token = soup.find('meta', {'name': '_csrf'})['content']
        print(f"Successfully got CSRF token: {token[:10]}...")
        return token
    except (requests.RequestException, KeyError, TypeError) as e:
        print(f"Error getting CSRF token: {e}")
        return None

def get_sanskrit_alphabet():
    """定义要遍历的梵文字母列表。"""
    alphabet = [
        'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ऌ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः',
        'क', 'क्ष', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'ज्ञ', 'झ', 'ञ', 'ट', 'ठ',
        'ड', 'ढ', 'ण', 'त', 'त्र', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म',
        'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह'
    ]
    # 注意: 我看到你的字母列表里混入了西里尔字母 м, ш, с
    # 我已将它们修正为梵文 भ, श, स
    # 如果是特意如此，请改回。
    fixed_alphabet = [
        'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ॠ', 'ऌ', 'ए', 'ऐ', 'ओ', 'औ', 'अं', 'अः',
        'क', 'क्ष', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'ज्ञ', 'झ', 'ञ', 'ट', 'ठ',
        'ड', 'ढ', 'ण', 'त', 'त्र', 'थ', 'द', 'ध', 'न', 'प', 'फ', 'ब', 'भ', 'म',
        'य', 'र', 'ल', 'ळ', 'व', 'श', 'ष', 'स', 'ह'
    ]
    return fixed_alphabet

def main():
    """主执行函数"""
    session = requests.Session()
    session.headers.update({
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    })

    csrf_token = get_csrf_token(session, CATALOG_URL)
    if not csrf_token:
        print("Failed to get CSRF token. Aborting.")
        return

    headers = {
        "X-CSRF-Token": csrf_token,
        "Content-Type": "application/json",
        "Accept": "application/json"
    }
    
    sanskrit_alphabet = get_sanskrit_alphabet()
    all_vocables = []

    print("\nStarting the scraping process...")

    for letter in sanskrit_alphabet:
        print(f"\n--- Searching for letter: {letter} ---")
        page = 1
        letter_count = 0
        while True:
            url = API_URL.format(page=page, size=PAGE_SIZE)
            payload = {
                "searchElement": letter,
                "searchComplexity": "BASIC",
                "searchWithIn": "vocable",
                "outputType": "devanagari_unicode"
            }
            
            print(f"Fetching page {page} for letter '{letter}'...")
            
            try:
                # ----------------- 关键修改 -----------------
                # 同样在这里添加 verify=False
                response = session.post(url, headers=headers, json=payload, timeout=30, verify=False)
                # -----------------------------------------------
                response.raise_for_status()
                data = response.json()
            except (requests.RequestException, json.JSONDecodeError) as e:
                print(f"An error occurred: {e}. Skipping to the next letter.")
                break

            results = data.get("listOfResult", [])
            if not results:
                print("No more results found for this letter.")
                break

            for item in results:
                clean_vocable = BeautifulSoup(item.get("vocable", ""), 'html.parser').get_text()
                all_vocables.append({
                    "id": item.get("id"),
                    "vocable": clean_vocable
                })
            
            num_results = len(results)
            letter_count += num_results
            print(f"Found {num_results} results. Total for '{letter}': {letter_count}")

            if num_results < PAGE_SIZE:
                print("Last page reached for this letter.")
                break

            page += 1
            time.sleep(1.5)

    print(f"\nScraping finished. Total unique vocables found: {len(all_vocables)}")

    try:
        with open(SAVE_PATH, "w", encoding="utf-8") as f:
            json.dump(all_vocables, f, ensure_ascii=False, indent=2)
        print(f"\nSuccessfully saved all vocables to: {SAVE_PATH}")
    except Exception as e:
        print(f"\nAn error occurred while saving the file: {e}")

# 运行主函数
if __name__ == "__main__":
    # 别忘了先挂载 Google Drive
    # from google.colab import drive
    # drive.mount('/content/drive')
    main()