《拉鲁斯法汉双解词典》 文本数据(OCR)

爬虫也让ai协助写好了,测试运行没有问题,不过速度比较慢,我暂时没让它继续跑。

import requests
from bs4 import BeautifulSoup
import time
import os

# --- 配置 ---
# 包含单词列表的输入文件名
INPUT_FILENAME = r'C:\Users\xxx\Desktop\拉鲁斯 词头索引.txt'
# 保存结果的输出文件名
OUTPUT_FILENAME = r'C:\Users\xxx\Desktop\resultats.txt'
# 基础URL,单词将附加到末尾
BASE_URL = 'https://www.larousse.fr/dictionnaires/francais-anglais/'

# 设置请求头,模拟浏览器访问
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def get_phonetic(word):
    """
    为单个单词获取其所有音标,并正确拼接。
    """
    word = word.strip()
    if not word:
        return "单词为空"

    url = BASE_URL + word
    
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            phonetique_spans = soup.find_all('span', class_='Phonetique')
            
            if phonetique_spans:
                # 提取所有文本片段
                all_phonetics_text = [span.get_text(strip=True) for span in phonetique_spans]

                return ''.join(all_phonetics_text)
            else:
                return "音标未找到"
        else:
            return f"页面加载失败 (状态码: {response.status_code})"
            
    except requests.exceptions.RequestException as e:
        return f"请求错误: {e}"

def load_processed_words():
    """
    读取输出文件,返回一个包含所有已处理单词的集合(set)。
    """
    processed = set()
    if not os.path.exists(OUTPUT_FILENAME):
        return processed
    
    with open(OUTPUT_FILENAME, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(': ', 1)
            if len(parts) > 0 and parts[0]:
                processed.add(parts[0])
    return processed

def main():
    """
    主函数,负责读取文件、循环处理和追加保存结果。
    """
    if not os.path.exists(INPUT_FILENAME):
        print(f"错误:输入文件 '{INPUT_FILENAME}' 未找到。")
        return

    processed_words = load_processed_words()
    if processed_words:
        print(f"已在 '{OUTPUT_FILENAME}' 中找到 {len(processed_words)} 个已处理的单词,将跳过它们。")

    with open(INPUT_FILENAME, 'r', encoding='utf-8') as f_in:
        words_to_process = [line.strip() for line in f_in if line.strip()]

    words = [word for word in words_to_process if word not in processed_words]
    
    if not words:
        print("所有单词均已处理完毕,无需操作。")
        return

    print(f"共 {len(words_to_process)} 个单词,还需处理 {len(words)} 个。开始爬取...")
    
    with open(OUTPUT_FILENAME, 'a', encoding='utf-8') as f_out:
        for i, word in enumerate(words):
            print(f"[{i+1}/{len(words)}] 正在查询: {word} ...")
            phonetic_text = get_phonetic(word)
            result_line = f"{word}: {phonetic_text}"
            print(f"  -> 结果: {phonetic_text}")
            f_out.write(result_line + '\n')
            f_out.flush() 
            time.sleep(0.5)
            
    print(f"\n本次任务处理完毕!结果已追加到 '{OUTPUT_FILENAME}' 文件中。")

if __name__ == '__main__':
    main()