分享《日汉双解学习词典（第二版）》 pdf及ocr结果

wynick27 · 2025 年9 月 28 日 05:35

确实，真要用的话要做不少预处理，我准备再试试gemini。

这次试了下不用网页版，自动调api，处理了前10页，2.5pro不知道怎么回事，一会超时，一会503错误，改用flash了。

贴一下我的代码：

class PDFChat:
    def __init__(self, pdf_path, client=None,file_id=None, model="gemini-2.5-flash"):
        self.pdf_path = pdf_path
        self.model = model
        self.client = client or genai.Client()
        self.file_id = file_id
        self.file = None
        self._ensure_uploaded()

    def _ensure_uploaded(self):
        """如果文件已经上传过，就跳过；否则上传"""
        filename = os.path.basename(self.pdf_path)

        # 遍历已上传文件，看看是否存在
        files = self.client.files.list()
        for f in files:
            if f.display_name == filename and f.state == "ACTIVE" or f.name == self.file_id:
                self.file_id = f.name
                self.file = self.client.files.get(name=self.file_id)
                print(f"找到已上传文件: {filename}, file_id={self.file_id}")
                return

        # 没找到 → 上传
        with open(self.pdf_path, "rb") as f:
            uploaded = self.client.files.upload(
                file=f,
                config={"mime_type": "application/pdf"}
            )
        self.file_id = uploaded.name
        self.file = uploaded
        print(f"已上传新文件: {filename}, file_id={self.file_id}")

    def ask(self, question):
        """对 PDF 提问"""
        response = self.client.models.generate_content(
            model=self.model,
            contents=[
                self.file,
                question
            ]
        )
        return response.text


client = genai.Client(api_key=API_KEY)

def write_ocr_text(file_path, output_path):
    chat = PDFChat(file_path,client=client)

    text = chat.ask("这是一本日语词典，希望识别全部文本。注意以下几点：每一页有页眉，表示起止单词，中间是表示页码的阿拉伯数字，作为分页符放入尖括号中。词典分为两栏，识别阅读顺序为先左栏再右栏，词条左栏没有结束的情况下和右栏合并输出。如果有表格需要用markdown表示法输出表格。注意特殊符号标注，比如黑色圆圈符号，三角符号△▷，菱形符号◇⬘⬙，还有词典使用的括号【】〔〕〘〙要注意配对。要尽可能多输出内容，需要识别pdf全部内容，不可在中途停止。")

    with open(output_path,'w',encoding='utf8') as f:
        f.write(text)

    return text

write_ocr_text('./pdf/日汉双解学习词典 1-10.pdf','./data/日汉双解学习词典 1-10.txt')