from pathlib import Path from paddleocr import PPStructureV3 from pdf2image import convert_from_path import os # 输入 PDF 和输出路径 input_file = "C:/Users/zhens/Downloads/12.pdf" output_path = Path("C:/Users/zhens/Downloads/output") output_path.mkdir(parents=True, exist_ok=True) # Step 1: 把 PDF 转为图片 pdf_pages = convert_from_path(input_file, dpi=300) image_paths = [] for idx, page in enumerate(pdf_pages): img_path = output_path / f"page_{idx+1}.jpg" page.save(img_path, "JPEG") image_paths.append(str(img_path)) # Step 2: 使用 PPStructureV3 分析每页图像 pipeline = PPStructureV3() markdown_list = [] markdown_images = [] for img in image_paths: output = pipeline.predict(img) for res in output: md_info = res.markdown markdown_list.append(md_info) markdown_images.append(md_info.get("markdown_images", {})) # Step 3: 合并 Markdown 输出 markdown_texts = pipeline.concatenate_markdown_pages(markdown_list) mkd_file_path = output_path / f"{Path(input_file).stem}.md" with open(mkd_file_path, "w", encoding="utf-8") as f: f.write(markdown_texts) # Step 4: 保存图像资源(markdown_images) for item in markdown_images: if item: for path, image in item.items(): file_path = output_path / path file_path.parent.mkdir(parents=True, exist_ok=True) image.save(file_path)