PaddleOCR_PDF.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from pathlib import Path
  2. from paddleocr import PPStructureV3
  3. from pdf2image import convert_from_path
  4. import os
  5. # 输入 PDF 和输出路径
  6. input_file = "C:/Users/zhens/Downloads/12.pdf"
  7. output_path = Path("C:/Users/zhens/Downloads/output")
  8. output_path.mkdir(parents=True, exist_ok=True)
  9. # Step 1: 把 PDF 转为图片
  10. pdf_pages = convert_from_path(input_file, dpi=300)
  11. image_paths = []
  12. for idx, page in enumerate(pdf_pages):
  13. img_path = output_path / f"page_{idx+1}.jpg"
  14. page.save(img_path, "JPEG")
  15. image_paths.append(str(img_path))
  16. # Step 2: 使用 PPStructureV3 分析每页图像
  17. pipeline = PPStructureV3()
  18. markdown_list = []
  19. markdown_images = []
  20. for img in image_paths:
  21. output = pipeline.predict(img)
  22. for res in output:
  23. md_info = res.markdown
  24. markdown_list.append(md_info)
  25. markdown_images.append(md_info.get("markdown_images", {}))
  26. # Step 3: 合并 Markdown 输出
  27. markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
  28. mkd_file_path = output_path / f"{Path(input_file).stem}.md"
  29. with open(mkd_file_path, "w", encoding="utf-8") as f:
  30. f.write(markdown_texts)
  31. # Step 4: 保存图像资源(markdown_images)
  32. for item in markdown_images:
  33. if item:
  34. for path, image in item.items():
  35. file_path = output_path / path
  36. file_path.parent.mkdir(parents=True, exist_ok=True)
  37. image.save(file_path)