PaddleOCR_png.py 891 B

12345678910111213141516171819202122232425262728293031
  1. from pathlib import Path
  2. from paddleocr import PPStructureV3
  3. input_file = "./your_pdf_file.pdf"
  4. output_path = Path("./output")
  5. pipeline = PPStructureV3()
  6. output = pipeline.predict("./pp_structure_v3_demo.png")
  7. markdown_list = []
  8. markdown_images = []
  9. for res in output:
  10. md_info = res.markdown
  11. markdown_list.append(md_info)
  12. markdown_images.append(md_info.get("markdown_images", {}))
  13. markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
  14. mkd_file_path = output_path / f"{Path(input_file).stem}.md"
  15. mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
  16. with open(mkd_file_path, "w", encoding="utf-8") as f:
  17. f.write(markdown_texts)
  18. for item in markdown_images:
  19. if item:
  20. for path, image in item.items():
  21. file_path = output_path / path
  22. file_path.parent.mkdir(parents=True, exist_ok=True)
  23. image.save(file_path)