| 123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from pathlib import Path
- from paddleocr import PPStructureV3
- from pdf2image import convert_from_path
- import os
- # 输入 PDF 和输出路径
- input_file = "C:/Users/zhens/Downloads/12.pdf"
- output_path = Path("C:/Users/zhens/Downloads/output")
- output_path.mkdir(parents=True, exist_ok=True)
- # Step 1: 把 PDF 转为图片
- pdf_pages = convert_from_path(input_file, dpi=300)
- image_paths = []
- for idx, page in enumerate(pdf_pages):
- img_path = output_path / f"page_{idx+1}.jpg"
- page.save(img_path, "JPEG")
- image_paths.append(str(img_path))
- # Step 2: 使用 PPStructureV3 分析每页图像
- pipeline = PPStructureV3()
- markdown_list = []
- markdown_images = []
- for img in image_paths:
- output = pipeline.predict(img)
- for res in output:
- md_info = res.markdown
- markdown_list.append(md_info)
- markdown_images.append(md_info.get("markdown_images", {}))
- # Step 3: 合并 Markdown 输出
- markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
- mkd_file_path = output_path / f"{Path(input_file).stem}.md"
- with open(mkd_file_path, "w", encoding="utf-8") as f:
- f.write(markdown_texts)
- # Step 4: 保存图像资源(markdown_images)
- for item in markdown_images:
- if item:
- for path, image in item.items():
- file_path = output_path / path
- file_path.parent.mkdir(parents=True, exist_ok=True)
- image.save(file_path)
|