|
@@ -0,0 +1,45 @@
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from paddleocr import PPStructureV3
|
|
|
|
|
+from pdf2image import convert_from_path
|
|
|
|
|
+import os
|
|
|
|
|
+
|
|
|
|
|
+# 输入 PDF 和输出路径
|
|
|
|
|
+input_file = "C:/Users/zhens/Downloads/12.pdf"
|
|
|
|
|
+output_path = Path("C:/Users/zhens/Downloads/output")
|
|
|
|
|
+output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+
|
|
|
|
|
+# Step 1: 把 PDF 转为图片
|
|
|
|
|
+pdf_pages = convert_from_path(input_file, dpi=300)
|
|
|
|
|
+image_paths = []
|
|
|
|
|
+
|
|
|
|
|
+for idx, page in enumerate(pdf_pages):
|
|
|
|
|
+ img_path = output_path / f"page_{idx+1}.jpg"
|
|
|
|
|
+ page.save(img_path, "JPEG")
|
|
|
|
|
+ image_paths.append(str(img_path))
|
|
|
|
|
+
|
|
|
|
|
+# Step 2: 使用 PPStructureV3 分析每页图像
|
|
|
|
|
+pipeline = PPStructureV3()
|
|
|
|
|
+markdown_list = []
|
|
|
|
|
+markdown_images = []
|
|
|
|
|
+
|
|
|
|
|
+for img in image_paths:
|
|
|
|
|
+ output = pipeline.predict(img)
|
|
|
|
|
+ for res in output:
|
|
|
|
|
+ md_info = res.markdown
|
|
|
|
|
+ markdown_list.append(md_info)
|
|
|
|
|
+ markdown_images.append(md_info.get("markdown_images", {}))
|
|
|
|
|
+
|
|
|
|
|
+# Step 3: 合并 Markdown 输出
|
|
|
|
|
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
|
|
|
|
|
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
|
|
|
|
|
+
|
|
|
|
|
+with open(mkd_file_path, "w", encoding="utf-8") as f:
|
|
|
|
|
+ f.write(markdown_texts)
|
|
|
|
|
+
|
|
|
|
|
+# Step 4: 保存图像资源(markdown_images)
|
|
|
|
|
+for item in markdown_images:
|
|
|
|
|
+ if item:
|
|
|
|
|
+ for path, image in item.items():
|
|
|
|
|
+ file_path = output_path / path
|
|
|
|
|
+ file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
+ image.save(file_path)
|