zhensolid 6 månader sedan
förälder
incheckning
68ce7268e6
2 ändrade filer med 76 tillägg och 0 borttagningar
  1. 45 0
      PaddleOCR_PDF.py
  2. 31 0
      PaddleOCR_png.py

+ 45 - 0
PaddleOCR_PDF.py

@@ -0,0 +1,45 @@
+from pathlib import Path
+from paddleocr import PPStructureV3
+from pdf2image import convert_from_path
+import os
+
+# 输入 PDF 和输出路径
+input_file = "C:/Users/zhens/Downloads/12.pdf"
+output_path = Path("C:/Users/zhens/Downloads/output")
+output_path.mkdir(parents=True, exist_ok=True)
+
+# Step 1: 把 PDF 转为图片
+pdf_pages = convert_from_path(input_file, dpi=300)
+image_paths = []
+
+for idx, page in enumerate(pdf_pages):
+    img_path = output_path / f"page_{idx+1}.jpg"
+    page.save(img_path, "JPEG")
+    image_paths.append(str(img_path))
+
+# Step 2: 使用 PPStructureV3 分析每页图像
+pipeline = PPStructureV3()
+markdown_list = []
+markdown_images = []
+
+for img in image_paths:
+    output = pipeline.predict(img)
+    for res in output:
+        md_info = res.markdown
+        markdown_list.append(md_info)
+        markdown_images.append(md_info.get("markdown_images", {}))
+
+# Step 3: 合并 Markdown 输出
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
+
+with open(mkd_file_path, "w", encoding="utf-8") as f:
+    f.write(markdown_texts)
+
+# Step 4: 保存图像资源(markdown_images)
+for item in markdown_images:
+    if item:
+        for path, image in item.items():
+            file_path = output_path / path
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            image.save(file_path)

+ 31 - 0
PaddleOCR_png.py

@@ -0,0 +1,31 @@
+from pathlib import Path
+from paddleocr import PPStructureV3
+
+input_file = "./your_pdf_file.pdf"
+output_path = Path("./output")
+
+pipeline = PPStructureV3()
+output = pipeline.predict("./pp_structure_v3_demo.png")
+
+markdown_list = []
+markdown_images = []
+
+for res in output:
+    md_info = res.markdown
+    markdown_list.append(md_info)
+    markdown_images.append(md_info.get("markdown_images", {}))
+
+markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
+
+mkd_file_path = output_path / f"{Path(input_file).stem}.md"
+mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
+
+with open(mkd_file_path, "w", encoding="utf-8") as f:
+    f.write(markdown_texts)
+
+for item in markdown_images:
+    if item:
+        for path, image in item.items():
+            file_path = output_path / path
+            file_path.parent.mkdir(parents=True, exist_ok=True)
+            image.save(file_path)