| 12345678910111213141516171819202122232425262728293031 |
- from pathlib import Path
- from paddleocr import PPStructureV3
- input_file = "./your_pdf_file.pdf"
- output_path = Path("./output")
- pipeline = PPStructureV3()
- output = pipeline.predict("./pp_structure_v3_demo.png")
- markdown_list = []
- markdown_images = []
- for res in output:
- md_info = res.markdown
- markdown_list.append(md_info)
- markdown_images.append(md_info.get("markdown_images", {}))
- markdown_texts = pipeline.concatenate_markdown_pages(markdown_list)
- mkd_file_path = output_path / f"{Path(input_file).stem}.md"
- mkd_file_path.parent.mkdir(parents=True, exist_ok=True)
- with open(mkd_file_path, "w", encoding="utf-8") as f:
- f.write(markdown_texts)
- for item in markdown_images:
- if item:
- for path, image in item.items():
- file_path = output_path / path
- file_path.parent.mkdir(parents=True, exist_ok=True)
- image.save(file_path)
|