pdf2word/src/core/converter.py
2025-04-26 14:32:43 +08:00

81 lines
3.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pdf2docx
import fitz
import markdown
import pandas as pd
import os
class PDFConverter:
@staticmethod
def batch_convert(pdf_paths, output_dir, convert_type):
"""
批量转换PDF文件
:param pdf_paths: PDF文件路径列表
:param output_dir: 输出目录
:param convert_type: 转换类型 ('word', 'images', 'markdown', 'excel')
"""
os.makedirs(output_dir, exist_ok=True)
for pdf_path in pdf_paths:
filename = os.path.basename(pdf_path)
name_without_ext = os.path.splitext(filename)[0]
try:
if convert_type == 'word':
output_path = os.path.join(output_dir, f"{name_without_ext}.docx")
PDFConverter.pdf_to_word(pdf_path, output_path)
elif convert_type == 'images':
# 为每个PDF创建单独的图片目录
pdf_images_dir = os.path.join(output_dir, name_without_ext)
os.makedirs(pdf_images_dir, exist_ok=True)
PDFConverter.pdf_to_images(pdf_path, pdf_images_dir)
elif convert_type == 'markdown':
output_path = os.path.join(output_dir, f"{name_without_ext}.md")
PDFConverter.pdf_to_markdown(pdf_path, output_path)
elif convert_type == 'excel':
output_path = os.path.join(output_dir, f"{name_without_ext}.xlsx")
PDFConverter.pdf_to_excel(pdf_path, output_path)
except Exception as e:
print(f"转换文件 {filename} 时出错: {str(e)}")
continue
@staticmethod
def pdf_to_word(pdf_path, output_path):
converter = pdf2docx.Converter(pdf_path)
converter.convert(output_path)
converter.close()
@staticmethod
def pdf_to_images(pdf_path, output_dir):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap()
output_path = os.path.join(output_dir, f'page_{page_num + 1}.png')
pix.save(output_path)
@staticmethod
def pdf_to_markdown(pdf_path, output_path):
# 使用PyMuPDF提取文本并转换为Markdown
doc = fitz.open(pdf_path)
markdown_text = ""
for page in doc:
markdown_text += page.get_text()
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_text)
@staticmethod
def pdf_to_excel(pdf_path, output_path):
"""
将PDF转换为Excel
:param pdf_path: PDF文件路径
:param output_path: 输出文件路径
"""
# 使用tabula-py提取表格数据
import tabula
df = tabula.read_pdf(pdf_path, pages='all')
if df:
df[0].to_excel(output_path, index=False)
else:
# 如果没有找到表格创建一个空的Excel文件
pd.DataFrame().to_excel(output_path, index=False)