pdf2word/src/core/converter.py

81 lines
3.0 KiB
Python
Raw Normal View History

2025-04-26 14:32:43 +08:00
import pdf2docx
import fitz
import markdown
import pandas as pd
import os
class PDFConverter:
@staticmethod
def batch_convert(pdf_paths, output_dir, convert_type):
"""
批量转换PDF文件
:param pdf_paths: PDF文件路径列表
:param output_dir: 输出目录
:param convert_type: 转换类型 ('word', 'images', 'markdown', 'excel')
"""
os.makedirs(output_dir, exist_ok=True)
for pdf_path in pdf_paths:
filename = os.path.basename(pdf_path)
name_without_ext = os.path.splitext(filename)[0]
try:
if convert_type == 'word':
output_path = os.path.join(output_dir, f"{name_without_ext}.docx")
PDFConverter.pdf_to_word(pdf_path, output_path)
elif convert_type == 'images':
# 为每个PDF创建单独的图片目录
pdf_images_dir = os.path.join(output_dir, name_without_ext)
os.makedirs(pdf_images_dir, exist_ok=True)
PDFConverter.pdf_to_images(pdf_path, pdf_images_dir)
elif convert_type == 'markdown':
output_path = os.path.join(output_dir, f"{name_without_ext}.md")
PDFConverter.pdf_to_markdown(pdf_path, output_path)
elif convert_type == 'excel':
output_path = os.path.join(output_dir, f"{name_without_ext}.xlsx")
PDFConverter.pdf_to_excel(pdf_path, output_path)
except Exception as e:
print(f"转换文件 {filename} 时出错: {str(e)}")
continue
@staticmethod
def pdf_to_word(pdf_path, output_path):
converter = pdf2docx.Converter(pdf_path)
converter.convert(output_path)
converter.close()
@staticmethod
def pdf_to_images(pdf_path, output_dir):
doc = fitz.open(pdf_path)
for page_num in range(len(doc)):
page = doc.load_page(page_num)
pix = page.get_pixmap()
output_path = os.path.join(output_dir, f'page_{page_num + 1}.png')
pix.save(output_path)
@staticmethod
def pdf_to_markdown(pdf_path, output_path):
# 使用PyMuPDF提取文本并转换为Markdown
doc = fitz.open(pdf_path)
markdown_text = ""
for page in doc:
markdown_text += page.get_text()
with open(output_path, 'w', encoding='utf-8') as f:
f.write(markdown_text)
@staticmethod
def pdf_to_excel(pdf_path, output_path):
"""
将PDF转换为Excel
:param pdf_path: PDF文件路径
:param output_path: 输出文件路径
"""
# 使用tabula-py提取表格数据
import tabula
df = tabula.read_pdf(pdf_path, pages='all')
if df:
df[0].to_excel(output_path, index=False)
else:
# 如果没有找到表格创建一个空的Excel文件
pd.DataFrame().to_excel(output_path, index=False)