81 lines
3.0 KiB
Python
81 lines
3.0 KiB
Python
|
import pdf2docx
|
|||
|
import fitz
|
|||
|
import markdown
|
|||
|
import pandas as pd
|
|||
|
import os
|
|||
|
|
|||
|
class PDFConverter:
|
|||
|
@staticmethod
|
|||
|
def batch_convert(pdf_paths, output_dir, convert_type):
|
|||
|
"""
|
|||
|
批量转换PDF文件
|
|||
|
:param pdf_paths: PDF文件路径列表
|
|||
|
:param output_dir: 输出目录
|
|||
|
:param convert_type: 转换类型 ('word', 'images', 'markdown', 'excel')
|
|||
|
"""
|
|||
|
os.makedirs(output_dir, exist_ok=True)
|
|||
|
|
|||
|
for pdf_path in pdf_paths:
|
|||
|
filename = os.path.basename(pdf_path)
|
|||
|
name_without_ext = os.path.splitext(filename)[0]
|
|||
|
|
|||
|
try:
|
|||
|
if convert_type == 'word':
|
|||
|
output_path = os.path.join(output_dir, f"{name_without_ext}.docx")
|
|||
|
PDFConverter.pdf_to_word(pdf_path, output_path)
|
|||
|
elif convert_type == 'images':
|
|||
|
# 为每个PDF创建单独的图片目录
|
|||
|
pdf_images_dir = os.path.join(output_dir, name_without_ext)
|
|||
|
os.makedirs(pdf_images_dir, exist_ok=True)
|
|||
|
PDFConverter.pdf_to_images(pdf_path, pdf_images_dir)
|
|||
|
elif convert_type == 'markdown':
|
|||
|
output_path = os.path.join(output_dir, f"{name_without_ext}.md")
|
|||
|
PDFConverter.pdf_to_markdown(pdf_path, output_path)
|
|||
|
elif convert_type == 'excel':
|
|||
|
output_path = os.path.join(output_dir, f"{name_without_ext}.xlsx")
|
|||
|
PDFConverter.pdf_to_excel(pdf_path, output_path)
|
|||
|
except Exception as e:
|
|||
|
print(f"转换文件 {filename} 时出错: {str(e)}")
|
|||
|
continue
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def pdf_to_word(pdf_path, output_path):
|
|||
|
converter = pdf2docx.Converter(pdf_path)
|
|||
|
converter.convert(output_path)
|
|||
|
converter.close()
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def pdf_to_images(pdf_path, output_dir):
|
|||
|
doc = fitz.open(pdf_path)
|
|||
|
for page_num in range(len(doc)):
|
|||
|
page = doc.load_page(page_num)
|
|||
|
pix = page.get_pixmap()
|
|||
|
output_path = os.path.join(output_dir, f'page_{page_num + 1}.png')
|
|||
|
pix.save(output_path)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def pdf_to_markdown(pdf_path, output_path):
|
|||
|
# 使用PyMuPDF提取文本并转换为Markdown
|
|||
|
doc = fitz.open(pdf_path)
|
|||
|
markdown_text = ""
|
|||
|
for page in doc:
|
|||
|
markdown_text += page.get_text()
|
|||
|
|
|||
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|||
|
f.write(markdown_text)
|
|||
|
|
|||
|
@staticmethod
|
|||
|
def pdf_to_excel(pdf_path, output_path):
|
|||
|
"""
|
|||
|
将PDF转换为Excel
|
|||
|
:param pdf_path: PDF文件路径
|
|||
|
:param output_path: 输出文件路径
|
|||
|
"""
|
|||
|
# 使用tabula-py提取表格数据
|
|||
|
import tabula
|
|||
|
df = tabula.read_pdf(pdf_path, pages='all')
|
|||
|
if df:
|
|||
|
df[0].to_excel(output_path, index=False)
|
|||
|
else:
|
|||
|
# 如果没有找到表格,创建一个空的Excel文件
|
|||
|
pd.DataFrame().to_excel(output_path, index=False)
|