81 lines
3.0 KiB
Python
81 lines
3.0 KiB
Python
import pdf2docx
|
||
import fitz
|
||
import markdown
|
||
import pandas as pd
|
||
import os
|
||
|
||
class PDFConverter:
|
||
@staticmethod
|
||
def batch_convert(pdf_paths, output_dir, convert_type):
|
||
"""
|
||
批量转换PDF文件
|
||
:param pdf_paths: PDF文件路径列表
|
||
:param output_dir: 输出目录
|
||
:param convert_type: 转换类型 ('word', 'images', 'markdown', 'excel')
|
||
"""
|
||
os.makedirs(output_dir, exist_ok=True)
|
||
|
||
for pdf_path in pdf_paths:
|
||
filename = os.path.basename(pdf_path)
|
||
name_without_ext = os.path.splitext(filename)[0]
|
||
|
||
try:
|
||
if convert_type == 'word':
|
||
output_path = os.path.join(output_dir, f"{name_without_ext}.docx")
|
||
PDFConverter.pdf_to_word(pdf_path, output_path)
|
||
elif convert_type == 'images':
|
||
# 为每个PDF创建单独的图片目录
|
||
pdf_images_dir = os.path.join(output_dir, name_without_ext)
|
||
os.makedirs(pdf_images_dir, exist_ok=True)
|
||
PDFConverter.pdf_to_images(pdf_path, pdf_images_dir)
|
||
elif convert_type == 'markdown':
|
||
output_path = os.path.join(output_dir, f"{name_without_ext}.md")
|
||
PDFConverter.pdf_to_markdown(pdf_path, output_path)
|
||
elif convert_type == 'excel':
|
||
output_path = os.path.join(output_dir, f"{name_without_ext}.xlsx")
|
||
PDFConverter.pdf_to_excel(pdf_path, output_path)
|
||
except Exception as e:
|
||
print(f"转换文件 {filename} 时出错: {str(e)}")
|
||
continue
|
||
|
||
@staticmethod
|
||
def pdf_to_word(pdf_path, output_path):
|
||
converter = pdf2docx.Converter(pdf_path)
|
||
converter.convert(output_path)
|
||
converter.close()
|
||
|
||
@staticmethod
|
||
def pdf_to_images(pdf_path, output_dir):
|
||
doc = fitz.open(pdf_path)
|
||
for page_num in range(len(doc)):
|
||
page = doc.load_page(page_num)
|
||
pix = page.get_pixmap()
|
||
output_path = os.path.join(output_dir, f'page_{page_num + 1}.png')
|
||
pix.save(output_path)
|
||
|
||
@staticmethod
|
||
def pdf_to_markdown(pdf_path, output_path):
|
||
# 使用PyMuPDF提取文本并转换为Markdown
|
||
doc = fitz.open(pdf_path)
|
||
markdown_text = ""
|
||
for page in doc:
|
||
markdown_text += page.get_text()
|
||
|
||
with open(output_path, 'w', encoding='utf-8') as f:
|
||
f.write(markdown_text)
|
||
|
||
@staticmethod
|
||
def pdf_to_excel(pdf_path, output_path):
|
||
"""
|
||
将PDF转换为Excel
|
||
:param pdf_path: PDF文件路径
|
||
:param output_path: 输出文件路径
|
||
"""
|
||
# 使用tabula-py提取表格数据
|
||
import tabula
|
||
df = tabula.read_pdf(pdf_path, pages='all')
|
||
if df:
|
||
df[0].to_excel(output_path, index=False)
|
||
else:
|
||
# 如果没有找到表格,创建一个空的Excel文件
|
||
pd.DataFrame().to_excel(output_path, index=False) |