import pdf2docx import fitz import markdown import pandas as pd import os class PDFConverter: @staticmethod def batch_convert(pdf_paths, output_dir, convert_type): """ 批量转换PDF文件 :param pdf_paths: PDF文件路径列表 :param output_dir: 输出目录 :param convert_type: 转换类型 ('word', 'images', 'markdown', 'excel') """ os.makedirs(output_dir, exist_ok=True) for pdf_path in pdf_paths: filename = os.path.basename(pdf_path) name_without_ext = os.path.splitext(filename)[0] try: if convert_type == 'word': output_path = os.path.join(output_dir, f"{name_without_ext}.docx") PDFConverter.pdf_to_word(pdf_path, output_path) elif convert_type == 'images': # 为每个PDF创建单独的图片目录 pdf_images_dir = os.path.join(output_dir, name_without_ext) os.makedirs(pdf_images_dir, exist_ok=True) PDFConverter.pdf_to_images(pdf_path, pdf_images_dir) elif convert_type == 'markdown': output_path = os.path.join(output_dir, f"{name_without_ext}.md") PDFConverter.pdf_to_markdown(pdf_path, output_path) elif convert_type == 'excel': output_path = os.path.join(output_dir, f"{name_without_ext}.xlsx") PDFConverter.pdf_to_excel(pdf_path, output_path) except Exception as e: print(f"转换文件 {filename} 时出错: {str(e)}") continue @staticmethod def pdf_to_word(pdf_path, output_path): converter = pdf2docx.Converter(pdf_path) converter.convert(output_path) converter.close() @staticmethod def pdf_to_images(pdf_path, output_dir): doc = fitz.open(pdf_path) for page_num in range(len(doc)): page = doc.load_page(page_num) pix = page.get_pixmap() output_path = os.path.join(output_dir, f'page_{page_num + 1}.png') pix.save(output_path) @staticmethod def pdf_to_markdown(pdf_path, output_path): # 使用PyMuPDF提取文本并转换为Markdown doc = fitz.open(pdf_path) markdown_text = "" for page in doc: markdown_text += page.get_text() with open(output_path, 'w', encoding='utf-8') as f: f.write(markdown_text) @staticmethod def pdf_to_excel(pdf_path, output_path): """ 将PDF转换为Excel :param pdf_path: PDF文件路径 :param output_path: 输出文件路径 """ # 使用tabula-py提取表格数据 import tabula df = tabula.read_pdf(pdf_path, pages='all') if df: df[0].to_excel(output_path, index=False) else: # 如果没有找到表格,创建一个空的Excel文件 pd.DataFrame().to_excel(output_path, index=False)