import dataclasses import os.path import tempfile from pdf2image import convert_from_path from PyPDF2 import PdfReader @dataclasses.dataclass class PDFMetadata: file_name: str file_size: int n_pages: int def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata: """ This function returns the metadata of a PDF file :param file_path: :return: PDFMetadata """ if not os.path.isfile(file_path): raise ValueError("file_path must be a file. {file_path} is not a file.") reader = PdfReader(file_path) n_pages = len(reader.pages) file_size = os.path.getsize(file_path) # pdf.close() return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages) def export_pages_as_images(file_path: str) -> list[str]: """ This function exports the pages of a PDF file as JPEG images. :param file_path: :return: List of paths to the JPEG images """ output_dir = tempfile.mkdtemp() # Create a temporary directory reader = PdfReader(file_path) n_pages = len(reader.pages) try: with tempfile.TemporaryDirectory() as path: images_from_path = convert_from_path(file_path, 56, size=300, output_folder=path) # get the file_name of this PDF file at file_path file_name = os.path.basename(file_path) image_paths = [] for i in range(n_pages): image = images_from_path[i] image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg") image.save(image_path) image_paths.append(image_path) return image_paths finally: # Optionally handle cleanup later or elsewhere in your code # Remove later with shutil.rmtree(output_dir) pass