import dataclasses import os.path import tempfile import pypdfium2 as pdfium @dataclasses.dataclass class PDFMetadata: file_name: str file_size: int n_pages: int def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata: """ This function returns the metadata of a PDF file :param file_path: :return: PDFMetadata """ if file_path is None: raise ValueError("file_path cannot be None") if not os.path.isfile(file_path): raise ValueError(f"file_path must be a file. {file_path} is not a file.") pdf = pdfium.PdfDocument(file_path) n_pages = len(pdf) file_size = os.path.getsize(file_path) pdf.close() return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages) def export_pages_as_images(file_path: str) -> list[str]: """ This function exports the pages of a PDF file as JPEG images. :param file_path: :return: List of paths to the JPEG images """ if file_path is None: raise ValueError("file_path cannot be None") output_dir = tempfile.mkdtemp() # Create a temporary directory try: pdf = pdfium.PdfDocument(file_path) # get the file_name of this PDF file at file_path file_name = os.path.basename(file_path) image_paths = [] for i in range(len(pdf)): page = pdf[i] image = page.render(scale=0.6).to_pil() image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg") image.save(image_path) image_paths.append(image_path) pdf.close() return image_paths finally: # Optionally handle cleanup later or elsewhere in your code # Remove later with shutil.rmtree(output_dir) pass