blob: b33afcf4f2a0b47a606bdfd725e6207fc703c519 (
plain) (
tree)
|
|
import dataclasses
import os.path
import tempfile
import pypdfium2 as pdfium
@dataclasses.dataclass
class PDFMetadata:
file_name: str
file_size: int
n_pages: int
def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata:
"""
This function returns the metadata of a PDF file
:param file_path:
:return: PDFMetadata
"""
if file_path is None:
raise ValueError("file_path cannot be None")
if not os.path.isfile(file_path):
raise ValueError(f"file_path must be a file. {file_path} is not a file.")
pdf = pdfium.PdfDocument(file_path)
n_pages = len(pdf)
file_size = os.path.getsize(file_path)
pdf.close()
return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages)
def export_pages_as_images(file_path: str) -> list[str]:
"""
This function exports the pages of a PDF file as JPEG images.
:param file_path:
:return: List of paths to the JPEG images
"""
if file_path is None:
raise ValueError("file_path cannot be None")
output_dir = tempfile.mkdtemp() # Create a temporary directory
try:
pdf = pdfium.PdfDocument(file_path)
# get the file_name of this PDF file at file_path
file_name = os.path.basename(file_path)
image_paths = []
for i in range(len(pdf)):
page = pdf[i]
image = page.render(scale=0.6).to_pil()
image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg")
image.save(image_path)
image_paths.append(image_path)
pdf.close()
return image_paths
finally:
# Optionally handle cleanup later or elsewhere in your code
# Remove later with shutil.rmtree(output_dir)
pass
|