diff options
Diffstat (limited to 'pyblackbird_cc/resources/services.py')
-rw-r--r-- | pyblackbird_cc/resources/services.py | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/pyblackbird_cc/resources/services.py b/pyblackbird_cc/resources/services.py new file mode 100644 index 0000000..b33afcf --- /dev/null +++ b/pyblackbird_cc/resources/services.py @@ -0,0 +1,57 @@ +import dataclasses +import os.path +import tempfile + +import pypdfium2 as pdfium + + +@dataclasses.dataclass +class PDFMetadata: + file_name: str + file_size: int + n_pages: int + + +def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata: + """ + This function returns the metadata of a PDF file + :param file_path: + :return: PDFMetadata + """ + if file_path is None: + raise ValueError("file_path cannot be None") + if not os.path.isfile(file_path): + raise ValueError(f"file_path must be a file. {file_path} is not a file.") + pdf = pdfium.PdfDocument(file_path) + n_pages = len(pdf) + file_size = os.path.getsize(file_path) + pdf.close() + return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages) + + +def export_pages_as_images(file_path: str) -> list[str]: + """ + This function exports the pages of a PDF file as JPEG images. + :param file_path: + :return: List of paths to the JPEG images + """ + if file_path is None: + raise ValueError("file_path cannot be None") + output_dir = tempfile.mkdtemp() # Create a temporary directory + try: + pdf = pdfium.PdfDocument(file_path) + # get the file_name of this PDF file at file_path + file_name = os.path.basename(file_path) + image_paths = [] + for i in range(len(pdf)): + page = pdf[i] + image = page.render(scale=0.6).to_pil() + image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg") + image.save(image_path) + image_paths.append(image_path) + pdf.close() + return image_paths + finally: + # Optionally handle cleanup later or elsewhere in your code + # Remove later with shutil.rmtree(output_dir) + pass |