diff options
author | Matthew Lemon <y@yulqen.org> | 2024-06-23 17:14:26 +0100 |
---|---|---|
committer | Matthew Lemon <y@yulqen.org> | 2024-06-23 17:32:10 +0100 |
commit | 9189ae7c790602035a5d43b5cc1f37aa3a2867d2 (patch) | |
tree | 645d4e5c605b0119e0fdbca3c91a17e378aaac3c /pyblackbird_cc | |
parent | 6a596cac664a063bd2dc9ca99f2f19664970ed01 (diff) |
Removed pypdfium2 which doesn't build on FreeBSD
Libraries used:
- PdPDF2 (to get basic PDF length) - https://pypi.org/project/PyPDF2/
- pdf2image (to extract images from each page) -
https://github.com/Belval/pdf2image
Reduction of quality of screenshotted image is in place.
Reduced pdf screenshot quality considerably
Diffstat (limited to '')
-rw-r--r-- | pyblackbird_cc/resources/services.py | 36 |
1 files changed, 19 insertions, 17 deletions
diff --git a/pyblackbird_cc/resources/services.py b/pyblackbird_cc/resources/services.py index b33afcf..441f623 100644 --- a/pyblackbird_cc/resources/services.py +++ b/pyblackbird_cc/resources/services.py @@ -2,7 +2,8 @@ import dataclasses import os.path import tempfile -import pypdfium2 as pdfium +from pdf2image import convert_from_path +from PyPDF2 import PdfReader @dataclasses.dataclass @@ -21,11 +22,11 @@ def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata: if file_path is None: raise ValueError("file_path cannot be None") if not os.path.isfile(file_path): - raise ValueError(f"file_path must be a file. {file_path} is not a file.") - pdf = pdfium.PdfDocument(file_path) - n_pages = len(pdf) + raise ValueError("file_path must be a file. {file_path} is not a file.") + reader = PdfReader(file_path) + n_pages = len(reader.pages) file_size = os.path.getsize(file_path) - pdf.close() + #pdf.close() return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages) @@ -38,19 +39,20 @@ def export_pages_as_images(file_path: str) -> list[str]: if file_path is None: raise ValueError("file_path cannot be None") output_dir = tempfile.mkdtemp() # Create a temporary directory + reader = PdfReader(file_path) + n_pages = len(reader.pages) try: - pdf = pdfium.PdfDocument(file_path) - # get the file_name of this PDF file at file_path - file_name = os.path.basename(file_path) - image_paths = [] - for i in range(len(pdf)): - page = pdf[i] - image = page.render(scale=0.6).to_pil() - image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg") - image.save(image_path) - image_paths.append(image_path) - pdf.close() - return image_paths + with tempfile.TemporaryDirectory() as path: + images_from_path = convert_from_path(file_path, 56, size=300, output_folder=path) + # get the file_name of this PDF file at file_path + file_name = os.path.basename(file_path) + image_paths = [] + for i in range(n_pages): + image = images_from_path[i] + image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg") + image.save(image_path) + image_paths.append(image_path) + return image_paths finally: # Optionally handle cleanup later or elsewhere in your code # Remove later with shutil.rmtree(output_dir) |