From 9189ae7c790602035a5d43b5cc1f37aa3a2867d2 Mon Sep 17 00:00:00 2001
From: Matthew Lemon <y@yulqen.org>
Date: Sun, 23 Jun 2024 17:14:26 +0100
Subject: Removed pypdfium2 which doesn't build on FreeBSD

Libraries used:

- PdPDF2 (to get basic PDF length) - https://pypi.org/project/PyPDF2/
- pdf2image (to extract images from each page) -
  https://github.com/Belval/pdf2image

Reduction of quality of screenshotted image is in place.

Reduced pdf screenshot quality considerably
---
 pyblackbird_cc/resources/services.py | 36 +++++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

(limited to 'pyblackbird_cc/resources/services.py')

diff --git a/pyblackbird_cc/resources/services.py b/pyblackbird_cc/resources/services.py
index b33afcf..441f623 100644
--- a/pyblackbird_cc/resources/services.py
+++ b/pyblackbird_cc/resources/services.py
@@ -2,7 +2,8 @@ import dataclasses
 import os.path
 import tempfile
 
-import pypdfium2 as pdfium
+from pdf2image import convert_from_path
+from PyPDF2 import PdfReader
 
 
 @dataclasses.dataclass
@@ -21,11 +22,11 @@ def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata:
     if file_path is None:
         raise ValueError("file_path cannot be None")
     if not os.path.isfile(file_path):
-        raise ValueError(f"file_path must be a file. {file_path} is not a file.")
-    pdf = pdfium.PdfDocument(file_path)
-    n_pages = len(pdf)
+        raise ValueError("file_path must be a file. {file_path} is not a file.")
+    reader = PdfReader(file_path)
+    n_pages = len(reader.pages)
     file_size = os.path.getsize(file_path)
-    pdf.close()
+    #pdf.close()
     return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages)
 
 
@@ -38,19 +39,20 @@ def export_pages_as_images(file_path: str) -> list[str]:
     if file_path is None:
         raise ValueError("file_path cannot be None")
     output_dir = tempfile.mkdtemp()  # Create a temporary directory
+    reader = PdfReader(file_path)
+    n_pages = len(reader.pages)
     try:
-        pdf = pdfium.PdfDocument(file_path)
-        # get the file_name of this PDF file at file_path
-        file_name = os.path.basename(file_path)
-        image_paths = []
-        for i in range(len(pdf)):
-            page = pdf[i]
-            image = page.render(scale=0.6).to_pil()
-            image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg")
-            image.save(image_path)
-            image_paths.append(image_path)
-        pdf.close()
-        return image_paths
+        with tempfile.TemporaryDirectory() as path:
+            images_from_path = convert_from_path(file_path, 56, size=300, output_folder=path)
+            # get the file_name of this PDF file at file_path
+            file_name = os.path.basename(file_path)
+            image_paths = []
+            for i in range(n_pages):
+                image = images_from_path[i]
+                image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg")
+                image.save(image_path)
+                image_paths.append(image_path)
+            return image_paths
     finally:
         # Optionally handle cleanup later or elsewhere in your code
         # Remove later with shutil.rmtree(output_dir)
-- 
cgit v1.2.3