aboutsummaryrefslogtreecommitdiffstats
path: root/pyblackbird_cc
diff options
context:
space:
mode:
authorMatthew Lemon <y@yulqen.org>2024-06-23 17:14:26 +0100
committerMatthew Lemon <y@yulqen.org>2024-06-23 17:32:10 +0100
commit9189ae7c790602035a5d43b5cc1f37aa3a2867d2 (patch)
tree645d4e5c605b0119e0fdbca3c91a17e378aaac3c /pyblackbird_cc
parent6a596cac664a063bd2dc9ca99f2f19664970ed01 (diff)
Removed pypdfium2 which doesn't build on FreeBSD
Libraries used: - PdPDF2 (to get basic PDF length) - https://pypi.org/project/PyPDF2/ - pdf2image (to extract images from each page) - https://github.com/Belval/pdf2image Reduction of quality of screenshotted image is in place. Reduced pdf screenshot quality considerably
Diffstat (limited to '')
-rw-r--r--pyblackbird_cc/resources/services.py36
1 files changed, 19 insertions, 17 deletions
diff --git a/pyblackbird_cc/resources/services.py b/pyblackbird_cc/resources/services.py
index b33afcf..441f623 100644
--- a/pyblackbird_cc/resources/services.py
+++ b/pyblackbird_cc/resources/services.py
@@ -2,7 +2,8 @@ import dataclasses
import os.path
import tempfile
-import pypdfium2 as pdfium
+from pdf2image import convert_from_path
+from PyPDF2 import PdfReader
@dataclasses.dataclass
@@ -21,11 +22,11 @@ def get_pdf_metadata_from_path(file_path: str) -> PDFMetadata:
if file_path is None:
raise ValueError("file_path cannot be None")
if not os.path.isfile(file_path):
- raise ValueError(f"file_path must be a file. {file_path} is not a file.")
- pdf = pdfium.PdfDocument(file_path)
- n_pages = len(pdf)
+ raise ValueError("file_path must be a file. {file_path} is not a file.")
+ reader = PdfReader(file_path)
+ n_pages = len(reader.pages)
file_size = os.path.getsize(file_path)
- pdf.close()
+ #pdf.close()
return PDFMetadata(file_name=file_path, file_size=file_size, n_pages=n_pages)
@@ -38,19 +39,20 @@ def export_pages_as_images(file_path: str) -> list[str]:
if file_path is None:
raise ValueError("file_path cannot be None")
output_dir = tempfile.mkdtemp() # Create a temporary directory
+ reader = PdfReader(file_path)
+ n_pages = len(reader.pages)
try:
- pdf = pdfium.PdfDocument(file_path)
- # get the file_name of this PDF file at file_path
- file_name = os.path.basename(file_path)
- image_paths = []
- for i in range(len(pdf)):
- page = pdf[i]
- image = page.render(scale=0.6).to_pil()
- image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg")
- image.save(image_path)
- image_paths.append(image_path)
- pdf.close()
- return image_paths
+ with tempfile.TemporaryDirectory() as path:
+ images_from_path = convert_from_path(file_path, 56, size=300, output_folder=path)
+ # get the file_name of this PDF file at file_path
+ file_name = os.path.basename(file_path)
+ image_paths = []
+ for i in range(n_pages):
+ image = images_from_path[i]
+ image_path = os.path.join(output_dir, f"{file_name}_{i:03d}.jpg")
+ image.save(image_path)
+ image_paths.append(image_path)
+ return image_paths
finally:
# Optionally handle cleanup later or elsewhere in your code
# Remove later with shutil.rmtree(output_dir)