pdf_reader.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217

import argparse
import hashlib
import io
import json
import os

import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from PyPDF2 import PdfReader
from langchain.chains import RetrievalQA
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms import Ollama

# Make sure Tesseract is installed and accessible
pytesseract.pytesseract.tesseract_cmd = r'/usr/local/bin/tesseract'  # Update this path based on your tesseract installation


def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    try:
        # Try to extract text using PyPDF2
        text = ""
        with open(pdf_path, 'rb') as file:
            pdf = PdfReader(file)
            for page in pdf.pages:
                text += page.extract_text() or ""
        print(f"Extracted {len(text)} characters from the PDF using PyPDF2.")
        return text
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
        return None


def perform_ocr_on_pdf(pdf_path):
    """Perform OCR on a PDF file to extract text."""
    try:
        doc = fitz.open(pdf_path)  # Open the PDF with PyMuPDF
        text = ""
        for page in doc:
            pix = page.get_pixmap()
            img = Image.open(io.BytesIO(pix.tobytes()))
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text
        print(f"Extracted {len(text)} characters from the PDF using OCR.")
        return text
    except Exception as e:
        print(f"Error performing OCR on PDF: {e}")
        return None


def get_pdf_text(pdf_path):
    """Determine if OCR is necessary and extract text from PDF."""
    text = extract_text_from_pdf(pdf_path)
    if text and text.strip():  # Check if text is not None and contains non-whitespace characters
        print(f"Successfully extracted text from PDF. Total characters: {len(text)}")
        return text
    else:
        print("No text found using PyPDF2, performing OCR...")
        ocr_text = perform_ocr_on_pdf(pdf_path)
        if ocr_text and ocr_text.strip():
            print(f"Successfully extracted text from PDF using OCR. Total characters: {len(ocr_text)}")
        return ocr_text


def compute_pdf_hash(pdf_path):
    """Compute a unique hash for the PDF file to identify if it's already processed."""
    hasher = hashlib.sha256()
    with open(pdf_path, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    return hasher.hexdigest()


def load_metadata(persist_directory):
    """Load metadata from a JSON file."""
    metadata_path = os.path.join(persist_directory, 'metadata.json')
    if os.path.exists(metadata_path):
        with open(metadata_path, 'r') as f:
            return json.load(f)
    else:
        return {'processed_pdfs': []}


def save_metadata(persist_directory, metadata):
    """Save metadata to a JSON file."""
    metadata_path = os.path.join(persist_directory, 'metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f)


def process_pdf_for_qa(pdf_path, embeddings, base_persist_directory):
    """Prepare a PDF for question answering, using a unique Chroma persistence directory for each PDF."""

    pdf_hash = compute_pdf_hash(pdf_path)
    persist_directory = os.path.join(base_persist_directory, pdf_hash)  # Use hash to create a unique directory

    # Load or initialize metadata
    metadata = load_metadata(base_persist_directory)

    # Check if this PDF has already been processed
    if pdf_hash in metadata['processed_pdfs']:
        print(f"This PDF has already been processed. Loading embeddings from the database at {persist_directory}.")
        vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
        return vector_store

    # Initialize or load Chroma vector store for this specific PDF
    vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

    # Get text from PDF
    text = get_pdf_text(pdf_path)

    if not text:  # Check if text extraction or OCR failed
        print("Failed to extract text from the PDF. Please check the file.")
        return None

    # Split text into manageable chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    texts = text_splitter.split_text(text)
    print(f"Split text into {len(texts)} chunks for processing.")

    # Convert texts into Document objects for embedding
    documents = [Document(page_content=chunk) for chunk in texts]

    # Add the new documents to the existing vector store
    print("Adding new documents to the vector store and persisting...")
    vector_store.add_documents(documents)

    # Update the metadata to include this processed PDF hash
    metadata['processed_pdfs'].append(pdf_hash)
    save_metadata(base_persist_directory, metadata)

    return vector_store

def create_qa_chain(vector_store, llm):
    """Create a QA chain for answering questions."""
    # Use the vector store retriever directly
    retriever = vector_store.as_retriever()

    # Create a RetrievalQA chain using the new approach
    qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    return qa_chain


def format_answer(response):
    """Format the answer to ensure plain text output without any special characters."""
    # Handle different response formats
    if isinstance(response, dict):
        answer = response.get('result', '')
    elif isinstance(response, list):
        answer = "\n\n".join(item.get('result', '') for item in response if 'result' in item)
    else:
        answer = str(response)

    # Clean up the text: Remove excess newlines and strip whitespace
    answer = answer.replace("\\n", "\n").replace("\\'", "'").replace('\\"', '"').strip()

    return answer


def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='PDF Question Answering using LangChain and Ollama')
    parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
    parser.add_argument('--persist', type=str, default='db', help='Base directory to save or load persisted vector stores')
    args = parser.parse_args()

    # Check if the PDF file exists
    if not os.path.exists(args.pdf_path):
        print(f"Error: The file {args.pdf_path} does not exist.")
        exit(1)

    # Initialize LLM (LLaMA 3.1 model hosted on Ollama)
    llm = Ollama(model="llama3.1")

    # Initialize Ollama embeddings model using nomic-embed-text:latest
    embeddings = OllamaEmbeddings(model="nomic-embed-text:latest")

    # Process the PDF and prepare it for QA
    print("Processing the PDF. Please wait...")
    vector_store = process_pdf_for_qa(args.pdf_path, embeddings, args.persist)

    if vector_store is None:
        print("Processing failed. Exiting.")
        exit(1)

    # Create the QA chain
    qa_chain = create_qa_chain(vector_store, llm)

    # Interactive mode for asking questions
    print("PDF processing complete. You can now ask questions about the content.")
    print("Type 'exit' or 'quit' to end the session.")

    while True:
        question = input("Enter your question: ").strip()
        if question.lower() in ["exit", "quit"]:
            print("Exiting the session. Goodbye!")
            break

        # Get the answer
        response = qa_chain.invoke(question)

        # Format the answer
        answer = format_answer(response)

        # Check if the answer is empty or only contains newlines
        if answer.strip():
            print(f"Answer:\n\n{answer}\n")
        else:
            print("No relevant information found for your question. Please try asking a different question.")

if __name__ == "__main__":
    main()