diff options
author | Matthew Lemon <y@yulqen.org> | 2024-08-27 20:37:06 +0100 |
---|---|---|
committer | Matthew Lemon <y@yulqen.org> | 2024-08-27 20:37:06 +0100 |
commit | be2da508e989952c5fde04bbe1acae29d9afdd96 (patch) | |
tree | 4ec93369b1b889a38f2a59c57c5cbc13ca0f425a | |
parent | 6cff0842a3d59a66f1cbb0b8b96881473c795549 (diff) |
Minor changes to nomenclature
-rw-r--r-- | pdf_reader.py | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/pdf_reader.py b/pdf_reader.py index af7b3d1..e70a88b 100644 --- a/pdf_reader.py +++ b/pdf_reader.py @@ -92,21 +92,22 @@ def save_metadata(persist_directory, metadata): json.dump(metadata, f) -def process_pdf_for_qa(pdf_path, embeddings, persist_directory): - """Prepare a PDF for question answering, using Chroma's persistence.""" +def process_pdf_for_qa(pdf_path, embeddings, base_persist_directory): + """Prepare a PDF for question answering, using a unique Chroma persistence directory for each PDF.""" pdf_hash = compute_pdf_hash(pdf_path) + persist_directory = os.path.join(base_persist_directory, pdf_hash) # Use hash to create a unique directory # Load or initialize metadata - metadata = load_metadata(persist_directory) + metadata = load_metadata(base_persist_directory) # Check if this PDF has already been processed if pdf_hash in metadata['processed_pdfs']: - print("This PDF has already been processed. Loading embeddings from the database.") + print(f"This PDF has already been processed. Loading embeddings from the database at {persist_directory}.") vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) return vector_store - # Initialize or load Chroma vector store + # Initialize or load Chroma vector store for this specific PDF vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings) # Get text from PDF @@ -130,7 +131,7 @@ def process_pdf_for_qa(pdf_path, embeddings, persist_directory): # Update the metadata to include this processed PDF hash metadata['processed_pdfs'].append(pdf_hash) - save_metadata(persist_directory, metadata) + save_metadata(base_persist_directory, metadata) return vector_store @@ -165,7 +166,7 @@ def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='PDF Question Answering using LangChain and Ollama') parser.add_argument('pdf_path', type=str, help='Path to the PDF file') - parser.add_argument('--persist', type=str, default='db', help='Directory to save or load persisted vector store') + parser.add_argument('--persist', type=str, default='db', help='Base directory to save or load persisted vector stores') args = parser.parse_args() # Check if the PDF file exists @@ -212,6 +213,5 @@ def main(): else: print("No relevant information found for your question. Please try asking a different question.") - if __name__ == "__main__": main() |