summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMatthew Lemon <y@yulqen.org>2024-08-27 20:37:06 +0100
committerMatthew Lemon <y@yulqen.org>2024-08-27 20:37:06 +0100
commitbe2da508e989952c5fde04bbe1acae29d9afdd96 (patch)
tree4ec93369b1b889a38f2a59c57c5cbc13ca0f425a
parent6cff0842a3d59a66f1cbb0b8b96881473c795549 (diff)
Minor changes to nomenclature
-rw-r--r--pdf_reader.py16
1 files changed, 8 insertions, 8 deletions
diff --git a/pdf_reader.py b/pdf_reader.py
index af7b3d1..e70a88b 100644
--- a/pdf_reader.py
+++ b/pdf_reader.py
@@ -92,21 +92,22 @@ def save_metadata(persist_directory, metadata):
json.dump(metadata, f)
-def process_pdf_for_qa(pdf_path, embeddings, persist_directory):
- """Prepare a PDF for question answering, using Chroma's persistence."""
+def process_pdf_for_qa(pdf_path, embeddings, base_persist_directory):
+ """Prepare a PDF for question answering, using a unique Chroma persistence directory for each PDF."""
pdf_hash = compute_pdf_hash(pdf_path)
+ persist_directory = os.path.join(base_persist_directory, pdf_hash) # Use hash to create a unique directory
# Load or initialize metadata
- metadata = load_metadata(persist_directory)
+ metadata = load_metadata(base_persist_directory)
# Check if this PDF has already been processed
if pdf_hash in metadata['processed_pdfs']:
- print("This PDF has already been processed. Loading embeddings from the database.")
+ print(f"This PDF has already been processed. Loading embeddings from the database at {persist_directory}.")
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
return vector_store
- # Initialize or load Chroma vector store
+ # Initialize or load Chroma vector store for this specific PDF
vector_store = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
# Get text from PDF
@@ -130,7 +131,7 @@ def process_pdf_for_qa(pdf_path, embeddings, persist_directory):
# Update the metadata to include this processed PDF hash
metadata['processed_pdfs'].append(pdf_hash)
- save_metadata(persist_directory, metadata)
+ save_metadata(base_persist_directory, metadata)
return vector_store
@@ -165,7 +166,7 @@ def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='PDF Question Answering using LangChain and Ollama')
parser.add_argument('pdf_path', type=str, help='Path to the PDF file')
- parser.add_argument('--persist', type=str, default='db', help='Directory to save or load persisted vector store')
+ parser.add_argument('--persist', type=str, default='db', help='Base directory to save or load persisted vector stores')
args = parser.parse_args()
# Check if the PDF file exists
@@ -212,6 +213,5 @@ def main():
else:
print("No relevant information found for your question. Please try asking a different question.")
-
if __name__ == "__main__":
main()