Spaces:

MaryamKarimi080
/

SCR_Course_ChatBot

Sleeping

SCR_Course_ChatBot / scripts /setup_vectorstore.py

Upload 3 files

5f096cc verified 4 months ago

1.11 kB

	from langchain_community.vectorstores import Chroma
	from langchain_openai import OpenAIEmbeddings
	import pickle
	import os

	CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl"
	DB_DIR = "E:/courses/LangChain Project/main root/db"
	BATCH_SIZE = 100 # You can tune this depending on average token size per chunk

	if not os.path.exists(CHUNKS_PATH):
	raise FileNotFoundError("Run chunk_and_embed.py first")

	with open(CHUNKS_PATH, "rb") as f:
	chunks = pickle.load(f)

	embedding = OpenAIEmbeddings(model="text-embedding-3-small")

	# Create or load the vectorstore
	vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding)

	print(f"🧠 Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...")

	# Add documents in batches to avoid hitting token limits
	for i in range(0, len(chunks), BATCH_SIZE):
	batch = chunks[i:i + BATCH_SIZE]
	vectorstore.add_documents(batch)
	print(f"✅ Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}")

	# vectorstore.persist()
	print(f"✅ Vectorstore saved to {DB_DIR}")