Spaces:
Sleeping
Sleeping
| from langchain_community.vectorstores import Chroma | |
| from langchain_openai import OpenAIEmbeddings | |
| import pickle | |
| import os | |
| CHUNKS_PATH = "E:/courses/LangChain Project/main root/output/chunks.pkl" | |
| DB_DIR = "E:/courses/LangChain Project/main root/db" | |
| BATCH_SIZE = 100 # You can tune this depending on average token size per chunk | |
| if not os.path.exists(CHUNKS_PATH): | |
| raise FileNotFoundError("Run chunk_and_embed.py first") | |
| with open(CHUNKS_PATH, "rb") as f: | |
| chunks = pickle.load(f) | |
| embedding = OpenAIEmbeddings(model="text-embedding-3-small") | |
| # Create or load the vectorstore | |
| vectorstore = Chroma(persist_directory=DB_DIR, embedding_function=embedding) | |
| print(f"π§ Embedding and adding {len(chunks)} chunks in batches of {BATCH_SIZE}...") | |
| # Add documents in batches to avoid hitting token limits | |
| for i in range(0, len(chunks), BATCH_SIZE): | |
| batch = chunks[i:i + BATCH_SIZE] | |
| vectorstore.add_documents(batch) | |
| print(f"β Added batch {i // BATCH_SIZE + 1} of {len(chunks) // BATCH_SIZE + 1}") | |
| # vectorstore.persist() | |
| print(f"β Vectorstore saved to {DB_DIR}") | |