Vector Stores Module
The Vector Stores Module provides a standardized interface for managing document embeddings and similarity searches. It allows users to add documents to a vector store, retrieve context based on queries, and manage collections across different providers. The module supports providers such as Pinecone and OpenAI through dedicated manager implementations.
Module Components
VectorStoreInterface: An abstract base class that defines the required methods for vector store implementations.
Service Implementations: Concrete classes that implement vector store functionality. For example:
PineconeManager: Manages vector store operations using Pinecone.
OpenAIManager: (Implementation not shown) Manages vector store operations using OpenAI.
VectorStoreAdapter: An adapter that creates a vector store manager instance based on backend configuration.
1. Vector Store Interface
Defines the required methods that all vector store implementations must provide.
File: autoppia_sdk/src/vectorstores/interface.py
pythonCopiarfrom abc import ABC, abstractmethod
class VectorStoreInterface(ABC):
"""Interface for vector store implementations.
Defines the required methods that all vector store implementations must provide.
"""
@abstractmethod
def get_or_create_collection(self, collection_name):
"""Get an existing collection or create a new one if it doesn't exist.
Args:
collection_name (str): Name of the collection to get or create
Returns:
Any: The vector store collection instance
"""
pass
@abstractmethod
def add_document(self, document):
"""Add a document to the vector store.
Args:
document: Document to be added to the vector store
"""
pass
@abstractmethod
def get_context(self, query):
"""Retrieve relevant context based on a query.
Args:
query (str): The search query
Returns:
str: Retrieved context based on the query
"""
pass
2. Vector Store Adapter
The adapter converts backend configuration into a concrete vector store manager instance. It handles provider-specific initialization and validates required credentials.
File: autoppia_sdk/src/vectorstores/adapter.py
pythonCopiarfrom typing import Union
from autoppia_backend_client.models import EmbeddingDatabase as VectorStoreDTO
from autoppia_sdk.src.vectorstores.implementations.pinecone_manager import PineconeManager
from autoppia_sdk.src.vectorstores.implementations.openai_manager import OpenAIManager
class VectorStoreAdapter:
"""Adapter for creating vector store manager instances based on backend configuration.
Handles provider-specific initialization and credential validation.
Args:
vector_store_dto (VectorStoreDTO): Configuration data from backend
"""
def __init__(self, vector_store_dto: VectorStoreDTO):
self.vector_store_dto = vector_store_dto
self._validate_credentials()
def _validate_credentials(self) -> None:
"""Validate required credentials for the configured provider."""
provider = self.vector_store_dto.provider.upper()
if provider == "PINECONE" and not self.vector_store_dto.api_key:
raise ValueError("Pinecone configuration requires: API key")
if provider == "OPENAI" and not self.vector_store_dto.openai_vector_store_id:
raise ValueError("OpenAI configuration requires: Vector Store ID")
def from_backend(self) -> Union[OpenAIManager, PineconeManager, None]:
"""Create a vector store manager instance based on configured provider.
Returns:
Union[OpenAIManager, PineconeManager]: Initialized vector store manager
None: If provider is not supported
Raises:
ValueError: For missing required configuration parameters
"""
match self.vector_store_dto.provider:
case "OPENAI":
return OpenAIManager(
index_name=self.vector_store_dto.index_name,
vector_store_id=self.vector_store_dto.openai_vector_store_id
)
case "PINECONE":
return PineconeManager(
api_key=self.vector_store_dto.api_key,
index_name=self.vector_store_dto.index_name
)
case _:
raise ValueError(f"Unsupported vector store provider: {self.vector_store_dto.provider}")
3. Pinecone Manager Implementation
The PineconeManager
class is a concrete implementation of the VectorStoreInterface
that uses Pinecone for managing document embeddings and performing similarity searches. It also leverages external document loaders and text splitters for processing various file types.
File: autoppia_sdk/src/vectorstores/implementations/pinecone_manager.py
pythonCopiarimport os
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone
from autoppia_sdk.src.vectorstores.interface import VectorStoreInterface
from autoppia_sdk.src.vectorstores.implementations.s3_manager import S3Manager
class PineconeManager(VectorStoreInterface):
"""Pinecone vector store implementation.
Manages vector store operations using Pinecone, including document loading,
chunking, and similarity search capabilities.
"""
def __init__(self, api_key: str, index_name: str):
"""Initialize Pinecone vector store manager.
Args:
api_key (str): Pinecone API key
index_name (str): Name of the Pinecone index
"""
self.embeddings = OpenAIEmbeddings()
self.index_name = index_name
self.api_key = api_key
self.pc = Pinecone(api_key=self.api_key)
self.pcvs = self.get_or_create_collection(self.index_name)
self.s3_manager = S3Manager()
def get_or_create_collection(self, index_name):
"""Get existing Pinecone index or create new one.
Args:
index_name (str): Name of the index
Returns:
PineconeVectorStore: Vector store instance
"""
if index_name not in self.pc.list_indexes().names():
self.pc.create_index(index_name, dimension=1536, metric="cosine")
return PineconeVectorStore.from_existing_index(self.index_name, self.embeddings)
def add_document(self, file_path, filter={"chat_session": 1}):
"""Add a document to the vector store.
Args:
file_path (str): Path to the document file
filter (dict, optional): Metadata filter for the document
"""
documents = self.load(file_path)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500, chunk_overlap=200
)
docs = text_splitter.split_documents(documents)
embeddings = OpenAIEmbeddings()
texts = [d.page_content for d in docs]
metadatas = [filter for d in docs]
# Save Vector DB in Pinecone
PineconeVectorStore.from_texts(
texts, embeddings, metadatas=metadatas, index_name=self.index_name
)
def get_context(self, query, filter):
"""Get relevant context based on query.
Args:
query (str): Search query
filter (dict): Metadata filter for search
Returns:
str: Template with context and query
"""
context = self.pcvs.similarity_search(query, filter=filter)
template = f"""
Following Context is data of PDF that user want to know about information of.
If the user sends a greeting such as 'hi','hello' and 'good morning', don't make other answers, just send the user greeting and ask what you can help.
Context:{context}
Question: {query}
"""
return template
def load(self, file_path):
"""Load document from file path.
Supports multiple file types including txt, pdf, docx, and csv.
Args:
file_path (str): Path to the document file
Returns:
list: Loaded documents
"""
file_type = self.get_file_type(file_path)
if file_type == "txt":
loader = TextLoader(file_path)
return loader.load()
elif file_type == "pdf":
loader = PyPDFLoader(file_path)
return loader.load()
elif file_type == "docx":
loader = Docx2txtLoader(file_path)
return loader.load()
elif file_type == "csv":
loader = CSVLoader(file_path)
return loader.load()
else:
print(f"{file_type} is not supported.")
return
@staticmethod
def get_file_type(file_path):
"""Get file extension from path.
Args:
file_path (str): Path to the file
Returns:
str: File extension without dot
"""
_, ext = os.path.splitext(file_path)
return ext[1:].lower()
Last updated