Vector Stores Module

The Vector Stores Module provides a standardized interface for managing document embeddings and similarity searches. It allows users to add documents to a vector store, retrieve context based on queries, and manage collections across different providers. The module supports providers such as Pinecone and OpenAI through dedicated manager implementations.

Module Components

  • VectorStoreInterface: An abstract base class that defines the required methods for vector store implementations.

  • Service Implementations: Concrete classes that implement vector store functionality. For example:

    • PineconeManager: Manages vector store operations using Pinecone.

    • OpenAIManager: (Implementation not shown) Manages vector store operations using OpenAI.

  • VectorStoreAdapter: An adapter that creates a vector store manager instance based on backend configuration.


1. Vector Store Interface

Defines the required methods that all vector store implementations must provide.

File: autoppia_sdk/src/vectorstores/interface.py

pythonCopiarfrom abc import ABC, abstractmethod

class VectorStoreInterface(ABC):
    """Interface for vector store implementations.
    
    Defines the required methods that all vector store implementations must provide.
    """

    @abstractmethod
    def get_or_create_collection(self, collection_name):
        """Get an existing collection or create a new one if it doesn't exist.
        
        Args:
            collection_name (str): Name of the collection to get or create
            
        Returns:
            Any: The vector store collection instance
        """
        pass

    @abstractmethod
    def add_document(self, document):
        """Add a document to the vector store.
        
        Args:
            document: Document to be added to the vector store
        """
        pass

    @abstractmethod
    def get_context(self, query):
        """Retrieve relevant context based on a query.
        
        Args:
            query (str): The search query
            
        Returns:
            str: Retrieved context based on the query
        """
        pass

2. Vector Store Adapter

The adapter converts backend configuration into a concrete vector store manager instance. It handles provider-specific initialization and validates required credentials.

File: autoppia_sdk/src/vectorstores/adapter.py

pythonCopiarfrom typing import Union
from autoppia_backend_client.models import EmbeddingDatabase as VectorStoreDTO
from autoppia_sdk.src.vectorstores.implementations.pinecone_manager import PineconeManager
from autoppia_sdk.src.vectorstores.implementations.openai_manager import OpenAIManager

class VectorStoreAdapter:
    """Adapter for creating vector store manager instances based on backend configuration.
    
    Handles provider-specific initialization and credential validation.
    
    Args:
        vector_store_dto (VectorStoreDTO): Configuration data from backend
    """
    
    def __init__(self, vector_store_dto: VectorStoreDTO):
        self.vector_store_dto = vector_store_dto
        self._validate_credentials()

    def _validate_credentials(self) -> None:
        """Validate required credentials for the configured provider."""
        provider = self.vector_store_dto.provider.upper()
        if provider == "PINECONE" and not self.vector_store_dto.api_key:
            raise ValueError("Pinecone configuration requires: API key")
        if provider == "OPENAI" and not self.vector_store_dto.openai_vector_store_id:
            raise ValueError("OpenAI configuration requires: Vector Store ID")

    def from_backend(self) -> Union[OpenAIManager, PineconeManager, None]:
        """Create a vector store manager instance based on configured provider.
        
        Returns:
            Union[OpenAIManager, PineconeManager]: Initialized vector store manager
            None: If provider is not supported
            
        Raises:
            ValueError: For missing required configuration parameters
        """
        match self.vector_store_dto.provider:
            case "OPENAI":
                return OpenAIManager(
                    index_name=self.vector_store_dto.index_name,
                    vector_store_id=self.vector_store_dto.openai_vector_store_id
                )
            case "PINECONE":
                return PineconeManager(
                    api_key=self.vector_store_dto.api_key,
                    index_name=self.vector_store_dto.index_name
                )
            case _:
                raise ValueError(f"Unsupported vector store provider: {self.vector_store_dto.provider}")

3. Pinecone Manager Implementation

The PineconeManager class is a concrete implementation of the VectorStoreInterface that uses Pinecone for managing document embeddings and performing similarity searches. It also leverages external document loaders and text splitters for processing various file types.

File: autoppia_sdk/src/vectorstores/implementations/pinecone_manager.py

pythonCopiarimport os

from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.word_document import Docx2txtLoader
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pinecone import Pinecone

from autoppia_sdk.src.vectorstores.interface import VectorStoreInterface
from autoppia_sdk.src.vectorstores.implementations.s3_manager import S3Manager

class PineconeManager(VectorStoreInterface):
    """Pinecone vector store implementation.
    
    Manages vector store operations using Pinecone, including document loading,
    chunking, and similarity search capabilities.
    """

    def __init__(self, api_key: str, index_name: str):
        """Initialize Pinecone vector store manager.
        
        Args:
            api_key (str): Pinecone API key
            index_name (str): Name of the Pinecone index
        """
        self.embeddings = OpenAIEmbeddings()
        self.index_name = index_name
        self.api_key = api_key
        self.pc = Pinecone(api_key=self.api_key)
        self.pcvs = self.get_or_create_collection(self.index_name)
        self.s3_manager = S3Manager()

    def get_or_create_collection(self, index_name):
        """Get existing Pinecone index or create new one.
        
        Args:
            index_name (str): Name of the index
            
        Returns:
            PineconeVectorStore: Vector store instance
        """
        if index_name not in self.pc.list_indexes().names():
            self.pc.create_index(index_name, dimension=1536, metric="cosine")
        return PineconeVectorStore.from_existing_index(self.index_name, self.embeddings)

    def add_document(self, file_path, filter={"chat_session": 1}):
        """Add a document to the vector store.
        
        Args:
            file_path (str): Path to the document file
            filter (dict, optional): Metadata filter for the document
        """
        documents = self.load(file_path)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=1500, chunk_overlap=200
        )
        docs = text_splitter.split_documents(documents)
        embeddings = OpenAIEmbeddings()

        texts = [d.page_content for d in docs]
        metadatas = [filter for d in docs]

        # Save Vector DB in Pinecone
        PineconeVectorStore.from_texts(
            texts, embeddings, metadatas=metadatas, index_name=self.index_name
        )

    def get_context(self, query, filter):
        """Get relevant context based on query.
        
        Args:
            query (str): Search query
            filter (dict): Metadata filter for search
            
        Returns:
            str: Template with context and query
        """
        context = self.pcvs.similarity_search(query, filter=filter)

        template = f"""
            Following Context is data of PDF that user want to know about information of.
            If the user sends a greeting such as 'hi','hello' and 'good morning', don't make other answers, just send the user greeting and ask what you can help.
            Context:{context}
            Question: {query}
            """
        return template

    def load(self, file_path):
        """Load document from file path.
        
        Supports multiple file types including txt, pdf, docx, and csv.
        
        Args:
            file_path (str): Path to the document file
            
        Returns:
            list: Loaded documents
        """
        file_type = self.get_file_type(file_path)

        if file_type == "txt":
            loader = TextLoader(file_path)
            return loader.load()
        elif file_type == "pdf":
            loader = PyPDFLoader(file_path)
            return loader.load()
        elif file_type == "docx":
            loader = Docx2txtLoader(file_path)
            return loader.load()
        elif file_type == "csv":
            loader = CSVLoader(file_path)
            return loader.load()
        else:
            print(f"{file_type} is not supported.")
            return

    @staticmethod
    def get_file_type(file_path):
        """Get file extension from path.
        
        Args:
            file_path (str): Path to the file
            
        Returns:
            str: File extension without dot
        """
        _, ext = os.path.splitext(file_path)
        return ext[1:].lower()

Last updated