with_id_rename

def with_id_rename(docs: Iterable) -> list[dict[str, Any]]

Utility changes _id field from Collection into id for Document.

MongoDBAtlasVectorDB

class MongoDBAtlasVectorDB(VectorDB)

A Collection object for MongoDB.

__init__

def __init__(connection_string: str = "",
             database_name: str = "vector_db",
             embedding_function: Callable = SentenceTransformer(
                 "all-MiniLM-L6-v2").encode,
             collection_name: str = None,
             index_name: str = "vector_index",
             overwrite: bool = False,
             wait_until_index_ready: float = None,
             wait_until_document_ready: float = None)

Initialize the vector database.

Arguments:

  • connection_string - str | The MongoDB connection string to connect to. Default is ”.
  • database_name - str | The name of the database. Default is ‘vector_db’.
  • embedding_function - Callable | The embedding function used to generate the vector representation.
  • collection_name - str | The name of the collection to create for this vector database Defaults to None
  • index_name - str | Index name for the vector database, defaults to ‘vector_index’
  • overwrite - bool = False
  • wait_until_index_ready - float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.
  • wait_until_document_ready - float | None | Blocking call to wait until the database indexes are ready. None, the default, means no wait.

list_collections

def list_collections()

List the collections in the vector database.

Returns:

List[str] | The list of collections.

create_collection

def create_collection(collection_name: str,
                      overwrite: bool = False,
                      get_or_create: bool = True) -> Collection

Create a collection in the vector database and create a vector search index in the collection.

Arguments:

  • collection_name - str | The name of the collection.
  • overwrite - bool | Whether to overwrite the collection if it exists. Default is False.
  • get_or_create - bool | Whether to get or create the collection. Default is True

create_index_if_not_exists

def create_index_if_not_exists(index_name: str = "vector_index",
                               collection: Collection = None) -> None

Creates a vector search index on the specified collection in MongoDB.

Arguments:

  • MONGODB_INDEX str, optional - The name of the vector search index to create. Defaults to “vector_search_index”.
  • collection Collection, optional - The MongoDB collection to create the index on. Defaults to None.

get_collection

def get_collection(collection_name: str = None) -> Collection

Get the collection from the vector database.

Arguments:

  • collection_name - str | The name of the collection. Default is None. If None, return the current active collection.

Returns:

Collection | The collection object.

delete_collection

def delete_collection(collection_name: str) -> None

Delete the collection from the vector database.

Arguments:

  • collection_name - str | The name of the collection.

create_vector_search_index

def create_vector_search_index(
    collection: Collection,
    index_name: Union[str, None] = "vector_index",
    similarity: Literal["euclidean", "cosine",
                        "dotProduct"] = "cosine") -> None

Create a vector search index in the collection.

Arguments:

  • collection - An existing Collection in the Atlas Database.
  • index_name - Vector Search Index name.
  • similarity - Algorithm used for measuring vector similarity.
  • kwargs - Additional keyword arguments.

Returns:

None

insert_docs

def insert_docs(docs: list[Document],
                collection_name: str = None,
                upsert: bool = False,
                batch_size=DEFAULT_INSERT_BATCH_SIZE,
                **kwargs) -> None

Insert Documents and Vector Embeddings into the collection of the vector database.

For large numbers of Documents, insertion is performed in batches.

Arguments:

  • docs - List[Document] | A list of documents. Each document is a TypedDict Document.
  • collection_name - str | The name of the collection. Default is None.
  • upsert - bool | Whether to update the document if it exists. Default is False.
  • batch_size - Number of documents to be inserted in each batch

update_docs

def update_docs(docs: list[Document],
                collection_name: str = None,
                **kwargs: Any) -> None

Update documents, including their embeddings, in the Collection.

Optionally allow upsert as kwarg.

Uses deepcopy to avoid changing docs.

Arguments:

  • docs - List[Document] | A list of documents.
  • collection_name - str | The name of the collection. Default is None.
  • kwargs - Any | Use upsert=True` to insert documents whose ids are not present in collection.

delete_docs

def delete_docs(ids: list[ItemID], collection_name: str = None, **kwargs)

Delete documents from the collection of the vector database.

Arguments:

  • ids - List[ItemID] | A list of document ids. Each id is a typed ItemID.
  • collection_name - str | The name of the collection. Default is None.

get_docs_by_ids

def get_docs_by_ids(ids: list[ItemID] = None,
                    collection_name: str = None,
                    include: list[str] = None,
                    **kwargs) -> list[Document]

Retrieve documents from the collection of the vector database based on the ids.

Arguments:

  • ids - List[ItemID] | A list of document ids. If None, will return all the documents. Default is None.
  • collection_name - str | The name of the collection. Default is None.
  • include - List[str] | The fields to include. If None, will include [“metadata”, “content”], ids will always be included. Basically, use include to choose whether to include embedding and metadata
  • kwargs - dict | Additional keyword arguments.

Returns:

List[Document] | The results.

retrieve_docs

def retrieve_docs(queries: list[str],
                  collection_name: str = None,
                  n_results: int = 10,
                  distance_threshold: float = -1,
                  **kwargs) -> QueryResults

Retrieve documents from the collection of the vector database based on the queries.

Arguments:

  • queries - List[str] | A list of queries. Each query is a string.
  • collection_name - str | The name of the collection. Default is None.
  • n_results - int | The number of relevant documents to return. Default is 10.
  • distance_threshold - float | The threshold for the distance score, only distance smaller than it will be returned. Don’t filter with it if < 0. Default is -1.
  • kwargs - Dict | Additional keyword arguments. Ones of importance follow:
  • oversampling_factor - int | This times n_results is ‘ef’ in the HNSW algorithm. It determines the number of nearest neighbor candidates to consider during the search phase. A higher value leads to more accuracy, but is slower. Default is 10

Returns:

QueryResults | For each query string, a list of nearest documents and their scores.