VectorX LlamaIndex Integration Demo
This doc demonstrates how to use VectorX (an encrypted vector database) with LlamaIndex for secure vector search and retrieval.
1. Installation
First, let’s make sure we have all the required packages installed.
# Install necessary packages
!pip install vecx-llamaindex
2. Setting up VectorX and OpenAI credentials
We need to set up our API credentials for VectorX and OpenAI.
import os
from llama_index.embeddings.openai import OpenAIEmbedding
from vecx.vectorx import VectorX
# Set API keys
os.environ["OPENAI_API_KEY"] = "your-openai-api-key"
vecx_api_token = "vectorx-api-key"
# Initialize VectorX client
vx = VectorX(token=vecx_api_token)
# Generate encryption key if you don't have one
encryption_key = vx.generate_key()
# Make sure to save this key securely - you'll need it to access your encrypted vectors
print("Encryption key:", encryption_key)
3. Creating Sample Documents
Let’s create some sample documents to index.
from llama_index.core import Document
# Create sample documents with different categories and metadata
documents = [
Document(
text="Python is a high-level, interpreted programming language known for its readability and simplicity.",
metadata={"category": "programming", "language": "python", "difficulty": "beginner"}
),
Document(
text="JavaScript is a scripting language that enables interactive web pages and is an essential part of web applications.",
metadata={"category": "programming", "language": "javascript", "difficulty": "intermediate"}
),
Document(
text="Machine learning is a subset of artificial intelligence that provides systems the ability to automatically learn and improve from experience.",
metadata={"category": "ai", "field": "machine_learning", "difficulty": "advanced"}
),
Document(
text="Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.",
metadata={"category": "ai", "field": "deep_learning", "difficulty": "advanced"}
),
Document(
text="Vector databases are specialized database systems designed to store and query high-dimensional vectors for similarity search.",
metadata={"category": "database", "type": "vector", "difficulty": "intermediate"}
),
Document(
text="VectorX is an encrypted vector database that provides secure and private vector search capabilities.",
metadata={"category": "database", "type": "vector", "product": "vectorx", "difficulty": "intermediate"}
)
]
print(f"Created {len(documents)} sample documents")
4. Setting up VectorX with LlamaIndex
Now we’ll set up the VectorX vector store integration with LlamaIndex.
from vecx_llamaindex import VectorXVectorStore
from llama_index.core import StorageContext
import time
# Create a unique index name with timestamp to avoid conflicts
timestamp = int(time.time())
index_name = f"llamaindex_demo_{timestamp}"
# Set up the embedding model
embed_model = OpenAIEmbedding()
# Get the embedding dimension
dimension = 1536 # OpenAI's default embedding dimension
# Initialize the VectorX vector store
vector_store = VectorXVectorStore.from_params(
api_token=vecx_api_token,
encryption_key=encryption_key,
index_name=index_name,
dimension=dimension,
space_type="cosine" # Can be "cosine", "l2", or "ip"
)
# Create storage context with our vector store
storage_context = StorageContext.from_defaults(vector_store=vector_store)
print(f"Initialized VectorX vector store with index: {index_name}")
5. Creating a Vector Index from Documents
Let’s create a vector index from our documents.
from llama_index.core import VectorStoreIndex
# Create a vector index
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
embed_model=embed_model
)
print("Vector index created successfully")
6. Basic Retrieval with Query Engine
Now we can create a query engine and perform a basic search.
# Create a query engine
query_engine = index.as_query_engine()
# Ask a question
response = query_engine.query("What is Python?")
print("Query: What is Python?")
print("Response:")
print(response)
7. Using Metadata Filters
Let’s demonstrate how to use metadata filters to narrow down our search.
from llama_index.core.vector_stores.types import MetadataFilters, MetadataFilter, FilterOperator
# Create a filtered retriever to only search within AI-related documents
ai_filter = MetadataFilter(key="category", value="ai", operator=FilterOperator.EQ)
ai_filters = MetadataFilters(filters=[ai_filter])
# Create a filtered query engine
filtered_query_engine = index.as_query_engine(filters=ai_filters)
# Ask a general question but only using AI documents
response = filtered_query_engine.query("What is learning from data?")
print("Filtered Query (AI category only): What is learning from data?")
print("Response:")
print(response)
8. Advanced Filtering with Multiple Conditions
We can also combine multiple metadata filters.
# Create a more complex filter: database category AND intermediate difficulty
category_filter = MetadataFilter(key="category", value="database", operator=FilterOperator.EQ)
difficulty_filter = MetadataFilter(key="difficulty", value="intermediate", operator=FilterOperator.EQ)
complex_filters = MetadataFilters(filters=[category_filter, difficulty_filter])
# Create a query engine with the complex filters
complex_filtered_engine = index.as_query_engine(filters=complex_filters)
# Query with the complex filters
response = complex_filtered_engine.query("Tell me about databases")
print("Complex Filtered Query (database category AND intermediate difficulty): Tell me about databases")
print("Response:")
print(response)
9. Custom Retriever Setup
We can also create a custom retriever with more control over the retrieval process.
from llama_index.core.retrievers import VectorIndexRetriever
# Create a retriever with custom parameters
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=3, # Return top 3 most similar results
filters=ai_filters # Use our AI category filter from before
)
# Retrieve nodes for a query
nodes = retriever.retrieve("What is deep learning?")
print(f"Retrieved {len(nodes)} nodes for query: 'What is deep learning?' (with AI category filter)")
print("\nRetrieved content:")
for i, node in enumerate(nodes):
print(f"\nNode {i+1}:")
print(f"Text: {node.node.text}")
print(f"Metadata: {node.node.metadata}")
print(f"Score: {node.score:.4f}")
10. Using a Custom Retriever with a Query Engine
We can use our custom retriever with a query engine for more control.
from llama_index.core.query_engine import RetrieverQueryEngine
# Create a query engine with our custom retriever
custom_query_engine = RetrieverQueryEngine.from_args(
retriever=retriever,
verbose=True # Enable verbose mode to see the retrieved nodes
)
# Query using the custom retriever query engine
response = custom_query_engine.query("Explain the difference between machine learning and deep learning")
print("\nFinal Response:")
print(response)
11. Direct VectorStore Querying
We can also query the VectorX vector store directly, bypassing the LlamaIndex query engine.
from llama_index.core.vector_stores.types import VectorStoreQuery
# Generate an embedding for our query
query_text = "What are vector databases?"
query_embedding = embed_model.get_text_embedding(query_text)
# Create a VectorStoreQuery
vector_store_query = VectorStoreQuery(
query_embedding=query_embedding,
similarity_top_k=2,
filters=MetadataFilters(filters=[MetadataFilter(key="category", value="database", operator=FilterOperator.EQ)])
)
# Execute the query directly on the vector store
query_result = vector_store.query(vector_store_query)
print(f"Direct VectorStore query: '{query_text}'")
print(f"Retrieved {len(query_result.nodes)} results with database category filter:")
for i, (node, score) in enumerate(zip(query_result.nodes, query_result.similarities)):
print(f"\nResult {i+1}:")
print(f"Text: {node.text}")
print(f"Metadata: {node.metadata}")
print(f"Similarity score: {score:.4f}")
12. Saving and Loading Indexes
With VectorX, your vectors and metadata are stored securely in the cloud. You can reconnect to your index in future sessions.
# To reconnect to an existing index in a future session, you would use:
def reconnect_to_index(api_token, encryption_key, index_name):
# Initialize the vector store with existing index
vector_store = VectorXVectorStore.from_params(
api_token=api_token,
encryption_key=encryption_key,
index_name=index_name
)
# Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)
# Load the index
index = VectorStoreIndex.from_vector_store(
vector_store,
embed_model=OpenAIEmbedding()
)
return index
# Example usage (commented out as we already have our index)
# reconnected_index = reconnect_to_index(vecx_api_token, encryption_key, index_name)
# query_engine = reconnected_index.as_query_engine()
# response = query_engine.query("What is VectorX?")
# print(response)
print(f"To reconnect to this index in the future, use:\n")
print(f"API Token: {vecx_api_token}")
print(f"Encryption Key: {encryption_key}")
print(f"Index Name: {index_name}")
13. Cleanup
If you want to delete the index when you’re done, you can do so with the VectorX client.
# Comment this out if you want to keep your index
# vx.delete_index(index_name)
# print(f"Index {index_name} deleted")