# Langchain with Convex
Integration patterns for using Langchain.js with Convex database - enabling semantic search, RAG, and AI-powered features in your NNT Ecosystem.
## Why Convex + Langchain
**Convex strengths:**
- Real-time sync with React components
- Type-safe queries
- Serverless functions
- Built-in authentication
**Langchain strengths:**
- Semantic search via embeddings
- LLM orchestration
- RAG (Retrieval Augmented Generation)
- Agent workflows
**Together:** Build AI-powered music analysis tools with real-time data sync.
## Architecture Patterns
### Pattern 1: Store Embeddings in Convex
**Use case:** Search song annotations semantically
**Convex schema:**
```typescript
// convex/schema.ts
import { defineSchema, defineTable } from "convex/server";
import { v } from "convex/values";
export default defineSchema({
songEmbeddings: defineTable({
songId: v.string(),
text: v.string(),
embedding: v.array(v.float64()), // 1536 dimensions
metadata: v.object({
title: v.string(),
artist: v.string(),
section: v.optional(v.string()),
timestamp: v.optional(v.number())
})
}).index("by_song", ["songId"]),
songs: defineTable({
title: v.string(),
artist: v.string(),
appleMusicId: v.optional(v.string()),
annotations: v.array(v.string())
})
});
```
**Generate and store embeddings:**
```typescript
// convex/embeddings.ts
import { mutation } from "./_generated/server";
import { v } from "convex/values";
import { OpenAIEmbeddings } from "@langchain/openai";
const embeddings = new OpenAIEmbeddings({
modelName: "text-embedding-3-small"
});
export const addSongEmbedding = mutation({
args: {
songId: v.string(),
text: v.string(),
metadata: v.object({
title: v.string(),
artist: v.string(),
section: v.optional(v.string())
})
},
handler: async (ctx, args) => {
// Generate embedding
const vector = await embeddings.embedQuery(args.text);
// Store in Convex
await ctx.db.insert("songEmbeddings", {
songId: args.songId,
text: args.text,
embedding: vector,
metadata: args.metadata
});
}
});
```
### Pattern 2: Similarity Search with Convex
**Custom similarity search function:**
```typescript
// convex/search.ts
import { query } from "./_generated/server";
import { v } from "convex/values";
// Cosine similarity helper
function cosineSimilarity(vec1: number[], vec2: number[]): number {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const mag1 = Math.sqrt(vec1.reduce((sum, val) => sum + val * val, 0));
const mag2 = Math.sqrt(vec2.reduce((sum, val) => sum + val * val, 0));
return dotProduct / (mag1 * mag2);
}
export const similaritySearch = query({
args: {
queryEmbedding: v.array(v.float64()),
limit: v.optional(v.number()),
minSimilarity: v.optional(v.number())
},
handler: async (ctx, args) => {
const limit = args.limit ?? 10;
const minSimilarity = args.minSimilarity ?? 0.7;
// Get all embeddings
const allEmbeddings = await ctx.db.query("songEmbeddings").collect();
// Calculate similarity for each
const withScores = allEmbeddings.map(doc => ({
...doc,
similarity: cosineSimilarity(args.queryEmbedding, doc.embedding)
}));
// Filter and sort
return withScores
.filter(doc => doc.similarity >= minSimilarity)
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit);
}
});
```
**Use from client:**
```typescript
import { useQuery, useMutation } from "convex/react";
import { api } from "../convex/_generated/api";
import { OpenAIEmbeddings } from "@langchain/openai";
function SongSearchComponent() {
const [query, setQuery] = useState("");
const [results, setResults] = useState([]);
const embeddings = new OpenAIEmbeddings();
const handleSearch = async () => {
// Generate query embedding
const queryVector = await embeddings.embedQuery(query);
// Search Convex
const similarDocs = await convex.query(api.search.similaritySearch, {
queryEmbedding: queryVector,
limit: 10
});
setResults(similarDocs);
};
return (
<div>
<input value={query} onChange={e => setQuery(e.target.value)} />
<button onClick={handleSearch}>Search</button>
{results.map(doc => (
<div key={doc._id}>
<h3>{doc.metadata.title} - {doc.metadata.artist}</h3>
<p>Similarity: {(doc.similarity * 100).toFixed(1)}%</p>
<p>{doc.text}</p>
</div>
))}
</div>
);
}
```
### Pattern 3: RAG with Convex Backend
**Retrieve relevant context from Convex, send to LLM:**
```typescript
// convex/rag.ts
import { action } from "./_generated/server";
import { v } from "convex/values";
import { ChatOpenAI } from "@langchain/openai";
import { OpenAIEmbeddings } from "@langchain/openai";
export const askQuestion = action({
args: {
question: v.string()
},
handler: async (ctx, args) => {
const embeddings = new OpenAIEmbeddings();
const llm = new ChatOpenAI({ modelName: "gpt-4" });
// 1. Embed the question
const questionVector = await embeddings.embedQuery(args.question);
// 2. Search Convex for relevant docs
const relevantDocs = await ctx.runQuery(api.search.similaritySearch, {
queryEmbedding: questionVector,
limit: 5
});
// 3. Build context from retrieved docs
const context = relevantDocs
.map(doc => `Song: ${doc.metadata.title} - ${doc.metadata.artist}\n${doc.text}`)
.join("\n\n---\n\n");
// 4. Ask LLM with context
const prompt = `Based on these song annotations:
${context}
Question: ${args.question}
Answer:`;
const response = await llm.invoke(prompt);
return {
answer: response.content,
sources: relevantDocs.map(d => ({
title: d.metadata.title,
artist: d.metadata.artist
}))
};
}
});
```
**Use from React:**
```typescript
function ResearchAssistant() {
const askQuestion = useMutation(api.rag.askQuestion);
const [answer, setAnswer] = useState(null);
const handleAsk = async (question: string) => {
const result = await askQuestion({ question });
setAnswer(result);
};
return (
<div>
<input onSubmit={e => handleAsk(e.target.value)} />
{answer && (
<div>
<h3>Answer:</h3>
<p>{answer.answer}</p>
<h4>Sources:</h4>
<ul>
{answer.sources.map(s => (
<li>{s.title} - {s.artist}</li>
))}
</ul>
</div>
)}
</div>
);
}
```
## Optimizing Vector Storage in Convex
### Challenge: Large Embeddings
**Problem:** 1536-dimensional vectors take space
**Solutions:**
#### 1. Dimensionality Reduction (Advanced)
Reduce to 384 or 768 dimensions using PCA (loses some accuracy)
#### 2. Lazy Loading
Store embeddings externally, cache in Convex on demand
```typescript
// Store just the hash
songEmbeddings: defineTable({
songId: v.string(),
text: v.string(),
embeddingHash: v.string(), // Reference to external storage
metadata: v.object({ ... })
})
```
#### 3. Selective Embedding
Don't embed everything - prioritize:
- User-created annotations
- Important passages
- Frequently queried content
### Indexing Strategy
**Convex doesn't have native vector indexes** - implement approximate nearest neighbor (ANN) manually:
```typescript
// Bucketing approach for faster search
export const bucketedSearch = query({
args: {
queryEmbedding: v.array(v.float64()),
limit: v.number()
},
handler: async (ctx, args) => {
// Pre-filter by metadata before similarity calculation
const candidates = await ctx.db
.query("songEmbeddings")
.filter(q =>
// Fast metadata filters first
q.eq(q.field("metadata.genre"), "jazz")
)
.collect();
// Then calculate similarity only on filtered set
const withScores = candidates.map(doc => ({
...doc,
similarity: cosineSimilarity(args.queryEmbedding, doc.embedding)
}));
return withScores
.sort((a, b) => b.similarity - a.similarity)
.slice(0, args.limit);
}
});
```
## Real-Time Sync: Embeddings + Live Data
**Convex's killer feature:** Real-time updates
### Use Case: Collaborative Annotation
Multiple users annotating same song, embeddings update live:
```typescript
// convex/annotations.ts
export const addAnnotation = mutation({
args: {
songId: v.string(),
text: v.string(),
timestamp: v.number()
},
handler: async (ctx, args) => {
// 1. Store annotation
const annotationId = await ctx.db.insert("annotations", args);
// 2. Trigger embedding generation (async)
await ctx.scheduler.runAfter(0, api.embeddings.addSongEmbedding, {
songId: args.songId,
text: args.text,
metadata: {
annotationId,
timestamp: args.timestamp
}
});
return annotationId;
}
});
```
**React component sees updates instantly:**
```typescript
function LiveAnnotations({ songId }: { songId: string }) {
// Real-time query
const annotations = useQuery(api.annotations.getBySong, { songId });
return (
<div>
{annotations?.map(a => (
<div key={a._id}>{a.text}</div>
))}
</div>
);
}
```
## Convex Actions for LLM Calls
**Use Convex actions** (not mutations) for LLM calls:
**Why:**
- Actions can make external API calls
- Mutations are deterministic (can't call OpenAI)
- Actions can be long-running
```typescript
import { action } from "./_generated/server";
import { ChatOpenAI } from "@langchain/openai";
export const generateSummary = action({
args: { songId: v.string() },
handler: async (ctx, args) => {
// 1. Query data (can call queries/mutations from actions)
const song = await ctx.runQuery(api.songs.get, { id: args.songId });
const annotations = await ctx.runQuery(api.annotations.getBySong, {
songId: args.songId
});
// 2. Call LLM
const llm = new ChatOpenAI();
const summary = await llm.invoke(`Summarize these annotations:
${annotations.map(a => a.text).join("\n")}`);
// 3. Store result
await ctx.runMutation(api.songs.updateSummary, {
songId: args.songId,
summary: summary.content
});
return summary.content;
}
});
```
## Multi-Tenant Embeddings
**If you have multiple users/vaults**, isolate embeddings:
```typescript
songEmbeddings: defineTable({
userId: v.string(), // Ownership
songId: v.string(),
embedding: v.array(v.float64()),
// ...
}).index("by_user_song", ["userId", "songId"])
// Search only user's embeddings
export const userSearch = query({
args: {
userId: v.string(),
queryEmbedding: v.array(v.float64())
},
handler: async (ctx, args) => {
const userDocs = await ctx.db
.query("songEmbeddings")
.withIndex("by_user_song", q => q.eq("userId", args.userId))
.collect();
// Calculate similarity only on user's docs
// ...
}
});
```
## Cost and Performance
### Embedding Generation Cost
**Typical workflow:**
- User adds annotation (500 tokens)
- Generate embedding: $0.00001 (text-embedding-3-small)
- Store in Convex: free (within limits)
**For 1000 annotations:** ~$0.01
### Query Performance
**Current approach (scan all):**
- 1000 embeddings: ~200ms
- 10,000 embeddings: ~2s (too slow)
**Optimization needed at scale:**
- Pre-filter with metadata indexes
- Implement ANN (approximate nearest neighbor)
- Cache popular queries
- Consider dedicated vector database for huge datasets (Pinecone + Convex hybrid)
## Example: NNT Ecosystem Integration
### Scenario: Search Song Annotations
**Goal:** User types "modal interchange examples" → finds relevant songs
**Implementation:**
**1. Embed song annotations (one-time):**
```typescript
// Script to embed existing songs
import { convex } from "./convex";
import { api } from "./convex/_generated/api";
import { OpenAIEmbeddings } from "@langchain/openai";
async function embedAllSongs() {
const songs = await convex.query(api.songs.list);
const embeddings = new OpenAIEmbeddings();
for (const song of songs) {
for (const annotation of song.annotations) {
const vector = await embeddings.embedQuery(annotation.text);
await convex.mutation(api.embeddings.addSongEmbedding, {
songId: song._id,
text: annotation.text,
embedding: vector,
metadata: {
title: song.title,
artist: song.artist,
section: annotation.section
}
});
}
}
}
```
**2. Search interface:**
```typescript
function SemanticSearch() {
const [query, setQuery] = useState("");
const askQuestion = useMutation(api.rag.askQuestion);
const [result, setResult] = useState(null);
const handleSearch = async () => {
const answer = await askQuestion({ question: query });
setResult(answer);
};
return (
<div>
<input
placeholder="Ask about song annotations..."
value={query}
onChange={e => setQuery(e.target.value)}
/>
<button onClick={handleSearch}>Search</button>
{result && (
<div>
<p>{result.answer}</p>
<h4>From these songs:</h4>
{result.sources.map(s => (
<a href={`/songs/${s.songId}`}>{s.title}</a>
))}
</div>
)}
</div>
);
}
```
**3. Voice annotation with auto-embedding:**
```typescript
export const addVoiceAnnotation = mutation({
args: {
songId: v.string(),
transcript: v.string(),
timestamp: v.number()
},
handler: async (ctx, args) => {
// Store annotation
const annotationId = await ctx.db.insert("annotations", args);
// Schedule embedding (async, doesn't block)
await ctx.scheduler.runAfter(0, api.embeddings.addSongEmbedding, {
songId: args.songId,
text: args.transcript,
metadata: {
timestamp: args.timestamp,
type: "voice"
}
});
}
});
```
## Migration Path
**If starting fresh:**
1. Store song metadata in Convex
2. Add embeddings table
3. Embed on creation
4. Implement similarity search
**If existing data:**
1. Export annotations from vault
2. Bulk embed with script
3. Import to Convex
4. Set up incremental updates
## See Also
- [[Langchain Overview]] - Core concepts
- [[Vectorized Databases]] - Embedding fundamentals
- [[Data Storage Architecture]] - NNT-specific Convex schemas
- [[Database Query Patterns]] - Efficient Convex queries