📖 文章概述
向量数据库是 AI 应用的基础设施。本文讲解向量化存储、Embedding 管理、相似度搜索、混合检索等核心技术。
🎯 向量数据库架构
向量数据库基本概念
文本/图像/音频
↓
[Embedding 模型] → 向量表示(768/1536 维)
↓
[向量数据库] → HNSW/IVF 索引
↓
[相似度计算] → 距离度量(欧氏/余弦/内积)
↓
快速检索结果
主流向量数据库对比
| 特性 | Pinecone | Weaviate | Milvus | Qdrant |
|---|---|---|---|---|
| 部署 | 云服务 | 自建/云 | 自建 | 自建/云 |
| 扩展性 | 高 | 中 | 高 | 高 |
| 易用性 | 高 | 高 | 中 | 中 |
| 成本 | 中等 | 低 | 低 | 低 |
| 元数据 | 支持 | 支持 | 支持 | 支持 |
| 混合搜索 | 支持 | 支持 | 支持 | 支持 |
| 实时性 | 高 | 高 | 中 | 高 |
🚀 Pinecone 云服务
1. Pinecone 快速开始
import { Pinecone } from '@pinecone-database/pinecone'
import { OpenAIEmbeddings } from 'langchain/embeddings/openai'
// 初始化
const pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY,
environment: process.env.PINECONE_ENVIRONMENT
})
// 创建或获取索引
async function ensureIndex() {
const indexName = 'documents'
try {
const existingIndex = await pinecone.describeIndex(indexName)
console.log(`索引 ${indexName} 已存在`)
return existingIndex
} catch (error) {
// 创建新索引
await pinecone.createIndex({
name: indexName,
dimension: 1536, // OpenAI embedding 维度
metric: 'cosine', // 余弦相似度
spec: {
serverless: {
cloud: 'aws',
region: 'us-west-2'
}
}
})
console.log(`索引 ${indexName} 已创建`)
}
}
// 2. 向量上传
async function upsertVectors() {
const index = pinecone.Index('documents')
const vectors = [
{
id: 'doc-1',
values: [0.1, 0.2, 0.3, ...], // 1536 维 embedding
metadata: {
title: '量子计算基础',
category: 'physics',
source: 'book.pdf'
}
},
{
id: 'doc-2',
values: [0.2, 0.3, 0.4, ...],
metadata: {
title: '机器学习入门',
category: 'ai',
source: 'article.md'
}
}
]
// 批量上传
const upsertRequest = {
vectors: vectors,
namespace: 'production'
}
const result = await index.upsert(upsertRequest)
console.log(`已上传 ${result.upsertedCount} 个向量`)
}
// 3. 向量查询
async function queryVectors(queryEmbedding) {
const index = pinecone.Index('documents')
const queryRequest = {
vector: queryEmbedding,
topK: 5,
includeMetadata: true,
namespace: 'production',
// 元数据过滤
filter: {
category: { $eq: 'ai' }
}
}
const results = await index.query(queryRequest)
return results.matches.map(match => ({
id: match.id,
score: match.score, // 相似度分数
metadata: match.metadata
}))
}
// 4. 删除向量
async function deleteVectors(ids) {
const index = pinecone.Index('documents')
await index.deleteMany(ids)
console.log(`已删除 ${ids.length} 个向量`)
}
// 完整工作流
async function main() {
await ensureIndex()
const embeddings = new OpenAIEmbeddings()
// 生成 embedding 并上传
const documents = [
'深度学习是机器学习的子领域',
'神经网络模仿大脑的工作原理',
'卷积神经网络用于图像处理'
]
const embeddingVectors = await embeddings.embedDocuments(documents)
const vectors = embeddingVectors.map((embedding, i) => ({
id: `doc-${i}`,
values: embedding,
metadata: { text: documents[i] }
}))
await upsertVectors(vectors)
// 查询相似文档
const queryText = '机器学习算法'
const queryEmbedding = await embeddings.embedQuery(queryText)
const results = await queryVectors(queryEmbedding)
console.log('相似结果:', results)
}
🏗️ Weaviate 自建部署
5. Weaviate 安装和配置
# docker-compose.yml
version: '3.4'
services:
weaviate:
image: semitechnologies/weaviate:1.14.0
ports:
- "8080:8080"
environment:
QUERY_DEFAULTS_LIMIT: 100
AUTHENTICATION_APIKEY_ENABLED: 'true'
AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'my-secret-key'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'text2vec-openai'
MODULES: 'text2vec-openai'
ENABLE_MODULES: 'text2vec-openai'
volumes:
- weaviate_data:/var/lib/weaviate
command:
- --host
- '0.0.0.0'
- --port
- '8080'
- --scheme
- http
volumes:
weaviate_data: {}
6. Weaviate 数据管理
import weaviate from 'weaviate-ts-client'
const client = weaviate.client({
scheme: 'http',
host: 'localhost:8080',
apiKey: new weaviate.ApiKey('my-secret-key'),
headers: {
'X-OpenAI-Api-Key': process.env.OPENAI_API_KEY
}
})
// 创建类(Schema)
async function createSchema() {
const classDefinition = {
class: 'Document',
description: 'A document with embeddings',
vectorizer: 'text2vec-openai',
vectorizerConfig: {
'text2vec-openai': {
model: 'text-embedding-3-small',
vectorizeClassName: false
}
},
properties: [
{
name: 'title',
dataType: ['string'],
description: '文档标题'
},
{
name: 'content',
dataType: ['text'],
description: '文档内容'
},
{
name: 'source',
dataType: ['string'],
description: '文档来源'
},
{
name: 'category',
dataType: ['string'],
description: '文档分类'
},
{
name: 'tags',
dataType: ['string[]'],
description: '标签'
},
{
name: 'createdAt',
dataType: ['date'],
description: '创建时间'
}
]
}
await client.schema.classCreator().withClass(classDefinition).do()
console.log('Schema 已创建')
}
// 添加数据对象
async function addDocuments() {
const documents = [
{
title: '深度学习基础',
content: '深度学习是使用多层神经网络的机器学习方法...',
source: 'book.pdf',
category: 'AI',
tags: ['neural-network', 'learning']
},
{
title: '自然语言处理',
content: 'NLP 是人工智能的重要领域...',
source: 'article.md',
category: 'AI',
tags: ['nlp', 'language-model']
}
]
for (const doc of documents) {
await client.data
.creator()
.withClass('Document')
.withProperties(doc)
.do()
}
console.log(`已添加 ${documents.length} 个文档`)
}
// 向量搜索
async function semanticSearch(query) {
const result = await client.graphql
.get()
.withClassName('Document')
.withFields(['title', 'content', 'category', '_additional { distance }'])
.withNearText({
concepts: [query],
certainty: 0.7
})
.withLimit(5)
.do()
return result.data.Get.Document.map(doc => ({
title: doc.title,
content: doc.content,
similarity: 1 - doc._additional.distance,
category: doc.category
}))
}
// 元数据过滤搜索
async function filteredSearch(query, category) {
const result = await client.graphql
.get()
.withClassName('Document')
.withFields(['title', 'content', '_additional { distance }'])
.withNearText({
concepts: [query]
})
.withWhere({
path: ['category'],
operator: 'Equal',
valueString: category
})
.withLimit(5)
.do()
return result.data.Get.Document
}
// 混合搜索(向量 + 关键词)
async function hybridSearch(query) {
const result = await client.graphql
.get()
.withClassName('Document')
.withFields(['title', 'content', '_additional { score }'])
.withHybrid({
query: query,
alpha: 0.5 // 0.5 = 50% 向量 + 50% 关键词
})
.withLimit(5)
.do()
return result.data.Get.Document
}
📦 Milvus 本地部署
7. Milvus Docker 部署
# docker-compose.yml for Milvus
version: '3.8'
services:
etcd:
image: quay.io/coreos/etcd:v3.5.5
environment:
ETCD_AUTO_COMPACTION_MODE: revision
ETCD_AUTO_COMPACTION_RETENTION: '1000'
ETCD_QUOTA_BACKEND_BYTES: '4294967296'
ETCD_HEARTBEAT_INTERVAL: '500'
ETCD_ELECTION_INTERVAL: '2500'
volumes:
- etcd_data:/etcd
minio:
image: minio/minio:latest
environment:
MINIO_ROOT_USER: minioadmin
MINIO_ROOT_PASSWORD: minioadmin
volumes:
- minio_data:/data
command: minio server /data
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
milvus:
image: milvusdb/milvus:latest
depends_on:
- etcd
- minio
environment:
COMMON_STORAGETYPE: minio
COMMON_MINIO_ADDRESS: minio:9000
COMMON_MINIO_ACCESSKEYID: minioadmin
COMMON_MINIO_SECRETACCESSKEY: minioadmin
ETCD_ENDPOINTS: etcd:2379
ports:
- "19530:19530"
- "9091:9091"
volumes:
- milvus_data:/var/lib/milvus
volumes:
etcd_data:
minio_data:
milvus_data:
8. Milvus 数据操作
import { MilvusClient } from '@zilliz/milvus2-sdk-node'
const milvusClient = new MilvusClient({
address: 'localhost:19530'
})
// 创建集合
async function createCollection() {
const collectionName = 'documents'
const fields = [
{
name: 'id',
description: '文档 ID',
data_type: 5 // DataType.Int64
},
{
name: 'embedding',
description: '文档向量',
data_type: 101, // DataType.FloatVector
type_params: {
dim: 1536
}
},
{
name: 'title',
description: '文档标题',
data_type: 21 // DataType.VarChar
},
{
name: 'content',
description: '文档内容',
data_type: 21
},
{
name: 'category',
description: '分类',
data_type: 21
}
]
// 创建集合
await milvusClient.createCollection({
collection_name: collectionName,
fields: fields,
primary_field_name: 'id'
})
// 创建索引
await milvusClient.createIndex({
collection_name: collectionName,
field_name: 'embedding',
index_type: 'IVF_FLAT', // 快速索引类型
metric_type: 'L2', // 欧氏距离
params: {
nlist: 1024
}
})
console.log(`集合 ${collectionName} 已创建并索引`)
}
// 插入数据
async function insertDocuments(documents) {
const collectionName = 'documents'
const entities = documents.map((doc, index) => ({
id: index,
embedding: doc.embedding,
title: doc.title,
content: doc.content,
category: doc.category
}))
const response = await milvusClient.insert({
collection_name: collectionName,
fields_data: entities
})
console.log(`插入 ${response.insert_cnt} 条记录`)
}
// 搜索
async function search(queryEmbedding, topK = 5) {
const collectionName = 'documents'
// 搜索前加载集合
await milvusClient.loadCollection({
collection_name: collectionName
})
const response = await milvusClient.search({
collection_name: collectionName,
vectors: [queryEmbedding],
search_params: {
anns_field: 'embedding',
topk: topK.toString(),
metric_type: 'L2',
params: JSON.stringify({
nprobe: 10
})
},
output_fields: ['title', 'content', 'category']
})
return response.results[0].map(result => ({
id: result.id,
score: result.distance,
title: result.title,
content: result.content,
category: result.category
}))
}
// 范围查询
async function rangeSearch(queryEmbedding, radius = 1.0) {
const collectionName = 'documents'
await milvusClient.loadCollection({
collection_name: collectionName
})
const response = await milvusClient.rangeSearch({
collection_name: collectionName,
vectors: [queryEmbedding],
search_params: {
anns_field: 'embedding',
metric_type: 'L2',
params: JSON.stringify({
radius: radius
})
},
output_fields: ['title', 'content']
})
return response.results[0].map(result => ({
distance: result.distance,
title: result.title,
content: result.content
}))
}
// 使用示例
async function main() {
await createCollection()
// 准备文档
const documents = [
{
title: '深度学习',
content: '深度学习的基本概念...',
category: 'AI',
embedding: new Array(1536).fill(0.1)
}
]
await insertDocuments(documents)
// 搜索
const queryEmbedding = new Array(1536).fill(0.15)
const results = await search(queryEmbedding, 5)
console.log('搜索结果:', results)
}
🔄 Embedding 管理
9. 多模型 Embedding 实现
import { OpenAIEmbeddings } from 'langchain/embeddings/openai'
import fetch from 'node-fetch'
// OpenAI Embedding
async function openaiEmbedding(text) {
const embeddings = new OpenAIEmbeddings({
openAIApiKey: process.env.OPENAI_API_KEY,
modelName: 'text-embedding-3-small' // 1536 维
})
const vector = await embeddings.embedQuery(text)
return vector
}
// 使用本地模型(sentence-transformers)
async function localEmbedding(text) {
const response = await fetch('http://localhost:8000/embed', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ text })
})
const data = await response.json()
return data.embedding
}
// Embedding 缓存
class EmbeddingCache {
constructor() {
this.cache = new Map()
}
async getEmbedding(text, modelName = 'openai') {
const key = `${modelName}:${text}`
if (this.cache.has(key)) {
console.log('从缓存获取 embedding')
return this.cache.get(key)
}
let embedding
if (modelName === 'openai') {
embedding = await openaiEmbedding(text)
} else {
embedding = await localEmbedding(text)
}
this.cache.set(key, embedding)
return embedding
}
clearCache() {
this.cache.clear()
}
}
// 批量生成 embedding
async function batchEmbeddings(texts, batchSize = 100) {
const embeddings = []
for (let i = 0; i < texts.length; i += batchSize) {
const batch = texts.slice(i, i + batchSize)
const batchEmbeds = await Promise.all(
batch.map(text => openaiEmbedding(text))
)
embeddings.push(...batchEmbeds)
console.log(`已处理 ${Math.min(i + batchSize, texts.length)}/${texts.length}`)
}
return embeddings
}
🎨 混合搜索实现
10. 向量 + 关键词混合搜索
import Elasticsearch from '@elastic/elasticsearch'
import { Pinecone } from '@pinecone-database/pinecone'
class HybridSearch {
constructor() {
this.es = new Elasticsearch.Client({
node: 'http://localhost:9200'
})
this.pinecone = new Pinecone({
apiKey: process.env.PINECONE_API_KEY
})
}
// 向量搜索
async vectorSearch(embedding, topK = 5) {
const index = this.pinecone.Index('documents')
const results = await index.query({
vector: embedding,
topK: topK,
includeMetadata: true
})
return results.matches.map(m => ({
id: m.id,
score: m.score,
metadata: m.metadata,
source: 'vector'
}))
}
// 关键词搜索
async keywordSearch(query, topK = 5) {
const results = await this.es.search({
index: 'documents',
body: {
query: {
multi_match: {
query: query,
fields: ['title', 'content']
}
},
size: topK
}
})
return results.body.hits.hits.map(hit => ({
id: hit._id,
score: hit._score,
metadata: hit._source,
source: 'keyword'
}))
}
// 混合搜索(融合两种结果)
async hybridSearch(query, embedding, weights = { vector: 0.6, keyword: 0.4 }) {
const [vectorResults, keywordResults] = await Promise.all([
this.vectorSearch(embedding),
this.keywordSearch(query)
])
// 合并结果
const merged = new Map()
// 添加向量搜索结果
vectorResults.forEach((result, index) => {
const key = result.id
if (!merged.has(key)) {
merged.set(key, {
...result,
vectorScore: (1 - index / vectorResults.length) * weights.vector,
keywordScore: 0
})
} else {
merged.get(key).vectorScore = (1 - index / vectorResults.length) * weights.vector
}
})
// 添加关键词搜索结果
keywordResults.forEach((result, index) => {
const key = result.id
if (!merged.has(key)) {
merged.set(key, {
...result,
vectorScore: 0,
keywordScore: (1 - index / keywordResults.length) * weights.keyword
})
} else {
merged.get(key).keywordScore = (1 - index / keywordResults.length) * weights.keyword
}
})
// 按综合分数排序
return Array.from(merged.values())
.map(result => ({
...result,
combinedScore: result.vectorScore + result.keywordScore
}))
.sort((a, b) => b.combinedScore - a.combinedScore)
.slice(0, 10)
}
}
// 使用
const hybrid = new HybridSearch()
const results = await hybrid.hybridSearch(
'深度学习',
embedding,
{ vector: 0.7, keyword: 0.3 }
)
console.log('混合搜索结果:', results)
🎓 最佳实践
DO ✅
// 1. 使用正确的相似度度量
const metricComparison = {
'cosine': '适合文本、处理向量方向',
'l2': '适合图像、要求精确距离',
'ip': '内积,最快但需要规范化向量'
}
// 2. 批量操作降低延迟
async function efficientBatchInsert(documents) {
const batchSize = 100
for (let i = 0; i < documents.length; i += batchSize) {
const batch = documents.slice(i, i + batchSize)
await index.upsert(batch) // 一次操作
}
}
// 3. 使用元数据过滤
await index.query({
vector: embedding,
filter: {
category: { $eq: 'tech' },
date: { $gte: '2023-01-01' }
}
})
// 4. 定期重建索引保持性能
async function rebuildIndex() {
await index.describeIndexStats()
// 监控 dimensions_processed 是否与总向量数接近
}
DON'T ❌
// 1. 不要使用未规范化的向量
// ❌
const vector = [1, 2, 3] // 未规范化
// ✅
const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v*v, 0))
const normalized = vector.map(v => v / magnitude)
// 2. 不要忽视维度匹配
// ❌
// embedding 是 1536 维但索引是 768 维
await index.upsert({ id: '1', values: embedding1536 })
// ✅ 确保维度一致
const embedding768 = embedding1536.slice(0, 768)
// 3. 不要在实时应用中使用过多的 topK
// ❌
await index.query({ vector, topK: 1000 })
// ✅ 合理设置范围
await index.query({ vector, topK: 10 })
📚 总结
- Pinecone: 云服务,开箱即用
- Weaviate: 灵活部署,支持多模型
- Milvus: 高性能,适合大规模
- 混合搜索: 向量+关键词综合检索
- Embedding 管理: 缓存、批处理、模型选择
掌握向量数据库,构建 AI-native 应用!