向量数据库

向量数据库完全指南:Pinecone、Weaviate、Milvus 与 AI 应用

深入学习向量数据库架构,掌握 embedding 管理、相似度搜索、混合搜索等技术,构建 AI 驱动的应用

17 分钟阅读
#向量数据库 #Pinecone #Weaviate #Embedding #AI

📖 文章概述

向量数据库是 AI 应用的基础设施。本文讲解向量化存储、Embedding 管理、相似度搜索、混合检索等核心技术。


🎯 向量数据库架构

向量数据库基本概念

文本/图像/音频
     ↓
[Embedding 模型] → 向量表示(768/1536 维)
     ↓
[向量数据库] → HNSW/IVF 索引
     ↓
[相似度计算] → 距离度量(欧氏/余弦/内积)
     ↓
快速检索结果

主流向量数据库对比

特性PineconeWeaviateMilvusQdrant
部署云服务自建/云自建自建/云
扩展性
易用性
成本中等
元数据支持支持支持支持
混合搜索支持支持支持支持
实时性

🚀 Pinecone 云服务

1. Pinecone 快速开始

import { Pinecone } from '@pinecone-database/pinecone'
import { OpenAIEmbeddings } from 'langchain/embeddings/openai'

// 初始化
const pinecone = new Pinecone({
  apiKey: process.env.PINECONE_API_KEY,
  environment: process.env.PINECONE_ENVIRONMENT
})

// 创建或获取索引
async function ensureIndex() {
  const indexName = 'documents'
  
  try {
    const existingIndex = await pinecone.describeIndex(indexName)
    console.log(`索引 ${indexName} 已存在`)
    return existingIndex
  } catch (error) {
    // 创建新索引
    await pinecone.createIndex({
      name: indexName,
      dimension: 1536,  // OpenAI embedding 维度
      metric: 'cosine',  // 余弦相似度
      spec: {
        serverless: {
          cloud: 'aws',
          region: 'us-west-2'
        }
      }
    })
    
    console.log(`索引 ${indexName} 已创建`)
  }
}

// 2. 向量上传
async function upsertVectors() {
  const index = pinecone.Index('documents')
  
  const vectors = [
    {
      id: 'doc-1',
      values: [0.1, 0.2, 0.3, ...], // 1536 维 embedding
      metadata: {
        title: '量子计算基础',
        category: 'physics',
        source: 'book.pdf'
      }
    },
    {
      id: 'doc-2',
      values: [0.2, 0.3, 0.4, ...],
      metadata: {
        title: '机器学习入门',
        category: 'ai',
        source: 'article.md'
      }
    }
  ]
  
  // 批量上传
  const upsertRequest = {
    vectors: vectors,
    namespace: 'production'
  }
  
  const result = await index.upsert(upsertRequest)
  console.log(`已上传 ${result.upsertedCount} 个向量`)
}

// 3. 向量查询
async function queryVectors(queryEmbedding) {
  const index = pinecone.Index('documents')
  
  const queryRequest = {
    vector: queryEmbedding,
    topK: 5,
    includeMetadata: true,
    namespace: 'production',
    // 元数据过滤
    filter: {
      category: { $eq: 'ai' }
    }
  }
  
  const results = await index.query(queryRequest)
  
  return results.matches.map(match => ({
    id: match.id,
    score: match.score,  // 相似度分数
    metadata: match.metadata
  }))
}

// 4. 删除向量
async function deleteVectors(ids) {
  const index = pinecone.Index('documents')
  
  await index.deleteMany(ids)
  console.log(`已删除 ${ids.length} 个向量`)
}

// 完整工作流
async function main() {
  await ensureIndex()
  
  const embeddings = new OpenAIEmbeddings()
  
  // 生成 embedding 并上传
  const documents = [
    '深度学习是机器学习的子领域',
    '神经网络模仿大脑的工作原理',
    '卷积神经网络用于图像处理'
  ]
  
  const embeddingVectors = await embeddings.embedDocuments(documents)
  
  const vectors = embeddingVectors.map((embedding, i) => ({
    id: `doc-${i}`,
    values: embedding,
    metadata: { text: documents[i] }
  }))
  
  await upsertVectors(vectors)
  
  // 查询相似文档
  const queryText = '机器学习算法'
  const queryEmbedding = await embeddings.embedQuery(queryText)
  const results = await queryVectors(queryEmbedding)
  
  console.log('相似结果:', results)
}

🏗️ Weaviate 自建部署

5. Weaviate 安装和配置

# docker-compose.yml
version: '3.4'
services:
  weaviate:
    image: semitechnologies/weaviate:1.14.0
    ports:
      - "8080:8080"
    environment:
      QUERY_DEFAULTS_LIMIT: 100
      AUTHENTICATION_APIKEY_ENABLED: 'true'
      AUTHENTICATION_APIKEY_ALLOWED_KEYS: 'my-secret-key'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'text2vec-openai'
      MODULES: 'text2vec-openai'
      ENABLE_MODULES: 'text2vec-openai'
    volumes:
      - weaviate_data:/var/lib/weaviate
    command:
      - --host
      - '0.0.0.0'
      - --port
      - '8080'
      - --scheme
      - http

volumes:
  weaviate_data: {}

6. Weaviate 数据管理

import weaviate from 'weaviate-ts-client'

const client = weaviate.client({
  scheme: 'http',
  host: 'localhost:8080',
  apiKey: new weaviate.ApiKey('my-secret-key'),
  headers: {
    'X-OpenAI-Api-Key': process.env.OPENAI_API_KEY
  }
})

// 创建类(Schema)
async function createSchema() {
  const classDefinition = {
    class: 'Document',
    description: 'A document with embeddings',
    vectorizer: 'text2vec-openai',
    vectorizerConfig: {
      'text2vec-openai': {
        model: 'text-embedding-3-small',
        vectorizeClassName: false
      }
    },
    properties: [
      {
        name: 'title',
        dataType: ['string'],
        description: '文档标题'
      },
      {
        name: 'content',
        dataType: ['text'],
        description: '文档内容'
      },
      {
        name: 'source',
        dataType: ['string'],
        description: '文档来源'
      },
      {
        name: 'category',
        dataType: ['string'],
        description: '文档分类'
      },
      {
        name: 'tags',
        dataType: ['string[]'],
        description: '标签'
      },
      {
        name: 'createdAt',
        dataType: ['date'],
        description: '创建时间'
      }
    ]
  }
  
  await client.schema.classCreator().withClass(classDefinition).do()
  console.log('Schema 已创建')
}

// 添加数据对象
async function addDocuments() {
  const documents = [
    {
      title: '深度学习基础',
      content: '深度学习是使用多层神经网络的机器学习方法...',
      source: 'book.pdf',
      category: 'AI',
      tags: ['neural-network', 'learning']
    },
    {
      title: '自然语言处理',
      content: 'NLP 是人工智能的重要领域...',
      source: 'article.md',
      category: 'AI',
      tags: ['nlp', 'language-model']
    }
  ]
  
  for (const doc of documents) {
    await client.data
      .creator()
      .withClass('Document')
      .withProperties(doc)
      .do()
  }
  
  console.log(`已添加 ${documents.length} 个文档`)
}

// 向量搜索
async function semanticSearch(query) {
  const result = await client.graphql
    .get()
    .withClassName('Document')
    .withFields(['title', 'content', 'category', '_additional { distance }'])
    .withNearText({
      concepts: [query],
      certainty: 0.7
    })
    .withLimit(5)
    .do()
  
  return result.data.Get.Document.map(doc => ({
    title: doc.title,
    content: doc.content,
    similarity: 1 - doc._additional.distance,
    category: doc.category
  }))
}

// 元数据过滤搜索
async function filteredSearch(query, category) {
  const result = await client.graphql
    .get()
    .withClassName('Document')
    .withFields(['title', 'content', '_additional { distance }'])
    .withNearText({
      concepts: [query]
    })
    .withWhere({
      path: ['category'],
      operator: 'Equal',
      valueString: category
    })
    .withLimit(5)
    .do()
  
  return result.data.Get.Document
}

// 混合搜索(向量 + 关键词)
async function hybridSearch(query) {
  const result = await client.graphql
    .get()
    .withClassName('Document')
    .withFields(['title', 'content', '_additional { score }'])
    .withHybrid({
      query: query,
      alpha: 0.5  // 0.5 = 50% 向量 + 50% 关键词
    })
    .withLimit(5)
    .do()
  
  return result.data.Get.Document
}

📦 Milvus 本地部署

7. Milvus Docker 部署

# docker-compose.yml for Milvus
version: '3.8'

services:
  etcd:
    image: quay.io/coreos/etcd:v3.5.5
    environment:
      ETCD_AUTO_COMPACTION_MODE: revision
      ETCD_AUTO_COMPACTION_RETENTION: '1000'
      ETCD_QUOTA_BACKEND_BYTES: '4294967296'
      ETCD_HEARTBEAT_INTERVAL: '500'
      ETCD_ELECTION_INTERVAL: '2500'
    volumes:
      - etcd_data:/etcd

  minio:
    image: minio/minio:latest
    environment:
      MINIO_ROOT_USER: minioadmin
      MINIO_ROOT_PASSWORD: minioadmin
    volumes:
      - minio_data:/data
    command: minio server /data
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
      interval: 30s
      timeout: 20s
      retries: 3

  milvus:
    image: milvusdb/milvus:latest
    depends_on:
      - etcd
      - minio
    environment:
      COMMON_STORAGETYPE: minio
      COMMON_MINIO_ADDRESS: minio:9000
      COMMON_MINIO_ACCESSKEYID: minioadmin
      COMMON_MINIO_SECRETACCESSKEY: minioadmin
      ETCD_ENDPOINTS: etcd:2379
    ports:
      - "19530:19530"
      - "9091:9091"
    volumes:
      - milvus_data:/var/lib/milvus

volumes:
  etcd_data:
  minio_data:
  milvus_data:

8. Milvus 数据操作

import { MilvusClient } from '@zilliz/milvus2-sdk-node'

const milvusClient = new MilvusClient({
  address: 'localhost:19530'
})

// 创建集合
async function createCollection() {
  const collectionName = 'documents'
  
  const fields = [
    {
      name: 'id',
      description: '文档 ID',
      data_type: 5  // DataType.Int64
    },
    {
      name: 'embedding',
      description: '文档向量',
      data_type: 101,  // DataType.FloatVector
      type_params: {
        dim: 1536
      }
    },
    {
      name: 'title',
      description: '文档标题',
      data_type: 21  // DataType.VarChar
    },
    {
      name: 'content',
      description: '文档内容',
      data_type: 21
    },
    {
      name: 'category',
      description: '分类',
      data_type: 21
    }
  ]
  
  // 创建集合
  await milvusClient.createCollection({
    collection_name: collectionName,
    fields: fields,
    primary_field_name: 'id'
  })
  
  // 创建索引
  await milvusClient.createIndex({
    collection_name: collectionName,
    field_name: 'embedding',
    index_type: 'IVF_FLAT',  // 快速索引类型
    metric_type: 'L2',  // 欧氏距离
    params: {
      nlist: 1024
    }
  })
  
  console.log(`集合 ${collectionName} 已创建并索引`)
}

// 插入数据
async function insertDocuments(documents) {
  const collectionName = 'documents'
  
  const entities = documents.map((doc, index) => ({
    id: index,
    embedding: doc.embedding,
    title: doc.title,
    content: doc.content,
    category: doc.category
  }))
  
  const response = await milvusClient.insert({
    collection_name: collectionName,
    fields_data: entities
  })
  
  console.log(`插入 ${response.insert_cnt} 条记录`)
}

// 搜索
async function search(queryEmbedding, topK = 5) {
  const collectionName = 'documents'
  
  // 搜索前加载集合
  await milvusClient.loadCollection({
    collection_name: collectionName
  })
  
  const response = await milvusClient.search({
    collection_name: collectionName,
    vectors: [queryEmbedding],
    search_params: {
      anns_field: 'embedding',
      topk: topK.toString(),
      metric_type: 'L2',
      params: JSON.stringify({
        nprobe: 10
      })
    },
    output_fields: ['title', 'content', 'category']
  })
  
  return response.results[0].map(result => ({
    id: result.id,
    score: result.distance,
    title: result.title,
    content: result.content,
    category: result.category
  }))
}

// 范围查询
async function rangeSearch(queryEmbedding, radius = 1.0) {
  const collectionName = 'documents'
  
  await milvusClient.loadCollection({
    collection_name: collectionName
  })
  
  const response = await milvusClient.rangeSearch({
    collection_name: collectionName,
    vectors: [queryEmbedding],
    search_params: {
      anns_field: 'embedding',
      metric_type: 'L2',
      params: JSON.stringify({
        radius: radius
      })
    },
    output_fields: ['title', 'content']
  })
  
  return response.results[0].map(result => ({
    distance: result.distance,
    title: result.title,
    content: result.content
  }))
}

// 使用示例
async function main() {
  await createCollection()
  
  // 准备文档
  const documents = [
    {
      title: '深度学习',
      content: '深度学习的基本概念...',
      category: 'AI',
      embedding: new Array(1536).fill(0.1)
    }
  ]
  
  await insertDocuments(documents)
  
  // 搜索
  const queryEmbedding = new Array(1536).fill(0.15)
  const results = await search(queryEmbedding, 5)
  console.log('搜索结果:', results)
}

🔄 Embedding 管理

9. 多模型 Embedding 实现

import { OpenAIEmbeddings } from 'langchain/embeddings/openai'
import fetch from 'node-fetch'

// OpenAI Embedding
async function openaiEmbedding(text) {
  const embeddings = new OpenAIEmbeddings({
    openAIApiKey: process.env.OPENAI_API_KEY,
    modelName: 'text-embedding-3-small'  // 1536 维
  })
  
  const vector = await embeddings.embedQuery(text)
  return vector
}

// 使用本地模型(sentence-transformers)
async function localEmbedding(text) {
  const response = await fetch('http://localhost:8000/embed', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json' },
    body: JSON.stringify({ text })
  })
  
  const data = await response.json()
  return data.embedding
}

// Embedding 缓存
class EmbeddingCache {
  constructor() {
    this.cache = new Map()
  }
  
  async getEmbedding(text, modelName = 'openai') {
    const key = `${modelName}:${text}`
    
    if (this.cache.has(key)) {
      console.log('从缓存获取 embedding')
      return this.cache.get(key)
    }
    
    let embedding
    if (modelName === 'openai') {
      embedding = await openaiEmbedding(text)
    } else {
      embedding = await localEmbedding(text)
    }
    
    this.cache.set(key, embedding)
    return embedding
  }
  
  clearCache() {
    this.cache.clear()
  }
}

// 批量生成 embedding
async function batchEmbeddings(texts, batchSize = 100) {
  const embeddings = []
  
  for (let i = 0; i < texts.length; i += batchSize) {
    const batch = texts.slice(i, i + batchSize)
    const batchEmbeds = await Promise.all(
      batch.map(text => openaiEmbedding(text))
    )
    embeddings.push(...batchEmbeds)
    
    console.log(`已处理 ${Math.min(i + batchSize, texts.length)}/${texts.length}`)
  }
  
  return embeddings
}

🎨 混合搜索实现

10. 向量 + 关键词混合搜索

import Elasticsearch from '@elastic/elasticsearch'
import { Pinecone } from '@pinecone-database/pinecone'

class HybridSearch {
  constructor() {
    this.es = new Elasticsearch.Client({
      node: 'http://localhost:9200'
    })
    
    this.pinecone = new Pinecone({
      apiKey: process.env.PINECONE_API_KEY
    })
  }
  
  // 向量搜索
  async vectorSearch(embedding, topK = 5) {
    const index = this.pinecone.Index('documents')
    
    const results = await index.query({
      vector: embedding,
      topK: topK,
      includeMetadata: true
    })
    
    return results.matches.map(m => ({
      id: m.id,
      score: m.score,
      metadata: m.metadata,
      source: 'vector'
    }))
  }
  
  // 关键词搜索
  async keywordSearch(query, topK = 5) {
    const results = await this.es.search({
      index: 'documents',
      body: {
        query: {
          multi_match: {
            query: query,
            fields: ['title', 'content']
          }
        },
        size: topK
      }
    })
    
    return results.body.hits.hits.map(hit => ({
      id: hit._id,
      score: hit._score,
      metadata: hit._source,
      source: 'keyword'
    }))
  }
  
  // 混合搜索(融合两种结果)
  async hybridSearch(query, embedding, weights = { vector: 0.6, keyword: 0.4 }) {
    const [vectorResults, keywordResults] = await Promise.all([
      this.vectorSearch(embedding),
      this.keywordSearch(query)
    ])
    
    // 合并结果
    const merged = new Map()
    
    // 添加向量搜索结果
    vectorResults.forEach((result, index) => {
      const key = result.id
      if (!merged.has(key)) {
        merged.set(key, {
          ...result,
          vectorScore: (1 - index / vectorResults.length) * weights.vector,
          keywordScore: 0
        })
      } else {
        merged.get(key).vectorScore = (1 - index / vectorResults.length) * weights.vector
      }
    })
    
    // 添加关键词搜索结果
    keywordResults.forEach((result, index) => {
      const key = result.id
      if (!merged.has(key)) {
        merged.set(key, {
          ...result,
          vectorScore: 0,
          keywordScore: (1 - index / keywordResults.length) * weights.keyword
        })
      } else {
        merged.get(key).keywordScore = (1 - index / keywordResults.length) * weights.keyword
      }
    })
    
    // 按综合分数排序
    return Array.from(merged.values())
      .map(result => ({
        ...result,
        combinedScore: result.vectorScore + result.keywordScore
      }))
      .sort((a, b) => b.combinedScore - a.combinedScore)
      .slice(0, 10)
  }
}

// 使用
const hybrid = new HybridSearch()
const results = await hybrid.hybridSearch(
  '深度学习',
  embedding,
  { vector: 0.7, keyword: 0.3 }
)
console.log('混合搜索结果:', results)

🎓 最佳实践

DO ✅

// 1. 使用正确的相似度度量
const metricComparison = {
  'cosine': '适合文本、处理向量方向',
  'l2': '适合图像、要求精确距离',
  'ip': '内积,最快但需要规范化向量'
}

// 2. 批量操作降低延迟
async function efficientBatchInsert(documents) {
  const batchSize = 100
  
  for (let i = 0; i < documents.length; i += batchSize) {
    const batch = documents.slice(i, i + batchSize)
    await index.upsert(batch)  // 一次操作
  }
}

// 3. 使用元数据过滤
await index.query({
  vector: embedding,
  filter: {
    category: { $eq: 'tech' },
    date: { $gte: '2023-01-01' }
  }
})

// 4. 定期重建索引保持性能
async function rebuildIndex() {
  await index.describeIndexStats()
  // 监控 dimensions_processed 是否与总向量数接近
}

DON'T ❌

// 1. 不要使用未规范化的向量
// ❌
const vector = [1, 2, 3]  // 未规范化

// ✅
const magnitude = Math.sqrt(vector.reduce((sum, v) => sum + v*v, 0))
const normalized = vector.map(v => v / magnitude)

// 2. 不要忽视维度匹配
// ❌
// embedding 是 1536 维但索引是 768 维
await index.upsert({ id: '1', values: embedding1536 })

// ✅ 确保维度一致
const embedding768 = embedding1536.slice(0, 768)

// 3. 不要在实时应用中使用过多的 topK
// ❌
await index.query({ vector, topK: 1000 })

// ✅ 合理设置范围
await index.query({ vector, topK: 10 })

📚 总结

  • Pinecone: 云服务,开箱即用
  • Weaviate: 灵活部署,支持多模型
  • Milvus: 高性能,适合大规模
  • 混合搜索: 向量+关键词综合检索
  • Embedding 管理: 缓存、批处理、模型选择

掌握向量数据库,构建 AI-native 应用!