AI agent Data Lineage Tracking：输入、中间产物和输出之间的数据血缘如何全程可追溯

enum NodeType {
  DATABASE_TABLE = "database_table",
  API_ENDPOINT = "api_endpoint",
  FILE = "file",
  MODEL = "model",
  AGENT = "agent",
  USER_INPUT = "user_input",
  EXTERNAL_SERVICE = "external_service",
  CACHE = "cache",
  QUEUE = "queue",
}

interface DataNode {
  node_id: string;              // 唯一标识（UUID）
  node_type: NodeType;
  name: string;                 // 节点名称（如 "users table"、"OpenAI API"）
  qualified_name: string;       // 限定名称（如 "prod_db.public.users"）
  
  // 元数据
  owner?: string;               // 责任人
  description?: string;         // 描述
  schema?: Record<string, any>; // 数据结构（如列名、类型）
  classification?: "public" | "internal" | "confidential" | "pii";
  
  // 技术信息
  location?: string;            // 物理位置（如 S3 bucket path、database connection string）
  format?: string;              // 数据格式（如 JSON、CSV、Parquet）
  
  // 时间信息
  created_at: string;
  updated_at: string;
  
  // 自定义元数据
  metadata?: Record<string, any>;
}

Edge 类型定义

enum EdgeType {
  READS_FROM = "reads_from",
  WRITES_TO = "writes_to",
  TRANSFORMS_INTO = "transforms_into",
  DERIVED_FROM = "derived_from",
  DEPENDS_ON = "depends_on",
  TRIGGERS = "triggers",
}

interface DataEdge {
  edge_id: string;              // 唯一标识（UUID）
  source_node_id: string;       // 源节点 ID
  target_node_id: string;       // 目标节点 ID
  edge_type: EdgeType;
  
  // 流动详情
  columns?: string[];           // 涉及的列（如 ["user_id", "email"]）
  transformation?: string;      // 转换逻辑描述（如 "JOIN users ON user_id"）
  filter_condition?: string;    // 过滤条件（如 "WHERE status = 'active'"）
  
  // 执行信息
  executed_at: string;          // 执行时间
  duration_ms?: number;         // 执行耗时
  rows_affected?: number;       // 影响的行数
  
  // 关联信息
  job_id?: string;              // 关联的 ETL job ID
  request_id?: string;          // 关联的请求 ID
  session_id?: string;          // 关联的会话 ID
  
  // 自定义元数据
  metadata?: Record<string, any>;
}

示例：Agent 决策的数据血缘

┌──────────────┐     ┌──────────────┐     ┌──────────────┐
│  User Input  │────▶│   Agent      │────▶│  OpenAI API  │
│  (query)     │     │  (orchestrator)│   │  (GPT-4)     │
└──────────────┘     └──────────────┘     └──────────────┘
                            │                      │
                            │                      ▼
                            │             ┌──────────────┐
                            │             │  Embedding   │
                            │             │  Model       │
                            │             └──────────────┘
                            │                      │
                            ▼                      ▼
                   ┌──────────────┐     ┌──────────────┐
                   │ Vector DB    │◀────│  Query       │
                   │ (retrieval)  │     │  Results     │
                   └──────────────┘     └──────────────┘
                            │
                            ▼
                   ┌──────────────┐
                   │ Final        │
                   │ Response     │
                   └──────────────┘

对应的血缘记录：

{
  "nodes": [
    {
      "node_id": "node_user_input_001",
      "node_type": "user_input",
      "name": "Customer Query",
      "qualified_name": "session:sess_abc123:user_input",
      "classification": "pii",
      "schema": {
        "query": "string",
        "user_id": "string",
        "timestamp": "datetime"
      }
    },
    {
      "node_id": "node_agent_001",
      "node_type": "agent",
      "name": "Customer Support Agent",
      "qualified_name": "agent:customer-support-bot:v2.3.1",
      "owner": "team:ai-platform"
    },
    {
      "node_id": "node_openai_001",
      "node_type": "api_endpoint",
      "name": "OpenAI Chat Completions",
      "qualified_name": "api:openai.com/v1/chat/completions",
      "location": "https://api.openai.com/v1/chat/completions"
    },
    {
      "node_id": "node_vector_db_001",
      "node_type": "database_table",
      "name": "Knowledge Base Embeddings",
      "qualified_name": "prod_db.public.kb_embeddings",
      "classification": "internal",
      "schema": {
        "document_id": "uuid",
        "embedding": "vector(1536)",
        "metadata": "jsonb"
      }
    },
    {
      "node_id": "node_response_001",
      "node_type": "file",
      "name": "Agent Response",
      "qualified_name": "session:sess_abc123:response",
      "format": "json"
    }
  ],
  "edges": [
    {
      "edge_id": "edge_001",
      "source_node_id": "node_user_input_001",
      "target_node_id": "node_agent_001",
      "edge_type": "reads_from",
      "executed_at": "2026-06-15T14:23:45Z",
      "request_id": "req_xyz789"
    },
    {
      "edge_id": "edge_002",
      "source_node_id": "node_agent_001",
      "target_node_id": "node_openai_001",
      "edge_type": "triggers",
      "executed_at": "2026-06-15T14:23:46Z",
      "duration_ms": 1234,
      "request_id": "req_xyz789"
    },
    {
      "edge_id": "edge_003",
      "source_node_id": "node_agent_001",
      "target_node_id": "node_vector_db_001",
      "edge_type": "reads_from",
      "columns": ["document_id", "embedding", "metadata"],
      "filter_condition": "WHERE metadata->>'category' = 'support'",
      "executed_at": "2026-06-15T14:23:45Z",
      "rows_affected": 5,
      "request_id": "req_xyz789"
    },
    {
      "edge_id": "edge_004",
      "source_node_id": "node_openai_001",
      "target_node_id": "node_response_001",
      "edge_type": "writes_to",
      "executed_at": "2026-06-15T14:23:47Z",
      "request_id": "req_xyz789"
    }
  ]
}

存储方案选型

方案一：Graph Database（图数据库）

推荐：Neo4j、Amazon Neptune、Azure Cosmos DB（Gremlin API）

优势：

原生支持图查询：轻松遍历上下游节点，查询性能高。
灵活的模式：可以轻松添加新的节点类型和边类型。
可视化友好：大多数图数据库都提供内置的可视化工具。

劣势：

学习曲线：需要学习图查询语言（如 Cypher、Gremlin）。
运维复杂：图数据库的集群管理和备份恢复相对复杂。

查询示例（Cypher）：

// 查询某个节点的所有上游依赖（向上追溯 3 层）
MATCH (target:DataNode {node_id: "node_response_001"})<-[:READS_FROM|DERIVED_FROM*1..3]-(source:DataNode)
RETURN source.node_id, source.name, source.node_type
ORDER BY source.created_at DESC;

// 查询某个数据表被哪些下游系统使用
MATCH (table:DataNode {qualified_name: "prod_db.public.users"})-[:READS_FROM]->(downstream:DataNode)
RETURN downstream.node_id, downstream.name, downstream.node_type;

// 查询包含 PII 数据的完整流转路径
MATCH path = (source:DataNode {classification: "pii"})-[*1..5]->(target:DataNode)
RETURN path
LIMIT 10;

方案二：Relational Database（关系数据库）

方案三：Lineage-specific Store（专用血缘存储）

推荐：Apache Atlas、DataHub、Amundsen

优势：

开箱即用：专门用于数据血缘管理，提供完整的 UI 和 API。
集成丰富：预置了与 Hive、Spark、Airflow、dbt 等工具的集成。
元数据管理：除了血缘，还提供数据目录、数据质量、数据治理等功能。

劣势：

重量级：需要部署和维护复杂的系统。
定制困难：如果需要特殊的血缘模型，定制成本高。

适用场景：大型企业，已有成熟的数据平台团队，需要统一的数据治理解决方案。

采集机制

方式一：Instrumentation（代码埋点）

在应用程序代码中显式记录数据血缘。

示例：

import { DataLineageTracker } from "@company/data-lineage-sdk";

const tracker = new DataLineageTracker({
  apiEndpoint: "https://lineage-api.example.com",
  apiKey: process.env.LINEAGE_API_KEY,
});

async function processUserQuery(userId: string, query: string): Promise<string> {
  const sessionId = generateSessionId();
  
  // 记录用户输入节点
  await tracker.recordNode({
    node_id: `user_input:${sessionId}`,
    node_type: "user_input",
    name: "User Query",
    qualified_name: `session:${sessionId}:user_input`,
    classification: "pii",
    metadata: { user_id: userId },
  });
  
  // 记录从向量数据库读取数据
  const embeddings = await vectorDB.search(query);
  await tracker.recordEdge({
    source_node_id: "node_vector_db_001",
    target_node_id: `agent_run:${sessionId}`,
    edge_type: "reads_from",
    columns: ["document_id", "embedding", "metadata"],
    rows_affected: embeddings.length,
    metadata: { session_id: sessionId },
  });
  
  // 记录调用 OpenAI API
  const response = await openai.chat.completions.create({
    model: "gpt-4",
    messages: [{ role: "user", content: query }],
  });
  await tracker.recordEdge({
    source_node_id: `agent_run:${sessionId}`,
    target_node_id: "node_openai_001",
    edge_type: "triggers",
    duration_ms: response.usage.total_tokens * 10, // 估算耗时
    metadata: { 
      session_id: sessionId,
      tokens_used: response.usage.total_tokens,
    },
  });
  
  // 记录最终响应
  await tracker.recordNode({
    node_id: `response:${sessionId}`,
    node_type: "file",
    name: "Agent Response",
    qualified_name: `session:${sessionId}:response`,
    metadata: { session_id: sessionId },
  });
  
  await tracker.recordEdge({
    source_node_id: "node_openai_001",
    target_node_id: `response:${sessionId}`,
    edge_type: "writes_to",
    metadata: { session_id: sessionId },
  });
  
  return response.choices[0].message.content;
}

优点：

精确：可以记录详细的上下文信息（如查询参数、返回结果、耗时）。
灵活：可以根据业务需求自定义记录逻辑。

缺点：

侵入性强：需要修改应用程序代码。
维护成本高：每次代码变更都需要同步更新血缘记录逻辑。

方式二：Proxy（代理拦截）

在数据访问层部署 Proxy，自动拦截并记录所有数据操作。

示例架构：

┌─────────────┐     ┌──────────────┐     ┌─────────────┐
│  AI Agents  │────▶│  Data Proxy  │────▶│  Database / │
│             │     │  (Records    │     │  API        │
└─────────────┘     │   Lineage)   │     └─────────────┘
                    └──────────────┘
                           │
                           ▼
                    ┌──────────────┐
                    │  Lineage     │
                    │  Storage     │
                    └──────────────┘

实现方式：

Database Proxy：使用 ProxySQL、PgBouncer 等数据库代理，记录所有 SQL 查询。
API Gateway：使用 Kong、Apigee 等 API Gateway，记录所有 API 调用。
Service Mesh：使用 Istio、Linkerd 等服务网格，记录微服务间的通信。

优点：

无侵入：无需修改应用程序代码。
全面：可以捕获所有数据操作，不会遗漏。

缺点：

性能开销：Proxy 会增加延迟，需要优化。
上下文缺失：Proxy 只能看到网络层面的信息，缺少业务上下文（如用户 ID、会话 ID）。

方式三：Sidecar（边车容器）

在 Kubernetes 环境中，为每个 Pod 部署一个 Sidecar 容器，负责记录血缘。

示例配置：

apiVersion: apps/v1
kind: Deployment
metadata:
  name: customer-support-agent
spec:
  template:
    spec:
      containers:
        - name: agent
          image: my-agent:latest
        
        - name: lineage-sidecar
          image: company/lineage-sidecar:latest
          env:
            - name: LINEAGE_API_ENDPOINT
              value: "https://lineage-api.example.com"
            - name: LINEAGE_API_KEY
              valueFrom:
                secretKeyRef:
                  name: lineage-secret
                  key: api-key
          volumeMounts:
            - name: shared-volume
              mountPath: /var/run/lineage
      
      volumes:
        - name: shared-volume
          emptyDir: {}

工作原理：

Agent 将血缘事件写入共享卷（如 /var/run/lineage/events.jsonl）。
Sidecar 监听共享卷，读取新事件并发送到 Lineage API。
Sidecar 负责重试、缓冲、批量发送等可靠性保障。

优点：

解耦：Agent 只需写入文件，无需关心网络通信。
可靠：Sidecar 可以处理重试、缓冲、离线缓存等。

缺点：

资源占用：每个 Pod 都需要额外的 Sidecar 容器。
复杂性：需要管理共享卷的生命周期和清理。

查询与分析

上下游追溯

向上追溯（Upstream）：找出数据的来源。

async function getUpstreamLineage(nodeId: string, maxDepth: number = 5): Promise<DataNode[]> {
  const query = `
    MATCH path = (target:DataNode {node_id: $nodeId})<-[:READS_FROM|DERIVED_FROM*1..${maxDepth}]-(source:DataNode)
    RETURN DISTINCT source
    ORDER BY source.created_at DESC
  `;
  
  const result = await graphDb.run(query, { nodeId });
  return result.records.map(record => record.get("source").properties);
}

// 使用
const upstreamNodes = await getUpstreamLineage("node_response_001", 3);
console.log("Upstream nodes:", upstreamNodes.map(n => n.name));
// Output: ["Customer Query", "Vector DB", "OpenAI API", ...]

向下追溯（Downstream）：找出数据的使用者。

async function getDownstreamLineage(nodeId: string, maxDepth: number = 5): Promise<DataNode[]> {
  const query = `
    MATCH path = (source:DataNode {node_id: $nodeId})-[:READS_FROM|WRITES_TO*1..${maxDepth}]->(target:DataNode)
    RETURN DISTINCT target
    ORDER BY target.created_at DESC
  `;
  
  const result = await graphDb.run(query, { nodeId });
  return result.records.map(record => record.get("target").properties);
}

// 使用
const downstreamNodes = await getDownstreamLineage("prod_db.public.users", 3);
console.log("Downstream nodes:", downstreamNodes.map(n => n.name));
// Output: ["Customer Support Agent", "Analytics Dashboard", "ML Model", ...]

影响范围分析

当需要修改某个数据源时，评估影响范围：

async function assessImpact(nodeId: string): Promise<ImpactAnalysis> {
  // 获取所有下游节点
  const downstreamNodes = await getDownstreamLineage(nodeId, 10);
  
  // 分类统计
  const impactByType: Record<string, number> = {};
  for (const node of downstreamNodes) {
    impactByType[node.node_type] = (impactByType[node.node_type] || 0) + 1;
  }
  
  // 识别关键系统（生产环境的 agent、对外 API 等）
  const criticalSystems = downstreamNodes.filter(node => 
    node.metadata?.environment === "production" &&
    (node.node_type === "agent" || node.node_type === "api_endpoint")
  );
  
  // 估算受影响的用户数
  const affectedUsers = await estimateAffectedUsers(downstreamNodes);
  
  return {
    total_downstream_nodes: downstreamNodes.length,
    impact_by_type: impactByType,
    critical_systems: criticalSystems.map(n => ({
      node_id: n.node_id,
      name: n.name,
      owner: n.owner,
    })),
    estimated_affected_users: affectedUsers,
    recommended_actions: [
      `Notify owners of ${criticalSystems.length} critical systems`,
      `Schedule regression testing for affected agents and models`,
      `Prepare rollback plan in case of issues`,
    ],
  };
}

// 使用
const impact = await assessImpact("prod_db.public.users");
console.log("Impact analysis:", impact);

血缘可视化

使用 D3.js、Cytoscape.js 或 Graphviz 构建交互式血缘图：

import cytoscape from "cytoscape";

async function renderLineageGraph(nodeId: string) {
  // 获取血缘数据
  const upstream = await getUpstreamLineage(nodeId, 3);
  const downstream = await getDownstreamLineage(nodeId, 3);
  const edges = await getEdgesBetweenNodes([nodeId, ...upstream.map(n => n.node_id), ...downstream.map(n => n.node_id)]);
  
  // 构建 Cytoscape 元素
  const elements = [
    ...upstream.map(node => ({
      data: { id: node.node_id, label: node.name, type: node.node_type },
      classes: "upstream",
    })),
    {
      data: { id: nodeId, label: "Current Node", type: "current" },
      classes: "current",
    },
    ...downstream.map(node => ({
      data: { id: node.node_id, label: node.name, type: node.node_type },
      classes: "downstream",
    })),
    ...edges.map(edge => ({
      data: {
        source: edge.source_node_id,
        target: edge.target_node_id,
        label: edge.edge_type,
      },
    })),
  ];
  
  // 渲染图形
  const cy = cytoscape({
    container: document.getElementById("lineage-graph"),
    elements: elements,
    style: [
      {
        selector: ".upstream",
        style: {
          "background-color": "#3498db",
          "label": "data(label)",
        },
      },
      {
        selector: ".current",
        style: {
          "background-color": "#e74c3c",
          "label": "data(label)",
          "width": 40,
          "height": 40,
        },
      },
      {
        selector: ".downstream",
        style: {
          "background-color": "#2ecc71",
          "label": "data(label)",
        },
      },
      {
        selector: "edge",
        style: {
          "width": 2,
          "line-color": "#95a5a6",
          "target-arrow-color": "#95a5a6",
          "target-arrow-shape": "triangle",
          "label": "data(label)",
          "font-size": 10,
        },
      },
    ],
    layout: {
      name: "dagre",
      rankDir: "LR",
    },
  });
  
  // 添加交互
  cy.on("tap", "node", (evt) => {
    const node = evt.target;
    showNodeDetails(node.data("id"));
  });
}