AI agent Privacy Boundary Enforcement：PII、敏感字段和租户隔离如何在运行时强制生效

const piiPatterns: Record<string, RegExp> = {
  email: /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/,
  phone: /^\+?[1-9]\d{1,14}$/, // E.164 格式
  ssn: /^\d{3}-\d{2}-\d{4}$/, // 美国社保号
  credit_card: /^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$/,
  ip_address: /^(?:[0-9]{1,3}\.){3}[0-9]{1,3}$/,
};

function detectPII(value: string): string[] {
  const detected: string[] = [];
  
  for (const [type, pattern] of Object.entries(piiPatterns)) {
    if (pattern.test(value)) {
      detected.push(type);
    }
  }
  
  return detected;
}

// 使用
console.log(detectPII("john.doe@example.com")); // ["email"]
console.log(detectPII("123-45-6789")); // ["ssn"]

优点：简单快速，适合已知格式的数据。
缺点：无法识别非结构化文本中的 PII（如 "My name is John Doe and I live at 123 Main St"）。

方法二：ML 分类器

适用于非结构化文本（如用户消息、文档、日志）。

实现方式：

预训练模型：使用 spaCy、Stanford NER、AWS Comprehend 等 NER（Named Entity Recognition）模型。
自定义训练：使用标注好的 PII 数据集训练自定义模型。
云服务：使用 Google Cloud DLP、Azure Text Analytics、AWS Macie 等云服务的 PII 检测 API。

示例（使用 spaCy）：

import spacy

# 加载预训练模型
nlp = spacy.load("en_core_web_lg")

def detect_pii_in_text(text: str) -> list[dict]:
    doc = nlp(text)
    
    pii_entities = []
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "ORG", "GPE", "DATE", "EMAIL", "PHONE"]:
            pii_entities.append({
                "text": ent.text,
                "label": ent.label_,
                "start": ent.start_char,
                "end": ent.end_char,
            })
    
    return pii_entities

# 使用
text = "My name is John Doe and my email is john@example.com"
print(detect_pii_in_text(text))
# Output: [
#   {"text": "John Doe", "label": "PERSON", "start": 11, "end": 19},
#   {"text": "john@example.com", "label": "EMAIL", "start": 35, "end": 51}
# ]

优点：可以识别非结构化文本中的 PII，准确率高。
缺点：计算成本高，延迟较大；需要定期更新模型以适应新的 PII 模式。

方法三：自定义规则引擎

结合正则表达式、关键词列表、上下文分析等多种方法。

示例：

interface PIIRule {
  id: string;
  name: string;
  type: "regex" | "keyword" | "context" | "ml";
  pattern?: RegExp;
  keywords?: string[];
  context_window?: number; // 上下文窗口大小
  confidence_threshold?: number; // ML 模型的置信度阈值
  action: "block" | "mask" | "alert" | "log";
}

class PIIDetector {
  private rules: PIIRule[];
  
  constructor(rules: PIIRule[]) {
    this.rules = rules;
  }
  
  detect(text: string): PIIDetection[] {
    const detections: PIIDetection[] = [];
    
    for (const rule of this.rules) {
      switch (rule.type) {
        case "regex":
          if (rule.pattern && rule.pattern.test(text)) {
            detections.push({
              rule_id: rule.id,
              rule_name: rule.name,
              matched_text: text.match(rule.pattern)?.[0] || "",
              confidence: 1.0,
              action: rule.action,
            });
          }
          break;
        
        case "keyword":
          if (rule.keywords) {
            for (const keyword of rule.keywords) {
              if (text.toLowerCase().includes(keyword.toLowerCase())) {
                detections.push({
                  rule_id: rule.id,
                  rule_name: rule.name,
                  matched_text: keyword,
                  confidence: 0.8,
                  action: rule.action,
                });
              }
            }
          }
          break;
        
        case "ml":
          // 调用 ML 模型
          const mlResult = this.callMLModel(text, rule.confidence_threshold || 0.9);
          if (mlResult.detected) {
            detections.push({
              rule_id: rule.id,
              rule_name: rule.name,
              matched_text: mlResult.matched_text,
              confidence: mlResult.confidence,
              action: rule.action,
            });
          }
          break;
      }
    }
    
    return detections;
  }
  
  private callMLModel(text: string, threshold: number): MLResult {
    // 调用外部 ML 服务
    // ...
    return { detected: false, matched_text: "", confidence: 0 };
  }
}

// 配置规则
const rules: PIIRule[] = [
  {
    id: "email_detection",
    name: "Email Address Detection",
    type: "regex",
    pattern: /^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$/,
    action: "mask",
  },
  {
    id: "ssn_detection",
    name: "Social Security Number Detection",
    type: "regex",
    pattern: /^\d{3}-\d{2}-\d{4}$/,
    action: "block",
  },
  {
    id: "credit_card_detection",
    name: "Credit Card Number Detection",
    type: "regex",
    pattern: /^\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4}$/,
    action: "block",
  },
  {
    id: "name_detection",
    name: "Person Name Detection (ML)",
    type: "ml",
    confidence_threshold: 0.9,
    action: "alert",
  },
];

// 使用
const detector = new PIIDetector(rules);
const detections = detector.detect("My email is john@example.com and SSN is 123-45-6789");
console.log(detections);

数据分类标签

为每个数据字段添加分类标签，便于后续的访问控制和脱敏策略。

enum DataClassification {
  PUBLIC = "public",           // 公开数据，无限制
  INTERNAL = "internal",       // 内部数据，仅限员工访问
  CONFIDENTIAL = "confidential", // 机密数据，需授权访问
  PII = "pii",                 // 个人身份信息，严格管控
  PHI = "phi",                 // 患者健康信息（HIPAA）
  PCI = "pci",                 // 支付卡信息（PCI DSS）
}

interface DataField {
  field_name: string;
  data_type: string;
  classification: DataClassification;
  pii_types?: string[]; // 如果是 PII，具体类型（如 "email", "phone"）
  retention_days?: number; // 保留期限
  encryption_required: boolean; // 是否需要加密
  masking_rule?: string; // 脱敏规则（如 "show_first_3_chars"）
}

// 示例：用户表的字段分类
const userTableFields: DataField[] = [
  {
    field_name: "user_id",
    data_type: "uuid",
    classification: DataClassification.INTERNAL,
    encryption_required: false,
  },
  {
    field_name: "email",
    data_type: "string",
    classification: DataClassification.PII,
    pii_types: ["email"],
    encryption_required: true,
    masking_rule: "show_first_3_chars", // 如 "joh***@example.com"
  },
  {
    field_name: "phone",
    data_type: "string",
    classification: DataClassification.PII,
    pii_types: ["phone"],
    encryption_required: true,
    masking_rule: "show_last_4_digits", // 如 "***-***-1234"
  },
  {
    field_name: "date_of_birth",
    data_type: "date",
    classification: DataClassification.PII,
    pii_types: ["date_of_birth"],
    encryption_required: true,
    masking_rule: "show_only_year", // 如 "1990"
  },
  {
    field_name: "profile_picture_url",
    data_type: "string",
    classification: DataClassification.PUBLIC,
    encryption_required: false,
  },
];

访问控制策略

RBAC（基于角色的访问控制）

根据用户角色授予不同的数据访问权限。

enum Role {
  ADMIN = "admin",
  SUPPORT_AGENT = "support_agent",
 ANALYST = "analyst",
  CUSTOMER = "customer",
}

interface AccessControlRule {
  role: Role;
  resource_type: string;
  classification: DataClassification;
  allowed_actions: Array<"read" | "write" | "delete">;
  conditions?: Record<string, any>; // 额外条件（如只能访问自己的数据）
}

const accessControlRules: AccessControlRule[] = [
  {
    role: Role.ADMIN,
    resource_type: "user_profile",
    classification: DataClassification.PII,
    allowed_actions: ["read", "write", "delete"],
  },
  {
    role: Role.SUPPORT_AGENT,
    resource_type: "user_profile",
    classification: DataClassification.PII,
    allowed_actions: ["read"],
    conditions: {
      mask_fields: ["ssn", "credit_card"], // 脱敏某些字段
      tenant_scope: true, // 只能访问同一租户的用户
    },
  },
  {
    role: Role.ANALYST,
    resource_type: "user_profile",
    classification: DataClassification.PII,
    allowed_actions: ["read"],
    conditions: {
      anonymize: true, // 必须匿名化
      aggregate_only: true, // 只能访问聚合数据
    },
  },
  {
    role: Role.CUSTOMER,
    resource_type: "user_profile",
    classification: DataClassification.PII,
    allowed_actions: ["read", "write"],
    conditions: {
      self_only: true, // 只能访问自己的数据
    },
  },
];

function checkAccess(
  userRole: Role,
  resourceType: string,
  classification: DataClassification,
  action: "read" | "write" | "delete",
  context?: Record<string, any>
): { allowed: boolean; reason?: string } {
  const rule = accessControlRules.find(
    r =>
      r.role === userRole &&
      r.resource_type === resourceType &&
      r.classification === classification
  );
  
  if (!rule) {
    return { allowed: false, reason: "No matching access control rule" };
  }
  
  if (!rule.allowed_actions.includes(action)) {
    return { allowed: false, reason: `Action '${action}' not allowed for role '${userRole}'` };
  }
  
  // 检查额外条件
  if (rule.conditions) {
    if (rule.conditions.tenant_scope && context?.tenant_id !== context?.user_tenant_id) {
      return { allowed: false, reason: "Cross-tenant access denied" };
    }
    
    if (rule.conditions.self_only && context?.user_id !== context?.resource_owner_id) {
      return { allowed: false, reason: "Self-only access required" };
    }
  }
  
  return { allowed: true };
}

// 使用
const result = checkAccess(
  Role.SUPPORT_AGENT,
  "user_profile",
  DataClassification.PII,
  "read",
  {
    tenant_id: "tenant_123",
    user_tenant_id: "tenant_456", // 不同租户
  }
);

console.log(result);
// Output: { allowed: false, reason: "Cross-tenant access denied" }

ABAC（基于属性的访问控制）

更细粒度的权限控制，基于多个属性动态决策。

interface ABACPolicy {
  effect: "allow" | "deny";
  conditions: ABACCondition[];
}

interface ABACCondition {
  attribute: string;
  operator: "equals" | "not_equals" | "in" | "not_in" | "greater_than" | "less_than" | "contains";
  value: any;
}

function evaluateABACPolicy(
  policy: ABACPolicy,
  attributes: Record<string, any>
): boolean {
  let result = policy.effect === "allow";
  
  for (const condition of policy.conditions) {
    const actualValue = attributes[condition.attribute];
    
    let conditionMet = false;
    switch (condition.operator) {
      case "equals":
        conditionMet = actualValue === condition.value;
        break;
      case "not_equals":
        conditionMet = actualValue !== condition.value;
        break;
      case "in":
        conditionMet = condition.value.includes(actualValue);
        break;
      case "not_in":
        conditionMet = !condition.value.includes(actualValue);
        break;
      case "greater_than":
        conditionMet = actualValue > condition.value;
        break;
      case "less_than":
        conditionMet = actualValue < condition.value;
        break;
      case "contains":
        conditionMet = actualValue?.includes(condition.value);
        break;
    }
    
    if (policy.effect === "allow") {
      result = result && conditionMet;
    } else {
      result = result && !conditionMet;
    }
  }
  
  return result;
}

// 示例策略：只允许在工作时间、从办公网络、由 support agent 角色访问 PII 数据
const policy: ABACPolicy = {
  effect: "allow",
  conditions: [
    { attribute: "role", operator: "equals", value: "support_agent" },
    { attribute: "time_of_day", operator: "greater_than", value: "09:00" },
    { attribute: "time_of_day", operator: "less_than", value: "18:00" },
    { attribute: "source_ip", operator: "in", value: ["10.0.0.0/8", "172.16.0.0/12"] },
    { attribute: "data_classification", operator: "equals", value: "pii" },
  ],
};

// 评估
const attributes = {
  role: "support_agent",
  time_of_day: "14:30",
  source_ip: "10.0.1.42",
  data_classification: "pii",
};

const allowed = evaluateABACPolicy(policy, attributes);
console.log(allowed); // true

优点：非常灵活，可以基于任意属性组合制定策略。
缺点：策略复杂度高，难以维护和调试。

运行时拦截机制

Middleware（中间件）

在数据访问层部署中间件，自动检查和拦截违规访问。

import { NextFunction, Request, Response } from "express";

async function privacyBoundaryMiddleware(
  req: Request,
  res: Response,
  next: NextFunction
) {
  const userRole = req.user?.role;
  const requestedResource = req.params.resource;
  const action = req.method === "GET" ? "read" : req.method === "POST" ? "write" : "delete";
  
  // 获取资源的数据分类
  const resourceMetadata = await getResourceMetadata(requestedResource);
  const classification = resourceMetadata?.classification || DataClassification.PUBLIC;
  
  // 检查访问权限
  const accessCheck = checkAccess(userRole, requestedResource, classification, action, {
    user_id: req.user?.id,
    tenant_id: req.user?.tenant_id,
    resource_owner_id: resourceMetadata?.owner_id,
  });
  
  if (!accessCheck.allowed) {
    // 记录审计日志
    await logPrivacyViolation({
      timestamp: new Date().toISOString(),
      user_id: req.user?.id,
      role: userRole,
      resource: requestedResource,
      action,
      classification,
      reason: accessCheck.reason,
      source_ip: req.ip,
    });
    
    // 返回 403 Forbidden
    return res.status(403).json({
      error: "Forbidden",
      message: accessCheck.reason,
    });
  }
  
  // 如果需要脱敏，应用脱敏规则
  if (resourceMetadata?.masking_rule && action === "read") {
    req.shouldMask = true;
    req.maskingRule = resourceMetadata.masking_rule;
  }
  
  next();
}

// 使用
app.use("/api/data/*", privacyBoundaryMiddleware);

Proxy（代理）

在数据库或 API 前面部署 Proxy，自动拦截和修改请求。

示例架构：

┌─────────────┐     ┌──────────────┐     ┌─────────────┐
│  AI Agents  │────▶│ Privacy      │────▶│  Database / │
│             │     │ Proxy        │     │  API        │
└─────────────┘     │ (Enforces    │     └─────────────┘
                    │  Boundaries) │
                    └──────────────┘
                           │
                           ▼
                    ┌──────────────┐
                    │  Audit Log   │
                    └──────────────┘

实现方式：

Database Proxy：使用 ProxySQL、PgBouncer 等，自动添加 WHERE tenant_id = ? 条件。
API Gateway：使用 Kong、Apigee 等，自动检查和脱敏响应数据。
Service Mesh：使用 Istio、Linkerd 等，在微服务间强制实施隐私策略。

Data Masking（数据脱敏）

在返回数据前自动脱敏敏感字段。

function applyDataMasking(data: any, maskingRule: string): any {
  switch (maskingRule) {
    case "show_first_3_chars":
      // 如 "john@example.com" → "joh***@example.com"
      if (typeof data === "string" && data.includes("@")) {
        const [localPart, domain] = data.split("@");
        return `${localPart.slice(0, 3)}***@${domain}`;
      }
      return data;
    
    case "show_last_4_digits":
      // 如 "123-456-7890" → "***-***-7890"
      if (typeof data === "string" && data.length >= 4) {
        return "***-***-" + data.slice(-4);
      }
      return data;
    
    case "show_only_year":
      // 如 "1990-05-15" → "1990"
      if (typeof data === "string" && data.match(/^\d{4}-\d{2}-\d{2}$/)) {
        return data.slice(0, 4);
      }
      return data;
    
    case "full_mask":
      // 完全隐藏
      return "***";
    
    default:
      return data;
  }
}

// 使用
const email = "john.doe@example.com";
console.log(applyDataMasking(email, "show_first_3_chars"));
// Output: "joh***@example.com"

const phone = "123-456-7890";
console.log(applyDataMasking(phone, "show_last_4_digits"));
// Output: "***-***-7890"

租户隔离设计

Logical Isolation（逻辑隔离）

所有租户的数据存储在同一个数据库中，通过 tenant_id 字段区分。

优点：

成本低：只需维护一个数据库实例。
易于扩展：添加新租户无需 provisioning 新数据库。
易于备份：只需备份一个数据库。

缺点：

安全风险高：如果代码中漏掉 WHERE tenant_id = ?，会导致跨租户数据泄露。
性能问题：大租户可能影响小租户的性能。
合规困难：某些法规要求物理隔离（如金融、医疗行业）。

实现方式：

-- 所有表都包含 tenant_id 字段
CREATE TABLE users (
  id UUID PRIMARY KEY,
  tenant_id UUID NOT NULL,
  email VARCHAR(255) NOT NULL,
  -- 其他字段...
  FOREIGN KEY (tenant_id) REFERENCES tenants(id)
);

-- 查询时必须包含 tenant_id 过滤
SELECT * FROM users WHERE tenant_id = 'tenant_123' AND id = 'user_456';

最佳实践：

Row-Level Security (RLS)：在数据库层面强制实施租户隔离，即使应用代码漏掉 WHERE 条件也不会泄露数据。

-- PostgreSQL RLS 示例
ALTER TABLE users ENABLE ROW LEVEL SECURITY;

CREATE POLICY tenant_isolation_policy ON users
  USING (tenant_id = current_setting('app.current_tenant_id')::uuid);

-- 设置当前租户 ID
SET app.current_tenant_id = 'tenant_123';

-- 现在所有查询都会自动应用租户隔离
SELECT * FROM users WHERE id = 'user_456';
-- 实际执行：SELECT * FROM users WHERE id = 'user_456' AND tenant_id = 'tenant_123'

Physical Isolation（物理隔离）

每个租户有独立的数据库实例或 schema。

优点：

安全性高：物理上隔离，不可能跨租户访问。
性能隔离：大租户不会影响小租户。
合规友好：满足严格的合规要求（如金融、医疗）。

缺点：

成本高：需要维护多个数据库实例。
运维复杂：备份、升级、监控都需要针对每个租户单独进行。
扩展困难：添加新租户需要 provisioning 新数据库。

实现方式：

# 每个租户一个独立的数据库
tenants:
  tenant_123:
    database: "prod_tenant_123"
    host: "db-tenant-123.example.com"
  
  tenant_456:
    database: "prod_tenant_456"
    host: "db-tenant-456.example.com"

适用场景：

大型企业客户，愿意为独立数据库支付额外费用。
合规要求严格的行业（金融、医疗、政府）。
对性能和安全性要求极高的场景。

Hybrid（混合隔离）

结合逻辑隔离和物理隔离的优点。

实现方式：

小租户：共享数据库（逻辑隔离），降低成本。
大租户/高价值租户：独立数据库（物理隔离），提高安全性和性能。
敏感数据：无论租户大小，PII/PHI/PCI 数据始终物理隔离存储。

async function getTenantDatabase(tenantId: string): Promise<DatabaseConnection> {
  const tenant = await getTenantMetadata(tenantId);
  
  if (tenant.isolation_mode === "physical") {
    // 物理隔离：返回独立的数据库连接
    return getPhysicalDatabaseConnection(tenant.database_host);
  } else {
    // 逻辑隔离：返回共享数据库连接，但设置 tenant_id
    const conn = getSharedDatabaseConnection();
    await conn.execute(`SET app.current_tenant_id = '${tenantId}'`);
    return conn;
  }
}

监控与告警

违规访问检测

实时监控并检测异常的访问模式。

interface PrivacyViolation {
  timestamp: string;
  user_id: string;
  role: string;
  resource: string;
  action: string;
  classification: DataClassification;
  reason: string;
  source_ip: string;
  severity: "low" | "medium" | "high" | "critical";
}

async function detectPrivacyViolations(): Promise<void> {
  const violations = await queryRecentViolations();
  
  for (const violation of violations) {
    // 判断严重程度
    let severity: "low" | "medium" | "high" | "critical" = "low";
    
    if (violation.classification === DataClassification.PII || 
        violation.classification === DataClassification.PHI) {
      severity = "high";
    }
    
    if (violation.role === "admin" && violation.action === "delete") {
      severity = "critical";
    }
    
    // 发送告警
    await sendAlert({
      severity,
      title: `Privacy Boundary Violation Detected`,
      message: `User ${violation.user_id} (${violation.role}) attempted to ${violation.action} ${violation.resource} (${violation.classification})`,
      channel: "#security-alerts",
      pagerduty: severity === "critical",
    });
  }
}

// 每分钟检查一次
setInterval(detectPrivacyViolations, 60 * 1000);

异常行为分析

使用机器学习检测异常的访问模式。

示例：

突然增加的 PII 访问：某个用户平时每天访问 10 次 PII 数据，今天突然访问了 1,000 次。
非工作时间访问：某个用户通常在 9:00-18:00 访问数据，今天在凌晨 3:00 访问了大量敏感数据。
跨租户访问尝试：某个 support agent 尝试访问多个不同租户的数据。
批量导出：某个分析师导出了超过正常阈值的聚合数据。

实现方式：

import { AnomalyDetector } from "@company/anomaly-detection-sdk";

const detector = new AnomalyDetector({
  model: "isolation_forest",
  training_period_days: 30,
  sensitivity: 0.95,
});

async function analyzeAccessPattern(userId: string): Promise<AnomalyScore> {
  // 获取过去 24 小时的访问记录
  const accessLogs = await getAccessLogs(userId, "24h");
  
  // 提取特征
  const features = {
    total_accesses: accessLogs.length,
    pii_accesses: accessLogs.filter(log => log.classification === DataClassification.PII).length,
    unique_resources: new Set(accessLogs.map(log => log.resource)).size,
    access_hours: accessLogs.map(log => new Date(log.timestamp).getHours()),
    cross_tenant_attempts: countCrossTenantAttempts(accessLogs),
  };
  
  // 检测异常
  const score = await detector.score(features);
  
  if (score.anomaly_score > 0.9) {
    // 高度异常，立即告警
    await sendAlert({
      severity: "critical",
      title: "Anomalous Access Pattern Detected",
      message: `User ${userId} shows highly anomalous access pattern (score: ${score.anomaly_score})`,
      channel: "#security-alerts",
      pagerduty: true,
    });
  }
  
  return score;
}