fix(server): frequent embedding (#13475)

## Summary by CodeRabbit - New Features - Smarter embedding pipeline skips re-embedding when content hasn’t changed; added content sanitization for embeddings and workspace content retrieval. - Bug Fixes - Re-embedding now requires both a document update and the last embedding being older than 10 minutes, reducing unnecessary work. - Refactor - Consolidated embedding preprocessing and moved sanitization utilities into shared models; upserts now refresh stored content. - Tests - Expanded snapshot-based tests covering multiple time/age scenarios for embedding decision logic.
2026-02-04 08:38:34 +00:00 · 2025-08-12 09:45:41 +08:00
parent 125564b7d2
commit 65f679c4f0
10 changed files with 177 additions and 43 deletions
--- a/packages/backend/server/src/tests/models/snapshots/copilot-workspace.spec.ts.md
+++ b/packages/backend/server/src/tests/models/snapshots/copilot-workspace.spec.ts.md
@@ -101,6 +101,28 @@ Generated by [AVA](https://avajs.dev).

    0

+## should check need to be embedded
+
+> document with no embedding should need embedding
+
+    true
+
+> document with recent embedding should not need embedding
+
+    false
+
+> document updated after embedding and older-than-10m should need embedding
+
+    true
+
+> should not need embedding when only 10-minute window passed without updates
+
+    false
+
+> should need embedding when doc updated and last embedding older than 10 minutes
+
+    true
+
 ## should filter outdated doc id style in embedding status

 > should include modern doc format
--- a/packages/backend/server/src/tests/models/snapshots/copilot-workspace.spec.ts.snap
+++ b/packages/backend/server/src/tests/models/snapshots/copilot-workspace.spec.ts.snap
--- a/packages/backend/server/src/tests/models/copilot-workspace.spec.ts
+++ b/packages/backend/server/src/tests/models/copilot-workspace.spec.ts
@@ -293,7 +293,10 @@ test('should check need to be embedded', async t => {
      workspace.id,
      docId
    );
-    t.true(needsEmbedding, 'document with no embedding should need embedding');
+    t.snapshot(
+      needsEmbedding,
+      'document with no embedding should need embedding'
+    );
  }

  {
@@ -313,7 +316,7 @@ test('should check need to be embedded', async t => {
      workspace.id,
      docId
    );
-    t.false(
+    t.snapshot(
      needsEmbedding,
      'document with recent embedding should not need embedding'
    );
@@ -328,15 +331,83 @@ test('should check need to be embedded', async t => {
      editorId: user.id,
    });

+    // simulate an old embedding
+    const oldEmbeddingTime = new Date(Date.now() - 25 * 60 * 1000);
+    await t.context.db.aiWorkspaceEmbedding.updateMany({
+      where: { workspaceId: workspace.id, docId },
+      data: { updatedAt: oldEmbeddingTime },
+    });
+
    let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
      workspace.id,
      docId
    );
-    t.true(
+    t.snapshot(
      needsEmbedding,
-      'document updated after embedding should need embedding'
+      'document updated after embedding and older-than-10m should need embedding'
    );
  }
+
+  {
+    // only time passed (>10m since last embedding) but no doc updates => should NOT re-embed
+    const baseNow = Date.now();
+    const docId2 = randomUUID();
+    const t0 = baseNow - 30 * 60 * 1000; // snapshot updated 30 minutes ago
+    const t1 = baseNow - 25 * 60 * 1000; // embedding updated 25 minutes ago
+
+    await t.context.doc.upsert({
+      spaceId: workspace.id,
+      docId: docId2,
+      blob: Uint8Array.from([1, 2, 3]),
+      timestamp: t0,
+      editorId: user.id,
+    });
+
+    await t.context.copilotContext.insertWorkspaceEmbedding(
+      workspace.id,
+      docId2,
+      [
+        {
+          index: 0,
+          content: 'content2',
+          embedding: Array.from({ length: 1024 }, () => 1),
+        },
+      ]
+    );
+
+    await t.context.db.aiWorkspaceEmbedding.updateMany({
+      where: { workspaceId: workspace.id, docId: docId2 },
+      data: { updatedAt: new Date(t1) },
+    });
+
+    let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
+      workspace.id,
+      docId2
+    );
+    t.snapshot(
+      needsEmbedding,
+      'should not need embedding when only 10-minute window passed without updates'
+    );
+
+    const t2 = baseNow - 5 * 60 * 1000; // doc updated 5 minutes ago
+    await t.context.doc.upsert({
+      spaceId: workspace.id,
+      docId: docId2,
+      blob: Uint8Array.from([7, 8, 9]),
+      timestamp: t2,
+      editorId: user.id,
+    });
+
+    needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
+      workspace.id,
+      docId2
+    );
+    t.snapshot(
+      needsEmbedding,
+      'should need embedding when doc updated and last embedding older than 10 minutes'
+    );
+  }
+  // --- new cases end ---
 });

 test('should check embedding table', async t => {
--- a/packages/backend/server/src/models/common/copilot.ts
+++ b/packages/backend/server/src/models/common/copilot.ts
@@ -148,3 +148,36 @@ export type IgnoredDoc = {
  createdByAvatar: string | undefined;
  updatedBy: string | undefined;
 };
+
+export const EMBEDDING_DIMENSIONS = 1024;
+
+const FILTER_PREFIX = [
+  'Title: ',
+  'Created at: ',
+  'Updated at: ',
+  'Created by: ',
+  'Updated by: ',
+];
+
+export function clearEmbeddingContent(content: string): string {
+  const lines = content.split('\n');
+  let maxLines = 5;
+  while (maxLines > 0 && lines.length > 0) {
+    if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
+      lines.shift();
+      maxLines--;
+    } else {
+      // only process consecutive metadata rows
+      break;
+    }
+  }
+  return lines.join('\n');
+}
+
+export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
+  if (chunk.content) {
+    const content = clearEmbeddingContent(chunk.content);
+    return { ...chunk, content };
+  }
+  return chunk;
+}
--- a/packages/backend/server/src/models/copilot-context.ts
+++ b/packages/backend/server/src/models/copilot-context.ts
@@ -6,6 +6,7 @@ import { Prisma } from '@prisma/client';
 import { CopilotSessionNotFound } from '../base';
 import { BaseModel } from './base';
 import {
+  clearEmbeddingContent,
  ContextBlob,
  ContextConfigSchema,
  ContextDoc,
@@ -13,14 +14,13 @@ import {
  CopilotContext,
  DocChunkSimilarity,
  Embedding,
+  EMBEDDING_DIMENSIONS,
  FileChunkSimilarity,
  MinimalContextConfigSchema,
 } from './common/copilot';

 type UpdateCopilotContextInput = Pick<CopilotContext, 'config'>;

-export const EMBEDDING_DIMENSIONS = 1024;
-
 /**
 * Copilot Job Model
 */
@@ -215,8 +215,9 @@ export class CopilotContextModel extends BaseModel {
      select: { content: true },
      orderBy: { chunk: 'asc' },
    });
-    return file?.map(f => f.content).join('\n');
+    return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
  }
+
  async insertFileEmbedding(
    contextId: string,
    fileId: string,
@@ -263,6 +264,19 @@ export class CopilotContextModel extends BaseModel {
    return similarityChunks.filter(c => Number(c.distance) <= threshold);
  }

+  async getWorkspaceContent(
+    workspaceId: string,
+    docId: string,
+    chunk?: number
+  ): Promise<string | undefined> {
+    const file = await this.db.aiWorkspaceEmbedding.findMany({
+      where: { workspaceId, docId, chunk },
+      select: { content: true },
+      orderBy: { chunk: 'asc' },
+    });
+    return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
+  }
+
  async insertWorkspaceEmbedding(
    workspaceId: string,
    docId: string,
@@ -287,6 +301,7 @@ export class CopilotContextModel extends BaseModel {
      VALUES ${values}
      ON CONFLICT (workspace_id, doc_id, chunk)
      DO UPDATE SET
+        content = EXCLUDED.content,
        embedding = EXCLUDED.embedding,
        updated_at = excluded.updated_at;
    `;
--- a/packages/backend/server/src/models/copilot-workspace.ts
+++ b/packages/backend/server/src/models/copilot-workspace.ts
@@ -242,10 +242,9 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
  @Transactional()
  async checkDocNeedEmbedded(workspaceId: string, docId: string) {
    // NOTE: check if the document needs re-embedding.
-    // 1. check if there have been any recent updates to the document snapshot and update
-    // 2. check if the embedding is older than the snapshot and update
-    // 3. check if the embedding is older than 10 minutes (avoid frequent updates)
-    // if all conditions are met, re-embedding is required.
+    // 1. first-time embedding when no embedding exists
+    // 2. re-embedding only when the doc has updates newer than the last embedding
+    //    AND the last embedding is older than 10 minutes (avoid frequent updates)
    const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>`
      SELECT
        EXISTS (
@@ -280,8 +279,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
              AND e.doc_id = docs.doc_id
          WHERE
            e.updated_at IS NULL
-            OR docs.updated_at > e.updated_at
-            OR e.updated_at < NOW() - INTERVAL '10 minutes'
+            OR (docs.updated_at > e.updated_at AND e.updated_at < NOW() - INTERVAL '10 minutes')
        ) AS needs_embedding;
    `;

--- a/packages/backend/server/src/plugins/copilot/embedding/job.ts
+++ b/packages/backend/server/src/plugins/copilot/embedding/job.ts
@@ -392,6 +392,10 @@ export class CopilotEmbeddingJob {
    return controller.signal;
  }

+  private normalize(s: string) {
+    return s.replaceAll(/[\p{White_Space}]+/gu, '');
+  }
+
  @OnJob('copilot.embedding.docs')
  async embedPendingDocs({
    contextId,
@@ -429,6 +433,21 @@ export class CopilotEmbeddingJob {
        if (!hasNewDoc && fragment) {
          // fast fall for empty doc, journal is easily to create a empty doc
          if (fragment.summary.trim()) {
+            const existsContent =
+              await this.models.copilotContext.getWorkspaceContent(
+                workspaceId,
+                docId
+              );
+            if (
+              existsContent &&
+              this.normalize(existsContent) === this.normalize(fragment.summary)
+            ) {
+              this.logger.log(
+                `Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
+              );
+              return;
+            }
+
            const embeddings = await this.embeddingClient.getFileEmbeddings(
              new File(
                [fragment.summary],
--- a/packages/backend/server/src/plugins/copilot/mcp/provider.ts
+++ b/packages/backend/server/src/plugins/copilot/mcp/provider.ts
@@ -6,9 +6,9 @@ import z from 'zod';

 import { DocReader } from '../../../core/doc';
 import { AccessController } from '../../../core/permission';
+import { clearEmbeddingChunk } from '../../../models';
 import { IndexerService } from '../../indexer';
 import { CopilotContextService } from '../context';
-import { clearEmbeddingChunk } from '../utils';

@Injectable()
 export class WorkspaceMcpProvider {
--- a/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts
+++ b/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts
@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
 import { z } from 'zod';

 import type { AccessController } from '../../../core/permission';
-import type { ChunkSimilarity, Models } from '../../../models';
+import {
+  type ChunkSimilarity,
+  clearEmbeddingChunk,
+  type Models,
+} from '../../../models';
 import type { CopilotContextService } from '../context';
 import type { ContextSession } from '../context/session';
 import type { CopilotChatOptions } from '../providers';
-import { clearEmbeddingChunk } from '../utils';
 import { toolError } from './error';

 export const buildDocSearchGetter = (
--- a/packages/backend/server/src/plugins/copilot/utils.ts
+++ b/packages/backend/server/src/plugins/copilot/utils.ts
@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
 import type { Request } from 'express';

 import { OneMB, readBufferWithLimit } from '../../base';
-import type { ChunkSimilarity } from '../../models';
 import type { PromptTools } from './providers';
 import type { ToolsConfig } from './types';

@@ -83,29 +82,3 @@ export function getTools(
  });
  return result;
 }
-
-const FILTER_PREFIX = [
-  'Title: ',
-  'Created at: ',
-  'Updated at: ',
-  'Created by: ',
-  'Updated by: ',
-];
-
-export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
-  if (chunk.content) {
-    const lines = chunk.content.split('\n');
-    let maxLines = 5;
-    while (maxLines > 0 && lines.length > 0) {
-      if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
-        lines.shift();
-        maxLines--;
-      } else {
-        // only process consecutive metadata rows
-        break;
-      }
-    }
-    return { ...chunk, content: lines.join('\n') };
-  }
-  return chunk;
-}