From 65f679c4f0f4c6ae876d5ae2f396e6ca43cc9f48 Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Tue, 12 Aug 2025 09:45:41 +0800 Subject: [PATCH] fix(server): frequent embedding (#13475) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary by CodeRabbit - New Features - Smarter embedding pipeline skips re-embedding when content hasn’t changed; added content sanitization for embeddings and workspace content retrieval. - Bug Fixes - Re-embedding now requires both a document update and the last embedding being older than 10 minutes, reducing unnecessary work. - Refactor - Consolidated embedding preprocessing and moved sanitization utilities into shared models; upserts now refresh stored content. - Tests - Expanded snapshot-based tests covering multiple time/age scenarios for embedding decision logic. --- .../copilot-workspace.spec.ts.md | 22 +++++ .../copilot-workspace.spec.ts.snap | Bin 771 -> 932 bytes .../models/copilot-workspace.spec.ts | 79 +++++++++++++++++- .../server/src/models/common/copilot.ts | 33 ++++++++ .../server/src/models/copilot-context.ts | 21 ++++- .../server/src/models/copilot-workspace.ts | 10 +-- .../src/plugins/copilot/embedding/job.ts | 19 +++++ .../src/plugins/copilot/mcp/provider.ts | 2 +- .../copilot/tools/doc-semantic-search.ts | 7 +- .../server/src/plugins/copilot/utils.ts | 27 ------ 10 files changed, 177 insertions(+), 43 deletions(-) diff --git a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md index 452fd2b959..35683f2e33 100644 --- a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md +++ b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md @@ -101,6 +101,28 @@ Generated by [AVA](https://avajs.dev). 0 +## should check need to be embedded + +> document with no embedding should need embedding + + true + +> document with recent embedding should not need embedding + + false + +> document updated after embedding and older-than-10m should need embedding + + true + +> should not need embedding when only 10-minute window passed without updates + + false + +> should need embedding when doc updated and last embedding older than 10 minutes + + true + ## should filter outdated doc id style in embedding status > should include modern doc format diff --git a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.snap b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.snap index d3d447aed41baaf913726a630af9ed8a8f8a57db..41816bdec66815891ba8fd4d32653d5694fb587a 100644 GIT binary patch literal 932 zcmV;V16%w-RzV5)9@xs~Bi#l1fI+ZKGS0l?Dxhz*1F z`N1zX{vV4700000000BcR?ChXMHD^d$IOrfd7`iZ38{pHkWdEfEFg<4V1=T9!UBn~ zLglJ!cQM^nR@d#=v&aSkLPBEC0-@wDSg>QocfgL1K;?e8r^k*NMndt%U3T4b&pr3m zPiNEA>(lVfGh)jrt!{_J7pak%a%xUe`-M$C%cajx!$N72cJ6#mMjEe!4NfgW;yHZ( z%oxS$lRgd+KVkqOC{X6h9Bm+IuoLP8jVH} z*B}JMbs_FGMjC*vjSe18zaq_|*E}8{A49ynHXGrV=)EHP5W|~dXuQ5}>eIk@Jt_Gv zE512#k-JFy9N~}t3*j5lGqyNey>;GOTU}}0M)%VD$1Q!)L2#a>PL!LIw6ivlq?HXE zURc3bvK2w6N~D@A>vk8{>a|vNIa+S1u)kXY_s0O<1MtDVpnL0}AC!jIg~;Q#SjYbp zfM52+erw zKsih0j0~i5ok+^2WQ=w5%|QCD4E780=~bgWTmpCk;3&QN+_`pSY+&UyZAUrEY(}4*EvWwC8qfC9kJ0DcAV*Z%q&JYW5n;_-wj zbmhBD25v+U`R~%ON4YgpN4c#tuWpr@b=9U|UmEV-60f&r>@K_i%URXW_1M=5=~L;5 zOuRC!tB1^{o4@V)>S0v*8XK>Rj2z3QLF+4dL%aTT@tKj{nzS1a+FSx{s znAXU^XDP)up;j7LanXAB{JPp}++Or?zT5@yd3Q{R3uB_!ZQHv3woiZ|dsb=rU(+SB z*-bhv49UzJ%1dEpKF<_a9E)unA6=Q_#Z|kF}}$ zxse25GNaa$-YYGM)$+&Xqxt`=ZPV)^kMS+de z-jneCe@S>Lj*N<6lS_M}71`y+&UjCvKPvExk_YRE%b~CX%}u}Pi0g?7DLk=+Pj~@A zsY|#R2xAX!uF1JVcUe8I=&;`f!1e0@ZUVSF4|Ht``e9h0KUw@-(j`y z0DhF!>PMuhZT0x2a`I@59zJp(Peuo1(9%Ml%?H7ETYJ__wwf-a8peh?ATF%pp%#H0 za_#rDQpOHu3+Y_~_62)-T(s>mfY$(a0DJ@R6F{8-SJ_+!S|tWrJ9c4liR_bam9pCu++4e|hKP-@*(6x~#j4i1)tnhQSX5V-^ z8`_Kcp&&Ud+=p0aR*S9kJ_Gm);Me^AYaGu0saU*1IxN$7lnN}xPWa!I zVGeUsPbKCmcV0FsePh#0K`t>o_)45F?AU|s{#USco^R!zJ<4A9vPm0ME+}!uB;ttz57qlvdTXZ;~V7CGT>6c)pdS7b(qs zr*vSaAo?!w1*JJgRXJ`PJIDP?wkpZXX_C@KmXd0SeoT78eavk}{~yDU8bcBa001!_ BaV-D< diff --git a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts index e64ac25938..bb15a930de 100644 --- a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts +++ b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts @@ -293,7 +293,10 @@ test('should check need to be embedded', async t => { workspace.id, docId ); - t.true(needsEmbedding, 'document with no embedding should need embedding'); + t.snapshot( + needsEmbedding, + 'document with no embedding should need embedding' + ); } { @@ -313,7 +316,7 @@ test('should check need to be embedded', async t => { workspace.id, docId ); - t.false( + t.snapshot( needsEmbedding, 'document with recent embedding should not need embedding' ); @@ -328,15 +331,83 @@ test('should check need to be embedded', async t => { editorId: user.id, }); + // simulate an old embedding + const oldEmbeddingTime = new Date(Date.now() - 25 * 60 * 1000); + await t.context.db.aiWorkspaceEmbedding.updateMany({ + where: { workspaceId: workspace.id, docId }, + data: { updatedAt: oldEmbeddingTime }, + }); + let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded( workspace.id, docId ); - t.true( + t.snapshot( needsEmbedding, - 'document updated after embedding should need embedding' + 'document updated after embedding and older-than-10m should need embedding' ); } + + { + // only time passed (>10m since last embedding) but no doc updates => should NOT re-embed + const baseNow = Date.now(); + const docId2 = randomUUID(); + const t0 = baseNow - 30 * 60 * 1000; // snapshot updated 30 minutes ago + const t1 = baseNow - 25 * 60 * 1000; // embedding updated 25 minutes ago + + await t.context.doc.upsert({ + spaceId: workspace.id, + docId: docId2, + blob: Uint8Array.from([1, 2, 3]), + timestamp: t0, + editorId: user.id, + }); + + await t.context.copilotContext.insertWorkspaceEmbedding( + workspace.id, + docId2, + [ + { + index: 0, + content: 'content2', + embedding: Array.from({ length: 1024 }, () => 1), + }, + ] + ); + + await t.context.db.aiWorkspaceEmbedding.updateMany({ + where: { workspaceId: workspace.id, docId: docId2 }, + data: { updatedAt: new Date(t1) }, + }); + + let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded( + workspace.id, + docId2 + ); + t.snapshot( + needsEmbedding, + 'should not need embedding when only 10-minute window passed without updates' + ); + + const t2 = baseNow - 5 * 60 * 1000; // doc updated 5 minutes ago + await t.context.doc.upsert({ + spaceId: workspace.id, + docId: docId2, + blob: Uint8Array.from([7, 8, 9]), + timestamp: t2, + editorId: user.id, + }); + + needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded( + workspace.id, + docId2 + ); + t.snapshot( + needsEmbedding, + 'should need embedding when doc updated and last embedding older than 10 minutes' + ); + } + // --- new cases end --- }); test('should check embedding table', async t => { diff --git a/packages/backend/server/src/models/common/copilot.ts b/packages/backend/server/src/models/common/copilot.ts index 184206746e..33408f9c54 100644 --- a/packages/backend/server/src/models/common/copilot.ts +++ b/packages/backend/server/src/models/common/copilot.ts @@ -148,3 +148,36 @@ export type IgnoredDoc = { createdByAvatar: string | undefined; updatedBy: string | undefined; }; + +export const EMBEDDING_DIMENSIONS = 1024; + +const FILTER_PREFIX = [ + 'Title: ', + 'Created at: ', + 'Updated at: ', + 'Created by: ', + 'Updated by: ', +]; + +export function clearEmbeddingContent(content: string): string { + const lines = content.split('\n'); + let maxLines = 5; + while (maxLines > 0 && lines.length > 0) { + if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) { + lines.shift(); + maxLines--; + } else { + // only process consecutive metadata rows + break; + } + } + return lines.join('\n'); +} + +export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity { + if (chunk.content) { + const content = clearEmbeddingContent(chunk.content); + return { ...chunk, content }; + } + return chunk; +} diff --git a/packages/backend/server/src/models/copilot-context.ts b/packages/backend/server/src/models/copilot-context.ts index be9cfc7efd..9b0f3ce0cc 100644 --- a/packages/backend/server/src/models/copilot-context.ts +++ b/packages/backend/server/src/models/copilot-context.ts @@ -6,6 +6,7 @@ import { Prisma } from '@prisma/client'; import { CopilotSessionNotFound } from '../base'; import { BaseModel } from './base'; import { + clearEmbeddingContent, ContextBlob, ContextConfigSchema, ContextDoc, @@ -13,14 +14,13 @@ import { CopilotContext, DocChunkSimilarity, Embedding, + EMBEDDING_DIMENSIONS, FileChunkSimilarity, MinimalContextConfigSchema, } from './common/copilot'; type UpdateCopilotContextInput = Pick; -export const EMBEDDING_DIMENSIONS = 1024; - /** * Copilot Job Model */ @@ -215,8 +215,9 @@ export class CopilotContextModel extends BaseModel { select: { content: true }, orderBy: { chunk: 'asc' }, }); - return file?.map(f => f.content).join('\n'); + return file?.map(f => clearEmbeddingContent(f.content)).join('\n'); } + async insertFileEmbedding( contextId: string, fileId: string, @@ -263,6 +264,19 @@ export class CopilotContextModel extends BaseModel { return similarityChunks.filter(c => Number(c.distance) <= threshold); } + async getWorkspaceContent( + workspaceId: string, + docId: string, + chunk?: number + ): Promise { + const file = await this.db.aiWorkspaceEmbedding.findMany({ + where: { workspaceId, docId, chunk }, + select: { content: true }, + orderBy: { chunk: 'asc' }, + }); + return file?.map(f => clearEmbeddingContent(f.content)).join('\n'); + } + async insertWorkspaceEmbedding( workspaceId: string, docId: string, @@ -287,6 +301,7 @@ export class CopilotContextModel extends BaseModel { VALUES ${values} ON CONFLICT (workspace_id, doc_id, chunk) DO UPDATE SET + content = EXCLUDED.content, embedding = EXCLUDED.embedding, updated_at = excluded.updated_at; `; diff --git a/packages/backend/server/src/models/copilot-workspace.ts b/packages/backend/server/src/models/copilot-workspace.ts index eab737d525..3d18c50f66 100644 --- a/packages/backend/server/src/models/copilot-workspace.ts +++ b/packages/backend/server/src/models/copilot-workspace.ts @@ -242,10 +242,9 @@ export class CopilotWorkspaceConfigModel extends BaseModel { @Transactional() async checkDocNeedEmbedded(workspaceId: string, docId: string) { // NOTE: check if the document needs re-embedding. - // 1. check if there have been any recent updates to the document snapshot and update - // 2. check if the embedding is older than the snapshot and update - // 3. check if the embedding is older than 10 minutes (avoid frequent updates) - // if all conditions are met, re-embedding is required. + // 1. first-time embedding when no embedding exists + // 2. re-embedding only when the doc has updates newer than the last embedding + // AND the last embedding is older than 10 minutes (avoid frequent updates) const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>` SELECT EXISTS ( @@ -280,8 +279,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel { AND e.doc_id = docs.doc_id WHERE e.updated_at IS NULL - OR docs.updated_at > e.updated_at - OR e.updated_at < NOW() - INTERVAL '10 minutes' + OR (docs.updated_at > e.updated_at AND e.updated_at < NOW() - INTERVAL '10 minutes') ) AS needs_embedding; `; diff --git a/packages/backend/server/src/plugins/copilot/embedding/job.ts b/packages/backend/server/src/plugins/copilot/embedding/job.ts index 6f2db734bd..5fd6413501 100644 --- a/packages/backend/server/src/plugins/copilot/embedding/job.ts +++ b/packages/backend/server/src/plugins/copilot/embedding/job.ts @@ -392,6 +392,10 @@ export class CopilotEmbeddingJob { return controller.signal; } + private normalize(s: string) { + return s.replaceAll(/[\p{White_Space}]+/gu, ''); + } + @OnJob('copilot.embedding.docs') async embedPendingDocs({ contextId, @@ -429,6 +433,21 @@ export class CopilotEmbeddingJob { if (!hasNewDoc && fragment) { // fast fall for empty doc, journal is easily to create a empty doc if (fragment.summary.trim()) { + const existsContent = + await this.models.copilotContext.getWorkspaceContent( + workspaceId, + docId + ); + if ( + existsContent && + this.normalize(existsContent) === this.normalize(fragment.summary) + ) { + this.logger.log( + `Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.` + ); + return; + } + const embeddings = await this.embeddingClient.getFileEmbeddings( new File( [fragment.summary], diff --git a/packages/backend/server/src/plugins/copilot/mcp/provider.ts b/packages/backend/server/src/plugins/copilot/mcp/provider.ts index c926ec8146..c731684790 100644 --- a/packages/backend/server/src/plugins/copilot/mcp/provider.ts +++ b/packages/backend/server/src/plugins/copilot/mcp/provider.ts @@ -6,9 +6,9 @@ import z from 'zod'; import { DocReader } from '../../../core/doc'; import { AccessController } from '../../../core/permission'; +import { clearEmbeddingChunk } from '../../../models'; import { IndexerService } from '../../indexer'; import { CopilotContextService } from '../context'; -import { clearEmbeddingChunk } from '../utils'; @Injectable() export class WorkspaceMcpProvider { diff --git a/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts b/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts index 5c425e6cd3..4208214c70 100644 --- a/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts +++ b/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts @@ -3,11 +3,14 @@ import { omit } from 'lodash-es'; import { z } from 'zod'; import type { AccessController } from '../../../core/permission'; -import type { ChunkSimilarity, Models } from '../../../models'; +import { + type ChunkSimilarity, + clearEmbeddingChunk, + type Models, +} from '../../../models'; import type { CopilotContextService } from '../context'; import type { ContextSession } from '../context/session'; import type { CopilotChatOptions } from '../providers'; -import { clearEmbeddingChunk } from '../utils'; import { toolError } from './error'; export const buildDocSearchGetter = ( diff --git a/packages/backend/server/src/plugins/copilot/utils.ts b/packages/backend/server/src/plugins/copilot/utils.ts index 9e8ee77f43..735285a119 100644 --- a/packages/backend/server/src/plugins/copilot/utils.ts +++ b/packages/backend/server/src/plugins/copilot/utils.ts @@ -3,7 +3,6 @@ import { Readable } from 'node:stream'; import type { Request } from 'express'; import { OneMB, readBufferWithLimit } from '../../base'; -import type { ChunkSimilarity } from '../../models'; import type { PromptTools } from './providers'; import type { ToolsConfig } from './types'; @@ -83,29 +82,3 @@ export function getTools( }); return result; } - -const FILTER_PREFIX = [ - 'Title: ', - 'Created at: ', - 'Updated at: ', - 'Created by: ', - 'Updated by: ', -]; - -export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity { - if (chunk.content) { - const lines = chunk.content.split('\n'); - let maxLines = 5; - while (maxLines > 0 && lines.length > 0) { - if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) { - lines.shift(); - maxLines--; - } else { - // only process consecutive metadata rows - break; - } - } - return { ...chunk, content: lines.join('\n') }; - } - return chunk; -}