From c31504baafb44b7f1bfb142990ed9e47f87bd624 Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Mon, 4 Aug 2025 16:19:59 +0800 Subject: [PATCH] fix(server): missing embedding search (#13401) ## Summary by CodeRabbit * **New Features** * Enhanced search functionality to include results from additional "blob" data sources, providing more comprehensive search results. * **Bug Fixes** * Improved messaging to ensure "No results found" is only shown when no relevant results exist across all data sources. * **Tests** * Updated test cases to reflect new keyword contexts, improving validation accuracy for search-related features. --- .../src/__tests__/copilot-provider.spec.ts | 20 ++++++++++++++----- .../src/plugins/copilot/context/service.ts | 16 +++++++++++++-- .../copilot/tools/doc-semantic-search.ts | 5 ++++- 3 files changed, 33 insertions(+), 8 deletions(-) diff --git a/packages/backend/server/src/__tests__/copilot-provider.spec.ts b/packages/backend/server/src/__tests__/copilot-provider.spec.ts index 938ca83aae..67e5069137 100644 --- a/packages/backend/server/src/__tests__/copilot-provider.spec.ts +++ b/packages/backend/server/src/__tests__/copilot-provider.spec.ts @@ -534,10 +534,16 @@ The term **“CRDT”** was first introduced by Marc Shapiro, Nuno Preguiça, Ca 'Chat With AFFiNE AI', 'Search With AFFiNE AI', ], - messages: [{ role: 'user' as const, content: TestAssets.AFFiNE }], + messages: [{ role: 'user' as const, content: TestAssets.SSOT }], verifier: (t: ExecutionContext, result: string) => { assertNotWrappedInCodeBlock(t, result); - t.assert(result.includes('AFFiNE'), 'should include original keyword'); + const cleared = result.toLowerCase(); + t.assert( + cleared.includes('single source of truth') || + /single.*source/.test(cleared) || + cleared.includes('ssot'), + 'should include original keyword' + ); }, type: 'text' as const, }, @@ -595,13 +601,17 @@ The term **“CRDT”** was first introduced by Marc Shapiro, Nuno Preguiça, Ca messages: [ { role: 'user' as const, - content: TestAssets.AFFiNE, + content: TestAssets.SSOT, params: { language: 'Simplified Chinese' }, }, ], verifier: (t: ExecutionContext, result: string) => { assertNotWrappedInCodeBlock(t, result); - t.assert(result.includes('AFFiNE'), 'should include keyword'); + const cleared = result.toLowerCase(); + t.assert( + cleared.includes('单一') || cleared.includes('SSOT'), + 'explain code result should include keyword' + ); }, type: 'text' as const, }, @@ -623,7 +633,7 @@ The term **“CRDT”** was first introduced by Marc Shapiro, Nuno Preguiça, Ca content.includes('classroom') || content.includes('school') || content.includes('sky'), - 'should include keyword' + 'explain code result should include keyword' ); }, type: 'text' as const, diff --git a/packages/backend/server/src/plugins/copilot/context/service.ts b/packages/backend/server/src/plugins/copilot/context/service.ts index 800baa0c13..3572112e6a 100644 --- a/packages/backend/server/src/plugins/copilot/context/service.ts +++ b/packages/backend/server/src/plugins/copilot/context/service.ts @@ -232,7 +232,7 @@ export class CopilotContextService implements OnApplicationBootstrap { const embedding = await this.embeddingClient.getEmbedding(content, signal); if (!embedding) return []; - const [fileChunks, workspaceChunks, scopedWorkspaceChunks] = + const [fileChunks, blobChunks, workspaceChunks, scopedWorkspaceChunks] = await Promise.all([ this.models.copilotWorkspace.matchFileEmbedding( workspaceId, @@ -240,6 +240,12 @@ export class CopilotContextService implements OnApplicationBootstrap { topK * 2, threshold ), + this.models.copilotWorkspace.matchBlobEmbedding( + workspaceId, + embedding, + topK * 2, + threshold + ), this.models.copilotContext.matchWorkspaceEmbedding( embedding, workspaceId, @@ -259,6 +265,7 @@ export class CopilotContextService implements OnApplicationBootstrap { if ( !fileChunks.length && + !blobChunks.length && !workspaceChunks.length && !scopedWorkspaceChunks?.length ) { @@ -267,7 +274,12 @@ export class CopilotContextService implements OnApplicationBootstrap { return await this.embeddingClient.reRank( content, - [...fileChunks, ...workspaceChunks, ...(scopedWorkspaceChunks || [])], + [ + ...fileChunks, + ...blobChunks, + ...workspaceChunks, + ...(scopedWorkspaceChunks || []), + ], topK, signal ); diff --git a/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts b/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts index 9736ba3d8e..5c425e6cd3 100644 --- a/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts +++ b/packages/backend/server/src/plugins/copilot/tools/doc-semantic-search.ts @@ -42,12 +42,14 @@ export const buildDocSearchGetter = ( chunks.filter(c => 'docId' in c), 'Doc.Read' ); + const blobChunks = chunks.filter(c => 'blobId' in c); const fileChunks = chunks.filter(c => 'fileId' in c); if (contextChunks.length) { fileChunks.push(...contextChunks); } - if (!docChunks.length && !fileChunks.length) + if (!blobChunks.length && !docChunks.length && !fileChunks.length) { return `No results found for "${query}".`; + } const docIds = docChunks.map(c => ({ // oxlint-disable-next-line no-non-null-assertion @@ -80,6 +82,7 @@ export const buildDocSearchGetter = ( return [ ...fileChunks.map(clearEmbeddingChunk), + ...blobChunks.map(clearEmbeddingChunk), ...docChunks.map(c => ({ ...c, ...docMetas.get(c.docId),