From c4cf5799d44f13bc77db1ba19d22d54490cf14f0 Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Mon, 21 Jul 2025 18:58:29 +0800 Subject: [PATCH] fix(server): exclude outdated doc id style in embedding count (#13269) fix AI-392 fix AI-393 ## Summary by CodeRabbit * **New Features** * Improved filtering of outdated document ID styles in embedding status reporting, ensuring more accurate counts of embedded documents. * Stricter rate limiting applied to workspace embedding status queries for enhanced system reliability. * **Bug Fixes** * Resolved issues with duplicate or outdated document IDs affecting embedding status totals. --- .../copilot-workspace.spec.ts.md | 16 ++++++ .../copilot-workspace.spec.ts.snap | Bin 606 -> 722 bytes .../models/copilot-workspace.spec.ts | 47 ++++++++++++++++++ .../server/src/models/copilot-workspace.ts | 28 +++++++++-- .../src/plugins/copilot/context/resolver.ts | 5 +- .../src/plugins/copilot/workspace/resolver.ts | 1 + 6 files changed, 89 insertions(+), 8 deletions(-) diff --git a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md index 3159900617..dacd531226 100644 --- a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md +++ b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.md @@ -89,3 +89,19 @@ Generated by [AVA](https://avajs.dev). > should not find docs to embed 0 + +## should filter outdated doc id style in embedding status + +> should include modern doc format + + { + embedded: 0, + total: 1, + } + +> should count docs after filtering outdated + + { + embedded: 1, + total: 1, + } diff --git a/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.snap b/packages/backend/server/src/__tests__/models/__snapshots__/copilot-workspace.spec.ts.snap index c1eb1e4f5a18cc64b9a069c106fb5a030b616c68..5f5bdbc4ee92836651be38df4b6b0d7e2c784d39 100644 GIT binary patch literal 722 zcmV;@0xkVPRzV zDbrmwbluKO7H$+l5Zt?xFG=|#DHWr)75<+=bZbfgMKe?YS(`} zBsLt-WYZ5_9B2_rD+eT$8=1hd7`tfKkE9}D2G&KSg?7ptZ{^4j9sAFRT1%GC>(GYe zhYdiaK=vru%OXmx%g(98&X>ZYgSAZn*am`#U zX7y9b)V4H!tDQQXq9>2s*YnWGFh&kD`p*SJWH!+agf;v|(jfm^J$*pnPbr z7Z(MqVd;KIY-R;HVRG|=_l?c=9lO-=_+59kv?-4B`(MejuDn-74wxe0V%CzgI2;|KSq;j$4v>LLXh@NVnXq(Ib7X(VD8`uZ{ E0MmhC8vpQ6WIg)*>2>qtIu3H}5p z+1#?U)E|oo00000000BcRXvZ>KoEVCZ{2~C)6sy0v_e8OC*UZkD4;?J5EK+pq4n-2 zUSjV?-p$2V&>$fpA<2NF6r-m&NH%zHaCdom0nE5`1_ zsq)%KDm`$aoko(y-0+cN!cKH#J)7BN>?T|&rbosmN-}AM)6VdT3$1sbPNn3YKX0Oo z?4cHjRKx=n9W8d`QdYq^;PvMc(WAxzfO`P;sk2AzM@hnuiHcRf-v{l02((M|pacm3 zx&U=v1n-pa)F_vO!2oo54(nr|8dsdO2Uu- zCgDq2WUL96-dc@TRhMbA)IE>>tfDV!I~ePk2)P+4riayym`b(t?!pS5vI>J*l`uZy z+H5Ya^jf95jGC>fu-_HJ&AR~Z19-TNbYo8XX%5^aO0qZ2`F{oQZ43V%HGTm2S$Aum zQ>Ly_v$xWU^C^0E%YC^R6;M!H9OTuq5n_9^GG>a=bYtP6jZ=wd+(_mWPejO694INZ z87&9W$DHgNdik$xyE6dq0h|E%4&WDnCIPo;s{);x0-YeT!BT>DW**j665-U0=T-}~ ze_dgXSH{=vnp;n5=eZGT{Vu&&$LiJbL=_))yh{C@NyN3OxwZ_COSkq=N2+Mro8?8} sMOe8XGN0vvoYA?(fe*D=%sX#$;O1F(y)r2_;r>&wKc^ZwB02{E0LgbE;s5{u diff --git a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts index 6adf124df1..5ddfc67e2d 100644 --- a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts +++ b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts @@ -306,3 +306,50 @@ test('should check embedding table', async t => { // t.false(ret, 'should return false when embedding table is not available'); // } }); + +test('should filter outdated doc id style in embedding status', async t => { + const docId = randomUUID(); + const outdatedDocId = `${workspace.id}:space:${docId}`; + + await t.context.doc.upsert({ + spaceId: workspace.id, + docId, + blob: Uint8Array.from([1, 2, 3]), + timestamp: Date.now(), + editorId: user.id, + }); + + await t.context.doc.upsert({ + spaceId: workspace.id, + docId: outdatedDocId, + blob: Uint8Array.from([1, 2, 3]), + timestamp: Date.now(), + editorId: user.id, + }); + + { + const status = await t.context.copilotWorkspace.getEmbeddingStatus( + workspace.id + ); + t.snapshot(status, 'should include modern doc format'); + } + + { + await t.context.copilotContext.insertWorkspaceEmbedding( + workspace.id, + docId, + [ + { + index: 0, + content: 'content', + embedding: Array.from({ length: 1024 }, () => 1), + }, + ] + ); + + const status = await t.context.copilotWorkspace.getEmbeddingStatus( + workspace.id + ); + t.snapshot(status, 'should count docs after filtering outdated'); + } +}); diff --git a/packages/backend/server/src/models/copilot-workspace.ts b/packages/backend/server/src/models/copilot-workspace.ts index 2a0fcbc4a4..651fa131e3 100644 --- a/packages/backend/server/src/models/copilot-workspace.ts +++ b/packages/backend/server/src/models/copilot-workspace.ts @@ -152,7 +152,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel { } @Transactional() - async getWorkspaceEmbeddingStatus(workspaceId: string) { + async getEmbeddingStatus(workspaceId: string) { const ignoredDocIds = (await this.listIgnoredDocIds(workspaceId)).map( d => d.docId ); @@ -168,9 +168,13 @@ export class CopilotWorkspaceConfigModel extends BaseModel { }; const [docTotal, docEmbedded, fileTotal, fileEmbedded] = await Promise.all([ - this.db.snapshot.count({ where: snapshotCondition }), - this.db.snapshot.count({ + this.db.snapshot.findMany({ + where: snapshotCondition, + select: { id: true }, + }), + this.db.snapshot.findMany({ where: { ...snapshotCondition, embedding: { some: {} } }, + select: { id: true }, }), this.db.aiWorkspaceFiles.count({ where: { workspaceId } }), this.db.aiWorkspaceFiles.count({ @@ -178,9 +182,23 @@ export class CopilotWorkspaceConfigModel extends BaseModel { }), ]); + const docTotalIds = docTotal.map(d => d.id); + const docTotalSet = new Set(docTotalIds); + const outdatedDocPrefix = `${workspaceId}:space:`; + const duplicateOutdatedDocSet = new Set( + docTotalIds + .filter(id => id.startsWith(outdatedDocPrefix)) + .filter(id => docTotalSet.has(id.slice(outdatedDocPrefix.length))) + ); + return { - total: docTotal + fileTotal, - embedded: docEmbedded + fileEmbedded, + total: + docTotalIds.filter(id => !duplicateOutdatedDocSet.has(id)).length + + fileTotal, + embedded: + docEmbedded + .map(d => d.id) + .filter(id => !duplicateOutdatedDocSet.has(id)).length + fileEmbedded, }; } diff --git a/packages/backend/server/src/plugins/copilot/context/resolver.ts b/packages/backend/server/src/plugins/copilot/context/resolver.ts index 484ecf8ba7..e235025177 100644 --- a/packages/backend/server/src/plugins/copilot/context/resolver.ts +++ b/packages/backend/server/src/plugins/copilot/context/resolver.ts @@ -356,6 +356,7 @@ export class CopilotContextRootResolver { return false; } + @Throttle('strict') @Query(() => ContextWorkspaceEmbeddingStatus, { description: 'query workspace embedding status', }) @@ -372,9 +373,7 @@ export class CopilotContextRootResolver { if (this.context.canEmbedding) { const { total, embedded } = - await this.models.copilotWorkspace.getWorkspaceEmbeddingStatus( - workspaceId - ); + await this.models.copilotWorkspace.getEmbeddingStatus(workspaceId); return { total, embedded }; } diff --git a/packages/backend/server/src/plugins/copilot/workspace/resolver.ts b/packages/backend/server/src/plugins/copilot/workspace/resolver.ts index a216ad947e..9d33c92ab9 100644 --- a/packages/backend/server/src/plugins/copilot/workspace/resolver.ts +++ b/packages/backend/server/src/plugins/copilot/workspace/resolver.ts @@ -103,6 +103,7 @@ export class CopilotWorkspaceEmbeddingConfigResolver { return ignoredDocs; } + @Mutation(() => Number, { name: 'updateWorkspaceEmbeddingIgnoredDocs', complexity: 2,