From 8e374f5517c9ad9533baca4ed9f212c20b22cc1e Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Tue, 15 Jul 2025 16:50:48 +0800 Subject: [PATCH] feat(server): skip embedding for deprecated doc ids & empty docs (#13211) fix AI-367 ## Summary by CodeRabbit * **Bug Fixes** * Improved document filtering to exclude settings documents and empty blobs from embedding and status calculations. * Enhanced embedding jobs to skip processing deprecated documents if a newer version exists, ensuring only up-to-date documents are embedded. * **New Features** * Added a mutation to trigger the cron job for generating missing titles. * **Tests** * Added test to verify exclusion of documents with empty content from embedding. * Updated embedding-related tests to toggle embedding state during attachment upload under simulated network conditions. --- .../__tests__/models/copilot-workspace.spec.ts | 15 +++++++++++++++ .../server/src/models/copilot-workspace.ts | 8 ++++++-- .../server/src/plugins/copilot/embedding/job.ts | 13 ++++++++++--- .../server/src/plugins/copilot/resolver.ts | 2 +- packages/backend/server/src/schema.gql | 3 +++ packages/common/graphql/src/schema.ts | 2 ++ .../e2e/settings/embedding.spec.ts | 3 +++ 7 files changed, 40 insertions(+), 6 deletions(-) diff --git a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts index f8ba424bf5..6adf124df1 100644 --- a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts +++ b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts @@ -214,6 +214,21 @@ test('should insert and search embedding', async t => { ); t.false(results.includes(docId), 'docs containing `$` should be excluded'); } + + { + const docId = 'empty_doc'; + await t.context.doc.upsert({ + spaceId: workspace.id, + docId: docId, + blob: Uint8Array.from([0, 0]), + timestamp: Date.now(), + editorId: user.id, + }); + const results = await t.context.copilotWorkspace.findDocsToEmbed( + workspace.id + ); + t.false(results.includes(docId), 'empty documents should be excluded'); + } }); test('should check need to be embedded', async t => { diff --git a/packages/backend/server/src/models/copilot-workspace.ts b/packages/backend/server/src/models/copilot-workspace.ts index 84cb0907ca..2a0fcbc4a4 100644 --- a/packages/backend/server/src/models/copilot-workspace.ts +++ b/packages/backend/server/src/models/copilot-workspace.ts @@ -58,10 +58,12 @@ export class CopilotWorkspaceConfigModel extends BaseModel { ON id.workspace_id = s.workspace_id AND id.doc_id = s.guid WHERE s.workspace_id = ${workspaceId} - AND s.guid != s.workspace_id + AND s.guid <> s.workspace_id AND s.guid NOT LIKE '%$%' + AND s.guid NOT LIKE '%:settings:%' AND e.doc_id IS NULL - AND id.doc_id IS NULL;`; + AND id.doc_id IS NULL + AND s.blob <> E'\\\\x0000';`; return docIds.map(r => r.id); } @@ -160,6 +162,8 @@ export class CopilotWorkspaceConfigModel extends BaseModel { { id: { notIn: ignoredDocIds } }, { id: { not: workspaceId } }, { id: { not: { contains: '$' } } }, + { id: { not: { contains: ':settings:' } } }, + { blob: { not: new Uint8Array([0, 0]) } }, ], }; diff --git a/packages/backend/server/src/plugins/copilot/embedding/job.ts b/packages/backend/server/src/plugins/copilot/embedding/job.ts index 5b1adfbe69..f6c2c5951b 100644 --- a/packages/backend/server/src/plugins/copilot/embedding/job.ts +++ b/packages/backend/server/src/plugins/copilot/embedding/job.ts @@ -337,6 +337,10 @@ export class CopilotEmbeddingJob { const signal = this.getWorkspaceSignal(workspaceId); try { + const hasNewDoc = await this.models.doc.exists( + workspaceId, + docId.split(':space:')[1] || '' + ); const needEmbedding = await this.models.copilotWorkspace.checkDocNeedEmbedded( workspaceId, @@ -352,8 +356,11 @@ export class CopilotEmbeddingJob { ); return; } - const fragment = await this.getDocFragment(workspaceId, docId); - if (fragment) { + // if doc id deprecated, skip embedding and fulfill empty embedding + const fragment = !hasNewDoc + ? await this.getDocFragment(workspaceId, docId) + : undefined; + if (!hasNewDoc && fragment) { // fast fall for empty doc, journal is easily to create a empty doc if (fragment.summary.trim()) { const embeddings = await this.embeddingClient.getFileEmbeddings( @@ -382,7 +389,7 @@ export class CopilotEmbeddingJob { ); await this.fulfillEmptyEmbedding(workspaceId, docId); } - } else if (contextId) { + } else { this.logger.warn( `Doc ${docId} in workspace ${workspaceId} has no fragment, fulfilling empty embedding.` ); diff --git a/packages/backend/server/src/plugins/copilot/resolver.ts b/packages/backend/server/src/plugins/copilot/resolver.ts index 7c46738a15..aa59d7e882 100644 --- a/packages/backend/server/src/plugins/copilot/resolver.ts +++ b/packages/backend/server/src/plugins/copilot/resolver.ts @@ -844,7 +844,7 @@ export class PromptsManagementResolver { private readonly promptService: PromptService ) {} - @Query(() => Boolean, { + @Mutation(() => Boolean, { description: 'Trigger generate missing titles cron job', }) async triggerGenerateTitleCron() { diff --git a/packages/backend/server/src/schema.gql b/packages/backend/server/src/schema.gql index 69a1e89387..f5d5223f4d 100644 --- a/packages/backend/server/src/schema.gql +++ b/packages/backend/server/src/schema.gql @@ -1297,6 +1297,9 @@ type Mutation { setBlob(blob: Upload!, workspaceId: String!): String! submitAudioTranscription(blob: Upload, blobId: String!, blobs: [Upload!], workspaceId: String!): TranscriptionResultType + """Trigger generate missing titles cron job""" + triggerGenerateTitleCron: Boolean! + """update app configuration""" updateAppConfig(updates: [UpdateAppConfigInput!]!): JSONObject! diff --git a/packages/common/graphql/src/schema.ts b/packages/common/graphql/src/schema.ts index 5f7fe418c8..c0acb18c90 100644 --- a/packages/common/graphql/src/schema.ts +++ b/packages/common/graphql/src/schema.ts @@ -1440,6 +1440,8 @@ export interface Mutation { sendVerifyEmail: Scalars['Boolean']['output']; setBlob: Scalars['String']['output']; submitAudioTranscription: Maybe; + /** Trigger generate missing titles cron job */ + triggerGenerateTitleCron: Scalars['Boolean']['output']; /** update app configuration */ updateAppConfig: Scalars['JSONObject']['output']; /** Update a comment content */ diff --git a/tests/affine-cloud-copilot/e2e/settings/embedding.spec.ts b/tests/affine-cloud-copilot/e2e/settings/embedding.spec.ts index b24f1522de..5e52655f97 100644 --- a/tests/affine-cloud-copilot/e2e/settings/embedding.spec.ts +++ b/tests/affine-cloud-copilot/e2e/settings/embedding.spec.ts @@ -182,6 +182,9 @@ test.describe('AISettings/Embedding', () => { uploadThroughput: -1, }); + await utils.settings.disableWorkspaceEmbedding(page); + await utils.settings.enableWorkspaceEmbedding(page); + await utils.settings.waitForFileEmbeddingReadiness(page, 2); await utils.settings.closeSettingsPanel(page);