From 9220b973c7aeeb1363f2bc20c769f29fd04b0a6e Mon Sep 17 00:00:00 2001 From: darkskygit Date: Tue, 27 May 2025 14:28:34 +0000 Subject: [PATCH] feat(server): increase embedding jobs concurrency & handle empty content after trim (#12574) ## Summary by CodeRabbit - **Improvements** - Increased the default concurrency for background tasks, enhancing processing efficiency. - Improved handling of empty or unsupported documents to ensure consistent processing. - Optimized document filtering to exclude certain documents from processing, improving performance. - **Bug Fixes** - Enhanced detection of empty document summaries, reducing errors during processing. --- .docker/selfhost/schema.json | 4 +-- .../models/copilot-workspace.spec.ts | 15 ++++++++ .../server/src/base/job/queue/config.ts | 2 +- .../server/src/models/copilot-workspace.ts | 19 +++++----- .../server/src/plugins/copilot/context/job.ts | 35 +++++++++++++------ .../i18n/src/i18n-completenesses.json | 2 +- 6 files changed, 54 insertions(+), 23 deletions(-) diff --git a/.docker/selfhost/schema.json b/.docker/selfhost/schema.json index 82065c8295..b1e6e3d692 100644 --- a/.docker/selfhost/schema.json +++ b/.docker/selfhost/schema.json @@ -52,14 +52,14 @@ }, "queues.copilot": { "type": "object", - "description": "The config for copilot job queue\n@default {\"concurrency\":5}", + "description": "The config for copilot job queue\n@default {\"concurrency\":10}", "properties": { "concurrency": { "type": "number" } }, "default": { - "concurrency": 5 + "concurrency": 10 } }, "queues.doc": { diff --git a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts index 4e169eb375..f8ba424bf5 100644 --- a/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts +++ b/packages/backend/server/src/__tests__/models/copilot-workspace.spec.ts @@ -199,6 +199,21 @@ test('should insert and search embedding', async t => { ); t.snapshot(afterAddIgnoreDocs.length, 'should not find docs to embed'); } + + { + const docId = `foo$bar`; + await t.context.doc.upsert({ + spaceId: workspace.id, + docId: docId, + blob: Uint8Array.from([1, 2, 3]), + timestamp: Date.now(), + editorId: user.id, + }); + const results = await t.context.copilotWorkspace.findDocsToEmbed( + workspace.id + ); + t.false(results.includes(docId), 'docs containing `$` should be excluded'); + } }); test('should check need to be embedded', async t => { diff --git a/packages/backend/server/src/base/job/queue/config.ts b/packages/backend/server/src/base/job/queue/config.ts index c5bf83309b..c677bb3062 100644 --- a/packages/backend/server/src/base/job/queue/config.ts +++ b/packages/backend/server/src/base/job/queue/config.ts @@ -50,7 +50,7 @@ defineModuleConfig('job', { 'queues.copilot': { desc: 'The config for copilot job queue', default: { - concurrency: 5, + concurrency: 10, }, schema, }, diff --git a/packages/backend/server/src/models/copilot-workspace.ts b/packages/backend/server/src/models/copilot-workspace.ts index 15cb9d761f..7a1d9c9354 100644 --- a/packages/backend/server/src/models/copilot-workspace.ts +++ b/packages/backend/server/src/models/copilot-workspace.ts @@ -42,23 +42,26 @@ export class CopilotWorkspaceConfigModel extends BaseModel { */ @Transactional() async findDocsToEmbed(workspaceId: string): Promise { + const ignoredDocIds = (await this.listIgnoredDocIds(workspaceId)).map( + d => d.docId + ); + const docIds = await this.db.snapshot .findMany({ where: { workspaceId, - embedding: { - none: {}, - }, + AND: [ + { id: { notIn: ignoredDocIds } }, + { id: { not: workspaceId } }, + { id: { not: { contains: '$' } } }, + ], + embedding: { none: {} }, }, select: { id: true }, }) .then(r => r.map(doc => doc.id)); - const skipDocIds = await this.listIgnoredDocIds(workspaceId).then( - r => new Set(r.map(r => r.docId)) - ); - - return docIds.filter(id => !skipDocIds.has(id)); + return docIds; } @Transactional() diff --git a/packages/backend/server/src/plugins/copilot/context/job.ts b/packages/backend/server/src/plugins/copilot/context/job.ts index 05c2e3c883..270f2cf8a2 100644 --- a/packages/backend/server/src/plugins/copilot/context/job.ts +++ b/packages/backend/server/src/plugins/copilot/context/job.ts @@ -4,6 +4,7 @@ import { AFFiNELogger, BlobNotFound, Config, + CopilotContextFileNotSupported, DocNotFound, EventBus, JobQueue, @@ -300,6 +301,19 @@ export class CopilotContextDocJob { return controller.signal; } + private async fulfillEmptyEmbedding(workspaceId: string, docId: string) { + const emptyEmbedding = { + index: 0, + content: '', + embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0), + }; + await this.models.copilotContext.insertWorkspaceEmbedding( + workspaceId, + docId, + [emptyEmbedding] + ); + } + @OnJob('copilot.embedding.docs') async embedPendingDocs({ contextId, @@ -321,7 +335,7 @@ export class CopilotContextDocJob { const fragment = await this.getDocFragment(workspaceId, docId); if (fragment) { // fast fall for empty doc, journal is easily to create a empty doc - if (fragment.summary) { + if (fragment.summary.trim()) { const embeddings = await this.embeddingClient.getFileEmbeddings( new File( [fragment.summary], @@ -340,16 +354,7 @@ export class CopilotContextDocJob { } } else { // for empty doc, insert empty embedding - const emptyEmbedding = { - index: 0, - content: '', - embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0), - }; - await this.models.copilotContext.insertWorkspaceEmbedding( - workspaceId, - docId, - [emptyEmbedding] - ); + await this.fulfillEmptyEmbedding(workspaceId, docId); } } else if (contextId) { throw new DocNotFound({ spaceId: workspaceId, docId }); @@ -362,6 +367,14 @@ export class CopilotContextDocJob { docId, }); } + if ( + error instanceof CopilotContextFileNotSupported && + error.message.includes('no content found') + ) { + // if the doc is empty, we still need to fulfill the embedding + await this.fulfillEmptyEmbedding(workspaceId, docId); + return; + } // passthrough error to job queue throw error; diff --git a/packages/frontend/i18n/src/i18n-completenesses.json b/packages/frontend/i18n/src/i18n-completenesses.json index 82fc0df8b9..65d6343a13 100644 --- a/packages/frontend/i18n/src/i18n-completenesses.json +++ b/packages/frontend/i18n/src/i18n-completenesses.json @@ -14,7 +14,7 @@ "it-IT": 93, "it": 1, "ja": 93, - "ko": 54, + "ko": 53, "pl": 93, "pt-BR": 93, "ru": 93,