From 9677bdf50df6a16d5b35169092703beb7fdc08aa Mon Sep 17 00:00:00 2001 From: DarkSky <25152247+darkskygit@users.noreply.github.com> Date: Wed, 6 Aug 2025 16:11:50 +0800 Subject: [PATCH] feat(server): skip cleanup for stale workspace (#13418) fix AI-408 ## Summary by CodeRabbit * **New Features** * Added a new field to workspaces to track the last time embeddings were checked. * Cleanup jobs for workspace embeddings now skip workspaces that haven't changed in over 30 days or have no embeddings, improving efficiency. * Cleanup jobs are now automatically triggered when a workspace is updated. * **Improvements** * Enhanced workspace selection for cleanup and indexing tasks to use more precise filters and batching. --- .../migration.sql | 5 ++ packages/backend/server/schema.prisma | 22 +++++---- .../backend/server/src/models/workspace.ts | 27 +++++------ .../server/src/plugins/copilot/cron.ts | 7 ++- .../src/plugins/copilot/embedding/job.ts | 47 ++++++++++++++++++- .../backend/server/src/plugins/indexer/job.ts | 6 ++- 6 files changed, 83 insertions(+), 31 deletions(-) create mode 100644 packages/backend/server/migrations/20250805074035_workspace_last_check_embeddings/migration.sql diff --git a/packages/backend/server/migrations/20250805074035_workspace_last_check_embeddings/migration.sql b/packages/backend/server/migrations/20250805074035_workspace_last_check_embeddings/migration.sql new file mode 100644 index 0000000000..90c9d5d437 --- /dev/null +++ b/packages/backend/server/migrations/20250805074035_workspace_last_check_embeddings/migration.sql @@ -0,0 +1,5 @@ +-- AlterTable +ALTER TABLE "workspaces" ADD COLUMN "last_check_embeddings" TIMESTAMPTZ(3) NOT NULL DEFAULT '1970-01-01 00:00:00 +00:00'; + +-- CreateIndex +CREATE INDEX "workspaces_last_check_embeddings_idx" ON "workspaces"("last_check_embeddings"); diff --git a/packages/backend/server/schema.prisma b/packages/backend/server/schema.prisma index b61a3f6766..4561eafdd9 100644 --- a/packages/backend/server/schema.prisma +++ b/packages/backend/server/schema.prisma @@ -111,17 +111,18 @@ model VerificationToken { model Workspace { // NOTE: manually set this column type to identity in migration file - sid Int @unique @default(autoincrement()) - id String @id @default(uuid()) @db.VarChar - public Boolean - createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3) + sid Int @unique @default(autoincrement()) + id String @id @default(uuid()) @db.VarChar + public Boolean + createdAt DateTime @default(now()) @map("created_at") @db.Timestamptz(3) // workspace level feature flags - enableAi Boolean @default(true) @map("enable_ai") - enableUrlPreview Boolean @default(false) @map("enable_url_preview") - enableDocEmbedding Boolean @default(true) @map("enable_doc_embedding") - name String? @db.VarChar - avatarKey String? @map("avatar_key") @db.VarChar - indexed Boolean @default(false) + enableAi Boolean @default(true) @map("enable_ai") + enableUrlPreview Boolean @default(false) @map("enable_url_preview") + enableDocEmbedding Boolean @default(true) @map("enable_doc_embedding") + name String? @db.VarChar + avatarKey String? @map("avatar_key") @db.VarChar + indexed Boolean @default(false) + lastCheckEmbeddings DateTime @default("1970-01-01T00:00:00-00:00") @map("last_check_embeddings") @db.Timestamptz(3) features WorkspaceFeature[] docs WorkspaceDoc[] @@ -133,6 +134,7 @@ model Workspace { comments Comment[] commentAttachments CommentAttachment[] + @@index([lastCheckEmbeddings]) @@map("workspaces") } diff --git a/packages/backend/server/src/models/workspace.ts b/packages/backend/server/src/models/workspace.ts index 5d86b388ee..8d871d03c4 100644 --- a/packages/backend/server/src/models/workspace.ts +++ b/packages/backend/server/src/models/workspace.ts @@ -24,6 +24,7 @@ export type UpdateWorkspaceInput = Pick< | 'name' | 'avatarKey' | 'indexed' + | 'lastCheckEmbeddings' >; @Injectable() @@ -49,7 +50,11 @@ export class WorkspaceModel extends BaseModel { /** * Update the workspace with the given data. */ - async update(workspaceId: string, data: UpdateWorkspaceInput) { + async update( + workspaceId: string, + data: UpdateWorkspaceInput, + notifyUpdate = true + ) { const workspace = await this.db.workspace.update({ where: { id: workspaceId, @@ -60,7 +65,9 @@ export class WorkspaceModel extends BaseModel { `Updated workspace ${workspaceId} with data ${JSON.stringify(data)}` ); - this.event.emit('workspace.updated', workspace); + if (notifyUpdate) { + this.event.emit('workspace.updated', workspace); + } return workspace; } @@ -81,25 +88,15 @@ export class WorkspaceModel extends BaseModel { }); } - async listAfterSid(sid: number, limit: number) { - return await this.db.workspace.findMany({ - where: { - sid: { gt: sid }, - }, - take: limit, - orderBy: { - sid: 'asc', - }, - }); - } - async list( where: Prisma.WorkspaceWhereInput = {}, - select?: S + select?: S, + limit?: number ) { return (await this.db.workspace.findMany({ where, select, + take: limit, orderBy: { sid: 'asc', }, diff --git a/packages/backend/server/src/plugins/copilot/cron.ts b/packages/backend/server/src/plugins/copilot/cron.ts index a694f8c97b..656e126ab1 100644 --- a/packages/backend/server/src/plugins/copilot/cron.ts +++ b/packages/backend/server/src/plugins/copilot/cron.ts @@ -93,8 +93,11 @@ export class CopilotCronJobs { params: Jobs['copilot.workspace.cleanupTrashedDocEmbeddings'] ) { const nextSid = params.nextSid ?? 0; - let workspaces = await this.models.workspace.listAfterSid( - nextSid, + // only consider workspaces that cleared their embeddings more than 24 hours ago + const oneDayAgo = new Date(Date.now() - OneDay); + const workspaces = await this.models.workspace.list( + { sid: { gt: nextSid }, lastCheckEmbeddings: { lt: oneDayAgo } }, + { id: true, sid: true }, CLEANUP_EMBEDDING_JOB_BATCH_SIZE ); if (!workspaces.length) { diff --git a/packages/backend/server/src/plugins/copilot/embedding/job.ts b/packages/backend/server/src/plugins/copilot/embedding/job.ts index c40e100744..77875a9e88 100644 --- a/packages/backend/server/src/plugins/copilot/embedding/job.ts +++ b/packages/backend/server/src/plugins/copilot/embedding/job.ts @@ -8,6 +8,7 @@ import { EventBus, JobQueue, mapAnyError, + OneDay, OnEvent, OnJob, } from '../../../base'; @@ -511,6 +512,7 @@ export class CopilotEmbeddingJob { return; } + const oneMonthAgo = new Date(Date.now() - OneDay * 30); const snapshot = await this.models.doc.getSnapshot( workspaceId, workspaceId @@ -518,11 +520,37 @@ export class CopilotEmbeddingJob { if (!snapshot) { this.logger.warn(`workspace snapshot ${workspaceId} not found`); return; + } else if ( + // always check if never cleared + workspace.lastCheckEmbeddings > new Date(0) && + snapshot.updatedAt < oneMonthAgo + ) { + this.logger.verbose( + `workspace ${workspaceId} is too old, skipping embeddings cleanup` + ); + await this.models.workspace.update( + workspaceId, + { lastCheckEmbeddings: new Date() }, + false + ); + return; + } + + const docIdsInEmbedding = + await this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId); + if (!docIdsInEmbedding.length) { + this.logger.verbose( + `No doc embeddings found in workspace ${workspaceId}, skipping cleanup` + ); + await this.models.workspace.update( + workspaceId, + { lastCheckEmbeddings: new Date() }, + false + ); + return; } const docIdsInWorkspace = readAllDocIdsFromWorkspaceSnapshot(snapshot.blob); - const docIdsInEmbedding = - await this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId); const docIdsInWorkspaceSet = new Set(docIdsInWorkspace); const deletedDocIds = docIdsInEmbedding.filter( @@ -534,5 +562,20 @@ export class CopilotEmbeddingJob { docId ); } + + await this.models.workspace.update( + workspaceId, + { lastCheckEmbeddings: new Date() }, + false + ); + } + + @OnEvent('workspace.updated') + async onWorkspaceUpdated({ id }: Events['workspace.updated']) { + if (!this.supportEmbedding) return; + + await this.queue.add('copilot.embedding.cleanupTrashedDocEmbeddings', { + workspaceId: id, + }); } } diff --git a/packages/backend/server/src/plugins/indexer/job.ts b/packages/backend/server/src/plugins/indexer/job.ts index f68b2c590b..ff21ecba2c 100644 --- a/packages/backend/server/src/plugins/indexer/job.ts +++ b/packages/backend/server/src/plugins/indexer/job.ts @@ -157,10 +157,12 @@ export class IndexerJob { } const startSid = payload.lastIndexedWorkspaceSid ?? 0; - const workspaces = await this.models.workspace.listAfterSid( - startSid, + const workspaces = await this.models.workspace.list( + { sid: { gt: startSid } }, + { id: true, indexed: true, sid: true }, this.config.indexer.autoIndex.batchSize ); + if (workspaces.length === 0) { // Keep the current sid value when repeating return JOB_SIGNAL.Repeat;