fix(server): frequent embedding (#13475)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- New Features
- Smarter embedding pipeline skips re-embedding when content hasn’t
changed; added content sanitization for embeddings and workspace content
retrieval.
- Bug Fixes
- Re-embedding now requires both a document update and the last
embedding being older than 10 minutes, reducing unnecessary work.
- Refactor
- Consolidated embedding preprocessing and moved sanitization utilities
into shared models; upserts now refresh stored content.
- Tests
- Expanded snapshot-based tests covering multiple time/age scenarios for
embedding decision logic.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2025-08-12 09:45:41 +08:00
committed by GitHub
parent 125564b7d2
commit 65f679c4f0
10 changed files with 177 additions and 43 deletions

View File

@@ -101,6 +101,28 @@ Generated by [AVA](https://avajs.dev).
0
## should check need to be embedded
> document with no embedding should need embedding
true
> document with recent embedding should not need embedding
false
> document updated after embedding and older-than-10m should need embedding
true
> should not need embedding when only 10-minute window passed without updates
false
> should need embedding when doc updated and last embedding older than 10 minutes
true
## should filter outdated doc id style in embedding status
> should include modern doc format

View File

@@ -293,7 +293,10 @@ test('should check need to be embedded', async t => {
workspace.id,
docId
);
t.true(needsEmbedding, 'document with no embedding should need embedding');
t.snapshot(
needsEmbedding,
'document with no embedding should need embedding'
);
}
{
@@ -313,7 +316,7 @@ test('should check need to be embedded', async t => {
workspace.id,
docId
);
t.false(
t.snapshot(
needsEmbedding,
'document with recent embedding should not need embedding'
);
@@ -328,15 +331,83 @@ test('should check need to be embedded', async t => {
editorId: user.id,
});
// simulate an old embedding
const oldEmbeddingTime = new Date(Date.now() - 25 * 60 * 1000);
await t.context.db.aiWorkspaceEmbedding.updateMany({
where: { workspaceId: workspace.id, docId },
data: { updatedAt: oldEmbeddingTime },
});
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
workspace.id,
docId
);
t.true(
t.snapshot(
needsEmbedding,
'document updated after embedding should need embedding'
'document updated after embedding and older-than-10m should need embedding'
);
}
{
// only time passed (>10m since last embedding) but no doc updates => should NOT re-embed
const baseNow = Date.now();
const docId2 = randomUUID();
const t0 = baseNow - 30 * 60 * 1000; // snapshot updated 30 minutes ago
const t1 = baseNow - 25 * 60 * 1000; // embedding updated 25 minutes ago
await t.context.doc.upsert({
spaceId: workspace.id,
docId: docId2,
blob: Uint8Array.from([1, 2, 3]),
timestamp: t0,
editorId: user.id,
});
await t.context.copilotContext.insertWorkspaceEmbedding(
workspace.id,
docId2,
[
{
index: 0,
content: 'content2',
embedding: Array.from({ length: 1024 }, () => 1),
},
]
);
await t.context.db.aiWorkspaceEmbedding.updateMany({
where: { workspaceId: workspace.id, docId: docId2 },
data: { updatedAt: new Date(t1) },
});
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
workspace.id,
docId2
);
t.snapshot(
needsEmbedding,
'should not need embedding when only 10-minute window passed without updates'
);
const t2 = baseNow - 5 * 60 * 1000; // doc updated 5 minutes ago
await t.context.doc.upsert({
spaceId: workspace.id,
docId: docId2,
blob: Uint8Array.from([7, 8, 9]),
timestamp: t2,
editorId: user.id,
});
needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
workspace.id,
docId2
);
t.snapshot(
needsEmbedding,
'should need embedding when doc updated and last embedding older than 10 minutes'
);
}
// --- new cases end ---
});
test('should check embedding table', async t => {

View File

@@ -148,3 +148,36 @@ export type IgnoredDoc = {
createdByAvatar: string | undefined;
updatedBy: string | undefined;
};
export const EMBEDDING_DIMENSIONS = 1024;
const FILTER_PREFIX = [
'Title: ',
'Created at: ',
'Updated at: ',
'Created by: ',
'Updated by: ',
];
export function clearEmbeddingContent(content: string): string {
const lines = content.split('\n');
let maxLines = 5;
while (maxLines > 0 && lines.length > 0) {
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
lines.shift();
maxLines--;
} else {
// only process consecutive metadata rows
break;
}
}
return lines.join('\n');
}
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
if (chunk.content) {
const content = clearEmbeddingContent(chunk.content);
return { ...chunk, content };
}
return chunk;
}

View File

@@ -6,6 +6,7 @@ import { Prisma } from '@prisma/client';
import { CopilotSessionNotFound } from '../base';
import { BaseModel } from './base';
import {
clearEmbeddingContent,
ContextBlob,
ContextConfigSchema,
ContextDoc,
@@ -13,14 +14,13 @@ import {
CopilotContext,
DocChunkSimilarity,
Embedding,
EMBEDDING_DIMENSIONS,
FileChunkSimilarity,
MinimalContextConfigSchema,
} from './common/copilot';
type UpdateCopilotContextInput = Pick<CopilotContext, 'config'>;
export const EMBEDDING_DIMENSIONS = 1024;
/**
* Copilot Job Model
*/
@@ -215,8 +215,9 @@ export class CopilotContextModel extends BaseModel {
select: { content: true },
orderBy: { chunk: 'asc' },
});
return file?.map(f => f.content).join('\n');
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
}
async insertFileEmbedding(
contextId: string,
fileId: string,
@@ -263,6 +264,19 @@ export class CopilotContextModel extends BaseModel {
return similarityChunks.filter(c => Number(c.distance) <= threshold);
}
async getWorkspaceContent(
workspaceId: string,
docId: string,
chunk?: number
): Promise<string | undefined> {
const file = await this.db.aiWorkspaceEmbedding.findMany({
where: { workspaceId, docId, chunk },
select: { content: true },
orderBy: { chunk: 'asc' },
});
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
}
async insertWorkspaceEmbedding(
workspaceId: string,
docId: string,
@@ -287,6 +301,7 @@ export class CopilotContextModel extends BaseModel {
VALUES ${values}
ON CONFLICT (workspace_id, doc_id, chunk)
DO UPDATE SET
content = EXCLUDED.content,
embedding = EXCLUDED.embedding,
updated_at = excluded.updated_at;
`;

View File

@@ -242,10 +242,9 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
@Transactional()
async checkDocNeedEmbedded(workspaceId: string, docId: string) {
// NOTE: check if the document needs re-embedding.
// 1. check if there have been any recent updates to the document snapshot and update
// 2. check if the embedding is older than the snapshot and update
// 3. check if the embedding is older than 10 minutes (avoid frequent updates)
// if all conditions are met, re-embedding is required.
// 1. first-time embedding when no embedding exists
// 2. re-embedding only when the doc has updates newer than the last embedding
// AND the last embedding is older than 10 minutes (avoid frequent updates)
const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>`
SELECT
EXISTS (
@@ -280,8 +279,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
AND e.doc_id = docs.doc_id
WHERE
e.updated_at IS NULL
OR docs.updated_at > e.updated_at
OR e.updated_at < NOW() - INTERVAL '10 minutes'
OR (docs.updated_at > e.updated_at AND e.updated_at < NOW() - INTERVAL '10 minutes')
) AS needs_embedding;
`;

View File

@@ -392,6 +392,10 @@ export class CopilotEmbeddingJob {
return controller.signal;
}
private normalize(s: string) {
return s.replaceAll(/[\p{White_Space}]+/gu, '');
}
@OnJob('copilot.embedding.docs')
async embedPendingDocs({
contextId,
@@ -429,6 +433,21 @@ export class CopilotEmbeddingJob {
if (!hasNewDoc && fragment) {
// fast fall for empty doc, journal is easily to create a empty doc
if (fragment.summary.trim()) {
const existsContent =
await this.models.copilotContext.getWorkspaceContent(
workspaceId,
docId
);
if (
existsContent &&
this.normalize(existsContent) === this.normalize(fragment.summary)
) {
this.logger.log(
`Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
);
return;
}
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File(
[fragment.summary],

View File

@@ -6,9 +6,9 @@ import z from 'zod';
import { DocReader } from '../../../core/doc';
import { AccessController } from '../../../core/permission';
import { clearEmbeddingChunk } from '../../../models';
import { IndexerService } from '../../indexer';
import { CopilotContextService } from '../context';
import { clearEmbeddingChunk } from '../utils';
@Injectable()
export class WorkspaceMcpProvider {

View File

@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
import { z } from 'zod';
import type { AccessController } from '../../../core/permission';
import type { ChunkSimilarity, Models } from '../../../models';
import {
type ChunkSimilarity,
clearEmbeddingChunk,
type Models,
} from '../../../models';
import type { CopilotContextService } from '../context';
import type { ContextSession } from '../context/session';
import type { CopilotChatOptions } from '../providers';
import { clearEmbeddingChunk } from '../utils';
import { toolError } from './error';
export const buildDocSearchGetter = (

View File

@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
import type { Request } from 'express';
import { OneMB, readBufferWithLimit } from '../../base';
import type { ChunkSimilarity } from '../../models';
import type { PromptTools } from './providers';
import type { ToolsConfig } from './types';
@@ -83,29 +82,3 @@ export function getTools(
});
return result;
}
const FILTER_PREFIX = [
'Title: ',
'Created at: ',
'Updated at: ',
'Created by: ',
'Updated by: ',
];
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
if (chunk.content) {
const lines = chunk.content.split('\n');
let maxLines = 5;
while (maxLines > 0 && lines.length > 0) {
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
lines.shift();
maxLines--;
} else {
// only process consecutive metadata rows
break;
}
}
return { ...chunk, content: lines.join('\n') };
}
return chunk;
}