mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-04 08:38:34 +00:00
fix(server): frequent embedding (#13475)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - New Features - Smarter embedding pipeline skips re-embedding when content hasn’t changed; added content sanitization for embeddings and workspace content retrieval. - Bug Fixes - Re-embedding now requires both a document update and the last embedding being older than 10 minutes, reducing unnecessary work. - Refactor - Consolidated embedding preprocessing and moved sanitization utilities into shared models; upserts now refresh stored content. - Tests - Expanded snapshot-based tests covering multiple time/age scenarios for embedding decision logic. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -101,6 +101,28 @@ Generated by [AVA](https://avajs.dev).
|
||||
|
||||
0
|
||||
|
||||
## should check need to be embedded
|
||||
|
||||
> document with no embedding should need embedding
|
||||
|
||||
true
|
||||
|
||||
> document with recent embedding should not need embedding
|
||||
|
||||
false
|
||||
|
||||
> document updated after embedding and older-than-10m should need embedding
|
||||
|
||||
true
|
||||
|
||||
> should not need embedding when only 10-minute window passed without updates
|
||||
|
||||
false
|
||||
|
||||
> should need embedding when doc updated and last embedding older than 10 minutes
|
||||
|
||||
true
|
||||
|
||||
## should filter outdated doc id style in embedding status
|
||||
|
||||
> should include modern doc format
|
||||
|
||||
Binary file not shown.
@@ -293,7 +293,10 @@ test('should check need to be embedded', async t => {
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.true(needsEmbedding, 'document with no embedding should need embedding');
|
||||
t.snapshot(
|
||||
needsEmbedding,
|
||||
'document with no embedding should need embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
@@ -313,7 +316,7 @@ test('should check need to be embedded', async t => {
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.false(
|
||||
t.snapshot(
|
||||
needsEmbedding,
|
||||
'document with recent embedding should not need embedding'
|
||||
);
|
||||
@@ -328,15 +331,83 @@ test('should check need to be embedded', async t => {
|
||||
editorId: user.id,
|
||||
});
|
||||
|
||||
// simulate an old embedding
|
||||
const oldEmbeddingTime = new Date(Date.now() - 25 * 60 * 1000);
|
||||
await t.context.db.aiWorkspaceEmbedding.updateMany({
|
||||
where: { workspaceId: workspace.id, docId },
|
||||
data: { updatedAt: oldEmbeddingTime },
|
||||
});
|
||||
|
||||
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId
|
||||
);
|
||||
t.true(
|
||||
t.snapshot(
|
||||
needsEmbedding,
|
||||
'document updated after embedding should need embedding'
|
||||
'document updated after embedding and older-than-10m should need embedding'
|
||||
);
|
||||
}
|
||||
|
||||
{
|
||||
// only time passed (>10m since last embedding) but no doc updates => should NOT re-embed
|
||||
const baseNow = Date.now();
|
||||
const docId2 = randomUUID();
|
||||
const t0 = baseNow - 30 * 60 * 1000; // snapshot updated 30 minutes ago
|
||||
const t1 = baseNow - 25 * 60 * 1000; // embedding updated 25 minutes ago
|
||||
|
||||
await t.context.doc.upsert({
|
||||
spaceId: workspace.id,
|
||||
docId: docId2,
|
||||
blob: Uint8Array.from([1, 2, 3]),
|
||||
timestamp: t0,
|
||||
editorId: user.id,
|
||||
});
|
||||
|
||||
await t.context.copilotContext.insertWorkspaceEmbedding(
|
||||
workspace.id,
|
||||
docId2,
|
||||
[
|
||||
{
|
||||
index: 0,
|
||||
content: 'content2',
|
||||
embedding: Array.from({ length: 1024 }, () => 1),
|
||||
},
|
||||
]
|
||||
);
|
||||
|
||||
await t.context.db.aiWorkspaceEmbedding.updateMany({
|
||||
where: { workspaceId: workspace.id, docId: docId2 },
|
||||
data: { updatedAt: new Date(t1) },
|
||||
});
|
||||
|
||||
let needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId2
|
||||
);
|
||||
t.snapshot(
|
||||
needsEmbedding,
|
||||
'should not need embedding when only 10-minute window passed without updates'
|
||||
);
|
||||
|
||||
const t2 = baseNow - 5 * 60 * 1000; // doc updated 5 minutes ago
|
||||
await t.context.doc.upsert({
|
||||
spaceId: workspace.id,
|
||||
docId: docId2,
|
||||
blob: Uint8Array.from([7, 8, 9]),
|
||||
timestamp: t2,
|
||||
editorId: user.id,
|
||||
});
|
||||
|
||||
needsEmbedding = await t.context.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspace.id,
|
||||
docId2
|
||||
);
|
||||
t.snapshot(
|
||||
needsEmbedding,
|
||||
'should need embedding when doc updated and last embedding older than 10 minutes'
|
||||
);
|
||||
}
|
||||
// --- new cases end ---
|
||||
});
|
||||
|
||||
test('should check embedding table', async t => {
|
||||
|
||||
@@ -148,3 +148,36 @@ export type IgnoredDoc = {
|
||||
createdByAvatar: string | undefined;
|
||||
updatedBy: string | undefined;
|
||||
};
|
||||
|
||||
export const EMBEDDING_DIMENSIONS = 1024;
|
||||
|
||||
const FILTER_PREFIX = [
|
||||
'Title: ',
|
||||
'Created at: ',
|
||||
'Updated at: ',
|
||||
'Created by: ',
|
||||
'Updated by: ',
|
||||
];
|
||||
|
||||
export function clearEmbeddingContent(content: string): string {
|
||||
const lines = content.split('\n');
|
||||
let maxLines = 5;
|
||||
while (maxLines > 0 && lines.length > 0) {
|
||||
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
|
||||
lines.shift();
|
||||
maxLines--;
|
||||
} else {
|
||||
// only process consecutive metadata rows
|
||||
break;
|
||||
}
|
||||
}
|
||||
return lines.join('\n');
|
||||
}
|
||||
|
||||
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
|
||||
if (chunk.content) {
|
||||
const content = clearEmbeddingContent(chunk.content);
|
||||
return { ...chunk, content };
|
||||
}
|
||||
return chunk;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import { Prisma } from '@prisma/client';
|
||||
import { CopilotSessionNotFound } from '../base';
|
||||
import { BaseModel } from './base';
|
||||
import {
|
||||
clearEmbeddingContent,
|
||||
ContextBlob,
|
||||
ContextConfigSchema,
|
||||
ContextDoc,
|
||||
@@ -13,14 +14,13 @@ import {
|
||||
CopilotContext,
|
||||
DocChunkSimilarity,
|
||||
Embedding,
|
||||
EMBEDDING_DIMENSIONS,
|
||||
FileChunkSimilarity,
|
||||
MinimalContextConfigSchema,
|
||||
} from './common/copilot';
|
||||
|
||||
type UpdateCopilotContextInput = Pick<CopilotContext, 'config'>;
|
||||
|
||||
export const EMBEDDING_DIMENSIONS = 1024;
|
||||
|
||||
/**
|
||||
* Copilot Job Model
|
||||
*/
|
||||
@@ -215,8 +215,9 @@ export class CopilotContextModel extends BaseModel {
|
||||
select: { content: true },
|
||||
orderBy: { chunk: 'asc' },
|
||||
});
|
||||
return file?.map(f => f.content).join('\n');
|
||||
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
|
||||
}
|
||||
|
||||
async insertFileEmbedding(
|
||||
contextId: string,
|
||||
fileId: string,
|
||||
@@ -263,6 +264,19 @@ export class CopilotContextModel extends BaseModel {
|
||||
return similarityChunks.filter(c => Number(c.distance) <= threshold);
|
||||
}
|
||||
|
||||
async getWorkspaceContent(
|
||||
workspaceId: string,
|
||||
docId: string,
|
||||
chunk?: number
|
||||
): Promise<string | undefined> {
|
||||
const file = await this.db.aiWorkspaceEmbedding.findMany({
|
||||
where: { workspaceId, docId, chunk },
|
||||
select: { content: true },
|
||||
orderBy: { chunk: 'asc' },
|
||||
});
|
||||
return file?.map(f => clearEmbeddingContent(f.content)).join('\n');
|
||||
}
|
||||
|
||||
async insertWorkspaceEmbedding(
|
||||
workspaceId: string,
|
||||
docId: string,
|
||||
@@ -287,6 +301,7 @@ export class CopilotContextModel extends BaseModel {
|
||||
VALUES ${values}
|
||||
ON CONFLICT (workspace_id, doc_id, chunk)
|
||||
DO UPDATE SET
|
||||
content = EXCLUDED.content,
|
||||
embedding = EXCLUDED.embedding,
|
||||
updated_at = excluded.updated_at;
|
||||
`;
|
||||
|
||||
@@ -242,10 +242,9 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
|
||||
@Transactional()
|
||||
async checkDocNeedEmbedded(workspaceId: string, docId: string) {
|
||||
// NOTE: check if the document needs re-embedding.
|
||||
// 1. check if there have been any recent updates to the document snapshot and update
|
||||
// 2. check if the embedding is older than the snapshot and update
|
||||
// 3. check if the embedding is older than 10 minutes (avoid frequent updates)
|
||||
// if all conditions are met, re-embedding is required.
|
||||
// 1. first-time embedding when no embedding exists
|
||||
// 2. re-embedding only when the doc has updates newer than the last embedding
|
||||
// AND the last embedding is older than 10 minutes (avoid frequent updates)
|
||||
const result = await this.db.$queryRaw<{ needs_embedding: boolean }[]>`
|
||||
SELECT
|
||||
EXISTS (
|
||||
@@ -280,8 +279,7 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
|
||||
AND e.doc_id = docs.doc_id
|
||||
WHERE
|
||||
e.updated_at IS NULL
|
||||
OR docs.updated_at > e.updated_at
|
||||
OR e.updated_at < NOW() - INTERVAL '10 minutes'
|
||||
OR (docs.updated_at > e.updated_at AND e.updated_at < NOW() - INTERVAL '10 minutes')
|
||||
) AS needs_embedding;
|
||||
`;
|
||||
|
||||
|
||||
@@ -392,6 +392,10 @@ export class CopilotEmbeddingJob {
|
||||
return controller.signal;
|
||||
}
|
||||
|
||||
private normalize(s: string) {
|
||||
return s.replaceAll(/[\p{White_Space}]+/gu, '');
|
||||
}
|
||||
|
||||
@OnJob('copilot.embedding.docs')
|
||||
async embedPendingDocs({
|
||||
contextId,
|
||||
@@ -429,6 +433,21 @@ export class CopilotEmbeddingJob {
|
||||
if (!hasNewDoc && fragment) {
|
||||
// fast fall for empty doc, journal is easily to create a empty doc
|
||||
if (fragment.summary.trim()) {
|
||||
const existsContent =
|
||||
await this.models.copilotContext.getWorkspaceContent(
|
||||
workspaceId,
|
||||
docId
|
||||
);
|
||||
if (
|
||||
existsContent &&
|
||||
this.normalize(existsContent) === this.normalize(fragment.summary)
|
||||
) {
|
||||
this.logger.log(
|
||||
`Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const embeddings = await this.embeddingClient.getFileEmbeddings(
|
||||
new File(
|
||||
[fragment.summary],
|
||||
|
||||
@@ -6,9 +6,9 @@ import z from 'zod';
|
||||
|
||||
import { DocReader } from '../../../core/doc';
|
||||
import { AccessController } from '../../../core/permission';
|
||||
import { clearEmbeddingChunk } from '../../../models';
|
||||
import { IndexerService } from '../../indexer';
|
||||
import { CopilotContextService } from '../context';
|
||||
import { clearEmbeddingChunk } from '../utils';
|
||||
|
||||
@Injectable()
|
||||
export class WorkspaceMcpProvider {
|
||||
|
||||
@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
|
||||
import { z } from 'zod';
|
||||
|
||||
import type { AccessController } from '../../../core/permission';
|
||||
import type { ChunkSimilarity, Models } from '../../../models';
|
||||
import {
|
||||
type ChunkSimilarity,
|
||||
clearEmbeddingChunk,
|
||||
type Models,
|
||||
} from '../../../models';
|
||||
import type { CopilotContextService } from '../context';
|
||||
import type { ContextSession } from '../context/session';
|
||||
import type { CopilotChatOptions } from '../providers';
|
||||
import { clearEmbeddingChunk } from '../utils';
|
||||
import { toolError } from './error';
|
||||
|
||||
export const buildDocSearchGetter = (
|
||||
|
||||
@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
|
||||
import type { Request } from 'express';
|
||||
|
||||
import { OneMB, readBufferWithLimit } from '../../base';
|
||||
import type { ChunkSimilarity } from '../../models';
|
||||
import type { PromptTools } from './providers';
|
||||
import type { ToolsConfig } from './types';
|
||||
|
||||
@@ -83,29 +82,3 @@ export function getTools(
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
const FILTER_PREFIX = [
|
||||
'Title: ',
|
||||
'Created at: ',
|
||||
'Updated at: ',
|
||||
'Created by: ',
|
||||
'Updated by: ',
|
||||
];
|
||||
|
||||
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
|
||||
if (chunk.content) {
|
||||
const lines = chunk.content.split('\n');
|
||||
let maxLines = 5;
|
||||
while (maxLines > 0 && lines.length > 0) {
|
||||
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
|
||||
lines.shift();
|
||||
maxLines--;
|
||||
} else {
|
||||
// only process consecutive metadata rows
|
||||
break;
|
||||
}
|
||||
}
|
||||
return { ...chunk, content: lines.join('\n') };
|
||||
}
|
||||
return chunk;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user