mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 21:05:19 +00:00
fix(server): frequent embedding (#13475)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - New Features - Smarter embedding pipeline skips re-embedding when content hasn’t changed; added content sanitization for embeddings and workspace content retrieval. - Bug Fixes - Re-embedding now requires both a document update and the last embedding being older than 10 minutes, reducing unnecessary work. - Refactor - Consolidated embedding preprocessing and moved sanitization utilities into shared models; upserts now refresh stored content. - Tests - Expanded snapshot-based tests covering multiple time/age scenarios for embedding decision logic. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -392,6 +392,10 @@ export class CopilotEmbeddingJob {
|
||||
return controller.signal;
|
||||
}
|
||||
|
||||
private normalize(s: string) {
|
||||
return s.replaceAll(/[\p{White_Space}]+/gu, '');
|
||||
}
|
||||
|
||||
@OnJob('copilot.embedding.docs')
|
||||
async embedPendingDocs({
|
||||
contextId,
|
||||
@@ -429,6 +433,21 @@ export class CopilotEmbeddingJob {
|
||||
if (!hasNewDoc && fragment) {
|
||||
// fast fall for empty doc, journal is easily to create a empty doc
|
||||
if (fragment.summary.trim()) {
|
||||
const existsContent =
|
||||
await this.models.copilotContext.getWorkspaceContent(
|
||||
workspaceId,
|
||||
docId
|
||||
);
|
||||
if (
|
||||
existsContent &&
|
||||
this.normalize(existsContent) === this.normalize(fragment.summary)
|
||||
) {
|
||||
this.logger.log(
|
||||
`Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
|
||||
);
|
||||
return;
|
||||
}
|
||||
|
||||
const embeddings = await this.embeddingClient.getFileEmbeddings(
|
||||
new File(
|
||||
[fragment.summary],
|
||||
|
||||
@@ -6,9 +6,9 @@ import z from 'zod';
|
||||
|
||||
import { DocReader } from '../../../core/doc';
|
||||
import { AccessController } from '../../../core/permission';
|
||||
import { clearEmbeddingChunk } from '../../../models';
|
||||
import { IndexerService } from '../../indexer';
|
||||
import { CopilotContextService } from '../context';
|
||||
import { clearEmbeddingChunk } from '../utils';
|
||||
|
||||
@Injectable()
|
||||
export class WorkspaceMcpProvider {
|
||||
|
||||
@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
|
||||
import { z } from 'zod';
|
||||
|
||||
import type { AccessController } from '../../../core/permission';
|
||||
import type { ChunkSimilarity, Models } from '../../../models';
|
||||
import {
|
||||
type ChunkSimilarity,
|
||||
clearEmbeddingChunk,
|
||||
type Models,
|
||||
} from '../../../models';
|
||||
import type { CopilotContextService } from '../context';
|
||||
import type { ContextSession } from '../context/session';
|
||||
import type { CopilotChatOptions } from '../providers';
|
||||
import { clearEmbeddingChunk } from '../utils';
|
||||
import { toolError } from './error';
|
||||
|
||||
export const buildDocSearchGetter = (
|
||||
|
||||
@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
|
||||
import type { Request } from 'express';
|
||||
|
||||
import { OneMB, readBufferWithLimit } from '../../base';
|
||||
import type { ChunkSimilarity } from '../../models';
|
||||
import type { PromptTools } from './providers';
|
||||
import type { ToolsConfig } from './types';
|
||||
|
||||
@@ -83,29 +82,3 @@ export function getTools(
|
||||
});
|
||||
return result;
|
||||
}
|
||||
|
||||
const FILTER_PREFIX = [
|
||||
'Title: ',
|
||||
'Created at: ',
|
||||
'Updated at: ',
|
||||
'Created by: ',
|
||||
'Updated by: ',
|
||||
];
|
||||
|
||||
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
|
||||
if (chunk.content) {
|
||||
const lines = chunk.content.split('\n');
|
||||
let maxLines = 5;
|
||||
while (maxLines > 0 && lines.length > 0) {
|
||||
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
|
||||
lines.shift();
|
||||
maxLines--;
|
||||
} else {
|
||||
// only process consecutive metadata rows
|
||||
break;
|
||||
}
|
||||
}
|
||||
return { ...chunk, content: lines.join('\n') };
|
||||
}
|
||||
return chunk;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user