fix(server): frequent embedding (#13475)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

- New Features
- Smarter embedding pipeline skips re-embedding when content hasn’t
changed; added content sanitization for embeddings and workspace content
retrieval.
- Bug Fixes
- Re-embedding now requires both a document update and the last
embedding being older than 10 minutes, reducing unnecessary work.
- Refactor
- Consolidated embedding preprocessing and moved sanitization utilities
into shared models; upserts now refresh stored content.
- Tests
- Expanded snapshot-based tests covering multiple time/age scenarios for
embedding decision logic.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2025-08-12 09:45:41 +08:00
committed by GitHub
parent 125564b7d2
commit 65f679c4f0
10 changed files with 177 additions and 43 deletions

View File

@@ -392,6 +392,10 @@ export class CopilotEmbeddingJob {
return controller.signal;
}
private normalize(s: string) {
return s.replaceAll(/[\p{White_Space}]+/gu, '');
}
@OnJob('copilot.embedding.docs')
async embedPendingDocs({
contextId,
@@ -429,6 +433,21 @@ export class CopilotEmbeddingJob {
if (!hasNewDoc && fragment) {
// fast fall for empty doc, journal is easily to create a empty doc
if (fragment.summary.trim()) {
const existsContent =
await this.models.copilotContext.getWorkspaceContent(
workspaceId,
docId
);
if (
existsContent &&
this.normalize(existsContent) === this.normalize(fragment.summary)
) {
this.logger.log(
`Doc ${docId} in workspace ${workspaceId} has no content change, skipping embedding.`
);
return;
}
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File(
[fragment.summary],

View File

@@ -6,9 +6,9 @@ import z from 'zod';
import { DocReader } from '../../../core/doc';
import { AccessController } from '../../../core/permission';
import { clearEmbeddingChunk } from '../../../models';
import { IndexerService } from '../../indexer';
import { CopilotContextService } from '../context';
import { clearEmbeddingChunk } from '../utils';
@Injectable()
export class WorkspaceMcpProvider {

View File

@@ -3,11 +3,14 @@ import { omit } from 'lodash-es';
import { z } from 'zod';
import type { AccessController } from '../../../core/permission';
import type { ChunkSimilarity, Models } from '../../../models';
import {
type ChunkSimilarity,
clearEmbeddingChunk,
type Models,
} from '../../../models';
import type { CopilotContextService } from '../context';
import type { ContextSession } from '../context/session';
import type { CopilotChatOptions } from '../providers';
import { clearEmbeddingChunk } from '../utils';
import { toolError } from './error';
export const buildDocSearchGetter = (

View File

@@ -3,7 +3,6 @@ import { Readable } from 'node:stream';
import type { Request } from 'express';
import { OneMB, readBufferWithLimit } from '../../base';
import type { ChunkSimilarity } from '../../models';
import type { PromptTools } from './providers';
import type { ToolsConfig } from './types';
@@ -83,29 +82,3 @@ export function getTools(
});
return result;
}
const FILTER_PREFIX = [
'Title: ',
'Created at: ',
'Updated at: ',
'Created by: ',
'Updated by: ',
];
export function clearEmbeddingChunk(chunk: ChunkSimilarity): ChunkSimilarity {
if (chunk.content) {
const lines = chunk.content.split('\n');
let maxLines = 5;
while (maxLines > 0 && lines.length > 0) {
if (FILTER_PREFIX.some(prefix => lines[0].startsWith(prefix))) {
lines.shift();
maxLines--;
} else {
// only process consecutive metadata rows
break;
}
}
return { ...chunk, content: lines.join('\n') };
}
return chunk;
}