fix(server): improve outdated embedding cleanup (#13476)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Bug Fixes**
* Prevents accidental deletion of placeholder documents during embedding
cleanup.
* Improves accuracy when identifying documents to remove, using multiple
data sources.
* Skips unnecessary cleanup when no embeddings or snapshots exist,
reducing noise and overhead.
* **Chores**
* Streamlined and centralized document filtering logic to ensure
consistent cleanup behavior.
* Parallelized data checks to make cleanup more efficient without
changing user workflows.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2025-08-12 09:20:21 +08:00
committed by GitHub
parent aa20e7ba66
commit 125564b7d2
2 changed files with 61 additions and 15 deletions

View File

@@ -532,11 +532,14 @@ export class CopilotEmbeddingJob {
return;
}
const docIdsInEmbedding =
await this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId);
if (!docIdsInEmbedding.length) {
const [docIdsInEmbedding, docIdsInSnapshots] = await Promise.all([
this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId),
this.models.copilotWorkspace.listEmbeddableDocIds(workspaceId),
]);
if (!docIdsInEmbedding.length && !docIdsInSnapshots.length) {
this.logger.verbose(
`No doc embeddings found in workspace ${workspaceId}, skipping cleanup`
`No doc embeddings and snapshots found in workspace ${workspaceId}, skipping cleanup`
);
await this.models.workspace.update(
workspaceId,
@@ -549,10 +552,17 @@ export class CopilotEmbeddingJob {
const docIdsInWorkspace = readAllDocIdsFromWorkspaceSnapshot(snapshot.blob);
const docIdsInWorkspaceSet = new Set(docIdsInWorkspace);
const deletedDocIds = docIdsInEmbedding.filter(
docId => !docIdsInWorkspaceSet.has(docId)
const deletedDocIds = new Set(
[...docIdsInEmbedding, ...docIdsInSnapshots].filter(
docId => !docIdsInWorkspaceSet.has(docId)
)
);
for (const docId of deletedDocIds) {
const isPlaceholder = await this.models.copilotWorkspace.hasPlaceholder(
workspaceId,
docId
);
if (isPlaceholder) continue;
await this.models.copilotContext.deleteWorkspaceEmbedding(
workspaceId,
docId