fix(server): improve outdated embedding cleanup (#13476)

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **Bug Fixes**
* Prevents accidental deletion of placeholder documents during embedding
cleanup.
* Improves accuracy when identifying documents to remove, using multiple
data sources.
* Skips unnecessary cleanup when no embeddings or snapshots exist,
reducing noise and overhead.
* **Chores**
* Streamlined and centralized document filtering logic to ensure
consistent cleanup behavior.
* Parallelized data checks to make cleanup more efficient without
changing user workflows.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2025-08-12 09:20:21 +08:00
committed by GitHub
parent aa20e7ba66
commit 125564b7d2
2 changed files with 61 additions and 15 deletions

View File

@@ -152,21 +152,57 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
return docIds.filter(id => ignored.has(id));
}
// check if a docId has only placeholder embeddings
@Transactional()
async hasPlaceholder(workspaceId: string, docId: string): Promise<boolean> {
const [total, nonPlaceholder] = await Promise.all([
this.db.aiWorkspaceEmbedding.count({ where: { workspaceId, docId } }),
this.db.aiWorkspaceEmbedding.count({
where: {
workspaceId,
docId,
NOT: { AND: [{ chunk: 0 }, { content: '' }] },
},
}),
]);
return total > 0 && nonPlaceholder === 0;
}
private getEmbeddableCondition(
workspaceId: string,
ignoredDocIds?: string[]
): Prisma.SnapshotWhereInput {
const condition: Prisma.SnapshotWhereInput['AND'] = [
{ id: { not: workspaceId } },
{ id: { not: { contains: '$' } } },
{ id: { not: { contains: ':settings:' } } },
{ blob: { not: new Uint8Array([0, 0]) } },
];
if (ignoredDocIds && ignoredDocIds.length > 0) {
condition.push({ id: { notIn: ignoredDocIds } });
}
return { workspaceId, AND: condition };
}
@Transactional()
async listEmbeddableDocIds(workspaceId: string) {
const condition = this.getEmbeddableCondition(workspaceId);
const rows = await this.db.snapshot.findMany({
where: condition,
select: { id: true },
});
return rows.map(r => r.id);
}
@Transactional()
async getEmbeddingStatus(workspaceId: string) {
const ignoredDocIds = (await this.listIgnoredDocIds(workspaceId)).map(
d => d.docId
);
const snapshotCondition = {
const snapshotCondition = this.getEmbeddableCondition(
workspaceId,
AND: [
{ id: { notIn: ignoredDocIds } },
{ id: { not: workspaceId } },
{ id: { not: { contains: '$' } } },
{ id: { not: { contains: ':settings:' } } },
{ blob: { not: new Uint8Array([0, 0]) } },
],
};
ignoredDocIds
);
const [docTotal, docEmbedded, fileTotal, fileEmbedded] = await Promise.all([
this.db.snapshot.findMany({

View File

@@ -532,11 +532,14 @@ export class CopilotEmbeddingJob {
return;
}
const docIdsInEmbedding =
await this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId);
if (!docIdsInEmbedding.length) {
const [docIdsInEmbedding, docIdsInSnapshots] = await Promise.all([
this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId),
this.models.copilotWorkspace.listEmbeddableDocIds(workspaceId),
]);
if (!docIdsInEmbedding.length && !docIdsInSnapshots.length) {
this.logger.verbose(
`No doc embeddings found in workspace ${workspaceId}, skipping cleanup`
`No doc embeddings and snapshots found in workspace ${workspaceId}, skipping cleanup`
);
await this.models.workspace.update(
workspaceId,
@@ -549,10 +552,17 @@ export class CopilotEmbeddingJob {
const docIdsInWorkspace = readAllDocIdsFromWorkspaceSnapshot(snapshot.blob);
const docIdsInWorkspaceSet = new Set(docIdsInWorkspace);
const deletedDocIds = docIdsInEmbedding.filter(
docId => !docIdsInWorkspaceSet.has(docId)
const deletedDocIds = new Set(
[...docIdsInEmbedding, ...docIdsInSnapshots].filter(
docId => !docIdsInWorkspaceSet.has(docId)
)
);
for (const docId of deletedDocIds) {
const isPlaceholder = await this.models.copilotWorkspace.hasPlaceholder(
workspaceId,
docId
);
if (isPlaceholder) continue;
await this.models.copilotContext.deleteWorkspaceEmbedding(
workspaceId,
docId