mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-04 08:38:34 +00:00
fix(server): improve outdated embedding cleanup (#13476)
<!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit * **Bug Fixes** * Prevents accidental deletion of placeholder documents during embedding cleanup. * Improves accuracy when identifying documents to remove, using multiple data sources. * Skips unnecessary cleanup when no embeddings or snapshots exist, reducing noise and overhead. * **Chores** * Streamlined and centralized document filtering logic to ensure consistent cleanup behavior. * Parallelized data checks to make cleanup more efficient without changing user workflows. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -152,21 +152,57 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
|
||||
return docIds.filter(id => ignored.has(id));
|
||||
}
|
||||
|
||||
// check if a docId has only placeholder embeddings
|
||||
@Transactional()
|
||||
async hasPlaceholder(workspaceId: string, docId: string): Promise<boolean> {
|
||||
const [total, nonPlaceholder] = await Promise.all([
|
||||
this.db.aiWorkspaceEmbedding.count({ where: { workspaceId, docId } }),
|
||||
this.db.aiWorkspaceEmbedding.count({
|
||||
where: {
|
||||
workspaceId,
|
||||
docId,
|
||||
NOT: { AND: [{ chunk: 0 }, { content: '' }] },
|
||||
},
|
||||
}),
|
||||
]);
|
||||
return total > 0 && nonPlaceholder === 0;
|
||||
}
|
||||
|
||||
private getEmbeddableCondition(
|
||||
workspaceId: string,
|
||||
ignoredDocIds?: string[]
|
||||
): Prisma.SnapshotWhereInput {
|
||||
const condition: Prisma.SnapshotWhereInput['AND'] = [
|
||||
{ id: { not: workspaceId } },
|
||||
{ id: { not: { contains: '$' } } },
|
||||
{ id: { not: { contains: ':settings:' } } },
|
||||
{ blob: { not: new Uint8Array([0, 0]) } },
|
||||
];
|
||||
if (ignoredDocIds && ignoredDocIds.length > 0) {
|
||||
condition.push({ id: { notIn: ignoredDocIds } });
|
||||
}
|
||||
return { workspaceId, AND: condition };
|
||||
}
|
||||
|
||||
@Transactional()
|
||||
async listEmbeddableDocIds(workspaceId: string) {
|
||||
const condition = this.getEmbeddableCondition(workspaceId);
|
||||
const rows = await this.db.snapshot.findMany({
|
||||
where: condition,
|
||||
select: { id: true },
|
||||
});
|
||||
return rows.map(r => r.id);
|
||||
}
|
||||
|
||||
@Transactional()
|
||||
async getEmbeddingStatus(workspaceId: string) {
|
||||
const ignoredDocIds = (await this.listIgnoredDocIds(workspaceId)).map(
|
||||
d => d.docId
|
||||
);
|
||||
const snapshotCondition = {
|
||||
const snapshotCondition = this.getEmbeddableCondition(
|
||||
workspaceId,
|
||||
AND: [
|
||||
{ id: { notIn: ignoredDocIds } },
|
||||
{ id: { not: workspaceId } },
|
||||
{ id: { not: { contains: '$' } } },
|
||||
{ id: { not: { contains: ':settings:' } } },
|
||||
{ blob: { not: new Uint8Array([0, 0]) } },
|
||||
],
|
||||
};
|
||||
ignoredDocIds
|
||||
);
|
||||
|
||||
const [docTotal, docEmbedded, fileTotal, fileEmbedded] = await Promise.all([
|
||||
this.db.snapshot.findMany({
|
||||
|
||||
@@ -532,11 +532,14 @@ export class CopilotEmbeddingJob {
|
||||
return;
|
||||
}
|
||||
|
||||
const docIdsInEmbedding =
|
||||
await this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId);
|
||||
if (!docIdsInEmbedding.length) {
|
||||
const [docIdsInEmbedding, docIdsInSnapshots] = await Promise.all([
|
||||
this.models.copilotContext.listWorkspaceDocEmbedding(workspaceId),
|
||||
this.models.copilotWorkspace.listEmbeddableDocIds(workspaceId),
|
||||
]);
|
||||
|
||||
if (!docIdsInEmbedding.length && !docIdsInSnapshots.length) {
|
||||
this.logger.verbose(
|
||||
`No doc embeddings found in workspace ${workspaceId}, skipping cleanup`
|
||||
`No doc embeddings and snapshots found in workspace ${workspaceId}, skipping cleanup`
|
||||
);
|
||||
await this.models.workspace.update(
|
||||
workspaceId,
|
||||
@@ -549,10 +552,17 @@ export class CopilotEmbeddingJob {
|
||||
const docIdsInWorkspace = readAllDocIdsFromWorkspaceSnapshot(snapshot.blob);
|
||||
const docIdsInWorkspaceSet = new Set(docIdsInWorkspace);
|
||||
|
||||
const deletedDocIds = docIdsInEmbedding.filter(
|
||||
docId => !docIdsInWorkspaceSet.has(docId)
|
||||
const deletedDocIds = new Set(
|
||||
[...docIdsInEmbedding, ...docIdsInSnapshots].filter(
|
||||
docId => !docIdsInWorkspaceSet.has(docId)
|
||||
)
|
||||
);
|
||||
for (const docId of deletedDocIds) {
|
||||
const isPlaceholder = await this.models.copilotWorkspace.hasPlaceholder(
|
||||
workspaceId,
|
||||
docId
|
||||
);
|
||||
if (isPlaceholder) continue;
|
||||
await this.models.copilotContext.deleteWorkspaceEmbedding(
|
||||
workspaceId,
|
||||
docId
|
||||
|
||||
Reference in New Issue
Block a user