feat(server): skip embedding for deprecated doc ids & empty docs (#13211)

fix AI-367

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->
## Summary by CodeRabbit

* **Bug Fixes**
* Improved document filtering to exclude settings documents and empty
blobs from embedding and status calculations.
* Enhanced embedding jobs to skip processing deprecated documents if a
newer version exists, ensuring only up-to-date documents are embedded.
* **New Features**
* Added a mutation to trigger the cron job for generating missing
titles.
* **Tests**
* Added test to verify exclusion of documents with empty content from
embedding.
* Updated embedding-related tests to toggle embedding state during
attachment upload under simulated network conditions.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
DarkSky
2025-07-15 16:50:48 +08:00
committed by GitHub
parent cd91bea5c1
commit 8e374f5517
7 changed files with 40 additions and 6 deletions

View File

@@ -214,6 +214,21 @@ test('should insert and search embedding', async t => {
);
t.false(results.includes(docId), 'docs containing `$` should be excluded');
}
{
const docId = 'empty_doc';
await t.context.doc.upsert({
spaceId: workspace.id,
docId: docId,
blob: Uint8Array.from([0, 0]),
timestamp: Date.now(),
editorId: user.id,
});
const results = await t.context.copilotWorkspace.findDocsToEmbed(
workspace.id
);
t.false(results.includes(docId), 'empty documents should be excluded');
}
});
test('should check need to be embedded', async t => {

View File

@@ -58,10 +58,12 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
ON id.workspace_id = s.workspace_id
AND id.doc_id = s.guid
WHERE s.workspace_id = ${workspaceId}
AND s.guid != s.workspace_id
AND s.guid <> s.workspace_id
AND s.guid NOT LIKE '%$%'
AND s.guid NOT LIKE '%:settings:%'
AND e.doc_id IS NULL
AND id.doc_id IS NULL;`;
AND id.doc_id IS NULL
AND s.blob <> E'\\\\x0000';`;
return docIds.map(r => r.id);
}
@@ -160,6 +162,8 @@ export class CopilotWorkspaceConfigModel extends BaseModel {
{ id: { notIn: ignoredDocIds } },
{ id: { not: workspaceId } },
{ id: { not: { contains: '$' } } },
{ id: { not: { contains: ':settings:' } } },
{ blob: { not: new Uint8Array([0, 0]) } },
],
};

View File

@@ -337,6 +337,10 @@ export class CopilotEmbeddingJob {
const signal = this.getWorkspaceSignal(workspaceId);
try {
const hasNewDoc = await this.models.doc.exists(
workspaceId,
docId.split(':space:')[1] || ''
);
const needEmbedding =
await this.models.copilotWorkspace.checkDocNeedEmbedded(
workspaceId,
@@ -352,8 +356,11 @@ export class CopilotEmbeddingJob {
);
return;
}
const fragment = await this.getDocFragment(workspaceId, docId);
if (fragment) {
// if doc id deprecated, skip embedding and fulfill empty embedding
const fragment = !hasNewDoc
? await this.getDocFragment(workspaceId, docId)
: undefined;
if (!hasNewDoc && fragment) {
// fast fall for empty doc, journal is easily to create a empty doc
if (fragment.summary.trim()) {
const embeddings = await this.embeddingClient.getFileEmbeddings(
@@ -382,7 +389,7 @@ export class CopilotEmbeddingJob {
);
await this.fulfillEmptyEmbedding(workspaceId, docId);
}
} else if (contextId) {
} else {
this.logger.warn(
`Doc ${docId} in workspace ${workspaceId} has no fragment, fulfilling empty embedding.`
);

View File

@@ -844,7 +844,7 @@ export class PromptsManagementResolver {
private readonly promptService: PromptService
) {}
@Query(() => Boolean, {
@Mutation(() => Boolean, {
description: 'Trigger generate missing titles cron job',
})
async triggerGenerateTitleCron() {

View File

@@ -1297,6 +1297,9 @@ type Mutation {
setBlob(blob: Upload!, workspaceId: String!): String!
submitAudioTranscription(blob: Upload, blobId: String!, blobs: [Upload!], workspaceId: String!): TranscriptionResultType
"""Trigger generate missing titles cron job"""
triggerGenerateTitleCron: Boolean!
"""update app configuration"""
updateAppConfig(updates: [UpdateAppConfigInput!]!): JSONObject!