fix(server): process empty doc embedding (#12417)

fix CLOUD-219

<!-- This is an auto-generated comment: release notes by coderabbit.ai -->

## Summary by CodeRabbit

- **Bug Fixes**
  - Ensured that documents without content now receive a placeholder embedding, improving consistency in document processing.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
darkskygit
2025-05-21 09:37:22 +00:00
parent abfc994180
commit c9b296c896
3 changed files with 32 additions and 10 deletions

View File

@@ -7,7 +7,12 @@ import { chunk } from 'lodash-es';
import { ChunkSimilarity, Embedding } from '../../../models';
import { OpenAIConfig } from '../providers/openai';
import { EmbeddingClient, getReRankSchema, ReRankResult } from './types';
import {
EMBEDDING_DIMENSIONS,
EmbeddingClient,
getReRankSchema,
ReRankResult,
} from './types';
const RERANK_MODEL = 'gpt-4.1-mini';
@@ -24,7 +29,7 @@ export class OpenAIEmbeddingClient extends EmbeddingClient {
async getEmbeddings(input: string[]): Promise<Embedding[]> {
const modelInstance = this.#instance.embedding('text-embedding-3-large', {
dimensions: 1024,
dimensions: EMBEDDING_DIMENSIONS,
});
const { embeddings } = await embedMany({
@@ -124,7 +129,9 @@ export class MockEmbeddingClient extends EmbeddingClient {
return input.map((_, i) => ({
index: i,
content: input[i],
embedding: Array.from({ length: 1024 }, () => Math.random()),
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () =>
Math.random()
),
}));
}
}

View File

@@ -16,7 +16,7 @@ import { Models } from '../../../models';
import { CopilotStorage } from '../storage';
import { readStream } from '../utils';
import { OpenAIEmbeddingClient } from './embedding';
import { EmbeddingClient } from './types';
import { EMBEDDING_DIMENSIONS, EmbeddingClient } from './types';
@Injectable()
export class CopilotContextDocJob {
@@ -225,16 +225,29 @@ export class CopilotContextDocJob {
const content = await this.doc.getFullDocContent(workspaceId, docId);
if (content) {
// fast fall for empty doc, journal is easily to create a empty doc
if (!content.summary) return;
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File([content.summary], `${content.title || 'Untitled'}.md`)
);
if (content.summary) {
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File([content.summary], `${content.title || 'Untitled'}.md`)
);
for (const chunks of embeddings) {
for (const chunks of embeddings) {
await this.models.copilotContext.insertWorkspaceEmbedding(
workspaceId,
docId,
chunks
);
}
} else {
// for empty doc, insert empty embedding
const emptyEmbedding = {
index: 0,
content: '',
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0),
};
await this.models.copilotContext.insertWorkspaceEmbedding(
workspaceId,
docId,
chunks
[emptyEmbedding]
);
}
} else if (contextId) {

View File

@@ -57,6 +57,8 @@ export type Chunk = {
content: string;
};
export const EMBEDDING_DIMENSIONS = 1024;
export abstract class EmbeddingClient {
async getFileEmbeddings(
file: File,