feat(server): workspace embedding improve (#12022)

fix AI-10
fix AI-109
fix PD-2484

<!-- This is an auto-generated comment: release notes by coderabbit.ai -->
## Summary by CodeRabbit

- **New Features**
  - Added a method to check if a document requires embedding, improving embedding efficiency.
  - Enhanced document embeddings with enriched metadata, including title, summary, creation/update dates, and author information.
  - Introduced a new type for document fragments with extended metadata fields.

- **Improvements**
  - Embedding logic now conditionally processes only documents needing updates.
  - Embedding content now includes document metadata for more informative context.
  - Expanded and improved test coverage for embedding scenarios and workspace behaviors.
  - Event emission added for workspace embedding updates on client version mismatch.
  - Job queueing enhanced with prioritization and explicit job IDs for better management.
  - Job queue calls updated to include priority and context identifiers in a structured format.

- **Bug Fixes**
  - Improved handling of ignored documents in embedding matches.
  - Fixed incorrect document ID assignment in embedding job queueing.

- **Tests**
  - Added and updated snapshot and behavioral tests for embedding and workspace document handling.
<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
darkskygit
2025-05-23 10:16:14 +00:00
parent 262f1a47a4
commit 2a80fbb993
9 changed files with 326 additions and 54 deletions

View File

@@ -16,6 +16,7 @@ import { Models } from '../../../models';
import { CopilotStorage } from '../storage';
import { readStream } from '../utils';
import { OpenAIEmbeddingClient } from './embedding';
import type { Chunk, DocFragment } from './types';
import { EMBEDDING_DIMENSIONS, EmbeddingClient } from './types';
@Injectable()
@@ -78,16 +79,23 @@ export class CopilotContextDocJob {
@OnEvent('workspace.doc.embedding')
async addDocEmbeddingQueue(
docs: Events['workspace.doc.embedding'],
contextId?: string
options?: { contextId: string; priority: number }
) {
if (!this.supportEmbedding) return;
for (const { workspaceId, docId } of docs) {
await this.queue.add('copilot.embedding.docs', {
contextId,
workspaceId,
docId,
});
await this.queue.add(
'copilot.embedding.docs',
{
contextId: options?.contextId,
workspaceId,
docId,
},
{
jobId: `workspace:embedding:${workspaceId}:${docId}`,
priority: options?.priority ?? 1,
}
);
}
}
@@ -110,14 +118,26 @@ export class CopilotContextDocJob {
}: Events['workspace.embedding']) {
if (!this.supportEmbedding || !this.embeddingClient) return;
if (enableDocEmbedding === undefined) {
enableDocEmbedding =
await this.models.workspace.allowEmbedding(workspaceId);
}
if (enableDocEmbedding) {
const toBeEmbedDocIds =
await this.models.copilotWorkspace.findDocsToEmbed(workspaceId);
for (const docId of toBeEmbedDocIds) {
await this.queue.add('copilot.embedding.docs', {
workspaceId,
docId,
});
await this.queue.add(
'copilot.embedding.docs',
{
workspaceId,
docId,
},
{
jobId: `workspace:embedding:${workspaceId}:${docId}`,
priority: 1,
}
);
}
} else {
const controller = this.workspaceJobAbortController.get(workspaceId);
@@ -132,14 +152,25 @@ export class CopilotContextDocJob {
async addDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.updated']) {
if (!this.supportEmbedding || !this.embeddingClient) return;
await this.queue.add('copilot.embedding.docs', {
workspaceId: doc.workspaceId,
docId: doc.workspaceId,
});
await this.queue.add(
'copilot.embedding.docs',
{
workspaceId: doc.workspaceId,
docId: doc.docId,
},
{
jobId: `workspace:embedding:${doc.workspaceId}:${doc.docId}`,
priority: 2,
}
);
}
@OnEvent('doc.indexer.deleted')
async deleteDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.deleted']) {
await this.queue.remove(
`workspace:embedding:${doc.workspaceId}:${doc.docId}`,
'copilot.embedding.docs'
);
await this.models.copilotContext.deleteWorkspaceEmbedding(
doc.workspaceId,
doc.docId
@@ -221,6 +252,43 @@ export class CopilotContextDocJob {
}
}
private async getDocFragment(
workspaceId: string,
docId: string
): Promise<DocFragment | null> {
const docContent = await this.doc.getFullDocContent(workspaceId, docId);
const authors = await this.models.doc.getAuthors(workspaceId, docId);
if (docContent?.summary && authors) {
const { title = 'Untitled', summary } = docContent;
const { createdAt, updatedAt, createdByUser, updatedByUser } = authors;
return {
title,
summary,
createdAt: createdAt.toDateString(),
updatedAt: updatedAt.toDateString(),
createdBy: createdByUser?.name,
updatedBy: updatedByUser?.name,
};
}
return null;
}
private formatDocChunks(chunks: Chunk[], fragment: DocFragment): Chunk[] {
return chunks.map(chunk => ({
index: chunk.index,
content: [
`Title: ${fragment.title}`,
`Created at: ${fragment.createdAt}`,
`Updated at: ${fragment.updatedAt}`,
fragment.createdBy ? `Created by: ${fragment.createdBy}` : undefined,
fragment.updatedBy ? `Updated by: ${fragment.updatedBy}` : undefined,
chunk.content,
]
.filter(Boolean)
.join('\n'),
}));
}
private getWorkspaceSignal(workspaceId: string) {
let controller = this.workspaceJobAbortController.get(workspaceId);
if (!controller) {
@@ -241,39 +309,49 @@ export class CopilotContextDocJob {
const signal = this.getWorkspaceSignal(workspaceId);
try {
const content = await this.doc.getFullDocContent(workspaceId, docId);
if (signal.aborted) {
return;
} else if (content) {
// fast fall for empty doc, journal is easily to create a empty doc
if (content.summary) {
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File([content.summary], `${content.title || 'Untitled'}.md`),
signal
);
const needEmbedding =
await this.models.copilotWorkspace.checkDocNeedEmbedded(
workspaceId,
docId
);
if (needEmbedding) {
if (signal.aborted) return;
const fragment = await this.getDocFragment(workspaceId, docId);
if (fragment) {
// fast fall for empty doc, journal is easily to create a empty doc
if (fragment.summary) {
const embeddings = await this.embeddingClient.getFileEmbeddings(
new File(
[fragment.summary],
`${fragment.title || 'Untitled'}.md`
),
chunks => this.formatDocChunks(chunks, fragment),
signal
);
for (const chunks of embeddings) {
for (const chunks of embeddings) {
await this.models.copilotContext.insertWorkspaceEmbedding(
workspaceId,
docId,
chunks
);
}
} else {
// for empty doc, insert empty embedding
const emptyEmbedding = {
index: 0,
content: '',
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0),
};
await this.models.copilotContext.insertWorkspaceEmbedding(
workspaceId,
docId,
chunks
[emptyEmbedding]
);
}
} else {
// for empty doc, insert empty embedding
const emptyEmbedding = {
index: 0,
content: '',
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0),
};
await this.models.copilotContext.insertWorkspaceEmbedding(
workspaceId,
docId,
[emptyEmbedding]
);
} else if (contextId) {
throw new DocNotFound({ spaceId: workspaceId, docId });
}
} else if (contextId) {
throw new DocNotFound({ spaceId: workspaceId, docId });
}
} catch (error: any) {
if (contextId) {

View File

@@ -498,7 +498,7 @@ export class CopilotContextResolver {
workspaceId: session.workspaceId,
docId,
})),
session.id
{ contextId: session.id, priority: 0 }
);
}
@@ -559,7 +559,7 @@ export class CopilotContextResolver {
await this.jobs.addDocEmbeddingQueue(
[{ workspaceId: session.workspaceId, docId: options.docId }],
session.id
{ contextId: session.id, priority: 0 }
);
return { ...record, status: record.status || null };

View File

@@ -3,6 +3,7 @@ import { File } from 'node:buffer';
import { z } from 'zod';
import { CopilotContextFileNotSupported } from '../../../base';
import type { PageDocContent } from '../../../core/utils/blocksuite';
import { ChunkSimilarity, Embedding } from '../../../models';
import { parseDoc } from '../../../native';
@@ -10,7 +11,7 @@ declare global {
interface Events {
'workspace.embedding': {
workspaceId: string;
enableDocEmbedding: boolean;
enableDocEmbedding?: boolean;
};
'workspace.doc.embedding': Array<{
@@ -53,6 +54,13 @@ declare global {
}
}
export type DocFragment = PageDocContent & {
createdAt: string;
createdBy?: string;
updatedAt: string;
updatedBy?: string;
};
export type Chunk = {
index: number;
content: string;
@@ -63,11 +71,12 @@ export const EMBEDDING_DIMENSIONS = 1024;
export abstract class EmbeddingClient {
async getFileEmbeddings(
file: File,
chunkMapper: (chunk: Chunk[]) => Chunk[],
signal?: AbortSignal
): Promise<Embedding[][]> {
const chunks = await this.getFileChunks(file, signal);
const chunkedEmbeddings = await Promise.all(
chunks.map(chunk => this.generateEmbeddings(chunk))
chunks.map(chunk => this.generateEmbeddings(chunkMapper(chunk)))
);
return chunkedEmbeddings;
}