mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-14 21:27:20 +00:00
feat(server): workspace embedding improve (#12022)
fix AI-10 fix AI-109 fix PD-2484 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Added a method to check if a document requires embedding, improving embedding efficiency. - Enhanced document embeddings with enriched metadata, including title, summary, creation/update dates, and author information. - Introduced a new type for document fragments with extended metadata fields. - **Improvements** - Embedding logic now conditionally processes only documents needing updates. - Embedding content now includes document metadata for more informative context. - Expanded and improved test coverage for embedding scenarios and workspace behaviors. - Event emission added for workspace embedding updates on client version mismatch. - Job queueing enhanced with prioritization and explicit job IDs for better management. - Job queue calls updated to include priority and context identifiers in a structured format. - **Bug Fixes** - Improved handling of ignored documents in embedding matches. - Fixed incorrect document ID assignment in embedding job queueing. - **Tests** - Added and updated snapshot and behavioral tests for embedding and workspace document handling. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -16,6 +16,7 @@ import { Models } from '../../../models';
|
||||
import { CopilotStorage } from '../storage';
|
||||
import { readStream } from '../utils';
|
||||
import { OpenAIEmbeddingClient } from './embedding';
|
||||
import type { Chunk, DocFragment } from './types';
|
||||
import { EMBEDDING_DIMENSIONS, EmbeddingClient } from './types';
|
||||
|
||||
@Injectable()
|
||||
@@ -78,16 +79,23 @@ export class CopilotContextDocJob {
|
||||
@OnEvent('workspace.doc.embedding')
|
||||
async addDocEmbeddingQueue(
|
||||
docs: Events['workspace.doc.embedding'],
|
||||
contextId?: string
|
||||
options?: { contextId: string; priority: number }
|
||||
) {
|
||||
if (!this.supportEmbedding) return;
|
||||
|
||||
for (const { workspaceId, docId } of docs) {
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
contextId,
|
||||
workspaceId,
|
||||
docId,
|
||||
});
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
contextId: options?.contextId,
|
||||
workspaceId,
|
||||
docId,
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${workspaceId}:${docId}`,
|
||||
priority: options?.priority ?? 1,
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -110,14 +118,26 @@ export class CopilotContextDocJob {
|
||||
}: Events['workspace.embedding']) {
|
||||
if (!this.supportEmbedding || !this.embeddingClient) return;
|
||||
|
||||
if (enableDocEmbedding === undefined) {
|
||||
enableDocEmbedding =
|
||||
await this.models.workspace.allowEmbedding(workspaceId);
|
||||
}
|
||||
|
||||
if (enableDocEmbedding) {
|
||||
const toBeEmbedDocIds =
|
||||
await this.models.copilotWorkspace.findDocsToEmbed(workspaceId);
|
||||
for (const docId of toBeEmbedDocIds) {
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
workspaceId,
|
||||
docId,
|
||||
});
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
workspaceId,
|
||||
docId,
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${workspaceId}:${docId}`,
|
||||
priority: 1,
|
||||
}
|
||||
);
|
||||
}
|
||||
} else {
|
||||
const controller = this.workspaceJobAbortController.get(workspaceId);
|
||||
@@ -132,14 +152,25 @@ export class CopilotContextDocJob {
|
||||
async addDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.updated']) {
|
||||
if (!this.supportEmbedding || !this.embeddingClient) return;
|
||||
|
||||
await this.queue.add('copilot.embedding.docs', {
|
||||
workspaceId: doc.workspaceId,
|
||||
docId: doc.workspaceId,
|
||||
});
|
||||
await this.queue.add(
|
||||
'copilot.embedding.docs',
|
||||
{
|
||||
workspaceId: doc.workspaceId,
|
||||
docId: doc.docId,
|
||||
},
|
||||
{
|
||||
jobId: `workspace:embedding:${doc.workspaceId}:${doc.docId}`,
|
||||
priority: 2,
|
||||
}
|
||||
);
|
||||
}
|
||||
|
||||
@OnEvent('doc.indexer.deleted')
|
||||
async deleteDocEmbeddingQueueFromEvent(doc: Events['doc.indexer.deleted']) {
|
||||
await this.queue.remove(
|
||||
`workspace:embedding:${doc.workspaceId}:${doc.docId}`,
|
||||
'copilot.embedding.docs'
|
||||
);
|
||||
await this.models.copilotContext.deleteWorkspaceEmbedding(
|
||||
doc.workspaceId,
|
||||
doc.docId
|
||||
@@ -221,6 +252,43 @@ export class CopilotContextDocJob {
|
||||
}
|
||||
}
|
||||
|
||||
private async getDocFragment(
|
||||
workspaceId: string,
|
||||
docId: string
|
||||
): Promise<DocFragment | null> {
|
||||
const docContent = await this.doc.getFullDocContent(workspaceId, docId);
|
||||
const authors = await this.models.doc.getAuthors(workspaceId, docId);
|
||||
if (docContent?.summary && authors) {
|
||||
const { title = 'Untitled', summary } = docContent;
|
||||
const { createdAt, updatedAt, createdByUser, updatedByUser } = authors;
|
||||
return {
|
||||
title,
|
||||
summary,
|
||||
createdAt: createdAt.toDateString(),
|
||||
updatedAt: updatedAt.toDateString(),
|
||||
createdBy: createdByUser?.name,
|
||||
updatedBy: updatedByUser?.name,
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private formatDocChunks(chunks: Chunk[], fragment: DocFragment): Chunk[] {
|
||||
return chunks.map(chunk => ({
|
||||
index: chunk.index,
|
||||
content: [
|
||||
`Title: ${fragment.title}`,
|
||||
`Created at: ${fragment.createdAt}`,
|
||||
`Updated at: ${fragment.updatedAt}`,
|
||||
fragment.createdBy ? `Created by: ${fragment.createdBy}` : undefined,
|
||||
fragment.updatedBy ? `Updated by: ${fragment.updatedBy}` : undefined,
|
||||
chunk.content,
|
||||
]
|
||||
.filter(Boolean)
|
||||
.join('\n'),
|
||||
}));
|
||||
}
|
||||
|
||||
private getWorkspaceSignal(workspaceId: string) {
|
||||
let controller = this.workspaceJobAbortController.get(workspaceId);
|
||||
if (!controller) {
|
||||
@@ -241,39 +309,49 @@ export class CopilotContextDocJob {
|
||||
const signal = this.getWorkspaceSignal(workspaceId);
|
||||
|
||||
try {
|
||||
const content = await this.doc.getFullDocContent(workspaceId, docId);
|
||||
if (signal.aborted) {
|
||||
return;
|
||||
} else if (content) {
|
||||
// fast fall for empty doc, journal is easily to create a empty doc
|
||||
if (content.summary) {
|
||||
const embeddings = await this.embeddingClient.getFileEmbeddings(
|
||||
new File([content.summary], `${content.title || 'Untitled'}.md`),
|
||||
signal
|
||||
);
|
||||
const needEmbedding =
|
||||
await this.models.copilotWorkspace.checkDocNeedEmbedded(
|
||||
workspaceId,
|
||||
docId
|
||||
);
|
||||
if (needEmbedding) {
|
||||
if (signal.aborted) return;
|
||||
const fragment = await this.getDocFragment(workspaceId, docId);
|
||||
if (fragment) {
|
||||
// fast fall for empty doc, journal is easily to create a empty doc
|
||||
if (fragment.summary) {
|
||||
const embeddings = await this.embeddingClient.getFileEmbeddings(
|
||||
new File(
|
||||
[fragment.summary],
|
||||
`${fragment.title || 'Untitled'}.md`
|
||||
),
|
||||
chunks => this.formatDocChunks(chunks, fragment),
|
||||
signal
|
||||
);
|
||||
|
||||
for (const chunks of embeddings) {
|
||||
for (const chunks of embeddings) {
|
||||
await this.models.copilotContext.insertWorkspaceEmbedding(
|
||||
workspaceId,
|
||||
docId,
|
||||
chunks
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// for empty doc, insert empty embedding
|
||||
const emptyEmbedding = {
|
||||
index: 0,
|
||||
content: '',
|
||||
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0),
|
||||
};
|
||||
await this.models.copilotContext.insertWorkspaceEmbedding(
|
||||
workspaceId,
|
||||
docId,
|
||||
chunks
|
||||
[emptyEmbedding]
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// for empty doc, insert empty embedding
|
||||
const emptyEmbedding = {
|
||||
index: 0,
|
||||
content: '',
|
||||
embedding: Array.from({ length: EMBEDDING_DIMENSIONS }, () => 0),
|
||||
};
|
||||
await this.models.copilotContext.insertWorkspaceEmbedding(
|
||||
workspaceId,
|
||||
docId,
|
||||
[emptyEmbedding]
|
||||
);
|
||||
} else if (contextId) {
|
||||
throw new DocNotFound({ spaceId: workspaceId, docId });
|
||||
}
|
||||
} else if (contextId) {
|
||||
throw new DocNotFound({ spaceId: workspaceId, docId });
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (contextId) {
|
||||
|
||||
@@ -498,7 +498,7 @@ export class CopilotContextResolver {
|
||||
workspaceId: session.workspaceId,
|
||||
docId,
|
||||
})),
|
||||
session.id
|
||||
{ contextId: session.id, priority: 0 }
|
||||
);
|
||||
}
|
||||
|
||||
@@ -559,7 +559,7 @@ export class CopilotContextResolver {
|
||||
|
||||
await this.jobs.addDocEmbeddingQueue(
|
||||
[{ workspaceId: session.workspaceId, docId: options.docId }],
|
||||
session.id
|
||||
{ contextId: session.id, priority: 0 }
|
||||
);
|
||||
|
||||
return { ...record, status: record.status || null };
|
||||
|
||||
@@ -3,6 +3,7 @@ import { File } from 'node:buffer';
|
||||
import { z } from 'zod';
|
||||
|
||||
import { CopilotContextFileNotSupported } from '../../../base';
|
||||
import type { PageDocContent } from '../../../core/utils/blocksuite';
|
||||
import { ChunkSimilarity, Embedding } from '../../../models';
|
||||
import { parseDoc } from '../../../native';
|
||||
|
||||
@@ -10,7 +11,7 @@ declare global {
|
||||
interface Events {
|
||||
'workspace.embedding': {
|
||||
workspaceId: string;
|
||||
enableDocEmbedding: boolean;
|
||||
enableDocEmbedding?: boolean;
|
||||
};
|
||||
|
||||
'workspace.doc.embedding': Array<{
|
||||
@@ -53,6 +54,13 @@ declare global {
|
||||
}
|
||||
}
|
||||
|
||||
export type DocFragment = PageDocContent & {
|
||||
createdAt: string;
|
||||
createdBy?: string;
|
||||
updatedAt: string;
|
||||
updatedBy?: string;
|
||||
};
|
||||
|
||||
export type Chunk = {
|
||||
index: number;
|
||||
content: string;
|
||||
@@ -63,11 +71,12 @@ export const EMBEDDING_DIMENSIONS = 1024;
|
||||
export abstract class EmbeddingClient {
|
||||
async getFileEmbeddings(
|
||||
file: File,
|
||||
chunkMapper: (chunk: Chunk[]) => Chunk[],
|
||||
signal?: AbortSignal
|
||||
): Promise<Embedding[][]> {
|
||||
const chunks = await this.getFileChunks(file, signal);
|
||||
const chunkedEmbeddings = await Promise.all(
|
||||
chunks.map(chunk => this.generateEmbeddings(chunk))
|
||||
chunks.map(chunk => this.generateEmbeddings(chunkMapper(chunk)))
|
||||
);
|
||||
return chunkedEmbeddings;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user