mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 21:05:19 +00:00
feat(server): improve context metadata & matching (#12064)
fix AI-20 <!-- This is an auto-generated comment: release notes by coderabbit.ai --> ## Summary by CodeRabbit - **New Features** - Enhanced file metadata with MIME type, blob ID, and file name across context and workspace, now visible in UI and API. - Added workspace-level matching for files and documents with configurable thresholds and workspace scoping in search queries. - Introduced a new error type and user-friendly messaging for global workspace context matching failures. - **Bug Fixes** - Improved consistent handling of file MIME types and nullable context IDs for accurate metadata. - **Documentation** - Updated GraphQL schema, queries, and mutations to include new metadata fields, optional parameters, and error types. - **Style** - Added new localization strings for global context matching error messages. - **Tests** - Extended test coverage with new and updated snapshot tests for metadata and matching logic. <!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
@@ -23,6 +23,7 @@ import {
|
||||
CallMetric,
|
||||
CopilotEmbeddingUnavailable,
|
||||
CopilotFailedToMatchContext,
|
||||
CopilotFailedToMatchGlobalContext,
|
||||
CopilotFailedToModifyContext,
|
||||
CopilotSessionNotFound,
|
||||
EventBus,
|
||||
@@ -117,8 +118,8 @@ class RemoveContextFileInput {
|
||||
|
||||
@ObjectType('CopilotContext')
|
||||
export class CopilotContextType {
|
||||
@Field(() => ID)
|
||||
id!: string;
|
||||
@Field(() => ID, { nullable: true })
|
||||
id!: string | undefined;
|
||||
|
||||
@Field(() => String)
|
||||
workspaceId!: string;
|
||||
@@ -169,6 +170,9 @@ class CopilotContextFile implements ContextFile {
|
||||
@Field(() => String)
|
||||
name!: string;
|
||||
|
||||
@Field(() => String)
|
||||
mimeType!: string;
|
||||
|
||||
@Field(() => SafeIntResolver)
|
||||
chunkSize!: number;
|
||||
|
||||
@@ -190,6 +194,15 @@ class ContextMatchedFileChunk implements FileChunkSimilarity {
|
||||
@Field(() => String)
|
||||
fileId!: string;
|
||||
|
||||
@Field(() => String)
|
||||
blobId!: string;
|
||||
|
||||
@Field(() => String)
|
||||
name!: string;
|
||||
|
||||
@Field(() => String)
|
||||
mimeType!: string;
|
||||
|
||||
@Field(() => SafeIntResolver)
|
||||
chunk!: number;
|
||||
|
||||
@@ -283,6 +296,15 @@ export class CopilotContextRootResolver {
|
||||
}
|
||||
}
|
||||
|
||||
if (copilot.workspaceId) {
|
||||
return [
|
||||
{
|
||||
id: undefined,
|
||||
workspaceId: copilot.workspaceId,
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
@@ -387,6 +409,9 @@ export class CopilotContextResolver {
|
||||
async collections(
|
||||
@Parent() context: CopilotContextType
|
||||
): Promise<CopilotContextCategory[]> {
|
||||
if (!context.id) {
|
||||
return [];
|
||||
}
|
||||
const session = await this.context.get(context.id);
|
||||
const collections = session.collections;
|
||||
await this.models.copilotContext.mergeDocStatus(
|
||||
@@ -404,6 +429,9 @@ export class CopilotContextResolver {
|
||||
async tags(
|
||||
@Parent() context: CopilotContextType
|
||||
): Promise<CopilotContextCategory[]> {
|
||||
if (!context.id) {
|
||||
return [];
|
||||
}
|
||||
const session = await this.context.get(context.id);
|
||||
const tags = session.tags;
|
||||
await this.models.copilotContext.mergeDocStatus(
|
||||
@@ -419,6 +447,9 @@ export class CopilotContextResolver {
|
||||
})
|
||||
@CallMetric('ai', 'context_file_list')
|
||||
async docs(@Parent() context: CopilotContextType): Promise<CopilotDocType[]> {
|
||||
if (!context.id) {
|
||||
return [];
|
||||
}
|
||||
const session = await this.context.get(context.id);
|
||||
const docs = session.docs;
|
||||
await this.models.copilotContext.mergeDocStatus(session.workspaceId, docs);
|
||||
@@ -433,6 +464,9 @@ export class CopilotContextResolver {
|
||||
async files(
|
||||
@Parent() context: CopilotContextType
|
||||
): Promise<CopilotContextFile[]> {
|
||||
if (!context.id) {
|
||||
return [];
|
||||
}
|
||||
const session = await this.context.get(context.id);
|
||||
return session.files;
|
||||
}
|
||||
@@ -593,7 +627,11 @@ export class CopilotContextResolver {
|
||||
const session = await this.context.get(options.contextId);
|
||||
|
||||
try {
|
||||
const file = await session.addFile(options.blobId, content.filename);
|
||||
const file = await session.addFile(
|
||||
options.blobId,
|
||||
content.filename,
|
||||
content.mimetype
|
||||
);
|
||||
|
||||
const buffer = await readStream(content.createReadStream());
|
||||
await this.storage.put(
|
||||
@@ -664,6 +702,8 @@ export class CopilotContextResolver {
|
||||
@Args('content') content: string,
|
||||
@Args('limit', { type: () => SafeIntResolver, nullable: true })
|
||||
limit?: number,
|
||||
@Args('scopedThreshold', { type: () => Float, nullable: true })
|
||||
scopedThreshold?: number,
|
||||
@Args('threshold', { type: () => Float, nullable: true })
|
||||
threshold?: number
|
||||
): Promise<ContextMatchedFileChunk[]> {
|
||||
@@ -671,22 +711,46 @@ export class CopilotContextResolver {
|
||||
return [];
|
||||
}
|
||||
|
||||
const session = await this.context.get(context.id);
|
||||
|
||||
try {
|
||||
return await session.matchFileChunks(
|
||||
if (!context.id) {
|
||||
return await this.context.matchWorkspaceFiles(
|
||||
context.workspaceId,
|
||||
content,
|
||||
limit,
|
||||
this.getSignal(ctx.req),
|
||||
threshold
|
||||
);
|
||||
}
|
||||
|
||||
const session = await this.context.get(context.id);
|
||||
return await session.matchFiles(
|
||||
content,
|
||||
limit,
|
||||
this.getSignal(ctx.req),
|
||||
scopedThreshold,
|
||||
threshold
|
||||
);
|
||||
} catch (e: any) {
|
||||
throw new CopilotFailedToMatchContext({
|
||||
contextId: context.id,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
// passthrough user friendly error
|
||||
if (e instanceof UserFriendlyError) {
|
||||
throw e;
|
||||
}
|
||||
|
||||
if (context.id) {
|
||||
throw new CopilotFailedToMatchContext({
|
||||
contextId: context.id,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
} else {
|
||||
throw new CopilotFailedToMatchGlobalContext({
|
||||
workspaceId: context.workspaceId,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -711,20 +775,38 @@ export class CopilotContextResolver {
|
||||
}
|
||||
|
||||
try {
|
||||
const session = await this.context.get(context.id);
|
||||
await this.ac
|
||||
.user(user.id)
|
||||
.workspace(session.workspaceId)
|
||||
.workspace(context.workspaceId)
|
||||
.allowLocal()
|
||||
.assert('Workspace.Copilot');
|
||||
const allowEmbedding = await this.models.workspace.allowEmbedding(
|
||||
session.workspaceId
|
||||
context.workspaceId
|
||||
);
|
||||
if (!allowEmbedding) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const chunks = await session.matchWorkspaceChunks(
|
||||
if (!context.id) {
|
||||
return await this.context.matchWorkspaceDocs(
|
||||
context.workspaceId,
|
||||
content,
|
||||
limit,
|
||||
this.getSignal(ctx.req),
|
||||
threshold
|
||||
);
|
||||
}
|
||||
|
||||
const session = await this.context.get(context.id);
|
||||
if (session.workspaceId !== context.workspaceId) {
|
||||
throw new CopilotFailedToMatchContext({
|
||||
contextId: context.id,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: 'context not in the same workspace',
|
||||
});
|
||||
}
|
||||
const chunks = await session.matchWorkspaceDocs(
|
||||
content,
|
||||
limit,
|
||||
this.getSignal(ctx.req),
|
||||
@@ -748,12 +830,22 @@ export class CopilotContextResolver {
|
||||
if (e instanceof UserFriendlyError) {
|
||||
throw e;
|
||||
}
|
||||
throw new CopilotFailedToMatchContext({
|
||||
contextId: context.id,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
|
||||
if (context.id) {
|
||||
throw new CopilotFailedToMatchContext({
|
||||
contextId: context.id,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
} else {
|
||||
throw new CopilotFailedToMatchGlobalContext({
|
||||
workspaceId: context.workspaceId,
|
||||
// don't record the large content
|
||||
content: content.slice(0, 512),
|
||||
message: e.message,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,6 +148,48 @@ export class CopilotContextService implements OnApplicationBootstrap {
|
||||
return null;
|
||||
}
|
||||
|
||||
async matchWorkspaceFiles(
|
||||
workspaceId: string,
|
||||
content: string,
|
||||
topK: number = 5,
|
||||
signal?: AbortSignal,
|
||||
threshold: number = 0.5
|
||||
) {
|
||||
if (!this.embeddingClient) return [];
|
||||
const embedding = await this.embeddingClient.getEmbedding(content, signal);
|
||||
if (!embedding) return [];
|
||||
|
||||
const chunks = await this.models.copilotWorkspace.matchFileEmbedding(
|
||||
workspaceId,
|
||||
embedding,
|
||||
topK * 2,
|
||||
threshold
|
||||
);
|
||||
|
||||
return this.embeddingClient.reRank(content, chunks, topK, signal);
|
||||
}
|
||||
|
||||
async matchWorkspaceDocs(
|
||||
workspaceId: string,
|
||||
content: string,
|
||||
topK: number = 5,
|
||||
signal?: AbortSignal,
|
||||
threshold: number = 0.5
|
||||
) {
|
||||
if (!this.embeddingClient) return [];
|
||||
const embedding = await this.embeddingClient.getEmbedding(content, signal);
|
||||
if (!embedding) return [];
|
||||
|
||||
const workspace = await this.models.copilotContext.matchWorkspaceEmbedding(
|
||||
embedding,
|
||||
workspaceId,
|
||||
topK * 2,
|
||||
threshold
|
||||
);
|
||||
|
||||
return this.embeddingClient.reRank(content, workspace, topK);
|
||||
}
|
||||
|
||||
@OnEvent('workspace.doc.embed.failed')
|
||||
async onDocEmbedFailed({
|
||||
contextId,
|
||||
|
||||
@@ -52,12 +52,16 @@ export class ContextSession implements AsyncDisposable {
|
||||
}
|
||||
|
||||
get files() {
|
||||
return this.config.files.map(f => ({ ...f }));
|
||||
return this.config.files.map(f => this.fulfillFile(f));
|
||||
}
|
||||
|
||||
get docIds() {
|
||||
return Array.from(
|
||||
new Set([this.config.docs, this.config.categories].flat().map(d => d.id))
|
||||
new Set(
|
||||
[this.config.docs, this.config.categories.flatMap(c => c.docs)]
|
||||
.flat()
|
||||
.map(d => d.id)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -136,14 +140,25 @@ export class ContextSession implements AsyncDisposable {
|
||||
return true;
|
||||
}
|
||||
|
||||
async addFile(blobId: string, name: string): Promise<ContextFile> {
|
||||
private fulfillFile(file: ContextFile): Required<ContextFile> {
|
||||
return {
|
||||
...file,
|
||||
mimeType: file.mimeType || 'application/octet-stream',
|
||||
};
|
||||
}
|
||||
|
||||
async addFile(
|
||||
blobId: string,
|
||||
name: string,
|
||||
mimeType: string
|
||||
): Promise<Required<ContextFile>> {
|
||||
let fileId = nanoid();
|
||||
const existsBlob = this.config.files.find(f => f.blobId === blobId);
|
||||
if (existsBlob) {
|
||||
// use exists file id if the blob exists
|
||||
// we assume that the file content pointed to by the same blobId is consistent.
|
||||
if (existsBlob.status === ContextEmbedStatus.finished) {
|
||||
return existsBlob;
|
||||
return this.fulfillFile(existsBlob);
|
||||
}
|
||||
fileId = existsBlob.id;
|
||||
} else {
|
||||
@@ -152,11 +167,12 @@ export class ContextSession implements AsyncDisposable {
|
||||
blobId,
|
||||
chunkSize: 0,
|
||||
name,
|
||||
mimeType,
|
||||
error: null,
|
||||
createdAt: Date.now(),
|
||||
}));
|
||||
}
|
||||
return this.getFile(fileId) as ContextFile;
|
||||
return this.fulfillFile(this.getFile(fileId) as ContextFile);
|
||||
}
|
||||
|
||||
getFile(fileId: string): ContextFile | undefined {
|
||||
@@ -181,15 +197,14 @@ export class ContextSession implements AsyncDisposable {
|
||||
* @param threshold relevance threshold for the similarity score, higher threshold means more similar chunks, default 0.7, good enough based on prior experiments
|
||||
* @returns list of similar chunks
|
||||
*/
|
||||
async matchFileChunks(
|
||||
async matchFiles(
|
||||
content: string,
|
||||
topK: number = 5,
|
||||
signal?: AbortSignal,
|
||||
threshold: number = 0.85
|
||||
scopedThreshold: number = 0.85,
|
||||
threshold: number = 0.5
|
||||
): Promise<FileChunkSimilarity[]> {
|
||||
const embedding = await this.client
|
||||
.getEmbeddings([content], signal)
|
||||
.then(r => r?.[0]?.embedding);
|
||||
const embedding = await this.client.getEmbedding(content, signal);
|
||||
if (!embedding) return [];
|
||||
|
||||
const [context, workspace] = await Promise.all([
|
||||
@@ -197,7 +212,7 @@ export class ContextSession implements AsyncDisposable {
|
||||
embedding,
|
||||
this.id,
|
||||
topK * 2,
|
||||
threshold
|
||||
scopedThreshold
|
||||
),
|
||||
this.models.copilotWorkspace.matchFileEmbedding(
|
||||
this.workspaceId,
|
||||
@@ -206,10 +221,21 @@ export class ContextSession implements AsyncDisposable {
|
||||
threshold
|
||||
),
|
||||
]);
|
||||
const files = new Map(this.files.map(f => [f.id, f]));
|
||||
|
||||
return this.client.reRank(
|
||||
content,
|
||||
[...context, ...workspace],
|
||||
[
|
||||
...context
|
||||
.filter(f => files.has(f.fileId))
|
||||
.map(c => {
|
||||
const { blobId, name, mimeType } = files.get(
|
||||
c.fileId
|
||||
) as Required<ContextFile>;
|
||||
return { ...c, blobId, name, mimeType };
|
||||
}),
|
||||
...workspace,
|
||||
],
|
||||
topK,
|
||||
signal
|
||||
);
|
||||
@@ -223,16 +249,14 @@ export class ContextSession implements AsyncDisposable {
|
||||
* @param threshold relevance threshold for the similarity score, higher threshold means more similar chunks, default 0.7, good enough based on prior experiments
|
||||
* @returns list of similar chunks
|
||||
*/
|
||||
async matchWorkspaceChunks(
|
||||
async matchWorkspaceDocs(
|
||||
content: string,
|
||||
topK: number = 5,
|
||||
signal?: AbortSignal,
|
||||
scopedThreshold: number = 0.5,
|
||||
threshold: number = 0.85
|
||||
scopedThreshold: number = 0.85,
|
||||
threshold: number = 0.5
|
||||
) {
|
||||
const embedding = await this.client
|
||||
.getEmbeddings([content], signal)
|
||||
.then(r => r?.[0]?.embedding);
|
||||
const embedding = await this.client.getEmbedding(content, signal);
|
||||
if (!embedding) return [];
|
||||
|
||||
const docIds = this.docIds;
|
||||
|
||||
@@ -128,6 +128,11 @@ export abstract class EmbeddingClient {
|
||||
.slice(0, topK);
|
||||
}
|
||||
|
||||
async getEmbedding(query: string, signal?: AbortSignal) {
|
||||
const embedding = await this.getEmbeddings([query], signal);
|
||||
return embedding?.[0]?.embedding;
|
||||
}
|
||||
|
||||
abstract getEmbeddings(
|
||||
input: string[],
|
||||
signal?: AbortSignal
|
||||
|
||||
@@ -60,6 +60,7 @@ export class CopilotWorkspaceService implements OnApplicationBootstrap {
|
||||
await this.storage.put(userId, workspaceId, blobId, buffer);
|
||||
const file = await this.models.copilotWorkspace.addFile(workspaceId, {
|
||||
fileName,
|
||||
blobId,
|
||||
mimeType: content.mimetype,
|
||||
size: buffer.length,
|
||||
});
|
||||
|
||||
@@ -55,6 +55,9 @@ export class CopilotWorkspaceFileType implements CopilotWorkspaceFile {
|
||||
@Field(() => String)
|
||||
fileId!: string;
|
||||
|
||||
@Field(() => String)
|
||||
blobId!: string;
|
||||
|
||||
@Field(() => String)
|
||||
fileName!: string;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user