feat(server): get full content for doc embedding (#11138)

This commit is contained in:
darkskygit
2025-03-25 01:10:43 +00:00
parent c1b3e25fc7
commit bf5d8b1211
3 changed files with 18 additions and 7 deletions

View File

@@ -42,10 +42,10 @@ export abstract class DocReader {
protected readonly blobStorage: WorkspaceBlobStorage
) {}
parseDocContent(bin: Uint8Array) {
parseDocContent(bin: Uint8Array, maxSummaryLength = 150) {
const doc = new YDoc();
applyUpdate(doc, bin);
return parsePageDoc(doc);
return parsePageDoc(doc, { maxSummaryLength });
}
parseWorkspaceContent(bin: Uint8Array) {
@@ -85,6 +85,13 @@ export abstract class DocReader {
return content;
}
async getFullDocContent(
workspaceId: string,
docId: string
): Promise<PageDocContent | null> {
return await this.getDocContentWithoutCache(workspaceId, docId, true);
}
/**
* Get workspace content, try to read from database first.
* If not exists, read from `getWorkspaceContentWithoutCache()` and save it back to database.
@@ -131,7 +138,8 @@ export abstract class DocReader {
protected abstract getDocContentWithoutCache(
workspaceId: string,
guid: string
guid: string,
fullContent?: boolean
): Promise<PageDocContent | null>;
protected abstract getWorkspaceContentWithoutCache(
@@ -180,13 +188,14 @@ export class DatabaseDocReader extends DocReader {
protected override async getDocContentWithoutCache(
workspaceId: string,
guid: string
guid: string,
fullContent?: boolean
): Promise<PageDocContent | null> {
const docRecord = await this.workspace.getDoc(workspaceId, guid);
if (!docRecord) {
return null;
}
return this.parseDocContent(docRecord.bin);
return this.parseDocContent(docRecord.bin, fullContent ? -1 : 150);
}
protected override async getWorkspaceContentWithoutCache(

View File

@@ -115,7 +115,9 @@ export function parsePageDoc(
continue;
}
if (summaryLenNeeded > 0) {
if (summaryLenNeeded === -1) {
content.summary += text.toString();
} else if (summaryLenNeeded > 0) {
content.summary += text.toString();
summaryLenNeeded -= text.length;
} else {

View File

@@ -159,7 +159,7 @@ export class CopilotContextDocJob implements OnModuleInit {
if (!this.supportEmbedding) return;
try {
const content = await this.doc.getDocContent(workspaceId, docId);
const content = await this.doc.getFullDocContent(workspaceId, docId);
if (content) {
// no need to check if embeddings is empty, will throw internally
const embeddings = await this.embeddingClient.getFileEmbeddings(