mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-14 05:14:54 +00:00
feat(server): compress transcript response (#11316)
This commit is contained in:
@@ -330,14 +330,6 @@ Convert a multi-speaker audio recording into a structured JSON format by transcr
|
||||
1. Analyze the audio to detect the presence of multiple speakers using distinct microphone inputs.
|
||||
2. Transcribe the audio content for each speaker and note the time intervals of speech.
|
||||
|
||||
# Output Format
|
||||
|
||||
The output should be a JSON array, with each element containing:
|
||||
- "speaker": A label identifying the speaker, such as "A", "B", etc.
|
||||
- "start": The start time of the transcribed segment in the format "HH:MM:SS".
|
||||
- "end": The end time of the transcribed segment in the format "HH:MM:SS".
|
||||
- "transcription": The transcribed text for the speaker's segment.
|
||||
|
||||
# Examples
|
||||
|
||||
**Example Input:**
|
||||
@@ -345,20 +337,7 @@ The output should be a JSON array, with each element containing:
|
||||
|
||||
**Example Output:**
|
||||
|
||||
[
|
||||
{
|
||||
"speaker": "A",
|
||||
"start": "00:00:30",
|
||||
"end": "00:00:45",
|
||||
"transcription": "Hello, everyone."
|
||||
},
|
||||
{
|
||||
"speaker": "B",
|
||||
"start": "00:00:46",
|
||||
"end": "00:01:10",
|
||||
"transcription": "Hi, thank you for joining the meeting today."
|
||||
}
|
||||
]
|
||||
[{"a":"A","s":30,"e":45,"t":"Hello, everyone."},{"a":"B","s":46,"e":70,"t":"Hi, thank you for joining the meeting today."}]
|
||||
|
||||
# Notes
|
||||
|
||||
@@ -369,7 +348,6 @@ The output should be a JSON array, with each element containing:
|
||||
},
|
||||
],
|
||||
config: {
|
||||
audioTimestamp: true,
|
||||
jsonMode: true,
|
||||
},
|
||||
},
|
||||
|
||||
@@ -7,6 +7,7 @@ import {
|
||||
type CoreAssistantMessage,
|
||||
type CoreUserMessage,
|
||||
FilePart,
|
||||
generateObject,
|
||||
generateText,
|
||||
streamText,
|
||||
TextPart,
|
||||
@@ -96,9 +97,10 @@ export class GeminiProvider
|
||||
|
||||
protected async chatToGPTMessage(
|
||||
messages: PromptMessage[]
|
||||
): Promise<[string | undefined, ChatMessage[]]> {
|
||||
let system =
|
||||
messages[0]?.role === 'system' ? messages.shift()?.content : undefined;
|
||||
): Promise<[string | undefined, ChatMessage[], any]> {
|
||||
const system =
|
||||
messages[0]?.role === 'system' ? messages.shift() : undefined;
|
||||
const schema = system?.params?.schema;
|
||||
|
||||
// filter redundant fields
|
||||
const msgs: ChatMessage[] = [];
|
||||
@@ -140,7 +142,7 @@ export class GeminiProvider
|
||||
}
|
||||
}
|
||||
|
||||
return [system, msgs];
|
||||
return [system?.content, msgs, schema];
|
||||
}
|
||||
|
||||
protected async checkParams({
|
||||
@@ -229,17 +231,25 @@ export class GeminiProvider
|
||||
try {
|
||||
metrics.ai.counter('chat_text_calls').add(1, { model });
|
||||
|
||||
const [system, msgs] = await this.chatToGPTMessage(messages);
|
||||
const [system, msgs, schema] = await this.chatToGPTMessage(messages);
|
||||
|
||||
const { text } = await generateText({
|
||||
model: this.#instance(model, {
|
||||
audioTimestamp: Boolean(options.audioTimestamp),
|
||||
structuredOutputs: Boolean(options.jsonMode),
|
||||
}),
|
||||
system,
|
||||
messages: msgs,
|
||||
abortSignal: options.signal,
|
||||
const modelInstance = this.#instance(model, {
|
||||
structuredOutputs: Boolean(options.jsonMode),
|
||||
});
|
||||
const { text } = schema
|
||||
? await generateObject({
|
||||
model: modelInstance,
|
||||
system,
|
||||
messages: msgs,
|
||||
schema,
|
||||
abortSignal: options.signal,
|
||||
}).then(r => ({ text: JSON.stringify(r.object) }))
|
||||
: await generateText({
|
||||
model: modelInstance,
|
||||
system,
|
||||
messages: msgs,
|
||||
abortSignal: options.signal,
|
||||
});
|
||||
|
||||
if (!text) throw new Error('Failed to generate text');
|
||||
return text.trim();
|
||||
@@ -251,7 +261,7 @@ export class GeminiProvider
|
||||
|
||||
async *generateTextStream(
|
||||
messages: PromptMessage[],
|
||||
model: string = 'gpt-4o-mini',
|
||||
model: string = 'gemini-2.0-flash-001',
|
||||
options: CopilotChatOptions = {}
|
||||
): AsyncIterable<string> {
|
||||
await this.checkParams({ messages, model, options });
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import { Injectable } from '@nestjs/common';
|
||||
import { AiJobStatus, AiJobType } from '@prisma/client';
|
||||
import { ZodType } from 'zod';
|
||||
|
||||
import {
|
||||
CopilotPromptNotFound,
|
||||
@@ -22,7 +23,7 @@ import {
|
||||
import { CopilotStorage } from '../storage';
|
||||
import {
|
||||
TranscriptionPayload,
|
||||
TranscriptionSchema,
|
||||
TranscriptionResponseSchema,
|
||||
TranscriptPayloadSchema,
|
||||
} from './types';
|
||||
import { readStream } from './utils';
|
||||
@@ -137,7 +138,8 @@ export class CopilotTranscriptionService {
|
||||
|
||||
private async chatWithPrompt(
|
||||
promptName: string,
|
||||
message: Partial<PromptMessage>
|
||||
message: Partial<PromptMessage>,
|
||||
schema?: ZodType<any>
|
||||
): Promise<string> {
|
||||
const prompt = await this.prompt.get(promptName);
|
||||
if (!prompt) {
|
||||
@@ -146,16 +148,20 @@ export class CopilotTranscriptionService {
|
||||
|
||||
const provider = await this.getProvider(prompt.model);
|
||||
return provider.generateText(
|
||||
[...prompt.finish({}), { role: 'user', content: '', ...message }],
|
||||
prompt.model
|
||||
[...prompt.finish({ schema }), { role: 'user', content: '', ...message }],
|
||||
prompt.model,
|
||||
Object.assign({}, prompt.config)
|
||||
);
|
||||
}
|
||||
|
||||
private cleanupResponse(response: string): string {
|
||||
return response
|
||||
.replace(/```[\w\s]+\n/g, '')
|
||||
.replace(/\n```/g, '')
|
||||
.trim();
|
||||
private convertTime(time: number) {
|
||||
const minutes = Math.floor(time / 60);
|
||||
const seconds = Math.floor(time % 60);
|
||||
const hours = Math.floor(minutes / 60);
|
||||
const minutesStr = String(minutes % 60).padStart(2, '0');
|
||||
const secondsStr = String(seconds).padStart(2, '0');
|
||||
const hoursStr = String(hours).padStart(2, '0');
|
||||
return `${hoursStr}:${minutesStr}:${secondsStr}`;
|
||||
}
|
||||
|
||||
@OnJob('copilot.transcript.submit')
|
||||
@@ -165,14 +171,23 @@ export class CopilotTranscriptionService {
|
||||
mimeType,
|
||||
}: Jobs['copilot.transcript.submit']) {
|
||||
try {
|
||||
const result = await this.chatWithPrompt('Transcript audio', {
|
||||
attachments: [url],
|
||||
params: { mimetype: mimeType },
|
||||
});
|
||||
|
||||
const transcription = TranscriptionSchema.parse(
|
||||
JSON.parse(this.cleanupResponse(result))
|
||||
const result = await this.chatWithPrompt(
|
||||
'Transcript audio',
|
||||
{
|
||||
attachments: [url],
|
||||
params: { mimetype: mimeType },
|
||||
},
|
||||
TranscriptionResponseSchema
|
||||
);
|
||||
|
||||
const transcription = TranscriptionResponseSchema.parse(
|
||||
JSON.parse(result)
|
||||
).map(t => ({
|
||||
speaker: t.a,
|
||||
start: this.convertTime(t.s),
|
||||
end: this.convertTime(t.e),
|
||||
transcription: t.t,
|
||||
}));
|
||||
await this.models.copilotJob.update(jobId, {
|
||||
payload: { transcription },
|
||||
});
|
||||
@@ -206,11 +221,9 @@ export class CopilotTranscriptionService {
|
||||
.trim();
|
||||
|
||||
if (content.length) {
|
||||
const result = await this.chatWithPrompt('Summary', {
|
||||
payload.summary = await this.chatWithPrompt('Summary', {
|
||||
content,
|
||||
});
|
||||
|
||||
payload.summary = this.cleanupResponse(result);
|
||||
await this.models.copilotJob.update(jobId, {
|
||||
payload,
|
||||
});
|
||||
@@ -244,11 +257,9 @@ export class CopilotTranscriptionService {
|
||||
.trim();
|
||||
|
||||
if (content.length) {
|
||||
const result = await this.chatWithPrompt('Summary as title', {
|
||||
payload.title = await this.chatWithPrompt('Summary as title', {
|
||||
content,
|
||||
});
|
||||
|
||||
payload.title = this.cleanupResponse(result);
|
||||
await this.models.copilotJob.update(jobId, {
|
||||
payload,
|
||||
});
|
||||
|
||||
@@ -2,6 +2,15 @@ import { z } from 'zod';
|
||||
|
||||
import { OneMB } from '../../../base';
|
||||
|
||||
export const TranscriptionResponseSchema = z
|
||||
.object({
|
||||
a: z.string().describe("speaker's name, for example A, B, C"),
|
||||
s: z.number().describe('start time(second) of the transcription'),
|
||||
e: z.number().describe('end time(second) of the transcription'),
|
||||
t: z.string().describe('transcription text'),
|
||||
})
|
||||
.array();
|
||||
|
||||
const TranscriptionItemSchema = z.object({
|
||||
speaker: z.string(),
|
||||
start: z.string(),
|
||||
|
||||
Reference in New Issue
Block a user