feat(server): compress transcript response (#11316)

This commit is contained in:
darkskygit
2025-03-31 14:46:10 +00:00
parent 58d6a96e45
commit fe05872ada
4 changed files with 67 additions and 59 deletions

View File

@@ -330,14 +330,6 @@ Convert a multi-speaker audio recording into a structured JSON format by transcr
1. Analyze the audio to detect the presence of multiple speakers using distinct microphone inputs.
2. Transcribe the audio content for each speaker and note the time intervals of speech.
# Output Format
The output should be a JSON array, with each element containing:
- "speaker": A label identifying the speaker, such as "A", "B", etc.
- "start": The start time of the transcribed segment in the format "HH:MM:SS".
- "end": The end time of the transcribed segment in the format "HH:MM:SS".
- "transcription": The transcribed text for the speaker's segment.
# Examples
**Example Input:**
@@ -345,20 +337,7 @@ The output should be a JSON array, with each element containing:
**Example Output:**
[
{
"speaker": "A",
"start": "00:00:30",
"end": "00:00:45",
"transcription": "Hello, everyone."
},
{
"speaker": "B",
"start": "00:00:46",
"end": "00:01:10",
"transcription": "Hi, thank you for joining the meeting today."
}
]
[{"a":"A","s":30,"e":45,"t":"Hello, everyone."},{"a":"B","s":46,"e":70,"t":"Hi, thank you for joining the meeting today."}]
# Notes
@@ -369,7 +348,6 @@ The output should be a JSON array, with each element containing:
},
],
config: {
audioTimestamp: true,
jsonMode: true,
},
},

View File

@@ -7,6 +7,7 @@ import {
type CoreAssistantMessage,
type CoreUserMessage,
FilePart,
generateObject,
generateText,
streamText,
TextPart,
@@ -96,9 +97,10 @@ export class GeminiProvider
protected async chatToGPTMessage(
messages: PromptMessage[]
): Promise<[string | undefined, ChatMessage[]]> {
let system =
messages[0]?.role === 'system' ? messages.shift()?.content : undefined;
): Promise<[string | undefined, ChatMessage[], any]> {
const system =
messages[0]?.role === 'system' ? messages.shift() : undefined;
const schema = system?.params?.schema;
// filter redundant fields
const msgs: ChatMessage[] = [];
@@ -140,7 +142,7 @@ export class GeminiProvider
}
}
return [system, msgs];
return [system?.content, msgs, schema];
}
protected async checkParams({
@@ -229,17 +231,25 @@ export class GeminiProvider
try {
metrics.ai.counter('chat_text_calls').add(1, { model });
const [system, msgs] = await this.chatToGPTMessage(messages);
const [system, msgs, schema] = await this.chatToGPTMessage(messages);
const { text } = await generateText({
model: this.#instance(model, {
audioTimestamp: Boolean(options.audioTimestamp),
structuredOutputs: Boolean(options.jsonMode),
}),
system,
messages: msgs,
abortSignal: options.signal,
const modelInstance = this.#instance(model, {
structuredOutputs: Boolean(options.jsonMode),
});
const { text } = schema
? await generateObject({
model: modelInstance,
system,
messages: msgs,
schema,
abortSignal: options.signal,
}).then(r => ({ text: JSON.stringify(r.object) }))
: await generateText({
model: modelInstance,
system,
messages: msgs,
abortSignal: options.signal,
});
if (!text) throw new Error('Failed to generate text');
return text.trim();
@@ -251,7 +261,7 @@ export class GeminiProvider
async *generateTextStream(
messages: PromptMessage[],
model: string = 'gpt-4o-mini',
model: string = 'gemini-2.0-flash-001',
options: CopilotChatOptions = {}
): AsyncIterable<string> {
await this.checkParams({ messages, model, options });

View File

@@ -1,5 +1,6 @@
import { Injectable } from '@nestjs/common';
import { AiJobStatus, AiJobType } from '@prisma/client';
import { ZodType } from 'zod';
import {
CopilotPromptNotFound,
@@ -22,7 +23,7 @@ import {
import { CopilotStorage } from '../storage';
import {
TranscriptionPayload,
TranscriptionSchema,
TranscriptionResponseSchema,
TranscriptPayloadSchema,
} from './types';
import { readStream } from './utils';
@@ -137,7 +138,8 @@ export class CopilotTranscriptionService {
private async chatWithPrompt(
promptName: string,
message: Partial<PromptMessage>
message: Partial<PromptMessage>,
schema?: ZodType<any>
): Promise<string> {
const prompt = await this.prompt.get(promptName);
if (!prompt) {
@@ -146,16 +148,20 @@ export class CopilotTranscriptionService {
const provider = await this.getProvider(prompt.model);
return provider.generateText(
[...prompt.finish({}), { role: 'user', content: '', ...message }],
prompt.model
[...prompt.finish({ schema }), { role: 'user', content: '', ...message }],
prompt.model,
Object.assign({}, prompt.config)
);
}
private cleanupResponse(response: string): string {
return response
.replace(/```[\w\s]+\n/g, '')
.replace(/\n```/g, '')
.trim();
private convertTime(time: number) {
const minutes = Math.floor(time / 60);
const seconds = Math.floor(time % 60);
const hours = Math.floor(minutes / 60);
const minutesStr = String(minutes % 60).padStart(2, '0');
const secondsStr = String(seconds).padStart(2, '0');
const hoursStr = String(hours).padStart(2, '0');
return `${hoursStr}:${minutesStr}:${secondsStr}`;
}
@OnJob('copilot.transcript.submit')
@@ -165,14 +171,23 @@ export class CopilotTranscriptionService {
mimeType,
}: Jobs['copilot.transcript.submit']) {
try {
const result = await this.chatWithPrompt('Transcript audio', {
attachments: [url],
params: { mimetype: mimeType },
});
const transcription = TranscriptionSchema.parse(
JSON.parse(this.cleanupResponse(result))
const result = await this.chatWithPrompt(
'Transcript audio',
{
attachments: [url],
params: { mimetype: mimeType },
},
TranscriptionResponseSchema
);
const transcription = TranscriptionResponseSchema.parse(
JSON.parse(result)
).map(t => ({
speaker: t.a,
start: this.convertTime(t.s),
end: this.convertTime(t.e),
transcription: t.t,
}));
await this.models.copilotJob.update(jobId, {
payload: { transcription },
});
@@ -206,11 +221,9 @@ export class CopilotTranscriptionService {
.trim();
if (content.length) {
const result = await this.chatWithPrompt('Summary', {
payload.summary = await this.chatWithPrompt('Summary', {
content,
});
payload.summary = this.cleanupResponse(result);
await this.models.copilotJob.update(jobId, {
payload,
});
@@ -244,11 +257,9 @@ export class CopilotTranscriptionService {
.trim();
if (content.length) {
const result = await this.chatWithPrompt('Summary as title', {
payload.title = await this.chatWithPrompt('Summary as title', {
content,
});
payload.title = this.cleanupResponse(result);
await this.models.copilotJob.update(jobId, {
payload,
});

View File

@@ -2,6 +2,15 @@ import { z } from 'zod';
import { OneMB } from '../../../base';
export const TranscriptionResponseSchema = z
.object({
a: z.string().describe("speaker's name, for example A, B, C"),
s: z.number().describe('start time(second) of the transcription'),
e: z.number().describe('end time(second) of the transcription'),
t: z.string().describe('transcription text'),
})
.array();
const TranscriptionItemSchema = z.object({
speaker: z.string(),
start: z.string(),