feat(server): compress transcript response (#11316)

2026-02-14 05:14:54 +00:00 · 2025-03-31 14:46:10 +00:00
parent 58d6a96e45
commit fe05872ada
4 changed files with 67 additions and 59 deletions
--- a/packages/backend/server/src/plugins/copilot/prompt/prompts.ts
+++ b/packages/backend/server/src/plugins/copilot/prompt/prompts.ts
@@ -330,14 +330,6 @@ Convert a multi-speaker audio recording into a structured JSON format by transcr
 1. Analyze the audio to detect the presence of multiple speakers using distinct microphone inputs.
 2. Transcribe the audio content for each speaker and note the time intervals of speech.

-# Output Format
-
-The output should be a JSON array, with each element containing:
- "speaker": A label identifying the speaker, such as "A", "B", etc.
- "start": The start time of the transcribed segment in the format "HH:MM:SS".
- "end": The end time of the transcribed segment in the format "HH:MM:SS".
- "transcription": The transcribed text for the speaker's segment.
-
 # Examples

 **Example Input:**
@@ -345,20 +337,7 @@ The output should be a JSON array, with each element containing:

 **Example Output:**

-[
-  {
-    "speaker": "A",
-    "start": "00:00:30",
-    "end": "00:00:45",
-    "transcription": "Hello, everyone."
-  },
-  {
-    "speaker": "B",
-    "start": "00:00:46",
-    "end": "00:01:10",
-    "transcription": "Hi, thank you for joining the meeting today."
-  }
-]
+[{"a":"A","s":30,"e":45,"t":"Hello, everyone."},{"a":"B","s":46,"e":70,"t":"Hi, thank you for joining the meeting today."}]

 # Notes

@@ -369,7 +348,6 @@ The output should be a JSON array, with each element containing:
      },
    ],
    config: {
-      audioTimestamp: true,
      jsonMode: true,
    },
  },
--- a/packages/backend/server/src/plugins/copilot/providers/gemini.ts
+++ b/packages/backend/server/src/plugins/copilot/providers/gemini.ts
@@ -7,6 +7,7 @@ import {
  type CoreAssistantMessage,
  type CoreUserMessage,
  FilePart,
+  generateObject,
  generateText,
  streamText,
  TextPart,
@@ -96,9 +97,10 @@ export class GeminiProvider

  protected async chatToGPTMessage(
    messages: PromptMessage[]
-  ): Promise<[string | undefined, ChatMessage[]]> {
-    let system =
-      messages[0]?.role === 'system' ? messages.shift()?.content : undefined;
+  ): Promise<[string | undefined, ChatMessage[], any]> {
+    const system =
+      messages[0]?.role === 'system' ? messages.shift() : undefined;
+    const schema = system?.params?.schema;

    // filter redundant fields
    const msgs: ChatMessage[] = [];
@@ -140,7 +142,7 @@ export class GeminiProvider
      }
    }

-    return [system, msgs];
+    return [system?.content, msgs, schema];
  }

  protected async checkParams({
@@ -229,17 +231,25 @@ export class GeminiProvider
    try {
      metrics.ai.counter('chat_text_calls').add(1, { model });

-      const [system, msgs] = await this.chatToGPTMessage(messages);
+      const [system, msgs, schema] = await this.chatToGPTMessage(messages);

-      const { text } = await generateText({
-        model: this.#instance(model, {
-          audioTimestamp: Boolean(options.audioTimestamp),
-          structuredOutputs: Boolean(options.jsonMode),
-        }),
-        system,
-        messages: msgs,
-        abortSignal: options.signal,
+      const modelInstance = this.#instance(model, {
+        structuredOutputs: Boolean(options.jsonMode),
      });
+      const { text } = schema
+        ? await generateObject({
+            model: modelInstance,
+            system,
+            messages: msgs,
+            schema,
+            abortSignal: options.signal,
+          }).then(r => ({ text: JSON.stringify(r.object) }))
+        : await generateText({
+            model: modelInstance,
+            system,
+            messages: msgs,
+            abortSignal: options.signal,
+          });

      if (!text) throw new Error('Failed to generate text');
      return text.trim();
@@ -251,7 +261,7 @@ export class GeminiProvider

  async *generateTextStream(
    messages: PromptMessage[],
-    model: string = 'gpt-4o-mini',
+    model: string = 'gemini-2.0-flash-001',
    options: CopilotChatOptions = {}
  ): AsyncIterable<string> {
    await this.checkParams({ messages, model, options });
--- a/packages/backend/server/src/plugins/copilot/transcript/service.ts
+++ b/packages/backend/server/src/plugins/copilot/transcript/service.ts
@@ -1,5 +1,6 @@
 import { Injectable } from '@nestjs/common';
 import { AiJobStatus, AiJobType } from '@prisma/client';
+import { ZodType } from 'zod';

 import {
  CopilotPromptNotFound,
@@ -22,7 +23,7 @@ import {
 import { CopilotStorage } from '../storage';
 import {
  TranscriptionPayload,
-  TranscriptionSchema,
+  TranscriptionResponseSchema,
  TranscriptPayloadSchema,
 } from './types';
 import { readStream } from './utils';
@@ -137,7 +138,8 @@ export class CopilotTranscriptionService {

  private async chatWithPrompt(
    promptName: string,
-    message: Partial<PromptMessage>
+    message: Partial<PromptMessage>,
+    schema?: ZodType<any>
  ): Promise<string> {
    const prompt = await this.prompt.get(promptName);
    if (!prompt) {
@@ -146,16 +148,20 @@ export class CopilotTranscriptionService {

    const provider = await this.getProvider(prompt.model);
    return provider.generateText(
-      [...prompt.finish({}), { role: 'user', content: '', ...message }],
-      prompt.model
+      [...prompt.finish({ schema }), { role: 'user', content: '', ...message }],
+      prompt.model,
+      Object.assign({}, prompt.config)
    );
  }

-  private cleanupResponse(response: string): string {
-    return response
-      .replace(/```[\w\s]+\n/g, '')
-      .replace(/\n```/g, '')
-      .trim();
+  private convertTime(time: number) {
+    const minutes = Math.floor(time / 60);
+    const seconds = Math.floor(time % 60);
+    const hours = Math.floor(minutes / 60);
+    const minutesStr = String(minutes % 60).padStart(2, '0');
+    const secondsStr = String(seconds).padStart(2, '0');
+    const hoursStr = String(hours).padStart(2, '0');
+    return `${hoursStr}:${minutesStr}:${secondsStr}`;
  }

  @OnJob('copilot.transcript.submit')
@@ -165,14 +171,23 @@ export class CopilotTranscriptionService {
    mimeType,
  }: Jobs['copilot.transcript.submit']) {
    try {
-      const result = await this.chatWithPrompt('Transcript audio', {
-        attachments: [url],
-        params: { mimetype: mimeType },
-      });
-
-      const transcription = TranscriptionSchema.parse(
-        JSON.parse(this.cleanupResponse(result))
+      const result = await this.chatWithPrompt(
+        'Transcript audio',
+        {
+          attachments: [url],
+          params: { mimetype: mimeType },
+        },
+        TranscriptionResponseSchema
      );
+
+      const transcription = TranscriptionResponseSchema.parse(
+        JSON.parse(result)
+      ).map(t => ({
+        speaker: t.a,
+        start: this.convertTime(t.s),
+        end: this.convertTime(t.e),
+        transcription: t.t,
+      }));
      await this.models.copilotJob.update(jobId, {
        payload: { transcription },
      });
@@ -206,11 +221,9 @@ export class CopilotTranscriptionService {
          .trim();

        if (content.length) {
-          const result = await this.chatWithPrompt('Summary', {
+          payload.summary = await this.chatWithPrompt('Summary', {
            content,
          });
-
-          payload.summary = this.cleanupResponse(result);
          await this.models.copilotJob.update(jobId, {
            payload,
          });
@@ -244,11 +257,9 @@ export class CopilotTranscriptionService {
          .trim();

        if (content.length) {
-          const result = await this.chatWithPrompt('Summary as title', {
+          payload.title = await this.chatWithPrompt('Summary as title', {
            content,
          });
-
-          payload.title = this.cleanupResponse(result);
          await this.models.copilotJob.update(jobId, {
            payload,
          });
--- a/packages/backend/server/src/plugins/copilot/transcript/types.ts
+++ b/packages/backend/server/src/plugins/copilot/transcript/types.ts
@@ -2,6 +2,15 @@ import { z } from 'zod';

 import { OneMB } from '../../../base';

+export const TranscriptionResponseSchema = z
+  .object({
+    a: z.string().describe("speaker's name, for example A, B, C"),
+    s: z.number().describe('start time(second) of the transcription'),
+    e: z.number().describe('end time(second) of the transcription'),
+    t: z.string().describe('transcription text'),
+  })
+  .array();
+
 const TranscriptionItemSchema = z.object({
  speaker: z.string(),
  start: z.string(),