From fe05872ada9fb84da0e74fe53d297bb130252334 Mon Sep 17 00:00:00 2001
From: darkskygit <darksky.graphite@toeverything.info>
Date: Mon, 31 Mar 2025 14:46:10 +0000
Subject: [PATCH] feat(server): compress transcript response (#11316)

---
 .../src/plugins/copilot/prompt/prompts.ts     | 24 +-------
 .../src/plugins/copilot/providers/gemini.ts   | 38 ++++++++-----
 .../src/plugins/copilot/transcript/service.ts | 55 +++++++++++--------
 .../src/plugins/copilot/transcript/types.ts   |  9 +++
 4 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/packages/backend/server/src/plugins/copilot/prompt/prompts.ts b/packages/backend/server/src/plugins/copilot/prompt/prompts.ts
index 1f62ea392c..661218cef0 100644
--- a/packages/backend/server/src/plugins/copilot/prompt/prompts.ts
+++ b/packages/backend/server/src/plugins/copilot/prompt/prompts.ts
@@ -330,14 +330,6 @@ Convert a multi-speaker audio recording into a structured JSON format by transcr
 1. Analyze the audio to detect the presence of multiple speakers using distinct microphone inputs.
 2. Transcribe the audio content for each speaker and note the time intervals of speech.
 
-# Output Format
-
-The output should be a JSON array, with each element containing:
-- "speaker": A label identifying the speaker, such as "A", "B", etc.
-- "start": The start time of the transcribed segment in the format "HH:MM:SS".
-- "end": The end time of the transcribed segment in the format "HH:MM:SS".
-- "transcription": The transcribed text for the speaker's segment.
-
 # Examples
 
 **Example Input:**
@@ -345,20 +337,7 @@ The output should be a JSON array, with each element containing:
 
 **Example Output:**
 
-[
-  {
-    "speaker": "A",
-    "start": "00:00:30",
-    "end": "00:00:45",
-    "transcription": "Hello, everyone."
-  },
-  {
-    "speaker": "B",
-    "start": "00:00:46",
-    "end": "00:01:10",
-    "transcription": "Hi, thank you for joining the meeting today."
-  }
-]
+[{"a":"A","s":30,"e":45,"t":"Hello, everyone."},{"a":"B","s":46,"e":70,"t":"Hi, thank you for joining the meeting today."}]
 
 # Notes
 
@@ -369,7 +348,6 @@ The output should be a JSON array, with each element containing:
       },
     ],
     config: {
-      audioTimestamp: true,
       jsonMode: true,
     },
   },
diff --git a/packages/backend/server/src/plugins/copilot/providers/gemini.ts b/packages/backend/server/src/plugins/copilot/providers/gemini.ts
index f9f966e69e..386a4b6b0e 100644
--- a/packages/backend/server/src/plugins/copilot/providers/gemini.ts
+++ b/packages/backend/server/src/plugins/copilot/providers/gemini.ts
@@ -7,6 +7,7 @@ import {
   type CoreAssistantMessage,
   type CoreUserMessage,
   FilePart,
+  generateObject,
   generateText,
   streamText,
   TextPart,
@@ -96,9 +97,10 @@ export class GeminiProvider
 
   protected async chatToGPTMessage(
     messages: PromptMessage[]
-  ): Promise<[string | undefined, ChatMessage[]]> {
-    let system =
-      messages[0]?.role === 'system' ? messages.shift()?.content : undefined;
+  ): Promise<[string | undefined, ChatMessage[], any]> {
+    const system =
+      messages[0]?.role === 'system' ? messages.shift() : undefined;
+    const schema = system?.params?.schema;
 
     // filter redundant fields
     const msgs: ChatMessage[] = [];
@@ -140,7 +142,7 @@ export class GeminiProvider
       }
     }
 
-    return [system, msgs];
+    return [system?.content, msgs, schema];
   }
 
   protected async checkParams({
@@ -229,17 +231,25 @@ export class GeminiProvider
     try {
       metrics.ai.counter('chat_text_calls').add(1, { model });
 
-      const [system, msgs] = await this.chatToGPTMessage(messages);
+      const [system, msgs, schema] = await this.chatToGPTMessage(messages);
 
-      const { text } = await generateText({
-        model: this.#instance(model, {
-          audioTimestamp: Boolean(options.audioTimestamp),
-          structuredOutputs: Boolean(options.jsonMode),
-        }),
-        system,
-        messages: msgs,
-        abortSignal: options.signal,
+      const modelInstance = this.#instance(model, {
+        structuredOutputs: Boolean(options.jsonMode),
       });
+      const { text } = schema
+        ? await generateObject({
+            model: modelInstance,
+            system,
+            messages: msgs,
+            schema,
+            abortSignal: options.signal,
+          }).then(r => ({ text: JSON.stringify(r.object) }))
+        : await generateText({
+            model: modelInstance,
+            system,
+            messages: msgs,
+            abortSignal: options.signal,
+          });
 
       if (!text) throw new Error('Failed to generate text');
       return text.trim();
@@ -251,7 +261,7 @@ export class GeminiProvider
 
   async *generateTextStream(
     messages: PromptMessage[],
-    model: string = 'gpt-4o-mini',
+    model: string = 'gemini-2.0-flash-001',
     options: CopilotChatOptions = {}
   ): AsyncIterable<string> {
     await this.checkParams({ messages, model, options });
diff --git a/packages/backend/server/src/plugins/copilot/transcript/service.ts b/packages/backend/server/src/plugins/copilot/transcript/service.ts
index c210cba40d..827e100acb 100644
--- a/packages/backend/server/src/plugins/copilot/transcript/service.ts
+++ b/packages/backend/server/src/plugins/copilot/transcript/service.ts
@@ -1,5 +1,6 @@
 import { Injectable } from '@nestjs/common';
 import { AiJobStatus, AiJobType } from '@prisma/client';
+import { ZodType } from 'zod';
 
 import {
   CopilotPromptNotFound,
@@ -22,7 +23,7 @@ import {
 import { CopilotStorage } from '../storage';
 import {
   TranscriptionPayload,
-  TranscriptionSchema,
+  TranscriptionResponseSchema,
   TranscriptPayloadSchema,
 } from './types';
 import { readStream } from './utils';
@@ -137,7 +138,8 @@ export class CopilotTranscriptionService {
 
   private async chatWithPrompt(
     promptName: string,
-    message: Partial<PromptMessage>
+    message: Partial<PromptMessage>,
+    schema?: ZodType<any>
   ): Promise<string> {
     const prompt = await this.prompt.get(promptName);
     if (!prompt) {
@@ -146,16 +148,20 @@ export class CopilotTranscriptionService {
 
     const provider = await this.getProvider(prompt.model);
     return provider.generateText(
-      [...prompt.finish({}), { role: 'user', content: '', ...message }],
-      prompt.model
+      [...prompt.finish({ schema }), { role: 'user', content: '', ...message }],
+      prompt.model,
+      Object.assign({}, prompt.config)
     );
   }
 
-  private cleanupResponse(response: string): string {
-    return response
-      .replace(/```[\w\s]+\n/g, '')
-      .replace(/\n```/g, '')
-      .trim();
+  private convertTime(time: number) {
+    const minutes = Math.floor(time / 60);
+    const seconds = Math.floor(time % 60);
+    const hours = Math.floor(minutes / 60);
+    const minutesStr = String(minutes % 60).padStart(2, '0');
+    const secondsStr = String(seconds).padStart(2, '0');
+    const hoursStr = String(hours).padStart(2, '0');
+    return `${hoursStr}:${minutesStr}:${secondsStr}`;
   }
 
   @OnJob('copilot.transcript.submit')
@@ -165,14 +171,23 @@ export class CopilotTranscriptionService {
     mimeType,
   }: Jobs['copilot.transcript.submit']) {
     try {
-      const result = await this.chatWithPrompt('Transcript audio', {
-        attachments: [url],
-        params: { mimetype: mimeType },
-      });
-
-      const transcription = TranscriptionSchema.parse(
-        JSON.parse(this.cleanupResponse(result))
+      const result = await this.chatWithPrompt(
+        'Transcript audio',
+        {
+          attachments: [url],
+          params: { mimetype: mimeType },
+        },
+        TranscriptionResponseSchema
       );
+
+      const transcription = TranscriptionResponseSchema.parse(
+        JSON.parse(result)
+      ).map(t => ({
+        speaker: t.a,
+        start: this.convertTime(t.s),
+        end: this.convertTime(t.e),
+        transcription: t.t,
+      }));
       await this.models.copilotJob.update(jobId, {
         payload: { transcription },
       });
@@ -206,11 +221,9 @@ export class CopilotTranscriptionService {
           .trim();
 
         if (content.length) {
-          const result = await this.chatWithPrompt('Summary', {
+          payload.summary = await this.chatWithPrompt('Summary', {
             content,
           });
-
-          payload.summary = this.cleanupResponse(result);
           await this.models.copilotJob.update(jobId, {
             payload,
           });
@@ -244,11 +257,9 @@ export class CopilotTranscriptionService {
           .trim();
 
         if (content.length) {
-          const result = await this.chatWithPrompt('Summary as title', {
+          payload.title = await this.chatWithPrompt('Summary as title', {
             content,
           });
-
-          payload.title = this.cleanupResponse(result);
           await this.models.copilotJob.update(jobId, {
             payload,
           });
diff --git a/packages/backend/server/src/plugins/copilot/transcript/types.ts b/packages/backend/server/src/plugins/copilot/transcript/types.ts
index 178470bafe..c9c3d13236 100644
--- a/packages/backend/server/src/plugins/copilot/transcript/types.ts
+++ b/packages/backend/server/src/plugins/copilot/transcript/types.ts
@@ -2,6 +2,15 @@ import { z } from 'zod';
 
 import { OneMB } from '../../../base';
 
+export const TranscriptionResponseSchema = z
+  .object({
+    a: z.string().describe("speaker's name, for example A, B, C"),
+    s: z.number().describe('start time(second) of the transcription'),
+    e: z.number().describe('end time(second) of the transcription'),
+    t: z.string().describe('transcription text'),
+  })
+  .array();
+
 const TranscriptionItemSchema = z.object({
   speaker: z.string(),
   start: z.string(),