From 34b6e7ef88f3258a4501b8782aabb731cd6196fa Mon Sep 17 00:00:00 2001
From: pengx17 <pengxiao@outlook.com>
Date: Wed, 9 Apr 2025 12:44:37 +0000
Subject: [PATCH] feat(core): support splitting audio blobs before submitting
 to backend (#11572)

fix AF-2484
---
 packages/common/graphql/export-gql-plugin.cjs |   5 +-
 packages/common/graphql/src/graphql/index.ts  |   1 +
 .../media/entities/audio-attachment-block.ts  |  16 +-
 .../entities/audio-transcription-job-store.ts |   6 +-
 .../media/entities/audio-transcription-job.ts |   4 +-
 .../frontend/core/src/utils/webm-encoding.ts  | 245 +++++++++++++++---
 6 files changed, 226 insertions(+), 51 deletions(-)

diff --git a/packages/common/graphql/export-gql-plugin.cjs b/packages/common/graphql/export-gql-plugin.cjs
index 665b845ce4..4822e43490 100644
--- a/packages/common/graphql/export-gql-plugin.cjs
+++ b/packages/common/graphql/export-gql-plugin.cjs
@@ -163,7 +163,10 @@ module.exports = {
 
             // parse 'file' fields
             const containsFile = node.variableDefinitions.some(def => {
-              const varType = def?.type?.type?.name?.value;
+              const varType =
+                def.type.kind === 'NamedType'
+                  ? def.type.name.value
+                  : def?.type?.type?.name?.value;
               const checkContainFile = type => {
                 if (schema.getType(type)?.name === 'Upload') return true;
                 const typeDef = schema.getType(type);
diff --git a/packages/common/graphql/src/graphql/index.ts b/packages/common/graphql/src/graphql/index.ts
index e31b68fe04..7276f57af5 100644
--- a/packages/common/graphql/src/graphql/index.ts
+++ b/packages/common/graphql/src/graphql/index.ts
@@ -612,6 +612,7 @@ export const submitAudioTranscriptionMutation = {
     status
   }
 }`,
+  file: true,
 };
 
 export const claimAudioTranscriptionMutation = {
diff --git a/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts b/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts
index 0aa6732cc9..531432947c 100644
--- a/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts
@@ -1,4 +1,4 @@
-import { encodeAudioBlobToOpus } from '@affine/core/utils/webm-encoding';
+import { encodeAudioBlobToOpusSlices } from '@affine/core/utils/webm-encoding';
 import { DebugLogger } from '@affine/debug';
 import { AiJobStatus } from '@affine/graphql';
 import track from '@affine/track';
@@ -115,17 +115,19 @@ export class AudioAttachmentBlock extends Entity<AttachmentBlockModel> {
     const job = this.framework.createEntity(AudioTranscriptionJob, {
       blobId: this.props.props.sourceId,
       blockProps: transcriptionBlockProps,
-      getAudioFile: async () => {
+      getAudioFiles: async () => {
         const buffer = await this.audioMedia.getBuffer();
         if (!buffer) {
           throw new Error('No audio buffer available');
         }
-        const encodedBuffer = await encodeAudioBlobToOpus(buffer, 64000);
-        const blob = new Blob([encodedBuffer], { type: this.props.props.type });
-        const file = new File([blob], this.props.props.name, {
-          type: this.props.props.type,
+        const slices = await encodeAudioBlobToOpusSlices(buffer, 64000);
+        const files = slices.map((slice, index) => {
+          const blob = new Blob([slice], { type: 'audio/opus' });
+          return new File([blob], this.props.props.name + `-${index}.opus`, {
+            type: 'audio/opus',
+          });
         });
-        return file;
+        return files;
       },
     });
 
diff --git a/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts b/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts
index 3ef75aa6c7..8b510f3c6c 100644
--- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts
@@ -12,7 +12,7 @@ import type { WorkspaceService } from '../../workspace';
 
 export class AudioTranscriptionJobStore extends Entity<{
   readonly blobId: string;
-  readonly getAudioFile: () => Promise<File>;
+  readonly getAudioFiles: () => Promise<File[]>;
 }> {
   constructor(
     private readonly workspaceService: WorkspaceService,
@@ -41,13 +41,13 @@ export class AudioTranscriptionJobStore extends Entity<{
     if (!graphqlService) {
       throw new Error('No graphql service available');
     }
-    const file = await this.props.getAudioFile();
+    const files = await this.props.getAudioFiles();
     const response = await graphqlService.gql({
       query: submitAudioTranscriptionMutation,
       variables: {
         workspaceId: this.currentWorkspaceId,
         blobId: this.props.blobId,
-        blob: file,
+        blobs: files,
       },
     });
     if (!response.submitAudioTranscription?.id) {
diff --git a/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts b/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts
index 22fef885c4..c73f3b5bc8 100644
--- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts
@@ -46,7 +46,7 @@ const logger = new DebugLogger('audio-transcription-job');
 export class AudioTranscriptionJob extends Entity<{
   readonly blockProps: TranscriptionBlockProps;
   readonly blobId: string;
-  readonly getAudioFile: () => Promise<File>;
+  readonly getAudioFiles: () => Promise<File[]>;
 }> {
   constructor(
     private readonly workspaceServerService: WorkspaceServerService,
@@ -68,7 +68,7 @@ export class AudioTranscriptionJob extends Entity<{
     AudioTranscriptionJobStore,
     {
       blobId: this.props.blobId,
-      getAudioFile: this.props.getAudioFile,
+      getAudioFiles: this.props.getAudioFiles,
     }
   );
 
diff --git a/packages/frontend/core/src/utils/webm-encoding.ts b/packages/frontend/core/src/utils/webm-encoding.ts
index 63163fd32f..5863fe1727 100644
--- a/packages/frontend/core/src/utils/webm-encoding.ts
+++ b/packages/frontend/core/src/utils/webm-encoding.ts
@@ -8,8 +8,62 @@ interface AudioEncodingConfig {
   bitrate?: number;
 }
 
+interface AudioEncodingResult {
+  encodedChunks: EncodedAudioChunk[];
+  config: AudioEncodingConfig;
+}
+
 const logger = new DebugLogger('webm-encoding');
 
+// Constants
+const DEFAULT_BITRATE = 64000;
+const MAX_SLICE_DURATION_SECONDS = 10 * 60; // 10 minutes
+const MIN_SLICE_DURATION_SECONDS = 5 * 60; // 5 minutes
+const AUDIO_LEVEL_THRESHOLD = 0.02; // Threshold for "silence" detection
+
+/**
+ * Converts various blob formats to ArrayBuffer
+ */
+async function blobToArrayBuffer(
+  blob: Blob | ArrayBuffer | Uint8Array
+): Promise<ArrayBuffer> {
+  if (blob instanceof Blob) {
+    return await blob.arrayBuffer();
+  } else if (blob instanceof Uint8Array) {
+    return blob.buffer instanceof ArrayBuffer
+      ? blob.buffer
+      : blob.slice().buffer;
+  } else {
+    return blob;
+  }
+}
+
+/**
+ * Extracts a combined Float32Array from an AudioBuffer
+ */
+function extractAudioData(
+  audioBuffer: AudioBuffer,
+  startSample: number = 0,
+  endSample?: number
+): Float32Array {
+  const numberOfChannels = audioBuffer.numberOfChannels;
+  const sampleCount =
+    endSample !== undefined
+      ? endSample - startSample
+      : audioBuffer.length - startSample;
+
+  const audioData = new Float32Array(sampleCount * numberOfChannels);
+
+  for (let channel = 0; channel < numberOfChannels; channel++) {
+    const channelData = audioBuffer.getChannelData(channel);
+    for (let i = 0; i < sampleCount; i++) {
+      audioData[i * numberOfChannels + channel] = channelData[startSample + i];
+    }
+  }
+
+  return audioData;
+}
+
 /**
  * Creates and configures an Opus encoder with the given settings
  */
@@ -31,7 +85,7 @@ export function createOpusEncoder(config: AudioEncodingConfig): {
     codec: 'opus',
     sampleRate: config.sampleRate,
     numberOfChannels: config.numberOfChannels,
-    bitrate: config.bitrate ?? 64000,
+    bitrate: config.bitrate ?? DEFAULT_BITRATE,
   });
 
   return { encoder, encodedChunks };
@@ -104,6 +158,32 @@ export function muxToWebM(
   return new Uint8Array(target.buffer);
 }
 
+/**
+ * Process and encode audio data to Opus chunks
+ */
+async function encodeAudioBufferToOpus(
+  audioBuffer: AudioBuffer,
+  targetBitrate: number = DEFAULT_BITRATE
+): Promise<AudioEncodingResult> {
+  const config: AudioEncodingConfig = {
+    sampleRate: audioBuffer.sampleRate,
+    numberOfChannels: audioBuffer.numberOfChannels,
+    bitrate: targetBitrate,
+  };
+
+  const { encoder, encodedChunks } = createOpusEncoder(config);
+  const audioData = extractAudioData(audioBuffer);
+
+  await encodeAudioFrames({
+    audioData,
+    numberOfChannels: config.numberOfChannels,
+    sampleRate: config.sampleRate,
+    encoder,
+  });
+
+  return { encodedChunks, config };
+}
+
 /**
  * Encodes raw audio data to Opus in WebM container.
  */
@@ -170,49 +250,18 @@ export async function encodeRawBufferToOpus({
  */
 export async function encodeAudioBlobToOpus(
   blob: Blob | ArrayBuffer | Uint8Array,
-  targetBitrate: number = 64000
+  targetBitrate: number = DEFAULT_BITRATE
 ): Promise<Uint8Array> {
   const audioContext = new AudioContext();
   logger.debug('Encoding audio blob to Opus');
 
   try {
-    let buffer: ArrayBuffer;
-    if (blob instanceof Blob) {
-      buffer = await blob.arrayBuffer();
-    } else if (blob instanceof Uint8Array) {
-      buffer =
-        blob.buffer instanceof ArrayBuffer ? blob.buffer : blob.slice().buffer;
-    } else {
-      buffer = blob;
-    }
-
-    const audioBuffer = await audioContext.decodeAudioData(buffer);
-
-    const config: AudioEncodingConfig = {
-      sampleRate: audioBuffer.sampleRate,
-      numberOfChannels: audioBuffer.numberOfChannels,
-      bitrate: targetBitrate,
-    };
-
-    const { encoder, encodedChunks } = createOpusEncoder(config);
-
-    // Combine all channels into a single Float32Array
-    const audioData = new Float32Array(
-      audioBuffer.length * config.numberOfChannels
+    const arrayBuffer = await blobToArrayBuffer(blob);
+    const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
+    const { encodedChunks, config } = await encodeAudioBufferToOpus(
+      audioBuffer,
+      targetBitrate
     );
-    for (let channel = 0; channel < config.numberOfChannels; channel++) {
-      const channelData = audioBuffer.getChannelData(channel);
-      for (let i = 0; i < channelData.length; i++) {
-        audioData[i * config.numberOfChannels + channel] = channelData[i];
-      }
-    }
-
-    await encodeAudioFrames({
-      audioData,
-      numberOfChannels: config.numberOfChannels,
-      sampleRate: config.sampleRate,
-      encoder,
-    });
 
     const webm = muxToWebM(encodedChunks, config);
     logger.debug('Encoded audio blob to Opus');
@@ -222,6 +271,126 @@ export async function encodeAudioBlobToOpus(
   }
 }
 
+/**
+ * Finds the best slice point based on audio level
+ */
+function findSlicePoint(
+  audioBuffer: AudioBuffer,
+  startSample: number,
+  endSample: number,
+  minSliceSamples: number
+): number {
+  // If we have more than min slice duration and not at the end,
+  // look for a good splitting point (low audio level)
+  if (
+    endSample < audioBuffer.length &&
+    endSample - startSample > minSliceSamples
+  ) {
+    // Start checking from min slice duration point
+    const checkStartSample = startSample + minSliceSamples;
+    const numberOfChannels = audioBuffer.numberOfChannels;
+
+    // Scan forward for a good split point (low audio level)
+    for (let i = checkStartSample; i < endSample; i++) {
+      // Calculate average level across all channels at this sample
+      let level = 0;
+      for (let channel = 0; channel < numberOfChannels; channel++) {
+        const data = audioBuffer.getChannelData(channel);
+        level += Math.abs(data[i]);
+      }
+      level /= numberOfChannels;
+
+      // If we found a quiet spot, use it as the split point
+      if (level < AUDIO_LEVEL_THRESHOLD) {
+        return i;
+      }
+    }
+  }
+
+  // If no good splitting point is found, use the original end sample
+  return endSample;
+}
+
+// Since the audio blob could be long and make the transcribe service busy,
+// we need to encode the audio blob to opus slices
+// Slice logic:
+// 1. Max slice duration is 10 minutes
+// 2. Min slice duration is 5 minutes
+// 3. If a new slice begins and the duration reached 5 minutes
+//    we start a new slice when the audio level value is below the threshold
+// 4. If the audio level value is above the threshold, we continue the current slice
+export async function encodeAudioBlobToOpusSlices(
+  blob: Blob | ArrayBuffer | Uint8Array,
+  targetBitrate: number = DEFAULT_BITRATE
+): Promise<Uint8Array[]> {
+  const audioContext = new AudioContext();
+
+  try {
+    const arrayBuffer = await blobToArrayBuffer(blob);
+    const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
+    const slices: Uint8Array[] = [];
+
+    // Define slicing parameters
+    const sampleRate = audioBuffer.sampleRate;
+    const numberOfChannels = audioBuffer.numberOfChannels;
+
+    // Calculate sizes in samples
+    const maxSliceSamples = MAX_SLICE_DURATION_SECONDS * sampleRate;
+    const minSliceSamples = MIN_SLICE_DURATION_SECONDS * sampleRate;
+    const totalSamples = audioBuffer.length;
+
+    // Start slicing
+    let startSample = 0;
+
+    while (startSample < totalSamples) {
+      // Determine end sample for this slice
+      let endSample = Math.min(startSample + maxSliceSamples, totalSamples);
+
+      // Find the best slice point based on audio levels
+      endSample = findSlicePoint(
+        audioBuffer,
+        startSample,
+        endSample,
+        minSliceSamples
+      );
+
+      // Create a slice from startSample to endSample
+      const audioData = extractAudioData(audioBuffer, startSample, endSample);
+
+      // Encode this slice to Opus
+      const { encoder, encodedChunks } = createOpusEncoder({
+        sampleRate,
+        numberOfChannels,
+        bitrate: targetBitrate,
+      });
+
+      await encodeAudioFrames({
+        audioData,
+        numberOfChannels,
+        sampleRate,
+        encoder,
+      });
+
+      // Mux to WebM and add to slices
+      const webm = muxToWebM(encodedChunks, {
+        sampleRate,
+        numberOfChannels,
+        bitrate: targetBitrate,
+      });
+
+      slices.push(webm);
+
+      // Move to next slice
+      startSample = endSample;
+    }
+
+    logger.debug(`Encoded audio blob to ${slices.length} Opus slices`);
+    return slices;
+  } finally {
+    await audioContext.close();
+  }
+}
+
 export const createStreamEncoder = (
   recordingId: number,
   codecs: {