feat(core): support splitting audio blobs before submitting to backend (#11572)

fix AF-2484
2026-02-13 12:55:00 +00:00 · 2025-04-09 12:44:37 +00:00
parent ba875a120f
commit 34b6e7ef88
6 changed files with 226 additions and 51 deletions
--- a/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts
@@ -1,4 +1,4 @@
-import { encodeAudioBlobToOpus } from '@affine/core/utils/webm-encoding';
+import { encodeAudioBlobToOpusSlices } from '@affine/core/utils/webm-encoding';
 import { DebugLogger } from '@affine/debug';
 import { AiJobStatus } from '@affine/graphql';
 import track from '@affine/track';
@@ -115,17 +115,19 @@ export class AudioAttachmentBlock extends Entity<AttachmentBlockModel> {
    const job = this.framework.createEntity(AudioTranscriptionJob, {
      blobId: this.props.props.sourceId,
      blockProps: transcriptionBlockProps,
-      getAudioFile: async () => {
+      getAudioFiles: async () => {
        const buffer = await this.audioMedia.getBuffer();
        if (!buffer) {
          throw new Error('No audio buffer available');
        }
-        const encodedBuffer = await encodeAudioBlobToOpus(buffer, 64000);
-        const blob = new Blob([encodedBuffer], { type: this.props.props.type });
-        const file = new File([blob], this.props.props.name, {
-          type: this.props.props.type,
+        const slices = await encodeAudioBlobToOpusSlices(buffer, 64000);
+        const files = slices.map((slice, index) => {
+          const blob = new Blob([slice], { type: 'audio/opus' });
+          return new File([blob], this.props.props.name + `-${index}.opus`, {
+            type: 'audio/opus',
+          });
        });
-        return file;
+        return files;
      },
    });

--- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts
@@ -12,7 +12,7 @@ import type { WorkspaceService } from '../../workspace';

 export class AudioTranscriptionJobStore extends Entity<{
  readonly blobId: string;
-  readonly getAudioFile: () => Promise<File>;
+  readonly getAudioFiles: () => Promise<File[]>;
 }> {
  constructor(
    private readonly workspaceService: WorkspaceService,
@@ -41,13 +41,13 @@ export class AudioTranscriptionJobStore extends Entity<{
    if (!graphqlService) {
      throw new Error('No graphql service available');
    }
-    const file = await this.props.getAudioFile();
+    const files = await this.props.getAudioFiles();
    const response = await graphqlService.gql({
      query: submitAudioTranscriptionMutation,
      variables: {
        workspaceId: this.currentWorkspaceId,
        blobId: this.props.blobId,
-        blob: file,
+        blobs: files,
      },
    });
    if (!response.submitAudioTranscription?.id) {
--- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts
+++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts
@@ -46,7 +46,7 @@ const logger = new DebugLogger('audio-transcription-job');
 export class AudioTranscriptionJob extends Entity<{
  readonly blockProps: TranscriptionBlockProps;
  readonly blobId: string;
-  readonly getAudioFile: () => Promise<File>;
+  readonly getAudioFiles: () => Promise<File[]>;
 }> {
  constructor(
    private readonly workspaceServerService: WorkspaceServerService,
@@ -68,7 +68,7 @@ export class AudioTranscriptionJob extends Entity<{
    AudioTranscriptionJobStore,
    {
      blobId: this.props.blobId,
-      getAudioFile: this.props.getAudioFile,
+      getAudioFiles: this.props.getAudioFiles,
    }
  );

--- a/packages/frontend/core/src/utils/webm-encoding.ts
+++ b/packages/frontend/core/src/utils/webm-encoding.ts
@@ -8,8 +8,62 @@ interface AudioEncodingConfig {
  bitrate?: number;
 }

+interface AudioEncodingResult {
+  encodedChunks: EncodedAudioChunk[];
+  config: AudioEncodingConfig;
+}
+
 const logger = new DebugLogger('webm-encoding');

+// Constants
+const DEFAULT_BITRATE = 64000;
+const MAX_SLICE_DURATION_SECONDS = 10 * 60; // 10 minutes
+const MIN_SLICE_DURATION_SECONDS = 5 * 60; // 5 minutes
+const AUDIO_LEVEL_THRESHOLD = 0.02; // Threshold for "silence" detection
+
+/**
+ * Converts various blob formats to ArrayBuffer
+ */
+async function blobToArrayBuffer(
+  blob: Blob | ArrayBuffer | Uint8Array
+): Promise<ArrayBuffer> {
+  if (blob instanceof Blob) {
+    return await blob.arrayBuffer();
+  } else if (blob instanceof Uint8Array) {
+    return blob.buffer instanceof ArrayBuffer
+      ? blob.buffer
+      : blob.slice().buffer;
+  } else {
+    return blob;
+  }
+}
+
+/**
+ * Extracts a combined Float32Array from an AudioBuffer
+ */
+function extractAudioData(
+  audioBuffer: AudioBuffer,
+  startSample: number = 0,
+  endSample?: number
+): Float32Array {
+  const numberOfChannels = audioBuffer.numberOfChannels;
+  const sampleCount =
+    endSample !== undefined
+      ? endSample - startSample
+      : audioBuffer.length - startSample;
+
+  const audioData = new Float32Array(sampleCount * numberOfChannels);
+
+  for (let channel = 0; channel < numberOfChannels; channel++) {
+    const channelData = audioBuffer.getChannelData(channel);
+    for (let i = 0; i < sampleCount; i++) {
+      audioData[i * numberOfChannels + channel] = channelData[startSample + i];
+    }
+  }
+
+  return audioData;
+}
+
 /**
 * Creates and configures an Opus encoder with the given settings
 */
@@ -31,7 +85,7 @@ export function createOpusEncoder(config: AudioEncodingConfig): {
    codec: 'opus',
    sampleRate: config.sampleRate,
    numberOfChannels: config.numberOfChannels,
-    bitrate: config.bitrate ?? 64000,
+    bitrate: config.bitrate ?? DEFAULT_BITRATE,
  });

  return { encoder, encodedChunks };
@@ -104,6 +158,32 @@ export function muxToWebM(
  return new Uint8Array(target.buffer);
 }

+/**
+ * Process and encode audio data to Opus chunks
+ */
+async function encodeAudioBufferToOpus(
+  audioBuffer: AudioBuffer,
+  targetBitrate: number = DEFAULT_BITRATE
+): Promise<AudioEncodingResult> {
+  const config: AudioEncodingConfig = {
+    sampleRate: audioBuffer.sampleRate,
+    numberOfChannels: audioBuffer.numberOfChannels,
+    bitrate: targetBitrate,
+  };
+
+  const { encoder, encodedChunks } = createOpusEncoder(config);
+  const audioData = extractAudioData(audioBuffer);
+
+  await encodeAudioFrames({
+    audioData,
+    numberOfChannels: config.numberOfChannels,
+    sampleRate: config.sampleRate,
+    encoder,
+  });
+
+  return { encodedChunks, config };
+}
+
 /**
 * Encodes raw audio data to Opus in WebM container.
 */
@@ -170,49 +250,18 @@ export async function encodeRawBufferToOpus({
 */
 export async function encodeAudioBlobToOpus(
  blob: Blob | ArrayBuffer | Uint8Array,
-  targetBitrate: number = 64000
+  targetBitrate: number = DEFAULT_BITRATE
 ): Promise<Uint8Array> {
  const audioContext = new AudioContext();
  logger.debug('Encoding audio blob to Opus');

  try {
-    let buffer: ArrayBuffer;
-    if (blob instanceof Blob) {
-      buffer = await blob.arrayBuffer();
-    } else if (blob instanceof Uint8Array) {
-      buffer =
-        blob.buffer instanceof ArrayBuffer ? blob.buffer : blob.slice().buffer;
-    } else {
-      buffer = blob;
-    }
-
-    const audioBuffer = await audioContext.decodeAudioData(buffer);
-
-    const config: AudioEncodingConfig = {
-      sampleRate: audioBuffer.sampleRate,
-      numberOfChannels: audioBuffer.numberOfChannels,
-      bitrate: targetBitrate,
-    };
-
-    const { encoder, encodedChunks } = createOpusEncoder(config);
-
-    // Combine all channels into a single Float32Array
-    const audioData = new Float32Array(
-      audioBuffer.length * config.numberOfChannels
+    const arrayBuffer = await blobToArrayBuffer(blob);
+    const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
+    const { encodedChunks, config } = await encodeAudioBufferToOpus(
+      audioBuffer,
+      targetBitrate
    );
-    for (let channel = 0; channel < config.numberOfChannels; channel++) {
-      const channelData = audioBuffer.getChannelData(channel);
-      for (let i = 0; i < channelData.length; i++) {
-        audioData[i * config.numberOfChannels + channel] = channelData[i];
-      }
-    }
-
-    await encodeAudioFrames({
-      audioData,
-      numberOfChannels: config.numberOfChannels,
-      sampleRate: config.sampleRate,
-      encoder,
-    });

    const webm = muxToWebM(encodedChunks, config);
    logger.debug('Encoded audio blob to Opus');
@@ -222,6 +271,126 @@ export async function encodeAudioBlobToOpus(
  }
 }

+/**
+ * Finds the best slice point based on audio level
+ */
+function findSlicePoint(
+  audioBuffer: AudioBuffer,
+  startSample: number,
+  endSample: number,
+  minSliceSamples: number
+): number {
+  // If we have more than min slice duration and not at the end,
+  // look for a good splitting point (low audio level)
+  if (
+    endSample < audioBuffer.length &&
+    endSample - startSample > minSliceSamples
+  ) {
+    // Start checking from min slice duration point
+    const checkStartSample = startSample + minSliceSamples;
+    const numberOfChannels = audioBuffer.numberOfChannels;
+
+    // Scan forward for a good split point (low audio level)
+    for (let i = checkStartSample; i < endSample; i++) {
+      // Calculate average level across all channels at this sample
+      let level = 0;
+      for (let channel = 0; channel < numberOfChannels; channel++) {
+        const data = audioBuffer.getChannelData(channel);
+        level += Math.abs(data[i]);
+      }
+      level /= numberOfChannels;
+
+      // If we found a quiet spot, use it as the split point
+      if (level < AUDIO_LEVEL_THRESHOLD) {
+        return i;
+      }
+    }
+  }
+
+  // If no good splitting point is found, use the original end sample
+  return endSample;
+}
+
+// Since the audio blob could be long and make the transcribe service busy,
+// we need to encode the audio blob to opus slices
+// Slice logic:
+// 1. Max slice duration is 10 minutes
+// 2. Min slice duration is 5 minutes
+// 3. If a new slice begins and the duration reached 5 minutes
+//    we start a new slice when the audio level value is below the threshold
+// 4. If the audio level value is above the threshold, we continue the current slice
+export async function encodeAudioBlobToOpusSlices(
+  blob: Blob | ArrayBuffer | Uint8Array,
+  targetBitrate: number = DEFAULT_BITRATE
+): Promise<Uint8Array[]> {
+  const audioContext = new AudioContext();
+
+  try {
+    const arrayBuffer = await blobToArrayBuffer(blob);
+    const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
+    const slices: Uint8Array[] = [];
+
+    // Define slicing parameters
+    const sampleRate = audioBuffer.sampleRate;
+    const numberOfChannels = audioBuffer.numberOfChannels;
+
+    // Calculate sizes in samples
+    const maxSliceSamples = MAX_SLICE_DURATION_SECONDS * sampleRate;
+    const minSliceSamples = MIN_SLICE_DURATION_SECONDS * sampleRate;
+    const totalSamples = audioBuffer.length;
+
+    // Start slicing
+    let startSample = 0;
+
+    while (startSample < totalSamples) {
+      // Determine end sample for this slice
+      let endSample = Math.min(startSample + maxSliceSamples, totalSamples);
+
+      // Find the best slice point based on audio levels
+      endSample = findSlicePoint(
+        audioBuffer,
+        startSample,
+        endSample,
+        minSliceSamples
+      );
+
+      // Create a slice from startSample to endSample
+      const audioData = extractAudioData(audioBuffer, startSample, endSample);
+
+      // Encode this slice to Opus
+      const { encoder, encodedChunks } = createOpusEncoder({
+        sampleRate,
+        numberOfChannels,
+        bitrate: targetBitrate,
+      });
+
+      await encodeAudioFrames({
+        audioData,
+        numberOfChannels,
+        sampleRate,
+        encoder,
+      });
+
+      // Mux to WebM and add to slices
+      const webm = muxToWebM(encodedChunks, {
+        sampleRate,
+        numberOfChannels,
+        bitrate: targetBitrate,
+      });
+
+      slices.push(webm);
+
+      // Move to next slice
+      startSample = endSample;
+    }
+
+    logger.debug(`Encoded audio blob to ${slices.length} Opus slices`);
+    return slices;
+  } finally {
+    await audioContext.close();
+  }
+}
+
 export const createStreamEncoder = (
  recordingId: number,
  codecs: {