From 34b6e7ef88f3258a4501b8782aabb731cd6196fa Mon Sep 17 00:00:00 2001 From: pengx17 Date: Wed, 9 Apr 2025 12:44:37 +0000 Subject: [PATCH] feat(core): support splitting audio blobs before submitting to backend (#11572) fix AF-2484 --- packages/common/graphql/export-gql-plugin.cjs | 5 +- packages/common/graphql/src/graphql/index.ts | 1 + .../media/entities/audio-attachment-block.ts | 16 +- .../entities/audio-transcription-job-store.ts | 6 +- .../media/entities/audio-transcription-job.ts | 4 +- .../frontend/core/src/utils/webm-encoding.ts | 245 +++++++++++++++--- 6 files changed, 226 insertions(+), 51 deletions(-) diff --git a/packages/common/graphql/export-gql-plugin.cjs b/packages/common/graphql/export-gql-plugin.cjs index 665b845ce4..4822e43490 100644 --- a/packages/common/graphql/export-gql-plugin.cjs +++ b/packages/common/graphql/export-gql-plugin.cjs @@ -163,7 +163,10 @@ module.exports = { // parse 'file' fields const containsFile = node.variableDefinitions.some(def => { - const varType = def?.type?.type?.name?.value; + const varType = + def.type.kind === 'NamedType' + ? def.type.name.value + : def?.type?.type?.name?.value; const checkContainFile = type => { if (schema.getType(type)?.name === 'Upload') return true; const typeDef = schema.getType(type); diff --git a/packages/common/graphql/src/graphql/index.ts b/packages/common/graphql/src/graphql/index.ts index e31b68fe04..7276f57af5 100644 --- a/packages/common/graphql/src/graphql/index.ts +++ b/packages/common/graphql/src/graphql/index.ts @@ -612,6 +612,7 @@ export const submitAudioTranscriptionMutation = { status } }`, + file: true, }; export const claimAudioTranscriptionMutation = { diff --git a/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts b/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts index 0aa6732cc9..531432947c 100644 --- a/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts +++ b/packages/frontend/core/src/modules/media/entities/audio-attachment-block.ts @@ -1,4 +1,4 @@ -import { encodeAudioBlobToOpus } from '@affine/core/utils/webm-encoding'; +import { encodeAudioBlobToOpusSlices } from '@affine/core/utils/webm-encoding'; import { DebugLogger } from '@affine/debug'; import { AiJobStatus } from '@affine/graphql'; import track from '@affine/track'; @@ -115,17 +115,19 @@ export class AudioAttachmentBlock extends Entity { const job = this.framework.createEntity(AudioTranscriptionJob, { blobId: this.props.props.sourceId, blockProps: transcriptionBlockProps, - getAudioFile: async () => { + getAudioFiles: async () => { const buffer = await this.audioMedia.getBuffer(); if (!buffer) { throw new Error('No audio buffer available'); } - const encodedBuffer = await encodeAudioBlobToOpus(buffer, 64000); - const blob = new Blob([encodedBuffer], { type: this.props.props.type }); - const file = new File([blob], this.props.props.name, { - type: this.props.props.type, + const slices = await encodeAudioBlobToOpusSlices(buffer, 64000); + const files = slices.map((slice, index) => { + const blob = new Blob([slice], { type: 'audio/opus' }); + return new File([blob], this.props.props.name + `-${index}.opus`, { + type: 'audio/opus', + }); }); - return file; + return files; }, }); diff --git a/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts b/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts index 3ef75aa6c7..8b510f3c6c 100644 --- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts +++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job-store.ts @@ -12,7 +12,7 @@ import type { WorkspaceService } from '../../workspace'; export class AudioTranscriptionJobStore extends Entity<{ readonly blobId: string; - readonly getAudioFile: () => Promise; + readonly getAudioFiles: () => Promise; }> { constructor( private readonly workspaceService: WorkspaceService, @@ -41,13 +41,13 @@ export class AudioTranscriptionJobStore extends Entity<{ if (!graphqlService) { throw new Error('No graphql service available'); } - const file = await this.props.getAudioFile(); + const files = await this.props.getAudioFiles(); const response = await graphqlService.gql({ query: submitAudioTranscriptionMutation, variables: { workspaceId: this.currentWorkspaceId, blobId: this.props.blobId, - blob: file, + blobs: files, }, }); if (!response.submitAudioTranscription?.id) { diff --git a/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts b/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts index 22fef885c4..c73f3b5bc8 100644 --- a/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts +++ b/packages/frontend/core/src/modules/media/entities/audio-transcription-job.ts @@ -46,7 +46,7 @@ const logger = new DebugLogger('audio-transcription-job'); export class AudioTranscriptionJob extends Entity<{ readonly blockProps: TranscriptionBlockProps; readonly blobId: string; - readonly getAudioFile: () => Promise; + readonly getAudioFiles: () => Promise; }> { constructor( private readonly workspaceServerService: WorkspaceServerService, @@ -68,7 +68,7 @@ export class AudioTranscriptionJob extends Entity<{ AudioTranscriptionJobStore, { blobId: this.props.blobId, - getAudioFile: this.props.getAudioFile, + getAudioFiles: this.props.getAudioFiles, } ); diff --git a/packages/frontend/core/src/utils/webm-encoding.ts b/packages/frontend/core/src/utils/webm-encoding.ts index 63163fd32f..5863fe1727 100644 --- a/packages/frontend/core/src/utils/webm-encoding.ts +++ b/packages/frontend/core/src/utils/webm-encoding.ts @@ -8,8 +8,62 @@ interface AudioEncodingConfig { bitrate?: number; } +interface AudioEncodingResult { + encodedChunks: EncodedAudioChunk[]; + config: AudioEncodingConfig; +} + const logger = new DebugLogger('webm-encoding'); +// Constants +const DEFAULT_BITRATE = 64000; +const MAX_SLICE_DURATION_SECONDS = 10 * 60; // 10 minutes +const MIN_SLICE_DURATION_SECONDS = 5 * 60; // 5 minutes +const AUDIO_LEVEL_THRESHOLD = 0.02; // Threshold for "silence" detection + +/** + * Converts various blob formats to ArrayBuffer + */ +async function blobToArrayBuffer( + blob: Blob | ArrayBuffer | Uint8Array +): Promise { + if (blob instanceof Blob) { + return await blob.arrayBuffer(); + } else if (blob instanceof Uint8Array) { + return blob.buffer instanceof ArrayBuffer + ? blob.buffer + : blob.slice().buffer; + } else { + return blob; + } +} + +/** + * Extracts a combined Float32Array from an AudioBuffer + */ +function extractAudioData( + audioBuffer: AudioBuffer, + startSample: number = 0, + endSample?: number +): Float32Array { + const numberOfChannels = audioBuffer.numberOfChannels; + const sampleCount = + endSample !== undefined + ? endSample - startSample + : audioBuffer.length - startSample; + + const audioData = new Float32Array(sampleCount * numberOfChannels); + + for (let channel = 0; channel < numberOfChannels; channel++) { + const channelData = audioBuffer.getChannelData(channel); + for (let i = 0; i < sampleCount; i++) { + audioData[i * numberOfChannels + channel] = channelData[startSample + i]; + } + } + + return audioData; +} + /** * Creates and configures an Opus encoder with the given settings */ @@ -31,7 +85,7 @@ export function createOpusEncoder(config: AudioEncodingConfig): { codec: 'opus', sampleRate: config.sampleRate, numberOfChannels: config.numberOfChannels, - bitrate: config.bitrate ?? 64000, + bitrate: config.bitrate ?? DEFAULT_BITRATE, }); return { encoder, encodedChunks }; @@ -104,6 +158,32 @@ export function muxToWebM( return new Uint8Array(target.buffer); } +/** + * Process and encode audio data to Opus chunks + */ +async function encodeAudioBufferToOpus( + audioBuffer: AudioBuffer, + targetBitrate: number = DEFAULT_BITRATE +): Promise { + const config: AudioEncodingConfig = { + sampleRate: audioBuffer.sampleRate, + numberOfChannels: audioBuffer.numberOfChannels, + bitrate: targetBitrate, + }; + + const { encoder, encodedChunks } = createOpusEncoder(config); + const audioData = extractAudioData(audioBuffer); + + await encodeAudioFrames({ + audioData, + numberOfChannels: config.numberOfChannels, + sampleRate: config.sampleRate, + encoder, + }); + + return { encodedChunks, config }; +} + /** * Encodes raw audio data to Opus in WebM container. */ @@ -170,49 +250,18 @@ export async function encodeRawBufferToOpus({ */ export async function encodeAudioBlobToOpus( blob: Blob | ArrayBuffer | Uint8Array, - targetBitrate: number = 64000 + targetBitrate: number = DEFAULT_BITRATE ): Promise { const audioContext = new AudioContext(); logger.debug('Encoding audio blob to Opus'); try { - let buffer: ArrayBuffer; - if (blob instanceof Blob) { - buffer = await blob.arrayBuffer(); - } else if (blob instanceof Uint8Array) { - buffer = - blob.buffer instanceof ArrayBuffer ? blob.buffer : blob.slice().buffer; - } else { - buffer = blob; - } - - const audioBuffer = await audioContext.decodeAudioData(buffer); - - const config: AudioEncodingConfig = { - sampleRate: audioBuffer.sampleRate, - numberOfChannels: audioBuffer.numberOfChannels, - bitrate: targetBitrate, - }; - - const { encoder, encodedChunks } = createOpusEncoder(config); - - // Combine all channels into a single Float32Array - const audioData = new Float32Array( - audioBuffer.length * config.numberOfChannels + const arrayBuffer = await blobToArrayBuffer(blob); + const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); + const { encodedChunks, config } = await encodeAudioBufferToOpus( + audioBuffer, + targetBitrate ); - for (let channel = 0; channel < config.numberOfChannels; channel++) { - const channelData = audioBuffer.getChannelData(channel); - for (let i = 0; i < channelData.length; i++) { - audioData[i * config.numberOfChannels + channel] = channelData[i]; - } - } - - await encodeAudioFrames({ - audioData, - numberOfChannels: config.numberOfChannels, - sampleRate: config.sampleRate, - encoder, - }); const webm = muxToWebM(encodedChunks, config); logger.debug('Encoded audio blob to Opus'); @@ -222,6 +271,126 @@ export async function encodeAudioBlobToOpus( } } +/** + * Finds the best slice point based on audio level + */ +function findSlicePoint( + audioBuffer: AudioBuffer, + startSample: number, + endSample: number, + minSliceSamples: number +): number { + // If we have more than min slice duration and not at the end, + // look for a good splitting point (low audio level) + if ( + endSample < audioBuffer.length && + endSample - startSample > minSliceSamples + ) { + // Start checking from min slice duration point + const checkStartSample = startSample + minSliceSamples; + const numberOfChannels = audioBuffer.numberOfChannels; + + // Scan forward for a good split point (low audio level) + for (let i = checkStartSample; i < endSample; i++) { + // Calculate average level across all channels at this sample + let level = 0; + for (let channel = 0; channel < numberOfChannels; channel++) { + const data = audioBuffer.getChannelData(channel); + level += Math.abs(data[i]); + } + level /= numberOfChannels; + + // If we found a quiet spot, use it as the split point + if (level < AUDIO_LEVEL_THRESHOLD) { + return i; + } + } + } + + // If no good splitting point is found, use the original end sample + return endSample; +} + +// Since the audio blob could be long and make the transcribe service busy, +// we need to encode the audio blob to opus slices +// Slice logic: +// 1. Max slice duration is 10 minutes +// 2. Min slice duration is 5 minutes +// 3. If a new slice begins and the duration reached 5 minutes +// we start a new slice when the audio level value is below the threshold +// 4. If the audio level value is above the threshold, we continue the current slice +export async function encodeAudioBlobToOpusSlices( + blob: Blob | ArrayBuffer | Uint8Array, + targetBitrate: number = DEFAULT_BITRATE +): Promise { + const audioContext = new AudioContext(); + + try { + const arrayBuffer = await blobToArrayBuffer(blob); + const audioBuffer = await audioContext.decodeAudioData(arrayBuffer); + const slices: Uint8Array[] = []; + + // Define slicing parameters + const sampleRate = audioBuffer.sampleRate; + const numberOfChannels = audioBuffer.numberOfChannels; + + // Calculate sizes in samples + const maxSliceSamples = MAX_SLICE_DURATION_SECONDS * sampleRate; + const minSliceSamples = MIN_SLICE_DURATION_SECONDS * sampleRate; + const totalSamples = audioBuffer.length; + + // Start slicing + let startSample = 0; + + while (startSample < totalSamples) { + // Determine end sample for this slice + let endSample = Math.min(startSample + maxSliceSamples, totalSamples); + + // Find the best slice point based on audio levels + endSample = findSlicePoint( + audioBuffer, + startSample, + endSample, + minSliceSamples + ); + + // Create a slice from startSample to endSample + const audioData = extractAudioData(audioBuffer, startSample, endSample); + + // Encode this slice to Opus + const { encoder, encodedChunks } = createOpusEncoder({ + sampleRate, + numberOfChannels, + bitrate: targetBitrate, + }); + + await encodeAudioFrames({ + audioData, + numberOfChannels, + sampleRate, + encoder, + }); + + // Mux to WebM and add to slices + const webm = muxToWebM(encodedChunks, { + sampleRate, + numberOfChannels, + bitrate: targetBitrate, + }); + + slices.push(webm); + + // Move to next slice + startSample = endSample; + } + + logger.debug(`Encoded audio blob to ${slices.length} Opus slices`); + return slices; + } finally { + await audioContext.close(); + } +} + export const createStreamEncoder = ( recordingId: number, codecs: {