feat(core): support splitting audio blobs before submitting to backend (#11572)

fix AF-2484
This commit is contained in:
pengx17
2025-04-09 12:44:37 +00:00
parent ba875a120f
commit 34b6e7ef88
6 changed files with 226 additions and 51 deletions

View File

@@ -1,4 +1,4 @@
import { encodeAudioBlobToOpus } from '@affine/core/utils/webm-encoding';
import { encodeAudioBlobToOpusSlices } from '@affine/core/utils/webm-encoding';
import { DebugLogger } from '@affine/debug';
import { AiJobStatus } from '@affine/graphql';
import track from '@affine/track';
@@ -115,17 +115,19 @@ export class AudioAttachmentBlock extends Entity<AttachmentBlockModel> {
const job = this.framework.createEntity(AudioTranscriptionJob, {
blobId: this.props.props.sourceId,
blockProps: transcriptionBlockProps,
getAudioFile: async () => {
getAudioFiles: async () => {
const buffer = await this.audioMedia.getBuffer();
if (!buffer) {
throw new Error('No audio buffer available');
}
const encodedBuffer = await encodeAudioBlobToOpus(buffer, 64000);
const blob = new Blob([encodedBuffer], { type: this.props.props.type });
const file = new File([blob], this.props.props.name, {
type: this.props.props.type,
const slices = await encodeAudioBlobToOpusSlices(buffer, 64000);
const files = slices.map((slice, index) => {
const blob = new Blob([slice], { type: 'audio/opus' });
return new File([blob], this.props.props.name + `-${index}.opus`, {
type: 'audio/opus',
});
});
return file;
return files;
},
});

View File

@@ -12,7 +12,7 @@ import type { WorkspaceService } from '../../workspace';
export class AudioTranscriptionJobStore extends Entity<{
readonly blobId: string;
readonly getAudioFile: () => Promise<File>;
readonly getAudioFiles: () => Promise<File[]>;
}> {
constructor(
private readonly workspaceService: WorkspaceService,
@@ -41,13 +41,13 @@ export class AudioTranscriptionJobStore extends Entity<{
if (!graphqlService) {
throw new Error('No graphql service available');
}
const file = await this.props.getAudioFile();
const files = await this.props.getAudioFiles();
const response = await graphqlService.gql({
query: submitAudioTranscriptionMutation,
variables: {
workspaceId: this.currentWorkspaceId,
blobId: this.props.blobId,
blob: file,
blobs: files,
},
});
if (!response.submitAudioTranscription?.id) {

View File

@@ -46,7 +46,7 @@ const logger = new DebugLogger('audio-transcription-job');
export class AudioTranscriptionJob extends Entity<{
readonly blockProps: TranscriptionBlockProps;
readonly blobId: string;
readonly getAudioFile: () => Promise<File>;
readonly getAudioFiles: () => Promise<File[]>;
}> {
constructor(
private readonly workspaceServerService: WorkspaceServerService,
@@ -68,7 +68,7 @@ export class AudioTranscriptionJob extends Entity<{
AudioTranscriptionJobStore,
{
blobId: this.props.blobId,
getAudioFile: this.props.getAudioFile,
getAudioFiles: this.props.getAudioFiles,
}
);

View File

@@ -8,8 +8,62 @@ interface AudioEncodingConfig {
bitrate?: number;
}
interface AudioEncodingResult {
encodedChunks: EncodedAudioChunk[];
config: AudioEncodingConfig;
}
const logger = new DebugLogger('webm-encoding');
// Constants
const DEFAULT_BITRATE = 64000;
const MAX_SLICE_DURATION_SECONDS = 10 * 60; // 10 minutes
const MIN_SLICE_DURATION_SECONDS = 5 * 60; // 5 minutes
const AUDIO_LEVEL_THRESHOLD = 0.02; // Threshold for "silence" detection
/**
* Converts various blob formats to ArrayBuffer
*/
async function blobToArrayBuffer(
blob: Blob | ArrayBuffer | Uint8Array
): Promise<ArrayBuffer> {
if (blob instanceof Blob) {
return await blob.arrayBuffer();
} else if (blob instanceof Uint8Array) {
return blob.buffer instanceof ArrayBuffer
? blob.buffer
: blob.slice().buffer;
} else {
return blob;
}
}
/**
* Extracts a combined Float32Array from an AudioBuffer
*/
function extractAudioData(
audioBuffer: AudioBuffer,
startSample: number = 0,
endSample?: number
): Float32Array {
const numberOfChannels = audioBuffer.numberOfChannels;
const sampleCount =
endSample !== undefined
? endSample - startSample
: audioBuffer.length - startSample;
const audioData = new Float32Array(sampleCount * numberOfChannels);
for (let channel = 0; channel < numberOfChannels; channel++) {
const channelData = audioBuffer.getChannelData(channel);
for (let i = 0; i < sampleCount; i++) {
audioData[i * numberOfChannels + channel] = channelData[startSample + i];
}
}
return audioData;
}
/**
* Creates and configures an Opus encoder with the given settings
*/
@@ -31,7 +85,7 @@ export function createOpusEncoder(config: AudioEncodingConfig): {
codec: 'opus',
sampleRate: config.sampleRate,
numberOfChannels: config.numberOfChannels,
bitrate: config.bitrate ?? 64000,
bitrate: config.bitrate ?? DEFAULT_BITRATE,
});
return { encoder, encodedChunks };
@@ -104,6 +158,32 @@ export function muxToWebM(
return new Uint8Array(target.buffer);
}
/**
* Process and encode audio data to Opus chunks
*/
async function encodeAudioBufferToOpus(
audioBuffer: AudioBuffer,
targetBitrate: number = DEFAULT_BITRATE
): Promise<AudioEncodingResult> {
const config: AudioEncodingConfig = {
sampleRate: audioBuffer.sampleRate,
numberOfChannels: audioBuffer.numberOfChannels,
bitrate: targetBitrate,
};
const { encoder, encodedChunks } = createOpusEncoder(config);
const audioData = extractAudioData(audioBuffer);
await encodeAudioFrames({
audioData,
numberOfChannels: config.numberOfChannels,
sampleRate: config.sampleRate,
encoder,
});
return { encodedChunks, config };
}
/**
* Encodes raw audio data to Opus in WebM container.
*/
@@ -170,49 +250,18 @@ export async function encodeRawBufferToOpus({
*/
export async function encodeAudioBlobToOpus(
blob: Blob | ArrayBuffer | Uint8Array,
targetBitrate: number = 64000
targetBitrate: number = DEFAULT_BITRATE
): Promise<Uint8Array> {
const audioContext = new AudioContext();
logger.debug('Encoding audio blob to Opus');
try {
let buffer: ArrayBuffer;
if (blob instanceof Blob) {
buffer = await blob.arrayBuffer();
} else if (blob instanceof Uint8Array) {
buffer =
blob.buffer instanceof ArrayBuffer ? blob.buffer : blob.slice().buffer;
} else {
buffer = blob;
}
const audioBuffer = await audioContext.decodeAudioData(buffer);
const config: AudioEncodingConfig = {
sampleRate: audioBuffer.sampleRate,
numberOfChannels: audioBuffer.numberOfChannels,
bitrate: targetBitrate,
};
const { encoder, encodedChunks } = createOpusEncoder(config);
// Combine all channels into a single Float32Array
const audioData = new Float32Array(
audioBuffer.length * config.numberOfChannels
const arrayBuffer = await blobToArrayBuffer(blob);
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
const { encodedChunks, config } = await encodeAudioBufferToOpus(
audioBuffer,
targetBitrate
);
for (let channel = 0; channel < config.numberOfChannels; channel++) {
const channelData = audioBuffer.getChannelData(channel);
for (let i = 0; i < channelData.length; i++) {
audioData[i * config.numberOfChannels + channel] = channelData[i];
}
}
await encodeAudioFrames({
audioData,
numberOfChannels: config.numberOfChannels,
sampleRate: config.sampleRate,
encoder,
});
const webm = muxToWebM(encodedChunks, config);
logger.debug('Encoded audio blob to Opus');
@@ -222,6 +271,126 @@ export async function encodeAudioBlobToOpus(
}
}
/**
* Finds the best slice point based on audio level
*/
function findSlicePoint(
audioBuffer: AudioBuffer,
startSample: number,
endSample: number,
minSliceSamples: number
): number {
// If we have more than min slice duration and not at the end,
// look for a good splitting point (low audio level)
if (
endSample < audioBuffer.length &&
endSample - startSample > minSliceSamples
) {
// Start checking from min slice duration point
const checkStartSample = startSample + minSliceSamples;
const numberOfChannels = audioBuffer.numberOfChannels;
// Scan forward for a good split point (low audio level)
for (let i = checkStartSample; i < endSample; i++) {
// Calculate average level across all channels at this sample
let level = 0;
for (let channel = 0; channel < numberOfChannels; channel++) {
const data = audioBuffer.getChannelData(channel);
level += Math.abs(data[i]);
}
level /= numberOfChannels;
// If we found a quiet spot, use it as the split point
if (level < AUDIO_LEVEL_THRESHOLD) {
return i;
}
}
}
// If no good splitting point is found, use the original end sample
return endSample;
}
// Since the audio blob could be long and make the transcribe service busy,
// we need to encode the audio blob to opus slices
// Slice logic:
// 1. Max slice duration is 10 minutes
// 2. Min slice duration is 5 minutes
// 3. If a new slice begins and the duration reached 5 minutes
// we start a new slice when the audio level value is below the threshold
// 4. If the audio level value is above the threshold, we continue the current slice
export async function encodeAudioBlobToOpusSlices(
blob: Blob | ArrayBuffer | Uint8Array,
targetBitrate: number = DEFAULT_BITRATE
): Promise<Uint8Array[]> {
const audioContext = new AudioContext();
try {
const arrayBuffer = await blobToArrayBuffer(blob);
const audioBuffer = await audioContext.decodeAudioData(arrayBuffer);
const slices: Uint8Array[] = [];
// Define slicing parameters
const sampleRate = audioBuffer.sampleRate;
const numberOfChannels = audioBuffer.numberOfChannels;
// Calculate sizes in samples
const maxSliceSamples = MAX_SLICE_DURATION_SECONDS * sampleRate;
const minSliceSamples = MIN_SLICE_DURATION_SECONDS * sampleRate;
const totalSamples = audioBuffer.length;
// Start slicing
let startSample = 0;
while (startSample < totalSamples) {
// Determine end sample for this slice
let endSample = Math.min(startSample + maxSliceSamples, totalSamples);
// Find the best slice point based on audio levels
endSample = findSlicePoint(
audioBuffer,
startSample,
endSample,
minSliceSamples
);
// Create a slice from startSample to endSample
const audioData = extractAudioData(audioBuffer, startSample, endSample);
// Encode this slice to Opus
const { encoder, encodedChunks } = createOpusEncoder({
sampleRate,
numberOfChannels,
bitrate: targetBitrate,
});
await encodeAudioFrames({
audioData,
numberOfChannels,
sampleRate,
encoder,
});
// Mux to WebM and add to slices
const webm = muxToWebM(encodedChunks, {
sampleRate,
numberOfChannels,
bitrate: targetBitrate,
});
slices.push(webm);
// Move to next slice
startSample = endSample;
}
logger.debug(`Encoded audio blob to ${slices.length} Opus slices`);
return slices;
} finally {
await audioContext.close();
}
}
export const createStreamEncoder = (
recordingId: number,
codecs: {