mirror of
https://github.com/toeverything/AFFiNE.git
synced 2026-02-13 12:55:00 +00:00
Need to encode the audio based on the sample's sample rate & channels. Also fixed that global audio tap not receiving any samples at all.
201 lines
5.2 KiB
TypeScript
201 lines
5.2 KiB
TypeScript
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
import {
|
|
GoogleAIFileManager,
|
|
type UploadFileResponse,
|
|
} from '@google/generative-ai/server';
|
|
|
|
const DEFAULT_MODEL = 'gemini-2.0-flash';
|
|
|
|
export interface TranscriptionResult {
|
|
title: string;
|
|
summary: string;
|
|
segments: {
|
|
speaker: string;
|
|
start_time: string;
|
|
end_time: string;
|
|
transcription: string;
|
|
}[];
|
|
}
|
|
|
|
const PROMPT_TRANSCRIPTION = `
|
|
Generate audio transcription and diarization for the recording.
|
|
The recording source is most likely from a video call with multiple speakers.
|
|
Output in JSON format with the following structure:
|
|
|
|
{
|
|
"segments": [
|
|
{
|
|
"speaker": "Speaker A",
|
|
"start_time": "MM:SS",
|
|
"end_time": "MM:SS",
|
|
"transcription": "..."
|
|
},
|
|
...
|
|
],
|
|
}
|
|
- Use consistent speaker labels throughout
|
|
- Accurate timestamps in MM:SS format
|
|
- Clean transcription with proper punctuation
|
|
- Identify speakers by name if possible, otherwise use "Speaker A/B/C"
|
|
`;
|
|
|
|
const PROMPT_SUMMARY = `
|
|
Generate a short title and summary of the conversation. The input is in the following JSON format:
|
|
{
|
|
"segments": [
|
|
{
|
|
"speaker": "Speaker A",
|
|
"start_time": "MM:SS",
|
|
"end_time": "MM:SS",
|
|
"transcription": "..."
|
|
},
|
|
...
|
|
],
|
|
}
|
|
|
|
Output in JSON format with the following structure:
|
|
|
|
{
|
|
"title": "Title of the recording",
|
|
"summary": "Summary of the conversation in markdown format"
|
|
}
|
|
|
|
1. Summary Structure:
|
|
- The sumary should be inferred from the speakers' language and context
|
|
- All insights should be derived directly from speakers' language and context
|
|
- Use hierarchical organization for clear information structure
|
|
- Use markdown format for the summary. Use bullet points, lists and other markdown styles when appropriate
|
|
|
|
2. Title:
|
|
- Come up with a title for the recording.
|
|
- The title should be a short description of the recording.
|
|
- The title should be a single sentence or a few words.
|
|
`;
|
|
|
|
export async function gemini(
|
|
audioFilePath: string,
|
|
options?: {
|
|
model?: 'gemini-2.0-flash' | 'gemini-1.5-flash';
|
|
mode?: 'transcript' | 'summary';
|
|
}
|
|
) {
|
|
if (!process.env.GOOGLE_GEMINI_API_KEY) {
|
|
console.error('Missing GOOGLE_GEMINI_API_KEY environment variable');
|
|
throw new Error('GOOGLE_GEMINI_API_KEY is not set');
|
|
}
|
|
|
|
// Initialize GoogleGenerativeAI and FileManager with your API_KEY
|
|
const genAI = new GoogleGenerativeAI(process.env.GOOGLE_GEMINI_API_KEY);
|
|
const fileManager = new GoogleAIFileManager(
|
|
process.env.GOOGLE_GEMINI_API_KEY
|
|
);
|
|
|
|
async function transcribe(
|
|
audioFilePath: string
|
|
): Promise<TranscriptionResult | null> {
|
|
let uploadResult: UploadFileResponse | null = null;
|
|
|
|
try {
|
|
// Upload the audio file
|
|
uploadResult = await fileManager.uploadFile(audioFilePath, {
|
|
mimeType: 'audio/mp3',
|
|
displayName: 'audio_transcription.mp3',
|
|
});
|
|
console.log('File uploaded:', uploadResult.file.uri);
|
|
|
|
// Initialize a Gemini model appropriate for your use case.
|
|
const model = genAI.getGenerativeModel({
|
|
model: options?.model || DEFAULT_MODEL,
|
|
generationConfig: {
|
|
responseMimeType: 'application/json',
|
|
},
|
|
});
|
|
|
|
// Generate content using a prompt and the uploaded file
|
|
const result = await model.generateContent([
|
|
{
|
|
fileData: {
|
|
fileUri: uploadResult.file.uri,
|
|
mimeType: uploadResult.file.mimeType,
|
|
},
|
|
},
|
|
{
|
|
text: PROMPT_TRANSCRIPTION,
|
|
},
|
|
]);
|
|
|
|
const text = result.response.text();
|
|
|
|
try {
|
|
const parsed = JSON.parse(text);
|
|
return parsed;
|
|
} catch (e) {
|
|
console.error('Failed to parse transcription JSON:', e);
|
|
console.error('Raw text that failed to parse:', text);
|
|
return null;
|
|
}
|
|
} catch (e) {
|
|
console.error('Error during transcription:', e);
|
|
return null;
|
|
} finally {
|
|
if (uploadResult) {
|
|
await fileManager.deleteFile(uploadResult.file.name);
|
|
}
|
|
}
|
|
}
|
|
|
|
async function summarize(transcription: TranscriptionResult) {
|
|
try {
|
|
const model = genAI.getGenerativeModel({
|
|
model: options?.model || DEFAULT_MODEL,
|
|
generationConfig: {
|
|
responseMimeType: 'application/json',
|
|
},
|
|
});
|
|
|
|
const result = await model.generateContent([
|
|
{
|
|
text: PROMPT_SUMMARY + '\n\n' + JSON.stringify(transcription),
|
|
},
|
|
]);
|
|
|
|
const text = result.response.text();
|
|
|
|
try {
|
|
const parsed = JSON.parse(text);
|
|
return parsed;
|
|
} catch (e) {
|
|
console.error('Failed to parse summary JSON:', e);
|
|
console.error('Raw text that failed to parse:', text);
|
|
return null;
|
|
}
|
|
} catch (e) {
|
|
console.error('Error during summarization:', e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
const transcription = await transcribe(audioFilePath);
|
|
if (!transcription) {
|
|
console.error('Transcription failed');
|
|
return null;
|
|
}
|
|
|
|
const summary = await summarize(transcription);
|
|
if (!summary) {
|
|
console.error('Summary generation failed');
|
|
return transcription;
|
|
}
|
|
|
|
const result = {
|
|
...transcription,
|
|
...summary,
|
|
};
|
|
console.log('Processing completed:', {
|
|
title: result.title,
|
|
segmentsCount: result.segments?.length,
|
|
});
|
|
|
|
return result;
|
|
}
|