Skill
Whisper (Speech-to-Text)
Whisper Speech-to-Text Integration Skill
Overview
This skill provides ready-to-use code for integrating Whisper STT into your applications.
Service: Whisper (faster-whisper-large-v3) on Ubuntu Server Access: Job-based API via Ubuntu backend Languages: 100+ languages including Arabic (optimized for Egyptian dialect) Speed: Real-time factor 0.15-0.35x (faster than audio length)
Service Details
| Feature | Specification |
|---|---|
| Model | faster-whisper-large-v3 |
| Arabic Support | ✅ Excellent (including dialects) |
| GPU Accelerated | ✅ CUDA (RTX 3060) |
| Max Audio Length | No hard limit (longer = slower) |
| Audio Formats | WAV, MP3, M4A, FLAC, OGG |
| Timestamp Support | ✅ Word-level and segment-level |
| Translation | ✅ To English |
TypeScript/JavaScript Implementation
Basic STT Client
import { JobClient } from './job-management-skill';
interface WhisperParams {
audioPath: string;
language?: string | null; // 'ar', 'en', or null for auto-detect
task?: 'transcribe' | 'translate';
temperature?: number;
beam_size?: number;
vad_filter?: boolean;
word_timestamps?: boolean;
}
interface TranscriptionResult {
text: string;
language: string;
language_probability: number;
duration: number;
segments: Array<{
start: number;
end: number;
text: string;
words?: Array<{
word: string;
start: number;
end: number;
probability: number;
}>;
}>;
}
class WhisperClient {
private jobClient: JobClient;
constructor(token: string, baseURL: string = 'https://api.proyaro.com') {
this.jobClient = new JobClient(baseURL, token);
}
async transcribe(params: WhisperParams): Promise<TranscriptionResult> {
// Create STT job
const job = await this.jobClient.createJob({
job_type: 'speech_to_text',
parameters: {
audio_path: params.audioPath,
language: params.language || null,
task: params.task || 'transcribe',
temperature: params.temperature || 0.0,
beam_size: params.beam_size || 5,
vad_filter: params.vad_filter !== false,
word_timestamps: params.word_timestamps || false,
},
});
console.log(`STT job created: ${job.id}`);
// Wait for completion
const result = await this.jobClient.waitForJob(job.id);
return result.result_data as TranscriptionResult;
}
// Upload audio file first
async uploadAudio(audioFile: File | Blob): Promise<string> {
const formData = new FormData();
formData.append('audio', audioFile);
const response = await fetch(`${this.jobClient['baseURL']}/upload-audio`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.jobClient['token']}`,
},
body: formData,
});
if (!response.ok) {
throw new Error(`Audio upload failed: ${response.statusText}`);
}
const data = await response.json();
return data.audio_path;
}
// Complete workflow: upload + transcribe
async transcribeFile(
audioFile: File | Blob,
options: Omit<WhisperParams, 'audioPath'> = {}
): Promise<TranscriptionResult> {
const audioPath = await this.uploadAudio(audioFile);
return this.transcribe({ ...options, audioPath });
}
}
// Usage Example
async function example() {
const client = new WhisperClient('your-jwt-token');
// From file input
const fileInput = document.querySelector<HTMLInputElement>('input[type="file"]');
const audioFile = fileInput?.files?.[0];
if (audioFile) {
const result = await client.transcribeFile(audioFile, {
language: 'ar',
task: 'transcribe',
word_timestamps: true,
});
console.log('Transcription:', result.text);
console.log('Language:', result.language);
console.log('Confidence:', result.language_probability);
console.log('Duration:', result.duration, 'seconds');
// Show segments with timestamps
result.segments.forEach((segment, i) => {
console.log(
`[${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s]: ${segment.text}`
);
});
}
}
React Hook
import { useState, useCallback } from 'react';
import { useJob } from './job-management-skill';
interface UseWhisperOptions {
token: string;
baseURL?: string;
enableWebSocket?: boolean;
}
export function useWhisper(options: UseWhisperOptions) {
const [transcribing, setTranscribing] = useState(false);
const [result, setResult] = useState<TranscriptionResult | null>(null);
const [error, setError] = useState<string | null>(null);
const { createJob, waitForJob } = useJob({
token: options.token,
baseURL: options.baseURL,
enableWebSocket: options.enableWebSocket,
});
const transcribe = useCallback(async (
audioFile: File | Blob,
params: Omit<WhisperParams, 'audioPath'> = {}
) => {
setTranscribing(true);
setError(null);
setResult(null);
try {
// Upload audio
const formData = new FormData();
formData.append('audio', audioFile);
const uploadResponse = await fetch(
`${options.baseURL || 'https://api.proyaro.com'}/upload-audio`,
{
method: 'POST',
headers: { 'Authorization': `Bearer ${options.token}` },
body: formData,
}
);
if (!uploadResponse.ok) {
throw new Error('Audio upload failed');
}
const { audio_path } = await uploadResponse.json();
// Create transcription job
const job = await createJob({
job_type: 'speech_to_text',
parameters: {
audio_path,
language: params.language || 'ar',
task: params.task || 'transcribe',
temperature: params.temperature || 0.0,
beam_size: params.beam_size || 5,
vad_filter: params.vad_filter !== false,
word_timestamps: params.word_timestamps || false,
},
});
// Wait for result
const jobResult = await waitForJob(job.id);
setResult(jobResult.result_data as TranscriptionResult);
return jobResult.result_data as TranscriptionResult;
} catch (err) {
const message = err instanceof Error ? err.message : 'Transcription failed';
setError(message);
throw err;
} finally {
setTranscribing(false);
}
}, [options.token, options.baseURL, createJob, waitForJob]);
return {
transcribe,
transcribing,
result,
error,
};
}
// Usage in Component
function VoiceRecorder() {
const { transcribe, transcribing, result, error } = useWhisper({
token: 'your-jwt-token',
enableWebSocket: true,
});
const [recording, setRecording] = useState(false);
const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);
const startRecording = async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
const recorder = new MediaRecorder(stream);
const chunks: Blob[] = [];
recorder.ondataavailable = (e) => chunks.push(e.data);
recorder.onstop = async () => {
const audioBlob = new Blob(chunks, { type: 'audio/wav' });
await transcribe(audioBlob, {
language: 'ar',
word_timestamps: true,
});
stream.getTracks().forEach(track => track.stop());
};
recorder.start();
setMediaRecorder(recorder);
setRecording(true);
};
const stopRecording = () => {
mediaRecorder?.stop();
setRecording(false);
};
return (
<div>
<button
onClick={recording ? stopRecording : startRecording}
disabled={transcribing}
>
{recording ? 'Stop Recording' : 'Start Recording'}
</button>
{transcribing && <div>Transcribing...</div>}
{error && <div className="error">{error}</div>}
{result && (
<div>
<h3>Transcription ({result.language})</h3>
<p>{result.text}</p>
<small>Confidence: {(result.language_probability * 100).toFixed(1)}%</small>
<h4>Segments:</h4>
{result.segments.map((segment, i) => (
<div key={i}>
<span>[{segment.start.toFixed(2)}s - {segment.end.toFixed(2)}s]:</span>
<span>{segment.text}</span>
</div>
))}
</div>
)}
</div>
);
}
Python Implementation
import requests
from typing import Dict, Any, Optional, List
from pathlib import Path
class WhisperClient:
"""Client for Whisper STT via Ubuntu backend"""
def __init__(self, token: str, base_url: str = "https://api.proyaro.com"):
self.token = token
self.base_url = base_url
self.headers = {
"Authorization": f"Bearer {token}"
}
def upload_audio(self, audio_path: str) -> str:
"""
Upload audio file to server
Args:
audio_path: Path to local audio file
Returns:
Server path to uploaded audio
"""
with open(audio_path, 'rb') as f:
files = {'audio': f}
response = requests.post(
f"{self.base_url}/upload-audio",
headers=self.headers,
files=files,
timeout=60
)
response.raise_for_status()
return response.json()['audio_path']
def transcribe(
self,
audio_path: str,
language: Optional[str] = 'ar',
task: str = 'transcribe',
temperature: float = 0.0,
beam_size: int = 5,
vad_filter: bool = True,
word_timestamps: bool = False,
) -> Dict[str, Any]:
"""
Transcribe audio file
Args:
audio_path: Server path to audio file
language: Language code ('ar', 'en', or None for auto-detect)
task: 'transcribe' or 'translate' (to English)
temperature: Sampling temperature (0.0 = deterministic)
beam_size: Beam search size (higher = more accurate, slower)
vad_filter: Remove silence with VAD
word_timestamps: Include word-level timestamps
Returns:
Transcription result dict
"""
# Create job
from job_management_skill import JobClient
job_client = JobClient(self.base_url, self.token)
job = job_client.create_job(
job_type="speech_to_text",
parameters={
"audio_path": audio_path,
"language": language,
"task": task,
"temperature": temperature,
"beam_size": beam_size,
"vad_filter": vad_filter,
"word_timestamps": word_timestamps,
}
)
print(f"STT job created: {job['id']}")
# Wait for result
result = job_client.wait_for_job(job['id'])
return result.result_data
def transcribe_file(
self,
local_audio_path: str,
**kwargs
) -> Dict[str, Any]:
"""
Upload and transcribe audio file
Args:
local_audio_path: Path to local audio file
**kwargs: Additional transcription parameters
Returns:
Transcription result dict
"""
# Upload
print(f"Uploading {local_audio_path}...")
server_path = self.upload_audio(local_audio_path)
# Transcribe
print("Transcribing...")
return self.transcribe(server_path, **kwargs)
# Usage Example
if __name__ == "__main__":
client = WhisperClient(token="your-jwt-token")
# Transcribe local file
result = client.transcribe_file(
"recording.wav",
language="ar",
task="transcribe",
word_timestamps=True,
)
print(f"Transcription: {result['text']}")
print(f"Language: {result['language']} ({result['language_probability']:.2%})")
print(f"Duration: {result['duration']:.2f}s")
# Show segments
for i, segment in enumerate(result['segments']):
print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}")
# Show word timestamps if available
if result['segments'][0].get('words'):
for word in result['segments'][0]['words']:
print(f" {word['word']} [{word['start']:.2f}s - {word['end']:.2f}s]")
Language Support
Recommended Settings by Language
const languageConfigs = {
arabic: {
language: 'ar',
beam_size: 5,
temperature: 0.0,
vad_filter: true, // Good for Arabic
},
english: {
language: 'en',
beam_size: 5,
temperature: 0.0,
vad_filter: true,
},
autoDetect: {
language: null, // Auto-detect
beam_size: 5,
temperature: 0.0,
vad_filter: true,
},
translateToEnglish: {
language: 'ar', // Source language
task: 'translate', // Translate to English
beam_size: 5,
},
};
Supported Languages
Most common: Arabic (ar), English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Russian (ru), Japanese (ja), Korean (ko), Chinese (zh)
Total: 100+ languages supported
Best Practices
Audio Quality
// Good audio characteristics
const goodAudio = {
format: 'WAV or M4A',
sampleRate: '16kHz or higher',
channels: 'Mono preferred',
bitrate: '128kbps minimum',
noise: 'Low background noise',
speaker: 'Clear speech, not too fast',
};
// Processing tips
async function prepareAudio(file: File): Promise<Blob> {
// Convert to WAV if needed
// Reduce noise if possible
// Normalize volume
return file;
}
Performance Optimization
// Use VAD filter to remove silence (faster processing)
const result = await client.transcribe({
audioPath,
vad_filter: true, // ✅ Faster
});
// Only use word timestamps when needed
const result = await client.transcribe({
audioPath,
word_timestamps: false, // ✅ Faster
});
// Use appropriate beam size
const configs = {
fast: { beam_size: 1 }, // Fastest, less accurate
balanced: { beam_size: 5 }, // Good balance (recommended)
accurate: { beam_size: 10 }, // Slowest, most accurate
};
Error Handling
async function safeTranscribe(
client: WhisperClient,
audioFile: File
): Promise<TranscriptionResult | null> {
try {
return await client.transcribeFile(audioFile);
} catch (error) {
if (error.message?.includes('upload')) {
console.error('File upload failed. Check file size and format.');
return null;
}
if (error.message?.includes('timeout')) {
console.error('Transcription took too long. Try shorter audio.');
return null;
}
if (error.message?.includes('language')) {
console.error('Language detection failed. Specify language manually.');
return null;
}
throw error;
}
}
Advanced Features
Segment-by-Segment Processing
function processSegments(result: TranscriptionResult) {
return result.segments.map(segment => ({
timestamp: `${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s`,
text: segment.text.trim(),
duration: segment.end - segment.start,
}));
}
// Create subtitles (SRT format)
function generateSRT(result: TranscriptionResult): string {
return result.segments.map((segment, i) => {
const start = formatSRTTime(segment.start);
const end = formatSRTTime(segment.end);
return `${i + 1}\n${start} --> ${end}\n${segment.text}\n`;
}).join('\n');
}
function formatSRTTime(seconds: number): string {
const h = Math.floor(seconds / 3600);
const m = Math.floor((seconds % 3600) / 60);
const s = Math.floor(seconds % 60);
const ms = Math.floor((seconds % 1) * 1000);
return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`;
}
Testing
async function testWhisper() {
const client = new WhisperClient('test-token');
// Test with sample audio
console.log('Testing Whisper STT...');
const testFile = new File(
[await fetch('/test-audio.wav').then(r => r.blob())],
'test.wav'
);
const result = await client.transcribeFile(testFile, {
language: 'ar',
task: 'transcribe',
});
console.assert(result.text.length > 0, 'Should return transcription');
console.assert(result.language === 'ar', 'Should detect Arabic');
console.assert(result.segments.length > 0, 'Should have segments');
console.log('All tests passed!');
}
Skill Version: 1.0 Last Updated: 2025-01-01
ProYaro AI Infrastructure Documentation • Version 1.2