Skill

Whisper (Speech-to-Text)

Whisper Speech-to-Text Integration Skill

Overview

This skill provides ready-to-use code for integrating Whisper STT into your applications.

Service: Whisper (faster-whisper-large-v3) on Ubuntu Server Access: Job-based API via Ubuntu backend Languages: 100+ languages including Arabic (optimized for Egyptian dialect) Speed: Real-time factor 0.15-0.35x (faster than audio length)


Service Details

FeatureSpecification
Modelfaster-whisper-large-v3
Arabic Support✅ Excellent (including dialects)
GPU Accelerated✅ CUDA (RTX 3060)
Max Audio LengthNo hard limit (longer = slower)
Audio FormatsWAV, MP3, M4A, FLAC, OGG
Timestamp Support✅ Word-level and segment-level
Translation✅ To English

TypeScript/JavaScript Implementation

Basic STT Client

import { JobClient } from './job-management-skill';

interface WhisperParams {
  audioPath: string;
  language?: string | null;  // 'ar', 'en', or null for auto-detect
  task?: 'transcribe' | 'translate';
  temperature?: number;
  beam_size?: number;
  vad_filter?: boolean;
  word_timestamps?: boolean;
}

interface TranscriptionResult {
  text: string;
  language: string;
  language_probability: number;
  duration: number;
  segments: Array<{
    start: number;
    end: number;
    text: string;
    words?: Array<{
      word: string;
      start: number;
      end: number;
      probability: number;
    }>;
  }>;
}

class WhisperClient {
  private jobClient: JobClient;

  constructor(token: string, baseURL: string = 'https://api.proyaro.com') {
    this.jobClient = new JobClient(baseURL, token);
  }

  async transcribe(params: WhisperParams): Promise<TranscriptionResult> {
    // Create STT job
    const job = await this.jobClient.createJob({
      job_type: 'speech_to_text',
      parameters: {
        audio_path: params.audioPath,
        language: params.language || null,
        task: params.task || 'transcribe',
        temperature: params.temperature || 0.0,
        beam_size: params.beam_size || 5,
        vad_filter: params.vad_filter !== false,
        word_timestamps: params.word_timestamps || false,
      },
    });

    console.log(`STT job created: ${job.id}`);

    // Wait for completion
    const result = await this.jobClient.waitForJob(job.id);

    return result.result_data as TranscriptionResult;
  }

  // Upload audio file first
  async uploadAudio(audioFile: File | Blob): Promise<string> {
    const formData = new FormData();
    formData.append('audio', audioFile);

    const response = await fetch(`${this.jobClient['baseURL']}/upload-audio`, {
      method: 'POST',
      headers: {
        'Authorization': `Bearer ${this.jobClient['token']}`,
      },
      body: formData,
    });

    if (!response.ok) {
      throw new Error(`Audio upload failed: ${response.statusText}`);
    }

    const data = await response.json();
    return data.audio_path;
  }

  // Complete workflow: upload + transcribe
  async transcribeFile(
    audioFile: File | Blob,
    options: Omit<WhisperParams, 'audioPath'> = {}
  ): Promise<TranscriptionResult> {
    const audioPath = await this.uploadAudio(audioFile);
    return this.transcribe({ ...options, audioPath });
  }
}

// Usage Example
async function example() {
  const client = new WhisperClient('your-jwt-token');

  // From file input
  const fileInput = document.querySelector<HTMLInputElement>('input[type="file"]');
  const audioFile = fileInput?.files?.[0];

  if (audioFile) {
    const result = await client.transcribeFile(audioFile, {
      language: 'ar',
      task: 'transcribe',
      word_timestamps: true,
    });

    console.log('Transcription:', result.text);
    console.log('Language:', result.language);
    console.log('Confidence:', result.language_probability);
    console.log('Duration:', result.duration, 'seconds');

    // Show segments with timestamps
    result.segments.forEach((segment, i) => {
      console.log(
        `[${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s]: ${segment.text}`
      );
    });
  }
}

React Hook

import { useState, useCallback } from 'react';
import { useJob } from './job-management-skill';

interface UseWhisperOptions {
  token: string;
  baseURL?: string;
  enableWebSocket?: boolean;
}

export function useWhisper(options: UseWhisperOptions) {
  const [transcribing, setTranscribing] = useState(false);
  const [result, setResult] = useState<TranscriptionResult | null>(null);
  const [error, setError] = useState<string | null>(null);

  const { createJob, waitForJob } = useJob({
    token: options.token,
    baseURL: options.baseURL,
    enableWebSocket: options.enableWebSocket,
  });

  const transcribe = useCallback(async (
    audioFile: File | Blob,
    params: Omit<WhisperParams, 'audioPath'> = {}
  ) => {
    setTranscribing(true);
    setError(null);
    setResult(null);

    try {
      // Upload audio
      const formData = new FormData();
      formData.append('audio', audioFile);

      const uploadResponse = await fetch(
        `${options.baseURL || 'https://api.proyaro.com'}/upload-audio`,
        {
          method: 'POST',
          headers: { 'Authorization': `Bearer ${options.token}` },
          body: formData,
        }
      );

      if (!uploadResponse.ok) {
        throw new Error('Audio upload failed');
      }

      const { audio_path } = await uploadResponse.json();

      // Create transcription job
      const job = await createJob({
        job_type: 'speech_to_text',
        parameters: {
          audio_path,
          language: params.language || 'ar',
          task: params.task || 'transcribe',
          temperature: params.temperature || 0.0,
          beam_size: params.beam_size || 5,
          vad_filter: params.vad_filter !== false,
          word_timestamps: params.word_timestamps || false,
        },
      });

      // Wait for result
      const jobResult = await waitForJob(job.id);
      setResult(jobResult.result_data as TranscriptionResult);

      return jobResult.result_data as TranscriptionResult;
    } catch (err) {
      const message = err instanceof Error ? err.message : 'Transcription failed';
      setError(message);
      throw err;
    } finally {
      setTranscribing(false);
    }
  }, [options.token, options.baseURL, createJob, waitForJob]);

  return {
    transcribe,
    transcribing,
    result,
    error,
  };
}

// Usage in Component
function VoiceRecorder() {
  const { transcribe, transcribing, result, error } = useWhisper({
    token: 'your-jwt-token',
    enableWebSocket: true,
  });

  const [recording, setRecording] = useState(false);
  const [mediaRecorder, setMediaRecorder] = useState<MediaRecorder | null>(null);

  const startRecording = async () => {
    const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
    const recorder = new MediaRecorder(stream);
    const chunks: Blob[] = [];

    recorder.ondataavailable = (e) => chunks.push(e.data);
    recorder.onstop = async () => {
      const audioBlob = new Blob(chunks, { type: 'audio/wav' });
      await transcribe(audioBlob, {
        language: 'ar',
        word_timestamps: true,
      });
      stream.getTracks().forEach(track => track.stop());
    };

    recorder.start();
    setMediaRecorder(recorder);
    setRecording(true);
  };

  const stopRecording = () => {
    mediaRecorder?.stop();
    setRecording(false);
  };

  return (
    <div>
      <button
        onClick={recording ? stopRecording : startRecording}
        disabled={transcribing}
      >
        {recording ? 'Stop Recording' : 'Start Recording'}
      </button>

      {transcribing && <div>Transcribing...</div>}
      {error && <div className="error">{error}</div>}

      {result && (
        <div>
          <h3>Transcription ({result.language})</h3>
          <p>{result.text}</p>
          <small>Confidence: {(result.language_probability * 100).toFixed(1)}%</small>

          <h4>Segments:</h4>
          {result.segments.map((segment, i) => (
            <div key={i}>
              <span>[{segment.start.toFixed(2)}s - {segment.end.toFixed(2)}s]:</span>
              <span>{segment.text}</span>
            </div>
          ))}
        </div>
      )}
    </div>
  );
}

Python Implementation

import requests
from typing import Dict, Any, Optional, List
from pathlib import Path


class WhisperClient:
    """Client for Whisper STT via Ubuntu backend"""

    def __init__(self, token: str, base_url: str = "https://api.proyaro.com"):
        self.token = token
        self.base_url = base_url
        self.headers = {
            "Authorization": f"Bearer {token}"
        }

    def upload_audio(self, audio_path: str) -> str:
        """
        Upload audio file to server

        Args:
            audio_path: Path to local audio file

        Returns:
            Server path to uploaded audio
        """
        with open(audio_path, 'rb') as f:
            files = {'audio': f}
            response = requests.post(
                f"{self.base_url}/upload-audio",
                headers=self.headers,
                files=files,
                timeout=60
            )
            response.raise_for_status()

        return response.json()['audio_path']

    def transcribe(
        self,
        audio_path: str,
        language: Optional[str] = 'ar',
        task: str = 'transcribe',
        temperature: float = 0.0,
        beam_size: int = 5,
        vad_filter: bool = True,
        word_timestamps: bool = False,
    ) -> Dict[str, Any]:
        """
        Transcribe audio file

        Args:
            audio_path: Server path to audio file
            language: Language code ('ar', 'en', or None for auto-detect)
            task: 'transcribe' or 'translate' (to English)
            temperature: Sampling temperature (0.0 = deterministic)
            beam_size: Beam search size (higher = more accurate, slower)
            vad_filter: Remove silence with VAD
            word_timestamps: Include word-level timestamps

        Returns:
            Transcription result dict
        """
        # Create job
        from job_management_skill import JobClient
        job_client = JobClient(self.base_url, self.token)

        job = job_client.create_job(
            job_type="speech_to_text",
            parameters={
                "audio_path": audio_path,
                "language": language,
                "task": task,
                "temperature": temperature,
                "beam_size": beam_size,
                "vad_filter": vad_filter,
                "word_timestamps": word_timestamps,
            }
        )

        print(f"STT job created: {job['id']}")

        # Wait for result
        result = job_client.wait_for_job(job['id'])
        return result.result_data

    def transcribe_file(
        self,
        local_audio_path: str,
        **kwargs
    ) -> Dict[str, Any]:
        """
        Upload and transcribe audio file

        Args:
            local_audio_path: Path to local audio file
            **kwargs: Additional transcription parameters

        Returns:
            Transcription result dict
        """
        # Upload
        print(f"Uploading {local_audio_path}...")
        server_path = self.upload_audio(local_audio_path)

        # Transcribe
        print("Transcribing...")
        return self.transcribe(server_path, **kwargs)


# Usage Example
if __name__ == "__main__":
    client = WhisperClient(token="your-jwt-token")

    # Transcribe local file
    result = client.transcribe_file(
        "recording.wav",
        language="ar",
        task="transcribe",
        word_timestamps=True,
    )

    print(f"Transcription: {result['text']}")
    print(f"Language: {result['language']} ({result['language_probability']:.2%})")
    print(f"Duration: {result['duration']:.2f}s")

    # Show segments
    for i, segment in enumerate(result['segments']):
        print(f"[{segment['start']:.2f}s - {segment['end']:.2f}s]: {segment['text']}")

    # Show word timestamps if available
    if result['segments'][0].get('words'):
        for word in result['segments'][0]['words']:
            print(f"  {word['word']} [{word['start']:.2f}s - {word['end']:.2f}s]")

Language Support

Recommended Settings by Language

const languageConfigs = {
  arabic: {
    language: 'ar',
    beam_size: 5,
    temperature: 0.0,
    vad_filter: true,  // Good for Arabic
  },
  english: {
    language: 'en',
    beam_size: 5,
    temperature: 0.0,
    vad_filter: true,
  },
  autoDetect: {
    language: null,  // Auto-detect
    beam_size: 5,
    temperature: 0.0,
    vad_filter: true,
  },
  translateToEnglish: {
    language: 'ar',  // Source language
    task: 'translate',  // Translate to English
    beam_size: 5,
  },
};

Supported Languages

Most common: Arabic (ar), English (en), Spanish (es), French (fr), German (de), Italian (it), Portuguese (pt), Russian (ru), Japanese (ja), Korean (ko), Chinese (zh)

Total: 100+ languages supported


Best Practices

Audio Quality

// Good audio characteristics
const goodAudio = {
  format: 'WAV or M4A',
  sampleRate: '16kHz or higher',
  channels: 'Mono preferred',
  bitrate: '128kbps minimum',
  noise: 'Low background noise',
  speaker: 'Clear speech, not too fast',
};

// Processing tips
async function prepareAudio(file: File): Promise<Blob> {
  // Convert to WAV if needed
  // Reduce noise if possible
  // Normalize volume
  return file;
}

Performance Optimization

// Use VAD filter to remove silence (faster processing)
const result = await client.transcribe({
  audioPath,
  vad_filter: true,  // ✅ Faster
});

// Only use word timestamps when needed
const result = await client.transcribe({
  audioPath,
  word_timestamps: false,  // ✅ Faster
});

// Use appropriate beam size
const configs = {
  fast: { beam_size: 1 },      // Fastest, less accurate
  balanced: { beam_size: 5 },  // Good balance (recommended)
  accurate: { beam_size: 10 }, // Slowest, most accurate
};

Error Handling

async function safeTranscribe(
  client: WhisperClient,
  audioFile: File
): Promise<TranscriptionResult | null> {
  try {
    return await client.transcribeFile(audioFile);
  } catch (error) {
    if (error.message?.includes('upload')) {
      console.error('File upload failed. Check file size and format.');
      return null;
    }
    if (error.message?.includes('timeout')) {
      console.error('Transcription took too long. Try shorter audio.');
      return null;
    }
    if (error.message?.includes('language')) {
      console.error('Language detection failed. Specify language manually.');
      return null;
    }
    throw error;
  }
}

Advanced Features

Segment-by-Segment Processing

function processSegments(result: TranscriptionResult) {
  return result.segments.map(segment => ({
    timestamp: `${segment.start.toFixed(2)}s - ${segment.end.toFixed(2)}s`,
    text: segment.text.trim(),
    duration: segment.end - segment.start,
  }));
}

// Create subtitles (SRT format)
function generateSRT(result: TranscriptionResult): string {
  return result.segments.map((segment, i) => {
    const start = formatSRTTime(segment.start);
    const end = formatSRTTime(segment.end);
    return `${i + 1}\n${start} --> ${end}\n${segment.text}\n`;
  }).join('\n');
}

function formatSRTTime(seconds: number): string {
  const h = Math.floor(seconds / 3600);
  const m = Math.floor((seconds % 3600) / 60);
  const s = Math.floor(seconds % 60);
  const ms = Math.floor((seconds % 1) * 1000);
  return `${h.toString().padStart(2, '0')}:${m.toString().padStart(2, '0')}:${s.toString().padStart(2, '0')},${ms.toString().padStart(3, '0')}`;
}

Testing

async function testWhisper() {
  const client = new WhisperClient('test-token');

  // Test with sample audio
  console.log('Testing Whisper STT...');

  const testFile = new File(
    [await fetch('/test-audio.wav').then(r => r.blob())],
    'test.wav'
  );

  const result = await client.transcribeFile(testFile, {
    language: 'ar',
    task: 'transcribe',
  });

  console.assert(result.text.length > 0, 'Should return transcription');
  console.assert(result.language === 'ar', 'Should detect Arabic');
  console.assert(result.segments.length > 0, 'Should have segments');

  console.log('All tests passed!');
}

Skill Version: 1.0 Last Updated: 2025-01-01

ProYaro AI Infrastructure Documentation • Version 1.2