| | """ |
| | Advanced Speech Recognition Module for Multilingual Audio Intelligence System |
| | |
| | This module implements state-of-the-art automatic speech recognition using openai-whisper |
| | with integrated language identification capabilities. Designed for maximum performance |
| | on CPU-constrained environments while maintaining SOTA accuracy. |
| | |
| | Key Features: |
| | - OpenAI Whisper with optimized backend for speed improvement |
| | - Integrated Language Identification (no separate LID module needed) |
| | - VAD-based batching for real-time performance on CPU |
| | - Word-level timestamps for interactive UI synchronization |
| | - Robust error handling and multilingual support |
| | - CPU and GPU optimization paths |
| | |
| | Model: openai/whisper-small (optimized for speed/accuracy balance) |
| | Dependencies: openai-whisper, torch, numpy |
| | """ |
| |
|
| | import os |
| | import logging |
| | import warnings |
| | import numpy as np |
| | import torch |
| | from typing import List, Dict, Optional, Tuple, Union |
| | import tempfile |
| | from dataclasses import dataclass |
| | import time |
| |
|
| | try: |
| | import whisper |
| | WHISPER_AVAILABLE = True |
| | except ImportError: |
| | WHISPER_AVAILABLE = False |
| | logging.warning("openai-whisper not available. Install with: pip install openai-whisper") |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | warnings.filterwarnings("ignore", category=UserWarning) |
| | warnings.filterwarnings("ignore", category=FutureWarning) |
| |
|
| |
|
| | @dataclass |
| | class TranscriptionSegment: |
| | """ |
| | Data class representing a transcribed speech segment with rich metadata. |
| | """ |
| | start: float |
| | end: float |
| | text: str |
| | language: str |
| | language_probability: float |
| | no_speech_probability: float |
| | words: Optional[List[Dict]] = None |
| | speaker_id: Optional[str] = None |
| | confidence: Optional[float] = None |
| | word_timestamps: Optional[List[Dict]] = None |
| |
|
| |
|
| | class SpeechRecognizer: |
| | """ |
| | Advanced Speech Recognition Engine using OpenAI Whisper. |
| | |
| | This class provides high-performance speech recognition with integrated language |
| | identification, optimized for both CPU and GPU environments. |
| | """ |
| | |
| | def __init__(self, model_size: str = "small", device: str = "auto", |
| | compute_type: str = "int8", language: Optional[str] = None): |
| | """ |
| | Initialize the Speech Recognizer. |
| | |
| | Args: |
| | model_size: Whisper model size (tiny, base, small, medium, large) |
| | device: Device to use (auto, cpu, cuda) |
| | compute_type: Computation precision (int8, float16, float32) |
| | language: Target language code (None for auto-detection) |
| | """ |
| | self.model_size = model_size |
| | self.device = self._determine_device(device) |
| | self.compute_type = compute_type |
| | self.language = language |
| | self.model = None |
| | self._initialize_model() |
| | |
| | def _determine_device(self, device: str) -> str: |
| | """Determine the best available device.""" |
| | if device == "auto": |
| | if torch.cuda.is_available(): |
| | return "cuda" |
| | elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): |
| | return "mps" |
| | else: |
| | return "cpu" |
| | return device |
| | |
| | def _initialize_model(self): |
| | """Initialize the Whisper model.""" |
| | if not WHISPER_AVAILABLE: |
| | raise ImportError("openai-whisper is required. Install with: pip install openai-whisper") |
| | |
| | try: |
| | logger.info(f"Loading {self.model_size} Whisper model...") |
| | self.model = whisper.load_model(self.model_size, device=self.device) |
| | logger.info(f"Speech recognition models loaded on {self.device}") |
| | except Exception as e: |
| | logger.error(f"Failed to load Whisper model: {e}") |
| | raise |
| | |
| | def transcribe_audio(self, audio_data: np.ndarray, sample_rate: int = 16000, |
| | language: Optional[str] = None, |
| | initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
| | """ |
| | Transcribe audio data with language identification. |
| | |
| | Args: |
| | audio_data: Audio data as numpy array |
| | sample_rate: Sample rate of the audio |
| | language: Language code (None for auto-detection) |
| | initial_prompt: Initial prompt for better transcription |
| | |
| | Returns: |
| | List of TranscriptionSegment objects |
| | """ |
| | if self.model is None: |
| | raise RuntimeError("Model not initialized") |
| | |
| | try: |
| | |
| | if sample_rate != 16000: |
| | import librosa |
| | audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| | |
| | |
| | result = self.model.transcribe( |
| | audio_data, |
| | language=language or self.language, |
| | initial_prompt=initial_prompt, |
| | word_timestamps=True, |
| | verbose=False |
| | ) |
| | |
| | |
| | segments = [] |
| | for segment in result["segments"]: |
| | words = [] |
| | if "words" in segment: |
| | for word in segment["words"]: |
| | words.append({ |
| | "word": word["word"], |
| | "start": word["start"], |
| | "end": word["end"], |
| | "probability": word.get("probability", 1.0) |
| | }) |
| | |
| | segments.append(TranscriptionSegment( |
| | start=segment["start"], |
| | end=segment["end"], |
| | text=segment["text"].strip(), |
| | language=result.get("language", "unknown"), |
| | language_probability=result.get("language_probability", 1.0), |
| | no_speech_probability=segment.get("no_speech_prob", 0.0), |
| | words=words, |
| | speaker_id=None, |
| | confidence=1.0 - segment.get("no_speech_prob", 0.0), |
| | word_timestamps=words |
| | )) |
| | |
| | return segments |
| | |
| | except Exception as e: |
| | logger.error(f"Transcription failed: {e}") |
| | raise |
| | |
| | def transcribe_file(self, file_path: str, language: Optional[str] = None, |
| | initial_prompt: Optional[str] = None) -> List[TranscriptionSegment]: |
| | """ |
| | Transcribe an audio file. |
| | |
| | Args: |
| | file_path: Path to audio file |
| | language: Language code (None for auto-detection) |
| | initial_prompt: Initial prompt for better transcription |
| | |
| | Returns: |
| | List of TranscriptionSegment objects |
| | """ |
| | try: |
| | |
| | import librosa |
| | audio_data, sample_rate = librosa.load(file_path, sr=16000) |
| | |
| | return self.transcribe_audio(audio_data, sample_rate, language, initial_prompt) |
| | |
| | except Exception as e: |
| | logger.error(f"File transcription failed: {e}") |
| | raise |
| | |
| | def transcribe_segments(self, audio_data: np.ndarray, sample_rate: int, |
| | speaker_segments: List[Tuple[float, float, str]], |
| | word_timestamps: bool = True) -> List[TranscriptionSegment]: |
| | """ |
| | Transcribe audio segments with speaker information. |
| | |
| | Args: |
| | audio_data: Audio data as numpy array |
| | sample_rate: Sample rate of the audio |
| | speaker_segments: List of (start_time, end_time, speaker_id) tuples |
| | word_timestamps: Whether to include word-level timestamps |
| | |
| | Returns: |
| | List of TranscriptionSegment objects with speaker information |
| | """ |
| | if self.model is None: |
| | raise RuntimeError("Model not initialized") |
| | |
| | try: |
| | |
| | if sample_rate != 16000: |
| | import librosa |
| | audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| | |
| | |
| | result = self.model.transcribe( |
| | audio_data, |
| | language=self.language, |
| | word_timestamps=word_timestamps, |
| | verbose=False |
| | ) |
| | |
| | |
| | segments = [] |
| | for segment in result["segments"]: |
| | |
| | speaker_id = "Unknown" |
| | for start_time, end_time, spk_id in speaker_segments: |
| | if (segment["start"] >= start_time and segment["end"] <= end_time): |
| | speaker_id = spk_id |
| | break |
| | |
| | words = [] |
| | if word_timestamps and "words" in segment: |
| | for word in segment["words"]: |
| | words.append({ |
| | "word": word["word"], |
| | "start": word["start"], |
| | "end": word["end"], |
| | "probability": word.get("probability", 1.0) |
| | }) |
| | |
| | segments.append(TranscriptionSegment( |
| | start=segment["start"], |
| | end=segment["end"], |
| | text=segment["text"].strip(), |
| | language=result.get("language", "unknown"), |
| | language_probability=result.get("language_probability", 1.0), |
| | no_speech_probability=segment.get("no_speech_prob", 0.0), |
| | words=words, |
| | speaker_id=speaker_id, |
| | confidence=1.0 - segment.get("no_speech_prob", 0.0), |
| | word_timestamps=words |
| | )) |
| | |
| | return segments |
| | |
| | except Exception as e: |
| | logger.error(f"Segment transcription failed: {e}") |
| | raise |
| |
|
| | def get_supported_languages(self) -> List[str]: |
| | """Get list of supported language codes.""" |
| | return [ |
| | "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su" |
| | ] |
| | |
| | def detect_language(self, audio_data: np.ndarray, sample_rate: int = 16000) -> Tuple[str, float]: |
| | """ |
| | Detect the language of audio data. |
| | |
| | Args: |
| | audio_data: Audio data as numpy array |
| | sample_rate: Sample rate of the audio |
| | |
| | Returns: |
| | Tuple of (language_code, confidence) |
| | """ |
| | try: |
| | |
| | if sample_rate != 16000: |
| | import librosa |
| | audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000) |
| | |
| | |
| | result = self.model.transcribe(audio_data, language=None, verbose=False) |
| | |
| | return result.get("language", "unknown"), result.get("language_probability", 0.0) |
| | |
| | except Exception as e: |
| | logger.error(f"Language detection failed: {e}") |
| | return "unknown", 0.0 |
| |
|
| |
|
| | def create_speech_recognizer(model_size: str = "small", device: str = "auto", |
| | compute_type: str = "int8", language: Optional[str] = None) -> SpeechRecognizer: |
| | """ |
| | Factory function to create a SpeechRecognizer instance. |
| | |
| | Args: |
| | model_size: Whisper model size |
| | device: Device to use |
| | compute_type: Computation precision |
| | language: Target language code |
| | |
| | Returns: |
| | SpeechRecognizer instance |
| | """ |
| | return SpeechRecognizer(model_size, device, compute_type, language) |