"""Whisper transcription service — CPU-only, async-ready.""" import io import tempfile import time from pathlib import Path _model = None _model_name = None def _load_model(name: str = "base"): """Lazy-load Whisper model (downloads on first use).""" global _model, _model_name import whisper if _model is None or _model_name != name: _model = whisper.load_model(name) _model_name = name return _model def transcribe_bytes(audio_bytes: bytes, model_name: str = "base") -> dict: """Transcribe audio from bytes. Returns {"text": "...", "segments": [...], "language": "..."}""" t0 = time.time() model = _load_model(model_name) # Write to temp file (whisper needs a file path or numpy array) suffix = ".wav" with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp: tmp.write(audio_bytes) tmp_path = tmp.name try: result = model.transcribe(tmp_path, fp16=False) # fp16=False for CPU finally: Path(tmp_path).unlink(missing_ok=True) elapsed = round(time.time() - t0, 1) return { "text": result["text"].strip(), "segments": [ { "start": round(seg["start"], 2), "end": round(seg["end"], 2), "text": seg["text"].strip(), } for seg in result.get("segments", []) ], "language": result.get("language", "unknown"), "duration_seconds": elapsed, "model": model_name, }