Add /api/transcribe endpoint with Whisper

This commit is contained in:
2026-06-05 03:02:42 +02:00
parent b2973b63ee
commit 46265a6f89
6 changed files with 168 additions and 7 deletions
+55 -1
View File
@@ -2,10 +2,15 @@
from datetime import datetime, timezone
from fastapi import APIRouter
from fastapi import APIRouter, File, HTTPException, UploadFile
from app.services.transcriber import transcribe_bytes
router = APIRouter(prefix="/api")
# File size limit: 10 MB
MAX_UPLOAD_SIZE = 10 * 1024 * 1024
@router.get("/health")
async def health():
@@ -16,3 +21,52 @@ async def health():
"version": "0.1.0",
"timestamp": datetime.now(timezone.utc).isoformat(),
}
@router.post("/transcribe")
async def transcribe_audio(
file: UploadFile = File(...),
model: str = "base",
):
"""Transcribe an audio file using Whisper.
Supported formats: wav, mp3, m4a, ogg, flac, webm.
Model options: tiny, base, small, medium (default: base).
"""
# Validate model
valid_models = {"tiny", "base", "small", "medium"}
if model not in valid_models:
raise HTTPException(
status_code=400,
detail=f"Invalid model '{model}'. Use: {', '.join(sorted(valid_models))}",
)
# Validate file type
allowed = {"audio/wav", "audio/mpeg", "audio/mp4", "audio/x-m4a",
"audio/ogg", "audio/flac", "audio/webm", "audio/x-wav"}
if file.content_type and file.content_type not in allowed:
raise HTTPException(
status_code=400,
detail=f"Unsupported format: {file.content_type}. Supported: wav, mp3, m4a, ogg, flac, webm",
)
# Read file
contents = await file.read()
if len(contents) > MAX_UPLOAD_SIZE:
raise HTTPException(
status_code=413,
detail=f"File too large. Max {MAX_UPLOAD_SIZE // (1024*1024)} MB.",
)
if len(contents) == 0:
raise HTTPException(400, detail="Empty file.")
# Transcribe
try:
result = transcribe_bytes(contents, model_name=model)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Transcription failed: {str(e)}")
return {
"filename": file.filename,
**result,
}
View File
+55
View File
@@ -0,0 +1,55 @@
"""Whisper transcription service — CPU-only, async-ready."""
import io
import tempfile
import time
from pathlib import Path
_model = None
_model_name = None
def _load_model(name: str = "base"):
"""Lazy-load Whisper model (downloads on first use)."""
global _model, _model_name
import whisper
if _model is None or _model_name != name:
_model = whisper.load_model(name)
_model_name = name
return _model
def transcribe_bytes(audio_bytes: bytes, model_name: str = "base") -> dict:
"""Transcribe audio from bytes. Returns {"text": "...", "segments": [...], "language": "..."}"""
t0 = time.time()
model = _load_model(model_name)
# Write to temp file (whisper needs a file path or numpy array)
suffix = ".wav"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
result = model.transcribe(tmp_path, fp16=False) # fp16=False for CPU
finally:
Path(tmp_path).unlink(missing_ok=True)
elapsed = round(time.time() - t0, 1)
return {
"text": result["text"].strip(),
"segments": [
{
"start": round(seg["start"], 2),
"end": round(seg["end"], 2),
"text": seg["text"].strip(),
}
for seg in result.get("segments", [])
],
"language": result.get("language", "unknown"),
"duration_seconds": elapsed,
"model": model_name,
}