feat: add chunked ASR for long audio with env-configurable chunk duration

- Integrate chunking into LocalAI client to avoid GPU OOM on long audio. - Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps. - Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s). - Add env variables: LOCALAI_CHUNK_DURATION (default 180) LOCALAI_CHUNK_OVERLAP (default 2) LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300) - Add unit and integration tests for chunking logic. - Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
2026-06-18 17:46:29 +00:00
parent 59363c5dcd
commit 6640bc050d
4 changed files with 737 additions and 0 deletions
@@ -7,13 +7,21 @@ Simplified audio processor for ScrAIbe.
 Previously this used torch and pyannote-style processing. In the LocalAI-backed
 version, we primarily pass files to the API, but we keep a lightweight helper
 for backward compatibility.
+
+Now also includes utilities for chunking long audio into smaller segments
+to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
 """

+import json
+import os
+import tempfile
 from subprocess import CalledProcessError, run
 import numpy as np

 SAMPLE_RATE = 16000
 NORMALIZATION_FACTOR = 32768.0
+DEFAULT_CHUNK_DURATION = 180.0  # seconds
+DEFAULT_CHUNK_OVERLAP = 2.0    # seconds


 class AudioProcessor:
@@ -106,3 +114,109 @@ class AudioProcessor:

    def __repr__(self) -> str:
        return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
+
+
+def get_audio_duration(file_path: str) -> float:
+    """
+    Get the duration of an audio file in seconds using ffprobe.
+
+    Args:
+        file_path: Path to the audio file.
+
+    Returns:
+        Duration in seconds as a float.
+
+    Raises:
+        RuntimeError: If ffprobe fails.
+    """
+    cmd = [
+        "ffprobe",
+        "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "json",
+        file_path,
+    ]
+    try:
+        result = run(cmd, capture_output=True, text=True, check=True)
+        data = json.loads(result.stdout)
+        return float(data["format"]["duration"])
+    except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
+        raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
+
+
+def split_audio_into_chunks(
+    input_path: str,
+    max_duration: float = DEFAULT_CHUNK_DURATION,
+    overlap: float = DEFAULT_CHUNK_OVERLAP,
+    output_format: str = "wav",
+    sample_rate: int = 24000,
+) -> list:
+    """
+    Split a long audio file into overlapping chunks using ffmpeg.
+
+    Args:
+        input_path: Path to the input audio file.
+        max_duration: Maximum duration of each chunk in seconds.
+        overlap: Overlap duration in seconds between consecutive chunks.
+        output_format: Output format (e.g., 'wav').
+        sample_rate: Sample rate for output chunks.
+
+    Returns:
+        List of dicts:
+            [{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
+        Files must be cleaned up by the caller.
+    """
+    duration = get_audio_duration(input_path)
+
+    # If file is shorter than max_duration, no need to split
+    if duration <= max_duration:
+        return [{"path": input_path, "start": 0.0, "end": duration}]
+
+    chunks = []
+    start = 0.0
+    chunk_id = 0
+
+    while start < duration:
+        chunk_end = min(start + max_duration, duration)
+        chunk_duration = chunk_end - start
+
+        tmp = tempfile.NamedTemporaryFile(
+            delete=False,
+            suffix=f".{output_format}",
+            prefix="scraibe_chunk_",
+        )
+        chunk_path = tmp.name
+        tmp.close()
+
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-nostdin",
+            "-ss", str(start),
+            "-i", input_path,
+            "-t", str(chunk_duration),
+            "-ar", str(sample_rate),
+            "-ac", "1",
+            "-c:a", "pcm_s16le",
+            chunk_path,
+        ]
+        try:
+            run(cmd, capture_output=True, check=True)
+        except CalledProcessError as e:
+            # Clean up on error
+            if os.path.exists(chunk_path):
+                os.remove(chunk_path)
+            raise RuntimeError(
+                f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
+            )
+
+        chunks.append({
+            "path": chunk_path,
+            "start": start,
+            "end": chunk_end,
+        })
+
+        start += max_duration - overlap
+        chunk_id += 1
+
+    return chunks