feat: add chunked ASR for long audio with env-configurable chunk duration

- Integrate chunking into LocalAI client to avoid GPU OOM on long audio. - Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps. - Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s). - Add env variables: LOCALAI_CHUNK_DURATION (default 180) LOCALAI_CHUNK_OVERLAP (default 2) LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300) - Add unit and integration tests for chunking logic. - Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
2026-06-18 17:46:29 +00:00
parent 59363c5dcd
commit 6640bc050d
4 changed files with 737 additions and 0 deletions
@@ -0,0 +1,86 @@
+import os
+import subprocess
+import tempfile
+import pytest
+
+from scraibe.audio import (
+    get_audio_duration,
+    split_audio_into_chunks,
+)
+
+TEST_AUDIO_1 = "tests/audio_test_1.mp4"
+TEST_AUDIO_2 = "tests/audio_test_2.mp4"
+
+
+@pytest.fixture(params=[TEST_AUDIO_1, TEST_AUDIO_2])
+def test_audio_path(request):
+    return request.param
+
+
+def test_get_audio_duration(test_audio_path):
+    dur = get_audio_duration(test_audio_path)
+    assert isinstance(dur, float)
+    assert dur > 0
+
+
+def test_split_audio_into_chunks_no_split_short(test_audio_path):
+    # For short files, should return the same file with no extra chunks
+    chunks = split_audio_into_chunks(
+        input_path=test_audio_path,
+        max_duration=600.0,  # larger than both test files
+        overlap=2.0,
+    )
+    assert len(chunks) == 1
+    assert chunks[0]["path"] == test_audio_path
+    assert chunks[0]["start"] == 0.0
+    dur = get_audio_duration(test_audio_path)
+    assert abs(chunks[0]["end"] - dur) < 0.05
+
+
+def test_split_audio_into_chunks_creates_chunks(tmp_path):
+    # Use a small chunk duration to force splitting
+    chunks = split_audio_into_chunks(
+        input_path=TEST_AUDIO_1,
+        max_duration=2.0,
+        overlap=0.5,
+    )
+    assert len(chunks) > 1
+
+    # Check that each chunk file exists and is non-empty
+    for c in chunks:
+        assert os.path.exists(c["path"])
+        assert os.path.getsize(c["path"]) > 0
+
+    # Check time ordering and overlap
+    for i in range(1, len(chunks)):
+        prev = chunks[i - 1]
+        curr = chunks[i]
+        assert curr["start"] >= prev["start"]
+        assert curr["start"] < prev["end"]  # overlap
+
+    # Cleanup
+    for c in chunks:
+        if os.path.exists(c["path"]):
+            os.remove(c["path"])
+
+
+def test_split_audio_into_chunks_total_coverage(test_audio_path):
+    dur = get_audio_duration(test_audio_path)
+
+    # Use small chunks to ensure coverage
+    chunks = split_audio_into_chunks(
+        input_path=test_audio_path,
+        max_duration=2.0,
+        overlap=0.5,
+    )
+
+    # First chunk starts at 0
+    assert chunks[0]["start"] == 0.0
+
+    # Last chunk end should cover the duration
+    assert chunks[-1]["end"] >= dur - 0.05
+
+    # Cleanup
+    for c in chunks:
+        if os.path.exists(c["path"]):
+            os.remove(c["path"])