feat: add chunked ASR for long audio with env-configurable chunk duration
- Integrate chunking into LocalAI client to avoid GPU OOM on long audio.
- Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps.
- Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s).
- Add env variables:
LOCALAI_CHUNK_DURATION (default 180)
LOCALAI_CHUNK_OVERLAP (default 2)
LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300)
- Add unit and integration tests for chunking logic.
- Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
This commit is contained in:
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import pytest
|
||||
|
||||
from scraibe.audio import (
|
||||
get_audio_duration,
|
||||
split_audio_into_chunks,
|
||||
)
|
||||
|
||||
TEST_AUDIO_1 = "tests/audio_test_1.mp4"
|
||||
TEST_AUDIO_2 = "tests/audio_test_2.mp4"
|
||||
|
||||
|
||||
@pytest.fixture(params=[TEST_AUDIO_1, TEST_AUDIO_2])
|
||||
def test_audio_path(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_get_audio_duration(test_audio_path):
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
assert isinstance(dur, float)
|
||||
assert dur > 0
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_no_split_short(test_audio_path):
|
||||
# For short files, should return the same file with no extra chunks
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=test_audio_path,
|
||||
max_duration=600.0, # larger than both test files
|
||||
overlap=2.0,
|
||||
)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0]["path"] == test_audio_path
|
||||
assert chunks[0]["start"] == 0.0
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
assert abs(chunks[0]["end"] - dur) < 0.05
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_creates_chunks(tmp_path):
|
||||
# Use a small chunk duration to force splitting
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=TEST_AUDIO_1,
|
||||
max_duration=2.0,
|
||||
overlap=0.5,
|
||||
)
|
||||
assert len(chunks) > 1
|
||||
|
||||
# Check that each chunk file exists and is non-empty
|
||||
for c in chunks:
|
||||
assert os.path.exists(c["path"])
|
||||
assert os.path.getsize(c["path"]) > 0
|
||||
|
||||
# Check time ordering and overlap
|
||||
for i in range(1, len(chunks)):
|
||||
prev = chunks[i - 1]
|
||||
curr = chunks[i]
|
||||
assert curr["start"] >= prev["start"]
|
||||
assert curr["start"] < prev["end"] # overlap
|
||||
|
||||
# Cleanup
|
||||
for c in chunks:
|
||||
if os.path.exists(c["path"]):
|
||||
os.remove(c["path"])
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_total_coverage(test_audio_path):
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
|
||||
# Use small chunks to ensure coverage
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=test_audio_path,
|
||||
max_duration=2.0,
|
||||
overlap=0.5,
|
||||
)
|
||||
|
||||
# First chunk starts at 0
|
||||
assert chunks[0]["start"] == 0.0
|
||||
|
||||
# Last chunk end should cover the duration
|
||||
assert chunks[-1]["end"] >= dur - 0.05
|
||||
|
||||
# Cleanup
|
||||
for c in chunks:
|
||||
if os.path.exists(c["path"]):
|
||||
os.remove(c["path"])
|
||||
Reference in New Issue
Block a user