feat: add chunked ASR for long audio with env-configurable chunk duration
- Integrate chunking into LocalAI client to avoid GPU OOM on long audio.
- Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps.
- Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s).
- Add env variables:
LOCALAI_CHUNK_DURATION (default 180)
LOCALAI_CHUNK_OVERLAP (default 2)
LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300)
- Add unit and integration tests for chunking logic.
- Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
This commit is contained in:
@@ -7,13 +7,21 @@ Simplified audio processor for ScrAIbe.
|
||||
Previously this used torch and pyannote-style processing. In the LocalAI-backed
|
||||
version, we primarily pass files to the API, but we keep a lightweight helper
|
||||
for backward compatibility.
|
||||
|
||||
Now also includes utilities for chunking long audio into smaller segments
|
||||
to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from subprocess import CalledProcessError, run
|
||||
import numpy as np
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
NORMALIZATION_FACTOR = 32768.0
|
||||
DEFAULT_CHUNK_DURATION = 180.0 # seconds
|
||||
DEFAULT_CHUNK_OVERLAP = 2.0 # seconds
|
||||
|
||||
|
||||
class AudioProcessor:
|
||||
@@ -106,3 +114,109 @@ class AudioProcessor:
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
|
||||
|
||||
|
||||
def get_audio_duration(file_path: str) -> float:
|
||||
"""
|
||||
Get the duration of an audio file in seconds using ffprobe.
|
||||
|
||||
Args:
|
||||
file_path: Path to the audio file.
|
||||
|
||||
Returns:
|
||||
Duration in seconds as a float.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If ffprobe fails.
|
||||
"""
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v", "error",
|
||||
"-show_entries", "format=duration",
|
||||
"-of", "json",
|
||||
file_path,
|
||||
]
|
||||
try:
|
||||
result = run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
return float(data["format"]["duration"])
|
||||
except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
|
||||
raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
|
||||
|
||||
|
||||
def split_audio_into_chunks(
|
||||
input_path: str,
|
||||
max_duration: float = DEFAULT_CHUNK_DURATION,
|
||||
overlap: float = DEFAULT_CHUNK_OVERLAP,
|
||||
output_format: str = "wav",
|
||||
sample_rate: int = 24000,
|
||||
) -> list:
|
||||
"""
|
||||
Split a long audio file into overlapping chunks using ffmpeg.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file.
|
||||
max_duration: Maximum duration of each chunk in seconds.
|
||||
overlap: Overlap duration in seconds between consecutive chunks.
|
||||
output_format: Output format (e.g., 'wav').
|
||||
sample_rate: Sample rate for output chunks.
|
||||
|
||||
Returns:
|
||||
List of dicts:
|
||||
[{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
|
||||
Files must be cleaned up by the caller.
|
||||
"""
|
||||
duration = get_audio_duration(input_path)
|
||||
|
||||
# If file is shorter than max_duration, no need to split
|
||||
if duration <= max_duration:
|
||||
return [{"path": input_path, "start": 0.0, "end": duration}]
|
||||
|
||||
chunks = []
|
||||
start = 0.0
|
||||
chunk_id = 0
|
||||
|
||||
while start < duration:
|
||||
chunk_end = min(start + max_duration, duration)
|
||||
chunk_duration = chunk_end - start
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
delete=False,
|
||||
suffix=f".{output_format}",
|
||||
prefix="scraibe_chunk_",
|
||||
)
|
||||
chunk_path = tmp.name
|
||||
tmp.close()
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-nostdin",
|
||||
"-ss", str(start),
|
||||
"-i", input_path,
|
||||
"-t", str(chunk_duration),
|
||||
"-ar", str(sample_rate),
|
||||
"-ac", "1",
|
||||
"-c:a", "pcm_s16le",
|
||||
chunk_path,
|
||||
]
|
||||
try:
|
||||
run(cmd, capture_output=True, check=True)
|
||||
except CalledProcessError as e:
|
||||
# Clean up on error
|
||||
if os.path.exists(chunk_path):
|
||||
os.remove(chunk_path)
|
||||
raise RuntimeError(
|
||||
f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
|
||||
)
|
||||
|
||||
chunks.append({
|
||||
"path": chunk_path,
|
||||
"start": start,
|
||||
"end": chunk_end,
|
||||
})
|
||||
|
||||
start += max_duration - overlap
|
||||
chunk_id += 1
|
||||
|
||||
return chunks
|
||||
|
||||
Reference in New Issue
Block a user