Files
admin 6640bc050d
Mirror and run GitLab CI / build (push) Waiting to run
Ruff / ruff (push) Waiting to run
feat: add chunked ASR for long audio with env-configurable chunk duration
- Integrate chunking into LocalAI client to avoid GPU OOM on long audio.
- Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps.
- Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s).
- Add env variables:
    LOCALAI_CHUNK_DURATION (default 180)
    LOCALAI_CHUNK_OVERLAP (default 2)
    LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300)
- Add unit and integration tests for chunking logic.
- Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
2026-06-18 17:46:29 +00:00

223 lines
6.1 KiB
Python

"""
Audio Processor Module
=======================
Simplified audio processor for ScrAIbe.
Previously this used torch and pyannote-style processing. In the LocalAI-backed
version, we primarily pass files to the API, but we keep a lightweight helper
for backward compatibility.
Now also includes utilities for chunking long audio into smaller segments
to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
"""
import json
import os
import tempfile
from subprocess import CalledProcessError, run
import numpy as np
SAMPLE_RATE = 16000
NORMALIZATION_FACTOR = 32768.0
DEFAULT_CHUNK_DURATION = 180.0 # seconds
DEFAULT_CHUNK_OVERLAP = 2.0 # seconds
class AudioProcessor:
"""
Lightweight audio processor for loading and cutting audio.
Attributes:
waveform (np.ndarray): The audio waveform as float32.
sr (int): The sample rate of the audio.
"""
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
self.waveform = waveform
self.sr = sr
if not isinstance(self.sr, int):
raise ValueError(
"Sample rate should be a single value of type int, "
f"not {len(self.sr)} and type {type(self.sr)}"
)
@classmethod
def from_file(cls, file: str, *args, **kwargs):
"""
Create an AudioProcessor instance from an audio file.
Args:
file (str): The audio file path.
Returns:
AudioProcessor: Instance with loaded audio.
"""
audio, sr = cls.load_audio(file, *args, **kwargs)
return cls(audio, sr)
def cut(self, start: float, end: float) -> np.ndarray:
"""
Cut a segment from the audio waveform.
Args:
start (float): Start time in seconds.
end (float): End time in seconds.
Returns:
np.ndarray: The cut waveform segment.
"""
start_idx = int(start * self.sr)
end_idx = int(np.ceil(end * self.sr))
return self.waveform[start_idx:end_idx]
@staticmethod
def load_audio(file: str, sr: int = SAMPLE_RATE):
"""
Load an audio file as a mono waveform, resampling if necessary.
Requires ffmpeg in PATH.
Args:
file (str): The audio file to open.
sr (int, optional): The desired sample rate.
Returns:
tuple: (waveform as np.ndarray[float32], sample rate)
Raises:
RuntimeError: If failed to load audio.
"""
cmd = [
"ffmpeg",
"-nostdin",
"-threads", "0",
"-i", file,
"-f", "s16le",
"-ac", "1",
"-acodec", "pcm_s16le",
"-ar", str(sr),
"-",
]
try:
out = run(cmd, capture_output=True, check=True).stdout
except CalledProcessError as e:
raise RuntimeError(
f"Failed to load audio: {e.stderr.decode()}"
) from e
waveform = np.frombuffer(out, np.int16).flatten().astype(
np.float32
) / NORMALIZATION_FACTOR
return waveform, sr
def __repr__(self) -> str:
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
def get_audio_duration(file_path: str) -> float:
"""
Get the duration of an audio file in seconds using ffprobe.
Args:
file_path: Path to the audio file.
Returns:
Duration in seconds as a float.
Raises:
RuntimeError: If ffprobe fails.
"""
cmd = [
"ffprobe",
"-v", "error",
"-show_entries", "format=duration",
"-of", "json",
file_path,
]
try:
result = run(cmd, capture_output=True, text=True, check=True)
data = json.loads(result.stdout)
return float(data["format"]["duration"])
except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
def split_audio_into_chunks(
input_path: str,
max_duration: float = DEFAULT_CHUNK_DURATION,
overlap: float = DEFAULT_CHUNK_OVERLAP,
output_format: str = "wav",
sample_rate: int = 24000,
) -> list:
"""
Split a long audio file into overlapping chunks using ffmpeg.
Args:
input_path: Path to the input audio file.
max_duration: Maximum duration of each chunk in seconds.
overlap: Overlap duration in seconds between consecutive chunks.
output_format: Output format (e.g., 'wav').
sample_rate: Sample rate for output chunks.
Returns:
List of dicts:
[{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
Files must be cleaned up by the caller.
"""
duration = get_audio_duration(input_path)
# If file is shorter than max_duration, no need to split
if duration <= max_duration:
return [{"path": input_path, "start": 0.0, "end": duration}]
chunks = []
start = 0.0
chunk_id = 0
while start < duration:
chunk_end = min(start + max_duration, duration)
chunk_duration = chunk_end - start
tmp = tempfile.NamedTemporaryFile(
delete=False,
suffix=f".{output_format}",
prefix="scraibe_chunk_",
)
chunk_path = tmp.name
tmp.close()
cmd = [
"ffmpeg",
"-y",
"-nostdin",
"-ss", str(start),
"-i", input_path,
"-t", str(chunk_duration),
"-ar", str(sample_rate),
"-ac", "1",
"-c:a", "pcm_s16le",
chunk_path,
]
try:
run(cmd, capture_output=True, check=True)
except CalledProcessError as e:
# Clean up on error
if os.path.exists(chunk_path):
os.remove(chunk_path)
raise RuntimeError(
f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
)
chunks.append({
"path": chunk_path,
"start": start,
"end": chunk_end,
})
start += max_duration - overlap
chunk_id += 1
return chunks