6640bc050d
- Integrate chunking into LocalAI client to avoid GPU OOM on long audio.
- Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps.
- Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s).
- Add env variables:
LOCALAI_CHUNK_DURATION (default 180)
LOCALAI_CHUNK_OVERLAP (default 2)
LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300)
- Add unit and integration tests for chunking logic.
- Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
223 lines
6.1 KiB
Python
223 lines
6.1 KiB
Python
"""
|
|
Audio Processor Module
|
|
=======================
|
|
|
|
Simplified audio processor for ScrAIbe.
|
|
|
|
Previously this used torch and pyannote-style processing. In the LocalAI-backed
|
|
version, we primarily pass files to the API, but we keep a lightweight helper
|
|
for backward compatibility.
|
|
|
|
Now also includes utilities for chunking long audio into smaller segments
|
|
to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import tempfile
|
|
from subprocess import CalledProcessError, run
|
|
import numpy as np
|
|
|
|
SAMPLE_RATE = 16000
|
|
NORMALIZATION_FACTOR = 32768.0
|
|
DEFAULT_CHUNK_DURATION = 180.0 # seconds
|
|
DEFAULT_CHUNK_OVERLAP = 2.0 # seconds
|
|
|
|
|
|
class AudioProcessor:
|
|
"""
|
|
Lightweight audio processor for loading and cutting audio.
|
|
|
|
Attributes:
|
|
waveform (np.ndarray): The audio waveform as float32.
|
|
sr (int): The sample rate of the audio.
|
|
"""
|
|
|
|
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
|
|
self.waveform = waveform
|
|
self.sr = sr
|
|
|
|
if not isinstance(self.sr, int):
|
|
raise ValueError(
|
|
"Sample rate should be a single value of type int, "
|
|
f"not {len(self.sr)} and type {type(self.sr)}"
|
|
)
|
|
|
|
@classmethod
|
|
def from_file(cls, file: str, *args, **kwargs):
|
|
"""
|
|
Create an AudioProcessor instance from an audio file.
|
|
|
|
Args:
|
|
file (str): The audio file path.
|
|
|
|
Returns:
|
|
AudioProcessor: Instance with loaded audio.
|
|
"""
|
|
audio, sr = cls.load_audio(file, *args, **kwargs)
|
|
return cls(audio, sr)
|
|
|
|
def cut(self, start: float, end: float) -> np.ndarray:
|
|
"""
|
|
Cut a segment from the audio waveform.
|
|
|
|
Args:
|
|
start (float): Start time in seconds.
|
|
end (float): End time in seconds.
|
|
|
|
Returns:
|
|
np.ndarray: The cut waveform segment.
|
|
"""
|
|
start_idx = int(start * self.sr)
|
|
end_idx = int(np.ceil(end * self.sr))
|
|
return self.waveform[start_idx:end_idx]
|
|
|
|
@staticmethod
|
|
def load_audio(file: str, sr: int = SAMPLE_RATE):
|
|
"""
|
|
Load an audio file as a mono waveform, resampling if necessary.
|
|
Requires ffmpeg in PATH.
|
|
|
|
Args:
|
|
file (str): The audio file to open.
|
|
sr (int, optional): The desired sample rate.
|
|
|
|
Returns:
|
|
tuple: (waveform as np.ndarray[float32], sample rate)
|
|
|
|
Raises:
|
|
RuntimeError: If failed to load audio.
|
|
"""
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-nostdin",
|
|
"-threads", "0",
|
|
"-i", file,
|
|
"-f", "s16le",
|
|
"-ac", "1",
|
|
"-acodec", "pcm_s16le",
|
|
"-ar", str(sr),
|
|
"-",
|
|
]
|
|
try:
|
|
out = run(cmd, capture_output=True, check=True).stdout
|
|
except CalledProcessError as e:
|
|
raise RuntimeError(
|
|
f"Failed to load audio: {e.stderr.decode()}"
|
|
) from e
|
|
|
|
waveform = np.frombuffer(out, np.int16).flatten().astype(
|
|
np.float32
|
|
) / NORMALIZATION_FACTOR
|
|
|
|
return waveform, sr
|
|
|
|
def __repr__(self) -> str:
|
|
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
|
|
|
|
|
|
def get_audio_duration(file_path: str) -> float:
|
|
"""
|
|
Get the duration of an audio file in seconds using ffprobe.
|
|
|
|
Args:
|
|
file_path: Path to the audio file.
|
|
|
|
Returns:
|
|
Duration in seconds as a float.
|
|
|
|
Raises:
|
|
RuntimeError: If ffprobe fails.
|
|
"""
|
|
cmd = [
|
|
"ffprobe",
|
|
"-v", "error",
|
|
"-show_entries", "format=duration",
|
|
"-of", "json",
|
|
file_path,
|
|
]
|
|
try:
|
|
result = run(cmd, capture_output=True, text=True, check=True)
|
|
data = json.loads(result.stdout)
|
|
return float(data["format"]["duration"])
|
|
except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
|
|
raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
|
|
|
|
|
|
def split_audio_into_chunks(
|
|
input_path: str,
|
|
max_duration: float = DEFAULT_CHUNK_DURATION,
|
|
overlap: float = DEFAULT_CHUNK_OVERLAP,
|
|
output_format: str = "wav",
|
|
sample_rate: int = 24000,
|
|
) -> list:
|
|
"""
|
|
Split a long audio file into overlapping chunks using ffmpeg.
|
|
|
|
Args:
|
|
input_path: Path to the input audio file.
|
|
max_duration: Maximum duration of each chunk in seconds.
|
|
overlap: Overlap duration in seconds between consecutive chunks.
|
|
output_format: Output format (e.g., 'wav').
|
|
sample_rate: Sample rate for output chunks.
|
|
|
|
Returns:
|
|
List of dicts:
|
|
[{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
|
|
Files must be cleaned up by the caller.
|
|
"""
|
|
duration = get_audio_duration(input_path)
|
|
|
|
# If file is shorter than max_duration, no need to split
|
|
if duration <= max_duration:
|
|
return [{"path": input_path, "start": 0.0, "end": duration}]
|
|
|
|
chunks = []
|
|
start = 0.0
|
|
chunk_id = 0
|
|
|
|
while start < duration:
|
|
chunk_end = min(start + max_duration, duration)
|
|
chunk_duration = chunk_end - start
|
|
|
|
tmp = tempfile.NamedTemporaryFile(
|
|
delete=False,
|
|
suffix=f".{output_format}",
|
|
prefix="scraibe_chunk_",
|
|
)
|
|
chunk_path = tmp.name
|
|
tmp.close()
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-y",
|
|
"-nostdin",
|
|
"-ss", str(start),
|
|
"-i", input_path,
|
|
"-t", str(chunk_duration),
|
|
"-ar", str(sample_rate),
|
|
"-ac", "1",
|
|
"-c:a", "pcm_s16le",
|
|
chunk_path,
|
|
]
|
|
try:
|
|
run(cmd, capture_output=True, check=True)
|
|
except CalledProcessError as e:
|
|
# Clean up on error
|
|
if os.path.exists(chunk_path):
|
|
os.remove(chunk_path)
|
|
raise RuntimeError(
|
|
f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
|
|
)
|
|
|
|
chunks.append({
|
|
"path": chunk_path,
|
|
"start": start,
|
|
"end": chunk_end,
|
|
})
|
|
|
|
start += max_duration - overlap
|
|
chunk_id += 1
|
|
|
|
return chunks
|