Files
scribe/scraibe/audio.py
T
admin 574124558b
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled
Initial commit: LocalAI-backed ScrAIbe with summarization
2026-06-13 16:38:59 +00:00

109 lines
3.0 KiB
Python

"""
Audio Processor Module
=======================
Simplified audio processor for ScrAIbe.
Previously this used torch and pyannote-style processing. In the LocalAI-backed
version, we primarily pass files to the API, but we keep a lightweight helper
for backward compatibility.
"""
from subprocess import CalledProcessError, run
import numpy as np
SAMPLE_RATE = 16000
NORMALIZATION_FACTOR = 32768.0
class AudioProcessor:
"""
Lightweight audio processor for loading and cutting audio.
Attributes:
waveform (np.ndarray): The audio waveform as float32.
sr (int): The sample rate of the audio.
"""
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
self.waveform = waveform
self.sr = sr
if not isinstance(self.sr, int):
raise ValueError(
"Sample rate should be a single value of type int, "
f"not {len(self.sr)} and type {type(self.sr)}"
)
@classmethod
def from_file(cls, file: str, *args, **kwargs):
"""
Create an AudioProcessor instance from an audio file.
Args:
file (str): The audio file path.
Returns:
AudioProcessor: Instance with loaded audio.
"""
audio, sr = cls.load_audio(file, *args, **kwargs)
return cls(audio, sr)
def cut(self, start: float, end: float) -> np.ndarray:
"""
Cut a segment from the audio waveform.
Args:
start (float): Start time in seconds.
end (float): End time in seconds.
Returns:
np.ndarray: The cut waveform segment.
"""
start_idx = int(start * self.sr)
end_idx = int(np.ceil(end * self.sr))
return self.waveform[start_idx:end_idx]
@staticmethod
def load_audio(file: str, sr: int = SAMPLE_RATE):
"""
Load an audio file as a mono waveform, resampling if necessary.
Requires ffmpeg in PATH.
Args:
file (str): The audio file to open.
sr (int, optional): The desired sample rate.
Returns:
tuple: (waveform as np.ndarray[float32], sample rate)
Raises:
RuntimeError: If failed to load audio.
"""
cmd = [
"ffmpeg",
"-nostdin",
"-threads", "0",
"-i", file,
"-f", "s16le",
"-ac", "1",
"-acodec", "pcm_s16le",
"-ar", str(sr),
"-",
]
try:
out = run(cmd, capture_output=True, check=True).stdout
except CalledProcessError as e:
raise RuntimeError(
f"Failed to load audio: {e.stderr.decode()}"
) from e
waveform = np.frombuffer(out, np.int16).flatten().astype(
np.float32
) / NORMALIZATION_FACTOR
return waveform, sr
def __repr__(self) -> str:
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"