""" Audio Processor Module ======================= Simplified audio processor for ScrAIbe. Previously this used torch and pyannote-style processing. In the LocalAI-backed version, we primarily pass files to the API, but we keep a lightweight helper for backward compatibility. """ from subprocess import CalledProcessError, run import numpy as np SAMPLE_RATE = 16000 NORMALIZATION_FACTOR = 32768.0 class AudioProcessor: """ Lightweight audio processor for loading and cutting audio. Attributes: waveform (np.ndarray): The audio waveform as float32. sr (int): The sample rate of the audio. """ def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE): self.waveform = waveform self.sr = sr if not isinstance(self.sr, int): raise ValueError( "Sample rate should be a single value of type int, " f"not {len(self.sr)} and type {type(self.sr)}" ) @classmethod def from_file(cls, file: str, *args, **kwargs): """ Create an AudioProcessor instance from an audio file. Args: file (str): The audio file path. Returns: AudioProcessor: Instance with loaded audio. """ audio, sr = cls.load_audio(file, *args, **kwargs) return cls(audio, sr) def cut(self, start: float, end: float) -> np.ndarray: """ Cut a segment from the audio waveform. Args: start (float): Start time in seconds. end (float): End time in seconds. Returns: np.ndarray: The cut waveform segment. """ start_idx = int(start * self.sr) end_idx = int(np.ceil(end * self.sr)) return self.waveform[start_idx:end_idx] @staticmethod def load_audio(file: str, sr: int = SAMPLE_RATE): """ Load an audio file as a mono waveform, resampling if necessary. Requires ffmpeg in PATH. Args: file (str): The audio file to open. sr (int, optional): The desired sample rate. Returns: tuple: (waveform as np.ndarray[float32], sample rate) Raises: RuntimeError: If failed to load audio. """ cmd = [ "ffmpeg", "-nostdin", "-threads", "0", "-i", file, "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le", "-ar", str(sr), "-", ] try: out = run(cmd, capture_output=True, check=True).stdout except CalledProcessError as e: raise RuntimeError( f"Failed to load audio: {e.stderr.decode()}" ) from e waveform = np.frombuffer(out, np.int16).flatten().astype( np.float32 ) / NORMALIZATION_FACTOR return waveform, sr def __repr__(self) -> str: return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"