scribe/scraibe/audio.py

"""
Audio Processor Module
=======================

Simplified audio processor for ScrAIbe.

Previously this used torch and pyannote-style processing. In the LocalAI-backed
version, we primarily pass files to the API, but we keep a lightweight helper
for backward compatibility.
"""

from subprocess import CalledProcessError, run
import numpy as np

SAMPLE_RATE = 16000
NORMALIZATION_FACTOR = 32768.0


class AudioProcessor:
    """
    Lightweight audio processor for loading and cutting audio.

    Attributes:
        waveform (np.ndarray): The audio waveform as float32.
        sr (int): The sample rate of the audio.
    """

    def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
        self.waveform = waveform
        self.sr = sr

        if not isinstance(self.sr, int):
            raise ValueError(
                "Sample rate should be a single value of type int, "
                f"not {len(self.sr)} and type {type(self.sr)}"
            )

    @classmethod
    def from_file(cls, file: str, *args, **kwargs):
        """
        Create an AudioProcessor instance from an audio file.

        Args:
            file (str): The audio file path.

        Returns:
            AudioProcessor: Instance with loaded audio.
        """
        audio, sr = cls.load_audio(file, *args, **kwargs)
        return cls(audio, sr)

    def cut(self, start: float, end: float) -> np.ndarray:
        """
        Cut a segment from the audio waveform.

        Args:
            start (float): Start time in seconds.
            end (float): End time in seconds.

        Returns:
            np.ndarray: The cut waveform segment.
        """
        start_idx = int(start * self.sr)
        end_idx = int(np.ceil(end * self.sr))
        return self.waveform[start_idx:end_idx]

    @staticmethod
    def load_audio(file: str, sr: int = SAMPLE_RATE):
        """
        Load an audio file as a mono waveform, resampling if necessary.
        Requires ffmpeg in PATH.

        Args:
            file (str): The audio file to open.
            sr (int, optional): The desired sample rate.

        Returns:
            tuple: (waveform as np.ndarray[float32], sample rate)

        Raises:
            RuntimeError: If failed to load audio.
        """
        cmd = [
            "ffmpeg",
            "-nostdin",
            "-threads", "0",
            "-i", file,
            "-f", "s16le",
            "-ac", "1",
            "-acodec", "pcm_s16le",
            "-ar", str(sr),
            "-",
        ]
        try:
            out = run(cmd, capture_output=True, check=True).stdout
        except CalledProcessError as e:
            raise RuntimeError(
                f"Failed to load audio: {e.stderr.decode()}"
            ) from e

        waveform = np.frombuffer(out, np.int16).flatten().astype(
            np.float32
        ) / NORMALIZATION_FACTOR

        return waveform, sr

    def __repr__(self) -> str:
        return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"