Initial commit: LocalAI-backed ScrAIbe with summarization

2026-06-13 16:38:59 +00:00
parent 46d119b63b
commit 574124558b
10 changed files with 992 additions and 594 deletions
@@ -2,28 +2,15 @@
 Audio Processor Module
 =======================

-This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
-It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
-flexible audio processing.
+Simplified audio processor for ScrAIbe.

-Available Classes:
- AudioProcessor: Processes audio waveforms and provides methods for loading, 
-                    cutting, and handling audio.
-
-Usage:
-    from .audio_import AudioProcessor
-
-    processor = AudioProcessor.from_file("path/to/audiofile.wav")
-    cut_waveform = processor.cut(start=1.0, end=5.0)
-
-Constants:
- SAMPLE_RATE (int): Default sample rate for processing.
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
+Previously this used torch and pyannote-style processing. In the LocalAI-backed
+version, we primarily pass files to the API, but we keep a lightweight helper
+for backward compatibility.
 """

 from subprocess import CalledProcessError, run
 import numpy as np
-import torch

 SAMPLE_RATE = 16000
 NORMALIZATION_FACTOR = 32768.0
@@ -31,38 +18,25 @@ NORMALIZATION_FACTOR = 32768.0

 class AudioProcessor:
    """
-    Audio Processor class that leverages PyTorchaudio to provide functionalities
-    for loading, cutting, and handling audio waveforms.
+    Lightweight audio processor for loading and cutting audio.

    Attributes:
-        waveform: torch.Tensor
-            The audio waveform tensor.
-        sr: int
-            The sample rate of the audio.
+        waveform (np.ndarray): The audio waveform as float32.
+        sr (int): The sample rate of the audio.
    """

-    def __init__(self, waveform: torch.Tensor,
-                 sr: int = SAMPLE_RATE) -> None:
-        """
-        Initialize the AudioProcessor object.
-
-        Args:
-            waveform (torch.Tensor): The audio waveform tensor.
-            sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
-
-        Raises:
-            ValueError: If the provided sample rate is not of type int.
-        """
-
+    def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
        self.waveform = waveform
        self.sr = sr

        if not isinstance(self.sr, int):
-            raise ValueError("Sample rate should be a single value of type int,"
-                             f"not {len(self.sr)} and type {type(self.sr)}")
+            raise ValueError(
+                "Sample rate should be a single value of type int, "
+                f"not {len(self.sr)} and type {type(self.sr)}"
+            )

    @classmethod
-    def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
+    def from_file(cls, file: str, *args, **kwargs):
        """
        Create an AudioProcessor instance from an audio file.

@@ -70,55 +44,42 @@ class AudioProcessor:
            file (str): The audio file path.

        Returns:
-            AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
+            AudioProcessor: Instance with loaded audio.
        """
-
        audio, sr = cls.load_audio(file, *args, **kwargs)
-
-        audio = torch.from_numpy(audio)
-
        return cls(audio, sr)

-    def cut(self, start: float, end: float) -> torch.Tensor:
+    def cut(self, start: float, end: float) -> np.ndarray:
        """
-        Cut a segment from the audio waveform between the specified start and end times.
+        Cut a segment from the audio waveform.

        Args:
            start (float): Start time in seconds.
            end (float): End time in seconds.

        Returns:
-            torch.Tensor: The cut waveform segment.
+            np.ndarray: The cut waveform segment.
        """
-
-        start = int(start * self.sr)
-        if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
-            end = int(np.ceil(end * self.sr))
-        else:
-            end = int(torch.ceil(end * self.sr))
-        return self.waveform[start:end]
+        start_idx = int(start * self.sr)
+        end_idx = int(np.ceil(end * self.sr))
+        return self.waveform[start_idx:end_idx]

    @staticmethod
    def load_audio(file: str, sr: int = SAMPLE_RATE):
        """
-        Open an audio file and read it as a mono waveform, resampling if necessary.
-        This method ensures compatibility with pyannote.audio
-        and requires the ffmpeg CLI in PATH.
+        Load an audio file as a mono waveform, resampling if necessary.
+        Requires ffmpeg in PATH.

        Args:
            file (str): The audio file to open.
-            sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
+            sr (int, optional): The desired sample rate.

        Returns:
-            tuple: A NumPy array containing the audio waveform in float32 dtype
-                    and the sample rate.
+            tuple: (waveform as np.ndarray[float32], sample rate)

        Raises:
            RuntimeError: If failed to load audio.
        """
-        # This launches a subprocess to decode audio while down-mixing
-        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
-        # fmt: off
        cmd = [
            "ffmpeg",
            "-nostdin",
@@ -128,19 +89,20 @@ class AudioProcessor:
            "-ac", "1",
            "-acodec", "pcm_s16le",
            "-ar", str(sr),
-            "-"
+            "-",
        ]
-        # fmt: on
        try:
            out = run(cmd, capture_output=True, check=True).stdout
        except CalledProcessError as e:
            raise RuntimeError(
-                f"Failed to load audio: {e.stderr.decode()}") from e
+                f"Failed to load audio: {e.stderr.decode()}"
+            ) from e

-        out = np.frombuffer(out, np.int16).flatten().astype(
-            np.float32) / NORMALIZATION_FACTOR
+        waveform = np.frombuffer(out, np.int16).flatten().astype(
+            np.float32
+        ) / NORMALIZATION_FACTOR
+
+        return waveform, sr

-        return out, sr
-    
    def __repr__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+        return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"