Initial commit: LocalAI-backed ScrAIbe with summarization
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-13 16:38:59 +00:00
parent 46d119b63b
commit 574124558b
10 changed files with 992 additions and 594 deletions
+33 -71
View File
@@ -2,28 +2,15 @@
Audio Processor Module
=======================
This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
flexible audio processing.
Simplified audio processor for ScrAIbe.
Available Classes:
- AudioProcessor: Processes audio waveforms and provides methods for loading,
cutting, and handling audio.
Usage:
from .audio_import AudioProcessor
processor = AudioProcessor.from_file("path/to/audiofile.wav")
cut_waveform = processor.cut(start=1.0, end=5.0)
Constants:
- SAMPLE_RATE (int): Default sample rate for processing.
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
Previously this used torch and pyannote-style processing. In the LocalAI-backed
version, we primarily pass files to the API, but we keep a lightweight helper
for backward compatibility.
"""
from subprocess import CalledProcessError, run
import numpy as np
import torch
SAMPLE_RATE = 16000
NORMALIZATION_FACTOR = 32768.0
@@ -31,38 +18,25 @@ NORMALIZATION_FACTOR = 32768.0
class AudioProcessor:
"""
Audio Processor class that leverages PyTorchaudio to provide functionalities
for loading, cutting, and handling audio waveforms.
Lightweight audio processor for loading and cutting audio.
Attributes:
waveform: torch.Tensor
The audio waveform tensor.
sr: int
The sample rate of the audio.
waveform (np.ndarray): The audio waveform as float32.
sr (int): The sample rate of the audio.
"""
def __init__(self, waveform: torch.Tensor,
sr: int = SAMPLE_RATE) -> None:
"""
Initialize the AudioProcessor object.
Args:
waveform (torch.Tensor): The audio waveform tensor.
sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
Raises:
ValueError: If the provided sample rate is not of type int.
"""
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
self.waveform = waveform
self.sr = sr
if not isinstance(self.sr, int):
raise ValueError("Sample rate should be a single value of type int,"
f"not {len(self.sr)} and type {type(self.sr)}")
raise ValueError(
"Sample rate should be a single value of type int, "
f"not {len(self.sr)} and type {type(self.sr)}"
)
@classmethod
def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
def from_file(cls, file: str, *args, **kwargs):
"""
Create an AudioProcessor instance from an audio file.
@@ -70,55 +44,42 @@ class AudioProcessor:
file (str): The audio file path.
Returns:
AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
AudioProcessor: Instance with loaded audio.
"""
audio, sr = cls.load_audio(file, *args, **kwargs)
audio = torch.from_numpy(audio)
return cls(audio, sr)
def cut(self, start: float, end: float) -> torch.Tensor:
def cut(self, start: float, end: float) -> np.ndarray:
"""
Cut a segment from the audio waveform between the specified start and end times.
Cut a segment from the audio waveform.
Args:
start (float): Start time in seconds.
end (float): End time in seconds.
Returns:
torch.Tensor: The cut waveform segment.
np.ndarray: The cut waveform segment.
"""
start = int(start * self.sr)
if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
end = int(np.ceil(end * self.sr))
else:
end = int(torch.ceil(end * self.sr))
return self.waveform[start:end]
start_idx = int(start * self.sr)
end_idx = int(np.ceil(end * self.sr))
return self.waveform[start_idx:end_idx]
@staticmethod
def load_audio(file: str, sr: int = SAMPLE_RATE):
"""
Open an audio file and read it as a mono waveform, resampling if necessary.
This method ensures compatibility with pyannote.audio
and requires the ffmpeg CLI in PATH.
Load an audio file as a mono waveform, resampling if necessary.
Requires ffmpeg in PATH.
Args:
file (str): The audio file to open.
sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
sr (int, optional): The desired sample rate.
Returns:
tuple: A NumPy array containing the audio waveform in float32 dtype
and the sample rate.
tuple: (waveform as np.ndarray[float32], sample rate)
Raises:
RuntimeError: If failed to load audio.
"""
# This launches a subprocess to decode audio while down-mixing
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
# fmt: off
cmd = [
"ffmpeg",
"-nostdin",
@@ -128,19 +89,20 @@ class AudioProcessor:
"-ac", "1",
"-acodec", "pcm_s16le",
"-ar", str(sr),
"-"
"-",
]
# fmt: on
try:
out = run(cmd, capture_output=True, check=True).stdout
except CalledProcessError as e:
raise RuntimeError(
f"Failed to load audio: {e.stderr.decode()}") from e
f"Failed to load audio: {e.stderr.decode()}"
) from e
out = np.frombuffer(out, np.int16).flatten().astype(
np.float32) / NORMALIZATION_FACTOR
waveform = np.frombuffer(out, np.int16).flatten().astype(
np.float32
) / NORMALIZATION_FACTOR
return waveform, sr
return out, sr
def __repr__(self) -> str:
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"