Initial commit: LocalAI-backed ScrAIbe with summarization
This commit is contained in:
+33
-71
@@ -2,28 +2,15 @@
|
||||
Audio Processor Module
|
||||
=======================
|
||||
|
||||
This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
|
||||
It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
|
||||
flexible audio processing.
|
||||
Simplified audio processor for ScrAIbe.
|
||||
|
||||
Available Classes:
|
||||
- AudioProcessor: Processes audio waveforms and provides methods for loading,
|
||||
cutting, and handling audio.
|
||||
|
||||
Usage:
|
||||
from .audio_import AudioProcessor
|
||||
|
||||
processor = AudioProcessor.from_file("path/to/audiofile.wav")
|
||||
cut_waveform = processor.cut(start=1.0, end=5.0)
|
||||
|
||||
Constants:
|
||||
- SAMPLE_RATE (int): Default sample rate for processing.
|
||||
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
|
||||
Previously this used torch and pyannote-style processing. In the LocalAI-backed
|
||||
version, we primarily pass files to the API, but we keep a lightweight helper
|
||||
for backward compatibility.
|
||||
"""
|
||||
|
||||
from subprocess import CalledProcessError, run
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
NORMALIZATION_FACTOR = 32768.0
|
||||
@@ -31,38 +18,25 @@ NORMALIZATION_FACTOR = 32768.0
|
||||
|
||||
class AudioProcessor:
|
||||
"""
|
||||
Audio Processor class that leverages PyTorchaudio to provide functionalities
|
||||
for loading, cutting, and handling audio waveforms.
|
||||
Lightweight audio processor for loading and cutting audio.
|
||||
|
||||
Attributes:
|
||||
waveform: torch.Tensor
|
||||
The audio waveform tensor.
|
||||
sr: int
|
||||
The sample rate of the audio.
|
||||
waveform (np.ndarray): The audio waveform as float32.
|
||||
sr (int): The sample rate of the audio.
|
||||
"""
|
||||
|
||||
def __init__(self, waveform: torch.Tensor,
|
||||
sr: int = SAMPLE_RATE) -> None:
|
||||
"""
|
||||
Initialize the AudioProcessor object.
|
||||
|
||||
Args:
|
||||
waveform (torch.Tensor): The audio waveform tensor.
|
||||
sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
|
||||
|
||||
Raises:
|
||||
ValueError: If the provided sample rate is not of type int.
|
||||
"""
|
||||
|
||||
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
|
||||
self.waveform = waveform
|
||||
self.sr = sr
|
||||
|
||||
if not isinstance(self.sr, int):
|
||||
raise ValueError("Sample rate should be a single value of type int,"
|
||||
f"not {len(self.sr)} and type {type(self.sr)}")
|
||||
raise ValueError(
|
||||
"Sample rate should be a single value of type int, "
|
||||
f"not {len(self.sr)} and type {type(self.sr)}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
|
||||
def from_file(cls, file: str, *args, **kwargs):
|
||||
"""
|
||||
Create an AudioProcessor instance from an audio file.
|
||||
|
||||
@@ -70,55 +44,42 @@ class AudioProcessor:
|
||||
file (str): The audio file path.
|
||||
|
||||
Returns:
|
||||
AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
|
||||
AudioProcessor: Instance with loaded audio.
|
||||
"""
|
||||
|
||||
audio, sr = cls.load_audio(file, *args, **kwargs)
|
||||
|
||||
audio = torch.from_numpy(audio)
|
||||
|
||||
return cls(audio, sr)
|
||||
|
||||
def cut(self, start: float, end: float) -> torch.Tensor:
|
||||
def cut(self, start: float, end: float) -> np.ndarray:
|
||||
"""
|
||||
Cut a segment from the audio waveform between the specified start and end times.
|
||||
Cut a segment from the audio waveform.
|
||||
|
||||
Args:
|
||||
start (float): Start time in seconds.
|
||||
end (float): End time in seconds.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The cut waveform segment.
|
||||
np.ndarray: The cut waveform segment.
|
||||
"""
|
||||
|
||||
start = int(start * self.sr)
|
||||
if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
|
||||
end = int(np.ceil(end * self.sr))
|
||||
else:
|
||||
end = int(torch.ceil(end * self.sr))
|
||||
return self.waveform[start:end]
|
||||
start_idx = int(start * self.sr)
|
||||
end_idx = int(np.ceil(end * self.sr))
|
||||
return self.waveform[start_idx:end_idx]
|
||||
|
||||
@staticmethod
|
||||
def load_audio(file: str, sr: int = SAMPLE_RATE):
|
||||
"""
|
||||
Open an audio file and read it as a mono waveform, resampling if necessary.
|
||||
This method ensures compatibility with pyannote.audio
|
||||
and requires the ffmpeg CLI in PATH.
|
||||
Load an audio file as a mono waveform, resampling if necessary.
|
||||
Requires ffmpeg in PATH.
|
||||
|
||||
Args:
|
||||
file (str): The audio file to open.
|
||||
sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
|
||||
sr (int, optional): The desired sample rate.
|
||||
|
||||
Returns:
|
||||
tuple: A NumPy array containing the audio waveform in float32 dtype
|
||||
and the sample rate.
|
||||
tuple: (waveform as np.ndarray[float32], sample rate)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If failed to load audio.
|
||||
"""
|
||||
# This launches a subprocess to decode audio while down-mixing
|
||||
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
|
||||
# fmt: off
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-nostdin",
|
||||
@@ -128,19 +89,20 @@ class AudioProcessor:
|
||||
"-ac", "1",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", str(sr),
|
||||
"-"
|
||||
"-",
|
||||
]
|
||||
# fmt: on
|
||||
try:
|
||||
out = run(cmd, capture_output=True, check=True).stdout
|
||||
except CalledProcessError as e:
|
||||
raise RuntimeError(
|
||||
f"Failed to load audio: {e.stderr.decode()}") from e
|
||||
f"Failed to load audio: {e.stderr.decode()}"
|
||||
) from e
|
||||
|
||||
out = np.frombuffer(out, np.int16).flatten().astype(
|
||||
np.float32) / NORMALIZATION_FACTOR
|
||||
waveform = np.frombuffer(out, np.int16).flatten().astype(
|
||||
np.float32
|
||||
) / NORMALIZATION_FACTOR
|
||||
|
||||
return waveform, sr
|
||||
|
||||
return out, sr
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
||||
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
|
||||
|
||||
Reference in New Issue
Block a user