147 lines
4.6 KiB
Python
147 lines
4.6 KiB
Python
"""
|
|
Audio Processor Module
|
|
=======================
|
|
|
|
This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
|
|
It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
|
|
flexible audio processing.
|
|
|
|
Available Classes:
|
|
- AudioProcessor: Processes audio waveforms and provides methods for loading,
|
|
cutting, and handling audio.
|
|
|
|
Usage:
|
|
from .audio_import AudioProcessor
|
|
|
|
processor = AudioProcessor.from_file("path/to/audiofile.wav")
|
|
cut_waveform = processor.cut(start=1.0, end=5.0)
|
|
|
|
Constants:
|
|
- SAMPLE_RATE (int): Default sample rate for processing.
|
|
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
|
|
"""
|
|
|
|
from subprocess import CalledProcessError, run
|
|
import numpy as np
|
|
import torch
|
|
|
|
SAMPLE_RATE = 16000
|
|
NORMALIZATION_FACTOR = 32768.0
|
|
|
|
class AudioProcessor:
|
|
"""
|
|
Audio Processor class that leverages PyTorchaudio to provide functionalities
|
|
for loading, cutting, and handling audio waveforms.
|
|
|
|
Attributes:
|
|
waveform: torch.Tensor
|
|
The audio waveform tensor.
|
|
sr: int
|
|
The sample rate of the audio.
|
|
"""
|
|
|
|
def __init__(self, waveform: torch.Tensor, sr : int = SAMPLE_RATE,
|
|
*args, **kwargs) -> None:
|
|
|
|
"""
|
|
Initialize the AudioProcessor object.
|
|
|
|
Args:
|
|
waveform (torch.Tensor): The audio waveform tensor.
|
|
sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
|
|
args: Additional arguments.
|
|
kwargs: Additional keyword arguments, e.g., device to use for processing.
|
|
If CUDA is available, it defaults to CUDA.
|
|
|
|
Raises:
|
|
ValueError: If the provided sample rate is not of type int.
|
|
"""
|
|
|
|
device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
self.waveform = waveform.to(device)
|
|
self.sr = sr
|
|
|
|
if not isinstance(self.sr, int):
|
|
raise ValueError("Sample rate should be a single value of type int," \
|
|
f"not {len(self.sr)} and type {type(self.sr)}")
|
|
|
|
@classmethod
|
|
def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
|
|
"""
|
|
Create an AudioProcessor instance from an audio file.
|
|
|
|
Args:
|
|
file (str): The audio file path.
|
|
|
|
Returns:
|
|
AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
|
|
"""
|
|
|
|
audio, sr = cls.load_audio(file , *args, **kwargs)
|
|
|
|
audio = torch.from_numpy(audio)
|
|
|
|
return cls(audio, sr)
|
|
|
|
|
|
def cut(self, start: float, end: float) -> torch.Tensor:
|
|
"""
|
|
Cut a segment from the audio waveform between the specified start and end times.
|
|
|
|
Args:
|
|
start (float): Start time in seconds.
|
|
end (float): End time in seconds.
|
|
|
|
Returns:
|
|
torch.Tensor: The cut waveform segment.
|
|
"""
|
|
|
|
start = int(start * self.sr)
|
|
end = int(torch.ceil(end * self.sr))
|
|
return self.waveform[start:end]
|
|
|
|
@staticmethod
|
|
def load_audio(file: str, sr: int = SAMPLE_RATE):
|
|
"""
|
|
Open an audio file and read it as a mono waveform, resampling if necessary.
|
|
This method ensures compatibility with pyannote.audio
|
|
and requires the ffmpeg CLI in PATH.
|
|
|
|
Args:
|
|
file (str): The audio file to open.
|
|
sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
|
|
|
|
Returns:
|
|
tuple: A NumPy array containing the audio waveform in float32 dtype
|
|
and the sample rate.
|
|
|
|
Raises:
|
|
RuntimeError: If failed to load audio.
|
|
"""
|
|
# This launches a subprocess to decode audio while down-mixing
|
|
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
|
|
# fmt: off
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-nostdin",
|
|
"-threads", "0",
|
|
"-i", file,
|
|
"-f", "s16le",
|
|
"-ac", "1",
|
|
"-acodec", "pcm_s16le",
|
|
"-ar", str(sr),
|
|
"-"
|
|
]
|
|
# fmt: on
|
|
try:
|
|
out = run(cmd, capture_output=True, check=True).stdout
|
|
except CalledProcessError as e:
|
|
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
|
|
|
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / NORMALIZATION_FACTOR
|
|
|
|
return out , sr
|
|
|
|
def __repr__(self) -> str:
|
|
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' |