removed pydub and use ffmpeg remove dependencies.

Droped pydub functionality and focuses on core components instead
This commit is contained in:
Jaikinator
2023-06-16 11:28:55 +02:00
parent 07acbc9464
commit edd6a0104c
+49 -141
View File
@@ -1,109 +1,13 @@
import os import os
from warnings import warn from warnings import warn
import numpy as np
import torch import torch
from pydub import AudioSegment import ffmpeg
from torchaudio import load, save
SAMPLE_RATE = 16000
class AudioProcessor: class AudioProcessor:
def __init__(self, audio_file:str):
self.audio = AudioSegment.from_file(audio_file,
format=audio_file.split('.')[-1])
self.audio_file_path = audio_file
self.waveform = self.pydub_to_tensor[0]
self.sr = self.pydub_to_tensor[1]
@property
def pydub_to_tensor(self):
"""
Converts pydub audio segment into np.float32 of shape
[duration_in_seconds*sample_rate, channels],
where each value is in range [-1.0, 1.0].
Returns tuple (audio_np_array, sample_rate).
"""
audio = self.audio
x = torch.Tensor(audio.get_array_of_samples()
).reshape((-1, audio.channels))
y = (1 << (8 * audio.sample_width - 1))
return x / y, audio.frame_rate
def convert_audio(self, path: str, remove_orginal: bool = False,
*args, **kwargs) -> None:
"""
Convert and saves video file or other audio files to a different file type,
Can be used to ensure that the audio file is in the correct format
for the Whisper model.
:param path : path to save file
:param remove_orginal: remove original file
:param args: arguments for pydub.AudioSegment.export
:param kwargs: keyword arguments for pydub.AudioSegment.export
e.g. format
:return: None
"""
self.audio.export(path, *args, **kwargs)
if remove_orginal:
os.remove(self.audio_file_path)
print(f'File {self.audio_file_path} removed')
self.audio_file_path = path
def to_mp3(self, *args, **kwargs) -> None:
"""
Convert audio file to mp3 file
:param file: audio file
:param remove_orginal: remove original file
:return: mp3 file path
"""
warn(DeprecationWarning, "This function is deprecated," \
"please use convert_audio instead")
if "mp3" not in kwargs["format"]:
kwargs["format"] = "mp3"
self.convert_audio(*args, **kwargs)
def to_wav(self,*args, **kwargs) -> None:
"""
Convert audio file to wav file
:param file: audio file
:param remove_orginal: remove original file
:return: wav file path
"""
warn(DeprecationWarning, "This function is deprecated," \
"please use convert_audio instead")
if "wav" not in kwargs["format"]:
kwargs["format"] = "wav"
self.convert_audio(*args, **kwargs)
def slower_mp3(self, path: str,
speed: float = 0.75,
type: str = "mp3") -> None:
"""
Slow down mp3 file
:param file: mp3 file
:param speed: speed
:return: None
"""
sound = self.audio_file
slow_sound = sound._spawn(sound.raw_data, overrides={
"frame_rate": int(sound.frame_rate * speed)
})
slow_sound.export(path, format=type)
return slow_sound
class TorchAudioProcessor:
""" """
Audio Processor using PyTorchaudio instead of PyDub Audio Processor using PyTorchaudio instead of PyDub
""" """
@@ -114,54 +18,27 @@ class TorchAudioProcessor:
:param waveform: waveform :param waveform: waveform
:param sr: sample rate :param sr: sample rate
""" """
self.waveform = waveform.reshape(-1) self.waveform = waveform
self.sr = sr self.sr = sr
if not isinstance(self.sr, int): if not isinstance(self.sr, int):
raise ValueError("Sample rate should be a single value of type int," \ raise ValueError("Sample rate should be a single value of type int," \
f"not {len(self.sr)} and type {type(self.sr)}") f"not {len(self.sr)} and type {type(self.sr)}")
@classmethod @classmethod
def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
""" """
Load audio file Load audio file
:param file: audio file :param file: audio file
:return: AudioProcessor :return: AudioProcessor
""" """
if not os.path.exists(file):
raise FileNotFoundError(f'File {file} not found')
if "format" not in kwargs: audio, sr = cls.load_audio(file , *args, **kwargs)
kwargs["format"] = file.split('.')[-1]
audio = torch.from_numpy(audio)
audio, sr = load(file , *args, **kwargs)
return cls(audio, sr) return cls(audio, sr)
@classmethod
def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
"""
Initialise audio processor using pydub audio segment.
pydub uses ffmped instead of SoX (which is used by torchaudio)
:param file: audio file
:return: TorchAudioProcessor
"""
audio = AudioProcessor(file)
return cls(audio.waveform, audio.sr)
@classmethod
def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor':
"""
Initialise audio processor using pydub audio segment.
:param audio_processor: AudioProcessor object
:type audio_processor: AudioProcessor
:return: TorchAudioProcessor
:rtype: TorchAudioProcessor
"""
return cls(audio_processor.waveform, audio_processor.sr)
def cut(self, start: float, end: float) -> torch.Tensor: def cut(self, start: float, end: float) -> torch.Tensor:
""" """
@@ -182,21 +59,52 @@ class TorchAudioProcessor:
end = torch.ceil(end * sr) end = torch.ceil(end * sr)
return self.waveform[start:end.to(int)] return self.waveform[start:end.to(int)]
def save(self, path: str, *args, **kwargs) -> None: @staticmethod
def load_audio(file: str, sr: int = SAMPLE_RATE):
""" """
Save audio file Open an audio file and read as mono waveform, resampling as necessary
:param path: path to save file
:return: None Changed from original function at whisper.audio.load_audio to ensure compatibility
with pyannote.audio
Parameters
----------
file: str
The audio file to open
sr: int
The sample rate to resample the audio if necessary
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
""" """
if "format" not in kwargs: try:
kwargs["format"] = path.split('.')[-1] # This launches a subprocess to decode audio while down-mixing
# and resampling as necessary.
save(path, self.waveform, self.sr, *args, **kwargs) # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le",
ac=1, ar=sr)
.run(cmd=["ffmpeg", "-nostdin"],
capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
return out , sr
def __repr__(self) -> str: def __repr__(self) -> str:
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
def __str__(self) -> str: def __str__(self) -> str:
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
if __name__ == "__main__":
print("Testing AudioProcessor")
print(AudioProcessor.from_file("tests/test.wav"))