removed pydub and use ffmpeg remove dependencies.
Droped pydub functionality and focuses on core components instead
This commit is contained in:
+45
-137
@@ -1,109 +1,13 @@
|
|||||||
import os
|
import os
|
||||||
from warnings import warn
|
from warnings import warn
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from pydub import AudioSegment
|
import ffmpeg
|
||||||
from torchaudio import load, save
|
|
||||||
|
|
||||||
|
SAMPLE_RATE = 16000
|
||||||
|
|
||||||
class AudioProcessor:
|
class AudioProcessor:
|
||||||
def __init__(self, audio_file:str):
|
|
||||||
|
|
||||||
self.audio = AudioSegment.from_file(audio_file,
|
|
||||||
format=audio_file.split('.')[-1])
|
|
||||||
self.audio_file_path = audio_file
|
|
||||||
self.waveform = self.pydub_to_tensor[0]
|
|
||||||
self.sr = self.pydub_to_tensor[1]
|
|
||||||
|
|
||||||
@property
|
|
||||||
def pydub_to_tensor(self):
|
|
||||||
"""
|
|
||||||
Converts pydub audio segment into np.float32 of shape
|
|
||||||
[duration_in_seconds*sample_rate, channels],
|
|
||||||
where each value is in range [-1.0, 1.0].
|
|
||||||
Returns tuple (audio_np_array, sample_rate).
|
|
||||||
"""
|
|
||||||
audio = self.audio
|
|
||||||
x = torch.Tensor(audio.get_array_of_samples()
|
|
||||||
).reshape((-1, audio.channels))
|
|
||||||
y = (1 << (8 * audio.sample_width - 1))
|
|
||||||
return x / y, audio.frame_rate
|
|
||||||
|
|
||||||
def convert_audio(self, path: str, remove_orginal: bool = False,
|
|
||||||
*args, **kwargs) -> None:
|
|
||||||
"""
|
|
||||||
Convert and saves video file or other audio files to a different file type,
|
|
||||||
Can be used to ensure that the audio file is in the correct format
|
|
||||||
for the Whisper model.
|
|
||||||
:param path : path to save file
|
|
||||||
:param remove_orginal: remove original file
|
|
||||||
:param args: arguments for pydub.AudioSegment.export
|
|
||||||
:param kwargs: keyword arguments for pydub.AudioSegment.export
|
|
||||||
e.g. format
|
|
||||||
:return: None
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.audio.export(path, *args, **kwargs)
|
|
||||||
|
|
||||||
if remove_orginal:
|
|
||||||
os.remove(self.audio_file_path)
|
|
||||||
print(f'File {self.audio_file_path} removed')
|
|
||||||
|
|
||||||
self.audio_file_path = path
|
|
||||||
|
|
||||||
|
|
||||||
def to_mp3(self, *args, **kwargs) -> None:
|
|
||||||
"""
|
|
||||||
Convert audio file to mp3 file
|
|
||||||
:param file: audio file
|
|
||||||
:param remove_orginal: remove original file
|
|
||||||
:return: mp3 file path
|
|
||||||
"""
|
|
||||||
|
|
||||||
warn(DeprecationWarning, "This function is deprecated," \
|
|
||||||
"please use convert_audio instead")
|
|
||||||
|
|
||||||
if "mp3" not in kwargs["format"]:
|
|
||||||
kwargs["format"] = "mp3"
|
|
||||||
|
|
||||||
self.convert_audio(*args, **kwargs)
|
|
||||||
|
|
||||||
def to_wav(self,*args, **kwargs) -> None:
|
|
||||||
"""
|
|
||||||
Convert audio file to wav file
|
|
||||||
:param file: audio file
|
|
||||||
:param remove_orginal: remove original file
|
|
||||||
:return: wav file path
|
|
||||||
"""
|
|
||||||
warn(DeprecationWarning, "This function is deprecated," \
|
|
||||||
"please use convert_audio instead")
|
|
||||||
|
|
||||||
if "wav" not in kwargs["format"]:
|
|
||||||
kwargs["format"] = "wav"
|
|
||||||
|
|
||||||
self.convert_audio(*args, **kwargs)
|
|
||||||
|
|
||||||
def slower_mp3(self, path: str,
|
|
||||||
speed: float = 0.75,
|
|
||||||
type: str = "mp3") -> None:
|
|
||||||
"""
|
|
||||||
Slow down mp3 file
|
|
||||||
:param file: mp3 file
|
|
||||||
:param speed: speed
|
|
||||||
:return: None
|
|
||||||
"""
|
|
||||||
|
|
||||||
sound = self.audio_file
|
|
||||||
slow_sound = sound._spawn(sound.raw_data, overrides={
|
|
||||||
"frame_rate": int(sound.frame_rate * speed)
|
|
||||||
})
|
|
||||||
|
|
||||||
slow_sound.export(path, format=type)
|
|
||||||
|
|
||||||
return slow_sound
|
|
||||||
|
|
||||||
|
|
||||||
class TorchAudioProcessor:
|
|
||||||
"""
|
"""
|
||||||
Audio Processor using PyTorchaudio instead of PyDub
|
Audio Processor using PyTorchaudio instead of PyDub
|
||||||
"""
|
"""
|
||||||
@@ -114,54 +18,27 @@ class TorchAudioProcessor:
|
|||||||
:param waveform: waveform
|
:param waveform: waveform
|
||||||
:param sr: sample rate
|
:param sr: sample rate
|
||||||
"""
|
"""
|
||||||
self.waveform = waveform.reshape(-1)
|
self.waveform = waveform
|
||||||
self.sr = sr
|
self.sr = sr
|
||||||
|
|
||||||
if not isinstance(self.sr, int):
|
if not isinstance(self.sr, int):
|
||||||
raise ValueError("Sample rate should be a single value of type int," \
|
raise ValueError("Sample rate should be a single value of type int," \
|
||||||
f"not {len(self.sr)} and type {type(self.sr)}")
|
f"not {len(self.sr)} and type {type(self.sr)}")
|
||||||
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
|
def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
|
||||||
"""
|
"""
|
||||||
Load audio file
|
Load audio file
|
||||||
:param file: audio file
|
:param file: audio file
|
||||||
:return: AudioProcessor
|
:return: AudioProcessor
|
||||||
"""
|
"""
|
||||||
if not os.path.exists(file):
|
|
||||||
raise FileNotFoundError(f'File {file} not found')
|
|
||||||
|
|
||||||
if "format" not in kwargs:
|
audio, sr = cls.load_audio(file , *args, **kwargs)
|
||||||
kwargs["format"] = file.split('.')[-1]
|
|
||||||
|
|
||||||
audio, sr = load(file , *args, **kwargs)
|
audio = torch.from_numpy(audio)
|
||||||
|
|
||||||
return cls(audio, sr)
|
return cls(audio, sr)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
|
|
||||||
"""
|
|
||||||
Initialise audio processor using pydub audio segment.
|
|
||||||
pydub uses ffmped instead of SoX (which is used by torchaudio)
|
|
||||||
:param file: audio file
|
|
||||||
:return: TorchAudioProcessor
|
|
||||||
"""
|
|
||||||
audio = AudioProcessor(file)
|
|
||||||
|
|
||||||
return cls(audio.waveform, audio.sr)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor':
|
|
||||||
"""
|
|
||||||
Initialise audio processor using pydub audio segment.
|
|
||||||
|
|
||||||
:param audio_processor: AudioProcessor object
|
|
||||||
:type audio_processor: AudioProcessor
|
|
||||||
:return: TorchAudioProcessor
|
|
||||||
:rtype: TorchAudioProcessor
|
|
||||||
"""
|
|
||||||
return cls(audio_processor.waveform, audio_processor.sr)
|
|
||||||
|
|
||||||
def cut(self, start: float, end: float) -> torch.Tensor:
|
def cut(self, start: float, end: float) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
@@ -183,20 +60,51 @@ class TorchAudioProcessor:
|
|||||||
|
|
||||||
return self.waveform[start:end.to(int)]
|
return self.waveform[start:end.to(int)]
|
||||||
|
|
||||||
def save(self, path: str, *args, **kwargs) -> None:
|
@staticmethod
|
||||||
|
def load_audio(file: str, sr: int = SAMPLE_RATE):
|
||||||
"""
|
"""
|
||||||
Save audio file
|
Open an audio file and read as mono waveform, resampling as necessary
|
||||||
:param path: path to save file
|
|
||||||
:return: None
|
Changed from original function at whisper.audio.load_audio to ensure compatibility
|
||||||
|
with pyannote.audio
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
file: str
|
||||||
|
The audio file to open
|
||||||
|
|
||||||
|
sr: int
|
||||||
|
The sample rate to resample the audio if necessary
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
A NumPy array containing the audio waveform, in float32 dtype.
|
||||||
"""
|
"""
|
||||||
if "format" not in kwargs:
|
try:
|
||||||
kwargs["format"] = path.split('.')[-1]
|
# This launches a subprocess to decode audio while down-mixing
|
||||||
|
# and resampling as necessary.
|
||||||
|
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
|
||||||
|
out, _ = (
|
||||||
|
ffmpeg.input(file, threads=0)
|
||||||
|
.output("-", format="s16le", acodec="pcm_s16le",
|
||||||
|
ac=1, ar=sr)
|
||||||
|
.run(cmd=["ffmpeg", "-nostdin"],
|
||||||
|
capture_stdout=True, capture_stderr=True)
|
||||||
|
)
|
||||||
|
except ffmpeg.Error as e:
|
||||||
|
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
||||||
|
|
||||||
save(path, self.waveform, self.sr, *args, **kwargs)
|
out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
||||||
|
|
||||||
|
return out , sr
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
print("Testing AudioProcessor")
|
||||||
|
print(AudioProcessor.from_file("tests/test.wav"))
|
||||||
Reference in New Issue
Block a user