import os from warnings import warn import torch from pydub import AudioSegment from torchaudio import load, save class AudioProcessor: def __init__(self, audio_file:str): self.audio = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1]) self.audio_file_path = audio_file self.waveform = self.pydub_to_tensor[0] self.sr = self.pydub_to_tensor[1] @property def pydub_to_tensor(self): """ Converts pydub audio segment into np.float32 of shape [duration_in_seconds*sample_rate, channels], where each value is in range [-1.0, 1.0]. Returns tuple (audio_np_array, sample_rate). """ audio = self.audio x = torch.Tensor(audio.get_array_of_samples() ).reshape((-1, audio.channels)) y = (1 << (8 * audio.sample_width - 1)) return x / y, audio.frame_rate def convert_audio(self, path: str, remove_orginal: bool = False, *args, **kwargs) -> None: """ Convert and saves video file or other audio files to a different file type, Can be used to ensure that the audio file is in the correct format for the Whisper model. :param path : path to save file :param remove_orginal: remove original file :param args: arguments for pydub.AudioSegment.export :param kwargs: keyword arguments for pydub.AudioSegment.export e.g. format :return: None """ self.audio.export(path, *args, **kwargs) if remove_orginal: os.remove(self.audio_file_path) print(f'File {self.audio_file_path} removed') self.audio_file_path = path def to_mp3(self, *args, **kwargs) -> None: """ Convert audio file to mp3 file :param file: audio file :param remove_orginal: remove original file :return: mp3 file path """ warn(DeprecationWarning, "This function is deprecated," \ "please use convert_audio instead") if "mp3" not in kwargs["format"]: kwargs["format"] = "mp3" self.convert_audio(*args, **kwargs) def to_wav(self,*args, **kwargs) -> None: """ Convert audio file to wav file :param file: audio file :param remove_orginal: remove original file :return: wav file path """ warn(DeprecationWarning, "This function is deprecated," \ "please use convert_audio instead") if "wav" not in kwargs["format"]: kwargs["format"] = "wav" self.convert_audio(*args, **kwargs) def slower_mp3(self, path: str, speed: float = 0.75, type: str = "mp3") -> None: """ Slow down mp3 file :param file: mp3 file :param speed: speed :return: None """ sound = self.audio_file slow_sound = sound._spawn(sound.raw_data, overrides={ "frame_rate": int(sound.frame_rate * speed) }) slow_sound.export(path, format=type) return slow_sound class TorchAudioProcessor: """ Audio Processor using PyTorchaudio instead of PyDub """ def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: """ Initialise audio processor :param waveform: waveform :param sr: sample rate """ self.waveform = waveform.reshape(-1) self.sr = sr if not isinstance(self.sr, int): raise ValueError("Sample rate should be a single value of type int," \ f"not {len(self.sr)} and type {type(self.sr)}") @classmethod def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': """ Load audio file :param file: audio file :return: AudioProcessor """ if not os.path.exists(file): raise FileNotFoundError(f'File {file} not found') if "format" not in kwargs: kwargs["format"] = file.split('.')[-1] audio, sr = load(file , *args, **kwargs) return cls(audio, sr) @classmethod def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': """ Initialise audio processor using pydub audio segment. pydub uses ffmped instead of SoX (which is used by torchaudio) :param file: audio file :return: TorchAudioProcessor """ audio = AudioProcessor(file) return cls(audio.waveform, audio.sr) @classmethod def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor': """ Initialise audio processor using pydub audio segment. :param audio_processor: AudioProcessor object :type audio_processor: AudioProcessor :return: TorchAudioProcessor :rtype: TorchAudioProcessor """ return cls(audio_processor.waveform, audio_processor.sr) def cut(self, start: float, end: float) -> torch.Tensor: """ Cut audio file :param start: start time in seconds :param end: end time in seconds :return: AudioProcessor """ if isinstance(start, float): start = torch.Tensor([start]) if isinstance(end, float): end = torch.Tensor([end]) sr = torch.Tensor([self.sr]) start = int(start * sr) end = torch.ceil(end * sr) return self.waveform[start:end.to(int)] def save(self, path: str, *args, **kwargs) -> None: """ Save audio file :param path: path to save file :return: None """ if "format" not in kwargs: kwargs["format"] = path.split('.')[-1] save(path, self.waveform, self.sr, *args, **kwargs) def __repr__(self) -> str: return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' def __str__(self) -> str: return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'