From 854469fb6e173bf0f4ee3f1ed4665480dfccf176 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:30:57 +0200 Subject: [PATCH] audio processing --- autotranscript/audio.py | 202 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 autotranscript/audio.py diff --git a/autotranscript/audio.py b/autotranscript/audio.py new file mode 100644 index 0000000..3175ca0 --- /dev/null +++ b/autotranscript/audio.py @@ -0,0 +1,202 @@ +import os +from warnings import warn + +import torch +from pydub import AudioSegment +from torchaudio import load, save + + +class AudioProcessor: + def __init__(self, audio_file:str): + + self.audio = AudioSegment.from_file(audio_file, + format=audio_file.split('.')[-1]) + self.audio_file_path = audio_file + self.waveform = self.pydub_to_tensor[0] + self.sr = self.pydub_to_tensor[1] + + @property + def pydub_to_tensor(self): + """ + Converts pydub audio segment into np.float32 of shape + [duration_in_seconds*sample_rate, channels], + where each value is in range [-1.0, 1.0]. + Returns tuple (audio_np_array, sample_rate). + """ + audio = self.audio + x = torch.Tensor(audio.get_array_of_samples() + ).reshape((-1, audio.channels)) + y = (1 << (8 * audio.sample_width - 1)) + return x / y, audio.frame_rate + + def convert_audio(self, path: str, remove_orginal: bool = False, + *args, **kwargs) -> None: + """ + Convert and saves video file or other audio files to a different file type, + Can be used to ensure that the audio file is in the correct format + for the Whisper model. + :param path : path to save file + :param remove_orginal: remove original file + :param args: arguments for pydub.AudioSegment.export + :param kwargs: keyword arguments for pydub.AudioSegment.export + e.g. format + :return: None + """ + + self.audio.export(path, *args, **kwargs) + + if remove_orginal: + os.remove(self.audio_file_path) + print(f'File {self.audio_file_path} removed') + + self.audio_file_path = path + + + def to_mp3(self, *args, **kwargs) -> None: + """ + Convert audio file to mp3 file + :param file: audio file + :param remove_orginal: remove original file + :return: mp3 file path + """ + + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "mp3" not in kwargs["format"]: + kwargs["format"] = "mp3" + + self.convert_audio(*args, **kwargs) + + def to_wav(self,*args, **kwargs) -> None: + """ + Convert audio file to wav file + :param file: audio file + :param remove_orginal: remove original file + :return: wav file path + """ + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "wav" not in kwargs["format"]: + kwargs["format"] = "wav" + + self.convert_audio(*args, **kwargs) + + def slower_mp3(self, path: str, + speed: float = 0.75, + type: str = "mp3") -> None: + """ + Slow down mp3 file + :param file: mp3 file + :param speed: speed + :return: None + """ + + sound = self.audio_file + slow_sound = sound._spawn(sound.raw_data, overrides={ + "frame_rate": int(sound.frame_rate * speed) + }) + + slow_sound.export(path, format=type) + + return slow_sound + + +class TorchAudioProcessor: + """ + Audio Processor using PyTorchaudio instead of PyDub + """ + + def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: + """ + Initialise audio processor + :param waveform: waveform + :param sr: sample rate + """ + self.waveform = waveform.reshape(-1) + self.sr = sr + + if not isinstance(self.sr, int): + raise ValueError("Sample rate should be a single value of type int," \ + f"not {len(self.sr)} and type {type(self.sr)}") + + + @classmethod + def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Load audio file + :param file: audio file + :return: AudioProcessor + """ + if not os.path.exists(file): + raise FileNotFoundError(f'File {file} not found') + + if "format" not in kwargs: + kwargs["format"] = file.split('.')[-1] + + audio, sr = load(file , *args, **kwargs) + + return cls(audio, sr) + + @classmethod + def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Initialise audio processor using pydub audio segment. + pydub uses ffmped instead of SoX (which is used by torchaudio) + :param file: audio file + :return: TorchAudioProcessor + """ + audio = AudioProcessor(file) + + return cls(audio.waveform, audio.sr) + + @classmethod + def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor': + """ + Initialise audio processor using pydub audio segment. + + :param audio_processor: AudioProcessor object + :type audio_processor: AudioProcessor + :return: TorchAudioProcessor + :rtype: TorchAudioProcessor + """ + return cls(audio_processor.waveform, audio_processor.sr) + + def cut(self, start: float, end: float) -> torch.Tensor: + """ + Cut audio file + :param start: start time in seconds + :param end: end time in seconds + :return: AudioProcessor + """ + + if isinstance(start, float): + start = torch.Tensor([start]) + if isinstance(end, float): + end = torch.Tensor([end]) + + sr = torch.Tensor([self.sr]) + + start = int(start * sr) + end = torch.ceil(end * sr) + + return self.waveform[start:end.to(int)] + + def save(self, path: str, *args, **kwargs) -> None: + """ + Save audio file + :param path: path to save file + :return: None + """ + if "format" not in kwargs: + kwargs["format"] = path.split('.')[-1] + + save(path, self.waveform, self.sr, *args, **kwargs) + + + def __repr__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' + + def __str__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'