Files
scribe/autotranscript/audio.py
T
2023-06-14 16:30:57 +02:00

203 lines
6.3 KiB
Python

import os
from warnings import warn
import torch
from pydub import AudioSegment
from torchaudio import load, save
class AudioProcessor:
def __init__(self, audio_file:str):
self.audio = AudioSegment.from_file(audio_file,
format=audio_file.split('.')[-1])
self.audio_file_path = audio_file
self.waveform = self.pydub_to_tensor[0]
self.sr = self.pydub_to_tensor[1]
@property
def pydub_to_tensor(self):
"""
Converts pydub audio segment into np.float32 of shape
[duration_in_seconds*sample_rate, channels],
where each value is in range [-1.0, 1.0].
Returns tuple (audio_np_array, sample_rate).
"""
audio = self.audio
x = torch.Tensor(audio.get_array_of_samples()
).reshape((-1, audio.channels))
y = (1 << (8 * audio.sample_width - 1))
return x / y, audio.frame_rate
def convert_audio(self, path: str, remove_orginal: bool = False,
*args, **kwargs) -> None:
"""
Convert and saves video file or other audio files to a different file type,
Can be used to ensure that the audio file is in the correct format
for the Whisper model.
:param path : path to save file
:param remove_orginal: remove original file
:param args: arguments for pydub.AudioSegment.export
:param kwargs: keyword arguments for pydub.AudioSegment.export
e.g. format
:return: None
"""
self.audio.export(path, *args, **kwargs)
if remove_orginal:
os.remove(self.audio_file_path)
print(f'File {self.audio_file_path} removed')
self.audio_file_path = path
def to_mp3(self, *args, **kwargs) -> None:
"""
Convert audio file to mp3 file
:param file: audio file
:param remove_orginal: remove original file
:return: mp3 file path
"""
warn(DeprecationWarning, "This function is deprecated," \
"please use convert_audio instead")
if "mp3" not in kwargs["format"]:
kwargs["format"] = "mp3"
self.convert_audio(*args, **kwargs)
def to_wav(self,*args, **kwargs) -> None:
"""
Convert audio file to wav file
:param file: audio file
:param remove_orginal: remove original file
:return: wav file path
"""
warn(DeprecationWarning, "This function is deprecated," \
"please use convert_audio instead")
if "wav" not in kwargs["format"]:
kwargs["format"] = "wav"
self.convert_audio(*args, **kwargs)
def slower_mp3(self, path: str,
speed: float = 0.75,
type: str = "mp3") -> None:
"""
Slow down mp3 file
:param file: mp3 file
:param speed: speed
:return: None
"""
sound = self.audio_file
slow_sound = sound._spawn(sound.raw_data, overrides={
"frame_rate": int(sound.frame_rate * speed)
})
slow_sound.export(path, format=type)
return slow_sound
class TorchAudioProcessor:
"""
Audio Processor using PyTorchaudio instead of PyDub
"""
def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None:
"""
Initialise audio processor
:param waveform: waveform
:param sr: sample rate
"""
self.waveform = waveform.reshape(-1)
self.sr = sr
if not isinstance(self.sr, int):
raise ValueError("Sample rate should be a single value of type int," \
f"not {len(self.sr)} and type {type(self.sr)}")
@classmethod
def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
"""
Load audio file
:param file: audio file
:return: AudioProcessor
"""
if not os.path.exists(file):
raise FileNotFoundError(f'File {file} not found')
if "format" not in kwargs:
kwargs["format"] = file.split('.')[-1]
audio, sr = load(file , *args, **kwargs)
return cls(audio, sr)
@classmethod
def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
"""
Initialise audio processor using pydub audio segment.
pydub uses ffmped instead of SoX (which is used by torchaudio)
:param file: audio file
:return: TorchAudioProcessor
"""
audio = AudioProcessor(file)
return cls(audio.waveform, audio.sr)
@classmethod
def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor':
"""
Initialise audio processor using pydub audio segment.
:param audio_processor: AudioProcessor object
:type audio_processor: AudioProcessor
:return: TorchAudioProcessor
:rtype: TorchAudioProcessor
"""
return cls(audio_processor.waveform, audio_processor.sr)
def cut(self, start: float, end: float) -> torch.Tensor:
"""
Cut audio file
:param start: start time in seconds
:param end: end time in seconds
:return: AudioProcessor
"""
if isinstance(start, float):
start = torch.Tensor([start])
if isinstance(end, float):
end = torch.Tensor([end])
sr = torch.Tensor([self.sr])
start = int(start * sr)
end = torch.ceil(end * sr)
return self.waveform[start:end.to(int)]
def save(self, path: str, *args, **kwargs) -> None:
"""
Save audio file
:param path: path to save file
:return: None
"""
if "format" not in kwargs:
kwargs["format"] = path.split('.')[-1]
save(path, self.waveform, self.sr, *args, **kwargs)
def __repr__(self) -> str:
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
def __str__(self) -> str:
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'