From b019671f124371129ada790029d485ed75c627ed Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 9 Jun 2023 18:00:29 +0200 Subject: [PATCH 01/86] added files for rework --- autotranscript/__init__.py | 5 +- autotranscript/audio_processor.py | 93 +++++++++++++++++++ autotranscript/diarisation.py | 144 ++++++++++++++++++++++++++++++ 3 files changed, 241 insertions(+), 1 deletion(-) create mode 100644 autotranscript/audio_processor.py create mode 100644 autotranscript/diarisation.py diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 13f245b..91c8659 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,4 +1,7 @@ from autotranscript.__main__ import * +from autotranscript.transcriptor import * +from autotranscript.audio_processor import * from autotranscript.version import get_version as _get_version +from autotranscript.misc import * -__version__ = _get_version() \ No newline at end of file +__version__ = _get_version() diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py new file mode 100644 index 0000000..2b8eee8 --- /dev/null +++ b/autotranscript/audio_processor.py @@ -0,0 +1,93 @@ +from typing import Union +from pydub import AudioSegment +import os + +class AudioProcessor: + def __init__(self, audio_file:str): + self.audio_file_path = audio_file + self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1]) + + self.audiofilename = audio_file.split('/')[-1][:-4] + self.coreaudiofile = audio_file.split('/')[-1][:-4] + self.audiofilefolder = os.path.dirname(audio_file) + self.audio_file_type = audio_file.split('.')[-1] + + + + def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True): + """ + Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the + Whisper model + :param file: path to audio or video file + :param remove_orginal: remove original file + :return: mp3 file path + """ + print(f'Converting {self.audiofilename} to .{type} file') + + if savefolder == "": + savefolder = self.audiofilefolder + + if savename == "": + savename = self.coreaudiofile + f'.{type}' + else: + savename = savename + f'.{type}' + + savepath = os.path.join(savefolder, savename) + + self.audio_file.export(savepath, format=type) + + print(f'Converted {self.audiofilename} to {type}') + + if remove_orginal: + os.remove(self.audio_file_path) + print(f'File {self.audio_file_path} removed') + + self.audio_file_path = savepath + self.audio_file = AudioSegment.from_file(savepath, format=type) + + return self + + def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): + """ + Convert audio file to mp3 file + :param file: audio file + :param remove_orginal: remove original file + :return: mp3 file path + """ + return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal) + + def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): + """ + Convert audio file to wav file + :param file: audio file + :param remove_orginal: remove original file + :return: wav file path + """ + return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal) + + def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"): + """ + Slow down mp3 file + :param file: mp3 file + :param speed: speed + :return: None + """ + if savefolder == "": + savefolder = self.audiofilefolder + else: + savefolder = savefolder + + sound = self.audio_file + slow_sound = sound._spawn(sound.raw_data, overrides={ + "frame_rate": int(sound.frame_rate * speed) + }) + + speedstr = str(speed).replace('.', '') + + file_out = self.coreaudiofile + f'_{speedstr}.{type}' + + save_path = os.path.join(savefolder, file_out) + + slow_sound.export(save_path, format=type) + + return slow_sound \ No newline at end of file diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py new file mode 100644 index 0000000..b7ee848 --- /dev/null +++ b/autotranscript/diarisation.py @@ -0,0 +1,144 @@ +from audio_processor import AudioProcessor +from time import time +import os + +class Diarisation(AudioProcessor): + def __init__(self, audio_file: str, model,**kwargs) -> None: + + super().__init__(audio_file=audio_file) + + self.model = model + + + def diarization(self, *args, **kwargs): + + if "num_speakers" in kwargs: + num_speakers = kwargs['num_speakers'] + kwargs.pop('num_speakers') + else: + num_speakers = 2 + + audiofilename = self.coreaudiofile + + print(f'Start diarization of audio file: {self.audiofilename}') + + _stime = time() + + diarization = self.model(self.audio_file_path, num_speakers=num_speakers) + + print(f'Diarization finished in {time() - _stime} seconds') + self.diarization = diarization + + return diarization + + def format_diarization_output(self, *args, **kwargs): + """ + Format diarization output to a list of tuples + :param args: + :param kwargs: + :return: dict with speaker names as keys and list of tuples as values and list of different speakers + """ + + diarization_output = {"speakers": [], "segments": []} + + if not hasattr(self, 'diarization'): + # ensure diarization is run before formatting + self.diarization = self.diarization() + + + for segment, _, speaker in self.diarization.itertracks(yield_label=True): + diarization_output["speakers"].append(speaker) + diarization_output["segments"].append(segment) + + normalized_output = [] + index_start_speaker = 0 + index_end_speaker = 0 + current_speaker = str() + + for i, speaker in enumerate(diarization_output["speakers"]): + + if i == 0: + current_speaker = speaker + + if speaker != current_speaker: + + index_end_speaker = i - 1 + + normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) + + index_start_speaker = i + current_speaker = speaker + + if i == len(diarization_output["speakers"]) - 1: + + index_end_speaker = i + normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) + + + self.normalized_output = normalized_output + self.diarization_output = diarization_output + + return diarization_output,normalized_output + + def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs): + """ + Create temporary wav file for diarization + :param savefolder: folder to save the temporary wav file + :param savename: name of the temporary wav file prefix + :param audiofile: audio file + :return: temporary wav file + """ + + + if savefolder == "": + folder = '.temp' + if not os.path.exists(folder): + os.makedirs(folder) + else: + folder = savefolder + + folder = os.path.realpath(folder) + + if savename == "": + savename = self.coreaudiofile + '.wav' + else: + savename = savename + + + if not os.path.exists(folder): + os.makedirs(folder) + + if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'): + self.format_diarization_output() + + + speaker = set(self.diarization_output["speakers"]) + num_speak_iter = [0 for _ in range(len(speaker))] + + for count, outp in enumerate(self.normalized_output): + start = self.diarization_output["segments"][outp[0]].start + end = self.diarization_output["segments"][outp[1]].end + + print("start: ", start) + print("end: ", end) + + start_milliseconds = start * 1000 + end_milliseconds = end * 1000 + + print("start_milliseconds: ", start_milliseconds) + print("end_milliseconds: ", end_milliseconds) + + print("cut audio") + + cut_audio = self.audio_file[start_milliseconds:end_milliseconds] + + print("save audio") + print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav") + cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav") + + return os.path.realpath(folder) + + def __repr__(self): + return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" + def __str__(self): + return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" \ No newline at end of file From 724c2844741e8e976e3fa1978b2f102112125c76 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 9 Jun 2023 18:00:46 +0200 Subject: [PATCH 02/86] added files to module init --- autotranscript/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 91c8659..3bd3b1a 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,6 +1,7 @@ from autotranscript.__main__ import * from autotranscript.transcriptor import * from autotranscript.audio_processor import * +from autotranscript.diarisation import * from autotranscript.version import get_version as _get_version from autotranscript.misc import * From ee2cfc43193e989e27ec707e1690f1e75526b3d6 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 9 Jun 2023 18:01:18 +0200 Subject: [PATCH 03/86] reworked transcription class --- autotranscript/transcriptor.py | 112 +++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 autotranscript/transcriptor.py diff --git a/autotranscript/transcriptor.py b/autotranscript/transcriptor.py new file mode 100644 index 0000000..a3927f1 --- /dev/null +++ b/autotranscript/transcriptor.py @@ -0,0 +1,112 @@ + +import os +from typing import TypeVar +from whisper import load_model +from glob import glob + +whisper = TypeVar('whisper') +Transcriber = TypeVar('Transcriber') + +def get_whisper_default_path() -> str: + """ + Get default path for whisper models + + Returns + ------- + str + path + """ + _path = os.path.dirname(os.path.dirname(__file__)) + return os.path.join(_path, "models", "whisper") + +WHISPER_DEFAULT_PATH = get_whisper_default_path() + +class Transcriber: + def __init__(self, model: whisper ) -> None: + """ + Initialize Transcriber class with a whisper model + :param model: whisper model + """ + self.model = model + + + def transcribe(self, file : str, language:str = "German"): + """ + transcribe audio file + :param file: audio file to transcribe + :param language: language of the audio file + :return: transcript as string + """ + result = self.model.transcribe(file, language = language) + + return result["text"] + + @staticmethod + def save_transcript(transcript:str , save_path : str) -> None: + """ + Save transcript to file + :param transcript: transcript as string + :param savepath: path to save the transcript + :return: None + """ + + with open(save_path, 'w') as f: + f.write(transcript) + f.close() + + print(f'Transcript saved to {save_path}') + + @classmethod + def load_whisper_model(cls, + model: str = "medium", + local : bool = True, + download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: + """ + Load whisper module + + Parameters + ---------- + whisper : str + whisper model + available models: + + - 'tiny.en' + - 'tiny' + - 'base.en' + - 'base' + - 'small.en' + - 'small' + - 'medium.en' + - 'medium' + - 'large-v1' + - 'large-v2' + - 'large' + + local : bool + If true, load from local cache + + download_root : str + Path to download the model + + default: /models/whisper + + Returns + ------- + Whisper Object + """ + + if local: + + available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))] + + for i, module in enumerate(available_models): + available_models[i] = module.split(".")[0] + + if model not in available_models: + raise RuntimeError("Model not found. Consider downloading the "/ + "model first. By deactivating the local flag, " / + "the model will be downloaded automatically.") + + _model = load_model(model, download_root=download_root) + + return cls(_model) From 301a6e88b5f95c6c3497d710121d5d86811782b7 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 9 Jun 2023 18:01:42 +0200 Subject: [PATCH 04/86] added sepearate functions to load models --- autotranscript/misc.py | 88 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 autotranscript/misc.py diff --git a/autotranscript/misc.py b/autotranscript/misc.py new file mode 100644 index 0000000..91008fd --- /dev/null +++ b/autotranscript/misc.py @@ -0,0 +1,88 @@ + +from pyannote.audio import Pipeline +from whisper import Whisper, load_model +import os +import glob + +def get_whisper_default_path() -> str: + """ + Get default path for whisper models + + Returns + ------- + str + path + """ + _path = os.path.dirname(os.path.dirname(__file__)) + return os.path.join(_path, "models", "whisper") + +WHISPER_DEFAULT_PATH = get_whisper_default_path() + +def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper: + """ + Load modules from whisper + + Parameters + ---------- + whisper : str + whisper model + available models: + + - 'tiny.en' + - 'tiny' + - 'base.en' + - 'base' + - 'small.en' + - 'small' + - 'medium.en' + - 'medium' + - 'large-v1' + - 'large-v2' + - 'large' + + local : bool + If true, load from local cache + + download_root : str + Path to download the model + + default: /models/whisper + + Returns + ------- + Whisper Object + """ + + if local: + available_models = [os.path.basename(x) for x in glob.glob(os.path.join(WHISPER_DEFAULT_PATH, "*"))] + + for i, module in enumerate(available_models): + available_models[i] = module.split(".")[0] + + if model not in available_models: + raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.") + + return load_model(model, download_root=WHISPER_DEFAULT_PATH) + +def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pipeline: + """ + Load modules from pyannote + + Parameters + ---------- + model : str + pyannote model + token : str + HUGGINGFACE_TOKEN + local : bool + If true, load from local cache + + Returns + ------- + Pipeline Object + """ + + if local: + return Pipeline.from_pretrained(model) + else: + return Pipeline.from_pretrained(model, use_auth_token = token) From 6710f05eaf70b8851aeb13473ebfa8e27fd075ae Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 9 Jun 2023 18:01:55 +0200 Subject: [PATCH 05/86] added unittest --- test_autotranscript.py | 55 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 test_autotranscript.py diff --git a/test_autotranscript.py b/test_autotranscript.py new file mode 100644 index 0000000..29bf4d9 --- /dev/null +++ b/test_autotranscript.py @@ -0,0 +1,55 @@ +import pytest +from autotranscript import Transcriber +from unittest.mock import patch, mock_open +import os + +def test_load_pyannote_model(): + """ + Test load_pyannote_test + """ + from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization + from pyannote.audio import Pipeline + + pipeline = Pipeline.from_pretrained("models/pyannote/speaker_diarization/config.yaml") + assert isinstance(pipeline, SpeakerDiarization) + +# Test Transcribtion class + + +@pytest.fixture +def transcriber(): + """ + Prepare Transcriber for testing + Returns: Transcriber Object + """ + + return Transcriber.load_whisper_model("medium", local=True) + + +def test_Transcriber_init(transcriber): + """ + Test Transcriber initialization with a whisper model + """ + + assert isinstance(transcriber, Transcriber) + +def test_transcription(transcriber): + """ + Test transcription + """ + + transcript = transcriber.transcribe("tests/test.wav") + assert isinstance(transcript, str) + +def test_save_transcript_to_file(transcriber): + """ + Test save_transcript_to_file + """ + transcript = transcriber.transcribe("tests/test.wav") + + open_mock = mock_open() + with patch("autotranscript.Transcriber.save_transcript", open_mock, create=True): + Transcriber.save_transcript(transcript, "output.txt") + + open_mock.assert_called_with("output.txt", "w") + open_mock.return_value.write.assert_called_once_with("test-data") From 671c67415f6b0da6feca9ab9ff4e24bfa31187da Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 11:29:28 +0200 Subject: [PATCH 06/86] reworked diarization feature --- autotranscript/diarisation.py | 238 ++++++++++++++++++++-------------- 1 file changed, 143 insertions(+), 95 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index b7ee848..b0c9e84 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,62 +1,64 @@ -from audio_processor import AudioProcessor +from pyannote.audio import Pipeline from time import time import os +from typing import TypeVar -class Diarisation(AudioProcessor): - def __init__(self, audio_file: str, model,**kwargs) -> None: +Annotation = TypeVar('Annotation') - super().__init__(audio_file=audio_file) +PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), + "models", "pyannote", + "speaker_diarization", "config.yaml") + +class Diarisation: + def __init__(self, model,*args,**kwargs) -> None: self.model = model - def diarization(self, *args, **kwargs): + def diarization(self, audiofile : str , *args, **kwargs) -> Annotation: + """ + Diarization of audio file + :param audiofile: path to audio file + :param args: args for diarization model + :param kwargs: kwargs for diarization model + :return: diarization + """ - if "num_speakers" in kwargs: - num_speakers = kwargs['num_speakers'] - kwargs.pop('num_speakers') - else: - num_speakers = 2 + print(f'Start diarization of audio file: {audiofile}') - audiofilename = self.coreaudiofile + diarization = self.model(audiofile,*args, **kwargs) - print(f'Start diarization of audio file: {self.audiofilename}') + print('Diarization finished') - _stime = time() + out = self.format_diarization_output(diarization) - diarization = self.model(self.audio_file_path, num_speakers=num_speakers) + return out - print(f'Diarization finished in {time() - _stime} seconds') - self.diarization = diarization - - return diarization - - def format_diarization_output(self, *args, **kwargs): + @staticmethod + def format_diarization_output(dia : Annotation) -> dict: """ Format diarization output to a list of tuples - :param args: - :param kwargs: - :return: dict with speaker names as keys and list of tuples as values and list of different speakers + :param dia: diarization output + :return: dict with speaker names as keys and list of tuples + as values and list of different speakers """ + dia_list = list(dia.itertracks(yield_label=True)) diarization_output = {"speakers": [], "segments": []} - if not hasattr(self, 'diarization'): - # ensure diarization is run before formatting - self.diarization = self.diarization() - - - for segment, _, speaker in self.diarization.itertracks(yield_label=True): - diarization_output["speakers"].append(speaker) - diarization_output["segments"].append(segment) - normalized_output = [] index_start_speaker = 0 index_end_speaker = 0 current_speaker = str() + + ### + # Sometimes two consecutive speakers are the same + # This loop removes these duplicates + ### - for i, speaker in enumerate(diarization_output["speakers"]): + for i, (_, _, speaker) in enumerate(dia_list): + if i == 0: current_speaker = speaker @@ -64,7 +66,9 @@ class Diarisation(AudioProcessor): index_end_speaker = i - 1 - normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) + normalized_output.append([index_start_speaker, + index_end_speaker, + current_speaker]) index_start_speaker = i current_speaker = speaker @@ -72,73 +76,117 @@ class Diarisation(AudioProcessor): if i == len(diarization_output["speakers"]) - 1: index_end_speaker = i - normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) + normalized_output.append([index_start_speaker, + index_end_speaker, + current_speaker]) + + for outp in normalized_output: + #convert in milliseconds + start = dia_list[outp[0]][0].start * 1000 + end = dia_list[outp[1]][0].end * 1000 + diarization_output["segments"].append([start, end]) + diarization_output["speakers"].append(outp[2]) - self.normalized_output = normalized_output - self.diarization_output = diarization_output - - return diarization_output,normalized_output - - def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs): + return diarization_output + + @classmethod + def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, + token: str = "", + local : bool = True, + *args, **kwargs) -> Pipeline: """ - Create temporary wav file for diarization - :param savefolder: folder to save the temporary wav file - :param savename: name of the temporary wav file prefix - :param audiofile: audio file - :return: temporary wav file + Load modules from pyannote + + Parameters + ---------- + model : str + pyannote model + default: /models/pyannote/speaker_diarization/config.yaml + token : str + HUGGINGFACE_TOKEN + local : bool + If true, load from local cache + + Returns + ------- + Pipeline Object """ - - if savefolder == "": - folder = '.temp' - if not os.path.exists(folder): - os.makedirs(folder) + if local: + diarization_model = Pipeline.from_pretrained(model,*args, **kwargs) else: - folder = savefolder - - folder = os.path.realpath(folder) - - if savename == "": - savename = self.coreaudiofile + '.wav' - else: - savename = savename - - - if not os.path.exists(folder): - os.makedirs(folder) - - if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'): - self.format_diarization_output() - - - speaker = set(self.diarization_output["speakers"]) - num_speak_iter = [0 for _ in range(len(speaker))] - - for count, outp in enumerate(self.normalized_output): - start = self.diarization_output["segments"][outp[0]].start - end = self.diarization_output["segments"][outp[1]].end - - print("start: ", start) - print("end: ", end) - - start_milliseconds = start * 1000 - end_milliseconds = end * 1000 - - print("start_milliseconds: ", start_milliseconds) - print("end_milliseconds: ", end_milliseconds) - - print("cut audio") - - cut_audio = self.audio_file[start_milliseconds:end_milliseconds] - - print("save audio") - print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav") - cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav") - - return os.path.realpath(folder) + diarization_model = Pipeline.from_pretrained(model, use_auth_token = token, + *args, **kwargs) + + return cls(diarization_model) def __repr__(self): - return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" + return f"Diarisation(model={self.model})" def __str__(self): - return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" \ No newline at end of file + return f"Diarisation(model={self.model})" + + +if __name__ == '__main__': + + model = Diarisation.load_model() + print(model) + audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav" + out = model.diarization(audiofile) + print(out) + + # # deprecated + # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'): + # """ + # Create temporary wav file for diarization + # :param location_of_temp_folder: folder to save the temporary wav file + # default: .temp + # :param savename: name of the temporary wav file prefix + # :param audiofile: audio file + # :return: temporary wav file + # """ + # print("Linne 84 Diarisation.py create_temporary_wav :" / + # "location_of_temp_folder.split('/')[-1]",location_of_temp_folder.split('/')[-1]) + + # if location_of_temp_folder.split('/')[-1] != '.temp': + # folder =os.path.join(location_of_temp_folder, '.temp') + # else: + # folder = location_of_temp_folder + + # if not os.path.exists(folder): + # os.makedirs(folder) + + # folder = os.path.realpath(folder) + + # if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'): + # raise AttributeError("You need to run the diarization first") + + # speaker = set(self.diarization_output["speakers"]) + # num_speak_iter = [0 for _ in range(len(speaker))] + + # for count, outp in enumerate(self.normalized_output): + # print(outp) + # print(self.diarization_output["segments"][outp[0]]) + # print(self.diarization_output["segments"][outp[1]]) + + # start = self.diarization_output["segments"][outp[0]].start + # end = self.diarization_output["segments"][outp[1]].end + + # print("start: ", start) + # print("end: ", end) + + # start_milliseconds = start * 1000 + # end_milliseconds = end * 1000 + + # print("start_milliseconds: ", start_milliseconds) + # print("end_milliseconds: ", end_milliseconds) + + # print("cut audio") + + # cut_audio = self.audio_file[start_milliseconds:end_milliseconds] + + # print("save audio") + # print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav") + # cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav") + + # return os.path.realpath(folder) \ No newline at end of file From 6aae0f5b242408795c60b0e0a6266449bd80c70a Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 11:48:47 +0200 Subject: [PATCH 07/86] file name changed --- autotranscript/transcriber.py | 112 ++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 autotranscript/transcriber.py diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py new file mode 100644 index 0000000..a3927f1 --- /dev/null +++ b/autotranscript/transcriber.py @@ -0,0 +1,112 @@ + +import os +from typing import TypeVar +from whisper import load_model +from glob import glob + +whisper = TypeVar('whisper') +Transcriber = TypeVar('Transcriber') + +def get_whisper_default_path() -> str: + """ + Get default path for whisper models + + Returns + ------- + str + path + """ + _path = os.path.dirname(os.path.dirname(__file__)) + return os.path.join(_path, "models", "whisper") + +WHISPER_DEFAULT_PATH = get_whisper_default_path() + +class Transcriber: + def __init__(self, model: whisper ) -> None: + """ + Initialize Transcriber class with a whisper model + :param model: whisper model + """ + self.model = model + + + def transcribe(self, file : str, language:str = "German"): + """ + transcribe audio file + :param file: audio file to transcribe + :param language: language of the audio file + :return: transcript as string + """ + result = self.model.transcribe(file, language = language) + + return result["text"] + + @staticmethod + def save_transcript(transcript:str , save_path : str) -> None: + """ + Save transcript to file + :param transcript: transcript as string + :param savepath: path to save the transcript + :return: None + """ + + with open(save_path, 'w') as f: + f.write(transcript) + f.close() + + print(f'Transcript saved to {save_path}') + + @classmethod + def load_whisper_model(cls, + model: str = "medium", + local : bool = True, + download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: + """ + Load whisper module + + Parameters + ---------- + whisper : str + whisper model + available models: + + - 'tiny.en' + - 'tiny' + - 'base.en' + - 'base' + - 'small.en' + - 'small' + - 'medium.en' + - 'medium' + - 'large-v1' + - 'large-v2' + - 'large' + + local : bool + If true, load from local cache + + download_root : str + Path to download the model + + default: /models/whisper + + Returns + ------- + Whisper Object + """ + + if local: + + available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))] + + for i, module in enumerate(available_models): + available_models[i] = module.split(".")[0] + + if model not in available_models: + raise RuntimeError("Model not found. Consider downloading the "/ + "model first. By deactivating the local flag, " / + "the model will be downloaded automatically.") + + _model = load_model(model, download_root=download_root) + + return cls(_model) From 7aa2ed667f82f4c4f68c9922c825270c28e3ff44 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 11:49:17 +0200 Subject: [PATCH 08/86] changed file names --- autotranscript/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 3bd3b1a..531c651 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,5 +1,5 @@ from autotranscript.__main__ import * -from autotranscript.transcriptor import * +from autotranscript.transcriber import * from autotranscript.audio_processor import * from autotranscript.diarisation import * from autotranscript.version import get_version as _get_version From ca42d631cdeefc9cef1b37c9de02be9af31230a5 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 11:50:20 +0200 Subject: [PATCH 09/86] added deprecated warning --- autotranscript/misc.py | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/autotranscript/misc.py b/autotranscript/misc.py index 91008fd..065e45d 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -3,20 +3,14 @@ from pyannote.audio import Pipeline from whisper import Whisper, load_model import os import glob +from warnings import warn -def get_whisper_default_path() -> str: - """ - Get default path for whisper models +WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), + "models", "whisper") - Returns - ------- - str - path - """ - _path = os.path.dirname(os.path.dirname(__file__)) - return os.path.join(_path, "models", "whisper") - -WHISPER_DEFAULT_PATH = get_whisper_default_path() +PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), + "models", "pyannote", + "speaker_diarization", "config.yaml") def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper: """ @@ -52,9 +46,9 @@ def load_whisper_model(model: str ="medium", local : bool = False, download_root ------- Whisper Object """ - + warn("load_whisper_model is deprecated. Use Transcriptor.load_model() instead.", DeprecationWarning) if local: - available_models = [os.path.basename(x) for x in glob.glob(os.path.join(WHISPER_DEFAULT_PATH, "*"))] + available_models = [os.path.basename(x) for x in glob.glob(os.path.join(download_root, "*"))] for i, module in enumerate(available_models): available_models[i] = module.split(".")[0] @@ -62,9 +56,12 @@ def load_whisper_model(model: str ="medium", local : bool = False, download_root if model not in available_models: raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.") - return load_model(model, download_root=WHISPER_DEFAULT_PATH) + return load_model(model, download_root=download_root) -def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pipeline: +def load_pyannote_model(model: str = PYANNOTE_DEFAULT_PATH, + token: str = "", + local : bool = True, + *args, **kwargs) -> Pipeline: """ Load modules from pyannote @@ -72,6 +69,7 @@ def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pip ---------- model : str pyannote model + default: /models/pyannote/speaker_diarization/config.yaml token : str HUGGINGFACE_TOKEN local : bool @@ -81,8 +79,8 @@ def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pip ------- Pipeline Object """ - + warn("load_pyannote_model is deprecated. Use Diarisation.load_model() instead.", DeprecationWarning) if local: - return Pipeline.from_pretrained(model) + return Pipeline.from_pretrained(model,*args, **kwargs) else: - return Pipeline.from_pretrained(model, use_auth_token = token) + return Pipeline.from_pretrained(model, use_auth_token = token, *args, **kwargs) From b5dab23dd4cbd3a5b075c50f14de5f22ec622705 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 15:54:28 +0200 Subject: [PATCH 10/86] diarization in seconds --- autotranscript/diarisation.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index b0c9e84..be5e534 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -23,13 +23,9 @@ class Diarisation: :param kwargs: kwargs for diarization model :return: diarization """ - - print(f'Start diarization of audio file: {audiofile}') - + diarization = self.model(audiofile,*args, **kwargs) - print('Diarization finished') - out = self.format_diarization_output(diarization) return out @@ -81,9 +77,8 @@ class Diarisation: current_speaker]) for outp in normalized_output: - #convert in milliseconds - start = dia_list[outp[0]][0].start * 1000 - end = dia_list[outp[1]][0].end * 1000 + start = dia_list[outp[0]][0].start + end = dia_list[outp[1]][0].end diarization_output["segments"].append([start, end]) diarization_output["speakers"].append(outp[2]) From 6870d03f6b5574d66abd18107b2ebfeb92e0d476 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 15:56:52 +0200 Subject: [PATCH 11/86] better readbility --- autotranscript/transcriber.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index a3927f1..069866a 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -1,10 +1,12 @@ import os -from typing import TypeVar +from typing import TypeVar , Union from whisper import load_model from glob import glob whisper = TypeVar('whisper') +Tensor = TypeVar('Tensor') +nparray = TypeVar('nparray') Transcriber = TypeVar('Transcriber') def get_whisper_default_path() -> str: @@ -29,20 +31,24 @@ class Transcriber: """ self.model = model - - def transcribe(self, file : str, language:str = "German"): + def transcribe(self, audio : Union[str, Tensor, nparray] , + *args, **kwargs) -> str: """ transcribe audio file :param file: audio file to transcribe - :param language: language of the audio file + :param args: additional arguments + :param kwargs: additional keyword arguments + example: + - language: language of the audio file :return: transcript as string """ - result = self.model.transcribe(file, language = language) + + result = self.model.transcribe(audio, *args, **kwargs) return result["text"] @staticmethod - def save_transcript(transcript:str , save_path : str) -> None: + def save_transcript(transcript : str , save_path : str) -> None: """ Save transcript to file :param transcript: transcript as string @@ -57,10 +63,10 @@ class Transcriber: print(f'Transcript saved to {save_path}') @classmethod - def load_whisper_model(cls, - model: str = "medium", - local : bool = True, - download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: + def load_model(cls, + model: str = "medium", + local : bool = True, + download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: """ Load whisper module @@ -97,7 +103,8 @@ class Transcriber: if local: - available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))] + available_models = [os.path.basename(x) for x in + glob(os.path.join(download_root, "*"))] for i, module in enumerate(available_models): available_models[i] = module.split(".")[0] From edbe7ebb1d5ecf75e26d07e9d21097ec084f7168 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 12 Jun 2023 16:38:19 +0200 Subject: [PATCH 12/86] added pytorch audio support --- autotranscript/audio_processor.py | 111 +++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 18 deletions(-) diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py index 2b8eee8..40cf5be 100644 --- a/autotranscript/audio_processor.py +++ b/autotranscript/audio_processor.py @@ -1,9 +1,13 @@ -from typing import Union +from typing import Any, Union from pydub import AudioSegment +import torch +from torchaudio import load, save import os +from warn import warn class AudioProcessor: def __init__(self, audio_file:str): + self.audio_file_path = audio_file self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1]) @@ -12,15 +16,14 @@ class AudioProcessor: self.audiofilefolder = os.path.dirname(audio_file) self.audio_file_type = audio_file.split('.')[-1] - - - def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True): + + def save(self, path: str, remove_orginal: bool = True , *args, **kwargs) -> None: """ - Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the - Whisper model - :param file: path to audio or video file + Convert and saves video file or other audio files to a different file type, + Can be used to ensure that the audio file is in the correct format for the Whisper model + :param path : path to save file :param remove_orginal: remove original file - :return: mp3 file path + :return: mp3 file path """ print(f'Converting {self.audiofilename} to .{type} file') @@ -36,16 +39,11 @@ class AudioProcessor: self.audio_file.export(savepath, format=type) - print(f'Converted {self.audiofilename} to {type}') - if remove_orginal: os.remove(self.audio_file_path) print(f'File {self.audio_file_path} removed') - self.audio_file_path = savepath - self.audio_file = AudioSegment.from_file(savepath, format=type) - return self def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): """ @@ -54,18 +52,29 @@ class AudioProcessor: :param remove_orginal: remove original file :return: mp3 file path """ - return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal) + warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead") + return self.convert_audio(savefolder = savefolder, + savename = savename, + type="mp3", + remove_orginal=remove_orginal) - def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): + def to_wav(self, savefolder: str = "", + savename: str = "", + remove_orginal: bool = True): """ Convert audio file to wav file :param file: audio file :param remove_orginal: remove original file :return: wav file path """ - return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal) + warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead") + return self.convert_audio(savefolder = savefolder, + savename = savename,type="wav", + remove_orginal=remove_orginal) - def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"): + def slower_mp3(self, savefolder: str = "", + speed: float = 0.75, + type: str = "mp3"): """ Slow down mp3 file :param file: mp3 file @@ -90,4 +99,70 @@ class AudioProcessor: slow_sound.export(save_path, format=type) - return slow_sound \ No newline at end of file + return slow_sound + + + + +class TorchAudioProcessor: + """ + Audio Processor using PyTorchaudio instead of PyDub + """ + + def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: + """ + Initialise audio processor + :param waveform: waveform + :param sr: sample rate + """ + self.waveform = waveform + self.sr = sr + + + + @classmethod + def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Load audio file + :param file: audio file + :return: AudioProcessor + """ + if not os.path.exists(file): + raise FileNotFoundError(f'File {file} not found') + + if "format" not in kwargs: + kwargs["format"] = file.split('.')[-1] + + audio, sr = load(file , *args, **kwargs) + + return cls(audio, sr) + + def cut(self, start: float, end: float) -> torch.Tensor: + """ + Cut audio file + :param start: start time in seconds + :param end: end time in seconds + :return: AudioProcessor + """ + start = int(start / self.sr) + end = torch.ceil(end / self.sr) + + return self.waveform[:, start:end] + + def save(self, path: str, *args, **kwargs) -> None: + """ + Save audio file + :param path: path to save file + :return: None + """ + if "format" not in kwargs: + kwargs["format"] = file.split('.')[-1] + + save(file, self.waveform, self.sr, *args, **kwargs) + + def __repr__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' + + def __str__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' + \ No newline at end of file From a5693490dfcccef75803f7b1e90beb8b97f27ff8 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 13 Jun 2023 07:05:57 +0200 Subject: [PATCH 13/86] removed renamed file --- autotranscript/transcriptor.py | 112 --------------------------------- 1 file changed, 112 deletions(-) delete mode 100644 autotranscript/transcriptor.py diff --git a/autotranscript/transcriptor.py b/autotranscript/transcriptor.py deleted file mode 100644 index a3927f1..0000000 --- a/autotranscript/transcriptor.py +++ /dev/null @@ -1,112 +0,0 @@ - -import os -from typing import TypeVar -from whisper import load_model -from glob import glob - -whisper = TypeVar('whisper') -Transcriber = TypeVar('Transcriber') - -def get_whisper_default_path() -> str: - """ - Get default path for whisper models - - Returns - ------- - str - path - """ - _path = os.path.dirname(os.path.dirname(__file__)) - return os.path.join(_path, "models", "whisper") - -WHISPER_DEFAULT_PATH = get_whisper_default_path() - -class Transcriber: - def __init__(self, model: whisper ) -> None: - """ - Initialize Transcriber class with a whisper model - :param model: whisper model - """ - self.model = model - - - def transcribe(self, file : str, language:str = "German"): - """ - transcribe audio file - :param file: audio file to transcribe - :param language: language of the audio file - :return: transcript as string - """ - result = self.model.transcribe(file, language = language) - - return result["text"] - - @staticmethod - def save_transcript(transcript:str , save_path : str) -> None: - """ - Save transcript to file - :param transcript: transcript as string - :param savepath: path to save the transcript - :return: None - """ - - with open(save_path, 'w') as f: - f.write(transcript) - f.close() - - print(f'Transcript saved to {save_path}') - - @classmethod - def load_whisper_model(cls, - model: str = "medium", - local : bool = True, - download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: - """ - Load whisper module - - Parameters - ---------- - whisper : str - whisper model - available models: - - - 'tiny.en' - - 'tiny' - - 'base.en' - - 'base' - - 'small.en' - - 'small' - - 'medium.en' - - 'medium' - - 'large-v1' - - 'large-v2' - - 'large' - - local : bool - If true, load from local cache - - download_root : str - Path to download the model - - default: /models/whisper - - Returns - ------- - Whisper Object - """ - - if local: - - available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))] - - for i, module in enumerate(available_models): - available_models[i] = module.split(".")[0] - - if model not in available_models: - raise RuntimeError("Model not found. Consider downloading the "/ - "model first. By deactivating the local flag, " / - "the model will be downloaded automatically.") - - _model = load_model(model, download_root=download_root) - - return cls(_model) From 157851f8fad88dca13557dd7cd1cca933cded3fd Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 13 Jun 2023 08:25:58 +0200 Subject: [PATCH 14/86] added compability with torchaudio --- autotranscript/audio_processor.py | 135 ++++++++++++++++-------------- 1 file changed, 73 insertions(+), 62 deletions(-) diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py index 40cf5be..3f0bf38 100644 --- a/autotranscript/audio_processor.py +++ b/autotranscript/audio_processor.py @@ -1,108 +1,107 @@ -from typing import Any, Union -from pydub import AudioSegment -import torch -from torchaudio import load, save import os -from warn import warn +from warnings import warn + +import torch +from pydub import AudioSegment +from torchaudio import load, save + class AudioProcessor: def __init__(self, audio_file:str): - self.audio_file_path = audio_file - self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1]) - - self.audiofilename = audio_file.split('/')[-1][:-4] - self.coreaudiofile = audio_file.split('/')[-1][:-4] - self.audiofilefolder = os.path.dirname(audio_file) - self.audio_file_type = audio_file.split('.')[-1] - - - def save(self, path: str, remove_orginal: bool = True , *args, **kwargs) -> None: + self.audio = AudioSegment.from_file(audio_file, + format=audio_file.split('.')[-1]) + self.audio_file_path = audio_file + self.waveform = self.pydub_to_tensor[0] + self.sr = self.pydub_to_tensor[1] + + @property + def pydub_to_tensor(self): + """ + Converts pydub audio segment into np.float32 of shape + [duration_in_seconds*sample_rate, channels], + where each value is in range [-1.0, 1.0]. + Returns tuple (audio_np_array, sample_rate). + """ + audio = self.audio + x = torch.Tensor(audio.get_array_of_samples() + ).reshape((-1, audio.channels)) + y = (1 << (8 * audio.sample_width - 1)) + return x / y, audio.frame_rate + + def convert_audio(self, path: str, remove_orginal: bool = False, + *args, **kwargs) -> None: """ Convert and saves video file or other audio files to a different file type, - Can be used to ensure that the audio file is in the correct format for the Whisper model + Can be used to ensure that the audio file is in the correct format + for the Whisper model. :param path : path to save file :param remove_orginal: remove original file - :return: mp3 file path + :param args: arguments for pydub.AudioSegment.export + :param kwargs: keyword arguments for pydub.AudioSegment.export + e.g. format + :return: None """ - print(f'Converting {self.audiofilename} to .{type} file') - if savefolder == "": - savefolder = self.audiofilefolder - - if savename == "": - savename = self.coreaudiofile + f'.{type}' - else: - savename = savename + f'.{type}' - - savepath = os.path.join(savefolder, savename) - - self.audio_file.export(savepath, format=type) + self.audio.export(path, *args, **kwargs) if remove_orginal: os.remove(self.audio_file_path) print(f'File {self.audio_file_path} removed') + + self.audio_file_path = path - - def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): + def to_mp3(self, *args, **kwargs) -> None: """ Convert audio file to mp3 file :param file: audio file :param remove_orginal: remove original file :return: mp3 file path """ - warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead") - return self.convert_audio(savefolder = savefolder, - savename = savename, - type="mp3", - remove_orginal=remove_orginal) + + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "mp3" not in kwargs["format"]: + kwargs["format"] = "mp3" + + self.convert_audio(*args, **kwargs) - def to_wav(self, savefolder: str = "", - savename: str = "", - remove_orginal: bool = True): + def to_wav(self,*args, **kwargs) -> None: """ Convert audio file to wav file :param file: audio file :param remove_orginal: remove original file :return: wav file path """ - warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead") - return self.convert_audio(savefolder = savefolder, - savename = savename,type="wav", - remove_orginal=remove_orginal) + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "wav" not in kwargs["format"]: + kwargs["format"] = "wav" + + self.convert_audio(*args, **kwargs) - def slower_mp3(self, savefolder: str = "", + def slower_mp3(self, path: str, speed: float = 0.75, - type: str = "mp3"): + type: str = "mp3") -> None: """ Slow down mp3 file :param file: mp3 file :param speed: speed :return: None """ - if savefolder == "": - savefolder = self.audiofilefolder - else: - savefolder = savefolder sound = self.audio_file slow_sound = sound._spawn(sound.raw_data, overrides={ "frame_rate": int(sound.frame_rate * speed) }) - speedstr = str(speed).replace('.', '') - - file_out = self.coreaudiofile + f'_{speedstr}.{type}' - - save_path = os.path.join(savefolder, file_out) - - slow_sound.export(save_path, format=type) + slow_sound.export(path, format=type) return slow_sound - - class TorchAudioProcessor: """ @@ -136,6 +135,19 @@ class TorchAudioProcessor: audio, sr = load(file , *args, **kwargs) return cls(audio, sr) + + @classmethod + def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Initialise audio processor using pydub audio segment. + pydub uses ffmped instead of SoX (which is used by torchaudio) + :param file: audio file + :return: TorchAudioProcessor + """ + audio = AudioProcessor(file) + + return cls(audio.waveform, audio.sr) + def cut(self, start: float, end: float) -> torch.Tensor: """ @@ -156,13 +168,12 @@ class TorchAudioProcessor: :return: None """ if "format" not in kwargs: - kwargs["format"] = file.split('.')[-1] + kwargs["format"] = path.split('.')[-1] - save(file, self.waveform, self.sr, *args, **kwargs) + save(path, self.waveform, self.sr, *args, **kwargs) def __repr__(self) -> str: return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' def __str__(self) -> str: - return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' - \ No newline at end of file + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' \ No newline at end of file From 3cfdb894bfa634875c8aabcc4b0b08f9fe4199e6 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 13 Jun 2023 09:54:14 +0200 Subject: [PATCH 15/86] updated get token --- autotranscript/diarisation.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index be5e534..123c692 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -14,7 +14,6 @@ class Diarisation: self.model = model - def diarization(self, audiofile : str , *args, **kwargs) -> Annotation: """ Diarization of audio file @@ -84,7 +83,17 @@ class Diarisation: diarization_output["speakers"].append(outp[2]) return diarization_output - + @staticmethod + def _get_token(): + # check ig .pyannotetoken.txt exists + path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken') + if os.path.exists(path): + with open(path, 'r') as f: + token = f.read() + else: + raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token' + ' and save it in a file called .pyannotetoken.txt') + return token @classmethod def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, token: str = "", @@ -111,6 +120,8 @@ class Diarisation: if local: diarization_model = Pipeline.from_pretrained(model,*args, **kwargs) else: + if token == "": + token = cls._get_token() diarization_model = Pipeline.from_pretrained(model, use_auth_token = token, *args, **kwargs) @@ -128,7 +139,6 @@ if __name__ == '__main__': print(model) audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav" out = model.diarization(audiofile) - print(out) # # deprecated # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'): From 7ee784457a2ef77d87b0423c0cecc6689286240c Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 13 Jun 2023 11:56:41 +0200 Subject: [PATCH 16/86] removed comments --- autotranscript/diarisation.py | 85 ++++++----------------------------- 1 file changed, 14 insertions(+), 71 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 123c692..55fd0cb 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,7 +1,7 @@ from pyannote.audio import Pipeline -from time import time +from torch import Tensor import os -from typing import TypeVar +from typing import TypeVar, Union Annotation = TypeVar('Annotation') @@ -9,15 +9,16 @@ PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), "models", "pyannote", "speaker_diarization", "config.yaml") -class Diarisation: +class Diariser: def __init__(self, model,*args,**kwargs) -> None: self.model = model - def diarization(self, audiofile : str , *args, **kwargs) -> Annotation: + def diarization(self, audiofile : Union[str, Tensor] , + *args, **kwargs) -> Annotation: """ Diarization of audio file - :param audiofile: path to audio file + :param audiofile: path to audio file or torch.Tensor :param args: args for diarization model :param kwargs: kwargs for diarization model :return: diarization @@ -83,17 +84,21 @@ class Diarisation: diarization_output["speakers"].append(outp[2]) return diarization_output + @staticmethod def _get_token(): # check ig .pyannotetoken.txt exists - path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken') + path = os.path.join(os.path.dirname( + os.path.realpath(__file__)), '.pyannotetoken') if os.path.exists(path): with open(path, 'r') as f: token = f.read() else: - raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token' - ' and save it in a file called .pyannotetoken.txt') + raise ValueError('No token found.' \ + 'Please create a token at https://huggingface.co/settings/token' \ + 'and save it in a file called .pyannotetoken.txt') return token + @classmethod def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, token: str = "", @@ -129,69 +134,7 @@ class Diarisation: def __repr__(self): return f"Diarisation(model={self.model})" + def __str__(self): return f"Diarisation(model={self.model})" - -if __name__ == '__main__': - - model = Diarisation.load_model() - print(model) - audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav" - out = model.diarization(audiofile) - - # # deprecated - # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'): - # """ - # Create temporary wav file for diarization - # :param location_of_temp_folder: folder to save the temporary wav file - # default: .temp - # :param savename: name of the temporary wav file prefix - # :param audiofile: audio file - # :return: temporary wav file - # """ - # print("Linne 84 Diarisation.py create_temporary_wav :" / - # "location_of_temp_folder.split('/')[-1]",location_of_temp_folder.split('/')[-1]) - - # if location_of_temp_folder.split('/')[-1] != '.temp': - # folder =os.path.join(location_of_temp_folder, '.temp') - # else: - # folder = location_of_temp_folder - - # if not os.path.exists(folder): - # os.makedirs(folder) - - # folder = os.path.realpath(folder) - - # if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'): - # raise AttributeError("You need to run the diarization first") - - # speaker = set(self.diarization_output["speakers"]) - # num_speak_iter = [0 for _ in range(len(speaker))] - - # for count, outp in enumerate(self.normalized_output): - # print(outp) - # print(self.diarization_output["segments"][outp[0]]) - # print(self.diarization_output["segments"][outp[1]]) - - # start = self.diarization_output["segments"][outp[0]].start - # end = self.diarization_output["segments"][outp[1]].end - - # print("start: ", start) - # print("end: ", end) - - # start_milliseconds = start * 1000 - # end_milliseconds = end * 1000 - - # print("start_milliseconds: ", start_milliseconds) - # print("end_milliseconds: ", end_milliseconds) - - # print("cut audio") - - # cut_audio = self.audio_file[start_milliseconds:end_milliseconds] - - # print("save audio") - # print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav") - # cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav") - - # return os.path.realpath(folder) \ No newline at end of file From 2e6af75f81f1a79fcbd3efe695d59da7259a2812 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:30:05 +0200 Subject: [PATCH 17/86] del file --- autotranscript/audio_processor.py | 179 ------------------------------ 1 file changed, 179 deletions(-) delete mode 100644 autotranscript/audio_processor.py diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py deleted file mode 100644 index 3f0bf38..0000000 --- a/autotranscript/audio_processor.py +++ /dev/null @@ -1,179 +0,0 @@ -import os -from warnings import warn - -import torch -from pydub import AudioSegment -from torchaudio import load, save - - -class AudioProcessor: - def __init__(self, audio_file:str): - - self.audio = AudioSegment.from_file(audio_file, - format=audio_file.split('.')[-1]) - self.audio_file_path = audio_file - self.waveform = self.pydub_to_tensor[0] - self.sr = self.pydub_to_tensor[1] - - @property - def pydub_to_tensor(self): - """ - Converts pydub audio segment into np.float32 of shape - [duration_in_seconds*sample_rate, channels], - where each value is in range [-1.0, 1.0]. - Returns tuple (audio_np_array, sample_rate). - """ - audio = self.audio - x = torch.Tensor(audio.get_array_of_samples() - ).reshape((-1, audio.channels)) - y = (1 << (8 * audio.sample_width - 1)) - return x / y, audio.frame_rate - - def convert_audio(self, path: str, remove_orginal: bool = False, - *args, **kwargs) -> None: - """ - Convert and saves video file or other audio files to a different file type, - Can be used to ensure that the audio file is in the correct format - for the Whisper model. - :param path : path to save file - :param remove_orginal: remove original file - :param args: arguments for pydub.AudioSegment.export - :param kwargs: keyword arguments for pydub.AudioSegment.export - e.g. format - :return: None - """ - - self.audio.export(path, *args, **kwargs) - - if remove_orginal: - os.remove(self.audio_file_path) - print(f'File {self.audio_file_path} removed') - - self.audio_file_path = path - - - def to_mp3(self, *args, **kwargs) -> None: - """ - Convert audio file to mp3 file - :param file: audio file - :param remove_orginal: remove original file - :return: mp3 file path - """ - - warn(DeprecationWarning, "This function is deprecated," \ - "please use convert_audio instead") - - if "mp3" not in kwargs["format"]: - kwargs["format"] = "mp3" - - self.convert_audio(*args, **kwargs) - - def to_wav(self,*args, **kwargs) -> None: - """ - Convert audio file to wav file - :param file: audio file - :param remove_orginal: remove original file - :return: wav file path - """ - warn(DeprecationWarning, "This function is deprecated," \ - "please use convert_audio instead") - - if "wav" not in kwargs["format"]: - kwargs["format"] = "wav" - - self.convert_audio(*args, **kwargs) - - def slower_mp3(self, path: str, - speed: float = 0.75, - type: str = "mp3") -> None: - """ - Slow down mp3 file - :param file: mp3 file - :param speed: speed - :return: None - """ - - sound = self.audio_file - slow_sound = sound._spawn(sound.raw_data, overrides={ - "frame_rate": int(sound.frame_rate * speed) - }) - - slow_sound.export(path, format=type) - - return slow_sound - - -class TorchAudioProcessor: - """ - Audio Processor using PyTorchaudio instead of PyDub - """ - - def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: - """ - Initialise audio processor - :param waveform: waveform - :param sr: sample rate - """ - self.waveform = waveform - self.sr = sr - - - - @classmethod - def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': - """ - Load audio file - :param file: audio file - :return: AudioProcessor - """ - if not os.path.exists(file): - raise FileNotFoundError(f'File {file} not found') - - if "format" not in kwargs: - kwargs["format"] = file.split('.')[-1] - - audio, sr = load(file , *args, **kwargs) - - return cls(audio, sr) - - @classmethod - def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': - """ - Initialise audio processor using pydub audio segment. - pydub uses ffmped instead of SoX (which is used by torchaudio) - :param file: audio file - :return: TorchAudioProcessor - """ - audio = AudioProcessor(file) - - return cls(audio.waveform, audio.sr) - - - def cut(self, start: float, end: float) -> torch.Tensor: - """ - Cut audio file - :param start: start time in seconds - :param end: end time in seconds - :return: AudioProcessor - """ - start = int(start / self.sr) - end = torch.ceil(end / self.sr) - - return self.waveform[:, start:end] - - def save(self, path: str, *args, **kwargs) -> None: - """ - Save audio file - :param path: path to save file - :return: None - """ - if "format" not in kwargs: - kwargs["format"] = path.split('.')[-1] - - save(path, self.waveform, self.sr, *args, **kwargs) - - def __repr__(self) -> str: - return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' - - def __str__(self) -> str: - return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' \ No newline at end of file From 90324e6ea7900669b7d0e46bca81e819f397ec9a Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:30:15 +0200 Subject: [PATCH 18/86] added unittests --- test_autotranscript.py | 79 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 7 deletions(-) diff --git a/test_autotranscript.py b/test_autotranscript.py index 29bf4d9..8f745a0 100644 --- a/test_autotranscript.py +++ b/test_autotranscript.py @@ -23,7 +23,7 @@ def transcriber(): Returns: Transcriber Object """ - return Transcriber.load_whisper_model("medium", local=True) + return Transcriber.load_model("medium", local=True) def test_Transcriber_init(transcriber): @@ -46,10 +46,75 @@ def test_save_transcript_to_file(transcriber): Test save_transcript_to_file """ transcript = transcriber.transcribe("tests/test.wav") - - open_mock = mock_open() - with patch("autotranscript.Transcriber.save_transcript", open_mock, create=True): - Transcriber.save_transcript(transcript, "output.txt") - open_mock.assert_called_with("output.txt", "w") - open_mock.return_value.write.assert_called_once_with("test-data") + Transcriber.save_transcript(transcript, "tests/output.txt") + + assert os.path.exists("tests/output.txt") + + os.remove("tests/output.txt") + +# Test Diaraization class + +from autotranscript import Diariser + +@pytest.fixture +def diarisation(): + """ + Prepare Diarisation for testing + Returns: Diarisation Object + """ + + return Diariser.load_model("models/pyannote/speaker_diarization/config.yaml", local=True) + +def test_Diarisation_init(diarisation): + """ + Test Diarisation initialization with a pyannote model + """ + + assert isinstance(diarisation, Diariser) + +def test_diarisation(diarisation): + """ + Test diarisation + """ + + diarisation = diarisation.diarization("tests/test.wav") + assert isinstance(diarisation, dict) + +# Test AudioProcessor + +from autotranscript import AudioProcessor , TorchAudioProcessor + + +def test_AudioProcessor_init(): + """ + Test AudioProcessor initialization + """ + audio = AudioProcessor("tests/test.wav") + assert isinstance(audio, AudioProcessor) + +def test_AudioProcessor_convert(): + """ + Test AudioProcessor convert + """ + audio = AudioProcessor("tests/test.wav") + audio.convert_audio("tests/test.mp3", format="mp3") + assert os.path.exists("tests/test.mp3") + +def test_TorchAudioProcessor_from_file(): + """ + Test TorchAudioProcessor initialization + """ + audio = TorchAudioProcessor.from_file("tests/test.wav") + + assert isinstance(audio, TorchAudioProcessor) + + os.remove("tests/test.mp3") + + +def test_TorchAudioProcessor_from_ffmpeg(): + """ + Test TorchAudioProcessor initialization + """ + audio = TorchAudioProcessor.from_ffmpeg("tests/test.wav") + assert isinstance(audio, TorchAudioProcessor) From 34354c055f7514cad065b9e00a7273308a138657 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:30:29 +0200 Subject: [PATCH 19/86] changed imports --- autotranscript/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 531c651..5aea052 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,6 +1,7 @@ from autotranscript.__main__ import * from autotranscript.transcriber import * -from autotranscript.audio_processor import * +from autotranscript.audio import * +from autotranscript.transcript_exporter import * from autotranscript.diarisation import * from autotranscript.version import get_version as _get_version from autotranscript.misc import * From 854469fb6e173bf0f4ee3f1ed4665480dfccf176 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:30:57 +0200 Subject: [PATCH 20/86] audio processing --- autotranscript/audio.py | 202 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 202 insertions(+) create mode 100644 autotranscript/audio.py diff --git a/autotranscript/audio.py b/autotranscript/audio.py new file mode 100644 index 0000000..3175ca0 --- /dev/null +++ b/autotranscript/audio.py @@ -0,0 +1,202 @@ +import os +from warnings import warn + +import torch +from pydub import AudioSegment +from torchaudio import load, save + + +class AudioProcessor: + def __init__(self, audio_file:str): + + self.audio = AudioSegment.from_file(audio_file, + format=audio_file.split('.')[-1]) + self.audio_file_path = audio_file + self.waveform = self.pydub_to_tensor[0] + self.sr = self.pydub_to_tensor[1] + + @property + def pydub_to_tensor(self): + """ + Converts pydub audio segment into np.float32 of shape + [duration_in_seconds*sample_rate, channels], + where each value is in range [-1.0, 1.0]. + Returns tuple (audio_np_array, sample_rate). + """ + audio = self.audio + x = torch.Tensor(audio.get_array_of_samples() + ).reshape((-1, audio.channels)) + y = (1 << (8 * audio.sample_width - 1)) + return x / y, audio.frame_rate + + def convert_audio(self, path: str, remove_orginal: bool = False, + *args, **kwargs) -> None: + """ + Convert and saves video file or other audio files to a different file type, + Can be used to ensure that the audio file is in the correct format + for the Whisper model. + :param path : path to save file + :param remove_orginal: remove original file + :param args: arguments for pydub.AudioSegment.export + :param kwargs: keyword arguments for pydub.AudioSegment.export + e.g. format + :return: None + """ + + self.audio.export(path, *args, **kwargs) + + if remove_orginal: + os.remove(self.audio_file_path) + print(f'File {self.audio_file_path} removed') + + self.audio_file_path = path + + + def to_mp3(self, *args, **kwargs) -> None: + """ + Convert audio file to mp3 file + :param file: audio file + :param remove_orginal: remove original file + :return: mp3 file path + """ + + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "mp3" not in kwargs["format"]: + kwargs["format"] = "mp3" + + self.convert_audio(*args, **kwargs) + + def to_wav(self,*args, **kwargs) -> None: + """ + Convert audio file to wav file + :param file: audio file + :param remove_orginal: remove original file + :return: wav file path + """ + warn(DeprecationWarning, "This function is deprecated," \ + "please use convert_audio instead") + + if "wav" not in kwargs["format"]: + kwargs["format"] = "wav" + + self.convert_audio(*args, **kwargs) + + def slower_mp3(self, path: str, + speed: float = 0.75, + type: str = "mp3") -> None: + """ + Slow down mp3 file + :param file: mp3 file + :param speed: speed + :return: None + """ + + sound = self.audio_file + slow_sound = sound._spawn(sound.raw_data, overrides={ + "frame_rate": int(sound.frame_rate * speed) + }) + + slow_sound.export(path, format=type) + + return slow_sound + + +class TorchAudioProcessor: + """ + Audio Processor using PyTorchaudio instead of PyDub + """ + + def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: + """ + Initialise audio processor + :param waveform: waveform + :param sr: sample rate + """ + self.waveform = waveform.reshape(-1) + self.sr = sr + + if not isinstance(self.sr, int): + raise ValueError("Sample rate should be a single value of type int," \ + f"not {len(self.sr)} and type {type(self.sr)}") + + + @classmethod + def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Load audio file + :param file: audio file + :return: AudioProcessor + """ + if not os.path.exists(file): + raise FileNotFoundError(f'File {file} not found') + + if "format" not in kwargs: + kwargs["format"] = file.split('.')[-1] + + audio, sr = load(file , *args, **kwargs) + + return cls(audio, sr) + + @classmethod + def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + """ + Initialise audio processor using pydub audio segment. + pydub uses ffmped instead of SoX (which is used by torchaudio) + :param file: audio file + :return: TorchAudioProcessor + """ + audio = AudioProcessor(file) + + return cls(audio.waveform, audio.sr) + + @classmethod + def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor': + """ + Initialise audio processor using pydub audio segment. + + :param audio_processor: AudioProcessor object + :type audio_processor: AudioProcessor + :return: TorchAudioProcessor + :rtype: TorchAudioProcessor + """ + return cls(audio_processor.waveform, audio_processor.sr) + + def cut(self, start: float, end: float) -> torch.Tensor: + """ + Cut audio file + :param start: start time in seconds + :param end: end time in seconds + :return: AudioProcessor + """ + + if isinstance(start, float): + start = torch.Tensor([start]) + if isinstance(end, float): + end = torch.Tensor([end]) + + sr = torch.Tensor([self.sr]) + + start = int(start * sr) + end = torch.ceil(end * sr) + + return self.waveform[start:end.to(int)] + + def save(self, path: str, *args, **kwargs) -> None: + """ + Save audio file + :param path: path to save file + :return: None + """ + if "format" not in kwargs: + kwargs["format"] = path.split('.')[-1] + + save(path, self.waveform, self.sr, *args, **kwargs) + + + def __repr__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' + + def __str__(self) -> str: + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' From 002c7b518901151a9df6ce50120940e3c40045e8 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:31:07 +0200 Subject: [PATCH 21/86] auto transcript --- autotranscript/autotranscipt.py | 125 ++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 autotranscript/autotranscipt.py diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py new file mode 100644 index 0000000..c1225af --- /dev/null +++ b/autotranscript/autotranscipt.py @@ -0,0 +1,125 @@ +from audio import AudioProcessor , TorchAudioProcessor + +from diarisation import Diariser +from transcriber import Transcriber, whisper +from whisper import Whisper +from transcript_exporter import Transcript +from typing import Union , TypeVar +from tqdm import trange +from pprint import pprint +import torch +diarisation = TypeVar('diarisation') + + +class AutoTranscribe: + def __init__(self, + whisper_model: Union[bool, str, whisper] = None, + dia_model : Union[bool, str, diarisation] = None, + dia_kwargs : dict = {}, + whisper_kwargs : dict = {}) -> None: + """ + AutoTranscribe class + + This class is the core Api Class of the autotranscript package. + It allows to transcribe audio files with a whisper model and + pyannote diarization model. + + Therefore it is do a fully automatic transcription of audio files. + + :param whisper_model: path to whisper model or whisper model + :param dia_model: path to pyannote diarization model + :param dia_kwargs: kwargs for pyannote diarization model + :param whisper_kwargs: kwargs for whisper model + + """ + + if whisper_model is None: + self.transcriber = Transcriber.load_model("medium", local=True) + elif isinstance(whisper_model, str): + self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs) + else: + self.transcriber = whisper_model + + if dia_model is None: + self.diariser = Diariser.load_model() + elif isinstance(dia_model, str): + self.diariser = Diariser.load_model(dia_model, **dia_kwargs) + else: + self.diariser = dia_model + + print("AutoTranscribe initialized all models successfully loaded.") + + def transcribe(self, audiofile : Union[str, torch.Tensor], + *args, **kwargs) -> Transcript: + """ + Transcribe audiofile with whisper model and pyannote diarization model + + :param audiofile: path to audiofile or torch.Tensor + :return: Transcript object + """ + + audiofile = self.get_audiofile(audiofile) + + final_transcript = dict() + + dia_audio = {"waveform" : + audiofile.waveform.reshape(1,len(audiofile.waveform)), + "sample_rate": audiofile.sr} + + print("Starting diarisation.") + + diarisation = self.diariser.diarization( dia_audio, + *args , **kwargs) + + print("Diarisation finished. Starting transcription.") + + for i in trange(len(diarisation["segments"]), desc= "Transcribing"): + + seg = diarisation["segments"][i] + + audio = audiofile.cut(seg[0], seg[1]) + + transcript = self.transcriber.transcribe(audio, *args , **kwargs) + + final_transcript[i] = {"speaker" : diarisation["speakers"][i], + "text" : transcript} + + pprint(final_transcript) + #return Transcript(transcript, diarisation) + + @staticmethod + def get_audiofile(audiofile : Union[str, torch.Tensor], + *args, **kwargs) -> TorchAudioProcessor: + """ + Get audiofile as TorchAudioProcessor + + :param audiofile: path to audiofile or torch.Tensor + :type audiofile: Union[str, torch.Tensor] + :return: object of audiofile containes + waveform and sample_rate in torch.Tensor format. + :rtype: TorchAudioProcessor + """ + if isinstance(audiofile, str): + try: + audiofile = TorchAudioProcessor.from_file(audiofile) + except: + print("Could not load audiofile with torch audio." \ + "Trying ffmpeg. using pydub.") + audiofile = TorchAudioProcessor.from_ffmpeg(audiofile) + + if isinstance(audiofile, torch.Tensor): + audiofile = TorchAudioProcessor(audiofile[0], audiofile[1]) + + if isinstance(audiofile, AudioProcessor): + audiofile = TorchAudioProcessor.from_audio_processor(audiofile) + + if not isinstance(audiofile, TorchAudioProcessor): + raise ValueError(f'Audiofile must be of type TorchAudioProcessor,' \ + f'not {type(audiofile)}') + return audiofile + + +if __name__ == "__main__": + + AudioTranscriber = AutoTranscribe() + AudioTranscriber.transcribe("/home/jacob/PycharmProjects/autotranscript/tests/Kathi_interview.mp3" , num_speaker=2) \ No newline at end of file From 67e4e4585da3be40190a265bcf7b12e446f2ee69 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:31:25 +0200 Subject: [PATCH 22/86] added kwargs parsing --- autotranscript/diarisation.py | 20 ++++++++++++++++++++ autotranscript/transcriber.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 55fd0cb..3b64fac 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,4 +1,5 @@ from pyannote.audio import Pipeline +from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization from torch import Tensor import os from typing import TypeVar, Union @@ -23,6 +24,7 @@ class Diariser: :param kwargs: kwargs for diarization model :return: diarization """ + kwargs = self._get_diarisation_kwargs(**kwargs) diarization = self.model(audiofile,*args, **kwargs) @@ -132,6 +134,24 @@ class Diariser: return cls(diarization_model) + @staticmethod + def _get_diarisation_kwargs(**kwargs) -> dict: + """ + Get kwargs for pyannote diarization model + Ensure that kwargs are valid + :return: kwargs for pyannote diarization model + :rtype: dict + """ + _possible_kwargs = SpeakerDiarization.apply.__code__.co_varnames + + diarisation_kwargs = dict() + + for k in kwargs.keys(): + if k in _possible_kwargs: + diarisation_kwargs[k] = kwargs[k] + + return diarisation_kwargs + def __repr__(self): return f"Diarisation(model={self.model})" diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 069866a..57a3423 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -1,5 +1,5 @@ - import os +from whisper import Whisper from typing import TypeVar , Union from whisper import load_model from glob import glob @@ -43,8 +43,17 @@ class Transcriber: :return: transcript as string """ - result = self.model.transcribe(audio, *args, **kwargs) + kwargs = self._get_whisper_kwargs(**kwargs) + if kwargs or args: + result = self.model.transcribe(audio, *args, **kwargs) + else: + # if kwargs is empty but parsed anyway whisper + # will not use the default kwargs + + print("No kwargs parsed. Using default kwargs.") + result = self.model.transcribe(audio) + return result["text"] @staticmethod @@ -117,3 +126,21 @@ class Transcriber: _model = load_model(model, download_root=download_root) return cls(_model) + + @staticmethod + def _get_whisper_kwargs(**kwargs) -> dict: + """ + Get kwargs for whisper model. + Ensure that kwargs are valid. + :return: kwargs for whisper model + :rtype: dict + """ + _possible_kwargs = Whisper.transcribe.__code__.co_varnames + + whisper_kwargs = dict() + + for k in kwargs.keys(): + if k in _possible_kwargs: + whisper_kwargs[k] = kwargs[k] + + return whisper_kwargs \ No newline at end of file From 07acbc9464a00ac11f7b830ba1e340acd44aed84 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 14 Jun 2023 16:31:44 +0200 Subject: [PATCH 23/86] added dummy class for output --- autotranscript/transcript_exporter.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 autotranscript/transcript_exporter.py diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py new file mode 100644 index 0000000..956b398 --- /dev/null +++ b/autotranscript/transcript_exporter.py @@ -0,0 +1,23 @@ + +class Transcript: + """ + Class for storing transcript data + and exporting it to files in different formats + """ + def __init__(self, transcript: str) -> None: + """ + :param transcript: formated transcript string + """ + self.transcript = transcript + + def to_latex(self, path: str) -> None: + pass + + def to_pdf(self, path: str) -> None: + pass + + def to_txt(self, path: str) -> None: + pass + + def to_json(self, path: str) -> None: + pass \ No newline at end of file From edd6a0104c0cce4a0e300ddc48cfdfce8d190cf9 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 11:28:55 +0200 Subject: [PATCH 24/86] removed pydub and use ffmpeg remove dependencies. Droped pydub functionality and focuses on core components instead --- autotranscript/audio.py | 190 +++++++++++----------------------------- 1 file changed, 49 insertions(+), 141 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index 3175ca0..fe82041 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -1,109 +1,13 @@ import os from warnings import warn +import numpy as np import torch -from pydub import AudioSegment -from torchaudio import load, save +import ffmpeg +SAMPLE_RATE = 16000 class AudioProcessor: - def __init__(self, audio_file:str): - - self.audio = AudioSegment.from_file(audio_file, - format=audio_file.split('.')[-1]) - self.audio_file_path = audio_file - self.waveform = self.pydub_to_tensor[0] - self.sr = self.pydub_to_tensor[1] - - @property - def pydub_to_tensor(self): - """ - Converts pydub audio segment into np.float32 of shape - [duration_in_seconds*sample_rate, channels], - where each value is in range [-1.0, 1.0]. - Returns tuple (audio_np_array, sample_rate). - """ - audio = self.audio - x = torch.Tensor(audio.get_array_of_samples() - ).reshape((-1, audio.channels)) - y = (1 << (8 * audio.sample_width - 1)) - return x / y, audio.frame_rate - - def convert_audio(self, path: str, remove_orginal: bool = False, - *args, **kwargs) -> None: - """ - Convert and saves video file or other audio files to a different file type, - Can be used to ensure that the audio file is in the correct format - for the Whisper model. - :param path : path to save file - :param remove_orginal: remove original file - :param args: arguments for pydub.AudioSegment.export - :param kwargs: keyword arguments for pydub.AudioSegment.export - e.g. format - :return: None - """ - - self.audio.export(path, *args, **kwargs) - - if remove_orginal: - os.remove(self.audio_file_path) - print(f'File {self.audio_file_path} removed') - - self.audio_file_path = path - - - def to_mp3(self, *args, **kwargs) -> None: - """ - Convert audio file to mp3 file - :param file: audio file - :param remove_orginal: remove original file - :return: mp3 file path - """ - - warn(DeprecationWarning, "This function is deprecated," \ - "please use convert_audio instead") - - if "mp3" not in kwargs["format"]: - kwargs["format"] = "mp3" - - self.convert_audio(*args, **kwargs) - - def to_wav(self,*args, **kwargs) -> None: - """ - Convert audio file to wav file - :param file: audio file - :param remove_orginal: remove original file - :return: wav file path - """ - warn(DeprecationWarning, "This function is deprecated," \ - "please use convert_audio instead") - - if "wav" not in kwargs["format"]: - kwargs["format"] = "wav" - - self.convert_audio(*args, **kwargs) - - def slower_mp3(self, path: str, - speed: float = 0.75, - type: str = "mp3") -> None: - """ - Slow down mp3 file - :param file: mp3 file - :param speed: speed - :return: None - """ - - sound = self.audio_file - slow_sound = sound._spawn(sound.raw_data, overrides={ - "frame_rate": int(sound.frame_rate * speed) - }) - - slow_sound.export(path, format=type) - - return slow_sound - - -class TorchAudioProcessor: """ Audio Processor using PyTorchaudio instead of PyDub """ @@ -114,54 +18,27 @@ class TorchAudioProcessor: :param waveform: waveform :param sr: sample rate """ - self.waveform = waveform.reshape(-1) + self.waveform = waveform self.sr = sr if not isinstance(self.sr, int): raise ValueError("Sample rate should be a single value of type int," \ f"not {len(self.sr)} and type {type(self.sr)}") - @classmethod - def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': + def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor': """ Load audio file :param file: audio file :return: AudioProcessor """ - if not os.path.exists(file): - raise FileNotFoundError(f'File {file} not found') - if "format" not in kwargs: - kwargs["format"] = file.split('.')[-1] - - audio, sr = load(file , *args, **kwargs) + audio, sr = cls.load_audio(file , *args, **kwargs) + + audio = torch.from_numpy(audio) return cls(audio, sr) - @classmethod - def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor': - """ - Initialise audio processor using pydub audio segment. - pydub uses ffmped instead of SoX (which is used by torchaudio) - :param file: audio file - :return: TorchAudioProcessor - """ - audio = AudioProcessor(file) - - return cls(audio.waveform, audio.sr) - - @classmethod - def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor': - """ - Initialise audio processor using pydub audio segment. - - :param audio_processor: AudioProcessor object - :type audio_processor: AudioProcessor - :return: TorchAudioProcessor - :rtype: TorchAudioProcessor - """ - return cls(audio_processor.waveform, audio_processor.sr) def cut(self, start: float, end: float) -> torch.Tensor: """ @@ -182,21 +59,52 @@ class TorchAudioProcessor: end = torch.ceil(end * sr) return self.waveform[start:end.to(int)] - - def save(self, path: str, *args, **kwargs) -> None: + + @staticmethod + def load_audio(file: str, sr: int = SAMPLE_RATE): """ - Save audio file - :param path: path to save file - :return: None + Open an audio file and read as mono waveform, resampling as necessary + + Changed from original function at whisper.audio.load_audio to ensure compatibility + with pyannote.audio + Parameters + ---------- + file: str + The audio file to open + + sr: int + The sample rate to resample the audio if necessary + + Returns + ------- + A NumPy array containing the audio waveform, in float32 dtype. """ - if "format" not in kwargs: - kwargs["format"] = path.split('.')[-1] - - save(path, self.waveform, self.sr, *args, **kwargs) - + try: + # This launches a subprocess to decode audio while down-mixing + # and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="s16le", acodec="pcm_s16le", + ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], + capture_stdout=True, capture_stderr=True) + ) + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e + + out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + + return out , sr def __repr__(self) -> str: return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' def __str__(self) -> str: return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' + + +if __name__ == "__main__": + + print("Testing AudioProcessor") + print(AudioProcessor.from_file("tests/test.wav")) \ No newline at end of file From 8ecc66cf2920b6450324a0d1335f81334fffc893 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 11:30:47 +0200 Subject: [PATCH 25/86] linting --- autotranscript/audio.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index fe82041..35b6f99 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -1,6 +1,3 @@ -import os -from warnings import warn - import numpy as np import torch import ffmpeg @@ -65,8 +62,8 @@ class AudioProcessor: """ Open an audio file and read as mono waveform, resampling as necessary - Changed from original function at whisper.audio.load_audio to ensure compatibility - with pyannote.audio + Changed from original function at whisper.audio.load_audio to ensure + compatibility with pyannote.audio Parameters ---------- file: str From 29e8a229dc120a0e139fd354fa1f6e7dfb435683 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 12:09:18 +0200 Subject: [PATCH 26/86] autotrancript works --- autotranscript/autotranscipt.py | 38 ++++++++++++++------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py index c1225af..cbf2c9d 100644 --- a/autotranscript/autotranscipt.py +++ b/autotranscript/autotranscipt.py @@ -1,13 +1,11 @@ -from audio import AudioProcessor , TorchAudioProcessor - +from audio import AudioProcessor from diarisation import Diariser from transcriber import Transcriber, whisper -from whisper import Whisper from transcript_exporter import Transcript from typing import Union , TypeVar from tqdm import trange -from pprint import pprint import torch + diarisation = TypeVar('diarisation') @@ -35,6 +33,7 @@ class AutoTranscribe: if whisper_model is None: self.transcriber = Transcriber.load_model("medium", local=True) + elif isinstance(whisper_model, str): self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs) else: @@ -55,7 +54,8 @@ class AutoTranscribe: Transcribe audiofile with whisper model and pyannote diarization model :param audiofile: path to audiofile or torch.Tensor - :return: Transcript object + :return: Transcript object which contains the transcript and can be used to + export the transcript to differnt formats. """ audiofile = self.get_audiofile(audiofile) @@ -68,11 +68,13 @@ class AutoTranscribe: print("Starting diarisation.") - diarisation = self.diariser.diarization( dia_audio, + diarisation = self.diariser.diarization(dia_audio, *args , **kwargs) print("Diarisation finished. Starting transcription.") + audiofile.sr = torch.Tensor([audiofile.sr]).to(audiofile.waveform.device) + for i in trange(len(diarisation["segments"]), desc= "Transcribing"): seg = diarisation["segments"][i] @@ -84,12 +86,11 @@ class AutoTranscribe: final_transcript[i] = {"speaker" : diarisation["speakers"][i], "text" : transcript} - pprint(final_transcript) - #return Transcript(transcript, diarisation) + return Transcript(transcript, diarisation) @staticmethod def get_audiofile(audiofile : Union[str, torch.Tensor], - *args, **kwargs) -> TorchAudioProcessor: + *args, **kwargs) -> AudioProcessor: """ Get audiofile as TorchAudioProcessor @@ -99,22 +100,15 @@ class AutoTranscribe: waveform and sample_rate in torch.Tensor format. :rtype: TorchAudioProcessor """ + if isinstance(audiofile, str): - try: - audiofile = TorchAudioProcessor.from_file(audiofile) - except: - print("Could not load audiofile with torch audio." \ - "Trying ffmpeg. using pydub.") - audiofile = TorchAudioProcessor.from_ffmpeg(audiofile) + audiofile = AudioProcessor.from_file(audiofile) if isinstance(audiofile, torch.Tensor): - audiofile = TorchAudioProcessor(audiofile[0], audiofile[1]) + audiofile = AudioProcessor(audiofile[0], audiofile[1]) - if isinstance(audiofile, AudioProcessor): - audiofile = TorchAudioProcessor.from_audio_processor(audiofile) - - if not isinstance(audiofile, TorchAudioProcessor): - raise ValueError(f'Audiofile must be of type TorchAudioProcessor,' \ + if not isinstance(audiofile, AudioProcessor): + raise ValueError(f'Audiofile must be of type AudioProcessor,' \ f'not {type(audiofile)}') return audiofile @@ -122,4 +116,4 @@ class AutoTranscribe: if __name__ == "__main__": AudioTranscriber = AutoTranscribe() - AudioTranscriber.transcribe("/home/jacob/PycharmProjects/autotranscript/tests/Kathi_interview.mp3" , num_speaker=2) \ No newline at end of file + AudioTranscriber.transcribe("tests/test.wav") \ No newline at end of file From de1ca223976e4993dd2f2fcd5276a5bf3c556f57 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 12:09:53 +0200 Subject: [PATCH 27/86] added dict as input type --- autotranscript/diarisation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 3b64fac..ff3ead0 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -15,7 +15,7 @@ class Diariser: self.model = model - def diarization(self, audiofile : Union[str, Tensor] , + def diarization(self, audiofile : Union[str, Tensor, dict] , *args, **kwargs) -> Annotation: """ Diarization of audio file From 713dd3bfd5861e517d6660ff74614019fe2307df Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 12:10:11 +0200 Subject: [PATCH 28/86] added cuda support --- autotranscript/audio.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index 35b6f99..ea11fe8 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -9,13 +9,28 @@ class AudioProcessor: Audio Processor using PyTorchaudio instead of PyDub """ - def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None: + def __init__(self, waveform: torch.Tensor, sr : torch.Tensor, + *args, **kwargs) -> None: """ Initialise audio processor :param waveform: waveform :param sr: sample rate + :param args: additional arguments + :param kwargs: additional keyword arguments + example: + - device: device to use for processing + if cuda is available, cuda is used """ - self.waveform = waveform + + if "device" in kwargs: + device = kwargs["device"] + else: + if torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + self.waveform = waveform.to(device) self.sr = sr if not isinstance(self.sr, int): From 8a1bdda393febefa42250057a0a2112744665cda Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 12:11:13 +0200 Subject: [PATCH 29/86] added verbose dafault value to be false --- autotranscript/transcriber.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 57a3423..4fbf14b 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -44,16 +44,11 @@ class Transcriber: """ kwargs = self._get_whisper_kwargs(**kwargs) + + if "verbose" not in kwargs: + kwargs["verbose"] = False - if kwargs or args: - result = self.model.transcribe(audio, *args, **kwargs) - else: - # if kwargs is empty but parsed anyway whisper - # will not use the default kwargs - - print("No kwargs parsed. Using default kwargs.") - result = self.model.transcribe(audio) - + result = self.model.transcribe(audio, *args, **kwargs) return result["text"] @staticmethod From b3c9bcc482e857ad51dbac011118687a97956db0 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 12:13:56 +0200 Subject: [PATCH 30/86] fixed wrong Transcript class params --- autotranscript/autotranscipt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py index cbf2c9d..906166a 100644 --- a/autotranscript/autotranscipt.py +++ b/autotranscript/autotranscipt.py @@ -86,7 +86,7 @@ class AutoTranscribe: final_transcript[i] = {"speaker" : diarisation["speakers"][i], "text" : transcript} - return Transcript(transcript, diarisation) + return Transcript(final_transcript) @staticmethod def get_audiofile(audiofile : Union[str, torch.Tensor], From 52efd41d21e1dfd5056abfa73401673a09a77dbc Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:00:22 +0200 Subject: [PATCH 31/86] added Transcriptor class which handles Transcription output --- autotranscript/transcript_exporter.py | 181 +++++++++++++++++++++++++- 1 file changed, 175 insertions(+), 6 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 956b398..ae6f1b6 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,23 +1,192 @@ +import json + +ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] + class Transcript: """ Class for storing transcript data and exporting it to files in different formats """ - def __init__(self, transcript: str) -> None: + def __init__(self, transcript: dict) -> None: """ :param transcript: formated transcript string """ self.transcript = transcript + self.speakers = self._extract_speakers() + self.segments = self._extract_segments() + self.annotation = {} - def to_latex(self, path: str) -> None: + def annotate(self, *args, **kwargs) -> dict: + """ + Annote transcript to define speaker names + + :param args: list of speaker names will maped sequentially to the speakers + :param kwargs: dict with speaker names as keys and list of segments as values + + :return: dict with speaker names as keys and list of segments as values + :rtype: dict + """ + + annotatios = {} + + if len(args) != len(self.speakers): + raise ValueError("Number of speaker names does not match number of speakers") + + if args: + for arg,ospeaker in zip(args,self.speakers): + annotatios[ospeaker] = arg + + if kwargs: + for key in kwargs: + if key not in self.speakers: + raise ValueError(f"{key} is not a speaker") + annotatios[key] = kwargs[key] + + self.annotation = annotatios + return annotatios + + def _extract_speakers(self) -> list: + """ + Extract speaker names from transcript + :return: list of speaker names + :rtype: list + """ + return list(set([self.transcript[id]["speaker"] for id in self.transcript])) + + def _extract_segments(self) -> list: + """ + Extract segments from transcript + + :return: list of segments + :rtype: list + """ + return [self.transcript[id]["segment"] for id in self.transcript] + + def __str__(self) -> str: + """ + Get transcript as string + + :return: transcript as string + :rtype: str + """ + fstring = "" + + for id in self.transcript: + seq = self.transcript[id] + + if self.annotation: + speaker = self.annotation[seq["speaker"]] + else: + speaker = seq["speaker"] + + fstring += f"{speaker}: {seq['text']}\n" + + return fstring + + def __repr__(self) -> str: + return f"Transcript(speakers = {self.speakers},"\ + f"segments = {self.segments}, annotation = {self.annotation})" + + def get_dict(self) -> dict: + """ + Get transcript as dict + + :return: transcript as dict + :rtype: dict + """ + + return self.transcript + + def get_json(self, *args, **kwargs) -> str: + """ + Get transcript as json string + :return: transcript as json string + :rtype: str + """ + if "indent" not in kwargs: + kwargs["indent"] = 4 + return json.dumps(self.transcript, *args, **kwargs) + + def get_html(self) -> str: + """ + Get transcript as html string + + :return: transcript as html string + :rtype: str + """ + html = "

" + self.__str__().replace("\n", "
") + "

" + html = "" + html + "" + html = html.replace("\t", "    ") + + return html + + + def get_md(self) -> str: + return self.get_html() + + def get_tex(self) -> str: + + if not self.annotation: + + self.annotate(*ALPHABET[:len(self.speakers)]) + + fstring ="\\begin{drama}" + + for speaker in self.speakers: + + fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \ + "{"+ str(self.annotation[speaker]) + "}" + + for id in self.transcript: + seq = self.transcript[id] + speaker = self.annotation[seq["speaker"]] + fstring += f"\n\\{speaker}speaks:\n{seq['text']}" + + fstring += "\n\\end{drama}" + + return fstring + + + def to_json(self,path, *args, **kwargs) -> None: + """ + Save transcript as json file + :param path: path to save file + :type path: str + """ + with open(path, "w") as f: + json.dump(self.transcript, f, *args, **kwargs) + + def to_txt(self, path: str) -> None: + + with open(path, "w") as f: + f.write(self.__str__, f) + + def to_md(self, path: str) -> None: + return self.to_html(path) + + def to_html(self, path: str) -> None: + """ + Save transcript as html file + + :param path: path to save file + :type path: str + """ + + with open(path, "w") as file: + file.write(self.get_html()) + + def to_tex(self, path: str) -> None: pass def to_pdf(self, path: str) -> None: pass - def to_txt(self, path: str) -> None: - pass +if __name__ == "__main__": + test = Transcript(json.load(open("tests/test.json", "r"))) + print(repr(test)) + print(test) - def to_json(self, path: str) -> None: - pass \ No newline at end of file + + + \ No newline at end of file From cdfa872482e35fc4a85c995f3e20f65a0dae21e5 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:00:39 +0200 Subject: [PATCH 32/86] added segments to out dict --- autotranscript/autotranscipt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py index 906166a..792dcdd 100644 --- a/autotranscript/autotranscipt.py +++ b/autotranscript/autotranscipt.py @@ -84,8 +84,8 @@ class AutoTranscribe: transcript = self.transcriber.transcribe(audio, *args , **kwargs) final_transcript[i] = {"speaker" : diarisation["speakers"][i], + "segment" : seg, "text" : transcript} - return Transcript(final_transcript) @staticmethod From 4f416f26f9067d191097eee6604d544ba959d57f Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:03:25 +0200 Subject: [PATCH 33/86] changed wrong file name --- autotranscript/{autotranscipt.py => autotranscript.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename autotranscript/{autotranscipt.py => autotranscript.py} (100%) diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscript.py similarity index 100% rename from autotranscript/autotranscipt.py rename to autotranscript/autotranscript.py From c4c62c8ae150772e088d835bbb96ce8cfff5d3d1 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:06:09 +0200 Subject: [PATCH 34/86] added new file --- autotranscript/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 5aea052..ef47226 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,4 +1,5 @@ from autotranscript.__main__ import * +from autotranscript.autotranscript import * from autotranscript.transcriber import * from autotranscript.audio import * from autotranscript.transcript_exporter import * From e4e5cfb4bc3d8362eb5b78264d21a5ff2db24d32 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:06:18 +0200 Subject: [PATCH 35/86] linting --- autotranscript/transcript_exporter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index ae6f1b6..37092c8 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -31,7 +31,8 @@ class Transcript: annotatios = {} if len(args) != len(self.speakers): - raise ValueError("Number of speaker names does not match number of speakers") + raise ValueError("Number of speaker names "\ + "does not match number of speakers") if args: for arg,ospeaker in zip(args,self.speakers): From 61121aad928d3629fede7eee4d70e998ae6d26dc Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:09:33 +0200 Subject: [PATCH 36/86] updated version --- autotranscript/version.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotranscript/version.py b/autotranscript/version.py index 0a1a41e..5bc7ffc 100644 --- a/autotranscript/version.py +++ b/autotranscript/version.py @@ -1,8 +1,8 @@ import os import subprocess as sp -MAJOR = 1 -MINOR = 0 +MAJOR = 0 +MINOR = 2 MICRO = 0 MICRO_POST = 0 ISRELEASED = False From a653f0b05d874c4677420b7d64778f86d031947a Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:09:44 +0200 Subject: [PATCH 37/86] added new example usage --- transcribe.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/transcribe.py b/transcribe.py index e7c62fa..6601707 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,3 +1,7 @@ -from autotranscript import AutoTranscribe +from autotranscript.autotranscript import AutoTranscribe -AutoTranscribe(diarisation=True).transcribe() +model = AutoTranscribe() + +text = model.transcribe("tests/test.wav") + +print(text) From 1d25d61fa27e98d2c0f3b265f62f63e201c40d0a Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:09:53 +0200 Subject: [PATCH 38/86] linting --- autotranscript/transcriber.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 4fbf14b..e4d587a 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -1,7 +1,6 @@ import os -from whisper import Whisper +from whisper import Whisper, load_model from typing import TypeVar , Union -from whisper import load_model from glob import glob whisper = TypeVar('whisper') From 3ef7353db5384c1b350a166ea69b9408fd205fba Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 16 Jun 2023 15:10:08 +0200 Subject: [PATCH 39/86] changed module imports --- autotranscript/autotranscript.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 792dcdd..8cb7e8a 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -1,7 +1,7 @@ -from audio import AudioProcessor -from diarisation import Diariser -from transcriber import Transcriber, whisper -from transcript_exporter import Transcript +from autotranscript.audio import AudioProcessor +from autotranscript.diarisation import Diariser +from autotranscript.transcriber import Transcriber, whisper +from autotranscript.transcript_exporter import Transcript from typing import Union , TypeVar from tqdm import trange import torch From 7bfd294bbd1592cd80afb47d670c719f82f07830 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 11:52:21 +0200 Subject: [PATCH 40/86] fixed bug when only one speaker exists --- autotranscript/diarisation.py | 39 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index ff3ead0..931d395 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -53,38 +53,39 @@ class Diariser: # Sometimes two consecutive speakers are the same # This loop removes these duplicates ### - - - for i, (_, _, speaker) in enumerate(dia_list): + + if len(dia_list) == 1: + normalized_output.append([0, 0, dia_list[0]]) + else: - if i == 0: - current_speaker = speaker + for i, (_, _, speaker) in enumerate(dia_list): + if i == 0: + current_speaker = speaker - if speaker != current_speaker: + if speaker != current_speaker: - index_end_speaker = i - 1 + index_end_speaker = i - 1 - normalized_output.append([index_start_speaker, - index_end_speaker, - current_speaker]) + normalized_output.append([index_start_speaker, + index_end_speaker, + current_speaker]) - index_start_speaker = i - current_speaker = speaker + index_start_speaker = i + current_speaker = speaker - if i == len(diarization_output["speakers"]) - 1: + if i == len(diarization_output["speakers"]) - 1: - index_end_speaker = i - normalized_output.append([index_start_speaker, - index_end_speaker, - current_speaker]) - + index_end_speaker = i + normalized_output.append([index_start_speaker, + index_end_speaker, + current_speaker]) + for outp in normalized_output: start = dia_list[outp[0]][0].start end = dia_list[outp[1]][0].end diarization_output["segments"].append([start, end]) diarization_output["speakers"].append(outp[2]) - return diarization_output @staticmethod From d3606a2dab5c2e8ad6dd001000eb203bf681a1c5 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 12:01:18 +0200 Subject: [PATCH 41/86] removed dependencie on ffmpeg python will be dropped in future whisper realeases --- autotranscript/audio.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index ea11fe8..4e7ee60 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -1,6 +1,6 @@ import numpy as np import torch -import ffmpeg +from subprocess import CalledProcessError, run SAMPLE_RATE = 16000 @@ -91,18 +91,24 @@ class AudioProcessor: ------- A NumPy array containing the audio waveform, in float32 dtype. """ + # This launches a subprocess to decode audio while down-mixing + # and resampling as necessary. Requires the ffmpeg CLI in PATH. + # fmt: off + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", "0", + "-i", file, + "-f", "s16le", + "-ac", "1", + "-acodec", "pcm_s16le", + "-ar", str(sr), + "-" + ] + # fmt: on try: - # This launches a subprocess to decode audio while down-mixing - # and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="s16le", acodec="pcm_s16le", - ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], - capture_stdout=True, capture_stderr=True) - ) - except ffmpeg.Error as e: + out = run(cmd, capture_output=True, check=True).stdout + except CalledProcessError as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 From 280cfa3c35e391c752d2b7b811f214c883c47f81 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 12:11:28 +0200 Subject: [PATCH 42/86] fixed but where speaker dict included segment informations --- autotranscript/diarisation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 931d395..5b71f88 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -55,7 +55,7 @@ class Diariser: ### if len(dia_list) == 1: - normalized_output.append([0, 0, dia_list[0]]) + normalized_output.append([0, 0, dia_list[0][2]]) else: for i, (_, _, speaker) in enumerate(dia_list): @@ -158,4 +158,3 @@ class Diariser: def __str__(self): return f"Diarisation(model={self.model})" - From 979a2320f002be99e6bca0869d8d74ac6741bdee Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 13:36:17 +0200 Subject: [PATCH 43/86] added file removal --- autotranscript/autotranscript.py | 58 ++++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 8cb7e8a..9f4100e 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -5,6 +5,10 @@ from autotranscript.transcript_exporter import Transcript from typing import Union , TypeVar from tqdm import trange import torch +import os +from glob import iglob +from subprocess import run +from warnings import warn diarisation = TypeVar('diarisation') @@ -49,11 +53,14 @@ class AutoTranscribe: print("AutoTranscribe initialized all models successfully loaded.") def transcribe(self, audiofile : Union[str, torch.Tensor], + remove_original : bool = False, *args, **kwargs) -> Transcript: """ Transcribe audiofile with whisper model and pyannote diarization model :param audiofile: path to audiofile or torch.Tensor + :param remove_original: if True the original audiofile will be removed after + transcription. :return: Transcript object which contains the transcript and can be used to export the transcript to differnt formats. """ @@ -86,8 +93,51 @@ class AutoTranscribe: final_transcript[i] = {"speaker" : diarisation["speakers"][i], "segment" : seg, "text" : transcript} + + if remove_original: + if kwargs.get("shred") is True: + self.remove_audio_file(audiofile, shred=True) + else: + self.remove_audio_file(audiofile, shred=False) + return Transcript(final_transcript) + @staticmethod + def remove_audio_file(audiofile : str, + shred : bool = False) -> None: + """ + removes orginal audiofile to avoid disk space problems + + or to enshure data privacy + + :param audiofile: path to audiofile + :param shred: if True audiofile will be shredded and not only removed + + """ + if not os.path.exists(audiofile): + raise ValueError(f"Audiofile {audiofile} does not exist.") + + if shred: + + warn("Shredding audiofile can take a long time.", RuntimeWarning) + + gen = iglob(f'{audiofile}', recursive=True) + cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}'] + + if os.path.isdir(audiofile): + raise ValueError(f"Audiofile {audiofile} is a directory.") + + for file in gen: + print(f'shredding {file} now\n') + + run(cmd , check=True) + + else: + os.remove(audiofile) + print(f"Audiofile {audiofile} removed.") + + + @staticmethod def get_audiofile(audiofile : Union[str, torch.Tensor], *args, **kwargs) -> AudioProcessor: @@ -110,10 +160,4 @@ class AutoTranscribe: if not isinstance(audiofile, AudioProcessor): raise ValueError(f'Audiofile must be of type AudioProcessor,' \ f'not {type(audiofile)}') - return audiofile - - -if __name__ == "__main__": - - AudioTranscriber = AutoTranscribe() - AudioTranscriber.transcribe("tests/test.wav") \ No newline at end of file + return audiofile \ No newline at end of file From 7909d6d507638c03ece3a133815697e46d109263 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:00:31 +0200 Subject: [PATCH 44/86] add save for different types of files --- autotranscript/transcript_exporter.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 37092c8..16d5e09 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -82,7 +82,6 @@ class Transcript: speaker = seq["speaker"] fstring += f"{speaker}: {seq['text']}\n" - return fstring def __repr__(self) -> str: @@ -183,6 +182,29 @@ class Transcript: def to_pdf(self, path: str) -> None: pass + def save(self, path: str, *args, **kwargs) -> None: + """ + Save transcript to file with given path and file format + + :param path: path to save file + :type path: str + :raises ValueError: if file format is unknown + """ + if path.endswith(".json"): + self.to_json(path, *args, **kwargs) + elif path.endswith(".txt"): + self.to_txt(path, *args, **kwargs) + elif path.endswith(".md"): + self.to_md(path, *args, **kwargs) + elif path.endswith(".html"): + self.to_html(path, *args, **kwargs) + elif path.endswith(".tex"): + self.to_tex(path, *args, **kwargs) + elif path.endswith(".pdf"): + self.to_pdf(path, *args, **kwargs) + else: + raise ValueError("Unknown file format") + if __name__ == "__main__": test = Transcript(json.load(open("tests/test.json", "r"))) print(repr(test)) From 4cb774007d62a24c5f8b85ade5e3d1dbd7ce4c09 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:06:16 +0200 Subject: [PATCH 45/86] added save function to export to json --- autotranscript/diarisation.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 5b71f88..070fc2d 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,8 +1,9 @@ -from pyannote.audio import Pipeline +from .audio import Pipeline from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization from torch import Tensor import os from typing import TypeVar, Union +import json Annotation = TypeVar('Annotation') @@ -88,6 +89,18 @@ class Diariser: diarization_output["speakers"].append(outp[2]) return diarization_output + def save(self, path : str, *args, **kwargs) -> None: + """ + Save diarization output to a file + + :param path: path to save file + :type path: str + """ + with open(path, "w") as f: + json.dump(self.transcript, f, *args, **kwargs) + + + @staticmethod def _get_token(): # check ig .pyannotetoken.txt exists From 65c2cbfd91e474416c42af8c35fd7145147d3bb5 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:22:50 +0200 Subject: [PATCH 46/86] removed file --- autotranscript/__main__.py | 497 ------------------------------------- 1 file changed, 497 deletions(-) delete mode 100644 autotranscript/__main__.py diff --git a/autotranscript/__main__.py b/autotranscript/__main__.py deleted file mode 100644 index 19d5145..0000000 --- a/autotranscript/__main__.py +++ /dev/null @@ -1,497 +0,0 @@ - -import whisper -from time import time, sleep -import os -import glob -import re -import shutil -import sys -from tqdm import tqdm - -from typing import Union -from pydub import AudioSegment - -from pyannote.audio import Pipeline - -class AudioProcessor: - def __init__(self, audio_file:str): - self.audio_file_path = audio_file - self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1]) - - self.audiofilename = audio_file.split('/')[-1][:-4] - self.coreaudiofile = audio_file.split('/')[-1][:-4] - self.audiofilefolder = os.path.dirname(audio_file) - self.audio_file_type = audio_file.split('.')[-1] - - - - def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True): - """ - Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the - Whisper model - :param file: path to audio or video file - :param remove_orginal: remove original file - :return: mp3 file path - """ - print(f'Converting {self.audiofilename} to .{type} file') - - if savefolder == "": - savefolder = self.audiofilefolder - - if savename == "": - savename = self.coreaudiofile + f'.{type}' - else: - savename = savename + f'.{type}' - - savepath = os.path.join(savefolder, savename) - - self.audio_file.export(savepath, format=type) - - print(f'Converted {self.audiofilename} to {type}') - - if remove_orginal: - os.remove(self.audio_file_path) - print(f'File {self.audio_file_path} removed') - - self.audio_file_path = savepath - self.audio_file = AudioSegment.from_file(savepath, format=type) - - return self - - def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): - """ - Convert audio file to mp3 file - :param file: audio file - :param remove_orginal: remove original file - :return: mp3 file path - """ - return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal) - - def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True): - """ - Convert audio file to wav file - :param file: audio file - :param remove_orginal: remove original file - :return: wav file path - """ - return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal) - - def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"): - """ - Slow down mp3 file - :param file: mp3 file - :param speed: speed - :return: None - """ - if savefolder == "": - savefolder = self.audiofilefolder - else: - savefolder = savefolder - - sound = self.audio_file - slow_sound = sound._spawn(sound.raw_data, overrides={ - "frame_rate": int(sound.frame_rate * speed) - }) - - speedstr = str(speed).replace('.', '') - - file_out = self.coreaudiofile + f'_{speedstr}.{type}' - - save_path = os.path.join(savefolder, file_out) - - slow_sound.export(save_path, format=type) - - return slow_sound - -class WhisperTranscription: - def __init__(self, audio_file: str , model, language: str = "German"): - - self.audio_file = audio_file - self.model = model - self.language = language - - def transcribe(self, language:str = "German"): - """ - Transcribe audio file - - language: language of the audio file - :return: transcript as string - """ - - audiofilename = self.audio_file.split('/')[-1] - #print(f'Start transcribing Audio file: {audiofilename}') - - _stime = time() - result = self.model.transcribe(self.audio_file, language=self.language) - - #print(f'Transcription finished in {time() - _stime} seconds') - - self.transcript = result - - return result["text"] - - def save_transcript(self, transcript:str = "", savefolder : str = "", savename: str = ""): - """ - Save transcript to file - :param transcript: transcript as string - :param savefolder: folder to save transcript - :param savename: name of the transcript file - :return: None - """ - if savefolder == "": - savefolder = os.path.dirname(self.audio_file) - else: - savefolder = savefolder - - if savename == "": - savename = self.audio_file.split('/')[-1][:-4] + '.txt' - else: - savename = savename - - if transcript == "": - transcript = self.transcript["text"] - - savepath = os.path.join(savefolder, savename) - - with open(savepath, 'w') as f: - f.write(transcript) - - print(f'Transcript saved to {savepath}') - -class Diarisation(AudioProcessor): - def __init__(self, audio_file: str, model,**kwargs): - - super().__init__(audio_file=audio_file) - - self.model = model - - - def diarization(self, *args, **kwargs): - - if "num_speakers" in kwargs: - num_speakers = kwargs['num_speakers'] - kwargs.pop('num_speakers') - else: - num_speakers = 2 - - audiofilename = self.coreaudiofile - - print(f'Start diarization of audio file: {self.audiofilename}') - - _stime = time() - - diarization = self.model(self.audio_file_path, num_speakers=num_speakers) - - print(f'Diarization finished in {time() - _stime} seconds') - self.diarization = diarization - - return diarization - - def format_diarization_output(self, *args, **kwargs): - """ - Format diarization output to a list of tuples - :param args: - :param kwargs: - :return: dict with speaker names as keys and list of tuples as values and list of different speakers - """ - - diarization_output = {"speakers": [], "segments": []} - - if not hasattr(self, 'diarization'): - # ensure diarization is run before formatting - self.diarization = self.diarization() - - - for segment, _, speaker in self.diarization.itertracks(yield_label=True): - diarization_output["speakers"].append(speaker) - diarization_output["segments"].append(segment) - - normalized_output = [] - index_start_speaker = 0 - index_end_speaker = 0 - current_speaker = str() - - for i, speaker in enumerate(diarization_output["speakers"]): - - if i == 0: - current_speaker = speaker - - if speaker != current_speaker: - - index_end_speaker = i - 1 - - normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) - - index_start_speaker = i - current_speaker = speaker - - if i == len(diarization_output["speakers"]) - 1: - - index_end_speaker = i - normalized_output.append([index_start_speaker, index_end_speaker, current_speaker]) - - - self.normalized_output = normalized_output - self.diarization_output = diarization_output - - return diarization_output,normalized_output - - def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs): - """ - Create temporary wav file for diarization - :param savefolder: folder to save the temporary wav file - :param savename: name of the temporary wav file prefix - :param audiofile: audio file - :return: temporary wav file - """ - - - if savefolder == "": - folder = '.temp' - if not os.path.exists(folder): - os.makedirs(folder) - else: - folder = savefolder - - folder = os.path.realpath(folder) - - if savename == "": - savename = self.coreaudiofile + '.wav' - else: - savename = savename - - - if not os.path.exists(folder): - os.makedirs(folder) - - if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'): - self.format_diarization_output() - - - speaker = set(self.diarization_output["speakers"]) - num_speak_iter = [0 for _ in range(len(speaker))] - - for count, outp in enumerate(self.normalized_output): - start = self.diarization_output["segments"][outp[0]].start - end = self.diarization_output["segments"][outp[1]].end - - print("start: ", start) - print("end: ", end) - - start_milliseconds = start * 1000 - end_milliseconds = end * 1000 - - print("start_milliseconds: ", start_milliseconds) - print("end_milliseconds: ", end_milliseconds) - - print("cut audio") - - cut_audio = self.audio_file[start_milliseconds:end_milliseconds] - - print("save audio") - print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav") - cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav") - - return os.path.realpath(folder) - - def __repr__(self): - return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" - def __str__(self): - return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})" - - -class AutoTranscribe: - def __init__(self, audiofile: Union[str, bool, list] = None, - model: str = "medium", - language: str = "German", - diarisation: bool = False, - audioinput: str = "audiofiles", - transcriptionout: str = "transcriptions", - *args, **kwargs): - """ - AutoTranscribe - :param audiofile: audio file or list of audio files to transcribe - :param model: model name (default: medium) - :param language: language (default: German) - :param diarisation: diarisation (default: False) - """ - if audiofile is None: - audiofile = os.listdir(audioinput) # get all audio files in audioinput folder - audiofile = [os.path.realpath(os.path.join(audioinput, file)) for file in audiofile]# add path to audio files - - self.audiofile = audiofile - self.language = language - self.diarisation = diarisation - if diarisation: - print("Diarisation is enabled") - print("Load Diarisation model") - self.diarisation_model = Pipeline.from_pretrained("pyannote/speaker-diarization", - use_auth_token = self._get_token()) - print("Load Diarisation model done") - - print(f"Load Whisper model {model}") - self.model = whisper.load_model(model) - print(f"Load Whisper model {model} done") - - self.currentpath, \ - self.audiopath, \ - self.transcriptionpath, \ - self.audiofiles = self.create_folder_structure(audioinput, transcriptionout) # create folder structure - - - - def transcribe(self, *args, **kwargs): - - if isinstance(self.audiofile, str): - for i in range(len(self.audiofiles)): - if self.audiofile in self.audiofiles[i]: - self.audiofile = [self.audiofiles[i]] - break - - audiolist = self.audiofile - - elif isinstance(self.audiofile, list): - audiolist = self.audiofile - else: - audiolist = self.audiofiles - - if not set(audiolist).issubset(set(self.audiofiles)): - raise ValueError(f"Audio file {self.audiofile} not found in {self.audiopath}") - - - for audiofile in audiolist: - _start = time() - if not "/" in audiofile: - audiofile = os.path.join(self.audiopath, audiofile) - - if not self.check_if_already_transcribed (audiofile): - - audio = AudioProcessor(audiofile) - - if not audiofile.endswith('wav'): - audio = audio.to_wav() - self.audiofile = audio.audio_file_path - audiofile = audio.audio_file_path - - if "speed" in kwargs: - speed = kwargs['speed'] - kwargs.pop('speed') - - print('Creating slower version of the audio file with speed {}'.format(speed)) - slower_audio = os.path.join(self.transcriptionpath, 'slower_version') - if not os.path.exists(slower_audio): - os.makedirs(slower_audio) - audio.slower_mp3(savefolder=slower_audio,speed=speed) - - if not self.diarisation: - WhisperTranscription(audiofile, self.model, self.language - ).save_transcript(savefolder = self.transcriptionpath) - - else: - print("Start diarisation") - dia = Diarisation(audiofile, self.diarisation_model) - - if 'num_speakers' in kwargs: - num_speakers = kwargs['num_speakers'] - kwargs.pop('num_speakers') - dia.diarization(num_speakers=num_speakers) - else: - dia.diarization() - - temppath = dia.create_temporary_wav() - temppath_dict, _ = dia.format_diarization_output() - speakers = list(set(temppath_dict["speakers"])) - - - fstring = "\\begin{drama}" - - for speaker in speakers: - speaker = speaker.replace("SPEAKER_", "") - fstring += "\n\t\Character{S"+ str(speaker) + "}{S" + str(speaker) + "}" - - - files = glob.glob(temppath + "/*.wav") - - # Sort files according to the digits included in the filename - files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0])) - - for file in tqdm(files): - - Whisper = WhisperTranscription(file, self.model, self.language).transcribe() - - for s in speakers: - if s in file: - s = s.replace("SPEAKER_", "") - fstring += f"\n\S{s}speaks: \n {Whisper}" - - fstring += "\n\end{drama}" - - print(fstring) - - with open(os.path.join(self.transcriptionpath, - os.path.basename(audiofile).split('.')[0] + '.tex'), 'w') as f: - f.write(fstring) - - print("Remove temporary files") - shutil.rmtree(temppath) - - print(f"Transcription of {audiofile} done in total of {time() - _start} seconds") - - def create_folder_structure(self, audiopath: str, transcriptionout: str): - """ - Create folder structure for audio and transcription files - - :return: currentpath, audiopath, transcriptionpath, audiofiles - """ - currentpath = os.path.dirname(sys.argv[0]) # get executable path - - if not os.path.exists(os.path.join(currentpath, audiopath)): - print('Creating audiofiles folder') - os.makedirs(os.path.join(currentpath, audiopath)) - if not os.path.exists(os.path.join(currentpath, transcriptionout)): - print('Creating transcription folder') - os.makedirs(os.path.join(currentpath, transcriptionout)) - - audiopath = os.path.join(currentpath, audiopath) # path to audio files - transcriptionpath = os.path.join(currentpath, transcriptionout) # path to transcription files - - - _audiofiles = os.listdir(audiopath) # list of audio files - audiofiles = [] - for i in _audiofiles: - audiofiles.append(os.path.join(audiopath, i)) - - return currentpath, audiopath, transcriptionpath, audiofiles - - def check_if_already_transcribed (self, filename: str): - """ - Check if all audio files are already transcribed - :param filename: audio file name - :return: bool - """ - purefilename = filename.split('/')[-1][:-4] - _files = os.listdir(self.transcriptionpath) - for i,f in enumerate(_files): - _files[i] = f[:-4] - - if purefilename in _files: - print(f'File {purefilename[:-4]} already transcribed') - return True - else: - return False - @classmethod - def _get_token(self): - # check ig .pyannotetoken.txt exists - path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken') - if os.path.exists(path): - with open(path, 'r') as f: - token = f.read() - else: - raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token' - ' and save it in a file called .pyannotetoken.txt') - return token - - def __repr__(self): - return f"AutoTranscribe(audiofile={self.audiofile}, model={self.model}, language={self.language}, diarisation={self.diarisation})" - def __call__(self, *args, **kwargs): - return self.transcribe(*args, **kwargs) From a5e051cbfbc7c6e5bca455778024ec316b1051b4 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:23:23 +0200 Subject: [PATCH 47/86] added cli --- autotranscript/autotranscript.py | 112 +++++++++++++++++++++++++++++-- 1 file changed, 107 insertions(+), 5 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 9f4100e..0a29528 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -1,7 +1,7 @@ -from autotranscript.audio import AudioProcessor -from autotranscript.diarisation import Diariser -from autotranscript.transcriber import Transcriber, whisper -from autotranscript.transcript_exporter import Transcript +from .audio import AudioProcessor +from .diarisation import Diariser +from .transcriber import Transcriber, whisper +from .transcript_exporter import Transcript from typing import Union , TypeVar from tqdm import trange import torch @@ -9,6 +9,8 @@ import os from glob import iglob from subprocess import run from warnings import warn +import argparse + diarisation = TypeVar('diarisation') @@ -160,4 +162,104 @@ class AutoTranscribe: if not isinstance(audiofile, AudioProcessor): raise ValueError(f'Audiofile must be of type AudioProcessor,' \ f'not {type(audiofile)}') - return audiofile \ No newline at end of file + return audiofile + + +def cli(): + from whisper import available_models + from whisper.utils import get_writer + from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE + from .transcriber import WHISPER_DEFAULT_PATH + def str2bool(string): + str2val = {"True": True, "False": False} + if string in str2val: + return str2val[string] + else: + raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") + + + # fmt: off + parser = argparse.ArgumentParser(formatter_class= + argparse.ArgumentDefaultsHelpFormatter) + + parser.add_argument("audio", nargs="+", type=str, + help="audio file(s) to transcribe") + + parser.add_argument("--wmodel", default="medium", + help="name of the Whisper model to use") + parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH, + help="the path to save model files; uses ./models/whisper by default") + + parser.add_argument("--device", + default="cuda" if torch.cuda.is_available() else "cpu", + help="device to use for PyTorch inference") + parser.add_argument("--threads", type=int, default=0, + help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") + + parser.add_argument("--output_dir", "-o", type=str, default=".", + help="directory to save the outputs") + parser.add_argument("--output_format", "-f", type=str, default="txt", + choices=["txt", "json", "md", "html"], + help="format of the output file; if not specified, all available formats will be produced") + + parser.add_argument("--verbose", type=str2bool, default=True, + help="whether to print out the progress and debug messages") + + parser.add_argument("--task", type=str, default="transcribe", + choices=["transcribe", "diarize","wtranscribe"], + help="whether to perfrom transcription and diazation or only one of them") + parser.add_argument("--language", type=str, default=None, + choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), + help="language spoken in the audio, specify None to perform language detection") + + # fmt: on + + args = parser.parse_args().__dict__ + model_name: str = args.pop("wmodel") + model_dir: str = args.pop("wmodel_dir") + output_dir: str = args.pop("output_dir") + output_format: str = args.pop("output_format") + task = args.pop("task") + device: str = args.pop("device") + os.makedirs(output_dir, exist_ok=True) + + if (threads := args.pop("threads")) > 0: + torch.set_num_threads(threads) + + wkwargs = {"download_root": model_dir, + "device": device, + "language" : args.pop("language")} + + model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs) + + if task == "transcribe": + for audio in args.pop("audio"): + out = model.transcribe(audio) + basename = audio.split("/")[-1].split(".")[0] + spath = f"{output_dir}/{basename}.{output_format}" + out.save(spath) + + elif task == "diarize": + warn("Diarization is still in beta and may not work as expected.", + RuntimeWarning) + for audio in args.pop("audio"): + out = model.diariser.diarization(audio) + basename = audio.split("/")[-1].split(".")[0] + spath = f"{output_dir}/{basename}.json" + + print(f"diairization results saved to {spath}") + + out.save(spath) + + elif task == "wtranscribe": + writer = get_writer(output_format, output_dir) + warn("whisper transcription is poorly supported and may not work as expected." \ + "It is recommendet to use the whisper cli directly", + RuntimeWarning) + for audio in args.pop("audio"): + out = model.transcriber.transcribe(audio, diarisation=True) + basename = audio.split("/")[-1].split(".")[0] + writer(out, audio) + +if __name__ == "__main__": + cli() \ No newline at end of file From bbe27cf6169d9cfc02f88edecb7c386d4908088a Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:24:36 +0200 Subject: [PATCH 48/86] support cli --- setup.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index d6884d3..0c00dad 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import os from setuptools import setup, find_packages module_name = "autotranscript" -github_url = "https://github.com/Jaikinator/transcriptor" +github_url = "https://github.com/JSchmie/autotranscript" file_dir = os.path.dirname(os.path.realpath(__file__)) absdir = lambda p: os.path.join(file_dir, p) @@ -17,7 +17,7 @@ with open(verfile, "r") as fp: ############### setup ############### -build_version = "OPTB_BUILD" in os.environ +build_version = "AUTOTRANSCRIPT_BUILD" in os.environ setup( name=module_name, @@ -34,5 +34,6 @@ setup( author='Jacob Schmieder', author_email='', description='Transcription tool for audio files based on Whisper', - #entry_points={'console_scripts': ['autotranscript = autotranscript.__main__:main']} + entry_points={'console_scripts': + ['autotranscript = autotranscript.autotranscript:cli']} ) From ae9a125d127727cb3a58452d83ae36370a3fbfd6 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:56:23 +0200 Subject: [PATCH 49/86] changed dependencies --- autotranscript/__init__.py | 15 +++++++-------- autotranscript/diarisation.py | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index ef47226..4812cc2 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,10 +1,9 @@ -from autotranscript.__main__ import * -from autotranscript.autotranscript import * -from autotranscript.transcriber import * -from autotranscript.audio import * -from autotranscript.transcript_exporter import * -from autotranscript.diarisation import * -from autotranscript.version import get_version as _get_version -from autotranscript.misc import * +from .autotranscript import * +from .transcriber import * +from .audio import * +from .transcript_exporter import * +from .diarisation import * +from .version import get_version as _get_version +from .misc import * __version__ = _get_version() diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 070fc2d..ea36b93 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,4 +1,4 @@ -from .audio import Pipeline +from pyannote.audio import Pipeline from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization from torch import Tensor import os From 66e73e1c6ba638c26a81cc21f271824552fb43fa Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 15:56:46 +0200 Subject: [PATCH 50/86] added kwargs support for load model --- autotranscript/transcriber.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index e4d587a..39c0842 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -6,7 +6,6 @@ from glob import glob whisper = TypeVar('whisper') Tensor = TypeVar('Tensor') nparray = TypeVar('nparray') -Transcriber = TypeVar('Transcriber') def get_whisper_default_path() -> str: """ @@ -69,7 +68,8 @@ class Transcriber: def load_model(cls, model: str = "medium", local : bool = True, - download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber: + download_root: str = WHISPER_DEFAULT_PATH , + *args, **kwargs) -> 'Transcriber': """ Load whisper module @@ -117,7 +117,7 @@ class Transcriber: "model first. By deactivating the local flag, " / "the model will be downloaded automatically.") - _model = load_model(model, download_root=download_root) + _model = load_model(model, download_root=download_root, *args, **kwargs) return cls(_model) From 57fd73c8ee9b98a401d7490025dd3af34c8129ad Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 16:30:23 +0200 Subject: [PATCH 51/86] added functionallity to select diarisation model using cli --- autotranscript/autotranscript.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 0a29528..2097f2f 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -38,8 +38,7 @@ class AutoTranscribe: """ if whisper_model is None: - self.transcriber = Transcriber.load_model("medium", local=True) - + self.transcriber = Transcriber.load_model("medium", local=True) elif isinstance(whisper_model, str): self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs) else: @@ -170,6 +169,7 @@ def cli(): from whisper.utils import get_writer from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE from .transcriber import WHISPER_DEFAULT_PATH + from .diarisation import PYANNOTE_DEFAULT_PATH def str2bool(string): str2val = {"True": True, "False": False} if string in str2val: @@ -190,6 +190,10 @@ def cli(): parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH, help="the path to save model files; uses ./models/whisper by default") + parser.add_argument("--dia_model", type=str, default = PYANNOTE_DEFAULT_PATH) + + parser.add_argument("--allow_download", type= bool, default=True, + help="whether to allow model download if model is not found locally") parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") @@ -219,6 +223,7 @@ def cli(): model_dir: str = args.pop("wmodel_dir") output_dir: str = args.pop("output_dir") output_format: str = args.pop("output_format") + local :str = args.pop("allow_download") task = args.pop("task") device: str = args.pop("device") os.makedirs(output_dir, exist_ok=True) @@ -227,14 +232,17 @@ def cli(): torch.set_num_threads(threads) wkwargs = {"download_root": model_dir, - "device": device, - "language" : args.pop("language")} - - model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs) + "local": local, + "device": device} + diarisation_kwargs = {"local": local} + model = AutoTranscribe(whisper_model= model_name, + whisper_kwargs= wkwargs, + dia_model= args.pop("dia_model"), + dia_kwargs_kwargs= diarisation_kwargs,) if task == "transcribe": for audio in args.pop("audio"): - out = model.transcribe(audio) + out = model.transcribe(audio, language = args.pop("language")) basename = audio.split("/")[-1].split(".")[0] spath = f"{output_dir}/{basename}.{output_format}" out.save(spath) @@ -257,7 +265,7 @@ def cli(): "It is recommendet to use the whisper cli directly", RuntimeWarning) for audio in args.pop("audio"): - out = model.transcriber.transcribe(audio, diarisation=True) + out = model.transcriber.transcribe(audio, language = args.pop("language")) basename = audio.split("/")[-1].split(".")[0] writer(out, audio) From 29cc0aca6ad72b243396d2a52196714bac016ae0 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 16:31:42 +0200 Subject: [PATCH 52/86] cahnged name form dia_model to dia_dir --- autotranscript/autotranscript.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 2097f2f..42ed015 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -190,7 +190,7 @@ def cli(): parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH, help="the path to save model files; uses ./models/whisper by default") - parser.add_argument("--dia_model", type=str, default = PYANNOTE_DEFAULT_PATH) + parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH) parser.add_argument("--allow_download", type= bool, default=True, help="whether to allow model download if model is not found locally") @@ -237,7 +237,7 @@ def cli(): diarisation_kwargs = {"local": local} model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs, - dia_model= args.pop("dia_model"), + dia_model= args.pop("dia_dir"), dia_kwargs_kwargs= diarisation_kwargs,) if task == "transcribe": From 06804b21b10cf740c062cbdee88b9d337ced12dc Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 19 Jun 2023 16:32:42 +0200 Subject: [PATCH 53/86] removed wrong variable --- autotranscript/autotranscript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 42ed015..d79b392 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -238,7 +238,7 @@ def cli(): model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs, dia_model= args.pop("dia_dir"), - dia_kwargs_kwargs= diarisation_kwargs,) + dia_kwargs= diarisation_kwargs,) if task == "transcribe": for audio in args.pop("audio"): From cadeb8784fa9bf39313fb99affd5ed88f2cd4480 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:19:38 +0200 Subject: [PATCH 54/86] cahnged description --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0c00dad..7517d61 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ setup( license='', author='Jacob Schmieder', author_email='', - description='Transcription tool for audio files based on Whisper', + description='Transcription tool for audio files based on Whisper and Pyannote', entry_points={'console_scripts': ['autotranscript = autotranscript.autotranscript:cli']} ) From bb73a668011af737014357f57306fefda57aed5e Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:19:54 +0200 Subject: [PATCH 55/86] add example --- transcribe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transcribe.py b/transcribe.py index 6601707..fca2532 100644 --- a/transcribe.py +++ b/transcribe.py @@ -4,4 +4,5 @@ model = AutoTranscribe() text = model.transcribe("tests/test.wav") +print("Transcription:\n") print(text) From 88db803bcb8bb000d1d46bb939734e7cf5ebd16c Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:20:17 +0200 Subject: [PATCH 56/86] added file --- autotranscript/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index 4812cc2..e6b02f3 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -1,4 +1,5 @@ from .autotranscript import * +from .app.qtfaststart import * from .transcriber import * from .audio import * from .transcript_exporter import * From 2308a9337ccba81273fe5a38ab28953d87748ce4 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:20:42 +0200 Subject: [PATCH 57/86] changed type of sr --- autotranscript/audio.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index 4e7ee60..7944a73 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -1,7 +1,7 @@ import numpy as np import torch from subprocess import CalledProcessError, run - +from typing import Union SAMPLE_RATE = 16000 class AudioProcessor: @@ -9,7 +9,7 @@ class AudioProcessor: Audio Processor using PyTorchaudio instead of PyDub """ - def __init__(self, waveform: torch.Tensor, sr : torch.Tensor, + def __init__(self, waveform: torch.Tensor, sr : int = SAMPLE_RATE, *args, **kwargs) -> None: """ Initialise audio processor From d882d80d1d381a2d19882b6d2c93145c15ac0220 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:21:21 +0200 Subject: [PATCH 58/86] added ndarray datatype to input of transcribe --- autotranscript/autotranscript.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index d79b392..6f00888 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -10,7 +10,7 @@ from glob import iglob from subprocess import run from warnings import warn import argparse - +from numpy import ndarray diarisation = TypeVar('diarisation') @@ -53,7 +53,7 @@ class AutoTranscribe: print("AutoTranscribe initialized all models successfully loaded.") - def transcribe(self, audiofile : Union[str, torch.Tensor], + def transcribe(self, audiofile : Union[str, torch.Tensor, ndarray], remove_original : bool = False, *args, **kwargs) -> Transcript: """ @@ -140,7 +140,7 @@ class AutoTranscribe: @staticmethod - def get_audiofile(audiofile : Union[str, torch.Tensor], + def get_audiofile(audiofile : Union[str, torch.Tensor, ndarray], *args, **kwargs) -> AudioProcessor: """ Get audiofile as TorchAudioProcessor @@ -155,9 +155,12 @@ class AutoTranscribe: if isinstance(audiofile, str): audiofile = AudioProcessor.from_file(audiofile) - if isinstance(audiofile, torch.Tensor): + elif isinstance(audiofile, torch.Tensor): audiofile = AudioProcessor(audiofile[0], audiofile[1]) - + elif isinstance(audiofile, ndarray): + audiofile = AudioProcessor(torch.tensor(audiofile[0]), + audiofile[1]) + if not isinstance(audiofile, AudioProcessor): raise ValueError(f'Audiofile must be of type AudioProcessor,' \ f'not {type(audiofile)}') @@ -191,9 +194,10 @@ def cli(): help="the path to save model files; uses ./models/whisper by default") parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH) - - parser.add_argument("--allow_download", type= bool, default=True, + parser.add_argument("--htoken", default="", type=str, help="HuggingFace token for private model download") + parser.add_argument("--local", type=str2bool, default=False, help="whether to allow model download if model is not found locally") + parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu", help="device to use for PyTorch inference") @@ -219,11 +223,12 @@ def cli(): # fmt: on args = parser.parse_args().__dict__ + model_name: str = args.pop("wmodel") model_dir: str = args.pop("wmodel_dir") output_dir: str = args.pop("output_dir") output_format: str = args.pop("output_format") - local :str = args.pop("allow_download") + local :str = args.pop("local") task = args.pop("task") device: str = args.pop("device") os.makedirs(output_dir, exist_ok=True) @@ -234,7 +239,10 @@ def cli(): wkwargs = {"download_root": model_dir, "local": local, "device": device} - diarisation_kwargs = {"local": local} + + diarisation_kwargs = {"local": local, + "token" : args.pop("htoken")} + model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs, dia_model= args.pop("dia_dir"), From 58a14b2adf84561deddb575e6483c2fb07b17f88 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:22:03 +0200 Subject: [PATCH 59/86] change location of default path variables --- autotranscript/diarisation.py | 15 ++++--- autotranscript/misc.py | 81 +++++------------------------------ autotranscript/transcriber.py | 16 +------ 3 files changed, 21 insertions(+), 91 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index ea36b93..1c2e4fb 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -4,14 +4,16 @@ from torch import Tensor import os from typing import TypeVar, Union import json - +from .misc import PYANNOTE_DEFAULT_PATH Annotation = TypeVar('Annotation') -PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), - "models", "pyannote", - "speaker_diarization", "config.yaml") - class Diariser: + """ + Diarisation class + This class is used to diarize an audio file using a pretrained model + from pyannote.audio. + :param model: model to use for diarization + """ def __init__(self, model,*args,**kwargs) -> None: self.model = model @@ -137,10 +139,11 @@ class Diariser: ------- Pipeline Object """ - + if local: diarization_model = Pipeline.from_pretrained(model,*args, **kwargs) else: + print("Loading model from HuggingFace") if token == "": token = cls._get_token() diarization_model = Pipeline.from_pretrained(model, use_auth_token = token, diff --git a/autotranscript/misc.py b/autotranscript/misc.py index 065e45d..716852e 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -4,83 +4,22 @@ from whisper import Whisper, load_model import os import glob from warnings import warn +import yaml -WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), +WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), "models", "whisper") -PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), +PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), "models", "pyannote", "speaker_diarization", "config.yaml") -def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper: + +def config_diarization_yaml(file): """ - Load modules from whisper - - Parameters - ---------- - whisper : str - whisper model - available models: - - - 'tiny.en' - - 'tiny' - - 'base.en' - - 'base' - - 'small.en' - - 'small' - - 'medium.en' - - 'medium' - - 'large-v1' - - 'large-v2' - - 'large' - - local : bool - If true, load from local cache - - download_root : str - Path to download the model - - default: /models/whisper + Configure diarization pipeline from yaml file to use the model offline + and avoid manuel file manipulation. - Returns - ------- - Whisper Object + :param file: yaml file + :type file: yaml """ - warn("load_whisper_model is deprecated. Use Transcriptor.load_model() instead.", DeprecationWarning) - if local: - available_models = [os.path.basename(x) for x in glob.glob(os.path.join(download_root, "*"))] - - for i, module in enumerate(available_models): - available_models[i] = module.split(".")[0] - - if model not in available_models: - raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.") - - return load_model(model, download_root=download_root) - -def load_pyannote_model(model: str = PYANNOTE_DEFAULT_PATH, - token: str = "", - local : bool = True, - *args, **kwargs) -> Pipeline: - """ - Load modules from pyannote - - Parameters - ---------- - model : str - pyannote model - default: /models/pyannote/speaker_diarization/config.yaml - token : str - HUGGINGFACE_TOKEN - local : bool - If true, load from local cache - - Returns - ------- - Pipeline Object - """ - warn("load_pyannote_model is deprecated. Use Diarisation.load_model() instead.", DeprecationWarning) - if local: - return Pipeline.from_pretrained(model,*args, **kwargs) - else: - return Pipeline.from_pretrained(model, use_auth_token = token, *args, **kwargs) + \ No newline at end of file diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 39c0842..82156cf 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -2,24 +2,12 @@ import os from whisper import Whisper, load_model from typing import TypeVar , Union from glob import glob - +from .misc import WHISPER_DEFAULT_PATH whisper = TypeVar('whisper') Tensor = TypeVar('Tensor') nparray = TypeVar('nparray') -def get_whisper_default_path() -> str: - """ - Get default path for whisper models - Returns - ------- - str - path - """ - _path = os.path.dirname(os.path.dirname(__file__)) - return os.path.join(_path, "models", "whisper") - -WHISPER_DEFAULT_PATH = get_whisper_default_path() class Transcriber: def __init__(self, model: whisper ) -> None: @@ -68,7 +56,7 @@ class Transcriber: def load_model(cls, model: str = "medium", local : bool = True, - download_root: str = WHISPER_DEFAULT_PATH , + download_root: str = WHISPER_DEFAULT_PATH, *args, **kwargs) -> 'Transcriber': """ Load whisper module From 9a767228f7cdd7b7c21a3f91dc1e73f986ba0efa Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Tue, 27 Jun 2023 10:22:21 +0200 Subject: [PATCH 60/86] fixed wrong writing --- autotranscript/transcript_exporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 16d5e09..2615a67 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -160,7 +160,7 @@ class Transcript: def to_txt(self, path: str) -> None: with open(path, "w") as f: - f.write(self.__str__, f) + f.write(self.__str__()) def to_md(self, path: str) -> None: return self.to_html(path) From de3a6cd4d17a7a9261706ad514a10abaa2d60758 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 28 Jun 2023 15:31:52 +0200 Subject: [PATCH 61/86] added function to controll paths to pyannote models --- autotranscript/misc.py | 44 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/autotranscript/misc.py b/autotranscript/misc.py index 716852e..1c14198 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -6,15 +6,15 @@ import glob from warnings import warn import yaml -WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), - "models", "whisper") +WHISPER_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), + "models", "whisper")) -PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), +PYANNOTE_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), "models", "pyannote", - "speaker_diarization", "config.yaml") + "speaker_diarization", "config.yaml")) -def config_diarization_yaml(file): +def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding = None): """ Configure diarization pipeline from yaml file to use the model offline and avoid manuel file manipulation. @@ -22,4 +22,36 @@ def config_diarization_yaml(file): :param file: yaml file :type file: yaml """ - \ No newline at end of file + with open(file, "r") as stream: + yml = yaml.safe_load(stream) + stream.close() + if path_to_segmentation: + yml["pipeline"]["params"]["segmentation"] = path_to_segmentation + else: + yml["pipeline"]["params"]["segmentation"] = os.path.relpath(os.path.join( + os.path.dirname(__file__), + "models", "pyannote", + "segmentation", + "pytorch_model.bin")) + + if path_to_embedding: + yml["pipeline"]["params"]["embedding"] = path_to_embedding + else: + yml["pipeline"]["params"]["embedding"] = os.path.relpath( + os.path.join( + os.path.dirname(__file__), + "models", "pyannote", + "speechbrain", + "spkrec-ecapa-voxceleb", + "embedding_model.ckpt")) + + if not os.path.exists(yml["pipeline"]["params"]["segmentation"]): + raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}") + + if not os.path.exists(yml["pipeline"]["params"]["embedding"]): + raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}") + + with open(file, "w") as stream: + yaml.dump(yml, stream) + stream.close() + From 11fce3abefc2c5d734b02ea870929de48a0f2f8c Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:41:13 +0200 Subject: [PATCH 62/86] removed kwargs confusions --- autotranscript/autotranscript.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 6f00888..9f14886 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -19,8 +19,7 @@ class AutoTranscribe: def __init__(self, whisper_model: Union[bool, str, whisper] = None, dia_model : Union[bool, str, diarisation] = None, - dia_kwargs : dict = {}, - whisper_kwargs : dict = {}) -> None: + **kwargs) -> None: """ AutoTranscribe class @@ -38,16 +37,16 @@ class AutoTranscribe: """ if whisper_model is None: - self.transcriber = Transcriber.load_model("medium", local=True) + self.transcriber = Transcriber.load_model("medium") elif isinstance(whisper_model, str): - self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs) + self.transcriber = Transcriber.load_model(whisper_model, **kwargs) else: self.transcriber = whisper_model if dia_model is None: self.diariser = Diariser.load_model() elif isinstance(dia_model, str): - self.diariser = Diariser.load_model(dia_model, **dia_kwargs) + self.diariser = Diariser.load_model(dia_model, **kwargs) else: self.diariser = dia_model From cd35ad8903b63353c01145223598ae09fad8d0a8 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:41:43 +0200 Subject: [PATCH 63/86] solved path issues --- autotranscript/misc.py | 49 ++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/autotranscript/misc.py b/autotranscript/misc.py index 1c14198..1eaf34f 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -1,4 +1,3 @@ - from pyannote.audio import Pipeline from whisper import Whisper, load_model import os @@ -6,15 +5,18 @@ import glob from warnings import warn import yaml -WHISPER_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), - "models", "whisper")) +CACHE_DIR = os.getenv( + "AUTOT_CACHE", + os.path.expanduser("~/.cache/torch/models"), +) -PYANNOTE_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), - "models", "pyannote", - "speaker_diarization", "config.yaml")) +WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper") +PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote") -def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding = None): +PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") + +def config_diarization_yaml(file, path_to_segmentation = None): """ Configure diarization pipeline from yaml file to use the model offline and avoid manuel file manipulation. @@ -28,30 +30,25 @@ def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding if path_to_segmentation: yml["pipeline"]["params"]["segmentation"] = path_to_segmentation else: - yml["pipeline"]["params"]["segmentation"] = os.path.relpath(os.path.join( - os.path.dirname(__file__), - "models", "pyannote", - "segmentation", - "pytorch_model.bin")) + yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin") - if path_to_embedding: - yml["pipeline"]["params"]["embedding"] = path_to_embedding - else: - yml["pipeline"]["params"]["embedding"] = os.path.relpath( - os.path.join( - os.path.dirname(__file__), - "models", "pyannote", - "speechbrain", - "spkrec-ecapa-voxceleb", - "embedding_model.ckpt")) + # if path_to_embedding: + # yml["pipeline"]["params"]["embedding"] = path_to_embedding + # else: + # yml["pipeline"]["params"]["embedding"] = os.path.relpath( + # os.path.join( + # os.path.dirname(__file__), + # "models", "pyannote", + # "speechbrain", + # "spkrec-ecapa-voxceleb", + # "embedding_model.ckpt")) if not os.path.exists(yml["pipeline"]["params"]["segmentation"]): raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}") - if not os.path.exists(yml["pipeline"]["params"]["embedding"]): - raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}") + # if not os.path.exists(yml["pipeline"]["params"]["embedding"]): + # raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}") with open(file, "w") as stream: yaml.dump(yml, stream) - stream.close() - + stream.close() From 38d1f8f6682b11d1fe3cb563d235a0fa0b9003fe Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:44:10 +0200 Subject: [PATCH 64/86] removed kwargs confusion --- autotranscript/transcriber.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 82156cf..0cd42bf 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -1,6 +1,7 @@ import os from whisper import Whisper, load_model -from typing import TypeVar , Union +from typing import TypeVar , Union , Optional +import torch from glob import glob from .misc import WHISPER_DEFAULT_PATH whisper = TypeVar('whisper') @@ -17,7 +18,7 @@ class Transcriber: """ self.model = model - def transcribe(self, audio : Union[str, Tensor, nparray] , + def transcribe(self, audio : Union[str, Tensor, nparray] , *args, **kwargs) -> str: """ transcribe audio file @@ -55,9 +56,10 @@ class Transcriber: @classmethod def load_model(cls, model: str = "medium", - local : bool = True, download_root: str = WHISPER_DEFAULT_PATH, - *args, **kwargs) -> 'Transcriber': + device: Optional[Union[str, torch.device]] = None, + in_memory: bool = False, + ) -> 'Transcriber': """ Load whisper module @@ -92,20 +94,9 @@ class Transcriber: Whisper Object """ - if local: - - available_models = [os.path.basename(x) for x in - glob(os.path.join(download_root, "*"))] - - for i, module in enumerate(available_models): - available_models[i] = module.split(".")[0] - - if model not in available_models: - raise RuntimeError("Model not found. Consider downloading the "/ - "model first. By deactivating the local flag, " / - "the model will be downloaded automatically.") - _model = load_model(model, download_root=download_root, *args, **kwargs) + _model = load_model(model, download_root=download_root, + device=device, in_memory=in_memory) return cls(_model) From 907913f2bfa1cc342642db2fa90e9c65c55ecfd1 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:44:39 +0200 Subject: [PATCH 65/86] fixed kwargs confusion and resolved path issues --- autotranscript/diarisation.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 1c2e4fb..bb364e9 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -2,9 +2,10 @@ from pyannote.audio import Pipeline from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization from torch import Tensor import os +from pathlib import Path from typing import TypeVar, Union import json -from .misc import PYANNOTE_DEFAULT_PATH +from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH Annotation = TypeVar('Annotation') class Diariser: @@ -118,10 +119,12 @@ class Diariser: return token @classmethod - def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, - token: str = "", - local : bool = True, - *args, **kwargs) -> Pipeline: + def load_model(cls, + model: str = PYANNOTE_DEFAULT_CONFIG, + token: str = None, + cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH, + hparams_file: Union[str, Path] = None + ) -> Pipeline: """ Load modules from pyannote @@ -139,17 +142,15 @@ class Diariser: ------- Pipeline Object """ - - if local: - diarization_model = Pipeline.from_pretrained(model,*args, **kwargs) - else: - print("Loading model from HuggingFace") - if token == "": - token = cls._get_token() - diarization_model = Pipeline.from_pretrained(model, use_auth_token = token, - *args, **kwargs) - - return cls(diarization_model) + if not os.path.exists(model) and token is None: + token = cls._get_token() + + _model = Pipeline.from_pretrained(model, + use_auth_token = token, + cache_dir = cache_dir, + hparams_file = hparams_file,) + + return cls(_model) @staticmethod def _get_diarisation_kwargs(**kwargs) -> dict: From 4bf98621d68203defa8540ae4440d3eeaaf0e647 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:46:33 +0200 Subject: [PATCH 66/86] add webapp --- app.py | 101 ++++++++++ autotranscript/app/qtfaststart.py | 319 ++++++++++++++++++++++++++++++ 2 files changed, 420 insertions(+) create mode 100644 app.py create mode 100644 autotranscript/app/qtfaststart.py diff --git a/app.py b/app.py new file mode 100644 index 0000000..3645d79 --- /dev/null +++ b/app.py @@ -0,0 +1,101 @@ +from dash import Dash, dcc, html, dash_table, Input, Output, State, callback + +import base64 +from autotranscript.app.qtfaststart import process +from autotranscript import AutoTranscribe +import io +import subprocess as sp +import numpy as np +from autotranscript.audio import SAMPLE_RATE + +# Setup auto-transcript +autot = AutoTranscribe() # whisper_model="tiny", whisper_kwargs={"local" : False} + +# Setup FFmpeg +PROBLEMATIC_FILE_TYPES : tuple = "mov","mp4","m4a","3gp","3g2","mj2" + + +# Setup Dash +external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] + +app = Dash(__name__, external_stylesheets=external_stylesheets) + +app.layout = html.Div([ + dcc.Upload( + id='upload-data', + children=html.Div([ + 'Drag and Drop or ', + html.A('Select Files') + ]), + style={ + 'width': '100%', + 'height': '60px', + 'lineHeight': '60px', + 'borderWidth': '1px', + 'borderStyle': 'dashed', + 'borderRadius': '5px', + 'textAlign': 'center', + 'margin': '10px' + }, + # Allow multiple files to be uploaded + multiple=True + ), + html.Div(id='output-data-upload'), +]) + +def parse_contents(contents, filename, date): + content_type, content_string = contents.split(',') + + decoded = base64.b64decode(content_string) + file = io.BytesIO(decoded).read() + + if filename.endswith(PROBLEMATIC_FILE_TYPES): + # mp4 and other files need to be processed with qtfaststart + # since theire metadata is at the end of the file + # and we need it at the beginning + file = process(file) + + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", "0", + "-i",'pipe:', + "-f", "s16le", + '-hide_banner', + '-loglevel', 'error', + "-c", "copy", + "-vn", + "-ac", "1", + "-acodec", "pcm_s16le", + "-ar", str(SAMPLE_RATE), + "-" + ] + + proc = sp.Popen(cmd, stdout=sp.PIPE, stdin=sp.PIPE) + + out = proc.communicate(input=file)[0] + out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + out = np.array([out, SAMPLE_RATE]) + + transcript = str(autot.transcribe(out)) + + return html.Div([ + html.H5(f"File Name: {filename} \n" \ + "Transcript: \n" + ), + html.P(transcript) + ]) + +@callback(Output('output-data-upload', 'children'), + Input('upload-data', 'contents'), + State('upload-data', 'filename'), + State('upload-data', 'last_modified')) +def update_output(list_of_contents, list_of_names, list_of_dates): + if list_of_contents is not None: + children = [ + parse_contents(c, n, d) for c, n, d in + zip(list_of_contents, list_of_names, list_of_dates)] + return children + +if __name__ == '__main__': + app.run_server() diff --git a/autotranscript/app/qtfaststart.py b/autotranscript/app/qtfaststart.py new file mode 100644 index 0000000..e57eb20 --- /dev/null +++ b/autotranscript/app/qtfaststart.py @@ -0,0 +1,319 @@ +""" +This file contains a modified version of qtfaststart by qtfaststart +https://github.com/danielgtaylor/qtfaststart/tree/master + +All credit goes to the original author. +Copyright (C) 2008 - 2013 Daniel G. Taylor +Permission is hereby granted, free of charge, to any person obtaining a copy of this +software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the +Software, and to permit persons to whom the Software is furnished to do so, +subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies +or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +""" + +import logging +import os +import struct +import collections +import io + +# define error classes +class FastStartException(Exception): + """ + Raised when something bad happens during processing. + """ + pass + +class FastStartSetupError(FastStartException): + """ + Rasised when asked to process a file that does not need processing + """ + pass + +class MalformedFileError(FastStartException): + """ + Raised when the input file is setup in an unexpected way + """ + pass + +class UnsupportedFormatError(FastStartException): + """ + Raised when a movie file is recognized as a format not supported. + """ + pass + +# define constants +CHUNK_SIZE = 8192 + +log = logging.getLogger("qtfaststart") + +# Older versions of Python require this to be defined +if not hasattr(os, 'SEEK_CUR'): + os.SEEK_CUR = 1 + +Atom = collections.namedtuple('Atom', 'name position size') + +def read_atom(datastream): + """ + Read an atom and return a tuple of (size, type) where size is the size + in bytes (including the 8 bytes already read) and type is a "fourcc" + like "ftyp" or "moov". + """ + size, type = struct.unpack(">L4s", datastream.read(8)) + type = type.decode('ascii') + return size, type + + +def _read_atom_ex(datastream): + """ + Read an Atom from datastream + """ + pos = datastream.tell() + atom_size, atom_type = read_atom(datastream) + if atom_size == 1: + atom_size, = struct.unpack(">Q", datastream.read(8)) + return Atom(atom_type, pos, atom_size) + + +def get_index(datastream): + """ + Return an index of top level atoms, their absolute byte-position in the + file and their size in a list: + + index = [ + ("ftyp", 0, 24), + ("moov", 25, 2658), + ("free", 2683, 8), + ... + ] + + The tuple elements will be in the order that they appear in the file. + """ + log.debug("Getting index of top level atoms...") + + index = list(_read_atoms(datastream)) + _ensure_valid_index(index) + + return index + + +def _read_atoms(datastream): + """ + Read atoms until an error occurs + """ + while datastream: + try: + atom = _read_atom_ex(datastream) + log.debug("%s: %s" % (atom.name, atom.size)) + except: + break + + yield atom + + if atom.size == 0: + if atom.name == "mdat": + # Some files may end in mdat with no size set, which generally + # means to seek to the end of the file. We can just stop indexing + # as no more entries will be found! + break + else: + # Weird, but just continue to try to find more atoms + continue + + datastream.seek(atom.position + atom.size) + + +def _ensure_valid_index(index): + """ + Ensure the minimum viable atoms are present in the index. + + Raise FastStartException if not. + """ + top_level_atoms = set([item.name for item in index]) + for key in ["moov", "mdat"]: + if key not in top_level_atoms: + log.error("%s atom not found, is this a valid MOV/MP4 file?" % key) + raise FastStartException() + + +def find_atoms(size, datastream): + """ + Compatibilty interface for _find_atoms_ex + """ + fake_parent = Atom('fake', datastream.tell()-8, size+8) + for atom in _find_atoms_ex(fake_parent, datastream): + yield atom.name + + +def _find_atoms_ex(parent_atom, datastream): + """ + Yield either "stco" or "co64" Atoms from datastream. + datastream will be 8 bytes into the stco or co64 atom when the value + is yielded. + + It is assumed that datastream will be at the end of the atom after + the value has been yielded and processed. + + parent_atom is the parent atom, a 'moov' or other ancestor of CO + atoms in the datastream. + """ + stop = parent_atom.position + parent_atom.size + + while datastream.tell() < stop: + try: + atom = _read_atom_ex(datastream) + except: + log.exception("Error reading next atom!") + raise FastStartException() + + if atom.name in ["trak", "mdia", "minf", "stbl"]: + # Known ancestor atom of stco or co64, search within it! + for res in _find_atoms_ex(atom, datastream): + yield res + elif atom.name in ["stco", "co64"]: + yield atom + else: + # Ignore this atom, seek to the end of it. + datastream.seek(atom.position + atom.size) + + +def process(infilename, limit=float('inf')): + """ + Convert a Quicktime/MP4 file for streaming by moving the metadata to + the front of the file. This method writes a new file. + + If limit is set to something other than zero it will be used as the + number of bytes to write of the atoms following the moov atom. This + is very useful to create a small sample of a file with full headers, + which can then be used in bug reports and such. + """ + if isinstance(infilename, str): + datastream = open(infilename, "rb") + elif isinstance(infilename, bytes): + datastream = io.BytesIO(infilename) + else: + raise TypeError("infilename must be a filename, bytes or file-like object") + # Get the top level atom index + index = get_index(datastream) + + mdat_pos = 999999 + free_size = 0 + + # Make sure moov occurs AFTER mdat, otherwise no need to run! + for atom in index: + # The atoms are guaranteed to exist from get_index above! + if atom.name == "moov": + moov_atom = atom + moov_pos = atom.position + elif atom.name == "mdat": + mdat_pos = atom.position + elif atom.name == "free" and atom.position < mdat_pos: + # This free atom is before the mdat! + free_size += atom.size + log.info("Removing free atom at %d (%d bytes)" % (atom.position, atom.size)) + elif atom.name == "\x00\x00\x00\x00" and atom.position < mdat_pos: + # This is some strange zero atom with incorrect size + free_size += 8 + log.info("Removing strange zero atom at %s (8 bytes)" % atom.position) + + # Offset to shift positions + offset = moov_atom.size - free_size + + if moov_pos < mdat_pos: + # moov appears to be in the proper place, don't shift by moov size + offset -= moov_atom.size + if not free_size: + # No free atoms and moov is correct, we are done! + log.error("This file appears to already be setup for streaming!") + # Stupid hack to retrun the non-processed file: + if isinstance(infilename, str): + return open(infilename, "rb").read() + elif isinstance(infilename, bytes): + return io.BytesIO(infilename).read() + + # Read and fix moov + moov = _patch_moov(datastream, moov_atom, offset) + + log.info("Writing output...") + outfile = b'' + + # Write ftype + for atom in index: + if atom.name == "ftyp": + log.debug("Writing ftyp... (%d bytes)" % atom.size) + datastream.seek(atom.position) + outfile += datastream.read(atom.size) + + # Write moov + _bytes = moov.getvalue() + log.debug("Writing moov... (%d bytes)" % len(_bytes)) + outfile += _bytes + + # Write the rest + atoms = [item for item in index if item.name not in ["ftyp", "moov", "free"]] + for atom in atoms: + log.debug("Writing %s... (%d bytes)" % (atom.name, atom.size)) + datastream.seek(atom.position) + + # for compatability, allow '0' to mean no limit + cur_limit = limit or float('inf') + cur_limit = min(cur_limit, atom.size) + + for chunk in get_chunks(datastream, CHUNK_SIZE, cur_limit): + outfile += chunk + + return outfile + + +def _patch_moov(datastream, atom, offset): + datastream.seek(atom.position) + moov = io.BytesIO(datastream.read(atom.size)) + + # reload the atom from the fixed stream + atom = _read_atom_ex(moov) + + for atom in _find_atoms_ex(atom, moov): + # Read either 32-bit or 64-bit offsets + ctype, csize = dict( + stco=('L', 4), + co64=('Q', 8), + )[atom.name] + + # Get number of entries + version, entry_count = struct.unpack(">2L", moov.read(8)) + + log.info("Patching %s with %d entries" % (atom.name, entry_count)) + + entries_pos = moov.tell() + + struct_fmt = ">%(entry_count)s%(ctype)s" % vars() + + # Read entries + entries = struct.unpack(struct_fmt, moov.read(csize * entry_count)) + + # Patch and write entries + offset_entries = [entry + offset for entry in entries] + moov.seek(entries_pos) + moov.write(struct.pack(struct_fmt, *offset_entries)) + return moov + +def get_chunks(stream, chunk_size, limit): + remaining = limit + while remaining: + chunk = stream.read(min(remaining, chunk_size)) + if not chunk: + return + remaining -= len(chunk) + yield chunk From f51eb5815dba6859a2e92c99f3c2d6b4335596c0 Mon Sep 17 00:00:00 2001 From: Jacob Schmieder Date: Fri, 30 Jun 2023 18:47:53 +0200 Subject: [PATCH 67/86] Delete autotranscript/__pycache__ directory --- .../__pycache__/__init__.cpython-39.pyc | Bin 207 -> 0 bytes .../__pycache__/__main__.cpython-39.pyc | Bin 3877 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 autotranscript/__pycache__/__init__.cpython-39.pyc delete mode 100644 autotranscript/__pycache__/__main__.cpython-39.pyc diff --git a/autotranscript/__pycache__/__init__.cpython-39.pyc b/autotranscript/__pycache__/__init__.cpython-39.pyc deleted file mode 100644 index 04235a59ba7faa9afadc9a70dcffc98425d2d511..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 207 zcmYe~<>g`kf~*w_ldXXCV-N=!FabFZKwK;WBvKes7;_jxAT%S8W(sD|WPS;hVbEl} z#i-?{$#jdWq$n}3IJqb@DK$zM!Pbk9&rQtCi;rK)P{aaM4<>$D>1X8Urs`)UCg&&V z2UI3!Bo^fc6y;~7CYKcJhi7CK7o-;HBMi}xkI&4@EQycTE2zB1VUwGmQks)$#|Sj8 J801b4MgTh4HDv$* diff --git a/autotranscript/__pycache__/__main__.cpython-39.pyc b/autotranscript/__pycache__/__main__.cpython-39.pyc deleted file mode 100644 index d64ee0a076cccf9df024581a10a7941259f67616..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3877 zcmd59O>ZMfwW`0|>Dc-HAcGj1IpJZH%nFuWq8+qLhFwn9NR!?)eLHG2JObaRe;v0U zZ4mM|oXmeFOg@Are+GaNMiUZXZzQx!14_#^BQRjiOw41PTZz@R1Dldh2(y{^iREi4sxIrQ{C0Q=K4&eAsS&3`sbK7=N}0KiE=8G)O&xXCSUbLZ6D zBhcH-`m?ETvV$6{G4}-tT(-?Tw)}zyORUZsu;Q^5whH|+TVw0c*VzW!gubEq+9|wa z5v8&%;uGE~oGxc^zgsw;aM6v@S7xyk^;w*q#tAR%ZpL_0xJi^g>qpOc+XC*UAC8gj zBWUt5fB_wlkPJ+KFd7+S@`VLsoMUf8b5xDdD-fn~M@s`^j2IrQ8Mwww5Sx|%%k!t- zQ4&e%tISsw$G+yyk5cBJ$Bbt_GAjLxSe^OmjQgLT#j?kRuQ{(M9rhv-bxT}CIGm8d zv7ZS)iKX(h)45f76IT;FEa@UkeaZb(K%KJIQ=r$INK?Tb`qL5Kf_#?G!22tlypd!P zi_>T2C3#(6>Ej}2t>&^=0wIfK0L5-w@JR9SG)ov4p;V&ZR(-*X<#t~Po~mA?&L*1F z!}SF=^^#ZVSr&Fr#HlK37p3$94=)Y%{Q}*lX%&{_$!W;~u4q195wQj1H}4*tWnF&I ziQ3u8!IR7OStPnoMAqSLB@ZSVIY18D2VvNa;xr8RdzXb9hH)CJFvN#Rq(ofDaF(3^ zIMl+Q;nF(IH;*gr?@uw>6i-h+v_c1(w#2`{K#>t0lOa{4V~k7GI3m0j8! zoN^ZTz%CatOpTEMhNZ1`9z*X-_abQ z70!-K*b5p9eB|G$jxMZEPB-1c$YkL>e4N$p|L_&4A02#&Dqa4 z_|0(}!~gWd$X0`{TBe1CAt$JpJE0Dtfz<}0HzZrE z=)F*7CH?n-phRgC5U>x<>CrWs@6CyJ@%(~=x`2j0t_AwUfK)*`kU#rW{1Zpe=s6Mh zmC-RrkSaQmDmnxlh^DTBYR$a@ht*saVKh70oto4eU}Wail?=K z1svIGePFZ3mGQ^BLucTOHU=hJ*#q@c@;TsI@Vhc8N!Q*X^6w7`)xTc?yryXjUj?s* zv0PJ|I8$3!#h2*nX+Q?=}7M@ysGXW;>}`JJiJ>y!@{rm;NZ%#TwaoL2I7mNNN<9=ryhek<;R-2)fuxhm)d)PnrE z%J*B7+e>btU}?vvF_5^u6v=ny$^}_IOy73z7T%*InLjNsQNBiyTUdMK8)f>KJ(#n6 z=k^yhcglRP1MgyN(P6ppK8un*|G5yEC|nM&#xhKY8}U83QGlltT|X3_s&aT)!I+Th z3S0J)Sn23qSXqy!g{?zc;l(merGkQi7ZzrP!c2JDbj!Sep(Jpi>Vdi%gPiOqDp&>y zV{A*KE)QH8opYUV=xIUY8?jB?2Lc~KlfM8!3=gV|EyIQ0r5j)jdmJx#WoW=miNDNtBVdfZ$=fukvfVc|i|nz4R@<9Zl=*^iQPM_{2)SSMMQ zh#w&KLj+if=(?oz0kk#oV+21zaEM?gC_U1Z3>LBY42cDgOYiQvJKmD}z6*9KaIaYt z@V9m1Q|OGr3RDL%?_KUg^`yW99d+4*=%u;RD7W2HxF3{{`6=3v#8MuMY!`lY Date: Fri, 30 Jun 2023 18:50:17 +0200 Subject: [PATCH 68/86] updated requirements --- requirements.txt | 171 ++++++----------------------------------------- 1 file changed, 20 insertions(+), 151 deletions(-) diff --git a/requirements.txt b/requirements.txt index 619d0c4..ecfbf11 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,152 +1,21 @@ -absl-py==1.3.0 -aiohttp==3.8.3 -aiosignal==1.3.1 -alembic==1.9.1 -antlr4-python3-runtime==4.9.3 -appdirs==1.4.4 -asteroid-filterbanks==0.4.0 -async-timeout==4.0.2 -attrs==22.2.0 -audioread==3.0.0 -autopage==0.5.1 -backports.cached-property==1.0.2 -brotlipy==0.7.0 -cachetools==5.2.0 -certifi==2023.5.7 -cffi==1.15.1 -charset-normalizer==2.1.1 -click==8.1.3 -cliff==4.1.0 -cmaes==0.9.0 -cmake==3.26.4 -cmd2==2.4.2 -colorama==0.4.6 -colorlog==6.7.0 -commonmark==0.9.1 -contourpy==1.0.6 -cryptography==39.0.1 -cycler==0.11.0 -decorator==4.4.2 -docopt==0.6.2 -einops==0.3.2 -ffmpeg-python==0.2.0 -filelock==3.8.0 -flit_core==3.8.0 -fonttools==4.38.0 -frozenlist==1.3.3 -fsspec==2022.11.0 -future==0.18.2 -google-auth==2.15.0 -google-auth-oauthlib==0.4.6 -greenlet==2.0.1 -grpcio==1.51.1 -hmmlearn==0.2.8 -huggingface-hub==0.11.0 -HyperPyYAML==1.1.0 -idna==3.4 -imageio==2.23.0 -imageio-ffmpeg==0.4.7 -importlib-metadata==4.13.0 -joblib==1.2.0 -julius==0.2.7 -kiwisolver==1.4.4 -librosa==0.9.2 -lit==16.0.5.post0 -llvmlite==0.39.1 -Mako==1.2.4 -Markdown==3.4.1 -MarkupSafe==2.1.1 -matplotlib==3.6.2 -mkl-fft==1.3.1 -mkl-random==1.2.2 -mkl-service==2.4.0 -more-itertools==9.0.0 -moviepy==1.0.3 -mpmath==1.2.1 -multidict==6.0.4 -networkx==2.8.8 -numba==0.56.4 -numpy==1.23.5 -oauthlib==3.2.2 -omegaconf==2.3.0 openai-whisper==20230314 -optuna==3.0.5 -packaging==21.3 -pandas==1.5.2 -pbr==5.11.0 -Pillow==9.4.0 -pip==23.0.1 -pooch==1.6.0 -prettytable==3.5.0 -primePy==1.3 -proglog==0.1.10 -protobuf==3.20.1 -pyannote.audio==2.1.1 -pyannote.core==4.5 -pyannote.database==4.1.3 -pyannote.metrics==3.2.1 -pyannote.pipeline==2.3 -pyasn1==0.4.8 -pyasn1-modules==0.2.8 -pycparser==2.21 -pyDeprecate==0.3.2 -pydub==0.25.1 -Pygments==2.13.0 -pyOpenSSL==23.0.0 -pyparsing==3.0.9 -pyperclip==1.8.2 -PySocks==1.7.1 -python-dateutil==2.8.2 -pytorch-lightning==1.6.5 -pytorch-metric-learning==1.6.3 -pytz==2022.7 -PyYAML==6.0 -regex==2022.10.31 -requests==2.28.1 -requests-oauthlib==1.3.1 -resampy==0.4.2 -rich==12.6.0 -rsa==4.9 -ruamel.yaml==0.17.21 -ruamel.yaml.clib==0.2.7 -scikit-learn==1.2.0 -scipy==1.8.1 -semantic-version==2.10.0 -semver==2.13.0 -sentencepiece==0.1.97 -setuptools==65.6.3 -setuptools-rust==1.5.2 -shellingham==1.5.0 -simplejson==3.18.0 -singledispatchmethod==1.0 -six==1.16.0 -sortedcontainers==2.4.0 -SoundFile==0.10.3.post1 -speechbrain==0.5.13 -SQLAlchemy==1.4.45 -stevedore==4.1.1 -sympy==1.11.1 -tabulate==0.9.0 -tensorboard==2.11.0 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.1 -threadpoolctl==3.1.0 -tiktoken==0.3.1 -tokenizers==0.13.2 -torch==1.11.0 -torch-audiomentations==0.11.0 -torch-pitch-shift==1.2.2 -torchaudio==0.11.0 -torchmetrics==0.11.0 -torchvision==0.12.0 -tqdm==4.65.0 -transformers==4.24.0 -triton==2.0.0 -typer==0.7.0 -typing_extensions==4.4.0 -urllib3==1.26.15 -wcwidth==0.2.5 -Werkzeug==2.2.2 -wheel==0.38.4 -yarl==1.8.2 -zipp==3.11.0 + +pyannote.audio~=2.1.1 +pyannote.core~=4.5 +pyannote.database~=4.1.3 +pyannote.metrics~=3.2.1 +pyannote.pipeline~=2.3 + +setuptools~=65.6.3 +setuptools-rust~=1.5.2 + +torch~=1.11.0 +torchaudio~=0.11.0 +torchmetrics~=0.11.0 +torchvision~=0.12.0 +tqdm>=4.65.0 + +#optional: +#dash~=2.10.2 + + From 9c78cdd230b737203766aacbace94368171450bc Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 30 Jun 2023 18:53:47 +0200 Subject: [PATCH 69/86] updated file --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index ecfbf11..433b3c1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ torchmetrics~=0.11.0 torchvision~=0.12.0 tqdm>=4.65.0 -#optional: +#optional: #dash~=2.10.2 From 22b5b28f2115744ccbc1cda9b8fcee41e261ce35 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 7 Jul 2023 12:57:31 +0200 Subject: [PATCH 70/86] updated Readme --- README.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 999dba3..8ffe9d1 100644 --- a/README.md +++ b/README.md @@ -1 +1,47 @@ -# transcriptor \ No newline at end of file + +# `AutoTranscript`: Fully Automated Transcription using AI + +`AutoTranscript` is a [PyTorch](https://pytorch.org/) based interface for. To enable fully auomated Transcription using AI models containing speaker diarization models: + +- [whisper](https://github.com/openai/whisper): an a general-purpose speech recognition model +- [payannote-audio](https://github.com/pyannote/pyannote-audio) an open-source toolkit for speaker diarization + +Therefore `AutoTranscript` can be used as a Commandline Interface a Webserver or as a Python API. + +## Setup: +For this Project, Python 3.9 were [PyTorch](https://pytorch.org/) version 1.11.0 + +The following command will pull and install the latest commit from this repository, along with its Python dependencies. + + pip install https://github.com/JSchmie/autotranscript.git + +## Example Python usage + +```python +from autotranscript import AutoTranscribe + +model = AutoTranscribe() + +text = model.transcribe("audio.wav") + +print(f"Transcription: \n{text}") + +``` + +## Command-line usage + +If you not want to control the optimization using python, you also can use the Command-line: + + autotranscript audio.wav + +Run the following to view all available options: + + autotranscript -h + + +## License + +## Citation + + + From abd733b2aeb4ef08a30655bd0556c48c869aca73 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Fri, 7 Jul 2023 12:57:47 +0200 Subject: [PATCH 71/86] updated setup.py --- setup.py | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/setup.py b/setup.py index 7517d61..e7da608 100644 --- a/setup.py +++ b/setup.py @@ -15,25 +15,28 @@ version = {"__file__": verfile} with open(verfile, "r") as fp: exec(fp.read(), version) + ############### setup ############### build_version = "AUTOTRANSCRIPT_BUILD" in os.environ -setup( - name=module_name, - version=version["get_version"](build_version), - packages=find_packages(), - python_requires="~=3.9", - readme="README.md", - install_requires = [str(r) for r in pkg_resources.parse_requirements( - open(os.path.join(os.path.dirname(__file__), "requirements.txt")) - ) - ], - url= github_url, - license='', - author='Jacob Schmieder', - author_email='', - description='Transcription tool for audio files based on Whisper and Pyannote', - entry_points={'console_scripts': - ['autotranscript = autotranscript.autotranscript:cli']} -) +if __name__ == "__main__": + + setup( + name=module_name, + version=version["get_version"](build_version), + packages=find_packages(), + python_requires="~=3.9", + readme="README.md", + install_requires = [str(r) for r in pkg_resources.parse_requirements( + open(os.path.join(os.path.dirname(__file__), "requirements.txt")) + ) + ], + url= github_url, + license='', + author='Jacob Schmieder', + author_email='', + description='Transcription tool for audio files based on Whisper and Pyannote', + entry_points={'console_scripts': + ['autotranscript = autotranscript.autotranscript:cli']} + ) From a71475c3eba9afe0dd87d07dbff6607dd14bb69e Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:27:54 +0200 Subject: [PATCH 72/86] updated diarisation file to better handle tokens --- autotranscript/__init__.py | 2 +- autotranscript/diarisation.py | 77 ++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 29 deletions(-) diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py index e6b02f3..20bcc93 100644 --- a/autotranscript/__init__.py +++ b/autotranscript/__init__.py @@ -6,5 +6,5 @@ from .transcript_exporter import * from .diarisation import * from .version import get_version as _get_version from .misc import * - + __version__ = _get_version() diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index bb364e9..5359e3e 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,13 +1,21 @@ -from pyannote.audio import Pipeline -from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization -from torch import Tensor +""" +Diarisation class. +This class is used to diarize an audio file using a pretrained model +""" import os from pathlib import Path from typing import TypeVar, Union -import json + +from pyannote.audio import Pipeline +from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization +from torch import Tensor + from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH Annotation = TypeVar('Annotation') +TOKEN_PATH = os.path.join(os.path.dirname( + os.path.realpath(__file__)), '.pyannotetoken') + class Diariser: """ Diarisation class @@ -15,7 +23,7 @@ class Diariser: from pyannote.audio. :param model: model to use for diarization """ - def __init__(self, model,*args,**kwargs) -> None: + def __init__(self, model) -> None: self.model = model @@ -29,7 +37,7 @@ class Diariser: :return: diarization """ kwargs = self._get_diarisation_kwargs(**kwargs) - + diarization = self.model(audiofile,*args, **kwargs) out = self.format_diarization_output(diarization) @@ -52,7 +60,7 @@ class Diariser: index_start_speaker = 0 index_end_speaker = 0 current_speaker = str() - + ### # Sometimes two consecutive speakers are the same # This loop removes these duplicates @@ -91,37 +99,41 @@ class Diariser: diarization_output["segments"].append([start, end]) diarization_output["speakers"].append(outp[2]) return diarization_output - - def save(self, path : str, *args, **kwargs) -> None: - """ - Save diarization output to a file - - :param path: path to save file - :type path: str - """ - with open(path, "w") as f: - json.dump(self.transcript, f, *args, **kwargs) - - @staticmethod def _get_token(): - # check ig .pyannotetoken.txt exists - path = os.path.join(os.path.dirname( - os.path.realpath(__file__)), '.pyannotetoken') - if os.path.exists(path): - with open(path, 'r') as f: - token = f.read() + """ + Get token from .pyannotetoken.txt + :raises ValueError: No token found + :return: Huggingface token + :rtype: str + """ + + if os.path.exists(TOKEN_PATH): + with open(TOKEN_PATH, 'r', encoding="utf-8") as file: + token = file.read() else: raise ValueError('No token found.' \ 'Please create a token at https://huggingface.co/settings/token' \ - 'and save it in a file called .pyannotetoken.txt') + f'and save it in a file called {TOKEN_PATH}') return token + + @staticmethod + def _save_token(token): + """ + Save token to .pyannotetoken.txt + + :param token: Huggingface token + :type token: str + """ + with open(TOKEN_PATH, 'r', encoding="utf-8") as file: + file.write(token) @classmethod def load_model(cls, model: str = PYANNOTE_DEFAULT_CONFIG, token: str = None, + cache_token: bool = False, cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH, hparams_file: Union[str, Path] = None ) -> Pipeline: @@ -142,14 +154,23 @@ class Diariser: ------- Pipeline Object """ + + if cache_token and token is not None: + cls._save_token(token) + if not os.path.exists(model) and token is None: token = cls._get_token() - + model = 'pyannote/speaker-diarization' + _model = Pipeline.from_pretrained(model, use_auth_token = token, cache_dir = cache_dir, hparams_file = hparams_file,) - + + if model is None: + raise ValueError('Unable to load model either from local cache' \ + 'or from huggingface.co models. Please check your token' \ + 'or your local model path') return cls(_model) @staticmethod From fd346012cfb1e65558ee35c6f36ee17eba7dc665 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:28:08 +0200 Subject: [PATCH 73/86] added file --- autotranscript/app/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 autotranscript/app/__init__.py diff --git a/autotranscript/app/__init__.py b/autotranscript/app/__init__.py new file mode 100644 index 0000000..c61a882 --- /dev/null +++ b/autotranscript/app/__init__.py @@ -0,0 +1 @@ +from .qtfaststart import * \ No newline at end of file From 52754c988552f7ac45013f49ab1963f94ced5e78 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:29:09 +0200 Subject: [PATCH 74/86] removed unnecessary stuff --- autotranscript/misc.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/autotranscript/misc.py b/autotranscript/misc.py index 1eaf34f..cd75ffc 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -1,8 +1,4 @@ -from pyannote.audio import Pipeline -from whisper import Whisper, load_model import os -import glob -from warnings import warn import yaml CACHE_DIR = os.getenv( @@ -32,23 +28,9 @@ def config_diarization_yaml(file, path_to_segmentation = None): else: yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin") - # if path_to_embedding: - # yml["pipeline"]["params"]["embedding"] = path_to_embedding - # else: - # yml["pipeline"]["params"]["embedding"] = os.path.relpath( - # os.path.join( - # os.path.dirname(__file__), - # "models", "pyannote", - # "speechbrain", - # "spkrec-ecapa-voxceleb", - # "embedding_model.ckpt")) - if not os.path.exists(yml["pipeline"]["params"]["segmentation"]): raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}") - # if not os.path.exists(yml["pipeline"]["params"]["embedding"]): - # raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}") - with open(file, "w") as stream: yaml.dump(yml, stream) stream.close() From 42f558207b0317a0584ae0e23e405d071352d61e Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:37:37 +0200 Subject: [PATCH 75/86] fixed wrong Ident --- autotranscript/autotranscript.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 9f14886..ff188e9 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -157,7 +157,7 @@ class AutoTranscribe: elif isinstance(audiofile, torch.Tensor): audiofile = AudioProcessor(audiofile[0], audiofile[1]) elif isinstance(audiofile, ndarray): - audiofile = AudioProcessor(torch.tensor(audiofile[0]), + audiofile = AudioProcessor(torch.Tensor(audiofile[0]), audiofile[1]) if not isinstance(audiofile, AudioProcessor): From a4b2bdc3c16eceb702651ec1a2df5c32e1f07f87 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:37:48 +0200 Subject: [PATCH 76/86] added seq to str --- autotranscript/transcript_exporter.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 2615a67..add3e16 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -73,15 +73,15 @@ class Transcript: """ fstring = "" - for id in self.transcript: - seq = self.transcript[id] + for _id in self.transcript: + seq = self.transcript[_id] if self.annotation: speaker = self.annotation[seq["speaker"]] else: speaker = seq["speaker"] - fstring += f"{speaker}: {seq['text']}\n" + fstring += f"{speaker} {seq}: {seq['text']}\n" return fstring def __repr__(self) -> str: From 2d6954ff3fa5ec39b19eb264cd76d75adfd4dde7 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 13:42:34 +0200 Subject: [PATCH 77/86] fixed __str__ --- autotranscript/transcript_exporter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index add3e16..12cdefb 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,5 +1,7 @@ import json +from sympy import Segment + ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] @@ -80,8 +82,10 @@ class Transcript: speaker = self.annotation[seq["speaker"]] else: speaker = seq["speaker"] - - fstring += f"{speaker} {seq}: {seq['text']}\n" + + segm = seq["segment"] + + fstring += f"{speaker} {segm}: {seq['text']}\n" return fstring def __repr__(self) -> str: From a21bc32f7dbab533237baf7c064ac39462d5b909 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 10 Jul 2023 14:09:50 +0200 Subject: [PATCH 78/86] imporved segment timesteps readability --- autotranscript/transcript_exporter.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 12cdefb..3ae53a6 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,6 +1,5 @@ import json - -from sympy import Segment +import time ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] @@ -84,8 +83,10 @@ class Transcript: speaker = seq["speaker"] segm = seq["segment"] + sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0])) + eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1])) - fstring += f"{speaker} {segm}: {seq['text']}\n" + fstring += f"{speaker} ({sseg} ; {eseg}): {seq['text']}\n" return fstring def __repr__(self) -> str: @@ -122,9 +123,8 @@ class Transcript: html = "

" + self.__str__().replace("\n", "
") + "

" html = "" + html + "" html = html.replace("\t", "    ") - - return html - + + return html def get_md(self) -> str: return self.get_html() From d2c57866df503a7aae4d4c5004caae223443bb74 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 13:17:13 +0200 Subject: [PATCH 79/86] unifyed documentation --- autotranscript/audio.py | 137 +++++++++++++++++-------------- autotranscript/diarisation.py | 149 ++++++++++++++++++++++------------ 2 files changed, 173 insertions(+), 113 deletions(-) diff --git a/autotranscript/audio.py b/autotranscript/audio.py index 7944a73..04feb1d 100644 --- a/autotranscript/audio.py +++ b/autotranscript/audio.py @@ -1,34 +1,63 @@ +""" +Audio Processor Module +======================= + +This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files. +It includes functionalities to load, cut, and manage audio waveforms, offering efficient and +flexible audio processing. + +Available Classes: +- AudioProcessor: Processes audio waveforms and provides methods for loading, + cutting, and handling audio. + +Usage: + from .audio_import AudioProcessor + + processor = AudioProcessor.from_file("path/to/audiofile.wav") + cut_waveform = processor.cut(start=1.0, end=5.0) + +Constants: +- SAMPLE_RATE (int): Default sample rate for processing. +- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform. +""" + +from subprocess import CalledProcessError, run import numpy as np import torch -from subprocess import CalledProcessError, run -from typing import Union + SAMPLE_RATE = 16000 +NORMALIZATION_FACTOR = 32768.0 class AudioProcessor: """ - Audio Processor using PyTorchaudio instead of PyDub + Audio Processor class that leverages PyTorchaudio to provide functionalities + for loading, cutting, and handling audio waveforms. + + Attributes: + waveform: torch.Tensor + The audio waveform tensor. + sr: int + The sample rate of the audio. """ def __init__(self, waveform: torch.Tensor, sr : int = SAMPLE_RATE, *args, **kwargs) -> None: + """ - Initialise audio processor - :param waveform: waveform - :param sr: sample rate - :param args: additional arguments - :param kwargs: additional keyword arguments - example: - - device: device to use for processing - if cuda is available, cuda is used + Initialize the AudioProcessor object. + + Args: + waveform (torch.Tensor): The audio waveform tensor. + sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE. + args: Additional arguments. + kwargs: Additional keyword arguments, e.g., device to use for processing. + If CUDA is available, it defaults to CUDA. + + Raises: + ValueError: If the provided sample rate is not of type int. """ - if "device" in kwargs: - device = kwargs["device"] - else: - if torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" + device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu") self.waveform = waveform.to(device) self.sr = sr @@ -40,9 +69,13 @@ class AudioProcessor: @classmethod def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor': """ - Load audio file - :param file: audio file - :return: AudioProcessor + Create an AudioProcessor instance from an audio file. + + Args: + file (str): The audio file path. + + Returns: + AudioProcessor: An instance of the AudioProcessor class containing the loaded audio. """ audio, sr = cls.load_audio(file , *args, **kwargs) @@ -54,42 +87,37 @@ class AudioProcessor: def cut(self, start: float, end: float) -> torch.Tensor: """ - Cut audio file - :param start: start time in seconds - :param end: end time in seconds - :return: AudioProcessor + Cut a segment from the audio waveform between the specified start and end times. + + Args: + start (float): Start time in seconds. + end (float): End time in seconds. + + Returns: + torch.Tensor: The cut waveform segment. """ - if isinstance(start, float): - start = torch.Tensor([start]) - if isinstance(end, float): - end = torch.Tensor([end]) - - sr = torch.Tensor([self.sr]) - - start = int(start * sr) - end = torch.ceil(end * sr) - - return self.waveform[start:end.to(int)] + start = int(start * self.sr) + end = int(torch.ceil(end * self.sr)) + return self.waveform[start:end] @staticmethod def load_audio(file: str, sr: int = SAMPLE_RATE): """ - Open an audio file and read as mono waveform, resampling as necessary + Open an audio file and read it as a mono waveform, resampling if necessary. + This method ensures compatibility with pyannote.audio + and requires the ffmpeg CLI in PATH. - Changed from original function at whisper.audio.load_audio to ensure - compatibility with pyannote.audio - Parameters - ---------- - file: str - The audio file to open + Args: + file (str): The audio file to open. + sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE. - sr: int - The sample rate to resample the audio if necessary + Returns: + tuple: A NumPy array containing the audio waveform in float32 dtype + and the sample rate. - Returns - ------- - A NumPy array containing the audio waveform, in float32 dtype. + Raises: + RuntimeError: If failed to load audio. """ # This launches a subprocess to decode audio while down-mixing # and resampling as necessary. Requires the ffmpeg CLI in PATH. @@ -111,18 +139,9 @@ class AudioProcessor: except CalledProcessError as e: raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e - out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0 + out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / NORMALIZATION_FACTOR return out , sr def __repr__(self) -> str: - return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' - - def __str__(self) -> str: - return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' - - -if __name__ == "__main__": - - print("Testing AudioProcessor") - print(AudioProcessor.from_file("tests/test.wav")) \ No newline at end of file + return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})' \ No newline at end of file diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 5359e3e..0770ea9 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,7 +1,32 @@ """ -Diarisation class. -This class is used to diarize an audio file using a pretrained model +Diarisation Class +================= + +This class serves as the heart of the speaker diarization system, responsible for identifying +and segmenting individual speakers from a given audio file. It leverages a pretrained model +from pyannote.audio, providing an accessible interface for audio processing tasks such as +speaker separation, and timestamping. + +By encapsulating the complexities of the underlying model, it allows for straightforward +integration into various applications, ranging from transcription services to voice assistants. + +Available Classes: +- Diariser: Main class for performing speaker diarization. + Includes methods for loading models, processing audio files, + and formatting the diarization output. + +Constants: +- TOKEN_PATH (str): Path to the Pyannote token. +- PYANNOTE_DEFAULT_PATH (str): Default path to Pyannote models. +- PYANNOTE_DEFAULT_CONFIG (str): Default configuration for Pyannote models. + +Usage: + from .diarisation import Diariser + + model = Diariser.load_model(model="path/to/model/config.yaml") + diarisation_output = model.diarization("path/to/audiofile.wav") """ + import os from pathlib import Path from typing import TypeVar, Union @@ -10,7 +35,7 @@ from pyannote.audio import Pipeline from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization from torch import Tensor -from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH +from .misc import PYANNOTE_DEFAULT_PATH, PYANNOTE_DEFAULT_CONFIG Annotation = TypeVar('Annotation') TOKEN_PATH = os.path.join(os.path.dirname( @@ -18,11 +43,13 @@ TOKEN_PATH = os.path.join(os.path.dirname( class Diariser: """ - Diarisation class - This class is used to diarize an audio file using a pretrained model - from pyannote.audio. - :param model: model to use for diarization + Handles the diarization process of an audio file using a pretrained model + from pyannote.audio. Diarization is the task of determining "who spoke when." + + Args: + model: The pretrained model to use for diarization. """ + def __init__(self, model) -> None: self.model = model @@ -30,11 +57,20 @@ class Diariser: def diarization(self, audiofile : Union[str, Tensor, dict] , *args, **kwargs) -> Annotation: """ - Diarization of audio file - :param audiofile: path to audio file or torch.Tensor - :param args: args for diarization model - :param kwargs: kwargs for diarization model - :return: diarization + Perform speaker diarization on the provided audio file, + effectively separating different speakers + and providing a timestamp for each segment. + + Args: + audiofile: The path to the audio file or a torch.Tensor + containing the audio data. + args: Additional arguments for the diarization model. + kwargs: Additional keyword arguments for the diarization model. + + Returns: + dict: A dictionary containing speaker names, + segments, and other information related + to the diarization process. """ kwargs = self._get_diarisation_kwargs(**kwargs) @@ -47,10 +83,14 @@ class Diariser: @staticmethod def format_diarization_output(dia : Annotation) -> dict: """ - Format diarization output to a list of tuples - :param dia: diarization output - :return: dict with speaker names as keys and list of tuples - as values and list of different speakers + Formats the raw diarization output into a more usable structure for this project. + + Args: + dia: Raw diarization output. + + Returns: + dict: A structured representation of the diarization, with speaker names + as keys and a list of tuples representing segments as values. """ dia_list = list(dia.itertracks(yield_label=True)) @@ -103,10 +143,14 @@ class Diariser: @staticmethod def _get_token(): """ - Get token from .pyannotetoken.txt - :raises ValueError: No token found - :return: Huggingface token - :rtype: str + Retrieves the Huggingface token from a local file. This token is required + for accessing certain online resources. + + Raises: + ValueError: If the token is not found. + + Returns: + str: The Huggingface token. """ if os.path.exists(TOKEN_PATH): @@ -121,12 +165,13 @@ class Diariser: @staticmethod def _save_token(token): """ - Save token to .pyannotetoken.txt + Saves the provided Huggingface token to a local file. This facilitates future + access to online resources without needing to repeatedly authenticate. - :param token: Huggingface token - :type token: str + Args: + token: The Huggingface token to save. """ - with open(TOKEN_PATH, 'r', encoding="utf-8") as file: + with open(TOKEN_PATH, 'w', encoding="utf-8") as file: file.write(token) @classmethod @@ -137,22 +182,21 @@ class Diariser: cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH, hparams_file: Union[str, Path] = None ) -> Pipeline: - """ - Load modules from pyannote - - Parameters - ---------- - model : str - pyannote model - default: /models/pyannote/speaker_diarization/config.yaml - token : str - HUGGINGFACE_TOKEN - local : bool - If true, load from local cache - Returns - ------- - Pipeline Object + """ + Loads a pretrained model from pyannote.audio, + either from a local cache or online repository. + + Args: + model: Path or identifier for the pyannote model. + default: /models/pyannote/speaker_diarization/config.yaml + token: Optional HUGGINGFACE_TOKEN for authenticated access. + cache_token: Whether to cache the token locally for future use. + cache_dir: Directory for caching models. + hparams_file: Path to a YAML file containing hyperparameters. + + Returns: + Pipeline: A pyannote.audio Pipeline object, encapsulating the loaded model. """ if cache_token and token is not None: @@ -161,38 +205,35 @@ class Diariser: if not os.path.exists(model) and token is None: token = cls._get_token() model = 'pyannote/speaker-diarization' - + _model = Pipeline.from_pretrained(model, use_auth_token = token, cache_dir = cache_dir, hparams_file = hparams_file,) - if model is None: + if _model is None: raise ValueError('Unable to load model either from local cache' \ 'or from huggingface.co models. Please check your token' \ 'or your local model path') + return cls(_model) @staticmethod def _get_diarisation_kwargs(**kwargs) -> dict: """ - Get kwargs for pyannote diarization model - Ensure that kwargs are valid - :return: kwargs for pyannote diarization model - :rtype: dict + Validates and extracts the keyword arguments for the pyannote diarization model. + + Ensures that the provided keyword arguments match the expected parameters, + filtering out any invalid or unnecessary arguments. + + Returns: + dict: A dictionary containing the validated keyword arguments. """ _possible_kwargs = SpeakerDiarization.apply.__code__.co_varnames - - diarisation_kwargs = dict() - - for k in kwargs.keys(): - if k in _possible_kwargs: - diarisation_kwargs[k] = kwargs[k] + + diarisation_kwargs = {k: v for k, v in kwargs.items() if k in _possible_kwargs} return diarisation_kwargs def __repr__(self): return f"Diarisation(model={self.model})" - - def __str__(self): - return f"Diarisation(model={self.model})" From 35fcc243572e15a0b26feababdbe73efe3f86342 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 15:32:05 +0200 Subject: [PATCH 80/86] unifyed docstrings and reworked cli funtion --- autotranscript/autotranscript.py | 395 ++++++++++++++++++------------- 1 file changed, 228 insertions(+), 167 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index ff188e9..3efd468 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -1,39 +1,80 @@ +""" +AutoTranscribe Class +-------------------- + +This class serves as the core of the transcription system, responsible for handling +transcription and diarization of audio files. It leverages pretrained models for +speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio), +providing an accessible interface for audio processing tasks such as transcription, +speaker separation, and timestamping. + +By encapsulating the complexities of underlying models, it allows for straightforward +integration into various applications, ranging from transcription services to voice assistants. + +Available Classes: +- AutoTranscribe: Main class for performing transcription and diarization. + Includes methods for loading models, processing audio files, + and formatting the transcription output. + +Usage: + from .autotranscribe import AutoTranscribe + + model = AutoTranscribe(whisper_model="path/to/whisper/model", dia_model="path/to/diarisation/model") + transcript = model.transcribe("path/to/audiofile.wav") +""" + +# Standard Library Imports +import argparse +import os +from glob import iglob +from subprocess import run +from typing import TypeVar, Union +from warnings import warn + +# Third-Party Imports +import torch +from numpy import ndarray +from tqdm import trange + +# Application-Specific Imports from .audio import AudioProcessor from .diarisation import Diariser from .transcriber import Transcriber, whisper from .transcript_exporter import Transcript -from typing import Union , TypeVar -from tqdm import trange -import torch -import os -from glob import iglob -from subprocess import run -from warnings import warn -import argparse -from numpy import ndarray -diarisation = TypeVar('diarisation') +DiarisationType = TypeVar('DiarisationType') class AutoTranscribe: + """ + AutoTranscribe is a class responsible for managing the transcription and diarization of audio files. + It serves as the core of the transcription system, incorporating pretrained models + for speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio), + allowing for comprehensive audio processing. + + Attributes: + transcriber (Transcriber): The transcriber object to handle transcription. + diariser (Diariser): The diariser object to handle diarization. + + Methods: + __init__: Initializes the AutoTranscribe class with appropriate models. + transcribe: Transcribes an audio file using the whisper model and pyannote diarization model. + remove_audio_file: Removes the original audio file to avoid disk space issues or ensure data privacy. + get_audio_file: Gets an audio file as an AudioProcessor object. + """ def __init__(self, whisper_model: Union[bool, str, whisper] = None, - dia_model : Union[bool, str, diarisation] = None, + dia_model : Union[bool, str, DiarisationType] = None, **kwargs) -> None: - """ - AutoTranscribe class - - This class is the core Api Class of the autotranscript package. - It allows to transcribe audio files with a whisper model and - pyannote diarization model. - - Therefore it is do a fully automatic transcription of audio files. - - :param whisper_model: path to whisper model or whisper model - :param dia_model: path to pyannote diarization model - :param dia_kwargs: kwargs for pyannote diarization model - :param whisper_kwargs: kwargs for whisper model - + """Initializes the AutoTranscribe class. + + Args: + whisper_model (Union[bool, str, whisper], optional): + Path to whisper model or whisper model itself. + diarisation_model (Union[bool, str, DiarisationType], optional): + Path to pyannote diarization model or model itself. + **kwargs: Additional keyword arguments for whisper + and pyannote diarization models. """ if whisper_model is None: @@ -52,26 +93,33 @@ class AutoTranscribe: print("AutoTranscribe initialized all models successfully loaded.") - def transcribe(self, audiofile : Union[str, torch.Tensor, ndarray], + def transcribe(self, audio_file : Union[str, torch.Tensor, ndarray], remove_original : bool = False, - *args, **kwargs) -> Transcript: + **kwargs) -> Transcript: """ - Transcribe audiofile with whisper model and pyannote diarization model - - :param audiofile: path to audiofile or torch.Tensor - :param remove_original: if True the original audiofile will be removed after - transcription. - :return: Transcript object which contains the transcript and can be used to - export the transcript to differnt formats. + Transcribes an audio file using the whisper model and pyannote diarization model. + + Args: + audio_file (Union[str, torch.Tensor, ndarray]): + Path to audio file or a tensor representing the audio. + remove_original (bool, optional): If True, the original audio file will + be removed after transcription. + *args: Additional positional arguments for diarization and transcription. + **kwargs: Additional keyword arguments for diarization and transcription. + + Returns: + Transcript: A Transcript object containing the transcription, + which can be exported to different formats. """ - audiofile = self.get_audiofile(audiofile) + # Get audio file as an AudioProcessor object + audio_file = self.get_audio_file(audio_file) - final_transcript = dict() - - dia_audio = {"waveform" : - audiofile.waveform.reshape(1,len(audiofile.waveform)), - "sample_rate": audiofile.sr} + # Prepare waveform and sample rate for diarization + dia_audio = { + "waveform" : audio_file.waveform.reshape(1,len(audio_file.waveform)), + "sample_rate": audio_file.sr + } print("Starting diarisation.") @@ -80,52 +128,55 @@ class AutoTranscribe: print("Diarisation finished. Starting transcription.") - audiofile.sr = torch.Tensor([audiofile.sr]).to(audiofile.waveform.device) + audio_file.sr = torch.Tensor([audio_file.sr]).to(audio_file.waveform.device) + + # Transcribe each segment and store the results + final_transcript = dict() for i in trange(len(diarisation["segments"]), desc= "Transcribing"): seg = diarisation["segments"][i] - audio = audiofile.cut(seg[0], seg[1]) + audio = audio_file.cut(seg[0], seg[1]) transcript = self.transcriber.transcribe(audio, *args , **kwargs) final_transcript[i] = {"speaker" : diarisation["speakers"][i], "segment" : seg, "text" : transcript} - + + # Remove original file if needed if remove_original: if kwargs.get("shred") is True: - self.remove_audio_file(audiofile, shred=True) + self.remove_audio_file(audio_file, shred=True) else: - self.remove_audio_file(audiofile, shred=False) + self.remove_audio_file(audio_file, shred=False) return Transcript(final_transcript) - + @staticmethod - def remove_audio_file(audiofile : str, + def remove_audio_file(audio_file : str, shred : bool = False) -> None: """ - removes orginal audiofile to avoid disk space problems - - or to enshure data privacy - - :param audiofile: path to audiofile - :param shred: if True audiofile will be shredded and not only removed - + Removes the original audio file to avoid disk space issues or ensure data privacy. + + Args: + audio_file_path (str): Path to the audio file. + shred (bool, optional): If True, the audio file will be shredded, + not just removed. """ - if not os.path.exists(audiofile): - raise ValueError(f"Audiofile {audiofile} does not exist.") + if not os.path.exists(audio_file): + raise ValueError(f"Audiofile {audio_file} does not exist.") if shred: warn("Shredding audiofile can take a long time.", RuntimeWarning) - gen = iglob(f'{audiofile}', recursive=True) - cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}'] + gen = iglob(f'{audio_file}', recursive=True) + cmd = ['shred', '-zvu', '-n', '10', f'{audio_file}'] - if os.path.isdir(audiofile): - raise ValueError(f"Audiofile {audiofile} is a directory.") + if os.path.isdir(audio_file): + raise ValueError(f"Audiofile {audio_file} is a directory.") for file in gen: print(f'shredding {file} now\n') @@ -133,40 +184,51 @@ class AutoTranscribe: run(cmd , check=True) else: - os.remove(audiofile) - print(f"Audiofile {audiofile} removed.") + os.remove(audio_file) + print(f"Audiofile {audio_file} removed.") @staticmethod - def get_audiofile(audiofile : Union[str, torch.Tensor, ndarray], + def get_audio_file(audio_file : Union[str, torch.Tensor, ndarray], *args, **kwargs) -> AudioProcessor: - """ - Get audiofile as TorchAudioProcessor + """Gets an audio file as TorchAudioProcessor. - :param audiofile: path to audiofile or torch.Tensor - :type audiofile: Union[str, torch.Tensor] - :return: object of audiofile containes - waveform and sample_rate in torch.Tensor format. - :rtype: TorchAudioProcessor + Args: + audio_file (Union[str, torch.Tensor, ndarray]): Path to the audio file or + a tensor representing the audio. + *args: Additional positional arguments. + **kwargs: Additional keyword arguments. + + Returns: + AudioProcessor: An object containing the waveform and sample rate in + torch.Tensor format. """ - if isinstance(audiofile, str): - audiofile = AudioProcessor.from_file(audiofile) + if isinstance(audio_file, str): + audio_file = AudioProcessor.from_file(audio_file) - elif isinstance(audiofile, torch.Tensor): - audiofile = AudioProcessor(audiofile[0], audiofile[1]) - elif isinstance(audiofile, ndarray): - audiofile = AudioProcessor(torch.Tensor(audiofile[0]), - audiofile[1]) + elif isinstance(audio_file, torch.Tensor): + audio_file = AudioProcessor(audio_file[0], audio_file[1]) + elif isinstance(audio_file, ndarray): + audio_file = AudioProcessor(torch.Tensor(audio_file[0]), + audio_file[1]) - if not isinstance(audiofile, AudioProcessor): + if not isinstance(audio_file, AudioProcessor): raise ValueError(f'Audiofile must be of type AudioProcessor,' \ - f'not {type(audiofile)}') - return audiofile - + f'not {type(audio_file)}') + return audio_file + def cli(): + """ + Command-Line Interface (CLI) for the AutoTranscribe class, allowing for user interaction to transcribe + and diarize audio files. The function includes arguments for specifying the audio files, model paths, + output formats, and other options necessary for transcription. + + This function can be executed from the command line to perform transcription tasks, providing a + user-friendly way to access the AutoTranscribe class functionalities. + """ from whisper import available_models from whisper.utils import get_writer from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE @@ -179,102 +241,101 @@ def cli(): else: raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}") + parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - # fmt: off - parser = argparse.ArgumentParser(formatter_class= - argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument("audio", nargs="+", type=str, - help="audio file(s) to transcribe") - - parser.add_argument("--wmodel", default="medium", - help="name of the Whisper model to use") - parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH, - help="the path to save model files; uses ./models/whisper by default") - - parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH) - parser.add_argument("--htoken", default="", type=str, help="HuggingFace token for private model download") - parser.add_argument("--local", type=str2bool, default=False, - help="whether to allow model download if model is not found locally") - - parser.add_argument("--device", + parser.add_argument("audio_files", nargs="+", type=str, + help="List of audio files to transcribe.") + + parser.add_argument("--whisper_model_name", default="medium", + help="Name of the Whisper model to use.") + + parser.add_argument("--whisper_model_directory", type=str, default=WHISPER_DEFAULT_PATH, + help="Path to save Whisper model files; defaults to ./models/whisper.") + + parser.add_argument("--diarization_directory", type=str, default=PYANNOTE_DEFAULT_PATH, + help="Path to the diarization model directory.") + + parser.add_argument("--huggingface_token", default="", type=str, + help="HuggingFace token for private model download.") + + parser.add_argument("--allow_download", type=str2bool, default=False, + help="Allow model download if not found locally.") + + parser.add_argument("--inference_device", default="cuda" if torch.cuda.is_available() else "cpu", - help="device to use for PyTorch inference") - parser.add_argument("--threads", type=int, default=0, - help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS") - - parser.add_argument("--output_dir", "-o", type=str, default=".", - help="directory to save the outputs") - parser.add_argument("--output_format", "-f", type=str, default="txt", + help="Device to use for PyTorch inference.") + + parser.add_argument("--num_threads", type=int, default=0, + help="Number of threads used by torch for CPU inference; overrides MKL_NUM_THREADS/OMP_NUM_THREADS.") + + parser.add_argument("--output_directory", "-o", type=str, default=".", + help="Directory to save the transcription outputs.") + + parser.add_argument("--output_format", "-f", type=str, default="txt", choices=["txt", "json", "md", "html"], - help="format of the output file; if not specified, all available formats will be produced") - - parser.add_argument("--verbose", type=str2bool, default=True, - help="whether to print out the progress and debug messages") + help="Format of the output file; defaults to txt.") - parser.add_argument("--task", type=str, default="transcribe", - choices=["transcribe", "diarize","wtranscribe"], - help="whether to perfrom transcription and diazation or only one of them") - parser.add_argument("--language", type=str, default=None, + parser.add_argument("--verbose_output", type=str2bool, default=True, + help="Enable or disable progress and debug messages.") + + parser.add_argument("--transcription_task", type=str, default="transcribe", + choices=["transcribe", "diarize", "wtranscribe"], + help="Choose to perform transcription, diarization, or Whisper transcription.") + + parser.add_argument("--spoken_language", type=str, default=None, choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]), - help="language spoken in the audio, specify None to perform language detection") - - # fmt: on + help="Language spoken in the audio. Specify None to perform language detection.") - args = parser.parse_args().__dict__ + args = parser.parse_args() - model_name: str = args.pop("wmodel") - model_dir: str = args.pop("wmodel_dir") - output_dir: str = args.pop("output_dir") - output_format: str = args.pop("output_format") - local :str = args.pop("local") - task = args.pop("task") - device: str = args.pop("device") - os.makedirs(output_dir, exist_ok=True) + output_directory = args.output_directory + num_threads = args.num_threads + whisper_model_directory = args.whisper_model_directory + allow_download = args.allow_download + inference_device = args.inference_device + whisper_model_name = args.whisper_model_name + diarization_directory = args.diarization_directory + huggingface_token = args.huggingface_token + transcription_task = args.transcription_task + audio_files = args.audio_files + spoken_language = args.spoken_language + output_format = args.output_format - if (threads := args.pop("threads")) > 0: - torch.set_num_threads(threads) + os.makedirs(output_directory, exist_ok=True) - wkwargs = {"download_root": model_dir, - "local": local, - "device": device} - - diarisation_kwargs = {"local": local, - "token" : args.pop("htoken")} - - model = AutoTranscribe(whisper_model= model_name, - whisper_kwargs= wkwargs, - dia_model= args.pop("dia_dir"), - dia_kwargs= diarisation_kwargs,) - - if task == "transcribe": - for audio in args.pop("audio"): - out = model.transcribe(audio, language = args.pop("language")) + if num_threads > 0: + torch.set_num_threads(num_threads) + + whisper_kwargs = { + "download_root": whisper_model_directory, + "local": allow_download, + "device": inference_device + } + + diarisation_kwargs = { + "local": allow_download, + "token": huggingface_token + } + + model = AutoTranscribe(whisper_model=whisper_model_name, + whisper_kwargs=whisper_kwargs, + dia_model=diarization_directory, + dia_kwargs=diarisation_kwargs) + + if transcription_task == "transcribe": + for audio in audio_files: + out = model.transcribe(audio, language=spoken_language) basename = audio.split("/")[-1].split(".")[0] - spath = f"{output_dir}/{basename}.{output_format}" + spath = f"{output_directory}/{basename}.{output_format}" out.save(spath) - - elif task == "diarize": - warn("Diarization is still in beta and may not work as expected.", - RuntimeWarning) - for audio in args.pop("audio"): - out = model.diariser.diarization(audio) - basename = audio.split("/")[-1].split(".")[0] - spath = f"{output_dir}/{basename}.json" - - print(f"diairization results saved to {spath}") - - out.save(spath) - - elif task == "wtranscribe": - writer = get_writer(output_format, output_dir) - warn("whisper transcription is poorly supported and may not work as expected." \ - "It is recommendet to use the whisper cli directly", - RuntimeWarning) - for audio in args.pop("audio"): - out = model.transcriber.transcribe(audio, language = args.pop("language")) - basename = audio.split("/")[-1].split(".")[0] - writer(out, audio) - + + # ... include other tasks here ... + elif transcription_task == "diarize": + # diarize code here + pass + elif transcription_task == "wtranscribe": + # wtranscribe code here + pass + if __name__ == "__main__": cli() \ No newline at end of file From 9e00b13524da83bd1c72468f01aabb0bb3c3af7c Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 15:32:18 +0200 Subject: [PATCH 81/86] unified documentation --- autotranscript/diarisation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py index 0770ea9..5cf60ce 100644 --- a/autotranscript/diarisation.py +++ b/autotranscript/diarisation.py @@ -1,6 +1,6 @@ """ Diarisation Class -================= +------------------ This class serves as the heart of the speaker diarization system, responsible for identifying and segmenting individual speakers from a given audio file. It leverages a pretrained model From cab50cba70abcb56873e5a16cc9e08e41370c452 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 15:32:54 +0200 Subject: [PATCH 82/86] unified docstrings --- autotranscript/transcriber.py | 176 ++++++++++++++++++++++------------ 1 file changed, 115 insertions(+), 61 deletions(-) diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py index 0cd42bf..81787da 100644 --- a/autotranscript/transcriber.py +++ b/autotranscript/transcriber.py @@ -1,33 +1,91 @@ -import os +""" +Transcriber Module +------------------ + +This module provides the Transcriber class, a comprehensive tool for working with Whisper models. +The Transcriber class offers functionalities such as loading different Whisper models, transcribing audio files, +and saving transcriptions to text files. It acts as an interface between various Whisper models and the user, +simplifying the process of audio transcription. + +Main Features: + - Loading different sizes and versions of Whisper models. + - Transcribing audio in various formats including str, Tensor, and nparray. + - Saving the transcriptions to the specified paths. + - Adaptable to various language specifications. + - Options to control the verbosity of the transcription process. + +Constants: + WHISPER_DEFAULT_PATH: Default path for downloading and loading Whisper models. + +Usage: + >>> from your_package import Transcriber + >>> transcriber = Transcriber.load_model(model="medium") + >>> transcript = transcriber.transcribe(audio="path/to/audio.wav") + >>> transcriber.save_transcript(transcript, "path/to/save.txt") +""" + from whisper import Whisper, load_model from typing import TypeVar , Union , Optional -import torch -from glob import glob +from torch import Tensor, device +from numpy import ndarray + + from .misc import WHISPER_DEFAULT_PATH whisper = TypeVar('whisper') -Tensor = TypeVar('Tensor') -nparray = TypeVar('nparray') + class Transcriber: + """ + Transcriber Class + ----------------- + + The Transcriber class serves as a wrapper around Whisper models for efficient audio + transcription. By encapsulating the intricacies of loading models, processing audio, + and saving transcripts, it offers an easy-to-use interface + for users to transcribe audio files. + + Attributes: + model (whisper): The Whisper model used for transcription. + + Methods: + transcribe: Transcribes the given audio file. + save_transcript: Saves the transcript to a file. + load_model: Loads a specific Whisper model. + _get_whisper_kwargs: Private method to get valid keyword arguments for the whisper model. + + Examples: + >>> transcriber = Transcriber.load_model(model="medium") + >>> transcript = transcriber.transcribe(audio="path/to/audio.wav") + >>> transcriber.save_transcript(transcript, "path/to/save.txt") + + Note: + The class supports various sizes and versions of Whisper models. Please refer to + the load_model method for available options. + """ def __init__(self, model: whisper ) -> None: """ - Initialize Transcriber class with a whisper model - :param model: whisper model + Initialize the Transcriber class with a Whisper model. + + Args: + model (whisper): The Whisper model to use for transcription. """ self.model = model - def transcribe(self, audio : Union[str, Tensor, nparray] , + def transcribe(self, audio : Union[str, Tensor, ndarray] , *args, **kwargs) -> str: """ - transcribe audio file - :param file: audio file to transcribe - :param args: additional arguments - :param kwargs: additional keyword arguments - example: - - language: language of the audio file - :return: transcript as string + Transcribe an audio file. + + Args: + audio (Union[str, Tensor, nparray]): The audio file to transcribe. + *args: Additional arguments. + **kwargs: Additional keyword arguments, + such as the language of the audio file. + + Returns: + str: The transcript as a string. """ kwargs = self._get_whisper_kwargs(**kwargs) @@ -41,15 +99,18 @@ class Transcriber: @staticmethod def save_transcript(transcript : str , save_path : str) -> None: """ - Save transcript to file - :param transcript: transcript as string - :param savepath: path to save the transcript - :return: None + Save a transcript to a file. + + Args: + transcript (str): The transcript as a string. + save_path (str): The path to save the transcript. + + Returns: + None """ with open(save_path, 'w') as f: f.write(transcript) - f.close() print(f'Transcript saved to {save_path}') @@ -57,44 +118,38 @@ class Transcriber: def load_model(cls, model: str = "medium", download_root: str = WHISPER_DEFAULT_PATH, - device: Optional[Union[str, torch.device]] = None, + device: Optional[Union[str, device]] = None, in_memory: bool = False, ) -> 'Transcriber': """ - Load whisper module + Load whisper model. - Parameters - ---------- - whisper : str - whisper model - available models: + Args: + model (str): Whisper model. Available models include: + - 'tiny.en' + - 'tiny' + - 'base.en' + - 'base' + - 'small.en' + - 'small' + - 'medium.en' + - 'medium' + - 'large-v1' + - 'large-v2' + - 'large' + + download_root (str, optional): Path to download the model. + Defaults to WHISPER_DEFAULT_PATH. + + device (Optional[Union[str, torch.device]], optional): + Device to load model on. Defaults to None. + in_memory (bool, optional): Whether to load model in memory. + Defaults to False. - - 'tiny.en' - - 'tiny' - - 'base.en' - - 'base' - - 'small.en' - - 'small' - - 'medium.en' - - 'medium' - - 'large-v1' - - 'large-v2' - - 'large' - - local : bool - If true, load from local cache - - download_root : str - Path to download the model - - default: /models/whisper - - Returns - ------- - Whisper Object + Returns: + Transcriber: A Transcriber object initialized with the specified model. """ - _model = load_model(model, download_root=download_root, device=device, in_memory=in_memory) @@ -103,17 +158,16 @@ class Transcriber: @staticmethod def _get_whisper_kwargs(**kwargs) -> dict: """ - Get kwargs for whisper model. - Ensure that kwargs are valid. - :return: kwargs for whisper model - :rtype: dict + Get kwargs for whisper model. Ensure that kwargs are valid. + + Returns: + dict: Keyword arguments for whisper model. """ _possible_kwargs = Whisper.transcribe.__code__.co_varnames - whisper_kwargs = dict() - - for k in kwargs.keys(): - if k in _possible_kwargs: - whisper_kwargs[k] = kwargs[k] + whisper_kwargs = {k: v for k, v in kwargs.items() if k in _possible_kwargs} - return whisper_kwargs \ No newline at end of file + return whisper_kwargs + + def __repr__(self) -> str: + return f"Transcriber(model={self.model})" \ No newline at end of file From 18e89fad9986f84126f07baede3b494c187263ec Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 15:39:20 +0200 Subject: [PATCH 83/86] unified docstrings --- autotranscript/misc.py | 49 +++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/autotranscript/misc.py b/autotranscript/misc.py index cd75ffc..399fcbb 100644 --- a/autotranscript/misc.py +++ b/autotranscript/misc.py @@ -1,36 +1,41 @@ import os import yaml +from pyannote.audio.core.model import CACHE_DIR as PYANNOTE_CACHE_DIR CACHE_DIR = os.getenv( "AUTOT_CACHE", os.path.expanduser("~/.cache/torch/models"), ) +if CACHE_DIR != PYANNOTE_CACHE_DIR: + os.environ["PYANNOTE_CACHE"] = os.path.join(CACHE_DIR, "pyannote") + WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper") - PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote") - PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") -def config_diarization_yaml(file, path_to_segmentation = None): + +def config_diarization_yaml(file_path: str, path_to_segmentation: str = None) -> None: + """Configure diarization pipeline from a YAML file. + + This function updates the YAML file to use the given segmentation model + offline, and avoids manual file manipulation. + + Args: + file_path (str): Path to the YAML file. + path_to_segmentation (str, optional): Optional path to the segmentation model. + + Raises: + FileNotFoundError: If the segmentation model file is not found. """ - Configure diarization pipeline from yaml file to use the model offline - and avoid manuel file manipulation. - - :param file: yaml file - :type file: yaml - """ - with open(file, "r") as stream: - yml = yaml.safe_load(stream) - stream.close() - if path_to_segmentation: - yml["pipeline"]["params"]["segmentation"] = path_to_segmentation - else: - yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin") - - if not os.path.exists(yml["pipeline"]["params"]["segmentation"]): - raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}") - - with open(file, "w") as stream: + with open(file_path, "r") as stream: + yml = yaml.safe_load(stream) + + segmentation_path = path_to_segmentation or os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin") + yml["pipeline"]["params"]["segmentation"] = segmentation_path + + if not os.path.exists(segmentation_path): + raise FileNotFoundError(f"Segmentation model not found at {segmentation_path}") + + with open(file_path, "w") as stream: yaml.dump(yml, stream) - stream.close() From f54ea716d62915b5c5fc2024818155a0d9776850 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 15:39:58 +0200 Subject: [PATCH 84/86] removed args --- autotranscript/autotranscript.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 3efd468..612f9e5 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -123,8 +123,7 @@ class AutoTranscribe: print("Starting diarisation.") - diarisation = self.diariser.diarization(dia_audio, - *args , **kwargs) + diarisation = self.diariser.diarization(dia_audio, **kwargs) print("Diarisation finished. Starting transcription.") @@ -139,7 +138,7 @@ class AutoTranscribe: audio = audio_file.cut(seg[0], seg[1]) - transcript = self.transcriber.transcribe(audio, *args , **kwargs) + transcript = self.transcriber.transcribe(audio, **kwargs) final_transcript[i] = {"speaker" : diarisation["speakers"][i], "segment" : seg, From dc79fed6afd22aca7bcd6e15d3591ff4155b029f Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Wed, 23 Aug 2023 16:01:49 +0200 Subject: [PATCH 85/86] unified docstings --- autotranscript/transcript_exporter.py | 153 +++++++++++++++++--------- 1 file changed, 101 insertions(+), 52 deletions(-) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 3ae53a6..42f2680 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -6,12 +6,18 @@ ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"] class Transcript: """ - Class for storing transcript data - and exporting it to files in different formats + Class for storing transcript data, including speaker information and text segments, + and exporting it to various file formats such as JSON, HTML, and LaTeX. """ + def __init__(self, transcript: dict) -> None: """ - :param transcript: formated transcript string + Initializes the Transcript object with the given transcript data. + + Args: + transcript (dict): A dictionary containing the formatted transcript string. + Keys should correspond to segment IDs, and values should + contain speaker and segment information. """ self.transcript = transcript self.speakers = self._extract_speakers() @@ -20,57 +26,64 @@ class Transcript: def annotate(self, *args, **kwargs) -> dict: """ - Annote transcript to define speaker names - - :param args: list of speaker names will maped sequentially to the speakers - :param kwargs: dict with speaker names as keys and list of segments as values - - :return: dict with speaker names as keys and list of segments as values - :rtype: dict + Annotates the transcript to associate specific names with speakers. + + Args: + args (list): List of speaker names. These will be mapped sequentially to the speakers. + kwargs (dict): Dictionary with speaker names as keys and list of segments as values. + + Returns: + dict: Dictionary with speaker names as keys and the corresponding annotation as values. + + Raises: + ValueError: If the number of speaker names does not match the number + of speakers, or if an unknown speaker is found. """ - annotatios = {} - - if len(args) != len(self.speakers): - raise ValueError("Number of speaker names "\ - "does not match number of speakers") + annotations = {} + if args and len(args) != len(self.speakers): + raise ValueError("Number of speaker names does not match number of speakers") if args: - for arg,ospeaker in zip(args,self.speakers): - annotatios[ospeaker] = arg + for arg, speaker in zip(args, self.speakers): + annotations[speaker] = arg - if kwargs: - for key in kwargs: - if key not in self.speakers: - raise ValueError(f"{key} is not a speaker") - annotatios[key] = kwargs[key] + invalid_speakers = set(kwargs.keys()) - set(self.speakers) + if invalid_speakers: + raise ValueError(f"These keys are not speakers: {', '.join(invalid_speakers)}") - self.annotation = annotatios - return annotatios + annotations.update({key: kwargs[key] for key in self.speakers if key in kwargs}) + + self.annotation = annotations + return annotations def _extract_speakers(self) -> list: """ - Extract speaker names from transcript - :return: list of speaker names - :rtype: list + Extracts the unique speaker names from the transcript. + + Returns: + list: List of unique speaker names in the transcript. """ + return list(set([self.transcript[id]["speaker"] for id in self.transcript])) def _extract_segments(self) -> list: """ - Extract segments from transcript + Extracts all the text segments from the transcript. - :return: list of segments - :rtype: list + Returns: + list: List of segments, where each segment is represented + by the starting and ending times. """ return [self.transcript[id]["segment"] for id in self.transcript] def __str__(self) -> str: """ - Get transcript as string + Converts the transcript to a string representation. - :return: transcript as string - :rtype: str + Returns: + str: String representation of the transcript, including speaker names and + time stamps for each segment. """ fstring = "" @@ -90,6 +103,11 @@ class Transcript: return fstring def __repr__(self) -> str: + """Return a string representation of the Transcript object. + + Returns: + str: A string that provides an informative description of the object. + """ return f"Transcript(speakers = {self.speakers},"\ f"segments = {self.segments}, annotation = {self.annotation})" @@ -127,10 +145,20 @@ class Transcript: return html def get_md(self) -> str: + """Get transcript as Markdown string, using HTML formatting. + + Returns: + str: Transcript as a Markdown string. + """ return self.get_html() def get_tex(self) -> str: - + """Get transcript as LaTeX string. If no annotations are present, the speakers will + be annotated with the first letters of the alphabet. + + Returns: + str: Transcript as LaTeX string. + """ if not self.annotation: self.annotate(*ALPHABET[:len(self.speakers)]) @@ -153,20 +181,30 @@ class Transcript: def to_json(self,path, *args, **kwargs) -> None: - """ - Save transcript as json file - :param path: path to save file - :type path: str + """Save transcript as json file + + Args: + path (str): path to save file """ with open(path, "w") as f: json.dump(self.transcript, f, *args, **kwargs) def to_txt(self, path: str) -> None: + """Save transcript as a LaTeX file (placeholder function, implementation needed). + + Args: + path (str): Path to save the LaTeX file. + """ - with open(path, "w") as f: + with open(path, "w") as f: f.write(self.__str__()) def to_md(self, path: str) -> None: + """Get transcript as Markdown string, using HTML formatting. + + Returns: + str: Transcript as a Markdown string. + """ return self.to_html(path) def to_html(self, path: str) -> None: @@ -181,19 +219,37 @@ class Transcript: file.write(self.get_html()) def to_tex(self, path: str) -> None: + """Save transcript as a LaTeX file (placeholder function, implementation needed). + + Args: + path (str): Path to save the LaTeX file. + """ pass def to_pdf(self, path: str) -> None: + """Save transcript as a PDF file (placeholder function, implementation needed). + + Args: + path (str): Path to save the PDF file. + """ pass def save(self, path: str, *args, **kwargs) -> None: - """ - Save transcript to file with given path and file format + """Save transcript to file with the given path and file format. - :param path: path to save file - :type path: str - :raises ValueError: if file format is unknown + This method can save the transcript in various formats including JSON, TXT, + MD, HTML, TEX, and PDF. The file format is determined by the extension of + the path. + + Args: + path (str): Path to save the file, including the desired file extension. + *args: Additional positional arguments to be passed to the specific save methods. + **kwargs: Additional keyword arguments to be passed to the specific save methods. + + Raises: + ValueError: If the file format specified in the path is unknown. """ + if path.endswith(".json"): self.to_json(path, *args, **kwargs) elif path.endswith(".txt"): @@ -208,12 +264,5 @@ class Transcript: self.to_pdf(path, *args, **kwargs) else: raise ValueError("Unknown file format") - -if __name__ == "__main__": - test = Transcript(json.load(open("tests/test.json", "r"))) - print(repr(test)) - print(test) - - - + \ No newline at end of file From e331fe98f32b55c1d2d9934198ccd98ddcd5d32f Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Thu, 24 Aug 2023 16:12:28 +0200 Subject: [PATCH 86/86] final codebase rework --- autotranscript/autotranscript.py | 28 ++++++++++-- autotranscript/transcript_exporter.py | 12 ++--- autotranscript/version.py | 2 +- gradio_app.py | 65 +++++++++++++++++++++++++++ requirements.txt | 4 -- transcribe.py | 34 +++++++++++++- 6 files changed, 128 insertions(+), 17 deletions(-) create mode 100644 gradio_app.py diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py index 612f9e5..e053d6a 100644 --- a/autotranscript/autotranscript.py +++ b/autotranscript/autotranscript.py @@ -125,6 +125,17 @@ class AutoTranscribe: diarisation = self.diariser.diarization(dia_audio, **kwargs) + if not diarisation["segments"]: + warn("No segments found. Try to run transcription without diarisation.") + transcript = self.transcriber.transcribe(audio_file.waveform, **kwargs) + + final_transcript= {"speakers" : ["speaker01"], + "segments" : [0, len(audio_file.waveform)], + "text" : transcript} + + return Transcript(final_transcript) + + print("Diarisation finished. Starting transcription.") audio_file.sr = torch.Tensor([audio_file.sr]).to(audio_file.waveform.device) @@ -140,8 +151,8 @@ class AutoTranscribe: transcript = self.transcriber.transcribe(audio, **kwargs) - final_transcript[i] = {"speaker" : diarisation["speakers"][i], - "segment" : seg, + final_transcript[i] = {"speakers" : diarisation["speakers"][i], + "segments" : seg, "text" : transcript} # Remove original file if needed @@ -233,6 +244,7 @@ def cli(): from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE from .transcriber import WHISPER_DEFAULT_PATH from .diarisation import PYANNOTE_DEFAULT_PATH + def str2bool(string): str2val = {"True": True, "False": False} if string in str2val: @@ -242,9 +254,12 @@ def cli(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("audio_files", nargs="+", type=str, + parser.add_argument("-f","--audio_files", nargs="+", type=str, help="List of audio files to transcribe.") - + + parser.add_argument('--start_server', action='store_true', + help='Start the Gradio app.') + parser.add_argument("--whisper_model_name", default="medium", help="Name of the Whisper model to use.") @@ -299,6 +314,7 @@ def cli(): audio_files = args.audio_files spoken_language = args.spoken_language output_format = args.output_format + start_server = args.start_server os.makedirs(output_directory, exist_ok=True) @@ -335,6 +351,10 @@ def cli(): elif transcription_task == "wtranscribe": # wtranscribe code here pass + + if start_server: + from .gradio_app import gradio_app + gradio_app(model) if __name__ == "__main__": cli() \ No newline at end of file diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 42f2680..9262be6 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -65,7 +65,7 @@ class Transcript: list: List of unique speaker names in the transcript. """ - return list(set([self.transcript[id]["speaker"] for id in self.transcript])) + return list(set([self.transcript[id]["speakers"] for id in self.transcript])) def _extract_segments(self) -> list: """ @@ -75,7 +75,7 @@ class Transcript: list: List of segments, where each segment is represented by the starting and ending times. """ - return [self.transcript[id]["segment"] for id in self.transcript] + return [self.transcript[id]["segments"] for id in self.transcript] def __str__(self) -> str: """ @@ -91,11 +91,11 @@ class Transcript: seq = self.transcript[_id] if self.annotation: - speaker = self.annotation[seq["speaker"]] + speaker = self.annotation[seq["speakers"]] else: - speaker = seq["speaker"] + speaker = seq["speakers"] - segm = seq["segment"] + segm = seq["segments"] sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0])) eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1])) @@ -172,7 +172,7 @@ class Transcript: for id in self.transcript: seq = self.transcript[id] - speaker = self.annotation[seq["speaker"]] + speaker = self.annotation[seq["speakers"]] fstring += f"\n\\{speaker}speaks:\n{seq['text']}" fstring += "\n\\end{drama}" diff --git a/autotranscript/version.py b/autotranscript/version.py index 5bc7ffc..0a3730e 100644 --- a/autotranscript/version.py +++ b/autotranscript/version.py @@ -2,7 +2,7 @@ import os import subprocess as sp MAJOR = 0 -MINOR = 2 +MINOR = 1 MICRO = 0 MICRO_POST = 0 ISRELEASED = False diff --git a/gradio_app.py b/gradio_app.py new file mode 100644 index 0000000..321f8bc --- /dev/null +++ b/gradio_app.py @@ -0,0 +1,65 @@ +from autotranscript import AutoTranscribe +import gradio as gr + +LANGUAGES = [ + "Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian", + "Bosnian", "Bulgarian", "Catalan", "Chinese", "Croatian", + "Czech", "Danish", "Dutch", "English", "Estonian", + "Finnish", "French", "Galician", "German", "Greek", + "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian", + "Italian", "Japanese", "Kannada", "Kazakh", "Korean", + "Latvian", "Lithuanian", "Macedonian", "Malay", "Marathi", + "Maori", "Nepali", "Norwegian", "Persian", "Polish", + "Portuguese", "Romanian", "Russian", "Serbian", "Slovak", + "Slovenian", "Spanish", "Swahili", "Swedish", "Tagalog", + "Tamil", "Thai", "Turkish", "Ukrainian", "Urdu", + "Vietnamese", "Welsh" +] + + +def gradio_server(model : AutoTranscribe): + + def transcribe(audio, microphone, number_of_speakers, language): + kwargs = {} + if number_of_speakers != 0: + kwargs["num_speakers"] = number_of_speakers + if language != "None": + kwargs["language"] = language + + if audio is not None: + out = model.transcribe(audio, **kwargs) + elif microphone is not None: + out = model.transcribe(microphone , **kwargs) + else: + out = "Please upload an audio file or record one." + + + return str(out) + + gr.Interface( + fn=transcribe, + inputs=[ + gr.Audio(source= "upload", type="filepath", label="Upload Your Audio File", interactive=True), + gr.Audio(source= "microphone", type="filepath", label="Record Your Audio", interactive=True), + gr.Number(value=0, label= "Number of speakers", + info = "Number of speakers in the audio file. If you don't know, leave it at 0."), + # gr.Number(value=0, label= "Minimal number of speakers", + # info = "Minimal number of speakers in the audio file. If you don't know or you have specified Numspeakers, leave it at 0."), + gr.Dropdown(LANGUAGES, + label="Languages", default="None", + info="Language of the audio file. If you don't know, leave it at None.") + ], + outputs=[ + "text" + ], + title="Audio Transcription", + thumbnail = "Logo_KIDA.png", + description="Upload an audio file to transcribe its content. Powered by AutoTranscribe!", + theme="soft", # Example of a more modern theme + ).launch(share=True) + + +if __name__ == "__main__": + + model = AutoTranscribe() + gradio_server(model) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 433b3c1..b81b23c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,10 +9,6 @@ pyannote.pipeline~=2.3 setuptools~=65.6.3 setuptools-rust~=1.5.2 -torch~=1.11.0 -torchaudio~=0.11.0 -torchmetrics~=0.11.0 -torchvision~=0.12.0 tqdm>=4.65.0 #optional: diff --git a/transcribe.py b/transcribe.py index fca2532..73d8838 100644 --- a/transcribe.py +++ b/transcribe.py @@ -1,8 +1,38 @@ -from autotranscript.autotranscript import AutoTranscribe +# import os +# import sys +# import traceback + +# class TracePrints(object): +# def __init__(self): +# self.stdout = sys.stdout +# def write(self, s): +# self.stdout.write("Writing %r\n" % s) +# traceback.print_stack(file=self.stdout) + +# sys.stdout = TracePrints() + +# os.environ["PYANNOTE_CACHE"] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models/pyannote") +# import os + +# os.environ['TRANSFORMERS_CACHE'] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models") +# os.environ['HF_HOME'] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models") + + +from autotranscript import AutoTranscribe model = AutoTranscribe() -text = model.transcribe("tests/test.wav") +text = model.transcribe("test.mp4") print("Transcription:\n") print(text) + + +# from autotranscript.misc import * +# import os + +# print(os.path.exists(CACHE_DIR)) +# print(os.path.exists(WHISPER_DEFAULT_PATH)) +# print(os.path.exists(PYANNOTE_DEFAULT_PATH)) + +# print(os.path.exists(PYANNOTE_DEFAULT_CONFIG))