From b019671f124371129ada790029d485ed75c627ed Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 9 Jun 2023 18:00:29 +0200
Subject: [PATCH 01/86] added files for rework

---
 autotranscript/__init__.py        |   5 +-
 autotranscript/audio_processor.py |  93 +++++++++++++++++++
 autotranscript/diarisation.py     | 144 ++++++++++++++++++++++++++++++
 3 files changed, 241 insertions(+), 1 deletion(-)
 create mode 100644 autotranscript/audio_processor.py
 create mode 100644 autotranscript/diarisation.py

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 13f245b..91c8659 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,4 +1,7 @@
 from autotranscript.__main__ import *
+from autotranscript.transcriptor import *
+from autotranscript.audio_processor import *
 from autotranscript.version import get_version as _get_version
+from autotranscript.misc import *
 
-__version__ = _get_version()
\ No newline at end of file
+__version__ = _get_version()
diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py
new file mode 100644
index 0000000..2b8eee8
--- /dev/null
+++ b/autotranscript/audio_processor.py
@@ -0,0 +1,93 @@
+from typing import Union
+from pydub import AudioSegment
+import os
+
+class AudioProcessor:
+    def __init__(self, audio_file:str):
+        self.audio_file_path = audio_file
+        self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1])
+
+        self.audiofilename = audio_file.split('/')[-1][:-4]
+        self.coreaudiofile =  audio_file.split('/')[-1][:-4]
+        self.audiofilefolder = os.path.dirname(audio_file)
+        self.audio_file_type = audio_file.split('.')[-1]
+
+
+
+    def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True):
+        """
+        Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
+        Whisper model
+        :param file: path to audio or video file
+        :param remove_orginal: remove original file
+        :return: mp3 file path
+        """
+        print(f'Converting {self.audiofilename} to .{type} file')
+
+        if savefolder == "":
+            savefolder = self.audiofilefolder
+
+        if savename == "":
+            savename = self.coreaudiofile + f'.{type}'
+        else:
+            savename = savename + f'.{type}'
+
+        savepath = os.path.join(savefolder, savename)
+
+        self.audio_file.export(savepath, format=type)
+
+        print(f'Converted {self.audiofilename} to {type}')
+
+        if remove_orginal:
+            os.remove(self.audio_file_path)
+            print(f'File {self.audio_file_path} removed')
+
+        self.audio_file_path = savepath
+        self.audio_file = AudioSegment.from_file(savepath, format=type)
+
+        return self
+
+    def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
+        """
+        Convert audio file to mp3 file
+        :param file: audio file
+        :param remove_orginal: remove original file
+        :return: mp3 file path
+        """
+        return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal)
+
+    def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
+        """
+        Convert audio file to wav file
+        :param file: audio file
+        :param remove_orginal: remove original file
+        :return: wav file path
+        """
+        return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal)
+
+    def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"):
+        """
+        Slow down mp3 file
+        :param file: mp3 file
+        :param speed: speed
+        :return: None
+        """
+        if savefolder == "":
+            savefolder = self.audiofilefolder
+        else:
+            savefolder = savefolder
+
+        sound = self.audio_file
+        slow_sound = sound._spawn(sound.raw_data, overrides={
+            "frame_rate": int(sound.frame_rate * speed)
+        })
+
+        speedstr = str(speed).replace('.', '')
+
+        file_out = self.coreaudiofile + f'_{speedstr}.{type}'
+
+        save_path = os.path.join(savefolder, file_out)
+
+        slow_sound.export(save_path, format=type)
+
+        return slow_sound
\ No newline at end of file
diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
new file mode 100644
index 0000000..b7ee848
--- /dev/null
+++ b/autotranscript/diarisation.py
@@ -0,0 +1,144 @@
+from audio_processor import AudioProcessor
+from time import time
+import os
+
+class Diarisation(AudioProcessor):
+    def __init__(self, audio_file: str, model,**kwargs) -> None:
+
+        super().__init__(audio_file=audio_file)
+
+        self.model = model
+
+
+    def diarization(self, *args, **kwargs):
+
+        if "num_speakers" in kwargs:
+            num_speakers = kwargs['num_speakers']
+            kwargs.pop('num_speakers')
+        else:
+            num_speakers = 2
+
+        audiofilename = self.coreaudiofile
+
+        print(f'Start diarization of audio file: {self.audiofilename}')
+
+        _stime = time()
+
+        diarization = self.model(self.audio_file_path, num_speakers=num_speakers)
+
+        print(f'Diarization finished in {time() - _stime} seconds')
+        self.diarization = diarization
+
+        return diarization
+
+    def format_diarization_output(self, *args, **kwargs):
+        """
+        Format diarization output to a list of tuples
+        :param args:
+        :param kwargs:
+        :return: dict with speaker names as keys and list of tuples as values and list of different speakers
+        """
+
+        diarization_output = {"speakers": [], "segments": []}
+
+        if not hasattr(self, 'diarization'):
+            # ensure diarization is run before formatting
+            self.diarization = self.diarization()
+
+
+        for segment, _, speaker in self.diarization.itertracks(yield_label=True):
+            diarization_output["speakers"].append(speaker)
+            diarization_output["segments"].append(segment)
+
+        normalized_output = []
+        index_start_speaker = 0
+        index_end_speaker = 0
+        current_speaker = str()
+
+        for i, speaker in enumerate(diarization_output["speakers"]):
+
+            if i == 0:
+                current_speaker = speaker
+
+            if speaker != current_speaker:
+
+                index_end_speaker = i - 1
+
+                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
+
+                index_start_speaker = i
+                current_speaker = speaker
+
+            if i == len(diarization_output["speakers"]) - 1:
+
+                index_end_speaker = i
+                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
+
+
+        self.normalized_output = normalized_output
+        self.diarization_output = diarization_output
+
+        return diarization_output,normalized_output
+
+    def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs):
+        """
+        Create temporary wav file for diarization
+        :param savefolder: folder to save the temporary wav file
+        :param savename: name of the temporary wav file prefix
+        :param audiofile: audio file
+        :return: temporary wav file
+        """
+
+
+        if savefolder == "":
+            folder = '.temp'
+            if not os.path.exists(folder):
+                os.makedirs(folder)
+        else:
+            folder = savefolder
+
+        folder = os.path.realpath(folder)
+
+        if savename == "":
+            savename = self.coreaudiofile + '.wav'
+        else:
+            savename = savename
+
+
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+
+        if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'):
+            self.format_diarization_output()
+
+
+        speaker = set(self.diarization_output["speakers"])
+        num_speak_iter = [0 for _ in range(len(speaker))]
+
+        for count, outp in enumerate(self.normalized_output):
+            start = self.diarization_output["segments"][outp[0]].start
+            end = self.diarization_output["segments"][outp[1]].end
+
+            print("start: ", start)
+            print("end: ", end)
+
+            start_milliseconds = start * 1000
+            end_milliseconds = end * 1000
+
+            print("start_milliseconds: ", start_milliseconds)
+            print("end_milliseconds: ", end_milliseconds)
+
+            print("cut audio")
+
+            cut_audio = self.audio_file[start_milliseconds:end_milliseconds]
+
+            print("save audio")
+            print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav")
+            cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav")
+
+        return os.path.realpath(folder)
+
+    def __repr__(self):
+        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
+    def __str__(self):
+        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
\ No newline at end of file

From 724c2844741e8e976e3fa1978b2f102112125c76 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 9 Jun 2023 18:00:46 +0200
Subject: [PATCH 02/86] added files to module init

---
 autotranscript/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 91c8659..3bd3b1a 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,6 +1,7 @@
 from autotranscript.__main__ import *
 from autotranscript.transcriptor import *
 from autotranscript.audio_processor import *
+from autotranscript.diarisation import *
 from autotranscript.version import get_version as _get_version
 from autotranscript.misc import *
 

From ee2cfc43193e989e27ec707e1690f1e75526b3d6 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 9 Jun 2023 18:01:18 +0200
Subject: [PATCH 03/86] reworked transcription class

---
 autotranscript/transcriptor.py | 112 +++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 autotranscript/transcriptor.py

diff --git a/autotranscript/transcriptor.py b/autotranscript/transcriptor.py
new file mode 100644
index 0000000..a3927f1
--- /dev/null
+++ b/autotranscript/transcriptor.py
@@ -0,0 +1,112 @@
+
+import os
+from typing import TypeVar
+from whisper import load_model
+from glob import glob
+
+whisper = TypeVar('whisper') 
+Transcriber = TypeVar('Transcriber')
+
+def get_whisper_default_path() -> str:
+    """
+    Get default path for whisper models
+
+    Returns
+    -------
+    str
+        path
+    """
+    _path = os.path.dirname(os.path.dirname(__file__))
+    return os.path.join(_path, "models", "whisper")
+
+WHISPER_DEFAULT_PATH = get_whisper_default_path()
+
+class Transcriber:
+    def __init__(self, model: whisper ) -> None:
+        """
+        Initialize Transcriber class with a whisper model
+        :param model: whisper model
+        """
+        self.model = model
+
+
+    def transcribe(self, file : str, language:str = "German"):
+        """
+        transcribe audio file
+        :param file: audio file to transcribe
+        :param language: language of the audio file
+        :return: transcript as string
+        """
+        result = self.model.transcribe(file, language = language)
+
+        return result["text"]
+    
+    @staticmethod
+    def save_transcript(transcript:str , save_path : str) -> None:
+        """
+        Save transcript to file
+        :param transcript: transcript as string
+        :param savepath: path to save the transcript
+        :return: None
+        """
+
+        with open(save_path, 'w') as f:
+            f.write(transcript)
+            f.close()
+            
+        print(f'Transcript saved to {save_path}')
+
+    @classmethod
+    def load_whisper_model(cls,
+                            model: str = "medium", 
+                            local : bool = True,
+                            download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
+        """
+        Load whisper module
+
+        Parameters
+        ----------
+        whisper : str
+            whisper model
+            available models:
+
+                - 'tiny.en'
+                - 'tiny'
+                - 'base.en'
+                - 'base'
+                - 'small.en'
+                - 'small'
+                - 'medium.en'
+                - 'medium'
+                - 'large-v1'
+                - 'large-v2'
+                - 'large' 
+
+        local : bool
+            If true, load from local cache
+
+        download_root : str
+            Path to download the model
+
+            default: /models/whisper
+        
+        Returns
+        -------
+        Whisper Object
+        """
+
+        if local:
+            
+            available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))]
+            
+            for i, module in enumerate(available_models):
+                available_models[i] = module.split(".")[0]
+            
+            if model not in available_models:
+                raise RuntimeError("Model not found. Consider downloading the "/
+                                   "model first. By deactivating the local flag, " /
+                                    "the model will be downloaded automatically.")
+
+        _model = load_model(model, download_root=download_root)
+
+        return cls(_model)

From 301a6e88b5f95c6c3497d710121d5d86811782b7 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 9 Jun 2023 18:01:42 +0200
Subject: [PATCH 04/86] added sepearate functions to load models

---
 autotranscript/misc.py | 88 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 autotranscript/misc.py

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
new file mode 100644
index 0000000..91008fd
--- /dev/null
+++ b/autotranscript/misc.py
@@ -0,0 +1,88 @@
+
+from pyannote.audio import Pipeline
+from whisper import Whisper, load_model
+import os
+import glob
+
+def get_whisper_default_path() -> str:
+    """
+    Get default path for whisper models
+
+    Returns
+    -------
+    str
+        path
+    """
+    _path = os.path.dirname(os.path.dirname(__file__))
+    return os.path.join(_path, "models", "whisper")
+
+WHISPER_DEFAULT_PATH = get_whisper_default_path()
+
+def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper:
+    """
+    Load modules from whisper
+
+    Parameters
+    ----------
+    whisper : str
+        whisper model
+        available models:
+
+            - 'tiny.en'
+            - 'tiny'
+            - 'base.en'
+            - 'base'
+            - 'small.en'
+            - 'small'
+            - 'medium.en'
+            - 'medium'
+            - 'large-v1'
+            - 'large-v2'
+            - 'large' 
+
+    local : bool
+        If true, load from local cache
+
+    download_root : str
+        Path to download the model
+
+        default: /models/whisper
+    
+    Returns
+    -------
+    Whisper Object
+    """
+    
+    if local:
+        available_models = [os.path.basename(x) for x in glob.glob(os.path.join(WHISPER_DEFAULT_PATH, "*"))]
+        
+        for i, module in enumerate(available_models):
+            available_models[i] = module.split(".")[0]
+        
+        if model not in available_models:
+            raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.")
+
+    return load_model(model, download_root=WHISPER_DEFAULT_PATH)
+
+def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pipeline:
+    """
+    Load modules from pyannote
+
+    Parameters
+    ----------
+    model : str
+        pyannote model 
+    token : str
+        HUGGINGFACE_TOKEN
+    local : bool
+        If true, load from local cache
+    
+    Returns
+    -------
+    Pipeline Object
+    """
+
+    if local:
+        return Pipeline.from_pretrained(model)
+    else:
+        return Pipeline.from_pretrained(model, use_auth_token = token)

From 6710f05eaf70b8851aeb13473ebfa8e27fd075ae Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 9 Jun 2023 18:01:55 +0200
Subject: [PATCH 05/86] added unittest

---
 test_autotranscript.py | 55 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 test_autotranscript.py

diff --git a/test_autotranscript.py b/test_autotranscript.py
new file mode 100644
index 0000000..29bf4d9
--- /dev/null
+++ b/test_autotranscript.py
@@ -0,0 +1,55 @@
+import pytest
+from autotranscript import Transcriber
+from unittest.mock import patch, mock_open
+import os
+
+def test_load_pyannote_model():
+    """
+    Test load_pyannote_test
+    """
+    from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
+    from pyannote.audio import Pipeline
+
+    pipeline = Pipeline.from_pretrained("models/pyannote/speaker_diarization/config.yaml")
+    assert isinstance(pipeline, SpeakerDiarization)
+
+# Test Transcribtion class
+
+
+@pytest.fixture
+def transcriber():
+    """
+    Prepare Transcriber for testing
+    Returns: Transcriber Object
+    """
+
+    return Transcriber.load_whisper_model("medium", local=True)
+
+
+def test_Transcriber_init(transcriber):
+    """
+    Test Transcriber initialization with a whisper model 
+    """
+    
+    assert isinstance(transcriber, Transcriber)
+
+def test_transcription(transcriber):
+    """
+    Test transcription
+    """
+
+    transcript = transcriber.transcribe("tests/test.wav") 
+    assert isinstance(transcript, str)
+    
+def test_save_transcript_to_file(transcriber):
+    """
+    Test save_transcript_to_file
+    """
+    transcript = transcriber.transcribe("tests/test.wav")
+    
+    open_mock = mock_open()
+    with patch("autotranscript.Transcriber.save_transcript", open_mock, create=True):
+        Transcriber.save_transcript(transcript, "output.txt")
+
+    open_mock.assert_called_with("output.txt", "w")
+    open_mock.return_value.write.assert_called_once_with("test-data")

From 671c67415f6b0da6feca9ab9ff4e24bfa31187da Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 11:29:28 +0200
Subject: [PATCH 06/86] reworked diarization feature

---
 autotranscript/diarisation.py | 238 ++++++++++++++++++++--------------
 1 file changed, 143 insertions(+), 95 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index b7ee848..b0c9e84 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,62 +1,64 @@
-from audio_processor import AudioProcessor
+from pyannote.audio import Pipeline
 from time import time
 import os
+from typing import TypeVar
 
-class Diarisation(AudioProcessor):
-    def __init__(self, audio_file: str, model,**kwargs) -> None:
+Annotation = TypeVar('Annotation') 
 
-        super().__init__(audio_file=audio_file)
+PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 
+                                     "models", "pyannote", 
+                                     "speaker_diarization", "config.yaml")
+
+class Diarisation:
+    def __init__(self, model,*args,**kwargs) -> None:
 
         self.model = model
 
 
-    def diarization(self, *args, **kwargs):
+    def diarization(self, audiofile : str , *args, **kwargs) -> Annotation:
+        """
+        Diarization of audio file
+        :param audiofile: path to audio file
+        :param args: args for diarization model 
+        :param kwargs: kwargs for diarization model
+        :return: diarization
+        """
 
-        if "num_speakers" in kwargs:
-            num_speakers = kwargs['num_speakers']
-            kwargs.pop('num_speakers')
-        else:
-            num_speakers = 2
+        print(f'Start diarization of audio file: {audiofile}')
 
-        audiofilename = self.coreaudiofile
+        diarization = self.model(audiofile,*args, **kwargs)
 
-        print(f'Start diarization of audio file: {self.audiofilename}')
+        print('Diarization finished')
 
-        _stime = time()
+        out = self.format_diarization_output(diarization)
 
-        diarization = self.model(self.audio_file_path, num_speakers=num_speakers)
+        return out
 
-        print(f'Diarization finished in {time() - _stime} seconds')
-        self.diarization = diarization
-
-        return diarization
-
-    def format_diarization_output(self, *args, **kwargs):
+    @staticmethod
+    def format_diarization_output(dia : Annotation) -> dict:
         """
         Format diarization output to a list of tuples
-        :param args:
-        :param kwargs:
-        :return: dict with speaker names as keys and list of tuples as values and list of different speakers
+        :param dia: diarization output
+        :return: dict with speaker names as keys and list of tuples
+                 as values and list of different speakers
         """
 
+        dia_list  = list(dia.itertracks(yield_label=True))
         diarization_output = {"speakers": [], "segments": []}
 
-        if not hasattr(self, 'diarization'):
-            # ensure diarization is run before formatting
-            self.diarization = self.diarization()
-
-
-        for segment, _, speaker in self.diarization.itertracks(yield_label=True):
-            diarization_output["speakers"].append(speaker)
-            diarization_output["segments"].append(segment)
-
         normalized_output = []
         index_start_speaker = 0
         index_end_speaker = 0
         current_speaker = str()
+        
+        ###
+        # Sometimes two consecutive speakers are the same
+        # This loop removes these duplicates
+        ###
 
-        for i, speaker in enumerate(diarization_output["speakers"]):
 
+        for i, (_, _, speaker) in enumerate(dia_list):
+            
             if i == 0:
                 current_speaker = speaker
 
@@ -64,7 +66,9 @@ class Diarisation(AudioProcessor):
 
                 index_end_speaker = i - 1
 
-                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
+                normalized_output.append([index_start_speaker,
+                                           index_end_speaker,
+                                           current_speaker])
 
                 index_start_speaker = i
                 current_speaker = speaker
@@ -72,73 +76,117 @@ class Diarisation(AudioProcessor):
             if i == len(diarization_output["speakers"]) - 1:
 
                 index_end_speaker = i
-                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
+                normalized_output.append([index_start_speaker, 
+                                          index_end_speaker, 
+                                          current_speaker])
+       
+        for outp in normalized_output:
+            #convert in milliseconds
+            start =  dia_list[outp[0]][0].start * 1000
+            end =  dia_list[outp[1]][0].end * 1000
 
+            diarization_output["segments"].append([start, end])
+            diarization_output["speakers"].append(outp[2])
 
-        self.normalized_output = normalized_output
-        self.diarization_output = diarization_output
-
-        return diarization_output,normalized_output
-
-    def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs):
+        return diarization_output
+    
+    @classmethod
+    def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, 
+                        token: str = "",
+                        local : bool = True,
+                        *args, **kwargs) -> Pipeline:
         """
-        Create temporary wav file for diarization
-        :param savefolder: folder to save the temporary wav file
-        :param savename: name of the temporary wav file prefix
-        :param audiofile: audio file
-        :return: temporary wav file
+        Load modules from pyannote
+
+        Parameters
+        ----------
+        model : str
+            pyannote model 
+            default: /models/pyannote/speaker_diarization/config.yaml
+        token : str
+            HUGGINGFACE_TOKEN
+        local : bool
+            If true, load from local cache
+        
+        Returns
+        -------
+        Pipeline Object
         """
 
-
-        if savefolder == "":
-            folder = '.temp'
-            if not os.path.exists(folder):
-                os.makedirs(folder)
+        if local:
+            diarization_model =  Pipeline.from_pretrained(model,*args, **kwargs)
         else:
-            folder = savefolder
-
-        folder = os.path.realpath(folder)
-
-        if savename == "":
-            savename = self.coreaudiofile + '.wav'
-        else:
-            savename = savename
-
-
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-
-        if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'):
-            self.format_diarization_output()
-
-
-        speaker = set(self.diarization_output["speakers"])
-        num_speak_iter = [0 for _ in range(len(speaker))]
-
-        for count, outp in enumerate(self.normalized_output):
-            start = self.diarization_output["segments"][outp[0]].start
-            end = self.diarization_output["segments"][outp[1]].end
-
-            print("start: ", start)
-            print("end: ", end)
-
-            start_milliseconds = start * 1000
-            end_milliseconds = end * 1000
-
-            print("start_milliseconds: ", start_milliseconds)
-            print("end_milliseconds: ", end_milliseconds)
-
-            print("cut audio")
-
-            cut_audio = self.audio_file[start_milliseconds:end_milliseconds]
-
-            print("save audio")
-            print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav")
-            cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav")
-
-        return os.path.realpath(folder)
+            diarization_model =  Pipeline.from_pretrained(model, use_auth_token = token,
+                                                           *args, **kwargs)
+        
+        return cls(diarization_model)
 
     def __repr__(self):
-        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
+        return f"Diarisation(model={self.model})"
     def __str__(self):
-        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
\ No newline at end of file
+        return f"Diarisation(model={self.model})"
+
+
+if __name__ == '__main__':
+
+    model = Diarisation.load_model()
+    print(model)
+    audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav"
+    out = model.diarization(audiofile)
+    print(out)
+
+    # # deprecated
+    # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'):
+    #     """
+    #     Create temporary wav file for diarization
+    #     :param location_of_temp_folder: folder to save the temporary wav file
+    #         default: .temp
+    #     :param savename: name of the temporary wav file prefix
+    #     :param audiofile: audio file
+    #     :return: temporary wav file
+    #     """
+    #     print("Linne 84 Diarisation.py create_temporary_wav :" /
+    #            "location_of_temp_folder.split('/')[-1]",location_of_temp_folder.split('/')[-1])
+        
+    #     if location_of_temp_folder.split('/')[-1] != '.temp':
+    #         folder =os.path.join(location_of_temp_folder, '.temp')
+    #     else:
+    #         folder = location_of_temp_folder
+        
+    #     if not os.path.exists(folder):
+    #             os.makedirs(folder)
+        
+    #     folder = os.path.realpath(folder)
+
+    #     if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'):
+    #         raise AttributeError("You need to run the diarization first")
+        
+    #     speaker = set(self.diarization_output["speakers"])
+    #     num_speak_iter = [0 for _ in range(len(speaker))]
+
+    #     for count, outp in enumerate(self.normalized_output):
+    #         print(outp)
+    #         print(self.diarization_output["segments"][outp[0]])
+    #         print(self.diarization_output["segments"][outp[1]])
+
+    #         start = self.diarization_output["segments"][outp[0]].start
+    #         end = self.diarization_output["segments"][outp[1]].end
+
+    #         print("start: ", start)
+    #         print("end: ", end)
+
+    #         start_milliseconds = start * 1000
+    #         end_milliseconds = end * 1000
+
+    #         print("start_milliseconds: ", start_milliseconds)
+    #         print("end_milliseconds: ", end_milliseconds)
+
+    #         print("cut audio")
+
+    #         cut_audio = self.audio_file[start_milliseconds:end_milliseconds]
+
+    #         print("save audio")
+    #         print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav")
+    #         cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav")
+
+    #     return os.path.realpath(folder)
\ No newline at end of file

From 6aae0f5b242408795c60b0e0a6266449bd80c70a Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 11:48:47 +0200
Subject: [PATCH 07/86] file name changed

---
 autotranscript/transcriber.py | 112 ++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 autotranscript/transcriber.py

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
new file mode 100644
index 0000000..a3927f1
--- /dev/null
+++ b/autotranscript/transcriber.py
@@ -0,0 +1,112 @@
+
+import os
+from typing import TypeVar
+from whisper import load_model
+from glob import glob
+
+whisper = TypeVar('whisper') 
+Transcriber = TypeVar('Transcriber')
+
+def get_whisper_default_path() -> str:
+    """
+    Get default path for whisper models
+
+    Returns
+    -------
+    str
+        path
+    """
+    _path = os.path.dirname(os.path.dirname(__file__))
+    return os.path.join(_path, "models", "whisper")
+
+WHISPER_DEFAULT_PATH = get_whisper_default_path()
+
+class Transcriber:
+    def __init__(self, model: whisper ) -> None:
+        """
+        Initialize Transcriber class with a whisper model
+        :param model: whisper model
+        """
+        self.model = model
+
+
+    def transcribe(self, file : str, language:str = "German"):
+        """
+        transcribe audio file
+        :param file: audio file to transcribe
+        :param language: language of the audio file
+        :return: transcript as string
+        """
+        result = self.model.transcribe(file, language = language)
+
+        return result["text"]
+    
+    @staticmethod
+    def save_transcript(transcript:str , save_path : str) -> None:
+        """
+        Save transcript to file
+        :param transcript: transcript as string
+        :param savepath: path to save the transcript
+        :return: None
+        """
+
+        with open(save_path, 'w') as f:
+            f.write(transcript)
+            f.close()
+            
+        print(f'Transcript saved to {save_path}')
+
+    @classmethod
+    def load_whisper_model(cls,
+                            model: str = "medium", 
+                            local : bool = True,
+                            download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
+        """
+        Load whisper module
+
+        Parameters
+        ----------
+        whisper : str
+            whisper model
+            available models:
+
+                - 'tiny.en'
+                - 'tiny'
+                - 'base.en'
+                - 'base'
+                - 'small.en'
+                - 'small'
+                - 'medium.en'
+                - 'medium'
+                - 'large-v1'
+                - 'large-v2'
+                - 'large' 
+
+        local : bool
+            If true, load from local cache
+
+        download_root : str
+            Path to download the model
+
+            default: /models/whisper
+        
+        Returns
+        -------
+        Whisper Object
+        """
+
+        if local:
+            
+            available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))]
+            
+            for i, module in enumerate(available_models):
+                available_models[i] = module.split(".")[0]
+            
+            if model not in available_models:
+                raise RuntimeError("Model not found. Consider downloading the "/
+                                   "model first. By deactivating the local flag, " /
+                                    "the model will be downloaded automatically.")
+
+        _model = load_model(model, download_root=download_root)
+
+        return cls(_model)

From 7aa2ed667f82f4c4f68c9922c825270c28e3ff44 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 11:49:17 +0200
Subject: [PATCH 08/86] changed file names

---
 autotranscript/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 3bd3b1a..531c651 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,5 +1,5 @@
 from autotranscript.__main__ import *
-from autotranscript.transcriptor import *
+from autotranscript.transcriber import *
 from autotranscript.audio_processor import *
 from autotranscript.diarisation import *
 from autotranscript.version import get_version as _get_version

From ca42d631cdeefc9cef1b37c9de02be9af31230a5 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 11:50:20 +0200
Subject: [PATCH 09/86] added deprecated warning

---
 autotranscript/misc.py | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index 91008fd..065e45d 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -3,20 +3,14 @@ from pyannote.audio import Pipeline
 from whisper import Whisper, load_model
 import os
 import glob
+from warnings import warn
 
-def get_whisper_default_path() -> str:
-    """
-    Get default path for whisper models
+WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)),
+                                     "models", "whisper")
 
-    Returns
-    -------
-    str
-        path
-    """
-    _path = os.path.dirname(os.path.dirname(__file__))
-    return os.path.join(_path, "models", "whisper")
-
-WHISPER_DEFAULT_PATH = get_whisper_default_path()
+PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 
+                                     "models", "pyannote", 
+                                     "speaker_diarization", "config.yaml")
 
 def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper:
     """
@@ -52,9 +46,9 @@ def load_whisper_model(model: str ="medium", local : bool = False, download_root
     -------
     Whisper Object
     """
-    
+    warn("load_whisper_model is deprecated. Use Transcriptor.load_model() instead.", DeprecationWarning)
     if local:
-        available_models = [os.path.basename(x) for x in glob.glob(os.path.join(WHISPER_DEFAULT_PATH, "*"))]
+        available_models = [os.path.basename(x) for x in glob.glob(os.path.join(download_root, "*"))]
         
         for i, module in enumerate(available_models):
             available_models[i] = module.split(".")[0]
@@ -62,9 +56,12 @@ def load_whisper_model(model: str ="medium", local : bool = False, download_root
         if model not in available_models:
             raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.")
 
-    return load_model(model, download_root=WHISPER_DEFAULT_PATH)
+    return load_model(model, download_root=download_root)
 
-def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pipeline:
+def load_pyannote_model(model: str = PYANNOTE_DEFAULT_PATH, 
+                        token: str = "",
+                        local : bool = True,
+                        *args, **kwargs) -> Pipeline:
     """
     Load modules from pyannote
 
@@ -72,6 +69,7 @@ def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pip
     ----------
     model : str
         pyannote model 
+        default: /models/pyannote/speaker_diarization/config.yaml
     token : str
         HUGGINGFACE_TOKEN
     local : bool
@@ -81,8 +79,8 @@ def load_pyannote_model(model: str, token: str = "", local : bool = True) -> Pip
     -------
     Pipeline Object
     """
-
+    warn("load_pyannote_model is deprecated. Use Diarisation.load_model() instead.", DeprecationWarning)
     if local:
-        return Pipeline.from_pretrained(model)
+        return Pipeline.from_pretrained(model,*args, **kwargs)
     else:
-        return Pipeline.from_pretrained(model, use_auth_token = token)
+        return Pipeline.from_pretrained(model, use_auth_token = token, *args, **kwargs)

From b5dab23dd4cbd3a5b075c50f14de5f22ec622705 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 15:54:28 +0200
Subject: [PATCH 10/86] diarization in seconds

---
 autotranscript/diarisation.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index b0c9e84..be5e534 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -23,13 +23,9 @@ class Diarisation:
         :param kwargs: kwargs for diarization model
         :return: diarization
         """
-
-        print(f'Start diarization of audio file: {audiofile}')
-
+        
         diarization = self.model(audiofile,*args, **kwargs)
 
-        print('Diarization finished')
-
         out = self.format_diarization_output(diarization)
 
         return out
@@ -81,9 +77,8 @@ class Diarisation:
                                           current_speaker])
        
         for outp in normalized_output:
-            #convert in milliseconds
-            start =  dia_list[outp[0]][0].start * 1000
-            end =  dia_list[outp[1]][0].end * 1000
+            start =  dia_list[outp[0]][0].start 
+            end =  dia_list[outp[1]][0].end
 
             diarization_output["segments"].append([start, end])
             diarization_output["speakers"].append(outp[2])

From 6870d03f6b5574d66abd18107b2ebfeb92e0d476 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 15:56:52 +0200
Subject: [PATCH 11/86] better readbility

---
 autotranscript/transcriber.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index a3927f1..069866a 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -1,10 +1,12 @@
 
 import os
-from typing import TypeVar
+from typing import TypeVar , Union
 from whisper import load_model
 from glob import glob
 
 whisper = TypeVar('whisper') 
+Tensor = TypeVar('Tensor')
+nparray = TypeVar('nparray')
 Transcriber = TypeVar('Transcriber')
 
 def get_whisper_default_path() -> str:
@@ -29,20 +31,24 @@ class Transcriber:
         """
         self.model = model
 
-
-    def transcribe(self, file : str, language:str = "German"):
+    def transcribe(self, audio : Union[str, Tensor, nparray]  ,
+                   *args, **kwargs) -> str:
         """
         transcribe audio file
         :param file: audio file to transcribe
-        :param language: language of the audio file
+        :param args: additional arguments
+        :param kwargs: additional keyword arguments
+            example:
+                - language: language of the audio file    
         :return: transcript as string
         """
-        result = self.model.transcribe(file, language = language)
+        
+        result = self.model.transcribe(audio, *args, **kwargs)
 
         return result["text"]
     
     @staticmethod
-    def save_transcript(transcript:str , save_path : str) -> None:
+    def save_transcript(transcript : str , save_path : str) -> None:
         """
         Save transcript to file
         :param transcript: transcript as string
@@ -57,10 +63,10 @@ class Transcriber:
         print(f'Transcript saved to {save_path}')
 
     @classmethod
-    def load_whisper_model(cls,
-                            model: str = "medium", 
-                            local : bool = True,
-                            download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
+    def load_model(cls,
+                    model: str = "medium", 
+                    local : bool = True,
+                    download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
         """
         Load whisper module
 
@@ -97,7 +103,8 @@ class Transcriber:
 
         if local:
             
-            available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))]
+            available_models = [os.path.basename(x) for x in 
+                                glob(os.path.join(download_root, "*"))]
             
             for i, module in enumerate(available_models):
                 available_models[i] = module.split(".")[0]

From edbe7ebb1d5ecf75e26d07e9d21097ec084f7168 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 12 Jun 2023 16:38:19 +0200
Subject: [PATCH 12/86] added pytorch audio support

---
 autotranscript/audio_processor.py | 111 +++++++++++++++++++++++++-----
 1 file changed, 93 insertions(+), 18 deletions(-)

diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py
index 2b8eee8..40cf5be 100644
--- a/autotranscript/audio_processor.py
+++ b/autotranscript/audio_processor.py
@@ -1,9 +1,13 @@
-from typing import Union
+from typing import Any, Union
 from pydub import AudioSegment
+import torch
+from torchaudio import load, save
 import os
+from warn import warn
 
 class AudioProcessor:
     def __init__(self, audio_file:str):
+        
         self.audio_file_path = audio_file
         self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1])
 
@@ -12,15 +16,14 @@ class AudioProcessor:
         self.audiofilefolder = os.path.dirname(audio_file)
         self.audio_file_type = audio_file.split('.')[-1]
 
-
-
-    def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True):
+    
+    def save(self, path: str, remove_orginal: bool = True , *args, **kwargs) -> None:
         """
-        Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
-        Whisper model
-        :param file: path to audio or video file
+        Convert and saves video file or other audio files to a different file type,
+         Can be used to ensure that the audio file is in the correct format for the Whisper model
+        :param path : path to save file
         :param remove_orginal: remove original file
-        :return: mp3 file path
+                :return: mp3 file path
         """
         print(f'Converting {self.audiofilename} to .{type} file')
 
@@ -36,16 +39,11 @@ class AudioProcessor:
 
         self.audio_file.export(savepath, format=type)
 
-        print(f'Converted {self.audiofilename} to {type}')
-
         if remove_orginal:
             os.remove(self.audio_file_path)
             print(f'File {self.audio_file_path} removed')
 
-        self.audio_file_path = savepath
-        self.audio_file = AudioSegment.from_file(savepath, format=type)
 
-        return self
 
     def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
         """
@@ -54,18 +52,29 @@ class AudioProcessor:
         :param remove_orginal: remove original file
         :return: mp3 file path
         """
-        return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal)
+        warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead")
+        return self.convert_audio(savefolder = savefolder,
+                                   savename = savename,
+                                   type="mp3",
+                                   remove_orginal=remove_orginal)
 
-    def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
+    def to_wav(self, savefolder: str = "",
+                savename: str = "",
+                remove_orginal: bool = True):
         """
         Convert audio file to wav file
         :param file: audio file
         :param remove_orginal: remove original file
         :return: wav file path
         """
-        return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal)
+        warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead")
+        return self.convert_audio(savefolder = savefolder, 
+                                  savename = savename,type="wav",
+                                  remove_orginal=remove_orginal)
 
-    def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"):
+    def slower_mp3(self, savefolder: str = "",
+                    speed: float = 0.75,
+                    type: str = "mp3"):
         """
         Slow down mp3 file
         :param file: mp3 file
@@ -90,4 +99,70 @@ class AudioProcessor:
 
         slow_sound.export(save_path, format=type)
 
-        return slow_sound
\ No newline at end of file
+        return slow_sound
+    
+    
+
+
+class TorchAudioProcessor:
+    """
+    Audio Processor using PyTorchaudio instead of PyDub
+    """
+    
+    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None:
+        """
+        Initialise audio processor
+        :param waveform: waveform
+        :param sr: sample rate
+        """
+        self.waveform = waveform
+        self.sr = sr
+    
+    
+        
+    @classmethod
+    def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
+        """
+        Load audio file
+        :param file: audio file
+        :return: AudioProcessor
+        """
+        if not os.path.exists(file):
+            raise FileNotFoundError(f'File {file} not found')
+        
+        if "format" not in kwargs:
+            kwargs["format"] = file.split('.')[-1]
+            
+        audio, sr = load(file , *args, **kwargs)
+        
+        return cls(audio, sr)
+
+    def cut(self, start: float, end: float) -> torch.Tensor:
+        """
+        Cut audio file
+        :param start: start time in seconds
+        :param end: end time in seconds
+        :return: AudioProcessor
+        """
+        start = int(start / self.sr)
+        end = torch.ceil(end / self.sr)
+        
+        return self.waveform[:, start:end]
+    
+    def save(self, path: str, *args, **kwargs) -> None:
+        """
+        Save audio file
+        :param path: path to save file
+        :return: None
+        """
+        if "format" not in kwargs:
+            kwargs["format"] = file.split('.')[-1]
+            
+        save(file, self.waveform, self.sr, *args, **kwargs)
+    
+    def __repr__(self) -> str:
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+    
+    def __str__(self) -> str:
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+    
\ No newline at end of file

From a5693490dfcccef75803f7b1e90beb8b97f27ff8 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 13 Jun 2023 07:05:57 +0200
Subject: [PATCH 13/86] removed renamed file

---
 autotranscript/transcriptor.py | 112 ---------------------------------
 1 file changed, 112 deletions(-)
 delete mode 100644 autotranscript/transcriptor.py

diff --git a/autotranscript/transcriptor.py b/autotranscript/transcriptor.py
deleted file mode 100644
index a3927f1..0000000
--- a/autotranscript/transcriptor.py
+++ /dev/null
@@ -1,112 +0,0 @@
-
-import os
-from typing import TypeVar
-from whisper import load_model
-from glob import glob
-
-whisper = TypeVar('whisper') 
-Transcriber = TypeVar('Transcriber')
-
-def get_whisper_default_path() -> str:
-    """
-    Get default path for whisper models
-
-    Returns
-    -------
-    str
-        path
-    """
-    _path = os.path.dirname(os.path.dirname(__file__))
-    return os.path.join(_path, "models", "whisper")
-
-WHISPER_DEFAULT_PATH = get_whisper_default_path()
-
-class Transcriber:
-    def __init__(self, model: whisper ) -> None:
-        """
-        Initialize Transcriber class with a whisper model
-        :param model: whisper model
-        """
-        self.model = model
-
-
-    def transcribe(self, file : str, language:str = "German"):
-        """
-        transcribe audio file
-        :param file: audio file to transcribe
-        :param language: language of the audio file
-        :return: transcript as string
-        """
-        result = self.model.transcribe(file, language = language)
-
-        return result["text"]
-    
-    @staticmethod
-    def save_transcript(transcript:str , save_path : str) -> None:
-        """
-        Save transcript to file
-        :param transcript: transcript as string
-        :param savepath: path to save the transcript
-        :return: None
-        """
-
-        with open(save_path, 'w') as f:
-            f.write(transcript)
-            f.close()
-            
-        print(f'Transcript saved to {save_path}')
-
-    @classmethod
-    def load_whisper_model(cls,
-                            model: str = "medium", 
-                            local : bool = True,
-                            download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
-        """
-        Load whisper module
-
-        Parameters
-        ----------
-        whisper : str
-            whisper model
-            available models:
-
-                - 'tiny.en'
-                - 'tiny'
-                - 'base.en'
-                - 'base'
-                - 'small.en'
-                - 'small'
-                - 'medium.en'
-                - 'medium'
-                - 'large-v1'
-                - 'large-v2'
-                - 'large' 
-
-        local : bool
-            If true, load from local cache
-
-        download_root : str
-            Path to download the model
-
-            default: /models/whisper
-        
-        Returns
-        -------
-        Whisper Object
-        """
-
-        if local:
-            
-            available_models = [os.path.basename(x) for x in glob(os.path.join(download_root, "*"))]
-            
-            for i, module in enumerate(available_models):
-                available_models[i] = module.split(".")[0]
-            
-            if model not in available_models:
-                raise RuntimeError("Model not found. Consider downloading the "/
-                                   "model first. By deactivating the local flag, " /
-                                    "the model will be downloaded automatically.")
-
-        _model = load_model(model, download_root=download_root)
-
-        return cls(_model)

From 157851f8fad88dca13557dd7cd1cca933cded3fd Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 13 Jun 2023 08:25:58 +0200
Subject: [PATCH 14/86] added compability with torchaudio

---
 autotranscript/audio_processor.py | 135 ++++++++++++++++--------------
 1 file changed, 73 insertions(+), 62 deletions(-)

diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py
index 40cf5be..3f0bf38 100644
--- a/autotranscript/audio_processor.py
+++ b/autotranscript/audio_processor.py
@@ -1,108 +1,107 @@
-from typing import Any, Union
-from pydub import AudioSegment
-import torch
-from torchaudio import load, save
 import os
-from warn import warn
+from warnings import warn
+
+import torch
+from pydub import AudioSegment
+from torchaudio import load, save
+
 
 class AudioProcessor:
     def __init__(self, audio_file:str):
         
-        self.audio_file_path = audio_file
-        self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1])
-
-        self.audiofilename = audio_file.split('/')[-1][:-4]
-        self.coreaudiofile =  audio_file.split('/')[-1][:-4]
-        self.audiofilefolder = os.path.dirname(audio_file)
-        self.audio_file_type = audio_file.split('.')[-1]
-
-    
-    def save(self, path: str, remove_orginal: bool = True , *args, **kwargs) -> None:
+        self.audio = AudioSegment.from_file(audio_file, 
+                                            format=audio_file.split('.')[-1])
+        self.audio_file_path = audio_file 
+        self.waveform = self.pydub_to_tensor[0]
+        self.sr = self.pydub_to_tensor[1]
+        
+    @property
+    def pydub_to_tensor(self):
+        """
+        Converts pydub audio segment into np.float32 of shape 
+        [duration_in_seconds*sample_rate, channels],
+        where each value is in range [-1.0, 1.0]. 
+        Returns tuple (audio_np_array, sample_rate).
+        """
+        audio = self.audio
+        x = torch.Tensor(audio.get_array_of_samples()
+                         ).reshape((-1, audio.channels))
+        y = (1 << (8 * audio.sample_width - 1))
+        return x / y, audio.frame_rate
+        
+    def convert_audio(self, path: str, remove_orginal: bool = False, 
+                      *args, **kwargs) ->  None:
         """
         Convert and saves video file or other audio files to a different file type,
-         Can be used to ensure that the audio file is in the correct format for the Whisper model
+        Can be used to ensure that the audio file is in the correct format
+        for the Whisper model.
         :param path : path to save file
         :param remove_orginal: remove original file
-                :return: mp3 file path
+        :param args: arguments for pydub.AudioSegment.export
+        :param kwargs: keyword arguments for pydub.AudioSegment.export
+            e.g. format
+        :return: None
         """
-        print(f'Converting {self.audiofilename} to .{type} file')
 
-        if savefolder == "":
-            savefolder = self.audiofilefolder
-
-        if savename == "":
-            savename = self.coreaudiofile + f'.{type}'
-        else:
-            savename = savename + f'.{type}'
-
-        savepath = os.path.join(savefolder, savename)
-
-        self.audio_file.export(savepath, format=type)
+        self.audio.export(path, *args, **kwargs)
 
         if remove_orginal:
             os.remove(self.audio_file_path)
             print(f'File {self.audio_file_path} removed')
+        
+        self.audio_file_path = path
 
 
-
-    def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
+    def to_mp3(self, *args, **kwargs) -> None:
         """
         Convert audio file to mp3 file
         :param file: audio file
         :param remove_orginal: remove original file
         :return: mp3 file path
         """
-        warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead")
-        return self.convert_audio(savefolder = savefolder,
-                                   savename = savename,
-                                   type="mp3",
-                                   remove_orginal=remove_orginal)
+        
+        warn(DeprecationWarning, "This function is deprecated," \
+             "please use convert_audio instead")
+        
+        if "mp3" not in kwargs["format"]:
+            kwargs["format"] = "mp3"
+            
+        self.convert_audio(*args, **kwargs)
 
-    def to_wav(self, savefolder: str = "",
-                savename: str = "",
-                remove_orginal: bool = True):
+    def to_wav(self,*args, **kwargs) -> None:
         """
         Convert audio file to wav file
         :param file: audio file
         :param remove_orginal: remove original file
         :return: wav file path
         """
-        warn(DeprecationWarning, "This function is deprecated, please use convert_audio instead")
-        return self.convert_audio(savefolder = savefolder, 
-                                  savename = savename,type="wav",
-                                  remove_orginal=remove_orginal)
+        warn(DeprecationWarning, "This function is deprecated," \
+             "please use convert_audio instead")
+        
+        if "wav" not in kwargs["format"]:
+            kwargs["format"] = "wav"
+            
+        self.convert_audio(*args, **kwargs)
 
-    def slower_mp3(self, savefolder: str = "",
+    def slower_mp3(self, path: str,
                     speed: float = 0.75,
-                    type: str = "mp3"):
+                    type: str = "mp3") -> None:
         """
         Slow down mp3 file
         :param file: mp3 file
         :param speed: speed
         :return: None
         """
-        if savefolder == "":
-            savefolder = self.audiofilefolder
-        else:
-            savefolder = savefolder
 
         sound = self.audio_file
         slow_sound = sound._spawn(sound.raw_data, overrides={
             "frame_rate": int(sound.frame_rate * speed)
         })
 
-        speedstr = str(speed).replace('.', '')
-
-        file_out = self.coreaudiofile + f'_{speedstr}.{type}'
-
-        save_path = os.path.join(savefolder, file_out)
-
-        slow_sound.export(save_path, format=type)
+        slow_sound.export(path, format=type)
 
         return slow_sound
     
-    
-
 
 class TorchAudioProcessor:
     """
@@ -136,6 +135,19 @@ class TorchAudioProcessor:
         audio, sr = load(file , *args, **kwargs)
         
         return cls(audio, sr)
+    
+    @classmethod
+    def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
+        """
+        Initialise audio processor using pydub audio segment.
+        pydub uses ffmped instead of SoX (which is used by torchaudio)
+        :param file: audio file
+        :return: TorchAudioProcessor
+        """
+        audio = AudioProcessor(file)
+        
+        return cls(audio.waveform, audio.sr)
+        
 
     def cut(self, start: float, end: float) -> torch.Tensor:
         """
@@ -156,13 +168,12 @@ class TorchAudioProcessor:
         :return: None
         """
         if "format" not in kwargs:
-            kwargs["format"] = file.split('.')[-1]
+            kwargs["format"] = path.split('.')[-1]
             
-        save(file, self.waveform, self.sr, *args, **kwargs)
+        save(path, self.waveform, self.sr, *args, **kwargs)
     
     def __repr__(self) -> str:
         return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
     
     def __str__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
-    
\ No newline at end of file
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
\ No newline at end of file

From 3cfdb894bfa634875c8aabcc4b0b08f9fe4199e6 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 13 Jun 2023 09:54:14 +0200
Subject: [PATCH 15/86] updated get token

---
 autotranscript/diarisation.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index be5e534..123c692 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -14,7 +14,6 @@ class Diarisation:
 
         self.model = model
 
-
     def diarization(self, audiofile : str , *args, **kwargs) -> Annotation:
         """
         Diarization of audio file
@@ -84,7 +83,17 @@ class Diarisation:
             diarization_output["speakers"].append(outp[2])
 
         return diarization_output
-    
+    @staticmethod
+    def _get_token():
+        # check ig .pyannotetoken.txt exists
+        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken')
+        if os.path.exists(path):
+            with open(path, 'r') as f:
+                token = f.read()
+        else:
+            raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token'
+                             ' and save it in a file called .pyannotetoken.txt')
+        return token
     @classmethod
     def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, 
                         token: str = "",
@@ -111,6 +120,8 @@ class Diarisation:
         if local:
             diarization_model =  Pipeline.from_pretrained(model,*args, **kwargs)
         else:
+            if token == "":
+                token = cls._get_token()
             diarization_model =  Pipeline.from_pretrained(model, use_auth_token = token,
                                                            *args, **kwargs)
         
@@ -128,7 +139,6 @@ if __name__ == '__main__':
     print(model)
     audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav"
     out = model.diarization(audiofile)
-    print(out)
 
     # # deprecated
     # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'):

From 7ee784457a2ef77d87b0423c0cecc6689286240c Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 13 Jun 2023 11:56:41 +0200
Subject: [PATCH 16/86] removed comments

---
 autotranscript/diarisation.py | 85 ++++++-----------------------------
 1 file changed, 14 insertions(+), 71 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 123c692..55fd0cb 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,7 +1,7 @@
 from pyannote.audio import Pipeline
-from time import time
+from torch import Tensor
 import os
-from typing import TypeVar
+from typing import TypeVar, Union
 
 Annotation = TypeVar('Annotation') 
 
@@ -9,15 +9,16 @@ PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)),
                                      "models", "pyannote", 
                                      "speaker_diarization", "config.yaml")
 
-class Diarisation:
+class Diariser:
     def __init__(self, model,*args,**kwargs) -> None:
 
         self.model = model
 
-    def diarization(self, audiofile : str , *args, **kwargs) -> Annotation:
+    def diarization(self, audiofile : Union[str, Tensor] ,
+                    *args, **kwargs) -> Annotation:
         """
         Diarization of audio file
-        :param audiofile: path to audio file
+        :param audiofile: path to audio file or torch.Tensor
         :param args: args for diarization model 
         :param kwargs: kwargs for diarization model
         :return: diarization
@@ -83,17 +84,21 @@ class Diarisation:
             diarization_output["speakers"].append(outp[2])
 
         return diarization_output
+    
     @staticmethod
     def _get_token():
         # check ig .pyannotetoken.txt exists
-        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken')
+        path = os.path.join(os.path.dirname(
+            os.path.realpath(__file__)), '.pyannotetoken')
         if os.path.exists(path):
             with open(path, 'r') as f:
                 token = f.read()
         else:
-            raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token'
-                             ' and save it in a file called .pyannotetoken.txt')
+            raise ValueError('No token found.' \
+                'Please create a token at https://huggingface.co/settings/token' \
+                'and save it in a file called .pyannotetoken.txt')
         return token
+    
     @classmethod
     def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, 
                         token: str = "",
@@ -129,69 +134,7 @@ class Diarisation:
 
     def __repr__(self):
         return f"Diarisation(model={self.model})"
+    
     def __str__(self):
         return f"Diarisation(model={self.model})"
 
-
-if __name__ == '__main__':
-
-    model = Diarisation.load_model()
-    print(model)
-    audiofile = "/home/jacob/PycharmProjects/autotranscript/tests/test.wav"
-    out = model.diarization(audiofile)
-
-    # # deprecated
-    # def create_temporary_wav(self, location_of_temp_folder : str = '.temp'):
-    #     """
-    #     Create temporary wav file for diarization
-    #     :param location_of_temp_folder: folder to save the temporary wav file
-    #         default: .temp
-    #     :param savename: name of the temporary wav file prefix
-    #     :param audiofile: audio file
-    #     :return: temporary wav file
-    #     """
-    #     print("Linne 84 Diarisation.py create_temporary_wav :" /
-    #            "location_of_temp_folder.split('/')[-1]",location_of_temp_folder.split('/')[-1])
-        
-    #     if location_of_temp_folder.split('/')[-1] != '.temp':
-    #         folder =os.path.join(location_of_temp_folder, '.temp')
-    #     else:
-    #         folder = location_of_temp_folder
-        
-    #     if not os.path.exists(folder):
-    #             os.makedirs(folder)
-        
-    #     folder = os.path.realpath(folder)
-
-    #     if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'):
-    #         raise AttributeError("You need to run the diarization first")
-        
-    #     speaker = set(self.diarization_output["speakers"])
-    #     num_speak_iter = [0 for _ in range(len(speaker))]
-
-    #     for count, outp in enumerate(self.normalized_output):
-    #         print(outp)
-    #         print(self.diarization_output["segments"][outp[0]])
-    #         print(self.diarization_output["segments"][outp[1]])
-
-    #         start = self.diarization_output["segments"][outp[0]].start
-    #         end = self.diarization_output["segments"][outp[1]].end
-
-    #         print("start: ", start)
-    #         print("end: ", end)
-
-    #         start_milliseconds = start * 1000
-    #         end_milliseconds = end * 1000
-
-    #         print("start_milliseconds: ", start_milliseconds)
-    #         print("end_milliseconds: ", end_milliseconds)
-
-    #         print("cut audio")
-
-    #         cut_audio = self.audio_file[start_milliseconds:end_milliseconds]
-
-    #         print("save audio")
-    #         print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav")
-    #         cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav")
-
-    #     return os.path.realpath(folder)
\ No newline at end of file

From 2e6af75f81f1a79fcbd3efe695d59da7259a2812 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:30:05 +0200
Subject: [PATCH 17/86] del file

---
 autotranscript/audio_processor.py | 179 ------------------------------
 1 file changed, 179 deletions(-)
 delete mode 100644 autotranscript/audio_processor.py

diff --git a/autotranscript/audio_processor.py b/autotranscript/audio_processor.py
deleted file mode 100644
index 3f0bf38..0000000
--- a/autotranscript/audio_processor.py
+++ /dev/null
@@ -1,179 +0,0 @@
-import os
-from warnings import warn
-
-import torch
-from pydub import AudioSegment
-from torchaudio import load, save
-
-
-class AudioProcessor:
-    def __init__(self, audio_file:str):
-        
-        self.audio = AudioSegment.from_file(audio_file, 
-                                            format=audio_file.split('.')[-1])
-        self.audio_file_path = audio_file 
-        self.waveform = self.pydub_to_tensor[0]
-        self.sr = self.pydub_to_tensor[1]
-        
-    @property
-    def pydub_to_tensor(self):
-        """
-        Converts pydub audio segment into np.float32 of shape 
-        [duration_in_seconds*sample_rate, channels],
-        where each value is in range [-1.0, 1.0]. 
-        Returns tuple (audio_np_array, sample_rate).
-        """
-        audio = self.audio
-        x = torch.Tensor(audio.get_array_of_samples()
-                         ).reshape((-1, audio.channels))
-        y = (1 << (8 * audio.sample_width - 1))
-        return x / y, audio.frame_rate
-        
-    def convert_audio(self, path: str, remove_orginal: bool = False, 
-                      *args, **kwargs) ->  None:
-        """
-        Convert and saves video file or other audio files to a different file type,
-        Can be used to ensure that the audio file is in the correct format
-        for the Whisper model.
-        :param path : path to save file
-        :param remove_orginal: remove original file
-        :param args: arguments for pydub.AudioSegment.export
-        :param kwargs: keyword arguments for pydub.AudioSegment.export
-            e.g. format
-        :return: None
-        """
-
-        self.audio.export(path, *args, **kwargs)
-
-        if remove_orginal:
-            os.remove(self.audio_file_path)
-            print(f'File {self.audio_file_path} removed')
-        
-        self.audio_file_path = path
-
-
-    def to_mp3(self, *args, **kwargs) -> None:
-        """
-        Convert audio file to mp3 file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: mp3 file path
-        """
-        
-        warn(DeprecationWarning, "This function is deprecated," \
-             "please use convert_audio instead")
-        
-        if "mp3" not in kwargs["format"]:
-            kwargs["format"] = "mp3"
-            
-        self.convert_audio(*args, **kwargs)
-
-    def to_wav(self,*args, **kwargs) -> None:
-        """
-        Convert audio file to wav file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: wav file path
-        """
-        warn(DeprecationWarning, "This function is deprecated," \
-             "please use convert_audio instead")
-        
-        if "wav" not in kwargs["format"]:
-            kwargs["format"] = "wav"
-            
-        self.convert_audio(*args, **kwargs)
-
-    def slower_mp3(self, path: str,
-                    speed: float = 0.75,
-                    type: str = "mp3") -> None:
-        """
-        Slow down mp3 file
-        :param file: mp3 file
-        :param speed: speed
-        :return: None
-        """
-
-        sound = self.audio_file
-        slow_sound = sound._spawn(sound.raw_data, overrides={
-            "frame_rate": int(sound.frame_rate * speed)
-        })
-
-        slow_sound.export(path, format=type)
-
-        return slow_sound
-    
-
-class TorchAudioProcessor:
-    """
-    Audio Processor using PyTorchaudio instead of PyDub
-    """
-    
-    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None:
-        """
-        Initialise audio processor
-        :param waveform: waveform
-        :param sr: sample rate
-        """
-        self.waveform = waveform
-        self.sr = sr
-    
-    
-        
-    @classmethod
-    def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
-        """
-        Load audio file
-        :param file: audio file
-        :return: AudioProcessor
-        """
-        if not os.path.exists(file):
-            raise FileNotFoundError(f'File {file} not found')
-        
-        if "format" not in kwargs:
-            kwargs["format"] = file.split('.')[-1]
-            
-        audio, sr = load(file , *args, **kwargs)
-        
-        return cls(audio, sr)
-    
-    @classmethod
-    def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
-        """
-        Initialise audio processor using pydub audio segment.
-        pydub uses ffmped instead of SoX (which is used by torchaudio)
-        :param file: audio file
-        :return: TorchAudioProcessor
-        """
-        audio = AudioProcessor(file)
-        
-        return cls(audio.waveform, audio.sr)
-        
-
-    def cut(self, start: float, end: float) -> torch.Tensor:
-        """
-        Cut audio file
-        :param start: start time in seconds
-        :param end: end time in seconds
-        :return: AudioProcessor
-        """
-        start = int(start / self.sr)
-        end = torch.ceil(end / self.sr)
-        
-        return self.waveform[:, start:end]
-    
-    def save(self, path: str, *args, **kwargs) -> None:
-        """
-        Save audio file
-        :param path: path to save file
-        :return: None
-        """
-        if "format" not in kwargs:
-            kwargs["format"] = path.split('.')[-1]
-            
-        save(path, self.waveform, self.sr, *args, **kwargs)
-    
-    def __repr__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
-    
-    def __str__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
\ No newline at end of file

From 90324e6ea7900669b7d0e46bca81e819f397ec9a Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:30:15 +0200
Subject: [PATCH 18/86] added unittests

---
 test_autotranscript.py | 79 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 72 insertions(+), 7 deletions(-)

diff --git a/test_autotranscript.py b/test_autotranscript.py
index 29bf4d9..8f745a0 100644
--- a/test_autotranscript.py
+++ b/test_autotranscript.py
@@ -23,7 +23,7 @@ def transcriber():
     Returns: Transcriber Object
     """
 
-    return Transcriber.load_whisper_model("medium", local=True)
+    return Transcriber.load_model("medium", local=True)
 
 
 def test_Transcriber_init(transcriber):
@@ -46,10 +46,75 @@ def test_save_transcript_to_file(transcriber):
     Test save_transcript_to_file
     """
     transcript = transcriber.transcribe("tests/test.wav")
-    
-    open_mock = mock_open()
-    with patch("autotranscript.Transcriber.save_transcript", open_mock, create=True):
-        Transcriber.save_transcript(transcript, "output.txt")
 
-    open_mock.assert_called_with("output.txt", "w")
-    open_mock.return_value.write.assert_called_once_with("test-data")
+    Transcriber.save_transcript(transcript, "tests/output.txt")
+    
+    assert os.path.exists("tests/output.txt")
+
+    os.remove("tests/output.txt")
+    
+# Test Diaraization class
+
+from autotranscript import Diariser
+
+@pytest.fixture
+def diarisation():
+    """
+    Prepare Diarisation for testing
+    Returns: Diarisation Object
+    """
+
+    return Diariser.load_model("models/pyannote/speaker_diarization/config.yaml", local=True)
+
+def test_Diarisation_init(diarisation):
+    """
+    Test Diarisation initialization with a pyannote model 
+    """
+    
+    assert isinstance(diarisation, Diariser)
+
+def test_diarisation(diarisation):
+    """
+    Test diarisation
+    """
+
+    diarisation = diarisation.diarization("tests/test.wav") 
+    assert isinstance(diarisation, dict)
+
+# Test AudioProcessor
+
+from autotranscript import AudioProcessor , TorchAudioProcessor
+
+
+def test_AudioProcessor_init():
+    """
+    Test AudioProcessor initialization
+    """
+    audio = AudioProcessor("tests/test.wav")
+    assert isinstance(audio, AudioProcessor)
+
+def test_AudioProcessor_convert():
+    """
+    Test AudioProcessor convert
+    """
+    audio = AudioProcessor("tests/test.wav")
+    audio.convert_audio("tests/test.mp3", format="mp3")
+    assert os.path.exists("tests/test.mp3")
+    
+def test_TorchAudioProcessor_from_file():
+    """
+    Test TorchAudioProcessor initialization
+    """
+    audio = TorchAudioProcessor.from_file("tests/test.wav")
+    
+    assert isinstance(audio, TorchAudioProcessor)
+    
+    os.remove("tests/test.mp3")
+
+
+def test_TorchAudioProcessor_from_ffmpeg():
+    """
+    Test TorchAudioProcessor initialization
+    """
+    audio = TorchAudioProcessor.from_ffmpeg("tests/test.wav")
+    assert isinstance(audio, TorchAudioProcessor)

From 34354c055f7514cad065b9e00a7273308a138657 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:30:29 +0200
Subject: [PATCH 19/86] changed imports

---
 autotranscript/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 531c651..5aea052 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,6 +1,7 @@
 from autotranscript.__main__ import *
 from autotranscript.transcriber import *
-from autotranscript.audio_processor import *
+from autotranscript.audio import *
+from autotranscript.transcript_exporter import *
 from autotranscript.diarisation import *
 from autotranscript.version import get_version as _get_version
 from autotranscript.misc import *

From 854469fb6e173bf0f4ee3f1ed4665480dfccf176 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:30:57 +0200
Subject: [PATCH 20/86] audio processing

---
 autotranscript/audio.py | 202 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 202 insertions(+)
 create mode 100644 autotranscript/audio.py

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
new file mode 100644
index 0000000..3175ca0
--- /dev/null
+++ b/autotranscript/audio.py
@@ -0,0 +1,202 @@
+import os
+from warnings import warn
+
+import torch
+from pydub import AudioSegment
+from torchaudio import load, save
+
+
+class AudioProcessor:
+    def __init__(self, audio_file:str):
+        
+        self.audio = AudioSegment.from_file(audio_file, 
+                                            format=audio_file.split('.')[-1])
+        self.audio_file_path = audio_file 
+        self.waveform = self.pydub_to_tensor[0]
+        self.sr = self.pydub_to_tensor[1]
+        
+    @property
+    def pydub_to_tensor(self):
+        """
+        Converts pydub audio segment into np.float32 of shape 
+        [duration_in_seconds*sample_rate, channels],
+        where each value is in range [-1.0, 1.0]. 
+        Returns tuple (audio_np_array, sample_rate).
+        """
+        audio = self.audio
+        x = torch.Tensor(audio.get_array_of_samples()
+                         ).reshape((-1, audio.channels))
+        y = (1 << (8 * audio.sample_width - 1))
+        return x / y, audio.frame_rate
+        
+    def convert_audio(self, path: str, remove_orginal: bool = False, 
+                      *args, **kwargs) ->  None:
+        """
+        Convert and saves video file or other audio files to a different file type,
+        Can be used to ensure that the audio file is in the correct format
+        for the Whisper model.
+        :param path : path to save file
+        :param remove_orginal: remove original file
+        :param args: arguments for pydub.AudioSegment.export
+        :param kwargs: keyword arguments for pydub.AudioSegment.export
+            e.g. format
+        :return: None
+        """
+
+        self.audio.export(path, *args, **kwargs)
+
+        if remove_orginal:
+            os.remove(self.audio_file_path)
+            print(f'File {self.audio_file_path} removed')
+        
+        self.audio_file_path = path
+
+
+    def to_mp3(self, *args, **kwargs) -> None:
+        """
+        Convert audio file to mp3 file
+        :param file: audio file
+        :param remove_orginal: remove original file
+        :return: mp3 file path
+        """
+        
+        warn(DeprecationWarning, "This function is deprecated," \
+             "please use convert_audio instead")
+        
+        if "mp3" not in kwargs["format"]:
+            kwargs["format"] = "mp3"
+            
+        self.convert_audio(*args, **kwargs)
+
+    def to_wav(self,*args, **kwargs) -> None:
+        """
+        Convert audio file to wav file
+        :param file: audio file
+        :param remove_orginal: remove original file
+        :return: wav file path
+        """
+        warn(DeprecationWarning, "This function is deprecated," \
+             "please use convert_audio instead")
+        
+        if "wav" not in kwargs["format"]:
+            kwargs["format"] = "wav"
+            
+        self.convert_audio(*args, **kwargs)
+
+    def slower_mp3(self, path: str,
+                    speed: float = 0.75,
+                    type: str = "mp3") -> None:
+        """
+        Slow down mp3 file
+        :param file: mp3 file
+        :param speed: speed
+        :return: None
+        """
+
+        sound = self.audio_file
+        slow_sound = sound._spawn(sound.raw_data, overrides={
+            "frame_rate": int(sound.frame_rate * speed)
+        })
+
+        slow_sound.export(path, format=type)
+
+        return slow_sound
+    
+
+class TorchAudioProcessor:
+    """
+    Audio Processor using PyTorchaudio instead of PyDub
+    """
+    
+    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None:
+        """
+        Initialise audio processor
+        :param waveform: waveform
+        :param sr: sample rate
+        """
+        self.waveform = waveform.reshape(-1)
+        self.sr = sr
+        
+        if not isinstance(self.sr, int):
+            raise ValueError("Sample rate should be a single value of type int," \
+                             f"not {len(self.sr)} and type {type(self.sr)}")
+    
+        
+    @classmethod
+    def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
+        """
+        Load audio file
+        :param file: audio file
+        :return: AudioProcessor
+        """
+        if not os.path.exists(file):
+            raise FileNotFoundError(f'File {file} not found')
+        
+        if "format" not in kwargs:
+            kwargs["format"] = file.split('.')[-1]
+        
+        audio, sr = load(file , *args, **kwargs)
+        
+        return cls(audio, sr)
+    
+    @classmethod
+    def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
+        """
+        Initialise audio processor using pydub audio segment.
+        pydub uses ffmped instead of SoX (which is used by torchaudio)
+        :param file: audio file
+        :return: TorchAudioProcessor
+        """
+        audio = AudioProcessor(file)
+        
+        return cls(audio.waveform, audio.sr)
+    
+    @classmethod
+    def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor':
+        """
+        Initialise audio processor using pydub audio segment.
+
+        :param audio_processor: AudioProcessor object
+        :type audio_processor: AudioProcessor
+        :return: TorchAudioProcessor
+        :rtype: TorchAudioProcessor
+        """
+        return cls(audio_processor.waveform, audio_processor.sr)    
+    
+    def cut(self, start: float, end: float) -> torch.Tensor:
+        """
+        Cut audio file
+        :param start: start time in seconds
+        :param end: end time in seconds
+        :return: AudioProcessor
+        """
+        
+        if isinstance(start, float):
+            start = torch.Tensor([start])
+        if isinstance(end, float):
+            end = torch.Tensor([end])
+        
+        sr = torch.Tensor([self.sr])
+            
+        start = int(start * sr)
+        end = torch.ceil(end * sr)
+        
+        return self.waveform[start:end.to(int)]
+    
+    def save(self, path: str, *args, **kwargs) -> None:
+        """
+        Save audio file
+        :param path: path to save file
+        :return: None
+        """
+        if "format" not in kwargs:
+            kwargs["format"] = path.split('.')[-1]
+            
+        save(path, self.waveform, self.sr, *args, **kwargs)
+    
+    
+    def __repr__(self) -> str:
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+    
+    def __str__(self) -> str:
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'

From 002c7b518901151a9df6ce50120940e3c40045e8 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:31:07 +0200
Subject: [PATCH 21/86] auto transcript

---
 autotranscript/autotranscipt.py | 125 ++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 autotranscript/autotranscipt.py

diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py
new file mode 100644
index 0000000..c1225af
--- /dev/null
+++ b/autotranscript/autotranscipt.py
@@ -0,0 +1,125 @@
+from audio import AudioProcessor , TorchAudioProcessor
+
+from diarisation import Diariser
+from transcriber import Transcriber, whisper
+from whisper import Whisper
+from transcript_exporter import Transcript
+from typing import Union , TypeVar
+from tqdm import trange
+from pprint import pprint
+import torch
+diarisation = TypeVar('diarisation')
+
+
+class AutoTranscribe:
+    def __init__(self,
+                whisper_model: Union[bool, str, whisper] = None,
+                dia_model : Union[bool, str, diarisation] = None,
+                dia_kwargs : dict = {},
+                whisper_kwargs : dict = {}) -> None:
+        """
+        AutoTranscribe class
+        
+        This class is the core Api Class of the autotranscript package.
+        It allows to transcribe audio files with a whisper model and
+        pyannote diarization model. 
+        
+        Therefore it is do a fully automatic transcription of audio files.
+        
+        :param whisper_model: path to whisper model or whisper model
+        :param dia_model: path to pyannote diarization model
+        :param dia_kwargs: kwargs for pyannote diarization model
+        :param whisper_kwargs: kwargs for whisper model      
+        
+        """
+        
+        if whisper_model is None:
+            self.transcriber = Transcriber.load_model("medium", local=True)
+        elif isinstance(whisper_model, str):
+            self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs)
+        else:
+            self.transcriber = whisper_model
+
+        if dia_model is None:
+            self.diariser = Diariser.load_model()
+        elif isinstance(dia_model, str):
+            self.diariser = Diariser.load_model(dia_model, **dia_kwargs)
+        else:
+            self.diariser = dia_model
+
+        print("AutoTranscribe initialized all models successfully loaded.")
+            
+    def transcribe(self, audiofile : Union[str, torch.Tensor],
+                   *args, **kwargs) -> Transcript:
+        """
+        Transcribe audiofile with whisper model and pyannote diarization model
+        
+        :param audiofile: path to audiofile or torch.Tensor
+        :return: Transcript object
+        """
+        
+        audiofile = self.get_audiofile(audiofile)
+        
+        final_transcript = dict()
+        
+        dia_audio = {"waveform" : 
+                        audiofile.waveform.reshape(1,len(audiofile.waveform)), 
+                    "sample_rate": audiofile.sr}
+       
+        print("Starting diarisation.")
+        
+        diarisation = self.diariser.diarization( dia_audio,
+                                                *args , **kwargs)
+        
+        print("Diarisation finished. Starting transcription.")
+        
+        for i in trange(len(diarisation["segments"]), desc= "Transcribing"):
+            
+            seg = diarisation["segments"][i]
+            
+            audio = audiofile.cut(seg[0], seg[1])
+            
+            transcript = self.transcriber.transcribe(audio, *args , **kwargs)
+            
+            final_transcript[i] = {"speaker" : diarisation["speakers"][i],
+                                   "text" : transcript}
+
+        pprint(final_transcript)   
+        #return Transcript(transcript, diarisation)
+    
+    @staticmethod
+    def get_audiofile(audiofile : Union[str, torch.Tensor],
+                        *args, **kwargs) -> TorchAudioProcessor:
+        """
+        Get audiofile as TorchAudioProcessor
+
+        :param audiofile: path to audiofile or torch.Tensor
+            :type audiofile: Union[str, torch.Tensor]
+        :return: object of audiofile containes
+                 waveform and sample_rate in torch.Tensor format.
+            :rtype: TorchAudioProcessor
+        """
+        if isinstance(audiofile, str):
+            try:
+               audiofile = TorchAudioProcessor.from_file(audiofile)   
+            except: 
+                print("Could not load audiofile with torch audio." \
+                        "Trying ffmpeg. using pydub.")
+                audiofile = TorchAudioProcessor.from_ffmpeg(audiofile)
+        
+        if isinstance(audiofile, torch.Tensor):
+            audiofile = TorchAudioProcessor(audiofile[0], audiofile[1])
+        
+        if isinstance(audiofile, AudioProcessor):
+            audiofile = TorchAudioProcessor.from_audio_processor(audiofile)
+        
+        if not isinstance(audiofile, TorchAudioProcessor):
+            raise ValueError(f'Audiofile must be of type TorchAudioProcessor,' \
+                             f'not {type(audiofile)}')     
+        return audiofile
+    
+
+if __name__ == "__main__":
+    
+    AudioTranscriber = AutoTranscribe()
+    AudioTranscriber.transcribe("/home/jacob/PycharmProjects/autotranscript/tests/Kathi_interview.mp3" , num_speaker=2)
\ No newline at end of file

From 67e4e4585da3be40190a265bcf7b12e446f2ee69 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:31:25 +0200
Subject: [PATCH 22/86] added kwargs parsing

---
 autotranscript/diarisation.py | 20 ++++++++++++++++++++
 autotranscript/transcriber.py | 31 +++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 55fd0cb..3b64fac 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,4 +1,5 @@
 from pyannote.audio import Pipeline
+from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
 from torch import Tensor
 import os
 from typing import TypeVar, Union
@@ -23,6 +24,7 @@ class Diariser:
         :param kwargs: kwargs for diarization model
         :return: diarization
         """
+        kwargs = self._get_diarisation_kwargs(**kwargs)
         
         diarization = self.model(audiofile,*args, **kwargs)
 
@@ -132,6 +134,24 @@ class Diariser:
         
         return cls(diarization_model)
 
+    @staticmethod
+    def _get_diarisation_kwargs(**kwargs) -> dict:
+        """
+        Get kwargs for pyannote diarization model
+        Ensure that kwargs are valid
+        :return: kwargs for pyannote diarization model
+            :rtype: dict
+        """
+        _possible_kwargs = SpeakerDiarization.apply.__code__.co_varnames
+        
+        diarisation_kwargs = dict()
+        
+        for k in kwargs.keys():
+            if k in _possible_kwargs:
+               diarisation_kwargs[k] = kwargs[k]
+            
+        return diarisation_kwargs
+    
     def __repr__(self):
         return f"Diarisation(model={self.model})"
     
diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 069866a..57a3423 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -1,5 +1,5 @@
-
 import os
+from whisper import Whisper
 from typing import TypeVar , Union
 from whisper import load_model
 from glob import glob
@@ -43,8 +43,17 @@ class Transcriber:
         :return: transcript as string
         """
         
-        result = self.model.transcribe(audio, *args, **kwargs)
+        kwargs = self._get_whisper_kwargs(**kwargs)
 
+        if kwargs or args: 
+            result = self.model.transcribe(audio, *args, **kwargs)
+        else:
+            # if kwargs is empty but parsed anyway whisper
+            # will not use the default kwargs
+            
+            print("No kwargs parsed. Using default kwargs.")
+            result = self.model.transcribe(audio)
+            
         return result["text"]
     
     @staticmethod
@@ -117,3 +126,21 @@ class Transcriber:
         _model = load_model(model, download_root=download_root)
 
         return cls(_model)
+
+    @staticmethod
+    def _get_whisper_kwargs(**kwargs) -> dict:
+        """
+        Get kwargs for whisper model.
+        Ensure that kwargs are valid.
+        :return: kwargs for whisper model
+            :rtype: dict
+        """
+        _possible_kwargs = Whisper.transcribe.__code__.co_varnames
+        
+        whisper_kwargs = dict()
+        
+        for k in kwargs.keys():
+            if k in _possible_kwargs:
+                whisper_kwargs[k] = kwargs[k]
+            
+        return whisper_kwargs
\ No newline at end of file

From 07acbc9464a00ac11f7b830ba1e340acd44aed84 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 14 Jun 2023 16:31:44 +0200
Subject: [PATCH 23/86] added dummy class for output

---
 autotranscript/transcript_exporter.py | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 autotranscript/transcript_exporter.py

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
new file mode 100644
index 0000000..956b398
--- /dev/null
+++ b/autotranscript/transcript_exporter.py
@@ -0,0 +1,23 @@
+
+class Transcript:
+    """
+    Class for storing transcript data
+    and exporting it to files in different formats
+    """
+    def __init__(self, transcript: str) -> None:
+        """
+        :param transcript: formated transcript string
+        """
+        self.transcript = transcript
+    
+    def to_latex(self, path: str) -> None:
+        pass
+    
+    def to_pdf(self, path: str) -> None:
+        pass
+    
+    def to_txt(self, path: str) -> None:
+        pass
+    
+    def to_json(self, path: str) -> None:
+        pass
\ No newline at end of file

From edd6a0104c0cce4a0e300ddc48cfdfce8d190cf9 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 11:28:55 +0200
Subject: [PATCH 24/86] removed pydub and use ffmpeg  remove dependencies.
 Droped pydub functionality and focuses on core components instead

---
 autotranscript/audio.py | 190 +++++++++++-----------------------------
 1 file changed, 49 insertions(+), 141 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index 3175ca0..fe82041 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -1,109 +1,13 @@
 import os
 from warnings import warn
 
+import numpy as np
 import torch
-from pydub import AudioSegment
-from torchaudio import load, save
+import ffmpeg
 
+SAMPLE_RATE = 16000
 
 class AudioProcessor:
-    def __init__(self, audio_file:str):
-        
-        self.audio = AudioSegment.from_file(audio_file, 
-                                            format=audio_file.split('.')[-1])
-        self.audio_file_path = audio_file 
-        self.waveform = self.pydub_to_tensor[0]
-        self.sr = self.pydub_to_tensor[1]
-        
-    @property
-    def pydub_to_tensor(self):
-        """
-        Converts pydub audio segment into np.float32 of shape 
-        [duration_in_seconds*sample_rate, channels],
-        where each value is in range [-1.0, 1.0]. 
-        Returns tuple (audio_np_array, sample_rate).
-        """
-        audio = self.audio
-        x = torch.Tensor(audio.get_array_of_samples()
-                         ).reshape((-1, audio.channels))
-        y = (1 << (8 * audio.sample_width - 1))
-        return x / y, audio.frame_rate
-        
-    def convert_audio(self, path: str, remove_orginal: bool = False, 
-                      *args, **kwargs) ->  None:
-        """
-        Convert and saves video file or other audio files to a different file type,
-        Can be used to ensure that the audio file is in the correct format
-        for the Whisper model.
-        :param path : path to save file
-        :param remove_orginal: remove original file
-        :param args: arguments for pydub.AudioSegment.export
-        :param kwargs: keyword arguments for pydub.AudioSegment.export
-            e.g. format
-        :return: None
-        """
-
-        self.audio.export(path, *args, **kwargs)
-
-        if remove_orginal:
-            os.remove(self.audio_file_path)
-            print(f'File {self.audio_file_path} removed')
-        
-        self.audio_file_path = path
-
-
-    def to_mp3(self, *args, **kwargs) -> None:
-        """
-        Convert audio file to mp3 file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: mp3 file path
-        """
-        
-        warn(DeprecationWarning, "This function is deprecated," \
-             "please use convert_audio instead")
-        
-        if "mp3" not in kwargs["format"]:
-            kwargs["format"] = "mp3"
-            
-        self.convert_audio(*args, **kwargs)
-
-    def to_wav(self,*args, **kwargs) -> None:
-        """
-        Convert audio file to wav file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: wav file path
-        """
-        warn(DeprecationWarning, "This function is deprecated," \
-             "please use convert_audio instead")
-        
-        if "wav" not in kwargs["format"]:
-            kwargs["format"] = "wav"
-            
-        self.convert_audio(*args, **kwargs)
-
-    def slower_mp3(self, path: str,
-                    speed: float = 0.75,
-                    type: str = "mp3") -> None:
-        """
-        Slow down mp3 file
-        :param file: mp3 file
-        :param speed: speed
-        :return: None
-        """
-
-        sound = self.audio_file
-        slow_sound = sound._spawn(sound.raw_data, overrides={
-            "frame_rate": int(sound.frame_rate * speed)
-        })
-
-        slow_sound.export(path, format=type)
-
-        return slow_sound
-    
-
-class TorchAudioProcessor:
     """
     Audio Processor using PyTorchaudio instead of PyDub
     """
@@ -114,54 +18,27 @@ class TorchAudioProcessor:
         :param waveform: waveform
         :param sr: sample rate
         """
-        self.waveform = waveform.reshape(-1)
+        self.waveform = waveform
         self.sr = sr
         
         if not isinstance(self.sr, int):
             raise ValueError("Sample rate should be a single value of type int," \
                              f"not {len(self.sr)} and type {type(self.sr)}")
-    
         
     @classmethod
-    def from_file(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
+    def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
         """
         Load audio file
         :param file: audio file
         :return: AudioProcessor
         """
-        if not os.path.exists(file):
-            raise FileNotFoundError(f'File {file} not found')
         
-        if "format" not in kwargs:
-            kwargs["format"] = file.split('.')[-1]
-        
-        audio, sr = load(file , *args, **kwargs)
+        audio, sr = cls.load_audio(file , *args, **kwargs)
+
+        audio = torch.from_numpy(audio)
         
         return cls(audio, sr)
     
-    @classmethod
-    def from_ffmpeg(cls, file: str, *args, **kwargs) -> 'TorchAudioProcessor':
-        """
-        Initialise audio processor using pydub audio segment.
-        pydub uses ffmped instead of SoX (which is used by torchaudio)
-        :param file: audio file
-        :return: TorchAudioProcessor
-        """
-        audio = AudioProcessor(file)
-        
-        return cls(audio.waveform, audio.sr)
-    
-    @classmethod
-    def from_audio_processor(cls, audio_processor: AudioProcessor) -> 'TorchAudioProcessor':
-        """
-        Initialise audio processor using pydub audio segment.
-
-        :param audio_processor: AudioProcessor object
-        :type audio_processor: AudioProcessor
-        :return: TorchAudioProcessor
-        :rtype: TorchAudioProcessor
-        """
-        return cls(audio_processor.waveform, audio_processor.sr)    
     
     def cut(self, start: float, end: float) -> torch.Tensor:
         """
@@ -182,21 +59,52 @@ class TorchAudioProcessor:
         end = torch.ceil(end * sr)
         
         return self.waveform[start:end.to(int)]
-    
-    def save(self, path: str, *args, **kwargs) -> None:
+
+    @staticmethod
+    def load_audio(file: str, sr: int = SAMPLE_RATE):
         """
-        Save audio file
-        :param path: path to save file
-        :return: None
+        Open an audio file and read as mono waveform, resampling as necessary
+
+        Changed from original function at whisper.audio.load_audio to ensure compatibility
+        with pyannote.audio
+        Parameters
+        ----------
+        file: str
+            The audio file to open
+
+        sr: int
+            The sample rate to resample the audio if necessary
+
+        Returns
+        -------
+        A NumPy array containing the audio waveform, in float32 dtype.
         """
-        if "format" not in kwargs:
-            kwargs["format"] = path.split('.')[-1]
-            
-        save(path, self.waveform, self.sr, *args, **kwargs)
-    
+        try:
+            # This launches a subprocess to decode audio while down-mixing 
+            # and resampling as necessary.
+            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
+            out, _ = (
+                ffmpeg.input(file, threads=0)
+                .output("-", format="s16le", acodec="pcm_s16le",
+                        ac=1, ar=sr)
+                .run(cmd=["ffmpeg", "-nostdin"],
+                     capture_stdout=True, capture_stderr=True)
+            )
+        except ffmpeg.Error as e:
+            raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
+
+        out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        
+        return out , sr
     
     def __repr__(self) -> str:
         return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
     
     def __str__(self) -> str:
         return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+
+    
+if __name__ == "__main__":
+    
+    print("Testing AudioProcessor")
+    print(AudioProcessor.from_file("tests/test.wav"))
\ No newline at end of file

From 8ecc66cf2920b6450324a0d1335f81334fffc893 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 11:30:47 +0200
Subject: [PATCH 25/86] linting

---
 autotranscript/audio.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index fe82041..35b6f99 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -1,6 +1,3 @@
-import os
-from warnings import warn
-
 import numpy as np
 import torch
 import ffmpeg
@@ -65,8 +62,8 @@ class AudioProcessor:
         """
         Open an audio file and read as mono waveform, resampling as necessary
 
-        Changed from original function at whisper.audio.load_audio to ensure compatibility
-        with pyannote.audio
+        Changed from original function at whisper.audio.load_audio to ensure 
+        compatibility with pyannote.audio
         Parameters
         ----------
         file: str

From 29e8a229dc120a0e139fd354fa1f6e7dfb435683 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 12:09:18 +0200
Subject: [PATCH 26/86] autotrancript works

---
 autotranscript/autotranscipt.py | 38 ++++++++++++++-------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py
index c1225af..cbf2c9d 100644
--- a/autotranscript/autotranscipt.py
+++ b/autotranscript/autotranscipt.py
@@ -1,13 +1,11 @@
-from audio import AudioProcessor , TorchAudioProcessor
-
+from audio import AudioProcessor
 from diarisation import Diariser
 from transcriber import Transcriber, whisper
-from whisper import Whisper
 from transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
-from pprint import pprint
 import torch
+
 diarisation = TypeVar('diarisation')
 
 
@@ -35,6 +33,7 @@ class AutoTranscribe:
         
         if whisper_model is None:
             self.transcriber = Transcriber.load_model("medium", local=True)
+            
         elif isinstance(whisper_model, str):
             self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs)
         else:
@@ -55,7 +54,8 @@ class AutoTranscribe:
         Transcribe audiofile with whisper model and pyannote diarization model
         
         :param audiofile: path to audiofile or torch.Tensor
-        :return: Transcript object
+        :return: Transcript object which contains the transcript and can be used to 
+                export the transcript to differnt formats.
         """
         
         audiofile = self.get_audiofile(audiofile)
@@ -68,11 +68,13 @@ class AutoTranscribe:
        
         print("Starting diarisation.")
         
-        diarisation = self.diariser.diarization( dia_audio,
+        diarisation = self.diariser.diarization(dia_audio,
                                                 *args , **kwargs)
         
         print("Diarisation finished. Starting transcription.")
         
+        audiofile.sr = torch.Tensor([audiofile.sr]).to(audiofile.waveform.device)
+        
         for i in trange(len(diarisation["segments"]), desc= "Transcribing"):
             
             seg = diarisation["segments"][i]
@@ -84,12 +86,11 @@ class AutoTranscribe:
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                    "text" : transcript}
 
-        pprint(final_transcript)   
-        #return Transcript(transcript, diarisation)
+        return Transcript(transcript, diarisation)
     
     @staticmethod
     def get_audiofile(audiofile : Union[str, torch.Tensor],
-                        *args, **kwargs) -> TorchAudioProcessor:
+                        *args, **kwargs) -> AudioProcessor:
         """
         Get audiofile as TorchAudioProcessor
 
@@ -99,22 +100,15 @@ class AutoTranscribe:
                  waveform and sample_rate in torch.Tensor format.
             :rtype: TorchAudioProcessor
         """
+        
         if isinstance(audiofile, str):
-            try:
-               audiofile = TorchAudioProcessor.from_file(audiofile)   
-            except: 
-                print("Could not load audiofile with torch audio." \
-                        "Trying ffmpeg. using pydub.")
-                audiofile = TorchAudioProcessor.from_ffmpeg(audiofile)
+            audiofile = AudioProcessor.from_file(audiofile)   
         
         if isinstance(audiofile, torch.Tensor):
-            audiofile = TorchAudioProcessor(audiofile[0], audiofile[1])
+            audiofile = AudioProcessor(audiofile[0], audiofile[1])
         
-        if isinstance(audiofile, AudioProcessor):
-            audiofile = TorchAudioProcessor.from_audio_processor(audiofile)
-        
-        if not isinstance(audiofile, TorchAudioProcessor):
-            raise ValueError(f'Audiofile must be of type TorchAudioProcessor,' \
+        if not isinstance(audiofile, AudioProcessor):
+            raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                              f'not {type(audiofile)}')     
         return audiofile
     
@@ -122,4 +116,4 @@ class AutoTranscribe:
 if __name__ == "__main__":
     
     AudioTranscriber = AutoTranscribe()
-    AudioTranscriber.transcribe("/home/jacob/PycharmProjects/autotranscript/tests/Kathi_interview.mp3" , num_speaker=2)
\ No newline at end of file
+    AudioTranscriber.transcribe("tests/test.wav")
\ No newline at end of file

From de1ca223976e4993dd2f2fcd5276a5bf3c556f57 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 12:09:53 +0200
Subject: [PATCH 27/86] added dict as input type

---
 autotranscript/diarisation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 3b64fac..ff3ead0 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -15,7 +15,7 @@ class Diariser:
 
         self.model = model
 
-    def diarization(self, audiofile : Union[str, Tensor] ,
+    def diarization(self, audiofile : Union[str, Tensor, dict] ,
                     *args, **kwargs) -> Annotation:
         """
         Diarization of audio file

From 713dd3bfd5861e517d6660ff74614019fe2307df Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 12:10:11 +0200
Subject: [PATCH 28/86] added cuda support

---
 autotranscript/audio.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index 35b6f99..ea11fe8 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -9,13 +9,28 @@ class AudioProcessor:
     Audio Processor using PyTorchaudio instead of PyDub
     """
     
-    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor) -> None:
+    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor,
+                 *args, **kwargs) -> None:
         """
         Initialise audio processor
         :param waveform: waveform
         :param sr: sample rate
+        :param args: additional arguments
+        :param kwargs: additional keyword arguments
+            example:
+                - device: device to use for processing
+                          if cuda is available, cuda is used 
         """
-        self.waveform = waveform
+        
+        if "device" in kwargs:
+            device = kwargs["device"]
+        else:
+            if torch.cuda.is_available():
+                device = "cuda"
+            else:
+                device = "cpu"
+                
+        self.waveform = waveform.to(device)
         self.sr = sr
         
         if not isinstance(self.sr, int):

From 8a1bdda393febefa42250057a0a2112744665cda Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 12:11:13 +0200
Subject: [PATCH 29/86] added verbose dafault value to be false

---
 autotranscript/transcriber.py | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 57a3423..4fbf14b 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -44,16 +44,11 @@ class Transcriber:
         """
         
         kwargs = self._get_whisper_kwargs(**kwargs)
+        
+        if "verbose" not in kwargs:
+            kwargs["verbose"] = False    
 
-        if kwargs or args: 
-            result = self.model.transcribe(audio, *args, **kwargs)
-        else:
-            # if kwargs is empty but parsed anyway whisper
-            # will not use the default kwargs
-            
-            print("No kwargs parsed. Using default kwargs.")
-            result = self.model.transcribe(audio)
-            
+        result = self.model.transcribe(audio, *args, **kwargs)
         return result["text"]
     
     @staticmethod

From b3c9bcc482e857ad51dbac011118687a97956db0 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 12:13:56 +0200
Subject: [PATCH 30/86] fixed wrong Transcript class params

---
 autotranscript/autotranscipt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py
index cbf2c9d..906166a 100644
--- a/autotranscript/autotranscipt.py
+++ b/autotranscript/autotranscipt.py
@@ -86,7 +86,7 @@ class AutoTranscribe:
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                    "text" : transcript}
 
-        return Transcript(transcript, diarisation)
+        return Transcript(final_transcript)
     
     @staticmethod
     def get_audiofile(audiofile : Union[str, torch.Tensor],

From 52efd41d21e1dfd5056abfa73401673a09a77dbc Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:00:22 +0200
Subject: [PATCH 31/86] added Transcriptor class which handles Transcription
 output

---
 autotranscript/transcript_exporter.py | 181 +++++++++++++++++++++++++-
 1 file changed, 175 insertions(+), 6 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 956b398..ae6f1b6 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -1,23 +1,192 @@
+import json
+
+ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
+
 
 class Transcript:
     """
     Class for storing transcript data
     and exporting it to files in different formats
     """
-    def __init__(self, transcript: str) -> None:
+    def __init__(self, transcript: dict) -> None:
         """
         :param transcript: formated transcript string
         """
         self.transcript = transcript
+        self.speakers = self._extract_speakers()
+        self.segments = self._extract_segments()
+        self.annotation = {}
     
-    def to_latex(self, path: str) -> None:
+    def annotate(self, *args, **kwargs) -> dict:
+        """
+        Annote transcript to define speaker names
+        
+        :param args: list of speaker names will maped sequentially to the speakers
+        :param kwargs: dict with speaker names as keys and list of segments as values
+        
+        :return: dict with speaker names as keys and list of segments as values
+        :rtype: dict
+        """
+        
+        annotatios = {}
+
+        if len(args) != len(self.speakers):
+            raise ValueError("Number of speaker names does not match number of speakers")
+        
+        if args:
+            for arg,ospeaker in zip(args,self.speakers):
+                annotatios[ospeaker] = arg
+        
+        if kwargs:
+            for key in kwargs:
+                if key not in self.speakers:
+                    raise ValueError(f"{key} is not a speaker")
+                annotatios[key] = kwargs[key]
+
+        self.annotation = annotatios
+        return annotatios
+    
+    def _extract_speakers(self) -> list:
+        """
+        Extract speaker names from transcript
+        :return: list of speaker names
+        :rtype: list
+        """
+        return list(set([self.transcript[id]["speaker"] for id in self.transcript]))
+    
+    def _extract_segments(self) -> list:
+        """
+        Extract segments from transcript
+
+        :return: list of segments
+        :rtype: list
+        """
+        return [self.transcript[id]["segment"] for id in self.transcript]
+
+    def __str__(self) -> str:
+        """
+        Get transcript as string
+
+        :return: transcript as string
+        :rtype: str
+        """
+        fstring = ""
+        
+        for id in self.transcript:
+            seq = self.transcript[id]
+            
+            if self.annotation:
+                speaker = self.annotation[seq["speaker"]]
+            else:
+                speaker = seq["speaker"]
+                
+            fstring += f"{speaker}: {seq['text']}\n"
+
+        return fstring
+    
+    def __repr__(self) -> str:
+        return f"Transcript(speakers = {self.speakers},"\
+                f"segments = {self.segments}, annotation = {self.annotation})"
+    
+    def get_dict(self) -> dict:
+        """
+        Get transcript as dict
+
+        :return: transcript as dict
+        :rtype: dict
+        """
+        
+        return self.transcript
+    
+    def get_json(self, *args, **kwargs) -> str:
+        """
+        Get transcript as json string
+        :return: transcript as json string
+        :rtype: str
+        """
+        if "indent" not in kwargs:
+            kwargs["indent"] = 4
+        return json.dumps(self.transcript, *args, **kwargs)
+    
+    def get_html(self) -> str:
+        """
+        Get transcript as html string
+
+        :return: transcript as html string
+        :rtype: str
+        """
+        html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>"
+        html = "<html><body>" + html + "</body></html>"
+        html = html.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
+        
+        return html
+        
+    
+    def get_md(self) -> str:
+        return self.get_html()
+    
+    def get_tex(self) -> str:
+        
+        if not self.annotation:
+
+            self.annotate(*ALPHABET[:len(self.speakers)])
+        
+        fstring ="\\begin{drama}"
+        
+        for speaker in self.speakers:
+            
+            fstring += "\n\t\\Character{"+ str(self.annotation[speaker]) + "}" \
+                "{"+ str(self.annotation[speaker]) + "}"
+        
+        for id in self.transcript:
+            seq = self.transcript[id]
+            speaker = self.annotation[seq["speaker"]]
+            fstring += f"\n\\{speaker}speaks:\n{seq['text']}"
+        
+        fstring += "\n\\end{drama}"
+        
+        return fstring
+        
+            
+    def to_json(self,path, *args, **kwargs) -> None:
+        """
+        Save transcript as json file
+        :param path: path to save file
+        :type path: str
+        """
+        with open(path, "w") as f:
+            json.dump(self.transcript, f, *args, **kwargs)
+    
+    def to_txt(self, path: str) -> None:
+        
+       with open(path, "w") as f:
+            f.write(self.__str__, f)
+    
+    def to_md(self, path: str) -> None:
+        return self.to_html(path)
+    
+    def to_html(self, path: str) -> None:
+        """
+        Save transcript as html file
+
+        :param path: path to save file
+        :type path: str
+        """
+        
+        with open(path, "w") as file:
+            file.write(self.get_html())
+    
+    def to_tex(self, path: str) -> None:
         pass
     
     def to_pdf(self, path: str) -> None:
         pass
     
-    def to_txt(self, path: str) -> None:
-        pass
+if __name__ == "__main__":
+    test = Transcript(json.load(open("tests/test.json", "r")))
+    print(repr(test))
+    print(test)
     
-    def to_json(self, path: str) -> None:
-        pass
\ No newline at end of file
+    
+    
+    
\ No newline at end of file

From cdfa872482e35fc4a85c995f3e20f65a0dae21e5 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:00:39 +0200
Subject: [PATCH 32/86] added segments to out dict

---
 autotranscript/autotranscipt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscipt.py
index 906166a..792dcdd 100644
--- a/autotranscript/autotranscipt.py
+++ b/autotranscript/autotranscipt.py
@@ -84,8 +84,8 @@ class AutoTranscribe:
             transcript = self.transcriber.transcribe(audio, *args , **kwargs)
             
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
+                                   "segment" : seg,
                                    "text" : transcript}
-
         return Transcript(final_transcript)
     
     @staticmethod

From 4f416f26f9067d191097eee6604d544ba959d57f Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:03:25 +0200
Subject: [PATCH 33/86] changed wrong file name

---
 autotranscript/{autotranscipt.py => autotranscript.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename autotranscript/{autotranscipt.py => autotranscript.py} (100%)

diff --git a/autotranscript/autotranscipt.py b/autotranscript/autotranscript.py
similarity index 100%
rename from autotranscript/autotranscipt.py
rename to autotranscript/autotranscript.py

From c4c62c8ae150772e088d835bbb96ce8cfff5d3d1 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:06:09 +0200
Subject: [PATCH 34/86] added new file

---
 autotranscript/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 5aea052..ef47226 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,4 +1,5 @@
 from autotranscript.__main__ import *
+from autotranscript.autotranscript import *
 from autotranscript.transcriber import *
 from autotranscript.audio import *
 from autotranscript.transcript_exporter import *

From e4e5cfb4bc3d8362eb5b78264d21a5ff2db24d32 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:06:18 +0200
Subject: [PATCH 35/86] linting

---
 autotranscript/transcript_exporter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index ae6f1b6..37092c8 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -31,7 +31,8 @@ class Transcript:
         annotatios = {}
 
         if len(args) != len(self.speakers):
-            raise ValueError("Number of speaker names does not match number of speakers")
+            raise ValueError("Number of speaker names "\
+                "does not match number of speakers")
         
         if args:
             for arg,ospeaker in zip(args,self.speakers):

From 61121aad928d3629fede7eee4d70e998ae6d26dc Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:09:33 +0200
Subject: [PATCH 36/86] updated version

---
 autotranscript/version.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotranscript/version.py b/autotranscript/version.py
index 0a1a41e..5bc7ffc 100644
--- a/autotranscript/version.py
+++ b/autotranscript/version.py
@@ -1,8 +1,8 @@
 import os
 import subprocess as sp
 
-MAJOR = 1
-MINOR = 0
+MAJOR = 0
+MINOR = 2
 MICRO = 0
 MICRO_POST = 0
 ISRELEASED = False

From a653f0b05d874c4677420b7d64778f86d031947a Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:09:44 +0200
Subject: [PATCH 37/86] added new example usage

---
 transcribe.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/transcribe.py b/transcribe.py
index e7c62fa..6601707 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -1,3 +1,7 @@
-from autotranscript import AutoTranscribe
+from autotranscript.autotranscript import AutoTranscribe
 
-AutoTranscribe(diarisation=True).transcribe()
+model = AutoTranscribe()
+
+text = model.transcribe("tests/test.wav")
+
+print(text)

From 1d25d61fa27e98d2c0f3b265f62f63e201c40d0a Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:09:53 +0200
Subject: [PATCH 38/86] linting

---
 autotranscript/transcriber.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 4fbf14b..e4d587a 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -1,7 +1,6 @@
 import os
-from whisper import Whisper
+from whisper import Whisper, load_model
 from typing import TypeVar , Union
-from whisper import load_model
 from glob import glob
 
 whisper = TypeVar('whisper') 

From 3ef7353db5384c1b350a166ea69b9408fd205fba Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 16 Jun 2023 15:10:08 +0200
Subject: [PATCH 39/86] changed module imports

---
 autotranscript/autotranscript.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 792dcdd..8cb7e8a 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -1,7 +1,7 @@
-from audio import AudioProcessor
-from diarisation import Diariser
-from transcriber import Transcriber, whisper
-from transcript_exporter import Transcript
+from autotranscript.audio import AudioProcessor
+from autotranscript.diarisation import Diariser
+from autotranscript.transcriber import Transcriber, whisper
+from autotranscript.transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
 import torch

From 7bfd294bbd1592cd80afb47d670c719f82f07830 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 11:52:21 +0200
Subject: [PATCH 40/86] fixed bug when only one speaker exists

---
 autotranscript/diarisation.py | 39 ++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index ff3ead0..931d395 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -53,38 +53,39 @@ class Diariser:
         # Sometimes two consecutive speakers are the same
         # This loop removes these duplicates
         ###
-
-
-        for i, (_, _, speaker) in enumerate(dia_list):
+        
+        if len(dia_list) == 1:
+            normalized_output.append([0, 0, dia_list[0]])
+        else:
             
-            if i == 0:
-                current_speaker = speaker
+            for i, (_, _, speaker) in enumerate(dia_list):
+                if i == 0:
+                    current_speaker = speaker
 
-            if speaker != current_speaker:
+                if speaker != current_speaker:
 
-                index_end_speaker = i - 1
+                    index_end_speaker = i - 1
 
-                normalized_output.append([index_start_speaker,
-                                           index_end_speaker,
-                                           current_speaker])
+                    normalized_output.append([index_start_speaker,
+                                            index_end_speaker,
+                                            current_speaker])
 
-                index_start_speaker = i
-                current_speaker = speaker
+                    index_start_speaker = i
+                    current_speaker = speaker
 
-            if i == len(diarization_output["speakers"]) - 1:
+                if i == len(diarization_output["speakers"]) - 1:
 
-                index_end_speaker = i
-                normalized_output.append([index_start_speaker, 
-                                          index_end_speaker, 
-                                          current_speaker])
-       
+                    index_end_speaker = i
+                    normalized_output.append([index_start_speaker, 
+                                            index_end_speaker, 
+                                            current_speaker])
+        
         for outp in normalized_output:
             start =  dia_list[outp[0]][0].start 
             end =  dia_list[outp[1]][0].end
 
             diarization_output["segments"].append([start, end])
             diarization_output["speakers"].append(outp[2])
-
         return diarization_output
     
     @staticmethod

From d3606a2dab5c2e8ad6dd001000eb203bf681a1c5 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 12:01:18 +0200
Subject: [PATCH 41/86] removed dependencie on ffmpeg python will be dropped in
 future whisper realeases

---
 autotranscript/audio.py | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index ea11fe8..4e7ee60 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -1,6 +1,6 @@
 import numpy as np
 import torch
-import ffmpeg
+from subprocess import CalledProcessError, run
 
 SAMPLE_RATE = 16000
 
@@ -91,18 +91,24 @@ class AudioProcessor:
         -------
         A NumPy array containing the audio waveform, in float32 dtype.
         """
+        # This launches a subprocess to decode audio while down-mixing
+        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
+        # fmt: off
+        cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads", "0",
+            "-i", file,
+            "-f", "s16le",
+            "-ac", "1",
+            "-acodec", "pcm_s16le",
+            "-ar", str(sr),
+            "-"
+        ]
+        # fmt: on
         try:
-            # This launches a subprocess to decode audio while down-mixing 
-            # and resampling as necessary.
-            # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
-            out, _ = (
-                ffmpeg.input(file, threads=0)
-                .output("-", format="s16le", acodec="pcm_s16le",
-                        ac=1, ar=sr)
-                .run(cmd=["ffmpeg", "-nostdin"],
-                     capture_stdout=True, capture_stderr=True)
-            )
-        except ffmpeg.Error as e:
+            out = run(cmd, capture_output=True, check=True).stdout
+        except CalledProcessError as e:
             raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
         out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

From 280cfa3c35e391c752d2b7b811f214c883c47f81 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 12:11:28 +0200
Subject: [PATCH 42/86] fixed but where speaker dict included segment
 informations

---
 autotranscript/diarisation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 931d395..5b71f88 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -55,7 +55,7 @@ class Diariser:
         ###
         
         if len(dia_list) == 1:
-            normalized_output.append([0, 0, dia_list[0]])
+            normalized_output.append([0, 0, dia_list[0][2]])
         else:
             
             for i, (_, _, speaker) in enumerate(dia_list):
@@ -158,4 +158,3 @@ class Diariser:
     
     def __str__(self):
         return f"Diarisation(model={self.model})"
-

From 979a2320f002be99e6bca0869d8d74ac6741bdee Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 13:36:17 +0200
Subject: [PATCH 43/86] added file removal

---
 autotranscript/autotranscript.py | 58 ++++++++++++++++++++++++++++----
 1 file changed, 51 insertions(+), 7 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 8cb7e8a..9f4100e 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -5,6 +5,10 @@ from autotranscript.transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
 import torch
+import os
+from glob import iglob
+from subprocess import run
+from warnings import warn
 
 diarisation = TypeVar('diarisation')
 
@@ -49,11 +53,14 @@ class AutoTranscribe:
         print("AutoTranscribe initialized all models successfully loaded.")
             
     def transcribe(self, audiofile : Union[str, torch.Tensor],
+                   remove_original : bool = False,
                    *args, **kwargs) -> Transcript:
         """
         Transcribe audiofile with whisper model and pyannote diarization model
         
         :param audiofile: path to audiofile or torch.Tensor
+        :param remove_original: if True the original audiofile will be removed after
+                                transcription.
         :return: Transcript object which contains the transcript and can be used to 
                 export the transcript to differnt formats.
         """
@@ -86,8 +93,51 @@ class AutoTranscribe:
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                    "segment" : seg,
                                    "text" : transcript}
+            
+        if remove_original:
+            if kwargs.get("shred") is True:
+                self.remove_audio_file(audiofile, shred=True)
+            else:
+                self.remove_audio_file(audiofile, shred=False)
+            
         return Transcript(final_transcript)
     
+    @staticmethod
+    def remove_audio_file(audiofile : str,
+                          shred : bool = False) -> None:
+        """
+        removes orginal audiofile to avoid disk space problems
+        
+        or to enshure data privacy
+        
+        :param audiofile: path to audiofile
+        :param shred: if True audiofile will be shredded and not only removed
+        
+        """
+        if not os.path.exists(audiofile):
+            raise ValueError(f"Audiofile {audiofile} does not exist.")
+        
+        if shred:
+            
+            warn("Shredding audiofile can take a long time.", RuntimeWarning)
+            
+            gen = iglob(f'{audiofile}', recursive=True)
+            cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}']
+            
+            if os.path.isdir(audiofile):
+                raise ValueError(f"Audiofile {audiofile} is a directory.")
+            
+            for file in gen:
+                print(f'shredding {file} now\n')
+                
+                run(cmd , check=True)
+
+        else:
+            os.remove(audiofile)
+            print(f"Audiofile {audiofile} removed.")
+        
+        
+    
     @staticmethod
     def get_audiofile(audiofile : Union[str, torch.Tensor],
                         *args, **kwargs) -> AudioProcessor:
@@ -110,10 +160,4 @@ class AutoTranscribe:
         if not isinstance(audiofile, AudioProcessor):
             raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                              f'not {type(audiofile)}')     
-        return audiofile
-    
-
-if __name__ == "__main__":
-    
-    AudioTranscriber = AutoTranscribe()
-    AudioTranscriber.transcribe("tests/test.wav")
\ No newline at end of file
+        return audiofile
\ No newline at end of file

From 7909d6d507638c03ece3a133815697e46d109263 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:00:31 +0200
Subject: [PATCH 44/86] add save for different types of files

---
 autotranscript/transcript_exporter.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 37092c8..16d5e09 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -82,7 +82,6 @@ class Transcript:
                 speaker = seq["speaker"]
                 
             fstring += f"{speaker}: {seq['text']}\n"
-
         return fstring
     
     def __repr__(self) -> str:
@@ -183,6 +182,29 @@ class Transcript:
     def to_pdf(self, path: str) -> None:
         pass
     
+    def save(self, path: str, *args, **kwargs) -> None:
+        """
+        Save transcript to file with given path and file format
+
+        :param path: path to save file
+        :type path: str
+        :raises ValueError: if file format is unknown
+        """
+        if path.endswith(".json"):
+            self.to_json(path, *args, **kwargs)
+        elif path.endswith(".txt"):
+            self.to_txt(path, *args, **kwargs)
+        elif path.endswith(".md"):
+            self.to_md(path, *args, **kwargs)
+        elif path.endswith(".html"):
+            self.to_html(path, *args, **kwargs)
+        elif path.endswith(".tex"):
+            self.to_tex(path, *args, **kwargs)
+        elif path.endswith(".pdf"):
+            self.to_pdf(path, *args, **kwargs)
+        else:
+            raise ValueError("Unknown file format")
+    
 if __name__ == "__main__":
     test = Transcript(json.load(open("tests/test.json", "r")))
     print(repr(test))

From 4cb774007d62a24c5f8b85ade5e3d1dbd7ce4c09 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:06:16 +0200
Subject: [PATCH 45/86] added save function to export to json

---
 autotranscript/diarisation.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 5b71f88..070fc2d 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,8 +1,9 @@
-from pyannote.audio import Pipeline
+from .audio import Pipeline
 from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
 from torch import Tensor
 import os
 from typing import TypeVar, Union
+import json
 
 Annotation = TypeVar('Annotation') 
 
@@ -88,6 +89,18 @@ class Diariser:
             diarization_output["speakers"].append(outp[2])
         return diarization_output
     
+    def save(self, path : str, *args, **kwargs) -> None:
+        """
+        Save diarization output to a file
+
+        :param path: path to save file
+        :type path: str
+        """
+        with open(path, "w") as f:
+            json.dump(self.transcript, f, *args, **kwargs)
+        
+        
+    
     @staticmethod
     def _get_token():
         # check ig .pyannotetoken.txt exists

From 65c2cbfd91e474416c42af8c35fd7145147d3bb5 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:22:50 +0200
Subject: [PATCH 46/86] removed file

---
 autotranscript/__main__.py | 497 -------------------------------------
 1 file changed, 497 deletions(-)
 delete mode 100644 autotranscript/__main__.py

diff --git a/autotranscript/__main__.py b/autotranscript/__main__.py
deleted file mode 100644
index 19d5145..0000000
--- a/autotranscript/__main__.py
+++ /dev/null
@@ -1,497 +0,0 @@
-
-import whisper
-from time import time, sleep
-import os
-import glob
-import re
-import shutil
-import sys
-from tqdm import tqdm
-
-from typing import Union
-from pydub import AudioSegment
-
-from pyannote.audio import Pipeline
-
-class AudioProcessor:
-    def __init__(self, audio_file:str):
-        self.audio_file_path = audio_file
-        self.audio_file = AudioSegment.from_file(audio_file, format=audio_file.split('.')[-1])
-
-        self.audiofilename = audio_file.split('/')[-1][:-4]
-        self.coreaudiofile =  audio_file.split('/')[-1][:-4]
-        self.audiofilefolder = os.path.dirname(audio_file)
-        self.audio_file_type = audio_file.split('.')[-1]
-
-
-
-    def convert_audio(self, savefolder: str = "", savename: str = "", type: str = "wav", remove_orginal: bool = True):
-        """
-        Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
-        Whisper model
-        :param file: path to audio or video file
-        :param remove_orginal: remove original file
-        :return: mp3 file path
-        """
-        print(f'Converting {self.audiofilename} to .{type} file')
-
-        if savefolder == "":
-            savefolder = self.audiofilefolder
-
-        if savename == "":
-            savename = self.coreaudiofile + f'.{type}'
-        else:
-            savename = savename + f'.{type}'
-
-        savepath = os.path.join(savefolder, savename)
-
-        self.audio_file.export(savepath, format=type)
-
-        print(f'Converted {self.audiofilename} to {type}')
-
-        if remove_orginal:
-            os.remove(self.audio_file_path)
-            print(f'File {self.audio_file_path} removed')
-
-        self.audio_file_path = savepath
-        self.audio_file = AudioSegment.from_file(savepath, format=type)
-
-        return self
-
-    def to_mp3(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
-        """
-        Convert audio file to mp3 file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: mp3 file path
-        """
-        return self.convert_audio(savefolder = savefolder, savename = savename, type="mp3", remove_orginal=remove_orginal)
-
-    def to_wav(self, savefolder: str = "", savename: str = "", remove_orginal: bool = True):
-        """
-        Convert audio file to wav file
-        :param file: audio file
-        :param remove_orginal: remove original file
-        :return: wav file path
-        """
-        return self.convert_audio(savefolder = savefolder, savename = savename,type="wav", remove_orginal=remove_orginal)
-
-    def slower_mp3(self, savefolder: str = "", savename: str = "", speed: float = 0.75, type: str = "mp3"):
-        """
-        Slow down mp3 file
-        :param file: mp3 file
-        :param speed: speed
-        :return: None
-        """
-        if savefolder == "":
-            savefolder = self.audiofilefolder
-        else:
-            savefolder = savefolder
-
-        sound = self.audio_file
-        slow_sound = sound._spawn(sound.raw_data, overrides={
-            "frame_rate": int(sound.frame_rate * speed)
-        })
-
-        speedstr = str(speed).replace('.', '')
-
-        file_out = self.coreaudiofile + f'_{speedstr}.{type}'
-
-        save_path = os.path.join(savefolder, file_out)
-
-        slow_sound.export(save_path, format=type)
-
-        return slow_sound
-
-class WhisperTranscription:
-    def __init__(self, audio_file: str , model, language: str = "German"):
-
-        self.audio_file = audio_file
-        self.model = model
-        self.language = language
-
-    def transcribe(self, language:str = "German"):
-        """
-        Transcribe audio file
-
-        language: language of the audio file
-        :return: transcript as string
-        """
-
-        audiofilename = self.audio_file.split('/')[-1]
-        #print(f'Start transcribing Audio file: {audiofilename}')
-
-        _stime = time()
-        result = self.model.transcribe(self.audio_file, language=self.language)
-
-        #print(f'Transcription finished in {time() - _stime} seconds')
-
-        self.transcript = result
-
-        return result["text"]
-
-    def save_transcript(self, transcript:str = "", savefolder : str = "", savename: str = ""):
-        """
-        Save transcript to file
-        :param transcript: transcript as string
-        :param savefolder: folder to save transcript
-        :param savename: name of the transcript file
-        :return: None
-        """
-        if savefolder == "":
-            savefolder = os.path.dirname(self.audio_file)
-        else:
-            savefolder = savefolder
-
-        if savename == "":
-            savename = self.audio_file.split('/')[-1][:-4] + '.txt'
-        else:
-            savename = savename
-
-        if transcript == "":
-            transcript = self.transcript["text"]
-
-        savepath = os.path.join(savefolder, savename)
-
-        with open(savepath, 'w') as f:
-            f.write(transcript)
-
-        print(f'Transcript saved to {savepath}')
-
-class Diarisation(AudioProcessor):
-    def __init__(self, audio_file: str, model,**kwargs):
-
-        super().__init__(audio_file=audio_file)
-
-        self.model = model
-
-
-    def diarization(self, *args, **kwargs):
-
-        if "num_speakers" in kwargs:
-            num_speakers = kwargs['num_speakers']
-            kwargs.pop('num_speakers')
-        else:
-            num_speakers = 2
-
-        audiofilename = self.coreaudiofile
-
-        print(f'Start diarization of audio file: {self.audiofilename}')
-
-        _stime = time()
-
-        diarization = self.model(self.audio_file_path, num_speakers=num_speakers)
-
-        print(f'Diarization finished in {time() - _stime} seconds')
-        self.diarization = diarization
-
-        return diarization
-
-    def format_diarization_output(self, *args, **kwargs):
-        """
-        Format diarization output to a list of tuples
-        :param args:
-        :param kwargs:
-        :return: dict with speaker names as keys and list of tuples as values and list of different speakers
-        """
-
-        diarization_output = {"speakers": [], "segments": []}
-
-        if not hasattr(self, 'diarization'):
-            # ensure diarization is run before formatting
-            self.diarization = self.diarization()
-
-
-        for segment, _, speaker in self.diarization.itertracks(yield_label=True):
-            diarization_output["speakers"].append(speaker)
-            diarization_output["segments"].append(segment)
-
-        normalized_output = []
-        index_start_speaker = 0
-        index_end_speaker = 0
-        current_speaker = str()
-
-        for i, speaker in enumerate(diarization_output["speakers"]):
-
-            if i == 0:
-                current_speaker = speaker
-
-            if speaker != current_speaker:
-
-                index_end_speaker = i - 1
-
-                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
-
-                index_start_speaker = i
-                current_speaker = speaker
-
-            if i == len(diarization_output["speakers"]) - 1:
-
-                index_end_speaker = i
-                normalized_output.append([index_start_speaker, index_end_speaker, current_speaker])
-
-
-        self.normalized_output = normalized_output
-        self.diarization_output = diarization_output
-
-        return diarization_output,normalized_output
-
-    def create_temporary_wav(self,savefolder: str = "", savename: str = "", *args, **kwargs):
-        """
-        Create temporary wav file for diarization
-        :param savefolder: folder to save the temporary wav file
-        :param savename: name of the temporary wav file prefix
-        :param audiofile: audio file
-        :return: temporary wav file
-        """
-
-
-        if savefolder == "":
-            folder = '.temp'
-            if not os.path.exists(folder):
-                os.makedirs(folder)
-        else:
-            folder = savefolder
-
-        folder = os.path.realpath(folder)
-
-        if savename == "":
-            savename = self.coreaudiofile + '.wav'
-        else:
-            savename = savename
-
-
-        if not os.path.exists(folder):
-            os.makedirs(folder)
-
-        if not hasattr(self, 'normalized_output') or not hasattr(self, 'diarization_output'):
-            self.format_diarization_output()
-
-
-        speaker = set(self.diarization_output["speakers"])
-        num_speak_iter = [0 for _ in range(len(speaker))]
-
-        for count, outp in enumerate(self.normalized_output):
-            start = self.diarization_output["segments"][outp[0]].start
-            end = self.diarization_output["segments"][outp[1]].end
-
-            print("start: ", start)
-            print("end: ", end)
-
-            start_milliseconds = start * 1000
-            end_milliseconds = end * 1000
-
-            print("start_milliseconds: ", start_milliseconds)
-            print("end_milliseconds: ", end_milliseconds)
-
-            print("cut audio")
-
-            cut_audio = self.audio_file[start_milliseconds:end_milliseconds]
-
-            print("save audio")
-            print(f".temp/{count}_speaker_" + str(outp[2]) + ".wav")
-            cut_audio.export(f".temp/{count}_speaker_" + str(outp[2]) + ".wav", format="wav")
-
-        return os.path.realpath(folder)
-
-    def __repr__(self):
-        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
-    def __str__(self):
-        return f"Diarization(audiofile={self.audiofile}, model={self.model}, language={self.language})"
-
-
-class AutoTranscribe:
-    def __init__(self, audiofile: Union[str, bool, list] = None,
-                 model: str = "medium",
-                 language: str = "German",
-                 diarisation: bool = False,
-                 audioinput: str = "audiofiles",
-                 transcriptionout: str = "transcriptions",
-                 *args, **kwargs):
-        """
-        AutoTranscribe
-        :param audiofile: audio file or list of audio files to transcribe
-        :param model: model name (default: medium)
-        :param language: language (default: German)
-        :param diarisation: diarisation (default: False)
-        """
-        if audiofile is None:
-            audiofile = os.listdir(audioinput) # get all audio files in audioinput folder
-            audiofile = [os.path.realpath(os.path.join(audioinput, file)) for file in audiofile]# add path to audio files
-
-        self.audiofile = audiofile
-        self.language = language
-        self.diarisation = diarisation
-        if diarisation:
-            print("Diarisation is enabled")
-            print("Load Diarisation model")
-            self.diarisation_model = Pipeline.from_pretrained("pyannote/speaker-diarization",
-                                             use_auth_token = self._get_token())
-            print("Load Diarisation model done")
-
-        print(f"Load Whisper model {model}")
-        self.model = whisper.load_model(model)
-        print(f"Load Whisper model {model} done")
-
-        self.currentpath, \
-            self.audiopath, \
-            self.transcriptionpath, \
-            self.audiofiles = self.create_folder_structure(audioinput, transcriptionout)  # create folder structure
-
-
-
-    def transcribe(self, *args, **kwargs):
-
-        if isinstance(self.audiofile, str):
-            for i in range(len(self.audiofiles)):
-                if self.audiofile in self.audiofiles[i]:
-                    self.audiofile = [self.audiofiles[i]]
-                    break
-
-            audiolist = self.audiofile
-
-        elif isinstance(self.audiofile, list):
-            audiolist = self.audiofile
-        else:
-            audiolist = self.audiofiles
-
-        if not set(audiolist).issubset(set(self.audiofiles)):
-            raise ValueError(f"Audio file {self.audiofile} not found in {self.audiopath}")
-
-
-        for audiofile in audiolist:
-            _start = time()
-            if not "/" in audiofile:
-                audiofile = os.path.join(self.audiopath, audiofile)
-
-            if not self.check_if_already_transcribed (audiofile):
-
-                audio = AudioProcessor(audiofile)
-
-                if not audiofile.endswith('wav'):
-                    audio = audio.to_wav()
-                    self.audiofile = audio.audio_file_path
-                    audiofile = audio.audio_file_path
-
-                if "speed" in kwargs:
-                    speed = kwargs['speed']
-                    kwargs.pop('speed')
-
-                    print('Creating slower version of the audio file with speed {}'.format(speed))
-                    slower_audio = os.path.join(self.transcriptionpath, 'slower_version')
-                    if not os.path.exists(slower_audio):
-                        os.makedirs(slower_audio)
-                    audio.slower_mp3(savefolder=slower_audio,speed=speed)
-
-                if not self.diarisation:
-                    WhisperTranscription(audiofile, self.model, self.language
-                                         ).save_transcript(savefolder = self.transcriptionpath)
-
-                else:
-                    print("Start diarisation")
-                    dia = Diarisation(audiofile, self.diarisation_model)
-
-                    if 'num_speakers' in kwargs:
-                        num_speakers = kwargs['num_speakers']
-                        kwargs.pop('num_speakers')
-                        dia.diarization(num_speakers=num_speakers)
-                    else:
-                        dia.diarization()
-
-                    temppath = dia.create_temporary_wav()
-                    temppath_dict, _ = dia.format_diarization_output()
-                    speakers = list(set(temppath_dict["speakers"]))
-
-
-                    fstring = "\\begin{drama}"
-
-                    for speaker in speakers:
-                        speaker = speaker.replace("SPEAKER_", "")
-                        fstring += "\n\t\Character{S"+ str(speaker) + "}{S" + str(speaker) + "}"
-
-
-                    files = glob.glob(temppath + "/*.wav")
-
-                    # Sort files according to the digits included in the filename
-                    files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0]))
-
-                    for file in tqdm(files):
-
-                            Whisper = WhisperTranscription(file, self.model, self.language).transcribe()
-
-                            for s in speakers:
-                                if s in file:
-                                    s = s.replace("SPEAKER_", "")
-                                    fstring += f"\n\S{s}speaks: \n {Whisper}"
-
-                    fstring += "\n\end{drama}"
-
-                    print(fstring)
-
-                    with open(os.path.join(self.transcriptionpath,
-                                           os.path.basename(audiofile).split('.')[0] + '.tex'), 'w') as f:
-                        f.write(fstring)
-
-                    print("Remove temporary files")
-                    shutil.rmtree(temppath)
-
-                print(f"Transcription of {audiofile} done in total of {time() - _start} seconds")
-
-    def create_folder_structure(self, audiopath: str, transcriptionout: str):
-        """
-        Create folder structure for audio and transcription files
-
-        :return:  currentpath, audiopath, transcriptionpath, audiofiles
-        """
-        currentpath = os.path.dirname(sys.argv[0]) # get executable path
-
-        if not os.path.exists(os.path.join(currentpath, audiopath)):
-            print('Creating audiofiles folder')
-            os.makedirs(os.path.join(currentpath, audiopath))
-        if not os.path.exists(os.path.join(currentpath, transcriptionout)):
-            print('Creating transcription folder')
-            os.makedirs(os.path.join(currentpath, transcriptionout))
-
-        audiopath = os.path.join(currentpath, audiopath)  # path to audio files
-        transcriptionpath = os.path.join(currentpath, transcriptionout)  # path to transcription files
-
-
-        _audiofiles =  os.listdir(audiopath) # list of audio files
-        audiofiles = []
-        for i in _audiofiles:
-                audiofiles.append(os.path.join(audiopath, i))
-
-        return currentpath, audiopath, transcriptionpath, audiofiles
-
-    def check_if_already_transcribed (self, filename: str):
-        """
-        Check if all audio files are already transcribed
-        :param filename: audio file name
-        :return: bool
-        """
-        purefilename = filename.split('/')[-1][:-4]
-        _files = os.listdir(self.transcriptionpath)
-        for i,f in enumerate(_files):
-            _files[i] = f[:-4]
-
-        if purefilename in _files:
-            print(f'File {purefilename[:-4]} already transcribed')
-            return True
-        else:
-            return False
-    @classmethod
-    def _get_token(self):
-        # check ig .pyannotetoken.txt exists
-        path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '.pyannotetoken')
-        if os.path.exists(path):
-            with open(path, 'r') as f:
-                token = f.read()
-        else:
-            raise ValueError('No token found. Please create a token at https://huggingface.co/settings/token'
-                             ' and save it in a file called .pyannotetoken.txt')
-        return token
-
-    def __repr__(self):
-        return f"AutoTranscribe(audiofile={self.audiofile}, model={self.model}, language={self.language}, diarisation={self.diarisation})"
-    def __call__(self, *args, **kwargs):
-        return self.transcribe(*args, **kwargs)

From a5e051cbfbc7c6e5bca455778024ec316b1051b4 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:23:23 +0200
Subject: [PATCH 47/86] added cli

---
 autotranscript/autotranscript.py | 112 +++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 5 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 9f4100e..0a29528 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -1,7 +1,7 @@
-from autotranscript.audio import AudioProcessor
-from autotranscript.diarisation import Diariser
-from autotranscript.transcriber import Transcriber, whisper
-from autotranscript.transcript_exporter import Transcript
+from .audio import AudioProcessor
+from .diarisation import Diariser
+from .transcriber import Transcriber, whisper
+from .transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
 import torch
@@ -9,6 +9,8 @@ import os
 from glob import iglob
 from subprocess import run
 from warnings import warn
+import argparse
+
 
 diarisation = TypeVar('diarisation')
 
@@ -160,4 +162,104 @@ class AutoTranscribe:
         if not isinstance(audiofile, AudioProcessor):
             raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                              f'not {type(audiofile)}')     
-        return audiofile
\ No newline at end of file
+        return audiofile
+    
+
+def cli():
+    from whisper import available_models
+    from whisper.utils import get_writer
+    from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
+    from .transcriber import WHISPER_DEFAULT_PATH
+    def str2bool(string):
+        str2val = {"True": True, "False": False}
+        if string in str2val:
+            return str2val[string]
+        else:
+            raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
+
+
+    # fmt: off
+    parser = argparse.ArgumentParser(formatter_class=
+                                     argparse.ArgumentDefaultsHelpFormatter)
+    
+    parser.add_argument("audio", nargs="+", type=str,
+                        help="audio file(s) to transcribe")
+    
+    parser.add_argument("--wmodel", default="medium",
+                        help="name of the Whisper model to use")
+    parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH,
+                        help="the path to save model files; uses ./models/whisper by default")
+    
+    parser.add_argument("--device", 
+                        default="cuda" if torch.cuda.is_available() else "cpu",
+                        help="device to use for PyTorch inference")
+    parser.add_argument("--threads", type=int, default=0,
+                        help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
+    
+    parser.add_argument("--output_dir", "-o", type=str, default=".",
+                        help="directory to save the outputs")
+    parser.add_argument("--output_format", "-f", type=str, default="txt", 
+                        choices=["txt", "json", "md", "html"],
+                        help="format of the output file; if not specified, all available formats will be produced")
+    
+    parser.add_argument("--verbose", type=str2bool, default=True, 
+                        help="whether to print out the progress and debug messages")
+
+    parser.add_argument("--task", type=str, default="transcribe", 
+                        choices=["transcribe", "diarize","wtranscribe"],
+                        help="whether to perfrom transcription and diazation or only one of them")
+    parser.add_argument("--language", type=str, default=None,
+                        choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
+                        help="language spoken in the audio, specify None to perform language detection")
+    
+    # fmt: on
+
+    args = parser.parse_args().__dict__
+    model_name: str = args.pop("wmodel")
+    model_dir: str = args.pop("wmodel_dir")
+    output_dir: str = args.pop("output_dir")
+    output_format: str = args.pop("output_format")
+    task = args.pop("task")
+    device: str = args.pop("device")
+    os.makedirs(output_dir, exist_ok=True)
+
+    if (threads := args.pop("threads")) > 0:
+        torch.set_num_threads(threads)
+
+    wkwargs = {"download_root": model_dir,
+               "device": device,
+               "language" : args.pop("language")}
+    
+    model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs)
+    
+    if task == "transcribe":
+        for audio in args.pop("audio"):
+            out  = model.transcribe(audio)
+            basename = audio.split("/")[-1].split(".")[0]
+            spath = f"{output_dir}/{basename}.{output_format}"
+            out.save(spath)
+            
+    elif task == "diarize":
+        warn("Diarization is still in beta and may not work as expected.",
+             RuntimeWarning)
+        for audio in args.pop("audio"):
+            out = model.diariser.diarization(audio)
+            basename = audio.split("/")[-1].split(".")[0]
+            spath = f"{output_dir}/{basename}.json"
+            
+            print(f"diairization results saved to {spath}")
+            
+            out.save(spath)
+            
+    elif task == "wtranscribe":
+        writer = get_writer(output_format, output_dir)
+        warn("whisper transcription is poorly supported and may not work as expected." \
+             "It is recommendet to use the whisper cli directly",
+             RuntimeWarning)
+        for audio in args.pop("audio"):
+            out = model.transcriber.transcribe(audio, diarisation=True)
+            basename = audio.split("/")[-1].split(".")[0]
+            writer(out, audio)
+            
+if __name__ == "__main__":
+    cli()
\ No newline at end of file

From bbe27cf6169d9cfc02f88edecb7c386d4908088a Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:24:36 +0200
Subject: [PATCH 48/86] support cli

---
 setup.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d6884d3..0c00dad 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@ import os
 from setuptools import setup, find_packages
 
 module_name = "autotranscript"
-github_url = "https://github.com/Jaikinator/transcriptor"
+github_url = "https://github.com/JSchmie/autotranscript"
 
 file_dir = os.path.dirname(os.path.realpath(__file__))
 absdir = lambda p: os.path.join(file_dir, p)
@@ -17,7 +17,7 @@ with open(verfile, "r") as fp:
 
 ############### setup ###############
 
-build_version = "OPTB_BUILD" in os.environ
+build_version = "AUTOTRANSCRIPT_BUILD" in os.environ
 
 setup(
     name=module_name,
@@ -34,5 +34,6 @@ setup(
     author='Jacob Schmieder',
     author_email='',
     description='Transcription tool for audio files based on Whisper',
-    #entry_points={'console_scripts': ['autotranscript = autotranscript.__main__:main']}
+    entry_points={'console_scripts':
+        ['autotranscript = autotranscript.autotranscript:cli']}
 )

From ae9a125d127727cb3a58452d83ae36370a3fbfd6 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:56:23 +0200
Subject: [PATCH 49/86] changed dependencies

---
 autotranscript/__init__.py    | 15 +++++++--------
 autotranscript/diarisation.py |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index ef47226..4812cc2 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,10 +1,9 @@
-from autotranscript.__main__ import *
-from autotranscript.autotranscript import *
-from autotranscript.transcriber import *
-from autotranscript.audio import *
-from autotranscript.transcript_exporter import *
-from autotranscript.diarisation import *
-from autotranscript.version import get_version as _get_version
-from autotranscript.misc import *
+from .autotranscript import *
+from .transcriber import *
+from .audio import *
+from .transcript_exporter import *
+from .diarisation import *
+from .version import get_version as _get_version
+from .misc import *
 
 __version__ = _get_version()
diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 070fc2d..ea36b93 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,4 +1,4 @@
-from .audio import Pipeline
+from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
 from torch import Tensor
 import os

From 66e73e1c6ba638c26a81cc21f271824552fb43fa Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 15:56:46 +0200
Subject: [PATCH 50/86] added kwargs support for load model

---
 autotranscript/transcriber.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index e4d587a..39c0842 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -6,7 +6,6 @@ from glob import glob
 whisper = TypeVar('whisper') 
 Tensor = TypeVar('Tensor')
 nparray = TypeVar('nparray')
-Transcriber = TypeVar('Transcriber')
 
 def get_whisper_default_path() -> str:
     """
@@ -69,7 +68,8 @@ class Transcriber:
     def load_model(cls,
                     model: str = "medium", 
                     local : bool = True,
-                    download_root: str = WHISPER_DEFAULT_PATH) -> Transcriber:
+                    download_root: str = WHISPER_DEFAULT_PATH ,
+                    *args, **kwargs) -> 'Transcriber':
         """
         Load whisper module
 
@@ -117,7 +117,7 @@ class Transcriber:
                                    "model first. By deactivating the local flag, " /
                                     "the model will be downloaded automatically.")
 
-        _model = load_model(model, download_root=download_root)
+        _model = load_model(model, download_root=download_root, *args, **kwargs)
 
         return cls(_model)
 

From 57fd73c8ee9b98a401d7490025dd3af34c8129ad Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 16:30:23 +0200
Subject: [PATCH 51/86] added functionallity to select diarisation model using
 cli

---
 autotranscript/autotranscript.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 0a29528..2097f2f 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -38,8 +38,7 @@ class AutoTranscribe:
         """
         
         if whisper_model is None:
-            self.transcriber = Transcriber.load_model("medium", local=True)
-            
+            self.transcriber = Transcriber.load_model("medium", local=True)    
         elif isinstance(whisper_model, str):
             self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs)
         else:
@@ -170,6 +169,7 @@ def cli():
     from whisper.utils import get_writer
     from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
     from .transcriber import WHISPER_DEFAULT_PATH
+    from .diarisation import PYANNOTE_DEFAULT_PATH
     def str2bool(string):
         str2val = {"True": True, "False": False}
         if string in str2val:
@@ -190,6 +190,10 @@ def cli():
     parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH,
                         help="the path to save model files; uses ./models/whisper by default")
     
+    parser.add_argument("--dia_model", type=str, default = PYANNOTE_DEFAULT_PATH)
+    
+    parser.add_argument("--allow_download", type= bool, default=True,
+                        help="whether to allow model download if model is not found locally")
     parser.add_argument("--device", 
                         default="cuda" if torch.cuda.is_available() else "cpu",
                         help="device to use for PyTorch inference")
@@ -219,6 +223,7 @@ def cli():
     model_dir: str = args.pop("wmodel_dir")
     output_dir: str = args.pop("output_dir")
     output_format: str = args.pop("output_format")
+    local :str = args.pop("allow_download")
     task = args.pop("task")
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
@@ -227,14 +232,17 @@ def cli():
         torch.set_num_threads(threads)
 
     wkwargs = {"download_root": model_dir,
-               "device": device,
-               "language" : args.pop("language")}
-    
-    model = AutoTranscribe(whisper_model= model_name, whisper_kwargs= wkwargs)
+               "local": local,
+               "device": device}
+    diarisation_kwargs = {"local": local}    
+    model = AutoTranscribe(whisper_model= model_name,
+                           whisper_kwargs= wkwargs,
+                           dia_model= args.pop("dia_model"),
+                           dia_kwargs_kwargs= diarisation_kwargs,)
     
     if task == "transcribe":
         for audio in args.pop("audio"):
-            out  = model.transcribe(audio)
+            out  = model.transcribe(audio, language = args.pop("language"))
             basename = audio.split("/")[-1].split(".")[0]
             spath = f"{output_dir}/{basename}.{output_format}"
             out.save(spath)
@@ -257,7 +265,7 @@ def cli():
              "It is recommendet to use the whisper cli directly",
              RuntimeWarning)
         for audio in args.pop("audio"):
-            out = model.transcriber.transcribe(audio, diarisation=True)
+            out = model.transcriber.transcribe(audio, language = args.pop("language"))
             basename = audio.split("/")[-1].split(".")[0]
             writer(out, audio)
             

From 29cc0aca6ad72b243396d2a52196714bac016ae0 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 16:31:42 +0200
Subject: [PATCH 52/86] cahnged name form dia_model to dia_dir

---
 autotranscript/autotranscript.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 2097f2f..42ed015 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -190,7 +190,7 @@ def cli():
     parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH,
                         help="the path to save model files; uses ./models/whisper by default")
     
-    parser.add_argument("--dia_model", type=str, default = PYANNOTE_DEFAULT_PATH)
+    parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH)
     
     parser.add_argument("--allow_download", type= bool, default=True,
                         help="whether to allow model download if model is not found locally")
@@ -237,7 +237,7 @@ def cli():
     diarisation_kwargs = {"local": local}    
     model = AutoTranscribe(whisper_model= model_name,
                            whisper_kwargs= wkwargs,
-                           dia_model= args.pop("dia_model"),
+                           dia_model= args.pop("dia_dir"),
                            dia_kwargs_kwargs= diarisation_kwargs,)
     
     if task == "transcribe":

From 06804b21b10cf740c062cbdee88b9d337ced12dc Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 19 Jun 2023 16:32:42 +0200
Subject: [PATCH 53/86] removed wrong variable

---
 autotranscript/autotranscript.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 42ed015..d79b392 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -238,7 +238,7 @@ def cli():
     model = AutoTranscribe(whisper_model= model_name,
                            whisper_kwargs= wkwargs,
                            dia_model= args.pop("dia_dir"),
-                           dia_kwargs_kwargs= diarisation_kwargs,)
+                           dia_kwargs= diarisation_kwargs,)
     
     if task == "transcribe":
         for audio in args.pop("audio"):

From cadeb8784fa9bf39313fb99affd5ed88f2cd4480 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:19:38 +0200
Subject: [PATCH 54/86] cahnged description

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 0c00dad..7517d61 100644
--- a/setup.py
+++ b/setup.py
@@ -33,7 +33,7 @@ setup(
     license='',
     author='Jacob Schmieder',
     author_email='',
-    description='Transcription tool for audio files based on Whisper',
+    description='Transcription tool for audio files based on Whisper and Pyannote',
     entry_points={'console_scripts':
         ['autotranscript = autotranscript.autotranscript:cli']}
 )

From bb73a668011af737014357f57306fefda57aed5e Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:19:54 +0200
Subject: [PATCH 55/86] add example

---
 transcribe.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transcribe.py b/transcribe.py
index 6601707..fca2532 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -4,4 +4,5 @@ model = AutoTranscribe()
 
 text = model.transcribe("tests/test.wav")
 
+print("Transcription:\n")
 print(text)

From 88db803bcb8bb000d1d46bb939734e7cf5ebd16c Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:20:17 +0200
Subject: [PATCH 56/86] added file

---
 autotranscript/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index 4812cc2..e6b02f3 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -1,4 +1,5 @@
 from .autotranscript import *
+from .app.qtfaststart import *
 from .transcriber import *
 from .audio import *
 from .transcript_exporter import *

From 2308a9337ccba81273fe5a38ab28953d87748ce4 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:20:42 +0200
Subject: [PATCH 57/86] changed type of sr

---
 autotranscript/audio.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index 4e7ee60..7944a73 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -1,7 +1,7 @@
 import numpy as np
 import torch
 from subprocess import CalledProcessError, run
-
+from typing import Union
 SAMPLE_RATE = 16000
 
 class AudioProcessor:
@@ -9,7 +9,7 @@ class AudioProcessor:
     Audio Processor using PyTorchaudio instead of PyDub
     """
     
-    def __init__(self, waveform: torch.Tensor, sr : torch.Tensor,
+    def __init__(self, waveform: torch.Tensor, sr : int = SAMPLE_RATE,
                  *args, **kwargs) -> None:
         """
         Initialise audio processor

From d882d80d1d381a2d19882b6d2c93145c15ac0220 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:21:21 +0200
Subject: [PATCH 58/86] added ndarray datatype to input of transcribe

---
 autotranscript/autotranscript.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index d79b392..6f00888 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -10,7 +10,7 @@ from glob import iglob
 from subprocess import run
 from warnings import warn
 import argparse
-
+from numpy import ndarray
 
 diarisation = TypeVar('diarisation')
 
@@ -53,7 +53,7 @@ class AutoTranscribe:
 
         print("AutoTranscribe initialized all models successfully loaded.")
             
-    def transcribe(self, audiofile : Union[str, torch.Tensor],
+    def transcribe(self, audiofile : Union[str, torch.Tensor, ndarray],
                    remove_original : bool = False,
                    *args, **kwargs) -> Transcript:
         """
@@ -140,7 +140,7 @@ class AutoTranscribe:
         
     
     @staticmethod
-    def get_audiofile(audiofile : Union[str, torch.Tensor],
+    def get_audiofile(audiofile : Union[str, torch.Tensor, ndarray],
                         *args, **kwargs) -> AudioProcessor:
         """
         Get audiofile as TorchAudioProcessor
@@ -155,9 +155,12 @@ class AutoTranscribe:
         if isinstance(audiofile, str):
             audiofile = AudioProcessor.from_file(audiofile)   
         
-        if isinstance(audiofile, torch.Tensor):
+        elif isinstance(audiofile, torch.Tensor):
             audiofile = AudioProcessor(audiofile[0], audiofile[1])
-        
+        elif isinstance(audiofile, ndarray):
+            audiofile = AudioProcessor(torch.tensor(audiofile[0]),
+                                       audiofile[1])
+            
         if not isinstance(audiofile, AudioProcessor):
             raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                              f'not {type(audiofile)}')     
@@ -191,9 +194,10 @@ def cli():
                         help="the path to save model files; uses ./models/whisper by default")
     
     parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH)
-    
-    parser.add_argument("--allow_download", type= bool, default=True,
+    parser.add_argument("--htoken", default="", type=str, help="HuggingFace token for private model download")
+    parser.add_argument("--local", type=str2bool, default=False,
                         help="whether to allow model download if model is not found locally")
+    
     parser.add_argument("--device", 
                         default="cuda" if torch.cuda.is_available() else "cpu",
                         help="device to use for PyTorch inference")
@@ -219,11 +223,12 @@ def cli():
     # fmt: on
 
     args = parser.parse_args().__dict__
+
     model_name: str = args.pop("wmodel")
     model_dir: str = args.pop("wmodel_dir")
     output_dir: str = args.pop("output_dir")
     output_format: str = args.pop("output_format")
-    local :str = args.pop("allow_download")
+    local :str = args.pop("local")
     task = args.pop("task")
     device: str = args.pop("device")
     os.makedirs(output_dir, exist_ok=True)
@@ -234,7 +239,10 @@ def cli():
     wkwargs = {"download_root": model_dir,
                "local": local,
                "device": device}
-    diarisation_kwargs = {"local": local}    
+    
+    diarisation_kwargs = {"local": local,
+                          "token" : args.pop("htoken")}  
+    
     model = AutoTranscribe(whisper_model= model_name,
                            whisper_kwargs= wkwargs,
                            dia_model= args.pop("dia_dir"),

From 58a14b2adf84561deddb575e6483c2fb07b17f88 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:22:03 +0200
Subject: [PATCH 59/86] change location of default path variables

---
 autotranscript/diarisation.py | 15 ++++---
 autotranscript/misc.py        | 81 +++++------------------------------
 autotranscript/transcriber.py | 16 +------
 3 files changed, 21 insertions(+), 91 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index ea36b93..1c2e4fb 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -4,14 +4,16 @@ from torch import Tensor
 import os
 from typing import TypeVar, Union
 import json
-
+from .misc import PYANNOTE_DEFAULT_PATH
 Annotation = TypeVar('Annotation') 
 
-PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 
-                                     "models", "pyannote", 
-                                     "speaker_diarization", "config.yaml")
-
 class Diariser:
+    """
+    Diarisation class
+    This class is used to diarize an audio file using a pretrained model
+    from pyannote.audio.
+    :param model: model to use for diarization
+    """
     def __init__(self, model,*args,**kwargs) -> None:
 
         self.model = model
@@ -137,10 +139,11 @@ class Diariser:
         -------
         Pipeline Object
         """
-
+        
         if local:
             diarization_model =  Pipeline.from_pretrained(model,*args, **kwargs)
         else:
+            print("Loading model from HuggingFace")
             if token == "":
                 token = cls._get_token()
             diarization_model =  Pipeline.from_pretrained(model, use_auth_token = token,
diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index 065e45d..716852e 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -4,83 +4,22 @@ from whisper import Whisper, load_model
 import os
 import glob
 from warnings import warn
+import yaml
 
-WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)),
+WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(__file__),
                                      "models", "whisper")
 
-PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.dirname(__file__)), 
+PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), 
                                      "models", "pyannote", 
                                      "speaker_diarization", "config.yaml")
 
-def load_whisper_model(model: str ="medium", local : bool = False, download_root: str = WHISPER_DEFAULT_PATH) -> Whisper:
+
+def config_diarization_yaml(file):
     """
-    Load modules from whisper
-
-    Parameters
-    ----------
-    whisper : str
-        whisper model
-        available models:
-
-            - 'tiny.en'
-            - 'tiny'
-            - 'base.en'
-            - 'base'
-            - 'small.en'
-            - 'small'
-            - 'medium.en'
-            - 'medium'
-            - 'large-v1'
-            - 'large-v2'
-            - 'large' 
-
-    local : bool
-        If true, load from local cache
-
-    download_root : str
-        Path to download the model
-
-        default: /models/whisper
+    Configure diarization pipeline from yaml file to use the model offline
+    and avoid manuel file manipulation.
     
-    Returns
-    -------
-    Whisper Object
+    :param file: yaml file
+    :type file: yaml
     """
-    warn("load_whisper_model is deprecated. Use Transcriptor.load_model() instead.", DeprecationWarning)
-    if local:
-        available_models = [os.path.basename(x) for x in glob.glob(os.path.join(download_root, "*"))]
-        
-        for i, module in enumerate(available_models):
-            available_models[i] = module.split(".")[0]
-        
-        if model not in available_models:
-            raise RuntimeError("Model not found. Consider downloading the model first. By deactivating the local flag, the model will be downloaded automatically.")
-
-    return load_model(model, download_root=download_root)
-
-def load_pyannote_model(model: str = PYANNOTE_DEFAULT_PATH, 
-                        token: str = "",
-                        local : bool = True,
-                        *args, **kwargs) -> Pipeline:
-    """
-    Load modules from pyannote
-
-    Parameters
-    ----------
-    model : str
-        pyannote model 
-        default: /models/pyannote/speaker_diarization/config.yaml
-    token : str
-        HUGGINGFACE_TOKEN
-    local : bool
-        If true, load from local cache
-    
-    Returns
-    -------
-    Pipeline Object
-    """
-    warn("load_pyannote_model is deprecated. Use Diarisation.load_model() instead.", DeprecationWarning)
-    if local:
-        return Pipeline.from_pretrained(model,*args, **kwargs)
-    else:
-        return Pipeline.from_pretrained(model, use_auth_token = token, *args, **kwargs)
+    
\ No newline at end of file
diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 39c0842..82156cf 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -2,24 +2,12 @@ import os
 from whisper import Whisper, load_model
 from typing import TypeVar , Union
 from glob import glob
-
+from .misc import WHISPER_DEFAULT_PATH
 whisper = TypeVar('whisper') 
 Tensor = TypeVar('Tensor')
 nparray = TypeVar('nparray')
 
-def get_whisper_default_path() -> str:
-    """
-    Get default path for whisper models
 
-    Returns
-    -------
-    str
-        path
-    """
-    _path = os.path.dirname(os.path.dirname(__file__))
-    return os.path.join(_path, "models", "whisper")
-
-WHISPER_DEFAULT_PATH = get_whisper_default_path()
 
 class Transcriber:
     def __init__(self, model: whisper ) -> None:
@@ -68,7 +56,7 @@ class Transcriber:
     def load_model(cls,
                     model: str = "medium", 
                     local : bool = True,
-                    download_root: str = WHISPER_DEFAULT_PATH ,
+                    download_root: str = WHISPER_DEFAULT_PATH,
                     *args, **kwargs) -> 'Transcriber':
         """
         Load whisper module

From 9a767228f7cdd7b7c21a3f91dc1e73f986ba0efa Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Tue, 27 Jun 2023 10:22:21 +0200
Subject: [PATCH 60/86] fixed wrong writing

---
 autotranscript/transcript_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 16d5e09..2615a67 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -160,7 +160,7 @@ class Transcript:
     def to_txt(self, path: str) -> None:
         
        with open(path, "w") as f:
-            f.write(self.__str__, f)
+            f.write(self.__str__())
     
     def to_md(self, path: str) -> None:
         return self.to_html(path)

From de3a6cd4d17a7a9261706ad514a10abaa2d60758 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 28 Jun 2023 15:31:52 +0200
Subject: [PATCH 61/86] added function to controll paths to pyannote models

---
 autotranscript/misc.py | 44 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index 716852e..1c14198 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -6,15 +6,15 @@ import glob
 from warnings import warn
 import yaml
 
-WHISPER_DEFAULT_PATH = os.path.join(os.path.dirname(__file__),
-                                     "models", "whisper")
+WHISPER_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__),
+                                     "models", "whisper"))
 
-PYANNOTE_DEFAULT_PATH = os.path.join(os.path.dirname(__file__), 
+PYANNOTE_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), 
                                      "models", "pyannote", 
-                                     "speaker_diarization", "config.yaml")
+                                     "speaker_diarization", "config.yaml"))
 
 
-def config_diarization_yaml(file):
+def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding = None):
     """
     Configure diarization pipeline from yaml file to use the model offline
     and avoid manuel file manipulation.
@@ -22,4 +22,36 @@ def config_diarization_yaml(file):
     :param file: yaml file
     :type file: yaml
     """
-    
\ No newline at end of file
+    with open(file, "r") as stream:
+            yml = yaml.safe_load(stream)
+            stream.close()
+    if path_to_segmentation:
+        yml["pipeline"]["params"]["segmentation"] = path_to_segmentation
+    else:
+        yml["pipeline"]["params"]["segmentation"] = os.path.relpath(os.path.join(
+                                                                    os.path.dirname(__file__),
+                                                                    "models", "pyannote",
+                                                                    "segmentation",
+                                                                    "pytorch_model.bin"))
+                                                 
+    if path_to_embedding:
+        yml["pipeline"]["params"]["embedding"] = path_to_embedding
+    else:
+        yml["pipeline"]["params"]["embedding"] = os.path.relpath(
+                                                            os.path.join(
+                                                            os.path.dirname(__file__),
+                                                            "models", "pyannote",
+                                                            "speechbrain",
+                                                            "spkrec-ecapa-voxceleb",
+                                                            "embedding_model.ckpt"))
+    
+    if not os.path.exists(yml["pipeline"]["params"]["segmentation"]):
+        raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}")
+    
+    if not os.path.exists(yml["pipeline"]["params"]["embedding"]):
+        raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}")
+    
+    with open(file, "w") as stream:
+        yaml.dump(yml, stream)
+        stream.close()
+                                                             

From 11fce3abefc2c5d734b02ea870929de48a0f2f8c Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:41:13 +0200
Subject: [PATCH 62/86] removed kwargs confusions

---
 autotranscript/autotranscript.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 6f00888..9f14886 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -19,8 +19,7 @@ class AutoTranscribe:
     def __init__(self,
                 whisper_model: Union[bool, str, whisper] = None,
                 dia_model : Union[bool, str, diarisation] = None,
-                dia_kwargs : dict = {},
-                whisper_kwargs : dict = {}) -> None:
+                **kwargs) -> None:
         """
         AutoTranscribe class
         
@@ -38,16 +37,16 @@ class AutoTranscribe:
         """
         
         if whisper_model is None:
-            self.transcriber = Transcriber.load_model("medium", local=True)    
+            self.transcriber = Transcriber.load_model("medium")    
         elif isinstance(whisper_model, str):
-            self.transcriber = Transcriber.load_model(whisper_model, **whisper_kwargs)
+            self.transcriber = Transcriber.load_model(whisper_model, **kwargs)
         else:
             self.transcriber = whisper_model
 
         if dia_model is None:
             self.diariser = Diariser.load_model()
         elif isinstance(dia_model, str):
-            self.diariser = Diariser.load_model(dia_model, **dia_kwargs)
+            self.diariser = Diariser.load_model(dia_model, **kwargs)
         else:
             self.diariser = dia_model
 

From cd35ad8903b63353c01145223598ae09fad8d0a8 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:41:43 +0200
Subject: [PATCH 63/86] solved path issues

---
 autotranscript/misc.py | 49 ++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 26 deletions(-)

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index 1c14198..1eaf34f 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -1,4 +1,3 @@
-
 from pyannote.audio import Pipeline
 from whisper import Whisper, load_model
 import os
@@ -6,15 +5,18 @@ import glob
 from warnings import warn
 import yaml
 
-WHISPER_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__),
-                                     "models", "whisper"))
+CACHE_DIR = os.getenv(
+    "AUTOT_CACHE",
+    os.path.expanduser("~/.cache/torch/models"),
+)
 
-PYANNOTE_DEFAULT_PATH = os.path.relpath(os.path.join(os.path.dirname(__file__), 
-                                     "models", "pyannote", 
-                                     "speaker_diarization", "config.yaml"))
+WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper")
 
+PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
 
-def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding = None):
+PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
+
+def config_diarization_yaml(file, path_to_segmentation = None):
     """
     Configure diarization pipeline from yaml file to use the model offline
     and avoid manuel file manipulation.
@@ -28,30 +30,25 @@ def config_diarization_yaml(file, path_to_segmentation = None, path_to_embedding
     if path_to_segmentation:
         yml["pipeline"]["params"]["segmentation"] = path_to_segmentation
     else:
-        yml["pipeline"]["params"]["segmentation"] = os.path.relpath(os.path.join(
-                                                                    os.path.dirname(__file__),
-                                                                    "models", "pyannote",
-                                                                    "segmentation",
-                                                                    "pytorch_model.bin"))
+        yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
                                                  
-    if path_to_embedding:
-        yml["pipeline"]["params"]["embedding"] = path_to_embedding
-    else:
-        yml["pipeline"]["params"]["embedding"] = os.path.relpath(
-                                                            os.path.join(
-                                                            os.path.dirname(__file__),
-                                                            "models", "pyannote",
-                                                            "speechbrain",
-                                                            "spkrec-ecapa-voxceleb",
-                                                            "embedding_model.ckpt"))
+    # if path_to_embedding:
+    #     yml["pipeline"]["params"]["embedding"] = path_to_embedding
+    # else:
+    #     yml["pipeline"]["params"]["embedding"] = os.path.relpath(
+    #                                                         os.path.join(
+    #                                                         os.path.dirname(__file__),
+    #                                                         "models", "pyannote",
+    #                                                         "speechbrain",
+    #                                                         "spkrec-ecapa-voxceleb",
+    #                                                         "embedding_model.ckpt"))
     
     if not os.path.exists(yml["pipeline"]["params"]["segmentation"]):
         raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}")
     
-    if not os.path.exists(yml["pipeline"]["params"]["embedding"]):
-        raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}")
+    # if not os.path.exists(yml["pipeline"]["params"]["embedding"]):
+    #     raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}")
     
     with open(file, "w") as stream:
         yaml.dump(yml, stream)
-        stream.close()
-                                                             
+        stream.close()                               

From 38d1f8f6682b11d1fe3cb563d235a0fa0b9003fe Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:44:10 +0200
Subject: [PATCH 64/86] removed kwargs confusion

---
 autotranscript/transcriber.py | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 82156cf..0cd42bf 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -1,6 +1,7 @@
 import os
 from whisper import Whisper, load_model
-from typing import TypeVar , Union
+from typing import TypeVar , Union , Optional
+import torch
 from glob import glob
 from .misc import WHISPER_DEFAULT_PATH
 whisper = TypeVar('whisper') 
@@ -17,7 +18,7 @@ class Transcriber:
         """
         self.model = model
 
-    def transcribe(self, audio : Union[str, Tensor, nparray]  ,
+    def transcribe(self, audio : Union[str, Tensor, nparray] ,
                    *args, **kwargs) -> str:
         """
         transcribe audio file
@@ -55,9 +56,10 @@ class Transcriber:
     @classmethod
     def load_model(cls,
                     model: str = "medium", 
-                    local : bool = True,
                     download_root: str = WHISPER_DEFAULT_PATH,
-                    *args, **kwargs) -> 'Transcriber':
+                    device: Optional[Union[str, torch.device]] = None,
+                    in_memory: bool = False,
+                    ) -> 'Transcriber':
         """
         Load whisper module
 
@@ -92,20 +94,9 @@ class Transcriber:
         Whisper Object
         """
 
-        if local:
-            
-            available_models = [os.path.basename(x) for x in 
-                                glob(os.path.join(download_root, "*"))]
-            
-            for i, module in enumerate(available_models):
-                available_models[i] = module.split(".")[0]
-            
-            if model not in available_models:
-                raise RuntimeError("Model not found. Consider downloading the "/
-                                   "model first. By deactivating the local flag, " /
-                                    "the model will be downloaded automatically.")
 
-        _model = load_model(model, download_root=download_root, *args, **kwargs)
+        _model = load_model(model, download_root=download_root,
+                            device=device, in_memory=in_memory)
 
         return cls(_model)
 

From 907913f2bfa1cc342642db2fa90e9c65c55ecfd1 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:44:39 +0200
Subject: [PATCH 65/86] fixed kwargs confusion and resolved path issues

---
 autotranscript/diarisation.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 1c2e4fb..bb364e9 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -2,9 +2,10 @@ from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
 from torch import Tensor
 import os
+from pathlib import Path
 from typing import TypeVar, Union
 import json
-from .misc import PYANNOTE_DEFAULT_PATH
+from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH
 Annotation = TypeVar('Annotation') 
 
 class Diariser:
@@ -118,10 +119,12 @@ class Diariser:
         return token
     
     @classmethod
-    def load_model(cls, model: str = PYANNOTE_DEFAULT_PATH, 
-                        token: str = "",
-                        local : bool = True,
-                        *args, **kwargs) -> Pipeline:
+    def load_model(cls, 
+                    model: str = PYANNOTE_DEFAULT_CONFIG, 
+                    token: str = None,
+                    cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH,
+                    hparams_file: Union[str, Path] = None
+                    ) -> Pipeline:
         """
         Load modules from pyannote
 
@@ -139,17 +142,15 @@ class Diariser:
         -------
         Pipeline Object
         """
-        
-        if local:
-            diarization_model =  Pipeline.from_pretrained(model,*args, **kwargs)
-        else:
-            print("Loading model from HuggingFace")
-            if token == "":
-                token = cls._get_token()
-            diarization_model =  Pipeline.from_pretrained(model, use_auth_token = token,
-                                                           *args, **kwargs)
-        
-        return cls(diarization_model)
+        if not os.path.exists(model) and token is None:
+            token = cls._get_token()
+            
+        _model =  Pipeline.from_pretrained(model,
+                                           use_auth_token = token,
+                                           cache_dir = cache_dir,
+                                           hparams_file = hparams_file,)
+
+        return cls(_model)
 
     @staticmethod
     def _get_diarisation_kwargs(**kwargs) -> dict:

From 4bf98621d68203defa8540ae4440d3eeaaf0e647 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:46:33 +0200
Subject: [PATCH 66/86] add webapp

---
 app.py                            | 101 ++++++++++
 autotranscript/app/qtfaststart.py | 319 ++++++++++++++++++++++++++++++
 2 files changed, 420 insertions(+)
 create mode 100644 app.py
 create mode 100644 autotranscript/app/qtfaststart.py

diff --git a/app.py b/app.py
new file mode 100644
index 0000000..3645d79
--- /dev/null
+++ b/app.py
@@ -0,0 +1,101 @@
+from dash import Dash, dcc, html, dash_table, Input, Output, State, callback
+
+import base64
+from autotranscript.app.qtfaststart import process
+from autotranscript import AutoTranscribe
+import io
+import subprocess as sp
+import numpy as np
+from autotranscript.audio import SAMPLE_RATE
+
+# Setup auto-transcript
+autot = AutoTranscribe() # whisper_model="tiny", whisper_kwargs={"local" : False}
+
+# Setup FFmpeg
+PROBLEMATIC_FILE_TYPES : tuple = "mov","mp4","m4a","3gp","3g2","mj2"
+
+
+# Setup Dash
+external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
+
+app = Dash(__name__, external_stylesheets=external_stylesheets)
+
+app.layout = html.Div([
+    dcc.Upload(
+        id='upload-data',
+        children=html.Div([
+            'Drag and Drop or ',
+            html.A('Select Files')
+        ]),
+        style={
+            'width': '100%',
+            'height': '60px',
+            'lineHeight': '60px',
+            'borderWidth': '1px',
+            'borderStyle': 'dashed',
+            'borderRadius': '5px',
+            'textAlign': 'center',
+            'margin': '10px'
+        },
+        # Allow multiple files to be uploaded
+        multiple=True
+    ),
+    html.Div(id='output-data-upload'),
+])
+
+def parse_contents(contents, filename, date):
+    content_type, content_string = contents.split(',')
+
+    decoded = base64.b64decode(content_string)
+    file = io.BytesIO(decoded).read()
+    
+    if filename.endswith(PROBLEMATIC_FILE_TYPES):
+        # mp4 and other files need to be processed with qtfaststart
+        # since theire metadata is at the end of the file
+        # and we need it at the beginning
+        file = process(file) 
+
+    cmd = [
+            "ffmpeg",
+            "-nostdin",
+            "-threads", "0",
+            "-i",'pipe:',
+            "-f", "s16le",
+            '-hide_banner',
+            '-loglevel', 'error',
+            "-c", "copy",
+            "-vn",
+            "-ac", "1",
+            "-acodec", "pcm_s16le",
+            "-ar", str(SAMPLE_RATE),
+            "-"
+        ]
+    
+    proc = sp.Popen(cmd, stdout=sp.PIPE, stdin=sp.PIPE)
+    
+    out = proc.communicate(input=file)[0]
+    out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+    out = np.array([out, SAMPLE_RATE])
+    
+    transcript = str(autot.transcribe(out))
+    
+    return html.Div([
+        html.H5(f"File Name: {filename} \n" \
+                "Transcript: \n"
+                ),
+        html.P(transcript)
+    ])
+
+@callback(Output('output-data-upload', 'children'),
+              Input('upload-data', 'contents'),
+              State('upload-data', 'filename'),
+              State('upload-data', 'last_modified'))
+def update_output(list_of_contents, list_of_names, list_of_dates):
+    if list_of_contents is not None:
+        children = [
+            parse_contents(c, n, d) for c, n, d in
+            zip(list_of_contents, list_of_names, list_of_dates)]
+        return children
+
+if __name__ == '__main__':
+    app.run_server()
diff --git a/autotranscript/app/qtfaststart.py b/autotranscript/app/qtfaststart.py
new file mode 100644
index 0000000..e57eb20
--- /dev/null
+++ b/autotranscript/app/qtfaststart.py
@@ -0,0 +1,319 @@
+"""
+This file contains a modified version of qtfaststart by qtfaststart
+https://github.com/danielgtaylor/qtfaststart/tree/master
+
+All credit goes to the original author.
+Copyright (C) 2008 - 2013 Daniel G. Taylor <dan@programmer-art.org>
+Permission is hereby granted, free of charge, to any person obtaining a copy of this
+software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the 
+Software, and to permit persons to whom the Software is furnished to do so, 
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies
+or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
+IN THE SOFTWARE.
+"""
+
+import logging
+import os
+import struct
+import collections
+import io
+
+# define error classes
+class FastStartException(Exception):
+    """
+    Raised when something bad happens during processing.
+    """
+    pass
+
+class FastStartSetupError(FastStartException):
+    """
+    Rasised when asked to process a file that does not need processing
+    """
+    pass
+
+class MalformedFileError(FastStartException):
+    """
+    Raised when the input file is setup in an unexpected way
+    """
+    pass
+
+class UnsupportedFormatError(FastStartException):
+    """
+    Raised when a movie file is recognized as a format not supported.
+    """
+    pass
+
+# define constants
+CHUNK_SIZE = 8192
+
+log = logging.getLogger("qtfaststart")
+
+# Older versions of Python require this to be defined
+if not hasattr(os, 'SEEK_CUR'):
+    os.SEEK_CUR = 1
+
+Atom = collections.namedtuple('Atom', 'name position size')
+
+def read_atom(datastream):
+    """
+        Read an atom and return a tuple of (size, type) where size is the size
+        in bytes (including the 8 bytes already read) and type is a "fourcc"
+        like "ftyp" or "moov".
+    """
+    size, type = struct.unpack(">L4s", datastream.read(8))
+    type = type.decode('ascii')
+    return size, type
+
+
+def _read_atom_ex(datastream):
+    """
+    Read an Atom from datastream
+    """
+    pos = datastream.tell()
+    atom_size, atom_type = read_atom(datastream)
+    if atom_size == 1:
+        atom_size, = struct.unpack(">Q", datastream.read(8))
+    return Atom(atom_type, pos, atom_size)
+
+
+def get_index(datastream):
+    """
+        Return an index of top level atoms, their absolute byte-position in the
+        file and their size in a list:
+
+        index = [
+            ("ftyp", 0, 24),
+            ("moov", 25, 2658),
+            ("free", 2683, 8),
+            ...
+        ]
+
+        The tuple elements will be in the order that they appear in the file.
+    """
+    log.debug("Getting index of top level atoms...")
+
+    index = list(_read_atoms(datastream))
+    _ensure_valid_index(index)
+
+    return index
+
+
+def _read_atoms(datastream):
+    """
+    Read atoms until an error occurs
+    """
+    while datastream:
+        try:
+            atom = _read_atom_ex(datastream)
+            log.debug("%s: %s" % (atom.name, atom.size))
+        except:
+            break
+
+        yield atom
+
+        if atom.size == 0:
+            if atom.name == "mdat":
+                # Some files may end in mdat with no size set, which generally
+                # means to seek to the end of the file. We can just stop indexing
+                # as no more entries will be found!
+                break
+            else:
+                # Weird, but just continue to try to find more atoms
+                continue
+
+        datastream.seek(atom.position + atom.size)
+
+
+def _ensure_valid_index(index):
+    """
+    Ensure the minimum viable atoms are present in the index.
+
+    Raise FastStartException if not.
+    """
+    top_level_atoms = set([item.name for item in index])
+    for key in ["moov", "mdat"]:
+        if key not in top_level_atoms:
+            log.error("%s atom not found, is this a valid MOV/MP4 file?" % key)
+            raise FastStartException()
+
+
+def find_atoms(size, datastream):
+    """
+    Compatibilty interface for _find_atoms_ex
+    """
+    fake_parent = Atom('fake', datastream.tell()-8, size+8)
+    for atom in _find_atoms_ex(fake_parent, datastream):
+        yield atom.name
+
+
+def _find_atoms_ex(parent_atom, datastream):
+    """
+        Yield either "stco" or "co64" Atoms from datastream.
+        datastream will be 8 bytes into the stco or co64 atom when the value
+        is yielded.
+
+        It is assumed that datastream will be at the end of the atom after
+        the value has been yielded and processed.
+
+        parent_atom is the parent atom, a 'moov' or other ancestor of CO
+        atoms in the datastream.
+    """
+    stop = parent_atom.position + parent_atom.size
+
+    while datastream.tell() < stop:
+        try:
+            atom = _read_atom_ex(datastream)
+        except:
+            log.exception("Error reading next atom!")
+            raise FastStartException()
+
+        if atom.name in ["trak", "mdia", "minf", "stbl"]:
+            # Known ancestor atom of stco or co64, search within it!
+            for res in _find_atoms_ex(atom, datastream):
+                yield res
+        elif atom.name in ["stco", "co64"]:
+            yield atom
+        else:
+            # Ignore this atom, seek to the end of it.
+            datastream.seek(atom.position + atom.size)
+
+
+def process(infilename, limit=float('inf')):
+    """
+        Convert a Quicktime/MP4 file for streaming by moving the metadata to
+        the front of the file. This method writes a new file.
+
+        If limit is set to something other than zero it will be used as the
+        number of bytes to write of the atoms following the moov atom. This
+        is very useful to create a small sample of a file with full headers,
+        which can then be used in bug reports and such.
+    """
+    if isinstance(infilename, str):
+        datastream = open(infilename, "rb")
+    elif isinstance(infilename, bytes):
+        datastream = io.BytesIO(infilename)
+    else:
+        raise TypeError("infilename must be a filename, bytes or file-like object")
+    # Get the top level atom index
+    index = get_index(datastream)
+
+    mdat_pos = 999999
+    free_size = 0
+
+    # Make sure moov occurs AFTER mdat, otherwise no need to run!
+    for atom in index:
+        # The atoms are guaranteed to exist from get_index above!
+        if atom.name == "moov":
+            moov_atom = atom
+            moov_pos = atom.position
+        elif atom.name == "mdat":
+            mdat_pos = atom.position
+        elif atom.name == "free" and atom.position < mdat_pos:
+            # This free atom is before the mdat!
+            free_size += atom.size
+            log.info("Removing free atom at %d (%d bytes)" % (atom.position, atom.size))
+        elif atom.name == "\x00\x00\x00\x00" and atom.position < mdat_pos:
+            # This is some strange zero atom with incorrect size
+            free_size += 8
+            log.info("Removing strange zero atom at %s (8 bytes)" % atom.position)
+
+    # Offset to shift positions
+    offset = moov_atom.size - free_size
+
+    if moov_pos < mdat_pos:
+        # moov appears to be in the proper place, don't shift by moov size
+        offset -= moov_atom.size
+        if not free_size:
+            # No free atoms and moov is correct, we are done!
+            log.error("This file appears to already be setup for streaming!")
+            # Stupid hack to retrun the non-processed file:
+            if isinstance(infilename, str):
+                return open(infilename, "rb").read()
+            elif isinstance(infilename, bytes):
+                return io.BytesIO(infilename).read()
+            
+    # Read and fix moov
+    moov = _patch_moov(datastream, moov_atom, offset)
+
+    log.info("Writing output...")
+    outfile = b''
+
+    # Write ftype
+    for atom in index:
+        if atom.name == "ftyp":
+            log.debug("Writing ftyp... (%d bytes)" % atom.size)
+            datastream.seek(atom.position)
+            outfile += datastream.read(atom.size)
+
+    # Write moov
+    _bytes = moov.getvalue()
+    log.debug("Writing moov... (%d bytes)" % len(_bytes))
+    outfile += _bytes
+
+    # Write the rest
+    atoms = [item for item in index if item.name not in ["ftyp", "moov", "free"]]
+    for atom in atoms:
+        log.debug("Writing %s... (%d bytes)" % (atom.name, atom.size))
+        datastream.seek(atom.position)
+
+        # for compatability, allow '0' to mean no limit
+        cur_limit = limit or float('inf')
+        cur_limit = min(cur_limit, atom.size)
+
+        for chunk in get_chunks(datastream, CHUNK_SIZE, cur_limit):
+            outfile += chunk
+
+    return outfile
+
+
+def _patch_moov(datastream, atom, offset):
+    datastream.seek(atom.position)
+    moov = io.BytesIO(datastream.read(atom.size))
+
+    # reload the atom from the fixed stream
+    atom = _read_atom_ex(moov)
+
+    for atom in _find_atoms_ex(atom, moov):
+        # Read either 32-bit or 64-bit offsets
+        ctype, csize = dict(
+            stco=('L', 4),
+            co64=('Q', 8),
+        )[atom.name]
+
+        # Get number of entries
+        version, entry_count = struct.unpack(">2L", moov.read(8))
+
+        log.info("Patching %s with %d entries" % (atom.name, entry_count))
+
+        entries_pos = moov.tell()
+
+        struct_fmt = ">%(entry_count)s%(ctype)s" % vars()
+
+        # Read entries
+        entries = struct.unpack(struct_fmt, moov.read(csize * entry_count))
+
+        # Patch and write entries
+        offset_entries = [entry + offset for entry in entries]
+        moov.seek(entries_pos)
+        moov.write(struct.pack(struct_fmt, *offset_entries))
+    return moov
+
+def get_chunks(stream, chunk_size, limit):
+    remaining = limit
+    while remaining:
+        chunk = stream.read(min(remaining, chunk_size))
+        if not chunk:
+            return
+        remaining -= len(chunk)
+        yield chunk

From f51eb5815dba6859a2e92c99f3c2d6b4335596c0 Mon Sep 17 00:00:00 2001
From: Jacob Schmieder <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:47:53 +0200
Subject: [PATCH 67/86] Delete autotranscript/__pycache__ directory

---
 .../__pycache__/__init__.cpython-39.pyc          | Bin 207 -> 0 bytes
 .../__pycache__/__main__.cpython-39.pyc          | Bin 3877 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 autotranscript/__pycache__/__init__.cpython-39.pyc
 delete mode 100644 autotranscript/__pycache__/__main__.cpython-39.pyc

diff --git a/autotranscript/__pycache__/__init__.cpython-39.pyc b/autotranscript/__pycache__/__init__.cpython-39.pyc
deleted file mode 100644
index 04235a59ba7faa9afadc9a70dcffc98425d2d511..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 207
zcmYe~<>g`kf~*w_ldXXCV-N=!FabFZKwK;WBvKes7;_jxAT%S8W(sD|WPS;hVbEl}
z#i-?{$#jdWq$n}3IJqb@DK$zM!Pbk9&rQtCi;rK)P{aaM4<>$D>1X8Urs`)UCg&&V
z2UI3!Bo^fc6y;~7CYKcJhi7CK7o-;HBMi}xkI&4@EQycTE2zB1VUwGmQks)$#|Sj8
J801b4MgTh4HDv$*

diff --git a/autotranscript/__pycache__/__main__.cpython-39.pyc b/autotranscript/__pycache__/__main__.cpython-39.pyc
deleted file mode 100644
index d64ee0a076cccf9df024581a10a7941259f67616..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3877
zcmd59O>ZMfwW`0|>Dc-HAcGj1IpJZH%nFuWq8+qLhFwn9NR!<at%X*{RhhJDcYCYK
z$)vF_WH=&n;I<61PiX#9U%4=MgwO~yyjN|v?QHhMg|_O|=X>?)eLHG2JObaRe;v0U
zZ4mM|oXmeFOg@Are+GaNMiUZXZzQx!14_#^BQRjiOw41PTZz@R1Dldh2(y{<f-vXG
z2pmSAl4k8S&>^iREi4sxIrQ{C0Q=K4&eAsS&3`sbK7=N}0KiE=8G)O&xXCSUbLZ6D
zBhcH-`m?ETvV$6{G4}-tT(-?Tw)}zyORUZsu;Q^5whH|+TVw0c*VzW!gubEq+9|wa
z5v8&%;uGE~oGxc^zgsw;aM6v@S7xyk^;w*q#tAR%ZpL_0xJi^g>qpOc+XC*UAC8gj
zBWUt5fB_wlkPJ+KFd7+S@`VLsoMUf8b5xDdD-fn~M@s`^j2IrQ8Mwww5Sx|%%k!t-
zQ4&e%tISsw$G+yyk5cBJ$Bbt_GAjLxSe^OmjQgLT#j?kRuQ{(M9rhv-bxT}CIGm8d
zv7ZS)iKX(h)45f76IT;FEa@UkeaZb(K%KJIQ=r$INK?Tb`qL5Kf_#?G!22tlypd!P
zi_>T2C3#(6>Ej}2t>&^=0wIfK0L5-w@JR9SG)ov4p;V&ZR(-*X<#t~Po~mA?&L*1F
z!}SF=^^#ZVSr&Fr#HlK37p3$94=)Y%{Q}*lX%&{_$!W;~u4q195wQj1H}4*tWnF&I
ziQ3u8!IR7OStPnoMAqSLB@ZSVIY18D2VvNa;xr8RdzXb9hH)CJFvN#Rq(ofDaF(3^
zIMl+Q;nF(IH;*gr?@uw>6i-h+v_c1(w#2`{K#>t0lOa{4V~k7<tpR0p3>GI3m0j8!
zoN^ZTz%CatOpTEMhNZ1`9z*X<FecQ8Mnw;Hhj8<#hE_g={ec0VVVzoF<97b1O6f;h
zkzh%vNWW5~Hm#YdFKDkcd9NaIC^+aoJ@oxK<v%Db3^n~=(MHRqner<-Vhu!>-_abQ
z70!-K*b5p9eB|G$jxMZEPB-1c$YkL><EnkZ3QHTE!=Hnz$-?SnaVpl~!ouxFzkwJe
zWKlymgi+HLo3JAq2v!iRB0%=Ey_H}ar*#1LpvfHo#Bq%s>e4N$p|L_&4A02#&Dqa4
z_|0(}!~gWd$X0`<?djJrP!QX$=+GRP9Xc|U*|A18*b^j)-y3_7B6i7`jv;~#4Dqot
zN6tXcAg-AM`W2*!m!`I!Tk+5yTCe~8_U+r;+VA#$m^+{8xR$%V;N9$;v!)v~^xDS|
zEsm177uZ@g!Rm#GAqZ4@61%{w*hBDb0EJT$&|+>{TBe1CAt$JpJE0Dtfz<}0HzZrE
z=)F*7CH?n-phRgC5U>x<>CrWs@6CyJ@%(~=x`2j0t_AwUfK)*`kU#rW{1Zpe=s6Mh
zmC-RrkSaQmDmnxlh^DTBY<fk1M+bBSX<|%8OSw#o6H?3)3L4zpW#C|6k(WA+DDR34
z4WO{xse_V`R)aZzFu@&NR>R$a@ht*saVKh70oto<X-t_1$|P(Vm3l>4eU}Wail?=K
z1svIGePFZ3mGQ^BLucTOHU=hJ*#q@c@;TsI@Vhc8N!Q*X^6w7`)xTc?yryXjUj?s*
zv0PJ|I8$3!#<d|n0M1s+w5YaW9bOFlt-;8KSCz)&vUQy#H-<Gl@tFMTAMo;-#2~m0
zD=A%5k^W!HXX71mZAfokLW3IHe2)xk${p0;^;>h2*nX+Q?=}7M@ysGXW;<FU1YBsh
zm2Z87b&C#;ewrzWLj9C!pZZb0ca){)Tqq1{+D<WpYro0A`&30jRUz^OkIX0?`o~S9
zr~&RtCb>>}`JJiJ>y!@{rm;NZ%#TwaoL2I7mNNN<9=ryhek<;R-2)fuxhm)d)PnrE
z%J*B7+e>btU}?vvF_5^u6v=ny$^}_IOy73z7T%*InLjNsQNBiyTUdMK8)f>KJ(#n6
z=k^yhcglRP1MgyN(P6ppK8un*|G5yEC|nM&#xhKY8}U83QGlltT|X3_s&aT)!I+Th
z3S0J)Sn23qSXqy!g{?zc;l(merGkQi7ZzrP!c2JDbj!Sep(Jpi>Vdi%gPiOqDp&>y
zV{A*KE)QH8opYUV=xIUY8?jB?2Lc~KlfM8!3=gV|EyIQ0r5<Dr2WC6QU1QCt(}(m9
ztYU+Zr&lH?==8)sR4Kmc8u|8AiYI)3_S}U)h<ZSw=s<{35uB8a(SRxZQ_y_9qU9s8
z9GWFMtd-iJ+AzF?b%`{{&C00U|Jr?3CPPbPP;qxK_+fy=AOqET$AI2?S$j%e*{%6k
ze80*pk8<}xS(QG@mmgHc?4vidDexKMT?AieS@50+!{5T@?JSr3{|n3FJ-qoFd8mb0
z9Eag>j)jdmJx#WoW=miNDNtBVdfZ$=fukvfVc|i|nz4R@<9Zl=*^iQPM_{2)SSMMQ
zh#w&KLj+if=(?oz0kk#oV+21zaEM?gC_U1Z3>LBY42cDgOYiQvJKmD}z6*9KaIaYt
z@V9m1Q|OGr3RDL%?_KUg^`yW99d+4*=%u;RD7W2HxF3{{`6=3v#8MuMY!`lY<G;i{
B*bo2!


From deb3e8bc8ad90dc6b3b00d769b9fec6bbde3283d Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:50:17 +0200
Subject: [PATCH 68/86] updated requirements

---
 requirements.txt | 171 ++++++-----------------------------------------
 1 file changed, 20 insertions(+), 151 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 619d0c4..ecfbf11 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,152 +1,21 @@
-absl-py==1.3.0
-aiohttp==3.8.3
-aiosignal==1.3.1
-alembic==1.9.1
-antlr4-python3-runtime==4.9.3
-appdirs==1.4.4
-asteroid-filterbanks==0.4.0
-async-timeout==4.0.2
-attrs==22.2.0
-audioread==3.0.0
-autopage==0.5.1
-backports.cached-property==1.0.2
-brotlipy==0.7.0
-cachetools==5.2.0
-certifi==2023.5.7
-cffi==1.15.1
-charset-normalizer==2.1.1
-click==8.1.3
-cliff==4.1.0
-cmaes==0.9.0
-cmake==3.26.4
-cmd2==2.4.2
-colorama==0.4.6
-colorlog==6.7.0
-commonmark==0.9.1
-contourpy==1.0.6
-cryptography==39.0.1
-cycler==0.11.0
-decorator==4.4.2
-docopt==0.6.2
-einops==0.3.2
-ffmpeg-python==0.2.0
-filelock==3.8.0
-flit_core==3.8.0
-fonttools==4.38.0
-frozenlist==1.3.3
-fsspec==2022.11.0
-future==0.18.2
-google-auth==2.15.0
-google-auth-oauthlib==0.4.6
-greenlet==2.0.1
-grpcio==1.51.1
-hmmlearn==0.2.8
-huggingface-hub==0.11.0
-HyperPyYAML==1.1.0
-idna==3.4
-imageio==2.23.0
-imageio-ffmpeg==0.4.7
-importlib-metadata==4.13.0
-joblib==1.2.0
-julius==0.2.7
-kiwisolver==1.4.4
-librosa==0.9.2
-lit==16.0.5.post0
-llvmlite==0.39.1
-Mako==1.2.4
-Markdown==3.4.1
-MarkupSafe==2.1.1
-matplotlib==3.6.2
-mkl-fft==1.3.1
-mkl-random==1.2.2
-mkl-service==2.4.0
-more-itertools==9.0.0
-moviepy==1.0.3
-mpmath==1.2.1
-multidict==6.0.4
-networkx==2.8.8
-numba==0.56.4
-numpy==1.23.5
-oauthlib==3.2.2
-omegaconf==2.3.0
 openai-whisper==20230314
-optuna==3.0.5
-packaging==21.3
-pandas==1.5.2
-pbr==5.11.0
-Pillow==9.4.0
-pip==23.0.1
-pooch==1.6.0
-prettytable==3.5.0
-primePy==1.3
-proglog==0.1.10
-protobuf==3.20.1
-pyannote.audio==2.1.1
-pyannote.core==4.5
-pyannote.database==4.1.3
-pyannote.metrics==3.2.1
-pyannote.pipeline==2.3
-pyasn1==0.4.8
-pyasn1-modules==0.2.8
-pycparser==2.21
-pyDeprecate==0.3.2
-pydub==0.25.1
-Pygments==2.13.0
-pyOpenSSL==23.0.0
-pyparsing==3.0.9
-pyperclip==1.8.2
-PySocks==1.7.1
-python-dateutil==2.8.2
-pytorch-lightning==1.6.5
-pytorch-metric-learning==1.6.3
-pytz==2022.7
-PyYAML==6.0
-regex==2022.10.31
-requests==2.28.1
-requests-oauthlib==1.3.1
-resampy==0.4.2
-rich==12.6.0
-rsa==4.9
-ruamel.yaml==0.17.21
-ruamel.yaml.clib==0.2.7
-scikit-learn==1.2.0
-scipy==1.8.1
-semantic-version==2.10.0
-semver==2.13.0
-sentencepiece==0.1.97
-setuptools==65.6.3
-setuptools-rust==1.5.2
-shellingham==1.5.0
-simplejson==3.18.0
-singledispatchmethod==1.0
-six==1.16.0
-sortedcontainers==2.4.0
-SoundFile==0.10.3.post1
-speechbrain==0.5.13
-SQLAlchemy==1.4.45
-stevedore==4.1.1
-sympy==1.11.1
-tabulate==0.9.0
-tensorboard==2.11.0
-tensorboard-data-server==0.6.1
-tensorboard-plugin-wit==1.8.1
-threadpoolctl==3.1.0
-tiktoken==0.3.1
-tokenizers==0.13.2
-torch==1.11.0
-torch-audiomentations==0.11.0
-torch-pitch-shift==1.2.2
-torchaudio==0.11.0
-torchmetrics==0.11.0
-torchvision==0.12.0
-tqdm==4.65.0
-transformers==4.24.0
-triton==2.0.0
-typer==0.7.0
-typing_extensions==4.4.0
-urllib3==1.26.15
-wcwidth==0.2.5
-Werkzeug==2.2.2
-wheel==0.38.4
-yarl==1.8.2
-zipp==3.11.0
+
+pyannote.audio~=2.1.1
+pyannote.core~=4.5
+pyannote.database~=4.1.3
+pyannote.metrics~=3.2.1
+pyannote.pipeline~=2.3
+
+setuptools~=65.6.3
+setuptools-rust~=1.5.2
+
+torch~=1.11.0
+torchaudio~=0.11.0
+torchmetrics~=0.11.0
+torchvision~=0.12.0
+tqdm>=4.65.0
+
+#optional:
+#dash~=2.10.2
+
+

From 9c78cdd230b737203766aacbace94368171450bc Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 30 Jun 2023 18:53:47 +0200
Subject: [PATCH 69/86] updated file

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index ecfbf11..433b3c1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ torchmetrics~=0.11.0
 torchvision~=0.12.0
 tqdm>=4.65.0
 
-#optional:
+#optional: 
 #dash~=2.10.2
 
 
From 22b5b28f2115744ccbc1cda9b8fcee41e261ce35 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 7 Jul 2023 12:57:31 +0200
Subject: [PATCH 70/86] updated Readme

---
 README.md | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 999dba3..8ffe9d1 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,47 @@
-# transcriptor
\ No newline at end of file
+
+# `AutoTranscript`: Fully Automated Transcription using AI 
+
+`AutoTranscript` is a [PyTorch](https://pytorch.org/) based interface for. To enable fully auomated Transcription using AI models containing speaker diarization models:
+
+- [whisper](https://github.com/openai/whisper): an a general-purpose speech recognition model
+- [payannote-audio](https://github.com/pyannote/pyannote-audio) an open-source toolkit for speaker diarization
+
+Therefore `AutoTranscript` can be used as a Commandline Interface a Webserver or as a Python API.
+
+## Setup: 
+For this Project, Python 3.9 were [PyTorch](https://pytorch.org/) version 1.11.0 
+
+The following command will pull and install the latest commit from this repository, along with its Python dependencies.
+
+    pip install https://github.com/JSchmie/autotranscript.git
+  
+## Example Python usage
+
+```python
+from autotranscript import AutoTranscribe
+
+model = AutoTranscribe()
+
+text = model.transcribe("audio.wav")
+
+print(f"Transcription: \n{text}")
+
+```
+
+## Command-line usage
+
+If you not want to control the optimization using python, you also can use the Command-line:
+
+	autotranscript audio.wav
+
+Run the following to view all available options:
+		
+	autotranscript -h
+
+
+## License 
+
+## Citation
+
+
+

From abd733b2aeb4ef08a30655bd0556c48c869aca73 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Fri, 7 Jul 2023 12:57:47 +0200
Subject: [PATCH 71/86] updated setup.py

---
 setup.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/setup.py b/setup.py
index 7517d61..e7da608 100644
--- a/setup.py
+++ b/setup.py
@@ -15,25 +15,28 @@ version = {"__file__": verfile}
 with open(verfile, "r") as fp:
     exec(fp.read(), version)
 
+
 ############### setup ###############
 
 build_version = "AUTOTRANSCRIPT_BUILD" in os.environ
 
-setup(
-    name=module_name,
-    version=version["get_version"](build_version),
-    packages=find_packages(),
-    python_requires="~=3.9",
-    readme="README.md",
-    install_requires = [str(r) for r in pkg_resources.parse_requirements(
-            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
-        )
-    ],
-    url= github_url,
-    license='',
-    author='Jacob Schmieder',
-    author_email='',
-    description='Transcription tool for audio files based on Whisper and Pyannote',
-    entry_points={'console_scripts':
-        ['autotranscript = autotranscript.autotranscript:cli']}
-)
+if __name__ == "__main__":
+
+    setup(
+        name=module_name,
+        version=version["get_version"](build_version),
+        packages=find_packages(),
+        python_requires="~=3.9",
+        readme="README.md",
+        install_requires = [str(r) for r in pkg_resources.parse_requirements(
+                open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
+            )
+        ],
+        url= github_url,
+        license='',
+        author='Jacob Schmieder',
+        author_email='',
+        description='Transcription tool for audio files based on Whisper and Pyannote',
+        entry_points={'console_scripts':
+            ['autotranscript = autotranscript.autotranscript:cli']}
+    )

From a71475c3eba9afe0dd87d07dbff6607dd14bb69e Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:27:54 +0200
Subject: [PATCH 72/86] updated diarisation file to better handle tokens

---
 autotranscript/__init__.py    |  2 +-
 autotranscript/diarisation.py | 77 ++++++++++++++++++++++-------------
 2 files changed, 50 insertions(+), 29 deletions(-)

diff --git a/autotranscript/__init__.py b/autotranscript/__init__.py
index e6b02f3..20bcc93 100644
--- a/autotranscript/__init__.py
+++ b/autotranscript/__init__.py
@@ -6,5 +6,5 @@ from .transcript_exporter import *
 from .diarisation import *
 from .version import get_version as _get_version
 from .misc import *
-
+ 
 __version__ = _get_version()
diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index bb364e9..5359e3e 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,13 +1,21 @@
-from pyannote.audio import Pipeline
-from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
-from torch import Tensor
+"""
+Diarisation class.
+This class is used to diarize an audio file using a pretrained model
+"""
 import os
 from pathlib import Path
 from typing import TypeVar, Union
-import json
+
+from pyannote.audio import Pipeline
+from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
+from torch import Tensor
+
 from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH
 Annotation = TypeVar('Annotation') 
 
+TOKEN_PATH = os.path.join(os.path.dirname(
+            os.path.realpath(__file__)), '.pyannotetoken')
+
 class Diariser:
     """
     Diarisation class
@@ -15,7 +23,7 @@ class Diariser:
     from pyannote.audio.
     :param model: model to use for diarization
     """
-    def __init__(self, model,*args,**kwargs) -> None:
+    def __init__(self, model) -> None:
 
         self.model = model
 
@@ -29,7 +37,7 @@ class Diariser:
         :return: diarization
         """
         kwargs = self._get_diarisation_kwargs(**kwargs)
-        
+            
         diarization = self.model(audiofile,*args, **kwargs)
 
         out = self.format_diarization_output(diarization)
@@ -52,7 +60,7 @@ class Diariser:
         index_start_speaker = 0
         index_end_speaker = 0
         current_speaker = str()
-        
+       
         ###
         # Sometimes two consecutive speakers are the same
         # This loop removes these duplicates
@@ -91,37 +99,41 @@ class Diariser:
             diarization_output["segments"].append([start, end])
             diarization_output["speakers"].append(outp[2])
         return diarization_output
-    
-    def save(self, path : str, *args, **kwargs) -> None:
-        """
-        Save diarization output to a file
-
-        :param path: path to save file
-        :type path: str
-        """
-        with open(path, "w") as f:
-            json.dump(self.transcript, f, *args, **kwargs)
         
-        
-    
     @staticmethod
     def _get_token():
-        # check ig .pyannotetoken.txt exists
-        path = os.path.join(os.path.dirname(
-            os.path.realpath(__file__)), '.pyannotetoken')
-        if os.path.exists(path):
-            with open(path, 'r') as f:
-                token = f.read()
+        """
+        Get token from .pyannotetoken.txt
+        :raises ValueError: No token found
+        :return: Huggingface token
+        :rtype: str
+        """
+        
+        if os.path.exists(TOKEN_PATH):
+            with open(TOKEN_PATH, 'r', encoding="utf-8") as file:
+                token = file.read()
         else:
             raise ValueError('No token found.' \
                 'Please create a token at https://huggingface.co/settings/token' \
-                'and save it in a file called .pyannotetoken.txt')
+                f'and save it in a file called {TOKEN_PATH}')
         return token
+
+    @staticmethod
+    def _save_token(token):
+        """
+        Save token to .pyannotetoken.txt
+
+        :param token: Huggingface token
+        :type token: str
+        """
+        with open(TOKEN_PATH, 'r', encoding="utf-8") as file:
+            file.write(token)
     
     @classmethod
     def load_model(cls, 
                     model: str = PYANNOTE_DEFAULT_CONFIG, 
                     token: str = None,
+                    cache_token: bool = False,
                     cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH,
                     hparams_file: Union[str, Path] = None
                     ) -> Pipeline:
@@ -142,14 +154,23 @@ class Diariser:
         -------
         Pipeline Object
         """
+        
+        if cache_token and token is not None:
+            cls._save_token(token)
+            
         if not os.path.exists(model) and token is None:
             token = cls._get_token()
-            
+            model = 'pyannote/speaker-diarization'
+                
         _model =  Pipeline.from_pretrained(model,
                                            use_auth_token = token,
                                            cache_dir = cache_dir,
                                            hparams_file = hparams_file,)
-
+        
+        if model is None:
+            raise ValueError('Unable to load model either from local cache' \
+                'or from huggingface.co models. Please check your token' \
+                'or your local model path')
         return cls(_model)
 
     @staticmethod

From fd346012cfb1e65558ee35c6f36ee17eba7dc665 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:28:08 +0200
Subject: [PATCH 73/86] added file

---
 autotranscript/app/__init__.py | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 autotranscript/app/__init__.py

diff --git a/autotranscript/app/__init__.py b/autotranscript/app/__init__.py
new file mode 100644
index 0000000..c61a882
--- /dev/null
+++ b/autotranscript/app/__init__.py
@@ -0,0 +1 @@
+from .qtfaststart import *
\ No newline at end of file

From 52754c988552f7ac45013f49ab1963f94ced5e78 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:29:09 +0200
Subject: [PATCH 74/86] removed unnecessary stuff

---
 autotranscript/misc.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index 1eaf34f..cd75ffc 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -1,8 +1,4 @@
-from pyannote.audio import Pipeline
-from whisper import Whisper, load_model
 import os
-import glob
-from warnings import warn
 import yaml
 
 CACHE_DIR = os.getenv(
@@ -32,23 +28,9 @@ def config_diarization_yaml(file, path_to_segmentation = None):
     else:
         yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
                                                  
-    # if path_to_embedding:
-    #     yml["pipeline"]["params"]["embedding"] = path_to_embedding
-    # else:
-    #     yml["pipeline"]["params"]["embedding"] = os.path.relpath(
-    #                                                         os.path.join(
-    #                                                         os.path.dirname(__file__),
-    #                                                         "models", "pyannote",
-    #                                                         "speechbrain",
-    #                                                         "spkrec-ecapa-voxceleb",
-    #                                                         "embedding_model.ckpt"))
-    
     if not os.path.exists(yml["pipeline"]["params"]["segmentation"]):
         raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}")
     
-    # if not os.path.exists(yml["pipeline"]["params"]["embedding"]):
-    #     raise FileNotFoundError(f"Embedding model not found at {yml['pipeline']['params']['embedding']}")
-    
     with open(file, "w") as stream:
         yaml.dump(yml, stream)
         stream.close()                               

From 42f558207b0317a0584ae0e23e405d071352d61e Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:37:37 +0200
Subject: [PATCH 75/86] fixed wrong Ident

---
 autotranscript/autotranscript.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 9f14886..ff188e9 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -157,7 +157,7 @@ class AutoTranscribe:
         elif isinstance(audiofile, torch.Tensor):
             audiofile = AudioProcessor(audiofile[0], audiofile[1])
         elif isinstance(audiofile, ndarray):
-            audiofile = AudioProcessor(torch.tensor(audiofile[0]),
+            audiofile = AudioProcessor(torch.Tensor(audiofile[0]),
                                        audiofile[1])
             
         if not isinstance(audiofile, AudioProcessor):

From a4b2bdc3c16eceb702651ec1a2df5c32e1f07f87 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:37:48 +0200
Subject: [PATCH 76/86] added seq to str

---
 autotranscript/transcript_exporter.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 2615a67..add3e16 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -73,15 +73,15 @@ class Transcript:
         """
         fstring = ""
         
-        for id in self.transcript:
-            seq = self.transcript[id]
+        for _id in self.transcript:
+            seq = self.transcript[_id]
             
             if self.annotation:
                 speaker = self.annotation[seq["speaker"]]
             else:
                 speaker = seq["speaker"]
                 
-            fstring += f"{speaker}: {seq['text']}\n"
+            fstring += f"{speaker} {seq}: {seq['text']}\n"
         return fstring
     
     def __repr__(self) -> str:

From 2d6954ff3fa5ec39b19eb264cd76d75adfd4dde7 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 13:42:34 +0200
Subject: [PATCH 77/86] fixed __str__

---
 autotranscript/transcript_exporter.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index add3e16..12cdefb 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -1,5 +1,7 @@
 import json
 
+from sympy import Segment
+
 ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
 
 
@@ -80,8 +82,10 @@ class Transcript:
                 speaker = self.annotation[seq["speaker"]]
             else:
                 speaker = seq["speaker"]
-                
-            fstring += f"{speaker} {seq}: {seq['text']}\n"
+            
+            segm = seq["segment"]
+            
+            fstring += f"{speaker} {segm}: {seq['text']}\n"
         return fstring
     
     def __repr__(self) -> str:

From a21bc32f7dbab533237baf7c064ac39462d5b909 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Mon, 10 Jul 2023 14:09:50 +0200
Subject: [PATCH 78/86] imporved segment timesteps readability

---
 autotranscript/transcript_exporter.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 12cdefb..3ae53a6 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -1,6 +1,5 @@
 import json
-
-from sympy import Segment
+import time
 
 ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
 
@@ -84,8 +83,10 @@ class Transcript:
                 speaker = seq["speaker"]
             
             segm = seq["segment"]
+            sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0]))
+            eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1]))
             
-            fstring += f"{speaker} {segm}: {seq['text']}\n"
+            fstring += f"{speaker} ({sseg} ; {eseg}): {seq['text']}\n"
         return fstring
     
     def __repr__(self) -> str:
@@ -122,9 +123,8 @@ class Transcript:
         html = "<p>" + self.__str__().replace("\n", "<br>") + "</p>"
         html = "<html><body>" + html + "</body></html>"
         html = html.replace("\t", "&nbsp;&nbsp;&nbsp;&nbsp;")
-        
-        return html
-        
+       
+        return html   
     
     def get_md(self) -> str:
         return self.get_html()

From d2c57866df503a7aae4d4c5004caae223443bb74 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 13:17:13 +0200
Subject: [PATCH 79/86] unifyed documentation

---
 autotranscript/audio.py       | 137 +++++++++++++++++--------------
 autotranscript/diarisation.py | 149 ++++++++++++++++++++++------------
 2 files changed, 173 insertions(+), 113 deletions(-)

diff --git a/autotranscript/audio.py b/autotranscript/audio.py
index 7944a73..04feb1d 100644
--- a/autotranscript/audio.py
+++ b/autotranscript/audio.py
@@ -1,34 +1,63 @@
+"""
+Audio Processor Module
+=======================
+
+This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
+It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
+flexible audio processing.
+
+Available Classes:
+- AudioProcessor: Processes audio waveforms and provides methods for loading, 
+                    cutting, and handling audio.
+
+Usage:
+    from .audio_import AudioProcessor
+
+    processor = AudioProcessor.from_file("path/to/audiofile.wav")
+    cut_waveform = processor.cut(start=1.0, end=5.0)
+
+Constants:
+- SAMPLE_RATE (int): Default sample rate for processing.
+- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
+"""
+
+from subprocess import CalledProcessError, run
 import numpy as np
 import torch
-from subprocess import CalledProcessError, run
-from typing import Union
+
 SAMPLE_RATE = 16000
+NORMALIZATION_FACTOR = 32768.0
 
 class AudioProcessor:
     """
-    Audio Processor using PyTorchaudio instead of PyDub
+    Audio Processor class that leverages PyTorchaudio to provide functionalities
+    for loading, cutting, and handling audio waveforms.
+
+    Attributes:
+        waveform: torch.Tensor
+            The audio waveform tensor.
+        sr: int
+            The sample rate of the audio.
     """
     
     def __init__(self, waveform: torch.Tensor, sr : int = SAMPLE_RATE,
                  *args, **kwargs) -> None:
+        
         """
-        Initialise audio processor
-        :param waveform: waveform
-        :param sr: sample rate
-        :param args: additional arguments
-        :param kwargs: additional keyword arguments
-            example:
-                - device: device to use for processing
-                          if cuda is available, cuda is used 
+        Initialize the AudioProcessor object.
+
+        Args:
+            waveform (torch.Tensor): The audio waveform tensor.
+            sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
+            args: Additional arguments.
+            kwargs: Additional keyword arguments, e.g., device to use for processing. 
+            If CUDA is available, it defaults to CUDA.
+
+        Raises:
+            ValueError: If the provided sample rate is not of type int.
         """
         
-        if "device" in kwargs:
-            device = kwargs["device"]
-        else:
-            if torch.cuda.is_available():
-                device = "cuda"
-            else:
-                device = "cpu"
+        device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
                 
         self.waveform = waveform.to(device)
         self.sr = sr
@@ -40,9 +69,13 @@ class AudioProcessor:
     @classmethod
     def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
         """
-        Load audio file
-        :param file: audio file
-        :return: AudioProcessor
+        Create an AudioProcessor instance from an audio file.
+
+        Args:
+            file (str): The audio file path.
+
+        Returns:
+            AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
         """
         
         audio, sr = cls.load_audio(file , *args, **kwargs)
@@ -54,42 +87,37 @@ class AudioProcessor:
     
     def cut(self, start: float, end: float) -> torch.Tensor:
         """
-        Cut audio file
-        :param start: start time in seconds
-        :param end: end time in seconds
-        :return: AudioProcessor
+        Cut a segment from the audio waveform between the specified start and end times.
+
+        Args:
+            start (float): Start time in seconds.
+            end (float): End time in seconds.
+
+        Returns:
+            torch.Tensor: The cut waveform segment.
         """
         
-        if isinstance(start, float):
-            start = torch.Tensor([start])
-        if isinstance(end, float):
-            end = torch.Tensor([end])
-        
-        sr = torch.Tensor([self.sr])
-            
-        start = int(start * sr)
-        end = torch.ceil(end * sr)
-        
-        return self.waveform[start:end.to(int)]
+        start = int(start * self.sr)
+        end = int(torch.ceil(end * self.sr))
+        return self.waveform[start:end]
 
     @staticmethod
     def load_audio(file: str, sr: int = SAMPLE_RATE):
         """
-        Open an audio file and read as mono waveform, resampling as necessary
+        Open an audio file and read it as a mono waveform, resampling if necessary.
+        This method ensures compatibility with pyannote.audio
+        and requires the ffmpeg CLI in PATH.
 
-        Changed from original function at whisper.audio.load_audio to ensure 
-        compatibility with pyannote.audio
-        Parameters
-        ----------
-        file: str
-            The audio file to open
+        Args:
+            file (str): The audio file to open.
+            sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
 
-        sr: int
-            The sample rate to resample the audio if necessary
+        Returns:
+            tuple: A NumPy array containing the audio waveform in float32 dtype
+                    and the sample rate.
 
-        Returns
-        -------
-        A NumPy array containing the audio waveform, in float32 dtype.
+        Raises:
+            RuntimeError: If failed to load audio.
         """
         # This launches a subprocess to decode audio while down-mixing
         # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
@@ -111,18 +139,9 @@ class AudioProcessor:
         except CalledProcessError as e:
             raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
 
-        out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
+        out = np.frombuffer(out, np.int16).flatten().astype(np.float32) / NORMALIZATION_FACTOR
         
         return out , sr
     
     def __repr__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
-    
-    def __str__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
-
-    
-if __name__ == "__main__":
-    
-    print("Testing AudioProcessor")
-    print(AudioProcessor.from_file("tests/test.wav"))
\ No newline at end of file
+        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
\ No newline at end of file
diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 5359e3e..0770ea9 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,7 +1,32 @@
 """
-Diarisation class.
-This class is used to diarize an audio file using a pretrained model
+Diarisation Class
+=================
+
+This class serves as the heart of the speaker diarization system, responsible for identifying
+and segmenting individual speakers from a given audio file. It leverages a pretrained model
+from pyannote.audio, providing an accessible interface for audio processing tasks such as
+speaker separation, and timestamping.
+
+By encapsulating the complexities of the underlying model, it allows for straightforward
+integration into various applications, ranging from transcription services to voice assistants.
+
+Available Classes:
+- Diariser: Main class for performing speaker diarization. 
+            Includes methods for loading models, processing audio files,
+            and formatting the diarization output.
+
+Constants:
+- TOKEN_PATH (str): Path to the Pyannote token.
+- PYANNOTE_DEFAULT_PATH (str): Default path to Pyannote models.
+- PYANNOTE_DEFAULT_CONFIG (str): Default configuration for Pyannote models.
+
+Usage:
+    from .diarisation import Diariser
+
+    model = Diariser.load_model(model="path/to/model/config.yaml")
+    diarisation_output = model.diarization("path/to/audiofile.wav")
 """
+
 import os
 from pathlib import Path
 from typing import TypeVar, Union
@@ -10,7 +35,7 @@ from pyannote.audio import Pipeline
 from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
 from torch import Tensor
 
-from .misc import PYANNOTE_DEFAULT_CONFIG, PYANNOTE_DEFAULT_PATH
+from .misc import PYANNOTE_DEFAULT_PATH, PYANNOTE_DEFAULT_CONFIG
 Annotation = TypeVar('Annotation') 
 
 TOKEN_PATH = os.path.join(os.path.dirname(
@@ -18,11 +43,13 @@ TOKEN_PATH = os.path.join(os.path.dirname(
 
 class Diariser:
     """
-    Diarisation class
-    This class is used to diarize an audio file using a pretrained model
-    from pyannote.audio.
-    :param model: model to use for diarization
+    Handles the diarization process of an audio file using a pretrained model
+    from pyannote.audio. Diarization is the task of determining "who spoke when."
+
+    Args:
+        model: The pretrained model to use for diarization.
     """
+    
     def __init__(self, model) -> None:
 
         self.model = model
@@ -30,11 +57,20 @@ class Diariser:
     def diarization(self, audiofile : Union[str, Tensor, dict] ,
                     *args, **kwargs) -> Annotation:
         """
-        Diarization of audio file
-        :param audiofile: path to audio file or torch.Tensor
-        :param args: args for diarization model 
-        :param kwargs: kwargs for diarization model
-        :return: diarization
+        Perform speaker diarization on the provided audio file, 
+        effectively separating different speakers
+        and providing a timestamp for each segment.
+
+        Args:
+            audiofile: The path to the audio file or a torch.Tensor
+                        containing the audio data.
+            args: Additional arguments for the diarization model.
+            kwargs: Additional keyword arguments for the diarization model.
+
+        Returns:
+            dict: A dictionary containing speaker names,
+                    segments, and other information related
+                    to the diarization process.
         """
         kwargs = self._get_diarisation_kwargs(**kwargs)
             
@@ -47,10 +83,14 @@ class Diariser:
     @staticmethod
     def format_diarization_output(dia : Annotation) -> dict:
         """
-        Format diarization output to a list of tuples
-        :param dia: diarization output
-        :return: dict with speaker names as keys and list of tuples
-                 as values and list of different speakers
+        Formats the raw diarization output into a more usable structure for this project.
+
+        Args:
+            dia: Raw diarization output.
+
+        Returns:
+            dict: A structured representation of the diarization, with speaker names
+                  as keys and a list of tuples representing segments as values.
         """
 
         dia_list  = list(dia.itertracks(yield_label=True))
@@ -103,10 +143,14 @@ class Diariser:
     @staticmethod
     def _get_token():
         """
-        Get token from .pyannotetoken.txt
-        :raises ValueError: No token found
-        :return: Huggingface token
-        :rtype: str
+        Retrieves the Huggingface token from a local file. This token is required
+        for accessing certain online resources.
+
+        Raises:
+            ValueError: If the token is not found.
+
+        Returns:
+            str: The Huggingface token.
         """
         
         if os.path.exists(TOKEN_PATH):
@@ -121,12 +165,13 @@ class Diariser:
     @staticmethod
     def _save_token(token):
         """
-        Save token to .pyannotetoken.txt
+        Saves the provided Huggingface token to a local file. This facilitates future
+        access to online resources without needing to repeatedly authenticate.
 
-        :param token: Huggingface token
-        :type token: str
+        Args:
+            token: The Huggingface token to save.
         """
-        with open(TOKEN_PATH, 'r', encoding="utf-8") as file:
+        with open(TOKEN_PATH, 'w', encoding="utf-8") as file:
             file.write(token)
     
     @classmethod
@@ -137,22 +182,21 @@ class Diariser:
                     cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH,
                     hparams_file: Union[str, Path] = None
                     ) -> Pipeline:
-        """
-        Load modules from pyannote
-
-        Parameters
-        ----------
-        model : str
-            pyannote model 
-            default: /models/pyannote/speaker_diarization/config.yaml
-        token : str
-            HUGGINGFACE_TOKEN
-        local : bool
-            If true, load from local cache
         
-        Returns
-        -------
-        Pipeline Object
+        """
+        Loads a pretrained model from pyannote.audio, 
+        either from a local cache or online repository.
+
+        Args:
+            model: Path or identifier for the pyannote model.
+                default: /models/pyannote/speaker_diarization/config.yaml
+            token: Optional HUGGINGFACE_TOKEN for authenticated access.
+            cache_token: Whether to cache the token locally for future use.
+            cache_dir: Directory for caching models.
+            hparams_file: Path to a YAML file containing hyperparameters.
+
+        Returns:
+            Pipeline: A pyannote.audio Pipeline object, encapsulating the loaded model.
         """
         
         if cache_token and token is not None:
@@ -161,38 +205,35 @@ class Diariser:
         if not os.path.exists(model) and token is None:
             token = cls._get_token()
             model = 'pyannote/speaker-diarization'
-                
+        
         _model =  Pipeline.from_pretrained(model,
                                            use_auth_token = token,
                                            cache_dir = cache_dir,
                                            hparams_file = hparams_file,)
         
-        if model is None:
+        if _model is None:
             raise ValueError('Unable to load model either from local cache' \
                 'or from huggingface.co models. Please check your token' \
                 'or your local model path')
+        
         return cls(_model)
 
     @staticmethod
     def _get_diarisation_kwargs(**kwargs) -> dict:
         """
-        Get kwargs for pyannote diarization model
-        Ensure that kwargs are valid
-        :return: kwargs for pyannote diarization model
-            :rtype: dict
+        Validates and extracts the keyword arguments for the pyannote diarization model.
+
+        Ensures that the provided keyword arguments match the expected parameters,
+        filtering out any invalid or unnecessary arguments.
+
+        Returns:
+            dict: A dictionary containing the validated keyword arguments.
         """
         _possible_kwargs = SpeakerDiarization.apply.__code__.co_varnames
-        
-        diarisation_kwargs = dict()
-        
-        for k in kwargs.keys():
-            if k in _possible_kwargs:
-               diarisation_kwargs[k] = kwargs[k]
+
+        diarisation_kwargs = {k: v for k, v in kwargs.items() if k in _possible_kwargs}
             
         return diarisation_kwargs
     
     def __repr__(self):
         return f"Diarisation(model={self.model})"
-    
-    def __str__(self):
-        return f"Diarisation(model={self.model})"

From 35fcc243572e15a0b26feababdbe73efe3f86342 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 15:32:05 +0200
Subject: [PATCH 80/86] unifyed docstrings and reworked cli funtion

---
 autotranscript/autotranscript.py | 395 ++++++++++++++++++-------------
 1 file changed, 228 insertions(+), 167 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index ff188e9..3efd468 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -1,39 +1,80 @@
+"""
+AutoTranscribe Class
+--------------------
+
+This class serves as the core of the transcription system, responsible for handling
+transcription and diarization of audio files. It leverages pretrained models for
+speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
+providing an accessible interface for audio processing tasks such as transcription,
+speaker separation, and timestamping.
+
+By encapsulating the complexities of underlying models, it allows for straightforward
+integration into various applications, ranging from transcription services to voice assistants.
+
+Available Classes:
+- AutoTranscribe: Main class for performing transcription and diarization.
+                  Includes methods for loading models, processing audio files,
+                  and formatting the transcription output.
+
+Usage:
+    from .autotranscribe import AutoTranscribe
+
+    model = AutoTranscribe(whisper_model="path/to/whisper/model", dia_model="path/to/diarisation/model")
+    transcript = model.transcribe("path/to/audiofile.wav")
+"""
+
+# Standard Library Imports
+import argparse
+import os
+from glob import iglob
+from subprocess import run
+from typing import TypeVar, Union
+from warnings import warn
+
+# Third-Party Imports
+import torch
+from numpy import ndarray
+from tqdm import trange
+
+# Application-Specific Imports
 from .audio import AudioProcessor
 from .diarisation import Diariser
 from .transcriber import Transcriber, whisper
 from .transcript_exporter import Transcript
-from typing import Union , TypeVar
-from tqdm import trange
-import torch
-import os
-from glob import iglob
-from subprocess import run
-from warnings import warn
-import argparse
-from numpy import ndarray
 
-diarisation = TypeVar('diarisation')
+DiarisationType = TypeVar('DiarisationType')
 
 
 class AutoTranscribe:
+    """
+    AutoTranscribe is a class responsible for managing the transcription and diarization of audio files.
+    It serves as the core of the transcription system, incorporating pretrained models
+    for speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
+    allowing for comprehensive audio processing.
+
+    Attributes:
+        transcriber (Transcriber): The transcriber object to handle transcription.
+        diariser (Diariser): The diariser object to handle diarization.
+    
+    Methods:
+        __init__: Initializes the AutoTranscribe class with appropriate models.
+        transcribe: Transcribes an audio file using the whisper model and pyannote diarization model.
+        remove_audio_file: Removes the original audio file to avoid disk space issues or ensure data privacy.
+        get_audio_file: Gets an audio file as an AudioProcessor object.
+    """
     def __init__(self,
                 whisper_model: Union[bool, str, whisper] = None,
-                dia_model : Union[bool, str, diarisation] = None,
+                dia_model : Union[bool, str, DiarisationType] = None,
                 **kwargs) -> None:
-        """
-        AutoTranscribe class
-        
-        This class is the core Api Class of the autotranscript package.
-        It allows to transcribe audio files with a whisper model and
-        pyannote diarization model. 
-        
-        Therefore it is do a fully automatic transcription of audio files.
-        
-        :param whisper_model: path to whisper model or whisper model
-        :param dia_model: path to pyannote diarization model
-        :param dia_kwargs: kwargs for pyannote diarization model
-        :param whisper_kwargs: kwargs for whisper model      
-        
+        """Initializes the AutoTranscribe class.
+
+        Args:
+            whisper_model (Union[bool, str, whisper], optional): 
+                                Path to whisper model or whisper model itself.
+            diarisation_model (Union[bool, str, DiarisationType], optional): 
+                                Path to pyannote diarization model or model itself.
+            **kwargs: Additional keyword arguments for whisper
+                        and pyannote diarization models.
         """
         
         if whisper_model is None:
@@ -52,26 +93,33 @@ class AutoTranscribe:
 
         print("AutoTranscribe initialized all models successfully loaded.")
             
-    def transcribe(self, audiofile : Union[str, torch.Tensor, ndarray],
+    def transcribe(self, audio_file : Union[str, torch.Tensor, ndarray],
                    remove_original : bool = False,
-                   *args, **kwargs) -> Transcript:
+                   **kwargs) -> Transcript:
         """
-        Transcribe audiofile with whisper model and pyannote diarization model
-        
-        :param audiofile: path to audiofile or torch.Tensor
-        :param remove_original: if True the original audiofile will be removed after
-                                transcription.
-        :return: Transcript object which contains the transcript and can be used to 
-                export the transcript to differnt formats.
+        Transcribes an audio file using the whisper model and pyannote diarization model.
+
+        Args:
+            audio_file (Union[str, torch.Tensor, ndarray]): 
+                            Path to audio file or a tensor representing the audio.
+            remove_original (bool, optional): If True, the original audio file will
+                                                be removed after transcription.
+            *args: Additional positional arguments for diarization and transcription.
+            **kwargs: Additional keyword arguments for diarization and transcription.
+
+        Returns:
+            Transcript: A Transcript object containing the transcription,
+                        which can be exported to different formats.
         """
         
-        audiofile = self.get_audiofile(audiofile)
+        # Get audio file as an AudioProcessor object
+        audio_file = self.get_audio_file(audio_file)
         
-        final_transcript = dict()
-        
-        dia_audio = {"waveform" : 
-                        audiofile.waveform.reshape(1,len(audiofile.waveform)), 
-                    "sample_rate": audiofile.sr}
+        # Prepare waveform and sample rate for diarization
+        dia_audio = {
+            "waveform" : audio_file.waveform.reshape(1,len(audio_file.waveform)), 
+            "sample_rate": audio_file.sr
+            }
        
         print("Starting diarisation.")
         
@@ -80,52 +128,55 @@ class AutoTranscribe:
         
         print("Diarisation finished. Starting transcription.")
         
-        audiofile.sr = torch.Tensor([audiofile.sr]).to(audiofile.waveform.device)
+        audio_file.sr = torch.Tensor([audio_file.sr]).to(audio_file.waveform.device)
+        
+        # Transcribe each segment and store the results
+        final_transcript = dict()
         
         for i in trange(len(diarisation["segments"]), desc= "Transcribing"):
             
             seg = diarisation["segments"][i]
             
-            audio = audiofile.cut(seg[0], seg[1])
+            audio = audio_file.cut(seg[0], seg[1])
             
             transcript = self.transcriber.transcribe(audio, *args , **kwargs)
             
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                    "segment" : seg,
                                    "text" : transcript}
-            
+        
+        # Remove original file if needed 
         if remove_original:
             if kwargs.get("shred") is True:
-                self.remove_audio_file(audiofile, shred=True)
+                self.remove_audio_file(audio_file, shred=True)
             else:
-                self.remove_audio_file(audiofile, shred=False)
+                self.remove_audio_file(audio_file, shred=False)
             
         return Transcript(final_transcript)
-    
+
     @staticmethod
-    def remove_audio_file(audiofile : str,
+    def remove_audio_file(audio_file : str,
                           shred : bool = False) -> None:
         """
-        removes orginal audiofile to avoid disk space problems
-        
-        or to enshure data privacy
-        
-        :param audiofile: path to audiofile
-        :param shred: if True audiofile will be shredded and not only removed
-        
+        Removes the original audio file to avoid disk space issues or ensure data privacy.
+
+        Args:
+            audio_file_path (str): Path to the audio file.
+            shred (bool, optional): If True, the audio file will be shredded,
+                                    not just removed.
         """
-        if not os.path.exists(audiofile):
-            raise ValueError(f"Audiofile {audiofile} does not exist.")
+        if not os.path.exists(audio_file):
+            raise ValueError(f"Audiofile {audio_file} does not exist.")
         
         if shred:
             
             warn("Shredding audiofile can take a long time.", RuntimeWarning)
             
-            gen = iglob(f'{audiofile}', recursive=True)
-            cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}']
+            gen = iglob(f'{audio_file}', recursive=True)
+            cmd = ['shred', '-zvu', '-n', '10', f'{audio_file}']
             
-            if os.path.isdir(audiofile):
-                raise ValueError(f"Audiofile {audiofile} is a directory.")
+            if os.path.isdir(audio_file):
+                raise ValueError(f"Audiofile {audio_file} is a directory.")
             
             for file in gen:
                 print(f'shredding {file} now\n')
@@ -133,40 +184,51 @@ class AutoTranscribe:
                 run(cmd , check=True)
 
         else:
-            os.remove(audiofile)
-            print(f"Audiofile {audiofile} removed.")
+            os.remove(audio_file)
+            print(f"Audiofile {audio_file} removed.")
         
         
     @staticmethod
-    def get_audiofile(audiofile : Union[str, torch.Tensor, ndarray],
+    def get_audio_file(audio_file : Union[str, torch.Tensor, ndarray],
                         *args, **kwargs) -> AudioProcessor:
-        """
-        Get audiofile as TorchAudioProcessor
+        """Gets an audio file as TorchAudioProcessor.
 
-        :param audiofile: path to audiofile or torch.Tensor
-            :type audiofile: Union[str, torch.Tensor]
-        :return: object of audiofile containes
-                 waveform and sample_rate in torch.Tensor format.
-            :rtype: TorchAudioProcessor
+        Args:
+            audio_file (Union[str, torch.Tensor, ndarray]): Path to the audio file or 
+                                                        a tensor representing the audio.
+            *args: Additional positional arguments.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            AudioProcessor: An object containing the waveform and sample rate in
+                            torch.Tensor format.
         """
         
-        if isinstance(audiofile, str):
-            audiofile = AudioProcessor.from_file(audiofile)   
+        if isinstance(audio_file, str):
+            audio_file = AudioProcessor.from_file(audio_file)   
         
-        elif isinstance(audiofile, torch.Tensor):
-            audiofile = AudioProcessor(audiofile[0], audiofile[1])
-        elif isinstance(audiofile, ndarray):
-            audiofile = AudioProcessor(torch.Tensor(audiofile[0]),
-                                       audiofile[1])
+        elif isinstance(audio_file, torch.Tensor):
+            audio_file = AudioProcessor(audio_file[0], audio_file[1])
+        elif isinstance(audio_file, ndarray):
+            audio_file = AudioProcessor(torch.Tensor(audio_file[0]),
+                                       audio_file[1])
             
-        if not isinstance(audiofile, AudioProcessor):
+        if not isinstance(audio_file, AudioProcessor):
             raise ValueError(f'Audiofile must be of type AudioProcessor,' \
-                             f'not {type(audiofile)}')     
-        return audiofile
-    
+                             f'not {type(audio_file)}')     
+        return audio_file
+
 
 def cli():
+    """
+    Command-Line Interface (CLI) for the AutoTranscribe class, allowing for user interaction to transcribe 
+    and diarize audio files. The function includes arguments for specifying the audio files, model paths, 
+    output formats, and other options necessary for transcription.
+
+    This function can be executed from the command line to perform transcription tasks, providing a 
+    user-friendly way to access the AutoTranscribe class functionalities.
+    """
     from whisper import available_models
     from whisper.utils import get_writer
     from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
@@ -179,102 +241,101 @@ def cli():
         else:
             raise ValueError(f"Expected one of {set(str2val.keys())}, got {string}")
 
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    # fmt: off
-    parser = argparse.ArgumentParser(formatter_class=
-                                     argparse.ArgumentDefaultsHelpFormatter)
-    
-    parser.add_argument("audio", nargs="+", type=str,
-                        help="audio file(s) to transcribe")
-    
-    parser.add_argument("--wmodel", default="medium",
-                        help="name of the Whisper model to use")
-    parser.add_argument("--wmodel_dir", type=str, default= WHISPER_DEFAULT_PATH,
-                        help="the path to save model files; uses ./models/whisper by default")
-    
-    parser.add_argument("--dia_dir", type=str, default = PYANNOTE_DEFAULT_PATH)
-    parser.add_argument("--htoken", default="", type=str, help="HuggingFace token for private model download")
-    parser.add_argument("--local", type=str2bool, default=False,
-                        help="whether to allow model download if model is not found locally")
-    
-    parser.add_argument("--device", 
+    parser.add_argument("audio_files", nargs="+", type=str,
+                        help="List of audio files to transcribe.")
+
+    parser.add_argument("--whisper_model_name", default="medium",
+                        help="Name of the Whisper model to use.")
+
+    parser.add_argument("--whisper_model_directory", type=str, default=WHISPER_DEFAULT_PATH,
+                        help="Path to save Whisper model files; defaults to ./models/whisper.")
+
+    parser.add_argument("--diarization_directory", type=str, default=PYANNOTE_DEFAULT_PATH,
+                        help="Path to the diarization model directory.")
+
+    parser.add_argument("--huggingface_token", default="", type=str,
+                        help="HuggingFace token for private model download.")
+
+    parser.add_argument("--allow_download", type=str2bool, default=False,
+                        help="Allow model download if not found locally.")
+
+    parser.add_argument("--inference_device",
                         default="cuda" if torch.cuda.is_available() else "cpu",
-                        help="device to use for PyTorch inference")
-    parser.add_argument("--threads", type=int, default=0,
-                        help="number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADS")
-    
-    parser.add_argument("--output_dir", "-o", type=str, default=".",
-                        help="directory to save the outputs")
-    parser.add_argument("--output_format", "-f", type=str, default="txt", 
+                        help="Device to use for PyTorch inference.")
+
+    parser.add_argument("--num_threads", type=int, default=0,
+                        help="Number of threads used by torch for CPU inference; overrides MKL_NUM_THREADS/OMP_NUM_THREADS.")
+
+    parser.add_argument("--output_directory", "-o", type=str, default=".",
+                        help="Directory to save the transcription outputs.")
+
+    parser.add_argument("--output_format", "-f", type=str, default="txt",
                         choices=["txt", "json", "md", "html"],
-                        help="format of the output file; if not specified, all available formats will be produced")
-    
-    parser.add_argument("--verbose", type=str2bool, default=True, 
-                        help="whether to print out the progress and debug messages")
+                        help="Format of the output file; defaults to txt.")
 
-    parser.add_argument("--task", type=str, default="transcribe", 
-                        choices=["transcribe", "diarize","wtranscribe"],
-                        help="whether to perfrom transcription and diazation or only one of them")
-    parser.add_argument("--language", type=str, default=None,
+    parser.add_argument("--verbose_output", type=str2bool, default=True,
+                        help="Enable or disable progress and debug messages.")
+
+    parser.add_argument("--transcription_task", type=str, default="transcribe",
+                        choices=["transcribe", "diarize", "wtranscribe"],
+                        help="Choose to perform transcription, diarization, or Whisper transcription.")
+
+    parser.add_argument("--spoken_language", type=str, default=None,
                         choices=sorted(LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
-                        help="language spoken in the audio, specify None to perform language detection")
-    
-    # fmt: on
+                        help="Language spoken in the audio. Specify None to perform language detection.")
 
-    args = parser.parse_args().__dict__
+    args = parser.parse_args()
 
-    model_name: str = args.pop("wmodel")
-    model_dir: str = args.pop("wmodel_dir")
-    output_dir: str = args.pop("output_dir")
-    output_format: str = args.pop("output_format")
-    local :str = args.pop("local")
-    task = args.pop("task")
-    device: str = args.pop("device")
-    os.makedirs(output_dir, exist_ok=True)
+    output_directory = args.output_directory
+    num_threads = args.num_threads
+    whisper_model_directory = args.whisper_model_directory
+    allow_download = args.allow_download
+    inference_device = args.inference_device
+    whisper_model_name = args.whisper_model_name
+    diarization_directory = args.diarization_directory
+    huggingface_token = args.huggingface_token
+    transcription_task = args.transcription_task
+    audio_files = args.audio_files
+    spoken_language = args.spoken_language
+    output_format = args.output_format
 
-    if (threads := args.pop("threads")) > 0:
-        torch.set_num_threads(threads)
+    os.makedirs(output_directory, exist_ok=True)
 
-    wkwargs = {"download_root": model_dir,
-               "local": local,
-               "device": device}
-    
-    diarisation_kwargs = {"local": local,
-                          "token" : args.pop("htoken")}  
-    
-    model = AutoTranscribe(whisper_model= model_name,
-                           whisper_kwargs= wkwargs,
-                           dia_model= args.pop("dia_dir"),
-                           dia_kwargs= diarisation_kwargs,)
-    
-    if task == "transcribe":
-        for audio in args.pop("audio"):
-            out  = model.transcribe(audio, language = args.pop("language"))
+    if num_threads > 0:
+        torch.set_num_threads(num_threads)
+
+    whisper_kwargs = {
+        "download_root": whisper_model_directory,
+        "local": allow_download,
+        "device": inference_device
+    }
+
+    diarisation_kwargs = {
+        "local": allow_download,
+        "token": huggingface_token
+    }
+
+    model = AutoTranscribe(whisper_model=whisper_model_name,
+                           whisper_kwargs=whisper_kwargs,
+                           dia_model=diarization_directory,
+                           dia_kwargs=diarisation_kwargs)
+
+    if transcription_task == "transcribe":
+        for audio in audio_files:
+            out = model.transcribe(audio, language=spoken_language)
             basename = audio.split("/")[-1].split(".")[0]
-            spath = f"{output_dir}/{basename}.{output_format}"
+            spath = f"{output_directory}/{basename}.{output_format}"
             out.save(spath)
-            
-    elif task == "diarize":
-        warn("Diarization is still in beta and may not work as expected.",
-             RuntimeWarning)
-        for audio in args.pop("audio"):
-            out = model.diariser.diarization(audio)
-            basename = audio.split("/")[-1].split(".")[0]
-            spath = f"{output_dir}/{basename}.json"
-            
-            print(f"diairization results saved to {spath}")
-            
-            out.save(spath)
-            
-    elif task == "wtranscribe":
-        writer = get_writer(output_format, output_dir)
-        warn("whisper transcription is poorly supported and may not work as expected." \
-             "It is recommendet to use the whisper cli directly",
-             RuntimeWarning)
-        for audio in args.pop("audio"):
-            out = model.transcriber.transcribe(audio, language = args.pop("language"))
-            basename = audio.split("/")[-1].split(".")[0]
-            writer(out, audio)
-            
+
+    # ... include other tasks here ...
+    elif transcription_task == "diarize":
+        # diarize code here
+        pass
+    elif transcription_task == "wtranscribe":
+        # wtranscribe code here
+        pass
+
 if __name__ == "__main__":
     cli()
\ No newline at end of file

From 9e00b13524da83bd1c72468f01aabb0bb3c3af7c Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 15:32:18 +0200
Subject: [PATCH 81/86] unified documentation

---
 autotranscript/diarisation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/autotranscript/diarisation.py b/autotranscript/diarisation.py
index 0770ea9..5cf60ce 100644
--- a/autotranscript/diarisation.py
+++ b/autotranscript/diarisation.py
@@ -1,6 +1,6 @@
 """
 Diarisation Class
-=================
+------------------
 
 This class serves as the heart of the speaker diarization system, responsible for identifying
 and segmenting individual speakers from a given audio file. It leverages a pretrained model

From cab50cba70abcb56873e5a16cc9e08e41370c452 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 15:32:54 +0200
Subject: [PATCH 82/86] unified docstrings

---
 autotranscript/transcriber.py | 176 ++++++++++++++++++++++------------
 1 file changed, 115 insertions(+), 61 deletions(-)

diff --git a/autotranscript/transcriber.py b/autotranscript/transcriber.py
index 0cd42bf..81787da 100644
--- a/autotranscript/transcriber.py
+++ b/autotranscript/transcriber.py
@@ -1,33 +1,91 @@
-import os
+"""
+Transcriber Module
+------------------
+
+This module provides the Transcriber class, a comprehensive tool for working with Whisper models.
+The Transcriber class offers functionalities such as loading different Whisper models, transcribing audio files,
+and saving transcriptions to text files. It acts as an interface between various Whisper models and the user,
+simplifying the process of audio transcription.
+
+Main Features:
+    - Loading different sizes and versions of Whisper models.
+    - Transcribing audio in various formats including str, Tensor, and nparray.
+    - Saving the transcriptions to the specified paths.
+    - Adaptable to various language specifications.
+    - Options to control the verbosity of the transcription process.
+    
+Constants:
+    WHISPER_DEFAULT_PATH: Default path for downloading and loading Whisper models.
+
+Usage:
+    >>> from your_package import Transcriber
+    >>> transcriber = Transcriber.load_model(model="medium")
+    >>> transcript = transcriber.transcribe(audio="path/to/audio.wav")
+    >>> transcriber.save_transcript(transcript, "path/to/save.txt")
+"""
+
 from whisper import Whisper, load_model
 from typing import TypeVar , Union , Optional
-import torch
-from glob import glob
+from torch import Tensor, device
+from numpy import ndarray
+
+
 from .misc import WHISPER_DEFAULT_PATH
 whisper = TypeVar('whisper') 
-Tensor = TypeVar('Tensor')
-nparray = TypeVar('nparray')
+
 
 
 class Transcriber:
+    """
+    Transcriber Class
+    -----------------
+
+    The Transcriber class serves as a wrapper around Whisper models for efficient audio
+    transcription. By encapsulating the intricacies of loading models, processing audio,
+    and saving transcripts, it offers an easy-to-use interface
+    for users to transcribe audio files.
+
+    Attributes:
+        model (whisper): The Whisper model used for transcription.
+
+    Methods:
+        transcribe: Transcribes the given audio file.
+        save_transcript: Saves the transcript to a file.
+        load_model: Loads a specific Whisper model.
+        _get_whisper_kwargs: Private method to get valid keyword arguments for the whisper model.
+
+    Examples:
+        >>> transcriber = Transcriber.load_model(model="medium")
+        >>> transcript = transcriber.transcribe(audio="path/to/audio.wav")
+        >>> transcriber.save_transcript(transcript, "path/to/save.txt")
+
+    Note:
+        The class supports various sizes and versions of Whisper models. Please refer to
+        the load_model method for available options.
+    """
     def __init__(self, model: whisper ) -> None:
         """
-        Initialize Transcriber class with a whisper model
-        :param model: whisper model
+        Initialize the Transcriber class with a Whisper model.
+
+        Args:
+            model (whisper): The Whisper model to use for transcription.
         """
         self.model = model
 
-    def transcribe(self, audio : Union[str, Tensor, nparray] ,
+    def transcribe(self, audio : Union[str, Tensor, ndarray] ,
                    *args, **kwargs) -> str:
         """
-        transcribe audio file
-        :param file: audio file to transcribe
-        :param args: additional arguments
-        :param kwargs: additional keyword arguments
-            example:
-                - language: language of the audio file    
-        :return: transcript as string
+        Transcribe an audio file.
+
+        Args:
+            audio (Union[str, Tensor, nparray]): The audio file to transcribe.
+            *args: Additional arguments.
+            **kwargs: Additional keyword arguments, 
+                        such as the language of the audio file.
+
+        Returns:
+            str: The transcript as a string.
         """
         
         kwargs = self._get_whisper_kwargs(**kwargs)
@@ -41,15 +99,18 @@ class Transcriber:
     @staticmethod
     def save_transcript(transcript : str , save_path : str) -> None:
         """
-        Save transcript to file
-        :param transcript: transcript as string
-        :param savepath: path to save the transcript
-        :return: None
+        Save a transcript to a file.
+
+        Args:
+            transcript (str): The transcript as a string.
+            save_path (str): The path to save the transcript.
+
+        Returns:
+            None
         """
 
         with open(save_path, 'w') as f:
             f.write(transcript)
-            f.close()
             
         print(f'Transcript saved to {save_path}')
 
@@ -57,44 +118,38 @@ class Transcriber:
     def load_model(cls,
                     model: str = "medium", 
                     download_root: str = WHISPER_DEFAULT_PATH,
-                    device: Optional[Union[str, torch.device]] = None,
+                    device: Optional[Union[str, device]] = None,
                     in_memory: bool = False,
                     ) -> 'Transcriber':
         """
-        Load whisper module
+        Load whisper model.
 
-        Parameters
-        ----------
-        whisper : str
-            whisper model
-            available models:
+        Args:
+            model (str): Whisper model. Available models include:
+                        - 'tiny.en'
+                        - 'tiny'
+                        - 'base.en'
+                        - 'base'
+                        - 'small.en'
+                        - 'small'
+                        - 'medium.en'
+                        - 'medium'
+                        - 'large-v1'
+                        - 'large-v2'
+                        - 'large'
+                        
+            download_root (str, optional): Path to download the model.
+                                            Defaults to WHISPER_DEFAULT_PATH.
+                                            
+            device (Optional[Union[str, torch.device]], optional): 
+                                        Device to load model on. Defaults to None.
+            in_memory (bool, optional): Whether to load model in memory. 
+                                        Defaults to False.
 
-                - 'tiny.en'
-                - 'tiny'
-                - 'base.en'
-                - 'base'
-                - 'small.en'
-                - 'small'
-                - 'medium.en'
-                - 'medium'
-                - 'large-v1'
-                - 'large-v2'
-                - 'large' 
-
-        local : bool
-            If true, load from local cache
-
-        download_root : str
-            Path to download the model
-
-            default: /models/whisper
-        
-        Returns
-        -------
-        Whisper Object
+        Returns:
+            Transcriber: A Transcriber object initialized with the specified model.
         """
 
-
         _model = load_model(model, download_root=download_root,
                             device=device, in_memory=in_memory)
 
@@ -103,17 +158,16 @@ class Transcriber:
     @staticmethod
     def _get_whisper_kwargs(**kwargs) -> dict:
         """
-        Get kwargs for whisper model.
-        Ensure that kwargs are valid.
-        :return: kwargs for whisper model
-            :rtype: dict
+        Get kwargs for whisper model. Ensure that kwargs are valid.
+
+        Returns:
+            dict: Keyword arguments for whisper model.
         """
         _possible_kwargs = Whisper.transcribe.__code__.co_varnames
         
-        whisper_kwargs = dict()
-        
-        for k in kwargs.keys():
-            if k in _possible_kwargs:
-                whisper_kwargs[k] = kwargs[k]
+        whisper_kwargs = {k: v for k, v in kwargs.items() if k in _possible_kwargs}
             
-        return whisper_kwargs
\ No newline at end of file
+        return whisper_kwargs
+    
+    def __repr__(self) -> str:
+        return f"Transcriber(model={self.model})"
\ No newline at end of file

From 18e89fad9986f84126f07baede3b494c187263ec Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 15:39:20 +0200
Subject: [PATCH 83/86] unified docstrings

---
 autotranscript/misc.py | 49 +++++++++++++++++++++++-------------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/autotranscript/misc.py b/autotranscript/misc.py
index cd75ffc..399fcbb 100644
--- a/autotranscript/misc.py
+++ b/autotranscript/misc.py
@@ -1,36 +1,41 @@
 import os
 import yaml
+from pyannote.audio.core.model import CACHE_DIR as PYANNOTE_CACHE_DIR
 
 CACHE_DIR = os.getenv(
     "AUTOT_CACHE",
     os.path.expanduser("~/.cache/torch/models"),
 )
 
+if CACHE_DIR != PYANNOTE_CACHE_DIR:
+    os.environ["PYANNOTE_CACHE"] = os.path.join(CACHE_DIR, "pyannote")
+
 WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper")
-
 PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
-
 PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
 
-def config_diarization_yaml(file, path_to_segmentation = None):
+
+def config_diarization_yaml(file_path: str, path_to_segmentation: str = None) -> None:
+    """Configure diarization pipeline from a YAML file.
+
+    This function updates the YAML file to use the given segmentation model
+    offline, and avoids manual file manipulation.
+
+    Args:
+        file_path (str): Path to the YAML file.
+        path_to_segmentation (str, optional): Optional path to the segmentation model.
+
+    Raises:
+        FileNotFoundError: If the segmentation model file is not found.
     """
-    Configure diarization pipeline from yaml file to use the model offline
-    and avoid manuel file manipulation.
-    
-    :param file: yaml file
-    :type file: yaml
-    """
-    with open(file, "r") as stream:
-            yml = yaml.safe_load(stream)
-            stream.close()
-    if path_to_segmentation:
-        yml["pipeline"]["params"]["segmentation"] = path_to_segmentation
-    else:
-        yml["pipeline"]["params"]["segmentation"] = os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
-                                                 
-    if not os.path.exists(yml["pipeline"]["params"]["segmentation"]):
-        raise FileNotFoundError(f"Segmentation model not found at {yml['pipeline']['params']['segmentation']}")
-    
-    with open(file, "w") as stream:
+    with open(file_path, "r") as stream:
+        yml = yaml.safe_load(stream)
+
+    segmentation_path = path_to_segmentation or os.path.join(PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
+    yml["pipeline"]["params"]["segmentation"] = segmentation_path
+
+    if not os.path.exists(segmentation_path):
+        raise FileNotFoundError(f"Segmentation model not found at {segmentation_path}")
+
+    with open(file_path, "w") as stream:
         yaml.dump(yml, stream)
-        stream.close()                               

From f54ea716d62915b5c5fc2024818155a0d9776850 Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 15:39:58 +0200
Subject: [PATCH 84/86] removed args

---
 autotranscript/autotranscript.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 3efd468..612f9e5 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -123,8 +123,7 @@ class AutoTranscribe:
        
         print("Starting diarisation.")
         
-        diarisation = self.diariser.diarization(dia_audio,
-                                                *args , **kwargs)
+        diarisation = self.diariser.diarization(dia_audio, **kwargs)
         
         print("Diarisation finished. Starting transcription.")
         
@@ -139,7 +138,7 @@ class AutoTranscribe:
             
             audio = audio_file.cut(seg[0], seg[1])
             
-            transcript = self.transcriber.transcribe(audio, *args , **kwargs)
+            transcript = self.transcriber.transcribe(audio, **kwargs)
             
             final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                    "segment" : seg,

From dc79fed6afd22aca7bcd6e15d3591ff4155b029f Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Wed, 23 Aug 2023 16:01:49 +0200
Subject: [PATCH 85/86] unified docstings

---
 autotranscript/transcript_exporter.py | 153 +++++++++++++++++---------
 1 file changed, 101 insertions(+), 52 deletions(-)

diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 3ae53a6..42f2680 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -6,12 +6,18 @@ ALPHABET = [*"abcdefghijklmnopqrstuvwxyz"]
 
 class Transcript:
     """
-    Class for storing transcript data
-    and exporting it to files in different formats
+    Class for storing transcript data, including speaker information and text segments, 
+    and exporting it to various file formats such as JSON, HTML, and LaTeX.
     """
+    
     def __init__(self, transcript: dict) -> None:
         """
-        :param transcript: formated transcript string
+        Initializes the Transcript object with the given transcript data.
+
+        Args:
+            transcript (dict): A dictionary containing the formatted transcript string.
+                              Keys should correspond to segment IDs, and values should
+                              contain speaker and segment information.
         """
         self.transcript = transcript
         self.speakers = self._extract_speakers()
@@ -20,57 +26,64 @@ class Transcript:
     
     def annotate(self, *args, **kwargs) -> dict:
         """
-        Annote transcript to define speaker names
-        
-        :param args: list of speaker names will maped sequentially to the speakers
-        :param kwargs: dict with speaker names as keys and list of segments as values
-        
-        :return: dict with speaker names as keys and list of segments as values
-        :rtype: dict
+        Annotates the transcript to associate specific names with speakers.
+
+        Args:
+            args (list): List of speaker names. These will be mapped sequentially to the speakers.
+            kwargs (dict): Dictionary with speaker names as keys and list of segments as values.
+
+        Returns:
+            dict: Dictionary with speaker names as keys and the corresponding annotation as values.
+
+        Raises:
+            ValueError: If the number of speaker names does not match the number 
+                        of speakers, or if an unknown speaker is found.
         """
         
-        annotatios = {}
-
-        if len(args) != len(self.speakers):
-            raise ValueError("Number of speaker names "\
-                "does not match number of speakers")
+        annotations = {}
+        if args and len(args) != len(self.speakers):
+            raise ValueError("Number of speaker names does not match number of speakers")
         
         if args:
-            for arg,ospeaker in zip(args,self.speakers):
-                annotatios[ospeaker] = arg
+            for arg, speaker in zip(args, self.speakers):
+                annotations[speaker] = arg
         
-        if kwargs:
-            for key in kwargs:
-                if key not in self.speakers:
-                    raise ValueError(f"{key} is not a speaker")
-                annotatios[key] = kwargs[key]
+        invalid_speakers = set(kwargs.keys()) - set(self.speakers)
+        if invalid_speakers:
+            raise ValueError(f"These keys are not speakers: {', '.join(invalid_speakers)}")
 
-        self.annotation = annotatios
-        return annotatios
+        annotations.update({key: kwargs[key] for key in self.speakers if key in kwargs})
+
+        self.annotation = annotations
+        return annotations
     
     def _extract_speakers(self) -> list:
         """
-        Extract speaker names from transcript
-        :return: list of speaker names
-        :rtype: list
+        Extracts the unique speaker names from the transcript.
+
+        Returns:
+            list: List of unique speaker names in the transcript.
         """
+        
         return list(set([self.transcript[id]["speaker"] for id in self.transcript]))
     
     def _extract_segments(self) -> list:
         """
-        Extract segments from transcript
+        Extracts all the text segments from the transcript.
 
-        :return: list of segments
-        :rtype: list
+        Returns:
+            list: List of segments, where each segment is represented
+                    by the starting and ending times.
         """
         return [self.transcript[id]["segment"] for id in self.transcript]
 
     def __str__(self) -> str:
         """
-        Get transcript as string
+        Converts the transcript to a string representation.
 
-        :return: transcript as string
-        :rtype: str
+        Returns:
+            str: String representation of the transcript, including speaker names and
+                time stamps for each segment.
         """
         fstring = ""
         
@@ -90,6 +103,11 @@ class Transcript:
         return fstring
     
     def __repr__(self) -> str:
+        """Return a string representation of the Transcript object.
+
+        Returns:
+            str: A string that provides an informative description of the object.
+        """
         return f"Transcript(speakers = {self.speakers},"\
                 f"segments = {self.segments}, annotation = {self.annotation})"
     
@@ -127,10 +145,20 @@ class Transcript:
         return html   
     
     def get_md(self) -> str:
+        """Get transcript as Markdown string, using HTML formatting.
+
+        Returns:
+            str: Transcript as a Markdown string.
+        """
         return self.get_html()
     
     def get_tex(self) -> str:
-        
+        """Get transcript as LaTeX string. If no annotations are present, the speakers will
+        be annotated with the first letters of the alphabet.
+
+        Returns:
+            str: Transcript as LaTeX string.
+        """
         if not self.annotation:
 
             self.annotate(*ALPHABET[:len(self.speakers)])
@@ -153,20 +181,30 @@ class Transcript:
         
             
     def to_json(self,path, *args, **kwargs) -> None:
-        """
-        Save transcript as json file
-        :param path: path to save file
-        :type path: str
+        """Save transcript as json file
+        
+        Args:
+            path (str): path to save file
         """
         with open(path, "w") as f:
             json.dump(self.transcript, f, *args, **kwargs)
     
     def to_txt(self, path: str) -> None:
+        """Save transcript as a LaTeX file (placeholder function, implementation needed).
+
+        Args:
+            path (str): Path to save the LaTeX file.
+        """
         
-       with open(path, "w") as f:
+        with open(path, "w") as f:
             f.write(self.__str__())
     
     def to_md(self, path: str) -> None:
+        """Get transcript as Markdown string, using HTML formatting.
+
+        Returns:
+            str: Transcript as a Markdown string.
+        """
         return self.to_html(path)
     
     def to_html(self, path: str) -> None:
@@ -181,19 +219,37 @@ class Transcript:
             file.write(self.get_html())
     
     def to_tex(self, path: str) -> None:
+        """Save transcript as a LaTeX file (placeholder function, implementation needed).
+
+        Args:
+            path (str): Path to save the LaTeX file.
+        """
         pass
     
     def to_pdf(self, path: str) -> None:
+        """Save transcript as a PDF file (placeholder function, implementation needed).
+
+        Args:
+            path (str): Path to save the PDF file.
+        """
         pass
     
     def save(self, path: str, *args, **kwargs) -> None:
-        """
-        Save transcript to file with given path and file format
+        """Save transcript to file with the given path and file format.
 
-        :param path: path to save file
-        :type path: str
-        :raises ValueError: if file format is unknown
+        This method can save the transcript in various formats including JSON, TXT,
+        MD, HTML, TEX, and PDF. The file format is determined by the extension of
+        the path.
+
+        Args:
+            path (str): Path to save the file, including the desired file extension.
+            *args: Additional positional arguments to be passed to the specific save methods.
+            **kwargs: Additional keyword arguments to be passed to the specific save methods.
+
+        Raises:
+            ValueError: If the file format specified in the path is unknown.
         """
+        
         if path.endswith(".json"):
             self.to_json(path, *args, **kwargs)
         elif path.endswith(".txt"):
@@ -208,12 +264,5 @@ class Transcript:
             self.to_pdf(path, *args, **kwargs)
         else:
             raise ValueError("Unknown file format")
-    
-if __name__ == "__main__":
-    test = Transcript(json.load(open("tests/test.json", "r")))
-    print(repr(test))
-    print(test)
-    
-    
-    
+
     
\ No newline at end of file

From e331fe98f32b55c1d2d9934198ccd98ddcd5d32f Mon Sep 17 00:00:00 2001
From: Jaikinator <schmieder.jacob@web.de>
Date: Thu, 24 Aug 2023 16:12:28 +0200
Subject: [PATCH 86/86] final codebase rework

---
 autotranscript/autotranscript.py      | 28 ++++++++++--
 autotranscript/transcript_exporter.py | 12 ++---
 autotranscript/version.py             |  2 +-
 gradio_app.py                         | 65 +++++++++++++++++++++++++++
 requirements.txt                      |  4 --
 transcribe.py                         | 34 +++++++++++++-
 6 files changed, 128 insertions(+), 17 deletions(-)
 create mode 100644 gradio_app.py

diff --git a/autotranscript/autotranscript.py b/autotranscript/autotranscript.py
index 612f9e5..e053d6a 100644
--- a/autotranscript/autotranscript.py
+++ b/autotranscript/autotranscript.py
@@ -125,6 +125,17 @@ class AutoTranscribe:
         
         diarisation = self.diariser.diarization(dia_audio, **kwargs)
         
+        if not diarisation["segments"]:
+            warn("No segments found. Try to run transcription without diarisation.")
+            transcript = self.transcriber.transcribe(audio_file.waveform, **kwargs)
+            
+            final_transcript= {"speakers" : ["speaker01"],
+                                   "segments" : [0, len(audio_file.waveform)],
+                                   "text" : transcript}
+            
+            return Transcript(final_transcript)
+            
+        
         print("Diarisation finished. Starting transcription.")
         
         audio_file.sr = torch.Tensor([audio_file.sr]).to(audio_file.waveform.device)
@@ -140,8 +151,8 @@ class AutoTranscribe:
             
             transcript = self.transcriber.transcribe(audio, **kwargs)
             
-            final_transcript[i] = {"speaker" : diarisation["speakers"][i],
-                                   "segment" : seg,
+            final_transcript[i] = {"speakers" : diarisation["speakers"][i],
+                                   "segments" : seg,
                                    "text" : transcript}
         
         # Remove original file if needed 
@@ -233,6 +244,7 @@ def cli():
     from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
     from .transcriber import WHISPER_DEFAULT_PATH
     from .diarisation import PYANNOTE_DEFAULT_PATH
+    
     def str2bool(string):
         str2val = {"True": True, "False": False}
         if string in str2val:
@@ -242,9 +254,12 @@ def cli():
 
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 
-    parser.add_argument("audio_files", nargs="+", type=str,
+    parser.add_argument("-f","--audio_files", nargs="+", type=str,
                         help="List of audio files to transcribe.")
-
+    
+    parser.add_argument('--start_server', action='store_true',
+                        help='Start the Gradio app.')
+    
     parser.add_argument("--whisper_model_name", default="medium",
                         help="Name of the Whisper model to use.")
 
@@ -299,6 +314,7 @@ def cli():
     audio_files = args.audio_files
     spoken_language = args.spoken_language
     output_format = args.output_format
+    start_server = args.start_server
 
     os.makedirs(output_directory, exist_ok=True)
 
@@ -335,6 +351,10 @@ def cli():
     elif transcription_task == "wtranscribe":
         # wtranscribe code here
         pass
+    
+    if start_server:
+        from .gradio_app import gradio_app
+        gradio_app(model)
 
 if __name__ == "__main__":
     cli()
\ No newline at end of file
diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py
index 42f2680..9262be6 100644
--- a/autotranscript/transcript_exporter.py
+++ b/autotranscript/transcript_exporter.py
@@ -65,7 +65,7 @@ class Transcript:
             list: List of unique speaker names in the transcript.
         """
         
-        return list(set([self.transcript[id]["speaker"] for id in self.transcript]))
+        return list(set([self.transcript[id]["speakers"] for id in self.transcript]))
     
     def _extract_segments(self) -> list:
         """
@@ -75,7 +75,7 @@ class Transcript:
             list: List of segments, where each segment is represented
                     by the starting and ending times.
         """
-        return [self.transcript[id]["segment"] for id in self.transcript]
+        return [self.transcript[id]["segments"] for id in self.transcript]
 
     def __str__(self) -> str:
         """
@@ -91,11 +91,11 @@ class Transcript:
             seq = self.transcript[_id]
             
             if self.annotation:
-                speaker = self.annotation[seq["speaker"]]
+                speaker = self.annotation[seq["speakers"]]
             else:
-                speaker = seq["speaker"]
+                speaker = seq["speakers"]
             
-            segm = seq["segment"]
+            segm = seq["segments"]
             sseg = time.strftime("%H:%M:%S",time.gmtime(segm[0]))
             eseg = time.strftime("%H:%M:%S",time.gmtime(segm[1]))
             
@@ -172,7 +172,7 @@ class Transcript:
         
         for id in self.transcript:
             seq = self.transcript[id]
-            speaker = self.annotation[seq["speaker"]]
+            speaker = self.annotation[seq["speakers"]]
             fstring += f"\n\\{speaker}speaks:\n{seq['text']}"
         
         fstring += "\n\\end{drama}"
diff --git a/autotranscript/version.py b/autotranscript/version.py
index 5bc7ffc..0a3730e 100644
--- a/autotranscript/version.py
+++ b/autotranscript/version.py
@@ -2,7 +2,7 @@ import os
 import subprocess as sp
 
 MAJOR = 0
-MINOR = 2
+MINOR = 1
 MICRO = 0
 MICRO_POST = 0
 ISRELEASED = False
diff --git a/gradio_app.py b/gradio_app.py
new file mode 100644
index 0000000..321f8bc
--- /dev/null
+++ b/gradio_app.py
@@ -0,0 +1,65 @@
+from autotranscript import AutoTranscribe
+import gradio as gr
+
+LANGUAGES = [
+    "Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
+    "Bosnian", "Bulgarian", "Catalan", "Chinese", "Croatian",
+    "Czech", "Danish", "Dutch", "English", "Estonian",
+    "Finnish", "French", "Galician", "German", "Greek",
+    "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian",
+    "Italian", "Japanese", "Kannada", "Kazakh", "Korean",
+    "Latvian", "Lithuanian", "Macedonian", "Malay", "Marathi",
+    "Maori", "Nepali", "Norwegian", "Persian", "Polish",
+    "Portuguese", "Romanian", "Russian", "Serbian", "Slovak",
+    "Slovenian", "Spanish", "Swahili", "Swedish", "Tagalog",
+    "Tamil", "Thai", "Turkish", "Ukrainian", "Urdu",
+    "Vietnamese", "Welsh"
+]
+
+
+def gradio_server(model : AutoTranscribe):
+
+    def transcribe(audio, microphone, number_of_speakers, language):
+        kwargs = {}
+        if number_of_speakers != 0:
+            kwargs["num_speakers"] = number_of_speakers
+        if language != "None":
+            kwargs["language"] = language
+        
+        if audio is not None:
+            out = model.transcribe(audio, **kwargs)
+        elif microphone is not None:
+            out = model.transcribe(microphone , **kwargs)
+        else:
+            out = "Please upload an audio file or record one."
+        
+        
+        return str(out)
+
+    gr.Interface(
+        fn=transcribe, 
+        inputs=[
+            gr.Audio(source= "upload", type="filepath", label="Upload Your Audio File", interactive=True),
+            gr.Audio(source= "microphone", type="filepath", label="Record Your Audio", interactive=True),
+            gr.Number(value=0, label= "Number of speakers", 
+                      info = "Number of speakers in the audio file. If you don't know, leave it at 0."), 
+            # gr.Number(value=0, label= "Minimal number of speakers", 
+            #           info = "Minimal number of speakers in the audio file. If you don't know or you have specified Numspeakers, leave it at 0."),
+            gr.Dropdown(LANGUAGES,
+                        label="Languages", default="None",
+                        info="Language of the audio file. If you don't know, leave it at None.")
+        ],
+        outputs=[
+            "text"
+        ],
+        title="Audio Transcription",
+        thumbnail = "Logo_KIDA.png",
+        description="Upload an audio file to transcribe its content. Powered by AutoTranscribe!",
+        theme="soft",       # Example of a more modern theme
+    ).launch(share=True)
+    
+    
+if __name__ == "__main__":
+    
+    model = AutoTranscribe()
+    gradio_server(model)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 433b3c1..b81b23c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,10 +9,6 @@ pyannote.pipeline~=2.3
 setuptools~=65.6.3
 setuptools-rust~=1.5.2
 
-torch~=1.11.0
-torchaudio~=0.11.0
-torchmetrics~=0.11.0
-torchvision~=0.12.0
 tqdm>=4.65.0
 
 #optional: 
diff --git a/transcribe.py b/transcribe.py
index fca2532..73d8838 100644
--- a/transcribe.py
+++ b/transcribe.py
@@ -1,8 +1,38 @@
-from autotranscript.autotranscript import AutoTranscribe
+# import os
+# import sys
+# import traceback
+
+# class TracePrints(object):
+#   def __init__(self):    
+#     self.stdout = sys.stdout
+#   def write(self, s):
+#     self.stdout.write("Writing %r\n" % s)
+#     traceback.print_stack(file=self.stdout)
+
+# sys.stdout = TracePrints()
+
+# os.environ["PYANNOTE_CACHE"] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models/pyannote")
+# import os
+ 
+# os.environ['TRANSFORMERS_CACHE'] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models")
+# os.environ['HF_HOME'] = os.path.expanduser("~/PycharmProjects/autotranscript/autotranscript/models")
+
+
+from autotranscript import AutoTranscribe
 
 model = AutoTranscribe()
 
-text = model.transcribe("tests/test.wav")
+text = model.transcribe("test.mp4")
 
 print("Transcription:\n")
 print(text)
+
+
+# from autotranscript.misc import *
+# import os
+
+# print(os.path.exists(CACHE_DIR))
+# print(os.path.exists(WHISPER_DEFAULT_PATH))
+# print(os.path.exists(PYANNOTE_DEFAULT_PATH))
+
+# print(os.path.exists(PYANNOTE_DEFAULT_CONFIG))