From ba058c3e021f08d22cf4c1ed0701bcca3d6c9071 Mon Sep 17 00:00:00 2001
From: Marko Henning <marko.henning@posteo.de>
Date: Tue, 18 Jun 2024 17:21:34 +0200
Subject: [PATCH 1/6] Implemented faster-whisper, removed WhisperX

---
 pyproject.toml            |  2 +-
 requirements.txt          |  2 +-
 scraibe/autotranscript.py |  2 +-
 scraibe/cli.py            |  4 ++--
 scraibe/misc.py           |  2 +-
 scraibe/transcriber.py    | 39 +++++++++++++++++++--------------------
 6 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8c46bdb..caf02a2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ python = "^3.9"
 tqdm = "^4.66.4"
 numpy = "^1.26.4"
 openai-whisper = "^20231117"
-whisperx = "^3.1.3"
+faster-whisper = "^1.0.1"
 "pyannote.audio" = "^3.1.1"
 torch = "^2.3.0"
 
diff --git a/requirements.txt b/requirements.txt
index f08e2e6..94ee85a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ tqdm>=4.65.0
 numpy>=1.26.4
 
 openai-whisper==20231117
-whisperx~=3.1.3
+faster-whisper~=1.0.1
 
 pyannote.audio~=3.1.1
 pyannote.core~=5.0.0
diff --git a/scraibe/autotranscript.py b/scraibe/autotranscript.py
index 7391f1a..43dedc2 100644
--- a/scraibe/autotranscript.py
+++ b/scraibe/autotranscript.py
@@ -74,7 +74,7 @@ class Scraibe:
             whisper_model (Union[bool, str, whisper], optional): 
                                 Path to whisper model or whisper model itself.
             whisper_type (str):
-                                Type of whisper model to load. "whisper" or "whisperx".
+                                Type of whisper model to load. "whisper" or "faster-whisper".
             diarisation_model (Union[bool, str, DiarisationType], optional): 
                                 Path to pyannote diarization model or model itself.
             **kwargs: Additional keyword arguments for whisper
diff --git a/scraibe/cli.py b/scraibe/cli.py
index ee40c8b..a234132 100644
--- a/scraibe/cli.py
+++ b/scraibe/cli.py
@@ -36,8 +36,8 @@ def cli():
                         help="List of audio files to transcribe.")
 
     parser.add_argument("--whisper-type", type=str, default="whisper",
-                        choices=["whisper", "whisperx"],
-                        help="Type of Whisper model to use ('whisper' or 'whisperx').")
+                        choices=["whisper", "faster-whisper"],
+                        help="Type of Whisper model to use ('whisper' or 'faster-whisper').")
     
     parser.add_argument("--whisper-model-name", default="medium",
                         help="Name of the Whisper model to use.")
diff --git a/scraibe/misc.py b/scraibe/misc.py
index f12335f..56e9f3a 100644
--- a/scraibe/misc.py
+++ b/scraibe/misc.py
@@ -16,7 +16,7 @@ WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper")
 PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
 PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") \
     if os.path.exists(os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")) \
-    else ('jaikinator/scraibe', 'pyannote/speaker-diarization-3.1')
+    else ('Jaikinator/ScrAIbe', 'pyannote/speaker-diarization-3.1')
 
 
 def config_diarization_yaml(file_path: str, path_to_segmentation: str = None) -> None:
diff --git a/scraibe/transcriber.py b/scraibe/transcriber.py
index 0301955..cea7274 100644
--- a/scraibe/transcriber.py
+++ b/scraibe/transcriber.py
@@ -26,8 +26,7 @@ Usage:
 
 from whisper import Whisper
 from whisper import load_model as whisper_load_model
-from whisperx.asr import WhisperModel
-from whisperx import load_model as whisperx_load_model
+from faster_whisper import WhisperModel as FasterWhisperModel
 from typing import TypeVar, Union, Optional
 from torch import Tensor, device
 from torch.cuda import is_available as cuda_is_available
@@ -145,7 +144,7 @@ class Transcriber:
                         - 'large-v3'
                         - 'large'
             whisper_type (str):
-                                Type of whisper model to load. "whisper" or "whisperx".
+                                Type of whisper model to load. "whisper" or "faster-whisper".
             download_root (str, optional): Path to download the model.
                                             Defaults to WHISPER_DEFAULT_PATH.
             device (Optional[Union[str, torch.device]], optional): 
@@ -272,7 +271,7 @@ class WhisperTranscriber(Transcriber):
         return f"WhisperTranscriber(model_name={self.model_name}, model={self.model})"
 
 
-class WhisperXTranscriber(Transcriber):
+class FasterWhisperTranscriber(Transcriber):
     def __init__(self, model: whisper, model_name: str) -> None:
         super().__init__(model, model_name)
 
@@ -294,10 +293,10 @@ class WhisperXTranscriber(Transcriber):
 
         if isinstance(audio, Tensor):
             audio = audio.cpu().numpy()
-        result = self.model.transcribe(audio, *args, **kwargs)
+        result, _ = self.model.transcribe(audio, *args, **kwargs)
         text = ""
-        for seg in result['segments']:
-            text += seg['text']
+        for seg in result:
+            text += seg.text
         return text
 
     @classmethod
@@ -306,7 +305,7 @@ class WhisperXTranscriber(Transcriber):
                    download_root: str = WHISPER_DEFAULT_PATH,
                    device: Optional[Union[str, device]] = None,
                    *args, **kwargs
-                   ) -> 'WhisperXTranscriber':
+                   ) -> 'FasterWhisperModel':
         """
         Load whisper model.
 
@@ -347,8 +346,8 @@ class WhisperXTranscriber(Transcriber):
             warnings.warn(f'Compute type {compute_type} not compatible with '
                           f'device {device}! Changing compute type to int8.')
             compute_type = 'int8'
-        _model = whisperx_load_model(model, download_root=download_root,
-                                     device=device, compute_type=compute_type)
+        _model = FasterWhisperModel(model, download_root=download_root,
+                                    device=device, compute_type=compute_type)
 
         return cls(_model, model_name=model)
 
@@ -361,7 +360,7 @@ class WhisperXTranscriber(Transcriber):
             dict: Keyword arguments for whisper model.
         """
         # _possible_kwargs = WhisperModel.transcribe.__code__.co_varnames
-        _possible_kwargs = signature(WhisperModel.transcribe).parameters.keys()
+        _possible_kwargs = signature(FasterWhisperModel.transcribe).parameters.keys()
 
         whisper_kwargs = {k: v for k,
                           v in kwargs.items() if k in _possible_kwargs}
@@ -375,7 +374,7 @@ class WhisperXTranscriber(Transcriber):
         return whisper_kwargs
 
     def __repr__(self) -> str:
-        return f"WhisperXTranscriber(model_name={self.model_name}, model={self.model})"
+        return f"FasterWhisperTranscriber(model_name={self.model_name}, model={self.model})"
 
 
 def load_transcriber(model: str = "medium",
@@ -384,7 +383,7 @@ def load_transcriber(model: str = "medium",
                      device: Optional[Union[str, device]] = None,
                      in_memory: bool = False,
                      *args, **kwargs
-                     ) -> Union[WhisperTranscriber, WhisperXTranscriber]:
+                     ) -> Union[WhisperTranscriber, FasterWhisperTranscriber]:
     """
     Load whisper model.
 
@@ -403,28 +402,28 @@ def load_transcriber(model: str = "medium",
                     - 'large-v3'
                     - 'large'
         whisper_type (str):
-                            Type of whisper model to load. "whisper" or "whisperx".
+                            Type of whisper model to load. "whisper" or "faster-whisper".
         download_root (str, optional): Path to download the model.
                                         Defaults to WHISPER_DEFAULT_PATH.
-        device (Optional[Union[str, torch.device]], optional): 
+        device (Optional[Union[str, torch.device]], optional):
                                     Device to load model on. Defaults to None.
-        in_memory (bool, optional): Whether to load model in memory. 
+        in_memory (bool, optional): Whether to load model in memory.
                                     Defaults to False.
         args: Additional arguments only to avoid errors.
         kwargs: Additional keyword arguments only to avoid errors.
 
     Returns:
-        Union[WhisperTranscriber, WhisperXTranscriber]: 
+        Union[WhisperTranscriber, FasterWhisperTranscriber]:
         One of the Whisper variants as Transcrbier object initialized with the specified model.
     """
     if whisper_type.lower() == 'whisper':
         _model = WhisperTranscriber.load_model(
             model, download_root, device, in_memory, *args, **kwargs)
         return _model
-    elif whisper_type.lower() == 'whisperx':
-        _model = WhisperXTranscriber.load_model(
+    elif whisper_type.lower() == 'faster-whisper':
+        _model = FasterWhisperTranscriber.load_model(
             model, download_root, device, *args, **kwargs)
         return _model
     else:
         raise ValueError(f'Model type not recognized, exptected "whisper" '
-                         f'or "whisperx", got {whisper_type}.')
+                         f'or "faster-whisper", got {whisper_type}.')

From 53e57a06d70263a08467ecea9063d44738b9c0c7 Mon Sep 17 00:00:00 2001
From: Marko Henning <marko.henning@posteo.de>
Date: Mon, 9 Sep 2024 12:25:14 +0200
Subject: [PATCH 2/6] Added tests for faster-whisper

---
 test/test_transcriber.py  | 18 +++++++++---------
 tests/test_diarization.py | 10 ++++++++++
 2 files changed, 19 insertions(+), 9 deletions(-)
 create mode 100644 tests/test_diarization.py

diff --git a/test/test_transcriber.py b/test/test_transcriber.py
index 31765f6..bd1e9f5 100644
--- a/test/test_transcriber.py
+++ b/test/test_transcriber.py
@@ -1,6 +1,6 @@
 import pytest
 from scraibe import (Transcriber, WhisperTranscriber,
-                     WhisperXTranscriber, load_transcriber)
+                     FasterWhisperTranscriber, load_transcriber)
 import torch
 
 
@@ -35,24 +35,24 @@ def whisper_instance():
 
 
 @pytest.fixture
-def whisperx_instance():
-    return load_transcriber('medium', whisper_type='whisperx')
+def faster_whisper_instance():
+    return load_transcriber('medium', whisper_type='faster-whisper')
 
 
 def test_whisper_base_initialization(whisper_instance):
     assert isinstance(whisper_instance, Transcriber)
 
 
-def test_whisperx_base_initialization(whisperx_instance):
-    assert isinstance(whisperx_instance, Transcriber)
+def test_faster_whisper_base_initialization(faster_whisper_instance):
+    assert isinstance(faster_whisper_instance, Transcriber)
 
 
 def test_whisper_transcriber_initialization(whisper_instance):
     assert isinstance(whisper_instance, WhisperTranscriber)
 
 
-def test_whisperx_transcriber_initialization(whisperx_instance):
-    assert isinstance(whisperx_instance, WhisperXTranscriber)
+def test_faster_whisper_transcriber_initialization(faster_whisper_instance):
+    assert isinstance(faster_whisper_instance, FasterWhisperTranscriber)
 
 
 def test_wrong_transcriber_initialization():
@@ -73,8 +73,8 @@ def test_whisper_transcribe(whisper_instance):
     assert isinstance(transcript, str)
 
 
-def test_whisperx_transcribe(whisperx_instance):
-    model = whisperx_instance
+def test_faster_whisper_transcribe(faster_whisper_instance):
+    model = faster_whisper_instance
     # mocker.patch.object(transcriber_instance.model, 'transcribe', return_value={'Hello, World !'} )
     transcript = model.transcribe('test/audio_test_2.mp4')
     assert isinstance(transcript, str)
diff --git a/tests/test_diarization.py b/tests/test_diarization.py
new file mode 100644
index 0000000..f9e81a5
--- /dev/null
+++ b/tests/test_diarization.py
@@ -0,0 +1,10 @@
+from os import environ
+
+environ["AUTOT_CACHE"] = "/mnt/disk1/Projekte/ScrAIbe/tests"
+# environ["PYANNOTE_CACHE"] = "/mnt/disk1/Projekte/ScrAIbe/tests/pyannote"
+# environ["TORCH_HOME"] = "/mnt/disk1/Projekte/ScrAIbe/tests/torch"
+
+from scraibe import Scraibe
+
+scraibe = Scraibe(whisper_type = "faster-whisper", whisper_model = "tiny")
+print(scraibe.autotranscribe('/mnt/disk1/Projekte/ScrAIbe/test/audio_test_1.mp4'))
\ No newline at end of file

From de9c81b3136652012cf92b345fe6b9621a670798 Mon Sep 17 00:00:00 2001
From: "Schmieder, Jacob" <jacob.schmieder@dbfz.de>
Date: Tue, 10 Sep 2024 09:01:59 +0000
Subject: [PATCH 3/6] added language to code support for faster whisper

---
 scraibe/transcriber.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/scraibe/transcriber.py b/scraibe/transcriber.py
index cea7274..abf1ace 100644
--- a/scraibe/transcriber.py
+++ b/scraibe/transcriber.py
@@ -26,7 +26,9 @@ Usage:
 
 from whisper import Whisper
 from whisper import load_model as whisper_load_model
+from whisper.tokenizer import TO_LANGUAGE_CODE
 from faster_whisper import WhisperModel as FasterWhisperModel
+from faster_whisper.tokenizer import _LANGUAGE_CODES as FASTER_WHISPER_LANGUAGE_CODES
 from typing import TypeVar, Union, Optional
 from torch import Tensor, device
 from torch.cuda import is_available as cuda_is_available
@@ -369,14 +371,44 @@ class FasterWhisperTranscriber(Transcriber):
             whisper_kwargs["task"] = task
 
         if (language := kwargs.get("language")):
+            language = FasterWhisperTranscriber.convert_to_language_code(language)
             whisper_kwargs["language"] = language
 
         return whisper_kwargs
 
+    @staticmethod
+    def convert_to_language_code(lang : str) -> str:
+        """
+        Load whisper model.
+
+        Args:
+            lang (str): language as code or language name
+
+        Returns:
+            language (str) code of language 
+        """
+        
+        # If the input is already in FASTER_WHISPER_LANGUAGE_CODES, return it directly
+        if lang in FASTER_WHISPER_LANGUAGE_CODES:
+            return lang
+
+        # Normalize the input to lowercase
+        lang = lang.lower()
+
+        # Check if the language name is in the TO_LANGUAGE_CODE mapping
+        if lang in TO_LANGUAGE_CODE:
+            return TO_LANGUAGE_CODE[lang]
+
+        # If the language is not recognized, raise a ValueError with the available options
+        available_codes = ', '.join(FASTER_WHISPER_LANGUAGE_CODES)
+        raise ValueError(f"Language '{lang}' is not a valid language code or name. "
+                        f"Available language codes are: {available_codes}.")
+
     def __repr__(self) -> str:
         return f"FasterWhisperTranscriber(model_name={self.model_name}, model={self.model})"
 
 
+
 def load_transcriber(model: str = "medium",
                      whisper_type: str = 'whisper',
                      download_root: str = WHISPER_DEFAULT_PATH,

From 5c0386edaca8e22d141b7eb9b94f60498cb7f7fe Mon Sep 17 00:00:00 2001
From: "Schmieder, Jacob" <jacob.schmieder@dbfz.de>
Date: Tue, 10 Sep 2024 15:07:57 +0000
Subject: [PATCH 4/6] define new Versions of pyannote and faster-whisper

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index caf02a2..2c346a3 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,8 +34,8 @@ python = "^3.9"
 tqdm = "^4.66.4"
 numpy = "^1.26.4"
 openai-whisper = "^20231117"
-faster-whisper = "^1.0.1"
-"pyannote.audio" = "^3.1.1"
+faster-whisper = "^1.0.3"
+"pyannote.audio" = "^3.3.1"
 torch = "^2.3.0"
 
 [tool.poetry.group.dev.dependencies]

From 51bf211d27469735aef86fd8c5ff78ec492042d8 Mon Sep 17 00:00:00 2001
From: "Schmieder, Jacob" <jacob.schmieder@dbfz.de>
Date: Tue, 10 Sep 2024 15:09:35 +0000
Subject: [PATCH 5/6] updated deps

---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 94ee85a..66d7857 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,9 +2,9 @@ tqdm>=4.65.0
 numpy>=1.26.4
 
 openai-whisper==20231117
-faster-whisper~=1.0.1
+faster-whisper~=1.0.3
 
-pyannote.audio~=3.1.1
+pyannote.audio~=3.3.1
 pyannote.core~=5.0.0
 pyannote.database~=5.0.1 
 pyannote.metrics~=3.2.1

From ab7b43ac489cef8967137b05162c7382d7247169 Mon Sep 17 00:00:00 2001
From: "Schmieder, Jacob" <jacob.schmieder@dbfz.de>
Date: Tue, 10 Sep 2024 15:22:18 +0000
Subject: [PATCH 6/6] set test whisper model to tiny

---
 test/test_transcriber.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/test_transcriber.py b/test/test_transcriber.py
index bd1e9f5..5bfe3cf 100644
--- a/test/test_transcriber.py
+++ b/test/test_transcriber.py
@@ -31,12 +31,12 @@ def test_transcriber(mock_load_model, audio_file, expected_transcription):
 
 @pytest.fixture
 def whisper_instance():
-    return load_transcriber('medium', whisper_type='whisper')
+    return load_transcriber('tiny', whisper_type='whisper')
 
 
 @pytest.fixture
 def faster_whisper_instance():
-    return load_transcriber('medium', whisper_type='faster-whisper')
+    return load_transcriber('tiny', whisper_type='faster-whisper')
 
 
 def test_whisper_base_initialization(whisper_instance):
@@ -57,7 +57,7 @@ def test_faster_whisper_transcriber_initialization(faster_whisper_instance):
 
 def test_wrong_transcriber_initialization():
     with pytest.raises(ValueError):
-        load_transcriber('medium', whisper_type='wrong_whisper')
+        load_transcriber('tiny', whisper_type='wrong_whisper')
 
 
 def test_get_whisper_kwargs():