Implemented faster-whisper, removed WhisperX

2024-06-18 17:21:34 +02:00
parent 6783352ea3
commit ba058c3e02
6 changed files with 25 additions and 26 deletions
@@ -26,8 +26,7 @@ Usage:

 from whisper import Whisper
 from whisper import load_model as whisper_load_model
-from whisperx.asr import WhisperModel
-from whisperx import load_model as whisperx_load_model
+from faster_whisper import WhisperModel as FasterWhisperModel
 from typing import TypeVar, Union, Optional
 from torch import Tensor, device
 from torch.cuda import is_available as cuda_is_available
@@ -145,7 +144,7 @@ class Transcriber:
                        - 'large-v3'
                        - 'large'
            whisper_type (str):
-                                Type of whisper model to load. "whisper" or "whisperx".
+                                Type of whisper model to load. "whisper" or "faster-whisper".
            download_root (str, optional): Path to download the model.
                                            Defaults to WHISPER_DEFAULT_PATH.
            device (Optional[Union[str, torch.device]], optional): 
@@ -272,7 +271,7 @@ class WhisperTranscriber(Transcriber):
        return f"WhisperTranscriber(model_name={self.model_name}, model={self.model})"


-class WhisperXTranscriber(Transcriber):
+class FasterWhisperTranscriber(Transcriber):
    def __init__(self, model: whisper, model_name: str) -> None:
        super().__init__(model, model_name)

@@ -294,10 +293,10 @@ class WhisperXTranscriber(Transcriber):

        if isinstance(audio, Tensor):
            audio = audio.cpu().numpy()
-        result = self.model.transcribe(audio, *args, **kwargs)
+        result, _ = self.model.transcribe(audio, *args, **kwargs)
        text = ""
-        for seg in result['segments']:
-            text += seg['text']
+        for seg in result:
+            text += seg.text
        return text

    @classmethod
@@ -306,7 +305,7 @@ class WhisperXTranscriber(Transcriber):
                   download_root: str = WHISPER_DEFAULT_PATH,
                   device: Optional[Union[str, device]] = None,
                   *args, **kwargs
-                   ) -> 'WhisperXTranscriber':
+                   ) -> 'FasterWhisperModel':
        """
        Load whisper model.

@@ -347,8 +346,8 @@ class WhisperXTranscriber(Transcriber):
            warnings.warn(f'Compute type {compute_type} not compatible with '
                          f'device {device}! Changing compute type to int8.')
            compute_type = 'int8'
-        _model = whisperx_load_model(model, download_root=download_root,
-                                     device=device, compute_type=compute_type)
+        _model = FasterWhisperModel(model, download_root=download_root,
+                                    device=device, compute_type=compute_type)

        return cls(_model, model_name=model)

@@ -361,7 +360,7 @@ class WhisperXTranscriber(Transcriber):
            dict: Keyword arguments for whisper model.
        """
        # _possible_kwargs = WhisperModel.transcribe.__code__.co_varnames
-        _possible_kwargs = signature(WhisperModel.transcribe).parameters.keys()
+        _possible_kwargs = signature(FasterWhisperModel.transcribe).parameters.keys()

        whisper_kwargs = {k: v for k,
                          v in kwargs.items() if k in _possible_kwargs}
@@ -375,7 +374,7 @@ class WhisperXTranscriber(Transcriber):
        return whisper_kwargs

    def __repr__(self) -> str:
-        return f"WhisperXTranscriber(model_name={self.model_name}, model={self.model})"
+        return f"FasterWhisperTranscriber(model_name={self.model_name}, model={self.model})"


 def load_transcriber(model: str = "medium",
@@ -384,7 +383,7 @@ def load_transcriber(model: str = "medium",
                     device: Optional[Union[str, device]] = None,
                     in_memory: bool = False,
                     *args, **kwargs
-                     ) -> Union[WhisperTranscriber, WhisperXTranscriber]:
+                     ) -> Union[WhisperTranscriber, FasterWhisperTranscriber]:
    """
    Load whisper model.

@@ -403,28 +402,28 @@ def load_transcriber(model: str = "medium",
                    - 'large-v3'
                    - 'large'
        whisper_type (str):
-                            Type of whisper model to load. "whisper" or "whisperx".
+                            Type of whisper model to load. "whisper" or "faster-whisper".
        download_root (str, optional): Path to download the model.
                                        Defaults to WHISPER_DEFAULT_PATH.
-        device (Optional[Union[str, torch.device]], optional): 
+        device (Optional[Union[str, torch.device]], optional):
                                    Device to load model on. Defaults to None.
-        in_memory (bool, optional): Whether to load model in memory. 
+        in_memory (bool, optional): Whether to load model in memory.
                                    Defaults to False.
        args: Additional arguments only to avoid errors.
        kwargs: Additional keyword arguments only to avoid errors.

    Returns:
-        Union[WhisperTranscriber, WhisperXTranscriber]: 
+        Union[WhisperTranscriber, FasterWhisperTranscriber]:
        One of the Whisper variants as Transcrbier object initialized with the specified model.
    """
    if whisper_type.lower() == 'whisper':
        _model = WhisperTranscriber.load_model(
            model, download_root, device, in_memory, *args, **kwargs)
        return _model
-    elif whisper_type.lower() == 'whisperx':
-        _model = WhisperXTranscriber.load_model(
+    elif whisper_type.lower() == 'faster-whisper':
+        _model = FasterWhisperTranscriber.load_model(
            model, download_root, device, *args, **kwargs)
        return _model
    else:
        raise ValueError(f'Model type not recognized, exptected "whisper" '
-                         f'or "whisperx", got {whisper_type}.')
+                         f'or "faster-whisper", got {whisper_type}.')