added check for already transcribed files

2022-12-21 18:17:26 +01:00
parent d3c93e2356
commit e91fcccf17
1 changed files with 48 additions and 29 deletions
@@ -1,6 +1,6 @@
 import whisper
-from time import time
+from time import time, sleep
 import os
 from typing import Union
@@ -32,6 +32,8 @@ class Transcribe:
        self.model = whisper.load_model(model)  # load model
        print("model loaded")
    def create_folder_structure(self):
        """
        Create folder structure for audio and transcription files
@@ -53,7 +55,18 @@ class Transcribe:
        audiofiles = os.listdir(audiopath) # list of audio files
        return currentpath, audiopath, transcriptionpath, audiofiles
-
+    def check_if_allready_transcribed(self, filename):
        """
        Check if all audio files are already transcribed
        :param filename: audio file name
        :return: bool
        """
        purefilename = filename.split('/')[-1][:-4] + '.txt'
        if purefilename in os.listdir(self.transcriptionpath):
            print(f'File {purefilename[:-4]} already transcribed')
            return True
        else:
            return False
    def to_mp3(self,file,  remove_orginal=True):
        """
        Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
@@ -79,6 +92,8 @@ class Transcribe:
            else:
                raise ValueError('Audio file not found')
            if not self.check_if_allready_transcribed(self.audiofile):
                if not audiofile.endswith('.mp3'):
                    print('Converting video to audio')
                    audiofile = self.to_mp3(audiofile)
@@ -95,6 +110,7 @@ class Transcribe:
                with open(savepath, 'w') as f:
                    f.write(result["text"])
        elif self.audiofile is None or isinstance(self.audiofile, list):
            print('No audio file specified or list of audio files')
            print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
@@ -104,6 +120,8 @@ class Transcribe:
                audiofile = os.path.join(self.audiopath, audiofile)
                if not self.check_if_allready_transcribed(audiofile):
                    if not audiofile.endswith('.mp3'):
                        audiofile = self.to_mp3(audiofile)
@@ -133,3 +151,4 @@ class Transcribe:
        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
    def __str__(self):
        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"