added check for already transcribed files

2022-12-21 18:17:26 +01:00
parent d3c93e2356
commit e91fcccf17
1 changed files with 48 additions and 29 deletions
@@ -1,6 +1,6 @@

 import whisper
-from time import time
+from time import time, sleep
 import os

 from typing import Union
@@ -32,6 +32,8 @@ class Transcribe:
        self.model = whisper.load_model(model)  # load model
        print("model loaded")

+
+
    def create_folder_structure(self):
        """
        Create folder structure for audio and transcription files
@@ -53,7 +55,18 @@ class Transcribe:
        audiofiles = os.listdir(audiopath) # list of audio files

        return currentpath, audiopath, transcriptionpath, audiofiles
-
+    def check_if_allready_transcribed(self, filename):
+        """
+        Check if all audio files are already transcribed
+        :param filename: audio file name
+        :return: bool
+        """
+        purefilename = filename.split('/')[-1][:-4] + '.txt'
+        if purefilename in os.listdir(self.transcriptionpath):
+            print(f'File {purefilename[:-4]} already transcribed')
+            return True
+        else:
+            return False
    def to_mp3(self,file,  remove_orginal=True):
        """
        Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
@@ -79,6 +92,8 @@ class Transcribe:
            else:
                raise ValueError('Audio file not found')

+            if not self.check_if_allready_transcribed(self.audiofile):
+
                if not audiofile.endswith('.mp3'):
                    print('Converting video to audio')
                    audiofile = self.to_mp3(audiofile)
@@ -95,6 +110,7 @@ class Transcribe:

                with open(savepath, 'w') as f:
                    f.write(result["text"])
+
        elif self.audiofile is None or isinstance(self.audiofile, list):
            print('No audio file specified or list of audio files')
            print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
@@ -104,6 +120,8 @@ class Transcribe:

                audiofile = os.path.join(self.audiopath, audiofile)

+                if not self.check_if_allready_transcribed(audiofile):
+
                    if not audiofile.endswith('.mp3'):
                        audiofile = self.to_mp3(audiofile)

@@ -133,3 +151,4 @@ class Transcribe:
        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
    def __str__(self):
        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
+