added check for already transcribed files

This commit is contained in:
Jaikinator
2022-12-21 18:17:26 +01:00
parent d3c93e2356
commit e91fcccf17
+21 -2
View File
@@ -1,6 +1,6 @@
import whisper
from time import time
from time import time, sleep
import os
from typing import Union
@@ -32,6 +32,8 @@ class Transcribe:
self.model = whisper.load_model(model) # load model
print("model loaded")
def create_folder_structure(self):
"""
Create folder structure for audio and transcription files
@@ -53,7 +55,18 @@ class Transcribe:
audiofiles = os.listdir(audiopath) # list of audio files
return currentpath, audiopath, transcriptionpath, audiofiles
def check_if_allready_transcribed(self, filename):
"""
Check if all audio files are already transcribed
:param filename: audio file name
:return: bool
"""
purefilename = filename.split('/')[-1][:-4] + '.txt'
if purefilename in os.listdir(self.transcriptionpath):
print(f'File {purefilename[:-4]} already transcribed')
return True
else:
return False
def to_mp3(self,file, remove_orginal=True):
"""
Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
@@ -79,6 +92,8 @@ class Transcribe:
else:
raise ValueError('Audio file not found')
if not self.check_if_allready_transcribed(self.audiofile):
if not audiofile.endswith('.mp3'):
print('Converting video to audio')
audiofile = self.to_mp3(audiofile)
@@ -95,6 +110,7 @@ class Transcribe:
with open(savepath, 'w') as f:
f.write(result["text"])
elif self.audiofile is None or isinstance(self.audiofile, list):
print('No audio file specified or list of audio files')
print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
@@ -104,6 +120,8 @@ class Transcribe:
audiofile = os.path.join(self.audiopath, audiofile)
if not self.check_if_allready_transcribed(audiofile):
if not audiofile.endswith('.mp3'):
audiofile = self.to_mp3(audiofile)
@@ -133,3 +151,4 @@ class Transcribe:
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
def __str__(self):
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"