added check for already transcribed files

This commit is contained in:
Jaikinator
2022-12-21 18:17:26 +01:00
parent d3c93e2356
commit e91fcccf17
+21 -2
View File
@@ -1,6 +1,6 @@
import whisper import whisper
from time import time from time import time, sleep
import os import os
from typing import Union from typing import Union
@@ -32,6 +32,8 @@ class Transcribe:
self.model = whisper.load_model(model) # load model self.model = whisper.load_model(model) # load model
print("model loaded") print("model loaded")
def create_folder_structure(self): def create_folder_structure(self):
""" """
Create folder structure for audio and transcription files Create folder structure for audio and transcription files
@@ -53,7 +55,18 @@ class Transcribe:
audiofiles = os.listdir(audiopath) # list of audio files audiofiles = os.listdir(audiopath) # list of audio files
return currentpath, audiopath, transcriptionpath, audiofiles return currentpath, audiopath, transcriptionpath, audiofiles
def check_if_allready_transcribed(self, filename):
"""
Check if all audio files are already transcribed
:param filename: audio file name
:return: bool
"""
purefilename = filename.split('/')[-1][:-4] + '.txt'
if purefilename in os.listdir(self.transcriptionpath):
print(f'File {purefilename[:-4]} already transcribed')
return True
else:
return False
def to_mp3(self,file, remove_orginal=True): def to_mp3(self,file, remove_orginal=True):
""" """
Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
@@ -79,6 +92,8 @@ class Transcribe:
else: else:
raise ValueError('Audio file not found') raise ValueError('Audio file not found')
if not self.check_if_allready_transcribed(self.audiofile):
if not audiofile.endswith('.mp3'): if not audiofile.endswith('.mp3'):
print('Converting video to audio') print('Converting video to audio')
audiofile = self.to_mp3(audiofile) audiofile = self.to_mp3(audiofile)
@@ -95,6 +110,7 @@ class Transcribe:
with open(savepath, 'w') as f: with open(savepath, 'w') as f:
f.write(result["text"]) f.write(result["text"])
elif self.audiofile is None or isinstance(self.audiofile, list): elif self.audiofile is None or isinstance(self.audiofile, list):
print('No audio file specified or list of audio files') print('No audio file specified or list of audio files')
print(f"{len(self.audiofiles)} audio files found in {self.audiopath}") print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
@@ -104,6 +120,8 @@ class Transcribe:
audiofile = os.path.join(self.audiopath, audiofile) audiofile = os.path.join(self.audiopath, audiofile)
if not self.check_if_allready_transcribed(audiofile):
if not audiofile.endswith('.mp3'): if not audiofile.endswith('.mp3'):
audiofile = self.to_mp3(audiofile) audiofile = self.to_mp3(audiofile)
@@ -133,3 +151,4 @@ class Transcribe:
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})" return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
def __str__(self): def __str__(self):
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})" return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"