added check for already transcribed files
This commit is contained in:
@@ -1,6 +1,6 @@
|
|||||||
|
|
||||||
import whisper
|
import whisper
|
||||||
from time import time
|
from time import time, sleep
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from typing import Union
|
from typing import Union
|
||||||
@@ -32,6 +32,8 @@ class Transcribe:
|
|||||||
self.model = whisper.load_model(model) # load model
|
self.model = whisper.load_model(model) # load model
|
||||||
print("model loaded")
|
print("model loaded")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_folder_structure(self):
|
def create_folder_structure(self):
|
||||||
"""
|
"""
|
||||||
Create folder structure for audio and transcription files
|
Create folder structure for audio and transcription files
|
||||||
@@ -53,7 +55,18 @@ class Transcribe:
|
|||||||
audiofiles = os.listdir(audiopath) # list of audio files
|
audiofiles = os.listdir(audiopath) # list of audio files
|
||||||
|
|
||||||
return currentpath, audiopath, transcriptionpath, audiofiles
|
return currentpath, audiopath, transcriptionpath, audiofiles
|
||||||
|
def check_if_allready_transcribed(self, filename):
|
||||||
|
"""
|
||||||
|
Check if all audio files are already transcribed
|
||||||
|
:param filename: audio file name
|
||||||
|
:return: bool
|
||||||
|
"""
|
||||||
|
purefilename = filename.split('/')[-1][:-4] + '.txt'
|
||||||
|
if purefilename in os.listdir(self.transcriptionpath):
|
||||||
|
print(f'File {purefilename[:-4]} already transcribed')
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
return False
|
||||||
def to_mp3(self,file, remove_orginal=True):
|
def to_mp3(self,file, remove_orginal=True):
|
||||||
"""
|
"""
|
||||||
Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
|
Convert video file or other audio files to mp3 file, ensures that the audio file is in the correct format for the
|
||||||
@@ -79,6 +92,8 @@ class Transcribe:
|
|||||||
else:
|
else:
|
||||||
raise ValueError('Audio file not found')
|
raise ValueError('Audio file not found')
|
||||||
|
|
||||||
|
if not self.check_if_allready_transcribed(self.audiofile):
|
||||||
|
|
||||||
if not audiofile.endswith('.mp3'):
|
if not audiofile.endswith('.mp3'):
|
||||||
print('Converting video to audio')
|
print('Converting video to audio')
|
||||||
audiofile = self.to_mp3(audiofile)
|
audiofile = self.to_mp3(audiofile)
|
||||||
@@ -95,6 +110,7 @@ class Transcribe:
|
|||||||
|
|
||||||
with open(savepath, 'w') as f:
|
with open(savepath, 'w') as f:
|
||||||
f.write(result["text"])
|
f.write(result["text"])
|
||||||
|
|
||||||
elif self.audiofile is None or isinstance(self.audiofile, list):
|
elif self.audiofile is None or isinstance(self.audiofile, list):
|
||||||
print('No audio file specified or list of audio files')
|
print('No audio file specified or list of audio files')
|
||||||
print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
|
print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
|
||||||
@@ -104,6 +120,8 @@ class Transcribe:
|
|||||||
|
|
||||||
audiofile = os.path.join(self.audiopath, audiofile)
|
audiofile = os.path.join(self.audiopath, audiofile)
|
||||||
|
|
||||||
|
if not self.check_if_allready_transcribed(audiofile):
|
||||||
|
|
||||||
if not audiofile.endswith('.mp3'):
|
if not audiofile.endswith('.mp3'):
|
||||||
audiofile = self.to_mp3(audiofile)
|
audiofile = self.to_mp3(audiofile)
|
||||||
|
|
||||||
@@ -133,3 +151,4 @@ class Transcribe:
|
|||||||
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
|
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
|
return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user