removed useless prints

added tqdm
added recognition for multiple Speakers
This commit is contained in:
Jaikinator
2023-04-05 20:04:29 +02:00
parent e12a4f4967
commit 30859b0f4e
+37 -24
View File
@@ -6,6 +6,7 @@ import glob
import re
import shutil
import sys
from tqdm import tqdm
from typing import Union
from pydub import AudioSegment
@@ -41,10 +42,10 @@ class AudioProcessor:
savename = self.coreaudiofile + f'.{type}'
else:
savename = savename + f'.{type}'
print(savefolder, savename)
savepath = os.path.join(savefolder, savename)
self.audio_file.export(savepath, format=type)
print(savefolder, savename)
savepath = os.path.join(savefolder, savename)
print(f'Converted {self.audiofilename} to {type}')
@@ -118,12 +119,12 @@ class WhisperTranscription:
"""
audiofilename = self.audio_file.split('/')[-1]
print(f'Start transcribing Audio file: {audiofilename}')
#print(f'Start transcribing Audio file: {audiofilename}')
_stime = time()
result = self.model.transcribe(self.audio_file, verbose=True, language=self.language)
result = self.model.transcribe(self.audio_file, language=self.language)
print(f'Transcription finished in {time() - _stime} seconds')
#print(f'Transcription finished in {time() - _stime} seconds')
self.transcript = result
@@ -169,6 +170,7 @@ class Diarisation(AudioProcessor):
if "num_speakers" in kwargs:
num_speakers = kwargs['num_speakers']
kwargs.pop('num_speakers')
else:
num_speakers = 2
@@ -210,12 +212,11 @@ class Diarisation(AudioProcessor):
current_speaker = str()
for i, speaker in enumerate(diarization_output["speakers"]):
print(i, speaker)
if i == 0:
current_speaker = speaker
if speaker != current_speaker:
print("Speaker change")
index_end_speaker = i - 1
@@ -316,8 +317,7 @@ class AutoTranscribe:
"""
if audiofile is None:
audiofile = os.listdir(audioinput) # get all audio files in audioinput folder
for i in range(len(audiofile)):
audiofile[i] = os.path.realpath(audiofile[i])
audiofile = [os.path.realpath(os.path.join(audioinput, file)) for file in audiofile]# add path to audio files
self.audiofile = audiofile
self.language = language
@@ -371,9 +371,12 @@ class AutoTranscribe:
if not audiofile.endswith('wav'):
audio = audio.to_wav()
self.audiofile = audio.audio_file_path
audiofile = audio.audio_file_path
if "speed" in kwargs:
speed = kwargs['speed']
kwargs.pop('speed')
print('Creating slower version of the audio file with speed {}'.format(speed))
slower_audio = os.path.join(self.transcriptionpath, 'slower_version')
if not os.path.exists(slower_audio):
@@ -387,29 +390,39 @@ class AutoTranscribe:
else:
print("Start diarisation")
dia = Diarisation(audiofile, self.diarisation_model)
dia.diarization()
temppath = dia.create_temporary_wav()
for file in sorted(os.listdir(temppath)):
print(file )
fstring = "\\begin{drama}" \
"\n\t\Character{F}{Frage}" \
"\n\t\Character{A1}{Antwort}\n" \
if 'num_speakers' in kwargs:
num_speakers = kwargs['num_speakers']
kwargs.pop('num_speakers')
dia.diarization(num_speakers=num_speakers)
else:
dia.diarization()
temppath = dia.create_temporary_wav()
temppath_dict, _ = dia.format_diarization_output()
speakers = list(set(temppath_dict["speakers"]))
fstring = "\\begin{drama}"
for speaker in speakers:
speaker = speaker.replace("SPEAKER_", "")
fstring += "\n\t\Character{S"+ str(speaker) + "}{S" + str(speaker) + "}"
files = glob.glob(temppath + "/*.wav")
# Sort files according to the digits included in the filename
files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0]))
for file in files:
print("Start Whisper")
for file in tqdm(files):
Whisper = WhisperTranscription(file, self.model, self.language).transcribe()
if "SPEAKER_00" in file:
fstring += f"\n\Fragespeaks: \n {Whisper}"
elif "SPEAKER_01" in file:
fstring += f"\n\Antwortspeaks: \n {Whisper}"
for s in speakers:
if s in file:
s = s.replace("SPEAKER_", "")
fstring += f"\n\S{s}speaks: \n {Whisper}"
fstring += "\n\end{drama}"