removed useless prints

added tqdm
added recognition for multiple Speakers
This commit is contained in:
Jaikinator
2023-04-05 20:04:29 +02:00
parent e12a4f4967
commit 30859b0f4e
+37 -24
View File
@@ -6,6 +6,7 @@ import glob
import re import re
import shutil import shutil
import sys import sys
from tqdm import tqdm
from typing import Union from typing import Union
from pydub import AudioSegment from pydub import AudioSegment
@@ -41,10 +42,10 @@ class AudioProcessor:
savename = self.coreaudiofile + f'.{type}' savename = self.coreaudiofile + f'.{type}'
else: else:
savename = savename + f'.{type}' savename = savename + f'.{type}'
print(savefolder, savename)
savepath = os.path.join(savefolder, savename)
self.audio_file.export(savepath, format=type) print(savefolder, savename)
savepath = os.path.join(savefolder, savename)
print(f'Converted {self.audiofilename} to {type}') print(f'Converted {self.audiofilename} to {type}')
@@ -118,12 +119,12 @@ class WhisperTranscription:
""" """
audiofilename = self.audio_file.split('/')[-1] audiofilename = self.audio_file.split('/')[-1]
print(f'Start transcribing Audio file: {audiofilename}') #print(f'Start transcribing Audio file: {audiofilename}')
_stime = time() _stime = time()
result = self.model.transcribe(self.audio_file, verbose=True, language=self.language) result = self.model.transcribe(self.audio_file, language=self.language)
print(f'Transcription finished in {time() - _stime} seconds') #print(f'Transcription finished in {time() - _stime} seconds')
self.transcript = result self.transcript = result
@@ -169,6 +170,7 @@ class Diarisation(AudioProcessor):
if "num_speakers" in kwargs: if "num_speakers" in kwargs:
num_speakers = kwargs['num_speakers'] num_speakers = kwargs['num_speakers']
kwargs.pop('num_speakers')
else: else:
num_speakers = 2 num_speakers = 2
@@ -210,12 +212,11 @@ class Diarisation(AudioProcessor):
current_speaker = str() current_speaker = str()
for i, speaker in enumerate(diarization_output["speakers"]): for i, speaker in enumerate(diarization_output["speakers"]):
print(i, speaker)
if i == 0: if i == 0:
current_speaker = speaker current_speaker = speaker
if speaker != current_speaker: if speaker != current_speaker:
print("Speaker change")
index_end_speaker = i - 1 index_end_speaker = i - 1
@@ -316,8 +317,7 @@ class AutoTranscribe:
""" """
if audiofile is None: if audiofile is None:
audiofile = os.listdir(audioinput) # get all audio files in audioinput folder audiofile = os.listdir(audioinput) # get all audio files in audioinput folder
for i in range(len(audiofile)): audiofile = [os.path.realpath(os.path.join(audioinput, file)) for file in audiofile]# add path to audio files
audiofile[i] = os.path.realpath(audiofile[i])
self.audiofile = audiofile self.audiofile = audiofile
self.language = language self.language = language
@@ -371,9 +371,12 @@ class AutoTranscribe:
if not audiofile.endswith('wav'): if not audiofile.endswith('wav'):
audio = audio.to_wav() audio = audio.to_wav()
self.audiofile = audio.audio_file_path self.audiofile = audio.audio_file_path
audiofile = audio.audio_file_path
if "speed" in kwargs: if "speed" in kwargs:
speed = kwargs['speed'] speed = kwargs['speed']
kwargs.pop('speed')
print('Creating slower version of the audio file with speed {}'.format(speed)) print('Creating slower version of the audio file with speed {}'.format(speed))
slower_audio = os.path.join(self.transcriptionpath, 'slower_version') slower_audio = os.path.join(self.transcriptionpath, 'slower_version')
if not os.path.exists(slower_audio): if not os.path.exists(slower_audio):
@@ -387,29 +390,39 @@ class AutoTranscribe:
else: else:
print("Start diarisation") print("Start diarisation")
dia = Diarisation(audiofile, self.diarisation_model) dia = Diarisation(audiofile, self.diarisation_model)
dia.diarization()
temppath = dia.create_temporary_wav()
for file in sorted(os.listdir(temppath)): if 'num_speakers' in kwargs:
print(file ) num_speakers = kwargs['num_speakers']
fstring = "\\begin{drama}" \ kwargs.pop('num_speakers')
"\n\t\Character{F}{Frage}" \ dia.diarization(num_speakers=num_speakers)
"\n\t\Character{A1}{Antwort}\n" \ else:
dia.diarization()
temppath = dia.create_temporary_wav()
temppath_dict, _ = dia.format_diarization_output()
speakers = list(set(temppath_dict["speakers"]))
fstring = "\\begin{drama}"
for speaker in speakers:
speaker = speaker.replace("SPEAKER_", "")
fstring += "\n\t\Character{S"+ str(speaker) + "}{S" + str(speaker) + "}"
files = glob.glob(temppath + "/*.wav") files = glob.glob(temppath + "/*.wav")
# Sort files according to the digits included in the filename # Sort files according to the digits included in the filename
files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0])) files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0]))
for file in files: for file in tqdm(files):
print("Start Whisper")
Whisper = WhisperTranscription(file, self.model, self.language).transcribe() Whisper = WhisperTranscription(file, self.model, self.language).transcribe()
if "SPEAKER_00" in file: for s in speakers:
fstring += f"\n\Fragespeaks: \n {Whisper}" if s in file:
s = s.replace("SPEAKER_", "")
elif "SPEAKER_01" in file: fstring += f"\n\S{s}speaks: \n {Whisper}"
fstring += f"\n\Antwortspeaks: \n {Whisper}"
fstring += "\n\end{drama}" fstring += "\n\end{drama}"