removed useless prints

added tqdm added recognition for multiple Speakers
2023-04-05 20:04:29 +02:00
parent e12a4f4967
commit 30859b0f4e
1 changed files with 37 additions and 24 deletions
@@ -6,6 +6,7 @@ import glob
 import re
 import shutil
 import sys
+from tqdm import tqdm

 from typing import Union
 from pydub import AudioSegment
@@ -41,10 +42,10 @@ class AudioProcessor:
            savename = self.coreaudiofile + f'.{type}'
        else:
            savename = savename + f'.{type}'
-        print(savefolder, savename)
-        savepath = os.path.join(savefolder, savename)

-        self.audio_file.export(savepath, format=type)
+        print(savefolder, savename)
+
+        savepath = os.path.join(savefolder, savename)

        print(f'Converted {self.audiofilename} to {type}')

@@ -118,12 +119,12 @@ class WhisperTranscription:
        """

        audiofilename = self.audio_file.split('/')[-1]
-        print(f'Start transcribing Audio file: {audiofilename}')
+        #print(f'Start transcribing Audio file: {audiofilename}')

        _stime = time()
-        result = self.model.transcribe(self.audio_file, verbose=True, language=self.language)
+        result = self.model.transcribe(self.audio_file, language=self.language)

-        print(f'Transcription finished in {time() - _stime} seconds')
+        #print(f'Transcription finished in {time() - _stime} seconds')

        self.transcript = result

@@ -169,6 +170,7 @@ class Diarisation(AudioProcessor):

        if "num_speakers" in kwargs:
            num_speakers = kwargs['num_speakers']
+            kwargs.pop('num_speakers')
        else:
            num_speakers = 2

@@ -210,12 +212,11 @@ class Diarisation(AudioProcessor):
        current_speaker = str()

        for i, speaker in enumerate(diarization_output["speakers"]):
-            print(i, speaker)
+
            if i == 0:
                current_speaker = speaker

            if speaker != current_speaker:
-                print("Speaker change")

                index_end_speaker = i - 1

@@ -316,8 +317,7 @@ class AutoTranscribe:
        """
        if audiofile is None:
            audiofile = os.listdir(audioinput) # get all audio files in audioinput folder
-            for i in range(len(audiofile)):
-                audiofile[i] =  os.path.realpath(audiofile[i])
+            audiofile = [os.path.realpath(os.path.join(audioinput, file)) for file in audiofile]# add path to audio files

        self.audiofile = audiofile
        self.language = language
@@ -371,9 +371,12 @@ class AutoTranscribe:
                if not audiofile.endswith('wav'):
                    audio = audio.to_wav()
                    self.audiofile = audio.audio_file_path
+                    audiofile = audio.audio_file_path

                if "speed" in kwargs:
                    speed = kwargs['speed']
+                    kwargs.pop('speed')
+
                    print('Creating slower version of the audio file with speed {}'.format(speed))
                    slower_audio = os.path.join(self.transcriptionpath, 'slower_version')
                    if not os.path.exists(slower_audio):
@@ -387,29 +390,39 @@ class AutoTranscribe:
                else:
                    print("Start diarisation")
                    dia = Diarisation(audiofile, self.diarisation_model)
-                    dia.diarization()
-                    temppath = dia.create_temporary_wav()

-                    for file in sorted(os.listdir(temppath)):
-                        print(file )
-                    fstring = "\\begin{drama}" \
-                              "\n\t\Character{F}{Frage}" \
-                              "\n\t\Character{A1}{Antwort}\n" \
+                    if 'num_speakers' in kwargs:
+                        num_speakers = kwargs['num_speakers']
+                        kwargs.pop('num_speakers')
+                        dia.diarization(num_speakers=num_speakers)
+                    else:
+                        dia.diarization()
+
+                    temppath = dia.create_temporary_wav()
+                    temppath_dict, _ = dia.format_diarization_output()
+                    speakers = list(set(temppath_dict["speakers"]))
+
+
+                    fstring = "\\begin{drama}"
+
+                    for speaker in speakers:
+                        speaker = speaker.replace("SPEAKER_", "")
+                        fstring += "\n\t\Character{S"+ str(speaker) + "}{S" + str(speaker) + "}"
+

                    files = glob.glob(temppath + "/*.wav")

                    # Sort files according to the digits included in the filename
                    files = sorted(files, key=lambda x: float(re.findall("(\d+)", x)[0]))

-                    for file in files:
-                            print("Start Whisper")
+                    for file in tqdm(files):
+
                            Whisper = WhisperTranscription(file, self.model, self.language).transcribe()

-                            if "SPEAKER_00" in file:
-                                fstring += f"\n\Fragespeaks: \n {Whisper}"
-
-                            elif "SPEAKER_01" in file:
-                                fstring += f"\n\Antwortspeaks: \n {Whisper}"
+                            for s in speakers:
+                                if s in file:
+                                    s = s.replace("SPEAKER_", "")
+                                    fstring += f"\n\S{s}speaks: \n {Whisper}"

                    fstring += "\n\end{drama}"