From e76b7b51a55b7df2a2a5a4755b4bb58ff40a3039 Mon Sep 17 00:00:00 2001 From: Jaikinator Date: Mon, 18 Sep 2023 14:39:34 +0200 Subject: [PATCH] rewored webapp --- autotranscript/app/Logo_KIDA_bmel_green.svg | 171 ++++++++++ autotranscript/app/gradio_app.py | 342 +++++++++++++++++--- autotranscript/app/header.html | 66 ++++ autotranscript/cli.py | 3 +- autotranscript/transcript_exporter.py | 2 + setup.py | 3 +- 6 files changed, 533 insertions(+), 54 deletions(-) create mode 100644 autotranscript/app/Logo_KIDA_bmel_green.svg create mode 100644 autotranscript/app/header.html diff --git a/autotranscript/app/Logo_KIDA_bmel_green.svg b/autotranscript/app/Logo_KIDA_bmel_green.svg new file mode 100644 index 0000000..c59c351 --- /dev/null +++ b/autotranscript/app/Logo_KIDA_bmel_green.svg @@ -0,0 +1,171 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/autotranscript/app/gradio_app.py b/autotranscript/app/gradio_app.py index d6aade8..13a6ee1 100644 --- a/autotranscript/app/gradio_app.py +++ b/autotranscript/app/gradio_app.py @@ -15,8 +15,34 @@ Usage: """ -from autotranscript import AutoTranscribe +""" +Gradio Audio Transcription App. +-------------------------------- + +This module provides an interface to transcribe audio files using the +AutoTranscribe model. Users can either upload an audio file or record their speech +live for transcription. The application supports multiple languages and provides +options to specify the number of speakers and the language of the audio. + +Attributes: + LANGUAGES (list): A list of supported languages for transcription. + +Usage: + Run this script to start the Gradio web interface for audio transcription. + +""" + +import json + import gradio as gr +from autotranscript import AutoTranscribe, Transcript + + +theme = gr.themes.Soft( + primary_hue="green", + secondary_hue='orange', + neutral_hue="gray", +) LANGUAGES = [ "Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian", @@ -33,70 +59,282 @@ LANGUAGES = [ "Vietnamese", "Welsh" ] - -def gradio_server(model : AutoTranscribe): +class GradioTranscriptionInterface: """ - Sets up and launches the Gradio interface for audio transcription. - - Args: - model (AutoTranscribe): An instance of the AutoTranscribe model for transcription. + Interface handling the interaction between Gradio UI and the Audio Transcription system. """ - def transcribe(audio, microphone, number_of_speakers, language): + + def __init__(self, model: AutoTranscribe): """ - Transcribes the provided audio input based on the given parameters. + Initializes the GradioTranscriptionInterface with a transcription model. Args: - audio (str): Filepath to the uploaded audio file. - microphone (str): Filepath to the recorded audio. - number_of_speakers (int): Number of speakers in the audio. - language (str): Language of the audio content. + model (AutoTranscribe): Model responsible for audio transcription tasks. + """ + self.model = model + + def auto_transcribe(self, source, + num_speakers : int, + translation : bool, + language : str): + """ + Shortcut method for the AutoTranscribe task. Returns: tuple: Transcribed text (str), JSON output (dict) """ - kwargs = {} - if number_of_speakers != 0: - kwargs["num_speakers"] = number_of_speakers - if language != "None": - kwargs["language"] = language - - print() - if audio is not None: - out = model.transcribe(audio, **kwargs) - elif microphone is not None: - out = model.transcribe(microphone , **kwargs) - else: - out = "Please upload an audio file or record one." + kwargs = { + "num_speakers": num_speakers if num_speakers != 0 else None, + "language": language if language != "None" else None, + "task": 'translate' if translation else None + } - return str(out), out.get_json(), out.get_md() + try: + result = self.model.autotranscribe(source, **kwargs) + except ValueError: + raise gr.Error("Couldn't detect any speech in the provided audio. \ + Please try again!") + return str(result), result.get_json() - gr.Interface( - fn=transcribe, - inputs=[ - gr.Audio(source= "upload", type="filepath", label="Upload Your Audio File", - interactive=True), - gr.Audio(source= "microphone", type="filepath", label="Record Your Audio", - interactive=True, container= False), - gr.Number(value=0, label= "Number of speakers (optional)", - info = "Number of speakers in the audio file. If you don't know, leave it at 0."), - gr.Dropdown(LANGUAGES, - label="Language (optional)", value = "None", - info="Language of the audio file. If you don't know, leave it at None.") - ], - outputs=[ - gr.Textbox(label="Transcription"), - gr.JSON(label="Raw Output", container= False), - ], - title="Audio Transcription", - description="Upload an audio file to transcribe its content. Powered by AutoTranscribe!", - theme="soft", # Example of a more modern theme - server_port=7860, - server_name="autotranscribe", - ).queue().launch() + + def transcribe(self, source, translation, language): + """ + Shortcut method for the Transcribe task. + + Returns: + str: Transcribed text. + """ + kwargs = { + "language": language if language != "None" else None, + "task": 'translate' if translation == "Yes" else None + } + + result = self.model.transcribe(source, **kwargs) + return str(result) + + def perform_diarisation(self, source, num_speakers): + """ + Shortcut method for the Diarisation task. + + Returns: + str: JSON output of diarisation result. + """ + kwargs = { + "num_speakers": num_speakers if num_speakers != 0 else None, + } + + + try: + result = self.model.diarization(source, **kwargs) + except ValueError: + raise gr.Error("Couldn't detect any speech in the provided audio. \ + Please try again!") + return json.dumps(result, indent=2) + +#### +# Gradio Interface +#### + +def gradio_Interface(model : AutoTranscribe = None): + if model is None: + model = AutoTranscribe() + + pipe = GradioTranscriptionInterface(model) + + def select_task(choice): + if choice == 'Auto Transcribe': + + return (gr.update(visible = True), + gr.update(visible = True), + gr.update(visible = True)) + + + elif choice == 'Transcribe': + + return (gr.update(visible = False), + gr.update(visible = True), + gr.update(visible = True)) + + + elif choice == 'Diarisation': + + return (gr.update(visible = True), + gr.update(visible = False), + gr.update(visible = False)) + + def select_origin(choice): + if choice == "Upload Audio": + + return (gr.update(visible = True), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None)) + + elif choice == "Record Audio": + + return (gr.update(visible = False, value = None), + gr.update(visible = True), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None)) + + elif choice == "Upload Video": + + return (gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = True), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None)) + + elif choice == "Record Video": + + return (gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = True), + gr.update(visible = False, value = None)) + + elif choice == "File": + + return (gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = False, value = None), + gr.update(visible = True)) + + def run_scribe(task, num_speakers, translate, language, audio1, audio2, video1, video2, file_in, progress = gr.Progress(track_tqdm= True)): + # get *args which are not None + progress(0, desc='Starting task...') + source = audio1 or audio2 or video1 or video2 or file_in + + if task == 'Auto Transcribe': + + out_str , out_json = pipe.auto_transcribe(source = source, + num_speakers = num_speakers, + translation = translate, + language = language) + + return (gr.update(value = out_str, visible = True), + gr.update(value = out_json, visible = True), + gr.update(visible = True), + gr.update(visible = True)) + + elif task == 'Transcribe': + + out = pipe.transcribe(source = source, + translation = translate, + language = language) + + return (gr.update(value = out, visible = True), + gr.update(value = None, visible = False), + gr.update(visible = False), + gr.update(visible = False)) + + elif task == 'Diarisation': + + out = pipe.perform_diarisation(source = source, + num_speakers = num_speakers) + + return (gr.update(value = None, visible = False), + gr.update(value = out, visible = True), + gr.update(visible = False), + gr.update(visible = False)) + + def annotate_output(annoation : str, out_json : dict): + # get *args which are not None + + trans = Transcript.from_json(out_json) + trans = trans.annotate(*annoation.split(",")) + + return gr.update(value = str(trans)),gr.update(value = trans.get_json()) + + + with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo: + + # Define components + header = open("header.html", "r").read() + gr.HTML(header, visible= True, show_label=False) + + with gr.Row(): + + with gr.Column(): + + task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task", + value= 'Auto Transcribe') + + num_speakers = gr.Number(value=0, label= "Number of speakers (optional)", + info = "Number of speakers in the audio file. If you don't know,\ + leave it at 0.", visible= True) + + translate = gr.Checkbox(label="Translation", choices=[True, False], value = False, + info="Select 'Yes' to have the output translated into English.", + visible= True) + + language = gr.Dropdown(LANGUAGES, + label="Language (optional)", value = "None", + info="Language of the audio file. If you don't know,\ + leave it at None.", visible= True) + + input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video" + ,"File"], label="Input Type", value="Upload Audio") + + audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio", + interactive= True, visible= True) + audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath", + interactive= True, visible= False) + video1 = gr.Video(source="upload", type="filepath", label="Upload Video", + interactive= True, visible= False) + video2 = gr.Video(source="webcam", label="Record Video", type="filepath", + interactive= True, visible= False) + file_in = gr.File(label="Upload File", interactive= True, visible= False) + + submit = gr.Button() + + with gr.Column(): + + out_txt = gr.Textbox(label="Output", + visible= True, show_copy_button=True) + + out_json = gr.JSON(label="JSON Output", + visible= False, show_copy_button=True) + + annoation = gr.Textbox(label="Name your speaker's", + info= "Please provide a list of the speakers arranged \ + in the order in which they appear in the input. Use comma ',' \ + as a seperator. Be aware that the first name is given \ + to SPEAKER_00 the second to SPEAKER_01 and so on.", + visible= False, interactive= True) + + annotate = gr.Button(value="Annotate", visible= False, interactive= True) + + # Define usage of components + input.change(fn=select_origin, inputs=[input], + outputs=[audio1, audio2, video1, video2, file_in]) + + task.change(fn=select_task, inputs=[task], + outputs=[num_speakers, translate, language]) + + translate.change(fn= lambda x : gr.update(value = x), + inputs=[translate], outputs=[translate]) + num_speakers.change(fn= lambda x : gr.update(value = x), + inputs=[num_speakers], outputs=[num_speakers]) + language.change(fn= lambda x : gr.update(value = x), + inputs=[language], outputs=[language]) + + submit.click(fn = run_scribe, + inputs=[task, num_speakers, translate, language, audio1, + audio2, video1, video2, file_in], + outputs=[out_txt, out_json, annoation, annotate]) + + annotate.click(fn = annotate_output, inputs=[annoation, out_json], + outputs=[out_txt, out_json]) + + return demo + if __name__ == "__main__": - model = AutoTranscribe() - gradio_server(model) \ No newline at end of file + gradio_Interface().queue().launch() \ No newline at end of file diff --git a/autotranscript/app/header.html b/autotranscript/app/header.html new file mode 100644 index 0000000..f174bfd --- /dev/null +++ b/autotranscript/app/header.html @@ -0,0 +1,66 @@ + + + + + +
+

ScrAIbe

+
+ + + +
+
+
+

+ Upload, record, or provide a video with audio for transcription. Our toolkit is designed to transcribe content from multiple languages accurately. The integrated speaker diarisation feature identifies different speakers, ensuring a smooth transcription experience. For optimal results, indicate the number of speakers and the original language of the content. +

+

What would you like to do next?

+
diff --git a/autotranscript/cli.py b/autotranscript/cli.py index 48a4fb0..b9da56d 100644 --- a/autotranscript/cli.py +++ b/autotranscript/cli.py @@ -11,6 +11,7 @@ import json from sympy import use from .autotranscript import AutoTranscribe +from .app.gradio_app import gradio_Interface from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE from torch.cuda import is_available @@ -160,7 +161,7 @@ def cli(): if start_server: # unfinished code - from .app.gradio_app import gradio_Interface + gradio_Interface(model).queue().launch(server_port=args.port, server_name=args.server_name) diff --git a/autotranscript/transcript_exporter.py b/autotranscript/transcript_exporter.py index 999383d..ac037a1 100644 --- a/autotranscript/transcript_exporter.py +++ b/autotranscript/transcript_exporter.py @@ -1,5 +1,6 @@ import json import time +from traceback import print_stack from typing import Union @@ -50,6 +51,7 @@ class Transcript: if args: for arg, speaker in zip(args, sorted(self.speakers)): + annotations[speaker] = arg invalid_speakers = set(kwargs.keys()) - set(self.speakers) diff --git a/setup.py b/setup.py index f5a4351..05a7f77 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ if __name__ == "__main__": author='Jacob Schmieder', author_email='Jacob.Schmieder@dbfz.de', description='Transcription tool for audio files based on Whisper and Pyannote', + package_data={ "header" : ["app/header.html"], "logo" : ["app/Logo_KIDA_bmel_green.svg"]}, entry_points={'console_scripts': - ['autotranscript = autotranscript.autotranscript:cli']} + ['autotranscript = autotranscript.cli:cli']} )