rewored webapp

2023-09-18 14:39:34 +02:00
parent fd80276dd6
commit e76b7b51a5
6 changed files with 533 additions and 54 deletions
@@ -15,8 +15,34 @@ Usage:
 """
-from autotranscript import AutoTranscribe
+"""
 Gradio Audio Transcription App.
 --------------------------------
 This module provides an interface to transcribe audio files using the 
 AutoTranscribe model. Users can either upload an audio file or record their speech 
 live for transcription. The application supports multiple languages and provides 
 options to specify the number of speakers and the language of the audio.
 Attributes:
    LANGUAGES (list): A list of supported languages for transcription.
 Usage:
    Run this script to start the Gradio web interface for audio transcription.
 """
 import json
 import gradio as gr
 from autotranscript import AutoTranscribe, Transcript
 theme = gr.themes.Soft(
    primary_hue="green",
    secondary_hue='orange',
    neutral_hue="gray",  
 )
 LANGUAGES = [
    "Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
@@ -33,70 +59,282 @@ LANGUAGES = [
    "Vietnamese", "Welsh"
 ]
-
+class GradioTranscriptionInterface:
 def gradio_server(model : AutoTranscribe):
    """
-    Sets up and launches the Gradio interface for audio transcription.
+    Interface handling the interaction between Gradio UI and the Audio Transcription system.
    Args:
        model (AutoTranscribe): An instance of the AutoTranscribe model for transcription.
    """
-    def transcribe(audio, microphone, number_of_speakers, language):
+
    def __init__(self, model: AutoTranscribe):
        """
-        Transcribes the provided audio input based on the given parameters.
+        Initializes the GradioTranscriptionInterface with a transcription model.
        Args:
-            audio (str): Filepath to the uploaded audio file.
+            model (AutoTranscribe): Model responsible for audio transcription tasks.
-            microphone (str): Filepath to the recorded audio.
+        """
-            number_of_speakers (int): Number of speakers in the audio.
+        self.model = model
-            language (str): Language of the audio content.
+
    def auto_transcribe(self, source,
                        num_speakers : int,
                        translation : bool,
                        language : str):
        """
        Shortcut method for the AutoTranscribe task.
        Returns:
            tuple: Transcribed text (str), JSON output (dict)
        """
        kwargs = {}
        if number_of_speakers != 0:
            kwargs["num_speakers"] = number_of_speakers
        if language != "None":
            kwargs["language"] = language
-        print()
+        kwargs = {
            "num_speakers": num_speakers if num_speakers != 0 else None,
            "language": language if language != "None" else None,
            "task": 'translate' if translation else None
        }
-        if audio is not None:
+        try:
-            out = model.transcribe(audio, **kwargs)
+            result = self.model.autotranscribe(source, **kwargs)
-        elif microphone is not None:
+        except ValueError:
-            out = model.transcribe(microphone , **kwargs)
+            raise gr.Error("Couldn't detect any speech in the provided audio. \
-        else:
+                    Please try again!")
-            out = "Please upload an audio file or record one."
+        return str(result), result.get_json()
        return str(out), out.get_json(), out.get_md()
-    gr.Interface(
+    def transcribe(self, source, translation, language):
-        fn=transcribe, 
+        """
-        inputs=[
+        Shortcut method for the Transcribe task.
-            gr.Audio(source= "upload", type="filepath", label="Upload Your Audio File",
+
-                     interactive=True),
+        Returns:
-            gr.Audio(source= "microphone", type="filepath", label="Record Your Audio",
+            str: Transcribed text.
-                     interactive=True, container= False),
+        """
-            gr.Number(value=0, label= "Number of speakers (optional)", 
+        kwargs = {
-                      info = "Number of speakers in the audio file. If you don't know, leave it at 0."), 
+            "language": language if language != "None" else None,
-            gr.Dropdown(LANGUAGES,
+            "task": 'translate' if translation == "Yes" else None
-                        label="Language (optional)", value = "None",
+        }
-                        info="Language of the audio file. If you don't know, leave it at None.")
+        
-        ],
+        result = self.model.transcribe(source, **kwargs)
-        outputs=[
+        return str(result)
-            gr.Textbox(label="Transcription"),
+
-            gr.JSON(label="Raw Output", container= False),
+    def perform_diarisation(self, source, num_speakers):
-        ],
+        """
-        title="Audio Transcription",
+        Shortcut method for the Diarisation task.
-        description="Upload an audio file to transcribe its content. Powered by AutoTranscribe!",
+
-        theme="soft",       # Example of a more modern theme
+        Returns:
-        server_port=7860,
+            str: JSON output of diarisation result.
-        server_name="autotranscribe",   
+        """
-    ).queue().launch() 
+        kwargs = {
            "num_speakers": num_speakers if num_speakers != 0 else None,
        }
        try:
            result = self.model.diarization(source, **kwargs)
        except ValueError:
            raise gr.Error("Couldn't detect any speech in the provided audio. \
                    Please try again!")
        return json.dumps(result, indent=2)
 ####
 # Gradio Interface
 ####
 def gradio_Interface(model : AutoTranscribe = None):
    if model is None:
        model = AutoTranscribe()
    pipe = GradioTranscriptionInterface(model)
    def select_task(choice):
        if choice == 'Auto Transcribe':
            return (gr.update(visible = True),
                    gr.update(visible = True),
                    gr.update(visible = True))
        elif choice == 'Transcribe':
            return (gr.update(visible = False),
                    gr.update(visible = True),
                    gr.update(visible = True))
        elif choice == 'Diarisation':
            return (gr.update(visible = True),
                    gr.update(visible = False),
                    gr.update(visible = False))
    def select_origin(choice):
        if choice == "Upload Audio":
            return (gr.update(visible = True),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None))
        elif choice == "Record Audio":
            return (gr.update(visible = False, value = None),
                    gr.update(visible = True),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None))
        elif choice == "Upload Video":
            return (gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = True),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None))
        elif choice == "Record Video":
            return (gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = True),
                    gr.update(visible = False, value = None))
        elif choice == "File":
            return (gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = False, value = None),
                    gr.update(visible = True))
    def run_scribe(task, num_speakers, translate, language, audio1, audio2, video1, video2, file_in, progress = gr.Progress(track_tqdm= True)):
        # get *args which are not None
        progress(0, desc='Starting task...')
        source = audio1 or audio2 or video1 or video2 or file_in
        if task == 'Auto Transcribe':
            out_str , out_json = pipe.auto_transcribe(source = source,
                                num_speakers = num_speakers,
                                translation = translate,
                                language = language)
            return (gr.update(value = out_str, visible = True),
                    gr.update(value = out_json, visible = True),
                    gr.update(visible = True),
                    gr.update(visible = True))        
        elif task == 'Transcribe':
            out = pipe.transcribe(source = source,
                                translation = translate,
                                language = language)
            return (gr.update(value = out, visible = True),
                    gr.update(value = None, visible = False),
                    gr.update(visible = False),
                    gr.update(visible = False))
        elif task == 'Diarisation':
            out = pipe.perform_diarisation(source = source,
                                num_speakers = num_speakers)
            return (gr.update(value = None, visible = False),
                    gr.update(value = out, visible = True),
                    gr.update(visible = False),
                    gr.update(visible = False))
    def annotate_output(annoation : str, out_json : dict):
        # get *args which are not None
        trans = Transcript.from_json(out_json)
        trans = trans.annotate(*annoation.split(","))
        return gr.update(value = str(trans)),gr.update(value = trans.get_json())
    with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo:
        # Define components
        header = open("header.html", "r").read()
        gr.HTML(header, visible= True, show_label=False)
        with gr.Row():
            with gr.Column():
                task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task",
                                value= 'Auto Transcribe')
                num_speakers = gr.Number(value=0, label= "Number of speakers (optional)", 
                                info = "Number of speakers in the audio file. If you don't know,\
                                    leave it at 0.", visible= True)
                translate = gr.Checkbox(label="Translation", choices=[True, False], value = False,
                                info="Select 'Yes' to have the output translated into English.",
                                visible= True)
                language = gr.Dropdown(LANGUAGES,
                                label="Language (optional)", value = "None",
                                info="Language of the audio file. If you don't know,\
                                    leave it at None.", visible= True)
                input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video" 
                                    ,"File"], label="Input Type", value="Upload Audio")
                audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio",
                                    interactive= True, visible= True)
                audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath",
                                    interactive= True, visible= False)
                video1 = gr.Video(source="upload", type="filepath", label="Upload Video",
                                    interactive= True, visible= False)
                video2 = gr.Video(source="webcam", label="Record Video", type="filepath",
                                    interactive= True, visible= False)
                file_in = gr.File(label="Upload File", interactive= True, visible= False)
                submit = gr.Button()
            with gr.Column():
                out_txt = gr.Textbox(label="Output",
                                        visible= True, show_copy_button=True)
                out_json = gr.JSON(label="JSON Output",
                                    visible= False, show_copy_button=True)
                annoation = gr.Textbox(label="Name your speaker's",
                                    info= "Please provide a list of the speakers arranged \
                                    in the order in which they appear in the input. Use comma ',' \
                                    as a seperator. Be aware that the first name is given \
                                        to SPEAKER_00 the second to SPEAKER_01 and so on.",
                                    visible= False, interactive= True)
                annotate = gr.Button(value="Annotate", visible= False, interactive= True)
        # Define usage of components
        input.change(fn=select_origin, inputs=[input],
                        outputs=[audio1, audio2, video1, video2, file_in])
        task.change(fn=select_task, inputs=[task],
                    outputs=[num_speakers, translate, language])
        translate.change(fn= lambda x : gr.update(value = x),
                            inputs=[translate], outputs=[translate])
        num_speakers.change(fn= lambda x : gr.update(value = x),
                            inputs=[num_speakers], outputs=[num_speakers])
        language.change(fn= lambda x : gr.update(value = x), 
                        inputs=[language], outputs=[language])
        submit.click(fn = run_scribe, 
                        inputs=[task, num_speakers, translate, language, audio1,
                                audio2, video1, video2, file_in],
                        outputs=[out_txt, out_json, annoation, annotate])
        annotate.click(fn = annotate_output, inputs=[annoation, out_json],
                        outputs=[out_txt, out_json])
    return demo
 if __name__ == "__main__":
-    model = AutoTranscribe()
+    gradio_Interface().queue().launch()
    gradio_server(model)
@@ -0,0 +1,66 @@
 <!-- Importing Cormorant Garamond font from Google Fonts -->
 <link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;700&display=swap" rel="stylesheet">
 <style>
    .header-container {
        display: flex;
        align-items: center;
        justify-content: center;
        position: relative;
        padding-top: 30px;
    }
    .logo-container {
        position: absolute;
        top: 50%;
        right: 20px;
        transform: translateY(-50%);
        width: 300px;
    }
    .logo {
        width: 100%;
        height: auto;
    }
    h1 {
        font-family: 'Cormorant Garamond', serif;
        font-size: 50px !important; /* Increased font size */
        font-weight: bold;
        color: #50AF31;
        margin: 0;
        position: relative;
        padding: 0.5em 0;
    }
    h1::before, h1::after {
        content: "";
        position: absolute;
        height: 2px;
        width: 80%;
        background-color: #50AF31;
        left: 10%;
    }
    h1::before {
        top: 0.5em;
    }
    h1::after {
        bottom: 0.5em;
    }
    p, h2 {
        font-size: 16px;
        margin: 10px 0;
        line-height: 1.4;
    }
 </style>
 <div class="header-container">
    <h1>ScrAIbe</h1>
    <div class="logo-container">
        <a href="https://www.kida-bmel.de/"> <!-- Replace with your actual URL -->
            <img src="file/Logo_KIDA_bmel_green.svg" alt="KIDA Logo" class="logo">
        </a>
    </div>
 </div>
 <div style="text-align: center; padding: 20px 10%;">
    <p>
        Upload, record, or provide a video with audio for transcription. Our toolkit is designed to transcribe content from multiple languages accurately. The integrated speaker diarisation feature identifies different speakers, ensuring a smooth transcription experience. For optimal results, indicate the number of speakers and the original language of the content.
    </p>
    <h2 style="font-weight: bold; color: #50AF31;">What would you like to do next?</h2>
 </div>
@@ -11,6 +11,7 @@ import json
 from sympy import use
 from .autotranscript import AutoTranscribe
 from .app.gradio_app import gradio_Interface
 from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
 from torch.cuda import is_available
@@ -160,7 +161,7 @@ def cli():
    if start_server: # unfinished code
-        from .app.gradio_app import gradio_Interface
+        
        gradio_Interface(model).queue().launch(server_port=args.port, server_name=args.server_name)
@@ -1,5 +1,6 @@
 import json
 import time
 from traceback import print_stack
 from typing import Union
@@ -50,6 +51,7 @@ class Transcript:
        if args:
            for arg, speaker in zip(args, sorted(self.speakers)):
                annotations[speaker] = arg
        invalid_speakers = set(kwargs.keys()) - set(self.speakers)
@@ -40,6 +40,7 @@ if __name__ == "__main__":
        author='Jacob Schmieder',
        author_email='Jacob.Schmieder@dbfz.de',
        description='Transcription tool for audio files based on Whisper and Pyannote',
        package_data={ "header" : ["app/header.html"], "logo" : ["app/Logo_KIDA_bmel_green.svg"]},
        entry_points={'console_scripts':
-            ['autotranscript = autotranscript.autotranscript:cli']}
+            ['autotranscript = autotranscript.cli:cli']}
    )