rewored webapp
This commit is contained in:
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 38 KiB |
@@ -15,8 +15,34 @@ Usage:
|
||||
|
||||
"""
|
||||
|
||||
from autotranscript import AutoTranscribe
|
||||
"""
|
||||
Gradio Audio Transcription App.
|
||||
--------------------------------
|
||||
|
||||
This module provides an interface to transcribe audio files using the
|
||||
AutoTranscribe model. Users can either upload an audio file or record their speech
|
||||
live for transcription. The application supports multiple languages and provides
|
||||
options to specify the number of speakers and the language of the audio.
|
||||
|
||||
Attributes:
|
||||
LANGUAGES (list): A list of supported languages for transcription.
|
||||
|
||||
Usage:
|
||||
Run this script to start the Gradio web interface for audio transcription.
|
||||
|
||||
"""
|
||||
|
||||
import json
|
||||
|
||||
import gradio as gr
|
||||
from autotranscript import AutoTranscribe, Transcript
|
||||
|
||||
|
||||
theme = gr.themes.Soft(
|
||||
primary_hue="green",
|
||||
secondary_hue='orange',
|
||||
neutral_hue="gray",
|
||||
)
|
||||
|
||||
LANGUAGES = [
|
||||
"Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
|
||||
@@ -33,70 +59,282 @@ LANGUAGES = [
|
||||
"Vietnamese", "Welsh"
|
||||
]
|
||||
|
||||
|
||||
def gradio_server(model : AutoTranscribe):
|
||||
class GradioTranscriptionInterface:
|
||||
"""
|
||||
Sets up and launches the Gradio interface for audio transcription.
|
||||
|
||||
Args:
|
||||
model (AutoTranscribe): An instance of the AutoTranscribe model for transcription.
|
||||
Interface handling the interaction between Gradio UI and the Audio Transcription system.
|
||||
"""
|
||||
def transcribe(audio, microphone, number_of_speakers, language):
|
||||
|
||||
def __init__(self, model: AutoTranscribe):
|
||||
"""
|
||||
Transcribes the provided audio input based on the given parameters.
|
||||
Initializes the GradioTranscriptionInterface with a transcription model.
|
||||
|
||||
Args:
|
||||
audio (str): Filepath to the uploaded audio file.
|
||||
microphone (str): Filepath to the recorded audio.
|
||||
number_of_speakers (int): Number of speakers in the audio.
|
||||
language (str): Language of the audio content.
|
||||
model (AutoTranscribe): Model responsible for audio transcription tasks.
|
||||
"""
|
||||
self.model = model
|
||||
|
||||
def auto_transcribe(self, source,
|
||||
num_speakers : int,
|
||||
translation : bool,
|
||||
language : str):
|
||||
"""
|
||||
Shortcut method for the AutoTranscribe task.
|
||||
|
||||
Returns:
|
||||
tuple: Transcribed text (str), JSON output (dict)
|
||||
"""
|
||||
kwargs = {}
|
||||
if number_of_speakers != 0:
|
||||
kwargs["num_speakers"] = number_of_speakers
|
||||
if language != "None":
|
||||
kwargs["language"] = language
|
||||
|
||||
print()
|
||||
|
||||
if audio is not None:
|
||||
out = model.transcribe(audio, **kwargs)
|
||||
elif microphone is not None:
|
||||
out = model.transcribe(microphone , **kwargs)
|
||||
else:
|
||||
out = "Please upload an audio file or record one."
|
||||
kwargs = {
|
||||
"num_speakers": num_speakers if num_speakers != 0 else None,
|
||||
"language": language if language != "None" else None,
|
||||
"task": 'translate' if translation else None
|
||||
}
|
||||
|
||||
return str(out), out.get_json(), out.get_md()
|
||||
try:
|
||||
result = self.model.autotranscribe(source, **kwargs)
|
||||
except ValueError:
|
||||
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
||||
Please try again!")
|
||||
return str(result), result.get_json()
|
||||
|
||||
gr.Interface(
|
||||
fn=transcribe,
|
||||
inputs=[
|
||||
gr.Audio(source= "upload", type="filepath", label="Upload Your Audio File",
|
||||
interactive=True),
|
||||
gr.Audio(source= "microphone", type="filepath", label="Record Your Audio",
|
||||
interactive=True, container= False),
|
||||
gr.Number(value=0, label= "Number of speakers (optional)",
|
||||
info = "Number of speakers in the audio file. If you don't know, leave it at 0."),
|
||||
gr.Dropdown(LANGUAGES,
|
||||
label="Language (optional)", value = "None",
|
||||
info="Language of the audio file. If you don't know, leave it at None.")
|
||||
],
|
||||
outputs=[
|
||||
gr.Textbox(label="Transcription"),
|
||||
gr.JSON(label="Raw Output", container= False),
|
||||
],
|
||||
title="Audio Transcription",
|
||||
description="Upload an audio file to transcribe its content. Powered by AutoTranscribe!",
|
||||
theme="soft", # Example of a more modern theme
|
||||
server_port=7860,
|
||||
server_name="autotranscribe",
|
||||
).queue().launch()
|
||||
|
||||
def transcribe(self, source, translation, language):
|
||||
"""
|
||||
Shortcut method for the Transcribe task.
|
||||
|
||||
Returns:
|
||||
str: Transcribed text.
|
||||
"""
|
||||
kwargs = {
|
||||
"language": language if language != "None" else None,
|
||||
"task": 'translate' if translation == "Yes" else None
|
||||
}
|
||||
|
||||
result = self.model.transcribe(source, **kwargs)
|
||||
return str(result)
|
||||
|
||||
def perform_diarisation(self, source, num_speakers):
|
||||
"""
|
||||
Shortcut method for the Diarisation task.
|
||||
|
||||
Returns:
|
||||
str: JSON output of diarisation result.
|
||||
"""
|
||||
kwargs = {
|
||||
"num_speakers": num_speakers if num_speakers != 0 else None,
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
result = self.model.diarization(source, **kwargs)
|
||||
except ValueError:
|
||||
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
||||
Please try again!")
|
||||
return json.dumps(result, indent=2)
|
||||
|
||||
####
|
||||
# Gradio Interface
|
||||
####
|
||||
|
||||
def gradio_Interface(model : AutoTranscribe = None):
|
||||
|
||||
if model is None:
|
||||
model = AutoTranscribe()
|
||||
|
||||
pipe = GradioTranscriptionInterface(model)
|
||||
|
||||
def select_task(choice):
|
||||
if choice == 'Auto Transcribe':
|
||||
|
||||
return (gr.update(visible = True),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = True))
|
||||
|
||||
|
||||
elif choice == 'Transcribe':
|
||||
|
||||
return (gr.update(visible = False),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = True))
|
||||
|
||||
|
||||
elif choice == 'Diarisation':
|
||||
|
||||
return (gr.update(visible = True),
|
||||
gr.update(visible = False),
|
||||
gr.update(visible = False))
|
||||
|
||||
def select_origin(choice):
|
||||
if choice == "Upload Audio":
|
||||
|
||||
return (gr.update(visible = True),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None))
|
||||
|
||||
elif choice == "Record Audio":
|
||||
|
||||
return (gr.update(visible = False, value = None),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None))
|
||||
|
||||
elif choice == "Upload Video":
|
||||
|
||||
return (gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None))
|
||||
|
||||
elif choice == "Record Video":
|
||||
|
||||
return (gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = False, value = None))
|
||||
|
||||
elif choice == "File":
|
||||
|
||||
return (gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = False, value = None),
|
||||
gr.update(visible = True))
|
||||
|
||||
def run_scribe(task, num_speakers, translate, language, audio1, audio2, video1, video2, file_in, progress = gr.Progress(track_tqdm= True)):
|
||||
# get *args which are not None
|
||||
progress(0, desc='Starting task...')
|
||||
source = audio1 or audio2 or video1 or video2 or file_in
|
||||
|
||||
if task == 'Auto Transcribe':
|
||||
|
||||
out_str , out_json = pipe.auto_transcribe(source = source,
|
||||
num_speakers = num_speakers,
|
||||
translation = translate,
|
||||
language = language)
|
||||
|
||||
return (gr.update(value = out_str, visible = True),
|
||||
gr.update(value = out_json, visible = True),
|
||||
gr.update(visible = True),
|
||||
gr.update(visible = True))
|
||||
|
||||
elif task == 'Transcribe':
|
||||
|
||||
out = pipe.transcribe(source = source,
|
||||
translation = translate,
|
||||
language = language)
|
||||
|
||||
return (gr.update(value = out, visible = True),
|
||||
gr.update(value = None, visible = False),
|
||||
gr.update(visible = False),
|
||||
gr.update(visible = False))
|
||||
|
||||
elif task == 'Diarisation':
|
||||
|
||||
out = pipe.perform_diarisation(source = source,
|
||||
num_speakers = num_speakers)
|
||||
|
||||
return (gr.update(value = None, visible = False),
|
||||
gr.update(value = out, visible = True),
|
||||
gr.update(visible = False),
|
||||
gr.update(visible = False))
|
||||
|
||||
def annotate_output(annoation : str, out_json : dict):
|
||||
# get *args which are not None
|
||||
|
||||
trans = Transcript.from_json(out_json)
|
||||
trans = trans.annotate(*annoation.split(","))
|
||||
|
||||
return gr.update(value = str(trans)),gr.update(value = trans.get_json())
|
||||
|
||||
|
||||
with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo:
|
||||
|
||||
# Define components
|
||||
header = open("header.html", "r").read()
|
||||
gr.HTML(header, visible= True, show_label=False)
|
||||
|
||||
with gr.Row():
|
||||
|
||||
with gr.Column():
|
||||
|
||||
task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task",
|
||||
value= 'Auto Transcribe')
|
||||
|
||||
num_speakers = gr.Number(value=0, label= "Number of speakers (optional)",
|
||||
info = "Number of speakers in the audio file. If you don't know,\
|
||||
leave it at 0.", visible= True)
|
||||
|
||||
translate = gr.Checkbox(label="Translation", choices=[True, False], value = False,
|
||||
info="Select 'Yes' to have the output translated into English.",
|
||||
visible= True)
|
||||
|
||||
language = gr.Dropdown(LANGUAGES,
|
||||
label="Language (optional)", value = "None",
|
||||
info="Language of the audio file. If you don't know,\
|
||||
leave it at None.", visible= True)
|
||||
|
||||
input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video"
|
||||
,"File"], label="Input Type", value="Upload Audio")
|
||||
|
||||
audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio",
|
||||
interactive= True, visible= True)
|
||||
audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath",
|
||||
interactive= True, visible= False)
|
||||
video1 = gr.Video(source="upload", type="filepath", label="Upload Video",
|
||||
interactive= True, visible= False)
|
||||
video2 = gr.Video(source="webcam", label="Record Video", type="filepath",
|
||||
interactive= True, visible= False)
|
||||
file_in = gr.File(label="Upload File", interactive= True, visible= False)
|
||||
|
||||
submit = gr.Button()
|
||||
|
||||
with gr.Column():
|
||||
|
||||
out_txt = gr.Textbox(label="Output",
|
||||
visible= True, show_copy_button=True)
|
||||
|
||||
out_json = gr.JSON(label="JSON Output",
|
||||
visible= False, show_copy_button=True)
|
||||
|
||||
annoation = gr.Textbox(label="Name your speaker's",
|
||||
info= "Please provide a list of the speakers arranged \
|
||||
in the order in which they appear in the input. Use comma ',' \
|
||||
as a seperator. Be aware that the first name is given \
|
||||
to SPEAKER_00 the second to SPEAKER_01 and so on.",
|
||||
visible= False, interactive= True)
|
||||
|
||||
annotate = gr.Button(value="Annotate", visible= False, interactive= True)
|
||||
|
||||
# Define usage of components
|
||||
input.change(fn=select_origin, inputs=[input],
|
||||
outputs=[audio1, audio2, video1, video2, file_in])
|
||||
|
||||
task.change(fn=select_task, inputs=[task],
|
||||
outputs=[num_speakers, translate, language])
|
||||
|
||||
translate.change(fn= lambda x : gr.update(value = x),
|
||||
inputs=[translate], outputs=[translate])
|
||||
num_speakers.change(fn= lambda x : gr.update(value = x),
|
||||
inputs=[num_speakers], outputs=[num_speakers])
|
||||
language.change(fn= lambda x : gr.update(value = x),
|
||||
inputs=[language], outputs=[language])
|
||||
|
||||
submit.click(fn = run_scribe,
|
||||
inputs=[task, num_speakers, translate, language, audio1,
|
||||
audio2, video1, video2, file_in],
|
||||
outputs=[out_txt, out_json, annoation, annotate])
|
||||
|
||||
annotate.click(fn = annotate_output, inputs=[annoation, out_json],
|
||||
outputs=[out_txt, out_json])
|
||||
|
||||
return demo
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
model = AutoTranscribe()
|
||||
gradio_server(model)
|
||||
gradio_Interface().queue().launch()
|
||||
@@ -0,0 +1,66 @@
|
||||
<!-- Importing Cormorant Garamond font from Google Fonts -->
|
||||
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;700&display=swap" rel="stylesheet">
|
||||
|
||||
<style>
|
||||
.header-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
position: relative;
|
||||
padding-top: 30px;
|
||||
}
|
||||
.logo-container {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
right: 20px;
|
||||
transform: translateY(-50%);
|
||||
width: 300px;
|
||||
}
|
||||
.logo {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
h1 {
|
||||
font-family: 'Cormorant Garamond', serif;
|
||||
font-size: 50px !important; /* Increased font size */
|
||||
font-weight: bold;
|
||||
color: #50AF31;
|
||||
margin: 0;
|
||||
position: relative;
|
||||
padding: 0.5em 0;
|
||||
}
|
||||
h1::before, h1::after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
height: 2px;
|
||||
width: 80%;
|
||||
background-color: #50AF31;
|
||||
left: 10%;
|
||||
}
|
||||
h1::before {
|
||||
top: 0.5em;
|
||||
}
|
||||
h1::after {
|
||||
bottom: 0.5em;
|
||||
}
|
||||
p, h2 {
|
||||
font-size: 16px;
|
||||
margin: 10px 0;
|
||||
line-height: 1.4;
|
||||
}
|
||||
</style>
|
||||
|
||||
<div class="header-container">
|
||||
<h1>ScrAIbe</h1>
|
||||
<div class="logo-container">
|
||||
<a href="https://www.kida-bmel.de/"> <!-- Replace with your actual URL -->
|
||||
<img src="file/Logo_KIDA_bmel_green.svg" alt="KIDA Logo" class="logo">
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
<div style="text-align: center; padding: 20px 10%;">
|
||||
<p>
|
||||
Upload, record, or provide a video with audio for transcription. Our toolkit is designed to transcribe content from multiple languages accurately. The integrated speaker diarisation feature identifies different speakers, ensuring a smooth transcription experience. For optimal results, indicate the number of speakers and the original language of the content.
|
||||
</p>
|
||||
<h2 style="font-weight: bold; color: #50AF31;">What would you like to do next?</h2>
|
||||
</div>
|
||||
@@ -11,6 +11,7 @@ import json
|
||||
from sympy import use
|
||||
|
||||
from .autotranscript import AutoTranscribe
|
||||
from .app.gradio_app import gradio_Interface
|
||||
|
||||
from whisper.tokenizer import LANGUAGES , TO_LANGUAGE_CODE
|
||||
from torch.cuda import is_available
|
||||
@@ -160,7 +161,7 @@ def cli():
|
||||
|
||||
|
||||
if start_server: # unfinished code
|
||||
from .app.gradio_app import gradio_Interface
|
||||
|
||||
gradio_Interface(model).queue().launch(server_port=args.port, server_name=args.server_name)
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import json
|
||||
import time
|
||||
from traceback import print_stack
|
||||
|
||||
|
||||
from typing import Union
|
||||
@@ -50,6 +51,7 @@ class Transcript:
|
||||
|
||||
if args:
|
||||
for arg, speaker in zip(args, sorted(self.speakers)):
|
||||
|
||||
annotations[speaker] = arg
|
||||
|
||||
invalid_speakers = set(kwargs.keys()) - set(self.speakers)
|
||||
|
||||
Reference in New Issue
Block a user