rework structure of gradio app
This commit is contained in:
+1
-2
@@ -7,8 +7,7 @@ from .diarisation import *
|
|||||||
from .version import get_version as _get_version
|
from .version import get_version as _get_version
|
||||||
from .misc import *
|
from .misc import *
|
||||||
|
|
||||||
from .app.gradio_app import *
|
from .app import *
|
||||||
from .app.qtfaststart import *
|
|
||||||
|
|
||||||
from .cli import *
|
from .cli import *
|
||||||
|
|
||||||
|
|||||||
@@ -1,2 +1,7 @@
|
|||||||
from .qtfaststart import *
|
from .qtfaststart import *
|
||||||
from .gradio_app import *
|
from .activity_tracker import *
|
||||||
|
from .interface import *
|
||||||
|
from .stg import *
|
||||||
|
from .interactions import *
|
||||||
|
from .global_var import *
|
||||||
|
from .app import *
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
"""
|
||||||
|
This file contains the functions which are related to monitoring the actual app usage.
|
||||||
|
Therefore, the app is to be more efficient in the usage of the resources.
|
||||||
|
By for example, unloading or reloading the model.
|
||||||
|
"""
|
||||||
|
import time
|
||||||
|
import threading
|
||||||
|
import torch
|
||||||
|
import gc
|
||||||
|
import gradio as gr
|
||||||
|
|
||||||
|
|
||||||
|
timeout = 30 #seconds
|
||||||
|
USER_ACTIVE = True
|
||||||
|
user_active_lock = threading.Lock() # dummy for now
|
||||||
|
|
||||||
|
# Create a thread to monitor user activity
|
||||||
|
def monitor_activity(model, pipe, timeout=timeout):
|
||||||
|
global USER_ACTIVE
|
||||||
|
|
||||||
|
while True:
|
||||||
|
time.sleep(timeout) # Check user activity every second
|
||||||
|
with user_active_lock:
|
||||||
|
|
||||||
|
if not USER_ACTIVE:
|
||||||
|
del model
|
||||||
|
del pipe
|
||||||
|
|
||||||
|
gc.collect()
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
print("Model deleted empty memory")
|
||||||
|
gr.Warning("Model unloaded due to inactivity. Please reload the model to continue.")
|
||||||
|
break
|
||||||
|
USER_ACTIVE = False
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
"""
|
||||||
|
Stores global variables for the app.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Global variable to store the model
|
||||||
|
MODEL = None
|
||||||
|
|
||||||
|
# Global variable to track user activity
|
||||||
|
USER_ACTIVE = False
|
||||||
@@ -1,504 +0,0 @@
|
|||||||
"""
|
|
||||||
Gradio Audio Transcription App.
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
This module provides an interface to transcribe audio files using the
|
|
||||||
Scraibe model. Users can either upload an audio file or record their speech
|
|
||||||
live for transcription. The application supports multiple languages and provides
|
|
||||||
options to specify the number of speakers and the language of the audio.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
LANGUAGES (list): A list of supported languages for transcription.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
Run this script to start the Gradio web interface for audio transcription.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
Gradio Audio Transcription App.
|
|
||||||
--------------------------------
|
|
||||||
|
|
||||||
This module provides an interface to transcribe audio files using the
|
|
||||||
Scraibe model. Users can either upload an audio file or record their speech
|
|
||||||
live for transcription. The application supports multiple languages and provides
|
|
||||||
options to specify the number of speakers and the language of the audio.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
LANGUAGES (list): A list of supported languages for transcription.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
Run this script to start the Gradio web interface for audio transcription.
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
import json
|
|
||||||
from math import pi
|
|
||||||
import os
|
|
||||||
|
|
||||||
import gradio as gr
|
|
||||||
import threading
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
import time
|
|
||||||
from scraibe import Scraibe, Transcript
|
|
||||||
|
|
||||||
theme = gr.themes.Soft(
|
|
||||||
primary_hue="green",
|
|
||||||
secondary_hue='orange',
|
|
||||||
neutral_hue="gray",
|
|
||||||
)
|
|
||||||
|
|
||||||
LANGUAGES = [
|
|
||||||
"Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
|
|
||||||
"Bosnian", "Bulgarian", "Catalan", "Chinese", "Croatian",
|
|
||||||
"Czech", "Danish", "Dutch", "English", "Estonian",
|
|
||||||
"Finnish", "French", "Galician", "German", "Greek",
|
|
||||||
"Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian",
|
|
||||||
"Italian", "Japanese", "Kannada", "Kazakh", "Korean",
|
|
||||||
"Latvian", "Lithuanian", "Macedonian", "Malay", "Marathi",
|
|
||||||
"Maori", "Nepali", "Norwegian", "Persian", "Polish",
|
|
||||||
"Portuguese", "Romanian", "Russian", "Serbian", "Slovak",
|
|
||||||
"Slovenian", "Spanish", "Swahili", "Swedish", "Tagalog",
|
|
||||||
"Tamil", "Thai", "Turkish", "Ukrainian", "Urdu",
|
|
||||||
"Vietnamese", "Welsh"
|
|
||||||
]
|
|
||||||
|
|
||||||
CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
|
|
||||||
|
|
||||||
# Global variable to track user activity
|
|
||||||
USER_ACTIVE = True
|
|
||||||
|
|
||||||
# Lock to synchronize access to user_active variable
|
|
||||||
user_active_lock = threading.Lock()
|
|
||||||
|
|
||||||
# Function to reset the user activity flag
|
|
||||||
def reset_user_activity():
|
|
||||||
global USER_ACTIVE
|
|
||||||
with user_active_lock:
|
|
||||||
USER_ACTIVE = True
|
|
||||||
|
|
||||||
class GradioTranscriptionInterface:
|
|
||||||
"""
|
|
||||||
Interface handling the interaction between Gradio UI and the Audio Transcription system.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, model: Scraibe):
|
|
||||||
"""
|
|
||||||
Initializes the GradioTranscriptionInterface with a transcription model.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model (Scraibe): Model responsible for audio transcription tasks.
|
|
||||||
"""
|
|
||||||
self.model = model
|
|
||||||
|
|
||||||
def auto_transcribe(self, source,
|
|
||||||
num_speakers : int,
|
|
||||||
translation : bool,
|
|
||||||
language : str):
|
|
||||||
"""
|
|
||||||
Shortcut method for the Scraibe task.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: Transcribed text (str), JSON output (dict)
|
|
||||||
"""
|
|
||||||
|
|
||||||
kwargs = {
|
|
||||||
"num_speakers": num_speakers if num_speakers != 0 else None,
|
|
||||||
"language": language if language != "None" else None,
|
|
||||||
"task": 'translate' if translation else None
|
|
||||||
}
|
|
||||||
if isinstance(source, str):
|
|
||||||
try:
|
|
||||||
result = self.model.autotranscribe(source, **kwargs)
|
|
||||||
except ValueError:
|
|
||||||
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
|
||||||
Please try again!")
|
|
||||||
|
|
||||||
return str(result), result.get_json()
|
|
||||||
|
|
||||||
elif isinstance(source, list):
|
|
||||||
source_names = [s.split("/")[-1] for s in source]
|
|
||||||
result = []
|
|
||||||
for s in tqdm(source, total=len(source),desc = "Transcribing audio files"):
|
|
||||||
try:
|
|
||||||
res = self.model.autotranscribe(s, **kwargs)
|
|
||||||
except ValueError:
|
|
||||||
_name = s.split("/")[-1]
|
|
||||||
res = f"NO TRANSCRIPT FOUND FOR {_name}"
|
|
||||||
gr.Warning(f"Couldn't detect any speech in {_name} will skip this file.")
|
|
||||||
result.append(res)
|
|
||||||
|
|
||||||
out = ''
|
|
||||||
out_dict = {}
|
|
||||||
for i, r in enumerate(result):
|
|
||||||
out += f"TRANSCRIPT FOR {source_names[i]}:\n\n"
|
|
||||||
out += str(r)
|
|
||||||
out += "\n\n"
|
|
||||||
|
|
||||||
if isinstance(r, str):
|
|
||||||
out_dict[source_names[i]] = r
|
|
||||||
else:
|
|
||||||
out_dict[source_names[i]] = r.get_dict()
|
|
||||||
|
|
||||||
return out, json.dumps(out_dict, indent=4)
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise gr.Error("Please provide a valid audio file.")
|
|
||||||
|
|
||||||
|
|
||||||
def transcribe(self, source, translation, language):
|
|
||||||
"""
|
|
||||||
Shortcut method for the Transcribe task.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: Transcribed text.
|
|
||||||
"""
|
|
||||||
kwargs = {
|
|
||||||
"language": language if language != "None" else None,
|
|
||||||
"task": 'translate' if translation == "Yes" else None
|
|
||||||
}
|
|
||||||
|
|
||||||
if isinstance(source, str):
|
|
||||||
result = self.model.transcribe(source, **kwargs)
|
|
||||||
|
|
||||||
return str(result)
|
|
||||||
|
|
||||||
elif isinstance(source, list):
|
|
||||||
source_names = [s.split("/")[-1] for s in source]
|
|
||||||
result = []
|
|
||||||
for s in tqdm(source, total=len(source),desc = "Transcribing audio files"):
|
|
||||||
res = self.model.transcribe(s, **kwargs)
|
|
||||||
result.append(res)
|
|
||||||
|
|
||||||
out = ''
|
|
||||||
for i, res in enumerate(result):
|
|
||||||
out += f"TRANSCRIPT FOR {source_names[i]}:\n\n"
|
|
||||||
out += str(res)
|
|
||||||
out += "\n\n"
|
|
||||||
|
|
||||||
return out
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise gr.Error("Please provide a valid audio file.")
|
|
||||||
|
|
||||||
def perform_diarisation(self, source, num_speakers):
|
|
||||||
"""
|
|
||||||
Shortcut method for the Diarisation task.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: JSON output of diarisation result.
|
|
||||||
"""
|
|
||||||
kwargs = {
|
|
||||||
"num_speakers": num_speakers if num_speakers != 0 else None,
|
|
||||||
}
|
|
||||||
|
|
||||||
if isinstance(source, str):
|
|
||||||
try:
|
|
||||||
result = self.model.diarization(source, **kwargs)
|
|
||||||
except ValueError:
|
|
||||||
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
|
||||||
Please try again!")
|
|
||||||
|
|
||||||
return json.dumps(result, indent=2)
|
|
||||||
elif isinstance(source, list):
|
|
||||||
source_names = [s.split("/")[-1] for s in source]
|
|
||||||
result = []
|
|
||||||
for s in tqdm(source, total=len(source),desc = "Performing diarisation"):
|
|
||||||
try:
|
|
||||||
res = self.model.diarization(s, **kwargs)
|
|
||||||
except ValueError:
|
|
||||||
res = f"NO DIARISATION FOUND FOR {s}"
|
|
||||||
gr.Warning(f"Couldn't detect any speech in {s} will skip this file.")
|
|
||||||
result.append(res)
|
|
||||||
|
|
||||||
out = {}
|
|
||||||
|
|
||||||
for i, res in enumerate(result):
|
|
||||||
out[source_names[i]] = res
|
|
||||||
|
|
||||||
return json.dumps(out, indent=4)
|
|
||||||
|
|
||||||
else:
|
|
||||||
gr.Error("Please provide a valid audio file.")
|
|
||||||
|
|
||||||
####
|
|
||||||
# Gradio Interface
|
|
||||||
####
|
|
||||||
|
|
||||||
def gradio_Interface(model : Scraibe = None, timeout = 1):
|
|
||||||
"""
|
|
||||||
Gradio Web interface for audio transcription.
|
|
||||||
|
|
||||||
:param model: Scraibe model, defaults to None
|
|
||||||
:type model: Scraibe, optional
|
|
||||||
:param timeout: Time until model is unloaded, defaults to 600 seconds
|
|
||||||
:type timeout: int, optional
|
|
||||||
:return: Gradio Interface
|
|
||||||
:rtype: gradio.Interface
|
|
||||||
"""
|
|
||||||
|
|
||||||
if model is None:
|
|
||||||
model = Scraibe()
|
|
||||||
|
|
||||||
save_model_params = model.params
|
|
||||||
|
|
||||||
pipe = GradioTranscriptionInterface(model)
|
|
||||||
|
|
||||||
def select_task(choice):
|
|
||||||
# tell the app that it is still in use
|
|
||||||
reset_user_activity()
|
|
||||||
|
|
||||||
if choice == 'Auto Transcribe':
|
|
||||||
|
|
||||||
return (gr.update(visible = True),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = True))
|
|
||||||
|
|
||||||
|
|
||||||
elif choice == 'Transcribe':
|
|
||||||
|
|
||||||
return (gr.update(visible = False),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = True))
|
|
||||||
|
|
||||||
|
|
||||||
elif choice == 'Diarisation':
|
|
||||||
|
|
||||||
return (gr.update(visible = True),
|
|
||||||
gr.update(visible = False),
|
|
||||||
gr.update(visible = False))
|
|
||||||
|
|
||||||
def select_origin(choice):
|
|
||||||
|
|
||||||
# tell the app that it is still in use
|
|
||||||
reset_user_activity()
|
|
||||||
|
|
||||||
if choice == "Upload Audio":
|
|
||||||
|
|
||||||
return (gr.update(visible = True),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None))
|
|
||||||
|
|
||||||
elif choice == "Record Audio":
|
|
||||||
|
|
||||||
return (gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None))
|
|
||||||
|
|
||||||
elif choice == "Upload Video":
|
|
||||||
|
|
||||||
return (gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None))
|
|
||||||
|
|
||||||
elif choice == "Record Video":
|
|
||||||
|
|
||||||
return (gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = False, value = None))
|
|
||||||
|
|
||||||
elif choice == "File or Files":
|
|
||||||
|
|
||||||
return (gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = False, value = None),
|
|
||||||
gr.update(visible = True))
|
|
||||||
|
|
||||||
def run_scribe(task,
|
|
||||||
num_speakers,
|
|
||||||
translate,
|
|
||||||
language,
|
|
||||||
audio1,
|
|
||||||
audio2,
|
|
||||||
video1,
|
|
||||||
video2,
|
|
||||||
file_in,
|
|
||||||
progress = gr.Progress(track_tqdm= True)):
|
|
||||||
# get *args which are not None
|
|
||||||
|
|
||||||
if not "model" in locals():
|
|
||||||
gr.Warning("Model unloaded due to inactivity. Reloading the model, please wait.")
|
|
||||||
model = Scraibe(**save_model_params)
|
|
||||||
pipe = GradioTranscriptionInterface(model)
|
|
||||||
# # tell the app that it is still in use
|
|
||||||
reset_user_activity()
|
|
||||||
|
|
||||||
progress(0, desc='Starting task...')
|
|
||||||
source = audio1 or audio2 or video1 or video2 or file_in
|
|
||||||
|
|
||||||
if isinstance(source, list):
|
|
||||||
source = [s.name for s in source]
|
|
||||||
if len(source) == 1:
|
|
||||||
source = source[0]
|
|
||||||
|
|
||||||
if task == 'Auto Transcribe':
|
|
||||||
|
|
||||||
out_str , out_json = pipe.auto_transcribe(source = source,
|
|
||||||
num_speakers = num_speakers,
|
|
||||||
translation = translate,
|
|
||||||
language = language)
|
|
||||||
|
|
||||||
if isinstance(source, str):
|
|
||||||
return (gr.update(value = out_str, visible = True),
|
|
||||||
gr.update(value = out_json, visible = True),
|
|
||||||
gr.update(visible = True),
|
|
||||||
gr.update(visible = True))
|
|
||||||
else:
|
|
||||||
return (gr.update(value = out_str, visible = True),
|
|
||||||
gr.update(value = out_json, visible = True),
|
|
||||||
gr.update(visible = False),
|
|
||||||
gr.update(visible = False))
|
|
||||||
|
|
||||||
elif task == 'Transcribe':
|
|
||||||
|
|
||||||
out = pipe.transcribe(source = source,
|
|
||||||
translation = translate,
|
|
||||||
language = language)
|
|
||||||
|
|
||||||
return (gr.update(value = out, visible = True),
|
|
||||||
gr.update(value = None, visible = False),
|
|
||||||
gr.update(visible = False),
|
|
||||||
gr.update(visible = False))
|
|
||||||
|
|
||||||
elif task == 'Diarisation':
|
|
||||||
|
|
||||||
out = pipe.perform_diarisation(source = source,
|
|
||||||
num_speakers = num_speakers)
|
|
||||||
|
|
||||||
return (gr.update(value = None, visible = False),
|
|
||||||
gr.update(value = out, visible = True),
|
|
||||||
gr.update(visible = False),
|
|
||||||
gr.update(visible = False))
|
|
||||||
|
|
||||||
def annotate_output(annoation : str, out_json : dict):
|
|
||||||
# get *args which are not None
|
|
||||||
|
|
||||||
trans = Transcript.from_json(out_json)
|
|
||||||
trans = trans.annotate(*annoation.split(","))
|
|
||||||
|
|
||||||
return gr.update(value = str(trans)),gr.update(value = trans.get_json())
|
|
||||||
|
|
||||||
# Create a thread to monitor user activity
|
|
||||||
def monitor_activity(model, pipe, timeout=timeout):
|
|
||||||
global USER_ACTIVE
|
|
||||||
|
|
||||||
while True:
|
|
||||||
time.sleep(timeout) # Check user activity every second
|
|
||||||
with user_active_lock:
|
|
||||||
|
|
||||||
if not USER_ACTIVE:
|
|
||||||
del model
|
|
||||||
del pipe
|
|
||||||
print("Model deleted empty memory")
|
|
||||||
gr.Warning("Model unloaded due to inactivity. Please reload the model to continue.")
|
|
||||||
break
|
|
||||||
USER_ACTIVE = False
|
|
||||||
|
|
||||||
# Start the monitoring thread
|
|
||||||
activity_thread = threading.Thread(target=monitor_activity, args=(model, pipe))
|
|
||||||
activity_thread.daemon = True
|
|
||||||
activity_thread.start()
|
|
||||||
|
|
||||||
with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo:
|
|
||||||
|
|
||||||
# Define components
|
|
||||||
hname = os.path.join(CURRENT_PATH, "header.html")
|
|
||||||
header = open(hname, "r").read()
|
|
||||||
|
|
||||||
# ugly hack to get the logo to work
|
|
||||||
header = header.replace("/file=logo.svg", f"/file={CURRENT_PATH}/logo.svg" )
|
|
||||||
|
|
||||||
gr.HTML(header, visible= True, show_label=False)
|
|
||||||
|
|
||||||
with gr.Row():
|
|
||||||
|
|
||||||
with gr.Column():
|
|
||||||
|
|
||||||
task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task",
|
|
||||||
value= 'Auto Transcribe')
|
|
||||||
|
|
||||||
num_speakers = gr.Number(value=0, label= "Number of speakers (optional)",
|
|
||||||
info = "Number of speakers in the audio file. If you don't know,\
|
|
||||||
leave it at 0.", visible= True)
|
|
||||||
|
|
||||||
translate = gr.Checkbox(label="Translation", choices=[True, False], value = False,
|
|
||||||
info="Select 'Yes' to have the output translated into English.",
|
|
||||||
visible= True)
|
|
||||||
|
|
||||||
language = gr.Dropdown(LANGUAGES,
|
|
||||||
label="Language (optional)", value = "None",
|
|
||||||
info="Language of the audio file. If you don't know,\
|
|
||||||
leave it at None.", visible= True)
|
|
||||||
|
|
||||||
input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video"
|
|
||||||
,"File or Files"], label="Input Type", value="Upload Audio")
|
|
||||||
|
|
||||||
audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio",
|
|
||||||
interactive= True, visible= True)
|
|
||||||
audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath",
|
|
||||||
interactive= True, visible= False)
|
|
||||||
video1 = gr.Video(source="upload", type="filepath", label="Upload Video",
|
|
||||||
interactive= True, visible= False)
|
|
||||||
video2 = gr.Video(source="webcam", label="Record Video", type="filepath",include_audio= True,
|
|
||||||
interactive= True, visible= False)
|
|
||||||
file_in = gr.Files(label="Upload File or Files", interactive= True, visible= False)
|
|
||||||
|
|
||||||
submit = gr.Button()
|
|
||||||
|
|
||||||
with gr.Column():
|
|
||||||
|
|
||||||
out_txt = gr.Textbox(label="Output",
|
|
||||||
visible= True, show_copy_button=True)
|
|
||||||
|
|
||||||
out_json = gr.JSON(label="JSON Output",
|
|
||||||
visible= False, show_copy_button=True)
|
|
||||||
|
|
||||||
annoation = gr.Textbox(label="Name your speaker's",
|
|
||||||
info= "Please provide a list of the speakers arranged \
|
|
||||||
in the order in which they appear in the input. Use comma ',' \
|
|
||||||
as a seperator. Be aware that the first name is given \
|
|
||||||
to SPEAKER_00 the second to SPEAKER_01 and so on.",
|
|
||||||
visible= False, interactive= True)
|
|
||||||
|
|
||||||
annotate = gr.Button(value="Annotate", visible= False, interactive= True)
|
|
||||||
|
|
||||||
# Define usage of components
|
|
||||||
input.change(fn=select_origin, inputs=[input],
|
|
||||||
outputs=[audio1, audio2, video1, video2, file_in])
|
|
||||||
|
|
||||||
task.change(fn=select_task, inputs=[task],
|
|
||||||
outputs=[num_speakers, translate, language])
|
|
||||||
|
|
||||||
translate.change(fn= lambda x : gr.update(value = x),
|
|
||||||
inputs=[translate], outputs=[translate])
|
|
||||||
num_speakers.change(fn= lambda x : gr.update(value = x),
|
|
||||||
inputs=[num_speakers], outputs=[num_speakers])
|
|
||||||
language.change(fn= lambda x : gr.update(value = x),
|
|
||||||
inputs=[language], outputs=[language])
|
|
||||||
|
|
||||||
submit.click(fn = run_scribe,
|
|
||||||
inputs=[task, num_speakers, translate, language, audio1,
|
|
||||||
audio2, video1, video2, file_in],
|
|
||||||
outputs=[out_txt, out_json, annoation, annotate])
|
|
||||||
|
|
||||||
annotate.click(fn = annotate_output, inputs=[annoation, out_json],
|
|
||||||
outputs=[out_txt, out_json])
|
|
||||||
|
|
||||||
return demo
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
gradio_Interface().queue().launch()
|
|
||||||
@@ -0,0 +1,145 @@
|
|||||||
|
"""
|
||||||
|
This file contains ervery function that will be called when the user interacts with the
|
||||||
|
UI like pressing a button or uploading a file.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from math import pi
|
||||||
|
import gradio as gr
|
||||||
|
import scraibe.app.global_var as gv
|
||||||
|
from scraibe import Transcript
|
||||||
|
|
||||||
|
def select_task(choice):
|
||||||
|
# tell the app that it is still in use
|
||||||
|
if choice == 'Auto Transcribe':
|
||||||
|
|
||||||
|
return (gr.update(visible = True),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = True))
|
||||||
|
|
||||||
|
|
||||||
|
elif choice == 'Transcribe':
|
||||||
|
|
||||||
|
return (gr.update(visible = False),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = True))
|
||||||
|
|
||||||
|
|
||||||
|
elif choice == 'Diarisation':
|
||||||
|
|
||||||
|
return (gr.update(visible = True),
|
||||||
|
gr.update(visible = False),
|
||||||
|
gr.update(visible = False))
|
||||||
|
|
||||||
|
def select_origin(choice):
|
||||||
|
|
||||||
|
# tell the app that it is still in use
|
||||||
|
if choice == "Upload Audio":
|
||||||
|
|
||||||
|
return (gr.update(visible = True),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None))
|
||||||
|
|
||||||
|
elif choice == "Record Audio":
|
||||||
|
|
||||||
|
return (gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None))
|
||||||
|
|
||||||
|
elif choice == "Upload Video":
|
||||||
|
|
||||||
|
return (gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None))
|
||||||
|
|
||||||
|
elif choice == "Record Video":
|
||||||
|
|
||||||
|
return (gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = False, value = None))
|
||||||
|
|
||||||
|
elif choice == "File or Files":
|
||||||
|
|
||||||
|
return (gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = False, value = None),
|
||||||
|
gr.update(visible = True))
|
||||||
|
|
||||||
|
def run_scraibe(task,
|
||||||
|
num_speakers,
|
||||||
|
translate,
|
||||||
|
language,
|
||||||
|
audio1,
|
||||||
|
audio2,
|
||||||
|
video1,
|
||||||
|
video2,
|
||||||
|
file_in,
|
||||||
|
progress = gr.Progress(track_tqdm= True)):
|
||||||
|
|
||||||
|
# get *args which are not None
|
||||||
|
|
||||||
|
pipe = gv.MODEL
|
||||||
|
|
||||||
|
progress(0, desc='Starting task...')
|
||||||
|
source = audio1 or audio2 or video1 or video2 or file_in
|
||||||
|
|
||||||
|
if isinstance(source, list):
|
||||||
|
source = [s.name for s in source]
|
||||||
|
if len(source) == 1:
|
||||||
|
source = source[0]
|
||||||
|
|
||||||
|
if task == 'Auto Transcribe':
|
||||||
|
|
||||||
|
out_str , out_json = pipe.auto_transcribe(source = source,
|
||||||
|
num_speakers = num_speakers,
|
||||||
|
translation = translate,
|
||||||
|
language = language)
|
||||||
|
|
||||||
|
if isinstance(source, str):
|
||||||
|
return (gr.update(value = out_str, visible = True),
|
||||||
|
gr.update(value = out_json, visible = True),
|
||||||
|
gr.update(visible = True),
|
||||||
|
gr.update(visible = True))
|
||||||
|
else:
|
||||||
|
return (gr.update(value = out_str, visible = True),
|
||||||
|
gr.update(value = out_json, visible = True),
|
||||||
|
gr.update(visible = False),
|
||||||
|
gr.update(visible = False))
|
||||||
|
|
||||||
|
elif task == 'Transcribe':
|
||||||
|
|
||||||
|
out = pipe.transcribe(source = source,
|
||||||
|
translation = translate,
|
||||||
|
language = language)
|
||||||
|
|
||||||
|
return (gr.update(value = out, visible = True),
|
||||||
|
gr.update(value = None, visible = False),
|
||||||
|
gr.update(visible = False),
|
||||||
|
gr.update(visible = False))
|
||||||
|
|
||||||
|
elif task == 'Diarisation':
|
||||||
|
|
||||||
|
out = pipe.perform_diarisation(source = source,
|
||||||
|
num_speakers = num_speakers)
|
||||||
|
|
||||||
|
return (gr.update(value = None, visible = False),
|
||||||
|
gr.update(value = out, visible = True),
|
||||||
|
gr.update(visible = False),
|
||||||
|
gr.update(visible = False))
|
||||||
|
|
||||||
|
def annotate_output(annoation : str, out_json : dict):
|
||||||
|
# get *args which are not None
|
||||||
|
|
||||||
|
trans = Transcript.from_json(out_json)
|
||||||
|
trans = trans.annotate(*annoation.split(","))
|
||||||
|
|
||||||
|
return gr.update(value = str(trans)),gr.update(value = trans.get_json())
|
||||||
|
|
||||||
@@ -0,0 +1,129 @@
|
|||||||
|
"""
|
||||||
|
This file contains the actual gradio Interface which is used to interact with the user.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import gradio as gr
|
||||||
|
import os
|
||||||
|
|
||||||
|
import scraibe.app.global_var as gv
|
||||||
|
from .interactions import *
|
||||||
|
from .stg import *
|
||||||
|
|
||||||
|
from scraibe import Scraibe
|
||||||
|
|
||||||
|
theme = gr.themes.Soft(
|
||||||
|
primary_hue="green",
|
||||||
|
secondary_hue='orange',
|
||||||
|
neutral_hue="gray",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
LANGUAGES = [
|
||||||
|
"Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
|
||||||
|
"Bosnian", "Bulgarian", "Catalan", "Chinese", "Croatian",
|
||||||
|
"Czech", "Danish", "Dutch", "English", "Estonian",
|
||||||
|
"Finnish", "French", "Galician", "German", "Greek",
|
||||||
|
"Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian",
|
||||||
|
"Italian", "Japanese", "Kannada", "Kazakh", "Korean",
|
||||||
|
"Latvian", "Lithuanian", "Macedonian", "Malay", "Marathi",
|
||||||
|
"Maori", "Nepali", "Norwegian", "Persian", "Polish",
|
||||||
|
"Portuguese", "Romanian", "Russian", "Serbian", "Slovak",
|
||||||
|
"Slovenian", "Spanish", "Swahili", "Swedish", "Tagalog",
|
||||||
|
"Tamil", "Thai", "Turkish", "Ukrainian", "Urdu",
|
||||||
|
"Vietnamese", "Welsh"
|
||||||
|
]
|
||||||
|
|
||||||
|
CURRENT_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
|
|
||||||
|
def gradio_Interface(pipe : Scraibe = None):
|
||||||
|
|
||||||
|
if pipe is not None:
|
||||||
|
gv.MODEL = GradioTranscriptionInterface(pipe)
|
||||||
|
|
||||||
|
with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo:
|
||||||
|
|
||||||
|
# Define components
|
||||||
|
hname = os.path.join(CURRENT_PATH, "header.html")
|
||||||
|
header = open(hname, "r").read()
|
||||||
|
|
||||||
|
# ugly hack to get the logo to work
|
||||||
|
header = header.replace("/file=logo.svg", f"/file={CURRENT_PATH}/logo.svg" )
|
||||||
|
|
||||||
|
gr.HTML(header, visible= True, show_label=False)
|
||||||
|
|
||||||
|
with gr.Row():
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
|
|
||||||
|
task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task",
|
||||||
|
value= 'Auto Transcribe')
|
||||||
|
|
||||||
|
num_speakers = gr.Number(value=0, label= "Number of speakers (optional)",
|
||||||
|
info = "Number of speakers in the audio file. If you don't know,\
|
||||||
|
leave it at 0.", visible= True)
|
||||||
|
|
||||||
|
translate = gr.Checkbox(label="Translation", choices=[True, False], value = False,
|
||||||
|
info="Select 'Yes' to have the output translated into English.",
|
||||||
|
visible= True)
|
||||||
|
|
||||||
|
language = gr.Dropdown(LANGUAGES,
|
||||||
|
label="Language (optional)", value = "None",
|
||||||
|
info="Language of the audio file. If you don't know,\
|
||||||
|
leave it at None.", visible= True)
|
||||||
|
|
||||||
|
input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video"
|
||||||
|
,"File or Files"], label="Input Type", value="Upload Audio")
|
||||||
|
|
||||||
|
audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio",
|
||||||
|
interactive= True, visible= True)
|
||||||
|
audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath",
|
||||||
|
interactive= True, visible= False)
|
||||||
|
video1 = gr.Video(source="upload", type="filepath", label="Upload Video",
|
||||||
|
interactive= True, visible= False)
|
||||||
|
video2 = gr.Video(source="webcam", label="Record Video", type="filepath",include_audio= True,
|
||||||
|
interactive= True, visible= False)
|
||||||
|
file_in = gr.Files(label="Upload File or Files", interactive= True, visible= False)
|
||||||
|
|
||||||
|
submit = gr.Button()
|
||||||
|
|
||||||
|
with gr.Column():
|
||||||
|
|
||||||
|
out_txt = gr.Textbox(label="Output",
|
||||||
|
visible= True, show_copy_button=True)
|
||||||
|
|
||||||
|
out_json = gr.JSON(label="JSON Output",
|
||||||
|
visible= False, show_copy_button=True)
|
||||||
|
|
||||||
|
annoation = gr.Textbox(label="Name your speaker's",
|
||||||
|
info= "Please provide a list of the speakers arranged \
|
||||||
|
in the order in which they appear in the input. Use comma ',' \
|
||||||
|
as a seperator. Be aware that the first name is given \
|
||||||
|
to SPEAKER_00 the second to SPEAKER_01 and so on.",
|
||||||
|
visible= False, interactive= True)
|
||||||
|
|
||||||
|
annotate = gr.Button(value="Annotate", visible= False, interactive= True)
|
||||||
|
|
||||||
|
# Define usage of components
|
||||||
|
input.change(fn=select_origin, inputs=[input],
|
||||||
|
outputs=[audio1, audio2, video1, video2, file_in])
|
||||||
|
|
||||||
|
task.change(fn=select_task, inputs=[task],
|
||||||
|
outputs=[num_speakers, translate, language])
|
||||||
|
|
||||||
|
translate.change(fn= lambda x : gr.update(value = x),
|
||||||
|
inputs=[translate], outputs=[translate])
|
||||||
|
num_speakers.change(fn= lambda x : gr.update(value = x),
|
||||||
|
inputs=[num_speakers], outputs=[num_speakers])
|
||||||
|
language.change(fn= lambda x : gr.update(value = x),
|
||||||
|
inputs=[language], outputs=[language])
|
||||||
|
|
||||||
|
submit.click(fn = run_scraibe,
|
||||||
|
inputs=[task, num_speakers, translate, language, audio1,
|
||||||
|
audio2, video1, video2, file_in],
|
||||||
|
outputs=[out_txt, out_json, annoation, annotate])
|
||||||
|
|
||||||
|
annotate.click(fn = annotate_output, inputs=[annoation, out_json],
|
||||||
|
outputs=[out_txt, out_json])
|
||||||
|
|
||||||
|
return demo
|
||||||
@@ -0,0 +1,157 @@
|
|||||||
|
"""
|
||||||
|
stg - scraibe to gradio interface
|
||||||
|
|
||||||
|
This file contains the code for the scraibe to gradio interface.
|
||||||
|
It makes adds gradio interactions to the scraibe class in the back.
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import gradio as gr
|
||||||
|
from tqdm import tqdm
|
||||||
|
from scraibe import Scraibe
|
||||||
|
|
||||||
|
|
||||||
|
class GradioTranscriptionInterface:
|
||||||
|
"""
|
||||||
|
Interface handling the interaction between Gradio UI and the Audio Transcription system.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: Scraibe):
|
||||||
|
"""
|
||||||
|
Initializes the GradioTranscriptionInterface with a transcription model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (Scraibe): Model responsible for audio transcription tasks.
|
||||||
|
"""
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def auto_transcribe(self, source,
|
||||||
|
num_speakers : int,
|
||||||
|
translation : bool,
|
||||||
|
language : str):
|
||||||
|
"""
|
||||||
|
Shortcut method for the Scraibe task.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple: Transcribed text (str), JSON output (dict)
|
||||||
|
"""
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"num_speakers": num_speakers if num_speakers != 0 else None,
|
||||||
|
"language": language if language != "None" else None,
|
||||||
|
"task": 'translate' if translation else None
|
||||||
|
}
|
||||||
|
if isinstance(source, str):
|
||||||
|
try:
|
||||||
|
result = self.model.autotranscribe(source, **kwargs)
|
||||||
|
except ValueError:
|
||||||
|
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
||||||
|
Please try again!")
|
||||||
|
|
||||||
|
return str(result), result.get_json()
|
||||||
|
|
||||||
|
elif isinstance(source, list):
|
||||||
|
source_names = [s.split("/")[-1] for s in source]
|
||||||
|
result = []
|
||||||
|
for s in tqdm(source, total=len(source),desc = "Transcribing audio files"):
|
||||||
|
try:
|
||||||
|
res = self.model.autotranscribe(s, **kwargs)
|
||||||
|
except ValueError:
|
||||||
|
_name = s.split("/")[-1]
|
||||||
|
res = f"NO TRANSCRIPT FOUND FOR {_name}"
|
||||||
|
gr.Warning(f"Couldn't detect any speech in {_name} will skip this file.")
|
||||||
|
result.append(res)
|
||||||
|
|
||||||
|
out = ''
|
||||||
|
out_dict = {}
|
||||||
|
for i, r in enumerate(result):
|
||||||
|
out += f"TRANSCRIPT FOR {source_names[i]}:\n\n"
|
||||||
|
out += str(r)
|
||||||
|
out += "\n\n"
|
||||||
|
|
||||||
|
if isinstance(r, str):
|
||||||
|
out_dict[source_names[i]] = r
|
||||||
|
else:
|
||||||
|
out_dict[source_names[i]] = r.get_dict()
|
||||||
|
|
||||||
|
return out, json.dumps(out_dict, indent=4)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise gr.Error("Please provide a valid audio file.")
|
||||||
|
|
||||||
|
|
||||||
|
def transcribe(self, source, translation, language):
|
||||||
|
"""
|
||||||
|
Shortcut method for the Transcribe task.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Transcribed text.
|
||||||
|
"""
|
||||||
|
kwargs = {
|
||||||
|
"language": language if language != "None" else None,
|
||||||
|
"task": 'translate' if translation == "Yes" else None
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(source, str):
|
||||||
|
result = self.model.transcribe(source, **kwargs)
|
||||||
|
|
||||||
|
return str(result)
|
||||||
|
|
||||||
|
elif isinstance(source, list):
|
||||||
|
source_names = [s.split("/")[-1] for s in source]
|
||||||
|
result = []
|
||||||
|
for s in tqdm(source, total=len(source),desc = "Transcribing audio files"):
|
||||||
|
res = self.model.transcribe(s, **kwargs)
|
||||||
|
result.append(res)
|
||||||
|
|
||||||
|
out = ''
|
||||||
|
for i, res in enumerate(result):
|
||||||
|
out += f"TRANSCRIPT FOR {source_names[i]}:\n\n"
|
||||||
|
out += str(res)
|
||||||
|
out += "\n\n"
|
||||||
|
|
||||||
|
return out
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise gr.Error("Please provide a valid audio file.")
|
||||||
|
|
||||||
|
def perform_diarisation(self, source, num_speakers):
|
||||||
|
"""
|
||||||
|
Shortcut method for the Diarisation task.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: JSON output of diarisation result.
|
||||||
|
"""
|
||||||
|
kwargs = {
|
||||||
|
"num_speakers": num_speakers if num_speakers != 0 else None,
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(source, str):
|
||||||
|
try:
|
||||||
|
result = self.model.diarization(source, **kwargs)
|
||||||
|
except ValueError:
|
||||||
|
raise gr.Error("Couldn't detect any speech in the provided audio. \
|
||||||
|
Please try again!")
|
||||||
|
|
||||||
|
return json.dumps(result, indent=2)
|
||||||
|
elif isinstance(source, list):
|
||||||
|
source_names = [s.split("/")[-1] for s in source]
|
||||||
|
result = []
|
||||||
|
for s in tqdm(source, total=len(source),desc = "Performing diarisation"):
|
||||||
|
try:
|
||||||
|
res = self.model.diarization(s, **kwargs)
|
||||||
|
except ValueError:
|
||||||
|
res = f"NO DIARISATION FOUND FOR {s}"
|
||||||
|
gr.Warning(f"Couldn't detect any speech in {s} will skip this file.")
|
||||||
|
result.append(res)
|
||||||
|
|
||||||
|
out = {}
|
||||||
|
|
||||||
|
for i, res in enumerate(result):
|
||||||
|
out[source_names[i]] = res
|
||||||
|
|
||||||
|
return json.dumps(out, indent=4)
|
||||||
|
|
||||||
|
else:
|
||||||
|
gr.Error("Please provide a valid audio file.")
|
||||||
Reference in New Issue
Block a user