renamed module

2023-09-18 15:29:09 +02:00
parent e76b7b51a5
commit 5385e266cc
21 changed files with 399 additions and 86 deletions
@@ -0,0 +1,2 @@
+from .qtfaststart import *
+from .gradio_app import *
@@ -0,0 +1,340 @@
+"""
+Gradio Audio Transcription App.
+--------------------------------
+
+This module provides an interface to transcribe audio files using the 
+AutoTranscribe model. Users can either upload an audio file or record their speech 
+live for transcription. The application supports multiple languages and provides 
+options to specify the number of speakers and the language of the audio.
+
+Attributes:
+    LANGUAGES (list): A list of supported languages for transcription.
+
+Usage:
+    Run this script to start the Gradio web interface for audio transcription.
+    
+"""
+
+"""
+Gradio Audio Transcription App.
+--------------------------------
+
+This module provides an interface to transcribe audio files using the 
+AutoTranscribe model. Users can either upload an audio file or record their speech 
+live for transcription. The application supports multiple languages and provides 
+options to specify the number of speakers and the language of the audio.
+
+Attributes:
+    LANGUAGES (list): A list of supported languages for transcription.
+
+Usage:
+    Run this script to start the Gradio web interface for audio transcription.
+    
+"""
+
+import json
+
+import gradio as gr
+from scraibe import AutoTranscribe, Transcript
+
+
+theme = gr.themes.Soft(
+    primary_hue="green",
+    secondary_hue='orange',
+    neutral_hue="gray",  
+)
+
+LANGUAGES = [
+    "Afrikaans", "Arabic", "Armenian", "Azerbaijani", "Belarusian",
+    "Bosnian", "Bulgarian", "Catalan", "Chinese", "Croatian",
+    "Czech", "Danish", "Dutch", "English", "Estonian",
+    "Finnish", "French", "Galician", "German", "Greek",
+    "Hebrew", "Hindi", "Hungarian", "Icelandic", "Indonesian",
+    "Italian", "Japanese", "Kannada", "Kazakh", "Korean",
+    "Latvian", "Lithuanian", "Macedonian", "Malay", "Marathi",
+    "Maori", "Nepali", "Norwegian", "Persian", "Polish",
+    "Portuguese", "Romanian", "Russian", "Serbian", "Slovak",
+    "Slovenian", "Spanish", "Swahili", "Swedish", "Tagalog",
+    "Tamil", "Thai", "Turkish", "Ukrainian", "Urdu",
+    "Vietnamese", "Welsh"
+]
+
+class GradioTranscriptionInterface:
+    """
+    Interface handling the interaction between Gradio UI and the Audio Transcription system.
+    """
+
+    def __init__(self, model: AutoTranscribe):
+        """
+        Initializes the GradioTranscriptionInterface with a transcription model.
+
+        Args:
+            model (AutoTranscribe): Model responsible for audio transcription tasks.
+        """
+        self.model = model
+
+    def auto_transcribe(self, source,
+                        num_speakers : int,
+                        translation : bool,
+                        language : str):
+        """
+        Shortcut method for the AutoTranscribe task.
+
+        Returns:
+            tuple: Transcribed text (str), JSON output (dict)
+        """
+        
+        kwargs = {
+            "num_speakers": num_speakers if num_speakers != 0 else None,
+            "language": language if language != "None" else None,
+            "task": 'translate' if translation else None
+        }
+        
+        try:
+            result = self.model.autotranscribe(source, **kwargs)
+        except ValueError:
+            raise gr.Error("Couldn't detect any speech in the provided audio. \
+                    Please try again!")
+        return str(result), result.get_json()
+
+
+    def transcribe(self, source, translation, language):
+        """
+        Shortcut method for the Transcribe task.
+
+        Returns:
+            str: Transcribed text.
+        """
+        kwargs = {
+            "language": language if language != "None" else None,
+            "task": 'translate' if translation == "Yes" else None
+        }
+        
+        result = self.model.transcribe(source, **kwargs)
+        return str(result)
+
+    def perform_diarisation(self, source, num_speakers):
+        """
+        Shortcut method for the Diarisation task.
+
+        Returns:
+            str: JSON output of diarisation result.
+        """
+        kwargs = {
+            "num_speakers": num_speakers if num_speakers != 0 else None,
+        }
+        
+        
+        try:
+            result = self.model.diarization(source, **kwargs)
+        except ValueError:
+            raise gr.Error("Couldn't detect any speech in the provided audio. \
+                    Please try again!")
+        return json.dumps(result, indent=2)
+
+####
+# Gradio Interface
+####
+
+def gradio_Interface(model : AutoTranscribe = None):
+    
+    if model is None:
+        model = AutoTranscribe()
+        
+    pipe = GradioTranscriptionInterface(model)
+
+    def select_task(choice):
+        if choice == 'Auto Transcribe':
+            
+            return (gr.update(visible = True),
+                    gr.update(visible = True),
+                    gr.update(visible = True))
+                    
+            
+        elif choice == 'Transcribe':
+            
+            return (gr.update(visible = False),
+                    gr.update(visible = True),
+                    gr.update(visible = True))
+
+            
+        elif choice == 'Diarisation':
+            
+            return (gr.update(visible = True),
+                    gr.update(visible = False),
+                    gr.update(visible = False))
+        
+    def select_origin(choice):
+        if choice == "Upload Audio":
+            
+            return (gr.update(visible = True),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None))
+        
+        elif choice == "Record Audio":
+            
+            return (gr.update(visible = False, value = None),
+                    gr.update(visible = True),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None))
+
+        elif choice == "Upload Video":
+            
+            return (gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = True),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None))
+        
+        elif choice == "Record Video":
+            
+            return (gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = True),
+                    gr.update(visible = False, value = None))
+            
+        elif choice == "File":
+            
+            return (gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = False, value = None),
+                    gr.update(visible = True))
+
+    def run_scribe(task, num_speakers, translate, language, audio1, audio2, video1, video2, file_in, progress = gr.Progress(track_tqdm= True)):
+        # get *args which are not None
+        progress(0, desc='Starting task...')
+        source = audio1 or audio2 or video1 or video2 or file_in
+        
+        if task == 'Auto Transcribe':
+            
+            out_str , out_json = pipe.auto_transcribe(source = source,
+                                num_speakers = num_speakers,
+                                translation = translate,
+                                language = language)
+            
+            return (gr.update(value = out_str, visible = True),
+                    gr.update(value = out_json, visible = True),
+                    gr.update(visible = True),
+                    gr.update(visible = True))        
+            
+        elif task == 'Transcribe':
+            
+            out = pipe.transcribe(source = source,
+                                translation = translate,
+                                language = language)
+            
+            return (gr.update(value = out, visible = True),
+                    gr.update(value = None, visible = False),
+                    gr.update(visible = False),
+                    gr.update(visible = False))
+            
+        elif task == 'Diarisation':
+            
+            out = pipe.perform_diarisation(source = source,
+                                num_speakers = num_speakers)
+            
+            return (gr.update(value = None, visible = False),
+                    gr.update(value = out, visible = True),
+                    gr.update(visible = False),
+                    gr.update(visible = False))
+        
+    def annotate_output(annoation : str, out_json : dict):
+        # get *args which are not None
+        
+        trans = Transcript.from_json(out_json)
+        trans = trans.annotate(*annoation.split(","))
+
+        return gr.update(value = str(trans)),gr.update(value = trans.get_json())
+        
+        
+    with gr.Blocks(theme=theme,title='ScrAIbe: Automatic Audio Transcription') as demo:
+            
+        # Define components
+        header = open("header.html", "r").read()
+        gr.HTML(header, visible= True, show_label=False)
+        
+        with gr.Row():
+            
+            with gr.Column():
+            
+                task = gr.Radio(["Auto Transcribe", "Transcribe", "Diarisation"], label="Task",
+                                value= 'Auto Transcribe')
+                
+                num_speakers = gr.Number(value=0, label= "Number of speakers (optional)", 
+                                info = "Number of speakers in the audio file. If you don't know,\
+                                    leave it at 0.", visible= True)
+                
+                translate = gr.Checkbox(label="Translation", choices=[True, False], value = False,
+                                info="Select 'Yes' to have the output translated into English.",
+                                visible= True)
+                
+                language = gr.Dropdown(LANGUAGES,
+                                label="Language (optional)", value = "None",
+                                info="Language of the audio file. If you don't know,\
+                                    leave it at None.", visible= True)
+                
+                input = gr.Radio(["Upload Audio", "Record Audio", "Upload Video","Record Video" 
+                                    ,"File"], label="Input Type", value="Upload Audio")
+                
+                audio1 = gr.Audio(source="upload", type="filepath", label="Upload Audio",
+                                    interactive= True, visible= True)
+                audio2 = gr.Audio(source="microphone", label="Record Audio", type="filepath",
+                                    interactive= True, visible= False)
+                video1 = gr.Video(source="upload", type="filepath", label="Upload Video",
+                                    interactive= True, visible= False)
+                video2 = gr.Video(source="webcam", label="Record Video", type="filepath",
+                                    interactive= True, visible= False)
+                file_in = gr.File(label="Upload File", interactive= True, visible= False)
+                
+                submit = gr.Button()
+            
+            with gr.Column():
+                
+                out_txt = gr.Textbox(label="Output",
+                                        visible= True, show_copy_button=True)
+                
+                out_json = gr.JSON(label="JSON Output",
+                                    visible= False, show_copy_button=True)
+                
+                annoation = gr.Textbox(label="Name your speaker's",
+                                    info= "Please provide a list of the speakers arranged \
+                                    in the order in which they appear in the input. Use comma ',' \
+                                    as a seperator. Be aware that the first name is given \
+                                        to SPEAKER_00 the second to SPEAKER_01 and so on.",
+                                    visible= False, interactive= True)
+                
+                annotate = gr.Button(value="Annotate", visible= False, interactive= True)
+            
+        # Define usage of components
+        input.change(fn=select_origin, inputs=[input],
+                        outputs=[audio1, audio2, video1, video2, file_in])
+        
+        task.change(fn=select_task, inputs=[task],
+                    outputs=[num_speakers, translate, language])
+        
+        translate.change(fn= lambda x : gr.update(value = x),
+                            inputs=[translate], outputs=[translate])
+        num_speakers.change(fn= lambda x : gr.update(value = x),
+                            inputs=[num_speakers], outputs=[num_speakers])
+        language.change(fn= lambda x : gr.update(value = x), 
+                        inputs=[language], outputs=[language])
+        
+        submit.click(fn = run_scribe, 
+                        inputs=[task, num_speakers, translate, language, audio1,
+                                audio2, video1, video2, file_in],
+                        outputs=[out_txt, out_json, annoation, annotate])
+        
+        annotate.click(fn = annotate_output, inputs=[annoation, out_json],
+                        outputs=[out_txt, out_json])
+        
+    return demo
+
+    
+if __name__ == "__main__":
+    
+    gradio_Interface().queue().launch()
@@ -0,0 +1,66 @@
+<!-- Importing Cormorant Garamond font from Google Fonts -->
+<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;700&display=swap" rel="stylesheet">
+
+<style>
+    .header-container {
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        position: relative;
+        padding-top: 30px;
+    }
+    .logo-container {
+        position: absolute;
+        top: 50%;
+        right: 20px;
+        transform: translateY(-50%);
+        width: 300px;
+    }
+    .logo {
+        width: 100%;
+        height: auto;
+    }
+    h1 {
+        font-family: 'Cormorant Garamond', serif;
+        font-size: 50px !important; /* Increased font size */
+        font-weight: bold;
+        color: #50AF31;
+        margin: 0;
+        position: relative;
+        padding: 0.5em 0;
+    }
+    h1::before, h1::after {
+        content: "";
+        position: absolute;
+        height: 2px;
+        width: 80%;
+        background-color: #50AF31;
+        left: 10%;
+    }
+    h1::before {
+        top: 0.5em;
+    }
+    h1::after {
+        bottom: 0.5em;
+    }
+    p, h2 {
+        font-size: 16px;
+        margin: 10px 0;
+        line-height: 1.4;
+    }
+</style>
+
+<div class="header-container">
+    <h1>ScrAIbe</h1>
+    <div class="logo-container">
+        <a href="https://www.kida-bmel.de/"> <!-- Replace with your actual URL -->
+            <img src="file/Logo_KIDA_bmel_green.svg" alt="KIDA Logo" class="logo">
+        </a>
+    </div>
+</div>
+<div style="text-align: center; padding: 20px 10%;">
+    <p>
+        Upload, record, or provide a video with audio for transcription. Our toolkit is designed to transcribe content from multiple languages accurately. The integrated speaker diarisation feature identifies different speakers, ensuring a smooth transcription experience. For optimal results, indicate the number of speakers and the original language of the content.
+    </p>
+    <h2 style="font-weight: bold; color: #50AF31;">What would you like to do next?</h2>
+</div>
@@ -0,0 +1,319 @@
+"""
+This file contains a modified version of qtfaststart by qtfaststart
+https://github.com/danielgtaylor/qtfaststart/tree/master
+
+All credit goes to the original author.
+Copyright (C) 2008 - 2013 Daniel G. Taylor <dan@programmer-art.org>
+Permission is hereby granted, free of charge, to any person obtaining a copy of this
+software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the 
+Software, and to permit persons to whom the Software is furnished to do so, 
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies
+or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
+IN THE SOFTWARE.
+"""
+
+import logging
+import os
+import struct
+import collections
+import io
+
+# define error classes
+class FastStartException(Exception):
+    """
+    Raised when something bad happens during processing.
+    """
+    pass
+
+class FastStartSetupError(FastStartException):
+    """
+    Rasised when asked to process a file that does not need processing
+    """
+    pass
+
+class MalformedFileError(FastStartException):
+    """
+    Raised when the input file is setup in an unexpected way
+    """
+    pass
+
+class UnsupportedFormatError(FastStartException):
+    """
+    Raised when a movie file is recognized as a format not supported.
+    """
+    pass
+
+# define constants
+CHUNK_SIZE = 8192
+
+log = logging.getLogger("qtfaststart")
+
+# Older versions of Python require this to be defined
+if not hasattr(os, 'SEEK_CUR'):
+    os.SEEK_CUR = 1
+
+Atom = collections.namedtuple('Atom', 'name position size')
+
+def read_atom(datastream):
+    """
+        Read an atom and return a tuple of (size, type) where size is the size
+        in bytes (including the 8 bytes already read) and type is a "fourcc"
+        like "ftyp" or "moov".
+    """
+    size, type = struct.unpack(">L4s", datastream.read(8))
+    type = type.decode('ascii')
+    return size, type
+
+
+def _read_atom_ex(datastream):
+    """
+    Read an Atom from datastream
+    """
+    pos = datastream.tell()
+    atom_size, atom_type = read_atom(datastream)
+    if atom_size == 1:
+        atom_size, = struct.unpack(">Q", datastream.read(8))
+    return Atom(atom_type, pos, atom_size)
+
+
+def get_index(datastream):
+    """
+        Return an index of top level atoms, their absolute byte-position in the
+        file and their size in a list:
+
+        index = [
+            ("ftyp", 0, 24),
+            ("moov", 25, 2658),
+            ("free", 2683, 8),
+            ...
+        ]
+
+        The tuple elements will be in the order that they appear in the file.
+    """
+    log.debug("Getting index of top level atoms...")
+
+    index = list(_read_atoms(datastream))
+    _ensure_valid_index(index)
+
+    return index
+
+
+def _read_atoms(datastream):
+    """
+    Read atoms until an error occurs
+    """
+    while datastream:
+        try:
+            atom = _read_atom_ex(datastream)
+            log.debug("%s: %s" % (atom.name, atom.size))
+        except:
+            break
+
+        yield atom
+
+        if atom.size == 0:
+            if atom.name == "mdat":
+                # Some files may end in mdat with no size set, which generally
+                # means to seek to the end of the file. We can just stop indexing
+                # as no more entries will be found!
+                break
+            else:
+                # Weird, but just continue to try to find more atoms
+                continue
+
+        datastream.seek(atom.position + atom.size)
+
+
+def _ensure_valid_index(index):
+    """
+    Ensure the minimum viable atoms are present in the index.
+
+    Raise FastStartException if not.
+    """
+    top_level_atoms = set([item.name for item in index])
+    for key in ["moov", "mdat"]:
+        if key not in top_level_atoms:
+            log.error("%s atom not found, is this a valid MOV/MP4 file?" % key)
+            raise FastStartException()
+
+
+def find_atoms(size, datastream):
+    """
+    Compatibilty interface for _find_atoms_ex
+    """
+    fake_parent = Atom('fake', datastream.tell()-8, size+8)
+    for atom in _find_atoms_ex(fake_parent, datastream):
+        yield atom.name
+
+
+def _find_atoms_ex(parent_atom, datastream):
+    """
+        Yield either "stco" or "co64" Atoms from datastream.
+        datastream will be 8 bytes into the stco or co64 atom when the value
+        is yielded.
+
+        It is assumed that datastream will be at the end of the atom after
+        the value has been yielded and processed.
+
+        parent_atom is the parent atom, a 'moov' or other ancestor of CO
+        atoms in the datastream.
+    """
+    stop = parent_atom.position + parent_atom.size
+
+    while datastream.tell() < stop:
+        try:
+            atom = _read_atom_ex(datastream)
+        except:
+            log.exception("Error reading next atom!")
+            raise FastStartException()
+
+        if atom.name in ["trak", "mdia", "minf", "stbl"]:
+            # Known ancestor atom of stco or co64, search within it!
+            for res in _find_atoms_ex(atom, datastream):
+                yield res
+        elif atom.name in ["stco", "co64"]:
+            yield atom
+        else:
+            # Ignore this atom, seek to the end of it.
+            datastream.seek(atom.position + atom.size)
+
+
+def process(infilename, limit=float('inf')):
+    """
+        Convert a Quicktime/MP4 file for streaming by moving the metadata to
+        the front of the file. This method writes a new file.
+
+        If limit is set to something other than zero it will be used as the
+        number of bytes to write of the atoms following the moov atom. This
+        is very useful to create a small sample of a file with full headers,
+        which can then be used in bug reports and such.
+    """
+    if isinstance(infilename, str):
+        datastream = open(infilename, "rb")
+    elif isinstance(infilename, bytes):
+        datastream = io.BytesIO(infilename)
+    else:
+        raise TypeError("infilename must be a filename, bytes or file-like object")
+    # Get the top level atom index
+    index = get_index(datastream)
+
+    mdat_pos = 999999
+    free_size = 0
+
+    # Make sure moov occurs AFTER mdat, otherwise no need to run!
+    for atom in index:
+        # The atoms are guaranteed to exist from get_index above!
+        if atom.name == "moov":
+            moov_atom = atom
+            moov_pos = atom.position
+        elif atom.name == "mdat":
+            mdat_pos = atom.position
+        elif atom.name == "free" and atom.position < mdat_pos:
+            # This free atom is before the mdat!
+            free_size += atom.size
+            log.info("Removing free atom at %d (%d bytes)" % (atom.position, atom.size))
+        elif atom.name == "\x00\x00\x00\x00" and atom.position < mdat_pos:
+            # This is some strange zero atom with incorrect size
+            free_size += 8
+            log.info("Removing strange zero atom at %s (8 bytes)" % atom.position)
+
+    # Offset to shift positions
+    offset = moov_atom.size - free_size
+
+    if moov_pos < mdat_pos:
+        # moov appears to be in the proper place, don't shift by moov size
+        offset -= moov_atom.size
+        if not free_size:
+            # No free atoms and moov is correct, we are done!
+            log.error("This file appears to already be setup for streaming!")
+            # Stupid hack to retrun the non-processed file:
+            if isinstance(infilename, str):
+                return open(infilename, "rb").read()
+            elif isinstance(infilename, bytes):
+                return io.BytesIO(infilename).read()
+            
+    # Read and fix moov
+    moov = _patch_moov(datastream, moov_atom, offset)
+
+    log.info("Writing output...")
+    outfile = b''
+
+    # Write ftype
+    for atom in index:
+        if atom.name == "ftyp":
+            log.debug("Writing ftyp... (%d bytes)" % atom.size)
+            datastream.seek(atom.position)
+            outfile += datastream.read(atom.size)
+
+    # Write moov
+    _bytes = moov.getvalue()
+    log.debug("Writing moov... (%d bytes)" % len(_bytes))
+    outfile += _bytes
+
+    # Write the rest
+    atoms = [item for item in index if item.name not in ["ftyp", "moov", "free"]]
+    for atom in atoms:
+        log.debug("Writing %s... (%d bytes)" % (atom.name, atom.size))
+        datastream.seek(atom.position)
+
+        # for compatability, allow '0' to mean no limit
+        cur_limit = limit or float('inf')
+        cur_limit = min(cur_limit, atom.size)
+
+        for chunk in get_chunks(datastream, CHUNK_SIZE, cur_limit):
+            outfile += chunk
+
+    return outfile
+
+
+def _patch_moov(datastream, atom, offset):
+    datastream.seek(atom.position)
+    moov = io.BytesIO(datastream.read(atom.size))
+
+    # reload the atom from the fixed stream
+    atom = _read_atom_ex(moov)
+
+    for atom in _find_atoms_ex(atom, moov):
+        # Read either 32-bit or 64-bit offsets
+        ctype, csize = dict(
+            stco=('L', 4),
+            co64=('Q', 8),
+        )[atom.name]
+
+        # Get number of entries
+        version, entry_count = struct.unpack(">2L", moov.read(8))
+
+        log.info("Patching %s with %d entries" % (atom.name, entry_count))
+
+        entries_pos = moov.tell()
+
+        struct_fmt = ">%(entry_count)s%(ctype)s" % vars()
+
+        # Read entries
+        entries = struct.unpack(struct_fmt, moov.read(csize * entry_count))
+
+        # Patch and write entries
+        offset_entries = [entry + offset for entry in entries]
+        moov.seek(entries_pos)
+        moov.write(struct.pack(struct_fmt, *offset_entries))
+    return moov
+
+def get_chunks(stream, chunk_size, limit):
+    remaining = limit
+    while remaining:
+        chunk = stream.read(min(remaining, chunk_size))
+        if not chunk:
+            return
+        remaining -= len(chunk)
+        yield chunk