Add files via upload

2022-12-20 13:54:29 +01:00
parent 085b3180e5
commit 60ef9c0db8
9 changed files with 340 additions and 0 deletions
@@ -0,0 +1,4 @@
+from autotranscript.__main__ import *
+from autotranscript.version import get_version as _get_version
+
+__version__ = _get_version()
@@ -0,0 +1,126 @@
+
+import whisper
+from time import time
+import os
+from moviepy.editor import *
+from typing import Union
+
+class Transcribe:
+    def __init__(self, audiofile : Union[bool, str, list] = None, model : str =  "medium", language :str =  "German"):
+        """
+         Class to autotranscript audio and video files with the Whisper model
+        :param audiofile: audio file or list of audio files
+        :param model: model to use for transcription
+        :param language: language of the audio file
+        """
+
+        self.audiofile = audiofile
+
+        self.language = language
+
+        """
+        Create folder structure
+        """
+
+        self.currentpath,\
+            self.audiopath,\
+            self.transcriptionpath,\
+            self.audiofiles = self.create_folder_structure() # create folder structure
+
+        print("loading model")
+        self.model = whisper.load_model(model)  # load model
+        print("model loaded")
+
+    def create_folder_structure(self):
+        """
+        Create folder structure for audio and transcription files
+
+        :return:  currentpath, audiopath, transcriptionpath, audiofiles
+        """
+        currentpath = os.getcwd() # get current path
+
+        if not os.path.exists(os.path.join(currentpath, 'audiofiles')):
+            print('Creating audiofiles folder')
+            os.makedirs(os.path.join(currentpath, 'audiofiles'))
+        if not os.path.exists(os.path.join(currentpath, 'transcription')):
+            print('Creating transcription folder')
+            os.makedirs(os.path.join(currentpath, 'transcription'))
+
+        audiopath = os.path.join(currentpath, 'audiofiles')  # path to audio files
+        transcriptionpath = os.path.join(currentpath, 'transcription') # path to transcription files
+
+        audiofiles = os.listdir(audiopath) # list of audio files
+
+        return currentpath, audiopath, transcriptionpath, audiofiles
+
+    def video_to_audio(self,file,  remove_video=True):
+        clip = VideoFileClip(file)
+        clip.audio.write_audiofile(os.path.join(file[:-4] + '.mp3'))
+        if remove_video:
+            os.remove(file)
+            print(f'Video {file} removed')
+        return os.path.join(file[:-4] + '.mp3')
+
+
+    def transcribe(self):
+
+        if self.audiofile is not None:
+            if self.audiofile in self.audiofiles:
+                audiofile = os.path.join(self.audiopath, self.audiofile)
+            else:
+                raise ValueError('Audio file not found')
+
+            if audiofile.endswith('.mp4'):
+                print('Converting video to audio')
+                audiofile = self.video_to_audio(audiofile)
+
+            print(f'Start transcribing Audio file: {audiofile}')
+            _stime = time()
+            result = self.model.transcribe(audiofile, verbose=True, language= self.language)
+
+            print(f'Transcription finished in {time() - _stime} seconds')
+
+            txtfilename = str(audiofile.split('/')[-1][:-4]) + '.txt'
+
+            savepath = os.path.join(self.transcriptionpath, txtfilename)
+
+            with open(savepath, 'w') as f:
+                f.write(result["text"])
+        elif self.audiofile is None or isinstance(self.audiofile, list):
+            print('No audio file specified or list of audio files')
+            print(f"{len(self.audiofiles)} audio files found in {self.audiopath}")
+            print("Start transcribing all audio files")
+            i = 0
+            for audiofile in self.audiofiles:
+
+                audiofile = os.path.join(self.audiopath, audiofile)
+
+                if audiofile.endswith('.mp4'):
+                    audiofile = self.video_to_audio(audiofile)
+
+                print(f'Start transcribing Audio file: {audiofile}')
+                _stime = time()
+                result = self.model.transcribe(audiofile, verbose=True, language=self.language)
+                print(f'Transcription finished in {time() - _stime} seconds')
+
+                txtfilename = str(audiofile.split('/')[-1][:-4]) + '.txt'
+
+                savepath = os.path.join(self.transcriptionpath, txtfilename)
+
+                with open(savepath, 'w') as f:
+                    f.write(result["text"])
+
+                i += 1
+                print(f'{i} of {len(self.audiofiles)} files transcribed')
+
+        else:
+            raise ValueError('Audio file not found')
+
+        print('Transcription finished')
+
+    def __call__(self):
+        return self.transcribe()
+    def __repr__(self):
+        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
+    def __str__(self):
+        return f"Transcribe(audiofile={self.audiofile}, model={self.model}, language={self.language})"
@@ -0,0 +1,69 @@
+import os
+import subprocess as sp
+
+MAJOR = 1
+MINOR = 0
+MICRO = 0
+MICRO_POST = 0
+ISRELEASED = False
+VERSION = '%d.%d.%d.%d' % (MAJOR, MINOR, MICRO, MICRO_POST)
+
+# Return the git revision as a string
+# taken from numpy/numpy
+def git_version():
+    def _minimal_ext_cmd(cmd):
+        # construct minimal environment
+        env = {}
+        for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+            v = os.environ.get(k)
+            if v is not None:
+                env[k] = v
+
+        # LANGUAGE is used on win32
+        env['LANGUAGE'] = 'C'
+        env['LANG'] = 'C'
+        env['LC_ALL'] = 'C'
+
+        out = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE, env=env).communicate()[0]
+        return out
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        GIT_REVISION = out.strip().decode('ascii')
+    except OSError:
+        GIT_REVISION = "Unknown"
+
+    return GIT_REVISION
+
+def _get_git_version():
+    cwd = os.getcwd()
+
+    # go to the main directory
+    fdir = os.path.dirname(os.path.abspath(__file__))
+    maindir = os.path.abspath(os.path.join(fdir, ".."))
+    # maindir = fdir # os.path.join(fdir, "..")
+    os.chdir(maindir)
+
+    # get git version
+    res = git_version()
+
+    # restore the cwd
+    os.chdir(cwd)
+    return res
+
+def get_version(build_version=False):
+    if ISRELEASED:
+        return VERSION
+
+    # unreleased version
+    GIT_REVISION = _get_git_version()
+
+    if build_version:
+        import datetime as dt
+        date = dt.date.strftime(dt.datetime.now(), "%Y%m%d%H%M%S")
+        return VERSION + ".dev" + date
+    else:
+        return VERSION + ".dev0+" + GIT_REVISION[:7]
+
+
+