added file removal

2023-06-19 13:36:17 +02:00
parent 280cfa3c35
commit 979a2320f0
1 changed files with 51 additions and 7 deletions
@@ -5,6 +5,10 @@ from autotranscript.transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
 import torch
 import os
 from glob import iglob
 from subprocess import run
 from warnings import warn
 diarisation = TypeVar('diarisation')
@@ -49,11 +53,14 @@ class AutoTranscribe:
        print("AutoTranscribe initialized all models successfully loaded.")
    def transcribe(self, audiofile : Union[str, torch.Tensor],
                   remove_original : bool = False,
                   *args, **kwargs) -> Transcript:
        """
        Transcribe audiofile with whisper model and pyannote diarization model
        :param audiofile: path to audiofile or torch.Tensor
        :param remove_original: if True the original audiofile will be removed after
                                transcription.
        :return: Transcript object which contains the transcript and can be used to 
                export the transcript to differnt formats.
        """
@@ -86,8 +93,51 @@ class AutoTranscribe:
            final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                   "segment" : seg,
                                   "text" : transcript}
        if remove_original:
            if kwargs.get("shred") is True:
                self.remove_audio_file(audiofile, shred=True)
            else:
                self.remove_audio_file(audiofile, shred=False)
        return Transcript(final_transcript)
    @staticmethod
    def remove_audio_file(audiofile : str,
                          shred : bool = False) -> None:
        """
        removes orginal audiofile to avoid disk space problems
        or to enshure data privacy
        :param audiofile: path to audiofile
        :param shred: if True audiofile will be shredded and not only removed
        """
        if not os.path.exists(audiofile):
            raise ValueError(f"Audiofile {audiofile} does not exist.")
        if shred:
            warn("Shredding audiofile can take a long time.", RuntimeWarning)
            gen = iglob(f'{audiofile}', recursive=True)
            cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}']
            if os.path.isdir(audiofile):
                raise ValueError(f"Audiofile {audiofile} is a directory.")
            for file in gen:
                print(f'shredding {file} now\n')
                run(cmd , check=True)
        else:
            os.remove(audiofile)
            print(f"Audiofile {audiofile} removed.")
    @staticmethod
    def get_audiofile(audiofile : Union[str, torch.Tensor],
                        *args, **kwargs) -> AudioProcessor:
@@ -110,10 +160,4 @@ class AutoTranscribe:
        if not isinstance(audiofile, AudioProcessor):
            raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                             f'not {type(audiofile)}')     
-        return audiofile
+        return audiofile
 if __name__ == "__main__":
    AudioTranscriber = AutoTranscribe()
    AudioTranscriber.transcribe("tests/test.wav")