added file removal

2023-06-19 13:36:17 +02:00
parent 280cfa3c35
commit 979a2320f0
1 changed files with 51 additions and 7 deletions
@@ -5,6 +5,10 @@ from autotranscript.transcript_exporter import Transcript
 from typing import Union , TypeVar
 from tqdm import trange
 import torch
+import os
+from glob import iglob
+from subprocess import run
+from warnings import warn

 diarisation = TypeVar('diarisation')

@@ -49,11 +53,14 @@ class AutoTranscribe:
        print("AutoTranscribe initialized all models successfully loaded.")
            
    def transcribe(self, audiofile : Union[str, torch.Tensor],
+                   remove_original : bool = False,
                   *args, **kwargs) -> Transcript:
        """
        Transcribe audiofile with whisper model and pyannote diarization model
        
        :param audiofile: path to audiofile or torch.Tensor
+        :param remove_original: if True the original audiofile will be removed after
+                                transcription.
        :return: Transcript object which contains the transcript and can be used to 
                export the transcript to differnt formats.
        """
@@ -86,8 +93,51 @@ class AutoTranscribe:
            final_transcript[i] = {"speaker" : diarisation["speakers"][i],
                                   "segment" : seg,
                                   "text" : transcript}
+            
+        if remove_original:
+            if kwargs.get("shred") is True:
+                self.remove_audio_file(audiofile, shred=True)
+            else:
+                self.remove_audio_file(audiofile, shred=False)
+            
        return Transcript(final_transcript)
    
+    @staticmethod
+    def remove_audio_file(audiofile : str,
+                          shred : bool = False) -> None:
+        """
+        removes orginal audiofile to avoid disk space problems
+        
+        or to enshure data privacy
+        
+        :param audiofile: path to audiofile
+        :param shred: if True audiofile will be shredded and not only removed
+        
+        """
+        if not os.path.exists(audiofile):
+            raise ValueError(f"Audiofile {audiofile} does not exist.")
+        
+        if shred:
+            
+            warn("Shredding audiofile can take a long time.", RuntimeWarning)
+            
+            gen = iglob(f'{audiofile}', recursive=True)
+            cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}']
+            
+            if os.path.isdir(audiofile):
+                raise ValueError(f"Audiofile {audiofile} is a directory.")
+            
+            for file in gen:
+                print(f'shredding {file} now\n')
+                
+                run(cmd , check=True)
+
+        else:
+            os.remove(audiofile)
+            print(f"Audiofile {audiofile} removed.")
+        
+        
+    
    @staticmethod
    def get_audiofile(audiofile : Union[str, torch.Tensor],
                        *args, **kwargs) -> AudioProcessor:
@@ -110,10 +160,4 @@ class AutoTranscribe:
        if not isinstance(audiofile, AudioProcessor):
            raise ValueError(f'Audiofile must be of type AudioProcessor,' \
                             f'not {type(audiofile)}')     
-        return audiofile
-    
-
-if __name__ == "__main__":
-    
-    AudioTranscriber = AutoTranscribe()
-    AudioTranscriber.transcribe("tests/test.wav")
+        return audiofile