added file removal

This commit is contained in:
Jaikinator
2023-06-19 13:36:17 +02:00
parent 280cfa3c35
commit 979a2320f0
+51 -7
View File
@@ -5,6 +5,10 @@ from autotranscript.transcript_exporter import Transcript
from typing import Union , TypeVar from typing import Union , TypeVar
from tqdm import trange from tqdm import trange
import torch import torch
import os
from glob import iglob
from subprocess import run
from warnings import warn
diarisation = TypeVar('diarisation') diarisation = TypeVar('diarisation')
@@ -49,11 +53,14 @@ class AutoTranscribe:
print("AutoTranscribe initialized all models successfully loaded.") print("AutoTranscribe initialized all models successfully loaded.")
def transcribe(self, audiofile : Union[str, torch.Tensor], def transcribe(self, audiofile : Union[str, torch.Tensor],
remove_original : bool = False,
*args, **kwargs) -> Transcript: *args, **kwargs) -> Transcript:
""" """
Transcribe audiofile with whisper model and pyannote diarization model Transcribe audiofile with whisper model and pyannote diarization model
:param audiofile: path to audiofile or torch.Tensor :param audiofile: path to audiofile or torch.Tensor
:param remove_original: if True the original audiofile will be removed after
transcription.
:return: Transcript object which contains the transcript and can be used to :return: Transcript object which contains the transcript and can be used to
export the transcript to differnt formats. export the transcript to differnt formats.
""" """
@@ -86,8 +93,51 @@ class AutoTranscribe:
final_transcript[i] = {"speaker" : diarisation["speakers"][i], final_transcript[i] = {"speaker" : diarisation["speakers"][i],
"segment" : seg, "segment" : seg,
"text" : transcript} "text" : transcript}
if remove_original:
if kwargs.get("shred") is True:
self.remove_audio_file(audiofile, shred=True)
else:
self.remove_audio_file(audiofile, shred=False)
return Transcript(final_transcript) return Transcript(final_transcript)
@staticmethod
def remove_audio_file(audiofile : str,
shred : bool = False) -> None:
"""
removes orginal audiofile to avoid disk space problems
or to enshure data privacy
:param audiofile: path to audiofile
:param shred: if True audiofile will be shredded and not only removed
"""
if not os.path.exists(audiofile):
raise ValueError(f"Audiofile {audiofile} does not exist.")
if shred:
warn("Shredding audiofile can take a long time.", RuntimeWarning)
gen = iglob(f'{audiofile}', recursive=True)
cmd = ['shred', '-zvu', '-n', '10', f'{audiofile}']
if os.path.isdir(audiofile):
raise ValueError(f"Audiofile {audiofile} is a directory.")
for file in gen:
print(f'shredding {file} now\n')
run(cmd , check=True)
else:
os.remove(audiofile)
print(f"Audiofile {audiofile} removed.")
@staticmethod @staticmethod
def get_audiofile(audiofile : Union[str, torch.Tensor], def get_audiofile(audiofile : Union[str, torch.Tensor],
*args, **kwargs) -> AudioProcessor: *args, **kwargs) -> AudioProcessor:
@@ -110,10 +160,4 @@ class AutoTranscribe:
if not isinstance(audiofile, AudioProcessor): if not isinstance(audiofile, AudioProcessor):
raise ValueError(f'Audiofile must be of type AudioProcessor,' \ raise ValueError(f'Audiofile must be of type AudioProcessor,' \
f'not {type(audiofile)}') f'not {type(audiofile)}')
return audiofile return audiofile
if __name__ == "__main__":
AudioTranscriber = AutoTranscribe()
AudioTranscriber.transcribe("tests/test.wav")