Add cleanup of temp and upload files after transcription job
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-14 15:55:44 +00:00
parent 63cd620b79
commit 1dea51f1f9
+33 -1
View File
@@ -17,6 +17,19 @@ from .email_sender import create_transcript_docx, create_summary_docx
logger = logging.getLogger("scraibe.tasks") logger = logging.getLogger("scraibe.tasks")
def _remove_file(path: str):
"""
Remove a file if it exists. Best-effort; logs but never raises.
"""
if not path:
return
try:
if os.path.exists(path):
os.remove(path)
except Exception as e:
logger.warning("Failed to remove file %s: %s", path, e)
def get_queue_position(task_id: str) -> int: def get_queue_position(task_id: str) -> int:
""" """
Estimate the job's position in the queue. Estimate the job's position in the queue.
@@ -200,6 +213,7 @@ def process_transcription_task(
): ):
""" """
Async task: transcribe audio, optionally summarize, then email results. Async task: transcribe audio, optionally summarize, then email results.
Cleans up temporary files after completion.
""" """
task_id = self.request.id task_id = self.request.id
@@ -207,6 +221,10 @@ def process_transcription_task(
log_level = os.getenv("LOG_LEVEL", "INFO") log_level = os.getenv("LOG_LEVEL", "INFO")
setup_logging(level=log_level) setup_logging(level=log_level)
# Track all temporary files to clean up later
temp_files = []
try:
# 1) Determine queue position and send initial email # 1) Determine queue position and send initial email
queue_pos = get_queue_position(task_id) queue_pos = get_queue_position(task_id)
send_initial_email(to=email_to, queue_pos=queue_pos) send_initial_email(to=email_to, queue_pos=queue_pos)
@@ -222,7 +240,6 @@ def process_transcription_task(
) )
raise raise
try:
# 3) Perform transcription # 3) Perform transcription
if task_type == "transcript_and_summarize": if task_type == "transcript_and_summarize":
result = scraibe.transcript_and_summarize( result = scraibe.transcript_and_summarize(
@@ -258,11 +275,13 @@ def process_transcription_task(
f.write("# Transcript\n\n") f.write("# Transcript\n\n")
f.write(transcript_text) f.write(transcript_text)
attachments.append(md_transcript_path) attachments.append(md_transcript_path)
temp_files.append(md_transcript_path)
# Transcript as .docx # Transcript as .docx
docx_transcript_path = tempfile.mktemp(suffix=".docx") docx_transcript_path = tempfile.mktemp(suffix=".docx")
create_transcript_docx(transcript_text, docx_transcript_path) create_transcript_docx(transcript_text, docx_transcript_path)
attachments.append(docx_transcript_path) attachments.append(docx_transcript_path)
temp_files.append(docx_transcript_path)
# JSON with diarization # JSON with diarization
json_data = { json_data = {
@@ -285,6 +304,7 @@ def process_transcription_task(
with open(json_path, "w", encoding="utf-8") as f: with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False) json.dump(json_data, f, indent=2, ensure_ascii=False)
attachments.append(json_path) attachments.append(json_path)
temp_files.append(json_path)
# Summary as .md (only when summary is available) # Summary as .md (only when summary is available)
if summary_text: if summary_text:
@@ -293,11 +313,13 @@ def process_transcription_task(
f.write("# Summary\n\n") f.write("# Summary\n\n")
f.write(summary_text) f.write(summary_text)
attachments.append(md_summary_path) attachments.append(md_summary_path)
temp_files.append(md_summary_path)
# Summary as .docx # Summary as .docx
docx_summary_path = tempfile.mktemp(suffix=".docx") docx_summary_path = tempfile.mktemp(suffix=".docx")
create_summary_docx(summary_text, docx_summary_path) create_summary_docx(summary_text, docx_summary_path)
attachments.append(docx_summary_path) attachments.append(docx_summary_path)
temp_files.append(docx_summary_path)
# 5) Send success email # 5) Send success email
send_success_email( send_success_email(
@@ -318,3 +340,13 @@ def process_transcription_task(
task_id=task_id, task_id=task_id,
) )
raise e raise e
finally:
# 6) Cleanup temporary files (best-effort)
for path in temp_files:
_remove_file(path)
# Also remove uploaded audio file
if audio_path:
_remove_file(audio_path)
logger.info("Cleanup completed for job %s.", task_id)