From 237bd4b37c1d9acbd4648c3a4138f6b287ea3a12 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 15 Jun 2026 03:16:53 +0000 Subject: [PATCH] Refactor PDF generation and attachment logic - Generate PDFs by: - Creating individual .docx components (cover, transcript, summary) - Converting each .docx to PDF - Merging PDFs in correct order - Adding page numbers to final PDFs - Transcribe & Summarize: - Attach: JSON, transcript MD, summary MD, TRANSCRIPT.pdf, SUMMARY.pdf, COMBINED.pdf - Transcribe only: - Attach: JSON, transcript MD, TRANSCRIPT.pdf - Ensure transcript line numbering is isolated to its own .docx before PDF merge --- scraibe/email_sender.py | 134 +++++++++++++++++++++------------------- scraibe/tasks.py | 14 ++++- 2 files changed, 81 insertions(+), 67 deletions(-) diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index 5041107..a3b75e7 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -472,7 +472,7 @@ def _add_summary_content(doc, text): def _docx_to_pdf(docx_path: str, output_dir: str) -> str: """ Convert a .docx file to PDF using LibreOffice. - Returns the path of the generated PDF. + Returns the path of the generated PDF, or None on failure. """ try: # Use LibreOffice in headless mode @@ -486,7 +486,7 @@ def _docx_to_pdf(docx_path: str, output_dir: str) -> str: ], capture_output=True, text=True, - timeout=60, + timeout=120, ) if result.returncode != 0: @@ -689,10 +689,24 @@ def generate_pdf_documents( summary_cover_desc: str, ) -> Dict[str, str]: """ - Generate PDF documents for transcript, summary, and combined. + Generate PDF documents by: + - Creating individual .docx components (cover pages, transcript, summary) + - Converting each .docx to PDF + - Merging PDFs in the correct order + - Adding page numbers to the final PDFs + + Behavior: + - Always: + - Generate TRANSCRIPT.pdf: + - transcript_cover.pdf + transcript_with_line_numbers.pdf + - If summary_text is provided: + - Generate SUMMARY.pdf: + - summary_cover.pdf + summary.pdf + - Generate COMBINED.pdf: + - transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf Returns a dict with paths: - - transcript_pdf + - transcript_pdf (always) - summary_pdf (if summary_text provided) - combined_pdf (if summary_text provided) """ @@ -705,6 +719,7 @@ def generate_pdf_documents( # Create temporary directory for intermediate files with tempfile.TemporaryDirectory() as tmpdir: # 1) Generate constituent DOCX files + # Transcript cover transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx") doc = Document() @@ -713,13 +728,15 @@ def generate_pdf_documents( _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc) doc.save(transcript_cover_docx) - # Summary cover - summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc) - doc.save(summary_cover_docx) + # Summary cover (only if summary is requested) + summary_cover_docx = None + if summary_text: + summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx") + doc = Document() + _configure_base_font(doc) + _configure_section_margins(doc) + _add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc) + doc.save(summary_cover_docx) # Transcript (with line numbering) transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx") @@ -731,73 +748,60 @@ def generate_pdf_documents( doc.save(transcript_docx) # Summary (no line numbering) - summary_docx = os.path.join(tmpdir, "SUMMARY.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _add_summary_content(doc, summary_text) - doc.save(summary_docx) - - # 2) Convert DOCX to PDF - transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir) - summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir) - transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir) - summary_pdf = _docx_to_pdf(summary_docx, tmpdir) - - # 3) Assemble final PDFs - - # Transcript PDF: cover + transcript + page numbers - transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf") - merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf") - if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript): - _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf) - result["transcript_pdf"] = transcript_output_pdf - - # Summary PDF: cover + summary + page numbers + summary_docx = None if summary_text: - summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf") - merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf") - if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary): - _add_page_numbers_to_pdf(merged_summary, summary_output_pdf) - result["summary_pdf"] = summary_output_pdf - - # Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers - if summary_text: - # Create a document with "TRANSCRIPT" header before transcript - transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx") + summary_docx = os.path.join(tmpdir, "SUMMARY.docx") doc = Document() _configure_base_font(doc) _configure_section_margins(doc) + _add_summary_content(doc, summary_text) + doc.save(summary_docx) - # Add "TRANSCRIPT" header - p = doc.add_paragraph() - p.alignment = WD_ALIGN_PARAGRAPH.CENTER - run = p.add_run("TRANSCRIPT") - run.bold = True - run.font.name = "Courier" - run.font.size = Pt(12) + # 2) Convert DOCX to PDF - # Page break - doc.add_page_break() + transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir) + transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir) - # Enable line numbering for transcript section - _enable_line_numbering(doc.sections[0]) + summary_cover_pdf = None + summary_pdf = None + if summary_text: + summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir) + summary_pdf = _docx_to_pdf(summary_docx, tmpdir) - # Transcript content (with line numbering) - _add_transcript_content(doc, transcript_text) - doc.save(transcript_header_docx) + # 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers + transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf") + merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf") + if ( + transcript_cover_pdf + and transcript_pdf + and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript) + and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf) + ): + result["transcript_pdf"] = transcript_output_pdf - # Convert to PDF - transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir) + # 4) If summary is provided: + # - SUMMARY.pdf: summary_cover + summary + page numbers + # - COMBINED.pdf: transcript_cover + summary + transcript + page numbers + if summary_text and summary_pdf and summary_cover_pdf: + # SUMMARY.pdf + summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf") + merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf") + if ( + _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary) + and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf) + ): + result["summary_pdf"] = summary_output_pdf - # Assemble combined PDF + # COMBINED.pdf combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf") merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf") - if _merge_pdfs( - [transcript_cover_pdf, summary_pdf, transcript_header_pdf], - merged_combined, + if ( + _merge_pdfs( + [transcript_cover_pdf, summary_pdf, transcript_pdf], + merged_combined, + ) + and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf) ): - _add_page_numbers_to_pdf(merged_combined, combined_output_pdf) result["combined_pdf"] = combined_output_pdf return result diff --git a/scraibe/tasks.py b/scraibe/tasks.py index 5e2c823..c4a6da4 100644 --- a/scraibe/tasks.py +++ b/scraibe/tasks.py @@ -546,12 +546,16 @@ def process_transcription_task( temp_files.append(docx_combined_path) # 4b) Generate PDF documents + + # Always use transcript text; use summary_text only if include_summary is True + pdf_summary_text = summary_text if include_summary else "" + pdf_files = {} try: pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_") pdf_files = generate_pdf_documents( transcript_text=transcript_text, - summary_text=summary_text if include_summary else "", + summary_text=pdf_summary_text, output_dir=pdf_output_dir, transcript_cover_date=today_str, transcript_cover_desc=transcript_cover_desc, @@ -565,17 +569,23 @@ def process_transcription_task( logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e) # 5) Build attachments list + + # Base attachments: JSON, transcript MD, transcript DOCX attachments = [ md_transcript_path, docx_transcript_path, json_path, ] + + # If summary is present, add summary MD and DOCXs if summary_text: attachments += [md_summary_path, docx_summary_path] if docx_combined_path: attachments.append(docx_combined_path) - # Add PDFs to attachments + # Add PDFs based on mode: + # - Always: TRANSCRIPT.pdf + # - If summary: SUMMARY.pdf and COMBINED.pdf if pdf_files.get("transcript_pdf"): attachments.append(pdf_files["transcript_pdf"]) if pdf_files.get("summary_pdf"):