Refactor PDF generation and attachment logic
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Generate PDFs by:
  - Creating individual .docx components (cover, transcript, summary)
  - Converting each .docx to PDF
  - Merging PDFs in correct order
  - Adding page numbers to final PDFs

- Transcribe & Summarize:
  - Attach: JSON, transcript MD, summary MD, TRANSCRIPT.pdf, SUMMARY.pdf, COMBINED.pdf

- Transcribe only:
  - Attach: JSON, transcript MD, TRANSCRIPT.pdf

- Ensure transcript line numbering is isolated to its own .docx before PDF merge
This commit is contained in:
admin
2026-06-15 03:16:53 +00:00
parent 7ece1a50c2
commit 237bd4b37c
2 changed files with 81 additions and 67 deletions
+69 -65
View File
@@ -472,7 +472,7 @@ def _add_summary_content(doc, text):
def _docx_to_pdf(docx_path: str, output_dir: str) -> str: def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
""" """
Convert a .docx file to PDF using LibreOffice. Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF. Returns the path of the generated PDF, or None on failure.
""" """
try: try:
# Use LibreOffice in headless mode # Use LibreOffice in headless mode
@@ -486,7 +486,7 @@ def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
], ],
capture_output=True, capture_output=True,
text=True, text=True,
timeout=60, timeout=120,
) )
if result.returncode != 0: if result.returncode != 0:
@@ -689,10 +689,24 @@ def generate_pdf_documents(
summary_cover_desc: str, summary_cover_desc: str,
) -> Dict[str, str]: ) -> Dict[str, str]:
""" """
Generate PDF documents for transcript, summary, and combined. Generate PDF documents by:
- Creating individual .docx components (cover pages, transcript, summary)
- Converting each .docx to PDF
- Merging PDFs in the correct order
- Adding page numbers to the final PDFs
Behavior:
- Always:
- Generate TRANSCRIPT.pdf:
- transcript_cover.pdf + transcript_with_line_numbers.pdf
- If summary_text is provided:
- Generate SUMMARY.pdf:
- summary_cover.pdf + summary.pdf
- Generate COMBINED.pdf:
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
Returns a dict with paths: Returns a dict with paths:
- transcript_pdf - transcript_pdf (always)
- summary_pdf (if summary_text provided) - summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided) - combined_pdf (if summary_text provided)
""" """
@@ -705,6 +719,7 @@ def generate_pdf_documents(
# Create temporary directory for intermediate files # Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir: with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files # 1) Generate constituent DOCX files
# Transcript cover # Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx") transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document() doc = Document()
@@ -713,13 +728,15 @@ def generate_pdf_documents(
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc) _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx) doc.save(transcript_cover_docx)
# Summary cover # Summary cover (only if summary is requested)
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx") summary_cover_docx = None
doc = Document() if summary_text:
_configure_base_font(doc) summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
_configure_section_margins(doc) doc = Document()
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc) _configure_base_font(doc)
doc.save(summary_cover_docx) _configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering) # Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx") transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
@@ -731,73 +748,60 @@ def generate_pdf_documents(
doc.save(transcript_docx) doc.save(transcript_docx)
# Summary (no line numbering) # Summary (no line numbering)
summary_docx = os.path.join(tmpdir, "SUMMARY.docx") summary_docx = None
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble final PDFs
# Transcript PDF: cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
result["transcript_pdf"] = transcript_output_pdf
# Summary PDF: cover + summary + page numbers
if summary_text: if summary_text:
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf") summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
result["summary_pdf"] = summary_output_pdf
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
if summary_text:
# Create a document with "TRANSCRIPT" header before transcript
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
doc = Document() doc = Document()
_configure_base_font(doc) _configure_base_font(doc)
_configure_section_margins(doc) _configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# Add "TRANSCRIPT" header # 2) Convert DOCX to PDF
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("TRANSCRIPT")
run.bold = True
run.font.name = "Courier"
run.font.size = Pt(12)
# Page break transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
doc.add_page_break() transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
# Enable line numbering for transcript section summary_cover_pdf = None
_enable_line_numbering(doc.sections[0]) summary_pdf = None
if summary_text:
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# Transcript content (with line numbering) # 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
_add_transcript_content(doc, transcript_text) transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
doc.save(transcript_header_docx) merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if (
transcript_cover_pdf
and transcript_pdf
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
):
result["transcript_pdf"] = transcript_output_pdf
# Convert to PDF # 4) If summary is provided:
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir) # - SUMMARY.pdf: summary_cover + summary + page numbers
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
if summary_text and summary_pdf and summary_cover_pdf:
# SUMMARY.pdf
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if (
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
):
result["summary_pdf"] = summary_output_pdf
# Assemble combined PDF # COMBINED.pdf
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf") combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf") merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if _merge_pdfs( if (
[transcript_cover_pdf, summary_pdf, transcript_header_pdf], _merge_pdfs(
merged_combined, [transcript_cover_pdf, summary_pdf, transcript_pdf],
merged_combined,
)
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
): ):
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
result["combined_pdf"] = combined_output_pdf result["combined_pdf"] = combined_output_pdf
return result return result
+12 -2
View File
@@ -546,12 +546,16 @@ def process_transcription_task(
temp_files.append(docx_combined_path) temp_files.append(docx_combined_path)
# 4b) Generate PDF documents # 4b) Generate PDF documents
# Always use transcript text; use summary_text only if include_summary is True
pdf_summary_text = summary_text if include_summary else ""
pdf_files = {} pdf_files = {}
try: try:
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_") pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
pdf_files = generate_pdf_documents( pdf_files = generate_pdf_documents(
transcript_text=transcript_text, transcript_text=transcript_text,
summary_text=summary_text if include_summary else "", summary_text=pdf_summary_text,
output_dir=pdf_output_dir, output_dir=pdf_output_dir,
transcript_cover_date=today_str, transcript_cover_date=today_str,
transcript_cover_desc=transcript_cover_desc, transcript_cover_desc=transcript_cover_desc,
@@ -565,17 +569,23 @@ def process_transcription_task(
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e) logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
# 5) Build attachments list # 5) Build attachments list
# Base attachments: JSON, transcript MD, transcript DOCX
attachments = [ attachments = [
md_transcript_path, md_transcript_path,
docx_transcript_path, docx_transcript_path,
json_path, json_path,
] ]
# If summary is present, add summary MD and DOCXs
if summary_text: if summary_text:
attachments += [md_summary_path, docx_summary_path] attachments += [md_summary_path, docx_summary_path]
if docx_combined_path: if docx_combined_path:
attachments.append(docx_combined_path) attachments.append(docx_combined_path)
# Add PDFs to attachments # Add PDFs based on mode:
# - Always: TRANSCRIPT.pdf
# - If summary: SUMMARY.pdf and COMBINED.pdf
if pdf_files.get("transcript_pdf"): if pdf_files.get("transcript_pdf"):
attachments.append(pdf_files["transcript_pdf"]) attachments.append(pdf_files["transcript_pdf"])
if pdf_files.get("summary_pdf"): if pdf_files.get("summary_pdf"):