Refactor PDF generation and attachment logic
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Generate PDFs by:
  - Creating individual .docx components (cover, transcript, summary)
  - Converting each .docx to PDF
  - Merging PDFs in correct order
  - Adding page numbers to final PDFs

- Transcribe & Summarize:
  - Attach: JSON, transcript MD, summary MD, TRANSCRIPT.pdf, SUMMARY.pdf, COMBINED.pdf

- Transcribe only:
  - Attach: JSON, transcript MD, TRANSCRIPT.pdf

- Ensure transcript line numbering is isolated to its own .docx before PDF merge
This commit is contained in:
admin
2026-06-15 03:16:53 +00:00
parent 7ece1a50c2
commit 237bd4b37c
2 changed files with 81 additions and 67 deletions
+69 -65
View File
@@ -472,7 +472,7 @@ def _add_summary_content(doc, text):
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
"""
Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF.
Returns the path of the generated PDF, or None on failure.
"""
try:
# Use LibreOffice in headless mode
@@ -486,7 +486,7 @@ def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
],
capture_output=True,
text=True,
timeout=60,
timeout=120,
)
if result.returncode != 0:
@@ -689,10 +689,24 @@ def generate_pdf_documents(
summary_cover_desc: str,
) -> Dict[str, str]:
"""
Generate PDF documents for transcript, summary, and combined.
Generate PDF documents by:
- Creating individual .docx components (cover pages, transcript, summary)
- Converting each .docx to PDF
- Merging PDFs in the correct order
- Adding page numbers to the final PDFs
Behavior:
- Always:
- Generate TRANSCRIPT.pdf:
- transcript_cover.pdf + transcript_with_line_numbers.pdf
- If summary_text is provided:
- Generate SUMMARY.pdf:
- summary_cover.pdf + summary.pdf
- Generate COMBINED.pdf:
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
Returns a dict with paths:
- transcript_pdf
- transcript_pdf (always)
- summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided)
"""
@@ -705,6 +719,7 @@ def generate_pdf_documents(
# Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files
# Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document()
@@ -713,13 +728,15 @@ def generate_pdf_documents(
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx)
# Summary cover
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Summary cover (only if summary is requested)
summary_cover_docx = None
if summary_text:
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
@@ -731,73 +748,60 @@ def generate_pdf_documents(
doc.save(transcript_docx)
# Summary (no line numbering)
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble final PDFs
# Transcript PDF: cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
result["transcript_pdf"] = transcript_output_pdf
# Summary PDF: cover + summary + page numbers
summary_docx = None
if summary_text:
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
result["summary_pdf"] = summary_output_pdf
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
if summary_text:
# Create a document with "TRANSCRIPT" header before transcript
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# Add "TRANSCRIPT" header
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("TRANSCRIPT")
run.bold = True
run.font.name = "Courier"
run.font.size = Pt(12)
# 2) Convert DOCX to PDF
# Page break
doc.add_page_break()
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
summary_cover_pdf = None
summary_pdf = None
if summary_text:
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(transcript_header_docx)
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if (
transcript_cover_pdf
and transcript_pdf
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
):
result["transcript_pdf"] = transcript_output_pdf
# Convert to PDF
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
# 4) If summary is provided:
# - SUMMARY.pdf: summary_cover + summary + page numbers
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
if summary_text and summary_pdf and summary_cover_pdf:
# SUMMARY.pdf
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if (
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
):
result["summary_pdf"] = summary_output_pdf
# Assemble combined PDF
# COMBINED.pdf
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if _merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
merged_combined,
if (
_merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_pdf],
merged_combined,
)
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
):
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
result["combined_pdf"] = combined_output_pdf
return result