Refactor PDF generation and attachment logic
- Generate PDFs by: - Creating individual .docx components (cover, transcript, summary) - Converting each .docx to PDF - Merging PDFs in correct order - Adding page numbers to final PDFs - Transcribe & Summarize: - Attach: JSON, transcript MD, summary MD, TRANSCRIPT.pdf, SUMMARY.pdf, COMBINED.pdf - Transcribe only: - Attach: JSON, transcript MD, TRANSCRIPT.pdf - Ensure transcript line numbering is isolated to its own .docx before PDF merge
This commit is contained in:
+69
-65
@@ -472,7 +472,7 @@ def _add_summary_content(doc, text):
|
||||
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
||||
"""
|
||||
Convert a .docx file to PDF using LibreOffice.
|
||||
Returns the path of the generated PDF.
|
||||
Returns the path of the generated PDF, or None on failure.
|
||||
"""
|
||||
try:
|
||||
# Use LibreOffice in headless mode
|
||||
@@ -486,7 +486,7 @@ def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
@@ -689,10 +689,24 @@ def generate_pdf_documents(
|
||||
summary_cover_desc: str,
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Generate PDF documents for transcript, summary, and combined.
|
||||
Generate PDF documents by:
|
||||
- Creating individual .docx components (cover pages, transcript, summary)
|
||||
- Converting each .docx to PDF
|
||||
- Merging PDFs in the correct order
|
||||
- Adding page numbers to the final PDFs
|
||||
|
||||
Behavior:
|
||||
- Always:
|
||||
- Generate TRANSCRIPT.pdf:
|
||||
- transcript_cover.pdf + transcript_with_line_numbers.pdf
|
||||
- If summary_text is provided:
|
||||
- Generate SUMMARY.pdf:
|
||||
- summary_cover.pdf + summary.pdf
|
||||
- Generate COMBINED.pdf:
|
||||
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
|
||||
|
||||
Returns a dict with paths:
|
||||
- transcript_pdf
|
||||
- transcript_pdf (always)
|
||||
- summary_pdf (if summary_text provided)
|
||||
- combined_pdf (if summary_text provided)
|
||||
"""
|
||||
@@ -705,6 +719,7 @@ def generate_pdf_documents(
|
||||
# Create temporary directory for intermediate files
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# 1) Generate constituent DOCX files
|
||||
|
||||
# Transcript cover
|
||||
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
|
||||
doc = Document()
|
||||
@@ -713,13 +728,15 @@ def generate_pdf_documents(
|
||||
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||
doc.save(transcript_cover_docx)
|
||||
|
||||
# Summary cover
|
||||
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
||||
doc.save(summary_cover_docx)
|
||||
# Summary cover (only if summary is requested)
|
||||
summary_cover_docx = None
|
||||
if summary_text:
|
||||
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
||||
doc.save(summary_cover_docx)
|
||||
|
||||
# Transcript (with line numbering)
|
||||
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
|
||||
@@ -731,73 +748,60 @@ def generate_pdf_documents(
|
||||
doc.save(transcript_docx)
|
||||
|
||||
# Summary (no line numbering)
|
||||
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_summary_content(doc, summary_text)
|
||||
doc.save(summary_docx)
|
||||
|
||||
# 2) Convert DOCX to PDF
|
||||
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
||||
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
||||
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
||||
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
||||
|
||||
# 3) Assemble final PDFs
|
||||
|
||||
# Transcript PDF: cover + transcript + page numbers
|
||||
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
||||
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
||||
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
|
||||
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
||||
result["transcript_pdf"] = transcript_output_pdf
|
||||
|
||||
# Summary PDF: cover + summary + page numbers
|
||||
summary_docx = None
|
||||
if summary_text:
|
||||
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
||||
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
||||
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
|
||||
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
||||
result["summary_pdf"] = summary_output_pdf
|
||||
|
||||
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
|
||||
if summary_text:
|
||||
# Create a document with "TRANSCRIPT" header before transcript
|
||||
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
|
||||
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_summary_content(doc, summary_text)
|
||||
doc.save(summary_docx)
|
||||
|
||||
# Add "TRANSCRIPT" header
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
run = p.add_run("TRANSCRIPT")
|
||||
run.bold = True
|
||||
run.font.name = "Courier"
|
||||
run.font.size = Pt(12)
|
||||
# 2) Convert DOCX to PDF
|
||||
|
||||
# Page break
|
||||
doc.add_page_break()
|
||||
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
||||
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
||||
|
||||
# Enable line numbering for transcript section
|
||||
_enable_line_numbering(doc.sections[0])
|
||||
summary_cover_pdf = None
|
||||
summary_pdf = None
|
||||
if summary_text:
|
||||
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
||||
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
||||
|
||||
# Transcript content (with line numbering)
|
||||
_add_transcript_content(doc, transcript_text)
|
||||
doc.save(transcript_header_docx)
|
||||
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
|
||||
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
||||
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
||||
if (
|
||||
transcript_cover_pdf
|
||||
and transcript_pdf
|
||||
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
|
||||
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
||||
):
|
||||
result["transcript_pdf"] = transcript_output_pdf
|
||||
|
||||
# Convert to PDF
|
||||
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
|
||||
# 4) If summary is provided:
|
||||
# - SUMMARY.pdf: summary_cover + summary + page numbers
|
||||
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
|
||||
if summary_text and summary_pdf and summary_cover_pdf:
|
||||
# SUMMARY.pdf
|
||||
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
||||
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
||||
if (
|
||||
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
|
||||
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
||||
):
|
||||
result["summary_pdf"] = summary_output_pdf
|
||||
|
||||
# Assemble combined PDF
|
||||
# COMBINED.pdf
|
||||
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
|
||||
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
|
||||
if _merge_pdfs(
|
||||
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
|
||||
merged_combined,
|
||||
if (
|
||||
_merge_pdfs(
|
||||
[transcript_cover_pdf, summary_pdf, transcript_pdf],
|
||||
merged_combined,
|
||||
)
|
||||
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
||||
):
|
||||
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
||||
result["combined_pdf"] = combined_output_pdf
|
||||
|
||||
return result
|
||||
|
||||
+12
-2
@@ -546,12 +546,16 @@ def process_transcription_task(
|
||||
temp_files.append(docx_combined_path)
|
||||
|
||||
# 4b) Generate PDF documents
|
||||
|
||||
# Always use transcript text; use summary_text only if include_summary is True
|
||||
pdf_summary_text = summary_text if include_summary else ""
|
||||
|
||||
pdf_files = {}
|
||||
try:
|
||||
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
|
||||
pdf_files = generate_pdf_documents(
|
||||
transcript_text=transcript_text,
|
||||
summary_text=summary_text if include_summary else "",
|
||||
summary_text=pdf_summary_text,
|
||||
output_dir=pdf_output_dir,
|
||||
transcript_cover_date=today_str,
|
||||
transcript_cover_desc=transcript_cover_desc,
|
||||
@@ -565,17 +569,23 @@ def process_transcription_task(
|
||||
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
|
||||
|
||||
# 5) Build attachments list
|
||||
|
||||
# Base attachments: JSON, transcript MD, transcript DOCX
|
||||
attachments = [
|
||||
md_transcript_path,
|
||||
docx_transcript_path,
|
||||
json_path,
|
||||
]
|
||||
|
||||
# If summary is present, add summary MD and DOCXs
|
||||
if summary_text:
|
||||
attachments += [md_summary_path, docx_summary_path]
|
||||
if docx_combined_path:
|
||||
attachments.append(docx_combined_path)
|
||||
|
||||
# Add PDFs to attachments
|
||||
# Add PDFs based on mode:
|
||||
# - Always: TRANSCRIPT.pdf
|
||||
# - If summary: SUMMARY.pdf and COMBINED.pdf
|
||||
if pdf_files.get("transcript_pdf"):
|
||||
attachments.append(pdf_files["transcript_pdf"])
|
||||
if pdf_files.get("summary_pdf"):
|
||||
|
||||
Reference in New Issue
Block a user