Refactor PDF generation and attachment logic
- Generate PDFs by: - Creating individual .docx components (cover, transcript, summary) - Converting each .docx to PDF - Merging PDFs in correct order - Adding page numbers to final PDFs - Transcribe & Summarize: - Attach: JSON, transcript MD, summary MD, TRANSCRIPT.pdf, SUMMARY.pdf, COMBINED.pdf - Transcribe only: - Attach: JSON, transcript MD, TRANSCRIPT.pdf - Ensure transcript line numbering is isolated to its own .docx before PDF merge
This commit is contained in:
+69
-65
@@ -472,7 +472,7 @@ def _add_summary_content(doc, text):
|
|||||||
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
||||||
"""
|
"""
|
||||||
Convert a .docx file to PDF using LibreOffice.
|
Convert a .docx file to PDF using LibreOffice.
|
||||||
Returns the path of the generated PDF.
|
Returns the path of the generated PDF, or None on failure.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Use LibreOffice in headless mode
|
# Use LibreOffice in headless mode
|
||||||
@@ -486,7 +486,7 @@ def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
|||||||
],
|
],
|
||||||
capture_output=True,
|
capture_output=True,
|
||||||
text=True,
|
text=True,
|
||||||
timeout=60,
|
timeout=120,
|
||||||
)
|
)
|
||||||
|
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
@@ -689,10 +689,24 @@ def generate_pdf_documents(
|
|||||||
summary_cover_desc: str,
|
summary_cover_desc: str,
|
||||||
) -> Dict[str, str]:
|
) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
Generate PDF documents for transcript, summary, and combined.
|
Generate PDF documents by:
|
||||||
|
- Creating individual .docx components (cover pages, transcript, summary)
|
||||||
|
- Converting each .docx to PDF
|
||||||
|
- Merging PDFs in the correct order
|
||||||
|
- Adding page numbers to the final PDFs
|
||||||
|
|
||||||
|
Behavior:
|
||||||
|
- Always:
|
||||||
|
- Generate TRANSCRIPT.pdf:
|
||||||
|
- transcript_cover.pdf + transcript_with_line_numbers.pdf
|
||||||
|
- If summary_text is provided:
|
||||||
|
- Generate SUMMARY.pdf:
|
||||||
|
- summary_cover.pdf + summary.pdf
|
||||||
|
- Generate COMBINED.pdf:
|
||||||
|
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
|
||||||
|
|
||||||
Returns a dict with paths:
|
Returns a dict with paths:
|
||||||
- transcript_pdf
|
- transcript_pdf (always)
|
||||||
- summary_pdf (if summary_text provided)
|
- summary_pdf (if summary_text provided)
|
||||||
- combined_pdf (if summary_text provided)
|
- combined_pdf (if summary_text provided)
|
||||||
"""
|
"""
|
||||||
@@ -705,6 +719,7 @@ def generate_pdf_documents(
|
|||||||
# Create temporary directory for intermediate files
|
# Create temporary directory for intermediate files
|
||||||
with tempfile.TemporaryDirectory() as tmpdir:
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
# 1) Generate constituent DOCX files
|
# 1) Generate constituent DOCX files
|
||||||
|
|
||||||
# Transcript cover
|
# Transcript cover
|
||||||
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
|
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
|
||||||
doc = Document()
|
doc = Document()
|
||||||
@@ -713,13 +728,15 @@ def generate_pdf_documents(
|
|||||||
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||||
doc.save(transcript_cover_docx)
|
doc.save(transcript_cover_docx)
|
||||||
|
|
||||||
# Summary cover
|
# Summary cover (only if summary is requested)
|
||||||
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
summary_cover_docx = None
|
||||||
doc = Document()
|
if summary_text:
|
||||||
_configure_base_font(doc)
|
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
||||||
_configure_section_margins(doc)
|
doc = Document()
|
||||||
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
_configure_base_font(doc)
|
||||||
doc.save(summary_cover_docx)
|
_configure_section_margins(doc)
|
||||||
|
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
||||||
|
doc.save(summary_cover_docx)
|
||||||
|
|
||||||
# Transcript (with line numbering)
|
# Transcript (with line numbering)
|
||||||
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
|
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
|
||||||
@@ -731,73 +748,60 @@ def generate_pdf_documents(
|
|||||||
doc.save(transcript_docx)
|
doc.save(transcript_docx)
|
||||||
|
|
||||||
# Summary (no line numbering)
|
# Summary (no line numbering)
|
||||||
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
summary_docx = None
|
||||||
doc = Document()
|
|
||||||
_configure_base_font(doc)
|
|
||||||
_configure_section_margins(doc)
|
|
||||||
_add_summary_content(doc, summary_text)
|
|
||||||
doc.save(summary_docx)
|
|
||||||
|
|
||||||
# 2) Convert DOCX to PDF
|
|
||||||
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
|
||||||
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
|
||||||
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
|
||||||
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
|
||||||
|
|
||||||
# 3) Assemble final PDFs
|
|
||||||
|
|
||||||
# Transcript PDF: cover + transcript + page numbers
|
|
||||||
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
|
||||||
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
|
||||||
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
|
|
||||||
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
|
||||||
result["transcript_pdf"] = transcript_output_pdf
|
|
||||||
|
|
||||||
# Summary PDF: cover + summary + page numbers
|
|
||||||
if summary_text:
|
if summary_text:
|
||||||
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
||||||
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
|
||||||
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
|
|
||||||
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
|
||||||
result["summary_pdf"] = summary_output_pdf
|
|
||||||
|
|
||||||
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
|
|
||||||
if summary_text:
|
|
||||||
# Create a document with "TRANSCRIPT" header before transcript
|
|
||||||
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
|
|
||||||
doc = Document()
|
doc = Document()
|
||||||
_configure_base_font(doc)
|
_configure_base_font(doc)
|
||||||
_configure_section_margins(doc)
|
_configure_section_margins(doc)
|
||||||
|
_add_summary_content(doc, summary_text)
|
||||||
|
doc.save(summary_docx)
|
||||||
|
|
||||||
# Add "TRANSCRIPT" header
|
# 2) Convert DOCX to PDF
|
||||||
p = doc.add_paragraph()
|
|
||||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
|
||||||
run = p.add_run("TRANSCRIPT")
|
|
||||||
run.bold = True
|
|
||||||
run.font.name = "Courier"
|
|
||||||
run.font.size = Pt(12)
|
|
||||||
|
|
||||||
# Page break
|
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
||||||
doc.add_page_break()
|
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
||||||
|
|
||||||
# Enable line numbering for transcript section
|
summary_cover_pdf = None
|
||||||
_enable_line_numbering(doc.sections[0])
|
summary_pdf = None
|
||||||
|
if summary_text:
|
||||||
|
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
||||||
|
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
||||||
|
|
||||||
# Transcript content (with line numbering)
|
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
|
||||||
_add_transcript_content(doc, transcript_text)
|
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
||||||
doc.save(transcript_header_docx)
|
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
||||||
|
if (
|
||||||
|
transcript_cover_pdf
|
||||||
|
and transcript_pdf
|
||||||
|
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
|
||||||
|
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
||||||
|
):
|
||||||
|
result["transcript_pdf"] = transcript_output_pdf
|
||||||
|
|
||||||
# Convert to PDF
|
# 4) If summary is provided:
|
||||||
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
|
# - SUMMARY.pdf: summary_cover + summary + page numbers
|
||||||
|
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
|
||||||
|
if summary_text and summary_pdf and summary_cover_pdf:
|
||||||
|
# SUMMARY.pdf
|
||||||
|
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
||||||
|
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
||||||
|
if (
|
||||||
|
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
|
||||||
|
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
||||||
|
):
|
||||||
|
result["summary_pdf"] = summary_output_pdf
|
||||||
|
|
||||||
# Assemble combined PDF
|
# COMBINED.pdf
|
||||||
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
|
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
|
||||||
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
|
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
|
||||||
if _merge_pdfs(
|
if (
|
||||||
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
|
_merge_pdfs(
|
||||||
merged_combined,
|
[transcript_cover_pdf, summary_pdf, transcript_pdf],
|
||||||
|
merged_combined,
|
||||||
|
)
|
||||||
|
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
||||||
):
|
):
|
||||||
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
|
||||||
result["combined_pdf"] = combined_output_pdf
|
result["combined_pdf"] = combined_output_pdf
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|||||||
+12
-2
@@ -546,12 +546,16 @@ def process_transcription_task(
|
|||||||
temp_files.append(docx_combined_path)
|
temp_files.append(docx_combined_path)
|
||||||
|
|
||||||
# 4b) Generate PDF documents
|
# 4b) Generate PDF documents
|
||||||
|
|
||||||
|
# Always use transcript text; use summary_text only if include_summary is True
|
||||||
|
pdf_summary_text = summary_text if include_summary else ""
|
||||||
|
|
||||||
pdf_files = {}
|
pdf_files = {}
|
||||||
try:
|
try:
|
||||||
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
|
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
|
||||||
pdf_files = generate_pdf_documents(
|
pdf_files = generate_pdf_documents(
|
||||||
transcript_text=transcript_text,
|
transcript_text=transcript_text,
|
||||||
summary_text=summary_text if include_summary else "",
|
summary_text=pdf_summary_text,
|
||||||
output_dir=pdf_output_dir,
|
output_dir=pdf_output_dir,
|
||||||
transcript_cover_date=today_str,
|
transcript_cover_date=today_str,
|
||||||
transcript_cover_desc=transcript_cover_desc,
|
transcript_cover_desc=transcript_cover_desc,
|
||||||
@@ -565,17 +569,23 @@ def process_transcription_task(
|
|||||||
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
|
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
|
||||||
|
|
||||||
# 5) Build attachments list
|
# 5) Build attachments list
|
||||||
|
|
||||||
|
# Base attachments: JSON, transcript MD, transcript DOCX
|
||||||
attachments = [
|
attachments = [
|
||||||
md_transcript_path,
|
md_transcript_path,
|
||||||
docx_transcript_path,
|
docx_transcript_path,
|
||||||
json_path,
|
json_path,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# If summary is present, add summary MD and DOCXs
|
||||||
if summary_text:
|
if summary_text:
|
||||||
attachments += [md_summary_path, docx_summary_path]
|
attachments += [md_summary_path, docx_summary_path]
|
||||||
if docx_combined_path:
|
if docx_combined_path:
|
||||||
attachments.append(docx_combined_path)
|
attachments.append(docx_combined_path)
|
||||||
|
|
||||||
# Add PDFs to attachments
|
# Add PDFs based on mode:
|
||||||
|
# - Always: TRANSCRIPT.pdf
|
||||||
|
# - If summary: SUMMARY.pdf and COMBINED.pdf
|
||||||
if pdf_files.get("transcript_pdf"):
|
if pdf_files.get("transcript_pdf"):
|
||||||
attachments.append(pdf_files["transcript_pdf"])
|
attachments.append(pdf_files["transcript_pdf"])
|
||||||
if pdf_files.get("summary_pdf"):
|
if pdf_files.get("summary_pdf"):
|
||||||
|
|||||||
Reference in New Issue
Block a user