Revert PDF generation; simplify to DOCX + MD + JSON only
- Remove PDF helpers, LibreOffice, PyPDF2, reportlab - Transcript DOCX: standalone, no cover page, with line numbering - Summary DOCX: standalone, no cover page, no line numbering - Attachments: - Transcribe: JSON, transcript MD, transcript DOCX - Transcribe & Summarize: JSON, transcript MD, transcript DOCX, summary MD, summary DOCX
This commit is contained in:
+7
-357
@@ -13,8 +13,6 @@ import logging
|
||||
import os
|
||||
import re
|
||||
import smtplib
|
||||
import subprocess
|
||||
import tempfile
|
||||
from email import encoders
|
||||
from email.mime.base import MIMEBase
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
@@ -336,58 +334,6 @@ def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
|
||||
sectPr.append(lnNumType)
|
||||
|
||||
|
||||
def _add_cover_page(doc, doc_type, date, description):
|
||||
"""
|
||||
Add a cover page:
|
||||
- Centered horizontally and vertically using a full-page table.
|
||||
- Lines:
|
||||
1) Document type
|
||||
2) Date (e.g. "June 14, 2026")
|
||||
3-5) Empty space
|
||||
6) One-sentence description
|
||||
"""
|
||||
# Create a full-page table to center content vertically and horizontally
|
||||
table = doc.add_table(rows=1, cols=1)
|
||||
table.autofit = False
|
||||
cell = table.cell(0, 0)
|
||||
|
||||
# Make table span full page width (approx)
|
||||
cell.width = Inches(6.5)
|
||||
|
||||
# Center content inside the cell
|
||||
for paragraph in cell.paragraphs:
|
||||
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
# Clear default paragraph
|
||||
cell.paragraphs[0].clear()
|
||||
|
||||
# Line 1: Document type
|
||||
p_type = cell.add_paragraph()
|
||||
p_type.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
run_type = p_type.add_run(doc_type)
|
||||
run_type.bold = True
|
||||
run_type.font.name = "Courier"
|
||||
run_type.font.size = Pt(12)
|
||||
|
||||
# Line 2: Date
|
||||
p_date = cell.add_paragraph()
|
||||
p_date.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
run_date = p_date.add_run(date)
|
||||
run_date.font.name = "Courier"
|
||||
run_date.font.size = Pt(12)
|
||||
|
||||
# Lines 3-5: blank space
|
||||
for _ in range(3):
|
||||
cell.add_paragraph()
|
||||
|
||||
# Line 6: Description
|
||||
p_desc = cell.add_paragraph()
|
||||
p_desc.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
run_desc = p_desc.add_run(description)
|
||||
run_desc.font.name = "Courier"
|
||||
run_desc.font.size = Pt(12)
|
||||
|
||||
|
||||
def _add_transcript_content(doc, text):
|
||||
"""
|
||||
Add transcript lines to the document with formatting.
|
||||
@@ -467,146 +413,27 @@ def _add_summary_content(doc, text):
|
||||
p.paragraph_format.space_after = Pt(4)
|
||||
|
||||
|
||||
# ------------ PDF helpers ------------
|
||||
|
||||
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
||||
"""
|
||||
Convert a .docx file to PDF using LibreOffice.
|
||||
Returns the path of the generated PDF, or None on failure.
|
||||
"""
|
||||
try:
|
||||
# Use LibreOffice in headless mode
|
||||
result = subprocess.run(
|
||||
[
|
||||
"libreoffice",
|
||||
"--headless",
|
||||
"--convert-to", "pdf",
|
||||
"--outdir", output_dir,
|
||||
docx_path,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
logger.warning("LibreOffice conversion failed: %s", result.stderr)
|
||||
|
||||
# LibreOffice creates a PDF with the same base name
|
||||
base = os.path.splitext(os.path.basename(docx_path))[0]
|
||||
pdf_path = os.path.join(output_dir, f"{base}.pdf")
|
||||
|
||||
if os.path.exists(pdf_path):
|
||||
return pdf_path
|
||||
else:
|
||||
logger.warning("Converted PDF not found at: %s", pdf_path)
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error converting DOCX to PDF: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
|
||||
"""
|
||||
Merge multiple PDF files into a single PDF using PyPDF2.
|
||||
Returns True on success, False on failure.
|
||||
"""
|
||||
try:
|
||||
from PyPDF2 import PdfMerger
|
||||
|
||||
merger = PdfMerger()
|
||||
for pdf in input_pdfs:
|
||||
if os.path.exists(pdf):
|
||||
merger.append(pdf)
|
||||
|
||||
merger.write(output_pdf)
|
||||
merger.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error merging PDFs: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
|
||||
"""
|
||||
Add page numbers to a PDF using reportlab.
|
||||
Page numbers appear at the bottom-right of each page.
|
||||
"""
|
||||
try:
|
||||
from reportlab.pdfgen import canvas
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
from PyPDF2 import PdfReader
|
||||
|
||||
# Try to use a standard font
|
||||
font_name = "Courier"
|
||||
try:
|
||||
pdfmetrics.registerFont(
|
||||
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
reader = PdfReader(input_pdf)
|
||||
num_pages = len(reader.pages)
|
||||
|
||||
# Create a canvas to add page numbers
|
||||
c = canvas.Canvas(output_pdf, pagesize=letter)
|
||||
|
||||
for page_num in range(num_pages):
|
||||
page = reader.pages[page_num]
|
||||
c = canvas.Canvas(output_pdf, pagesize=letter)
|
||||
|
||||
# Add page number
|
||||
c.setFont(font_name, 10)
|
||||
page_text = f"Page {page_num + 1} of {num_pages}"
|
||||
text_width = c.stringWidth(page_text, font_name, 10)
|
||||
x = letter[0] - 1 * 72 - text_width # 1 inch from right
|
||||
y = 1 * 72 # 1 inch from bottom
|
||||
c.drawString(x, y, page_text)
|
||||
|
||||
c.showPage()
|
||||
c.save()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error adding page numbers to PDF: %s", e)
|
||||
return False
|
||||
|
||||
|
||||
# ------------ Public DOCX/PDF functions ------------
|
||||
# ------------ Public DOCX functions ------------
|
||||
|
||||
def create_transcript_docx(
|
||||
text: str,
|
||||
filename: str,
|
||||
include_cover: bool = False,
|
||||
cover_date: str = "",
|
||||
cover_desc: str = "",
|
||||
):
|
||||
"""
|
||||
Create a .docx transcript with:
|
||||
- 1" margins on all sides
|
||||
- 12pt Courier
|
||||
- Continuous line numbering on the left (for transcript content only)
|
||||
- Optional cover page with type, date, and AI-generated description.
|
||||
- Continuous line numbering on the left
|
||||
- No cover page (standalone transcript only).
|
||||
"""
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
|
||||
# 1) Optional cover page (no line numbering)
|
||||
if include_cover and cover_date and cover_desc:
|
||||
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
|
||||
doc.add_page_break()
|
||||
|
||||
# 2) Enable line numbering for transcript section
|
||||
# Enable line numbering for transcript section
|
||||
_enable_line_numbering(doc.sections[0])
|
||||
|
||||
# 3) Transcript content (with line numbering)
|
||||
# Transcript content (with line numbering)
|
||||
_add_transcript_content(doc, text)
|
||||
|
||||
doc.save(filename)
|
||||
@@ -615,198 +442,21 @@ def create_transcript_docx(
|
||||
def create_summary_docx(
|
||||
text: str,
|
||||
filename: str,
|
||||
include_cover: bool = False,
|
||||
cover_date: str = "",
|
||||
cover_desc: str = "",
|
||||
):
|
||||
"""
|
||||
Create a .docx summary with consistent font and heading styles.
|
||||
Optional cover page with type, date, and AI-generated description.
|
||||
No line numbering.
|
||||
No cover page, no line numbering.
|
||||
"""
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
|
||||
# 1) Optional cover page
|
||||
if include_cover and cover_date and cover_desc:
|
||||
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
|
||||
doc.add_page_break()
|
||||
|
||||
# 2) Summary content (no line numbering)
|
||||
# Summary content (no line numbering)
|
||||
_add_summary_content(doc, text)
|
||||
|
||||
doc.save(filename)
|
||||
|
||||
|
||||
def create_combined_docx(
|
||||
transcript_text: str,
|
||||
summary_text: str,
|
||||
filename: str,
|
||||
transcript_cover_date: str,
|
||||
transcript_cover_desc: str,
|
||||
summary_cover_date: str,
|
||||
summary_cover_desc: str,
|
||||
):
|
||||
"""
|
||||
Create a combined .docx with:
|
||||
1) Transcript cover page (no line numbering)
|
||||
2) Page break
|
||||
3) Summary content (no line numbering)
|
||||
4) Page break
|
||||
5) Transcript content (line numbering enabled only here)
|
||||
|
||||
Line numbering is restricted to the transcript section only.
|
||||
"""
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
|
||||
# 1) Transcript cover page (no line numbering)
|
||||
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||
doc.add_page_break()
|
||||
|
||||
# 3) Summary content (no line numbering)
|
||||
_add_summary_content(doc, summary_text)
|
||||
doc.add_page_break()
|
||||
|
||||
# 4) Enable line numbering for transcript section
|
||||
_enable_line_numbering(doc.sections[0])
|
||||
|
||||
# 5) Transcript content (with line numbering)
|
||||
_add_transcript_content(doc, transcript_text)
|
||||
|
||||
doc.save(filename)
|
||||
|
||||
|
||||
def generate_pdf_documents(
|
||||
transcript_text: str,
|
||||
summary_text: str,
|
||||
output_dir: str,
|
||||
transcript_cover_date: str,
|
||||
transcript_cover_desc: str,
|
||||
summary_cover_date: str,
|
||||
summary_cover_desc: str,
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Generate PDF documents by:
|
||||
- Creating individual .docx components (cover pages, transcript, summary)
|
||||
- Converting each .docx to PDF
|
||||
- Merging PDFs in the correct order
|
||||
- Adding page numbers to the final PDFs
|
||||
|
||||
Behavior:
|
||||
- Always:
|
||||
- Generate TRANSCRIPT.pdf:
|
||||
- transcript_cover.pdf + transcript_with_line_numbers.pdf
|
||||
- If summary_text is provided:
|
||||
- Generate SUMMARY.pdf:
|
||||
- summary_cover.pdf + summary.pdf
|
||||
- Generate COMBINED.pdf:
|
||||
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
|
||||
|
||||
Returns a dict with paths:
|
||||
- transcript_pdf (always)
|
||||
- summary_pdf (if summary_text provided)
|
||||
- combined_pdf (if summary_text provided)
|
||||
"""
|
||||
import os
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
result = {}
|
||||
|
||||
# Create temporary directory for intermediate files
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# 1) Generate constituent DOCX files
|
||||
|
||||
# Transcript cover
|
||||
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||
doc.save(transcript_cover_docx)
|
||||
|
||||
# Summary cover (only if summary is requested)
|
||||
summary_cover_docx = None
|
||||
if summary_text:
|
||||
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
||||
doc.save(summary_cover_docx)
|
||||
|
||||
# Transcript (with line numbering)
|
||||
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_enable_line_numbering(doc.sections[0])
|
||||
_add_transcript_content(doc, transcript_text)
|
||||
doc.save(transcript_docx)
|
||||
|
||||
# Summary (no line numbering)
|
||||
summary_docx = None
|
||||
if summary_text:
|
||||
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
||||
doc = Document()
|
||||
_configure_base_font(doc)
|
||||
_configure_section_margins(doc)
|
||||
_add_summary_content(doc, summary_text)
|
||||
doc.save(summary_docx)
|
||||
|
||||
# 2) Convert DOCX to PDF
|
||||
|
||||
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
||||
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
||||
|
||||
summary_cover_pdf = None
|
||||
summary_pdf = None
|
||||
if summary_text:
|
||||
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
||||
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
||||
|
||||
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
|
||||
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
||||
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
||||
if (
|
||||
transcript_cover_pdf
|
||||
and transcript_pdf
|
||||
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
|
||||
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
||||
):
|
||||
result["transcript_pdf"] = transcript_output_pdf
|
||||
|
||||
# 4) If summary is provided:
|
||||
# - SUMMARY.pdf: summary_cover + summary + page numbers
|
||||
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
|
||||
if summary_text and summary_pdf and summary_cover_pdf:
|
||||
# SUMMARY.pdf
|
||||
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
||||
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
||||
if (
|
||||
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
|
||||
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
||||
):
|
||||
result["summary_pdf"] = summary_output_pdf
|
||||
|
||||
# COMBINED.pdf
|
||||
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
|
||||
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
|
||||
if (
|
||||
_merge_pdfs(
|
||||
[transcript_cover_pdf, summary_pdf, transcript_pdf],
|
||||
merged_combined,
|
||||
)
|
||||
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
||||
):
|
||||
result["combined_pdf"] = combined_output_pdf
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def send_success_email(
|
||||
to: str,
|
||||
transcript_text: str,
|
||||
|
||||
Reference in New Issue
Block a user