Revert PDF generation; simplify to DOCX + MD + JSON only
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Remove PDF helpers, LibreOffice, PyPDF2, reportlab
- Transcript DOCX: standalone, no cover page, with line numbering
- Summary DOCX: standalone, no cover page, no line numbering
- Attachments:
  - Transcribe: JSON, transcript MD, transcript DOCX
  - Transcribe & Summarize: JSON, transcript MD, transcript DOCX, summary MD, summary DOCX
This commit is contained in:
admin
2026-06-15 03:38:12 +00:00
parent 6233a41f61
commit e27e5b8522
4 changed files with 14 additions and 480 deletions
+7 -357
View File
@@ -13,8 +13,6 @@ import logging
import os
import re
import smtplib
import subprocess
import tempfile
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
@@ -336,58 +334,6 @@ def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
sectPr.append(lnNumType)
def _add_cover_page(doc, doc_type, date, description):
"""
Add a cover page:
- Centered horizontally and vertically using a full-page table.
- Lines:
1) Document type
2) Date (e.g. "June 14, 2026")
3-5) Empty space
6) One-sentence description
"""
# Create a full-page table to center content vertically and horizontally
table = doc.add_table(rows=1, cols=1)
table.autofit = False
cell = table.cell(0, 0)
# Make table span full page width (approx)
cell.width = Inches(6.5)
# Center content inside the cell
for paragraph in cell.paragraphs:
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Clear default paragraph
cell.paragraphs[0].clear()
# Line 1: Document type
p_type = cell.add_paragraph()
p_type.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_type = p_type.add_run(doc_type)
run_type.bold = True
run_type.font.name = "Courier"
run_type.font.size = Pt(12)
# Line 2: Date
p_date = cell.add_paragraph()
p_date.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_date = p_date.add_run(date)
run_date.font.name = "Courier"
run_date.font.size = Pt(12)
# Lines 3-5: blank space
for _ in range(3):
cell.add_paragraph()
# Line 6: Description
p_desc = cell.add_paragraph()
p_desc.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_desc = p_desc.add_run(description)
run_desc.font.name = "Courier"
run_desc.font.size = Pt(12)
def _add_transcript_content(doc, text):
"""
Add transcript lines to the document with formatting.
@@ -467,146 +413,27 @@ def _add_summary_content(doc, text):
p.paragraph_format.space_after = Pt(4)
# ------------ PDF helpers ------------
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
"""
Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF, or None on failure.
"""
try:
# Use LibreOffice in headless mode
result = subprocess.run(
[
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", output_dir,
docx_path,
],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
logger.warning("LibreOffice conversion failed: %s", result.stderr)
# LibreOffice creates a PDF with the same base name
base = os.path.splitext(os.path.basename(docx_path))[0]
pdf_path = os.path.join(output_dir, f"{base}.pdf")
if os.path.exists(pdf_path):
return pdf_path
else:
logger.warning("Converted PDF not found at: %s", pdf_path)
return None
except Exception as e:
logger.error("Error converting DOCX to PDF: %s", e)
return None
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
"""
Merge multiple PDF files into a single PDF using PyPDF2.
Returns True on success, False on failure.
"""
try:
from PyPDF2 import PdfMerger
merger = PdfMerger()
for pdf in input_pdfs:
if os.path.exists(pdf):
merger.append(pdf)
merger.write(output_pdf)
merger.close()
return True
except Exception as e:
logger.error("Error merging PDFs: %s", e)
return False
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
"""
Add page numbers to a PDF using reportlab.
Page numbers appear at the bottom-right of each page.
"""
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from PyPDF2 import PdfReader
# Try to use a standard font
font_name = "Courier"
try:
pdfmetrics.registerFont(
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
)
except Exception:
pass
reader = PdfReader(input_pdf)
num_pages = len(reader.pages)
# Create a canvas to add page numbers
c = canvas.Canvas(output_pdf, pagesize=letter)
for page_num in range(num_pages):
page = reader.pages[page_num]
c = canvas.Canvas(output_pdf, pagesize=letter)
# Add page number
c.setFont(font_name, 10)
page_text = f"Page {page_num + 1} of {num_pages}"
text_width = c.stringWidth(page_text, font_name, 10)
x = letter[0] - 1 * 72 - text_width # 1 inch from right
y = 1 * 72 # 1 inch from bottom
c.drawString(x, y, page_text)
c.showPage()
c.save()
return True
except Exception as e:
logger.error("Error adding page numbers to PDF: %s", e)
return False
# ------------ Public DOCX/PDF functions ------------
# ------------ Public DOCX functions ------------
def create_transcript_docx(
text: str,
filename: str,
include_cover: bool = False,
cover_date: str = "",
cover_desc: str = "",
):
"""
Create a .docx transcript with:
- 1" margins on all sides
- 12pt Courier
- Continuous line numbering on the left (for transcript content only)
- Optional cover page with type, date, and AI-generated description.
- Continuous line numbering on the left
- No cover page (standalone transcript only).
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page (no line numbering)
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
doc.add_page_break()
# 2) Enable line numbering for transcript section
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 3) Transcript content (with line numbering)
# Transcript content (with line numbering)
_add_transcript_content(doc, text)
doc.save(filename)
@@ -615,198 +442,21 @@ def create_transcript_docx(
def create_summary_docx(
text: str,
filename: str,
include_cover: bool = False,
cover_date: str = "",
cover_desc: str = "",
):
"""
Create a .docx summary with consistent font and heading styles.
Optional cover page with type, date, and AI-generated description.
No line numbering.
No cover page, no line numbering.
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
doc.add_page_break()
# 2) Summary content (no line numbering)
# Summary content (no line numbering)
_add_summary_content(doc, text)
doc.save(filename)
def create_combined_docx(
transcript_text: str,
summary_text: str,
filename: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
):
"""
Create a combined .docx with:
1) Transcript cover page (no line numbering)
2) Page break
3) Summary content (no line numbering)
4) Page break
5) Transcript content (line numbering enabled only here)
Line numbering is restricted to the transcript section only.
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Transcript cover page (no line numbering)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.add_page_break()
# 3) Summary content (no line numbering)
_add_summary_content(doc, summary_text)
doc.add_page_break()
# 4) Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 5) Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(filename)
def generate_pdf_documents(
transcript_text: str,
summary_text: str,
output_dir: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
) -> Dict[str, str]:
"""
Generate PDF documents by:
- Creating individual .docx components (cover pages, transcript, summary)
- Converting each .docx to PDF
- Merging PDFs in the correct order
- Adding page numbers to the final PDFs
Behavior:
- Always:
- Generate TRANSCRIPT.pdf:
- transcript_cover.pdf + transcript_with_line_numbers.pdf
- If summary_text is provided:
- Generate SUMMARY.pdf:
- summary_cover.pdf + summary.pdf
- Generate COMBINED.pdf:
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
Returns a dict with paths:
- transcript_pdf (always)
- summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided)
"""
import os
os.makedirs(output_dir, exist_ok=True)
result = {}
# Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files
# Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx)
# Summary cover (only if summary is requested)
summary_cover_docx = None
if summary_text:
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_enable_line_numbering(doc.sections[0])
_add_transcript_content(doc, transcript_text)
doc.save(transcript_docx)
# Summary (no line numbering)
summary_docx = None
if summary_text:
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_cover_pdf = None
summary_pdf = None
if summary_text:
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if (
transcript_cover_pdf
and transcript_pdf
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
):
result["transcript_pdf"] = transcript_output_pdf
# 4) If summary is provided:
# - SUMMARY.pdf: summary_cover + summary + page numbers
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
if summary_text and summary_pdf and summary_cover_pdf:
# SUMMARY.pdf
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if (
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
):
result["summary_pdf"] = summary_output_pdf
# COMBINED.pdf
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if (
_merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_pdf],
merged_combined,
)
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
):
result["combined_pdf"] = combined_output_pdf
return result
def send_success_email(
to: str,
transcript_text: str,