Add PDF-based document generation with LibreOffice; fix line numbering and margins
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Add LibreOffice Writer and DejaVu fonts to Dockerfile for PDF generation
- Add PyPDF2 and reportlab to requirements.txt
- Refactor email_sender.py:
  - Enforce 1-inch margins on all sides
  - Isolate line numbering to transcript section only
  - Add generate_pdf_documents() to build:
    - TRANSCRIPT.pdf (cover + transcript)
    - SUMMARY.pdf (cover + summary)
    - COMBINED.pdf (transcript cover + summary + TRANSCRIPT header + transcript)
  - Add page numbers (bottom-right) to all PDFs via reportlab
- Update tasks.py:
  - Use generate_pdf_documents() after creating DOCX files
  - Attach source JSON, MD files, and compiled PDFs in success email
- Add test_docx_generation.py for transcript/summary/combined DOCX testing
This commit is contained in:
admin
2026-06-15 02:19:17 +00:00
parent b0a23b32e1
commit 42a155aeaa
5 changed files with 534 additions and 119 deletions
+398 -114
View File
@@ -8,20 +8,23 @@ Template placeholders are primarily filled via environment variables.
"""
import base64
import json
import logging
import os
import re
import smtplib
import logging
import subprocess
import tempfile
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import List, Optional, Dict, Any
from typing import Any, Dict, List, Optional
from docx import Document
from docx.shared import Inches, Pt
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
logger = logging.getLogger("scraibe.email_sender")
@@ -221,12 +224,12 @@ def send_email(
msg["To"] = ", ".join(to_list)
if cc_list:
msg["Cc"] = ", ".join(cc_list)
# Ensure subject is never blank
if not subject or not subject.strip():
logger.warning("Subject was blank; using default subject.")
subject = "ScrAIbe: Your transcript is ready"
msg["Subject"] = subject.strip()
# Attach plain text
@@ -291,98 +294,44 @@ def send_email(
raise EmailError(f"Failed to send email: {e}")
def _remove_line_numbering(section):
"""
Explicitly remove line numbering from a section.
"""
sectPr = section._sectPr
lnNumType = sectPr.find(qn("w:lnNumType"))
if lnNumType is not None:
sectPr.remove(lnNumType)
# ------------ DOCX helpers ------------
def _enable_line_numbering(section):
def _configure_base_font(doc):
"""
Enable continuous line numbering for a section.
Set base document font to 12pt Courier.
"""
sectPr = section._sectPr
lnNumType = sectPr.find(qn("w:lnNumType"))
if lnNumType is None:
lnNumType = OxmlElement("w:lnNumType")
sectPr.append(lnNumType)
lnNumType.set(qn("w:start"), "continuous")
lnNumType.set(qn("w:countBy"), "1")
def _setup_docx_style(doc, enable_line_numbering=False):
"""
Base document setup (margins, font, footer).
Line numbering is optional and applied to the first section only.
"""
section = doc.sections[0]
section.left_margin = Inches(1.5)
section.right_margin = Inches(1.0)
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
# Line numbering (only for transcript sections)
if enable_line_numbering:
_enable_line_numbering(section)
else:
_remove_line_numbering(section)
# Default font
style = doc.styles["Normal"]
font = style.font
font.name = "Courier"
font.size = Pt(12)
style.font.name = "Courier"
style.font.size = Pt(12)
# Add "Page X of Y" footer, right-aligned
footer = section.footer
footer.is_linked_to_previous = False
p = footer.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# PAGE field (current page number)
run_page = p.add_run()
r = run_page._r
r.clear()
def _configure_section_margins(doc, margin=1.0):
"""
Set uniform margins on all sides (default 1 inch).
"""
for section in doc.sections:
section.left_margin = Inches(margin)
section.right_margin = Inches(margin)
section.top_margin = Inches(margin)
section.bottom_margin = Inches(margin)
fldChar_begin = OxmlElement("w:fldChar")
fldChar_begin.set(qn("w:fldCharType"), "begin")
r.append(fldChar_begin)
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = "PAGE"
r.append(instrText)
def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
"""
Enable line numbering for a specific section.
"""
sectPr = section._sectPr
fldChar_end = OxmlElement("w:fldChar")
fldChar_end.set(qn("w:fldCharType"), "end")
r.append(fldChar_end)
# Create the line numbering element: <w:lnNumType>
lnNumType = OxmlElement("w:lnNumType")
lnNumType.set(qn("w:start"), str(start_at))
lnNumType.set(qn("w:countBy"), str(count_by))
# " of " text
run_of = p.add_run(" of ")
run_of.font.name = "Courier"
run_of.font.size = Pt(10)
# 'eachPage' restarts numbering on every page; 'continuous' keeps it going
restart_mode = "eachPage" if restart else "continuous"
lnNumType.set(qn("w:restart"), restart_mode)
# NUMPAGES field (total pages)
run_numpages = p.add_run()
r = run_numpages._r
r.clear()
fldChar_begin = OxmlElement("w:fldChar")
fldChar_begin.set(qn("w:fldCharType"), "begin")
r.append(fldChar_begin)
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = "NUMPAGES"
r.append(instrText)
fldChar_end = OxmlElement("w:fldChar")
fldChar_end.set(qn("w:fldCharType"), "end")
r.append(fldChar_end)
sectPr.append(lnNumType)
def _add_cover_page(doc, doc_type, date, description):
@@ -394,14 +343,13 @@ def _add_cover_page(doc, doc_type, date, description):
2) Date (e.g. "June 14, 2026")
3-5) Empty space
6) One-sentence description
- Then page break.
"""
# Create a full-page table to center content vertically and horizontally
table = doc.add_table(rows=1, cols=1)
table.autofit = False
cell = table.cell(0, 0)
# Make table span full page height (approx)
# Make table span full page width (approx)
cell.width = Inches(6.5)
# Center content inside the cell
@@ -437,11 +385,11 @@ def _add_cover_page(doc, doc_type, date, description):
run_desc.font.name = "Courier"
run_desc.font.size = Pt(12)
# Page break after cover page
doc.add_page_break()
def _add_transcript_content(doc, text):
"""
Add transcript lines to the document with formatting.
"""
lines = text.strip().split("\n")
for line in lines:
line = line.strip()
@@ -479,6 +427,9 @@ def _add_transcript_content(doc, text):
def _add_summary_content(doc, text):
"""
Add summary content with heading styles.
"""
heading_count = 0
for line in text.splitlines():
stripped = line.strip()
@@ -514,6 +465,119 @@ def _add_summary_content(doc, text):
p.paragraph_format.space_after = Pt(4)
# ------------ PDF helpers ------------
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
"""
Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF.
"""
try:
# Use LibreOffice in headless mode
result = subprocess.run(
[
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", output_dir,
docx_path,
],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
logger.warning("LibreOffice conversion failed: %s", result.stderr)
# LibreOffice creates a PDF with the same base name
base = os.path.splitext(os.path.basename(docx_path))[0]
pdf_path = os.path.join(output_dir, f"{base}.pdf")
if os.path.exists(pdf_path):
return pdf_path
else:
logger.warning("Converted PDF not found at: %s", pdf_path)
return None
except Exception as e:
logger.error("Error converting DOCX to PDF: %s", e)
return None
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
"""
Merge multiple PDF files into a single PDF using PyPDF2.
Returns True on success, False on failure.
"""
try:
from PyPDF2 import PdfMerger
merger = PdfMerger()
for pdf in input_pdfs:
if os.path.exists(pdf):
merger.append(pdf)
merger.write(output_pdf)
merger.close()
return True
except Exception as e:
logger.error("Error merging PDFs: %s", e)
return False
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
"""
Add page numbers to a PDF using reportlab.
Page numbers appear at the bottom-right of each page.
"""
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from PyPDF2 import PdfReader
# Try to use a standard font
font_name = "Courier"
try:
pdfmetrics.registerFont(
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
)
except Exception:
pass
reader = PdfReader(input_pdf)
num_pages = len(reader.pages)
# Create a canvas to add page numbers
c = canvas.Canvas(output_pdf, pagesize=letter)
for page_num in range(num_pages):
page = reader.pages[page_num]
c = canvas.Canvas(output_pdf, pagesize=letter)
# Add page number
c.setFont(font_name, 10)
page_text = f"Page {page_num + 1} of {num_pages}"
text_width = c.stringWidth(page_text, font_name, 10)
x = letter[0] - 1 * 72 - text_width # 1 inch from right
y = 1 * 72 # 1 inch from bottom
c.drawString(x, y, page_text)
c.showPage()
c.save()
return True
except Exception as e:
logger.error("Error adding page numbers to PDF: %s", e)
return False
# ------------ Public DOCX/PDF functions ------------
def create_transcript_docx(
text: str,
filename: str,
@@ -523,19 +587,26 @@ def create_transcript_docx(
):
"""
Create a .docx transcript with:
- 1.5" left margin, 1" right margin
- 1" margins on all sides
- 12pt Courier
- Continuous line numbering on the left (for transcript content only)
- Optional cover page with type, date, and AI-generated description.
"""
doc = Document()
# Enable line numbering for transcript
_setup_docx_style(doc, enable_line_numbering=True)
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page (no line numbering)
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
doc.add_page_break()
# 2) Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 3) Transcript content (with line numbering)
_add_transcript_content(doc, text)
doc.save(filename)
@@ -552,13 +623,17 @@ def create_summary_docx(
No line numbering.
"""
doc = Document()
# No line numbering for summary
_setup_docx_style(doc, enable_line_numbering=False)
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
doc.add_page_break()
# 2) Summary content (no line numbering)
_add_summary_content(doc, text)
doc.save(filename)
@@ -577,34 +652,243 @@ def create_combined_docx(
2) Page break
3) Summary content (no line numbering)
4) Page break
5) Transcript content (line numbering enabled)
5) Transcript content (line numbering enabled only here)
Line numbering is restricted to the transcript section only.
"""
doc = Document()
# Start with no line numbering (for cover and summary)
_setup_docx_style(doc, enable_line_numbering=False)
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Transcript cover page (includes trailing page break)
# 1) Transcript cover page (no line numbering)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.add_page_break()
# 3) Summary content (no line numbering)
_add_summary_content(doc, summary_text)
# 4) Page break before transcript
doc.add_page_break()
# Enable line numbering for transcript section
# We create a new section for transcript so line numbering applies only there
section_transcript = doc.add_section()
# Apply same margins
section_transcript.left_margin = Inches(1.5)
section_transcript.right_margin = Inches(1.0)
section_transcript.top_margin = Inches(1.0)
section_transcript.bottom_margin = Inches(1.0)
# Enable line numbering in transcript section
_enable_line_numbering(section_transcript)
# 4) Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 5) Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(filename)
def generate_pdf_documents(
transcript_text: str,
summary_text: str,
output_dir: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
) -> Dict[str, str]:
"""
Generate PDF documents for transcript, summary, and combined.
Returns a dict with paths:
- transcript_pdf
- summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided)
"""
import os
os.makedirs(output_dir, exist_ok=True)
result = {}
# Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files
# Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx)
# Summary cover
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_enable_line_numbering(doc.sections[0])
_add_transcript_content(doc, transcript_text)
doc.save(transcript_docx)
# Summary (no line numbering)
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble final PDFs
# Transcript PDF: cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
result["transcript_pdf"] = transcript_output_pdf
# Summary PDF: cover + summary + page numbers
if summary_text:
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
result["summary_pdf"] = summary_output_pdf
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
if summary_text:
# Create a document with "TRANSCRIPT" header before transcript
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Add "TRANSCRIPT" header
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("TRANSCRIPT")
run.bold = True
run.font.name = "Courier"
run.font.size = Pt(12)
# Page break
doc.add_page_break()
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(transcript_header_docx)
# Convert to PDF
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
# Assemble combined PDF
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if _merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
merged_combined,
):
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
result["combined_pdf"] = combined_output_pdf
return result
def send_success_email(
to: str,
transcript_text: str,
summary_text: str,
attachments: List[str],
task_id: str,
):
"""
Send a success email with attachments.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Build email body
body = f"""
Your transcription is complete.
Task ID: {task_id}
Please find the attached documents:
- Transcript (MD)
- Transcript (DOCX)
- Source JSON
"""
if summary_text:
body += "- Summary (MD)\n- Summary (DOCX)\n"
# Load HTML template
try:
html = load_template(
"success_template.html",
task_id=task_id,
transcript_text=transcript_text[:500],
summary_text=summary_text[:500] if summary_text else "",
)
except EmailError:
html = None
# Send email
send_email(
to=to,
subject=f"Transcription Complete - Task {task_id}",
body=body,
html=html,
attachments=attachments,
)
def send_error_email(
to: str,
error_message: str,
task_id: str,
):
"""
Send an error email.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Build email body
body = f"""
There was an error processing your transcription.
Task ID: {task_id}
Error: {error_message}
"""
# Load HTML template
try:
html = load_template(
"error_notification_template.html",
task_id=task_id,
error_message=error_message,
)
except EmailError:
html = None
# Send email
send_email(
to=to,
subject=f"Transcription Error - Task {task_id}",
body=body,
html=html,
attachments=[],
)