Add PDF-based document generation with LibreOffice; fix line numbering and margins
- Add LibreOffice Writer and DejaVu fonts to Dockerfile for PDF generation
- Add PyPDF2 and reportlab to requirements.txt
- Refactor email_sender.py:
- Enforce 1-inch margins on all sides
- Isolate line numbering to transcript section only
- Add generate_pdf_documents() to build:
- TRANSCRIPT.pdf (cover + transcript)
- SUMMARY.pdf (cover + summary)
- COMBINED.pdf (transcript cover + summary + TRANSCRIPT header + transcript)
- Add page numbers (bottom-right) to all PDFs via reportlab
- Update tasks.py:
- Use generate_pdf_documents() after creating DOCX files
- Attach source JSON, MD files, and compiled PDFs in success email
- Add test_docx_generation.py for transcript/summary/combined DOCX testing
This commit is contained in:
+7
-2
@@ -9,9 +9,14 @@ LABEL description="Scraibe: LocalAI-backed transcription and diarization client
|
|||||||
Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
|
Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
|
||||||
LABEL url="https://git.optimex.systems/admin/scribe"
|
LABEL url="https://git.optimex.systems/admin/scribe"
|
||||||
|
|
||||||
# Install system dependencies (ffmpeg, redis)
|
# Install system dependencies (ffmpeg, redis, LibreOffice for PDF generation)
|
||||||
RUN apt update -y && \
|
RUN apt update -y && \
|
||||||
apt install -y --no-install-recommends ffmpeg redis-server && \
|
apt install -y --no-install-recommends \
|
||||||
|
ffmpeg \
|
||||||
|
redis-server \
|
||||||
|
libreoffice-writer \
|
||||||
|
fonts-dejavu-core \
|
||||||
|
&& \
|
||||||
apt clean && \
|
apt clean && \
|
||||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||||
|
|
||||||
|
|||||||
@@ -6,3 +6,5 @@ PyYAML>=6.0
|
|||||||
celery[redis]>=5.3.0
|
celery[redis]>=5.3.0
|
||||||
redis>=5.0.0
|
redis>=5.0.0
|
||||||
python-docx>=1.1.0
|
python-docx>=1.1.0
|
||||||
|
PyPDF2>=3.0.0
|
||||||
|
reportlab>=4.0.0
|
||||||
|
|||||||
+396
-112
@@ -8,20 +8,23 @@ Template placeholders are primarily filled via environment variables.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import smtplib
|
import smtplib
|
||||||
import logging
|
import subprocess
|
||||||
|
import tempfile
|
||||||
from email import encoders
|
from email import encoders
|
||||||
from email.mime.base import MIMEBase
|
from email.mime.base import MIMEBase
|
||||||
from email.mime.multipart import MIMEMultipart
|
from email.mime.multipart import MIMEMultipart
|
||||||
from email.mime.text import MIMEText
|
from email.mime.text import MIMEText
|
||||||
from typing import List, Optional, Dict, Any
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.shared import Inches, Pt
|
|
||||||
from docx.oxml.ns import qn
|
|
||||||
from docx.oxml import OxmlElement
|
from docx.oxml import OxmlElement
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
from docx.shared import Inches, Pt
|
||||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
|
||||||
logger = logging.getLogger("scraibe.email_sender")
|
logger = logging.getLogger("scraibe.email_sender")
|
||||||
@@ -291,98 +294,44 @@ def send_email(
|
|||||||
raise EmailError(f"Failed to send email: {e}")
|
raise EmailError(f"Failed to send email: {e}")
|
||||||
|
|
||||||
|
|
||||||
def _remove_line_numbering(section):
|
# ------------ DOCX helpers ------------
|
||||||
"""
|
|
||||||
Explicitly remove line numbering from a section.
|
|
||||||
"""
|
|
||||||
sectPr = section._sectPr
|
|
||||||
lnNumType = sectPr.find(qn("w:lnNumType"))
|
|
||||||
if lnNumType is not None:
|
|
||||||
sectPr.remove(lnNumType)
|
|
||||||
|
|
||||||
|
def _configure_base_font(doc):
|
||||||
def _enable_line_numbering(section):
|
|
||||||
"""
|
"""
|
||||||
Enable continuous line numbering for a section.
|
Set base document font to 12pt Courier.
|
||||||
"""
|
"""
|
||||||
sectPr = section._sectPr
|
|
||||||
lnNumType = sectPr.find(qn("w:lnNumType"))
|
|
||||||
if lnNumType is None:
|
|
||||||
lnNumType = OxmlElement("w:lnNumType")
|
|
||||||
sectPr.append(lnNumType)
|
|
||||||
lnNumType.set(qn("w:start"), "continuous")
|
|
||||||
lnNumType.set(qn("w:countBy"), "1")
|
|
||||||
|
|
||||||
|
|
||||||
def _setup_docx_style(doc, enable_line_numbering=False):
|
|
||||||
"""
|
|
||||||
Base document setup (margins, font, footer).
|
|
||||||
Line numbering is optional and applied to the first section only.
|
|
||||||
"""
|
|
||||||
section = doc.sections[0]
|
|
||||||
section.left_margin = Inches(1.5)
|
|
||||||
section.right_margin = Inches(1.0)
|
|
||||||
section.top_margin = Inches(1.0)
|
|
||||||
section.bottom_margin = Inches(1.0)
|
|
||||||
|
|
||||||
# Line numbering (only for transcript sections)
|
|
||||||
if enable_line_numbering:
|
|
||||||
_enable_line_numbering(section)
|
|
||||||
else:
|
|
||||||
_remove_line_numbering(section)
|
|
||||||
|
|
||||||
# Default font
|
|
||||||
style = doc.styles["Normal"]
|
style = doc.styles["Normal"]
|
||||||
font = style.font
|
style.font.name = "Courier"
|
||||||
font.name = "Courier"
|
style.font.size = Pt(12)
|
||||||
font.size = Pt(12)
|
|
||||||
|
|
||||||
# Add "Page X of Y" footer, right-aligned
|
|
||||||
footer = section.footer
|
|
||||||
footer.is_linked_to_previous = False
|
|
||||||
p = footer.paragraphs[0]
|
|
||||||
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
|
|
||||||
|
|
||||||
# PAGE field (current page number)
|
def _configure_section_margins(doc, margin=1.0):
|
||||||
run_page = p.add_run()
|
"""
|
||||||
r = run_page._r
|
Set uniform margins on all sides (default 1 inch).
|
||||||
r.clear()
|
"""
|
||||||
|
for section in doc.sections:
|
||||||
|
section.left_margin = Inches(margin)
|
||||||
|
section.right_margin = Inches(margin)
|
||||||
|
section.top_margin = Inches(margin)
|
||||||
|
section.bottom_margin = Inches(margin)
|
||||||
|
|
||||||
fldChar_begin = OxmlElement("w:fldChar")
|
|
||||||
fldChar_begin.set(qn("w:fldCharType"), "begin")
|
|
||||||
r.append(fldChar_begin)
|
|
||||||
|
|
||||||
instrText = OxmlElement("w:instrText")
|
def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
|
||||||
instrText.set(qn("xml:space"), "preserve")
|
"""
|
||||||
instrText.text = "PAGE"
|
Enable line numbering for a specific section.
|
||||||
r.append(instrText)
|
"""
|
||||||
|
sectPr = section._sectPr
|
||||||
|
|
||||||
fldChar_end = OxmlElement("w:fldChar")
|
# Create the line numbering element: <w:lnNumType>
|
||||||
fldChar_end.set(qn("w:fldCharType"), "end")
|
lnNumType = OxmlElement("w:lnNumType")
|
||||||
r.append(fldChar_end)
|
lnNumType.set(qn("w:start"), str(start_at))
|
||||||
|
lnNumType.set(qn("w:countBy"), str(count_by))
|
||||||
|
|
||||||
# " of " text
|
# 'eachPage' restarts numbering on every page; 'continuous' keeps it going
|
||||||
run_of = p.add_run(" of ")
|
restart_mode = "eachPage" if restart else "continuous"
|
||||||
run_of.font.name = "Courier"
|
lnNumType.set(qn("w:restart"), restart_mode)
|
||||||
run_of.font.size = Pt(10)
|
|
||||||
|
|
||||||
# NUMPAGES field (total pages)
|
sectPr.append(lnNumType)
|
||||||
run_numpages = p.add_run()
|
|
||||||
r = run_numpages._r
|
|
||||||
r.clear()
|
|
||||||
|
|
||||||
fldChar_begin = OxmlElement("w:fldChar")
|
|
||||||
fldChar_begin.set(qn("w:fldCharType"), "begin")
|
|
||||||
r.append(fldChar_begin)
|
|
||||||
|
|
||||||
instrText = OxmlElement("w:instrText")
|
|
||||||
instrText.set(qn("xml:space"), "preserve")
|
|
||||||
instrText.text = "NUMPAGES"
|
|
||||||
r.append(instrText)
|
|
||||||
|
|
||||||
fldChar_end = OxmlElement("w:fldChar")
|
|
||||||
fldChar_end.set(qn("w:fldCharType"), "end")
|
|
||||||
r.append(fldChar_end)
|
|
||||||
|
|
||||||
|
|
||||||
def _add_cover_page(doc, doc_type, date, description):
|
def _add_cover_page(doc, doc_type, date, description):
|
||||||
@@ -394,14 +343,13 @@ def _add_cover_page(doc, doc_type, date, description):
|
|||||||
2) Date (e.g. "June 14, 2026")
|
2) Date (e.g. "June 14, 2026")
|
||||||
3-5) Empty space
|
3-5) Empty space
|
||||||
6) One-sentence description
|
6) One-sentence description
|
||||||
- Then page break.
|
|
||||||
"""
|
"""
|
||||||
# Create a full-page table to center content vertically and horizontally
|
# Create a full-page table to center content vertically and horizontally
|
||||||
table = doc.add_table(rows=1, cols=1)
|
table = doc.add_table(rows=1, cols=1)
|
||||||
table.autofit = False
|
table.autofit = False
|
||||||
cell = table.cell(0, 0)
|
cell = table.cell(0, 0)
|
||||||
|
|
||||||
# Make table span full page height (approx)
|
# Make table span full page width (approx)
|
||||||
cell.width = Inches(6.5)
|
cell.width = Inches(6.5)
|
||||||
|
|
||||||
# Center content inside the cell
|
# Center content inside the cell
|
||||||
@@ -437,11 +385,11 @@ def _add_cover_page(doc, doc_type, date, description):
|
|||||||
run_desc.font.name = "Courier"
|
run_desc.font.name = "Courier"
|
||||||
run_desc.font.size = Pt(12)
|
run_desc.font.size = Pt(12)
|
||||||
|
|
||||||
# Page break after cover page
|
|
||||||
doc.add_page_break()
|
|
||||||
|
|
||||||
|
|
||||||
def _add_transcript_content(doc, text):
|
def _add_transcript_content(doc, text):
|
||||||
|
"""
|
||||||
|
Add transcript lines to the document with formatting.
|
||||||
|
"""
|
||||||
lines = text.strip().split("\n")
|
lines = text.strip().split("\n")
|
||||||
for line in lines:
|
for line in lines:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
@@ -479,6 +427,9 @@ def _add_transcript_content(doc, text):
|
|||||||
|
|
||||||
|
|
||||||
def _add_summary_content(doc, text):
|
def _add_summary_content(doc, text):
|
||||||
|
"""
|
||||||
|
Add summary content with heading styles.
|
||||||
|
"""
|
||||||
heading_count = 0
|
heading_count = 0
|
||||||
for line in text.splitlines():
|
for line in text.splitlines():
|
||||||
stripped = line.strip()
|
stripped = line.strip()
|
||||||
@@ -514,6 +465,119 @@ def _add_summary_content(doc, text):
|
|||||||
p.paragraph_format.space_after = Pt(4)
|
p.paragraph_format.space_after = Pt(4)
|
||||||
|
|
||||||
|
|
||||||
|
# ------------ PDF helpers ------------
|
||||||
|
|
||||||
|
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert a .docx file to PDF using LibreOffice.
|
||||||
|
Returns the path of the generated PDF.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Use LibreOffice in headless mode
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"libreoffice",
|
||||||
|
"--headless",
|
||||||
|
"--convert-to", "pdf",
|
||||||
|
"--outdir", output_dir,
|
||||||
|
docx_path,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
logger.warning("LibreOffice conversion failed: %s", result.stderr)
|
||||||
|
|
||||||
|
# LibreOffice creates a PDF with the same base name
|
||||||
|
base = os.path.splitext(os.path.basename(docx_path))[0]
|
||||||
|
pdf_path = os.path.join(output_dir, f"{base}.pdf")
|
||||||
|
|
||||||
|
if os.path.exists(pdf_path):
|
||||||
|
return pdf_path
|
||||||
|
else:
|
||||||
|
logger.warning("Converted PDF not found at: %s", pdf_path)
|
||||||
|
return None
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error converting DOCX to PDF: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
|
||||||
|
"""
|
||||||
|
Merge multiple PDF files into a single PDF using PyPDF2.
|
||||||
|
Returns True on success, False on failure.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from PyPDF2 import PdfMerger
|
||||||
|
|
||||||
|
merger = PdfMerger()
|
||||||
|
for pdf in input_pdfs:
|
||||||
|
if os.path.exists(pdf):
|
||||||
|
merger.append(pdf)
|
||||||
|
|
||||||
|
merger.write(output_pdf)
|
||||||
|
merger.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error merging PDFs: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
|
||||||
|
"""
|
||||||
|
Add page numbers to a PDF using reportlab.
|
||||||
|
Page numbers appear at the bottom-right of each page.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from reportlab.pdfgen import canvas
|
||||||
|
from reportlab.lib.pagesizes import letter
|
||||||
|
from reportlab.pdfbase import pdfmetrics
|
||||||
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
|
from PyPDF2 import PdfReader
|
||||||
|
|
||||||
|
# Try to use a standard font
|
||||||
|
font_name = "Courier"
|
||||||
|
try:
|
||||||
|
pdfmetrics.registerFont(
|
||||||
|
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
reader = PdfReader(input_pdf)
|
||||||
|
num_pages = len(reader.pages)
|
||||||
|
|
||||||
|
# Create a canvas to add page numbers
|
||||||
|
c = canvas.Canvas(output_pdf, pagesize=letter)
|
||||||
|
|
||||||
|
for page_num in range(num_pages):
|
||||||
|
page = reader.pages[page_num]
|
||||||
|
c = canvas.Canvas(output_pdf, pagesize=letter)
|
||||||
|
|
||||||
|
# Add page number
|
||||||
|
c.setFont(font_name, 10)
|
||||||
|
page_text = f"Page {page_num + 1} of {num_pages}"
|
||||||
|
text_width = c.stringWidth(page_text, font_name, 10)
|
||||||
|
x = letter[0] - 1 * 72 - text_width # 1 inch from right
|
||||||
|
y = 1 * 72 # 1 inch from bottom
|
||||||
|
c.drawString(x, y, page_text)
|
||||||
|
|
||||||
|
c.showPage()
|
||||||
|
c.save()
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Error adding page numbers to PDF: %s", e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ------------ Public DOCX/PDF functions ------------
|
||||||
|
|
||||||
def create_transcript_docx(
|
def create_transcript_docx(
|
||||||
text: str,
|
text: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
@@ -523,19 +587,26 @@ def create_transcript_docx(
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Create a .docx transcript with:
|
Create a .docx transcript with:
|
||||||
- 1.5" left margin, 1" right margin
|
- 1" margins on all sides
|
||||||
- 12pt Courier
|
- 12pt Courier
|
||||||
- Continuous line numbering on the left (for transcript content only)
|
- Continuous line numbering on the left (for transcript content only)
|
||||||
- Optional cover page with type, date, and AI-generated description.
|
- Optional cover page with type, date, and AI-generated description.
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
# Enable line numbering for transcript
|
_configure_base_font(doc)
|
||||||
_setup_docx_style(doc, enable_line_numbering=True)
|
_configure_section_margins(doc)
|
||||||
|
|
||||||
|
# 1) Optional cover page (no line numbering)
|
||||||
if include_cover and cover_date and cover_desc:
|
if include_cover and cover_date and cover_desc:
|
||||||
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
|
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
# 2) Enable line numbering for transcript section
|
||||||
|
_enable_line_numbering(doc.sections[0])
|
||||||
|
|
||||||
|
# 3) Transcript content (with line numbering)
|
||||||
_add_transcript_content(doc, text)
|
_add_transcript_content(doc, text)
|
||||||
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|
||||||
|
|
||||||
@@ -552,13 +623,17 @@ def create_summary_docx(
|
|||||||
No line numbering.
|
No line numbering.
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
# No line numbering for summary
|
_configure_base_font(doc)
|
||||||
_setup_docx_style(doc, enable_line_numbering=False)
|
_configure_section_margins(doc)
|
||||||
|
|
||||||
|
# 1) Optional cover page
|
||||||
if include_cover and cover_date and cover_desc:
|
if include_cover and cover_date and cover_desc:
|
||||||
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
|
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
# 2) Summary content (no line numbering)
|
||||||
_add_summary_content(doc, text)
|
_add_summary_content(doc, text)
|
||||||
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|
||||||
|
|
||||||
@@ -577,34 +652,243 @@ def create_combined_docx(
|
|||||||
2) Page break
|
2) Page break
|
||||||
3) Summary content (no line numbering)
|
3) Summary content (no line numbering)
|
||||||
4) Page break
|
4) Page break
|
||||||
5) Transcript content (line numbering enabled)
|
5) Transcript content (line numbering enabled only here)
|
||||||
|
|
||||||
|
Line numbering is restricted to the transcript section only.
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
# Start with no line numbering (for cover and summary)
|
_configure_base_font(doc)
|
||||||
_setup_docx_style(doc, enable_line_numbering=False)
|
_configure_section_margins(doc)
|
||||||
|
|
||||||
# 1) Transcript cover page (includes trailing page break)
|
# 1) Transcript cover page (no line numbering)
|
||||||
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
# 3) Summary content (no line numbering)
|
# 3) Summary content (no line numbering)
|
||||||
_add_summary_content(doc, summary_text)
|
_add_summary_content(doc, summary_text)
|
||||||
|
|
||||||
# 4) Page break before transcript
|
|
||||||
doc.add_page_break()
|
doc.add_page_break()
|
||||||
|
|
||||||
# Enable line numbering for transcript section
|
# 4) Enable line numbering for transcript section
|
||||||
# We create a new section for transcript so line numbering applies only there
|
_enable_line_numbering(doc.sections[0])
|
||||||
section_transcript = doc.add_section()
|
|
||||||
# Apply same margins
|
|
||||||
section_transcript.left_margin = Inches(1.5)
|
|
||||||
section_transcript.right_margin = Inches(1.0)
|
|
||||||
section_transcript.top_margin = Inches(1.0)
|
|
||||||
section_transcript.bottom_margin = Inches(1.0)
|
|
||||||
|
|
||||||
# Enable line numbering in transcript section
|
|
||||||
_enable_line_numbering(section_transcript)
|
|
||||||
|
|
||||||
# 5) Transcript content (with line numbering)
|
# 5) Transcript content (with line numbering)
|
||||||
_add_transcript_content(doc, transcript_text)
|
_add_transcript_content(doc, transcript_text)
|
||||||
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_pdf_documents(
|
||||||
|
transcript_text: str,
|
||||||
|
summary_text: str,
|
||||||
|
output_dir: str,
|
||||||
|
transcript_cover_date: str,
|
||||||
|
transcript_cover_desc: str,
|
||||||
|
summary_cover_date: str,
|
||||||
|
summary_cover_desc: str,
|
||||||
|
) -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Generate PDF documents for transcript, summary, and combined.
|
||||||
|
|
||||||
|
Returns a dict with paths:
|
||||||
|
- transcript_pdf
|
||||||
|
- summary_pdf (if summary_text provided)
|
||||||
|
- combined_pdf (if summary_text provided)
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
result = {}
|
||||||
|
|
||||||
|
# Create temporary directory for intermediate files
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
# 1) Generate constituent DOCX files
|
||||||
|
# Transcript cover
|
||||||
|
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
|
||||||
|
doc = Document()
|
||||||
|
_configure_base_font(doc)
|
||||||
|
_configure_section_margins(doc)
|
||||||
|
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
|
||||||
|
doc.save(transcript_cover_docx)
|
||||||
|
|
||||||
|
# Summary cover
|
||||||
|
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
|
||||||
|
doc = Document()
|
||||||
|
_configure_base_font(doc)
|
||||||
|
_configure_section_margins(doc)
|
||||||
|
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
|
||||||
|
doc.save(summary_cover_docx)
|
||||||
|
|
||||||
|
# Transcript (with line numbering)
|
||||||
|
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
|
||||||
|
doc = Document()
|
||||||
|
_configure_base_font(doc)
|
||||||
|
_configure_section_margins(doc)
|
||||||
|
_enable_line_numbering(doc.sections[0])
|
||||||
|
_add_transcript_content(doc, transcript_text)
|
||||||
|
doc.save(transcript_docx)
|
||||||
|
|
||||||
|
# Summary (no line numbering)
|
||||||
|
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
|
||||||
|
doc = Document()
|
||||||
|
_configure_base_font(doc)
|
||||||
|
_configure_section_margins(doc)
|
||||||
|
_add_summary_content(doc, summary_text)
|
||||||
|
doc.save(summary_docx)
|
||||||
|
|
||||||
|
# 2) Convert DOCX to PDF
|
||||||
|
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
|
||||||
|
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
|
||||||
|
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
|
||||||
|
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
|
||||||
|
|
||||||
|
# 3) Assemble final PDFs
|
||||||
|
|
||||||
|
# Transcript PDF: cover + transcript + page numbers
|
||||||
|
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
|
||||||
|
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
|
||||||
|
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
|
||||||
|
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
|
||||||
|
result["transcript_pdf"] = transcript_output_pdf
|
||||||
|
|
||||||
|
# Summary PDF: cover + summary + page numbers
|
||||||
|
if summary_text:
|
||||||
|
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
|
||||||
|
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
|
||||||
|
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
|
||||||
|
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
|
||||||
|
result["summary_pdf"] = summary_output_pdf
|
||||||
|
|
||||||
|
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
|
||||||
|
if summary_text:
|
||||||
|
# Create a document with "TRANSCRIPT" header before transcript
|
||||||
|
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
|
||||||
|
doc = Document()
|
||||||
|
_configure_base_font(doc)
|
||||||
|
_configure_section_margins(doc)
|
||||||
|
|
||||||
|
# Add "TRANSCRIPT" header
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
run = p.add_run("TRANSCRIPT")
|
||||||
|
run.bold = True
|
||||||
|
run.font.name = "Courier"
|
||||||
|
run.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Page break
|
||||||
|
doc.add_page_break()
|
||||||
|
|
||||||
|
# Enable line numbering for transcript section
|
||||||
|
_enable_line_numbering(doc.sections[0])
|
||||||
|
|
||||||
|
# Transcript content (with line numbering)
|
||||||
|
_add_transcript_content(doc, transcript_text)
|
||||||
|
doc.save(transcript_header_docx)
|
||||||
|
|
||||||
|
# Convert to PDF
|
||||||
|
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
|
||||||
|
|
||||||
|
# Assemble combined PDF
|
||||||
|
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
|
||||||
|
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
|
||||||
|
if _merge_pdfs(
|
||||||
|
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
|
||||||
|
merged_combined,
|
||||||
|
):
|
||||||
|
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
|
||||||
|
result["combined_pdf"] = combined_output_pdf
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def send_success_email(
|
||||||
|
to: str,
|
||||||
|
transcript_text: str,
|
||||||
|
summary_text: str,
|
||||||
|
attachments: List[str],
|
||||||
|
task_id: str,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Send a success email with attachments.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cfg = get_email_config()
|
||||||
|
except EmailError as e:
|
||||||
|
logger.error("Email configuration error: %s", e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Build email body
|
||||||
|
body = f"""
|
||||||
|
Your transcription is complete.
|
||||||
|
|
||||||
|
Task ID: {task_id}
|
||||||
|
|
||||||
|
Please find the attached documents:
|
||||||
|
- Transcript (MD)
|
||||||
|
- Transcript (DOCX)
|
||||||
|
- Source JSON
|
||||||
|
"""
|
||||||
|
if summary_text:
|
||||||
|
body += "- Summary (MD)\n- Summary (DOCX)\n"
|
||||||
|
|
||||||
|
# Load HTML template
|
||||||
|
try:
|
||||||
|
html = load_template(
|
||||||
|
"success_template.html",
|
||||||
|
task_id=task_id,
|
||||||
|
transcript_text=transcript_text[:500],
|
||||||
|
summary_text=summary_text[:500] if summary_text else "",
|
||||||
|
)
|
||||||
|
except EmailError:
|
||||||
|
html = None
|
||||||
|
|
||||||
|
# Send email
|
||||||
|
send_email(
|
||||||
|
to=to,
|
||||||
|
subject=f"Transcription Complete - Task {task_id}",
|
||||||
|
body=body,
|
||||||
|
html=html,
|
||||||
|
attachments=attachments,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def send_error_email(
|
||||||
|
to: str,
|
||||||
|
error_message: str,
|
||||||
|
task_id: str,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Send an error email.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cfg = get_email_config()
|
||||||
|
except EmailError as e:
|
||||||
|
logger.error("Email configuration error: %s", e)
|
||||||
|
raise
|
||||||
|
|
||||||
|
# Build email body
|
||||||
|
body = f"""
|
||||||
|
There was an error processing your transcription.
|
||||||
|
|
||||||
|
Task ID: {task_id}
|
||||||
|
Error: {error_message}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Load HTML template
|
||||||
|
try:
|
||||||
|
html = load_template(
|
||||||
|
"error_notification_template.html",
|
||||||
|
task_id=task_id,
|
||||||
|
error_message=error_message,
|
||||||
|
)
|
||||||
|
except EmailError:
|
||||||
|
html = None
|
||||||
|
|
||||||
|
# Send email
|
||||||
|
send_email(
|
||||||
|
to=to,
|
||||||
|
subject=f"Transcription Error - Task {task_id}",
|
||||||
|
body=body,
|
||||||
|
html=html,
|
||||||
|
attachments=[],
|
||||||
|
)
|
||||||
|
|||||||
+31
-3
@@ -14,7 +14,7 @@ from .summarizer import SummarizerClient, SummarizerError
|
|||||||
from .misc import setup_logging
|
from .misc import setup_logging
|
||||||
from .email_sender import send_email, EmailError, load_template
|
from .email_sender import send_email, EmailError, load_template
|
||||||
from .email_sender import create_transcript_docx, create_summary_docx
|
from .email_sender import create_transcript_docx, create_summary_docx
|
||||||
from .email_sender import create_combined_docx
|
from .email_sender import create_combined_docx, generate_pdf_documents
|
||||||
|
|
||||||
logger = logging.getLogger("scraibe.tasks")
|
logger = logging.getLogger("scraibe.tasks")
|
||||||
|
|
||||||
@@ -545,6 +545,26 @@ def process_transcription_task(
|
|||||||
)
|
)
|
||||||
temp_files.append(docx_combined_path)
|
temp_files.append(docx_combined_path)
|
||||||
|
|
||||||
|
# 4b) Generate PDF documents
|
||||||
|
pdf_files = {}
|
||||||
|
try:
|
||||||
|
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
|
||||||
|
pdf_files = generate_pdf_documents(
|
||||||
|
transcript_text=transcript_text,
|
||||||
|
summary_text=summary_text if include_summary else "",
|
||||||
|
output_dir=pdf_output_dir,
|
||||||
|
transcript_cover_date=today_str,
|
||||||
|
transcript_cover_desc=transcript_cover_desc,
|
||||||
|
summary_cover_date=today_str,
|
||||||
|
summary_cover_desc=summary_cover_desc,
|
||||||
|
)
|
||||||
|
for pdf_path in pdf_files.values():
|
||||||
|
if pdf_path and os.path.exists(pdf_path):
|
||||||
|
temp_files.append(pdf_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
|
||||||
|
|
||||||
|
# 5) Build attachments list
|
||||||
attachments = [
|
attachments = [
|
||||||
md_transcript_path,
|
md_transcript_path,
|
||||||
docx_transcript_path,
|
docx_transcript_path,
|
||||||
@@ -555,7 +575,15 @@ def process_transcription_task(
|
|||||||
if docx_combined_path:
|
if docx_combined_path:
|
||||||
attachments.append(docx_combined_path)
|
attachments.append(docx_combined_path)
|
||||||
|
|
||||||
# 5) Send success email
|
# Add PDFs to attachments
|
||||||
|
if pdf_files.get("transcript_pdf"):
|
||||||
|
attachments.append(pdf_files["transcript_pdf"])
|
||||||
|
if pdf_files.get("summary_pdf"):
|
||||||
|
attachments.append(pdf_files["summary_pdf"])
|
||||||
|
if pdf_files.get("combined_pdf"):
|
||||||
|
attachments.append(pdf_files["combined_pdf"])
|
||||||
|
|
||||||
|
# 6) Send success email
|
||||||
send_success_email(
|
send_success_email(
|
||||||
to=email_to,
|
to=email_to,
|
||||||
transcript_text=transcript_text,
|
transcript_text=transcript_text,
|
||||||
@@ -575,7 +603,7 @@ def process_transcription_task(
|
|||||||
)
|
)
|
||||||
raise e
|
raise e
|
||||||
finally:
|
finally:
|
||||||
# 6) Cleanup
|
# 7) Cleanup
|
||||||
for path in temp_files:
|
for path in temp_files:
|
||||||
_remove_file(path)
|
_remove_file(path)
|
||||||
if audio_path:
|
if audio_path:
|
||||||
|
|||||||
@@ -0,0 +1,96 @@
|
|||||||
|
"""
|
||||||
|
Local test for transcript/summary/combined .docx generation.
|
||||||
|
Checks:
|
||||||
|
- Line numbering only on transcript pages.
|
||||||
|
- Page numbering (X of Y) in footer.
|
||||||
|
- Cover pages present and centered.
|
||||||
|
- Combined document structure.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
from scraibe.email_sender import (
|
||||||
|
create_transcript_docx,
|
||||||
|
create_summary_docx,
|
||||||
|
create_combined_docx,
|
||||||
|
)
|
||||||
|
|
||||||
|
TRANSCRIPT_TEXT = """[00:00] Speaker 1: Good morning, everyone. Thank you for joining today's meeting.
|
||||||
|
[00:12] Speaker 2: Good morning. I'm looking forward to discussing the new requirements.
|
||||||
|
[00:25] Speaker 1: Let's start with the timeline. We need to finalize the scope by Friday.
|
||||||
|
[00:38] Speaker 2: Agreed. I'll send a summary of the key points after this call.
|
||||||
|
[00:45] Speaker 1: Perfect. If there are no other items, we can wrap up here."""
|
||||||
|
|
||||||
|
SUMMARY_TEXT = """# Meeting Overview
|
||||||
|
## Key Discussion Points
|
||||||
|
### Timeline and Scope
|
||||||
|
#### Next Steps"""
|
||||||
|
|
||||||
|
COVER_DATE = "June 14, 2026"
|
||||||
|
TRANSCRIPT_DESC = "Transcript of a project planning meeting discussing timelines and scope."
|
||||||
|
SUMMARY_DESC = "Summary of a project planning meeting covering key decisions and next steps."
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
with tempfile.TemporaryDirectory() as tmpdir:
|
||||||
|
print("Using temp directory:", tmpdir)
|
||||||
|
|
||||||
|
# 1) Transcript-only
|
||||||
|
transcript_path = os.path.join(tmpdir, "TRANSCRIPT_TEST.docx")
|
||||||
|
print("Creating transcript-only docx:", transcript_path)
|
||||||
|
create_transcript_docx(
|
||||||
|
text=TRANSCRIPT_TEXT,
|
||||||
|
filename=transcript_path,
|
||||||
|
include_cover=True,
|
||||||
|
cover_date=COVER_DATE,
|
||||||
|
cover_desc=TRANSCRIPT_DESC,
|
||||||
|
)
|
||||||
|
print("OK: transcript-only created.")
|
||||||
|
|
||||||
|
# 2) Summary-only
|
||||||
|
summary_path = os.path.join(tmpdir, "SUMMARY_TEST.docx")
|
||||||
|
print("Creating summary-only docx:", summary_path)
|
||||||
|
create_summary_docx(
|
||||||
|
text=SUMMARY_TEXT,
|
||||||
|
filename=summary_path,
|
||||||
|
include_cover=True,
|
||||||
|
cover_date=COVER_DATE,
|
||||||
|
cover_desc=SUMMARY_DESC,
|
||||||
|
)
|
||||||
|
print("OK: summary-only created.")
|
||||||
|
|
||||||
|
# 3) Combined
|
||||||
|
combined_path = os.path.join(tmpdir, "COMBINED_TEST.docx")
|
||||||
|
print("Creating combined docx:", combined_path)
|
||||||
|
create_combined_docx(
|
||||||
|
transcript_text=TRANSCRIPT_TEXT,
|
||||||
|
summary_text=SUMMARY_TEXT,
|
||||||
|
filename=combined_path,
|
||||||
|
transcript_cover_date=COVER_DATE,
|
||||||
|
transcript_cover_desc=TRANSCRIPT_DESC,
|
||||||
|
summary_cover_date=COVER_DATE,
|
||||||
|
summary_cover_desc=SUMMARY_DESC,
|
||||||
|
)
|
||||||
|
print("OK: combined created.")
|
||||||
|
|
||||||
|
# Basic size sanity checks
|
||||||
|
for path in [transcript_path, summary_path, combined_path]:
|
||||||
|
size = os.path.getsize(path)
|
||||||
|
print(f"File: {os.path.basename(path)} - size: {size} bytes")
|
||||||
|
if size < 10000:
|
||||||
|
print("WARNING: File seems unusually small:", path)
|
||||||
|
|
||||||
|
print("\nAll .docx files generated successfully.")
|
||||||
|
print("Please open them in Word to verify:")
|
||||||
|
print("- Only transcript pages have line numbers.")
|
||||||
|
print("- Footer shows 'X of Y' on all pages.")
|
||||||
|
print("- Cover pages are centered and use the correct date format.")
|
||||||
|
print("- Combined doc order: cover, page break, summary, page break, transcript.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user