Add PDF-based document generation with LibreOffice; fix line numbering and margins
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Add LibreOffice Writer and DejaVu fonts to Dockerfile for PDF generation
- Add PyPDF2 and reportlab to requirements.txt
- Refactor email_sender.py:
  - Enforce 1-inch margins on all sides
  - Isolate line numbering to transcript section only
  - Add generate_pdf_documents() to build:
    - TRANSCRIPT.pdf (cover + transcript)
    - SUMMARY.pdf (cover + summary)
    - COMBINED.pdf (transcript cover + summary + TRANSCRIPT header + transcript)
  - Add page numbers (bottom-right) to all PDFs via reportlab
- Update tasks.py:
  - Use generate_pdf_documents() after creating DOCX files
  - Attach source JSON, MD files, and compiled PDFs in success email
- Add test_docx_generation.py for transcript/summary/combined DOCX testing
This commit is contained in:
admin
2026-06-15 02:19:17 +00:00
parent b0a23b32e1
commit 42a155aeaa
5 changed files with 534 additions and 119 deletions
+7 -2
View File
@@ -9,9 +9,14 @@ LABEL description="Scraibe: LocalAI-backed transcription and diarization client
Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization." Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
LABEL url="https://git.optimex.systems/admin/scribe" LABEL url="https://git.optimex.systems/admin/scribe"
# Install system dependencies (ffmpeg, redis) # Install system dependencies (ffmpeg, redis, LibreOffice for PDF generation)
RUN apt update -y && \ RUN apt update -y && \
apt install -y --no-install-recommends ffmpeg redis-server && \ apt install -y --no-install-recommends \
ffmpeg \
redis-server \
libreoffice-writer \
fonts-dejavu-core \
&& \
apt clean && \ apt clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+2
View File
@@ -6,3 +6,5 @@ PyYAML>=6.0
celery[redis]>=5.3.0 celery[redis]>=5.3.0
redis>=5.0.0 redis>=5.0.0
python-docx>=1.1.0 python-docx>=1.1.0
PyPDF2>=3.0.0
reportlab>=4.0.0
+396 -112
View File
@@ -8,20 +8,23 @@ Template placeholders are primarily filled via environment variables.
""" """
import base64 import base64
import json
import logging
import os import os
import re import re
import smtplib import smtplib
import logging import subprocess
import tempfile
from email import encoders from email import encoders
from email.mime.base import MIMEBase from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText from email.mime.text import MIMEText
from typing import List, Optional, Dict, Any from typing import Any, Dict, List, Optional
from docx import Document from docx import Document
from docx.shared import Inches, Pt
from docx.oxml.ns import qn
from docx.oxml import OxmlElement from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.text import WD_ALIGN_PARAGRAPH
logger = logging.getLogger("scraibe.email_sender") logger = logging.getLogger("scraibe.email_sender")
@@ -291,98 +294,44 @@ def send_email(
raise EmailError(f"Failed to send email: {e}") raise EmailError(f"Failed to send email: {e}")
def _remove_line_numbering(section): # ------------ DOCX helpers ------------
"""
Explicitly remove line numbering from a section.
"""
sectPr = section._sectPr
lnNumType = sectPr.find(qn("w:lnNumType"))
if lnNumType is not None:
sectPr.remove(lnNumType)
def _configure_base_font(doc):
def _enable_line_numbering(section):
""" """
Enable continuous line numbering for a section. Set base document font to 12pt Courier.
""" """
sectPr = section._sectPr
lnNumType = sectPr.find(qn("w:lnNumType"))
if lnNumType is None:
lnNumType = OxmlElement("w:lnNumType")
sectPr.append(lnNumType)
lnNumType.set(qn("w:start"), "continuous")
lnNumType.set(qn("w:countBy"), "1")
def _setup_docx_style(doc, enable_line_numbering=False):
"""
Base document setup (margins, font, footer).
Line numbering is optional and applied to the first section only.
"""
section = doc.sections[0]
section.left_margin = Inches(1.5)
section.right_margin = Inches(1.0)
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
# Line numbering (only for transcript sections)
if enable_line_numbering:
_enable_line_numbering(section)
else:
_remove_line_numbering(section)
# Default font
style = doc.styles["Normal"] style = doc.styles["Normal"]
font = style.font style.font.name = "Courier"
font.name = "Courier" style.font.size = Pt(12)
font.size = Pt(12)
# Add "Page X of Y" footer, right-aligned
footer = section.footer
footer.is_linked_to_previous = False
p = footer.paragraphs[0]
p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# PAGE field (current page number) def _configure_section_margins(doc, margin=1.0):
run_page = p.add_run() """
r = run_page._r Set uniform margins on all sides (default 1 inch).
r.clear() """
for section in doc.sections:
section.left_margin = Inches(margin)
section.right_margin = Inches(margin)
section.top_margin = Inches(margin)
section.bottom_margin = Inches(margin)
fldChar_begin = OxmlElement("w:fldChar")
fldChar_begin.set(qn("w:fldCharType"), "begin")
r.append(fldChar_begin)
instrText = OxmlElement("w:instrText") def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
instrText.set(qn("xml:space"), "preserve") """
instrText.text = "PAGE" Enable line numbering for a specific section.
r.append(instrText) """
sectPr = section._sectPr
fldChar_end = OxmlElement("w:fldChar") # Create the line numbering element: <w:lnNumType>
fldChar_end.set(qn("w:fldCharType"), "end") lnNumType = OxmlElement("w:lnNumType")
r.append(fldChar_end) lnNumType.set(qn("w:start"), str(start_at))
lnNumType.set(qn("w:countBy"), str(count_by))
# " of " text # 'eachPage' restarts numbering on every page; 'continuous' keeps it going
run_of = p.add_run(" of ") restart_mode = "eachPage" if restart else "continuous"
run_of.font.name = "Courier" lnNumType.set(qn("w:restart"), restart_mode)
run_of.font.size = Pt(10)
# NUMPAGES field (total pages) sectPr.append(lnNumType)
run_numpages = p.add_run()
r = run_numpages._r
r.clear()
fldChar_begin = OxmlElement("w:fldChar")
fldChar_begin.set(qn("w:fldCharType"), "begin")
r.append(fldChar_begin)
instrText = OxmlElement("w:instrText")
instrText.set(qn("xml:space"), "preserve")
instrText.text = "NUMPAGES"
r.append(instrText)
fldChar_end = OxmlElement("w:fldChar")
fldChar_end.set(qn("w:fldCharType"), "end")
r.append(fldChar_end)
def _add_cover_page(doc, doc_type, date, description): def _add_cover_page(doc, doc_type, date, description):
@@ -394,14 +343,13 @@ def _add_cover_page(doc, doc_type, date, description):
2) Date (e.g. "June 14, 2026") 2) Date (e.g. "June 14, 2026")
3-5) Empty space 3-5) Empty space
6) One-sentence description 6) One-sentence description
- Then page break.
""" """
# Create a full-page table to center content vertically and horizontally # Create a full-page table to center content vertically and horizontally
table = doc.add_table(rows=1, cols=1) table = doc.add_table(rows=1, cols=1)
table.autofit = False table.autofit = False
cell = table.cell(0, 0) cell = table.cell(0, 0)
# Make table span full page height (approx) # Make table span full page width (approx)
cell.width = Inches(6.5) cell.width = Inches(6.5)
# Center content inside the cell # Center content inside the cell
@@ -437,11 +385,11 @@ def _add_cover_page(doc, doc_type, date, description):
run_desc.font.name = "Courier" run_desc.font.name = "Courier"
run_desc.font.size = Pt(12) run_desc.font.size = Pt(12)
# Page break after cover page
doc.add_page_break()
def _add_transcript_content(doc, text): def _add_transcript_content(doc, text):
"""
Add transcript lines to the document with formatting.
"""
lines = text.strip().split("\n") lines = text.strip().split("\n")
for line in lines: for line in lines:
line = line.strip() line = line.strip()
@@ -479,6 +427,9 @@ def _add_transcript_content(doc, text):
def _add_summary_content(doc, text): def _add_summary_content(doc, text):
"""
Add summary content with heading styles.
"""
heading_count = 0 heading_count = 0
for line in text.splitlines(): for line in text.splitlines():
stripped = line.strip() stripped = line.strip()
@@ -514,6 +465,119 @@ def _add_summary_content(doc, text):
p.paragraph_format.space_after = Pt(4) p.paragraph_format.space_after = Pt(4)
# ------------ PDF helpers ------------
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
"""
Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF.
"""
try:
# Use LibreOffice in headless mode
result = subprocess.run(
[
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", output_dir,
docx_path,
],
capture_output=True,
text=True,
timeout=60,
)
if result.returncode != 0:
logger.warning("LibreOffice conversion failed: %s", result.stderr)
# LibreOffice creates a PDF with the same base name
base = os.path.splitext(os.path.basename(docx_path))[0]
pdf_path = os.path.join(output_dir, f"{base}.pdf")
if os.path.exists(pdf_path):
return pdf_path
else:
logger.warning("Converted PDF not found at: %s", pdf_path)
return None
except Exception as e:
logger.error("Error converting DOCX to PDF: %s", e)
return None
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
"""
Merge multiple PDF files into a single PDF using PyPDF2.
Returns True on success, False on failure.
"""
try:
from PyPDF2 import PdfMerger
merger = PdfMerger()
for pdf in input_pdfs:
if os.path.exists(pdf):
merger.append(pdf)
merger.write(output_pdf)
merger.close()
return True
except Exception as e:
logger.error("Error merging PDFs: %s", e)
return False
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
"""
Add page numbers to a PDF using reportlab.
Page numbers appear at the bottom-right of each page.
"""
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from PyPDF2 import PdfReader
# Try to use a standard font
font_name = "Courier"
try:
pdfmetrics.registerFont(
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
)
except Exception:
pass
reader = PdfReader(input_pdf)
num_pages = len(reader.pages)
# Create a canvas to add page numbers
c = canvas.Canvas(output_pdf, pagesize=letter)
for page_num in range(num_pages):
page = reader.pages[page_num]
c = canvas.Canvas(output_pdf, pagesize=letter)
# Add page number
c.setFont(font_name, 10)
page_text = f"Page {page_num + 1} of {num_pages}"
text_width = c.stringWidth(page_text, font_name, 10)
x = letter[0] - 1 * 72 - text_width # 1 inch from right
y = 1 * 72 # 1 inch from bottom
c.drawString(x, y, page_text)
c.showPage()
c.save()
return True
except Exception as e:
logger.error("Error adding page numbers to PDF: %s", e)
return False
# ------------ Public DOCX/PDF functions ------------
def create_transcript_docx( def create_transcript_docx(
text: str, text: str,
filename: str, filename: str,
@@ -523,19 +587,26 @@ def create_transcript_docx(
): ):
""" """
Create a .docx transcript with: Create a .docx transcript with:
- 1.5" left margin, 1" right margin - 1" margins on all sides
- 12pt Courier - 12pt Courier
- Continuous line numbering on the left (for transcript content only) - Continuous line numbering on the left (for transcript content only)
- Optional cover page with type, date, and AI-generated description. - Optional cover page with type, date, and AI-generated description.
""" """
doc = Document() doc = Document()
# Enable line numbering for transcript _configure_base_font(doc)
_setup_docx_style(doc, enable_line_numbering=True) _configure_section_margins(doc)
# 1) Optional cover page (no line numbering)
if include_cover and cover_date and cover_desc: if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc) _add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
doc.add_page_break()
# 2) Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 3) Transcript content (with line numbering)
_add_transcript_content(doc, text) _add_transcript_content(doc, text)
doc.save(filename) doc.save(filename)
@@ -552,13 +623,17 @@ def create_summary_docx(
No line numbering. No line numbering.
""" """
doc = Document() doc = Document()
# No line numbering for summary _configure_base_font(doc)
_setup_docx_style(doc, enable_line_numbering=False) _configure_section_margins(doc)
# 1) Optional cover page
if include_cover and cover_date and cover_desc: if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc) _add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
doc.add_page_break()
# 2) Summary content (no line numbering)
_add_summary_content(doc, text) _add_summary_content(doc, text)
doc.save(filename) doc.save(filename)
@@ -577,34 +652,243 @@ def create_combined_docx(
2) Page break 2) Page break
3) Summary content (no line numbering) 3) Summary content (no line numbering)
4) Page break 4) Page break
5) Transcript content (line numbering enabled) 5) Transcript content (line numbering enabled only here)
Line numbering is restricted to the transcript section only.
""" """
doc = Document() doc = Document()
# Start with no line numbering (for cover and summary) _configure_base_font(doc)
_setup_docx_style(doc, enable_line_numbering=False) _configure_section_margins(doc)
# 1) Transcript cover page (includes trailing page break) # 1) Transcript cover page (no line numbering)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc) _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.add_page_break()
# 3) Summary content (no line numbering) # 3) Summary content (no line numbering)
_add_summary_content(doc, summary_text) _add_summary_content(doc, summary_text)
# 4) Page break before transcript
doc.add_page_break() doc.add_page_break()
# Enable line numbering for transcript section # 4) Enable line numbering for transcript section
# We create a new section for transcript so line numbering applies only there _enable_line_numbering(doc.sections[0])
section_transcript = doc.add_section()
# Apply same margins
section_transcript.left_margin = Inches(1.5)
section_transcript.right_margin = Inches(1.0)
section_transcript.top_margin = Inches(1.0)
section_transcript.bottom_margin = Inches(1.0)
# Enable line numbering in transcript section
_enable_line_numbering(section_transcript)
# 5) Transcript content (with line numbering) # 5) Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text) _add_transcript_content(doc, transcript_text)
doc.save(filename) doc.save(filename)
def generate_pdf_documents(
transcript_text: str,
summary_text: str,
output_dir: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
) -> Dict[str, str]:
"""
Generate PDF documents for transcript, summary, and combined.
Returns a dict with paths:
- transcript_pdf
- summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided)
"""
import os
os.makedirs(output_dir, exist_ok=True)
result = {}
# Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files
# Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx)
# Summary cover
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_enable_line_numbering(doc.sections[0])
_add_transcript_content(doc, transcript_text)
doc.save(transcript_docx)
# Summary (no line numbering)
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble final PDFs
# Transcript PDF: cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript):
_add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
result["transcript_pdf"] = transcript_output_pdf
# Summary PDF: cover + summary + page numbers
if summary_text:
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary):
_add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
result["summary_pdf"] = summary_output_pdf
# Combined PDF: transcript cover + summary + "TRANSCRIPT" header + transcript + page numbers
if summary_text:
# Create a document with "TRANSCRIPT" header before transcript
transcript_header_docx = os.path.join(tmpdir, "TRANSCRIPT_HEADER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Add "TRANSCRIPT" header
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run("TRANSCRIPT")
run.bold = True
run.font.name = "Courier"
run.font.size = Pt(12)
# Page break
doc.add_page_break()
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(transcript_header_docx)
# Convert to PDF
transcript_header_pdf = _docx_to_pdf(transcript_header_docx, tmpdir)
# Assemble combined PDF
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if _merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_header_pdf],
merged_combined,
):
_add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
result["combined_pdf"] = combined_output_pdf
return result
def send_success_email(
to: str,
transcript_text: str,
summary_text: str,
attachments: List[str],
task_id: str,
):
"""
Send a success email with attachments.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Build email body
body = f"""
Your transcription is complete.
Task ID: {task_id}
Please find the attached documents:
- Transcript (MD)
- Transcript (DOCX)
- Source JSON
"""
if summary_text:
body += "- Summary (MD)\n- Summary (DOCX)\n"
# Load HTML template
try:
html = load_template(
"success_template.html",
task_id=task_id,
transcript_text=transcript_text[:500],
summary_text=summary_text[:500] if summary_text else "",
)
except EmailError:
html = None
# Send email
send_email(
to=to,
subject=f"Transcription Complete - Task {task_id}",
body=body,
html=html,
attachments=attachments,
)
def send_error_email(
to: str,
error_message: str,
task_id: str,
):
"""
Send an error email.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Build email body
body = f"""
There was an error processing your transcription.
Task ID: {task_id}
Error: {error_message}
"""
# Load HTML template
try:
html = load_template(
"error_notification_template.html",
task_id=task_id,
error_message=error_message,
)
except EmailError:
html = None
# Send email
send_email(
to=to,
subject=f"Transcription Error - Task {task_id}",
body=body,
html=html,
attachments=[],
)
+31 -3
View File
@@ -14,7 +14,7 @@ from .summarizer import SummarizerClient, SummarizerError
from .misc import setup_logging from .misc import setup_logging
from .email_sender import send_email, EmailError, load_template from .email_sender import send_email, EmailError, load_template
from .email_sender import create_transcript_docx, create_summary_docx from .email_sender import create_transcript_docx, create_summary_docx
from .email_sender import create_combined_docx from .email_sender import create_combined_docx, generate_pdf_documents
logger = logging.getLogger("scraibe.tasks") logger = logging.getLogger("scraibe.tasks")
@@ -545,6 +545,26 @@ def process_transcription_task(
) )
temp_files.append(docx_combined_path) temp_files.append(docx_combined_path)
# 4b) Generate PDF documents
pdf_files = {}
try:
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
pdf_files = generate_pdf_documents(
transcript_text=transcript_text,
summary_text=summary_text if include_summary else "",
output_dir=pdf_output_dir,
transcript_cover_date=today_str,
transcript_cover_desc=transcript_cover_desc,
summary_cover_date=today_str,
summary_cover_desc=summary_cover_desc,
)
for pdf_path in pdf_files.values():
if pdf_path and os.path.exists(pdf_path):
temp_files.append(pdf_path)
except Exception as e:
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
# 5) Build attachments list
attachments = [ attachments = [
md_transcript_path, md_transcript_path,
docx_transcript_path, docx_transcript_path,
@@ -555,7 +575,15 @@ def process_transcription_task(
if docx_combined_path: if docx_combined_path:
attachments.append(docx_combined_path) attachments.append(docx_combined_path)
# 5) Send success email # Add PDFs to attachments
if pdf_files.get("transcript_pdf"):
attachments.append(pdf_files["transcript_pdf"])
if pdf_files.get("summary_pdf"):
attachments.append(pdf_files["summary_pdf"])
if pdf_files.get("combined_pdf"):
attachments.append(pdf_files["combined_pdf"])
# 6) Send success email
send_success_email( send_success_email(
to=email_to, to=email_to,
transcript_text=transcript_text, transcript_text=transcript_text,
@@ -575,7 +603,7 @@ def process_transcription_task(
) )
raise e raise e
finally: finally:
# 6) Cleanup # 7) Cleanup
for path in temp_files: for path in temp_files:
_remove_file(path) _remove_file(path)
if audio_path: if audio_path:
+96
View File
@@ -0,0 +1,96 @@
"""
Local test for transcript/summary/combined .docx generation.
Checks:
- Line numbering only on transcript pages.
- Page numbering (X of Y) in footer.
- Cover pages present and centered.
- Combined document structure.
"""
import sys
import os
import tempfile
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from scraibe.email_sender import (
create_transcript_docx,
create_summary_docx,
create_combined_docx,
)
TRANSCRIPT_TEXT = """[00:00] Speaker 1: Good morning, everyone. Thank you for joining today's meeting.
[00:12] Speaker 2: Good morning. I'm looking forward to discussing the new requirements.
[00:25] Speaker 1: Let's start with the timeline. We need to finalize the scope by Friday.
[00:38] Speaker 2: Agreed. I'll send a summary of the key points after this call.
[00:45] Speaker 1: Perfect. If there are no other items, we can wrap up here."""
SUMMARY_TEXT = """# Meeting Overview
## Key Discussion Points
### Timeline and Scope
#### Next Steps"""
COVER_DATE = "June 14, 2026"
TRANSCRIPT_DESC = "Transcript of a project planning meeting discussing timelines and scope."
SUMMARY_DESC = "Summary of a project planning meeting covering key decisions and next steps."
def main():
with tempfile.TemporaryDirectory() as tmpdir:
print("Using temp directory:", tmpdir)
# 1) Transcript-only
transcript_path = os.path.join(tmpdir, "TRANSCRIPT_TEST.docx")
print("Creating transcript-only docx:", transcript_path)
create_transcript_docx(
text=TRANSCRIPT_TEXT,
filename=transcript_path,
include_cover=True,
cover_date=COVER_DATE,
cover_desc=TRANSCRIPT_DESC,
)
print("OK: transcript-only created.")
# 2) Summary-only
summary_path = os.path.join(tmpdir, "SUMMARY_TEST.docx")
print("Creating summary-only docx:", summary_path)
create_summary_docx(
text=SUMMARY_TEXT,
filename=summary_path,
include_cover=True,
cover_date=COVER_DATE,
cover_desc=SUMMARY_DESC,
)
print("OK: summary-only created.")
# 3) Combined
combined_path = os.path.join(tmpdir, "COMBINED_TEST.docx")
print("Creating combined docx:", combined_path)
create_combined_docx(
transcript_text=TRANSCRIPT_TEXT,
summary_text=SUMMARY_TEXT,
filename=combined_path,
transcript_cover_date=COVER_DATE,
transcript_cover_desc=TRANSCRIPT_DESC,
summary_cover_date=COVER_DATE,
summary_cover_desc=SUMMARY_DESC,
)
print("OK: combined created.")
# Basic size sanity checks
for path in [transcript_path, summary_path, combined_path]:
size = os.path.getsize(path)
print(f"File: {os.path.basename(path)} - size: {size} bytes")
if size < 10000:
print("WARNING: File seems unusually small:", path)
print("\nAll .docx files generated successfully.")
print("Please open them in Word to verify:")
print("- Only transcript pages have line numbers.")
print("- Footer shows 'X of Y' on all pages.")
print("- Cover pages are centered and use the correct date format.")
print("- Combined doc order: cover, page break, summary, page break, transcript.")
if __name__ == "__main__":
main()