Use structured filenames and formal DOCX transcript styling

2026-06-14 16:20:10 +00:00
parent 2dce9b43c9
commit a8f48b9e58
2 changed files with 121 additions and 27 deletions
@@ -9,6 +9,7 @@ Template placeholders are primarily filled via environment variables.

 import base64
 import os
+import re
 import smtplib
 import logging
 from email import encoders
@@ -17,6 +18,11 @@ from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from typing import List, Optional, Dict, Any

+from docx import Document
+from docx.shared import Inches, Pt
+from docx.oxml.ns import qn
+from docx.oxml import OxmlElement
+
 logger = logging.getLogger("scraibe.email_sender")


@@ -280,30 +286,82 @@ def send_email(

 def create_transcript_docx(text: str, filename: str):
    """
-    Create a .docx file from plain/markdown transcript text.
+    Create a .docx transcript with:
+      - 1.5" left margin, 1" right margin
+      - 12pt Courier
+      - Continuous line numbering on the left
+      - Speaker names capitalized and indented; spoken text further indented
    """
-    from docx import Document
-    from docx.shared import Pt
-
    doc = Document()
-    doc.add_heading("Transcript", level=1)
+    section = doc.sections[0]

-    for line in text.splitlines():
-        p = doc.add_paragraph(line)
-        p.paragraph_format.space_after = Pt(4)
+    # Margins
+    section.left_margin = Inches(1.5)
+    section.right_margin = Inches(1.0)
+    section.top_margin = Inches(1.0)
+    section.bottom_margin = Inches(1.0)
+
+    # Line numbering (continuous, left side)
+    section_type = section.element.find(qn("w:sectionPr"))
+    if section_type is None:
+        section_type = OxmlElement("w:sectionPr")
+        section.element.insert(0, section_type)
+
+    line_num = OxmlElement("w:lineNumbering")
+    line_num.set(qn("w:start"), "continuous")
+    line_num.set(qn("w:countBy"), "1")
+    section_type.append(line_num)
+
+    # Default font
+    style = doc.styles["Normal"]
+    font = style.font
+    font.name = "Courier"
+    font.size = Pt(12)
+
+    # Parse lines
+    lines = text.strip().split("\n")
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        # Try to parse: [00:00] SPEAKER: text
+        m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
+        if m:
+            ts, speaker, content = m.groups()
+            # Speaker line
+            p_spk = doc.add_paragraph()
+            p_spk.paragraph_format.left_indent = Inches(0.25)
+            run_spk = p_spk.add_run(f"[{ts}] {speaker.upper()}")
+            run_spk.bold = True
+            run_spk.font.name = "Courier"
+            run_spk.font.size = Pt(12)
+
+            # Spoken text line
+            p_txt = doc.add_paragraph()
+            p_txt.paragraph_format.left_indent = Inches(0.5)
+            run_txt = p_txt.add_run(content.strip())
+            run_txt.font.name = "Courier"
+            run_txt.font.size = Pt(12)
+        else:
+            # Fallback for non-standard lines
+            p = doc.add_paragraph()
+            run = p.add_run(line)
+            run.font.name = "Courier"
+            run.font.size = Pt(12)

    doc.save(filename)


 def create_summary_docx(text: str, filename: str):
    """
-    Create a .docx file from summary text.
+    Create a .docx summary with consistent font.
    """
-    from docx import Document
-    from docx.shared import Pt
-
    doc = Document()
-    doc.add_heading("Summary", level=1)
+    style = doc.styles["Normal"]
+    font = style.font
+    font.name = "Courier"
+    font.size = Pt(12)

    for line in text.splitlines():
        p = doc.add_paragraph(line)