Rebuild transcript DOCX generation flow

- Clean, single-pass implementation for transcript and summary DOCX - Explicit margins, font, line numbering per OOXML spec - Disable docGrid to prevent off-by-one line numbering - Ensure first content line is line 1
2026-06-16 16:54:48 +00:00
parent 0c0e52dfb8
commit 03d66219d9
1 changed files with 111 additions and 127 deletions
@@ -304,173 +304,157 @@ def send_email(

 # ------------ DOCX helpers ------------

-def _configure_base_font(doc):
-    """
-    Set base document font to 12pt Courier.
-    """
-    style = doc.styles["Normal"]
-    style.font.name = "Courier"
-    style.font.size = Pt(12)
+# Namespaces
+W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"


-def _configure_section_margins(doc, margin=1.0):
-    """
-    Set uniform margins on all sides (default 1 inch).
-    """
-    for section in doc.sections:
-        section.left_margin = Inches(margin)
-        section.right_margin = Inches(margin)
-        section.top_margin = Inches(margin)
-        section.bottom_margin = Inches(margin)
+def _set_element_attr(elem, attr, value):
+    elem.set(f"{{{W_NS}}}{attr}", str(value))


-def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
+def _create_transcript_section_properties(section):
    """
-    Enable line numbering for a specific section.
-    Disables the document grid so Word counts lines based on content, not grid lines.
+    Configure the section properties for transcript DOCX:
+    - Margins: 1 inch all sides
+    - Line numbering: start=1, countBy=1, restart each page
+    - Disable document grid to avoid off-by-one line numbering
    """
    sectPr = section._sectPr

-    # Disable document grid to avoid Word treating a grid line as line 1
-    docGrid = sectPr.find(qn("w:docGrid"))
+    # Margins: 1 inch = 1440 twips
+    pgMar = sectPr.find(f"{{{W_NS}}}pgMar") or OxmlElement("w:pgMar")
+    _set_element_attr(pgMar, "top", "1440")
+    _set_element_attr(pgMar, "right", "1440")
+    _set_element_attr(pgMar, "bottom", "1440")
+    _set_element_attr(pgMar, "left", "1440")
+    _set_element_attr(pgMar, "header", "720")
+    _set_element_attr(pgMar, "footer", "720")
+    _set_element_attr(pgMar, "gutter", "0")
+    sectPr.append(pgMar)
+
+    # Disable document grid to avoid Word counting phantom grid lines
+    docGrid = sectPr.find(f"{{{W_NS}}}docGrid")
    if docGrid is not None:
-        docGrid.set(qn("w:type"), "none")
+        _set_element_attr(docGrid, "type", "none")

-    # Create the line numbering element: <w:lnNumType>
+    # Line numbering: start=1, countBy=1, restart each page
    lnNumType = OxmlElement("w:lnNumType")
-    lnNumType.set(qn("w:start"), str(start_at))
-    lnNumType.set(qn("w:countBy"), str(count_by))
-
-    # 'newPage' restarts numbering on every page; 'continuous' keeps it going
-    restart_mode = "newPage" if restart else "continuous"
-    lnNumType.set(qn("w:restart"), restart_mode)
-
+    _set_element_attr(lnNumType, "start", "1")
+    _set_element_attr(lnNumType, "countBy", "1")
+    _set_element_attr(lnNumType, "restart", "newPage")
    sectPr.append(lnNumType)


-def _add_transcript_content(doc, text):
+def _add_transcript_paragraph(doc, line_text):
    """
-    Add transcript lines to the document with formatting.
+    Add a single transcript line as a paragraph.
+    If it matches timestamp + speaker format, format the label differently.
    """
-    lines = text.strip().split("\n")
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
+    line_text = line_text.strip()
+    if not line_text:
+        return

-        m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
-        if m:
-            ts, speaker, content = m.groups()
    p = doc.add_paragraph()
    p.paragraph_format.left_indent = Inches(0.25)

-            run_label = p.add_run(f"[{ts}] {speaker.upper()}:")
-            run_label.bold = False
+    # Try to match: [00:00] SPEAKER 1: content
+    m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
+    if m:
+        ts, speaker, content = m.groups()
+        label_text = f"[{ts}] {speaker.upper()}:"
+
+        # Label run (underline)
+        run_label = p.add_run(label_text)
        run_label.underline = True
        run_label.font.name = "Courier"
        run_label.font.size = Pt(12)

+        # Space run (no underline)
        run_space = p.add_run(" ")
-            run_space.bold = False
        run_space.underline = False
        run_space.font.name = "Courier"
        run_space.font.size = Pt(12)

+        # Content run (no underline)
        run_txt = p.add_run(content.strip())
-            run_txt.bold = False
        run_txt.underline = False
        run_txt.font.name = "Courier"
        run_txt.font.size = Pt(12)
    else:
-            p = doc.add_paragraph()
-            run = p.add_run(line)
+        # Non-standard line: plain text
+        run = p.add_run(line_text)
+        run.underline = False
        run.font.name = "Courier"
        run.font.size = Pt(12)


-def _add_summary_content(doc, text):
-    """
-    Add summary content with heading styles.
-    """
-    heading_count = 0
-    for line in text.splitlines():
-        stripped = line.strip()
-        if not stripped:
-            continue
-
-        # Detect markdown-style headings: #, ##, ###, #### at start of line
-        m = re.match(r"^(#{1,4})\s+(.*)", stripped)
-        if m:
-            heading_count += 1
-            content = m.group(2).strip()
-
-            p = doc.add_paragraph()
-            p.paragraph_format.space_after = Pt(4)
-
-            run = p.add_run(content)
-            run.font.name = "Courier"
-            run.font.size = Pt(12)
-
-            # Apply formatting based on this heading's ordinal position
-            if heading_count == 1:
-                run.bold = True
-            elif heading_count == 2:
-                run.italic = True
-            elif heading_count == 3:
-                run.underline = True
-            elif heading_count >= 4:
-                run.italic = True
-                run.underline = True
-        else:
-            # Normal text line
-            p = doc.add_paragraph(stripped)
-            p.paragraph_format.space_after = Pt(4)
-
-
 # ------------ Public DOCX functions ------------

-def create_transcript_docx(
-    text: str,
-    filename: str,
-):
+def create_transcript_docx(text: str, filename: str):
    """
-    Create a .docx transcript with:
+    Create a transcript DOCX with:
    - 1" margins on all sides
-      - 12pt Courier
-      - Continuous line numbering on the left
-      - No cover page (standalone transcript only).
+    - 12pt Courier font
+    - Line numbering starting at 1 on the first page
+    - Proper formatting for timestamps and speaker labels
    """
    doc = Document()
-    _configure_base_font(doc)
-    _configure_section_margins(doc)

-    # Fully clear default paragraphs so Word's line numbering starts at line 1
+    # Set base font (Normal style)
+    style = doc.styles["Normal"]
+    style.font.name = "Courier"
+    style.font.size = Pt(12)
+
+    # Remove any default paragraphs (ensure no phantom first line)
    body = doc.element.body
-    for p in body.findall(qn('w:p')):
+    for p in list(body.findall(f"{{{W_NS}}}p")):
        body.remove(p)

-    # Enable line numbering for transcript section
-    _enable_line_numbering(doc.sections[0])
+    # Configure section properties (margins, line numbering, grid)
+    _create_transcript_section_properties(doc.sections[0])

-    # Transcript content (with line numbering)
-    _add_transcript_content(doc, text)
+    # Add transcript lines
+    lines = text.strip().splitlines()
+    for line in lines:
+        _add_transcript_paragraph(doc, line)

+    # Save
    doc.save(filename)


-def create_summary_docx(
-    text: str,
-    filename: str,
-):
+def create_summary_docx(text: str, filename: str):
    """
-    Create a .docx summary with consistent font and heading styles.
-    No cover page, no line numbering.
+    Create a summary DOCX with:
+    - 1" margins on all sides
+    - 12pt Courier font
+    - No line numbering
    """
    doc = Document()
-    _configure_base_font(doc)
-    _configure_section_margins(doc)

-    # Summary content (no line numbering)
-    _add_summary_content(doc, text)
+    # Base font
+    style = doc.styles["Normal"]
+    style.font.name = "Courier"
+    style.font.size = Pt(12)
+
+    # Margins: 1 inch all sides
+    for section in doc.sections:
+        section.left_margin = Inches(1.0)
+        section.right_margin = Inches(1.0)
+        section.top_margin = Inches(1.0)
+        section.bottom_margin = Inches(1.0)
+
+    # Remove default paragraph
+    body = doc.element.body
+    for p in list(body.findall(f"{{{W_NS}}}p")):
+        body.remove(p)
+
+    # Add summary content
+    lines = text.strip().splitlines()
+    for line in lines:
+        line = line.strip()
+        if not line:
+            continue
+        p = doc.add_paragraph(line)
+        p.paragraph_format.space_after = Pt(4)

    doc.save(filename)