Rewrite transcript DOCX logic for correctness

- Prepare transcript into pages of 29 lines each before writing. - Each line max 60 chars total (48 content + number + spaces). - Words preserved (no clipping); wrap at word boundaries. - Page break after every 29 lines. - No distinction between logical/visual lines.
2026-06-19 16:07:20 +00:00
parent 49f3cdc407
commit 2112b8c7e2
1 changed files with 66 additions and 119 deletions
@@ -444,144 +444,92 @@ def create_transcript_docx(text: str, filename: str):
    Create a transcript DOCX with:
    - 1" margins on all sides
    - 12pt Courier font
-    - Embedded line numbers starting at 1 on the first page
-      (portable across Word, LibreOffice, Google Docs)
-    - Line numbers reflect visual lines on the page, not speaker turns.
-    - Proper formatting for timestamps and speaker labels
+    - Each page has exactly 29 numbered lines of text
+    - Max 60 characters per line (including number and spaces)
+    - Words preserved (no clipping or omission)
+    - Blank spacing between number and text preserved
+    - Page break after every 29 lines
+    - Centered footer: "X of Y"
    """
-    doc = Document()
+    # Step 1: Prepare transcript into pages of 29 lines each
+    # Each line <= 60 chars total, words preserved, no clipping
+    # Structure: nested list of paragraphs (pages -> lines)
+    prepared_pages = []
+    current_page = []
+    line_count = 0

-    # Set base font (Normal style)
+    # 48 chars content + 2 digits + 1 tab + 9 spaces = 60 max
+    MAX_CONTENT_LEN = 48
+
+    for raw_line in text.strip().splitlines():
+        raw_line = raw_line.strip()
+        if not raw_line:
+            continue
+
+        # Wrap into segments without clipping words
+        words = raw_line.split()
+        segments = []
+        current = ""
+        for w in words:
+            if not current:
+                current = w
+            elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN:
+                current += " " + w
+            else:
+                segments.append(current)
+                current = w
+        if current:
+            segments.append(current)
+
+        # Add segments to pages, enforcing 29 lines per page
+        for seg in segments:
+            if line_count == 29:
+                prepared_pages.append(current_page)
+                current_page = []
+                line_count = 0
+            current_page.append(seg)
+            line_count += 1
+
+    if current_page:
+        prepared_pages.append(current_page)
+
+    # Step 2: Create DOCX
+    doc = Document()
    style = doc.styles["Normal"]
    style.font.name = "Courier"
    style.font.size = Pt(12)

-    # Remove any default paragraphs (ensure no phantom first line)
    body = doc.element.body
    for p in list(body.findall(f"{{{W_NS}}}p")):
        body.remove(p)

-    # Configure section properties (margins, no built-in line numbering)
    _create_transcript_section_properties(doc.sections[0])

-    # Max characters per visual line (content only; total line including line number and spaces <= 60)
-    max_chars = 58
+    # Step 3: Write prepared pages into DOCX
+    for page_idx, page_lines in enumerate(prepared_pages):
+        # Insert page break between pages
+        if page_idx > 0:
+            p_break = doc.add_paragraph()
+            pPr = p_break._p.get_or_add_pPr()
+            for child in list(pPr):
+                tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
+                if tag in ("tabs", "spacing", "ind"):
+                    pPr.remove(child)
+            page_break = OxmlElement("w:pageBreak")
+            page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
+            pPr.append(page_break)

-    # Lines per page before restarting numbering
-    lines_per_page = 29
+        # Write each line with its number (1-29)
+        for line_num, line_text in enumerate(page_lines, start=1):
+            _add_transcript_paragraph(doc, line_text, line_number=line_num)

-    # Current line counter for visual lines
-    line_number = 0
-
-    # Split transcript into logical lines
-    logical_lines = text.strip().splitlines()
-
-    def insert_page_break():
-        nonlocal line_number
-        p_break = doc.add_paragraph()
-        pPr = p_break._p.get_or_add_pPr()
-        for child in list(pPr):
-            tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
-            if tag in ("tabs", "spacing", "ind"):
-                pPr.remove(child)
-        page_break = OxmlElement("w:pageBreak")
-        page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
-        pPr.append(page_break)
-        line_number = 0
-
-    for line in logical_lines:
-        line = line.strip()
-        if not line:
-            continue
-
-        # Try to match: [00:00] SPEAKER 1: content
-        m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
-        if m:
-            ts, speaker, content = m.groups()
-            label_text = f"[{ts}] {speaker.upper()}:"
-            content = content.strip()
-        else:
-            label_text = ""
-            content = line.strip()
-
-        # Wrap content into visual lines (whole words, max_chars)
-        content_lines = []
-        words = content.split()
-        current = ""
-        for w in words:
-            if len(current) == 0:
-                current = w
-            elif len(current) + 1 + len(w) <= max_chars:
-                current += " " + w
-            else:
-                content_lines.append(current)
-                current = w
-        if current:
-            content_lines.append(current)
-
-        # Prepare first visual line with label if present
-        visual_lines = []
-        if content_lines:
-            first_content = content_lines.pop(0)
-            if label_text:
-                prefix = label_text + " "
-                if len(prefix) + len(first_content) > max_chars:
-                    allowed = max_chars - len(prefix)
-                    if allowed < 1:
-                        allowed = 1
-                    candidate = first_content[:allowed]
-                    last_space = candidate.rfind(" ")
-                    if last_space > 0:
-                        kept = candidate[:last_space]
-                        rest = first_content[last_space:].strip()
-                    else:
-                        kept = candidate
-                        rest = first_content[allowed:].strip()
-
-                    visual_lines.append(prefix + kept)
-                    if rest:
-                        extra_words = rest.split()
-                        new_lines = []
-                        buf = ""
-                        for ew in extra_words:
-                            if len(buf) == 0:
-                                buf = ew
-                            elif len(buf) + 1 + len(ew) <= max_chars:
-                                buf += " " + ew
-                            else:
-                                new_lines.append(buf)
-                                buf = ew
-                        if buf:
-                            new_lines.append(buf)
-                        content_lines = new_lines + content_lines
-                else:
-                    visual_lines.append(prefix + first_content)
-            else:
-                visual_lines.append(first_content)
-
-            visual_lines.extend(content_lines)
-        else:
-            # No content_lines but maybe label_text only
-            if label_text:
-                visual_lines.append(label_text)
-
-        # If adding these visual lines would exceed lines_per_page, insert page break first
-        if line_number + len(visual_lines) > lines_per_page:
-            insert_page_break()
-
-        # Write visual lines
-        for vl in visual_lines:
-            line_number += 1
-            _add_transcript_paragraph(doc, vl, line_number=line_number)
-
-    # Add page numbers to footer: "X of Y" (bottom left)
+    # Step 4: Add footer: "X of Y" centered
    section = doc.sections[0]
    footer = section.footer
    footer.is_linked_to_previous = False
    footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
    footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER

-    # Clear any existing content
    for r in footer_para.runs:
        r.text = ""

@@ -607,7 +555,6 @@ def create_transcript_docx(text: str, filename: str):
    run_total = footer_para.add_run()
    add_field(run_total, " NUMPAGES ")

-    # Save
    doc.save(filename)