Rewrite transcript DOCX logic for correctness

- Prepare transcript into pages of 29 lines each before writing. - Each line max 60 chars total (48 content + number + spaces). - Words preserved (no clipping); wrap at word boundaries. - Page break after every 29 lines. - No distinction between logical/visual lines.
2026-06-19 16:07:20 +00:00
parent 49f3cdc407
commit 2112b8c7e2
1 changed files with 66 additions and 119 deletions
@@ -444,40 +444,71 @@ def create_transcript_docx(text: str, filename: str):
    Create a transcript DOCX with:
    - 1" margins on all sides
    - 12pt Courier font
-    - Embedded line numbers starting at 1 on the first page
+    - Each page has exactly 29 numbered lines of text
-      (portable across Word, LibreOffice, Google Docs)
+    - Max 60 characters per line (including number and spaces)
-    - Line numbers reflect visual lines on the page, not speaker turns.
+    - Words preserved (no clipping or omission)
-    - Proper formatting for timestamps and speaker labels
+    - Blank spacing between number and text preserved
    - Page break after every 29 lines
    - Centered footer: "X of Y"
    """
-    doc = Document()
+    # Step 1: Prepare transcript into pages of 29 lines each
    # Each line <= 60 chars total, words preserved, no clipping
    # Structure: nested list of paragraphs (pages -> lines)
    prepared_pages = []
    current_page = []
    line_count = 0
-    # Set base font (Normal style)
+    # 48 chars content + 2 digits + 1 tab + 9 spaces = 60 max
    MAX_CONTENT_LEN = 48
    for raw_line in text.strip().splitlines():
        raw_line = raw_line.strip()
        if not raw_line:
            continue
        # Wrap into segments without clipping words
        words = raw_line.split()
        segments = []
        current = ""
        for w in words:
            if not current:
                current = w
            elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN:
                current += " " + w
            else:
                segments.append(current)
                current = w
        if current:
            segments.append(current)
        # Add segments to pages, enforcing 29 lines per page
        for seg in segments:
            if line_count == 29:
                prepared_pages.append(current_page)
                current_page = []
                line_count = 0
            current_page.append(seg)
            line_count += 1
    if current_page:
        prepared_pages.append(current_page)
    # Step 2: Create DOCX
    doc = Document()
    style = doc.styles["Normal"]
    style.font.name = "Courier"
    style.font.size = Pt(12)
    # Remove any default paragraphs (ensure no phantom first line)
    body = doc.element.body
    for p in list(body.findall(f"{{{W_NS}}}p")):
        body.remove(p)
    # Configure section properties (margins, no built-in line numbering)
    _create_transcript_section_properties(doc.sections[0])
-    # Max characters per visual line (content only; total line including line number and spaces <= 60)
+    # Step 3: Write prepared pages into DOCX
-    max_chars = 58
+    for page_idx, page_lines in enumerate(prepared_pages):
-
+        # Insert page break between pages
-    # Lines per page before restarting numbering
+        if page_idx > 0:
    lines_per_page = 29
    # Current line counter for visual lines
    line_number = 0
    # Split transcript into logical lines
    logical_lines = text.strip().splitlines()
    def insert_page_break():
        nonlocal line_number
            p_break = doc.add_paragraph()
            pPr = p_break._p.get_or_add_pPr()
            for child in list(pPr):
@@ -487,101 +518,18 @@ def create_transcript_docx(text: str, filename: str):
            page_break = OxmlElement("w:pageBreak")
            page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
            pPr.append(page_break)
        line_number = 0
-    for line in logical_lines:
+        # Write each line with its number (1-29)
-        line = line.strip()
+        for line_num, line_text in enumerate(page_lines, start=1):
-        if not line:
+            _add_transcript_paragraph(doc, line_text, line_number=line_num)
            continue
-        # Try to match: [00:00] SPEAKER 1: content
+    # Step 4: Add footer: "X of Y" centered
        m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
        if m:
            ts, speaker, content = m.groups()
            label_text = f"[{ts}] {speaker.upper()}:"
            content = content.strip()
        else:
            label_text = ""
            content = line.strip()
        # Wrap content into visual lines (whole words, max_chars)
        content_lines = []
        words = content.split()
        current = ""
        for w in words:
            if len(current) == 0:
                current = w
            elif len(current) + 1 + len(w) <= max_chars:
                current += " " + w
            else:
                content_lines.append(current)
                current = w
        if current:
            content_lines.append(current)
        # Prepare first visual line with label if present
        visual_lines = []
        if content_lines:
            first_content = content_lines.pop(0)
            if label_text:
                prefix = label_text + " "
                if len(prefix) + len(first_content) > max_chars:
                    allowed = max_chars - len(prefix)
                    if allowed < 1:
                        allowed = 1
                    candidate = first_content[:allowed]
                    last_space = candidate.rfind(" ")
                    if last_space > 0:
                        kept = candidate[:last_space]
                        rest = first_content[last_space:].strip()
                    else:
                        kept = candidate
                        rest = first_content[allowed:].strip()
                    visual_lines.append(prefix + kept)
                    if rest:
                        extra_words = rest.split()
                        new_lines = []
                        buf = ""
                        for ew in extra_words:
                            if len(buf) == 0:
                                buf = ew
                            elif len(buf) + 1 + len(ew) <= max_chars:
                                buf += " " + ew
                            else:
                                new_lines.append(buf)
                                buf = ew
                        if buf:
                            new_lines.append(buf)
                        content_lines = new_lines + content_lines
                else:
                    visual_lines.append(prefix + first_content)
            else:
                visual_lines.append(first_content)
            visual_lines.extend(content_lines)
        else:
            # No content_lines but maybe label_text only
            if label_text:
                visual_lines.append(label_text)
        # If adding these visual lines would exceed lines_per_page, insert page break first
        if line_number + len(visual_lines) > lines_per_page:
            insert_page_break()
        # Write visual lines
        for vl in visual_lines:
            line_number += 1
            _add_transcript_paragraph(doc, vl, line_number=line_number)
    # Add page numbers to footer: "X of Y" (bottom left)
    section = doc.sections[0]
    footer = section.footer
    footer.is_linked_to_previous = False
    footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
    footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    # Clear any existing content
    for r in footer_para.runs:
        r.text = ""
@@ -607,7 +555,6 @@ def create_transcript_docx(text: str, filename: str):
    run_total = footer_para.add_run()
    add_field(run_total, " NUMPAGES ")
    # Save
    doc.save(filename)