From 83f3c0921811ed684f74e56169c94642ad0211ea Mon Sep 17 00:00:00 2001
From: admin <admin@git.optimex.systems>
Date: Tue, 16 Jun 2026 19:21:04 +0000
Subject: [PATCH] Make line numbers reflect visual lines, not speaker turns

- Split long lines into multiple visual lines at word boundaries.
- Each visual line is its own paragraph with its own embedded line number.
- Continuous numbering across speakers and pages.
- Portable across Word, LibreOffice, Google Docs.
---
 scraibe/email_sender.py | 56 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 4 deletions(-)

diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py
index faeb715..8cd9488 100644
--- a/scraibe/email_sender.py
+++ b/scraibe/email_sender.py
@@ -439,6 +439,7 @@ def create_transcript_docx(text: str, filename: str):
     - 12pt Courier font
     - Embedded line numbers starting at 1 on the first page
       (portable across Word, LibreOffice, Google Docs)
+    - Line numbers reflect visual lines on the page, not speaker turns.
     - Proper formatting for timestamps and speaker labels
     """
     doc = Document()
@@ -456,10 +457,57 @@ def create_transcript_docx(text: str, filename: str):
     # Configure section properties (margins, no built-in line numbering)
     _create_transcript_section_properties(doc.sections[0])
 
-    # Add transcript lines with embedded line numbers
-    lines = text.strip().splitlines()
-    for idx, line in enumerate(lines, start=1):
-        _add_transcript_paragraph(doc, line, line_number=idx)
+    # Max characters per visual line (for 12pt Courier, 1" margins)
+    max_chars = 72
+
+    # Global line counter for visual lines
+    line_number = 0
+
+    # Split transcript into logical lines
+    logical_lines = text.strip().splitlines()
+
+    for line in logical_lines:
+        line = line.strip()
+        if not line:
+            continue
+
+        # Try to match: [00:00] SPEAKER 1: content
+        m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
+        if m:
+            ts, speaker, content = m.groups()
+            label_text = f"[{ts}] {speaker.upper()}:"
+            content = content.strip()
+        else:
+            label_text = ""
+            content = line.strip()
+
+        # Split content into visual lines at word boundaries
+        content_lines = []
+        words = content.split()
+        current = ""
+        for w in words:
+            if len(current) == 0:
+                current = w
+            elif len(current) + 1 + len(w) <= max_chars:
+                current += " " + w
+            else:
+                content_lines.append(current)
+                current = w
+        if current:
+            content_lines.append(current)
+
+        # First visual line: include label if present
+        if content_lines:
+            first_line_text = (label_text + " " if label_text else "") + content_lines[0]
+            line_number += 1
+            _add_transcript_paragraph(doc, first_line_text, line_number=line_number)
+            # Remove remaining content lines' leading content (already done) from first line
+            content_lines = content_lines[1:]
+
+        # Subsequent visual lines: no label, just content
+        for cl in content_lines:
+            line_number += 1
+            _add_transcript_paragraph(doc, cl, line_number=line_number)
 
     # Save
     doc.save(filename)