From 83f3c0921811ed684f74e56169c94642ad0211ea Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 16 Jun 2026 19:21:04 +0000 Subject: [PATCH] Make line numbers reflect visual lines, not speaker turns - Split long lines into multiple visual lines at word boundaries. - Each visual line is its own paragraph with its own embedded line number. - Continuous numbering across speakers and pages. - Portable across Word, LibreOffice, Google Docs. --- scraibe/email_sender.py | 56 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index faeb715..8cd9488 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -439,6 +439,7 @@ def create_transcript_docx(text: str, filename: str): - 12pt Courier font - Embedded line numbers starting at 1 on the first page (portable across Word, LibreOffice, Google Docs) + - Line numbers reflect visual lines on the page, not speaker turns. - Proper formatting for timestamps and speaker labels """ doc = Document() @@ -456,10 +457,57 @@ def create_transcript_docx(text: str, filename: str): # Configure section properties (margins, no built-in line numbering) _create_transcript_section_properties(doc.sections[0]) - # Add transcript lines with embedded line numbers - lines = text.strip().splitlines() - for idx, line in enumerate(lines, start=1): - _add_transcript_paragraph(doc, line, line_number=idx) + # Max characters per visual line (for 12pt Courier, 1" margins) + max_chars = 72 + + # Global line counter for visual lines + line_number = 0 + + # Split transcript into logical lines + logical_lines = text.strip().splitlines() + + for line in logical_lines: + line = line.strip() + if not line: + continue + + # Try to match: [00:00] SPEAKER 1: content + m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line) + if m: + ts, speaker, content = m.groups() + label_text = f"[{ts}] {speaker.upper()}:" + content = content.strip() + else: + label_text = "" + content = line.strip() + + # Split content into visual lines at word boundaries + content_lines = [] + words = content.split() + current = "" + for w in words: + if len(current) == 0: + current = w + elif len(current) + 1 + len(w) <= max_chars: + current += " " + w + else: + content_lines.append(current) + current = w + if current: + content_lines.append(current) + + # First visual line: include label if present + if content_lines: + first_line_text = (label_text + " " if label_text else "") + content_lines[0] + line_number += 1 + _add_transcript_paragraph(doc, first_line_text, line_number=line_number) + # Remove remaining content lines' leading content (already done) from first line + content_lines = content_lines[1:] + + # Subsequent visual lines: no label, just content + for cl in content_lines: + line_number += 1 + _add_transcript_paragraph(doc, cl, line_number=line_number) # Save doc.save(filename)