diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index faeb715..8cd9488 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -439,6 +439,7 @@ def create_transcript_docx(text: str, filename: str): - 12pt Courier font - Embedded line numbers starting at 1 on the first page (portable across Word, LibreOffice, Google Docs) + - Line numbers reflect visual lines on the page, not speaker turns. - Proper formatting for timestamps and speaker labels """ doc = Document() @@ -456,10 +457,57 @@ def create_transcript_docx(text: str, filename: str): # Configure section properties (margins, no built-in line numbering) _create_transcript_section_properties(doc.sections[0]) - # Add transcript lines with embedded line numbers - lines = text.strip().splitlines() - for idx, line in enumerate(lines, start=1): - _add_transcript_paragraph(doc, line, line_number=idx) + # Max characters per visual line (for 12pt Courier, 1" margins) + max_chars = 72 + + # Global line counter for visual lines + line_number = 0 + + # Split transcript into logical lines + logical_lines = text.strip().splitlines() + + for line in logical_lines: + line = line.strip() + if not line: + continue + + # Try to match: [00:00] SPEAKER 1: content + m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line) + if m: + ts, speaker, content = m.groups() + label_text = f"[{ts}] {speaker.upper()}:" + content = content.strip() + else: + label_text = "" + content = line.strip() + + # Split content into visual lines at word boundaries + content_lines = [] + words = content.split() + current = "" + for w in words: + if len(current) == 0: + current = w + elif len(current) + 1 + len(w) <= max_chars: + current += " " + w + else: + content_lines.append(current) + current = w + if current: + content_lines.append(current) + + # First visual line: include label if present + if content_lines: + first_line_text = (label_text + " " if label_text else "") + content_lines[0] + line_number += 1 + _add_transcript_paragraph(doc, first_line_text, line_number=line_number) + # Remove remaining content lines' leading content (already done) from first line + content_lines = content_lines[1:] + + # Subsequent visual lines: no label, just content + for cl in content_lines: + line_number += 1 + _add_transcript_paragraph(doc, cl, line_number=line_number) # Save doc.save(filename)