Make line numbers reflect visual lines, not speaker turns
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Split long lines into multiple visual lines at word boundaries.
- Each visual line is its own paragraph with its own embedded line number.
- Continuous numbering across speakers and pages.
- Portable across Word, LibreOffice, Google Docs.
This commit is contained in:
admin
2026-06-16 19:21:04 +00:00
parent d828a91bf3
commit 83f3c09218
+52 -4
View File
@@ -439,6 +439,7 @@ def create_transcript_docx(text: str, filename: str):
- 12pt Courier font - 12pt Courier font
- Embedded line numbers starting at 1 on the first page - Embedded line numbers starting at 1 on the first page
(portable across Word, LibreOffice, Google Docs) (portable across Word, LibreOffice, Google Docs)
- Line numbers reflect visual lines on the page, not speaker turns.
- Proper formatting for timestamps and speaker labels - Proper formatting for timestamps and speaker labels
""" """
doc = Document() doc = Document()
@@ -456,10 +457,57 @@ def create_transcript_docx(text: str, filename: str):
# Configure section properties (margins, no built-in line numbering) # Configure section properties (margins, no built-in line numbering)
_create_transcript_section_properties(doc.sections[0]) _create_transcript_section_properties(doc.sections[0])
# Add transcript lines with embedded line numbers # Max characters per visual line (for 12pt Courier, 1" margins)
lines = text.strip().splitlines() max_chars = 72
for idx, line in enumerate(lines, start=1):
_add_transcript_paragraph(doc, line, line_number=idx) # Global line counter for visual lines
line_number = 0
# Split transcript into logical lines
logical_lines = text.strip().splitlines()
for line in logical_lines:
line = line.strip()
if not line:
continue
# Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
if m:
ts, speaker, content = m.groups()
label_text = f"[{ts}] {speaker.upper()}:"
content = content.strip()
else:
label_text = ""
content = line.strip()
# Split content into visual lines at word boundaries
content_lines = []
words = content.split()
current = ""
for w in words:
if len(current) == 0:
current = w
elif len(current) + 1 + len(w) <= max_chars:
current += " " + w
else:
content_lines.append(current)
current = w
if current:
content_lines.append(current)
# First visual line: include label if present
if content_lines:
first_line_text = (label_text + " " if label_text else "") + content_lines[0]
line_number += 1
_add_transcript_paragraph(doc, first_line_text, line_number=line_number)
# Remove remaining content lines' leading content (already done) from first line
content_lines = content_lines[1:]
# Subsequent visual lines: no label, just content
for cl in content_lines:
line_number += 1
_add_transcript_paragraph(doc, cl, line_number=line_number)
# Save # Save
doc.save(filename) doc.save(filename)