Make line numbers reflect visual lines, not speaker turns
- Split long lines into multiple visual lines at word boundaries. - Each visual line is its own paragraph with its own embedded line number. - Continuous numbering across speakers and pages. - Portable across Word, LibreOffice, Google Docs.
This commit is contained in:
+52
-4
@@ -439,6 +439,7 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
- 12pt Courier font
|
- 12pt Courier font
|
||||||
- Embedded line numbers starting at 1 on the first page
|
- Embedded line numbers starting at 1 on the first page
|
||||||
(portable across Word, LibreOffice, Google Docs)
|
(portable across Word, LibreOffice, Google Docs)
|
||||||
|
- Line numbers reflect visual lines on the page, not speaker turns.
|
||||||
- Proper formatting for timestamps and speaker labels
|
- Proper formatting for timestamps and speaker labels
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
@@ -456,10 +457,57 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
# Configure section properties (margins, no built-in line numbering)
|
# Configure section properties (margins, no built-in line numbering)
|
||||||
_create_transcript_section_properties(doc.sections[0])
|
_create_transcript_section_properties(doc.sections[0])
|
||||||
|
|
||||||
# Add transcript lines with embedded line numbers
|
# Max characters per visual line (for 12pt Courier, 1" margins)
|
||||||
lines = text.strip().splitlines()
|
max_chars = 72
|
||||||
for idx, line in enumerate(lines, start=1):
|
|
||||||
_add_transcript_paragraph(doc, line, line_number=idx)
|
# Global line counter for visual lines
|
||||||
|
line_number = 0
|
||||||
|
|
||||||
|
# Split transcript into logical lines
|
||||||
|
logical_lines = text.strip().splitlines()
|
||||||
|
|
||||||
|
for line in logical_lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to match: [00:00] SPEAKER 1: content
|
||||||
|
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
|
||||||
|
if m:
|
||||||
|
ts, speaker, content = m.groups()
|
||||||
|
label_text = f"[{ts}] {speaker.upper()}:"
|
||||||
|
content = content.strip()
|
||||||
|
else:
|
||||||
|
label_text = ""
|
||||||
|
content = line.strip()
|
||||||
|
|
||||||
|
# Split content into visual lines at word boundaries
|
||||||
|
content_lines = []
|
||||||
|
words = content.split()
|
||||||
|
current = ""
|
||||||
|
for w in words:
|
||||||
|
if len(current) == 0:
|
||||||
|
current = w
|
||||||
|
elif len(current) + 1 + len(w) <= max_chars:
|
||||||
|
current += " " + w
|
||||||
|
else:
|
||||||
|
content_lines.append(current)
|
||||||
|
current = w
|
||||||
|
if current:
|
||||||
|
content_lines.append(current)
|
||||||
|
|
||||||
|
# First visual line: include label if present
|
||||||
|
if content_lines:
|
||||||
|
first_line_text = (label_text + " " if label_text else "") + content_lines[0]
|
||||||
|
line_number += 1
|
||||||
|
_add_transcript_paragraph(doc, first_line_text, line_number=line_number)
|
||||||
|
# Remove remaining content lines' leading content (already done) from first line
|
||||||
|
content_lines = content_lines[1:]
|
||||||
|
|
||||||
|
# Subsequent visual lines: no label, just content
|
||||||
|
for cl in content_lines:
|
||||||
|
line_number += 1
|
||||||
|
_add_transcript_paragraph(doc, cl, line_number=line_number)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|||||||
Reference in New Issue
Block a user