Fix transcript DOCX line numbering (spacing and column fixes)
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Ensure single column layout (cols num='1')
- Set explicit single line spacing (before/after=0, line=240 twips)
- Prevents Word from counting extra lines due to spacing/columns
This commit is contained in:
admin
2026-06-16 18:08:46 +00:00
parent 0e6bc53cf8
commit f20102d564
+17
View File
@@ -316,6 +316,7 @@ def _create_transcript_section_properties(section):
"""
Configure the section properties for transcript DOCX:
- Margins: 1 inch all sides
- Single column layout
- Line numbering: start=1, countBy=1, restart each page
- Disable document grid to avoid off-by-one line numbering
"""
@@ -334,6 +335,12 @@ def _create_transcript_section_properties(section):
_set_element_attr(pgMar, "footer", "720")
_set_element_attr(pgMar, "gutter", "0")
# Ensure single column (no multi-column layout that can interfere with line numbering)
cols = sectPr.find(f"{{{W_NS}}}cols")
if cols is not None:
_set_element_attr(cols, "num", "1")
_set_element_attr(cols, "space", "720")
# Disable document grid to avoid Word counting phantom grid lines
docGrid = sectPr.find(f"{{{W_NS}}}docGrid")
if docGrid is not None:
@@ -351,6 +358,7 @@ def _add_transcript_paragraph(doc, line_text):
"""
Add a single transcript line as a paragraph.
If it matches timestamp + speaker format, format the label differently.
Uses single line spacing and zero before/after spacing so each paragraph = 1 line.
"""
line_text = line_text.strip()
if not line_text:
@@ -359,6 +367,15 @@ def _add_transcript_paragraph(doc, line_text):
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.25)
# Enforce single line spacing with no extra before/after
pPr = p._p.get_or_add_pPr()
spacing = OxmlElement("w:spacing")
_set_element_attr(spacing, "before", "0")
_set_element_attr(spacing, "after", "0")
_set_element_attr(spacing, "line", "240") # 12pt in twips
_set_element_attr(spacing, "lineRule", "auto")
pPr.append(spacing)
# Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
if m: