Use embedded line numbers instead of built-in line numbering
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Remove w:lnNumType; line numbers are now plain text in each paragraph.
- Ensures first line is always '1' across Word, LibreOffice, Google Docs.
- Each paragraph: line number + tab + content.
This commit is contained in:
admin
2026-06-16 19:15:47 +00:00
parent 670c6d3e2b
commit d828a91bf3
+43 -22
View File
@@ -317,7 +317,7 @@ def _create_transcript_section_properties(section):
Configure the section properties for transcript DOCX:
- Margins: 1 inch all sides
- Single column layout
- Line numbering: start=1, countBy=1, restart each page
- No built-in line numbering (we embed line numbers as text for portability)
- Remove document grid to avoid off-by-one line numbering
"""
sectPr = section._sectPr
@@ -335,43 +335,50 @@ def _create_transcript_section_properties(section):
_set_element_attr(pgMar, "footer", "720")
_set_element_attr(pgMar, "gutter", "0")
# Ensure single column (no multi-column layout that can interfere with line numbering)
# Ensure single column (no multi-column layout)
cols = sectPr.find(f"{{{W_NS}}}cols")
if cols is not None:
_set_element_attr(cols, "num", "1")
_set_element_attr(cols, "space", "720")
# Remove document grid entirely to avoid phantom grid-based line numbering
# Remove document grid entirely
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
sectPr.remove(docGrid)
# Ensure exactly one lnNumType element (remove any existing)
# Remove any built-in line numbering; we will use text-based line numbers
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
sectPr.remove(lnNumType)
# Line numbering: start=1, countBy=1, restart each page
lnNumType = OxmlElement("w:lnNumType")
_set_element_attr(lnNumType, "start", "1")
_set_element_attr(lnNumType, "countBy", "1")
_set_element_attr(lnNumType, "restart", "newPage")
sectPr.append(lnNumType)
def _add_transcript_paragraph(doc, line_text):
def _add_transcript_paragraph(doc, line_text, line_number):
"""
Add a single transcript line as a paragraph.
If it matches timestamp + speaker format, format the label differently.
Uses single line spacing and zero before/after spacing so each paragraph = 1 line.
Add a single transcript line as a paragraph with an embedded line number.
Uses a left tab stop so the line number appears in the left margin area,
independent of built-in line numbering, ensuring consistent behavior
across Word, LibreOffice, Google Docs, etc.
"""
line_text = line_text.strip()
if not line_text:
return
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.25)
# Enforce single line spacing with no extra before/after
# Set up paragraph formatting:
# - No left indent; we control spacing via tab stop
# - Single line spacing, no extra before/after
pPr = p._p.get_or_add_pPr()
# Remove any default indent
pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind"))
# Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25")
tabs = OxmlElement("w:tabs")
tab = OxmlElement("w:tab")
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left")
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360")
tabs.append(tab)
pPr.append(tabs)
spacing = OxmlElement("w:spacing")
_set_element_attr(spacing, "before", "0")
_set_element_attr(spacing, "after", "0")
@@ -381,6 +388,19 @@ def _add_transcript_paragraph(doc, line_text):
# Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
# Line number run (no underline)
run_ln = p.add_run(str(line_number))
run_ln.font.name = "Courier"
run_ln.font.size = Pt(12)
run_ln.underline = False
# Tab between line number and content
run_tab = p.add_run("\t")
run_tab.font.name = "Courier"
run_tab.font.size = Pt(12)
run_tab.underline = False
if m:
ts, speaker, content = m.groups()
label_text = f"[{ts}] {speaker.upper()}:"
@@ -417,7 +437,8 @@ def create_transcript_docx(text: str, filename: str):
Create a transcript DOCX with:
- 1" margins on all sides
- 12pt Courier font
- Line numbering starting at 1 on the first page
- Embedded line numbers starting at 1 on the first page
(portable across Word, LibreOffice, Google Docs)
- Proper formatting for timestamps and speaker labels
"""
doc = Document()
@@ -432,13 +453,13 @@ def create_transcript_docx(text: str, filename: str):
for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p)
# Configure section properties (margins, line numbering, grid)
# Configure section properties (margins, no built-in line numbering)
_create_transcript_section_properties(doc.sections[0])
# Add transcript lines
# Add transcript lines with embedded line numbers
lines = text.strip().splitlines()
for line in lines:
_add_transcript_paragraph(doc, line)
for idx, line in enumerate(lines, start=1):
_add_transcript_paragraph(doc, line, line_number=idx)
# Save
doc.save(filename)