Use embedded line numbers instead of built-in line numbering
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Remove w:lnNumType; line numbers are now plain text in each paragraph.
- Ensures first line is always '1' across Word, LibreOffice, Google Docs.
- Each paragraph: line number + tab + content.
This commit is contained in:
admin
2026-06-16 19:15:47 +00:00
parent 670c6d3e2b
commit d828a91bf3
+43 -22
View File
@@ -317,7 +317,7 @@ def _create_transcript_section_properties(section):
Configure the section properties for transcript DOCX: Configure the section properties for transcript DOCX:
- Margins: 1 inch all sides - Margins: 1 inch all sides
- Single column layout - Single column layout
- Line numbering: start=1, countBy=1, restart each page - No built-in line numbering (we embed line numbers as text for portability)
- Remove document grid to avoid off-by-one line numbering - Remove document grid to avoid off-by-one line numbering
""" """
sectPr = section._sectPr sectPr = section._sectPr
@@ -335,43 +335,50 @@ def _create_transcript_section_properties(section):
_set_element_attr(pgMar, "footer", "720") _set_element_attr(pgMar, "footer", "720")
_set_element_attr(pgMar, "gutter", "0") _set_element_attr(pgMar, "gutter", "0")
# Ensure single column (no multi-column layout that can interfere with line numbering) # Ensure single column (no multi-column layout)
cols = sectPr.find(f"{{{W_NS}}}cols") cols = sectPr.find(f"{{{W_NS}}}cols")
if cols is not None: if cols is not None:
_set_element_attr(cols, "num", "1") _set_element_attr(cols, "num", "1")
_set_element_attr(cols, "space", "720") _set_element_attr(cols, "space", "720")
# Remove document grid entirely to avoid phantom grid-based line numbering # Remove document grid entirely
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"): for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
sectPr.remove(docGrid) sectPr.remove(docGrid)
# Ensure exactly one lnNumType element (remove any existing) # Remove any built-in line numbering; we will use text-based line numbers
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"): for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
sectPr.remove(lnNumType) sectPr.remove(lnNumType)
# Line numbering: start=1, countBy=1, restart each page
lnNumType = OxmlElement("w:lnNumType")
_set_element_attr(lnNumType, "start", "1")
_set_element_attr(lnNumType, "countBy", "1")
_set_element_attr(lnNumType, "restart", "newPage")
sectPr.append(lnNumType)
def _add_transcript_paragraph(doc, line_text, line_number):
def _add_transcript_paragraph(doc, line_text):
""" """
Add a single transcript line as a paragraph. Add a single transcript line as a paragraph with an embedded line number.
If it matches timestamp + speaker format, format the label differently. Uses a left tab stop so the line number appears in the left margin area,
Uses single line spacing and zero before/after spacing so each paragraph = 1 line. independent of built-in line numbering, ensuring consistent behavior
across Word, LibreOffice, Google Docs, etc.
""" """
line_text = line_text.strip() line_text = line_text.strip()
if not line_text: if not line_text:
return return
p = doc.add_paragraph() p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.25)
# Enforce single line spacing with no extra before/after # Set up paragraph formatting:
# - No left indent; we control spacing via tab stop
# - Single line spacing, no extra before/after
pPr = p._p.get_or_add_pPr() pPr = p._p.get_or_add_pPr()
# Remove any default indent
pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind"))
# Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25")
tabs = OxmlElement("w:tabs")
tab = OxmlElement("w:tab")
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left")
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360")
tabs.append(tab)
pPr.append(tabs)
spacing = OxmlElement("w:spacing") spacing = OxmlElement("w:spacing")
_set_element_attr(spacing, "before", "0") _set_element_attr(spacing, "before", "0")
_set_element_attr(spacing, "after", "0") _set_element_attr(spacing, "after", "0")
@@ -381,6 +388,19 @@ def _add_transcript_paragraph(doc, line_text):
# Try to match: [00:00] SPEAKER 1: content # Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text) m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
# Line number run (no underline)
run_ln = p.add_run(str(line_number))
run_ln.font.name = "Courier"
run_ln.font.size = Pt(12)
run_ln.underline = False
# Tab between line number and content
run_tab = p.add_run("\t")
run_tab.font.name = "Courier"
run_tab.font.size = Pt(12)
run_tab.underline = False
if m: if m:
ts, speaker, content = m.groups() ts, speaker, content = m.groups()
label_text = f"[{ts}] {speaker.upper()}:" label_text = f"[{ts}] {speaker.upper()}:"
@@ -417,7 +437,8 @@ def create_transcript_docx(text: str, filename: str):
Create a transcript DOCX with: Create a transcript DOCX with:
- 1" margins on all sides - 1" margins on all sides
- 12pt Courier font - 12pt Courier font
- Line numbering starting at 1 on the first page - Embedded line numbers starting at 1 on the first page
(portable across Word, LibreOffice, Google Docs)
- Proper formatting for timestamps and speaker labels - Proper formatting for timestamps and speaker labels
""" """
doc = Document() doc = Document()
@@ -432,13 +453,13 @@ def create_transcript_docx(text: str, filename: str):
for p in list(body.findall(f"{{{W_NS}}}p")): for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p) body.remove(p)
# Configure section properties (margins, line numbering, grid) # Configure section properties (margins, no built-in line numbering)
_create_transcript_section_properties(doc.sections[0]) _create_transcript_section_properties(doc.sections[0])
# Add transcript lines # Add transcript lines with embedded line numbers
lines = text.strip().splitlines() lines = text.strip().splitlines()
for line in lines: for idx, line in enumerate(lines, start=1):
_add_transcript_paragraph(doc, line) _add_transcript_paragraph(doc, line, line_number=idx)
# Save # Save
doc.save(filename) doc.save(filename)