Use embedded line numbers instead of built-in line numbering
- Remove w:lnNumType; line numbers are now plain text in each paragraph. - Ensures first line is always '1' across Word, LibreOffice, Google Docs. - Each paragraph: line number + tab + content.
This commit is contained in:
+43
-22
@@ -317,7 +317,7 @@ def _create_transcript_section_properties(section):
|
||||
Configure the section properties for transcript DOCX:
|
||||
- Margins: 1 inch all sides
|
||||
- Single column layout
|
||||
- Line numbering: start=1, countBy=1, restart each page
|
||||
- No built-in line numbering (we embed line numbers as text for portability)
|
||||
- Remove document grid to avoid off-by-one line numbering
|
||||
"""
|
||||
sectPr = section._sectPr
|
||||
@@ -335,43 +335,50 @@ def _create_transcript_section_properties(section):
|
||||
_set_element_attr(pgMar, "footer", "720")
|
||||
_set_element_attr(pgMar, "gutter", "0")
|
||||
|
||||
# Ensure single column (no multi-column layout that can interfere with line numbering)
|
||||
# Ensure single column (no multi-column layout)
|
||||
cols = sectPr.find(f"{{{W_NS}}}cols")
|
||||
if cols is not None:
|
||||
_set_element_attr(cols, "num", "1")
|
||||
_set_element_attr(cols, "space", "720")
|
||||
|
||||
# Remove document grid entirely to avoid phantom grid-based line numbering
|
||||
# Remove document grid entirely
|
||||
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
|
||||
sectPr.remove(docGrid)
|
||||
|
||||
# Ensure exactly one lnNumType element (remove any existing)
|
||||
# Remove any built-in line numbering; we will use text-based line numbers
|
||||
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
|
||||
sectPr.remove(lnNumType)
|
||||
|
||||
# Line numbering: start=1, countBy=1, restart each page
|
||||
lnNumType = OxmlElement("w:lnNumType")
|
||||
_set_element_attr(lnNumType, "start", "1")
|
||||
_set_element_attr(lnNumType, "countBy", "1")
|
||||
_set_element_attr(lnNumType, "restart", "newPage")
|
||||
sectPr.append(lnNumType)
|
||||
|
||||
|
||||
def _add_transcript_paragraph(doc, line_text):
|
||||
def _add_transcript_paragraph(doc, line_text, line_number):
|
||||
"""
|
||||
Add a single transcript line as a paragraph.
|
||||
If it matches timestamp + speaker format, format the label differently.
|
||||
Uses single line spacing and zero before/after spacing so each paragraph = 1 line.
|
||||
Add a single transcript line as a paragraph with an embedded line number.
|
||||
Uses a left tab stop so the line number appears in the left margin area,
|
||||
independent of built-in line numbering, ensuring consistent behavior
|
||||
across Word, LibreOffice, Google Docs, etc.
|
||||
"""
|
||||
line_text = line_text.strip()
|
||||
if not line_text:
|
||||
return
|
||||
|
||||
p = doc.add_paragraph()
|
||||
p.paragraph_format.left_indent = Inches(0.25)
|
||||
|
||||
# Enforce single line spacing with no extra before/after
|
||||
# Set up paragraph formatting:
|
||||
# - No left indent; we control spacing via tab stop
|
||||
# - Single line spacing, no extra before/after
|
||||
pPr = p._p.get_or_add_pPr()
|
||||
|
||||
# Remove any default indent
|
||||
pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind"))
|
||||
|
||||
# Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25")
|
||||
tabs = OxmlElement("w:tabs")
|
||||
tab = OxmlElement("w:tab")
|
||||
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left")
|
||||
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360")
|
||||
tabs.append(tab)
|
||||
pPr.append(tabs)
|
||||
|
||||
spacing = OxmlElement("w:spacing")
|
||||
_set_element_attr(spacing, "before", "0")
|
||||
_set_element_attr(spacing, "after", "0")
|
||||
@@ -381,6 +388,19 @@ def _add_transcript_paragraph(doc, line_text):
|
||||
|
||||
# Try to match: [00:00] SPEAKER 1: content
|
||||
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
|
||||
|
||||
# Line number run (no underline)
|
||||
run_ln = p.add_run(str(line_number))
|
||||
run_ln.font.name = "Courier"
|
||||
run_ln.font.size = Pt(12)
|
||||
run_ln.underline = False
|
||||
|
||||
# Tab between line number and content
|
||||
run_tab = p.add_run("\t")
|
||||
run_tab.font.name = "Courier"
|
||||
run_tab.font.size = Pt(12)
|
||||
run_tab.underline = False
|
||||
|
||||
if m:
|
||||
ts, speaker, content = m.groups()
|
||||
label_text = f"[{ts}] {speaker.upper()}:"
|
||||
@@ -417,7 +437,8 @@ def create_transcript_docx(text: str, filename: str):
|
||||
Create a transcript DOCX with:
|
||||
- 1" margins on all sides
|
||||
- 12pt Courier font
|
||||
- Line numbering starting at 1 on the first page
|
||||
- Embedded line numbers starting at 1 on the first page
|
||||
(portable across Word, LibreOffice, Google Docs)
|
||||
- Proper formatting for timestamps and speaker labels
|
||||
"""
|
||||
doc = Document()
|
||||
@@ -432,13 +453,13 @@ def create_transcript_docx(text: str, filename: str):
|
||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||
body.remove(p)
|
||||
|
||||
# Configure section properties (margins, line numbering, grid)
|
||||
# Configure section properties (margins, no built-in line numbering)
|
||||
_create_transcript_section_properties(doc.sections[0])
|
||||
|
||||
# Add transcript lines
|
||||
# Add transcript lines with embedded line numbers
|
||||
lines = text.strip().splitlines()
|
||||
for line in lines:
|
||||
_add_transcript_paragraph(doc, line)
|
||||
for idx, line in enumerate(lines, start=1):
|
||||
_add_transcript_paragraph(doc, line, line_number=idx)
|
||||
|
||||
# Save
|
||||
doc.save(filename)
|
||||
|
||||
Reference in New Issue
Block a user