Use embedded line numbers instead of built-in line numbering
- Remove w:lnNumType; line numbers are now plain text in each paragraph. - Ensures first line is always '1' across Word, LibreOffice, Google Docs. - Each paragraph: line number + tab + content.
This commit is contained in:
+43
-22
@@ -317,7 +317,7 @@ def _create_transcript_section_properties(section):
|
|||||||
Configure the section properties for transcript DOCX:
|
Configure the section properties for transcript DOCX:
|
||||||
- Margins: 1 inch all sides
|
- Margins: 1 inch all sides
|
||||||
- Single column layout
|
- Single column layout
|
||||||
- Line numbering: start=1, countBy=1, restart each page
|
- No built-in line numbering (we embed line numbers as text for portability)
|
||||||
- Remove document grid to avoid off-by-one line numbering
|
- Remove document grid to avoid off-by-one line numbering
|
||||||
"""
|
"""
|
||||||
sectPr = section._sectPr
|
sectPr = section._sectPr
|
||||||
@@ -335,43 +335,50 @@ def _create_transcript_section_properties(section):
|
|||||||
_set_element_attr(pgMar, "footer", "720")
|
_set_element_attr(pgMar, "footer", "720")
|
||||||
_set_element_attr(pgMar, "gutter", "0")
|
_set_element_attr(pgMar, "gutter", "0")
|
||||||
|
|
||||||
# Ensure single column (no multi-column layout that can interfere with line numbering)
|
# Ensure single column (no multi-column layout)
|
||||||
cols = sectPr.find(f"{{{W_NS}}}cols")
|
cols = sectPr.find(f"{{{W_NS}}}cols")
|
||||||
if cols is not None:
|
if cols is not None:
|
||||||
_set_element_attr(cols, "num", "1")
|
_set_element_attr(cols, "num", "1")
|
||||||
_set_element_attr(cols, "space", "720")
|
_set_element_attr(cols, "space", "720")
|
||||||
|
|
||||||
# Remove document grid entirely to avoid phantom grid-based line numbering
|
# Remove document grid entirely
|
||||||
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
|
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
|
||||||
sectPr.remove(docGrid)
|
sectPr.remove(docGrid)
|
||||||
|
|
||||||
# Ensure exactly one lnNumType element (remove any existing)
|
# Remove any built-in line numbering; we will use text-based line numbers
|
||||||
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
|
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
|
||||||
sectPr.remove(lnNumType)
|
sectPr.remove(lnNumType)
|
||||||
|
|
||||||
# Line numbering: start=1, countBy=1, restart each page
|
|
||||||
lnNumType = OxmlElement("w:lnNumType")
|
|
||||||
_set_element_attr(lnNumType, "start", "1")
|
|
||||||
_set_element_attr(lnNumType, "countBy", "1")
|
|
||||||
_set_element_attr(lnNumType, "restart", "newPage")
|
|
||||||
sectPr.append(lnNumType)
|
|
||||||
|
|
||||||
|
def _add_transcript_paragraph(doc, line_text, line_number):
|
||||||
def _add_transcript_paragraph(doc, line_text):
|
|
||||||
"""
|
"""
|
||||||
Add a single transcript line as a paragraph.
|
Add a single transcript line as a paragraph with an embedded line number.
|
||||||
If it matches timestamp + speaker format, format the label differently.
|
Uses a left tab stop so the line number appears in the left margin area,
|
||||||
Uses single line spacing and zero before/after spacing so each paragraph = 1 line.
|
independent of built-in line numbering, ensuring consistent behavior
|
||||||
|
across Word, LibreOffice, Google Docs, etc.
|
||||||
"""
|
"""
|
||||||
line_text = line_text.strip()
|
line_text = line_text.strip()
|
||||||
if not line_text:
|
if not line_text:
|
||||||
return
|
return
|
||||||
|
|
||||||
p = doc.add_paragraph()
|
p = doc.add_paragraph()
|
||||||
p.paragraph_format.left_indent = Inches(0.25)
|
|
||||||
|
|
||||||
# Enforce single line spacing with no extra before/after
|
# Set up paragraph formatting:
|
||||||
|
# - No left indent; we control spacing via tab stop
|
||||||
|
# - Single line spacing, no extra before/after
|
||||||
pPr = p._p.get_or_add_pPr()
|
pPr = p._p.get_or_add_pPr()
|
||||||
|
|
||||||
|
# Remove any default indent
|
||||||
|
pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind"))
|
||||||
|
|
||||||
|
# Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25")
|
||||||
|
tabs = OxmlElement("w:tabs")
|
||||||
|
tab = OxmlElement("w:tab")
|
||||||
|
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left")
|
||||||
|
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360")
|
||||||
|
tabs.append(tab)
|
||||||
|
pPr.append(tabs)
|
||||||
|
|
||||||
spacing = OxmlElement("w:spacing")
|
spacing = OxmlElement("w:spacing")
|
||||||
_set_element_attr(spacing, "before", "0")
|
_set_element_attr(spacing, "before", "0")
|
||||||
_set_element_attr(spacing, "after", "0")
|
_set_element_attr(spacing, "after", "0")
|
||||||
@@ -381,6 +388,19 @@ def _add_transcript_paragraph(doc, line_text):
|
|||||||
|
|
||||||
# Try to match: [00:00] SPEAKER 1: content
|
# Try to match: [00:00] SPEAKER 1: content
|
||||||
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
|
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
|
||||||
|
|
||||||
|
# Line number run (no underline)
|
||||||
|
run_ln = p.add_run(str(line_number))
|
||||||
|
run_ln.font.name = "Courier"
|
||||||
|
run_ln.font.size = Pt(12)
|
||||||
|
run_ln.underline = False
|
||||||
|
|
||||||
|
# Tab between line number and content
|
||||||
|
run_tab = p.add_run("\t")
|
||||||
|
run_tab.font.name = "Courier"
|
||||||
|
run_tab.font.size = Pt(12)
|
||||||
|
run_tab.underline = False
|
||||||
|
|
||||||
if m:
|
if m:
|
||||||
ts, speaker, content = m.groups()
|
ts, speaker, content = m.groups()
|
||||||
label_text = f"[{ts}] {speaker.upper()}:"
|
label_text = f"[{ts}] {speaker.upper()}:"
|
||||||
@@ -417,7 +437,8 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
Create a transcript DOCX with:
|
Create a transcript DOCX with:
|
||||||
- 1" margins on all sides
|
- 1" margins on all sides
|
||||||
- 12pt Courier font
|
- 12pt Courier font
|
||||||
- Line numbering starting at 1 on the first page
|
- Embedded line numbers starting at 1 on the first page
|
||||||
|
(portable across Word, LibreOffice, Google Docs)
|
||||||
- Proper formatting for timestamps and speaker labels
|
- Proper formatting for timestamps and speaker labels
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
@@ -432,13 +453,13 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||||
body.remove(p)
|
body.remove(p)
|
||||||
|
|
||||||
# Configure section properties (margins, line numbering, grid)
|
# Configure section properties (margins, no built-in line numbering)
|
||||||
_create_transcript_section_properties(doc.sections[0])
|
_create_transcript_section_properties(doc.sections[0])
|
||||||
|
|
||||||
# Add transcript lines
|
# Add transcript lines with embedded line numbers
|
||||||
lines = text.strip().splitlines()
|
lines = text.strip().splitlines()
|
||||||
for line in lines:
|
for idx, line in enumerate(lines, start=1):
|
||||||
_add_transcript_paragraph(doc, line)
|
_add_transcript_paragraph(doc, line, line_number=idx)
|
||||||
|
|
||||||
# Save
|
# Save
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|||||||
Reference in New Issue
Block a user