From d828a91bf3e70ea565f2859b9a344d11c4c84eb2 Mon Sep 17 00:00:00 2001 From: admin Date: Tue, 16 Jun 2026 19:15:47 +0000 Subject: [PATCH] Use embedded line numbers instead of built-in line numbering - Remove w:lnNumType; line numbers are now plain text in each paragraph. - Ensures first line is always '1' across Word, LibreOffice, Google Docs. - Each paragraph: line number + tab + content. --- scraibe/email_sender.py | 65 +++++++++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 22 deletions(-) diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index bed38b5..faeb715 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -317,7 +317,7 @@ def _create_transcript_section_properties(section): Configure the section properties for transcript DOCX: - Margins: 1 inch all sides - Single column layout - - Line numbering: start=1, countBy=1, restart each page + - No built-in line numbering (we embed line numbers as text for portability) - Remove document grid to avoid off-by-one line numbering """ sectPr = section._sectPr @@ -335,43 +335,50 @@ def _create_transcript_section_properties(section): _set_element_attr(pgMar, "footer", "720") _set_element_attr(pgMar, "gutter", "0") - # Ensure single column (no multi-column layout that can interfere with line numbering) + # Ensure single column (no multi-column layout) cols = sectPr.find(f"{{{W_NS}}}cols") if cols is not None: _set_element_attr(cols, "num", "1") _set_element_attr(cols, "space", "720") - # Remove document grid entirely to avoid phantom grid-based line numbering + # Remove document grid entirely for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"): sectPr.remove(docGrid) - # Ensure exactly one lnNumType element (remove any existing) + # Remove any built-in line numbering; we will use text-based line numbers for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"): sectPr.remove(lnNumType) - # Line numbering: start=1, countBy=1, restart each page - lnNumType = OxmlElement("w:lnNumType") - _set_element_attr(lnNumType, "start", "1") - _set_element_attr(lnNumType, "countBy", "1") - _set_element_attr(lnNumType, "restart", "newPage") - sectPr.append(lnNumType) - -def _add_transcript_paragraph(doc, line_text): +def _add_transcript_paragraph(doc, line_text, line_number): """ - Add a single transcript line as a paragraph. - If it matches timestamp + speaker format, format the label differently. - Uses single line spacing and zero before/after spacing so each paragraph = 1 line. + Add a single transcript line as a paragraph with an embedded line number. + Uses a left tab stop so the line number appears in the left margin area, + independent of built-in line numbering, ensuring consistent behavior + across Word, LibreOffice, Google Docs, etc. """ line_text = line_text.strip() if not line_text: return p = doc.add_paragraph() - p.paragraph_format.left_indent = Inches(0.25) - # Enforce single line spacing with no extra before/after + # Set up paragraph formatting: + # - No left indent; we control spacing via tab stop + # - Single line spacing, no extra before/after pPr = p._p.get_or_add_pPr() + + # Remove any default indent + pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind")) + + # Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25") + tabs = OxmlElement("w:tabs") + tab = OxmlElement("w:tab") + tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left") + tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360") + tabs.append(tab) + pPr.append(tabs) + spacing = OxmlElement("w:spacing") _set_element_attr(spacing, "before", "0") _set_element_attr(spacing, "after", "0") @@ -381,6 +388,19 @@ def _add_transcript_paragraph(doc, line_text): # Try to match: [00:00] SPEAKER 1: content m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text) + + # Line number run (no underline) + run_ln = p.add_run(str(line_number)) + run_ln.font.name = "Courier" + run_ln.font.size = Pt(12) + run_ln.underline = False + + # Tab between line number and content + run_tab = p.add_run("\t") + run_tab.font.name = "Courier" + run_tab.font.size = Pt(12) + run_tab.underline = False + if m: ts, speaker, content = m.groups() label_text = f"[{ts}] {speaker.upper()}:" @@ -417,7 +437,8 @@ def create_transcript_docx(text: str, filename: str): Create a transcript DOCX with: - 1" margins on all sides - 12pt Courier font - - Line numbering starting at 1 on the first page + - Embedded line numbers starting at 1 on the first page + (portable across Word, LibreOffice, Google Docs) - Proper formatting for timestamps and speaker labels """ doc = Document() @@ -432,13 +453,13 @@ def create_transcript_docx(text: str, filename: str): for p in list(body.findall(f"{{{W_NS}}}p")): body.remove(p) - # Configure section properties (margins, line numbering, grid) + # Configure section properties (margins, no built-in line numbering) _create_transcript_section_properties(doc.sections[0]) - # Add transcript lines + # Add transcript lines with embedded line numbers lines = text.strip().splitlines() - for line in lines: - _add_transcript_paragraph(doc, line) + for idx, line in enumerate(lines, start=1): + _add_transcript_paragraph(doc, line, line_number=idx) # Save doc.save(filename)