Rebuild transcript DOCX generation flow
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Clean, single-pass implementation for transcript and summary DOCX
- Explicit margins, font, line numbering per OOXML spec
- Disable docGrid to prevent off-by-one line numbering
- Ensure first content line is line 1
This commit is contained in:
admin
2026-06-16 16:54:48 +00:00
parent 0c0e52dfb8
commit 03d66219d9
+95 -111
View File
@@ -304,173 +304,157 @@ def send_email(
# ------------ DOCX helpers ------------
def _configure_base_font(doc):
"""
Set base document font to 12pt Courier.
"""
style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
# Namespaces
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
def _configure_section_margins(doc, margin=1.0):
"""
Set uniform margins on all sides (default 1 inch).
"""
for section in doc.sections:
section.left_margin = Inches(margin)
section.right_margin = Inches(margin)
section.top_margin = Inches(margin)
section.bottom_margin = Inches(margin)
def _set_element_attr(elem, attr, value):
elem.set(f"{{{W_NS}}}{attr}", str(value))
def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
def _create_transcript_section_properties(section):
"""
Enable line numbering for a specific section.
Disables the document grid so Word counts lines based on content, not grid lines.
Configure the section properties for transcript DOCX:
- Margins: 1 inch all sides
- Line numbering: start=1, countBy=1, restart each page
- Disable document grid to avoid off-by-one line numbering
"""
sectPr = section._sectPr
# Disable document grid to avoid Word treating a grid line as line 1
docGrid = sectPr.find(qn("w:docGrid"))
# Margins: 1 inch = 1440 twips
pgMar = sectPr.find(f"{{{W_NS}}}pgMar") or OxmlElement("w:pgMar")
_set_element_attr(pgMar, "top", "1440")
_set_element_attr(pgMar, "right", "1440")
_set_element_attr(pgMar, "bottom", "1440")
_set_element_attr(pgMar, "left", "1440")
_set_element_attr(pgMar, "header", "720")
_set_element_attr(pgMar, "footer", "720")
_set_element_attr(pgMar, "gutter", "0")
sectPr.append(pgMar)
# Disable document grid to avoid Word counting phantom grid lines
docGrid = sectPr.find(f"{{{W_NS}}}docGrid")
if docGrid is not None:
docGrid.set(qn("w:type"), "none")
_set_element_attr(docGrid, "type", "none")
# Create the line numbering element: <w:lnNumType>
# Line numbering: start=1, countBy=1, restart each page
lnNumType = OxmlElement("w:lnNumType")
lnNumType.set(qn("w:start"), str(start_at))
lnNumType.set(qn("w:countBy"), str(count_by))
# 'newPage' restarts numbering on every page; 'continuous' keeps it going
restart_mode = "newPage" if restart else "continuous"
lnNumType.set(qn("w:restart"), restart_mode)
_set_element_attr(lnNumType, "start", "1")
_set_element_attr(lnNumType, "countBy", "1")
_set_element_attr(lnNumType, "restart", "newPage")
sectPr.append(lnNumType)
def _add_transcript_content(doc, text):
def _add_transcript_paragraph(doc, line_text):
"""
Add transcript lines to the document with formatting.
Add a single transcript line as a paragraph.
If it matches timestamp + speaker format, format the label differently.
"""
lines = text.strip().split("\n")
for line in lines:
line = line.strip()
if not line:
continue
line_text = line_text.strip()
if not line_text:
return
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
if m:
ts, speaker, content = m.groups()
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.25)
run_label = p.add_run(f"[{ts}] {speaker.upper()}:")
run_label.bold = False
# Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
if m:
ts, speaker, content = m.groups()
label_text = f"[{ts}] {speaker.upper()}:"
# Label run (underline)
run_label = p.add_run(label_text)
run_label.underline = True
run_label.font.name = "Courier"
run_label.font.size = Pt(12)
# Space run (no underline)
run_space = p.add_run(" ")
run_space.bold = False
run_space.underline = False
run_space.font.name = "Courier"
run_space.font.size = Pt(12)
# Content run (no underline)
run_txt = p.add_run(content.strip())
run_txt.bold = False
run_txt.underline = False
run_txt.font.name = "Courier"
run_txt.font.size = Pt(12)
else:
p = doc.add_paragraph()
run = p.add_run(line)
# Non-standard line: plain text
run = p.add_run(line_text)
run.underline = False
run.font.name = "Courier"
run.font.size = Pt(12)
def _add_summary_content(doc, text):
"""
Add summary content with heading styles.
"""
heading_count = 0
for line in text.splitlines():
stripped = line.strip()
if not stripped:
continue
# Detect markdown-style headings: #, ##, ###, #### at start of line
m = re.match(r"^(#{1,4})\s+(.*)", stripped)
if m:
heading_count += 1
content = m.group(2).strip()
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(4)
run = p.add_run(content)
run.font.name = "Courier"
run.font.size = Pt(12)
# Apply formatting based on this heading's ordinal position
if heading_count == 1:
run.bold = True
elif heading_count == 2:
run.italic = True
elif heading_count == 3:
run.underline = True
elif heading_count >= 4:
run.italic = True
run.underline = True
else:
# Normal text line
p = doc.add_paragraph(stripped)
p.paragraph_format.space_after = Pt(4)
# ------------ Public DOCX functions ------------
def create_transcript_docx(
text: str,
filename: str,
):
def create_transcript_docx(text: str, filename: str):
"""
Create a .docx transcript with:
Create a transcript DOCX with:
- 1" margins on all sides
- 12pt Courier
- Continuous line numbering on the left
- No cover page (standalone transcript only).
- 12pt Courier font
- Line numbering starting at 1 on the first page
- Proper formatting for timestamps and speaker labels
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Fully clear default paragraphs so Word's line numbering starts at line 1
# Set base font (Normal style)
style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
# Remove any default paragraphs (ensure no phantom first line)
body = doc.element.body
for p in body.findall(qn('w:p')):
for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p)
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# Configure section properties (margins, line numbering, grid)
_create_transcript_section_properties(doc.sections[0])
# Transcript content (with line numbering)
_add_transcript_content(doc, text)
# Add transcript lines
lines = text.strip().splitlines()
for line in lines:
_add_transcript_paragraph(doc, line)
# Save
doc.save(filename)
def create_summary_docx(
text: str,
filename: str,
):
def create_summary_docx(text: str, filename: str):
"""
Create a .docx summary with consistent font and heading styles.
No cover page, no line numbering.
Create a summary DOCX with:
- 1" margins on all sides
- 12pt Courier font
- No line numbering
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Summary content (no line numbering)
_add_summary_content(doc, text)
# Base font
style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
# Margins: 1 inch all sides
for section in doc.sections:
section.left_margin = Inches(1.0)
section.right_margin = Inches(1.0)
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
# Remove default paragraph
body = doc.element.body
for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p)
# Add summary content
lines = text.strip().splitlines()
for line in lines:
line = line.strip()
if not line:
continue
p = doc.add_paragraph(line)
p.paragraph_format.space_after = Pt(4)
doc.save(filename)