Rebuild transcript DOCX generation flow
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Clean, single-pass implementation for transcript and summary DOCX
- Explicit margins, font, line numbering per OOXML spec
- Disable docGrid to prevent off-by-one line numbering
- Ensure first content line is line 1
This commit is contained in:
admin
2026-06-16 16:54:48 +00:00
parent 0c0e52dfb8
commit 03d66219d9
+111 -127
View File
@@ -304,173 +304,157 @@ def send_email(
# ------------ DOCX helpers ------------ # ------------ DOCX helpers ------------
def _configure_base_font(doc): # Namespaces
""" W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
Set base document font to 12pt Courier.
"""
style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
def _configure_section_margins(doc, margin=1.0): def _set_element_attr(elem, attr, value):
""" elem.set(f"{{{W_NS}}}{attr}", str(value))
Set uniform margins on all sides (default 1 inch).
"""
for section in doc.sections:
section.left_margin = Inches(margin)
section.right_margin = Inches(margin)
section.top_margin = Inches(margin)
section.bottom_margin = Inches(margin)
def _enable_line_numbering(section, start_at=1, count_by=1, restart=True): def _create_transcript_section_properties(section):
""" """
Enable line numbering for a specific section. Configure the section properties for transcript DOCX:
Disables the document grid so Word counts lines based on content, not grid lines. - Margins: 1 inch all sides
- Line numbering: start=1, countBy=1, restart each page
- Disable document grid to avoid off-by-one line numbering
""" """
sectPr = section._sectPr sectPr = section._sectPr
# Disable document grid to avoid Word treating a grid line as line 1 # Margins: 1 inch = 1440 twips
docGrid = sectPr.find(qn("w:docGrid")) pgMar = sectPr.find(f"{{{W_NS}}}pgMar") or OxmlElement("w:pgMar")
_set_element_attr(pgMar, "top", "1440")
_set_element_attr(pgMar, "right", "1440")
_set_element_attr(pgMar, "bottom", "1440")
_set_element_attr(pgMar, "left", "1440")
_set_element_attr(pgMar, "header", "720")
_set_element_attr(pgMar, "footer", "720")
_set_element_attr(pgMar, "gutter", "0")
sectPr.append(pgMar)
# Disable document grid to avoid Word counting phantom grid lines
docGrid = sectPr.find(f"{{{W_NS}}}docGrid")
if docGrid is not None: if docGrid is not None:
docGrid.set(qn("w:type"), "none") _set_element_attr(docGrid, "type", "none")
# Create the line numbering element: <w:lnNumType> # Line numbering: start=1, countBy=1, restart each page
lnNumType = OxmlElement("w:lnNumType") lnNumType = OxmlElement("w:lnNumType")
lnNumType.set(qn("w:start"), str(start_at)) _set_element_attr(lnNumType, "start", "1")
lnNumType.set(qn("w:countBy"), str(count_by)) _set_element_attr(lnNumType, "countBy", "1")
_set_element_attr(lnNumType, "restart", "newPage")
# 'newPage' restarts numbering on every page; 'continuous' keeps it going
restart_mode = "newPage" if restart else "continuous"
lnNumType.set(qn("w:restart"), restart_mode)
sectPr.append(lnNumType) sectPr.append(lnNumType)
def _add_transcript_content(doc, text): def _add_transcript_paragraph(doc, line_text):
""" """
Add transcript lines to the document with formatting. Add a single transcript line as a paragraph.
If it matches timestamp + speaker format, format the label differently.
""" """
lines = text.strip().split("\n") line_text = line_text.strip()
for line in lines: if not line_text:
line = line.strip() return
if not line:
continue
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line) p = doc.add_paragraph()
if m: p.paragraph_format.left_indent = Inches(0.25)
ts, speaker, content = m.groups()
p = doc.add_paragraph()
p.paragraph_format.left_indent = Inches(0.25)
run_label = p.add_run(f"[{ts}] {speaker.upper()}:") # Try to match: [00:00] SPEAKER 1: content
run_label.bold = False m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
run_label.underline = True if m:
run_label.font.name = "Courier" ts, speaker, content = m.groups()
run_label.font.size = Pt(12) label_text = f"[{ts}] {speaker.upper()}:"
run_space = p.add_run(" ") # Label run (underline)
run_space.bold = False run_label = p.add_run(label_text)
run_space.underline = False run_label.underline = True
run_space.font.name = "Courier" run_label.font.name = "Courier"
run_space.font.size = Pt(12) run_label.font.size = Pt(12)
run_txt = p.add_run(content.strip()) # Space run (no underline)
run_txt.bold = False run_space = p.add_run(" ")
run_txt.underline = False run_space.underline = False
run_txt.font.name = "Courier" run_space.font.name = "Courier"
run_txt.font.size = Pt(12) run_space.font.size = Pt(12)
else:
p = doc.add_paragraph()
run = p.add_run(line)
run.font.name = "Courier"
run.font.size = Pt(12)
# Content run (no underline)
def _add_summary_content(doc, text): run_txt = p.add_run(content.strip())
""" run_txt.underline = False
Add summary content with heading styles. run_txt.font.name = "Courier"
""" run_txt.font.size = Pt(12)
heading_count = 0 else:
for line in text.splitlines(): # Non-standard line: plain text
stripped = line.strip() run = p.add_run(line_text)
if not stripped: run.underline = False
continue run.font.name = "Courier"
run.font.size = Pt(12)
# Detect markdown-style headings: #, ##, ###, #### at start of line
m = re.match(r"^(#{1,4})\s+(.*)", stripped)
if m:
heading_count += 1
content = m.group(2).strip()
p = doc.add_paragraph()
p.paragraph_format.space_after = Pt(4)
run = p.add_run(content)
run.font.name = "Courier"
run.font.size = Pt(12)
# Apply formatting based on this heading's ordinal position
if heading_count == 1:
run.bold = True
elif heading_count == 2:
run.italic = True
elif heading_count == 3:
run.underline = True
elif heading_count >= 4:
run.italic = True
run.underline = True
else:
# Normal text line
p = doc.add_paragraph(stripped)
p.paragraph_format.space_after = Pt(4)
# ------------ Public DOCX functions ------------ # ------------ Public DOCX functions ------------
def create_transcript_docx( def create_transcript_docx(text: str, filename: str):
text: str,
filename: str,
):
""" """
Create a .docx transcript with: Create a transcript DOCX with:
- 1" margins on all sides - 1" margins on all sides
- 12pt Courier - 12pt Courier font
- Continuous line numbering on the left - Line numbering starting at 1 on the first page
- No cover page (standalone transcript only). - Proper formatting for timestamps and speaker labels
""" """
doc = Document() doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Fully clear default paragraphs so Word's line numbering starts at line 1 # Set base font (Normal style)
style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
# Remove any default paragraphs (ensure no phantom first line)
body = doc.element.body body = doc.element.body
for p in body.findall(qn('w:p')): for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p) body.remove(p)
# Enable line numbering for transcript section # Configure section properties (margins, line numbering, grid)
_enable_line_numbering(doc.sections[0]) _create_transcript_section_properties(doc.sections[0])
# Transcript content (with line numbering) # Add transcript lines
_add_transcript_content(doc, text) lines = text.strip().splitlines()
for line in lines:
_add_transcript_paragraph(doc, line)
# Save
doc.save(filename) doc.save(filename)
def create_summary_docx( def create_summary_docx(text: str, filename: str):
text: str,
filename: str,
):
""" """
Create a .docx summary with consistent font and heading styles. Create a summary DOCX with:
No cover page, no line numbering. - 1" margins on all sides
- 12pt Courier font
- No line numbering
""" """
doc = Document() doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# Summary content (no line numbering) # Base font
_add_summary_content(doc, text) style = doc.styles["Normal"]
style.font.name = "Courier"
style.font.size = Pt(12)
# Margins: 1 inch all sides
for section in doc.sections:
section.left_margin = Inches(1.0)
section.right_margin = Inches(1.0)
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
# Remove default paragraph
body = doc.element.body
for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p)
# Add summary content
lines = text.strip().splitlines()
for line in lines:
line = line.strip()
if not line:
continue
p = doc.add_paragraph(line)
p.paragraph_format.space_after = Pt(4)
doc.save(filename) doc.save(filename)