Rebuild transcript DOCX generation flow
- Clean, single-pass implementation for transcript and summary DOCX - Explicit margins, font, line numbering per OOXML spec - Disable docGrid to prevent off-by-one line numbering - Ensure first content line is line 1
This commit is contained in:
+95
-111
@@ -304,173 +304,157 @@ def send_email(
|
|||||||
|
|
||||||
# ------------ DOCX helpers ------------
|
# ------------ DOCX helpers ------------
|
||||||
|
|
||||||
def _configure_base_font(doc):
|
# Namespaces
|
||||||
"""
|
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||||
Set base document font to 12pt Courier.
|
|
||||||
"""
|
|
||||||
style = doc.styles["Normal"]
|
|
||||||
style.font.name = "Courier"
|
|
||||||
style.font.size = Pt(12)
|
|
||||||
|
|
||||||
|
|
||||||
def _configure_section_margins(doc, margin=1.0):
|
def _set_element_attr(elem, attr, value):
|
||||||
"""
|
elem.set(f"{{{W_NS}}}{attr}", str(value))
|
||||||
Set uniform margins on all sides (default 1 inch).
|
|
||||||
"""
|
|
||||||
for section in doc.sections:
|
|
||||||
section.left_margin = Inches(margin)
|
|
||||||
section.right_margin = Inches(margin)
|
|
||||||
section.top_margin = Inches(margin)
|
|
||||||
section.bottom_margin = Inches(margin)
|
|
||||||
|
|
||||||
|
|
||||||
def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
|
def _create_transcript_section_properties(section):
|
||||||
"""
|
"""
|
||||||
Enable line numbering for a specific section.
|
Configure the section properties for transcript DOCX:
|
||||||
Disables the document grid so Word counts lines based on content, not grid lines.
|
- Margins: 1 inch all sides
|
||||||
|
- Line numbering: start=1, countBy=1, restart each page
|
||||||
|
- Disable document grid to avoid off-by-one line numbering
|
||||||
"""
|
"""
|
||||||
sectPr = section._sectPr
|
sectPr = section._sectPr
|
||||||
|
|
||||||
# Disable document grid to avoid Word treating a grid line as line 1
|
# Margins: 1 inch = 1440 twips
|
||||||
docGrid = sectPr.find(qn("w:docGrid"))
|
pgMar = sectPr.find(f"{{{W_NS}}}pgMar") or OxmlElement("w:pgMar")
|
||||||
|
_set_element_attr(pgMar, "top", "1440")
|
||||||
|
_set_element_attr(pgMar, "right", "1440")
|
||||||
|
_set_element_attr(pgMar, "bottom", "1440")
|
||||||
|
_set_element_attr(pgMar, "left", "1440")
|
||||||
|
_set_element_attr(pgMar, "header", "720")
|
||||||
|
_set_element_attr(pgMar, "footer", "720")
|
||||||
|
_set_element_attr(pgMar, "gutter", "0")
|
||||||
|
sectPr.append(pgMar)
|
||||||
|
|
||||||
|
# Disable document grid to avoid Word counting phantom grid lines
|
||||||
|
docGrid = sectPr.find(f"{{{W_NS}}}docGrid")
|
||||||
if docGrid is not None:
|
if docGrid is not None:
|
||||||
docGrid.set(qn("w:type"), "none")
|
_set_element_attr(docGrid, "type", "none")
|
||||||
|
|
||||||
# Create the line numbering element: <w:lnNumType>
|
# Line numbering: start=1, countBy=1, restart each page
|
||||||
lnNumType = OxmlElement("w:lnNumType")
|
lnNumType = OxmlElement("w:lnNumType")
|
||||||
lnNumType.set(qn("w:start"), str(start_at))
|
_set_element_attr(lnNumType, "start", "1")
|
||||||
lnNumType.set(qn("w:countBy"), str(count_by))
|
_set_element_attr(lnNumType, "countBy", "1")
|
||||||
|
_set_element_attr(lnNumType, "restart", "newPage")
|
||||||
# 'newPage' restarts numbering on every page; 'continuous' keeps it going
|
|
||||||
restart_mode = "newPage" if restart else "continuous"
|
|
||||||
lnNumType.set(qn("w:restart"), restart_mode)
|
|
||||||
|
|
||||||
sectPr.append(lnNumType)
|
sectPr.append(lnNumType)
|
||||||
|
|
||||||
|
|
||||||
def _add_transcript_content(doc, text):
|
def _add_transcript_paragraph(doc, line_text):
|
||||||
"""
|
"""
|
||||||
Add transcript lines to the document with formatting.
|
Add a single transcript line as a paragraph.
|
||||||
|
If it matches timestamp + speaker format, format the label differently.
|
||||||
"""
|
"""
|
||||||
lines = text.strip().split("\n")
|
line_text = line_text.strip()
|
||||||
for line in lines:
|
if not line_text:
|
||||||
line = line.strip()
|
return
|
||||||
if not line:
|
|
||||||
continue
|
|
||||||
|
|
||||||
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
|
|
||||||
if m:
|
|
||||||
ts, speaker, content = m.groups()
|
|
||||||
p = doc.add_paragraph()
|
p = doc.add_paragraph()
|
||||||
p.paragraph_format.left_indent = Inches(0.25)
|
p.paragraph_format.left_indent = Inches(0.25)
|
||||||
|
|
||||||
run_label = p.add_run(f"[{ts}] {speaker.upper()}:")
|
# Try to match: [00:00] SPEAKER 1: content
|
||||||
run_label.bold = False
|
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
|
||||||
|
if m:
|
||||||
|
ts, speaker, content = m.groups()
|
||||||
|
label_text = f"[{ts}] {speaker.upper()}:"
|
||||||
|
|
||||||
|
# Label run (underline)
|
||||||
|
run_label = p.add_run(label_text)
|
||||||
run_label.underline = True
|
run_label.underline = True
|
||||||
run_label.font.name = "Courier"
|
run_label.font.name = "Courier"
|
||||||
run_label.font.size = Pt(12)
|
run_label.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Space run (no underline)
|
||||||
run_space = p.add_run(" ")
|
run_space = p.add_run(" ")
|
||||||
run_space.bold = False
|
|
||||||
run_space.underline = False
|
run_space.underline = False
|
||||||
run_space.font.name = "Courier"
|
run_space.font.name = "Courier"
|
||||||
run_space.font.size = Pt(12)
|
run_space.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Content run (no underline)
|
||||||
run_txt = p.add_run(content.strip())
|
run_txt = p.add_run(content.strip())
|
||||||
run_txt.bold = False
|
|
||||||
run_txt.underline = False
|
run_txt.underline = False
|
||||||
run_txt.font.name = "Courier"
|
run_txt.font.name = "Courier"
|
||||||
run_txt.font.size = Pt(12)
|
run_txt.font.size = Pt(12)
|
||||||
else:
|
else:
|
||||||
p = doc.add_paragraph()
|
# Non-standard line: plain text
|
||||||
run = p.add_run(line)
|
run = p.add_run(line_text)
|
||||||
|
run.underline = False
|
||||||
run.font.name = "Courier"
|
run.font.name = "Courier"
|
||||||
run.font.size = Pt(12)
|
run.font.size = Pt(12)
|
||||||
|
|
||||||
|
|
||||||
def _add_summary_content(doc, text):
|
|
||||||
"""
|
|
||||||
Add summary content with heading styles.
|
|
||||||
"""
|
|
||||||
heading_count = 0
|
|
||||||
for line in text.splitlines():
|
|
||||||
stripped = line.strip()
|
|
||||||
if not stripped:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Detect markdown-style headings: #, ##, ###, #### at start of line
|
|
||||||
m = re.match(r"^(#{1,4})\s+(.*)", stripped)
|
|
||||||
if m:
|
|
||||||
heading_count += 1
|
|
||||||
content = m.group(2).strip()
|
|
||||||
|
|
||||||
p = doc.add_paragraph()
|
|
||||||
p.paragraph_format.space_after = Pt(4)
|
|
||||||
|
|
||||||
run = p.add_run(content)
|
|
||||||
run.font.name = "Courier"
|
|
||||||
run.font.size = Pt(12)
|
|
||||||
|
|
||||||
# Apply formatting based on this heading's ordinal position
|
|
||||||
if heading_count == 1:
|
|
||||||
run.bold = True
|
|
||||||
elif heading_count == 2:
|
|
||||||
run.italic = True
|
|
||||||
elif heading_count == 3:
|
|
||||||
run.underline = True
|
|
||||||
elif heading_count >= 4:
|
|
||||||
run.italic = True
|
|
||||||
run.underline = True
|
|
||||||
else:
|
|
||||||
# Normal text line
|
|
||||||
p = doc.add_paragraph(stripped)
|
|
||||||
p.paragraph_format.space_after = Pt(4)
|
|
||||||
|
|
||||||
|
|
||||||
# ------------ Public DOCX functions ------------
|
# ------------ Public DOCX functions ------------
|
||||||
|
|
||||||
def create_transcript_docx(
|
def create_transcript_docx(text: str, filename: str):
|
||||||
text: str,
|
|
||||||
filename: str,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Create a .docx transcript with:
|
Create a transcript DOCX with:
|
||||||
- 1" margins on all sides
|
- 1" margins on all sides
|
||||||
- 12pt Courier
|
- 12pt Courier font
|
||||||
- Continuous line numbering on the left
|
- Line numbering starting at 1 on the first page
|
||||||
- No cover page (standalone transcript only).
|
- Proper formatting for timestamps and speaker labels
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
_configure_base_font(doc)
|
|
||||||
_configure_section_margins(doc)
|
|
||||||
|
|
||||||
# Fully clear default paragraphs so Word's line numbering starts at line 1
|
# Set base font (Normal style)
|
||||||
|
style = doc.styles["Normal"]
|
||||||
|
style.font.name = "Courier"
|
||||||
|
style.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Remove any default paragraphs (ensure no phantom first line)
|
||||||
body = doc.element.body
|
body = doc.element.body
|
||||||
for p in body.findall(qn('w:p')):
|
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||||
body.remove(p)
|
body.remove(p)
|
||||||
|
|
||||||
# Enable line numbering for transcript section
|
# Configure section properties (margins, line numbering, grid)
|
||||||
_enable_line_numbering(doc.sections[0])
|
_create_transcript_section_properties(doc.sections[0])
|
||||||
|
|
||||||
# Transcript content (with line numbering)
|
# Add transcript lines
|
||||||
_add_transcript_content(doc, text)
|
lines = text.strip().splitlines()
|
||||||
|
for line in lines:
|
||||||
|
_add_transcript_paragraph(doc, line)
|
||||||
|
|
||||||
|
# Save
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|
||||||
|
|
||||||
def create_summary_docx(
|
def create_summary_docx(text: str, filename: str):
|
||||||
text: str,
|
|
||||||
filename: str,
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Create a .docx summary with consistent font and heading styles.
|
Create a summary DOCX with:
|
||||||
No cover page, no line numbering.
|
- 1" margins on all sides
|
||||||
|
- 12pt Courier font
|
||||||
|
- No line numbering
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
doc = Document()
|
||||||
_configure_base_font(doc)
|
|
||||||
_configure_section_margins(doc)
|
|
||||||
|
|
||||||
# Summary content (no line numbering)
|
# Base font
|
||||||
_add_summary_content(doc, text)
|
style = doc.styles["Normal"]
|
||||||
|
style.font.name = "Courier"
|
||||||
|
style.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Margins: 1 inch all sides
|
||||||
|
for section in doc.sections:
|
||||||
|
section.left_margin = Inches(1.0)
|
||||||
|
section.right_margin = Inches(1.0)
|
||||||
|
section.top_margin = Inches(1.0)
|
||||||
|
section.bottom_margin = Inches(1.0)
|
||||||
|
|
||||||
|
# Remove default paragraph
|
||||||
|
body = doc.element.body
|
||||||
|
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||||
|
body.remove(p)
|
||||||
|
|
||||||
|
# Add summary content
|
||||||
|
lines = text.strip().splitlines()
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
p = doc.add_paragraph(line)
|
||||||
|
p.paragraph_format.space_after = Pt(4)
|
||||||
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|||||||
Reference in New Issue
Block a user