Rewrite transcript DOCX logic for correctness
Mirror and run GitLab CI / build (push) Waiting to run
Ruff / ruff (push) Waiting to run

- Prepare transcript into pages of 29 lines each before writing.
- Each line max 60 chars total (48 content + number + spaces).
- Words preserved (no clipping); wrap at word boundaries.
- Page break after every 29 lines.
- No distinction between logical/visual lines.
This commit is contained in:
admin
2026-06-19 16:07:20 +00:00
parent 49f3cdc407
commit 2112b8c7e2
+66 -119
View File
@@ -444,144 +444,92 @@ def create_transcript_docx(text: str, filename: str):
Create a transcript DOCX with: Create a transcript DOCX with:
- 1" margins on all sides - 1" margins on all sides
- 12pt Courier font - 12pt Courier font
- Embedded line numbers starting at 1 on the first page - Each page has exactly 29 numbered lines of text
(portable across Word, LibreOffice, Google Docs) - Max 60 characters per line (including number and spaces)
- Line numbers reflect visual lines on the page, not speaker turns. - Words preserved (no clipping or omission)
- Proper formatting for timestamps and speaker labels - Blank spacing between number and text preserved
- Page break after every 29 lines
- Centered footer: "X of Y"
""" """
doc = Document() # Step 1: Prepare transcript into pages of 29 lines each
# Each line <= 60 chars total, words preserved, no clipping
# Structure: nested list of paragraphs (pages -> lines)
prepared_pages = []
current_page = []
line_count = 0
# Set base font (Normal style) # 48 chars content + 2 digits + 1 tab + 9 spaces = 60 max
MAX_CONTENT_LEN = 48
for raw_line in text.strip().splitlines():
raw_line = raw_line.strip()
if not raw_line:
continue
# Wrap into segments without clipping words
words = raw_line.split()
segments = []
current = ""
for w in words:
if not current:
current = w
elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN:
current += " " + w
else:
segments.append(current)
current = w
if current:
segments.append(current)
# Add segments to pages, enforcing 29 lines per page
for seg in segments:
if line_count == 29:
prepared_pages.append(current_page)
current_page = []
line_count = 0
current_page.append(seg)
line_count += 1
if current_page:
prepared_pages.append(current_page)
# Step 2: Create DOCX
doc = Document()
style = doc.styles["Normal"] style = doc.styles["Normal"]
style.font.name = "Courier" style.font.name = "Courier"
style.font.size = Pt(12) style.font.size = Pt(12)
# Remove any default paragraphs (ensure no phantom first line)
body = doc.element.body body = doc.element.body
for p in list(body.findall(f"{{{W_NS}}}p")): for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p) body.remove(p)
# Configure section properties (margins, no built-in line numbering)
_create_transcript_section_properties(doc.sections[0]) _create_transcript_section_properties(doc.sections[0])
# Max characters per visual line (content only; total line including line number and spaces <= 60) # Step 3: Write prepared pages into DOCX
max_chars = 58 for page_idx, page_lines in enumerate(prepared_pages):
# Insert page break between pages
if page_idx > 0:
p_break = doc.add_paragraph()
pPr = p_break._p.get_or_add_pPr()
for child in list(pPr):
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag in ("tabs", "spacing", "ind"):
pPr.remove(child)
page_break = OxmlElement("w:pageBreak")
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
pPr.append(page_break)
# Lines per page before restarting numbering # Write each line with its number (1-29)
lines_per_page = 29 for line_num, line_text in enumerate(page_lines, start=1):
_add_transcript_paragraph(doc, line_text, line_number=line_num)
# Current line counter for visual lines # Step 4: Add footer: "X of Y" centered
line_number = 0
# Split transcript into logical lines
logical_lines = text.strip().splitlines()
def insert_page_break():
nonlocal line_number
p_break = doc.add_paragraph()
pPr = p_break._p.get_or_add_pPr()
for child in list(pPr):
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag in ("tabs", "spacing", "ind"):
pPr.remove(child)
page_break = OxmlElement("w:pageBreak")
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
pPr.append(page_break)
line_number = 0
for line in logical_lines:
line = line.strip()
if not line:
continue
# Try to match: [00:00] SPEAKER 1: content
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
if m:
ts, speaker, content = m.groups()
label_text = f"[{ts}] {speaker.upper()}:"
content = content.strip()
else:
label_text = ""
content = line.strip()
# Wrap content into visual lines (whole words, max_chars)
content_lines = []
words = content.split()
current = ""
for w in words:
if len(current) == 0:
current = w
elif len(current) + 1 + len(w) <= max_chars:
current += " " + w
else:
content_lines.append(current)
current = w
if current:
content_lines.append(current)
# Prepare first visual line with label if present
visual_lines = []
if content_lines:
first_content = content_lines.pop(0)
if label_text:
prefix = label_text + " "
if len(prefix) + len(first_content) > max_chars:
allowed = max_chars - len(prefix)
if allowed < 1:
allowed = 1
candidate = first_content[:allowed]
last_space = candidate.rfind(" ")
if last_space > 0:
kept = candidate[:last_space]
rest = first_content[last_space:].strip()
else:
kept = candidate
rest = first_content[allowed:].strip()
visual_lines.append(prefix + kept)
if rest:
extra_words = rest.split()
new_lines = []
buf = ""
for ew in extra_words:
if len(buf) == 0:
buf = ew
elif len(buf) + 1 + len(ew) <= max_chars:
buf += " " + ew
else:
new_lines.append(buf)
buf = ew
if buf:
new_lines.append(buf)
content_lines = new_lines + content_lines
else:
visual_lines.append(prefix + first_content)
else:
visual_lines.append(first_content)
visual_lines.extend(content_lines)
else:
# No content_lines but maybe label_text only
if label_text:
visual_lines.append(label_text)
# If adding these visual lines would exceed lines_per_page, insert page break first
if line_number + len(visual_lines) > lines_per_page:
insert_page_break()
# Write visual lines
for vl in visual_lines:
line_number += 1
_add_transcript_paragraph(doc, vl, line_number=line_number)
# Add page numbers to footer: "X of Y" (bottom left)
section = doc.sections[0] section = doc.sections[0]
footer = section.footer footer = section.footer
footer.is_linked_to_previous = False footer.is_linked_to_previous = False
footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph() footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Clear any existing content
for r in footer_para.runs: for r in footer_para.runs:
r.text = "" r.text = ""
@@ -607,7 +555,6 @@ def create_transcript_docx(text: str, filename: str):
run_total = footer_para.add_run() run_total = footer_para.add_run()
add_field(run_total, " NUMPAGES ") add_field(run_total, " NUMPAGES ")
# Save
doc.save(filename) doc.save(filename)