Rewrite transcript DOCX logic for correctness
- Prepare transcript into pages of 29 lines each before writing. - Each line max 60 chars total (48 content + number + spaces). - Words preserved (no clipping); wrap at word boundaries. - Page break after every 29 lines. - No distinction between logical/visual lines.
This commit is contained in:
+57
-110
@@ -444,40 +444,71 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
Create a transcript DOCX with:
|
Create a transcript DOCX with:
|
||||||
- 1" margins on all sides
|
- 1" margins on all sides
|
||||||
- 12pt Courier font
|
- 12pt Courier font
|
||||||
- Embedded line numbers starting at 1 on the first page
|
- Each page has exactly 29 numbered lines of text
|
||||||
(portable across Word, LibreOffice, Google Docs)
|
- Max 60 characters per line (including number and spaces)
|
||||||
- Line numbers reflect visual lines on the page, not speaker turns.
|
- Words preserved (no clipping or omission)
|
||||||
- Proper formatting for timestamps and speaker labels
|
- Blank spacing between number and text preserved
|
||||||
|
- Page break after every 29 lines
|
||||||
|
- Centered footer: "X of Y"
|
||||||
"""
|
"""
|
||||||
doc = Document()
|
# Step 1: Prepare transcript into pages of 29 lines each
|
||||||
|
# Each line <= 60 chars total, words preserved, no clipping
|
||||||
|
# Structure: nested list of paragraphs (pages -> lines)
|
||||||
|
prepared_pages = []
|
||||||
|
current_page = []
|
||||||
|
line_count = 0
|
||||||
|
|
||||||
# Set base font (Normal style)
|
# 48 chars content + 2 digits + 1 tab + 9 spaces = 60 max
|
||||||
|
MAX_CONTENT_LEN = 48
|
||||||
|
|
||||||
|
for raw_line in text.strip().splitlines():
|
||||||
|
raw_line = raw_line.strip()
|
||||||
|
if not raw_line:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Wrap into segments without clipping words
|
||||||
|
words = raw_line.split()
|
||||||
|
segments = []
|
||||||
|
current = ""
|
||||||
|
for w in words:
|
||||||
|
if not current:
|
||||||
|
current = w
|
||||||
|
elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN:
|
||||||
|
current += " " + w
|
||||||
|
else:
|
||||||
|
segments.append(current)
|
||||||
|
current = w
|
||||||
|
if current:
|
||||||
|
segments.append(current)
|
||||||
|
|
||||||
|
# Add segments to pages, enforcing 29 lines per page
|
||||||
|
for seg in segments:
|
||||||
|
if line_count == 29:
|
||||||
|
prepared_pages.append(current_page)
|
||||||
|
current_page = []
|
||||||
|
line_count = 0
|
||||||
|
current_page.append(seg)
|
||||||
|
line_count += 1
|
||||||
|
|
||||||
|
if current_page:
|
||||||
|
prepared_pages.append(current_page)
|
||||||
|
|
||||||
|
# Step 2: Create DOCX
|
||||||
|
doc = Document()
|
||||||
style = doc.styles["Normal"]
|
style = doc.styles["Normal"]
|
||||||
style.font.name = "Courier"
|
style.font.name = "Courier"
|
||||||
style.font.size = Pt(12)
|
style.font.size = Pt(12)
|
||||||
|
|
||||||
# Remove any default paragraphs (ensure no phantom first line)
|
|
||||||
body = doc.element.body
|
body = doc.element.body
|
||||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||||
body.remove(p)
|
body.remove(p)
|
||||||
|
|
||||||
# Configure section properties (margins, no built-in line numbering)
|
|
||||||
_create_transcript_section_properties(doc.sections[0])
|
_create_transcript_section_properties(doc.sections[0])
|
||||||
|
|
||||||
# Max characters per visual line (content only; total line including line number and spaces <= 60)
|
# Step 3: Write prepared pages into DOCX
|
||||||
max_chars = 58
|
for page_idx, page_lines in enumerate(prepared_pages):
|
||||||
|
# Insert page break between pages
|
||||||
# Lines per page before restarting numbering
|
if page_idx > 0:
|
||||||
lines_per_page = 29
|
|
||||||
|
|
||||||
# Current line counter for visual lines
|
|
||||||
line_number = 0
|
|
||||||
|
|
||||||
# Split transcript into logical lines
|
|
||||||
logical_lines = text.strip().splitlines()
|
|
||||||
|
|
||||||
def insert_page_break():
|
|
||||||
nonlocal line_number
|
|
||||||
p_break = doc.add_paragraph()
|
p_break = doc.add_paragraph()
|
||||||
pPr = p_break._p.get_or_add_pPr()
|
pPr = p_break._p.get_or_add_pPr()
|
||||||
for child in list(pPr):
|
for child in list(pPr):
|
||||||
@@ -487,101 +518,18 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
page_break = OxmlElement("w:pageBreak")
|
page_break = OxmlElement("w:pageBreak")
|
||||||
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
|
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
|
||||||
pPr.append(page_break)
|
pPr.append(page_break)
|
||||||
line_number = 0
|
|
||||||
|
|
||||||
for line in logical_lines:
|
# Write each line with its number (1-29)
|
||||||
line = line.strip()
|
for line_num, line_text in enumerate(page_lines, start=1):
|
||||||
if not line:
|
_add_transcript_paragraph(doc, line_text, line_number=line_num)
|
||||||
continue
|
|
||||||
|
|
||||||
# Try to match: [00:00] SPEAKER 1: content
|
# Step 4: Add footer: "X of Y" centered
|
||||||
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line)
|
|
||||||
if m:
|
|
||||||
ts, speaker, content = m.groups()
|
|
||||||
label_text = f"[{ts}] {speaker.upper()}:"
|
|
||||||
content = content.strip()
|
|
||||||
else:
|
|
||||||
label_text = ""
|
|
||||||
content = line.strip()
|
|
||||||
|
|
||||||
# Wrap content into visual lines (whole words, max_chars)
|
|
||||||
content_lines = []
|
|
||||||
words = content.split()
|
|
||||||
current = ""
|
|
||||||
for w in words:
|
|
||||||
if len(current) == 0:
|
|
||||||
current = w
|
|
||||||
elif len(current) + 1 + len(w) <= max_chars:
|
|
||||||
current += " " + w
|
|
||||||
else:
|
|
||||||
content_lines.append(current)
|
|
||||||
current = w
|
|
||||||
if current:
|
|
||||||
content_lines.append(current)
|
|
||||||
|
|
||||||
# Prepare first visual line with label if present
|
|
||||||
visual_lines = []
|
|
||||||
if content_lines:
|
|
||||||
first_content = content_lines.pop(0)
|
|
||||||
if label_text:
|
|
||||||
prefix = label_text + " "
|
|
||||||
if len(prefix) + len(first_content) > max_chars:
|
|
||||||
allowed = max_chars - len(prefix)
|
|
||||||
if allowed < 1:
|
|
||||||
allowed = 1
|
|
||||||
candidate = first_content[:allowed]
|
|
||||||
last_space = candidate.rfind(" ")
|
|
||||||
if last_space > 0:
|
|
||||||
kept = candidate[:last_space]
|
|
||||||
rest = first_content[last_space:].strip()
|
|
||||||
else:
|
|
||||||
kept = candidate
|
|
||||||
rest = first_content[allowed:].strip()
|
|
||||||
|
|
||||||
visual_lines.append(prefix + kept)
|
|
||||||
if rest:
|
|
||||||
extra_words = rest.split()
|
|
||||||
new_lines = []
|
|
||||||
buf = ""
|
|
||||||
for ew in extra_words:
|
|
||||||
if len(buf) == 0:
|
|
||||||
buf = ew
|
|
||||||
elif len(buf) + 1 + len(ew) <= max_chars:
|
|
||||||
buf += " " + ew
|
|
||||||
else:
|
|
||||||
new_lines.append(buf)
|
|
||||||
buf = ew
|
|
||||||
if buf:
|
|
||||||
new_lines.append(buf)
|
|
||||||
content_lines = new_lines + content_lines
|
|
||||||
else:
|
|
||||||
visual_lines.append(prefix + first_content)
|
|
||||||
else:
|
|
||||||
visual_lines.append(first_content)
|
|
||||||
|
|
||||||
visual_lines.extend(content_lines)
|
|
||||||
else:
|
|
||||||
# No content_lines but maybe label_text only
|
|
||||||
if label_text:
|
|
||||||
visual_lines.append(label_text)
|
|
||||||
|
|
||||||
# If adding these visual lines would exceed lines_per_page, insert page break first
|
|
||||||
if line_number + len(visual_lines) > lines_per_page:
|
|
||||||
insert_page_break()
|
|
||||||
|
|
||||||
# Write visual lines
|
|
||||||
for vl in visual_lines:
|
|
||||||
line_number += 1
|
|
||||||
_add_transcript_paragraph(doc, vl, line_number=line_number)
|
|
||||||
|
|
||||||
# Add page numbers to footer: "X of Y" (bottom left)
|
|
||||||
section = doc.sections[0]
|
section = doc.sections[0]
|
||||||
footer = section.footer
|
footer = section.footer
|
||||||
footer.is_linked_to_previous = False
|
footer.is_linked_to_previous = False
|
||||||
footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
|
footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
|
||||||
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
|
||||||
# Clear any existing content
|
|
||||||
for r in footer_para.runs:
|
for r in footer_para.runs:
|
||||||
r.text = ""
|
r.text = ""
|
||||||
|
|
||||||
@@ -607,7 +555,6 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
run_total = footer_para.add_run()
|
run_total = footer_para.add_run()
|
||||||
add_field(run_total, " NUMPAGES ")
|
add_field(run_total, " NUMPAGES ")
|
||||||
|
|
||||||
# Save
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user