From 2112b8c7e24df5e726c0e7f3972506388b057441 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 19 Jun 2026 16:07:20 +0000 Subject: [PATCH] Rewrite transcript DOCX logic for correctness - Prepare transcript into pages of 29 lines each before writing. - Each line max 60 chars total (48 content + number + spaces). - Words preserved (no clipping); wrap at word boundaries. - Page break after every 29 lines. - No distinction between logical/visual lines. --- scraibe/email_sender.py | 185 ++++++++++++++-------------------------- 1 file changed, 66 insertions(+), 119 deletions(-) diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index 2277186..72b7db7 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -444,144 +444,92 @@ def create_transcript_docx(text: str, filename: str): Create a transcript DOCX with: - 1" margins on all sides - 12pt Courier font - - Embedded line numbers starting at 1 on the first page - (portable across Word, LibreOffice, Google Docs) - - Line numbers reflect visual lines on the page, not speaker turns. - - Proper formatting for timestamps and speaker labels + - Each page has exactly 29 numbered lines of text + - Max 60 characters per line (including number and spaces) + - Words preserved (no clipping or omission) + - Blank spacing between number and text preserved + - Page break after every 29 lines + - Centered footer: "X of Y" """ - doc = Document() + # Step 1: Prepare transcript into pages of 29 lines each + # Each line <= 60 chars total, words preserved, no clipping + # Structure: nested list of paragraphs (pages -> lines) + prepared_pages = [] + current_page = [] + line_count = 0 - # Set base font (Normal style) + # 48 chars content + 2 digits + 1 tab + 9 spaces = 60 max + MAX_CONTENT_LEN = 48 + + for raw_line in text.strip().splitlines(): + raw_line = raw_line.strip() + if not raw_line: + continue + + # Wrap into segments without clipping words + words = raw_line.split() + segments = [] + current = "" + for w in words: + if not current: + current = w + elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN: + current += " " + w + else: + segments.append(current) + current = w + if current: + segments.append(current) + + # Add segments to pages, enforcing 29 lines per page + for seg in segments: + if line_count == 29: + prepared_pages.append(current_page) + current_page = [] + line_count = 0 + current_page.append(seg) + line_count += 1 + + if current_page: + prepared_pages.append(current_page) + + # Step 2: Create DOCX + doc = Document() style = doc.styles["Normal"] style.font.name = "Courier" style.font.size = Pt(12) - # Remove any default paragraphs (ensure no phantom first line) body = doc.element.body for p in list(body.findall(f"{{{W_NS}}}p")): body.remove(p) - # Configure section properties (margins, no built-in line numbering) _create_transcript_section_properties(doc.sections[0]) - # Max characters per visual line (content only; total line including line number and spaces <= 60) - max_chars = 58 + # Step 3: Write prepared pages into DOCX + for page_idx, page_lines in enumerate(prepared_pages): + # Insert page break between pages + if page_idx > 0: + p_break = doc.add_paragraph() + pPr = p_break._p.get_or_add_pPr() + for child in list(pPr): + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag in ("tabs", "spacing", "ind"): + pPr.remove(child) + page_break = OxmlElement("w:pageBreak") + page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1") + pPr.append(page_break) - # Lines per page before restarting numbering - lines_per_page = 29 + # Write each line with its number (1-29) + for line_num, line_text in enumerate(page_lines, start=1): + _add_transcript_paragraph(doc, line_text, line_number=line_num) - # Current line counter for visual lines - line_number = 0 - - # Split transcript into logical lines - logical_lines = text.strip().splitlines() - - def insert_page_break(): - nonlocal line_number - p_break = doc.add_paragraph() - pPr = p_break._p.get_or_add_pPr() - for child in list(pPr): - tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag - if tag in ("tabs", "spacing", "ind"): - pPr.remove(child) - page_break = OxmlElement("w:pageBreak") - page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1") - pPr.append(page_break) - line_number = 0 - - for line in logical_lines: - line = line.strip() - if not line: - continue - - # Try to match: [00:00] SPEAKER 1: content - m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line) - if m: - ts, speaker, content = m.groups() - label_text = f"[{ts}] {speaker.upper()}:" - content = content.strip() - else: - label_text = "" - content = line.strip() - - # Wrap content into visual lines (whole words, max_chars) - content_lines = [] - words = content.split() - current = "" - for w in words: - if len(current) == 0: - current = w - elif len(current) + 1 + len(w) <= max_chars: - current += " " + w - else: - content_lines.append(current) - current = w - if current: - content_lines.append(current) - - # Prepare first visual line with label if present - visual_lines = [] - if content_lines: - first_content = content_lines.pop(0) - if label_text: - prefix = label_text + " " - if len(prefix) + len(first_content) > max_chars: - allowed = max_chars - len(prefix) - if allowed < 1: - allowed = 1 - candidate = first_content[:allowed] - last_space = candidate.rfind(" ") - if last_space > 0: - kept = candidate[:last_space] - rest = first_content[last_space:].strip() - else: - kept = candidate - rest = first_content[allowed:].strip() - - visual_lines.append(prefix + kept) - if rest: - extra_words = rest.split() - new_lines = [] - buf = "" - for ew in extra_words: - if len(buf) == 0: - buf = ew - elif len(buf) + 1 + len(ew) <= max_chars: - buf += " " + ew - else: - new_lines.append(buf) - buf = ew - if buf: - new_lines.append(buf) - content_lines = new_lines + content_lines - else: - visual_lines.append(prefix + first_content) - else: - visual_lines.append(first_content) - - visual_lines.extend(content_lines) - else: - # No content_lines but maybe label_text only - if label_text: - visual_lines.append(label_text) - - # If adding these visual lines would exceed lines_per_page, insert page break first - if line_number + len(visual_lines) > lines_per_page: - insert_page_break() - - # Write visual lines - for vl in visual_lines: - line_number += 1 - _add_transcript_paragraph(doc, vl, line_number=line_number) - - # Add page numbers to footer: "X of Y" (bottom left) + # Step 4: Add footer: "X of Y" centered section = doc.sections[0] footer = section.footer footer.is_linked_to_previous = False footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph() footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER - # Clear any existing content for r in footer_para.runs: r.text = "" @@ -607,7 +555,6 @@ def create_transcript_docx(text: str, filename: str): run_total = footer_para.add_run() add_field(run_total, " NUMPAGES ") - # Save doc.save(filename)