diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index 7ea7910..ecb5e0c 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -304,173 +304,157 @@ def send_email( # ------------ DOCX helpers ------------ -def _configure_base_font(doc): - """ - Set base document font to 12pt Courier. - """ - style = doc.styles["Normal"] - style.font.name = "Courier" - style.font.size = Pt(12) +# Namespaces +W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" -def _configure_section_margins(doc, margin=1.0): - """ - Set uniform margins on all sides (default 1 inch). - """ - for section in doc.sections: - section.left_margin = Inches(margin) - section.right_margin = Inches(margin) - section.top_margin = Inches(margin) - section.bottom_margin = Inches(margin) +def _set_element_attr(elem, attr, value): + elem.set(f"{{{W_NS}}}{attr}", str(value)) -def _enable_line_numbering(section, start_at=1, count_by=1, restart=True): +def _create_transcript_section_properties(section): """ - Enable line numbering for a specific section. - Disables the document grid so Word counts lines based on content, not grid lines. + Configure the section properties for transcript DOCX: + - Margins: 1 inch all sides + - Line numbering: start=1, countBy=1, restart each page + - Disable document grid to avoid off-by-one line numbering """ sectPr = section._sectPr - # Disable document grid to avoid Word treating a grid line as line 1 - docGrid = sectPr.find(qn("w:docGrid")) + # Margins: 1 inch = 1440 twips + pgMar = sectPr.find(f"{{{W_NS}}}pgMar") or OxmlElement("w:pgMar") + _set_element_attr(pgMar, "top", "1440") + _set_element_attr(pgMar, "right", "1440") + _set_element_attr(pgMar, "bottom", "1440") + _set_element_attr(pgMar, "left", "1440") + _set_element_attr(pgMar, "header", "720") + _set_element_attr(pgMar, "footer", "720") + _set_element_attr(pgMar, "gutter", "0") + sectPr.append(pgMar) + + # Disable document grid to avoid Word counting phantom grid lines + docGrid = sectPr.find(f"{{{W_NS}}}docGrid") if docGrid is not None: - docGrid.set(qn("w:type"), "none") + _set_element_attr(docGrid, "type", "none") - # Create the line numbering element: + # Line numbering: start=1, countBy=1, restart each page lnNumType = OxmlElement("w:lnNumType") - lnNumType.set(qn("w:start"), str(start_at)) - lnNumType.set(qn("w:countBy"), str(count_by)) - - # 'newPage' restarts numbering on every page; 'continuous' keeps it going - restart_mode = "newPage" if restart else "continuous" - lnNumType.set(qn("w:restart"), restart_mode) - + _set_element_attr(lnNumType, "start", "1") + _set_element_attr(lnNumType, "countBy", "1") + _set_element_attr(lnNumType, "restart", "newPage") sectPr.append(lnNumType) -def _add_transcript_content(doc, text): +def _add_transcript_paragraph(doc, line_text): """ - Add transcript lines to the document with formatting. + Add a single transcript line as a paragraph. + If it matches timestamp + speaker format, format the label differently. """ - lines = text.strip().split("\n") - for line in lines: - line = line.strip() - if not line: - continue + line_text = line_text.strip() + if not line_text: + return - m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line) - if m: - ts, speaker, content = m.groups() - p = doc.add_paragraph() - p.paragraph_format.left_indent = Inches(0.25) + p = doc.add_paragraph() + p.paragraph_format.left_indent = Inches(0.25) - run_label = p.add_run(f"[{ts}] {speaker.upper()}:") - run_label.bold = False - run_label.underline = True - run_label.font.name = "Courier" - run_label.font.size = Pt(12) + # Try to match: [00:00] SPEAKER 1: content + m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text) + if m: + ts, speaker, content = m.groups() + label_text = f"[{ts}] {speaker.upper()}:" - run_space = p.add_run(" ") - run_space.bold = False - run_space.underline = False - run_space.font.name = "Courier" - run_space.font.size = Pt(12) + # Label run (underline) + run_label = p.add_run(label_text) + run_label.underline = True + run_label.font.name = "Courier" + run_label.font.size = Pt(12) - run_txt = p.add_run(content.strip()) - run_txt.bold = False - run_txt.underline = False - run_txt.font.name = "Courier" - run_txt.font.size = Pt(12) - else: - p = doc.add_paragraph() - run = p.add_run(line) - run.font.name = "Courier" - run.font.size = Pt(12) + # Space run (no underline) + run_space = p.add_run(" ") + run_space.underline = False + run_space.font.name = "Courier" + run_space.font.size = Pt(12) - -def _add_summary_content(doc, text): - """ - Add summary content with heading styles. - """ - heading_count = 0 - for line in text.splitlines(): - stripped = line.strip() - if not stripped: - continue - - # Detect markdown-style headings: #, ##, ###, #### at start of line - m = re.match(r"^(#{1,4})\s+(.*)", stripped) - if m: - heading_count += 1 - content = m.group(2).strip() - - p = doc.add_paragraph() - p.paragraph_format.space_after = Pt(4) - - run = p.add_run(content) - run.font.name = "Courier" - run.font.size = Pt(12) - - # Apply formatting based on this heading's ordinal position - if heading_count == 1: - run.bold = True - elif heading_count == 2: - run.italic = True - elif heading_count == 3: - run.underline = True - elif heading_count >= 4: - run.italic = True - run.underline = True - else: - # Normal text line - p = doc.add_paragraph(stripped) - p.paragraph_format.space_after = Pt(4) + # Content run (no underline) + run_txt = p.add_run(content.strip()) + run_txt.underline = False + run_txt.font.name = "Courier" + run_txt.font.size = Pt(12) + else: + # Non-standard line: plain text + run = p.add_run(line_text) + run.underline = False + run.font.name = "Courier" + run.font.size = Pt(12) # ------------ Public DOCX functions ------------ -def create_transcript_docx( - text: str, - filename: str, -): +def create_transcript_docx(text: str, filename: str): """ - Create a .docx transcript with: - - 1" margins on all sides - - 12pt Courier - - Continuous line numbering on the left - - No cover page (standalone transcript only). + Create a transcript DOCX with: + - 1" margins on all sides + - 12pt Courier font + - Line numbering starting at 1 on the first page + - Proper formatting for timestamps and speaker labels """ doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - # Fully clear default paragraphs so Word's line numbering starts at line 1 + # Set base font (Normal style) + style = doc.styles["Normal"] + style.font.name = "Courier" + style.font.size = Pt(12) + + # Remove any default paragraphs (ensure no phantom first line) body = doc.element.body - for p in body.findall(qn('w:p')): + for p in list(body.findall(f"{{{W_NS}}}p")): body.remove(p) - # Enable line numbering for transcript section - _enable_line_numbering(doc.sections[0]) + # Configure section properties (margins, line numbering, grid) + _create_transcript_section_properties(doc.sections[0]) - # Transcript content (with line numbering) - _add_transcript_content(doc, text) + # Add transcript lines + lines = text.strip().splitlines() + for line in lines: + _add_transcript_paragraph(doc, line) + # Save doc.save(filename) -def create_summary_docx( - text: str, - filename: str, -): +def create_summary_docx(text: str, filename: str): """ - Create a .docx summary with consistent font and heading styles. - No cover page, no line numbering. + Create a summary DOCX with: + - 1" margins on all sides + - 12pt Courier font + - No line numbering """ doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - # Summary content (no line numbering) - _add_summary_content(doc, text) + # Base font + style = doc.styles["Normal"] + style.font.name = "Courier" + style.font.size = Pt(12) + + # Margins: 1 inch all sides + for section in doc.sections: + section.left_margin = Inches(1.0) + section.right_margin = Inches(1.0) + section.top_margin = Inches(1.0) + section.bottom_margin = Inches(1.0) + + # Remove default paragraph + body = doc.element.body + for p in list(body.findall(f"{{{W_NS}}}p")): + body.remove(p) + + # Add summary content + lines = text.strip().splitlines() + for line in lines: + line = line.strip() + if not line: + continue + p = doc.add_paragraph(line) + p.paragraph_format.space_after = Pt(4) doc.save(filename)