Files
scribe/scraibe/docx_styles.py
T
admin 7a31be9de5
Mirror and run GitLab CI / build (push) Waiting to run
Ruff / ruff (push) Waiting to run
Improve summary prompt, add markdown-to-DOCX styling, and add cover pages
- Configurable summary prompts via ENV or file; stronger default prompt.
- New docx_styles.py: converts markdown (headings, bullets, bold/italic) to DOCX.
- Updated create_summary_docx to use markdown-aware styling.
- New docx_cover.py: reusable cover page for transcript and summary.
- Cover pages enabled when COVER_PAGE_ENABLED=true.
2026-06-19 17:16:46 +00:00

150 lines
4.3 KiB
Python

"""
Utility module for applying styles and converting simple markdown
into styled DOCX paragraphs/runs for summaries.
"""
import re
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)):
"""
Ensure a paragraph style exists in the document.
"""
styles = doc.styles
if name not in [s.name for s in styles]:
style = styles.add_style(name, 1) # 1 = WD_STYLE_TYPE.PARAGRAPH
style.font.name = font_name
style.font.size = font_size
if based_on:
style.base_style = styles[based_on]
return styles[name]
def apply_heading_style(paragraph, level: int):
"""
Apply heading style to a paragraph based on level (1, 2, 3).
"""
if level == 1:
style_name = "SummaryHeading1"
size = Pt(16)
elif level == 2:
style_name = "SummaryHeading2"
size = Pt(14)
else:
style_name = "SummaryHeading3"
size = Pt(12)
doc = paragraph.document
style = _ensure_style(doc, style_name, font_size=size)
paragraph.style = style
paragraph.paragraph_format.space_before = Pt(4)
paragraph.paragraph_format.space_after = Pt(2)
def apply_bullet_style(paragraph):
"""
Apply a simple bullet style to a paragraph.
"""
doc = paragraph.document
style_name = "SummaryBullet"
style = _ensure_style(doc, style_name)
paragraph.style = style
pPr = paragraph._p.get_or_add_pPr()
tabs = OxmlElement("w:tabs")
tab = OxmlElement("w:tab")
tab.set(qn("w:val"), "left")
tab.set(qn("w:pos"), "360")
tabs.append(tab)
pPr.append(tabs)
def parse_simple_md_to_paragraphs(doc, text: str):
"""
Convert simple markdown text into DOCX paragraphs with styles.
Supported:
- # / ## / ### for headings
- - / * for bullet lists
- **bold** and *italic*
This is intentionally simple and robust for legal/business summaries.
"""
lines = text.splitlines()
current_paragraph = None
in_list = False
for line in lines:
stripped = line.strip()
if not stripped:
current_paragraph = None
in_list = False
continue
# Headings
heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2).strip()
p = doc.add_paragraph()
apply_heading_style(p, level)
_add_run_with_inline_md(p, content)
current_paragraph = p
in_list = False
continue
# Bullet list
bullet_match = re.match(r"^[-*]\s+(.*)", stripped)
if bullet_match:
content = bullet_match.group(1).strip()
if not in_list or current_paragraph is None:
in_list = True
current_paragraph = doc.add_paragraph()
apply_bullet_style(current_paragraph)
else:
current_paragraph = doc.add_paragraph()
apply_bullet_style(current_paragraph)
_add_run_with_inline_md(current_paragraph, content)
continue
# Normal paragraph
if not in_list or current_paragraph is None:
in_list = False
current_paragraph = doc.add_paragraph()
else:
current_paragraph = doc.add_paragraph()
_add_run_with_inline_md(current_paragraph, stripped)
def _add_run_with_inline_md(paragraph, text: str):
"""
Add runs to a paragraph, interpreting **bold** and *italic*.
"""
# Simple regex for bold and italic
parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text)
for part in parts:
if not part:
continue
run = paragraph.add_run(part)
run.font.name = "Courier"
run.font.size = Pt(12)
# Bold
bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part)
if bold_match:
run.bold = True
part = bold_match.group(1)
# Italic
italic_match = re.fullmatch(r"\*(.+?)\*", part)
if italic_match:
run.italic = True
part = italic_match.group(1)
run.text = part