Improve summary prompt, add markdown-to-DOCX styling, and add cover pages
- Configurable summary prompts via ENV or file; stronger default prompt. - New docx_styles.py: converts markdown (headings, bullets, bold/italic) to DOCX. - Updated create_summary_docx to use markdown-aware styling. - New docx_cover.py: reusable cover page for transcript and summary. - Cover pages enabled when COVER_PAGE_ENABLED=true.
This commit is contained in:
@@ -0,0 +1,118 @@
|
|||||||
|
"""
|
||||||
|
Reusable cover-page generator for transcript and summary DOCX files.
|
||||||
|
|
||||||
|
Configuration (env):
|
||||||
|
- COVER_PAGE_ENABLED: "true"/"false" (default: false)
|
||||||
|
- COVER_PAGE_ORGANIZATION: e.g., "A.P.Strom"
|
||||||
|
- COVER_PAGE_TITLE_PREFIX: e.g., "TRANSCRIPT" or "SUMMARY"
|
||||||
|
- COVER_PAGE_LOGO_URL: optional URL
|
||||||
|
- COVER_PAGE_LOGO_PATH: optional local path
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Pt, Inches
|
||||||
|
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||||
|
from docx.oxml import OxmlElement
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
|
|
||||||
|
def _add_page_break(doc: Document):
|
||||||
|
"""Insert a page break paragraph."""
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
pPr = p._p.get_or_add_pPr()
|
||||||
|
# Clear spacing/tabs
|
||||||
|
for child in list(pPr):
|
||||||
|
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||||
|
if tag in ("tabs", "spacing", "ind"):
|
||||||
|
pPr.remove(child)
|
||||||
|
page_break = OxmlElement("w:pageBreak")
|
||||||
|
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
|
||||||
|
pPr.append(page_break)
|
||||||
|
|
||||||
|
|
||||||
|
def add_cover_page(
|
||||||
|
doc: Document,
|
||||||
|
title: str,
|
||||||
|
subtitle: Optional[str] = None,
|
||||||
|
metadata: Optional[dict] = None,
|
||||||
|
include_logo: bool = False,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Insert a cover page at the current cursor position.
|
||||||
|
|
||||||
|
- title: e.g., "TRANSCRIPT" or "SUMMARY"
|
||||||
|
- subtitle: e.g., "Meeting of 16 June 2026"
|
||||||
|
- metadata: optional dict with keys like:
|
||||||
|
- "Organization"
|
||||||
|
- "Date"
|
||||||
|
- "Prepared by"
|
||||||
|
- "Reference"
|
||||||
|
"""
|
||||||
|
|
||||||
|
org = (os.getenv("COVER_PAGE_ORGANIZATION") or "").strip() or metadata.get("Organization") if metadata else None
|
||||||
|
date = (metadata.get("Date") if metadata else None) or ""
|
||||||
|
prepared_by = (metadata.get("Prepared by") if metadata else None) or ""
|
||||||
|
reference = (metadata.get("Reference") if metadata else None) or ""
|
||||||
|
|
||||||
|
# Title
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
p.paragraph_format.space_after = Pt(6)
|
||||||
|
run = p.add_run(title.upper())
|
||||||
|
run.bold = True
|
||||||
|
run.font.name = "Courier"
|
||||||
|
run.font.size = Pt(18)
|
||||||
|
|
||||||
|
# Subtitle
|
||||||
|
if subtitle:
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
p.paragraph_format.space_after = Pt(12)
|
||||||
|
run = p.add_run(subtitle)
|
||||||
|
run.font.name = "Courier"
|
||||||
|
run.font.size = Pt(14)
|
||||||
|
|
||||||
|
# Optional logo placeholder (text-only for now; can be extended)
|
||||||
|
if include_logo:
|
||||||
|
logo_url = (os.getenv("COVER_PAGE_LOGO_URL") or "").strip()
|
||||||
|
logo_path = (os.getenv("COVER_PAGE_LOGO_PATH") or "").strip()
|
||||||
|
# For now, just reserve space; image insertion can be added later.
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
p.paragraph_format.space_after = Pt(12)
|
||||||
|
|
||||||
|
# Metadata lines
|
||||||
|
if org or date or prepared_by or reference:
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
p.paragraph_format.space_after = Pt(4)
|
||||||
|
if org:
|
||||||
|
r = p.add_run(org)
|
||||||
|
r.font.name = "Courier"
|
||||||
|
r.font.size = Pt(12)
|
||||||
|
if date:
|
||||||
|
if org:
|
||||||
|
p.add_run("\n")
|
||||||
|
r = p.add_run(date)
|
||||||
|
r.font.name = "Courier"
|
||||||
|
r.font.size = Pt(12)
|
||||||
|
|
||||||
|
if prepared_by or reference:
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||||
|
p.paragraph_format.space_after = Pt(4)
|
||||||
|
if prepared_by:
|
||||||
|
r = p.add_run(f"Prepared by: {prepared_by}")
|
||||||
|
r.font.name = "Courier"
|
||||||
|
r.font.size = Pt(11)
|
||||||
|
if reference:
|
||||||
|
if prepared_by:
|
||||||
|
p.add_run("\n")
|
||||||
|
r = p.add_run(f"Reference: {reference}")
|
||||||
|
r.font.name = "Courier"
|
||||||
|
r.font.size = Pt(11)
|
||||||
|
|
||||||
|
# Page break after cover page
|
||||||
|
_add_page_break(doc)
|
||||||
@@ -0,0 +1,149 @@
|
|||||||
|
"""
|
||||||
|
Utility module for applying styles and converting simple markdown
|
||||||
|
into styled DOCX paragraphs/runs for summaries.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import re
|
||||||
|
from docx import Document
|
||||||
|
from docx.shared import Pt
|
||||||
|
from docx.oxml import OxmlElement
|
||||||
|
from docx.oxml.ns import qn
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)):
|
||||||
|
"""
|
||||||
|
Ensure a paragraph style exists in the document.
|
||||||
|
"""
|
||||||
|
styles = doc.styles
|
||||||
|
if name not in [s.name for s in styles]:
|
||||||
|
style = styles.add_style(name, 1) # 1 = WD_STYLE_TYPE.PARAGRAPH
|
||||||
|
style.font.name = font_name
|
||||||
|
style.font.size = font_size
|
||||||
|
if based_on:
|
||||||
|
style.base_style = styles[based_on]
|
||||||
|
return styles[name]
|
||||||
|
|
||||||
|
|
||||||
|
def apply_heading_style(paragraph, level: int):
|
||||||
|
"""
|
||||||
|
Apply heading style to a paragraph based on level (1, 2, 3).
|
||||||
|
"""
|
||||||
|
if level == 1:
|
||||||
|
style_name = "SummaryHeading1"
|
||||||
|
size = Pt(16)
|
||||||
|
elif level == 2:
|
||||||
|
style_name = "SummaryHeading2"
|
||||||
|
size = Pt(14)
|
||||||
|
else:
|
||||||
|
style_name = "SummaryHeading3"
|
||||||
|
size = Pt(12)
|
||||||
|
|
||||||
|
doc = paragraph.document
|
||||||
|
style = _ensure_style(doc, style_name, font_size=size)
|
||||||
|
paragraph.style = style
|
||||||
|
paragraph.paragraph_format.space_before = Pt(4)
|
||||||
|
paragraph.paragraph_format.space_after = Pt(2)
|
||||||
|
|
||||||
|
|
||||||
|
def apply_bullet_style(paragraph):
|
||||||
|
"""
|
||||||
|
Apply a simple bullet style to a paragraph.
|
||||||
|
"""
|
||||||
|
doc = paragraph.document
|
||||||
|
style_name = "SummaryBullet"
|
||||||
|
style = _ensure_style(doc, style_name)
|
||||||
|
paragraph.style = style
|
||||||
|
pPr = paragraph._p.get_or_add_pPr()
|
||||||
|
tabs = OxmlElement("w:tabs")
|
||||||
|
tab = OxmlElement("w:tab")
|
||||||
|
tab.set(qn("w:val"), "left")
|
||||||
|
tab.set(qn("w:pos"), "360")
|
||||||
|
tabs.append(tab)
|
||||||
|
pPr.append(tabs)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_simple_md_to_paragraphs(doc, text: str):
|
||||||
|
"""
|
||||||
|
Convert simple markdown text into DOCX paragraphs with styles.
|
||||||
|
|
||||||
|
Supported:
|
||||||
|
- # / ## / ### for headings
|
||||||
|
- - / * for bullet lists
|
||||||
|
- **bold** and *italic*
|
||||||
|
|
||||||
|
This is intentionally simple and robust for legal/business summaries.
|
||||||
|
"""
|
||||||
|
lines = text.splitlines()
|
||||||
|
current_paragraph = None
|
||||||
|
in_list = False
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
current_paragraph = None
|
||||||
|
in_list = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Headings
|
||||||
|
heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped)
|
||||||
|
if heading_match:
|
||||||
|
level = len(heading_match.group(1))
|
||||||
|
content = heading_match.group(2).strip()
|
||||||
|
p = doc.add_paragraph()
|
||||||
|
apply_heading_style(p, level)
|
||||||
|
_add_run_with_inline_md(p, content)
|
||||||
|
current_paragraph = p
|
||||||
|
in_list = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Bullet list
|
||||||
|
bullet_match = re.match(r"^[-*]\s+(.*)", stripped)
|
||||||
|
if bullet_match:
|
||||||
|
content = bullet_match.group(1).strip()
|
||||||
|
if not in_list or current_paragraph is None:
|
||||||
|
in_list = True
|
||||||
|
current_paragraph = doc.add_paragraph()
|
||||||
|
apply_bullet_style(current_paragraph)
|
||||||
|
else:
|
||||||
|
current_paragraph = doc.add_paragraph()
|
||||||
|
apply_bullet_style(current_paragraph)
|
||||||
|
_add_run_with_inline_md(current_paragraph, content)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normal paragraph
|
||||||
|
if not in_list or current_paragraph is None:
|
||||||
|
in_list = False
|
||||||
|
current_paragraph = doc.add_paragraph()
|
||||||
|
else:
|
||||||
|
current_paragraph = doc.add_paragraph()
|
||||||
|
|
||||||
|
_add_run_with_inline_md(current_paragraph, stripped)
|
||||||
|
|
||||||
|
|
||||||
|
def _add_run_with_inline_md(paragraph, text: str):
|
||||||
|
"""
|
||||||
|
Add runs to a paragraph, interpreting **bold** and *italic*.
|
||||||
|
"""
|
||||||
|
# Simple regex for bold and italic
|
||||||
|
parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text)
|
||||||
|
for part in parts:
|
||||||
|
if not part:
|
||||||
|
continue
|
||||||
|
|
||||||
|
run = paragraph.add_run(part)
|
||||||
|
run.font.name = "Courier"
|
||||||
|
run.font.size = Pt(12)
|
||||||
|
|
||||||
|
# Bold
|
||||||
|
bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part)
|
||||||
|
if bold_match:
|
||||||
|
run.bold = True
|
||||||
|
part = bold_match.group(1)
|
||||||
|
|
||||||
|
# Italic
|
||||||
|
italic_match = re.fullmatch(r"\*(.+?)\*", part)
|
||||||
|
if italic_match:
|
||||||
|
run.italic = True
|
||||||
|
part = italic_match.group(1)
|
||||||
|
|
||||||
|
run.text = part
|
||||||
+32
-11
@@ -505,7 +505,19 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
|
|
||||||
_create_transcript_section_properties(doc.sections[0])
|
_create_transcript_section_properties(doc.sections[0])
|
||||||
|
|
||||||
# Step 3: Write prepared pages into DOCX
|
# Step 3: Optionally add cover page
|
||||||
|
from . import docx_cover
|
||||||
|
cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||||
|
if cover_enabled:
|
||||||
|
docx_cover.add_cover_page(
|
||||||
|
doc,
|
||||||
|
title="TRANSCRIPT",
|
||||||
|
subtitle=None,
|
||||||
|
metadata=None,
|
||||||
|
include_logo=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Step 4: Write prepared pages into DOCX
|
||||||
for page_idx, page_lines in enumerate(prepared_pages):
|
for page_idx, page_lines in enumerate(prepared_pages):
|
||||||
# Insert page break between pages
|
# Insert page break between pages
|
||||||
if page_idx > 0:
|
if page_idx > 0:
|
||||||
@@ -523,7 +535,7 @@ def create_transcript_docx(text: str, filename: str):
|
|||||||
for line_num, line_text in enumerate(page_lines, start=1):
|
for line_num, line_text in enumerate(page_lines, start=1):
|
||||||
_add_transcript_paragraph(doc, line_text, line_number=line_num)
|
_add_transcript_paragraph(doc, line_text, line_number=line_num)
|
||||||
|
|
||||||
# Step 4: Add footer: "X of Y" centered
|
# Step 5: Add footer: "X of Y" centered
|
||||||
section = doc.sections[0]
|
section = doc.sections[0]
|
||||||
footer = section.footer
|
footer = section.footer
|
||||||
footer.is_linked_to_previous = False
|
footer.is_linked_to_previous = False
|
||||||
@@ -563,8 +575,10 @@ def create_summary_docx(text: str, filename: str):
|
|||||||
Create a summary DOCX with:
|
Create a summary DOCX with:
|
||||||
- 1" margins on all sides
|
- 1" margins on all sides
|
||||||
- 12pt Courier font
|
- 12pt Courier font
|
||||||
- No line numbering
|
- Markdown-aware WYSIWYG styling (headings, bullets, bold/italic)
|
||||||
"""
|
"""
|
||||||
|
from . import docx_styles
|
||||||
|
|
||||||
doc = Document()
|
doc = Document()
|
||||||
|
|
||||||
# Base font
|
# Base font
|
||||||
@@ -584,13 +598,20 @@ def create_summary_docx(text: str, filename: str):
|
|||||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||||
body.remove(p)
|
body.remove(p)
|
||||||
|
|
||||||
# Add summary content
|
# Optionally add cover page
|
||||||
lines = text.strip().splitlines()
|
from . import docx_cover
|
||||||
for line in lines:
|
cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||||
line = line.strip()
|
if cover_enabled:
|
||||||
if not line:
|
docx_cover.add_cover_page(
|
||||||
continue
|
doc,
|
||||||
p = doc.add_paragraph(line)
|
title="SUMMARY",
|
||||||
p.paragraph_format.space_after = Pt(4)
|
subtitle=None,
|
||||||
|
metadata=None,
|
||||||
|
include_logo=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add summary content using markdown-aware styling
|
||||||
|
if text.strip():
|
||||||
|
docx_styles.parse_simple_md_to_paragraphs(doc, text.strip())
|
||||||
|
|
||||||
doc.save(filename)
|
doc.save(filename)
|
||||||
|
|||||||
+70
-26
@@ -148,19 +148,76 @@ class SummarizerClient:
|
|||||||
start = break_pos
|
start = break_pos
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
def _load_summary_prompt(self, role: str) -> str:
|
||||||
|
"""
|
||||||
|
Load summary prompt for the given role: 'chunk' or 'combined'.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1) SUMMARY_PROMPT_{ROLE} (env)
|
||||||
|
2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections
|
||||||
|
3) Built-in default prompt
|
||||||
|
"""
|
||||||
|
role_upper = role.upper()
|
||||||
|
|
||||||
|
# 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED
|
||||||
|
env_key = f"SUMMARY_PROMPT_{role_upper}"
|
||||||
|
env_prompt = (os.getenv(env_key) or "").strip()
|
||||||
|
if env_prompt:
|
||||||
|
return env_prompt
|
||||||
|
|
||||||
|
# 2) File-based prompt with sections
|
||||||
|
prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip()
|
||||||
|
if prompt_file and os.path.exists(prompt_file):
|
||||||
|
try:
|
||||||
|
with open(prompt_file, "r", encoding="utf-8") as f:
|
||||||
|
content = f.read()
|
||||||
|
# Simple section parser: [chunk], [combined]
|
||||||
|
import re
|
||||||
|
pattern = re.compile(
|
||||||
|
r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)",
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
m = pattern.search(content)
|
||||||
|
if m:
|
||||||
|
text = m.group(1).strip()
|
||||||
|
if text:
|
||||||
|
return text
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e)
|
||||||
|
|
||||||
|
# 3) Default prompts
|
||||||
|
if role == "chunk":
|
||||||
|
return (
|
||||||
|
"You are an expert legal and business meeting summarizer. "
|
||||||
|
"You will receive a segment of a longer transcript. "
|
||||||
|
"Provide a detailed, structured summary of this segment, focusing on: "
|
||||||
|
"- Topics discussed\n"
|
||||||
|
"- Key points and arguments\n"
|
||||||
|
"- Decisions and agreements\n"
|
||||||
|
"- Action items and responsibilities\n"
|
||||||
|
"- Any risks, conflicts, or open issues\n\n"
|
||||||
|
"Be concise but complete. Use bullet points where helpful. "
|
||||||
|
"Do not add information that is not present in the transcript."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return (
|
||||||
|
"You are an expert legal and business meeting summarizer. "
|
||||||
|
"You will receive several intermediate summaries of a longer conversation. "
|
||||||
|
"Produce a single, comprehensive summary that makes it clear: "
|
||||||
|
"- The overall purpose and context of the discussion\n"
|
||||||
|
"- The main issues and topics addressed\n"
|
||||||
|
"- Key arguments and positions (briefly)\n"
|
||||||
|
"- Decisions and outcomes\n"
|
||||||
|
"- Action items, responsibilities, and next steps\n"
|
||||||
|
"- Any unresolved issues or risks\n\n"
|
||||||
|
"The summary should be detailed enough that a reader who was not present "
|
||||||
|
"can understand what happened and what is expected going forward. "
|
||||||
|
"Use clear, concise language and bullet points where appropriate. "
|
||||||
|
"Use markdown formatting (headings, lists, bold) to structure the summary."
|
||||||
|
)
|
||||||
|
|
||||||
def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
|
def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
|
||||||
system_prompt = (
|
system_prompt = self._load_summary_prompt("chunk")
|
||||||
"You are an expert legal and business meeting summarizer. "
|
|
||||||
"You will receive a segment of a longer transcript. "
|
|
||||||
"Provide a detailed, structured summary of this segment, focusing on: "
|
|
||||||
"- Topics discussed\n"
|
|
||||||
"- Key points and arguments\n"
|
|
||||||
"- Decisions and agreements\n"
|
|
||||||
"- Action items and responsibilities\n"
|
|
||||||
"- Any risks, conflicts, or open issues\n\n"
|
|
||||||
"Be concise but complete. Use bullet points when helpful. "
|
|
||||||
"Do not add information that is not present in the transcript."
|
|
||||||
)
|
|
||||||
|
|
||||||
user_prompt = (
|
user_prompt = (
|
||||||
f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
|
f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
|
||||||
@@ -170,20 +227,7 @@ class SummarizerClient:
|
|||||||
return self._chat_completion(system_prompt, user_prompt)
|
return self._chat_completion(system_prompt, user_prompt)
|
||||||
|
|
||||||
def _summarize_combined(self, combined_summaries: str) -> str:
|
def _summarize_combined(self, combined_summaries: str) -> str:
|
||||||
system_prompt = (
|
system_prompt = self._load_summary_prompt("combined")
|
||||||
"You are an expert legal and business meeting summarizer. "
|
|
||||||
"You will receive several intermediate summaries of a longer conversation. "
|
|
||||||
"Produce a single, comprehensive summary that makes it clear: "
|
|
||||||
"- The overall purpose and context of the discussion\n"
|
|
||||||
"- The main issues and topics addressed\n"
|
|
||||||
"- Key arguments and positions (briefly)\n"
|
|
||||||
"- Decisions and outcomes\n"
|
|
||||||
"- Action items, responsibilities, and next steps\n"
|
|
||||||
"- Any unresolved issues or risks\n\n"
|
|
||||||
"The summary should be detailed enough that a reader who was not present "
|
|
||||||
"can understand what happened and what is expected going forward. "
|
|
||||||
"Use clear, concise language and bullet points where appropriate."
|
|
||||||
)
|
|
||||||
|
|
||||||
user_prompt = (
|
user_prompt = (
|
||||||
"Here are the intermediate summaries from different parts of the same conversation:\n\n"
|
"Here are the intermediate summaries from different parts of the same conversation:\n\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user