Revert PDF generation; simplify to DOCX + MD + JSON only
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Remove PDF helpers, LibreOffice, PyPDF2, reportlab
- Transcript DOCX: standalone, no cover page, with line numbering
- Summary DOCX: standalone, no cover page, no line numbering
- Attachments:
  - Transcribe: JSON, transcript MD, transcript DOCX
  - Transcribe & Summarize: JSON, transcript MD, transcript DOCX, summary MD, summary DOCX
This commit is contained in:
admin
2026-06-15 03:38:12 +00:00
parent 6233a41f61
commit e27e5b8522
4 changed files with 14 additions and 480 deletions
+2 -7
View File
@@ -9,14 +9,9 @@ LABEL description="Scraibe: LocalAI-backed transcription and diarization client
Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
LABEL url="https://git.optimex.systems/admin/scribe"
# Install system dependencies (ffmpeg, redis, LibreOffice for PDF generation)
# Install system dependencies (ffmpeg, redis)
RUN apt update -y && \
apt install -y --no-install-recommends \
ffmpeg \
redis-server \
libreoffice-writer \
fonts-dejavu-core \
&& \
apt install -y --no-install-recommends ffmpeg redis-server && \
apt clean && \
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-2
View File
@@ -6,5 +6,3 @@ PyYAML>=6.0
celery[redis]>=5.3.0
redis>=5.0.0
python-docx>=1.1.0
PyPDF2>=3.0.0
reportlab>=4.0.0
+7 -357
View File
@@ -13,8 +13,6 @@ import logging
import os
import re
import smtplib
import subprocess
import tempfile
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
@@ -336,58 +334,6 @@ def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
sectPr.append(lnNumType)
def _add_cover_page(doc, doc_type, date, description):
"""
Add a cover page:
- Centered horizontally and vertically using a full-page table.
- Lines:
1) Document type
2) Date (e.g. "June 14, 2026")
3-5) Empty space
6) One-sentence description
"""
# Create a full-page table to center content vertically and horizontally
table = doc.add_table(rows=1, cols=1)
table.autofit = False
cell = table.cell(0, 0)
# Make table span full page width (approx)
cell.width = Inches(6.5)
# Center content inside the cell
for paragraph in cell.paragraphs:
paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
# Clear default paragraph
cell.paragraphs[0].clear()
# Line 1: Document type
p_type = cell.add_paragraph()
p_type.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_type = p_type.add_run(doc_type)
run_type.bold = True
run_type.font.name = "Courier"
run_type.font.size = Pt(12)
# Line 2: Date
p_date = cell.add_paragraph()
p_date.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_date = p_date.add_run(date)
run_date.font.name = "Courier"
run_date.font.size = Pt(12)
# Lines 3-5: blank space
for _ in range(3):
cell.add_paragraph()
# Line 6: Description
p_desc = cell.add_paragraph()
p_desc.alignment = WD_ALIGN_PARAGRAPH.CENTER
run_desc = p_desc.add_run(description)
run_desc.font.name = "Courier"
run_desc.font.size = Pt(12)
def _add_transcript_content(doc, text):
"""
Add transcript lines to the document with formatting.
@@ -467,146 +413,27 @@ def _add_summary_content(doc, text):
p.paragraph_format.space_after = Pt(4)
# ------------ PDF helpers ------------
def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
"""
Convert a .docx file to PDF using LibreOffice.
Returns the path of the generated PDF, or None on failure.
"""
try:
# Use LibreOffice in headless mode
result = subprocess.run(
[
"libreoffice",
"--headless",
"--convert-to", "pdf",
"--outdir", output_dir,
docx_path,
],
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
logger.warning("LibreOffice conversion failed: %s", result.stderr)
# LibreOffice creates a PDF with the same base name
base = os.path.splitext(os.path.basename(docx_path))[0]
pdf_path = os.path.join(output_dir, f"{base}.pdf")
if os.path.exists(pdf_path):
return pdf_path
else:
logger.warning("Converted PDF not found at: %s", pdf_path)
return None
except Exception as e:
logger.error("Error converting DOCX to PDF: %s", e)
return None
def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
"""
Merge multiple PDF files into a single PDF using PyPDF2.
Returns True on success, False on failure.
"""
try:
from PyPDF2 import PdfMerger
merger = PdfMerger()
for pdf in input_pdfs:
if os.path.exists(pdf):
merger.append(pdf)
merger.write(output_pdf)
merger.close()
return True
except Exception as e:
logger.error("Error merging PDFs: %s", e)
return False
def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
"""
Add page numbers to a PDF using reportlab.
Page numbers appear at the bottom-right of each page.
"""
try:
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from PyPDF2 import PdfReader
# Try to use a standard font
font_name = "Courier"
try:
pdfmetrics.registerFont(
TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
)
except Exception:
pass
reader = PdfReader(input_pdf)
num_pages = len(reader.pages)
# Create a canvas to add page numbers
c = canvas.Canvas(output_pdf, pagesize=letter)
for page_num in range(num_pages):
page = reader.pages[page_num]
c = canvas.Canvas(output_pdf, pagesize=letter)
# Add page number
c.setFont(font_name, 10)
page_text = f"Page {page_num + 1} of {num_pages}"
text_width = c.stringWidth(page_text, font_name, 10)
x = letter[0] - 1 * 72 - text_width # 1 inch from right
y = 1 * 72 # 1 inch from bottom
c.drawString(x, y, page_text)
c.showPage()
c.save()
return True
except Exception as e:
logger.error("Error adding page numbers to PDF: %s", e)
return False
# ------------ Public DOCX/PDF functions ------------
# ------------ Public DOCX functions ------------
def create_transcript_docx(
text: str,
filename: str,
include_cover: bool = False,
cover_date: str = "",
cover_desc: str = "",
):
"""
Create a .docx transcript with:
- 1" margins on all sides
- 12pt Courier
- Continuous line numbering on the left (for transcript content only)
- Optional cover page with type, date, and AI-generated description.
- Continuous line numbering on the left
- No cover page (standalone transcript only).
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page (no line numbering)
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
doc.add_page_break()
# 2) Enable line numbering for transcript section
# Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 3) Transcript content (with line numbering)
# Transcript content (with line numbering)
_add_transcript_content(doc, text)
doc.save(filename)
@@ -615,198 +442,21 @@ def create_transcript_docx(
def create_summary_docx(
text: str,
filename: str,
include_cover: bool = False,
cover_date: str = "",
cover_desc: str = "",
):
"""
Create a .docx summary with consistent font and heading styles.
Optional cover page with type, date, and AI-generated description.
No line numbering.
No cover page, no line numbering.
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Optional cover page
if include_cover and cover_date and cover_desc:
_add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
doc.add_page_break()
# 2) Summary content (no line numbering)
# Summary content (no line numbering)
_add_summary_content(doc, text)
doc.save(filename)
def create_combined_docx(
transcript_text: str,
summary_text: str,
filename: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
):
"""
Create a combined .docx with:
1) Transcript cover page (no line numbering)
2) Page break
3) Summary content (no line numbering)
4) Page break
5) Transcript content (line numbering enabled only here)
Line numbering is restricted to the transcript section only.
"""
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
# 1) Transcript cover page (no line numbering)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.add_page_break()
# 3) Summary content (no line numbering)
_add_summary_content(doc, summary_text)
doc.add_page_break()
# 4) Enable line numbering for transcript section
_enable_line_numbering(doc.sections[0])
# 5) Transcript content (with line numbering)
_add_transcript_content(doc, transcript_text)
doc.save(filename)
def generate_pdf_documents(
transcript_text: str,
summary_text: str,
output_dir: str,
transcript_cover_date: str,
transcript_cover_desc: str,
summary_cover_date: str,
summary_cover_desc: str,
) -> Dict[str, str]:
"""
Generate PDF documents by:
- Creating individual .docx components (cover pages, transcript, summary)
- Converting each .docx to PDF
- Merging PDFs in the correct order
- Adding page numbers to the final PDFs
Behavior:
- Always:
- Generate TRANSCRIPT.pdf:
- transcript_cover.pdf + transcript_with_line_numbers.pdf
- If summary_text is provided:
- Generate SUMMARY.pdf:
- summary_cover.pdf + summary.pdf
- Generate COMBINED.pdf:
- transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
Returns a dict with paths:
- transcript_pdf (always)
- summary_pdf (if summary_text provided)
- combined_pdf (if summary_text provided)
"""
import os
os.makedirs(output_dir, exist_ok=True)
result = {}
# Create temporary directory for intermediate files
with tempfile.TemporaryDirectory() as tmpdir:
# 1) Generate constituent DOCX files
# Transcript cover
transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
doc.save(transcript_cover_docx)
# Summary cover (only if summary is requested)
summary_cover_docx = None
if summary_text:
summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
doc.save(summary_cover_docx)
# Transcript (with line numbering)
transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_enable_line_numbering(doc.sections[0])
_add_transcript_content(doc, transcript_text)
doc.save(transcript_docx)
# Summary (no line numbering)
summary_docx = None
if summary_text:
summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
doc = Document()
_configure_base_font(doc)
_configure_section_margins(doc)
_add_summary_content(doc, summary_text)
doc.save(summary_docx)
# 2) Convert DOCX to PDF
transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
summary_cover_pdf = None
summary_pdf = None
if summary_text:
summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
# 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
if (
transcript_cover_pdf
and transcript_pdf
and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
):
result["transcript_pdf"] = transcript_output_pdf
# 4) If summary is provided:
# - SUMMARY.pdf: summary_cover + summary + page numbers
# - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
if summary_text and summary_pdf and summary_cover_pdf:
# SUMMARY.pdf
summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
if (
_merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
):
result["summary_pdf"] = summary_output_pdf
# COMBINED.pdf
combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
if (
_merge_pdfs(
[transcript_cover_pdf, summary_pdf, transcript_pdf],
merged_combined,
)
and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
):
result["combined_pdf"] = combined_output_pdf
return result
def send_success_email(
to: str,
transcript_text: str,
+5 -114
View File
@@ -14,7 +14,6 @@ from .summarizer import SummarizerClient, SummarizerError
from .misc import setup_logging
from .email_sender import send_email, EmailError, load_template
from .email_sender import create_transcript_docx, create_summary_docx
from .email_sender import create_combined_docx, generate_pdf_documents
logger = logging.getLogger("scraibe.tasks")
@@ -93,7 +92,7 @@ def get_queue_position(task_id: str) -> int:
if found:
break
# If not found in reserved, it may already be active or not yet visible.
# If not found in reserved, it may already be active or not yet visible.
# In that case, treat it as position 1.
if found:
return max(ahead + 1, 1)
@@ -415,59 +414,6 @@ def process_transcription_task(
)
speaker_map = {}
# 3c) Generate short cover-page descriptions using summarizer
transcript_cover_desc = ""
summary_cover_desc = ""
dt = datetime.utcnow()
today_str = f"{dt.strftime('%B')} {dt.day}, {dt.year}"
try:
scraibe._ensure_summarizer()
summarizer = scraibe._summarizer
# Transcript description
prompt = (
"In one short sentence (max 25 words), describe the content of this transcript "
"for use on a cover page. Do not include speaker names. Output only the sentence.\n\n"
+ transcript_text
)
response = summarizer._chat_completion(
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=60,
)
transcript_cover_desc = (
(response or {}).get("choices", [{}])[0].get("message", {}).get("content", "")
).strip()
if not transcript_cover_desc:
transcript_cover_desc = "Transcript of the recorded conversation."
logger.info("Transcript cover description: %s", transcript_cover_desc)
# Summary description (if summary requested)
if include_summary:
prompt = (
"In one short sentence (max 25 words), describe the content of this summary "
"for use on a cover page. Output only the sentence.\n\n"
+ summary_text
)
response = summarizer._chat_completion(
messages=[{"role": "user", "content": prompt}],
temperature=0.3,
max_tokens=60,
)
summary_cover_desc = (
(response or {}).get("choices", [{}])[0].get("message", {}).get("content", "")
).strip()
if not summary_cover_desc:
summary_cover_desc = "Summary of the recorded conversation."
logger.info("Summary cover description: %s", summary_cover_desc)
except (SummarizerError, Exception) as e:
logger.warning("Failed to generate cover descriptions: %s", e)
transcript_cover_desc = transcript_cover_desc or "Transcript of the recorded conversation."
if include_summary:
summary_cover_desc = summary_cover_desc or "Summary of the recorded conversation."
# 4) Prepare files
# Transcript .md
@@ -477,14 +423,11 @@ def process_transcription_task(
f.write(transcript_text)
temp_files.append(md_transcript_path)
# Transcript .docx (standalone, with cover page)
# Transcript .docx (standalone, no cover page)
docx_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".docx")
create_transcript_docx(
transcript_text,
docx_transcript_path,
include_cover=True,
cover_date=today_str,
cover_desc=transcript_cover_desc,
)
temp_files.append(docx_transcript_path)
@@ -511,7 +454,6 @@ def process_transcription_task(
# Summary files (if present)
md_summary_path = None
docx_summary_path = None
docx_combined_path = None
if summary_text:
# Summary .md
@@ -521,77 +463,26 @@ def process_transcription_task(
f.write(summary_text)
temp_files.append(md_summary_path)
# Summary .docx (standalone, with cover page)
# Summary .docx (standalone, no cover page)
docx_summary_path = _safe_filename("SUMMARY", local, date_tag, ".docx")
create_summary_docx(
summary_text,
docx_summary_path,
include_cover=True,
cover_date=today_str,
cover_desc=summary_cover_desc,
)
temp_files.append(docx_summary_path)
# Combined .docx: Transcript + Summary with cover pages
docx_combined_path = _safe_filename("TRANSCRIPT_AND_SUMMARY", local, date_tag, ".docx")
create_combined_docx(
transcript_text=transcript_text,
summary_text=summary_text,
filename=docx_combined_path,
transcript_cover_date=today_str,
transcript_cover_desc=transcript_cover_desc,
summary_cover_date=today_str,
summary_cover_desc=summary_cover_desc,
)
temp_files.append(docx_combined_path)
# 4b) Generate PDF documents
# Always use transcript text; use summary_text only if include_summary is True
pdf_summary_text = summary_text if include_summary else ""
pdf_files = {}
try:
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
pdf_files = generate_pdf_documents(
transcript_text=transcript_text,
summary_text=pdf_summary_text,
output_dir=pdf_output_dir,
transcript_cover_date=today_str,
transcript_cover_desc=transcript_cover_desc,
summary_cover_date=today_str,
summary_cover_desc=summary_cover_desc,
)
for pdf_path in pdf_files.values():
if pdf_path and os.path.exists(pdf_path):
temp_files.append(pdf_path)
except Exception as e:
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
# 5) Build attachments list
# Base attachments: JSON, transcript MD, transcript DOCX
# Always: JSON, transcript MD, transcript DOCX
attachments = [
md_transcript_path,
docx_transcript_path,
json_path,
]
# If summary is present, add summary MD and DOCXs
# If summary is present, add summary MD and DOCX
if summary_text:
attachments += [md_summary_path, docx_summary_path]
if docx_combined_path:
attachments.append(docx_combined_path)
# Add PDFs based on mode:
# - Always: TRANSCRIPT.pdf
# - If summary: SUMMARY.pdf and COMBINED.pdf
if pdf_files.get("transcript_pdf"):
attachments.append(pdf_files["transcript_pdf"])
if pdf_files.get("summary_pdf"):
attachments.append(pdf_files["summary_pdf"])
if pdf_files.get("combined_pdf"):
attachments.append(pdf_files["combined_pdf"])
# 6) Send success email
send_success_email(