Add PDF-based document generation with LibreOffice; fix line numbering and margins
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

- Add LibreOffice Writer and DejaVu fonts to Dockerfile for PDF generation
- Add PyPDF2 and reportlab to requirements.txt
- Refactor email_sender.py:
  - Enforce 1-inch margins on all sides
  - Isolate line numbering to transcript section only
  - Add generate_pdf_documents() to build:
    - TRANSCRIPT.pdf (cover + transcript)
    - SUMMARY.pdf (cover + summary)
    - COMBINED.pdf (transcript cover + summary + TRANSCRIPT header + transcript)
  - Add page numbers (bottom-right) to all PDFs via reportlab
- Update tasks.py:
  - Use generate_pdf_documents() after creating DOCX files
  - Attach source JSON, MD files, and compiled PDFs in success email
- Add test_docx_generation.py for transcript/summary/combined DOCX testing
This commit is contained in:
admin
2026-06-15 02:19:17 +00:00
parent b0a23b32e1
commit 42a155aeaa
5 changed files with 534 additions and 119 deletions
+31 -3
View File
@@ -14,7 +14,7 @@ from .summarizer import SummarizerClient, SummarizerError
from .misc import setup_logging
from .email_sender import send_email, EmailError, load_template
from .email_sender import create_transcript_docx, create_summary_docx
from .email_sender import create_combined_docx
from .email_sender import create_combined_docx, generate_pdf_documents
logger = logging.getLogger("scraibe.tasks")
@@ -545,6 +545,26 @@ def process_transcription_task(
)
temp_files.append(docx_combined_path)
# 4b) Generate PDF documents
pdf_files = {}
try:
pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
pdf_files = generate_pdf_documents(
transcript_text=transcript_text,
summary_text=summary_text if include_summary else "",
output_dir=pdf_output_dir,
transcript_cover_date=today_str,
transcript_cover_desc=transcript_cover_desc,
summary_cover_date=today_str,
summary_cover_desc=summary_cover_desc,
)
for pdf_path in pdf_files.values():
if pdf_path and os.path.exists(pdf_path):
temp_files.append(pdf_path)
except Exception as e:
logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
# 5) Build attachments list
attachments = [
md_transcript_path,
docx_transcript_path,
@@ -555,7 +575,15 @@ def process_transcription_task(
if docx_combined_path:
attachments.append(docx_combined_path)
# 5) Send success email
# Add PDFs to attachments
if pdf_files.get("transcript_pdf"):
attachments.append(pdf_files["transcript_pdf"])
if pdf_files.get("summary_pdf"):
attachments.append(pdf_files["summary_pdf"])
if pdf_files.get("combined_pdf"):
attachments.append(pdf_files["combined_pdf"])
# 6) Send success email
send_success_email(
to=email_to,
transcript_text=transcript_text,
@@ -575,7 +603,7 @@ def process_transcription_task(
)
raise e
finally:
# 6) Cleanup
# 7) Cleanup
for path in temp_files:
_remove_file(path)
if audio_path: