Compare commits
146 Commits
b07f593fab
..
dev
| Author | SHA1 | Date | |
|---|---|---|---|
| cd0c730abe | |||
| 2bd6ee1567 | |||
| 4bc9f82ee7 | |||
| bdd0a80d8d | |||
| 7a31be9de5 | |||
| 54414def26 | |||
| 111d1ea18b | |||
| cb27ba80a1 | |||
| 2112b8c7e2 | |||
| 49f3cdc407 | |||
| 2c0998579c | |||
| 327c05ea16 | |||
| dabb5970ba | |||
| 36b0b6f241 | |||
| 6640bc050d | |||
| 59363c5dcd | |||
| 0e27537a68 | |||
| 0947e91f15 | |||
| 1d447f2836 | |||
| 49e607e1e1 | |||
| bd4393addc | |||
| f5836d83f3 | |||
| b2dce9e048 | |||
| 4d9414fee9 | |||
| d4ed84f68d | |||
| eb83a37f02 | |||
| e7aa5ebf25 | |||
| 1265a664cd | |||
| 83f3c09218 | |||
| d828a91bf3 | |||
| 670c6d3e2b | |||
| f20102d564 | |||
| 0e6bc53cf8 | |||
| c43076efd4 | |||
| 03d66219d9 | |||
| 0c0e52dfb8 | |||
| 604bfa3f41 | |||
| 8ff473f3e6 | |||
| 0b3f737e5b | |||
| 598f8630de | |||
| 7fac0e7d9c | |||
| 5dd56a3368 | |||
| 7364d572d5 | |||
| d51b006a19 | |||
| ea5a0752df | |||
| b0a1bc059b | |||
| e27e5b8522 | |||
| 6233a41f61 | |||
| 237bd4b37c | |||
| 7ece1a50c2 | |||
| 46fbcf80af | |||
| 42a155aeaa | |||
| b0a23b32e1 | |||
| 2e2bc3fb29 | |||
| 2f9299389b | |||
| e0d2fd6963 | |||
| 4651c5f8b2 | |||
| 6c11a8f19a | |||
| 2a2a5e024c | |||
| 7adca3d921 | |||
| efb34dd9ff | |||
| 11e5309a8e | |||
| a3ca1f3505 | |||
| 154cac6c7b | |||
| 18f4a4e8de | |||
| 2f304e3ed1 | |||
| fd94e2daa0 | |||
| e74bc04cb3 | |||
| c792fa17e8 | |||
| e55f36a131 | |||
| 572587bb85 | |||
| cfc38b21ed | |||
| 1582b90ddb | |||
| 9ec4c4ccba | |||
| 8ecae8f648 | |||
| 49e999f0ee | |||
| eb9b2f9126 | |||
| 50c7ec90a0 | |||
| f7c9c70bfc | |||
| a8f48b9e58 | |||
| 2dce9b43c9 | |||
| 1dea51f1f9 | |||
| 63cd620b79 | |||
| dc20e9cff0 | |||
| fb1dc3324d | |||
| 917a7b8f2f | |||
| 85cdd9216a | |||
| 2803c81b44 | |||
| b9d25a39dd | |||
| f6db48b1d0 | |||
| 37d30e0ee2 | |||
| d854d498cd | |||
| 1eb88d27ba | |||
| 2ea46ada42 | |||
| 47b3304297 | |||
| 4822ef28e8 | |||
| 574124558b | |||
| 46d119b63b | |||
| d00ec2d44f | |||
| de883bc062 | |||
| 663675c7b2 | |||
| e5d189fdd0 | |||
| 9528468ebb | |||
| 3fe13803b9 | |||
| 81fefd5568 | |||
| fec46aa563 | |||
| 3851311ffc | |||
| f0989a574b | |||
| de9071762e | |||
| 08f14883e2 | |||
| 101e913f84 | |||
| e7c1a5a2b0 | |||
| af99a655e5 | |||
| 44ff678e06 | |||
| 8813662d4d | |||
| 6fadf3d851 | |||
| ce2f3ebde2 | |||
| fa1dad69d1 | |||
| 575a8de48d | |||
| a4b8546033 | |||
| 81fb9af461 | |||
| 6326d0f156 | |||
| 5f6f681edf | |||
| 2adbfaef51 | |||
| 9df05033da | |||
| df9c5109f3 | |||
| ab7b43ac48 | |||
| 929f916077 | |||
| 51bf211d27 | |||
| 5c0386edac | |||
| 18666adda4 | |||
| 9ce47ac4c2 | |||
| 95c145c74a | |||
| 4e7b7e748b | |||
| ae1bae750f | |||
| 885d0c864e | |||
| de9c81b313 | |||
| 5b56b54da2 | |||
| 53e57a06d7 | |||
| 129f0ce390 | |||
| d25fda5802 | |||
| 533b199f4c | |||
| cf63ac8e2e | |||
| 0ddb52cc95 | |||
| f5ef26432b | |||
| ba058c3e02 |
@@ -0,0 +1,95 @@
|
||||
# This workflow uses actions that are not certified by GitHub.
|
||||
# They are provided by a third-party and are governed by
|
||||
# separate terms of service, privacy policy, and support
|
||||
# documentation.
|
||||
|
||||
# GitHub recommends pinning actions to a commit SHA.
|
||||
# To get a newer version, you will need to update the SHA.
|
||||
# You can also reference a tag or branch, but the action may change without warning.
|
||||
|
||||
name: Publish Docker image
|
||||
|
||||
on:
|
||||
|
||||
push:
|
||||
tags:
|
||||
- v*
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
env:
|
||||
image: hadr0n/scraibe
|
||||
|
||||
jobs:
|
||||
push_to_registry:
|
||||
name: Push Docker image to Docker Hub
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
packages: write
|
||||
contents: read
|
||||
security-events: write
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-tags: true
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Get Version Tag
|
||||
id: version
|
||||
run: |
|
||||
echo "tag=$(git describe --tags --abbrev=0)" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Overwrite label tag
|
||||
run: sed -i 's/LABEL version=".*"/LABEL version="'${{ steps.version.outputs.tag }}'"/' Dockerfile
|
||||
|
||||
- name: Test name and tag
|
||||
run: |
|
||||
echo "${{ env.image }}:latest,${{ env.image }}:${{ steps.version.outputs.tag }}"
|
||||
|
||||
- name: Log in to Docker Hub
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
||||
- name: Build and push Docker image
|
||||
id: push
|
||||
uses: docker/build-push-action@v5
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
tags: "${{ env.image }}:latest,${{ env.image }}:${{ steps.version.outputs.tag }}"
|
||||
|
||||
- name: SBOM Generation
|
||||
uses: anchore/sbom-action@v0
|
||||
with:
|
||||
image: ${{ env.image }}:latest
|
||||
|
||||
- name: Scan image
|
||||
id: scan
|
||||
uses: anchore/scan-action@v3
|
||||
with:
|
||||
image: ${{ env.image }}:latest
|
||||
fail-build: false
|
||||
|
||||
- name: upload Anchore scan SARIF report
|
||||
uses: github/codeql-action/upload-sarif@v3
|
||||
with:
|
||||
sarif_file: ${{ steps.scan.outputs.sarif }}
|
||||
|
||||
# - name: Inspect action SARIF report
|
||||
# run: cat ${{ steps.scan.outputs.sarif }}
|
||||
|
||||
- uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: SARIF report
|
||||
path: ${{ steps.scan.outputs.sarif }}
|
||||
|
||||
# - name: Generate artifact attestation
|
||||
# uses: actions/attest-build-provenance@v1
|
||||
# with:
|
||||
# subject-name: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME}}
|
||||
# subject-digest: ${{ steps.push.outputs.digest }}
|
||||
# push-to-registry: false
|
||||
+11
-33
@@ -1,18 +1,14 @@
|
||||
name: Publish Python 🐍 distribution 📦 to PyPI and TestPyPI
|
||||
|
||||
on:
|
||||
pull_request_target:
|
||||
branches:
|
||||
- develop
|
||||
types:
|
||||
- closed
|
||||
paths:
|
||||
- scraibe/**
|
||||
- pyproject.toml
|
||||
|
||||
push:
|
||||
tags:
|
||||
- 'v*.*.*'
|
||||
branches:
|
||||
- "develop"
|
||||
paths:
|
||||
- "scraibe/**"
|
||||
- "pyproject.toml"
|
||||
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
@@ -27,13 +23,7 @@ on:
|
||||
|
||||
jobs:
|
||||
Build-and-publish-to-Test-PyPI:
|
||||
if: |
|
||||
(github.event_name == 'workflow_dispatch' &&
|
||||
github.event.inputs.test == 'true') ||
|
||||
(github.event_name == 'pull_request_target' &&
|
||||
github.event.pull_request.merged &&
|
||||
contains(github.event.pull_request.labels.*.name, 'release')) ||
|
||||
(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/'))
|
||||
if: github.event_name != 'workflow_dispatch' || github.event.inputs.test == 'true'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
@@ -72,28 +62,16 @@ jobs:
|
||||
needs: Test-PyPi-install
|
||||
runs-on: ubuntu-latest
|
||||
if: |
|
||||
always() &&
|
||||
(( needs.Build-and-publish-to-Test-PyPI.result != 'failure' &&
|
||||
needs.Test-PyPi-install.result != 'failure' ) &&
|
||||
((github.event_name == 'workflow_dispatch' &&
|
||||
github.event.inputs.publish_to_pypi == 'true') ||
|
||||
(github.event_name == 'pull_request_target' &&
|
||||
github.event.pull_request.merged &&
|
||||
contains(github.event.pull_request.labels.*.name, 'release')) ||
|
||||
(github.event_name == 'push' && startsWith(github.ref, 'refs/tags/'))))
|
||||
always() &&
|
||||
(( needs.Build-and-publish-to-Test-PyPI.result != 'failure' &&
|
||||
needs.Test-PyPi-install.result != 'failure' ) ||
|
||||
((github.event_name == 'workflow_dispatch' &&
|
||||
github.event.inputs.publish_to_pypi == 'true')))
|
||||
steps:
|
||||
- name: Checkout Repository Tags
|
||||
uses: actions/checkout@v4
|
||||
if: github.ref == 'refs/heads/main'
|
||||
with:
|
||||
fetch-depth: '0'
|
||||
branch: 'main'
|
||||
- name: Checkout Repository (Develop)
|
||||
uses: actions/checkout@v4
|
||||
if: github.ref == 'refs/heads/develop'
|
||||
with:
|
||||
fetch-depth: '0'
|
||||
branch: 'develop'
|
||||
- name: Set up Poetry 📦
|
||||
uses: JRubics/poetry-publish@v1.16
|
||||
with:
|
||||
|
||||
+48
-35
@@ -1,46 +1,59 @@
|
||||
#pytorch Image
|
||||
FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
|
||||
# Lightweight Python base image (no GPU/PyTorch needed)
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Labels
|
||||
|
||||
LABEL maintainer="Jacob Schmieder"
|
||||
LABEL email="Jacob.Schmieder@dbfz.de"
|
||||
LABEL version="0.1.1.dev"
|
||||
LABEL description="Scraibe is a tool for automatic speech recognition and speaker diarization. \
|
||||
It is based on the Hugging Face Transformers library and the Pyannote library. \
|
||||
It is designed to be used with the Whisper model, a lightweight model for automatic \
|
||||
speech recognition and speaker diarization."
|
||||
LABEL url="https://github.com/JSchmie/ScrAIbe"
|
||||
LABEL description="Scraibe: LocalAI-backed transcription and diarization client with summarization and custom Web GUI. \
|
||||
Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
|
||||
LABEL url="https://git.optimex.systems/admin/scribe"
|
||||
|
||||
# Install dependencies
|
||||
WORKDIR /app
|
||||
ARG model_name=medium
|
||||
#Enviorment Dependncies
|
||||
ENV TRANSFORMERS_CACHE /app/models
|
||||
ENV HF_HOME /app/models
|
||||
ENV AUTOT_CACHE /app/models
|
||||
ENV PYANNOTE_CACHE /app/models/pyannote
|
||||
#Copy all necessary files
|
||||
COPY requirements.txt /app/requirements.txt
|
||||
COPY README.md /app/README.md
|
||||
COPY models /app/models
|
||||
COPY scraibe /app/scraibe
|
||||
COPY setup.py /app/setup.py
|
||||
# Install system dependencies (ffmpeg, redis)
|
||||
RUN apt update -y && \
|
||||
apt install -y --no-install-recommends ffmpeg redis-server && \
|
||||
apt clean && \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
|
||||
|
||||
#Installing all necessary Dependencies and Running the Application with a personalised Hugging-Face-Token
|
||||
RUN apt update && apt-get install -y libsm6 libxrender1 libfontconfig1
|
||||
RUN conda update --all
|
||||
# Working directory
|
||||
WORKDIR /app/src
|
||||
|
||||
RUN conda install pip
|
||||
RUN conda install -y ffmpeg
|
||||
RUN conda install -c conda-forge libsndfile
|
||||
RUN pip install torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
|
||||
RUN pip install -r requirements.txt
|
||||
RUN pip install markupsafe==2.0.1 --force-reinstall
|
||||
# Environment variables for LocalAI (transcription/diarization)
|
||||
ENV LOCALAI_API_URL=http://localhost:8080
|
||||
ENV LOCALAI_API_KEY=
|
||||
ENV LOCALAI_MODEL=vibevoice-cpp-asr
|
||||
|
||||
RUN python3 -m 'scraibe.cli' --whisper-model-name $model_name
|
||||
# Expose port
|
||||
# Environment variables for Summarizer LLM
|
||||
ENV SUMMARIZER_API_URL=http://localhost:8080
|
||||
ENV SUMMARIZER_API_KEY=
|
||||
ENV SUMMARIZER_MODEL=qwen3-14b
|
||||
|
||||
# Gradio / Web GUI
|
||||
ENV GRADIO_SERVER_NAME=0.0.0.0
|
||||
|
||||
# Async processing (Celery + Redis)
|
||||
ENV CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
ENV CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||
ENV SCRAIBE_UPLOAD_DIR=/tmp/scraibe_uploads
|
||||
|
||||
# Email and template configuration
|
||||
ENV EMAIL_CONTACT_ADDRESS=support@example.com
|
||||
ENV EMAIL_CSS_PATH=
|
||||
ENV SCRAIBE_TEMPLATES_DIR=/app/src/misc
|
||||
ENV SCRABIE_VERSION=0.1.1.dev
|
||||
|
||||
# Copy and install Python dependencies
|
||||
COPY requirements.txt /app/src/requirements.txt
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY scraibe /app/src/scraibe
|
||||
|
||||
# Copy custom Web GUI assets (header, footer, templates, logos, config)
|
||||
COPY misc /app/src/misc
|
||||
|
||||
# Expose ports
|
||||
EXPOSE 7860
|
||||
# Run the application
|
||||
|
||||
ENTRYPOINT ["python3", "-m", "scraibe.cli" ,"--whisper-model-name", "$model_name"]
|
||||
# Run the Web GUI and Celery worker (with Redis) by default
|
||||
CMD ["/bin/bash", "-c", "redis-server --daemonize yes && celery -A scraibe.celery_app worker -Q transcription -l info & python3 -m scraibe"]
|
||||
|
||||
@@ -1,173 +1,370 @@
|
||||
# `ScrAIbe: Streamlined Conversation Recording with Automated Intelligence Based Environment` 🎙️🧠
|
||||
# ScrAIbe – LocalAI-Backed Transcription and Summarization
|
||||
|
||||
ScrAIbe is a transcription and summarization service that:
|
||||
|
||||
- Sends audio to a LocalAI server running vibevoice.cpp for transcription and speaker diarization.
|
||||
- Optionally uses a second LLM to generate a structured summary.
|
||||
- Provides:
|
||||
- A web GUI for uploading audio and receiving transcripts via email.
|
||||
- A CLI and Python API for direct integration.
|
||||
- An MCP-style HTTP API (OpenAPI) for LLMs and external systems.
|
||||
- A watch-folder mode for automatic transcription, summarization, and email delivery.
|
||||
|
||||
No local speech models or heavy dependencies are required. ScrAIbe is designed as a thin client in front of your own AI services.
|
||||
|
||||
For more information: https://apstrom.ca
|
||||
|
||||
## Features
|
||||
|
||||
- Transcription with speaker diarization via LocalAI:
|
||||
- Uses the /v1/audio/diarization endpoint.
|
||||
- Compatible with vibevoice.cpp and other diarization-capable backends.
|
||||
- Optional AI-powered summarization:
|
||||
- Task: transcript_and_summarize
|
||||
- Highlights:
|
||||
- Main topics and discussion points
|
||||
- Key decisions and outcomes
|
||||
- Action items and responsibilities
|
||||
- Open issues and risks
|
||||
- Improved, configurable summary prompts (via environment or file).
|
||||
- Async web GUI (always enabled):
|
||||
- Upload audio via browser.
|
||||
- Jobs are queued and processed in the background (Celery + Redis).
|
||||
- Emails:
|
||||
- Immediate confirmation with queue position.
|
||||
- Final transcript (MD + DOCX + JSON) when ready.
|
||||
- Summary as MD + DOCX (if requested).
|
||||
- Error notification if processing fails.
|
||||
- MCP-style HTTP API (optional):
|
||||
- Exposes an OpenAPI-compliant REST endpoint for external LLMs or services.
|
||||
- Allows:
|
||||
- Audio upload for transcription.
|
||||
- Job status checks.
|
||||
- Retrieval of transcript JSON (no summary).
|
||||
- Enabled via MCP_SERVER_ENABLED=true.
|
||||
- Watch-folder mode (optional):
|
||||
- Monitors a directory for audio files.
|
||||
- For each file:
|
||||
- Transcribes and summarizes.
|
||||
- Emails transcript + summary + JSON to a configured address.
|
||||
- Deletes the source file after successful processing (configurable).
|
||||
- Enabled via WATCH_ENABLED=true.
|
||||
- File formats:
|
||||
- Transcript:
|
||||
- .md
|
||||
- .docx (line-numbered, 30 lines per page, optional cover page)
|
||||
- Summary (if requested):
|
||||
- .md
|
||||
- .docx (markdown-aware WYSIWYG styling, optional cover page)
|
||||
- Full structured output: .json
|
||||
- Customizable branding:
|
||||
- Web GUI title, logo, and accent color via environment variables.
|
||||
- Email logo, accent color, and subject lines via environment variables.
|
||||
- Optional cover pages for transcript and summary DOCX.
|
||||
- CLI and Python API:
|
||||
- Simple command-line interface.
|
||||
- Drop-in Scraibe class for integration into other tools.
|
||||
- Docker-ready:
|
||||
- Lightweight container, configured via environment variables.
|
||||
|
||||
## Architecture
|
||||
|
||||
- LocalAI (vibevoice.cpp):
|
||||
- Handles audio → transcript + speaker segments.
|
||||
- Summarizer LLM (OpenAI-compatible chat endpoint):
|
||||
- Handles transcript → structured summary.
|
||||
- ScrAIbe:
|
||||
- Orchestrates:
|
||||
- File upload to LocalAI
|
||||
- Transcript assembly
|
||||
- Chunked summarization
|
||||
- Output formatting (e.g., .md with transcript + summary)
|
||||
- Runs:
|
||||
- Web GUI (Gradio) – always enabled
|
||||
- MCP-style HTTP API (FastAPI) – optional
|
||||
- Watch-folder mode – optional
|
||||
- Celery worker (async processing)
|
||||
- Redis (in-container by default)
|
||||
|
||||
## Quick Start (Web GUI in Docker)
|
||||
|
||||
Run the container with your LocalAI and summarizer endpoints:
|
||||
|
||||
- docker run -d \
|
||||
-p 7860:7860 \
|
||||
-e LOCALAI_API_URL=http://localai:8080 \
|
||||
-e SUMMARIZER_API_URL=http://llm:8080 \
|
||||
-e EMAIL_SMTP_HOST=smtp.your-domain.com \
|
||||
-e EMAIL_SMTP_PORT=587 \
|
||||
-e EMAIL_SMTP_USER=transcribe@your-domain.com \
|
||||
-e EMAIL_SMTP_PASSWORD=your_password \
|
||||
-e EMAIL_FROM_ADDRESS="ScrAIbe <transcribe@your-domain.com>" \
|
||||
-e EMAIL_CONTACT_ADDRESS=support@your-domain.com \
|
||||
-e WEBUI_TITLE="Your Transcription Service" \
|
||||
-e WEBUI_LOGO_URL="https://your-domain.com/logo.png" \
|
||||
-e EMAIL_LOGO_URL="https://your-domain.com/logo.png" \
|
||||
-e EMAIL_ACCENT_COLOR="#7C6DA0" \
|
||||
scraibe:latest
|
||||
|
||||
Then open: http://<host>:7860
|
||||
|
||||
## Quick Start (CLI)
|
||||
|
||||
Basic usage:
|
||||
|
||||
- Transcribe:
|
||||
|
||||
- python3 -m scraibe.cli -f "audio.wav" -o "./output" -of txt
|
||||
|
||||
- Transcribe and summarize:
|
||||
|
||||
- python3 -m scraibe.cli -f "audio.wav" -o "./output" --task transcript_and_summarize
|
||||
|
||||
Environment variables must be set to point to your LocalAI and summarizer LLM.
|
||||
|
||||
## Python API
|
||||
|
||||
Example: transcribe only
|
||||
|
||||
- from scraibe import Scraibe
|
||||
|
||||
- client = Scraibe()
|
||||
- text = client.transcribe("audio.wav")
|
||||
- print(text)
|
||||
|
||||
Example: transcribe and summarize
|
||||
|
||||
- from scraibe import Scraibe
|
||||
|
||||
- client = Scraibe()
|
||||
- result = client.transcript_and_summarize("audio.wav")
|
||||
- transcript = result["transcript"]
|
||||
- summary = result["summary"]
|
||||
|
||||
You can override endpoints and models via environment variables or constructor parameters if needed.
|
||||
|
||||
## Command-Line Options
|
||||
|
||||
Run:
|
||||
|
||||
- python3 -m scraibe.cli -h
|
||||
|
||||
Key options:
|
||||
|
||||
- -f / --audio-files:
|
||||
- One or more audio files to process.
|
||||
- --task:
|
||||
- transcribe (default)
|
||||
- transcript_and_summarize
|
||||
- -o / --output-directory:
|
||||
- Output folder for generated files.
|
||||
- -of / --output-format:
|
||||
- txt, json, md, html
|
||||
- For transcript_and_summarize, output is always saved as .md with:
|
||||
- # Transcript
|
||||
- # Summary
|
||||
|
||||
Other options (e.g., --language, --num-speakers) are accepted and forwarded where applicable; many legacy Whisper/Pyannote flags are kept for compatibility but ignored.
|
||||
|
||||
## Docker Usage
|
||||
|
||||
ScrAIbe is designed to run in Docker as a client to your LocalAI and summarizer LLM.
|
||||
|
||||
### Basic run (transcribe via CLI)
|
||||
|
||||
- docker run -it \
|
||||
-e LOCALAI_API_URL=http://localai:8080 \
|
||||
-v /path/to/audio:/audio \
|
||||
scraibe:latest \
|
||||
-f /audio/meeting.wav -o /audio/output -of txt
|
||||
|
||||
### Basic run (transcribe + summarize via CLI)
|
||||
|
||||
- docker run -it \
|
||||
-e LOCALAI_API_URL=http://localai:8080 \
|
||||
-e SUMMARIZER_API_URL=http://llm:8080 \
|
||||
-v /path/to/audio:/audio \
|
||||
scraibe:latest \
|
||||
-f /audio/meeting.wav -o /audio/output --task transcript_and_summarize
|
||||
|
||||
### Docker Environment Variables
|
||||
|
||||
The following environment variables configure ScrAIbe in Docker.
|
||||
|
||||
Transcription / Diarization (LocalAI):
|
||||
|
||||
- LOCALAI_API_URL:
|
||||
- Required.
|
||||
- Base URL of the LocalAI server.
|
||||
- Example: http://localai:8080
|
||||
- LOCALAI_API_KEY:
|
||||
- Optional.
|
||||
- API key for LocalAI, if configured.
|
||||
- LOCALAI_MODEL:
|
||||
- Optional (default: vibevoice-diarize).
|
||||
- Model name used for transcription/diarization.
|
||||
|
||||
Summarization LLM:
|
||||
|
||||
Welcome to `ScrAIbe`, a state-of-the-art, [PyTorch](https://pytorch.org/) based multilingual speech-to-text framework designed to generate fully automated transcriptions.
|
||||
|
||||
Beyond transcription, ScrAIbe supports advanced functions such as speaker diarization and speaker recognition. 🚀
|
||||
|
||||
Designed as a comprehensive AI toolkit, it uses multiple powerful AI models:
|
||||
|
||||
- **[Whisper](https://github.com/openai/whisper)**: A general-purpose speech recognition model.
|
||||
- **[WhisperX](https://github.com/m-bain/whisperX)**: A faster, quantized version of Whisper for enhanced performance on CPU. ⚡
|
||||
- **[Pyannote-Audio](https://github.com/pyannote/pyannote-audio)**: An open-source toolkit for speaker diarization. 🗣️
|
||||
|
||||
The framework utilizes a PyanNet-inspired pipeline, with the `Pyannote` library for speaker diarization and `VoxCeleb` for speaker embedding.
|
||||
|
||||
During post-diarization, each audio segment is processed by the OpenAI `Whisper` model in a transformer encoder-decoder structure. Initially, a CNN mitigates noise and enhances speech. Before transcription, `VoxLingua` identifies the language segment, facilitating Whisper's role in both transcription and text translation. 🌍✨
|
||||
|
||||
The following graphic illustrates the whole pipeline:
|
||||
|
||||
<div style="text-align:center;">
|
||||
<img src="./Pictures/pipeline.png#gh-dark-mode-only" style="width: 60%;" />
|
||||
<img src="./Pictures/pipeline_light.png#gh-light-mode-only" style="width: 60%;" />
|
||||
</div>
|
||||
|
||||
## Getting Started 🚀
|
||||
|
||||
### Prerequisites
|
||||
|
||||
Before installing ScrAIbe, ensure you have the following prerequisites:
|
||||
|
||||
- **Python**: Version 3.9 or later.
|
||||
- **PyTorch**: Version 2.0 or later.
|
||||
- **CUDA**: A compatible version with your PyTorch Version if you want to use GPU acceleration.
|
||||
|
||||
**Note:** PyTorch should be automatically installed with the pip installer. However, if you encounter any issues, you should consider installing it manually by following the instructions on the [PyTorch website](https://pytorch.org/get-started/locally/).
|
||||
|
||||
### Install ScrAIbe
|
||||
|
||||
Install ScrAIbe on your local machine with ease using PyPI.
|
||||
|
||||
```bash
|
||||
pip install scraibe
|
||||
```
|
||||
|
||||
If you want to install the development version, you can do so by installing it from GitHub:
|
||||
|
||||
```bash
|
||||
pip install git+https://github.com/JSchmie/ScrAIbe.git@develop
|
||||
```
|
||||
|
||||
or from PyPI using our latest pre-release:
|
||||
|
||||
```bash
|
||||
pip install --pre scraibe
|
||||
```
|
||||
|
||||
Get started with ScrAIbe today and experience seamless, automated transcription and diarization.
|
||||
|
||||
## Usage
|
||||
|
||||
We've developed ScrAIbe with several access points to cater to diverse user needs.
|
||||
|
||||
### Python Usage
|
||||
|
||||
Gain full control over the functionalities as well as process customization.
|
||||
|
||||
```python
|
||||
from scraibe import Scraibe
|
||||
|
||||
model = Scraibe()
|
||||
|
||||
text = model.autotranscribe("audio.wav")
|
||||
|
||||
print(f"Transcription: \n{text}")
|
||||
```
|
||||
|
||||
The `Scraibe` class ensures the models are properly loaded. You can customize the models with various keywords:
|
||||
|
||||
- **Whisper Models**: Use the `whisper_model` keyword to specify models like `tiny`, `base`, `small`, `medium`, or `large` (`large-v2`, `large-v3`) depending on your accuracy and speed needs.
|
||||
- **Pyannote Diarization Model**: Use the `dia_model` keyword to change the diarization model.
|
||||
- **WhisperX**: Set the `whisper_type` to `"whisperX"` for enhanced performance on CPU and use their enhanced models. (Model names are the same)
|
||||
- **Keyword Arguments**: A variety of different `kwargs` are available:
|
||||
- `use_auth_token`: Pass a Hugging Face token to the Pyannote backend if you want to use one of the models hosted on their Hugging Face.
|
||||
- `verbose`: Enable this to add an additional level of verbosity.
|
||||
|
||||
In general, you should be able to input any `kwargs` that you can input in the original Whisper (WhisperX) and Pyannote Python APIs.
|
||||
|
||||
As input, `autotranscribe` accepts every format compatible with [FFmpeg](https://ffmpeg.org/ffmpeg-formats.html). Examples include `.mp4`, `.mp3`, `.wav`, `.ogg`, `.flac`, and many more.
|
||||
|
||||
To further control the pipeline of `ScrAIbe`, you can pass almost any keyword argument that is accepted by `Whisper` or `Pyannote`. For more options, refer to the documentation of these frameworks, as their keywords are likely to work here as well.
|
||||
|
||||
Here are some examples regarding `diarization` (which relies on the `pyannote` pipeline):
|
||||
|
||||
- `num_speakers`: Number of speakers in the audio file
|
||||
- `min_speakers`: Minimum number of speakers in the audio file
|
||||
- `max_speakers`: Maximum number of speakers in the audio file
|
||||
|
||||
Then there are arguments for the transcription process, which uses the "Whisper" model:
|
||||
|
||||
- `language`: Specify the language ([list of supported languages](https://github.com/openai/whisper/blob/main/language-breakdown.svg))
|
||||
- `task`: Can be either `transcribe` or `translate`. If `translate` is selected, the transcribed audio will be translated to English.
|
||||
|
||||
For example:
|
||||
|
||||
```python
|
||||
text = model.autotranscribe("audio.wav", language="german", num_speakers = 2)
|
||||
```
|
||||
|
||||
`Scraibe` also contains the option to just do a transcription:
|
||||
|
||||
```python
|
||||
transcription = model.transcribe("audio.wav")
|
||||
```
|
||||
|
||||
or just do a diarization:
|
||||
|
||||
```python
|
||||
diarization = model.diarization("audio.wav")
|
||||
```
|
||||
|
||||
Start exploring the powerful features of ScrAIbe and customize it to fit your specific transcription and diarization needs!
|
||||
|
||||
### Command-line usage
|
||||
|
||||
Next to the Pyhton interface, you can also run ScrAIbe using the command-line interface:
|
||||
|
||||
```bash
|
||||
scraibe -f "audio.wav" --language "german" --num_speakers 2
|
||||
```
|
||||
|
||||
For the full list of options, run:
|
||||
|
||||
```bash
|
||||
scraibe -h
|
||||
```
|
||||
|
||||
This will display a comprehensive list of all command-line options, allowing you to tailor ScrAIbe’s functionality to your specific needs.
|
||||
|
||||
## Gradio App 🌐
|
||||
|
||||
The Gradio App is now part of ScrAIbe-WebUI! This user-friendly interface enables you to run the model without any coding knowledge. You can easily run the app in your browser and upload your audio files, or make the framework available on your network and run it on your local machine. 🚀
|
||||
|
||||
All functionalities previously available in the Gradio App are now part of the ScrAIbe-WebUI. For more information and detailed instructions, visit the [ScrAIbe-WebUI GitHub repository](https://github.com/JSchmie/ScrAIbe-WebUI).
|
||||
|
||||
## Docker Container 🐳
|
||||
|
||||
ScrAIbe's Docker containers have also moved to ScrAIbe-WebUI! This option is especially useful if you want to run the model on a server or if you would like to use the GPU without dealing with CUDA.
|
||||
|
||||
All Docker container functionalities are now part of ScrAIbe-WebUI. For more information and detailed instructions on how to use the Docker containers, please visit the [ScrAIbe-WebUI GitHub repository](https://github.com/JSchmie/ScrAIbe-WebUI).
|
||||
|
||||
---
|
||||
|
||||
With these changes, ScrAIbe focuses on its core functionalities while the enhanced Gradio App and related Docker containers are now part of ScrAIbe-WebUI. Enjoy a more streamlined and powerful transcription experience! 🎉
|
||||
|
||||
## Documentation 📚
|
||||
|
||||
For comprehensive guides, detailed instructions, and advanced usage tips, visit our [documentation page](https://jschmie.github.io/ScrAIbe/). Here, you will find everything you need to make the most out of ScrAIbe.
|
||||
|
||||
### Contributions 🤝
|
||||
|
||||
We warmly welcome contributions from the community! Whether you’re fixing bugs, adding new features, or improving documentation, your help is invaluable. Please see our [Contributing Guidelines](./CONTRIBUTING.md) for more information on how to get involved and make your mark on ScrAIbe-WebUI.
|
||||
|
||||
|
||||
### License 📜
|
||||
|
||||
ScrAIbe-WebUI is proudly open source and licensed under the GPL-3.0 license. This promotes a collaborative and transparent development process. For more details, see the [LICENSE](./LICENSE) file in this repository.
|
||||
|
||||
## Acknowledgments
|
||||
|
||||
Special thanks go to the [KIDA](https://www.kida-bmel.de/) project and the [BMEL (Bundesministerium für Ernährung und Landwirtschaft)](https://www.bmel.de/EN/Home/home_node.html), especially to the AI Consultancy Team.
|
||||
|
||||
---
|
||||
|
||||
Join us in making ScrAIbe even better! 🚀
|
||||
- SUMMARIZER_API_URL:
|
||||
- Required when using --task transcript_and_summarize.
|
||||
- Base URL of the summarization LLM (OpenAI-compatible /v1/chat/completions).
|
||||
- Example: http://llm:8080
|
||||
- SUMMARIZER_API_KEY:
|
||||
- Optional.
|
||||
- API key for the summarization LLM, if required.
|
||||
- SUMMARIZER_MODEL:
|
||||
- Optional (default: llama-3.1-8b-instruct).
|
||||
- Model name used for summarization.
|
||||
|
||||
Web GUI and branding:
|
||||
|
||||
- WEBUI_TITLE:
|
||||
- Title shown in the web GUI (default: A.P.Strom Transcription).
|
||||
- WEBUI_LOGO_URL:
|
||||
- URL of the logo displayed in the web GUI header.
|
||||
- Example: https://your-domain.com/logo.png
|
||||
|
||||
Accent color (UI and emails):
|
||||
|
||||
- EMAIL_ACCENT_COLOR:
|
||||
- Accent color used in:
|
||||
- Web GUI buttons and accents
|
||||
- Email headings, links, and email addresses
|
||||
- Default: #7C6DA0
|
||||
|
||||
MCP-style HTTP API:
|
||||
|
||||
- MCP_SERVER_ENABLED:
|
||||
- Enable MCP-style HTTP API (default: false).
|
||||
- Values: true/false.
|
||||
- MCP_SERVER_HOST:
|
||||
- Bind address (default: 0.0.0.0).
|
||||
- MCP_SERVER_PORT:
|
||||
- Port (default: 8000).
|
||||
- MCP_USE_CELERY:
|
||||
- Use Celery for async transcription (default: true).
|
||||
- If false, transcription runs in-process.
|
||||
|
||||
Watch-folder mode:
|
||||
|
||||
- WATCH_ENABLED:
|
||||
- Enable watch-folder mode (default: false).
|
||||
- Values: true/false.
|
||||
- WATCH_DIR:
|
||||
- Directory to monitor for audio files (required if WATCH_ENABLED=true).
|
||||
- WATCH_EMAIL_TO:
|
||||
- Email address to send transcript and summary (required if WATCH_ENABLED=true).
|
||||
- WATCH_POLL_INTERVAL:
|
||||
- Seconds between scans (default: 10).
|
||||
- WATCH_DELETE_ON_SUCCESS:
|
||||
- Delete source file after successful processing (default: true).
|
||||
|
||||
Async processing (Celery + Redis):
|
||||
|
||||
- CELERY_BROKER_URL:
|
||||
- Redis broker URL (default: redis://localhost:6379/0).
|
||||
- CELERY_RESULT_BACKEND:
|
||||
- Redis backend URL (default: redis://localhost:6379/0).
|
||||
- SCRAIBE_UPLOAD_DIR:
|
||||
- Directory where uploaded audio is stored (default: /tmp/scraibe_uploads).
|
||||
|
||||
Email configuration:
|
||||
|
||||
- EMAIL_SMTP_HOST:
|
||||
- SMTP server host.
|
||||
- EMAIL_SMTP_PORT:
|
||||
- SMTP server port (e.g., 587).
|
||||
- EMAIL_SMTP_USER:
|
||||
- SMTP username.
|
||||
- EMAIL_SMTP_PASSWORD:
|
||||
- SMTP password.
|
||||
- EMAIL_SMTP_USE_TLS:
|
||||
- Use TLS (true/false; default: true).
|
||||
- EMAIL_FROM_ADDRESS:
|
||||
- Sender address (e.g., "ScrAIbe <transcribe@your-domain.com>").
|
||||
- EMAIL_CONTACT_ADDRESS:
|
||||
- Support contact address shown in email templates.
|
||||
- EMAIL_LOGO_URL:
|
||||
- URL of the logo used in emails (preferred).
|
||||
- EMAIL_LOGO_PATH:
|
||||
- Fallback local path for email logo (default: /app/src/misc/logo1.png).
|
||||
- EMAIL_CSS_PATH:
|
||||
- Path to the CSS used in emails (default: /app/src/misc/mail_style.css).
|
||||
|
||||
Email subject lines (customizable):
|
||||
|
||||
- EMAIL_SUBJECT_UPLOAD:
|
||||
- Subject for upload confirmation email.
|
||||
- Default: "ScrAIbe: Your transcription request has been received"
|
||||
- EMAIL_SUBJECT_SUCCESS:
|
||||
- Subject for transcript-ready email.
|
||||
- Default: "ScrAIbe: Your transcript is ready"
|
||||
- EMAIL_SUBJECT_ERROR:
|
||||
- Subject for error notification email.
|
||||
- Default: "ScrAIbe: Error with your transcription request"
|
||||
|
||||
Summary prompt customization:
|
||||
|
||||
- SUMMARY_PROMPT_CHUNK:
|
||||
- Override prompt used for each transcript chunk.
|
||||
- SUMMARY_PROMPT_COMBINED:
|
||||
- Override prompt used for the final combined summary.
|
||||
- SUMMARY_PROMPT_FILE:
|
||||
- Path to a file with prompts in sections:
|
||||
- [chunk]
|
||||
- [combined]
|
||||
|
||||
DOCX and cover pages:
|
||||
|
||||
- COVER_PAGE_ENABLED:
|
||||
- Add a cover page to transcript and summary DOCX files (default: false).
|
||||
- COVER_PAGE_ORGANIZATION:
|
||||
- Organization name shown on the cover page.
|
||||
- COVER_PAGE_TITLE_PREFIX:
|
||||
- Title prefix (e.g., "TRANSCRIPT" or "SUMMARY").
|
||||
- COVER_PAGE_LOGO_URL:
|
||||
- Logo URL to include on the cover page.
|
||||
- COVER_PAGE_LOGO_PATH:
|
||||
- Local logo path to include on the cover page.
|
||||
|
||||
Output files (async web GUI and watch-folder mode):
|
||||
|
||||
When a job completes, the user receives:
|
||||
|
||||
- Transcript:
|
||||
- .md file
|
||||
- .docx file (line-numbered, 30 lines per page, optional cover page)
|
||||
- Summary (if requested):
|
||||
- .md file
|
||||
- .docx file (markdown-aware styling, optional cover page)
|
||||
- JSON:
|
||||
- Structured transcript with diarization and metadata
|
||||
|
||||
All of these can also be overridden from the CLI when needed (e.g., --localai-api-url, --summarizer-api-url).
|
||||
|
||||
## Dependencies
|
||||
|
||||
Core runtime dependencies:
|
||||
|
||||
- Python 3.9+
|
||||
- httpx
|
||||
- numpy
|
||||
- tqdm
|
||||
- gradio
|
||||
- celery[redis]
|
||||
- redis
|
||||
- python-docx
|
||||
- fastapi
|
||||
- uvicorn
|
||||
- ffmpeg (for audio preprocessing)
|
||||
|
||||
No local Whisper, PyTorch, or Pyannote models are required.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome. Please refer to CONTRIBUTING.md for guidelines.
|
||||
|
||||
## License
|
||||
|
||||
This project is licensed under GPL-3.0. See LICENSE for details.
|
||||
|
||||
-256
@@ -1,256 +0,0 @@
|
||||
channels:
|
||||
- pytorch
|
||||
- defaults
|
||||
dependencies:
|
||||
- _libgcc_mutex=0.1=main
|
||||
- _openmp_mutex=5.1=1_gnu
|
||||
- blas=1.0=mkl
|
||||
- brotlipy=0.7.0=py39h27cfd23_1003
|
||||
- bzip2=1.0.8=h7b6447c_0
|
||||
- ca-certificates=2023.05.30=h06a4308_0
|
||||
- certifi=2023.5.7=py39h06a4308_0
|
||||
- cffi=1.15.1=py39h5eee18b_3
|
||||
- cryptography=39.0.1=py39h9ce1e76_2
|
||||
- cudatoolkit=11.3.1=h2bc3f7f_2
|
||||
- ffmpeg=4.2.2=h20bf706_0
|
||||
- flit-core=3.8.0=py39h06a4308_0
|
||||
- freetype=2.12.1=h4a9f257_0
|
||||
- giflib=5.2.1=h5eee18b_3
|
||||
- gmp=6.2.1=h295c915_3
|
||||
- gnutls=3.6.15=he1e5248_0
|
||||
- idna=3.4=py39h06a4308_0
|
||||
- intel-openmp=2021.4.0=h06a4308_3561
|
||||
- jpeg=9e=h5eee18b_1
|
||||
- lame=3.100=h7b6447c_0
|
||||
- lcms2=2.12=h3be6417_0
|
||||
- ld_impl_linux-64=2.38=h1181459_1
|
||||
- lerc=3.0=h295c915_0
|
||||
- libdeflate=1.17=h5eee18b_0
|
||||
- libffi=3.4.2=h6a678d5_6
|
||||
- libgcc-ng=11.2.0=h1234567_1
|
||||
- libgomp=11.2.0=h1234567_1
|
||||
- libidn2=2.3.2=h7f8727e_0
|
||||
- libopus=1.3.1=h7b6447c_0
|
||||
- libpng=1.6.39=h5eee18b_0
|
||||
- libstdcxx-ng=11.2.0=h1234567_1
|
||||
- libtasn1=4.16.0=h27cfd23_0
|
||||
- libtiff=4.5.0=h6a678d5_2
|
||||
- libunistring=0.9.10=h27cfd23_0
|
||||
- libuv=1.44.2=h5eee18b_0
|
||||
- libvpx=1.7.0=h439df22_0
|
||||
- libwebp=1.2.4=h11a3e52_1
|
||||
- libwebp-base=1.2.4=h5eee18b_1
|
||||
- lz4-c=1.9.4=h6a678d5_0
|
||||
- mkl=2021.4.0=h06a4308_640
|
||||
- mkl-service=2.4.0=py39h7f8727e_0
|
||||
- mkl_fft=1.3.1=py39hd3c417c_0
|
||||
- mkl_random=1.2.2=py39h51133e4_0
|
||||
- ncurses=6.4=h6a678d5_0
|
||||
- nettle=3.7.3=hbbd107a_1
|
||||
- numpy=1.23.5=py39h14f4228_0
|
||||
- numpy-base=1.23.5=py39h31eccc5_0
|
||||
- openh264=2.1.1=h4ff587b_0
|
||||
- openssl=3.0.9=h7f8727e_0
|
||||
- pillow=9.4.0=py39h6a678d5_0
|
||||
- pip=23.0.1=py39h06a4308_0
|
||||
- pycparser=2.21=pyhd3eb1b0_0
|
||||
- pyopenssl=23.0.0=py39h06a4308_0
|
||||
- pysocks=1.7.1=py39h06a4308_0
|
||||
- python=3.9.16=h955ad1f_3
|
||||
- pytorch=1.11.0=py3.9_cuda11.3_cudnn8.2.0_0
|
||||
- pytorch-mutex=1.0=cuda
|
||||
- readline=8.2=h5eee18b_0
|
||||
- requests=2.28.1=py39h06a4308_1
|
||||
- setuptools=65.6.3=py39h06a4308_0
|
||||
- six=1.16.0=pyhd3eb1b0_1
|
||||
- sqlite=3.41.2=h5eee18b_0
|
||||
- tk=8.6.12=h1ccaba5_0
|
||||
- torchaudio=0.11.0=py39_cu113
|
||||
- torchvision=0.12.0=py39_cu113
|
||||
- tzdata=2023c=h04d1e81_0
|
||||
- wheel=0.38.4=py39h06a4308_0
|
||||
- x264=1!157.20191217=h7b6447c_0
|
||||
- xz=5.4.2=h5eee18b_0
|
||||
- zlib=1.2.13=h5eee18b_0
|
||||
- zstd=1.5.4=hc292b87_0
|
||||
- pip:
|
||||
- absl-py==1.3.0
|
||||
- aiofiles==23.1.0
|
||||
- aiohttp==3.8.3
|
||||
- aiosignal==1.3.1
|
||||
- alembic==1.9.1
|
||||
- altair==5.0.1
|
||||
- annotated-types==0.5.0
|
||||
- ansi2html==1.8.0
|
||||
- antlr4-python3-runtime==4.9.3
|
||||
- anyio==3.7.1
|
||||
- appdirs==1.4.4
|
||||
- asteroid-filterbanks==0.4.0
|
||||
- async-timeout==4.0.2
|
||||
- attrs==22.2.0
|
||||
- audioread==3.0.0
|
||||
- autopage==0.5.1
|
||||
- backports-cached-property==1.0.2
|
||||
- cachetools==5.2.0
|
||||
- charset-normalizer==2.1.1
|
||||
- click==8.1.3
|
||||
- cliff==4.1.0
|
||||
- cmaes==0.9.0
|
||||
- cmake==3.26.4
|
||||
- cmd2==2.4.2
|
||||
- colorama==0.4.6
|
||||
- colorlog==6.7.0
|
||||
- commonmark==0.9.1
|
||||
- contourpy==1.0.6
|
||||
- cycler==0.11.0
|
||||
- dash==2.12.1
|
||||
- dash-core-components==2.0.0
|
||||
- dash-html-components==2.0.0
|
||||
- dash-table==5.0.0
|
||||
- decorator==4.4.2
|
||||
- docopt==0.6.2
|
||||
- einops==0.3.2
|
||||
- exceptiongroup==1.1.1
|
||||
- fastapi==0.100.0
|
||||
- ffmpeg-python==0.2.0
|
||||
- ffmpy==0.3.0
|
||||
- filelock==3.8.0
|
||||
- flask==2.2.5
|
||||
- fonttools==4.38.0
|
||||
- frozenlist==1.3.3
|
||||
- fsspec==2022.11.0
|
||||
- future==0.18.2
|
||||
- google-auth==2.15.0
|
||||
- google-auth-oauthlib==0.4.6
|
||||
- gradio==3.36.1
|
||||
- gradio-client==0.2.7
|
||||
- greenlet==2.0.1
|
||||
- grpcio==1.51.1
|
||||
- h11==0.14.0
|
||||
- hmmlearn==0.2.8
|
||||
- httpcore==0.17.3
|
||||
- httpx==0.24.1
|
||||
- huggingface-hub==0.16.4
|
||||
- humanize==4.7.0
|
||||
- hyperpyyaml==1.1.0
|
||||
- imageio==2.23.0
|
||||
- imageio-ffmpeg==0.4.7
|
||||
- importlib-metadata==4.13.0
|
||||
- importlib-resources==5.12.0
|
||||
- iniconfig==2.0.0
|
||||
- itsdangerous==2.1.2
|
||||
- jinja2==3.1.2
|
||||
- joblib==1.2.0
|
||||
- jsonschema==4.18.0
|
||||
- jsonschema-specifications==2023.6.1
|
||||
- julius==0.2.7
|
||||
- kiwisolver==1.4.4
|
||||
- librosa==0.9.2
|
||||
- linkify-it-py==2.0.2
|
||||
- lit==16.0.5.post0
|
||||
- llvmlite==0.39.1
|
||||
- mako==1.2.4
|
||||
- markdown==3.4.1
|
||||
- markdown-it-py==2.2.0
|
||||
- markupsafe==2.1.1
|
||||
- matplotlib==3.7.1
|
||||
- mdit-py-plugins==0.3.3
|
||||
- mdurl==0.1.2
|
||||
- more-itertools==9.0.0
|
||||
- moviepy==1.0.3
|
||||
- mpmath==1.2.1
|
||||
- multidict==6.0.4
|
||||
- nest-asyncio==1.5.7
|
||||
- networkx==2.8.8
|
||||
- numba==0.56.4
|
||||
- oauthlib==3.2.2
|
||||
- omegaconf==2.3.0
|
||||
- openai-whisper==20230314
|
||||
- optuna==3.0.5
|
||||
- orjson==3.9.2
|
||||
- packaging==21.3
|
||||
- pandas==1.5.2
|
||||
- pbr==5.11.0
|
||||
- plotly==5.15.0
|
||||
- pluggy==1.0.0
|
||||
- pooch==1.6.0
|
||||
- prettytable==3.5.0
|
||||
- primepy==1.3
|
||||
- proglog==0.1.10
|
||||
- protobuf==3.20.1
|
||||
- pyannote-audio==2.1.1
|
||||
- pyannote-core==4.5
|
||||
- pyannote-database==4.1.3
|
||||
- pyannote-metrics==3.2.1
|
||||
- pyannote-pipeline==2.3
|
||||
- pyasn1==0.4.8
|
||||
- pyasn1-modules==0.2.8
|
||||
- pydantic==2.0.2
|
||||
- pydantic-core==2.1.2
|
||||
- pydeprecate==0.3.2
|
||||
- pydub==0.25.1
|
||||
- pygments==2.13.0
|
||||
- pyparsing==3.0.9
|
||||
- pyperclip==1.8.2
|
||||
- pytest==7.3.1
|
||||
- python-dateutil==2.8.2
|
||||
- python-multipart==0.0.6
|
||||
- pytorch-lightning==1.6.5
|
||||
- pytorch-metric-learning==1.6.3
|
||||
- pytz==2022.7
|
||||
- pyyaml==6.0
|
||||
- qtfaststart==1.8
|
||||
- referencing==0.29.1
|
||||
- regex==2022.10.31
|
||||
- requests-oauthlib==1.3.1
|
||||
- resampy==0.4.2
|
||||
- retrying==1.3.4
|
||||
- rich==12.6.0
|
||||
- rpds-py==0.8.10
|
||||
- rsa==4.9
|
||||
- ruamel-yaml==0.17.21
|
||||
- ruamel-yaml-clib==0.2.7
|
||||
- ruff==0.0.272
|
||||
- scikit-learn==1.2.0
|
||||
- scipy==1.8.1
|
||||
- semantic-version==2.10.0
|
||||
- semver==2.13.0
|
||||
- sentencepiece==0.1.97
|
||||
- setuptools-rust==1.5.2
|
||||
- shellingham==1.5.0
|
||||
- simplejson==3.18.0
|
||||
- singledispatchmethod==1.0
|
||||
- sniffio==1.3.0
|
||||
- sortedcontainers==2.4.0
|
||||
- soundfile==0.10.3.post1
|
||||
- speechbrain==0.5.14
|
||||
- sqlalchemy==1.4.45
|
||||
- starlette==0.27.0
|
||||
- stevedore==4.1.1
|
||||
- sympy==1.11.1
|
||||
- tabulate==0.9.0
|
||||
- tenacity==8.2.2
|
||||
- tensorboard==2.11.0
|
||||
- tensorboard-data-server==0.6.1
|
||||
- tensorboard-plugin-wit==1.8.1
|
||||
- threadpoolctl==3.1.0
|
||||
- tiktoken==0.3.1
|
||||
- tokenizers==0.13.2
|
||||
- tomli==2.0.1
|
||||
- toolz==0.12.0
|
||||
- torch-audiomentations==0.11.0
|
||||
- torch-pitch-shift==1.2.2
|
||||
- torchmetrics==0.11.0
|
||||
- tqdm==4.64.1
|
||||
- transformers==4.24.0
|
||||
- triton==2.0.0
|
||||
- typer==0.7.0
|
||||
- typing-extensions==4.7.1
|
||||
- uc-micro-py==1.0.2
|
||||
- urllib3==1.26.12
|
||||
- uvicorn==0.22.0
|
||||
- wcwidth==0.2.5
|
||||
- websockets==11.0.3
|
||||
- werkzeug==2.2.2
|
||||
- yarl==1.8.2
|
||||
- zipp==3.11.0
|
||||
@@ -0,0 +1,101 @@
|
||||
## Custom configuration for A.P.Strom Transcription (LocalAI-backed)
|
||||
## Lines that start with ## are comment lines.
|
||||
|
||||
interface_type: async # async or simple (one does transcriptions, requires Email setup)
|
||||
|
||||
launch:
|
||||
## Gradio launch options (if using WebUI)
|
||||
# server_port: null
|
||||
# server_name: "A.P.Strom Transcription"
|
||||
# inline: false
|
||||
# inbrowser: false
|
||||
# share: false
|
||||
# debug: false
|
||||
max_threads: 18
|
||||
# quiet: false
|
||||
# auth: null # tuple of username and password
|
||||
# auth_message: null
|
||||
# prevent_thread_lock: false
|
||||
# show_error: false
|
||||
# height: 500
|
||||
# width: 100%
|
||||
favicon_path: /app/src/misc/logo.png
|
||||
# ssl_keyfile: null
|
||||
# ssl_certfile: null
|
||||
# ssl_keyfile_password: null
|
||||
# ssl_verify: false
|
||||
# show_api: false
|
||||
# allowed_paths:
|
||||
# blocked_paths: null
|
||||
# root_path: null
|
||||
# app_kwargs: null
|
||||
# state_session_capacity: 10000
|
||||
# share_server_address: null
|
||||
# share_server_protocol: null
|
||||
# max_file_size: null
|
||||
# enable_monitoring: null
|
||||
|
||||
queue:
|
||||
## Queue configuration
|
||||
# status_update_rate: 'auto'
|
||||
# api_open: null
|
||||
max_size: 10
|
||||
# default_concurrency_limit:
|
||||
|
||||
layout:
|
||||
show_settings: false
|
||||
header: /app/src/misc/header.html
|
||||
header_format_options:
|
||||
header_logo_url: https://apstrom.ca/
|
||||
header_logo_src: /app/src/misc/logo.png
|
||||
footer: /app/src/misc/footer.html
|
||||
footer_format_options:
|
||||
# footer_css_path: /app/src/misc/footer_style.css
|
||||
footer_scraibe_webui_version: "0.1.1-dev"
|
||||
|
||||
scraibe_params:
|
||||
## LocalAI (transcription + diarization)
|
||||
localai_api_url: http://localhost:8080
|
||||
localai_api_key: ""
|
||||
localai_model: vibevoice-cpp-asr
|
||||
|
||||
## Summarizer LLM (for transcript_and_summarize)
|
||||
summarizer_api_url: http://localhost:8080
|
||||
summarizer_api_key: ""
|
||||
summarizer_model: qwen3-14b
|
||||
|
||||
## Legacy Whisper/Pyannote fields (ignored by LocalAI client; kept for compatibility)
|
||||
whisper_model: large-v3
|
||||
whisper_type: whisper
|
||||
dia_model: null
|
||||
use_auth_token: null
|
||||
device: cpu
|
||||
num_threads: 18
|
||||
|
||||
mail:
|
||||
sender_email: scribe@apstrom.ca
|
||||
smtp_server: mail.apstrom.ca
|
||||
smtp_port: 587
|
||||
sender_password: ""
|
||||
connection_type: TLS # 'SSL', 'TLS', or 'PLAIN'
|
||||
context: default
|
||||
default_subject: "A.P.Strom audio transcription"
|
||||
error_template: /app/src/misc/error_notification_template.html
|
||||
error_subject: "error"
|
||||
error_format_options:
|
||||
## exception is mandatory for your html; will be set to the related exception in the Code
|
||||
contact_email: support@apstrom.ca
|
||||
success_template: /app/src/misc/success_template.html
|
||||
success_subject: "ready"
|
||||
success_format_options:
|
||||
contact_email: info@apstrom.ca
|
||||
upload_notification_template: /app/src/misc/upload_notification_template.html
|
||||
upload_subject: "upload successful"
|
||||
upload_notification_format_options:
|
||||
queue_position: null
|
||||
contact_email: info@apstrom.ca
|
||||
# mail_css_path: /app/src/misc/mail_style.css
|
||||
|
||||
advanced:
|
||||
keep_model_alive: false # for sync interface only; keeps the model alive during a session
|
||||
concurrent_workers_async: 2 # number of concurrent working threads in the async interface
|
||||
@@ -0,0 +1,31 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Error Notification</title>
|
||||
<style>
|
||||
{email_css}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 style="color:{accent_color};">Error Notification</h1>
|
||||
<p>Dear user,</p>
|
||||
<p>An error occurred while processing your audio file. This means that something went wrong during the processing of your file, and it could not be completed successfully.</p>
|
||||
<p class="error-message">Error details: {exception}</p>
|
||||
<p>Please check the file and try again. If the problem persists, our support team is here to help.</p>
|
||||
<div class="contact">
|
||||
<p>You can contact our support team at <a href="mailto:{contact_email}" style="color:{accent_color};">{contact_email}</a>. They are available to assist with any questions or issues you may have.</p>
|
||||
</div>
|
||||
<div class="disclaimer">
|
||||
<p>Please note that our support team does not have the ability to fix processing errors directly or access the files you have uploaded. They can provide guidance and help troubleshoot any issues you may encounter.</p>
|
||||
</div>
|
||||
<div class="signature">
|
||||
<p>Thank you for using our transcription service!</p>
|
||||
<p>A.P.Strom</p>
|
||||
</div>
|
||||
{email_logo}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,119 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<title>Footer</title>
|
||||
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/css/all.min.css">
|
||||
|
||||
<style>
|
||||
/* Styles from footer_style.css */
|
||||
|
||||
/* Resetting margins and paddings */
|
||||
html, body {{
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
}}
|
||||
body {{
|
||||
font-family: Arial, sans-serif;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between; /* Ensures footer stays at the bottom */
|
||||
}}
|
||||
.footer {{
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 20px;
|
||||
/* Removed background-color to inherit from parent */
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
width: 100%; /* Ensure footer is full width */
|
||||
box-sizing: border-box; /* Padding is included in the width */
|
||||
}}
|
||||
.footer > div:first-child {{
|
||||
margin-left: -20px;
|
||||
padding-left: 0; /* Adjust if necessary */
|
||||
}}
|
||||
.footer div, .footer a {{
|
||||
margin: 0 5px;
|
||||
}}
|
||||
.footer div {{
|
||||
text-align: left;
|
||||
}}
|
||||
.footer a {{
|
||||
color: {accent_color};
|
||||
transition: color 0.3s ease;
|
||||
}}
|
||||
.footer a:hover {{
|
||||
color: #50AF31;
|
||||
}}
|
||||
.foot-text {{
|
||||
text-align: center;
|
||||
width: 80%;
|
||||
margin-bottom: 15px;
|
||||
font-size: 14px;
|
||||
color: #333;
|
||||
}}
|
||||
.brand-section {{
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
text-align: center;
|
||||
}}
|
||||
.brand-icon a {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 50px;
|
||||
height: 50px;
|
||||
border-radius: 50%;
|
||||
background-color: transparent; /* Ensure transparency */
|
||||
text-decoration: none !important;
|
||||
color: white;
|
||||
transition: background-color 0.3s ease, transform 0.3s ease;
|
||||
}}
|
||||
.brand-icon i {{
|
||||
font-size: 24px;
|
||||
}}
|
||||
.brand-icon a:hover, .brand-icon a:focus {{
|
||||
background-color: {accent_color};
|
||||
transform: scale(1.1);
|
||||
text-decoration: none;
|
||||
}}
|
||||
.brand-icon a, .brand-icon a:hover, .brand-icon a:active, .brand-icon a:visited {{
|
||||
text-decoration: none;
|
||||
}}
|
||||
.build-version {{
|
||||
margin-top: 8px;
|
||||
color: white; /* Adjust as needed */
|
||||
font-size: 12px;
|
||||
}}
|
||||
/* Removed dark mode media query to let Gradio handle theming */
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="footer">
|
||||
<div class="foot-text">
|
||||
<h2 style="font-weight: bold; color: {accent_color};">Disclaimer</h2>
|
||||
<p>The transcription completed by this application may contain errors.</p>
|
||||
<p>Users must take care to review transcripts before circulating to ensure that they are error-free and complete.</p>
|
||||
<p>The transcripts produced by this application do not replace a court reporter's transcription. The transcripts completed by this application are for the user's convenience only.</p>
|
||||
<h2 style="font-weight: bold; color: {accent_color};">Data retention</h2>
|
||||
<p>Audio or video files uploaded to this application are only retained for the time that it takes to complete the transcription. All transcripts are deleted after they are transmitted to the user.</p>
|
||||
</div>
|
||||
<div class="brand-section">
|
||||
<div class="brand-icon">
|
||||
<a href="https://apstrom.ca" aria-label="A.P.Strom">
|
||||
<i class="fas fa-globe"></i>
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
<div class="build-version">Build-Version: {footer_scraibe_webui_version}</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,100 @@
|
||||
/* footer_style.css */
|
||||
|
||||
/* Resetting margins and paddings */
|
||||
html, body {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
}
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
justify-content: space-between; /* Ensures footer stays at the bottom */
|
||||
}
|
||||
.footer {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 20px; /* Adjusted for demonstration */
|
||||
background-color: #F9FAFB; /* Fixed background color */
|
||||
font-size: 16px;
|
||||
color: #333;
|
||||
width: 100%; /* Ensure footer is full width */
|
||||
box-sizing: border-box; /* Padding is included in the width */
|
||||
}
|
||||
.footer > div:first-child {
|
||||
margin-left: -20px;
|
||||
padding-left: 0; /* Reducing or eliminating left padding if it's causing the shift */
|
||||
}
|
||||
.footer div, .footer a {
|
||||
margin: 0 5px;
|
||||
}
|
||||
.footer div {
|
||||
text-align: left;
|
||||
}
|
||||
.footer a {
|
||||
color: #333;
|
||||
transition: color 0.3s ease;
|
||||
}
|
||||
.footer a:hover {
|
||||
color: #50AF31;
|
||||
}
|
||||
.github-section {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
text-align: center;
|
||||
}
|
||||
.github-icon a {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
width: 50px;
|
||||
height: 50px;
|
||||
border-radius: 50%;
|
||||
background-color: none;
|
||||
text-decoration: none !important; /* Removes underline */
|
||||
color: white;
|
||||
transition: background-color 0.3s ease, transform 0.3s ease;
|
||||
}
|
||||
.github-icon i {
|
||||
font-size: 24px;
|
||||
}
|
||||
.github-icon a:hover, .github-icon a:focus {
|
||||
background-color: #50AF31;
|
||||
transform: scale(1.1);
|
||||
text-decoration: none; /* Removes underline */
|
||||
}
|
||||
.github-icon a, .github-icon a:hover, .github-icon a:active, .github-icon a:visited {
|
||||
text-decoration: none;
|
||||
}
|
||||
.build-version {
|
||||
margin-top: 8px; /* Adjust spacing between the icon and the text as needed */
|
||||
color: white; /* Adjust text color as needed */
|
||||
font-size: 12px; /* Adjust font size as needed */
|
||||
}
|
||||
|
||||
/* Dark mode styles */
|
||||
@media (prefers-color-scheme: dark) {
|
||||
body {
|
||||
background-color: #121212;
|
||||
color: #FFFFFF;
|
||||
}
|
||||
.footer {
|
||||
background-color: transparent; /* Make footer background transparent */
|
||||
color: #FFFFFF;
|
||||
}
|
||||
.footer a {
|
||||
color: #FFFFFF;
|
||||
}
|
||||
.footer a:hover {
|
||||
color: #50AF31;
|
||||
}
|
||||
.build-version {
|
||||
color: #CCCCCC; /* Adjust text color for better contrast in dark mode */
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{webui_title}</title>
|
||||
|
||||
<!-- Importing Cormorant Garamond font from Google Fonts -->
|
||||
<link href="https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;700&display=swap" rel="stylesheet">
|
||||
|
||||
<style>
|
||||
.header-wrapper {{
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
padding: 20px 20px 0;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
|
||||
.logo-container {{
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
margin-bottom: 10px;
|
||||
}}
|
||||
|
||||
.logo {{
|
||||
width: 75px;
|
||||
height: auto;
|
||||
display: block;
|
||||
}}
|
||||
|
||||
.header-title {{
|
||||
font-family: 'Cormorant Garamond', serif;
|
||||
font-size: 45px;
|
||||
font-weight: bold;
|
||||
color: {accent_color};
|
||||
margin: 0;
|
||||
position: relative;
|
||||
padding: 0.4em 0;
|
||||
text-align: center;
|
||||
max-width: 90%;
|
||||
}}
|
||||
|
||||
.header-title::before,
|
||||
.header-title::after {{
|
||||
content: "";
|
||||
position: absolute;
|
||||
height: 2px;
|
||||
width: 80%;
|
||||
background-color: {accent_color};
|
||||
left: 10%;
|
||||
}}
|
||||
|
||||
.header-title::before {{
|
||||
top: 0.4em;
|
||||
}}
|
||||
|
||||
.header-title::after {{
|
||||
bottom: 0.4em;
|
||||
}}
|
||||
|
||||
.header-description {{
|
||||
text-align: center;
|
||||
padding: 10px 40px 20px;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}}
|
||||
|
||||
.header-description p,
|
||||
.header-description h2 {{
|
||||
font-size: 15px;
|
||||
margin: 8px 0;
|
||||
line-height: 1.5;
|
||||
}}
|
||||
|
||||
.header-description h2 {{
|
||||
font-weight: bold;
|
||||
color: {accent_color};
|
||||
}}
|
||||
|
||||
@media (max-width: 768px) {{
|
||||
.header-title {{
|
||||
font-size: 31px;
|
||||
}}
|
||||
|
||||
.header-title::before,
|
||||
.header-title::after {{
|
||||
width: 80%;
|
||||
left: 10%;
|
||||
}}
|
||||
|
||||
.logo {{
|
||||
width: 50px;
|
||||
}}
|
||||
|
||||
.header-description {{
|
||||
padding: 10px 20px 15px;
|
||||
}}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header-wrapper">
|
||||
<div class="logo-container">
|
||||
<a href="{header_logo_url}">
|
||||
<img src="{header_logo_src}" alt="{webui_title}" class="logo">
|
||||
</a>
|
||||
</div>
|
||||
<h1 class="header-title">{webui_title}</h1>
|
||||
</div>
|
||||
<div class="header-description">
|
||||
<p>
|
||||
Upload, record, or provide a video with audio for transcription. Our toolkit is designed to transcribe content from multiple languages accurately. The integrated speaker diarisation feature identifies different speakers, ensuring a smooth transcription experience. For optimal results, indicate the number of speakers and the original language of the content.
|
||||
</p>
|
||||
<h2>Start your transcription below.</h2>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,58 @@
|
||||
/* header_style.css */
|
||||
|
||||
/* Importing Cormorant Garamond font from Google Fonts */
|
||||
@import url('https://fonts.googleapis.com/css2?family=Cormorant+Garamond:wght@400;700&display=swap');
|
||||
|
||||
.header-container {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
position: relative;
|
||||
padding-top: 30px;
|
||||
}
|
||||
|
||||
.logo-container {
|
||||
position: absolute;
|
||||
top: 50%;
|
||||
right: 20px;
|
||||
transform: translateY(-50%);
|
||||
width: 300px;
|
||||
}
|
||||
|
||||
.logo {
|
||||
width: 100%;
|
||||
height: auto;
|
||||
}
|
||||
|
||||
h1 {
|
||||
font-family: 'Cormorant Garamond', serif;
|
||||
font-size: 50px !important; /* Increased font size */
|
||||
font-weight: bold;
|
||||
color: #50AF31;
|
||||
margin: 0;
|
||||
position: relative;
|
||||
padding: 0.5em 0;
|
||||
}
|
||||
|
||||
h1::before, h1::after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
height: 2px;
|
||||
width: 80%;
|
||||
background-color: #50AF31;
|
||||
left: 10%;
|
||||
}
|
||||
|
||||
h1::before {
|
||||
top: 0.5em;
|
||||
}
|
||||
|
||||
h1::after {
|
||||
bottom: 0.5em;
|
||||
}
|
||||
|
||||
p, h2 {
|
||||
font-size: 16px;
|
||||
margin: 10px 0;
|
||||
line-height: 1.4;
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 152 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 111 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 111 KiB |
@@ -0,0 +1,61 @@
|
||||
body {
|
||||
font-family: Arial, sans-serif;
|
||||
line-height: 1.5;
|
||||
background-color: #ffffff;
|
||||
color: #333;
|
||||
margin: 0;
|
||||
padding: 20px;
|
||||
}
|
||||
.container {
|
||||
width: 100%;
|
||||
max-width: 600px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
border: 1px solid #ddd;
|
||||
border-radius: 5px;
|
||||
}
|
||||
h1, h2, h3 {
|
||||
font-size: 1.5em;
|
||||
margin-top: 0;
|
||||
color: {accent_color};
|
||||
}
|
||||
p {
|
||||
margin: 10px 0;
|
||||
font-size: 1em;
|
||||
}
|
||||
.error-message, .success-message {
|
||||
padding: 10px 0;
|
||||
margin-bottom: 15px;
|
||||
font-size: 1em;
|
||||
}
|
||||
.error-message {
|
||||
color: #721c24;
|
||||
}
|
||||
.success-message {
|
||||
color: #155724;
|
||||
}
|
||||
.contact {
|
||||
margin-top: 15px;
|
||||
font-size: 0.9em;
|
||||
color: #555;
|
||||
}
|
||||
.contact a {
|
||||
color: {accent_color};
|
||||
text-decoration: none;
|
||||
}
|
||||
.contact a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
a {
|
||||
color: {accent_color};
|
||||
}
|
||||
.disclaimer {
|
||||
margin-top: 20px;
|
||||
font-size: 0.8em;
|
||||
color: #777;
|
||||
}
|
||||
.signature {
|
||||
margin-top: 20px;
|
||||
font-size: 0.8em;
|
||||
color: #555;
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Transcript Ready</title>
|
||||
<style>
|
||||
{email_css}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 style="color:{accent_color};">Transcript Ready</h1>
|
||||
<p>Dear user,</p>
|
||||
<p>Your file has been successfully processed, and the transcript is now ready. The transcript of your audio or video file is attached to this email.</p>
|
||||
<p>We hope you find the transcript useful. If you have any questions or need further assistance, please do not hesitate to contact our support team.</p>
|
||||
<div class="contact">
|
||||
<p>You can reach our support team at <a href="mailto:{contact_email}" style="color:{accent_color};">{contact_email}</a>. They are available to help with any questions or issues you may have.</p>
|
||||
</div>
|
||||
<div class="disclaimer">
|
||||
<p>Please note that our support team cannot modify the content of the transcript. They can assist with any other questions or concerns you may have.</p>
|
||||
</div>
|
||||
<div class="signature">
|
||||
<p>Thank you for using our transcription service!</p>
|
||||
<p>A.P.Strom</p>
|
||||
</div>
|
||||
{email_logo}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
@@ -0,0 +1,30 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Upload Successful</title>
|
||||
<style>
|
||||
{email_css}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<h1 style="color:{accent_color};">Upload Successful</h1>
|
||||
<p>Dear user,</p>
|
||||
<p>Your file has been successfully uploaded and is now in our processing queue. This means that our system has received your file, and it is waiting to be processed. We will handle your file as soon as possible.</p>
|
||||
<p>We will notify you once your file has been processed. If you have any urgent needs or further questions, feel free to reach out to our support team.</p>
|
||||
<div class="contact">
|
||||
<p>You can contact our support team at <a href="mailto:{contact_email}" style="color:{accent_color};">{contact_email}</a>. Please note that our support team is here to help with any questions or issues you might have.</p>
|
||||
</div>
|
||||
<div class="disclaimer">
|
||||
<p>Please note that our support team does not have the ability to change your position in the queue or access the files you have uploaded. They are here to provide assistance and answer any questions you might have about the process.</p>
|
||||
</div>
|
||||
<div class="signature">
|
||||
<p>Thank you for using our transcription service!</p>
|
||||
<p>A.P.Strom</p>
|
||||
</div>
|
||||
{email_logo}
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
+31
-24
@@ -5,38 +5,42 @@ build-backend = "poetry_dynamic_versioning.backend"
|
||||
[tool.poetry]
|
||||
name = "scraibe"
|
||||
version = "0.0.0"
|
||||
description = "Transcription tool for audio files based on Whisper and Pyannote"
|
||||
description = "LocalAI-backed transcription and diarization client using vibevoice.cpp"
|
||||
authors = ["Schmieder, Jacob <jacob.schmieder@dbfz.de>"]
|
||||
license = "GPL-3.0-or-later"
|
||||
readme = ["README.md", "LICENSE"]
|
||||
repository = "https://github.com/JSchmie/ScAIbe"
|
||||
documentation = "https://jschmie.github.io/ScrAIbe/"
|
||||
keywords = ["transcription", "audio", "whisper", "pyannote", "speech-to-text", "speech-recognition"]
|
||||
keywords = [
|
||||
"transcription",
|
||||
"audio",
|
||||
"diarization",
|
||||
"localai",
|
||||
"vibevoice",
|
||||
"speech-to-text",
|
||||
]
|
||||
classifiers = [
|
||||
'Development Status :: 4 - Beta',
|
||||
'Intended Audience :: Developers',
|
||||
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Programming Language :: Python :: 3.9',
|
||||
'Programming Language :: Python :: 3.10',
|
||||
'Programming Language :: Python :: 3.11',
|
||||
'Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence'
|
||||
]
|
||||
"Development Status :: 4 - Beta",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Programming Language :: Python :: 3.11",
|
||||
"Programming Language :: Python :: 3.12",
|
||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||
]
|
||||
packages = [{include = "scraibe"}]
|
||||
exclude =[
|
||||
"__pycache__",
|
||||
"*.pyc",
|
||||
"test"
|
||||
]
|
||||
exclude = [
|
||||
"__pycache__",
|
||||
"*.pyc",
|
||||
"test",
|
||||
]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
tqdm = "^4.66.4"
|
||||
tqdm = "^4.66.5"
|
||||
numpy = "^1.26.4"
|
||||
openai-whisper = ">=20231117,<20240931"
|
||||
whisperx = "^3.1.3"
|
||||
"pyannote.audio" = "^3.1.1"
|
||||
torch = "^2.3.0"
|
||||
httpx = ">=0.28.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
pytest = "^8.1.1"
|
||||
@@ -57,7 +61,7 @@ format-jinja = """
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
sphinx = "^7.3.7"
|
||||
sphinx-rtd-theme = "^2.0.0"
|
||||
sphinx-rtd-theme = ">=2,<4"
|
||||
markdown-it-py = {version = "~3.0.0", extras = ["plugins"]}
|
||||
myst-parser = "^3.0.1"
|
||||
mdit-py-plugins = "^0.4.1"
|
||||
@@ -68,6 +72,9 @@ scraibe = "scraibe.cli:cli"
|
||||
[tool.poetry.extras]
|
||||
app = ["scraibe-webui"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 58
|
||||
|
||||
[tool.ruff.lint.extend-per-file-ignores]
|
||||
"__init__.py" = ["E402","F403",'F401']
|
||||
"__init__.py" = ["E402", "F403", "F401"]
|
||||
"scraibe/misc.py" = ["E722"]
|
||||
|
||||
+7
-13
@@ -1,14 +1,8 @@
|
||||
tqdm>=4.65.0
|
||||
tqdm>=4.66.5
|
||||
numpy>=1.26.4
|
||||
|
||||
openai-whisper==20231117
|
||||
whisperx~=3.1.3
|
||||
|
||||
pyannote.audio~=3.1.1
|
||||
pyannote.core~=5.0.0
|
||||
pyannote.database~=5.0.1
|
||||
pyannote.metrics~=3.2.1
|
||||
pyannote.pipeline~=3.0.1
|
||||
|
||||
torch>=2.0.0
|
||||
|
||||
httpx>=0.28.0
|
||||
gradio>=5.0.0
|
||||
PyYAML>=6.0
|
||||
celery[redis]>=5.3.0
|
||||
redis>=5.0.0
|
||||
python-docx>=1.1.0
|
||||
|
||||
+6
-9
@@ -1,11 +1,8 @@
|
||||
from .autotranscript import *
|
||||
from .transcriber import *
|
||||
from .audio import *
|
||||
from .transcript_exporter import *
|
||||
from .diarisation import *
|
||||
|
||||
from .misc import *
|
||||
|
||||
from .cli import *
|
||||
from .autotranscript import Scraibe
|
||||
from .localai_client import LocalAIClient, LocalAIError
|
||||
from .summarizer import SummarizerClient, SummarizerError
|
||||
from .audio import AudioProcessor
|
||||
from .transcript_exporter import Transcript
|
||||
from .misc import set_threads, ParseKwargs
|
||||
|
||||
from ._version import __version__
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
"""
|
||||
Entrypoint for running ScrAIbe as a module:
|
||||
|
||||
python -m scraibe
|
||||
|
||||
Always launches the Web GUI (Gradio).
|
||||
Optionally launches:
|
||||
- MCP-style API server
|
||||
- Watch-folder mode
|
||||
"""
|
||||
|
||||
import os
|
||||
import threading
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("scraibe.__main__")
|
||||
|
||||
from .webui import create_app
|
||||
|
||||
|
||||
def _run_mcp_server():
|
||||
"""
|
||||
Run MCP server in a separate thread.
|
||||
"""
|
||||
import uvicorn
|
||||
from . import mcp_server
|
||||
|
||||
host = os.getenv("MCP_SERVER_HOST", "0.0.0.0")
|
||||
port = int(os.getenv("MCP_SERVER_PORT", "8000"))
|
||||
|
||||
uvicorn.run(
|
||||
mcp_server.app,
|
||||
host=host,
|
||||
port=port,
|
||||
log_level="info",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Optionally start MCP server in background (non-blocking)
|
||||
mcp_enabled = os.getenv("MCP_SERVER_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||
if mcp_enabled:
|
||||
try:
|
||||
t = threading.Thread(target=_run_mcp_server, daemon=True)
|
||||
t.start()
|
||||
logger.info("MCP server started in background.")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to start MCP server (WebUI will continue): %s", e)
|
||||
|
||||
# Optionally start watch-folder mode (non-blocking)
|
||||
try:
|
||||
from .watcher import start_watcher
|
||||
start_watcher()
|
||||
logger.info("Watch-folder mode started.")
|
||||
except Exception as e:
|
||||
logger.warning("Failed to start watch-folder mode (WebUI will continue): %s", e)
|
||||
|
||||
# Always start WebUI (Gradio)
|
||||
create_app()
|
||||
+146
-76
@@ -2,73 +2,49 @@
|
||||
Audio Processor Module
|
||||
=======================
|
||||
|
||||
This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
|
||||
It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
|
||||
flexible audio processing.
|
||||
Simplified audio processor for ScrAIbe.
|
||||
|
||||
Available Classes:
|
||||
- AudioProcessor: Processes audio waveforms and provides methods for loading,
|
||||
cutting, and handling audio.
|
||||
Previously this used torch and pyannote-style processing. In the LocalAI-backed
|
||||
version, we primarily pass files to the API, but we keep a lightweight helper
|
||||
for backward compatibility.
|
||||
|
||||
Usage:
|
||||
from .audio_import AudioProcessor
|
||||
|
||||
processor = AudioProcessor.from_file("path/to/audiofile.wav")
|
||||
cut_waveform = processor.cut(start=1.0, end=5.0)
|
||||
|
||||
Constants:
|
||||
- SAMPLE_RATE (int): Default sample rate for processing.
|
||||
- NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
|
||||
Now also includes utilities for chunking long audio into smaller segments
|
||||
to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from subprocess import CalledProcessError, run
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
NORMALIZATION_FACTOR = 32768.0
|
||||
DEFAULT_CHUNK_DURATION = 180.0 # seconds
|
||||
DEFAULT_CHUNK_OVERLAP = 2.0 # seconds
|
||||
|
||||
|
||||
class AudioProcessor:
|
||||
"""
|
||||
Audio Processor class that leverages PyTorchaudio to provide functionalities
|
||||
for loading, cutting, and handling audio waveforms.
|
||||
Lightweight audio processor for loading and cutting audio.
|
||||
|
||||
Attributes:
|
||||
waveform: torch.Tensor
|
||||
The audio waveform tensor.
|
||||
sr: int
|
||||
The sample rate of the audio.
|
||||
waveform (np.ndarray): The audio waveform as float32.
|
||||
sr (int): The sample rate of the audio.
|
||||
"""
|
||||
|
||||
def __init__(self, waveform: torch.Tensor, sr: int = SAMPLE_RATE,
|
||||
*args, **kwargs) -> None:
|
||||
"""
|
||||
Initialize the AudioProcessor object.
|
||||
|
||||
Args:
|
||||
waveform (torch.Tensor): The audio waveform tensor.
|
||||
sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
|
||||
args: Additional arguments.
|
||||
kwargs: Additional keyword arguments, e.g., device to use for processing.
|
||||
If CUDA is available, it defaults to CUDA.
|
||||
|
||||
Raises:
|
||||
ValueError: If the provided sample rate is not of type int.
|
||||
"""
|
||||
|
||||
device = kwargs.get(
|
||||
"device", "cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
self.waveform = waveform.to(device)
|
||||
def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
|
||||
self.waveform = waveform
|
||||
self.sr = sr
|
||||
|
||||
if not isinstance(self.sr, int):
|
||||
raise ValueError("Sample rate should be a single value of type int,"
|
||||
f"not {len(self.sr)} and type {type(self.sr)}")
|
||||
raise ValueError(
|
||||
"Sample rate should be a single value of type int, "
|
||||
f"not {len(self.sr)} and type {type(self.sr)}"
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
|
||||
def from_file(cls, file: str, *args, **kwargs):
|
||||
"""
|
||||
Create an AudioProcessor instance from an audio file.
|
||||
|
||||
@@ -76,55 +52,42 @@ class AudioProcessor:
|
||||
file (str): The audio file path.
|
||||
|
||||
Returns:
|
||||
AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
|
||||
AudioProcessor: Instance with loaded audio.
|
||||
"""
|
||||
|
||||
audio, sr = cls.load_audio(file, *args, **kwargs)
|
||||
|
||||
audio = torch.from_numpy(audio)
|
||||
|
||||
return cls(audio, sr)
|
||||
|
||||
def cut(self, start: float, end: float) -> torch.Tensor:
|
||||
def cut(self, start: float, end: float) -> np.ndarray:
|
||||
"""
|
||||
Cut a segment from the audio waveform between the specified start and end times.
|
||||
Cut a segment from the audio waveform.
|
||||
|
||||
Args:
|
||||
start (float): Start time in seconds.
|
||||
end (float): End time in seconds.
|
||||
|
||||
Returns:
|
||||
torch.Tensor: The cut waveform segment.
|
||||
np.ndarray: The cut waveform segment.
|
||||
"""
|
||||
|
||||
start = int(start * self.sr)
|
||||
if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
|
||||
end = int(np.ceil(end * self.sr))
|
||||
else:
|
||||
end = int(torch.ceil(end * self.sr))
|
||||
return self.waveform[start:end]
|
||||
start_idx = int(start * self.sr)
|
||||
end_idx = int(np.ceil(end * self.sr))
|
||||
return self.waveform[start_idx:end_idx]
|
||||
|
||||
@staticmethod
|
||||
def load_audio(file: str, sr: int = SAMPLE_RATE):
|
||||
"""
|
||||
Open an audio file and read it as a mono waveform, resampling if necessary.
|
||||
This method ensures compatibility with pyannote.audio
|
||||
and requires the ffmpeg CLI in PATH.
|
||||
Load an audio file as a mono waveform, resampling if necessary.
|
||||
Requires ffmpeg in PATH.
|
||||
|
||||
Args:
|
||||
file (str): The audio file to open.
|
||||
sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
|
||||
sr (int, optional): The desired sample rate.
|
||||
|
||||
Returns:
|
||||
tuple: A NumPy array containing the audio waveform in float32 dtype
|
||||
and the sample rate.
|
||||
tuple: (waveform as np.ndarray[float32], sample rate)
|
||||
|
||||
Raises:
|
||||
RuntimeError: If failed to load audio.
|
||||
"""
|
||||
# This launches a subprocess to decode audio while down-mixing
|
||||
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
|
||||
# fmt: off
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-nostdin",
|
||||
@@ -134,19 +97,126 @@ class AudioProcessor:
|
||||
"-ac", "1",
|
||||
"-acodec", "pcm_s16le",
|
||||
"-ar", str(sr),
|
||||
"-"
|
||||
"-",
|
||||
]
|
||||
# fmt: on
|
||||
try:
|
||||
out = run(cmd, capture_output=True, check=True).stdout
|
||||
except CalledProcessError as e:
|
||||
raise RuntimeError(
|
||||
f"Failed to load audio: {e.stderr.decode()}") from e
|
||||
f"Failed to load audio: {e.stderr.decode()}"
|
||||
) from e
|
||||
|
||||
out = np.frombuffer(out, np.int16).flatten().astype(
|
||||
np.float32) / NORMALIZATION_FACTOR
|
||||
waveform = np.frombuffer(out, np.int16).flatten().astype(
|
||||
np.float32
|
||||
) / NORMALIZATION_FACTOR
|
||||
|
||||
return out, sr
|
||||
return waveform, sr
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
|
||||
return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
|
||||
|
||||
|
||||
def get_audio_duration(file_path: str) -> float:
|
||||
"""
|
||||
Get the duration of an audio file in seconds using ffprobe.
|
||||
|
||||
Args:
|
||||
file_path: Path to the audio file.
|
||||
|
||||
Returns:
|
||||
Duration in seconds as a float.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If ffprobe fails.
|
||||
"""
|
||||
cmd = [
|
||||
"ffprobe",
|
||||
"-v", "error",
|
||||
"-show_entries", "format=duration",
|
||||
"-of", "json",
|
||||
file_path,
|
||||
]
|
||||
try:
|
||||
result = run(cmd, capture_output=True, text=True, check=True)
|
||||
data = json.loads(result.stdout)
|
||||
return float(data["format"]["duration"])
|
||||
except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
|
||||
raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
|
||||
|
||||
|
||||
def split_audio_into_chunks(
|
||||
input_path: str,
|
||||
max_duration: float = DEFAULT_CHUNK_DURATION,
|
||||
overlap: float = DEFAULT_CHUNK_OVERLAP,
|
||||
output_format: str = "wav",
|
||||
sample_rate: int = 24000,
|
||||
) -> list:
|
||||
"""
|
||||
Split a long audio file into overlapping chunks using ffmpeg.
|
||||
|
||||
Args:
|
||||
input_path: Path to the input audio file.
|
||||
max_duration: Maximum duration of each chunk in seconds.
|
||||
overlap: Overlap duration in seconds between consecutive chunks.
|
||||
output_format: Output format (e.g., 'wav').
|
||||
sample_rate: Sample rate for output chunks.
|
||||
|
||||
Returns:
|
||||
List of dicts:
|
||||
[{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
|
||||
Files must be cleaned up by the caller.
|
||||
"""
|
||||
duration = get_audio_duration(input_path)
|
||||
|
||||
# If file is shorter than max_duration, no need to split
|
||||
if duration <= max_duration:
|
||||
return [{"path": input_path, "start": 0.0, "end": duration}]
|
||||
|
||||
chunks = []
|
||||
start = 0.0
|
||||
chunk_id = 0
|
||||
|
||||
while start < duration:
|
||||
chunk_end = min(start + max_duration, duration)
|
||||
chunk_duration = chunk_end - start
|
||||
|
||||
tmp = tempfile.NamedTemporaryFile(
|
||||
delete=False,
|
||||
suffix=f".{output_format}",
|
||||
prefix="scraibe_chunk_",
|
||||
)
|
||||
chunk_path = tmp.name
|
||||
tmp.close()
|
||||
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-nostdin",
|
||||
"-ss", str(start),
|
||||
"-i", input_path,
|
||||
"-t", str(chunk_duration),
|
||||
"-ar", str(sample_rate),
|
||||
"-ac", "1",
|
||||
"-c:a", "pcm_s16le",
|
||||
chunk_path,
|
||||
]
|
||||
try:
|
||||
run(cmd, capture_output=True, check=True)
|
||||
except CalledProcessError as e:
|
||||
# Clean up on error
|
||||
if os.path.exists(chunk_path):
|
||||
os.remove(chunk_path)
|
||||
raise RuntimeError(
|
||||
f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
|
||||
)
|
||||
|
||||
chunks.append({
|
||||
"path": chunk_path,
|
||||
"start": start,
|
||||
"end": chunk_end,
|
||||
})
|
||||
|
||||
start += max_duration - overlap
|
||||
chunk_id += 1
|
||||
|
||||
return chunks
|
||||
|
||||
+302
-297
@@ -1,357 +1,362 @@
|
||||
"""
|
||||
Scraibe Class
|
||||
--------------------
|
||||
Scraibe Class (LocalAI-backed)
|
||||
------------------------------
|
||||
|
||||
This class serves as the core of the transcription system, responsible for handling
|
||||
transcription and diarization of audio files. It leverages pretrained models for
|
||||
speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
|
||||
providing an accessible interface for audio processing tasks such as transcription,
|
||||
speaker separation, and timestamping.
|
||||
Core class for transcription and (optionally) summarization.
|
||||
|
||||
By encapsulating the complexities of underlying models, it allows for straightforward
|
||||
integration into various applications, ranging from transcription services to voice assistants.
|
||||
- Transcription and diarization are delegated to LocalAI (vibevoice.cpp).
|
||||
- Summarization is delegated to a separate LLM via /v1/chat/completions.
|
||||
|
||||
Available Classes:
|
||||
- Scraibe: Main class for performing transcription and diarization.
|
||||
Includes methods for loading models, processing audio files,
|
||||
and formatting the transcription output.
|
||||
Public tasks:
|
||||
- transcribe
|
||||
- transcript_and_summarize (transcribe + generate a detailed summary)
|
||||
|
||||
Usage:
|
||||
from scraibe import Scraibe
|
||||
|
||||
model = Scraibe()
|
||||
transcript = model.autotranscribe("path/to/audiofile.wav")
|
||||
Previous task/whisper/pyannote-specific settings are kept for compatibility
|
||||
but ignored when not relevant.
|
||||
"""
|
||||
|
||||
# Standard Library Imports
|
||||
import os
|
||||
from glob import iglob
|
||||
from subprocess import run
|
||||
from typing import TypeVar, Union
|
||||
from warnings import warn
|
||||
import logging
|
||||
from typing import Union, Optional, Dict, Any
|
||||
|
||||
# Third-Party Imports
|
||||
import torch
|
||||
from numpy import ndarray
|
||||
from tqdm import trange
|
||||
|
||||
# Application-Specific Imports
|
||||
from .audio import AudioProcessor
|
||||
from .diarisation import Diariser
|
||||
from .transcriber import Transcriber, load_transcriber, whisper
|
||||
from .localai_client import LocalAIClient, LocalAIError
|
||||
from .summarizer import SummarizerClient, SummarizerError
|
||||
from .transcript_exporter import Transcript
|
||||
|
||||
|
||||
DiarisationType = TypeVar('DiarisationType')
|
||||
logger = logging.getLogger("scraibe.autotranscript")
|
||||
|
||||
|
||||
class Scraibe:
|
||||
"""
|
||||
Scraibe is a class responsible for managing the transcription and diarization of audio files.
|
||||
It serves as the core of the transcription system, incorporating pretrained models
|
||||
for speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
|
||||
allowing for comprehensive audio processing.
|
||||
Scraibe now:
|
||||
- Uses LocalAI for transcription + diarization.
|
||||
- Uses a separate LLM for summarization (when requested).
|
||||
|
||||
Attributes:
|
||||
transcriber (Transcriber): The transcriber object to handle transcription.
|
||||
diariser (Diariser): The diariser object to handle diarization.
|
||||
|
||||
Methods:
|
||||
__init__: Initializes the Scraibe class with appropriate models.
|
||||
transcribe: Transcribes an audio file using the whisper model and pyannote diarization model.
|
||||
remove_audio_file: Removes the original audio file to avoid disk space issues or ensure data privacy.
|
||||
get_audio_file: Gets an audio file as an AudioProcessor object.
|
||||
Public methods:
|
||||
- transcribe(audio_file, ...)
|
||||
- transcript_and_summarize(audio_file, ...)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
whisper_model: Union[bool, str, whisper] = None,
|
||||
whisper_type: str = "whisper",
|
||||
dia_model: Union[bool, str, DiarisationType] = None,
|
||||
**kwargs) -> None:
|
||||
"""Initializes the Scraibe class.
|
||||
def __init__(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
whisper_model: Union[bool, str] = None,
|
||||
whisper_type: str = "whisper",
|
||||
dia_model: Union[bool, str] = None,
|
||||
use_auth_token: str = None,
|
||||
verbose: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize Scraibe with LocalAI client and summarizer client.
|
||||
|
||||
Args:
|
||||
whisper_model (Union[bool, str, whisper], optional):
|
||||
Path to whisper model or whisper model itself.
|
||||
whisper_type (str):
|
||||
Type of whisper model to load. "whisper" or "whisperx".
|
||||
diarisation_model (Union[bool, str, DiarisationType], optional):
|
||||
Path to pyannote diarization model or model itself.
|
||||
**kwargs: Additional keyword arguments for whisper
|
||||
and pyannote diarization models.
|
||||
e.g.:
|
||||
api_url: LocalAI server URL for transcription/diarization.
|
||||
Falls back to LOCALAI_API_URL env var.
|
||||
api_key: API key for LocalAI. Falls back to LOCALAI_API_KEY.
|
||||
model: Model name for LocalAI (e.g., vibevoice-diarize).
|
||||
Falls back to LOCALAI_MODEL env var.
|
||||
|
||||
- verbose: If True, the class will print additional information.
|
||||
- save_kwargs: If True, the keyword arguments will be saved
|
||||
for autotranscribe. So you can unload the class and reload it again.
|
||||
Summarizer uses:
|
||||
- SUMMARIZER_API_URL
|
||||
- SUMMARIZER_API_KEY
|
||||
- SUMMARIZER_MODEL
|
||||
These can be overridden via environment or via the transcript_and_summarize
|
||||
method if needed.
|
||||
|
||||
Backward-compat (ignored):
|
||||
- whisper_model, whisper_type, dia_model, use_auth_token, etc.
|
||||
"""
|
||||
self.verbose = verbose or kwargs.get("verbose", False)
|
||||
|
||||
if whisper_model is None:
|
||||
self.transcriber = load_transcriber(
|
||||
"medium", whisper_type, **kwargs)
|
||||
elif isinstance(whisper_model, str):
|
||||
self.transcriber = load_transcriber(
|
||||
whisper_model, whisper_type, **kwargs)
|
||||
else:
|
||||
self.transcriber = whisper_model
|
||||
logger.info("Initializing Scraibe.")
|
||||
|
||||
if dia_model is None:
|
||||
self.diariser = Diariser.load_model(**kwargs)
|
||||
elif isinstance(dia_model, str):
|
||||
self.diariser = Diariser.load_model(dia_model, **kwargs)
|
||||
else:
|
||||
self.diariser: Diariser = dia_model
|
||||
try:
|
||||
self.client = LocalAIClient(
|
||||
api_url=api_url,
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
)
|
||||
except LocalAIError as e:
|
||||
logger.error("Failed to initialize LocalAI client: %s", e)
|
||||
raise LocalAIError(f"Failed to initialize LocalAI client: {e}")
|
||||
|
||||
if kwargs.get("verbose"):
|
||||
print("Scraibe initialized all models successfully loaded.")
|
||||
self.verbose = True
|
||||
else:
|
||||
self.verbose = False
|
||||
|
||||
# Save kwargs for autotranscribe if you want to unload the class and load it again.
|
||||
if kwargs.get('save_setup'):
|
||||
self.params = dict(whisper_model=whisper_model,
|
||||
dia_model=dia_model,
|
||||
**kwargs)
|
||||
else:
|
||||
self.params = {}
|
||||
|
||||
def autotranscribe(self, audio_file: Union[str, torch.Tensor, ndarray],
|
||||
remove_original: bool = False,
|
||||
**kwargs) -> Transcript:
|
||||
"""
|
||||
Transcribes an audio file using the whisper model and pyannote diarization model.
|
||||
|
||||
Args:
|
||||
audio_file (Union[str, torch.Tensor, ndarray]):
|
||||
Path to audio file or a tensor representing the audio.
|
||||
remove_original (bool, optional): If True, the original audio file will
|
||||
be removed after transcription.
|
||||
*args: Additional positional arguments for diarization and transcription.
|
||||
**kwargs: Additional keyword arguments for diarization and transcription.
|
||||
|
||||
Returns:
|
||||
Transcript: A Transcript object containing the transcription,
|
||||
which can be exported to different formats.
|
||||
"""
|
||||
if kwargs.get("verbose"):
|
||||
self.verbose = kwargs.get("verbose")
|
||||
# Get audio file as an AudioProcessor object
|
||||
audio_file: AudioProcessor = self.get_audio_file(audio_file)
|
||||
|
||||
# Prepare waveform and sample rate for diarization
|
||||
dia_audio = {
|
||||
"waveform": audio_file.waveform.reshape(1, len(audio_file.waveform)),
|
||||
"sample_rate": audio_file.sr
|
||||
}
|
||||
# Summarizer is lazy-initialized if needed
|
||||
self._summarizer: Optional[SummarizerClient] = None
|
||||
|
||||
if self.verbose:
|
||||
print("Starting diarisation.")
|
||||
print("Scraibe initialized. Using LocalAI for transcription and diarization.")
|
||||
|
||||
diarisation = self.diariser.diarization(dia_audio, **kwargs)
|
||||
|
||||
if not diarisation["segments"]:
|
||||
print("No segments found. Try to run transcription without diarisation.")
|
||||
|
||||
transcript = self.transcriber.transcribe(
|
||||
audio_file.waveform, **kwargs)
|
||||
|
||||
final_transcript = {0: {"speakers": 'SPEAKER_01',
|
||||
"segments": [0, len(audio_file.waveform)],
|
||||
"text": transcript}}
|
||||
|
||||
return Transcript(final_transcript)
|
||||
|
||||
if self.verbose:
|
||||
print("Diarisation finished. Starting transcription.")
|
||||
|
||||
audio_file.sr = torch.Tensor([audio_file.sr]).to(
|
||||
audio_file.waveform.device)
|
||||
|
||||
# Transcribe each segment and store the results
|
||||
final_transcript = dict()
|
||||
|
||||
for i in trange(len(diarisation["segments"]), desc="Transcribing", disable=not self.verbose):
|
||||
|
||||
seg = diarisation["segments"][i]
|
||||
|
||||
audio = audio_file.cut(seg[0], seg[1])
|
||||
|
||||
transcript = self.transcriber.transcribe(audio, **kwargs)
|
||||
|
||||
final_transcript[i] = {"speakers": diarisation["speakers"][i],
|
||||
"segments": seg,
|
||||
"text": transcript}
|
||||
|
||||
# Remove original file if needed
|
||||
if remove_original:
|
||||
if kwargs.get("shred") is True:
|
||||
self.remove_audio_file(audio_file, shred=True)
|
||||
else:
|
||||
self.remove_audio_file(audio_file, shred=False)
|
||||
|
||||
return Transcript(final_transcript)
|
||||
|
||||
def diarization(self, audio_file: Union[str, torch.Tensor, ndarray],
|
||||
**kwargs) -> dict:
|
||||
def _ensure_summarizer(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
) -> SummarizerClient:
|
||||
"""
|
||||
Perform diarization on an audio file using the pyannote diarization model.
|
||||
Lazy-init summarizer client.
|
||||
"""
|
||||
if self._summarizer is not None:
|
||||
return self._summarizer
|
||||
|
||||
Args:
|
||||
audio_file (Union[str, torch.Tensor, ndarray]):
|
||||
The audio source which can either be a path to the audio file or a tensor representation.
|
||||
**kwargs:
|
||||
Additional keyword arguments for diarization.
|
||||
logger.info("Initializing SummarizerClient (lazy).")
|
||||
try:
|
||||
self._summarizer = SummarizerClient(
|
||||
api_url=api_url,
|
||||
api_key=api_key,
|
||||
model=model,
|
||||
)
|
||||
except SummarizerError as e:
|
||||
logger.error("Failed to initialize Summarizer client: %s", e)
|
||||
raise SummarizerError(f"Failed to initialize Summarizer client: {e}")
|
||||
|
||||
return self._summarizer
|
||||
|
||||
# -----------------
|
||||
# Primary public API
|
||||
# -----------------
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio_file: str,
|
||||
*,
|
||||
for_export: bool = False,
|
||||
**kwargs,
|
||||
) -> Union[str, Dict[str, Any]]:
|
||||
"""
|
||||
Transcribe the provided audio file using LocalAI.
|
||||
|
||||
Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
|
||||
Returns:
|
||||
- If for_export=False: plain transcript text (str).
|
||||
- If for_export=True: dict with:
|
||||
- transcript: plain text
|
||||
- segments: list[segment] with speaker labels
|
||||
- raw_result: full verbose_json from LocalAI (if present)
|
||||
"""
|
||||
if isinstance(audio_file, str):
|
||||
if not os.path.exists(audio_file):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
||||
else:
|
||||
raise TypeError(
|
||||
"In LocalAI mode, audio_file must be a file path (str)."
|
||||
)
|
||||
|
||||
verbose = kwargs.pop("verbose", self.verbose)
|
||||
logger.info("transcribe called for: %s", audio_file)
|
||||
|
||||
try:
|
||||
result = self.client.diarize_and_transcribe(
|
||||
audio_path=audio_file,
|
||||
include_text=True,
|
||||
verbose=verbose,
|
||||
return_raw=True,
|
||||
**kwargs,
|
||||
)
|
||||
except LocalAIError as e:
|
||||
logger.error("Error during LocalAI transcription: %s", e)
|
||||
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
||||
|
||||
segments = result.get("segments", [])
|
||||
speakers = result.get("speakers", [])
|
||||
transcripts = result.get("transcripts", [])
|
||||
|
||||
# Build simple transcript text
|
||||
if for_export:
|
||||
# Include speaker-labeled transcript
|
||||
lines = []
|
||||
for seg, speaker, text in zip(segments, speakers, transcripts):
|
||||
start, end = seg
|
||||
ts = self._format_timestamp(start)
|
||||
line = f"[{ts}] {speaker}: {text.strip()}"
|
||||
lines.append(line)
|
||||
full_text = "\n\n".join(lines)
|
||||
else:
|
||||
# Legacy: space-joined text
|
||||
full_text = " ".join(t.strip() for t in transcripts if t.strip())
|
||||
|
||||
logger.info("transcribe completed, length=%d chars", len(full_text))
|
||||
|
||||
if for_export:
|
||||
# Return richer structure for JSON export
|
||||
raw_result = result.get("raw_result")
|
||||
return {
|
||||
"transcript": full_text,
|
||||
"segments": [
|
||||
{
|
||||
"id": i,
|
||||
"speaker": sp,
|
||||
"start": seg[0],
|
||||
"end": seg[1],
|
||||
"text": txt,
|
||||
}
|
||||
for i, (seg, sp, txt) in enumerate(
|
||||
zip(segments, speakers, transcripts)
|
||||
)
|
||||
],
|
||||
"raw_result": raw_result if raw_result is not None else None,
|
||||
}
|
||||
|
||||
return full_text
|
||||
|
||||
def transcript_and_summarize(
|
||||
self,
|
||||
audio_file: str,
|
||||
*,
|
||||
summarizer_api_url: Optional[str] = None,
|
||||
summarizer_api_key: Optional[str] = None,
|
||||
summarizer_model: Optional[str] = None,
|
||||
for_export: bool = False,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe the audio file and generate a detailed summary.
|
||||
|
||||
Steps:
|
||||
- Transcribe via LocalAI (verbose_json).
|
||||
- Build a plain-text transcript (with speaker labels).
|
||||
- Summarize the transcript using the configured LLM.
|
||||
|
||||
Returns:
|
||||
dict:
|
||||
A dictionary containing the results of the diarization process.
|
||||
dict with:
|
||||
- transcript: full transcript text (with speaker labels)
|
||||
- summary: final detailed summary (markdown-ready)
|
||||
- segments: (if for_export) list[segment] with speaker labels
|
||||
- raw_result: (if for_export) full verbose_json from LocalAI
|
||||
"""
|
||||
if isinstance(audio_file, str):
|
||||
if not os.path.exists(audio_file):
|
||||
raise FileNotFoundError(f"Audio file not found: {audio_file}")
|
||||
else:
|
||||
raise TypeError(
|
||||
"In LocalAI mode, audio_file must be a file path (str)."
|
||||
)
|
||||
|
||||
# Get audio file as an AudioProcessor object
|
||||
audio_file: AudioProcessor = self.get_audio_file(audio_file)
|
||||
verbose = kwargs.pop("verbose", self.verbose)
|
||||
logger.info("transcript_and_summarize called for: %s", audio_file)
|
||||
|
||||
# Prepare waveform and sample rate for diarization
|
||||
dia_audio = {
|
||||
"waveform": audio_file.waveform.reshape(1, len(audio_file.waveform)),
|
||||
"sample_rate": audio_file.sr
|
||||
# 1) Get diarized + transcribed result
|
||||
try:
|
||||
result = self.client.diarize_and_transcribe(
|
||||
audio_path=audio_file,
|
||||
include_text=True,
|
||||
verbose=verbose,
|
||||
return_raw=True,
|
||||
**kwargs,
|
||||
)
|
||||
except LocalAIError as e:
|
||||
logger.error("Error during LocalAI transcription: %s", e)
|
||||
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
||||
|
||||
segments = result.get("segments", [])
|
||||
speakers = result.get("speakers", [])
|
||||
transcripts = result.get("transcripts", [])
|
||||
|
||||
if not segments:
|
||||
logger.warning("No segments returned; returning empty transcript/summary.")
|
||||
return {
|
||||
"transcript": "",
|
||||
"summary": "No transcript content to summarize.",
|
||||
}
|
||||
|
||||
# 2) Build full transcript text with speaker labels
|
||||
lines = []
|
||||
for seg, speaker, text in zip(segments, speakers, transcripts):
|
||||
start, end = seg
|
||||
ts = self._format_timestamp(start)
|
||||
line = f"[{ts}] {speaker}: {text.strip()}"
|
||||
lines.append(line)
|
||||
|
||||
full_transcript = "\n\n".join(lines)
|
||||
logger.info("Built full transcript, length=%d chars", len(full_transcript))
|
||||
|
||||
# 3) Summarize
|
||||
try:
|
||||
summarizer = self._ensure_summarizer(
|
||||
api_url=summarizer_api_url,
|
||||
api_key=summarizer_api_key,
|
||||
model=summarizer_model,
|
||||
)
|
||||
except SummarizerError as e:
|
||||
logger.error("Failed to initialize summarizer: %s", e)
|
||||
raise SummarizerError(f"Failed to initialize summarizer: {e}")
|
||||
|
||||
try:
|
||||
summary = summarizer.summarize_transcript(full_transcript)
|
||||
except SummarizerError as e:
|
||||
logger.error("Error during summarization: %s", e)
|
||||
raise SummarizerError(f"Error during summarization: {e}")
|
||||
|
||||
logger.info("transcript_and_summarize completed.")
|
||||
|
||||
out = {
|
||||
"transcript": full_transcript,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
print("Starting diarisation.")
|
||||
if for_export:
|
||||
# Add segments and raw_result for JSON export
|
||||
raw_result = result.get("raw_result")
|
||||
out["segments"] = [
|
||||
{
|
||||
"id": i,
|
||||
"speaker": sp,
|
||||
"start": seg[0],
|
||||
"end": seg[1],
|
||||
"text": txt,
|
||||
}
|
||||
for i, (seg, sp, txt) in enumerate(
|
||||
zip(segments, speakers, transcripts)
|
||||
)
|
||||
]
|
||||
out["raw_result"] = raw_result if raw_result is not None else None
|
||||
|
||||
diarisation = self.diariser.diarization(dia_audio, **kwargs)
|
||||
return out
|
||||
|
||||
return diarisation
|
||||
|
||||
def transcribe(self, audio_file: Union[str, torch.Tensor, ndarray],
|
||||
**kwargs):
|
||||
"""
|
||||
Transcribe the provided audio file.
|
||||
|
||||
Args:
|
||||
audio_file (Union[str, torch.Tensor, ndarray]):
|
||||
The audio source, which can either be a path or a tensor representation.
|
||||
**kwargs:
|
||||
Additional keyword arguments for transcription.
|
||||
|
||||
Returns:
|
||||
str:
|
||||
The transcribed text from the audio source.
|
||||
"""
|
||||
audio_file: AudioProcessor = self.get_audio_file(audio_file)
|
||||
|
||||
return self.transcriber.transcribe(audio_file.waveform, **kwargs)
|
||||
|
||||
def update_transcriber(self, whisper_model: Union[str, whisper], **kwargs) -> None:
|
||||
"""
|
||||
Update the transcriber model.
|
||||
|
||||
Args:
|
||||
whisper_model (Union[str, whisper]):
|
||||
The new whisper model to use for transcription.
|
||||
**kwargs:
|
||||
Additional keyword arguments for the transcriber model.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
_old_model = self.transcriber.model_name
|
||||
|
||||
if isinstance(whisper_model, str):
|
||||
self.transcriber = load_transcriber(whisper_model, **kwargs)
|
||||
elif isinstance(whisper_model, Transcriber):
|
||||
self.transcriber = whisper_model
|
||||
else:
|
||||
warn(
|
||||
f"Invalid model type. Please provide a valid model. Fallback to old {_old_model} Model.", RuntimeWarning)
|
||||
|
||||
return None
|
||||
|
||||
def update_diariser(self, dia_model: Union[str, DiarisationType], **kwargs) -> None:
|
||||
"""
|
||||
Update the diariser model.
|
||||
|
||||
Args:
|
||||
dia_model (Union[str, DiarisationType]):
|
||||
The new diariser model to use for diarization.
|
||||
**kwargs:
|
||||
Additional keyword arguments for the diariser model.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
if isinstance(dia_model, str):
|
||||
self.diariser = Diariser.load_model(dia_model, **kwargs)
|
||||
elif isinstance(dia_model, Diariser):
|
||||
self.diariser = dia_model
|
||||
else:
|
||||
warn("Invalid model type. Please provide a valid model. Fallback to old Model.", RuntimeWarning)
|
||||
|
||||
return None
|
||||
# -----------------
|
||||
# Helpers
|
||||
# -----------------
|
||||
|
||||
@staticmethod
|
||||
def remove_audio_file(audio_file: str,
|
||||
shred: bool = False) -> None:
|
||||
def _format_timestamp(seconds: float) -> str:
|
||||
"""
|
||||
Removes the original audio file to avoid disk space issues or ensure data privacy.
|
||||
Format seconds into MM:SS or HH:MM:SS.
|
||||
"""
|
||||
m, s = divmod(int(seconds), 60)
|
||||
h, m = divmod(m, 60)
|
||||
if h > 0:
|
||||
return f"{h:02d}:{m:02d}:{s:02d}"
|
||||
return f"{m:02d}:{s:02d}"
|
||||
|
||||
Args:
|
||||
audio_file_path (str): Path to the audio file.
|
||||
shred (bool, optional): If True, the audio file will be shredded,
|
||||
not just removed.
|
||||
@staticmethod
|
||||
def remove_audio_file(audio_file: str, shred: bool = False) -> None:
|
||||
"""
|
||||
Remove the original audio file.
|
||||
"""
|
||||
if not os.path.exists(audio_file):
|
||||
raise ValueError(f"Audiofile {audio_file} does not exist.")
|
||||
|
||||
if shred:
|
||||
import subprocess
|
||||
import warnings
|
||||
from glob import iglob
|
||||
|
||||
warn("Shredding audiofile can take a long time.", RuntimeWarning)
|
||||
warnings.warn("Shredding audiofile can take a long time.", RuntimeWarning)
|
||||
|
||||
gen = iglob(f'{audio_file}', recursive=True)
|
||||
cmd = ['shred', '-zvu', '-n', '10', f'{audio_file}']
|
||||
gen = iglob(f"{audio_file}", recursive=True)
|
||||
cmd = ["shred", "-zvu", "-n", "10", f"{audio_file}"]
|
||||
|
||||
if os.path.isdir(audio_file):
|
||||
raise ValueError(f"Audiofile {audio_file} is a directory.")
|
||||
|
||||
for file in gen:
|
||||
print(f'shredding {file} now\n')
|
||||
|
||||
run(cmd, check=True)
|
||||
|
||||
print(f"shredding {file} now\n")
|
||||
subprocess.run(cmd, check=True)
|
||||
else:
|
||||
os.remove(audio_file)
|
||||
print(f"Audiofile {audio_file} removed.")
|
||||
|
||||
@staticmethod
|
||||
def get_audio_file(audio_file: Union[str, torch.Tensor, ndarray],
|
||||
*args, **kwargs) -> AudioProcessor:
|
||||
"""Gets an audio file as TorchAudioProcessor.
|
||||
|
||||
Args:
|
||||
audio_file (Union[str, torch.Tensor, ndarray]): Path to the audio file or
|
||||
a tensor representing the audio.
|
||||
*args: Additional positional arguments.
|
||||
**kwargs: Additional keyword arguments.
|
||||
|
||||
Returns:
|
||||
AudioProcessor: An object containing the waveform and sample rate in
|
||||
torch.Tensor format.
|
||||
"""
|
||||
|
||||
if isinstance(audio_file, str):
|
||||
audio_file = AudioProcessor.from_file(audio_file)
|
||||
|
||||
elif isinstance(audio_file, torch.Tensor):
|
||||
audio_file = AudioProcessor(audio_file[0], audio_file[1])
|
||||
elif isinstance(audio_file, ndarray):
|
||||
audio_file = AudioProcessor(torch.Tensor(audio_file[0]),
|
||||
audio_file[1])
|
||||
|
||||
if not isinstance(audio_file, AudioProcessor):
|
||||
raise ValueError(f'Audiofile must be of type AudioProcessor,'
|
||||
f'not {type(audio_file)}')
|
||||
|
||||
return audio_file
|
||||
|
||||
def __repr__(self):
|
||||
return f"Scraibe(transcriber={self.transcriber}, diariser={self.diariser})"
|
||||
return "Scraibe(LocalAI-backed)"
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
"""
|
||||
Celery application for async transcription jobs.
|
||||
"""
|
||||
|
||||
import os
|
||||
from celery import Celery
|
||||
|
||||
broker_url = os.getenv("CELERY_BROKER_URL", "redis://redis:6379/0")
|
||||
result_backend = os.getenv("CELERY_RESULT_BACKEND", "redis://redis:6379/0")
|
||||
|
||||
celery_app = Celery(
|
||||
"scraibe",
|
||||
broker=broker_url,
|
||||
backend=result_backend,
|
||||
)
|
||||
|
||||
celery_app.conf.update(
|
||||
task_routes={
|
||||
"scraibe.tasks.process_transcription_task": {"queue": "transcription"},
|
||||
},
|
||||
task_serializer="json",
|
||||
result_serializer="json",
|
||||
accept_content=["json"],
|
||||
timezone="UTC",
|
||||
enable_utc=True,
|
||||
)
|
||||
|
||||
celery_app.autodiscover_tasks(["scraibe.tasks"])
|
||||
+229
-89
@@ -3,152 +3,292 @@ Command-Line Interface (CLI) for the Scraibe class,
|
||||
allowing for user interaction to transcribe and diarize audio files.
|
||||
The function includes arguments for specifying the audio files, model paths,
|
||||
output formats, and other options necessary for transcription.
|
||||
|
||||
This version is adapted for LocalAI-based transcription and diarization.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
||||
from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
|
||||
from torch.cuda import is_available
|
||||
from torch import set_num_threads
|
||||
from .autotranscript import Scraibe
|
||||
from .misc import set_threads, setup_logging
|
||||
|
||||
|
||||
def cli():
|
||||
"""
|
||||
Command-Line Interface (CLI) for the Scraibe class, allowing for user interaction to transcribe
|
||||
and diarize audio files. The function includes arguments for specifying the audio files, model paths,
|
||||
output formats, and other options necessary for transcription.
|
||||
|
||||
This function can be executed from the command line to perform transcription tasks, providing a
|
||||
user-friendly way to access the Scraibe class functionalities.
|
||||
and diarize audio files via a LocalAI server.
|
||||
"""
|
||||
|
||||
# Initialize logging (can be overridden via --log-level)
|
||||
setup_logging(level=os.getenv("LOG_LEVEL", "INFO"))
|
||||
|
||||
logger = logging.getLogger("scraibe.cli")
|
||||
|
||||
def str2bool(string):
|
||||
str2val = {"True": True, "False": False}
|
||||
if string in str2val:
|
||||
return str2val[string]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Expected one of {set(str2val.keys())}, got {string}")
|
||||
f"Expected one of {set(str2val.keys())}, got {string}"
|
||||
)
|
||||
|
||||
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
|
||||
|
||||
parser.add_argument("-f", "--audio-files", nargs="+", type=str, default=None,
|
||||
help="List of audio files to transcribe.")
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--audio-files",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=None,
|
||||
help="List of audio files to transcribe.",
|
||||
)
|
||||
|
||||
parser.add_argument("--whisper-type", type=str, default="whisper",
|
||||
choices=["whisper", "whisperx"],
|
||||
help="Type of Whisper model to use ('whisper' or 'whisperx').")
|
||||
# LocalAI connection (env vars preferred, but CLI overrides allowed)
|
||||
parser.add_argument(
|
||||
"--localai-api-url",
|
||||
type=str,
|
||||
default=None,
|
||||
help="LocalAI server URL (e.g., http://localhost:8080). "
|
||||
"Overrides LOCALAI_API_URL env var if provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--localai-api-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help="LocalAI API key. Overrides LOCALAI_API_KEY env var if provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--localai-model",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model name to use on LocalAI (e.g., vibevoice-diarize). "
|
||||
"Overrides LOCALAI_MODEL env var if provided.",
|
||||
)
|
||||
|
||||
parser.add_argument("--whisper-model-name", default="medium",
|
||||
help="Name of the Whisper model to use.")
|
||||
# Summarizer overrides (env vars are primary)
|
||||
parser.add_argument(
|
||||
"--summarizer-api-url",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Summarization LLM API URL (e.g., http://localhost:8080). "
|
||||
"Overrides SUMMARIZER_API_URL env var if provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summarizer-api-key",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Summarization LLM API key. Overrides SUMMARIZER_API_KEY env var if provided.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--summarizer-model",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Model name for summarization. Overrides SUMMARIZER_MODEL env var if provided.",
|
||||
)
|
||||
|
||||
parser.add_argument("--whisper-model-directory", type=str, default=None,
|
||||
help="Path to save Whisper model files; defaults to ./models/whisper.")
|
||||
# Kept for backward compatibility with UI / existing scripts; ignored by LocalAI client.
|
||||
parser.add_argument(
|
||||
"--whisper-type",
|
||||
type=str,
|
||||
default="whisper",
|
||||
choices=["whisper", "faster-whisper"],
|
||||
help="[Backward compatibility] Type of Whisper model. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--diarization-directory", type=str, default=None,
|
||||
help="Path to the diarization model directory.")
|
||||
parser.add_argument(
|
||||
"--whisper-model-name",
|
||||
default="medium",
|
||||
help="[Backward compatibility] Whisper model name. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--hf-token", default=None, type=str,
|
||||
help="HuggingFace token for private model download.")
|
||||
parser.add_argument(
|
||||
"--whisper-model-directory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="[Backward compatibility] Whisper model directory. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--inference-device",
|
||||
default="cuda" if is_available() else "cpu",
|
||||
help="Device to use for PyTorch inference.")
|
||||
parser.add_argument(
|
||||
"--diarization-directory",
|
||||
type=str,
|
||||
default=None,
|
||||
help="[Backward compatibility] Diarization model directory. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--num-threads", type=int, default=0,
|
||||
help="Number of threads used by torch for CPU inference; '\
|
||||
'overrides MKL_NUM_THREADS/OMP_NUM_THREADS.")
|
||||
parser.add_argument(
|
||||
"--hf-token",
|
||||
default=None,
|
||||
type=str,
|
||||
help="[Backward compatibility] HuggingFace token. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--output-directory", "-o", type=str, default=".",
|
||||
help="Directory to save the transcription outputs.")
|
||||
parser.add_argument(
|
||||
"--inference-device",
|
||||
default="cpu",
|
||||
help="[Backward compatibility] Device for inference. Ignored when using LocalAI.",
|
||||
)
|
||||
|
||||
parser.add_argument("--output-format", "-of", type=str, default="txt",
|
||||
choices=["txt", "json", "md", "html"],
|
||||
help="Format of the output file; defaults to txt.")
|
||||
parser.add_argument(
|
||||
"--num-threads",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of threads used for CPU operations; overrides MKL_NUM_THREADS/OMP_NUM_THREADS.",
|
||||
)
|
||||
|
||||
parser.add_argument("--verbose-output", type=str2bool, default=True,
|
||||
help="Enable or disable progress and debug messages.")
|
||||
parser.add_argument(
|
||||
"--output-directory",
|
||||
"-o",
|
||||
type=str,
|
||||
default=".",
|
||||
help="Directory to save the transcription outputs.",
|
||||
)
|
||||
|
||||
parser.add_argument("--task", type=str, default='autotranscribe',
|
||||
choices=["autotranscribe", "diarization",
|
||||
"autotranscribe+translate", "translate", 'transcribe'],
|
||||
help="Choose to perform transcription, diarization, or translation. \
|
||||
If set to translate, the output will be translated to English.")
|
||||
parser.add_argument(
|
||||
"--output-format",
|
||||
"-of",
|
||||
type=str,
|
||||
default="txt",
|
||||
choices=["txt", "json", "md", "html"],
|
||||
help="Format of the output file; defaults to txt.",
|
||||
)
|
||||
|
||||
parser.add_argument("--language", type=str, default=None,
|
||||
choices=sorted(
|
||||
LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
|
||||
help="Language spoken in the audio. Specify None to perform language detection.")
|
||||
parser.add_argument(
|
||||
"--verbose-output",
|
||||
type=str2bool,
|
||||
default=True,
|
||||
help="Enable or disable progress and debug messages.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
default="transcribe",
|
||||
choices=[
|
||||
"transcribe",
|
||||
"transcript_and_summarize",
|
||||
],
|
||||
help="Task to perform: 'transcribe' or 'transcript_and_summarize'.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--language",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Language spoken in the audio. Specify None to perform language detection.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--num-speakers",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Number of speakers in the audio.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--log-level",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
||||
help="Override LOG_LEVEL env var for logging verbosity.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Apply log-level override if provided
|
||||
log_level = args.log_level or os.getenv("LOG_LEVEL", "INFO")
|
||||
setup_logging(level=log_level)
|
||||
logger.info("CLI starting with log_level=%s", log_level)
|
||||
|
||||
arg_dict = vars(args)
|
||||
|
||||
# configure output
|
||||
out_folder = arg_dict.pop("output_directory")
|
||||
os.makedirs(out_folder, exist_ok=True)
|
||||
logger.info("Output directory: %s", out_folder)
|
||||
|
||||
out_format = arg_dict.pop("output_format")
|
||||
|
||||
task = arg_dict.pop("task")
|
||||
|
||||
if args.num_threads > 0:
|
||||
set_num_threads(arg_dict.pop("num_threads"))
|
||||
logger.info("Task: %s", task)
|
||||
logger.info("Output format: %s", out_format)
|
||||
|
||||
class_kwargs = {'whisper_model': arg_dict.pop("whisper_model_name"),
|
||||
'whisper_type':arg_dict.pop("whisper_type"),
|
||||
'dia_model': arg_dict.pop("diarization_directory"),
|
||||
'use_auth_token': arg_dict.pop("hf_token"),
|
||||
}
|
||||
set_threads(arg_dict.pop("num_threads"))
|
||||
|
||||
if arg_dict["whisper_model_directory"]:
|
||||
class_kwargs["download_root"] = arg_dict.pop("whisper_model_directory")
|
||||
# Read shared values once
|
||||
verbose = arg_dict.pop("verbose_output")
|
||||
language = arg_dict.pop("language")
|
||||
num_speakers = arg_dict.pop("num_speakers")
|
||||
|
||||
# Build kwargs for Scraibe (LocalAI-backed)
|
||||
class_kwargs = {
|
||||
"api_url": arg_dict.pop("localai_api_url"),
|
||||
"api_key": arg_dict.pop("localai_api_key"),
|
||||
"model": arg_dict.pop("localai_model"),
|
||||
# kept for backward compatibility, but ignored:
|
||||
"whisper_model": arg_dict.pop("whisper_model_name"),
|
||||
"whisper_type": arg_dict.pop("whisper_type"),
|
||||
"dia_model": arg_dict.pop("diarization_directory"),
|
||||
"use_auth_token": arg_dict.pop("hf_token"),
|
||||
"verbose": verbose,
|
||||
}
|
||||
|
||||
logger.info("LocalAI API URL: %s", class_kwargs["api_url"] or os.getenv("LOCALAI_API_URL", "<not set>"))
|
||||
logger.info("LocalAI Model: %s", class_kwargs["model"] or os.getenv("LOCALAI_MODEL", "<not set>"))
|
||||
|
||||
model = Scraibe(**class_kwargs)
|
||||
|
||||
if arg_dict["audio_files"]:
|
||||
audio_files = arg_dict.pop("audio_files")
|
||||
logger.info("Audio files: %s", audio_files)
|
||||
|
||||
if task == "autotranscribe" or task == "autotranscribe+translate":
|
||||
if task == "transcribe":
|
||||
for audio in audio_files:
|
||||
if task == "autotranscribe+translate":
|
||||
task = "translate"
|
||||
else:
|
||||
task = "transcribe"
|
||||
|
||||
out = model.autotranscribe(audio, task=task, language=arg_dict.pop(
|
||||
"language"), verbose=arg_dict.pop("verbose_output"))
|
||||
basename = audio.split("/")[-1].split(".")[0]
|
||||
print(f'Saving {basename}.{out_format} to {out_folder}')
|
||||
out.save(os.path.join(
|
||||
out_folder, f"{basename}.{out_format}"))
|
||||
|
||||
elif task == "diarization":
|
||||
for audio in audio_files:
|
||||
if arg_dict.pop("verbose_output"):
|
||||
print("Verbose not implemented for diarization.")
|
||||
|
||||
out = model.diarization(audio)
|
||||
logger.info("Starting 'transcribe' for: %s", audio)
|
||||
out = model.transcribe(
|
||||
audio,
|
||||
language=language,
|
||||
verbose=verbose,
|
||||
num_speakers=num_speakers,
|
||||
)
|
||||
basename = audio.split("/")[-1].split(".")[0]
|
||||
path = os.path.join(out_folder, f"{basename}.{out_format}")
|
||||
|
||||
print(f'Saving {basename}.{out_format} to {out_folder}')
|
||||
|
||||
with open(path, "w") as f:
|
||||
json.dump(json.dumps(out, indent=1), f)
|
||||
|
||||
elif task == "transcribe" or task == "translate":
|
||||
|
||||
for audio in audio_files:
|
||||
|
||||
out = model.transcribe(audio, task=task,
|
||||
language=arg_dict.pop("language"),
|
||||
verbose=arg_dict.pop("verbose_output"))
|
||||
basename = audio.split("/")[-1].split(".")[0]
|
||||
path = os.path.join(out_folder, f"{basename}.{out_format}")
|
||||
with open(path, "w") as f:
|
||||
logger.info("Saving transcript to: %s", path)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(out)
|
||||
logger.info("Transcript saved: %s", path)
|
||||
|
||||
elif task == "transcript_and_summarize":
|
||||
for audio in audio_files:
|
||||
logger.info("Starting 'transcript_and_summarize' for: %s", audio)
|
||||
result = model.transcript_and_summarize(
|
||||
audio,
|
||||
summarizer_api_url=arg_dict.pop("summarizer_api_url"),
|
||||
summarizer_api_key=arg_dict.pop("summarizer_api_key"),
|
||||
summarizer_model=arg_dict.pop("summarizer_model"),
|
||||
language=language,
|
||||
verbose=verbose,
|
||||
num_speakers=num_speakers,
|
||||
)
|
||||
|
||||
transcript_text = result.get("transcript", "")
|
||||
summary_text = result.get("summary", "")
|
||||
|
||||
basename = audio.split("/")[-1].split(".")[0]
|
||||
|
||||
# Always use .md for transcript_and_summarize
|
||||
md_path = os.path.join(out_folder, f"{basename}.md")
|
||||
logger.info("Saving transcript + summary to: %s", md_path)
|
||||
|
||||
with open(md_path, "w", encoding="utf-8") as f:
|
||||
f.write("# Transcript\n\n")
|
||||
f.write(transcript_text)
|
||||
f.write("\n\n# Summary\n\n")
|
||||
f.write(summary_text)
|
||||
|
||||
logger.info("Transcript + summary saved: %s", md_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
|
||||
@@ -37,11 +37,11 @@ from pyannote.audio import Pipeline
|
||||
from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
|
||||
from torch import Tensor
|
||||
from torch import device as torch_device
|
||||
from torch.cuda import is_available
|
||||
|
||||
from huggingface_hub import HfApi
|
||||
from huggingface_hub.utils import RepositoryNotFoundError
|
||||
|
||||
from .misc import PYANNOTE_DEFAULT_PATH, PYANNOTE_DEFAULT_CONFIG
|
||||
from .misc import PYANNOTE_DEFAULT_PATH, PYANNOTE_DEFAULT_CONFIG, SCRAIBE_TORCH_DEVICE
|
||||
Annotation = TypeVar('Annotation')
|
||||
|
||||
TOKEN_PATH = os.path.join(os.path.dirname(
|
||||
@@ -190,8 +190,7 @@ class Diariser:
|
||||
cache_token: bool = False,
|
||||
cache_dir: Union[Path, str] = PYANNOTE_DEFAULT_PATH,
|
||||
hparams_file: Union[str, Path] = None,
|
||||
device: str = None,
|
||||
*args, **kwargs
|
||||
device: str = SCRAIBE_TORCH_DEVICE,
|
||||
) -> Pipeline:
|
||||
"""
|
||||
Loads a pretrained model from pyannote.audio,
|
||||
@@ -283,10 +282,6 @@ class Diariser:
|
||||
'or from huggingface.co models. Please check your token'
|
||||
'or your local model path')
|
||||
|
||||
# try to move the model to the device
|
||||
if device is None:
|
||||
device = "cuda" if is_available() else "cpu"
|
||||
|
||||
# torch_device is renamed from torch.device to avoid name conflict
|
||||
_model = _model.to(torch_device(device))
|
||||
|
||||
|
||||
@@ -0,0 +1,118 @@
|
||||
"""
|
||||
Reusable cover-page generator for transcript and summary DOCX files.
|
||||
|
||||
Configuration (env):
|
||||
- COVER_PAGE_ENABLED: "true"/"false" (default: false)
|
||||
- COVER_PAGE_ORGANIZATION: e.g., "A.P.Strom"
|
||||
- COVER_PAGE_TITLE_PREFIX: e.g., "TRANSCRIPT" or "SUMMARY"
|
||||
- COVER_PAGE_LOGO_URL: optional URL
|
||||
- COVER_PAGE_LOGO_PATH: optional local path
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Optional
|
||||
from docx import Document
|
||||
from docx.shared import Pt, Inches
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
|
||||
def _add_page_break(doc: Document):
|
||||
"""Insert a page break paragraph."""
|
||||
p = doc.add_paragraph()
|
||||
pPr = p._p.get_or_add_pPr()
|
||||
# Clear spacing/tabs
|
||||
for child in list(pPr):
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
if tag in ("tabs", "spacing", "ind"):
|
||||
pPr.remove(child)
|
||||
page_break = OxmlElement("w:pageBreak")
|
||||
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
|
||||
pPr.append(page_break)
|
||||
|
||||
|
||||
def add_cover_page(
|
||||
doc: Document,
|
||||
title: str,
|
||||
subtitle: Optional[str] = None,
|
||||
metadata: Optional[dict] = None,
|
||||
include_logo: bool = False,
|
||||
):
|
||||
"""
|
||||
Insert a cover page at the current cursor position.
|
||||
|
||||
- title: e.g., "TRANSCRIPT" or "SUMMARY"
|
||||
- subtitle: e.g., "Meeting of 16 June 2026"
|
||||
- metadata: optional dict with keys like:
|
||||
- "Organization"
|
||||
- "Date"
|
||||
- "Prepared by"
|
||||
- "Reference"
|
||||
"""
|
||||
|
||||
org = (os.getenv("COVER_PAGE_ORGANIZATION") or "").strip() or metadata.get("Organization") if metadata else None
|
||||
date = (metadata.get("Date") if metadata else None) or ""
|
||||
prepared_by = (metadata.get("Prepared by") if metadata else None) or ""
|
||||
reference = (metadata.get("Reference") if metadata else None) or ""
|
||||
|
||||
# Title
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(6)
|
||||
run = p.add_run(title.upper())
|
||||
run.bold = True
|
||||
run.font.name = "Courier"
|
||||
run.font.size = Pt(18)
|
||||
|
||||
# Subtitle
|
||||
if subtitle:
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(12)
|
||||
run = p.add_run(subtitle)
|
||||
run.font.name = "Courier"
|
||||
run.font.size = Pt(14)
|
||||
|
||||
# Optional logo placeholder (text-only for now; can be extended)
|
||||
if include_logo:
|
||||
logo_url = (os.getenv("COVER_PAGE_LOGO_URL") or "").strip()
|
||||
logo_path = (os.getenv("COVER_PAGE_LOGO_PATH") or "").strip()
|
||||
# For now, just reserve space; image insertion can be added later.
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(12)
|
||||
|
||||
# Metadata lines
|
||||
if org or date or prepared_by or reference:
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(4)
|
||||
if org:
|
||||
r = p.add_run(org)
|
||||
r.font.name = "Courier"
|
||||
r.font.size = Pt(12)
|
||||
if date:
|
||||
if org:
|
||||
p.add_run("\n")
|
||||
r = p.add_run(date)
|
||||
r.font.name = "Courier"
|
||||
r.font.size = Pt(12)
|
||||
|
||||
if prepared_by or reference:
|
||||
p = doc.add_paragraph()
|
||||
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
p.paragraph_format.space_after = Pt(4)
|
||||
if prepared_by:
|
||||
r = p.add_run(f"Prepared by: {prepared_by}")
|
||||
r.font.name = "Courier"
|
||||
r.font.size = Pt(11)
|
||||
if reference:
|
||||
if prepared_by:
|
||||
p.add_run("\n")
|
||||
r = p.add_run(f"Reference: {reference}")
|
||||
r.font.name = "Courier"
|
||||
r.font.size = Pt(11)
|
||||
|
||||
# Page break after cover page
|
||||
_add_page_break(doc)
|
||||
@@ -0,0 +1,147 @@
|
||||
"""
|
||||
Utility module for applying styles and converting simple markdown
|
||||
into styled DOCX paragraphs/runs for summaries.
|
||||
"""
|
||||
|
||||
import re
|
||||
from docx import Document
|
||||
from docx.shared import Pt
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
|
||||
|
||||
def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)):
|
||||
"""
|
||||
Ensure a paragraph style exists in the document.
|
||||
"""
|
||||
styles = doc.styles
|
||||
if name not in [s.name for s in styles]:
|
||||
style = styles.add_style(name, 1) # 1 = WD_STYLE_TYPE.PARAGRAPH
|
||||
style.font.name = font_name
|
||||
style.font.size = font_size
|
||||
if based_on:
|
||||
style.base_style = styles[based_on]
|
||||
return styles[name]
|
||||
|
||||
|
||||
def apply_heading_style(doc, paragraph, level: int):
|
||||
"""
|
||||
Apply heading style to a paragraph based on level (1, 2, 3).
|
||||
"""
|
||||
if level == 1:
|
||||
style_name = "SummaryHeading1"
|
||||
size = Pt(16)
|
||||
elif level == 2:
|
||||
style_name = "SummaryHeading2"
|
||||
size = Pt(14)
|
||||
else:
|
||||
style_name = "SummaryHeading3"
|
||||
size = Pt(12)
|
||||
|
||||
style = _ensure_style(doc, style_name, font_size=size)
|
||||
paragraph.style = style
|
||||
paragraph.paragraph_format.space_before = Pt(4)
|
||||
paragraph.paragraph_format.space_after = Pt(2)
|
||||
|
||||
|
||||
def apply_bullet_style(doc, paragraph):
|
||||
"""
|
||||
Apply a simple bullet style to a paragraph.
|
||||
"""
|
||||
style_name = "SummaryBullet"
|
||||
style = _ensure_style(doc, style_name)
|
||||
paragraph.style = style
|
||||
pPr = paragraph._p.get_or_add_pPr()
|
||||
tabs = OxmlElement("w:tabs")
|
||||
tab = OxmlElement("w:tab")
|
||||
tab.set(qn("w:val"), "left")
|
||||
tab.set(qn("w:pos"), "360")
|
||||
tabs.append(tab)
|
||||
pPr.append(tabs)
|
||||
|
||||
|
||||
def parse_simple_md_to_paragraphs(doc, text: str):
|
||||
"""
|
||||
Convert simple markdown text into DOCX paragraphs with styles.
|
||||
|
||||
Supported:
|
||||
- # / ## / ### for headings
|
||||
- - / * for bullet lists
|
||||
- **bold** and *italic*
|
||||
|
||||
This is intentionally simple and robust for legal/business summaries.
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
current_paragraph = None
|
||||
in_list = False
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
current_paragraph = None
|
||||
in_list = False
|
||||
continue
|
||||
|
||||
# Headings
|
||||
heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
content = heading_match.group(2).strip()
|
||||
p = doc.add_paragraph()
|
||||
apply_heading_style(doc, p, level)
|
||||
_add_run_with_inline_md(p, content)
|
||||
current_paragraph = p
|
||||
in_list = False
|
||||
continue
|
||||
|
||||
# Bullet list
|
||||
bullet_match = re.match(r"^[-*]\s+(.*)", stripped)
|
||||
if bullet_match:
|
||||
content = bullet_match.group(1).strip()
|
||||
if not in_list or current_paragraph is None:
|
||||
in_list = True
|
||||
current_paragraph = doc.add_paragraph()
|
||||
apply_bullet_style(doc, current_paragraph)
|
||||
else:
|
||||
current_paragraph = doc.add_paragraph()
|
||||
apply_bullet_style(doc, current_paragraph)
|
||||
_add_run_with_inline_md(current_paragraph, content)
|
||||
continue
|
||||
|
||||
# Normal paragraph
|
||||
if not in_list or current_paragraph is None:
|
||||
in_list = False
|
||||
current_paragraph = doc.add_paragraph()
|
||||
else:
|
||||
current_paragraph = doc.add_paragraph()
|
||||
|
||||
_add_run_with_inline_md(current_paragraph, stripped)
|
||||
|
||||
|
||||
def _add_run_with_inline_md(paragraph, text: str):
|
||||
"""
|
||||
Add runs to a paragraph, interpreting **bold** and *italic*.
|
||||
"""
|
||||
# Simple regex for bold and italic
|
||||
parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text)
|
||||
for part in parts:
|
||||
if not part:
|
||||
continue
|
||||
|
||||
run = paragraph.add_run(part)
|
||||
run.font.name = "Courier"
|
||||
run.font.size = Pt(12)
|
||||
|
||||
# Bold
|
||||
bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part)
|
||||
if bold_match:
|
||||
run.bold = True
|
||||
part = bold_match.group(1)
|
||||
|
||||
# Italic
|
||||
italic_match = re.fullmatch(r"\*(.+?)\*", part)
|
||||
if italic_match:
|
||||
run.italic = True
|
||||
part = italic_match.group(1)
|
||||
|
||||
run.text = part
|
||||
@@ -0,0 +1,617 @@
|
||||
"""
|
||||
Email sender module for ScrAIbe.
|
||||
|
||||
Sends transcription outputs (TXT, JSON, etc.) via SMTP.
|
||||
All credentials are configured via environment variables.
|
||||
Supports both plain text and HTML email bodies.
|
||||
Template placeholders are primarily filled via environment variables.
|
||||
"""
|
||||
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import smtplib
|
||||
from email import encoders
|
||||
from email.mime.base import MIMEBase
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.mime.text import MIMEText
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from docx import Document
|
||||
from docx.oxml import OxmlElement
|
||||
from docx.oxml.ns import qn
|
||||
from docx.shared import Inches, Pt
|
||||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||||
|
||||
logger = logging.getLogger("scraibe.email_sender")
|
||||
|
||||
|
||||
class EmailError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def get_email_config():
|
||||
"""
|
||||
Read email configuration from environment variables.
|
||||
Raises EmailError if required fields are missing.
|
||||
"""
|
||||
smtp_host = os.getenv("EMAIL_SMTP_HOST")
|
||||
smtp_port = os.getenv("EMAIL_SMTP_PORT")
|
||||
smtp_user = os.getenv("EMAIL_SMTP_USER")
|
||||
smtp_password = os.getenv("EMAIL_SMTP_PASSWORD")
|
||||
from_address = os.getenv("EMAIL_FROM_ADDRESS")
|
||||
use_tls_str = os.getenv("EMAIL_SMTP_USE_TLS", "true").strip().lower()
|
||||
use_tls = use_tls_str not in ("false", "0", "no")
|
||||
|
||||
if not all([smtp_host, smtp_port, smtp_user, smtp_password, from_address]):
|
||||
raise EmailError(
|
||||
"Email configuration incomplete. "
|
||||
"Ensure EMAIL_SMTP_HOST, EMAIL_SMTP_PORT, EMAIL_SMTP_USER, "
|
||||
"EMAIL_SMTP_PASSWORD, and EMAIL_FROM_ADDRESS are set."
|
||||
)
|
||||
|
||||
return {
|
||||
"smtp_host": smtp_host,
|
||||
"smtp_port": int(smtp_port),
|
||||
"smtp_user": smtp_user,
|
||||
"smtp_password": smtp_password,
|
||||
"from_address": from_address,
|
||||
"use_tls": use_tls,
|
||||
}
|
||||
|
||||
|
||||
def _load_css(path: str) -> str:
|
||||
"""
|
||||
Load CSS file content if it exists.
|
||||
"""
|
||||
if not path or not os.path.exists(path):
|
||||
return ""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def _email_logo_html() -> str:
|
||||
"""
|
||||
Return a subtle watermark-style logo for emails.
|
||||
|
||||
- Priority:
|
||||
1) EMAIL_LOGO_URL (direct URL)
|
||||
2) EMAIL_LOGO_PATH (local file as base64)
|
||||
- Style: small, faint, bottom-right, non-intrusive.
|
||||
"""
|
||||
logo_url = os.getenv("EMAIL_LOGO_URL")
|
||||
src = logo_url
|
||||
|
||||
if not logo_url:
|
||||
logo_path = os.getenv("EMAIL_LOGO_PATH", "/app/src/misc/logo1.png")
|
||||
if os.path.exists(logo_path):
|
||||
try:
|
||||
with open(logo_path, "rb") as f:
|
||||
b64 = base64.b64encode(f.read()).decode("utf-8")
|
||||
src = f"data:image/png;base64,{b64}"
|
||||
except Exception:
|
||||
src = None
|
||||
|
||||
if not src:
|
||||
return ""
|
||||
|
||||
# Watermark: bottom-right, low opacity, compact
|
||||
return (
|
||||
f'<div style="text-align: right; margin-top: 24px; opacity: 0.15;">'
|
||||
f'<img src="{src}" alt="Logo" style="max-width: 90px; height: auto; display: inline-block;" />'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
|
||||
def _accent_color() -> str:
|
||||
"""
|
||||
Accent color for UI and emails.
|
||||
Default: #7C6DA0
|
||||
"""
|
||||
return os.getenv("EMAIL_ACCENT_COLOR", "#7C6DA0")
|
||||
|
||||
|
||||
def build_template_context(**runtime_kwargs: Any) -> Dict[str, Any]:
|
||||
"""
|
||||
Build a context dict for templates from:
|
||||
- environment variables (base, customizable)
|
||||
- runtime-provided values (override env if present)
|
||||
|
||||
Environment variables:
|
||||
- EMAIL_CONTACT_ADDRESS: value for {contact_email}
|
||||
- EMAIL_CSS_PATH: path to mail_style.css (optional; we inline it)
|
||||
- EMAIL_LOGO_URL: URL for email logo (preferred)
|
||||
- EMAIL_LOGO_PATH: fallback local path for email logo
|
||||
- EMAIL_ACCENT_COLOR: accent color (default #7C6DA0)
|
||||
"""
|
||||
# Load and inline mail_style.css for consistent email styling
|
||||
css_path = os.getenv("EMAIL_CSS_PATH", "/app/src/misc/mail_style.css")
|
||||
css_text = _load_css(css_path)
|
||||
|
||||
# Build logo HTML (URL or local fallback)
|
||||
logo_html = _email_logo_html()
|
||||
|
||||
# Accent color
|
||||
accent = _accent_color()
|
||||
|
||||
ctx: Dict[str, Any] = {
|
||||
"contact_email": os.getenv("EMAIL_CONTACT_ADDRESS", "support@example.com"),
|
||||
"email_css": css_text,
|
||||
"email_logo": logo_html,
|
||||
"accent_color": accent,
|
||||
}
|
||||
|
||||
# Runtime values override env if provided
|
||||
if runtime_kwargs:
|
||||
ctx.update(runtime_kwargs)
|
||||
|
||||
return ctx
|
||||
|
||||
|
||||
def load_template(template_name: str, **runtime_kwargs: Any) -> str:
|
||||
"""
|
||||
Load an HTML email template from misc/ and render placeholders.
|
||||
|
||||
Expects files like:
|
||||
/app/src/misc/upload_notification_template.html
|
||||
/app/src/misc/success_template.html
|
||||
/app/src/misc/error_notification_template.html
|
||||
"""
|
||||
base = os.getenv("SCRAIBE_TEMPLATES_DIR", "/app/src/misc")
|
||||
path = os.path.join(base, template_name)
|
||||
|
||||
if not os.path.exists(path):
|
||||
raise EmailError(f"Email template not found: {path}")
|
||||
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
|
||||
# Build context from env + runtime
|
||||
ctx = build_template_context(**runtime_kwargs)
|
||||
|
||||
# Replace {placeholder} style variables safely
|
||||
try:
|
||||
return template.format(**ctx)
|
||||
except KeyError as e:
|
||||
raise EmailError(f"Missing template variable: {e}")
|
||||
|
||||
|
||||
def send_email(
|
||||
to: str,
|
||||
subject: str,
|
||||
body: str,
|
||||
html: Optional[str],
|
||||
attachments: List[str],
|
||||
cc: Optional[str] = None,
|
||||
) -> bool:
|
||||
"""
|
||||
Send an email with optional HTML body and file attachments.
|
||||
|
||||
Args:
|
||||
to: Comma-separated list of recipient email addresses.
|
||||
subject: Email subject.
|
||||
body: Email body (plain text).
|
||||
html: Email body (HTML), or None.
|
||||
attachments: List of file paths to attach.
|
||||
cc: Comma-separated list of CC email addresses (optional).
|
||||
|
||||
Returns:
|
||||
True if sent successfully.
|
||||
|
||||
Raises:
|
||||
EmailError if sending fails.
|
||||
"""
|
||||
try:
|
||||
cfg = get_email_config()
|
||||
except EmailError as e:
|
||||
logger.error("Email configuration error: %s", e)
|
||||
raise
|
||||
|
||||
# Parse recipients
|
||||
to_list = [addr.strip() for addr in to.split(",") if addr.strip()]
|
||||
cc_list = [addr.strip() for addr in cc.split(",") if addr.strip()] if cc else []
|
||||
|
||||
if not to_list:
|
||||
raise EmailError("No valid 'To' email addresses provided.")
|
||||
|
||||
# Ensure subject is never blank
|
||||
if not subject or not subject.strip():
|
||||
logger.warning("Subject was blank or missing; using default subject.")
|
||||
subject = "ScrAIbe: Your transcript is ready"
|
||||
|
||||
subject = subject.strip()
|
||||
|
||||
has_attachments = bool(attachments)
|
||||
|
||||
# Build the text/HTML part (alternative)
|
||||
alt = MIMEMultipart("alternative")
|
||||
alt.attach(MIMEText(body, "plain"))
|
||||
if html:
|
||||
alt.attach(MIMEText(html, "html"))
|
||||
|
||||
if has_attachments:
|
||||
# Outer message: multipart/mixed with headers
|
||||
msg = MIMEMultipart("mixed")
|
||||
msg["From"] = cfg["from_address"]
|
||||
msg["To"] = ", ".join(to_list)
|
||||
if cc_list:
|
||||
msg["Cc"] = ", ".join(cc_list)
|
||||
msg["Subject"] = subject
|
||||
|
||||
# Attach the alternative (text/HTML) part
|
||||
msg.attach(alt)
|
||||
|
||||
# Attach files
|
||||
for file_path in attachments:
|
||||
if not os.path.isfile(file_path):
|
||||
logger.warning("Attachment file not found, skipping: %s", file_path)
|
||||
continue
|
||||
|
||||
try:
|
||||
with open(file_path, "rb") as f:
|
||||
part = MIMEBase("application", "octet-stream")
|
||||
part.set_payload(f.read())
|
||||
encoders.encode_base64(part)
|
||||
part.add_header(
|
||||
"Content-Disposition",
|
||||
"attachment",
|
||||
filename=os.path.basename(file_path),
|
||||
)
|
||||
msg.attach(part)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to attach file %s: %s", file_path, e)
|
||||
else:
|
||||
# No attachments: use the alternative part as the root message
|
||||
msg = alt
|
||||
msg["From"] = cfg["from_address"]
|
||||
msg["To"] = ", ".join(to_list)
|
||||
if cc_list:
|
||||
msg["Cc"] = ", ".join(cc_list)
|
||||
msg["Subject"] = subject
|
||||
|
||||
# Connect and send
|
||||
try:
|
||||
if cfg["use_tls"]:
|
||||
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
|
||||
server.ehlo()
|
||||
server.starttls()
|
||||
server.ehlo()
|
||||
else:
|
||||
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
|
||||
server.ehlo()
|
||||
|
||||
server.login(cfg["smtp_user"], cfg["smtp_password"])
|
||||
server.sendmail(
|
||||
cfg["from_address"],
|
||||
to_list + cc_list,
|
||||
msg.as_string(),
|
||||
)
|
||||
server.quit()
|
||||
logger.info(
|
||||
"Email sent to %s (CC: %s) with subject: %s",
|
||||
to_list,
|
||||
cc_list or "None",
|
||||
subject,
|
||||
)
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Failed to send email: %s", e)
|
||||
raise EmailError(f"Failed to send email: {e}")
|
||||
|
||||
|
||||
# ------------ DOCX helpers ------------
|
||||
|
||||
# Namespaces
|
||||
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
||||
|
||||
|
||||
def _set_element_attr(elem, attr, value):
|
||||
elem.set(f"{{{W_NS}}}{attr}", str(value))
|
||||
|
||||
|
||||
def _create_transcript_section_properties(section):
|
||||
"""
|
||||
Configure the section properties for transcript DOCX:
|
||||
- Margins: 1 inch all sides
|
||||
- Single column layout
|
||||
- No built-in line numbering (we embed line numbers as text for portability)
|
||||
- Remove document grid to avoid off-by-one line numbering
|
||||
"""
|
||||
sectPr = section._sectPr
|
||||
|
||||
# Margins: 1 inch = 1440 twips
|
||||
pgMar = sectPr.find(f"{{{W_NS}}}pgMar")
|
||||
if pgMar is None:
|
||||
pgMar = OxmlElement("w:pgMar")
|
||||
sectPr.append(pgMar)
|
||||
_set_element_attr(pgMar, "top", "1440")
|
||||
_set_element_attr(pgMar, "right", "1440")
|
||||
_set_element_attr(pgMar, "bottom", "1440")
|
||||
_set_element_attr(pgMar, "left", "1440")
|
||||
_set_element_attr(pgMar, "header", "720")
|
||||
_set_element_attr(pgMar, "footer", "720")
|
||||
_set_element_attr(pgMar, "gutter", "0")
|
||||
|
||||
# Ensure single column (no multi-column layout)
|
||||
cols = sectPr.find(f"{{{W_NS}}}cols")
|
||||
if cols is not None:
|
||||
_set_element_attr(cols, "num", "1")
|
||||
_set_element_attr(cols, "space", "720")
|
||||
|
||||
# Remove document grid entirely
|
||||
for docGrid in sectPr.findall(f"{{{W_NS}}}docGrid"):
|
||||
sectPr.remove(docGrid)
|
||||
|
||||
# Remove any built-in line numbering; we will use text-based line numbers
|
||||
for lnNumType in sectPr.findall(f"{{{W_NS}}}lnNumType"):
|
||||
sectPr.remove(lnNumType)
|
||||
|
||||
|
||||
def _add_transcript_paragraph(doc, line_text, line_number):
|
||||
"""
|
||||
Add a single transcript line as a paragraph with an embedded line number.
|
||||
Uses a left tab stop so the line number appears in the left margin area,
|
||||
independent of built-in line numbering, ensuring consistent behavior
|
||||
across Word, LibreOffice, Google Docs, etc.
|
||||
"""
|
||||
line_text = line_text.strip()
|
||||
if not line_text:
|
||||
return
|
||||
|
||||
p = doc.add_paragraph()
|
||||
|
||||
# Set up paragraph formatting:
|
||||
# - No left indent; we control spacing via tab stop
|
||||
# - Single line spacing, no extra before/after
|
||||
pPr = p._p.get_or_add_pPr()
|
||||
|
||||
# Remove any default indent
|
||||
pPr.find(f"{{{W_NS}}}ind") and pPr.remove(pPr.find(f"{{{W_NS}}}ind"))
|
||||
|
||||
# Define a left tab stop for line numbers (e.g. 360 twips ≈ 0.25")
|
||||
tabs = OxmlElement("w:tabs")
|
||||
tab = OxmlElement("w:tab")
|
||||
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "left")
|
||||
tab.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}pos", "360")
|
||||
tabs.append(tab)
|
||||
pPr.append(tabs)
|
||||
|
||||
spacing = OxmlElement("w:spacing")
|
||||
_set_element_attr(spacing, "before", "0")
|
||||
_set_element_attr(spacing, "after", "0")
|
||||
_set_element_attr(spacing, "line", "360") # 1.5 line spacing (12pt * 1.5 = 18pt → 360 twips)
|
||||
_set_element_attr(spacing, "lineRule", "auto")
|
||||
pPr.append(spacing)
|
||||
|
||||
# Try to match: [00:00] SPEAKER 1: content
|
||||
m = re.match(r"\[(\d+:\d+(?::\d+)?)\]\s*(.+?):\s*(.*)", line_text)
|
||||
|
||||
# Line number run (no underline)
|
||||
run_ln = p.add_run(str(line_number))
|
||||
run_ln.font.name = "Courier"
|
||||
run_ln.font.size = Pt(12)
|
||||
run_ln.underline = False
|
||||
|
||||
# Tab + spaces between line number and content
|
||||
# - 2 base spaces + 7 more for first line of speaker turn
|
||||
# - 2 base spaces + 3 more for continuation lines
|
||||
if m:
|
||||
extra_spaces = " " # 7 spaces for speaker lines
|
||||
else:
|
||||
extra_spaces = " " # 3 spaces for continuation lines
|
||||
|
||||
run_tab = p.add_run("\t " + extra_spaces)
|
||||
run_tab.font.name = "Courier"
|
||||
run_tab.font.size = Pt(12)
|
||||
run_tab.underline = False
|
||||
|
||||
if m:
|
||||
ts, speaker, content = m.groups()
|
||||
label_text = f"[{ts}] {speaker.upper()}:"
|
||||
|
||||
# Label run (underline)
|
||||
run_label = p.add_run(label_text)
|
||||
run_label.underline = True
|
||||
run_label.font.name = "Courier"
|
||||
run_label.font.size = Pt(12)
|
||||
|
||||
# Space run (no underline)
|
||||
run_space = p.add_run(" ")
|
||||
run_space.underline = False
|
||||
run_space.font.name = "Courier"
|
||||
run_space.font.size = Pt(12)
|
||||
|
||||
# Content run (no underline)
|
||||
run_txt = p.add_run(content.strip())
|
||||
run_txt.underline = False
|
||||
run_txt.font.name = "Courier"
|
||||
run_txt.font.size = Pt(12)
|
||||
else:
|
||||
# Non-standard line: plain text
|
||||
run = p.add_run(line_text)
|
||||
run.underline = False
|
||||
run.font.name = "Courier"
|
||||
run.font.size = Pt(12)
|
||||
|
||||
|
||||
# ------------ Public DOCX functions ------------
|
||||
|
||||
def create_transcript_docx(text: str, filename: str):
|
||||
"""
|
||||
Create a transcript DOCX with:
|
||||
- 1" margins on all sides
|
||||
- 12pt Courier font
|
||||
- Each page has exactly 29 numbered lines of text
|
||||
- Max 60 characters per line (including number and spaces)
|
||||
- Words preserved (no clipping or omission)
|
||||
- Blank spacing between number and text preserved
|
||||
- Page break after every 29 lines
|
||||
- Centered footer: "X of Y"
|
||||
"""
|
||||
# Step 1: Prepare transcript into pages of 29 lines each
|
||||
# Each line <= 60 chars total, words preserved, no clipping
|
||||
# Structure: nested list of paragraphs (pages -> lines)
|
||||
prepared_pages = []
|
||||
current_page = []
|
||||
line_count = 0
|
||||
|
||||
# 52 chars content + 2 digits + 1 tab + 9 spaces = 64 max
|
||||
MAX_CONTENT_LEN = 52
|
||||
|
||||
for raw_line in text.strip().splitlines():
|
||||
raw_line = raw_line.strip()
|
||||
if not raw_line:
|
||||
continue
|
||||
|
||||
# Wrap into segments without clipping words
|
||||
words = raw_line.split()
|
||||
segments = []
|
||||
current = ""
|
||||
for w in words:
|
||||
if not current:
|
||||
current = w
|
||||
elif len(current) + 1 + len(w) <= MAX_CONTENT_LEN:
|
||||
current += " " + w
|
||||
else:
|
||||
segments.append(current)
|
||||
current = w
|
||||
if current:
|
||||
segments.append(current)
|
||||
|
||||
# Add segments to pages, enforcing 29 lines per page
|
||||
for seg in segments:
|
||||
if line_count == 30:
|
||||
prepared_pages.append(current_page)
|
||||
current_page = []
|
||||
line_count = 0
|
||||
current_page.append(seg)
|
||||
line_count += 1
|
||||
|
||||
if current_page:
|
||||
prepared_pages.append(current_page)
|
||||
|
||||
# Step 2: Create DOCX
|
||||
doc = Document()
|
||||
style = doc.styles["Normal"]
|
||||
style.font.name = "Courier"
|
||||
style.font.size = Pt(12)
|
||||
|
||||
body = doc.element.body
|
||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||
body.remove(p)
|
||||
|
||||
_create_transcript_section_properties(doc.sections[0])
|
||||
|
||||
# Step 3: Optionally add cover page
|
||||
from . import docx_cover
|
||||
cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||
if cover_enabled:
|
||||
docx_cover.add_cover_page(
|
||||
doc,
|
||||
title="TRANSCRIPT",
|
||||
subtitle=None,
|
||||
metadata=None,
|
||||
include_logo=True,
|
||||
)
|
||||
|
||||
# Step 4: Write prepared pages into DOCX
|
||||
for page_idx, page_lines in enumerate(prepared_pages):
|
||||
# Insert page break between pages
|
||||
if page_idx > 0:
|
||||
p_break = doc.add_paragraph()
|
||||
pPr = p_break._p.get_or_add_pPr()
|
||||
for child in list(pPr):
|
||||
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
||||
if tag in ("tabs", "spacing", "ind"):
|
||||
pPr.remove(child)
|
||||
page_break = OxmlElement("w:pageBreak")
|
||||
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
|
||||
pPr.append(page_break)
|
||||
|
||||
# Write each line with its number (1-29)
|
||||
for line_num, line_text in enumerate(page_lines, start=1):
|
||||
_add_transcript_paragraph(doc, line_text, line_number=line_num)
|
||||
|
||||
# Step 5: Add footer: "X of Y" centered
|
||||
section = doc.sections[0]
|
||||
footer = section.footer
|
||||
footer.is_linked_to_previous = False
|
||||
footer_para = footer.paragraphs[0] if footer.paragraphs else footer.add_paragraph()
|
||||
footer_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
|
||||
|
||||
for r in footer_para.runs:
|
||||
r.text = ""
|
||||
|
||||
def add_field(run, code):
|
||||
fldChar = OxmlElement("w:fldChar")
|
||||
fldChar.set(qn("w:fldCharType"), "begin")
|
||||
run._r.append(fldChar)
|
||||
|
||||
instrText = OxmlElement("w:instrText")
|
||||
instrText.set(qn("xml:space"), "preserve")
|
||||
instrText.text = code
|
||||
run._r.append(instrText)
|
||||
|
||||
fldCharEnd = OxmlElement("w:fldChar")
|
||||
fldCharEnd.set(qn("w:fldCharType"), "end")
|
||||
run._r.append(fldCharEnd)
|
||||
|
||||
run_page = footer_para.add_run()
|
||||
add_field(run_page, " PAGE ")
|
||||
|
||||
run_of = footer_para.add_run(" of ")
|
||||
|
||||
run_total = footer_para.add_run()
|
||||
add_field(run_total, " NUMPAGES ")
|
||||
|
||||
doc.save(filename)
|
||||
|
||||
|
||||
def create_summary_docx(text: str, filename: str):
|
||||
"""
|
||||
Create a summary DOCX with:
|
||||
- 1" margins on all sides
|
||||
- 12pt Courier font
|
||||
- Markdown-aware WYSIWYG styling (headings, bullets, bold/italic)
|
||||
"""
|
||||
from . import docx_styles
|
||||
|
||||
doc = Document()
|
||||
|
||||
# Base font
|
||||
style = doc.styles["Normal"]
|
||||
style.font.name = "Courier"
|
||||
style.font.size = Pt(12)
|
||||
|
||||
# Margins: 1 inch all sides
|
||||
for section in doc.sections:
|
||||
section.left_margin = Inches(1.0)
|
||||
section.right_margin = Inches(1.0)
|
||||
section.top_margin = Inches(1.0)
|
||||
section.bottom_margin = Inches(1.0)
|
||||
|
||||
# Remove default paragraph
|
||||
body = doc.element.body
|
||||
for p in list(body.findall(f"{{{W_NS}}}p")):
|
||||
body.remove(p)
|
||||
|
||||
# Optionally add cover page
|
||||
from . import docx_cover
|
||||
cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||
if cover_enabled:
|
||||
docx_cover.add_cover_page(
|
||||
doc,
|
||||
title="SUMMARY",
|
||||
subtitle=None,
|
||||
metadata=None,
|
||||
include_logo=True,
|
||||
)
|
||||
|
||||
# Add summary content using markdown-aware styling
|
||||
if text.strip():
|
||||
docx_styles.parse_simple_md_to_paragraphs(doc, text.strip())
|
||||
|
||||
doc.save(filename)
|
||||
@@ -0,0 +1,566 @@
|
||||
"""
|
||||
LocalAI Client Module
|
||||
---------------------
|
||||
|
||||
This module provides a client for communicating with a LocalAI server
|
||||
running vibevoice.cpp for transcription and speaker diarization.
|
||||
|
||||
It replaces the previous local Whisper + Pyannote pipeline by sending
|
||||
audio files to the /v1/audio/diarization endpoint and mapping the
|
||||
response into the same Transcript format used by the UI.
|
||||
|
||||
For long audio files, it can chunk the input to avoid GPU OOM errors.
|
||||
|
||||
Environment Variables:
|
||||
LOCALAI_API_URL: (required) Base URL of the LocalAI server
|
||||
(e.g., http://localhost:8080)
|
||||
LOCALAI_API_KEY: (optional) API key, if configured
|
||||
LOCALAI_MODEL: (optional) Model name to use (default: vibevoice-diarize)
|
||||
|
||||
Chunking / long audio (all optional):
|
||||
LOCALAI_CHUNK_DURATION: Max duration of each chunk in seconds
|
||||
(default: 180.0)
|
||||
LOCALAI_CHUNK_OVERLAP: Overlap between consecutive chunks in seconds
|
||||
(default: 2.0)
|
||||
LOCALAI_MAX_SINGLE_REQUEST_DURATION: If audio duration exceeds this, chunking
|
||||
is enabled automatically (default: 300.0)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from .audio import get_audio_duration, split_audio_into_chunks
|
||||
|
||||
logger = logging.getLogger("scraibe.localai_client")
|
||||
|
||||
|
||||
class LocalAIError(Exception):
|
||||
"""Raised when the LocalAI API returns an error or unexpected response."""
|
||||
pass
|
||||
|
||||
|
||||
class LocalAIClient:
|
||||
"""
|
||||
Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
|
||||
|
||||
Responsibilities:
|
||||
- Read configuration from environment.
|
||||
- Upload audio file as multipart/form-data.
|
||||
- Parse diarization + transcription response (verbose_json).
|
||||
- Map response into the same structure expected by Scraibe's Transcript.
|
||||
- For long audio: chunk, transcribe each chunk, merge results.
|
||||
"""
|
||||
|
||||
# Default thresholds for chunking long audio to avoid GPU OOM.
|
||||
# These can be overridden via environment or at call time.
|
||||
DEFAULT_CHUNK_DURATION = 180.0 # seconds
|
||||
DEFAULT_CHUNK_OVERLAP = 2.0 # seconds
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: float = 3600.0,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
api_url: LocalAI server URL (e.g., http://localhost:8080).
|
||||
Falls back to LOCALAI_API_URL env var.
|
||||
api_key: API key, if required. Falls back to LOCALAI_API_KEY.
|
||||
model: Model name (e.g., vibevoice-diarize).
|
||||
Falls back to LOCALAI_MODEL or default.
|
||||
timeout: Request timeout in seconds.
|
||||
"""
|
||||
self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
|
||||
self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
|
||||
self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
|
||||
self.timeout = timeout
|
||||
|
||||
if not self.api_url:
|
||||
raise LocalAIError(
|
||||
"LOCALAI_API_URL is not set. "
|
||||
"Provide the LocalAI server URL via environment or constructor."
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Initializing LocalAIClient: url=%s model=%s",
|
||||
self.api_url,
|
||||
self.model,
|
||||
)
|
||||
|
||||
self._client = httpx.Client(
|
||||
base_url=self.api_url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _env_float(var: str, default: float) -> float:
|
||||
"""
|
||||
Read a float from environment with a fallback default.
|
||||
"""
|
||||
val = (os.getenv(var) or "").strip()
|
||||
if val == "":
|
||||
return default
|
||||
try:
|
||||
return float(val)
|
||||
except ValueError:
|
||||
logger.warning(
|
||||
"Invalid value for %s: %s; using default %s", var, val, default
|
||||
)
|
||||
return default
|
||||
|
||||
def _effective_chunk_duration(self, provided: Optional[float]) -> float:
|
||||
"""
|
||||
Resolve chunk_duration using this precedence:
|
||||
1) provided argument
|
||||
2) LOCALAI_CHUNK_DURATION env
|
||||
3) class default
|
||||
"""
|
||||
if provided is not None:
|
||||
return provided
|
||||
return self._env_float("LOCALAI_CHUNK_DURATION", self.DEFAULT_CHUNK_DURATION)
|
||||
|
||||
def _effective_chunk_overlap(self, provided: Optional[float]) -> float:
|
||||
"""
|
||||
Resolve chunk_overlap:
|
||||
1) provided argument
|
||||
2) LOCALAI_CHUNK_OVERLAP env
|
||||
3) class default
|
||||
"""
|
||||
if provided is not None:
|
||||
return provided
|
||||
return self._env_float("LOCALAI_CHUNK_OVERLAP", self.DEFAULT_CHUNK_OVERLAP)
|
||||
|
||||
def _effective_max_single_request_duration(self, provided: Optional[float]) -> float:
|
||||
"""
|
||||
Resolve max_single_request_duration:
|
||||
1) provided argument
|
||||
2) LOCALAI_MAX_SINGLE_REQUEST_DURATION env
|
||||
3) default 300.0
|
||||
"""
|
||||
if provided is not None:
|
||||
return provided
|
||||
return self._env_float("LOCALAI_MAX_SINGLE_REQUEST_DURATION", 300.0)
|
||||
|
||||
def close(self):
|
||||
"""Close the underlying HTTP client."""
|
||||
self._client.close()
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def diarize_and_transcribe(
|
||||
self,
|
||||
audio_path: str,
|
||||
*,
|
||||
language: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
min_speakers: Optional[int] = None,
|
||||
max_speakers: Optional[int] = None,
|
||||
clustering_threshold: Optional[float] = None,
|
||||
min_duration_on: Optional[float] = None,
|
||||
min_duration_off: Optional[float] = None,
|
||||
response_format: Optional[str] = None,
|
||||
include_text: Optional[bool] = None,
|
||||
verbose: bool = False,
|
||||
return_raw: bool = False,
|
||||
use_chunking: Optional[bool] = None,
|
||||
chunk_duration: Optional[float] = None,
|
||||
chunk_overlap: Optional[float] = None,
|
||||
max_single_request_duration: Optional[float] = None,
|
||||
**_ignored,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Send audio to LocalAI /v1/audio/diarization and return:
|
||||
- A normalized dict with segments, speakers, transcripts.
|
||||
- Optionally, the raw verbose_json response (for JSON export).
|
||||
|
||||
For long audio, it can automatically chunk the file to avoid GPU OOM.
|
||||
|
||||
Args:
|
||||
audio_path: Path to the audio file.
|
||||
language: Language hint, forwarded if set.
|
||||
num_speakers: Optional exact speaker count.
|
||||
min_speakers: Optional hint.
|
||||
max_speakers: Optional hint.
|
||||
clustering_threshold: Optional clustering threshold.
|
||||
min_duration_on: Optional min segment duration.
|
||||
min_duration_off: Optional min gap duration.
|
||||
response_format: "json", "verbose_json", or "rttm".
|
||||
Defaults to "verbose_json".
|
||||
include_text: Whether to request per-segment text.
|
||||
Defaults to True.
|
||||
verbose: If True, prints progress messages.
|
||||
return_raw: If True, also return the raw API response in 'raw_result'.
|
||||
use_chunking: Whether to enable chunking for long audio.
|
||||
If None, enabled automatically based on duration.
|
||||
chunk_duration: Max duration per chunk in seconds.
|
||||
Falls back to LOCALAI_CHUNK_DURATION env, then 180.0.
|
||||
chunk_overlap: Overlap between chunks in seconds.
|
||||
Falls back to LOCALAI_CHUNK_OVERLAP env, then 2.0.
|
||||
max_single_request_duration: If audio duration exceeds this, chunking
|
||||
is enabled (unless explicitly disabled).
|
||||
Falls back to LOCALAI_MAX_SINGLE_REQUEST_DURATION
|
||||
env, then 300.0.
|
||||
"""
|
||||
if verbose:
|
||||
print("Starting diarization and transcription via LocalAI.")
|
||||
|
||||
logger.info("diarize_and_transcribe requested for: %s", audio_path)
|
||||
|
||||
# Resolve chunking parameters with environment support
|
||||
chunk_duration = self._effective_chunk_duration(chunk_duration)
|
||||
chunk_overlap = self._effective_chunk_overlap(chunk_overlap)
|
||||
max_single = self._effective_max_single_request_duration(max_single_request_duration)
|
||||
|
||||
if use_chunking is None:
|
||||
try:
|
||||
duration = get_audio_duration(audio_path)
|
||||
except RuntimeError:
|
||||
duration = None
|
||||
|
||||
use_chunking = (duration is not None and duration > max_single)
|
||||
logger.info(
|
||||
"Auto-chunking decision: duration=%s, threshold=%s, use_chunking=%s",
|
||||
duration,
|
||||
max_single,
|
||||
use_chunking,
|
||||
)
|
||||
|
||||
if use_chunking:
|
||||
return self._diarize_and_transcribe_chunked(
|
||||
audio_path=audio_path,
|
||||
language=language,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
clustering_threshold=clustering_threshold,
|
||||
min_duration_on=min_duration_on,
|
||||
min_duration_off=min_duration_off,
|
||||
response_format=response_format,
|
||||
include_text=include_text,
|
||||
verbose=verbose,
|
||||
return_raw=return_raw,
|
||||
chunk_duration=chunk_duration,
|
||||
chunk_overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
# Single-request path (existing behavior)
|
||||
return self._diarize_and_transcribe_single(
|
||||
audio_path=audio_path,
|
||||
language=language,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
clustering_threshold=clustering_threshold,
|
||||
min_duration_on=min_duration_on,
|
||||
min_duration_off=min_duration_off,
|
||||
response_format=response_format,
|
||||
include_text=include_text,
|
||||
verbose=verbose,
|
||||
return_raw=return_raw,
|
||||
)
|
||||
|
||||
def _diarize_and_transcribe_single(
|
||||
self,
|
||||
audio_path: str,
|
||||
*,
|
||||
language: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
min_speakers: Optional[int] = None,
|
||||
max_speakers: Optional[int] = None,
|
||||
clustering_threshold: Optional[float] = None,
|
||||
min_duration_on: Optional[float] = None,
|
||||
min_duration_off: Optional[float] = None,
|
||||
response_format: Optional[str] = None,
|
||||
include_text: Optional[bool] = None,
|
||||
verbose: bool = False,
|
||||
return_raw: bool = False,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Internal: single-request diarization and transcription.
|
||||
"""
|
||||
if verbose:
|
||||
print("Starting diarization and transcription via LocalAI.")
|
||||
|
||||
logger.info("diarize_and_transcribe requested for: %s", audio_path)
|
||||
|
||||
# Always use verbose_json for diarization + speaker info
|
||||
if response_format is None:
|
||||
response_format = "verbose_json"
|
||||
if include_text is None:
|
||||
include_text = True
|
||||
|
||||
# Prepare form data
|
||||
data = {
|
||||
"model": self.model,
|
||||
"response_format": response_format,
|
||||
"include_text": str(include_text).lower(),
|
||||
}
|
||||
|
||||
if language is not None:
|
||||
data["language"] = language
|
||||
if num_speakers is not None:
|
||||
data["num_speakers"] = str(num_speakers)
|
||||
if min_speakers is not None:
|
||||
data["min_speakers"] = str(min_speakers)
|
||||
if max_speakers is not None:
|
||||
data["max_speakers"] = str(max_speakers)
|
||||
if clustering_threshold is not None:
|
||||
data["clustering_threshold"] = str(clustering_threshold)
|
||||
if min_duration_on is not None:
|
||||
data["min_duration_on"] = str(min_duration_on)
|
||||
if min_duration_off is not None:
|
||||
data["min_duration_off"] = str(min_duration_off)
|
||||
|
||||
logger.debug("LocalAI request params: %s", data)
|
||||
|
||||
# Open file
|
||||
if not os.path.exists(audio_path):
|
||||
raise LocalAIError(f"Audio file not found: {audio_path}")
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
files = {
|
||||
"file": (os.path.basename(audio_path), f, "application/octet-stream")
|
||||
}
|
||||
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
# POST /v1/audio/diarization
|
||||
logger.info("Sending request to LocalAI: /v1/audio/diarization")
|
||||
resp = self._client.post(
|
||||
"/v1/audio/diarization",
|
||||
data=data,
|
||||
files=files,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
logger.info("LocalAI response status: %d", resp.status_code)
|
||||
|
||||
if resp.status_code >= 400:
|
||||
body = resp.text
|
||||
logger.error("LocalAI error response: %s", body)
|
||||
raise LocalAIError(
|
||||
f"LocalAI request failed with status {resp.status_code}: {body}"
|
||||
)
|
||||
|
||||
try:
|
||||
raw_result = resp.json()
|
||||
except json.JSONDecodeError:
|
||||
logger.error("Failed to parse LocalAI response as JSON.")
|
||||
raise LocalAIError(
|
||||
"Failed to parse LocalAI response as JSON."
|
||||
)
|
||||
|
||||
if verbose:
|
||||
print("Diarization and transcription finished. Starting post-processing.")
|
||||
|
||||
parsed = self._parse_diarization_response(raw_result)
|
||||
|
||||
if return_raw:
|
||||
parsed["raw_result"] = raw_result
|
||||
|
||||
return parsed
|
||||
|
||||
def _diarize_and_transcribe_chunked(
|
||||
self,
|
||||
audio_path: str,
|
||||
*,
|
||||
language: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
min_speakers: Optional[int] = None,
|
||||
max_speakers: Optional[int] = None,
|
||||
clustering_threshold: Optional[float] = None,
|
||||
min_duration_on: Optional[float] = None,
|
||||
min_duration_off: Optional[float] = None,
|
||||
response_format: Optional[str] = None,
|
||||
include_text: Optional[bool] = None,
|
||||
verbose: bool = False,
|
||||
return_raw: bool = False,
|
||||
chunk_duration: float = DEFAULT_CHUNK_DURATION,
|
||||
chunk_overlap: float = DEFAULT_CHUNK_OVERLAP,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Internal: chunked diarization and transcription for long audio.
|
||||
|
||||
- Splits audio into overlapping chunks.
|
||||
- Transcribes each chunk via /v1/audio/diarization.
|
||||
- Merges segments with adjusted timestamps.
|
||||
"""
|
||||
if verbose:
|
||||
print("Audio is long; splitting into chunks to avoid GPU memory issues.")
|
||||
|
||||
logger.info(
|
||||
"Chunked transcription: chunk_duration=%s, overlap=%s",
|
||||
chunk_duration,
|
||||
chunk_overlap,
|
||||
)
|
||||
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=audio_path,
|
||||
max_duration=chunk_duration,
|
||||
overlap=chunk_overlap,
|
||||
)
|
||||
|
||||
if len(chunks) == 1:
|
||||
# No actual split needed; fall back to single-request path
|
||||
return self._diarize_and_transcribe_single(
|
||||
audio_path=chunks[0]["path"],
|
||||
language=language,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
clustering_threshold=clustering_threshold,
|
||||
min_duration_on=min_duration_on,
|
||||
min_duration_off=min_duration_off,
|
||||
response_format=response_format,
|
||||
include_text=include_text,
|
||||
verbose=verbose,
|
||||
return_raw=return_raw,
|
||||
)
|
||||
|
||||
all_segments: List[List[float]] = []
|
||||
all_speakers: List[str] = []
|
||||
all_transcripts: List[str] = []
|
||||
raw_results: List[Dict[str, Any]] = []
|
||||
temp_files = [c["path"] for c in chunks]
|
||||
|
||||
try:
|
||||
for i, chunk_info in enumerate(chunks):
|
||||
chunk_path = chunk_info["path"]
|
||||
chunk_start = chunk_info["start"]
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"Transcribing chunk {i+1}/{len(chunks)} "
|
||||
f"(start={chunk_start:.1f}s)"
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Transcribing chunk %d/%d, start=%.1f", i + 1, len(chunks), chunk_start
|
||||
)
|
||||
|
||||
# Use single-request logic for each chunk
|
||||
chunk_result = self._diarize_and_transcribe_single(
|
||||
audio_path=chunk_path,
|
||||
language=language,
|
||||
num_speakers=num_speakers,
|
||||
min_speakers=min_speakers,
|
||||
max_speakers=max_speakers,
|
||||
clustering_threshold=clustering_threshold,
|
||||
min_duration_on=min_duration_on,
|
||||
min_duration_off=min_duration_off,
|
||||
response_format=response_format,
|
||||
include_text=include_text,
|
||||
verbose=False,
|
||||
return_raw=return_raw,
|
||||
)
|
||||
|
||||
segs = chunk_result.get("segments", [])
|
||||
spks = chunk_result.get("speakers", [])
|
||||
txts = chunk_result.get("transcripts", [])
|
||||
raw = chunk_result.get("raw_result")
|
||||
|
||||
# Adjust timestamps to global timeline
|
||||
adjusted_segs = []
|
||||
for seg, sp, txt in zip(segs, spks, txts):
|
||||
start = float(seg[0]) + chunk_start
|
||||
end = float(seg[1]) + chunk_start
|
||||
adjusted_segs.append([start, end])
|
||||
all_speakers.append(sp)
|
||||
all_transcripts.append(txt)
|
||||
all_segments.extend(adjusted_segs)
|
||||
|
||||
if return_raw and raw is not None:
|
||||
raw_results.append(raw)
|
||||
|
||||
finally:
|
||||
# Clean up temporary chunk files
|
||||
for path in temp_files:
|
||||
if path and os.path.exists(path) and path != audio_path:
|
||||
try:
|
||||
os.remove(path)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to remove chunk file %s: %s", path, e)
|
||||
|
||||
# Sort segments by start time
|
||||
combined = list(zip(all_segments, all_speakers, all_transcripts))
|
||||
combined.sort(key=lambda x: x[0][0])
|
||||
all_segments = [x[0] for x in combined]
|
||||
all_speakers = [x[1] for x in combined]
|
||||
all_transcripts = [x[2] for x in combined]
|
||||
|
||||
if verbose:
|
||||
print(
|
||||
f"Chunked transcription complete. Total segments: {len(all_segments)}"
|
||||
)
|
||||
|
||||
result = {
|
||||
"segments": all_segments,
|
||||
"speakers": all_speakers,
|
||||
"transcripts": all_transcripts,
|
||||
}
|
||||
|
||||
if return_raw and raw_results:
|
||||
result["raw_result"] = {
|
||||
"chunked": True,
|
||||
"chunks": raw_results,
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert LocalAI verbose_json response into the internal format used by Scraibe:
|
||||
{
|
||||
"segments": [ [start, end], ... ],
|
||||
"speakers": [ "SPEAKER_00", ... ],
|
||||
"transcripts": [ "text for segment", ... ]
|
||||
}
|
||||
"""
|
||||
segments = result.get("segments", [])
|
||||
|
||||
if not segments:
|
||||
logger.warning("LocalAI returned no segments.")
|
||||
return {
|
||||
"segments": [],
|
||||
"speakers": [],
|
||||
"transcripts": [],
|
||||
}
|
||||
|
||||
out_segments = []
|
||||
out_speakers = []
|
||||
out_transcripts = []
|
||||
|
||||
for seg in segments:
|
||||
start = float(seg.get("start", 0.0))
|
||||
end = float(seg.get("end", 0.0))
|
||||
speaker = seg.get("speaker", "SPEAKER_00")
|
||||
text = seg.get("text", "").strip()
|
||||
|
||||
out_segments.append([start, end])
|
||||
out_speakers.append(speaker)
|
||||
out_transcripts.append(text)
|
||||
|
||||
logger.info(
|
||||
"Parsed %d segments from LocalAI.",
|
||||
len(out_segments),
|
||||
)
|
||||
|
||||
return {
|
||||
"segments": out_segments,
|
||||
"speakers": out_speakers,
|
||||
"transcripts": out_transcripts,
|
||||
}
|
||||
@@ -0,0 +1,205 @@
|
||||
"""
|
||||
MCP-style HTTP server for ScrAIbe.
|
||||
|
||||
- Exposes an OpenAPI-compliant endpoint for external LLMs to:
|
||||
- Upload audio
|
||||
- Receive transcript JSON (no summary)
|
||||
- WebUI remains always enabled; this is additive.
|
||||
|
||||
Configuration (env):
|
||||
- MCP_SERVER_ENABLED: "true"/"false" (default: false)
|
||||
- MCP_SERVER_HOST: bind address (default: 0.0.0.0)
|
||||
- MCP_SERVER_PORT: port (default: 8000)
|
||||
- MCP_USE_CELERY: "true"/"false" (default: true)
|
||||
- If true, uses Celery tasks; if false, runs synchronously.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, UploadFile, File, Form, HTTPException
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from .autotranscript import Scraibe
|
||||
|
||||
logger = logging.getLogger("scraibe.mcp_server")
|
||||
|
||||
app = FastAPI(
|
||||
title="ScrAIbe MCP Transcription API",
|
||||
version="0.1.0",
|
||||
description=(
|
||||
"MCP-style HTTP API for ScrAIbe. "
|
||||
"Allows external LLMs to upload audio and receive transcript JSON."
|
||||
),
|
||||
)
|
||||
|
||||
# In-memory job store for MCP (simple; can be replaced with Redis later)
|
||||
_mcp_jobs: dict = {}
|
||||
|
||||
|
||||
def _job_id() -> str:
|
||||
return str(uuid.uuid4())
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/transcribe")
|
||||
async def transcribe(
|
||||
file: UploadFile = File(...),
|
||||
language: Optional[str] = Form(None),
|
||||
num_speakers: Optional[int] = Form(None),
|
||||
):
|
||||
"""
|
||||
Upload audio and start transcription.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"job_id": "<id>",
|
||||
"status": "queued" | "processing" | "completed" | "error",
|
||||
"message": "..."
|
||||
}
|
||||
|
||||
Use GET /transcribe/{job_id}/status and /json to retrieve results.
|
||||
"""
|
||||
use_celery = os.getenv("MCP_USE_CELERY", "true").strip().lower() in ("true", "1", "yes")
|
||||
|
||||
# Save uploaded file temporarily
|
||||
try:
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
upload_dir = Path(os.getenv("SCRAIBE_UPLOAD_DIR", "/tmp/scraibe_uploads"))
|
||||
upload_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ext = Path(file.filename or "file").suffix or ".wav"
|
||||
ts = time.strftime("%Y%m%d%H%M%S")
|
||||
tmp_name = f"mcp_upload_{ts}_{uuid.uuid4().hex[:8]}{ext}"
|
||||
file_path = upload_dir / tmp_name
|
||||
|
||||
content = await file.read()
|
||||
file_path.write_bytes(content)
|
||||
except Exception as e:
|
||||
logger.error("Error saving MCP upload: %s", e)
|
||||
raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
|
||||
|
||||
job_id = _job_id()
|
||||
|
||||
if use_celery:
|
||||
try:
|
||||
from .tasks import process_mcp_transcribe_task
|
||||
except ImportError:
|
||||
# Fallback: run synchronously
|
||||
use_celery = False
|
||||
|
||||
if use_celery:
|
||||
try:
|
||||
process_mcp_transcribe_task.delay(
|
||||
audio_path=str(file_path),
|
||||
job_id=job_id,
|
||||
language=language or None,
|
||||
num_speakers=int(num_speakers) if num_speakers else None,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Error enqueuing MCP job: %s", e)
|
||||
_mcp_jobs[job_id] = {
|
||||
"status": "error",
|
||||
"message": f"Error enqueuing job: {e}",
|
||||
"file_path": str(file_path),
|
||||
}
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "error",
|
||||
"message": _mcp_jobs[job_id]["message"],
|
||||
}
|
||||
|
||||
_mcp_jobs[job_id] = {
|
||||
"status": "queued",
|
||||
"message": "Job queued for processing.",
|
||||
"file_path": str(file_path),
|
||||
}
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "queued",
|
||||
"message": _mcp_jobs[job_id]["message"],
|
||||
}
|
||||
|
||||
# Synchronous path
|
||||
_mcp_jobs[job_id] = {
|
||||
"status": "processing",
|
||||
"message": "Transcription started (synchronous).",
|
||||
"file_path": str(file_path),
|
||||
}
|
||||
|
||||
def _run_sync():
|
||||
try:
|
||||
scraibe = Scraibe(verbose=False)
|
||||
result = scraibe.transcribe(
|
||||
audio_file=str(file_path),
|
||||
language=language or None,
|
||||
num_speakers=int(num_speakers) if num_speakers else None,
|
||||
verbose=False,
|
||||
for_export=True,
|
||||
)
|
||||
transcript_text = result.get("transcript", "")
|
||||
segments = result.get("segments", [])
|
||||
_mcp_jobs[job_id]["status"] = "completed"
|
||||
_mcp_jobs[job_id]["transcript"] = transcript_text
|
||||
_mcp_jobs[job_id]["segments"] = segments
|
||||
_mcp_jobs[job_id]["message"] = "Transcription completed."
|
||||
except Exception as e:
|
||||
logger.error("MCP sync transcription error: %s", e)
|
||||
_mcp_jobs[job_id]["status"] = "error"
|
||||
_mcp_jobs[job_id]["message"] = f"Transcription error: {e}"
|
||||
|
||||
import threading
|
||||
t = threading.Thread(target=_run_sync, daemon=True)
|
||||
t.start()
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "processing",
|
||||
"message": _mcp_jobs[job_id]["message"],
|
||||
}
|
||||
|
||||
|
||||
@app.get("/transcribe/{job_id}/status")
|
||||
async def get_status(job_id: str):
|
||||
job = _mcp_jobs.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": job["status"],
|
||||
"message": job.get("message", ""),
|
||||
}
|
||||
|
||||
|
||||
@app.get("/transcribe/{job_id}/json")
|
||||
async def get_json(job_id: str):
|
||||
job = _mcp_jobs.get(job_id)
|
||||
if not job:
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
if job["status"] != "completed":
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Job not completed. Current status: {job['status']}",
|
||||
)
|
||||
|
||||
transcript_text = job.get("transcript", "")
|
||||
segments = job.get("segments", [])
|
||||
|
||||
return JSONResponse(
|
||||
content={
|
||||
"job_id": job_id,
|
||||
"transcript": transcript_text,
|
||||
"segments": segments,
|
||||
}
|
||||
)
|
||||
+50
-29
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
import yaml
|
||||
from pyannote.audio.core.model import CACHE_DIR as PYANNOTE_CACHE_DIR
|
||||
import logging
|
||||
from argparse import Action
|
||||
from ast import literal_eval
|
||||
|
||||
@@ -9,42 +8,64 @@ CACHE_DIR = os.getenv(
|
||||
os.path.expanduser("~/.cache/torch/models"),
|
||||
)
|
||||
|
||||
if CACHE_DIR != PYANNOTE_CACHE_DIR:
|
||||
os.environ["PYANNOTE_CACHE"] = os.path.join(CACHE_DIR, "pyannote")
|
||||
|
||||
# Legacy paths kept for backward compatibility (ignored by LocalAI client)
|
||||
WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper")
|
||||
PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
|
||||
PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") \
|
||||
if os.path.exists(os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")) \
|
||||
else ('jaikinator/scraibe', 'pyannote/speaker-diarization-3.1')
|
||||
PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
|
||||
|
||||
|
||||
def config_diarization_yaml(file_path: str, path_to_segmentation: str = None) -> None:
|
||||
"""Configure diarization pipeline from a YAML file.
|
||||
|
||||
This function updates the YAML file to use the given segmentation model
|
||||
offline, and avoids manual file manipulation.
|
||||
def setup_logging(level: str = "INFO"):
|
||||
"""
|
||||
Configure root logger to write to stdout so Docker can capture logs.
|
||||
|
||||
Args:
|
||||
file_path (str): Path to the YAML file.
|
||||
path_to_segmentation (str, optional): Optional path to the segmentation model.
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the segmentation model file is not found.
|
||||
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
|
||||
"""
|
||||
with open(file_path, "r") as stream:
|
||||
yml = yaml.safe_load(stream)
|
||||
numeric_level = getattr(logging, level.upper(), logging.INFO)
|
||||
if not isinstance(numeric_level, int):
|
||||
numeric_level = logging.INFO
|
||||
|
||||
segmentation_path = path_to_segmentation or os.path.join(
|
||||
PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
|
||||
yml["pipeline"]["params"]["segmentation"] = segmentation_path
|
||||
logging.basicConfig(
|
||||
level=numeric_level,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
||||
force=True,
|
||||
)
|
||||
|
||||
if not os.path.exists(segmentation_path):
|
||||
raise FileNotFoundError(
|
||||
f"Segmentation model not found at {segmentation_path}")
|
||||
|
||||
with open(file_path, "w") as stream:
|
||||
yaml.dump(yml, stream)
|
||||
def set_threads(parse_threads=None, yaml_threads=None):
|
||||
"""
|
||||
Configure number of threads.
|
||||
|
||||
In LocalAI mode, this is mainly kept for backward compatibility.
|
||||
"""
|
||||
chosen = None
|
||||
if parse_threads is not None:
|
||||
if not isinstance(parse_threads, int):
|
||||
raise ValueError(
|
||||
f"Type of --num-threads must be int, but the type is {type(parse_threads)}"
|
||||
)
|
||||
elif parse_threads < 1:
|
||||
raise ValueError(
|
||||
f"Number of threads must be a positive integer, {parse_threads} was given"
|
||||
)
|
||||
else:
|
||||
chosen = parse_threads
|
||||
elif yaml_threads is not None:
|
||||
if not isinstance(yaml_threads, int):
|
||||
raise ValueError(
|
||||
f"Type of num_threads must be int, but the type is {type(yaml_threads)}"
|
||||
)
|
||||
elif yaml_threads < 1:
|
||||
raise ValueError(
|
||||
f"Number of threads must be a positive integer, {yaml_threads} was given"
|
||||
)
|
||||
else:
|
||||
chosen = yaml_threads
|
||||
|
||||
if chosen is not None:
|
||||
os.environ["OMP_NUM_THREADS"] = str(chosen)
|
||||
os.environ["MKL_NUM_THREADS"] = str(chosen)
|
||||
|
||||
|
||||
class ParseKwargs(Action):
|
||||
@@ -55,7 +76,7 @@ class ParseKwargs(Action):
|
||||
def __call__(self, parser, namespace, values, option_string=None):
|
||||
setattr(namespace, self.dest, dict())
|
||||
for value in values:
|
||||
key, value = value.split('=')
|
||||
key, value = value.split("=")
|
||||
try:
|
||||
value = literal_eval(value)
|
||||
except:
|
||||
|
||||
@@ -0,0 +1,294 @@
|
||||
"""
|
||||
Summarizer Module
|
||||
-----------------
|
||||
|
||||
Provides a client to summarize long transcripts via an LLM endpoint.
|
||||
|
||||
Behavior:
|
||||
- Chunks transcript into 10,240-character segments.
|
||||
- Summarizes each chunk.
|
||||
- Summarizes the summaries into a final, detailed summary.
|
||||
|
||||
Environment Variables:
|
||||
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
|
||||
- SUMMARIZER_API_KEY: (optional) API key, if required
|
||||
- SUMMARIZER_MODEL: (optional) Model name (e.g., llama-3.1-8b-instruct)
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger("scraibe.summarizer")
|
||||
|
||||
|
||||
class SummarizerError(Exception):
|
||||
"""Raised when the summarization API call fails."""
|
||||
pass
|
||||
|
||||
|
||||
class SummarizerClient:
|
||||
"""
|
||||
HTTP client for an OpenAI-compatible chat completions endpoint.
|
||||
Used to summarize long transcripts in chunks.
|
||||
"""
|
||||
|
||||
CHUNK_SIZE = 10_240 # characters per chunk
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: float = 3600.0,
|
||||
):
|
||||
self.api_url = (api_url or os.getenv("SUMMARIZER_API_URL")).strip().rstrip("/")
|
||||
self.api_key = api_key or os.getenv("SUMMARIZER_API_KEY") or None
|
||||
self.model = model or os.getenv("SUMMARIZER_MODEL") or "llama-3.1-8b-instruct"
|
||||
self.timeout = timeout
|
||||
|
||||
if not self.api_url:
|
||||
raise SummarizerError(
|
||||
"SUMMARIZER_API_URL is not set. "
|
||||
"Provide the summarization LLM URL via environment or constructor."
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Initializing SummarizerClient: url=%s model=%s",
|
||||
self.api_url,
|
||||
self.model,
|
||||
)
|
||||
|
||||
self._client = httpx.Client(
|
||||
base_url=self.api_url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
def close(self):
|
||||
self._client.close()
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def summarize_transcript(self, transcript: str) -> str:
|
||||
"""
|
||||
Summarize a (possibly very long) transcript.
|
||||
|
||||
Strategy:
|
||||
- Split transcript into chunks of CHUNK_SIZE characters.
|
||||
- Generate a detailed summary for each chunk.
|
||||
- Combine all chunk summaries and generate a final, concise but thorough summary.
|
||||
|
||||
The final summary should make it clear:
|
||||
- What was discussed
|
||||
- Main issues
|
||||
- Outcomes / decisions
|
||||
- Next steps / action items
|
||||
"""
|
||||
if not transcript.strip():
|
||||
logger.warning("Empty transcript provided to summarize_transcript.")
|
||||
return "No transcript provided to summarize."
|
||||
|
||||
logger.info(
|
||||
"Starting summarization for transcript length=%d chars",
|
||||
len(transcript),
|
||||
)
|
||||
|
||||
# 1) Chunk the transcript
|
||||
chunks = self._chunk_text(transcript)
|
||||
logger.info("Split transcript into %d chunks.", len(chunks))
|
||||
|
||||
# 2) Summarize each chunk
|
||||
chunk_summaries = []
|
||||
for i, chunk in enumerate(chunks):
|
||||
logger.info(
|
||||
"Summarizing chunk %d/%d (length=%d)",
|
||||
i + 1,
|
||||
len(chunks),
|
||||
len(chunk),
|
||||
)
|
||||
summary = self._summarize_chunk(chunk, i, len(chunks))
|
||||
chunk_summaries.append(summary)
|
||||
|
||||
# 3) Combine and summarize summaries
|
||||
combined = "\n\n".join(chunk_summaries)
|
||||
logger.info(
|
||||
"Combining %d chunk summaries (total length=%d) for final summary.",
|
||||
len(chunk_summaries),
|
||||
len(combined),
|
||||
)
|
||||
final_summary = self._summarize_combined(combined)
|
||||
|
||||
logger.info("Summarization completed.")
|
||||
return final_summary
|
||||
|
||||
def _chunk_text(self, text: str) -> list[str]:
|
||||
"""Split text into chunks of CHUNK_SIZE characters."""
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(text):
|
||||
end = start + self.CHUNK_SIZE
|
||||
if end >= len(text):
|
||||
chunks.append(text[start:])
|
||||
break
|
||||
# Try to break at a reasonable boundary (newline or space)
|
||||
break_pos = text.rfind("\n", start, end)
|
||||
if break_pos == -1:
|
||||
break_pos = text.rfind(" ", start, end)
|
||||
if break_pos == -1 or break_pos <= start:
|
||||
break_pos = end
|
||||
chunks.append(text[start:break_pos].strip())
|
||||
start = break_pos
|
||||
return chunks
|
||||
|
||||
def _load_summary_prompt(self, role: str) -> str:
|
||||
"""
|
||||
Load summary prompt for the given role: 'chunk' or 'combined'.
|
||||
|
||||
Priority:
|
||||
1) SUMMARY_PROMPT_{ROLE} (env)
|
||||
2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections
|
||||
3) Built-in default prompt
|
||||
"""
|
||||
role_upper = role.upper()
|
||||
|
||||
# 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED
|
||||
env_key = f"SUMMARY_PROMPT_{role_upper}"
|
||||
env_prompt = (os.getenv(env_key) or "").strip()
|
||||
if env_prompt:
|
||||
return env_prompt
|
||||
|
||||
# 2) File-based prompt with sections
|
||||
prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip()
|
||||
if prompt_file and os.path.exists(prompt_file):
|
||||
try:
|
||||
with open(prompt_file, "r", encoding="utf-8") as f:
|
||||
content = f.read()
|
||||
# Simple section parser: [chunk], [combined]
|
||||
import re
|
||||
pattern = re.compile(
|
||||
r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)",
|
||||
re.DOTALL,
|
||||
)
|
||||
m = pattern.search(content)
|
||||
if m:
|
||||
text = m.group(1).strip()
|
||||
if text:
|
||||
return text
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e)
|
||||
|
||||
# 3) Default prompts
|
||||
if role == "chunk":
|
||||
return (
|
||||
"You are an expert legal and business meeting summarizer. "
|
||||
"You will receive a segment of a longer transcript. "
|
||||
"Provide a detailed, structured summary of this segment, focusing on: "
|
||||
"- Topics discussed\n"
|
||||
"- Key points and arguments\n"
|
||||
"- Decisions and agreements\n"
|
||||
"- Action items and responsibilities\n"
|
||||
"- Any risks, conflicts, or open issues\n\n"
|
||||
"Be concise but complete. Use bullet points where helpful. "
|
||||
"Do not add information that is not present in the transcript."
|
||||
)
|
||||
else:
|
||||
return (
|
||||
"You are an expert legal and business meeting summarizer. "
|
||||
"You will receive several intermediate summaries of a longer conversation. "
|
||||
"Produce a single, comprehensive summary that makes it clear: "
|
||||
"- The overall purpose and context of the discussion\n"
|
||||
"- The main issues and topics addressed\n"
|
||||
"- Key arguments and positions (briefly)\n"
|
||||
"- Decisions and outcomes\n"
|
||||
"- Action items, responsibilities, and next steps\n"
|
||||
"- Any unresolved issues or risks\n\n"
|
||||
"The summary should be detailed enough that a reader who was not present "
|
||||
"can understand what happened and what is expected going forward. "
|
||||
"Use clear, concise language and bullet points where appropriate. "
|
||||
"Use markdown formatting (headings, lists, bold) to structure the summary."
|
||||
)
|
||||
|
||||
def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
|
||||
system_prompt = self._load_summary_prompt("chunk")
|
||||
|
||||
user_prompt = (
|
||||
f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
|
||||
f"{chunk}"
|
||||
)
|
||||
|
||||
return self._chat_completion(system_prompt, user_prompt)
|
||||
|
||||
def _summarize_combined(self, combined_summaries: str) -> str:
|
||||
system_prompt = self._load_summary_prompt("combined")
|
||||
|
||||
user_prompt = (
|
||||
"Here are the intermediate summaries from different parts of the same conversation:\n\n"
|
||||
f"{combined_summaries}"
|
||||
)
|
||||
|
||||
return self._chat_completion(system_prompt, user_prompt)
|
||||
|
||||
def _chat_completion(self, system_prompt: str, user_prompt: str) -> str:
|
||||
"""
|
||||
Call OpenAI-compatible /v1/chat/completions endpoint.
|
||||
"""
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
logger.info("Calling summarizer endpoint: /v1/chat/completions")
|
||||
|
||||
resp = self._client.post(
|
||||
"/v1/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
logger.info("Summarizer response status: %d", resp.status_code)
|
||||
|
||||
if resp.status_code >= 400:
|
||||
logger.error("Summarizer error response: %s", resp.text)
|
||||
raise SummarizerError(
|
||||
f"Summarizer API error {resp.status_code}: {resp.text}"
|
||||
)
|
||||
|
||||
try:
|
||||
data = resp.json()
|
||||
except json.JSONDecodeError:
|
||||
logger.error("Failed to parse summarizer response as JSON.")
|
||||
raise SummarizerError(
|
||||
"Failed to parse summarizer response as JSON."
|
||||
)
|
||||
|
||||
# Extract assistant message
|
||||
try:
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
return content.strip()
|
||||
except (KeyError, IndexError, TypeError):
|
||||
logger.error(
|
||||
"Unexpected summarizer response format: %s",
|
||||
json.dumps(data, indent=2),
|
||||
)
|
||||
raise SummarizerError(
|
||||
"Unexpected summarizer response format: "
|
||||
f"{json.dumps(data, indent=2)}"
|
||||
)
|
||||
@@ -0,0 +1,713 @@
|
||||
"""
|
||||
Celery tasks for async transcription, diarization, and email notifications.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
|
||||
from .celery_app import celery_app
|
||||
from .autotranscript import Scraibe
|
||||
from .summarizer import SummarizerClient, SummarizerError
|
||||
from .misc import setup_logging
|
||||
from .email_sender import send_email, EmailError, load_template
|
||||
from .email_sender import create_transcript_docx, create_summary_docx
|
||||
|
||||
logger = logging.getLogger("scraibe.tasks")
|
||||
|
||||
|
||||
def _local_part(email: str) -> str:
|
||||
"""
|
||||
Extract the part before '@' from an email, sanitized for filenames.
|
||||
"""
|
||||
local = (email or "").split("@")[0].strip()
|
||||
local = "".join(ch if ch.isalnum() or ch in ("-", "_", ".") else "_" for ch in local)
|
||||
return local or "user"
|
||||
|
||||
|
||||
def _date_tag() -> str:
|
||||
"""
|
||||
Date tag in DD-MON-YYYY format (e.g. 01-JAN-2025).
|
||||
"""
|
||||
return datetime.utcnow().strftime("%d-%b-%Y").upper()
|
||||
|
||||
|
||||
def _safe_filename(base: str, local: str, date_tag: str, ext: str) -> str:
|
||||
"""
|
||||
Create a temp file with the requested logical name.
|
||||
Uses mktemp for uniqueness but keeps the desired name pattern.
|
||||
"""
|
||||
name = f"{base}-{local}-{date_tag}{ext}"
|
||||
return tempfile.mktemp(prefix=name.replace(".", ""), suffix=ext)
|
||||
|
||||
|
||||
def _remove_file(path: str):
|
||||
"""
|
||||
Remove a file if it exists. Best-effort; logs but never raises.
|
||||
"""
|
||||
if not path:
|
||||
return
|
||||
try:
|
||||
if os.path.exists(path):
|
||||
os.remove(path)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to remove file %s: %s", path, e)
|
||||
|
||||
|
||||
def _get_subject(env_var: str, default: str) -> str:
|
||||
"""
|
||||
Safely read an email subject from an environment variable.
|
||||
Uses default if unset or blank. Logs the final value.
|
||||
"""
|
||||
value = (os.getenv(env_var) or "").strip()
|
||||
subject = value or default
|
||||
logger.info("Email subject [%s] = %s", env_var, subject)
|
||||
return subject
|
||||
|
||||
|
||||
def get_queue_position(task_id: str) -> int:
|
||||
"""
|
||||
Estimate the job's position in the queue.
|
||||
Returns:
|
||||
- A positive int if we can estimate (1 = first in line).
|
||||
- 0 if we cannot reliably determine position.
|
||||
"""
|
||||
try:
|
||||
inspect = celery_app.control.inspect()
|
||||
reserved = inspect.reserved() or {} # queued but not yet running
|
||||
active = inspect.active() or {} # currently running
|
||||
|
||||
# Count tasks ahead of this one in the reserved (waiting) queue
|
||||
ahead = 0
|
||||
found = False
|
||||
for _, tasks in list(reserved.values()):
|
||||
for t in tasks:
|
||||
tid = t.get("id")
|
||||
if tid == task_id:
|
||||
found = True
|
||||
break
|
||||
ahead += 1
|
||||
if found:
|
||||
break
|
||||
|
||||
# If not found in reserved, it may already be active or not yet visible.
|
||||
# In that case, treat it as position 1.
|
||||
if found:
|
||||
return max(ahead + 1, 1)
|
||||
else:
|
||||
return 1
|
||||
except Exception:
|
||||
# If inspection fails, don't guess; caller should use a safe message.
|
||||
return 0
|
||||
|
||||
|
||||
def send_initial_email(to: str, queue_pos: int):
|
||||
"""
|
||||
Send initial confirmation email with queue position.
|
||||
Subject is customizable via EMAIL_SUBJECT_UPLOAD.
|
||||
"""
|
||||
subject = _get_subject(
|
||||
"EMAIL_SUBJECT_UPLOAD",
|
||||
"ScrAIbe: Your transcription request has been received",
|
||||
)
|
||||
|
||||
body = (
|
||||
"Hello,\n\n"
|
||||
"We have received your audio file for transcription.\n"
|
||||
)
|
||||
|
||||
if queue_pos > 0:
|
||||
body += f"Your request is currently number {queue_pos} in the queue.\n"
|
||||
queue_position_display = (
|
||||
f'<span style="color:{_accent_color()}; font-weight:bold;">{queue_pos}</span>'
|
||||
)
|
||||
else:
|
||||
body += "Your request has been queued for processing.\n"
|
||||
queue_position_display = "the queue"
|
||||
|
||||
body += (
|
||||
"\n"
|
||||
"You will receive an email with your transcript (and summary, if requested) "
|
||||
"once processing is complete.\n\n"
|
||||
"If you have any questions, contact us at "
|
||||
f"{os.getenv('EMAIL_CONTACT_ADDRESS', 'support@example.com')}.\n\n"
|
||||
"This is an automated message from ScrAIbe.\n"
|
||||
)
|
||||
|
||||
html = None
|
||||
try:
|
||||
html = load_template(
|
||||
"upload_notification_template.html",
|
||||
queue_position_text=queue_position_display,
|
||||
)
|
||||
except EmailError as e:
|
||||
logger.warning("Failed to render upload notification template: %s", e)
|
||||
|
||||
try:
|
||||
send_email(to=to, subject=subject, body=body, html=html, attachments=[])
|
||||
logger.info("Initial confirmation email sent to %s", to)
|
||||
except EmailError as e:
|
||||
logger.error("Failed to send initial email to %s: %s", to, e)
|
||||
|
||||
|
||||
def send_success_email(
|
||||
to: str,
|
||||
transcript_text: str,
|
||||
summary_text: str,
|
||||
attachments: list,
|
||||
task_id: str,
|
||||
):
|
||||
"""
|
||||
Send final email with transcript and attachments.
|
||||
Subject is customizable via EMAIL_SUBJECT_SUCCESS.
|
||||
Falls back to a safe default if the env var is missing or blank.
|
||||
"""
|
||||
subject = _get_subject(
|
||||
"EMAIL_SUBJECT_SUCCESS",
|
||||
"ScrAIbe: Your transcript is ready",
|
||||
)
|
||||
|
||||
body = (
|
||||
"Hello,\n\n"
|
||||
"Your transcription is ready.\n\n"
|
||||
"Please find the transcript and JSON files attached.\n"
|
||||
)
|
||||
|
||||
if summary_text:
|
||||
body += (
|
||||
"\n"
|
||||
"SUMMARY\n"
|
||||
"-------\n"
|
||||
f"{summary_text}\n"
|
||||
)
|
||||
|
||||
body += (
|
||||
"\n"
|
||||
"Job ID: " + str(task_id) + "\n\n"
|
||||
"If you have any questions, contact us at "
|
||||
f"{os.getenv('EMAIL_CONTACT_ADDRESS', 'support@example.com')}.\n\n"
|
||||
"This is an automated message from ScrAIbe.\n"
|
||||
)
|
||||
|
||||
html = None
|
||||
try:
|
||||
html = load_template("success_template.html")
|
||||
except EmailError as e:
|
||||
logger.warning("Failed to render success template: %s", e)
|
||||
|
||||
try:
|
||||
send_email(
|
||||
to=to,
|
||||
subject=subject,
|
||||
body=body,
|
||||
html=html,
|
||||
attachments=attachments,
|
||||
)
|
||||
logger.info("Success email sent to %s for job %s with subject: %s", to, task_id, subject)
|
||||
except EmailError as e:
|
||||
logger.error("Failed to send success email to %s for job %s: %s", to, task_id, e)
|
||||
|
||||
|
||||
def send_error_email(to: str, error_message: str, task_id: str):
|
||||
"""
|
||||
Send error notification email.
|
||||
Subject is customizable via EMAIL_SUBJECT_ERROR.
|
||||
"""
|
||||
subject = _get_subject(
|
||||
"EMAIL_SUBJECT_ERROR",
|
||||
"ScrAIbe: Error with your transcription request",
|
||||
)
|
||||
|
||||
body = (
|
||||
"Hello,\n\n"
|
||||
"We encountered an error while processing your transcription request.\n\n"
|
||||
f"Details: {error_message}\n\n"
|
||||
"Job ID: " + str(task_id) + "\n\n"
|
||||
"Please contact your administrator if the problem persists.\n\n"
|
||||
"If you have any questions, contact us at "
|
||||
f"{os.getenv('EMAIL_CONTACT_ADDRESS', 'support@example.com')}.\n\n"
|
||||
"This is an automated message from ScrAIbe.\n"
|
||||
)
|
||||
|
||||
html = None
|
||||
try:
|
||||
html = load_template(
|
||||
"error_notification_template.html",
|
||||
exception=str(error_message),
|
||||
)
|
||||
except EmailError as e:
|
||||
logger.warning("Failed to render error template: %s", e)
|
||||
|
||||
try:
|
||||
send_email(to=to, subject=subject, body=body, html=html, attachments=[])
|
||||
logger.info("Error email sent to %s for job %s", to, task_id)
|
||||
except EmailError as e:
|
||||
logger.error("Failed to send error email to %s for job %s: %s", to, task_id, e)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="scraibe.tasks.process_transcription_task",
|
||||
bind=True,
|
||||
max_retries=1,
|
||||
task_time_limit=14400, # 4 hours
|
||||
task_soft_time_limit=13500, # warn at 3h45m
|
||||
)
|
||||
def process_transcription_task(
|
||||
self,
|
||||
audio_path: str,
|
||||
task_type: str,
|
||||
language: str,
|
||||
num_speakers: int,
|
||||
email_to: str,
|
||||
email_cc: str,
|
||||
include_summary: bool,
|
||||
identify_speakers: bool = False,
|
||||
):
|
||||
"""
|
||||
Async task: transcribe audio, optionally summarize, then email results.
|
||||
Cleans up temporary files after completion.
|
||||
"""
|
||||
task_id = self.request.id
|
||||
|
||||
log_level = os.getenv("LOG_LEVEL", "INFO")
|
||||
setup_logging(level=log_level)
|
||||
|
||||
temp_files = []
|
||||
local = _local_part(email_to)
|
||||
date_tag = _date_tag()
|
||||
|
||||
try:
|
||||
# 1) Queue position and initial email
|
||||
queue_pos = get_queue_position(task_id)
|
||||
send_initial_email(to=email_to, queue_pos=queue_pos)
|
||||
|
||||
# 2) Initialize Scraibe
|
||||
try:
|
||||
scraibe = Scraibe(verbose=True)
|
||||
except Exception as e:
|
||||
send_error_email(
|
||||
to=email_to,
|
||||
error_message=f"Failed to initialize transcription service: {e}",
|
||||
task_id=task_id,
|
||||
)
|
||||
raise
|
||||
|
||||
# 3) Transcription
|
||||
if task_type == "transcript_and_summarize":
|
||||
result = scraibe.transcript_and_summarize(
|
||||
audio_file=audio_path,
|
||||
language=language or None,
|
||||
num_speakers=int(num_speakers) if num_speakers else None,
|
||||
verbose=True,
|
||||
for_export=True,
|
||||
)
|
||||
transcript_text = result.get("transcript", "")
|
||||
summary_text = result.get("summary", "")
|
||||
segments = result.get("segments", [])
|
||||
raw_result = result.get("raw_result")
|
||||
else:
|
||||
result = scraibe.transcribe(
|
||||
audio_file=audio_path,
|
||||
language=language or None,
|
||||
num_speakers=int(num_speakers) if num_speakers else None,
|
||||
verbose=True,
|
||||
for_export=True,
|
||||
)
|
||||
transcript_text = result.get("transcript", "")
|
||||
summary_text = ""
|
||||
segments = result.get("segments", [])
|
||||
raw_result = result.get("raw_result")
|
||||
|
||||
# 3b) Optional speaker identification
|
||||
speaker_map = {} # e.g. {"SPEAKER 1": "John", "SPEAKER 2": "Maria"}
|
||||
if identify_speakers:
|
||||
try:
|
||||
# Use the same summarizer client as transcript_and_summarize
|
||||
scraibe._ensure_summarizer()
|
||||
summarizer = scraibe._summarizer
|
||||
|
||||
prompt = (
|
||||
"Below is a transcript with speaker labels like 'SPEAKER 1', 'SPEAKER 2', etc. "
|
||||
"Based on the context and how each speaker talks, identify each speaker as:\n"
|
||||
"- Their real name, if it is clearly mentioned or strongly implied, OR\n"
|
||||
"- A concise role/position (e.g., Judge, Doctor, Manager, Interviewer, Client, Witness), "
|
||||
"if their identity is not clear.\n"
|
||||
"Do not invent random personal names. "
|
||||
"Do not add extra commentary. Output ONLY a mapping in this exact format, one per line:\n"
|
||||
"SPEAKER 1: Name or Role\n"
|
||||
"SPEAKER 2: Name or Role\n"
|
||||
"SPEAKER 3: Name or Role\n"
|
||||
"\n"
|
||||
"Transcript:\n"
|
||||
+ transcript_text
|
||||
)
|
||||
|
||||
response = summarizer._chat_completion(
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.3,
|
||||
max_tokens=300,
|
||||
)
|
||||
reply = (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "")
|
||||
|
||||
# Parse mapping
|
||||
import re
|
||||
for m in re.finditer(
|
||||
r"SPEAKER\s+(\d+)\s*:\s*(.+)",
|
||||
reply,
|
||||
re.IGNORECASE,
|
||||
):
|
||||
spk = f"SPEAKER {m.group(1).strip()}"
|
||||
name = m.group(2).strip().rstrip(".").upper()
|
||||
if name:
|
||||
speaker_map[spk] = name
|
||||
|
||||
logger.info("Speaker identification mapping: %s", speaker_map)
|
||||
|
||||
# Apply mapping to transcript text
|
||||
if speaker_map:
|
||||
def replace_speaker(m):
|
||||
label = m.group(0).strip()
|
||||
# normalize to "SPEAKER N"
|
||||
normalized = re.sub(
|
||||
r"\s+",
|
||||
" ",
|
||||
re.sub(r"[^A-Z0-9\s]", "", label.upper()),
|
||||
).strip()
|
||||
return speaker_map.get(normalized, label)
|
||||
|
||||
# Replace in lines like "[00:12] SPEAKER 1:" but preserve timestamp and colon
|
||||
def replace_in_line(line: str) -> str:
|
||||
# match after timestamp bracket and space: "SPEAKER N:"
|
||||
return re.sub(
|
||||
r"(\[\d+:\d+(?::\d+)?\]\s*)([A-Z\s]+?):\s*",
|
||||
lambda m: m.group(1) + (speaker_map.get(m.group(2).strip(), m.group(2)) + ": "),
|
||||
line,
|
||||
)
|
||||
|
||||
transcript_lines = transcript_text.splitlines()
|
||||
transcript_text = "\n".join(
|
||||
replace_in_line(line) for line in transcript_lines
|
||||
)
|
||||
|
||||
# Also update segments for JSON export
|
||||
updated_segments = []
|
||||
for seg in segments:
|
||||
sp = (seg.get("speaker") or "").strip()
|
||||
sp_norm = re.sub(r"[^A-Z0-9\s]", "", sp.upper()).strip()
|
||||
sp_new = speaker_map.get(sp_norm, sp)
|
||||
seg = dict(seg)
|
||||
seg["speaker"] = sp_new
|
||||
updated_segments.append(seg)
|
||||
segments = updated_segments
|
||||
|
||||
except (SummarizerError, Exception) as e:
|
||||
logger.warning(
|
||||
"Speaker identification failed; falling back to Speaker IDs: %s", e
|
||||
)
|
||||
speaker_map = {}
|
||||
|
||||
# 4) Prepare files
|
||||
|
||||
# Transcript .md
|
||||
md_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".md")
|
||||
with open(md_transcript_path, "w", encoding="utf-8") as f:
|
||||
f.write("# Transcript\n\n")
|
||||
f.write(transcript_text)
|
||||
temp_files.append(md_transcript_path)
|
||||
|
||||
# Transcript .docx (standalone, no cover page)
|
||||
docx_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".docx")
|
||||
create_transcript_docx(
|
||||
transcript_text,
|
||||
docx_transcript_path,
|
||||
)
|
||||
temp_files.append(docx_transcript_path)
|
||||
|
||||
# JSON as SOURCE
|
||||
json_data = {
|
||||
"task": task_type,
|
||||
"transcript": transcript_text,
|
||||
"segments": segments,
|
||||
"metadata": {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"job_id": task_id,
|
||||
},
|
||||
}
|
||||
if summary_text:
|
||||
json_data["summary"] = summary_text
|
||||
if raw_result is not None:
|
||||
json_data["raw_result"] = raw_result
|
||||
|
||||
json_path = _safe_filename("SOURCE", local, date_tag, ".json")
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
||||
temp_files.append(json_path)
|
||||
|
||||
# Summary files (if present)
|
||||
md_summary_path = None
|
||||
docx_summary_path = None
|
||||
|
||||
if summary_text:
|
||||
# Summary .md
|
||||
md_summary_path = _safe_filename("SUMMARY", local, date_tag, ".md")
|
||||
with open(md_summary_path, "w", encoding="utf-8") as f:
|
||||
f.write("# Summary\n\n")
|
||||
f.write(summary_text)
|
||||
temp_files.append(md_summary_path)
|
||||
|
||||
# Summary .docx (standalone, no cover page)
|
||||
docx_summary_path = _safe_filename("SUMMARY", local, date_tag, ".docx")
|
||||
create_summary_docx(
|
||||
summary_text,
|
||||
docx_summary_path,
|
||||
)
|
||||
temp_files.append(docx_summary_path)
|
||||
|
||||
# 5) Build attachments list
|
||||
|
||||
# Always: JSON, transcript MD, transcript DOCX
|
||||
attachments = [
|
||||
md_transcript_path,
|
||||
docx_transcript_path,
|
||||
json_path,
|
||||
]
|
||||
|
||||
# If summary is present, add summary MD and DOCX
|
||||
if summary_text:
|
||||
attachments += [md_summary_path, docx_summary_path]
|
||||
|
||||
# 6) Send success email
|
||||
send_success_email(
|
||||
to=email_to,
|
||||
transcript_text=transcript_text,
|
||||
summary_text=summary_text if include_summary else "",
|
||||
attachments=attachments,
|
||||
task_id=task_id,
|
||||
)
|
||||
|
||||
logger.info("Job %s completed successfully.", task_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error processing job %s: %s", task_id, e, exc_info=True)
|
||||
send_error_email(
|
||||
to=email_to,
|
||||
error_message=str(e),
|
||||
task_id=task_id,
|
||||
)
|
||||
raise e
|
||||
finally:
|
||||
# 7) Cleanup
|
||||
for path in temp_files:
|
||||
_remove_file(path)
|
||||
if audio_path:
|
||||
_remove_file(audio_path)
|
||||
logger.info("Cleanup completed for job %s.", task_id)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="scraibe.tasks.process_mcp_transcribe_task",
|
||||
bind=True,
|
||||
max_retries=1,
|
||||
task_time_limit=14400,
|
||||
task_soft_time_limit=13500,
|
||||
)
|
||||
def process_mcp_transcribe_task(
|
||||
self,
|
||||
audio_path: str,
|
||||
job_id: str,
|
||||
language: str,
|
||||
num_speakers: int,
|
||||
):
|
||||
"""
|
||||
Async task used by MCP-style API:
|
||||
- Transcribe audio
|
||||
- Store transcript + segments in shared MCP job store
|
||||
- Clean up temporary file
|
||||
"""
|
||||
from .mcp_server import _mcp_jobs
|
||||
|
||||
log_level = os.getenv("LOG_LEVEL", "INFO")
|
||||
setup_logging(level=log_level)
|
||||
|
||||
# Initialize status
|
||||
_mcp_jobs.setdefault(
|
||||
job_id,
|
||||
{
|
||||
"status": "processing",
|
||||
"message": "Transcription started (async).",
|
||||
"file_path": audio_path,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
scraibe = Scraibe(verbose=True)
|
||||
result = scraibe.transcribe(
|
||||
audio_file=audio_path,
|
||||
language=language or None,
|
||||
num_speakers=int(num_speakers) if num_speakers else None,
|
||||
verbose=True,
|
||||
for_export=True,
|
||||
)
|
||||
|
||||
transcript_text = result.get("transcript", "")
|
||||
segments = result.get("segments", [])
|
||||
|
||||
_mcp_jobs[job_id]["status"] = "completed"
|
||||
_mcp_jobs[job_id]["transcript"] = transcript_text
|
||||
_mcp_jobs[job_id]["segments"] = segments
|
||||
_mcp_jobs[job_id]["message"] = "Transcription completed."
|
||||
|
||||
logger.info("MCP job %s completed.", job_id)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("MCP job %s failed: %s", job_id, e, exc_info=True)
|
||||
_mcp_jobs[job_id]["status"] = "error"
|
||||
_mcp_jobs[job_id]["message"] = f"Transcription error: {e}"
|
||||
|
||||
finally:
|
||||
_remove_file(audio_path)
|
||||
logger.info("MCP job %s cleanup completed.", job_id)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
name="scraibe.tasks.process_watch_file_task",
|
||||
bind=True,
|
||||
max_retries=1,
|
||||
task_time_limit=14400,
|
||||
task_soft_time_limit=13500,
|
||||
)
|
||||
def process_watch_file_task(
|
||||
self,
|
||||
file_path: str,
|
||||
):
|
||||
"""
|
||||
Async task for watch-folder mode:
|
||||
- Transcribe + summarize
|
||||
- Email results
|
||||
- Optionally delete source file
|
||||
"""
|
||||
task_id = self.request.id
|
||||
|
||||
log_level = os.getenv("LOG_LEVEL", "INFO")
|
||||
setup_logging(level=log_level)
|
||||
|
||||
email_to = os.getenv("WATCH_EMAIL_TO") or os.getenv("EMAIL_DEFAULT_TO")
|
||||
if not email_to:
|
||||
logger.error("No email address configured for watch-folder mode.")
|
||||
raise RuntimeError("WATCH_EMAIL_TO or EMAIL_DEFAULT_TO not set.")
|
||||
|
||||
delete_on_success = os.getenv("WATCH_DELETE_ON_SUCCESS", "true").strip().lower() in ("true", "1", "yes")
|
||||
|
||||
temp_files = []
|
||||
local = "watch"
|
||||
date_tag = _date_tag()
|
||||
|
||||
try:
|
||||
scraibe = Scraibe(verbose=True)
|
||||
|
||||
result = scraibe.transcript_and_summarize(
|
||||
audio_file=file_path,
|
||||
language=None,
|
||||
num_speakers=None,
|
||||
verbose=True,
|
||||
for_export=True,
|
||||
)
|
||||
|
||||
transcript_text = result.get("transcript", "")
|
||||
summary_text = result.get("summary", "")
|
||||
segments = result.get("segments", [])
|
||||
raw_result = result.get("raw_result")
|
||||
|
||||
# Transcript .md
|
||||
md_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".md")
|
||||
with open(md_transcript_path, "w", encoding="utf-8") as f:
|
||||
f.write("# Transcript\n\n")
|
||||
f.write(transcript_text)
|
||||
temp_files.append(md_transcript_path)
|
||||
|
||||
# Transcript .docx
|
||||
docx_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".docx")
|
||||
create_transcript_docx(
|
||||
transcript_text,
|
||||
docx_transcript_path,
|
||||
)
|
||||
temp_files.append(docx_transcript_path)
|
||||
|
||||
# Summary .md
|
||||
md_summary_path = _safe_filename("SUMMARY", local, date_tag, ".md")
|
||||
with open(md_summary_path, "w", encoding="utf-8") as f:
|
||||
f.write("# Summary\n\n")
|
||||
f.write(summary_text)
|
||||
temp_files.append(md_summary_path)
|
||||
|
||||
# Summary .docx
|
||||
docx_summary_path = _safe_filename("SUMMARY", local, date_tag, ".docx")
|
||||
create_summary_docx(
|
||||
summary_text,
|
||||
docx_summary_path,
|
||||
)
|
||||
temp_files.append(docx_summary_path)
|
||||
|
||||
# JSON as SOURCE
|
||||
json_data = {
|
||||
"task": "watch_transcript_and_summarize",
|
||||
"transcript": transcript_text,
|
||||
"summary": summary_text,
|
||||
"segments": segments,
|
||||
"metadata": {
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"job_id": task_id,
|
||||
"source_file": file_path,
|
||||
},
|
||||
}
|
||||
if raw_result is not None:
|
||||
json_data["raw_result"] = raw_result
|
||||
|
||||
json_path = _safe_filename("SOURCE", local, date_tag, ".json")
|
||||
with open(json_path, "w", encoding="utf-8") as f:
|
||||
json.dump(json_data, f, indent=2, ensure_ascii=False)
|
||||
temp_files.append(json_path)
|
||||
|
||||
# Attachments
|
||||
attachments = [
|
||||
md_transcript_path,
|
||||
docx_transcript_path,
|
||||
md_summary_path,
|
||||
docx_summary_path,
|
||||
json_path,
|
||||
]
|
||||
|
||||
# Send email
|
||||
send_success_email(
|
||||
to=email_to,
|
||||
transcript_text=transcript_text,
|
||||
summary_text=summary_text,
|
||||
attachments=attachments,
|
||||
task_id=task_id,
|
||||
)
|
||||
|
||||
logger.info("Watch-folder job %s completed for %s.", task_id, file_path)
|
||||
|
||||
# Delete source file if configured
|
||||
if delete_on_success and os.path.exists(file_path):
|
||||
try:
|
||||
os.remove(file_path)
|
||||
logger.info("Deleted source file: %s", file_path)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete source file %s: %s", file_path, e)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error processing watch file %s: %s", file_path, e, exc_info=True)
|
||||
send_error_email(
|
||||
to=email_to,
|
||||
error_message=str(e),
|
||||
task_id=task_id,
|
||||
)
|
||||
raise e
|
||||
finally:
|
||||
# Cleanup temp files
|
||||
for path in temp_files:
|
||||
_remove_file(path)
|
||||
logger.info("Watch-folder job %s cleanup completed.", task_id)
|
||||
+59
-28
@@ -26,17 +26,17 @@ Usage:
|
||||
|
||||
from whisper import Whisper
|
||||
from whisper import load_model as whisper_load_model
|
||||
from whisperx.asr import WhisperModel
|
||||
from whisperx import load_model as whisperx_load_model
|
||||
from whisper.tokenizer import TO_LANGUAGE_CODE
|
||||
from faster_whisper import WhisperModel as FasterWhisperModel
|
||||
from faster_whisper.tokenizer import _LANGUAGE_CODES as FASTER_WHISPER_LANGUAGE_CODES
|
||||
from typing import TypeVar, Union, Optional
|
||||
from torch import Tensor, device
|
||||
from torch.cuda import is_available as cuda_is_available
|
||||
from numpy import ndarray
|
||||
from inspect import signature
|
||||
from abc import abstractmethod
|
||||
import warnings
|
||||
|
||||
from .misc import WHISPER_DEFAULT_PATH
|
||||
from .misc import WHISPER_DEFAULT_PATH, SCRAIBE_TORCH_DEVICE, SCRAIBE_NUM_THREADS
|
||||
whisper = TypeVar('whisper')
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ class Transcriber:
|
||||
model: str = "medium",
|
||||
whisper_type: str = 'whisper',
|
||||
download_root: str = WHISPER_DEFAULT_PATH,
|
||||
device: Optional[Union[str, device]] = None,
|
||||
device: Optional[Union[str, device]] = SCRAIBE_TORCH_DEVICE,
|
||||
in_memory: bool = False,
|
||||
*args, **kwargs
|
||||
) -> None:
|
||||
@@ -145,7 +145,7 @@ class Transcriber:
|
||||
- 'large-v3'
|
||||
- 'large'
|
||||
whisper_type (str):
|
||||
Type of whisper model to load. "whisper" or "whisperx".
|
||||
Type of whisper model to load. "whisper" or "faster-whisper".
|
||||
download_root (str, optional): Path to download the model.
|
||||
Defaults to WHISPER_DEFAULT_PATH.
|
||||
device (Optional[Union[str, torch.device]], optional):
|
||||
@@ -205,7 +205,7 @@ class WhisperTranscriber(Transcriber):
|
||||
def load_model(cls,
|
||||
model: str = "medium",
|
||||
download_root: str = WHISPER_DEFAULT_PATH,
|
||||
device: Optional[Union[str, device]] = None,
|
||||
device: Optional[Union[str, device]] = SCRAIBE_TORCH_DEVICE,
|
||||
in_memory: bool = False,
|
||||
*args, **kwargs
|
||||
) -> 'WhisperTranscriber':
|
||||
@@ -272,7 +272,7 @@ class WhisperTranscriber(Transcriber):
|
||||
return f"WhisperTranscriber(model_name={self.model_name}, model={self.model})"
|
||||
|
||||
|
||||
class WhisperXTranscriber(Transcriber):
|
||||
class FasterWhisperTranscriber(Transcriber):
|
||||
def __init__(self, model: whisper, model_name: str) -> None:
|
||||
super().__init__(model, model_name)
|
||||
|
||||
@@ -294,19 +294,19 @@ class WhisperXTranscriber(Transcriber):
|
||||
|
||||
if isinstance(audio, Tensor):
|
||||
audio = audio.cpu().numpy()
|
||||
result = self.model.transcribe(audio, *args, **kwargs)
|
||||
result, _ = self.model.transcribe(audio, *args, **kwargs)
|
||||
text = ""
|
||||
for seg in result['segments']:
|
||||
text += seg['text']
|
||||
for seg in result:
|
||||
text += seg.text
|
||||
return text
|
||||
|
||||
@classmethod
|
||||
def load_model(cls,
|
||||
model: str = "medium",
|
||||
download_root: str = WHISPER_DEFAULT_PATH,
|
||||
device: Optional[Union[str, device]] = None,
|
||||
device: Optional[Union[str, device]] = SCRAIBE_TORCH_DEVICE,
|
||||
*args, **kwargs
|
||||
) -> 'WhisperXTranscriber':
|
||||
) -> 'FasterWhisperModel':
|
||||
"""
|
||||
Load whisper model.
|
||||
|
||||
@@ -329,7 +329,7 @@ class WhisperXTranscriber(Transcriber):
|
||||
Defaults to WHISPER_DEFAULT_PATH.
|
||||
|
||||
device (Optional[Union[str, torch.device]], optional):
|
||||
Device to load model on. Defaults to None.
|
||||
Device to load model on. Defaults to SCRAIBE_TORCH_DEVICE.
|
||||
in_memory (bool, optional): Whether to load model in memory.
|
||||
Defaults to False.
|
||||
args: Additional arguments only to avoid errors.
|
||||
@@ -338,17 +338,18 @@ class WhisperXTranscriber(Transcriber):
|
||||
Returns:
|
||||
Transcriber: A Transcriber object initialized with the specified model.
|
||||
"""
|
||||
if device is None:
|
||||
device = "cuda" if cuda_is_available() else "cpu"
|
||||
|
||||
if not isinstance(device, str):
|
||||
device = str(device)
|
||||
|
||||
compute_type = kwargs.get('compute_type', 'float16')
|
||||
if device == 'cpu' and compute_type == 'float16':
|
||||
warnings.warn(f'Compute type {compute_type} not compatible with '
|
||||
f'device {device}! Changing compute type to int8.')
|
||||
compute_type = 'int8'
|
||||
_model = whisperx_load_model(model, download_root=download_root,
|
||||
device=device, compute_type=compute_type)
|
||||
_model = FasterWhisperModel(model, download_root=download_root,
|
||||
device=device, compute_type=compute_type,
|
||||
cpu_threads=SCRAIBE_NUM_THREADS)
|
||||
|
||||
return cls(_model, model_name=model)
|
||||
|
||||
@@ -361,7 +362,7 @@ class WhisperXTranscriber(Transcriber):
|
||||
dict: Keyword arguments for whisper model.
|
||||
"""
|
||||
# _possible_kwargs = WhisperModel.transcribe.__code__.co_varnames
|
||||
_possible_kwargs = signature(WhisperModel.transcribe).parameters.keys()
|
||||
_possible_kwargs = signature(FasterWhisperModel.transcribe).parameters.keys()
|
||||
|
||||
whisper_kwargs = {k: v for k,
|
||||
v in kwargs.items() if k in _possible_kwargs}
|
||||
@@ -370,21 +371,51 @@ class WhisperXTranscriber(Transcriber):
|
||||
whisper_kwargs["task"] = task
|
||||
|
||||
if (language := kwargs.get("language")):
|
||||
language = FasterWhisperTranscriber.convert_to_language_code(language)
|
||||
whisper_kwargs["language"] = language
|
||||
|
||||
return whisper_kwargs
|
||||
|
||||
@staticmethod
|
||||
def convert_to_language_code(lang : str) -> str:
|
||||
"""
|
||||
Load whisper model.
|
||||
|
||||
Args:
|
||||
lang (str): language as code or language name
|
||||
|
||||
Returns:
|
||||
language (str) code of language
|
||||
"""
|
||||
|
||||
# If the input is already in FASTER_WHISPER_LANGUAGE_CODES, return it directly
|
||||
if lang in FASTER_WHISPER_LANGUAGE_CODES:
|
||||
return lang
|
||||
|
||||
# Normalize the input to lowercase
|
||||
lang = lang.lower()
|
||||
|
||||
# Check if the language name is in the TO_LANGUAGE_CODE mapping
|
||||
if lang in TO_LANGUAGE_CODE:
|
||||
return TO_LANGUAGE_CODE[lang]
|
||||
|
||||
# If the language is not recognized, raise a ValueError with the available options
|
||||
available_codes = ', '.join(FASTER_WHISPER_LANGUAGE_CODES)
|
||||
raise ValueError(f"Language '{lang}' is not a valid language code or name. "
|
||||
f"Available language codes are: {available_codes}.")
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"WhisperXTranscriber(model_name={self.model_name}, model={self.model})"
|
||||
return f"FasterWhisperTranscriber(model_name={self.model_name}, model={self.model})"
|
||||
|
||||
|
||||
|
||||
def load_transcriber(model: str = "medium",
|
||||
whisper_type: str = 'whisper',
|
||||
download_root: str = WHISPER_DEFAULT_PATH,
|
||||
device: Optional[Union[str, device]] = None,
|
||||
device: Optional[Union[str, device]] = SCRAIBE_TORCH_DEVICE,
|
||||
in_memory: bool = False,
|
||||
*args, **kwargs
|
||||
) -> Union[WhisperTranscriber, WhisperXTranscriber]:
|
||||
) -> Union[WhisperTranscriber, FasterWhisperTranscriber]:
|
||||
"""
|
||||
Load whisper model.
|
||||
|
||||
@@ -403,28 +434,28 @@ def load_transcriber(model: str = "medium",
|
||||
- 'large-v3'
|
||||
- 'large'
|
||||
whisper_type (str):
|
||||
Type of whisper model to load. "whisper" or "whisperx".
|
||||
Type of whisper model to load. "whisper" or "faster-whisper".
|
||||
download_root (str, optional): Path to download the model.
|
||||
Defaults to WHISPER_DEFAULT_PATH.
|
||||
device (Optional[Union[str, torch.device]], optional):
|
||||
Device to load model on. Defaults to None.
|
||||
Device to load model on. Defaults to SCRAIBE_TORCH_DEVICE.
|
||||
in_memory (bool, optional): Whether to load model in memory.
|
||||
Defaults to False.
|
||||
args: Additional arguments only to avoid errors.
|
||||
kwargs: Additional keyword arguments only to avoid errors.
|
||||
|
||||
Returns:
|
||||
Union[WhisperTranscriber, WhisperXTranscriber]:
|
||||
Union[WhisperTranscriber, FasterWhisperTranscriber]:
|
||||
One of the Whisper variants as Transcrbier object initialized with the specified model.
|
||||
"""
|
||||
if whisper_type.lower() == 'whisper':
|
||||
_model = WhisperTranscriber.load_model(
|
||||
model, download_root, device, in_memory, *args, **kwargs)
|
||||
return _model
|
||||
elif whisper_type.lower() == 'whisperx':
|
||||
_model = WhisperXTranscriber.load_model(
|
||||
elif whisper_type.lower() == 'faster-whisper':
|
||||
_model = FasterWhisperTranscriber.load_model(
|
||||
model, download_root, device, *args, **kwargs)
|
||||
return _model
|
||||
else:
|
||||
raise ValueError(f'Model type not recognized, exptected "whisper" '
|
||||
f'or "whisperx", got {whisper_type}.')
|
||||
f'or "faster-whisper", got {whisper_type}.')
|
||||
|
||||
@@ -0,0 +1,100 @@
|
||||
"""
|
||||
Watch-folder mode for ScrAIbe.
|
||||
|
||||
Monitors a folder for audio files. For each file:
|
||||
- Transcribes + summarizes
|
||||
- Emails results
|
||||
- Deletes source file
|
||||
|
||||
Configuration (env):
|
||||
- WATCH_ENABLED: "true"/"false" (default: false)
|
||||
- WATCH_DIR: directory to watch (required if enabled)
|
||||
- WATCH_EMAIL_TO: destination email (required if enabled)
|
||||
- WATCH_POLL_INTERVAL: seconds between scans (default: 10)
|
||||
- WATCH_DELETE_ON_SUCCESS: "true"/"false" (default: true)
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger("scraibe.watcher")
|
||||
|
||||
AUDIO_EXTENSIONS = {
|
||||
".wav",
|
||||
".mp3",
|
||||
".flac",
|
||||
".m4a",
|
||||
".ogg",
|
||||
".webm",
|
||||
".mp4",
|
||||
}
|
||||
|
||||
|
||||
def _is_audio(path: Path) -> bool:
|
||||
return path.is_file() and path.suffix.lower() in AUDIO_EXTENSIONS
|
||||
|
||||
|
||||
def _enqueue_file(file_path: Path):
|
||||
"""
|
||||
Enqueue a file for transcription + summarization via Celery.
|
||||
"""
|
||||
from .tasks import process_watch_file_task
|
||||
|
||||
try:
|
||||
process_watch_file_task.delay(str(file_path))
|
||||
except Exception as e:
|
||||
logger.error("Failed to enqueue watch file %s: %s", file_path, e)
|
||||
|
||||
|
||||
def _scan_directory(watch_dir: Path):
|
||||
"""
|
||||
Scan directory and enqueue all audio files.
|
||||
"""
|
||||
if not watch_dir.is_dir():
|
||||
logger.warning("WATCH_DIR does not exist or is not a directory: %s", watch_dir)
|
||||
return
|
||||
|
||||
for p in watch_dir.iterdir():
|
||||
if _is_audio(p):
|
||||
logger.info("Found audio file in WATCH_DIR: %s", p)
|
||||
_enqueue_file(p)
|
||||
|
||||
|
||||
def start_watcher():
|
||||
"""
|
||||
Start watch-folder loop in a background thread.
|
||||
"""
|
||||
enabled = os.getenv("WATCH_ENABLED", "false").strip().lower() in ("true", "1", "yes")
|
||||
if not enabled:
|
||||
return
|
||||
|
||||
watch_dir = os.getenv("WATCH_DIR")
|
||||
if not watch_dir:
|
||||
logger.warning("WATCH_ENABLED is true but WATCH_DIR is not set. Watcher disabled.")
|
||||
return
|
||||
|
||||
email_to = os.getenv("WATCH_EMAIL_TO")
|
||||
if not email_to:
|
||||
logger.warning("WATCH_ENABLED is true but WATCH_EMAIL_TO is not set. Watcher disabled.")
|
||||
return
|
||||
|
||||
interval = float(os.getenv("WATCH_POLL_INTERVAL", "10"))
|
||||
|
||||
watch_path = Path(watch_dir).expanduser().resolve()
|
||||
watch_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logger.info("Starting watch-folder: dir=%s, email=%s, interval=%s", watch_dir, email_to, interval)
|
||||
|
||||
def _loop():
|
||||
while True:
|
||||
try:
|
||||
_scan_directory(watch_path)
|
||||
except Exception as e:
|
||||
logger.error("Error scanning WATCH_DIR: %s", e)
|
||||
time.sleep(interval)
|
||||
|
||||
t = threading.Thread(target=_loop, daemon=True)
|
||||
t.start()
|
||||
@@ -0,0 +1,338 @@
|
||||
"""
|
||||
ScrAIbe Web GUI (Gradio) - Async Mode
|
||||
-------------------------------------
|
||||
|
||||
Runs the Web GUI that:
|
||||
- Accepts audio uploads
|
||||
- Enqueues transcription jobs asynchronously via Celery
|
||||
- Backend worker:
|
||||
- Transcribes (with diarization)
|
||||
- Optionally summarizes
|
||||
- Emails the user:
|
||||
- Immediately: confirmation + queue position
|
||||
- On success: transcript + JSON (+ summary if requested)
|
||||
- On error: error details
|
||||
|
||||
This is the default entrypoint when running in Docker.
|
||||
"""
|
||||
|
||||
import os
|
||||
import logging
|
||||
import shutil
|
||||
from datetime import datetime
|
||||
|
||||
import gradio as gr
|
||||
|
||||
from .misc import setup_logging
|
||||
|
||||
logger = logging.getLogger("scraibe.webui")
|
||||
|
||||
|
||||
def load_config():
|
||||
"""
|
||||
Load configuration from misc/config.yaml if present.
|
||||
Primary runtime configuration is via environment variables.
|
||||
"""
|
||||
config_path = os.getenv("SCRAIBE_CONFIG", "/app/src/misc/config.yaml")
|
||||
config = {}
|
||||
if os.path.exists(config_path):
|
||||
try:
|
||||
import yaml
|
||||
with open(config_path, "r", encoding="utf-8") as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
except Exception as e:
|
||||
logger.warning("Failed to load config from %s: %s", config_path, e)
|
||||
return config
|
||||
|
||||
|
||||
def load_html_template(path: str, **kwargs) -> str:
|
||||
"""
|
||||
Load an HTML template and fill placeholders.
|
||||
"""
|
||||
if not os.path.exists(path):
|
||||
return ""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
template = f.read()
|
||||
try:
|
||||
return template.format(**kwargs)
|
||||
except KeyError:
|
||||
return template
|
||||
|
||||
|
||||
def create_app():
|
||||
"""
|
||||
Create and launch the Gradio Web GUI (async mode).
|
||||
"""
|
||||
|
||||
# Logging
|
||||
log_level = os.getenv("LOG_LEVEL", "INFO")
|
||||
setup_logging(level=log_level)
|
||||
|
||||
# Load config (branding, layout, etc.)
|
||||
config = load_config()
|
||||
layout_cfg = config.get("layout", {})
|
||||
launch_cfg = config.get("launch", {})
|
||||
|
||||
logger.info("Starting ScrAIbe Web GUI (async mode).")
|
||||
|
||||
# Ensure upload directory exists
|
||||
upload_dir = os.getenv("SCRAIBE_UPLOAD_DIR", "/tmp/scraibe_uploads")
|
||||
os.makedirs(upload_dir, exist_ok=True)
|
||||
|
||||
# Paths for assets
|
||||
header_path = layout_cfg.get("header", "/app/src/misc/header.html")
|
||||
footer_path = layout_cfg.get("footer", "/app/src/misc/footer.html")
|
||||
|
||||
# Configurable title, logo URL, and accent color via environment
|
||||
webui_title = os.getenv("WEBUI_TITLE", "A.P.Strom Transcription")
|
||||
logo_url = os.getenv("WEBUI_LOGO_URL", "https://apstrom.ca")
|
||||
accent_color = os.getenv("EMAIL_ACCENT_COLOR", "#7C6DA0")
|
||||
|
||||
# Prepare header HTML with logo URL and accent color
|
||||
header_html = ""
|
||||
if os.path.exists(header_path):
|
||||
header_html = load_html_template(
|
||||
header_path,
|
||||
webui_title=webui_title,
|
||||
header_logo_url=logo_url,
|
||||
header_logo_src=logo_url,
|
||||
accent_color=accent_color,
|
||||
)
|
||||
|
||||
# Prepare footer HTML with accent color
|
||||
footer_html = ""
|
||||
if os.path.exists(footer_path):
|
||||
version = os.getenv("SCRABIE_VERSION", "0.1.1.dev")
|
||||
footer_html = load_html_template(
|
||||
footer_path,
|
||||
footer_scraibe_webui_version=version,
|
||||
accent_color=accent_color,
|
||||
)
|
||||
|
||||
# Build Gradio interface
|
||||
with gr.Blocks(
|
||||
title=webui_title,
|
||||
css="""
|
||||
/* Responsive layout: stack columns on smaller screens */
|
||||
@media (max-width: 850px) {
|
||||
.gradio-container {
|
||||
max-width: 100% !important;
|
||||
}
|
||||
#main-row .gr-row {
|
||||
flex-direction: column !important;
|
||||
}
|
||||
#main-row .gr-col {
|
||||
width: 100% !important;
|
||||
max-width: 100% !important;
|
||||
flex: none !important;
|
||||
}
|
||||
}
|
||||
""",
|
||||
) as app:
|
||||
|
||||
# Header
|
||||
if header_html:
|
||||
gr.HTML(header_html)
|
||||
|
||||
# Single-column layout: inputs followed by status/output
|
||||
with gr.Column():
|
||||
audio_input = gr.Audio(
|
||||
label="Upload or record audio",
|
||||
type="filepath",
|
||||
)
|
||||
|
||||
task_choice = gr.Radio(
|
||||
choices=[
|
||||
("Transcribe", "transcribe"),
|
||||
("Transcribe & summarize", "transcript_and_summarize"),
|
||||
],
|
||||
value="transcribe",
|
||||
label="Task",
|
||||
container=True,
|
||||
)
|
||||
|
||||
identify_speakers = gr.Checkbox(
|
||||
label="Identify speakers (best effort using AI)",
|
||||
value=True,
|
||||
info="If enabled, AI will attempt to infer real names for speakers and replace Speaker 1/2/etc. in the transcript.",
|
||||
)
|
||||
|
||||
email_to = gr.Textbox(
|
||||
label="Your email address (required)",
|
||||
placeholder="e.g. your.name@example.com",
|
||||
)
|
||||
|
||||
email_cc = gr.Textbox(
|
||||
label="CC (optional, comma-separated)",
|
||||
placeholder="e.g. manager@example.com",
|
||||
)
|
||||
|
||||
submit_btn = gr.Button("Submit for transcription", variant="primary")
|
||||
|
||||
status_text = gr.Textbox(
|
||||
label="Status",
|
||||
lines=6,
|
||||
interactive=False,
|
||||
)
|
||||
|
||||
# Footer
|
||||
if footer_html:
|
||||
gr.HTML(footer_html)
|
||||
|
||||
# Events
|
||||
|
||||
def on_task_change(value):
|
||||
# No special UI changes needed; both modes handled in backend
|
||||
return
|
||||
|
||||
task_choice.change(
|
||||
fn=on_task_change,
|
||||
inputs=[task_choice],
|
||||
outputs=[],
|
||||
)
|
||||
|
||||
def on_submit(
|
||||
audio,
|
||||
task,
|
||||
email_to_val,
|
||||
email_cc_val,
|
||||
identify_speakers_val,
|
||||
):
|
||||
if not audio:
|
||||
return "Please upload or record audio."
|
||||
|
||||
email_to_val = (email_to_val or "").strip()
|
||||
if not email_to_val:
|
||||
return "Please enter your email address."
|
||||
|
||||
# Copy uploaded file to a stable location
|
||||
try:
|
||||
ext = os.path.splitext(audio)[1] or ".wav"
|
||||
ts = datetime.utcnow().strftime("%Y%m%d%H%M%S%f")
|
||||
new_name = f"upload_{ts}{ext}"
|
||||
dest_path = os.path.join(upload_dir, new_name)
|
||||
shutil.copy2(audio, dest_path)
|
||||
except Exception as e:
|
||||
logger.error("Error copying uploaded file: %s", e)
|
||||
return f"Error saving your file: {e}"
|
||||
|
||||
# Import Celery task
|
||||
try:
|
||||
from .tasks import process_transcription_task
|
||||
except ImportError:
|
||||
return (
|
||||
"Error: async processing is not available (Celery not configured)."
|
||||
)
|
||||
|
||||
# Enqueue transcription job
|
||||
try:
|
||||
task_result = process_transcription_task.delay(
|
||||
audio_path=dest_path,
|
||||
task_type=task,
|
||||
language=None,
|
||||
num_speakers=None,
|
||||
email_to=email_to_val,
|
||||
email_cc=email_cc_val or None,
|
||||
include_summary=(task == "transcript_and_summarize"),
|
||||
identify_speakers=bool(identify_speakers_val),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("Error enqueuing job: %s", e)
|
||||
return f"Error submitting your file: {e}"
|
||||
|
||||
return (
|
||||
"Your audio file has been received and added to the queue.\n"
|
||||
"We have sent a confirmation email to you.\n"
|
||||
"You will receive another email with your transcript (and summary, if requested) "
|
||||
"once processing is complete.\n"
|
||||
f"Job ID: {task_result.id}"
|
||||
)
|
||||
|
||||
submit_btn.click(
|
||||
fn=on_submit,
|
||||
inputs=[
|
||||
audio_input,
|
||||
task_choice,
|
||||
email_to,
|
||||
email_cc,
|
||||
identify_speakers,
|
||||
],
|
||||
outputs=[status_text],
|
||||
)
|
||||
|
||||
# Launch options with accent color applied via CSS
|
||||
server_name = launch_cfg.get("server_name", os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"))
|
||||
server_port = launch_cfg.get("server_port", 7860)
|
||||
|
||||
accent_css = f"""
|
||||
:root {{
|
||||
--primary-accent: {accent_color};
|
||||
}}
|
||||
button.primary,
|
||||
.primary,
|
||||
.gradio-button-primary,
|
||||
.gradio-container button.primary {{
|
||||
background-color: var(--primary-accent) !important;
|
||||
border-color: var(--primary-accent) !important;
|
||||
}}
|
||||
button.primary:hover,
|
||||
.primary:hover,
|
||||
.gradio-button-primary:hover {{
|
||||
background-color: var(--primary-accent) !important;
|
||||
opacity: 0.95;
|
||||
}}
|
||||
.radio-item.selected,
|
||||
.radio-item.selected label {{
|
||||
color: var(--primary-accent) !important;
|
||||
}}
|
||||
a,
|
||||
.gradio-container a {{
|
||||
color: var(--primary-accent) !important;
|
||||
}}
|
||||
body {{
|
||||
font-family: Arial, sans-serif;
|
||||
}}
|
||||
/* Increase main title font size */
|
||||
h1,
|
||||
.webui-title,
|
||||
.header-title {{
|
||||
font-size: 60px !important;
|
||||
}}
|
||||
/* Hide Gradio's "Use via API" link/button */
|
||||
#share-btn,
|
||||
a[href*="/api"],
|
||||
a[href*="#/api"],
|
||||
a[href*="#api"],
|
||||
.gradio-container a[href*="api"] {{
|
||||
display: none !important;
|
||||
}}
|
||||
/* Mobile-friendly adjustments */
|
||||
@media (max-width: 700px) {{
|
||||
.gradio-container {{
|
||||
padding: 0 4px !important;
|
||||
}}
|
||||
.gradio-container .gr-row {{
|
||||
flex-direction: column !important;
|
||||
gap: 8px !important;
|
||||
}}
|
||||
.gradio-container .gr-col {{
|
||||
width: 100% !important;
|
||||
max-width: 100% !important;
|
||||
flex: none !important;
|
||||
}}
|
||||
.gradio-container button.primary {{
|
||||
width: 100% !important;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
}}
|
||||
"""
|
||||
|
||||
app.launch(
|
||||
server_name=str(server_name),
|
||||
server_port=int(server_port),
|
||||
css=accent_css,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
create_app()
|
||||
@@ -0,0 +1,86 @@
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import pytest
|
||||
|
||||
from scraibe.audio import (
|
||||
get_audio_duration,
|
||||
split_audio_into_chunks,
|
||||
)
|
||||
|
||||
TEST_AUDIO_1 = "tests/audio_test_1.mp4"
|
||||
TEST_AUDIO_2 = "tests/audio_test_2.mp4"
|
||||
|
||||
|
||||
@pytest.fixture(params=[TEST_AUDIO_1, TEST_AUDIO_2])
|
||||
def test_audio_path(request):
|
||||
return request.param
|
||||
|
||||
|
||||
def test_get_audio_duration(test_audio_path):
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
assert isinstance(dur, float)
|
||||
assert dur > 0
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_no_split_short(test_audio_path):
|
||||
# For short files, should return the same file with no extra chunks
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=test_audio_path,
|
||||
max_duration=600.0, # larger than both test files
|
||||
overlap=2.0,
|
||||
)
|
||||
assert len(chunks) == 1
|
||||
assert chunks[0]["path"] == test_audio_path
|
||||
assert chunks[0]["start"] == 0.0
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
assert abs(chunks[0]["end"] - dur) < 0.05
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_creates_chunks(tmp_path):
|
||||
# Use a small chunk duration to force splitting
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=TEST_AUDIO_1,
|
||||
max_duration=2.0,
|
||||
overlap=0.5,
|
||||
)
|
||||
assert len(chunks) > 1
|
||||
|
||||
# Check that each chunk file exists and is non-empty
|
||||
for c in chunks:
|
||||
assert os.path.exists(c["path"])
|
||||
assert os.path.getsize(c["path"]) > 0
|
||||
|
||||
# Check time ordering and overlap
|
||||
for i in range(1, len(chunks)):
|
||||
prev = chunks[i - 1]
|
||||
curr = chunks[i]
|
||||
assert curr["start"] >= prev["start"]
|
||||
assert curr["start"] < prev["end"] # overlap
|
||||
|
||||
# Cleanup
|
||||
for c in chunks:
|
||||
if os.path.exists(c["path"]):
|
||||
os.remove(c["path"])
|
||||
|
||||
|
||||
def test_split_audio_into_chunks_total_coverage(test_audio_path):
|
||||
dur = get_audio_duration(test_audio_path)
|
||||
|
||||
# Use small chunks to ensure coverage
|
||||
chunks = split_audio_into_chunks(
|
||||
input_path=test_audio_path,
|
||||
max_duration=2.0,
|
||||
overlap=0.5,
|
||||
)
|
||||
|
||||
# First chunk starts at 0
|
||||
assert chunks[0]["start"] == 0.0
|
||||
|
||||
# Last chunk end should cover the duration
|
||||
assert chunks[-1]["end"] >= dur - 0.05
|
||||
|
||||
# Cleanup
|
||||
for c in chunks:
|
||||
if os.path.exists(c["path"]):
|
||||
os.remove(c["path"])
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
@pytest.fixture
|
||||
def create_scraibe_instance():
|
||||
if "HF_TOKEN" in os.environ:
|
||||
return Scraibe(use_auth_token=os.environ["HF_TOKEN"])
|
||||
return Scraibe(use_auth_token=os.environ["HF_TOKEN"], whisper_model= "tiny")
|
||||
else:
|
||||
return Scraibe()
|
||||
|
||||
@@ -19,19 +19,19 @@ def test_scraibe_init(create_scraibe_instance):
|
||||
|
||||
def test_scraibe_autotranscribe(create_scraibe_instance):
|
||||
model = create_scraibe_instance
|
||||
transcript = model.autotranscribe('test/audio_test_2.mp4')
|
||||
transcript = model.autotranscribe('tests/audio_test_2.mp4')
|
||||
assert isinstance(transcript, Transcript)
|
||||
|
||||
|
||||
def test_scraibe_diarization(create_scraibe_instance):
|
||||
model = create_scraibe_instance
|
||||
diarisation_result = model.diarization('test/audio_test_2.mp4')
|
||||
diarisation_result = model.diarization('tests/audio_test_2.mp4')
|
||||
assert isinstance(diarisation_result, dict)
|
||||
|
||||
|
||||
def test_scraibe_transcribe(create_scraibe_instance):
|
||||
model = create_scraibe_instance
|
||||
transcription_result = model.transcribe('test/audio_test_2.mp4')
|
||||
transcription_result = model.transcribe('tests/audio_test_2.mp4')
|
||||
assert isinstance(transcription_result, str)
|
||||
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
"""
|
||||
Local test for transcript/summary/combined .docx generation.
|
||||
Checks:
|
||||
- Line numbering only on transcript pages.
|
||||
- Page numbering (X of Y) in footer.
|
||||
- Cover pages present and centered.
|
||||
- Combined document structure.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from scraibe.email_sender import (
|
||||
create_transcript_docx,
|
||||
create_summary_docx,
|
||||
create_combined_docx,
|
||||
)
|
||||
|
||||
TRANSCRIPT_TEXT = """[00:00] Speaker 1: Good morning, everyone. Thank you for joining today's meeting.
|
||||
[00:12] Speaker 2: Good morning. I'm looking forward to discussing the new requirements.
|
||||
[00:25] Speaker 1: Let's start with the timeline. We need to finalize the scope by Friday.
|
||||
[00:38] Speaker 2: Agreed. I'll send a summary of the key points after this call.
|
||||
[00:45] Speaker 1: Perfect. If there are no other items, we can wrap up here."""
|
||||
|
||||
SUMMARY_TEXT = """# Meeting Overview
|
||||
## Key Discussion Points
|
||||
### Timeline and Scope
|
||||
#### Next Steps"""
|
||||
|
||||
COVER_DATE = "June 14, 2026"
|
||||
TRANSCRIPT_DESC = "Transcript of a project planning meeting discussing timelines and scope."
|
||||
SUMMARY_DESC = "Summary of a project planning meeting covering key decisions and next steps."
|
||||
|
||||
|
||||
def main():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
print("Using temp directory:", tmpdir)
|
||||
|
||||
# 1) Transcript-only
|
||||
transcript_path = os.path.join(tmpdir, "TRANSCRIPT_TEST.docx")
|
||||
print("Creating transcript-only docx:", transcript_path)
|
||||
create_transcript_docx(
|
||||
text=TRANSCRIPT_TEXT,
|
||||
filename=transcript_path,
|
||||
include_cover=True,
|
||||
cover_date=COVER_DATE,
|
||||
cover_desc=TRANSCRIPT_DESC,
|
||||
)
|
||||
print("OK: transcript-only created.")
|
||||
|
||||
# 2) Summary-only
|
||||
summary_path = os.path.join(tmpdir, "SUMMARY_TEST.docx")
|
||||
print("Creating summary-only docx:", summary_path)
|
||||
create_summary_docx(
|
||||
text=SUMMARY_TEXT,
|
||||
filename=summary_path,
|
||||
include_cover=True,
|
||||
cover_date=COVER_DATE,
|
||||
cover_desc=SUMMARY_DESC,
|
||||
)
|
||||
print("OK: summary-only created.")
|
||||
|
||||
# 3) Combined
|
||||
combined_path = os.path.join(tmpdir, "COMBINED_TEST.docx")
|
||||
print("Creating combined docx:", combined_path)
|
||||
create_combined_docx(
|
||||
transcript_text=TRANSCRIPT_TEXT,
|
||||
summary_text=SUMMARY_TEXT,
|
||||
filename=combined_path,
|
||||
transcript_cover_date=COVER_DATE,
|
||||
transcript_cover_desc=TRANSCRIPT_DESC,
|
||||
summary_cover_date=COVER_DATE,
|
||||
summary_cover_desc=SUMMARY_DESC,
|
||||
)
|
||||
print("OK: combined created.")
|
||||
|
||||
# Basic size sanity checks
|
||||
for path in [transcript_path, summary_path, combined_path]:
|
||||
size = os.path.getsize(path)
|
||||
print(f"File: {os.path.basename(path)} - size: {size} bytes")
|
||||
if size < 10000:
|
||||
print("WARNING: File seems unusually small:", path)
|
||||
|
||||
print("\nAll .docx files generated successfully.")
|
||||
print("Please open them in Word to verify:")
|
||||
print("- Only transcript pages have line numbers.")
|
||||
print("- Footer shows 'X of Y' on all pages.")
|
||||
print("- Cover pages are centered and use the correct date format.")
|
||||
print("- Combined doc order: cover, page break, summary, page break, transcript.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,230 @@
|
||||
import os
|
||||
import json
|
||||
import tempfile
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from scraibe.localai_client import LocalAIClient, LocalAIError
|
||||
from scraibe.audio import get_audio_duration, split_audio_into_chunks
|
||||
|
||||
|
||||
TEST_AUDIO_1 = "tests/audio_test_1.mp4"
|
||||
|
||||
|
||||
def make_fake_segments(start=0.0, count=3):
|
||||
segments = []
|
||||
for i in range(count):
|
||||
s = start + i * 2.0
|
||||
e = s + 2.0
|
||||
segments.append({
|
||||
"start": s,
|
||||
"end": e,
|
||||
"speaker": "SPEAKER_00",
|
||||
"text": f"Segment text {i}",
|
||||
})
|
||||
return segments
|
||||
|
||||
|
||||
def fake_localai_response(segments):
|
||||
return {
|
||||
"segments": segments,
|
||||
"text": " ".join(seg["text"] for seg in segments),
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
with patch.object(LocalAIClient, "__init__", lambda self, **kw: None):
|
||||
c = LocalAIClient()
|
||||
c.api_url = "http://localhost:8080"
|
||||
c.model = "vibevoice-diarize"
|
||||
c.api_key = None
|
||||
c._client = MagicMock()
|
||||
return c
|
||||
|
||||
|
||||
def test_parse_diarization_response(client):
|
||||
segs = make_fake_segments()
|
||||
raw = fake_localai_response(segs)
|
||||
|
||||
out = client._parse_diarization_response(raw)
|
||||
|
||||
assert "segments" in out
|
||||
assert "speakers" in out
|
||||
assert "transcripts" in out
|
||||
assert len(out["segments"]) == len(segs)
|
||||
for i, s in enumerate(segs):
|
||||
assert out["segments"][i][0] == s["start"]
|
||||
assert out["segments"][i][1] == s["end"]
|
||||
assert out["speakers"][i] == s["speaker"]
|
||||
assert out["transcripts"][i] == s["text"]
|
||||
|
||||
|
||||
def test_parse_diarization_empty(client):
|
||||
out = client._parse_diarization_response({"segments": []})
|
||||
assert out["segments"] == []
|
||||
assert out["speakers"] == []
|
||||
assert out["transcripts"] == []
|
||||
|
||||
|
||||
def test_diarize_and_transcribe_single_happy(client):
|
||||
with patch.object(client, "_client") as mock_client:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = fake_localai_response(make_fake_segments())
|
||||
mock_client.post.return_value = mock_resp
|
||||
|
||||
result = client.diarize_and_transcribe(
|
||||
audio_path=TEST_AUDIO_1,
|
||||
verbose=False,
|
||||
return_raw=True,
|
||||
)
|
||||
|
||||
assert "segments" in result
|
||||
assert "raw_result" in result
|
||||
assert len(result["segments"]) > 0
|
||||
|
||||
|
||||
def test_chunking_triggered_for_long_audio(client):
|
||||
# Simulate long audio by patching get_audio_duration
|
||||
with patch("scraibe.localai_client.get_audio_duration") as mock_dur, \
|
||||
patch.object(client, "_diarize_and_transcribe_chunked") as mock_chunked:
|
||||
|
||||
mock_dur.return_value = 600.0 # 10 minutes
|
||||
mock_chunked.return_value = {
|
||||
"segments": [],
|
||||
"speakers": [],
|
||||
"transcripts": [],
|
||||
}
|
||||
|
||||
client.diarize_and_transcribe(
|
||||
audio_path=TEST_AUDIO_1,
|
||||
verbose=False,
|
||||
use_chunking=None,
|
||||
max_single_request_duration=300.0,
|
||||
)
|
||||
|
||||
mock_chunked.assert_called_once()
|
||||
|
||||
|
||||
def test_chunking_not_triggered_for_short_audio(client):
|
||||
with patch("scraibe.localai_client.get_audio_duration") as mock_dur, \
|
||||
patch.object(client, "_diarize_and_transcribe_chunked") as mock_chunked, \
|
||||
patch.object(client, "_diarize_and_transcribe_single") as mock_single:
|
||||
|
||||
mock_dur.return_value = 120.0
|
||||
mock_single.return_value = {
|
||||
"segments": [],
|
||||
"speakers": [],
|
||||
"transcripts": [],
|
||||
}
|
||||
|
||||
client.diarize_and_transcribe(
|
||||
audio_path=TEST_AUDIO_1,
|
||||
verbose=False,
|
||||
use_chunking=None,
|
||||
max_single_request_duration=300.0,
|
||||
)
|
||||
|
||||
mock_chunked.assert_not_called()
|
||||
mock_single.assert_called_once()
|
||||
|
||||
|
||||
def test_chunked_transcription_adjusts_timestamps(client):
|
||||
# Mock split_audio_into_chunks to return two chunks
|
||||
chunk1_path = TEST_AUDIO_1
|
||||
chunk2_path = TEST_AUDIO_1 # reusing same file; in real usage different
|
||||
|
||||
chunks = [
|
||||
{"path": chunk1_path, "start": 0.0, "end": 10.0},
|
||||
{"path": chunk2_path, "start": 10.0, "end": 20.0},
|
||||
]
|
||||
|
||||
with patch("scraibe.localai_client.split_audio_into_chunks") as mock_split, \
|
||||
patch.object(client, "_diarize_and_transcribe_single") as mock_single, \
|
||||
patch("os.remove"):
|
||||
|
||||
mock_split.return_value = chunks
|
||||
|
||||
# First chunk: segments 0–4
|
||||
# Second chunk: segments 0–4 (local times)
|
||||
def side_effect(audio_path, **kw):
|
||||
if audio_path == chunk1_path:
|
||||
segs = make_fake_segments(start=0.0, count=2)
|
||||
else:
|
||||
segs = make_fake_segments(start=0.0, count=2)
|
||||
return client._parse_diarization_response(fake_localai_response(segs))
|
||||
|
||||
mock_single.side_effect = side_effect
|
||||
|
||||
result = client._diarize_and_transcribe_chunked(
|
||||
audio_path=TEST_AUDIO_1,
|
||||
verbose=False,
|
||||
return_raw=False,
|
||||
chunk_duration=10.0,
|
||||
chunk_overlap=2.0,
|
||||
)
|
||||
|
||||
# Check we got 4 segments total
|
||||
assert len(result["segments"]) == 4
|
||||
|
||||
# First two segments should be in [0, 4]
|
||||
assert result["segments"][0][0] == 0.0
|
||||
assert result["segments"][1][0] == 2.0
|
||||
|
||||
# Next two segments should be shifted by 10
|
||||
assert result["segments"][2][0] == 10.0
|
||||
assert result["segments"][3][0] == 12.0
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_integration_chunked_transcription_with_localai():
|
||||
"""
|
||||
Integration test: run chunked transcription against a live LocalAI instance.
|
||||
Only runs if LOCALAI_API_URL is set and an audio file is provided.
|
||||
This test is skipped by default unless run with:
|
||||
pytest -m integration
|
||||
"""
|
||||
api_url = os.getenv("LOCALAI_API_URL")
|
||||
if not api_url:
|
||||
pytest.skip("LOCALAI_API_URL not set; skipping integration test")
|
||||
|
||||
# Use one of the bundled test audio files
|
||||
audio_path = TEST_AUDIO_1
|
||||
if not os.path.exists(audio_path):
|
||||
pytest.skip(f"Test audio not found: {audio_path}")
|
||||
|
||||
# Force chunking with a very small max_single_request_duration
|
||||
# Use environment-configured model or a sensible default
|
||||
model = os.getenv("LOCALAI_MODEL") or "vibevoice-cpp-asr"
|
||||
|
||||
client = LocalAIClient(api_url=api_url, model=model)
|
||||
try:
|
||||
result = client.diarize_and_transcribe(
|
||||
audio_path=audio_path,
|
||||
verbose=True,
|
||||
return_raw=True,
|
||||
use_chunking=True,
|
||||
chunk_duration=3.0,
|
||||
chunk_overlap=0.5,
|
||||
max_single_request_duration=1.0,
|
||||
)
|
||||
|
||||
assert "segments" in result
|
||||
assert len(result["segments"]) > 0
|
||||
|
||||
# Basic sanity: segments are time-ordered
|
||||
for i in range(1, len(result["segments"])):
|
||||
prev_end = result["segments"][i - 1][1]
|
||||
curr_start = result["segments"][i][0]
|
||||
assert curr_start >= result["segments"][i - 1][0]
|
||||
|
||||
# If raw_result indicates chunked, ensure structure is sensible
|
||||
raw = result.get("raw_result")
|
||||
if raw and raw.get("chunked"):
|
||||
assert "chunks" in raw
|
||||
assert len(raw["chunks"]) > 1
|
||||
|
||||
finally:
|
||||
client.close()
|
||||
@@ -1,6 +1,6 @@
|
||||
import pytest
|
||||
from scraibe import (Transcriber, WhisperTranscriber,
|
||||
WhisperXTranscriber, load_transcriber)
|
||||
FasterWhisperTranscriber, load_transcriber)
|
||||
import torch
|
||||
|
||||
|
||||
@@ -31,33 +31,33 @@ def test_transcriber(mock_load_model, audio_file, expected_transcription):
|
||||
|
||||
@pytest.fixture
|
||||
def whisper_instance():
|
||||
return load_transcriber('medium', whisper_type='whisper')
|
||||
return load_transcriber('tiny', whisper_type='whisper')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def whisperx_instance():
|
||||
return load_transcriber('medium', whisper_type='whisperx')
|
||||
def faster_whisper_instance():
|
||||
return load_transcriber('tiny', whisper_type='faster-whisper')
|
||||
|
||||
|
||||
def test_whisper_base_initialization(whisper_instance):
|
||||
assert isinstance(whisper_instance, Transcriber)
|
||||
|
||||
|
||||
def test_whisperx_base_initialization(whisperx_instance):
|
||||
assert isinstance(whisperx_instance, Transcriber)
|
||||
def test_faster_whisper_base_initialization(faster_whisper_instance):
|
||||
assert isinstance(faster_whisper_instance, Transcriber)
|
||||
|
||||
|
||||
def test_whisper_transcriber_initialization(whisper_instance):
|
||||
assert isinstance(whisper_instance, WhisperTranscriber)
|
||||
|
||||
|
||||
def test_whisperx_transcriber_initialization(whisperx_instance):
|
||||
assert isinstance(whisperx_instance, WhisperXTranscriber)
|
||||
def test_faster_whisper_transcriber_initialization(faster_whisper_instance):
|
||||
assert isinstance(faster_whisper_instance, FasterWhisperTranscriber)
|
||||
|
||||
|
||||
def test_wrong_transcriber_initialization():
|
||||
with pytest.raises(ValueError):
|
||||
load_transcriber('medium', whisper_type='wrong_whisper')
|
||||
load_transcriber('tiny', whisper_type='wrong_whisper')
|
||||
|
||||
|
||||
def test_get_whisper_kwargs():
|
||||
@@ -69,12 +69,12 @@ def test_get_whisper_kwargs():
|
||||
def test_whisper_transcribe(whisper_instance):
|
||||
model = whisper_instance
|
||||
# mocker.patch.object(transcriber_instance.model, 'transcribe', return_value={'Hello, World !'} )
|
||||
transcript = model.transcribe('test/audio_test_2.mp4')
|
||||
transcript = model.transcribe('tests/audio_test_2.mp4')
|
||||
assert isinstance(transcript, str)
|
||||
|
||||
|
||||
def test_whisperx_transcribe(whisperx_instance):
|
||||
model = whisperx_instance
|
||||
def test_faster_whisper_transcribe(faster_whisper_instance):
|
||||
model = faster_whisper_instance
|
||||
# mocker.patch.object(transcriber_instance.model, 'transcribe', return_value={'Hello, World !'} )
|
||||
transcript = model.transcribe('test/audio_test_2.mp4')
|
||||
transcript = model.transcribe('tests/audio_test_2.mp4')
|
||||
assert isinstance(transcript, str)
|
||||
Reference in New Issue
Block a user