290 lines
9.2 KiB
Python
290 lines
9.2 KiB
Python
"""
|
|
Command-Line Interface (CLI) for the Scraibe class,
|
|
allowing for user interaction to transcribe and diarize audio files.
|
|
The function includes arguments for specifying the audio files, model paths,
|
|
output formats, and other options necessary for transcription.
|
|
|
|
This version is adapted for LocalAI-based transcription and diarization.
|
|
"""
|
|
|
|
import os
|
|
import json
|
|
import logging
|
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
|
from .autotranscript import Scraibe
|
|
from .misc import set_threads, setup_logging
|
|
|
|
|
|
def cli():
|
|
"""
|
|
Command-Line Interface (CLI) for the Scraibe class, allowing for user interaction to transcribe
|
|
and diarize audio files via a LocalAI server.
|
|
"""
|
|
|
|
# Initialize logging (can be overridden via --log-level)
|
|
setup_logging(level=os.getenv("LOG_LEVEL", "INFO"))
|
|
|
|
logger = logging.getLogger("scraibe.cli")
|
|
|
|
def str2bool(string):
|
|
str2val = {"True": True, "False": False}
|
|
if string in str2val:
|
|
return str2val[string]
|
|
else:
|
|
raise ValueError(
|
|
f"Expected one of {set(str2val.keys())}, got {string}"
|
|
)
|
|
|
|
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
|
|
|
|
parser.add_argument(
|
|
"-f",
|
|
"--audio-files",
|
|
nargs="+",
|
|
type=str,
|
|
default=None,
|
|
help="List of audio files to transcribe.",
|
|
)
|
|
|
|
# LocalAI connection (env vars preferred, but CLI overrides allowed)
|
|
parser.add_argument(
|
|
"--localai-api-url",
|
|
type=str,
|
|
default=None,
|
|
help="LocalAI server URL (e.g., http://localhost:8080). "
|
|
"Overrides LOCALAI_API_URL env var if provided.",
|
|
)
|
|
parser.add_argument(
|
|
"--localai-api-key",
|
|
type=str,
|
|
default=None,
|
|
help="LocalAI API key. Overrides LOCALAI_API_KEY env var if provided.",
|
|
)
|
|
parser.add_argument(
|
|
"--localai-model",
|
|
type=str,
|
|
default=None,
|
|
help="Model name to use on LocalAI (e.g., vibevoice-diarize). "
|
|
"Overrides LOCALAI_MODEL env var if provided.",
|
|
)
|
|
|
|
# Summarizer overrides (env vars are primary)
|
|
parser.add_argument(
|
|
"--summarizer-api-url",
|
|
type=str,
|
|
default=None,
|
|
help="Summarization LLM API URL (e.g., http://localhost:8080). "
|
|
"Overrides SUMMARIZER_API_URL env var if provided.",
|
|
)
|
|
parser.add_argument(
|
|
"--summarizer-api-key",
|
|
type=str,
|
|
default=None,
|
|
help="Summarization LLM API key. Overrides SUMMARIZER_API_KEY env var if provided.",
|
|
)
|
|
parser.add_argument(
|
|
"--summarizer-model",
|
|
type=str,
|
|
default=None,
|
|
help="Model name for summarization. Overrides SUMMARIZER_MODEL env var if provided.",
|
|
)
|
|
|
|
# Kept for backward compatibility with UI / existing scripts; ignored by LocalAI client.
|
|
parser.add_argument(
|
|
"--whisper-type",
|
|
type=str,
|
|
default="whisper",
|
|
choices=["whisper", "faster-whisper"],
|
|
help="[Backward compatibility] Type of Whisper model. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--whisper-model-name",
|
|
default="medium",
|
|
help="[Backward compatibility] Whisper model name. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--whisper-model-directory",
|
|
type=str,
|
|
default=None,
|
|
help="[Backward compatibility] Whisper model directory. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--diarization-directory",
|
|
type=str,
|
|
default=None,
|
|
help="[Backward compatibility] Diarization model directory. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--hf-token",
|
|
default=None,
|
|
type=str,
|
|
help="[Backward compatibility] HuggingFace token. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--inference-device",
|
|
default="cpu",
|
|
help="[Backward compatibility] Device for inference. Ignored when using LocalAI.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num-threads",
|
|
type=int,
|
|
default=None,
|
|
help="Number of threads used for CPU operations; overrides MKL_NUM_THREADS/OMP_NUM_THREADS.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-directory",
|
|
"-o",
|
|
type=str,
|
|
default=".",
|
|
help="Directory to save the transcription outputs.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--output-format",
|
|
"-of",
|
|
type=str,
|
|
default="txt",
|
|
choices=["txt", "json", "md", "html"],
|
|
help="Format of the output file; defaults to txt.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--verbose-output",
|
|
type=str2bool,
|
|
default=True,
|
|
help="Enable or disable progress and debug messages.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--task",
|
|
type=str,
|
|
default="transcribe",
|
|
choices=[
|
|
"transcribe",
|
|
"transcript_and_summarize",
|
|
],
|
|
help="Task to perform: 'transcribe' or 'transcript_and_summarize'.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--language",
|
|
type=str,
|
|
default=None,
|
|
help="Language spoken in the audio. Specify None to perform language detection.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--num-speakers",
|
|
type=int,
|
|
default=None,
|
|
help="Number of speakers in the audio.",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--log-level",
|
|
type=str,
|
|
default=None,
|
|
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
|
help="Override LOG_LEVEL env var for logging verbosity.",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Apply log-level override if provided
|
|
log_level = args.log_level or os.getenv("LOG_LEVEL", "INFO")
|
|
setup_logging(level=log_level)
|
|
logger.info("CLI starting with log_level=%s", log_level)
|
|
|
|
arg_dict = vars(args)
|
|
|
|
# configure output
|
|
out_folder = arg_dict.pop("output_directory")
|
|
os.makedirs(out_folder, exist_ok=True)
|
|
logger.info("Output directory: %s", out_folder)
|
|
|
|
out_format = arg_dict.pop("output_format")
|
|
task = arg_dict.pop("task")
|
|
|
|
logger.info("Task: %s", task)
|
|
logger.info("Output format: %s", out_format)
|
|
|
|
set_threads(arg_dict.pop("num_threads"))
|
|
|
|
# Build kwargs for Scraibe (LocalAI-backed)
|
|
class_kwargs = {
|
|
"api_url": arg_dict.pop("localai_api_url"),
|
|
"api_key": arg_dict.pop("localai_api_key"),
|
|
"model": arg_dict.pop("localai_model"),
|
|
# kept for backward compatibility, but ignored:
|
|
"whisper_model": arg_dict.pop("whisper_model_name"),
|
|
"whisper_type": arg_dict.pop("whisper_type"),
|
|
"dia_model": arg_dict.pop("diarization_directory"),
|
|
"use_auth_token": arg_dict.pop("hf_token"),
|
|
"verbose": arg_dict.pop("verbose_output"),
|
|
}
|
|
|
|
logger.info("LocalAI API URL: %s", class_kwargs["api_url"] or os.getenv("LOCALAI_API_URL", "<not set>"))
|
|
logger.info("LocalAI Model: %s", class_kwargs["model"] or os.getenv("LOCALAI_MODEL", "<not set>"))
|
|
|
|
model = Scraibe(**class_kwargs)
|
|
|
|
if arg_dict["audio_files"]:
|
|
audio_files = arg_dict.pop("audio_files")
|
|
logger.info("Audio files: %s", audio_files)
|
|
|
|
if task == "transcribe":
|
|
for audio in audio_files:
|
|
logger.info("Starting 'transcribe' for: %s", audio)
|
|
out = model.transcribe(
|
|
audio,
|
|
language=arg_dict.pop("language"),
|
|
verbose=arg_dict.pop("verbose_output"),
|
|
num_speakers=arg_dict.pop("num_speakers"),
|
|
)
|
|
basename = audio.split("/")[-1].split(".")[0]
|
|
path = os.path.join(out_folder, f"{basename}.{out_format}")
|
|
logger.info("Saving transcript to: %s", path)
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(out)
|
|
logger.info("Transcript saved: %s", path)
|
|
|
|
elif task == "transcript_and_summarize":
|
|
for audio in audio_files:
|
|
logger.info("Starting 'transcript_and_summarize' for: %s", audio)
|
|
result = model.transcript_and_summarize(
|
|
audio,
|
|
summarizer_api_url=arg_dict.pop("summarizer_api_url"),
|
|
summarizer_api_key=arg_dict.pop("summarizer_api_key"),
|
|
summarizer_model=arg_dict.pop("summarizer_model"),
|
|
language=arg_dict.pop("language"),
|
|
verbose=arg_dict.pop("verbose_output"),
|
|
num_speakers=arg_dict.pop("num_speakers"),
|
|
)
|
|
|
|
transcript_text = result.get("transcript", "")
|
|
summary_text = result.get("summary", "")
|
|
|
|
basename = audio.split("/")[-1].split(".")[0]
|
|
|
|
# Always use .md for transcript_and_summarize
|
|
md_path = os.path.join(out_folder, f"{basename}.md")
|
|
logger.info("Saving transcript + summary to: %s", md_path)
|
|
|
|
with open(md_path, "w", encoding="utf-8") as f:
|
|
f.write("# Transcript\n\n")
|
|
f.write(transcript_text)
|
|
f.write("\n\n# Summary\n\n")
|
|
f.write(summary_text)
|
|
|
|
logger.info("Transcript + summary saved: %s", md_path)
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|