Fix: Improve LLM connectivity, add logging, increase timeout, update docs

This commit is contained in:
2026-06-14 03:44:55 +00:00
parent dbddfcd61d
commit b0f19810d4
3 changed files with 121 additions and 51 deletions
+3
View File
@@ -12,6 +12,9 @@ OPENAPI_URL=http://localhost:8080/v1
OPENAPI_API_KEY= OPENAPI_API_KEY=
MODEL_NAME=gpt-4o MODEL_NAME=gpt-4o
# LLM Call Timeout in seconds (increase for large documents)
LLM_TIMEOUT=120
# Summarization Configuration # Summarization Configuration
# Characters per chunk when splitting long text # Characters per chunk when splitting long text
CHUNK_SIZE=4000 CHUNK_SIZE=4000
+57
View File
@@ -27,6 +27,7 @@ cp .env.example .env
| OPENAPI_URL | http://localhost:8080/v1 | LLM API endpoint | | OPENAPI_URL | http://localhost:8080/v1 | LLM API endpoint |
| OPENAPI_API_KEY | (empty) | LLM API key | | OPENAPI_API_KEY | (empty) | LLM API key |
| MODEL_NAME | gpt-4o | LLM model to use | | MODEL_NAME | gpt-4o | LLM model to use |
| LLM_TIMEOUT | 120 | LLM call timeout in seconds |
| CHUNK_SIZE | 4000 | Characters per chunk | | CHUNK_SIZE | 4000 | Characters per chunk |
| OVERLAP | 200 | Characters of overlap between chunks | | OVERLAP | 200 | Characters of overlap between chunks |
| TARGET_INTERMEDIATE_SUMMARY_LENGTH | 150 | Words per chunk summary | | TARGET_INTERMEDIATE_SUMMARY_LENGTH | 150 | Words per chunk summary |
@@ -59,6 +60,40 @@ pip install -r requirements.txt
python mcp_summary_server.py python mcp_summary_server.py
``` ```
## Connecting to OpenWebUI
### In OpenWebUI Admin Settings
1. Go to **Admin Settings → External Tools**
2. Click **+ (Add Server)**
3. Set **Type** to **MCP (Streamable HTTP)**
4. Enter your **Server URL**
5. Set **Authentication**:
- **None** if no API key is configured
- **Bearer** if API_KEY is set (provide the key)
6. Save
### Docker Networking
If running both OpenWebUI and MCP Summary in Docker:
```bash
# Use host.docker.internal to reach host machine
docker run -p 8080:8080 \
-e OPENAPI_URL=http://host.docker.internal:3000/v1 \
-e OPENAPI_API_KEY=your-key \
mcp-summary
```
If both containers are on the same Docker network, use the container name directly:
```bash
docker run --network mynetwork -p 8080:8080 \
-e OPENAPI_URL=http://openwebui-container:8080/v1 \
-e OPENAPI_API_KEY=your-key \
mcp-summary
```
## MCP Tool ## MCP Tool
### summarize_document ### summarize_document
@@ -78,3 +113,25 @@ Summarizes a document, automatically handling chunking for long text.
"chunks": 1 // number of chunks used "chunks": 1 // number of chunks used
} }
``` ```
## Troubleshooting
### "Failed to connect to MCP server"
1. **Check authentication**: Ensure you haven't selected `Bearer` without a key. Switch to `None` if no token is needed.
2. **Check network connectivity**: Ensure OpenWebUI can reach the MCP server URL
3. **Check LLM connectivity**: Ensure the MCP server can reach the LLM at OPENAPI_URL
4. **Check timeouts**: Increase LLM_TIMEOUT if summarization takes too long
### Infinite loading screen
This may occur if you configured the server as OpenAPI instead of MCP. Fix by:
1. Opening Admin Settings → External Tools
2. Disabling/deleting the problematic connection
3. Re-adding with **Type** set to **MCP (Streamable HTTP)**
### Slow initialization
If the server takes longer than 10 seconds to initialize:
- Increase `MCP_INITIALIZE_TIMEOUT` in OpenWebUI (default: 10 seconds)
+59 -49
View File
@@ -25,9 +25,15 @@ import json
import os import os
import sys import sys
import re import re
import logging
from http.server import HTTPServer, BaseHTTPRequestHandler from http.server import HTTPServer, BaseHTTPRequestHandler
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
import requests import requests
from requests.exceptions import RequestException
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("mcp-summary")
# MCP Server Configuration # MCP Server Configuration
API_KEY = os.environ.get("API_KEY", "").strip() API_KEY = os.environ.get("API_KEY", "").strip()
@@ -39,11 +45,14 @@ OPENAPI_API_KEY = os.environ.get("OPENAPI_API_KEY", "")
MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o") MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
# Summarization Configuration # Summarization Configuration
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "4000")) # Characters per chunk CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "4000"))
OVERLAP = int(os.environ.get("OVERLAP", "200")) # Characters of overlap between chunks OVERLAP = int(os.environ.get("OVERLAP", "200"))
TARGET_INTERMEDIATE_SUMMARY_LENGTH = int(os.environ.get("TARGET_INTERMEDIATE_SUMMARY_LENGTH", "150")) # Words TARGET_INTERMEDIATE_SUMMARY_LENGTH = int(os.environ.get("TARGET_INTERMEDIATE_SUMMARY_LENGTH", "150"))
MAX_DIRECT_SUMMARY_LENGTH = int(os.environ.get("MAX_DIRECT_SUMMARY_LENGTH", "100")) # Words for final summary MAX_DIRECT_SUMMARY_LENGTH = int(os.environ.get("MAX_DIRECT_SUMMARY_LENGTH", "100"))
MAX_DIRECT_TEXT_LENGTH = int(os.environ.get("MAX_DIRECT_TEXT_LENGTH", "8000")) # Characters before chunking MAX_DIRECT_TEXT_LENGTH = int(os.environ.get("MAX_DIRECT_TEXT_LENGTH", "8000"))
# LLM call timeout in seconds - increase for large documents
LLM_TIMEOUT = int(os.environ.get("LLM_TIMEOUT", "120"))
# Tool definitions # Tool definitions
TOOLS_LIST: Dict[str, Any] = { TOOLS_LIST: Dict[str, Any] = {
@@ -71,7 +80,7 @@ TOOLS_LIST: Dict[str, Any] = {
def call_llm(messages: List[Dict], temperature: float = 0.3) -> str: def call_llm(messages: List[Dict], temperature: float = 0.3) -> str:
"""Make an OpenAPI-compatible LLM call.""" """Make an OpenAPI-compatible LLM call with error handling."""
url = f"{OPENAPI_URL}/chat/completions" url = f"{OPENAPI_URL}/chat/completions"
headers = { headers = {
"Content-Type": "application/json", "Content-Type": "application/json",
@@ -86,12 +95,21 @@ def call_llm(messages: List[Dict], temperature: float = 0.3) -> str:
"top_p": 0.9 "top_p": 0.9
} }
response = requests.post(url, headers=headers, json=payload, timeout=60) try:
logger.info(f"Calling LLM at {OPENAPI_URL} with model {MODEL_NAME}")
response = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT)
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
return data["choices"][0]["message"]["content"] return data["choices"][0]["message"]["content"]
except RequestException as e:
logger.error(f"LLM request failed: {e}")
raise RuntimeError(f"Failed to connect to LLM at {OPENAPI_URL}: {str(e)}")
except Exception as e:
logger.error(f"LLM call failed: {e}")
raise RuntimeError(f"LLM call failed: {str(e)}")
def chunk_text(text: str) -> List[str]: def chunk_text(text: str) -> List[str]:
"""Split text into chunks with overlap for summarization.""" """Split text into chunks with overlap for summarization."""
@@ -102,30 +120,15 @@ def chunk_text(text: str) -> List[str]:
start = 0 start = 0
while start < len(text): while start < len(text):
# Find a good breaking point (after sentence or paragraph)
end = min(start + CHUNK_SIZE, len(text)) end = min(start + CHUNK_SIZE, len(text))
# Try to break at sentence boundary # Try to break at sentence/paragraph boundary
search_end = min(end, len(text))
break_point = -1
# Look for paragraph break first
for marker in ["\n\n", "\n"]:
pos = text.rfind(marker, start + CHUNK_SIZE // 2, search_end)
if pos > 0:
break_point = pos
break
# If no paragraph break, look for sentence break
if break_point == -1:
for marker in [".", "!", "?"]:
pos = text.rfind(marker, start + CHUNK_SIZE // 2, search_end)
if pos > 0:
break_point = pos
break
if break_point == -1:
break_point = end break_point = end
for marker in ["\n\n", "\n", ". ", "! ", "? "]:
pos = text.rfind(marker, start + CHUNK_SIZE // 2, end)
if pos > start:
break_point = pos
break
chunk = text[start:break_point] chunk = text[start:break_point]
if chunk.strip(): if chunk.strip():
@@ -135,25 +138,26 @@ def chunk_text(text: str) -> List[str]:
if start >= len(text): if start >= len(text):
break break
logger.info(f"Split text into {len(chunks)} chunks")
return chunks return chunks
def summarize_chunk(chunk: str, chunk_num: int, total_chunks: int) -> str: def summarize_chunk(chunk: str, chunk_num: int, total_chunks: int) -> str:
"""Summarize a single chunk of text.""" """Summarize a single chunk of text."""
system_prompt = f"""You are a precise legal assistant specializing in creating concise, accurate summaries. system_prompt = f"""You are a precise legal assistant creating concise, accurate summaries.
You are processing chunk {chunk_num} of {total_chunks} from a larger document. You are processing chunk {chunk_num} of {total_chunks} from a larger document.
Your task: Create a focused summary of this chunk that: Create a focused summary that:
- Captures the key points and important details - Captures key points and important details
- Is approximately {TARGET_INTERMEDIATE_SUMMARY_LENGTH} words - Is approximately {TARGET_INTERMEDIATE_SUMMARY_LENGTH} words
- Can be combined with summaries of other chunks to form a complete picture - Can be combined with other chunk summaries
- Uses clear, professional language - Uses clear, professional language
- Preserves important names, dates, and specific facts - Preserves names, dates, and specific facts
Format your response as plain text without bullet points or special formatting.""" Respond as plain text without bullet points."""
user_prompt = f"""Summarize the following text (chunk {chunk_num} of {total_chunks}): user_prompt = f"""Summarize this text (chunk {chunk_num} of {total_chunks}):
{text} {text}
@@ -164,6 +168,7 @@ Summary:"""
{"role": "user", "content": user_prompt} {"role": "user", "content": user_prompt}
] ]
logger.info(f"Summarizing chunk {chunk_num}/{total_chunks}")
return call_llm(messages) return call_llm(messages)
@@ -173,17 +178,17 @@ def synthesize_summaries(chunk_summaries: List[str]) -> str:
system_prompt = """You are a precise legal assistant creating executive-level summaries. system_prompt = """You are a precise legal assistant creating executive-level summaries.
Your task: Synthesize the provided partial summaries into a single, cohesive summary that: Synthesize the provided partial summaries into a single, cohesive summary that:
- Is approximately 100 words - Is approximately 100 words
- Captures the complete picture of the document - Captures the complete document picture
- Is clear and professional - Is clear and professional
- Removes redundancy - Removes redundancy
- Maintains logical flow - Maintains logical flow
- Preserves all critical information - Preserves all critical information
Format your response as a single paragraph of plain text.""" Format as a single paragraph of plain text."""
user_prompt = f"""Synthesize the following partial summaries into one cohesive summary: user_prompt = f"""Synthesize these partial summaries into one cohesive summary:
{combined} {combined}
@@ -194,6 +199,7 @@ Final summary:"""
{"role": "user", "content": user_prompt} {"role": "user", "content": user_prompt}
] ]
logger.info(f"Synthesizing {len(chunk_summaries)} chunk summaries")
return call_llm(messages) return call_llm(messages)
@@ -206,23 +212,23 @@ def summarize_document(text: str, max_length: int = MAX_DIRECT_SUMMARY_LENGTH) -
""" """
original_length = len(text) original_length = len(text)
# Strip whitespace and validate
text = text.strip() text = text.strip()
if not text: if not text:
raise ValueError("Empty text provided") raise ValueError("Empty text provided")
logger.info(f"Summarizing text of {original_length} characters")
# Direct summarization for shorter texts # Direct summarization for shorter texts
if len(text) <= MAX_DIRECT_TEXT_LENGTH: if len(text) <= MAX_DIRECT_TEXT_LENGTH:
system_prompt = f"""You are a precise legal assistant creating concise, accurate summaries. system_prompt = f"""You are a precise legal assistant creating concise, accurate summaries.
Your task: Create a summary that: Create a summary that:
- Is approximately {max_length} words - Is approximately {max_length} words
- Captures the key points and important details - Captures key points and important details
- Uses clear, professional language - Uses clear, professional language
- Preserves important names, dates, and specific facts - Preserves names, dates, and specific facts
- Is suitable for a legal professional
Format your response as plain text without bullet points or special formatting.""" Format as plain text without bullet points."""
user_prompt = f"""Summarize the following document: user_prompt = f"""Summarize the following document:
@@ -247,13 +253,11 @@ Summary:"""
# Chunked summarization for longer texts # Chunked summarization for longer texts
chunks = chunk_text(text) chunks = chunk_text(text)
# Summarize each chunk
chunk_summaries = [] chunk_summaries = []
for i, chunk in enumerate(chunks, 1): for i, chunk in enumerate(chunks, 1):
chunk_summary = summarize_chunk(chunk, i, len(chunks)) chunk_summary = summarize_chunk(chunk, i, len(chunks))
chunk_summaries.append(chunk_summary) chunk_summaries.append(chunk_summary)
# Synthesize into final summary
final_summary = synthesize_summaries(chunk_summaries) final_summary = synthesize_summaries(chunk_summaries)
return { return {
@@ -268,8 +272,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
"""HTTP handler for MCP summary server.""" """HTTP handler for MCP summary server."""
def log_message(self, format, *args): def log_message(self, format, *args):
# Quiet logs by default logger.info(format % args)
pass
def _send_json(self, status: int, payload: Any): def _send_json(self, status: int, payload: Any):
"""Send JSON response.""" """Send JSON response."""
@@ -304,6 +307,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
"service": "mcp-summary", "service": "mcp-summary",
"transport": "streamable-http", "transport": "streamable-http",
"model": MODEL_NAME, "model": MODEL_NAME,
"status": "running",
"docs": "Use POST / with MCP JSON-RPC (initialize, tools/list, tools/call)." "docs": "Use POST / with MCP JSON-RPC (initialize, tools/list, tools/call)."
}) })
return return
@@ -336,6 +340,8 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
params = req.get("params") or {} params = req.get("params") or {}
req_id = req.get("id") req_id = req.get("id")
logger.info(f"MCP request: method={method}, id={req_id}")
# MCP: initialize # MCP: initialize
if method == "initialize": if method == "initialize":
self._send_json(200, { self._send_json(200, {
@@ -380,6 +386,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
} }
}) })
except Exception as e: except Exception as e:
logger.error(f"Tool call failed: {e}")
self._send_json(200, { self._send_json(200, {
"jsonrpc": "2.0", "jsonrpc": "2.0",
"id": req_id, "id": req_id,
@@ -410,10 +417,13 @@ def main():
"""Start the MCP summary server.""" """Start the MCP summary server."""
server = HTTPServer(("0.0.0.0", PORT), MCPSummaryHandler) server = HTTPServer(("0.0.0.0", PORT), MCPSummaryHandler)
mode = "auth enabled (Bearer)" if API_KEY else "no auth (API_KEY not set)" mode = "auth enabled (Bearer)" if API_KEY else "no auth (API_KEY not set)"
print(f"MCP Summary Server listening on 0.0.0.0:{PORT} [{mode}]") print(f"MCP Summary Server listening on 0.0.0.0:{PORT} [{mode}]")
print(f" - Model: {MODEL_NAME}") print(f" - Model: {MODEL_NAME}")
print(f" - LLM URL: {OPENAPI_URL}")
print(f" - Chunk size: {CHUNK_SIZE} characters") print(f" - Chunk size: {CHUNK_SIZE} characters")
print(f" - Max direct text: {MAX_DIRECT_TEXT_LENGTH} characters") print(f" - Max direct text: {MAX_DIRECT_TEXT_LENGTH} characters")
print(f" - LLM timeout: {LLM_TIMEOUT} seconds")
try: try:
server.serve_forever() server.serve_forever()