From b0f19810d43150f569b50f6d9c1bff0328a59946 Mon Sep 17 00:00:00 2001
From: Admin <admin@optimex.systems>
Date: Sun, 14 Jun 2026 03:44:55 +0000
Subject: [PATCH] Fix: Improve LLM connectivity, add logging, increase timeout,
 update docs

---
 .env.example          |   3 ++
 README.md             |  57 +++++++++++++++++++++
 mcp_summary_server.py | 112 +++++++++++++++++++++++-------------------
 3 files changed, 121 insertions(+), 51 deletions(-)

diff --git a/.env.example b/.env.example
index d4cfbbe..f975e02 100644
--- a/.env.example
+++ b/.env.example
@@ -12,6 +12,9 @@ OPENAPI_URL=http://localhost:8080/v1
 OPENAPI_API_KEY=
 MODEL_NAME=gpt-4o
 
+# LLM Call Timeout in seconds (increase for large documents)
+LLM_TIMEOUT=120
+
 # Summarization Configuration
 # Characters per chunk when splitting long text
 CHUNK_SIZE=4000
diff --git a/README.md b/README.md
index ade9adf..e81fb22 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ cp .env.example .env
 | OPENAPI_URL | http://localhost:8080/v1 | LLM API endpoint |
 | OPENAPI_API_KEY | (empty) | LLM API key |
 | MODEL_NAME | gpt-4o | LLM model to use |
+| LLM_TIMEOUT | 120 | LLM call timeout in seconds |
 | CHUNK_SIZE | 4000 | Characters per chunk |
 | OVERLAP | 200 | Characters of overlap between chunks |
 | TARGET_INTERMEDIATE_SUMMARY_LENGTH | 150 | Words per chunk summary |
@@ -59,6 +60,40 @@ pip install -r requirements.txt
 python mcp_summary_server.py
 ```
 
+## Connecting to OpenWebUI
+
+### In OpenWebUI Admin Settings
+
+1. Go to **Admin Settings → External Tools**
+2. Click **+ (Add Server)**
+3. Set **Type** to **MCP (Streamable HTTP)**
+4. Enter your **Server URL**
+5. Set **Authentication**:
+   - **None** if no API key is configured
+   - **Bearer** if API_KEY is set (provide the key)
+6. Save
+
+### Docker Networking
+
+If running both OpenWebUI and MCP Summary in Docker:
+
+```bash
+# Use host.docker.internal to reach host machine
+docker run -p 8080:8080 \
+  -e OPENAPI_URL=http://host.docker.internal:3000/v1 \
+  -e OPENAPI_API_KEY=your-key \
+  mcp-summary
+```
+
+If both containers are on the same Docker network, use the container name directly:
+
+```bash
+docker run --network mynetwork -p 8080:8080 \
+  -e OPENAPI_URL=http://openwebui-container:8080/v1 \
+  -e OPENAPI_API_KEY=your-key \
+  mcp-summary
+```
+
 ## MCP Tool
 
 ### summarize_document
@@ -78,3 +113,25 @@ Summarizes a document, automatically handling chunking for long text.
   "chunks": 1  // number of chunks used
 }
 ```
+
+## Troubleshooting
+
+### "Failed to connect to MCP server"
+
+1. **Check authentication**: Ensure you haven't selected `Bearer` without a key. Switch to `None` if no token is needed.
+2. **Check network connectivity**: Ensure OpenWebUI can reach the MCP server URL
+3. **Check LLM connectivity**: Ensure the MCP server can reach the LLM at OPENAPI_URL
+4. **Check timeouts**: Increase LLM_TIMEOUT if summarization takes too long
+
+### Infinite loading screen
+
+This may occur if you configured the server as OpenAPI instead of MCP. Fix by:
+
+1. Opening Admin Settings → External Tools
+2. Disabling/deleting the problematic connection
+3. Re-adding with **Type** set to **MCP (Streamable HTTP)**
+
+### Slow initialization
+
+If the server takes longer than 10 seconds to initialize:
+- Increase `MCP_INITIALIZE_TIMEOUT` in OpenWebUI (default: 10 seconds)
diff --git a/mcp_summary_server.py b/mcp_summary_server.py
index 8d53383..29b1b9e 100644
--- a/mcp_summary_server.py
+++ b/mcp_summary_server.py
@@ -25,9 +25,15 @@ import json
 import os
 import sys
 import re
+import logging
 from http.server import HTTPServer, BaseHTTPRequestHandler
 from typing import Any, Dict, List, Optional
 import requests
+from requests.exceptions import RequestException
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger("mcp-summary")
 
 # MCP Server Configuration
 API_KEY = os.environ.get("API_KEY", "").strip()
@@ -39,11 +45,14 @@ OPENAPI_API_KEY = os.environ.get("OPENAPI_API_KEY", "")
 MODEL_NAME = os.environ.get("MODEL_NAME", "gpt-4o")
 
 # Summarization Configuration
-CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "4000"))  # Characters per chunk
-OVERLAP = int(os.environ.get("OVERLAP", "200"))  # Characters of overlap between chunks
-TARGET_INTERMEDIATE_SUMMARY_LENGTH = int(os.environ.get("TARGET_INTERMEDIATE_SUMMARY_LENGTH", "150"))  # Words
-MAX_DIRECT_SUMMARY_LENGTH = int(os.environ.get("MAX_DIRECT_SUMMARY_LENGTH", "100"))  # Words for final summary
-MAX_DIRECT_TEXT_LENGTH = int(os.environ.get("MAX_DIRECT_TEXT_LENGTH", "8000"))  # Characters before chunking
+CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "4000"))
+OVERLAP = int(os.environ.get("OVERLAP", "200"))
+TARGET_INTERMEDIATE_SUMMARY_LENGTH = int(os.environ.get("TARGET_INTERMEDIATE_SUMMARY_LENGTH", "150"))
+MAX_DIRECT_SUMMARY_LENGTH = int(os.environ.get("MAX_DIRECT_SUMMARY_LENGTH", "100"))
+MAX_DIRECT_TEXT_LENGTH = int(os.environ.get("MAX_DIRECT_TEXT_LENGTH", "8000"))
+
+# LLM call timeout in seconds - increase for large documents
+LLM_TIMEOUT = int(os.environ.get("LLM_TIMEOUT", "120"))
 
 # Tool definitions
 TOOLS_LIST: Dict[str, Any] = {
@@ -71,7 +80,7 @@ TOOLS_LIST: Dict[str, Any] = {
 
 
 def call_llm(messages: List[Dict], temperature: float = 0.3) -> str:
-    """Make an OpenAPI-compatible LLM call."""
+    """Make an OpenAPI-compatible LLM call with error handling."""
     url = f"{OPENAPI_URL}/chat/completions"
     headers = {
         "Content-Type": "application/json",
@@ -86,11 +95,20 @@ def call_llm(messages: List[Dict], temperature: float = 0.3) -> str:
         "top_p": 0.9
     }
     
-    response = requests.post(url, headers=headers, json=payload, timeout=60)
-    response.raise_for_status()
+    try:
+        logger.info(f"Calling LLM at {OPENAPI_URL} with model {MODEL_NAME}")
+        response = requests.post(url, headers=headers, json=payload, timeout=LLM_TIMEOUT)
+        response.raise_for_status()
+        
+        data = response.json()
+        return data["choices"][0]["message"]["content"]
     
-    data = response.json()
-    return data["choices"][0]["message"]["content"]
+    except RequestException as e:
+        logger.error(f"LLM request failed: {e}")
+        raise RuntimeError(f"Failed to connect to LLM at {OPENAPI_URL}: {str(e)}")
+    except Exception as e:
+        logger.error(f"LLM call failed: {e}")
+        raise RuntimeError(f"LLM call failed: {str(e)}")
 
 
 def chunk_text(text: str) -> List[str]:
@@ -102,31 +120,16 @@ def chunk_text(text: str) -> List[str]:
     start = 0
     
     while start < len(text):
-        # Find a good breaking point (after sentence or paragraph)
         end = min(start + CHUNK_SIZE, len(text))
         
-        # Try to break at sentence boundary
-        search_end = min(end, len(text))
-        break_point = -1
-        
-        # Look for paragraph break first
-        for marker in ["\n\n", "\n"]:
-            pos = text.rfind(marker, start + CHUNK_SIZE // 2, search_end)
-            if pos > 0:
+        # Try to break at sentence/paragraph boundary
+        break_point = end
+        for marker in ["\n\n", "\n", ". ", "! ", "? "]:
+            pos = text.rfind(marker, start + CHUNK_SIZE // 2, end)
+            if pos > start:
                 break_point = pos
                 break
         
-        # If no paragraph break, look for sentence break
-        if break_point == -1:
-            for marker in [".", "!", "?"]:
-                pos = text.rfind(marker, start + CHUNK_SIZE // 2, search_end)
-                if pos > 0:
-                    break_point = pos
-                    break
-        
-        if break_point == -1:
-            break_point = end
-        
         chunk = text[start:break_point]
         if chunk.strip():
             chunks.append(chunk)
@@ -135,25 +138,26 @@ def chunk_text(text: str) -> List[str]:
         if start >= len(text):
             break
     
+    logger.info(f"Split text into {len(chunks)} chunks")
     return chunks
 
 
 def summarize_chunk(chunk: str, chunk_num: int, total_chunks: int) -> str:
     """Summarize a single chunk of text."""
-    system_prompt = f"""You are a precise legal assistant specializing in creating concise, accurate summaries.
+    system_prompt = f"""You are a precise legal assistant creating concise, accurate summaries.
 
 You are processing chunk {chunk_num} of {total_chunks} from a larger document.
 
-Your task: Create a focused summary of this chunk that:
-- Captures the key points and important details
+Create a focused summary that:
+- Captures key points and important details
 - Is approximately {TARGET_INTERMEDIATE_SUMMARY_LENGTH} words
-- Can be combined with summaries of other chunks to form a complete picture
+- Can be combined with other chunk summaries
 - Uses clear, professional language
-- Preserves important names, dates, and specific facts
+- Preserves names, dates, and specific facts
 
-Format your response as plain text without bullet points or special formatting."""
+Respond as plain text without bullet points."""
     
-    user_prompt = f"""Summarize the following text (chunk {chunk_num} of {total_chunks}):
+    user_prompt = f"""Summarize this text (chunk {chunk_num} of {total_chunks}):
 
 {text}
 
@@ -164,6 +168,7 @@ Summary:"""
         {"role": "user", "content": user_prompt}
     ]
     
+    logger.info(f"Summarizing chunk {chunk_num}/{total_chunks}")
     return call_llm(messages)
 
 
@@ -173,17 +178,17 @@ def synthesize_summaries(chunk_summaries: List[str]) -> str:
     
     system_prompt = """You are a precise legal assistant creating executive-level summaries.
 
-Your task: Synthesize the provided partial summaries into a single, cohesive summary that:
+Synthesize the provided partial summaries into a single, cohesive summary that:
 - Is approximately 100 words
-- Captures the complete picture of the document
+- Captures the complete document picture
 - Is clear and professional
 - Removes redundancy
 - Maintains logical flow
 - Preserves all critical information
 
-Format your response as a single paragraph of plain text."""
+Format as a single paragraph of plain text."""
     
-    user_prompt = f"""Synthesize the following partial summaries into one cohesive summary:
+    user_prompt = f"""Synthesize these partial summaries into one cohesive summary:
 
 {combined}
 
@@ -194,6 +199,7 @@ Final summary:"""
         {"role": "user", "content": user_prompt}
     ]
     
+    logger.info(f"Synthesizing {len(chunk_summaries)} chunk summaries")
     return call_llm(messages)
 
 
@@ -206,23 +212,23 @@ def summarize_document(text: str, max_length: int = MAX_DIRECT_SUMMARY_LENGTH) -
     """
     original_length = len(text)
     
-    # Strip whitespace and validate
     text = text.strip()
     if not text:
         raise ValueError("Empty text provided")
     
+    logger.info(f"Summarizing text of {original_length} characters")
+    
     # Direct summarization for shorter texts
     if len(text) <= MAX_DIRECT_TEXT_LENGTH:
         system_prompt = f"""You are a precise legal assistant creating concise, accurate summaries.
 
-Your task: Create a summary that:
+Create a summary that:
 - Is approximately {max_length} words
-- Captures the key points and important details
+- Captures key points and important details
 - Uses clear, professional language
-- Preserves important names, dates, and specific facts
-- Is suitable for a legal professional
+- Preserves names, dates, and specific facts
 
-Format your response as plain text without bullet points or special formatting."""
+Format as plain text without bullet points."""
         
         user_prompt = f"""Summarize the following document:
 
@@ -247,13 +253,11 @@ Summary:"""
     # Chunked summarization for longer texts
     chunks = chunk_text(text)
     
-    # Summarize each chunk
     chunk_summaries = []
     for i, chunk in enumerate(chunks, 1):
         chunk_summary = summarize_chunk(chunk, i, len(chunks))
         chunk_summaries.append(chunk_summary)
     
-    # Synthesize into final summary
     final_summary = synthesize_summaries(chunk_summaries)
     
     return {
@@ -268,8 +272,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
     """HTTP handler for MCP summary server."""
     
     def log_message(self, format, *args):
-        # Quiet logs by default
-        pass
+        logger.info(format % args)
     
     def _send_json(self, status: int, payload: Any):
         """Send JSON response."""
@@ -304,6 +307,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
                 "service": "mcp-summary",
                 "transport": "streamable-http",
                 "model": MODEL_NAME,
+                "status": "running",
                 "docs": "Use POST / with MCP JSON-RPC (initialize, tools/list, tools/call)."
             })
             return
@@ -336,6 +340,8 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
         params = req.get("params") or {}
         req_id = req.get("id")
         
+        logger.info(f"MCP request: method={method}, id={req_id}")
+        
         # MCP: initialize
         if method == "initialize":
             self._send_json(200, {
@@ -380,6 +386,7 @@ class MCPSummaryHandler(BaseHTTPRequestHandler):
                     }
                 })
             except Exception as e:
+                logger.error(f"Tool call failed: {e}")
                 self._send_json(200, {
                     "jsonrpc": "2.0",
                     "id": req_id,
@@ -410,10 +417,13 @@ def main():
     """Start the MCP summary server."""
     server = HTTPServer(("0.0.0.0", PORT), MCPSummaryHandler)
     mode = "auth enabled (Bearer)" if API_KEY else "no auth (API_KEY not set)"
+    
     print(f"MCP Summary Server listening on 0.0.0.0:{PORT} [{mode}]")
     print(f"  - Model: {MODEL_NAME}")
+    print(f"  - LLM URL: {OPENAPI_URL}")
     print(f"  - Chunk size: {CHUNK_SIZE} characters")
     print(f"  - Max direct text: {MAX_DIRECT_TEXT_LENGTH} characters")
+    print(f"  - LLM timeout: {LLM_TIMEOUT} seconds")
     
     try:
         server.serve_forever()