""" Conversation compression - summarizes old turns to save tokens. Supports multiple strategies: extractive summarization or Ollama LLM. """ from typing import List, Dict import logging import tiktoken import httpx from sumy.parsers.plaintext import PlaintextParser from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer import asyncio logger = logging.getLogger(__name__) ENCODING = tiktoken.get_encoding("cl100k_base") def count_tokens(text: str) -> int: """Count tokens in text""" return len(ENCODING.encode(text)) def truncate_tool_output(output: str, max_tokens: int = 200) -> str: """Truncate tool outputs to save tokens""" tokens = ENCODING.encode(output) if len(tokens) <= max_tokens: return output truncated = ENCODING.decode(tokens[:max_tokens]) return f"{truncated}... [truncated, {len(tokens) - max_tokens} tokens omitted]" def extractive_summarize(text: str, sentences_count: int = 3) -> str: """ Simple extractive summarization using LSA algorithm. Picks the most important sentences from the text. No external API calls, fast and deterministic. """ try: parser = PlaintextParser.from_string(text, Tokenizer("english")) summarizer = LsaSummarizer() summary_sentences = summarizer(parser.document, sentences_count) return " ".join(str(sentence) for sentence in summary_sentences) except Exception as e: # Fallback: truncate to first few sentences sentences = text.split('. ')[:3] return '. '.join(sentences) + '.' async def ollama_summarize(text: str, model: str = "phi3:mini", url: str = "http://localhost:11434") -> str: """ Summarize using Ollama API. Requires Ollama running with the specified model pulled. """ try: async with httpx.AsyncClient(timeout=30.0) as client: response = await client.post( f"{url}/api/generate", json={ "model": model, "prompt": f"Summarize the following conversation in 2-3 sentences, focusing on key decisions and conclusions:\n\n{text}", "stream": False, "options": { "num_predict": 200 } } ) response.raise_for_status() result = response.json() return result.get("response", "").strip() except Exception as e: # Fallback to extractive on any error return extractive_summarize(text, sentences_count=3) async def compress_conversation( messages: List[Dict], max_tokens: int = 2000, keep_last_n: int = 3, strategy: str = "extractive", ollama_model: str = "phi3:mini", ollama_url: str = "http://localhost:11434" ) -> List[Dict]: """ Compress conversation history: - Keep last N exchanges in full - Summarize everything before using the configured strategy Args: messages: Full conversation history max_tokens: Target token budget keep_last_n: Number of recent exchanges to keep uncompressed strategy: "extractive", "ollama", or "none" ollama_model: Model to use if strategy is "ollama" ollama_url: Ollama API endpoint Returns compressed message list. """ if strategy == "none": return messages if len(messages) <= keep_last_n * 2: # *2 for user/assistant pairs return messages # Keep system message if present system_msg = None convo_messages = messages[:] if messages[0].get("role") == "system": system_msg = messages[0] convo_messages = messages[1:] # Split into old (to compress) and recent (keep full) recent = convo_messages[-keep_last_n * 2:] old = convo_messages[:-keep_last_n * 2] # Create text to summarize from old turns old_text = "\n".join([f"{m['role']}: {m['content']}" for m in old]) # Summarize using selected strategy summary = None if strategy == "ollama": try: summary = await ollama_summarize(old_text, ollama_model, ollama_url) except Exception as e: logger.warning(f"Ollama summarization failed: {e}, falling back to extractive") summary = extractive_summarize(old_text, sentences_count=3) else: # Extractive is synchronous but fast; run in thread pool to avoid blocking loop = asyncio.get_event_loop() summary = await loop.run_in_executor(None, lambda: extractive_summarize(old_text, 3)) # Build compressed messages compressed = [] if system_msg: compressed.append(system_msg) # Add summary as a user message with clear demarcation compressed.append({ "role": "user", "content": f"[CONVERSATION SUMMARY]\n{summary}\n[/CONVERSATION SUMMARY]\n\n---\n\nRecent conversation (most relevant):" }) compressed.extend(recent) # Verify we're under limit, if not, drop old more aggressively total_tokens = sum(count_tokens(m.get("content", "")) for m in compressed) if total_tokens > max_tokens and len(compressed) > 2: # Keep only system + last exchange if system_msg: compressed = [system_msg, recent[-2]] else: compressed = recent[-2:] return compressed