""" Prompt compression - summarizes conversation history to reduce tokens. Uses a small local model (no API calls) to compress old turns. """ from typing import List, Dict import tiktoken ENCODING = tiktoken.get_encoding("cl100k_base") def count_tokens(text: str) -> int: """Count tokens in text""" return len(ENCODING.encode(text)) def compress_conversation( messages: List[Dict], max_tokens: int = 2000, keep_last_n: int = 3 ) -> List[Dict]: """ Compress conversation history: - Keep last N exchanges in full - Summarize everything before into a single system message Returns compressed message list. """ if len(messages) <= keep_last_n * 2: # *2 for user/assistant pairs return messages # Keep system message if present system_msg = None convo_messages = messages[:] if messages[0].get("role") == "system": system_msg = messages[0] convo_messages = messages[1:] # Split into old (to compress) and recent (keep full) recent = convo_messages[-keep_last_n * 2:] old = convo_messages[:-keep_last_n * 2] # Summarize old conversation summary = _summarize_turns(old) # Build compressed messages compressed = [] if system_msg: compressed.append(system_msg) # Add summary as a user message with context compressed.append({ "role": "user", "content": f"[PREVIOUS CONVERSATION SUMMARY]\n{summary}\n[/PREVIOUS CONVERSATION SUMMARY]\n\n---\n\nConversation continues below:" }) compressed.extend(recent) # Verify we're under limit total_tokens = sum(count_tokens(m.get("content", "")) for m in compressed) if total_tokens > max_tokens: # Aggressive compression - keep only last exchange compressed = compressed[-2:] return compressed def _summarize_turns(messages: List[Dict]) -> str: """ Create a brief summary of conversation turns. In production, call a small local model here. For now, extract key decisions and topics. """ topics = [] decisions = [] for msg in messages: content = msg.get("content", "") # Extract topics from user messages if msg.get("role") == "user": # Simple keyword extraction (replace with LLM summary) if "docker" in content.lower(): topics.append("Docker configuration") if "server" in content.lower(): topics.append("Server setup") if "config" in content.lower(): topics.append("Configuration") # Extract decisions from assistant messages if msg.get("role") == "assistant": if "we decided" in content.lower() or "I'll use" in content.lower(): decisions.append(content[:200]) summary_parts = [] if topics: summary_parts.append(f"Topics discussed: {', '.join(set(topics))}") if decisions: summary_parts.append(f"Decisions made: {'; '.join(decisions[:3])}") return "\n".join(summary_parts) if summary_parts else "Previous conversation covered various topics." def truncate_tool_output(output: str, max_tokens: int = 200) -> str: """Truncate tool outputs to save tokens""" tokens = ENCODING.encode(output) if len(tokens) <= max_tokens: return output truncated = ENCODING.decode(tokens[:max_tokens]) return f"{truncated}... [truncated, {len(tokens) - max_tokens} tokens omitted]"