- semantic_cache.py: Semantic similarity matching for cache hits - rag.py: RAG-based context selection with local embeddings - compression.py: Conversation history summarization - New endpoints: /cache/semantic-lookup, /cache/semantic-store, /context/rag, /compress - Uses sentence-transformers (all-MiniLM-L6-v2) - no external API calls - No vector DB needed - cosine similarity on small datasets is fast enough - Expected savings: 50-70% token reduction
112 lines
3.5 KiB
Python
112 lines
3.5 KiB
Python
"""
|
|
Prompt compression - summarizes conversation history to reduce tokens.
|
|
Uses a small local model (no API calls) to compress old turns.
|
|
"""
|
|
|
|
from typing import List, Dict
|
|
import tiktoken
|
|
|
|
ENCODING = tiktoken.get_encoding("cl100k_base")
|
|
|
|
|
|
def count_tokens(text: str) -> int:
|
|
"""Count tokens in text"""
|
|
return len(ENCODING.encode(text))
|
|
|
|
|
|
def compress_conversation(
|
|
messages: List[Dict],
|
|
max_tokens: int = 2000,
|
|
keep_last_n: int = 3
|
|
) -> List[Dict]:
|
|
"""
|
|
Compress conversation history:
|
|
- Keep last N exchanges in full
|
|
- Summarize everything before into a single system message
|
|
|
|
Returns compressed message list.
|
|
"""
|
|
if len(messages) <= keep_last_n * 2: # *2 for user/assistant pairs
|
|
return messages
|
|
|
|
# Keep system message if present
|
|
system_msg = None
|
|
convo_messages = messages[:]
|
|
|
|
if messages[0].get("role") == "system":
|
|
system_msg = messages[0]
|
|
convo_messages = messages[1:]
|
|
|
|
# Split into old (to compress) and recent (keep full)
|
|
recent = convo_messages[-keep_last_n * 2:]
|
|
old = convo_messages[:-keep_last_n * 2]
|
|
|
|
# Summarize old conversation
|
|
summary = _summarize_turns(old)
|
|
|
|
# Build compressed messages
|
|
compressed = []
|
|
if system_msg:
|
|
compressed.append(system_msg)
|
|
|
|
# Add summary as a user message with context
|
|
compressed.append({
|
|
"role": "user",
|
|
"content": f"[PREVIOUS CONVERSATION SUMMARY]\n{summary}\n[/PREVIOUS CONVERSATION SUMMARY]\n\n---\n\nConversation continues below:"
|
|
})
|
|
|
|
compressed.extend(recent)
|
|
|
|
# Verify we're under limit
|
|
total_tokens = sum(count_tokens(m.get("content", "")) for m in compressed)
|
|
if total_tokens > max_tokens:
|
|
# Aggressive compression - keep only last exchange
|
|
compressed = compressed[-2:]
|
|
|
|
return compressed
|
|
|
|
|
|
def _summarize_turns(messages: List[Dict]) -> str:
|
|
"""
|
|
Create a brief summary of conversation turns.
|
|
In production, call a small local model here.
|
|
For now, extract key decisions and topics.
|
|
"""
|
|
topics = []
|
|
decisions = []
|
|
|
|
for msg in messages:
|
|
content = msg.get("content", "")
|
|
|
|
# Extract topics from user messages
|
|
if msg.get("role") == "user":
|
|
# Simple keyword extraction (replace with LLM summary)
|
|
if "docker" in content.lower():
|
|
topics.append("Docker configuration")
|
|
if "server" in content.lower():
|
|
topics.append("Server setup")
|
|
if "config" in content.lower():
|
|
topics.append("Configuration")
|
|
|
|
# Extract decisions from assistant messages
|
|
if msg.get("role") == "assistant":
|
|
if "we decided" in content.lower() or "I'll use" in content.lower():
|
|
decisions.append(content[:200])
|
|
|
|
summary_parts = []
|
|
if topics:
|
|
summary_parts.append(f"Topics discussed: {', '.join(set(topics))}")
|
|
if decisions:
|
|
summary_parts.append(f"Decisions made: {'; '.join(decisions[:3])}")
|
|
|
|
return "\n".join(summary_parts) if summary_parts else "Previous conversation covered various topics."
|
|
|
|
|
|
def truncate_tool_output(output: str, max_tokens: int = 200) -> str:
|
|
"""Truncate tool outputs to save tokens"""
|
|
tokens = ENCODING.encode(output)
|
|
if len(tokens) <= max_tokens:
|
|
return output
|
|
|
|
truncated = ENCODING.decode(tokens[:max_tokens])
|
|
return f"{truncated}... [truncated, {len(tokens) - max_tokens} tokens omitted]"
|