feat(memory): hybrid trivial-message classifier (heuristics + LLM batch)

Step 3 of memory system overhaul: smart junk detection. Replaces the old 37-pattern frozenset (44% accuracy) with a 3-tier hybrid: TIER 1 - DEFINITELY_TRIVIAL (instant delete, no LLM): 50+ exact-match patterns, pure emoji, single char, punctuation-only TIER 2 - DEFINITELY_IMPORTANT (instant keep, no LLM): 8+ words, question with substance, first-person statements, numbers/dates, links, mentions TIER 3 - BORDERLINE (batch → LLM for economical classification): 2-7 word messages without clear markers Compact prompt: ~150-200 tokens per 20-message batch Safety default: KEEP on any parsing error Real-time filtering (discord_bridge) uses conservative heuristics only: - 1-char, pure reactions, single emoji, custom emoji-only - 50+ single-word fillers - Never deletes multi-word messages in real-time - Philosophy: false negatives (junk stored) > false positives (data lost) Consolidation gets the full hybrid pipeline with LLM for borderline cases, achieving much better accuracy than the old 44% while keeping token costs minimal (LLM only called during nightly consolidation, not real-time chat).
2026-05-15 14:07:35 +03:00
parent cb4be35f13
commit 5a740c9334
2 changed files with 263 additions and 29 deletions
--- a/cat-plugins/discord_bridge/discord_bridge.py
+++ b/cat-plugins/discord_bridge/discord_bridge.py
@@ -64,24 +64,52 @@ def before_cat_stores_episodic_memory(doc, cat):
    """
    Filter and enrich memories before storage.
    
-    Phase 1: Minimal filtering
-    - Skip only obvious junk (1-2 char messages, pure reactions)
-    - Store everything else temporarily
-    - Mark as unconsolidated for nightly processing
+    Phase 2: Enhanced heuristic filtering (real-time only, no LLM calls)
+    - Skip obvious junk (1-2 chars, pure reactions, fillers, single emoji)
+    - Conservative: when in doubt, KEEP. False negatives are better than lost data.
+    - Deeper classification happens during nightly consolidation.
    """
    message = doc.page_content.strip()
+    msg_lower = message.lower()
+    msg_len = len(msg_lower)
+    word_count = len(msg_lower.split())
    
-    # Skip only the most trivial messages
-    skip_patterns = [
-        r'^\w{1,2}$',  # 1-2 character messages: "k", "ok"
-        r'^(lol|lmao|haha|hehe|xd|rofl)$',  # Pure reactions
-        r'^:[\w_]+:$',  # Discord emoji only: ":smile:"
-    ]
+    # TIER 1: Length-based instant skips (must be exact matches, very conservative)
+    # Single character or empty
+    if msg_len <= 1:
+        print(f"🗑️  [Discord Bridge] Skipping 1-char message: '{message}'")
+        return None
    
-    for pattern in skip_patterns:
-        if re.match(pattern, message.lower()):
-            print(f"🗑️  [Discord Bridge] Skipping trivial message: {message}")
-            return None  # Don't store at all
+    # TIER 2: Pattern-based skips — only the most obvious junk
+    # Pure single reactions (2-4 chars, no other content)
+    if msg_len <= 4 and msg_lower in {'lol', 'lmao', 'haha', 'hehe', 'xd', 'rofl', 'heh', 'lmfao', 'k', 'ok', 'kk'}:
+        print(f"🗑️  [Discord Bridge] Skipping pure reaction: '{message}'")
+        return None
+    
+    # Pure Discord emoji only: ":smile:", ":cat_heart:", etc.
+    if re.match(r'^:[\w_]+:$', msg_lower):
+        print(f"🗑️  [Discord Bridge] Skipping emoji-only: '{message}'")
+        return None
+    
+    # Pure custom emoji: <:name:id> or <a:name:id>
+    if re.match(r'^<a?:[\w_]+:\d+>$', msg_lower):
+        print(f"🗑️  [Discord Bridge] Skipping custom emoji-only: '{message}'")
+        return None
+    
+    # TIER 3: Single-word fillers that are NEVER meaningful alone
+    # (only skip if it's literally just that one word, no punctuation, no context)
+    if word_count == 1 and msg_lower in {
+        'lol', 'lmao', 'haha', 'hehe', 'xd', 'rofl', 'lmfao',
+        'k', 'ok', 'okay', 'kk', 'yep', 'nope', 'yeah', 'nah',
+        'cool', 'nice', 'neat', 'wow', 'heh',
+        'ty', 'thx', 'np', 'yw', 'gg', 'gj', 'wp', 'gz',
+        'brb', 'gtg', 'afk', 'ttyl',
+        'idk', 'tbh', 'imo', 'imho', 'omg', 'wtf', 'btw', 'nvm', 'jk', 'ikr', 'smh',
+        'hi', 'hey', 'hello', 'bye', 'cya', 'gn', 'gm', 'yo', 'sup',
+        'based', 'true', 'real', 'same', 'facts',
+    }:
+        print(f"🗑️  [Discord Bridge] Skipping single-word filler: '{message}'")
+        return None
    
    # Add Discord metadata to memory
    doc.metadata['consolidated'] = False  # Needs nightly processing