feat(memory): hybrid trivial-message classifier (heuristics + LLM batch)
Step 3 of memory system overhaul: smart junk detection. Replaces the old 37-pattern frozenset (44% accuracy) with a 3-tier hybrid: TIER 1 - DEFINITELY_TRIVIAL (instant delete, no LLM): 50+ exact-match patterns, pure emoji, single char, punctuation-only TIER 2 - DEFINITELY_IMPORTANT (instant keep, no LLM): 8+ words, question with substance, first-person statements, numbers/dates, links, mentions TIER 3 - BORDERLINE (batch → LLM for economical classification): 2-7 word messages without clear markers Compact prompt: ~150-200 tokens per 20-message batch Safety default: KEEP on any parsing error Real-time filtering (discord_bridge) uses conservative heuristics only: - 1-char, pure reactions, single emoji, custom emoji-only - 50+ single-word fillers - Never deletes multi-word messages in real-time - Philosophy: false negatives (junk stored) > false positives (data lost) Consolidation gets the full hybrid pipeline with LLM for borderline cases, achieving much better accuracy than the old 44% while keeping token costs minimal (LLM only called during nightly consolidation, not real-time chat).
This commit is contained in:
@@ -64,24 +64,52 @@ def before_cat_stores_episodic_memory(doc, cat):
|
||||
"""
|
||||
Filter and enrich memories before storage.
|
||||
|
||||
Phase 1: Minimal filtering
|
||||
- Skip only obvious junk (1-2 char messages, pure reactions)
|
||||
- Store everything else temporarily
|
||||
- Mark as unconsolidated for nightly processing
|
||||
Phase 2: Enhanced heuristic filtering (real-time only, no LLM calls)
|
||||
- Skip obvious junk (1-2 chars, pure reactions, fillers, single emoji)
|
||||
- Conservative: when in doubt, KEEP. False negatives are better than lost data.
|
||||
- Deeper classification happens during nightly consolidation.
|
||||
"""
|
||||
message = doc.page_content.strip()
|
||||
msg_lower = message.lower()
|
||||
msg_len = len(msg_lower)
|
||||
word_count = len(msg_lower.split())
|
||||
|
||||
# Skip only the most trivial messages
|
||||
skip_patterns = [
|
||||
r'^\w{1,2}$', # 1-2 character messages: "k", "ok"
|
||||
r'^(lol|lmao|haha|hehe|xd|rofl)$', # Pure reactions
|
||||
r'^:[\w_]+:$', # Discord emoji only: ":smile:"
|
||||
]
|
||||
# TIER 1: Length-based instant skips (must be exact matches, very conservative)
|
||||
# Single character or empty
|
||||
if msg_len <= 1:
|
||||
print(f"🗑️ [Discord Bridge] Skipping 1-char message: '{message}'")
|
||||
return None
|
||||
|
||||
for pattern in skip_patterns:
|
||||
if re.match(pattern, message.lower()):
|
||||
print(f"🗑️ [Discord Bridge] Skipping trivial message: {message}")
|
||||
return None # Don't store at all
|
||||
# TIER 2: Pattern-based skips — only the most obvious junk
|
||||
# Pure single reactions (2-4 chars, no other content)
|
||||
if msg_len <= 4 and msg_lower in {'lol', 'lmao', 'haha', 'hehe', 'xd', 'rofl', 'heh', 'lmfao', 'k', 'ok', 'kk'}:
|
||||
print(f"🗑️ [Discord Bridge] Skipping pure reaction: '{message}'")
|
||||
return None
|
||||
|
||||
# Pure Discord emoji only: ":smile:", ":cat_heart:", etc.
|
||||
if re.match(r'^:[\w_]+:$', msg_lower):
|
||||
print(f"🗑️ [Discord Bridge] Skipping emoji-only: '{message}'")
|
||||
return None
|
||||
|
||||
# Pure custom emoji: <:name:id> or <a:name:id>
|
||||
if re.match(r'^<a?:[\w_]+:\d+>$', msg_lower):
|
||||
print(f"🗑️ [Discord Bridge] Skipping custom emoji-only: '{message}'")
|
||||
return None
|
||||
|
||||
# TIER 3: Single-word fillers that are NEVER meaningful alone
|
||||
# (only skip if it's literally just that one word, no punctuation, no context)
|
||||
if word_count == 1 and msg_lower in {
|
||||
'lol', 'lmao', 'haha', 'hehe', 'xd', 'rofl', 'lmfao',
|
||||
'k', 'ok', 'okay', 'kk', 'yep', 'nope', 'yeah', 'nah',
|
||||
'cool', 'nice', 'neat', 'wow', 'heh',
|
||||
'ty', 'thx', 'np', 'yw', 'gg', 'gj', 'wp', 'gz',
|
||||
'brb', 'gtg', 'afk', 'ttyl',
|
||||
'idk', 'tbh', 'imo', 'imho', 'omg', 'wtf', 'btw', 'nvm', 'jk', 'ikr', 'smh',
|
||||
'hi', 'hey', 'hello', 'bye', 'cya', 'gn', 'gm', 'yo', 'sup',
|
||||
'based', 'true', 'real', 'same', 'facts',
|
||||
}:
|
||||
print(f"🗑️ [Discord Bridge] Skipping single-word filler: '{message}'")
|
||||
return None
|
||||
|
||||
# Add Discord metadata to memory
|
||||
doc.metadata['consolidated'] = False # Needs nightly processing
|
||||
|
||||
Reference in New Issue
Block a user