From 486acb5c1402b9340e76f2ca8c5a7e7c80e38e6b Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Wed, 3 Jun 2026 22:50:03 +0300 Subject: [PATCH] Fix reply-context speaker confusion with structured metadata pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, when a user replied to Miku's message via Discord's reply feature, Miku's quoted words were embedded directly into the user's message text using the format: [Replying to your message: "Miku's words"] User's response This caused two problems: 1. The LLM had to parse "your message" to determine the quoted text was MIKU's words — fragile and frequently misattributed 2. When stored in episodic memory as [User]: ..., Miku's quoted words were permanently mislabeled under the user's speaker prefix Now reply context flows through as structured metadata: - bot/bot.py captures the replied-to text WITHOUT embedding it in prompt - cat_client.py passes it as discord_reply_context in the WebSocket payload - discord_bridge.py injects it as agent_input['reply_context'] — a CLEARLY LABELED note: [The user is replying to what you (Miku) said — ...] - miku_personality.py + evil_miku_personality.py render it via {reply_context} placeholder in the prompt suffix, between memory context and conversation history This keeps Miku's words as a separate context note, never mixed into the user's HumanMessage. Episodic memory only stores the user's actual words. The fallback path (when Cat is unavailable) also uses a cleaner format with explicit speaker labels. --- bot/bot.py | 15 +++++++++++---- bot/utils/cat_client.py | 7 +++++++ cat-plugins/discord_bridge/discord_bridge.py | 18 +++++++++++++++++- .../evil_miku_personality.py | 2 ++ .../miku_personality/miku_personality.py | 2 ++ 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/bot/bot.py b/bot/bot.py index 4964b04..f15e827 100644 --- a/bot/bot.py +++ b/bot/bot.py @@ -284,8 +284,12 @@ async def on_message(message): prompt = text # No cleanup — keep it raw user_id = str(message.author.id) + reply_context = None # Will be passed as structured metadata to Cat pipeline - # If user is replying to a specific message, add context marker + # If user is replying to a specific message, capture the context + # WITHOUT embedding it in the prompt text (that caused speaker confusion). + # Instead, it's passed as structured metadata — the Cat plugin injects it + # into the prompt as a clearly labeled context note, preserving speaker boundaries. if message.reference: try: replied_msg = await message.channel.fetch_message(message.reference.message_id) @@ -293,8 +297,7 @@ async def on_message(message): if replied_msg.author == globals.client.user: # Truncate the replied message to keep prompt manageable replied_content = replied_msg.content[:200] + "..." if len(replied_msg.content) > 200 else replied_msg.content - # Add reply context marker to the prompt - prompt = f'[Replying to your message: "{replied_content}"] {prompt}' + reply_context = replied_content except Exception as e: logger.error(f"Failed to fetch replied message for context: {e}") @@ -364,6 +367,7 @@ async def on_message(message): author_name=author_name, mood=current_mood, response_type=response_type, + reply_context=reply_context, ) if cat_result: response, cat_full_prompt = cat_result @@ -395,8 +399,11 @@ async def on_message(message): # Fallback to direct LLM query if Cat didn't respond if not response: + fallback_prompt = prompt + if reply_context: + fallback_prompt = f'[Context: you (Miku) said: {reply_context}]\n[User says:] {prompt}' response = await query_llama( - prompt, + fallback_prompt, user_id=str(message.author.id), guild_id=guild_id, response_type=response_type, diff --git a/bot/utils/cat_client.py b/bot/utils/cat_client.py index 3cdbcc6..35ae136 100644 --- a/bot/utils/cat_client.py +++ b/bot/utils/cat_client.py @@ -109,6 +109,7 @@ class CatAdapter: mood: Optional[str] = None, response_type: str = "dm_response", media_type: Optional[str] = None, + reply_context: Optional[str] = None, ) -> Optional[tuple]: """ Send a message through the Cat pipeline via WebSocket and get a response. @@ -162,6 +163,12 @@ class CatAdapter: # Pass media type so discord_bridge can add MEDIA NOTE to the prompt if media_type: payload["discord_media_type"] = media_type + # Pass the message the user is replying to (if any) as structured metadata. + # The discord_bridge plugin injects this into the prompt as a clearly-labeled + # context note — keeping Miku's words separate from the user's message text + # and preventing the speaker confusion that the old embed-in-prompt format caused. + if reply_context: + payload["discord_reply_context"] = reply_context # Pass current Discord activity if it changed recently (30-min decay window) activity_label = get_current_activity_fresh() if activity_label: diff --git a/cat-plugins/discord_bridge/discord_bridge.py b/cat-plugins/discord_bridge/discord_bridge.py index 62b43a9..dd49f28 100644 --- a/cat-plugins/discord_bridge/discord_bridge.py +++ b/cat-plugins/discord_bridge/discord_bridge.py @@ -44,6 +44,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: evil_mode = user_message_json.get('discord_evil_mode', False) media_type = user_message_json.get('discord_media_type', None) activity = user_message_json.get('discord_activity', None) + reply_context = user_message_json.get('discord_reply_context', None) # Also check working memory for backward compatibility if not guild_id: @@ -57,6 +58,7 @@ def before_cat_reads_message(user_message_json: dict, cat) -> dict: cat.working_memory['evil_mode'] = evil_mode cat.working_memory['media_type'] = media_type cat.working_memory['activity'] = activity + cat.working_memory['reply_context'] = reply_context return user_message_json @@ -375,7 +377,21 @@ Please respond in a way that reflects this emotional tone.""" print(f" [Discord Bridge] Error building system prefix: {e}") system_prefix = cat.working_memory.get('full_system_prefix', '[system prefix not available]') - full_prompt = f"{system_prefix}\n\n# Context\n\n{episodic_mem}\n\n{declarative_mem}\n\n{tools_output}\n\n# Conversation until now:\nHuman: {user_input}" + # Build reply context note if the user is replying to Miku's message. + # This injects Miku's quoted words as a SEPARATE clearly-labeled context note + # (not embedded in the user's message text). Keeps speaker boundaries intact + # and prevents the LLM from misattributing Miku's words to the user. + # Uses a colon+space delimiter (no nested quotes) to avoid formatting issues + # when the replied message itself contains double-quote characters. + reply_context = cat.working_memory.get('reply_context') + if reply_context: + reply_context_note = f'[The user is replying to what you (Miku) said — you said: {reply_context}]' + agent_input['reply_context'] = reply_context_note + else: + reply_context_note = '' + agent_input['reply_context'] = '' + + full_prompt = f"{system_prefix}\n\n# Context\n\n{episodic_mem}\n\n{declarative_mem}\n\n{tools_output}\n\n{reply_context_note}\n\n# Conversation until now:\nHuman: {user_input}" cat.working_memory['last_full_prompt'] = full_prompt return agent_input diff --git a/cat-plugins/evil_miku_personality/evil_miku_personality.py b/cat-plugins/evil_miku_personality/evil_miku_personality.py index f9d6336..8ad07dd 100644 --- a/cat-plugins/evil_miku_personality/evil_miku_personality.py +++ b/cat-plugins/evil_miku_personality/evil_miku_personality.py @@ -119,6 +119,8 @@ def agent_prompt_suffix(suffix, cat): {{tools_output}} +{{reply_context}} + [Current mood: {mood_name.upper()} — respond accordingly] # Conversation until now: diff --git a/cat-plugins/miku_personality/miku_personality.py b/cat-plugins/miku_personality/miku_personality.py index 4cc42de..2913bfd 100644 --- a/cat-plugins/miku_personality/miku_personality.py +++ b/cat-plugins/miku_personality/miku_personality.py @@ -91,6 +91,8 @@ def agent_prompt_suffix(suffix, cat): {tools_output} +{reply_context} + # Conversation until now: (Note: In the conversation below, "Human" = the person you're talking to, "AI" = you, Miku. Pay attention to who said what.)"""