refactor: extract media processing from bot.py into image_handling.py (Phase D Step 19)

- Create process_media_in_message() in utils/image_handling.py that handles all 4 media
  types: image attachments, video/GIF attachments, Tenor GIF embeds, and rich embeds
- DRY the send→log→bipolar tail pattern (5x repeated) into _send_log_bipolar() helper
- Unify rich/article/link embed handling to use rephrase_as_miku() instead of inline
  Cat→LLM routing, fixing a mood-resolution bug (was using globals.DM_MOOD for servers)
- Add 'rich_embed' media_type to rephrase_as_miku() prefix switch
- Remove 3 inline 'import base64' from bot.py (already module-level in image_handling.py)
- bot.py: 986 → 623 lines (-363)
- image_handling.py: 559 → 881 lines (+322)
- All 170 tests pass (21 config/state + 149 route split)
This commit is contained in:
2026-04-15 12:19:37 +03:00
parent 979217e7cc
commit fc4674bb13
2 changed files with 339 additions and 379 deletions

View File

@@ -19,15 +19,7 @@ from utils.scheduled import (
send_monday_video
)
from utils.image_handling import (
download_and_encode_image,
download_and_encode_media,
extract_video_frames,
analyze_image_with_qwen,
analyze_video_with_vision,
rephrase_as_miku,
extract_tenor_gif_url,
convert_gif_to_mp4,
extract_embed_content
process_media_in_message,
)
from utils.core import (
is_miku_addressed,
@@ -266,344 +258,11 @@ async def on_message(message):
)
return
# If message has an image, video, or GIF attachment
if message.attachments:
for attachment in message.attachments:
# Handle images
if any(attachment.filename.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".webp"]):
base64_img = await download_and_encode_image(attachment.url)
if not base64_img:
await message.channel.send("I couldn't load the image, sorry!")
return
# Analyze image (objective description)
qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
if not qwen_description or not qwen_description.strip():
await message.channel.send("I couldn't see that image clearly, sorry! Try sending it again.")
return
# For DMs, pass None as guild_id to use DM mood
guild_id = message.guild.id if message.guild else None
miku_reply = await rephrase_as_miku(
qwen_description,
prompt,
guild_id=guild_id,
user_id=str(message.author.id),
author_name=message.author.display_name,
media_type="image"
)
if is_dm:
logger.info(f"💌 DM image response to {message.author.display_name} (using DM mood: {globals.DM_MOOD})")
else:
logger.info(f"💬 Server image response to {message.author.display_name} in {message.guild.name} (using server mood)")
response_message = await message.channel.send(miku_reply)
# Log the bot's DM response
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# For server messages, check if opposite persona should interject
if not is_dm and globals.BIPOLAR_MODE:
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
create_tracked_task(check_for_interjection(response_message, current_persona), task_name="interjection_check")
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
return
# Handle videos and GIFs
elif any(attachment.filename.lower().endswith(ext) for ext in [".gif", ".mp4", ".webm", ".mov"]):
# Determine media type
is_gif = attachment.filename.lower().endswith('.gif')
media_type = "gif" if is_gif else "video"
logger.debug(f"🎬 Processing {media_type}: {attachment.filename}")
# Download the media
media_bytes_b64 = await download_and_encode_media(attachment.url)
if not media_bytes_b64:
await message.channel.send(f"I couldn't load the {media_type}, sorry!")
return
# Decode back to bytes for frame extraction
import base64
media_bytes = base64.b64decode(media_bytes_b64)
# If it's a GIF, convert to MP4 for better processing
if is_gif:
logger.debug(f"🔄 Converting GIF to MP4 for processing...")
mp4_bytes = await convert_gif_to_mp4(media_bytes)
if mp4_bytes:
media_bytes = mp4_bytes
logger.info(f"✅ GIF converted to MP4")
else:
logger.warning(f"GIF conversion failed, trying direct processing")
# Extract frames
frames = await extract_video_frames(media_bytes, num_frames=6)
if not frames:
await message.channel.send(f"I couldn't extract frames from that {media_type}, sorry!")
return
logger.debug(f"📹 Extracted {len(frames)} frames from {attachment.filename}")
# Analyze the video/GIF with appropriate media type
video_description = await analyze_video_with_vision(frames, media_type=media_type, user_prompt=prompt)
if not video_description or not video_description.strip():
await message.channel.send(f"I couldn't analyze that {media_type} clearly, sorry! Try sending it again.")
return
# For DMs, pass None as guild_id to use DM mood
guild_id = message.guild.id if message.guild else None
miku_reply = await rephrase_as_miku(
video_description,
prompt,
guild_id=guild_id,
user_id=str(message.author.id),
author_name=message.author.display_name,
media_type=media_type
)
if is_dm:
logger.info(f"💌 DM {media_type} response to {message.author.display_name} (using DM mood: {globals.DM_MOOD})")
else:
logger.info(f"💬 Server video response to {message.author.display_name} in {message.guild.name} (using server mood)")
response_message = await message.channel.send(miku_reply)
# Log the bot's DM response
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# For server messages, check if opposite persona should interject
if not is_dm and globals.BIPOLAR_MODE:
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
create_tracked_task(check_for_interjection(response_message, current_persona), task_name="interjection_check")
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
return
# Check for embeds (articles, images, videos, GIFs, etc.)
if message.embeds:
for embed in message.embeds:
# Handle Tenor GIF embeds specially (Discord uses these for /gif command)
if embed.type == 'gifv' and embed.url and 'tenor.com' in embed.url:
logger.info(f"🎭 Processing Tenor GIF from embed: {embed.url}")
# Extract the actual GIF URL from Tenor
gif_url = await extract_tenor_gif_url(embed.url)
if not gif_url:
# Try using the embed's video or image URL as fallback
if hasattr(embed, 'video') and embed.video:
gif_url = embed.video.url
elif hasattr(embed, 'thumbnail') and embed.thumbnail:
gif_url = embed.thumbnail.url
if not gif_url:
logger.warning(f"Could not extract GIF URL from Tenor embed")
continue
# Download the GIF
media_bytes_b64 = await download_and_encode_media(gif_url)
if not media_bytes_b64:
await message.channel.send("I couldn't load that Tenor GIF, sorry!")
return
# Decode to bytes
import base64
media_bytes = base64.b64decode(media_bytes_b64)
# Convert GIF to MP4
logger.debug(f"Converting Tenor GIF to MP4 for processing...")
mp4_bytes = await convert_gif_to_mp4(media_bytes)
if not mp4_bytes:
logger.warning(f"GIF conversion failed, trying direct frame extraction")
mp4_bytes = media_bytes
else:
logger.debug(f"Tenor GIF converted to MP4")
# Extract frames
frames = await extract_video_frames(mp4_bytes, num_frames=6)
if not frames:
await message.channel.send("I couldn't extract frames from that GIF, sorry!")
return
logger.info(f"📹 Extracted {len(frames)} frames from Tenor GIF")
# Analyze the GIF with tenor_gif media type
video_description = await analyze_video_with_vision(frames, media_type="tenor_gif", user_prompt=prompt)
if not video_description or not video_description.strip():
await message.channel.send("I couldn't analyze that GIF clearly, sorry! Try sending it again.")
return
guild_id = message.guild.id if message.guild else None
miku_reply = await rephrase_as_miku(
video_description,
prompt,
guild_id=guild_id,
user_id=str(message.author.id),
author_name=message.author.display_name,
media_type="tenor_gif"
)
if is_dm:
logger.info(f"💌 DM Tenor GIF response to {message.author.display_name} (using DM mood: {globals.DM_MOOD})")
else:
logger.info(f"💬 Server Tenor GIF response to {message.author.display_name} in {message.guild.name} (using server mood)")
response_message = await message.channel.send(miku_reply)
# Log the bot's DM response
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# For server messages, check if opposite persona should interject
if not is_dm and globals.BIPOLAR_MODE:
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
create_tracked_task(check_for_interjection(response_message, current_persona), task_name="interjection_check")
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
return
# Handle other types of embeds (rich, article, image, video, link)
elif embed.type in ['rich', 'article', 'image', 'video', 'link']:
logger.error(f"Processing {embed.type} embed")
# Extract content from embed
embed_content = await extract_embed_content(embed)
if not embed_content['has_content']:
logger.warning(f"Embed has no extractable content, skipping")
continue
# Build context string with embed text
embed_context_parts = []
if embed_content['text']:
embed_context_parts.append(f"[Embedded content: {embed_content['text'][:500]}{'...' if len(embed_content['text']) > 500 else ''}]")
# Process images from embed
if embed_content['images']:
for img_url in embed_content['images']:
logger.error(f"Processing image from embed: {img_url}")
try:
base64_img = await download_and_encode_image(img_url)
if base64_img:
logger.info(f"Image downloaded, analyzing with vision model...")
# Analyze image
qwen_description = await analyze_image_with_qwen(base64_img, user_prompt=prompt)
truncated = (qwen_description[:50] + "...") if len(qwen_description) > 50 else qwen_description
logger.error(f"Vision analysis result: {truncated}")
if qwen_description and qwen_description.strip():
embed_context_parts.append(f"[Embedded image shows: {qwen_description}]")
else:
logger.error(f"Failed to download image from embed")
except Exception as e:
logger.error(f"Error processing embedded image: {e}")
import traceback
traceback.print_exc()
# Process videos from embed
if embed_content['videos']:
for video_url in embed_content['videos']:
logger.info(f"🎬 Processing video from embed: {video_url}")
try:
media_bytes_b64 = await download_and_encode_media(video_url)
if media_bytes_b64:
import base64
media_bytes = base64.b64decode(media_bytes_b64)
frames = await extract_video_frames(media_bytes, num_frames=6)
if frames:
logger.info(f"📹 Extracted {len(frames)} frames, analyzing with vision model...")
video_description = await analyze_video_with_vision(frames, media_type="video", user_prompt=prompt)
logger.info(f"Video analysis result: {video_description[:100]}...")
if video_description and video_description.strip():
embed_context_parts.append(f"[Embedded video shows: {video_description}]")
else:
logger.error(f"Failed to extract frames from video")
else:
logger.error(f"Failed to download video from embed")
except Exception as e:
logger.error(f"Error processing embedded video: {e}")
import traceback
traceback.print_exc()
# Combine embed context with user prompt
if embed_context_parts:
full_context = '\n'.join(embed_context_parts)
enhanced_prompt = f"{full_context}\n\nUser message: {prompt}" if prompt else full_context
# Get Miku's response
guild_id = message.guild.id if message.guild else None
response_type = "dm_response" if is_dm else "server_response"
author_name = message.author.display_name
# Phase 3: Try Cat pipeline first for embed responses too
response = None
if globals.USE_CHESHIRE_CAT:
try:
from utils.cat_client import cat_adapter
cat_result = await cat_adapter.query(
text=enhanced_prompt,
user_id=str(message.author.id),
guild_id=str(guild_id) if guild_id else None,
author_name=author_name,
mood=globals.DM_MOOD,
response_type=response_type,
)
if cat_result:
response, cat_full_prompt = cat_result
logger.info(f"🐱 Cat embed response for {author_name}")
import datetime
globals.LAST_CAT_INTERACTION = {
"full_prompt": cat_full_prompt,
"response": response[:500] if response else "",
"user": author_name,
"mood": globals.DM_MOOD,
"timestamp": datetime.datetime.now().isoformat(),
}
except Exception as e:
logger.warning(f"🐱 Cat embed error, fallback: {e}")
response = None
if not response:
response = await query_llama(
enhanced_prompt,
user_id=str(message.author.id),
guild_id=guild_id,
response_type=response_type,
author_name=author_name
)
if is_dm:
logger.info(f"💌 DM embed response to {message.author.display_name} (using DM mood: {globals.DM_MOOD})")
else:
logger.info(f"💬 Server embed response to {message.author.display_name} in {message.guild.name}")
response_message = await message.channel.send(response)
# Log the bot's DM response
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# For server messages, check if opposite persona should interject
if not is_dm and globals.BIPOLAR_MODE:
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
create_tracked_task(check_for_interjection(response_message, current_persona), task_name="interjection_check")
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
return
# Dispatch media processing (images, videos, GIFs, embeds)
# to utils/image_handling.process_media_in_message()
guild_id = message.guild.id if message.guild else None
if await process_media_in_message(message, prompt, is_dm, guild_id):
return
# Check if this is an image generation request
from utils.image_generation import detect_image_request, handle_image_generation_request
@@ -686,30 +345,8 @@ async def on_message(message):
author_name=author_name
)
if is_dm:
logger.info(f"💌 DM response to {message.author.display_name} (using DM mood: {globals.DM_MOOD})")
else:
logger.info(f"💬 Server response to {message.author.display_name} in {message.guild.name} (using server mood)")
response_message = await message.channel.send(response)
# Log the bot's DM response
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# For server messages, check if opposite persona should interject (persona dialogue system)
if not is_dm and globals.BIPOLAR_MODE:
logger.debug(f"Attempting to check for interjection (is_dm={is_dm}, BIPOLAR_MODE={globals.BIPOLAR_MODE})")
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
logger.debug(f"Creating interjection check task for persona: {current_persona}")
# Pass the bot's response message for analysis
create_tracked_task(check_for_interjection(response_message, current_persona), task_name="interjection_check")
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
import traceback
traceback.print_exc()
from utils.image_handling import _send_log_bipolar
response_message = await _send_log_bipolar(message, response, is_dm)
# For server messages, do server-specific mood detection
if not is_dm and message.guild: