refactor: extract media processing from bot.py into image_handling.py (Phase D Step 19)

- Create process_media_in_message() in utils/image_handling.py that handles all 4 media
  types: image attachments, video/GIF attachments, Tenor GIF embeds, and rich embeds
- DRY the send→log→bipolar tail pattern (5x repeated) into _send_log_bipolar() helper
- Unify rich/article/link embed handling to use rephrase_as_miku() instead of inline
  Cat→LLM routing, fixing a mood-resolution bug (was using globals.DM_MOOD for servers)
- Add 'rich_embed' media_type to rephrase_as_miku() prefix switch
- Remove 3 inline 'import base64' from bot.py (already module-level in image_handling.py)
- bot.py: 986 → 623 lines (-363)
- image_handling.py: 559 → 881 lines (+322)
- All 170 tests pass (21 config/state + 149 route split)
This commit is contained in:
2026-04-15 12:19:37 +03:00
parent 979217e7cc
commit fc4674bb13
2 changed files with 339 additions and 379 deletions

View File

@@ -418,14 +418,13 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
# Format the user's message to include vision context with media type
# This will be saved to history automatically by query_llama
if media_type == "gif":
media_prefix = "Looking at a GIF"
elif media_type == "tenor_gif":
media_prefix = "Looking at a Tenor GIF"
elif media_type == "video":
media_prefix = "Looking at a video"
else: # image
media_prefix = "Looking at an image"
_MEDIA_PREFIXES = {
"gif": "Looking at a GIF",
"tenor_gif": "Looking at a Tenor GIF",
"video": "Looking at a video",
"rich_embed": "Looking at embedded content",
}
media_prefix = _MEDIA_PREFIXES.get(media_type, "Looking at an image")
if user_prompt:
# Include media type, vision description, and user's text
@@ -503,6 +502,330 @@ async def rephrase_as_miku(vision_output, user_prompt, guild_id=None, user_id=No
analyze_image_with_qwen = analyze_image_with_vision
# ---------------------------------------------------------------------------
# Shared tail helper — send response, log DM, check bipolar interjection
# ---------------------------------------------------------------------------
async def _send_log_bipolar(message, reply_text, is_dm, *, media_label=""):
"""
Common tail shared by every media handler *and* the text-fallback path in
bot.py. Sends *reply_text* to the channel, logs the reply in the DM
ledger when appropriate, and fires a bipolar-interjection check for server
messages.
Returns the sent ``discord.Message`` so callers can use it if needed.
"""
from utils.dm_logger import dm_logger
from utils.task_tracker import create_tracked_task
label = f" {media_label}" if media_label else ""
if is_dm:
logger.info(
f"💌 DM{label} response to {message.author.display_name} "
f"(using DM mood: {globals.DM_MOOD})"
)
else:
guild_name = message.guild.name if message.guild else "unknown"
logger.info(
f"💬 Server{label} response to {message.author.display_name} "
f"in {guild_name} (using server mood)"
)
response_message = await message.channel.send(reply_text)
# Log bot's reply in the DM ledger
if is_dm:
dm_logger.log_user_message(message.author, response_message, is_bot_message=True)
# Bipolar-mode interjection check (server messages only)
if not is_dm and globals.BIPOLAR_MODE:
try:
from utils.persona_dialogue import check_for_interjection
current_persona = "evil" if globals.EVIL_MODE else "miku"
create_tracked_task(
check_for_interjection(response_message, current_persona),
task_name="interjection_check",
)
except Exception as e:
logger.error(f"Error checking for persona interjection: {e}")
return response_message
# ---------------------------------------------------------------------------
# High-level media dispatcher — called from bot.py on_message()
# ---------------------------------------------------------------------------
async def process_media_in_message(message, prompt, is_dm, guild_id) -> bool:
"""
Inspect *message* for image/video/GIF attachments and embeds.
If any media is found and successfully processed, a reply is sent to the
channel and this function returns ``True``. Otherwise it returns
``False`` so the caller can fall through to text-only handling.
"""
author_id = str(message.author.id)
author_name = message.author.display_name
# ---- 1. Image attachments (.jpg, .jpeg, .png, .webp) -----------------
if message.attachments:
for attachment in message.attachments:
lower = attachment.filename.lower()
if any(lower.endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp")):
base64_img = await download_and_encode_image(attachment.url)
if not base64_img:
await message.channel.send("I couldn't load the image, sorry!")
return True
qwen_description = await analyze_image_with_vision(base64_img, user_prompt=prompt)
if not qwen_description or not qwen_description.strip():
await message.channel.send(
"I couldn't see that image clearly, sorry! Try sending it again."
)
return True
miku_reply = await rephrase_as_miku(
qwen_description, prompt,
guild_id=guild_id,
user_id=author_id,
author_name=author_name,
media_type="image",
)
await _send_log_bipolar(message, miku_reply, is_dm, media_label="image")
return True
# ---- 2. Video / GIF attachments (.gif, .mp4, .webm, .mov) ----
elif any(lower.endswith(ext) for ext in (".gif", ".mp4", ".webm", ".mov")):
is_gif = lower.endswith(".gif")
media_type = "gif" if is_gif else "video"
logger.debug(f"🎬 Processing {media_type}: {attachment.filename}")
media_bytes_b64 = await download_and_encode_media(attachment.url)
if not media_bytes_b64:
await message.channel.send(f"I couldn't load the {media_type}, sorry!")
return True
media_bytes = base64.b64decode(media_bytes_b64)
if is_gif:
logger.debug("🔄 Converting GIF to MP4 for processing...")
mp4_bytes = await convert_gif_to_mp4(media_bytes)
if mp4_bytes:
media_bytes = mp4_bytes
logger.info("✅ GIF converted to MP4")
else:
logger.warning("GIF conversion failed, trying direct processing")
frames = await extract_video_frames(media_bytes, num_frames=6)
if not frames:
await message.channel.send(
f"I couldn't extract frames from that {media_type}, sorry!"
)
return True
logger.debug(
f"📹 Extracted {len(frames)} frames from {attachment.filename}"
)
video_description = await analyze_video_with_vision(
frames, media_type=media_type, user_prompt=prompt,
)
if not video_description or not video_description.strip():
await message.channel.send(
f"I couldn't analyze that {media_type} clearly, sorry! "
"Try sending it again."
)
return True
miku_reply = await rephrase_as_miku(
video_description, prompt,
guild_id=guild_id,
user_id=author_id,
author_name=author_name,
media_type=media_type,
)
await _send_log_bipolar(message, miku_reply, is_dm, media_label=media_type)
return True
# ---- 3. Tenor GIF embeds (gifv from tenor.com) -----------------------
if message.embeds:
for embed in message.embeds:
if embed.type == "gifv" and embed.url and "tenor.com" in embed.url:
logger.info(f"🎭 Processing Tenor GIF from embed: {embed.url}")
gif_url = await extract_tenor_gif_url(embed.url)
if not gif_url:
if hasattr(embed, "video") and embed.video:
gif_url = embed.video.url
elif hasattr(embed, "thumbnail") and embed.thumbnail:
gif_url = embed.thumbnail.url
if not gif_url:
logger.warning("Could not extract GIF URL from Tenor embed")
continue
media_bytes_b64 = await download_and_encode_media(gif_url)
if not media_bytes_b64:
await message.channel.send(
"I couldn't load that Tenor GIF, sorry!"
)
return True
media_bytes = base64.b64decode(media_bytes_b64)
logger.debug("Converting Tenor GIF to MP4 for processing...")
mp4_bytes = await convert_gif_to_mp4(media_bytes)
if not mp4_bytes:
logger.warning(
"GIF conversion failed, trying direct frame extraction"
)
mp4_bytes = media_bytes
else:
logger.debug("Tenor GIF converted to MP4")
frames = await extract_video_frames(mp4_bytes, num_frames=6)
if not frames:
await message.channel.send(
"I couldn't extract frames from that GIF, sorry!"
)
return True
logger.info(
f"📹 Extracted {len(frames)} frames from Tenor GIF"
)
video_description = await analyze_video_with_vision(
frames, media_type="tenor_gif", user_prompt=prompt,
)
if not video_description or not video_description.strip():
await message.channel.send(
"I couldn't analyze that GIF clearly, sorry! "
"Try sending it again."
)
return True
miku_reply = await rephrase_as_miku(
video_description, prompt,
guild_id=guild_id,
user_id=author_id,
author_name=author_name,
media_type="tenor_gif",
)
await _send_log_bipolar(
message, miku_reply, is_dm, media_label="Tenor GIF",
)
return True
# ---- 4. Rich / article / image / video / link embeds ---------
elif embed.type in ("rich", "article", "image", "video", "link"):
logger.info(f"Processing {embed.type} embed")
embed_content = await extract_embed_content(embed)
if not embed_content["has_content"]:
logger.warning("Embed has no extractable content, skipping")
continue
embed_context_parts = []
if embed_content["text"]:
truncated = embed_content["text"][:500]
if len(embed_content["text"]) > 500:
truncated += "..."
embed_context_parts.append(
f"[Embedded content: {truncated}]"
)
# Analyze images found inside the embed
for img_url in embed_content["images"]:
logger.info(f"Processing image from embed: {img_url}")
try:
base64_img = await download_and_encode_image(img_url)
if base64_img:
logger.info(
"Image downloaded, analyzing with vision model..."
)
qwen_description = await analyze_image_with_vision(
base64_img, user_prompt=prompt,
)
if qwen_description and qwen_description.strip():
embed_context_parts.append(
f"[Embedded image shows: {qwen_description}]"
)
else:
logger.error("Failed to download image from embed")
except Exception as e:
logger.error(f"Error processing embedded image: {e}")
# Analyze videos found inside the embed
for video_url in embed_content["videos"]:
logger.info(
f"🎬 Processing video from embed: {video_url}"
)
try:
media_bytes_b64 = await download_and_encode_media(
video_url,
)
if media_bytes_b64:
media_bytes = base64.b64decode(media_bytes_b64)
frames = await extract_video_frames(
media_bytes, num_frames=6,
)
if frames:
logger.info(
f"📹 Extracted {len(frames)} frames, "
"analyzing with vision model..."
)
video_description = (
await analyze_video_with_vision(
frames,
media_type="video",
user_prompt=prompt,
)
)
if (
video_description
and video_description.strip()
):
embed_context_parts.append(
f"[Embedded video shows: "
f"{video_description}]"
)
else:
logger.error(
"Failed to extract frames from video"
)
else:
logger.error(
"Failed to download video from embed"
)
except Exception as e:
logger.error(
f"Error processing embedded video: {e}"
)
if not embed_context_parts:
continue
# Build a combined vision description and route through
# rephrase_as_miku (which handles Cat → LLM fallback,
# mood resolution, and LAST_CAT_INTERACTION tracking).
combined_description = "\n".join(embed_context_parts)
miku_reply = await rephrase_as_miku(
combined_description, prompt,
guild_id=guild_id,
user_id=author_id,
author_name=author_name,
media_type="rich_embed",
)
await _send_log_bipolar(
message, miku_reply, is_dm, media_label="embed",
)
return True
return False
async def extract_embed_content(embed):
"""
Extract text and media content from a Discord embed.