miku-discord/bot/utils/core.py

# utils/core.py

import asyncio
import aiohttp
import re

import globals
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from utils.logger import get_logger

logger = get_logger('core')


# switch_model() removed - llama-swap handles model switching automatically


async def is_miku_addressed(message) -> bool:
    # Check if this is a DM (no guild)
    if message.guild is None:
        # In DMs, always respond to every message
        return True

    # Safety check: ensure guild and guild.me exist
    if not message.guild or not message.guild.me:
        logger.warning(f"Invalid guild or guild.me in message from {message.author}")
        return False

    # If message contains a ping for Miku, return true
    if message.guild.me in message.mentions:
        return True

    # If message is a reply, check the referenced message author
    if message.reference:
        try:
            referenced_msg = await message.channel.fetch_message(message.reference.message_id)
            if referenced_msg.author == message.guild.me:
                return True
        except Exception as e:
            logger.warning(f"Could not fetch referenced message: {e}")

    cleaned = message.content.strip().lower()

    # Base names for Miku in different scripts
    base_names = [
        'miku', 'мику', 'みく', 'ミク', '未来'
    ]

    # Japanese honorifics - all scripts combined for simpler matching
    honorifics_all_scripts = [
        # Latin
        'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
        'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
        # Hiragana
        'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
        'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
        # Katakana
        'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
        'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
        # Cyrillic
        'чан', 'сан', 'кун', 'ньян', 'химе', 'тан', 'чин', 'хэйка',
        'дэнка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сэнсэй', 'сэнпай', 'жо'
    ]

    # Optional o- prefix in different scripts
    o_prefixes = ['o-', 'о-', 'お', 'オ']

    # Strategy: Just check if any base name appears (case insensitive for latin/cyrillic)
    # Then allow any honorific to optionally follow

    for base in base_names:
        base_lower = base.lower()

        # Check for just the base name
        if re.search(r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])', cleaned):
            return True

        # Check with optional o- prefix
        for prefix in o_prefixes:
            prefix_pattern = prefix.lower() if prefix != 'お' and prefix != 'オ' else prefix
            pattern = r'(?<![a-zа-яa-я\w])' + re.escape(prefix_pattern) + r'\s*' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])'
            if re.search(pattern, cleaned):
                return True

        # Check base name followed by any honorific (no spacing requirement to catch mixed script)
        for honorific in honorifics_all_scripts:
            honorific_lower = honorific.lower()
            # Allow optional dash, space, or no separator between name and honorific
            pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) +
                      r'[-\s]*' + re.escape(honorific_lower) +
                      r'(?![a-zа-яa-я\w])')
            if re.search(pattern, cleaned):
                return True

        # Check with o- prefix + base + honorific
        for prefix in o_prefixes:
            prefix_lower = prefix.lower() if prefix != 'お' and prefix != 'オ' else prefix
            for honorific in honorifics_all_scripts:
                honorific_lower = honorific.lower()
                pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(prefix_lower) +
                          r'[-\s]*' + re.escape(base_lower) +
                          r'[-\s]*' + re.escape(honorific_lower) +
                          r'(?![a-zа-яa-я\w])')
                if re.search(pattern, cleaned):
                    return True

    return False

# Vectorstore functionality disabled - not needed with current structured context approach
# If you need embeddings in the future, you can use a different embedding provider
# For now, the bot uses structured prompts from context_manager.py

# def load_miku_knowledge():
#     with open("miku_lore.txt", "r", encoding="utf-8") as f:
#         text = f.read()
#
#     from langchain_text_splitters import RecursiveCharacterTextSplitter
#
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=520,
#         chunk_overlap=50,
#         separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
#     )
#
#     docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
#
#     vectorstore = FAISS.from_documents(docs, embeddings)
#     return vectorstore
#
# def load_miku_lyrics():
#     with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
#         lyrics_text = f.read()
#
#     text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
#     docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
#
#     vectorstore = FAISS.from_documents(docs, embeddings)
#     return vectorstore
#
# miku_vectorstore = load_miku_knowledge()
# miku_lyrics_vectorstore = load_miku_lyrics()