Files
miku-discord/bot/utils/core.py

143 lines
5.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# utils/core.py
import asyncio
import aiohttp
import re
import globals
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from utils.logger import get_logger
logger = get_logger('core')
# switch_model() removed - llama-swap handles model switching automatically
async def is_miku_addressed(message) -> bool:
# Check if this is a DM (no guild)
if message.guild is None:
# In DMs, always respond to every message
return True
# Safety check: ensure guild and guild.me exist
if not message.guild or not message.guild.me:
logger.warning(f"Invalid guild or guild.me in message from {message.author}")
return False
# If message contains a ping for Miku, return true
if message.guild.me in message.mentions:
return True
# If message is a reply, check the referenced message author
if message.reference:
try:
referenced_msg = await message.channel.fetch_message(message.reference.message_id)
if referenced_msg.author == message.guild.me:
return True
except Exception as e:
logger.warning(f"Could not fetch referenced message: {e}")
cleaned = message.content.strip().lower()
# Base names for Miku in different scripts
base_names = [
'miku', 'мику', 'みく', 'ミク', '未来'
]
# Japanese honorifics - all scripts combined for simpler matching
honorifics_all_scripts = [
# Latin
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
# Hiragana
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
'でんか', 'かっか', '', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
# Katakana
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
'デンカ', 'カッカ', '', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
# Cyrillic
'чан', 'сан', 'кун', 'ньян', 'химе', 'тан', 'чин', 'хэйка',
'дэнка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сэнсэй', 'сэнпай', 'жо'
]
# Optional o- prefix in different scripts
o_prefixes = ['o-', 'о-', '', '']
# Strategy: Just check if any base name appears (case insensitive for latin/cyrillic)
# Then allow any honorific to optionally follow
for base in base_names:
base_lower = base.lower()
# Check for just the base name
if re.search(r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])', cleaned):
return True
# Check with optional o- prefix
for prefix in o_prefixes:
prefix_pattern = prefix.lower() if prefix != '' and prefix != '' else prefix
pattern = r'(?<![a-zа-яa-я\w])' + re.escape(prefix_pattern) + r'\s*' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])'
if re.search(pattern, cleaned):
return True
# Check base name followed by any honorific (no spacing requirement to catch mixed script)
for honorific in honorifics_all_scripts:
honorific_lower = honorific.lower()
# Allow optional dash, space, or no separator between name and honorific
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) +
r'[-\s]*' + re.escape(honorific_lower) +
r'(?![a-zа-яa-я\w])')
if re.search(pattern, cleaned):
return True
# Check with o- prefix + base + honorific
for prefix in o_prefixes:
prefix_lower = prefix.lower() if prefix != '' and prefix != '' else prefix
for honorific in honorifics_all_scripts:
honorific_lower = honorific.lower()
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(prefix_lower) +
r'[-\s]*' + re.escape(base_lower) +
r'[-\s]*' + re.escape(honorific_lower) +
r'(?![a-zа-яa-я\w])')
if re.search(pattern, cleaned):
return True
return False
# Vectorstore functionality disabled - not needed with current structured context approach
# If you need embeddings in the future, you can use a different embedding provider
# For now, the bot uses structured prompts from context_manager.py
# def load_miku_knowledge():
# with open("miku_lore.txt", "r", encoding="utf-8") as f:
# text = f.read()
#
# from langchain_text_splitters import RecursiveCharacterTextSplitter
#
# text_splitter = RecursiveCharacterTextSplitter(
# chunk_size=520,
# chunk_overlap=50,
# separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
# )
#
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# def load_miku_lyrics():
# with open("miku_lyrics.txt", "r", encoding="utf-8") as f:
# lyrics_text = f.read()
#
# text_splitter = CharacterTextSplitter(chunk_size=520, chunk_overlap=50)
# docs = [Document(page_content=chunk) for chunk in text_splitter.split_text(lyrics_text)]
#
# vectorstore = FAISS.from_documents(docs, embeddings)
# return vectorstore
#
# miku_vectorstore = load_miku_knowledge()
# miku_lyrics_vectorstore = load_miku_lyrics()