Able to now address Miku in Cyrillic, Kanji and both Kanas, incl. Japanese honorifics
This commit is contained in:
@@ -40,13 +40,72 @@ async def is_miku_addressed(message) -> bool:
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fetch referenced message: {e}")
|
||||
|
||||
cleaned = message.content.strip()
|
||||
|
||||
return bool(re.search(
|
||||
r'(?<![\w\(])(?:[^\w\s]{0,2}\s*)?miku(?:\s*[^\w\s]{0,2})?(?=,|\s*,|[!\.?\s]*$)',
|
||||
cleaned,
|
||||
re.IGNORECASE
|
||||
))
|
||||
cleaned = message.content.strip().lower()
|
||||
|
||||
# Base names for Miku in different scripts
|
||||
base_names = [
|
||||
'miku', 'мику', 'みく', 'ミク', '未来'
|
||||
]
|
||||
|
||||
# Japanese honorifics - all scripts combined for simpler matching
|
||||
honorifics_all_scripts = [
|
||||
# Latin
|
||||
'chan', 'san', 'kun', 'nyan', 'hime', 'tan', 'chin', 'heika',
|
||||
'denka', 'kakka', 'shi', 'chama', 'kyun', 'dono', 'sensei', 'senpai', 'jou',
|
||||
# Hiragana
|
||||
'ちゃん', 'さん', 'くん', 'にゃん', 'ひめ', 'たん', 'ちん', 'へいか',
|
||||
'でんか', 'かっか', 'し', 'ちゃま', 'きゅん', 'どの', 'せんせい', 'せんぱい', 'じょう',
|
||||
# Katakana
|
||||
'チャン', 'サン', 'クン', 'ニャン', 'ヒメ', 'タン', 'チン', 'ヘイカ',
|
||||
'デンカ', 'カッカ', 'シ', 'チャマ', 'キュン', 'ドノ', 'センセイ', 'センパイ', 'ジョウ',
|
||||
# Cyrillic
|
||||
'чан', 'сан', 'кун', 'ньян', 'химе', 'тан', 'чин', 'хэйка',
|
||||
'дэнка', 'какка', 'си', 'чама', 'кюн', 'доно', 'сэнсэй', 'сэнпай', 'жо'
|
||||
]
|
||||
|
||||
# Optional o- prefix in different scripts
|
||||
o_prefixes = ['o-', 'о-', 'お', 'オ']
|
||||
|
||||
# Strategy: Just check if any base name appears (case insensitive for latin/cyrillic)
|
||||
# Then allow any honorific to optionally follow
|
||||
|
||||
for base in base_names:
|
||||
base_lower = base.lower()
|
||||
|
||||
# Check for just the base name
|
||||
if re.search(r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])', cleaned):
|
||||
return True
|
||||
|
||||
# Check with optional o- prefix
|
||||
for prefix in o_prefixes:
|
||||
prefix_pattern = prefix.lower() if prefix != 'お' and prefix != 'オ' else prefix
|
||||
pattern = r'(?<![a-zа-яa-я\w])' + re.escape(prefix_pattern) + r'\s*' + re.escape(base_lower) + r'(?![a-zа-яa-я\w])'
|
||||
if re.search(pattern, cleaned):
|
||||
return True
|
||||
|
||||
# Check base name followed by any honorific (no spacing requirement to catch mixed script)
|
||||
for honorific in honorifics_all_scripts:
|
||||
honorific_lower = honorific.lower()
|
||||
# Allow optional dash, space, or no separator between name and honorific
|
||||
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(base_lower) +
|
||||
r'[-\s]*' + re.escape(honorific_lower) +
|
||||
r'(?![a-zа-яa-я\w])')
|
||||
if re.search(pattern, cleaned):
|
||||
return True
|
||||
|
||||
# Check with o- prefix + base + honorific
|
||||
for prefix in o_prefixes:
|
||||
prefix_lower = prefix.lower() if prefix != 'お' and prefix != 'オ' else prefix
|
||||
for honorific in honorifics_all_scripts:
|
||||
honorific_lower = honorific.lower()
|
||||
pattern = (r'(?<![a-zа-яa-я\w])' + re.escape(prefix_lower) +
|
||||
r'[-\s]*' + re.escape(base_lower) +
|
||||
r'[-\s]*' + re.escape(honorific_lower) +
|
||||
r'(?![a-zа-яa-я\w])')
|
||||
if re.search(pattern, cleaned):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# Vectorstore functionality disabled - not needed with current structured context approach
|
||||
# If you need embeddings in the future, you can use a different embedding provider
|
||||
|
||||
Reference in New Issue
Block a user