bot/utils/twscrape_fix.py

# utils/twscrape_fix.py
"""
Monkey patch for twscrape to fix parsing of Twitter's JS bundle.

Fixes two known issues:
1. Issue #284: Malformed JSON with unquoted keys
   (old fix, kept for backward compatibility)
2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id
   generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to
   'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"'
   Fix from: https://github.com/vladkens/twscrape/pull/303

Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.
"""

import json
import re
from typing import Iterator
from utils.logger import get_logger

logger = get_logger('core')


def script_url(k: str, v: str):
    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"


def _js_obj_to_dict(s: str) -> dict:
    """
    Parse a JavaScript object literal with unquoted numeric keys into a Python dict.
    Handles both plain integers (20113) and scientific notation (88e3 → 88000).
    
    From: https://github.com/vladkens/twscrape/pull/303
    """
    # Scientific notation first so the plain-int pass does not consume only the mantissa
    s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)
    # Plain integer keys
    s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)
    return json.loads('{' + s + '}')


def patched_get_scripts_list(text: str) -> Iterator[str]:
    """
    Fixed version that handles Twitter's changing JS bundle format.
    
    Uses a robust two-pass approach:
    1. Try to find the script map using generic regex patterns
    2. Fall back to known format-specific splits
    
    Twitter keeps changing the JS bundle structure. The key invariant is that
    there's always a JavaScript object literal mapping chunk IDs to hashes,
    somewhere in a function that constructs script URLs with ".a.js" suffix.
    """
    # Strategy: Find the JS object that maps IDs to hash values.
    # The format is always some variation of:
    #   ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"
    # or:
    #   ... => e + "." + ({...})[e] + "a.js"
    #
    # We use regex to find the LAST object literal before "a.js" that looks
    # like a hash map (integer keys, short hex-ish string values).
    
    # Approach 1: Known patterns (newest first)
    patterns = [
        # Pattern from PR #303 (April 2026):
        # u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js"
        {
            "name_split_start": '(({',
            "name_split_end": '})[e]||e)',
            "hash_split_start": '|e)+"."+({',
            "hash_split_end": '})[e]+"a.js"',
        },
        # Alternative: same but without the ||e fallback
        {
            "name_split_start": '""+(({',
            "name_split_end": '})[e]',
            "hash_split_start": ')+"."+({',
            "hash_split_end": '})[e]+"a.js"',
        },
        # Old format (pre-April 2026):
        # e=>e+"."+{...}[e]+"a.js"
        {
            "name_split_start": None,  # single map
            "name_split_end": None,
            "hash_split_start": 'e=>e+"."+',
            "hash_split_end": '[e]+"a.js"',
        },
    ]
    
    for pattern in patterns:
        try:
            if pattern["name_split_start"] is None:
                # Single-map old format
                scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
                names = None
                hashes = _js_obj_to_dict(scripts)
            else:
                # Two-map new format
                name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]
                hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
                names = _js_obj_to_dict(name_raw)
                hashes = _js_obj_to_dict(hash_raw)
            
            for k, hash_val in hashes.items():
                name = names.get(k, k) if names else k
                yield script_url(name, f"{hash_val}a")
            logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")
            return
        except (IndexError, KeyError, json.JSONDecodeError):
            continue
    
    # If ALL patterns failed, log a snippet of the text for debugging
    # Find any line near "a.js" to help diagnose
    snippet = ""
    for line in text.split('\n'):
        if 'a.js' in line and ('{' in line or '=>' in line):
            snippet = line.strip()[:300]
            break
    if not snippet:
        # Try to find any JSON-like object near script URL construction
        match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)
        if match:
            snippet = match.group(0)[:400]
    
    logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")
    raise Exception(
        "Failed to parse scripts: unknown JS bundle format. "
        "Twitter may have changed their JS structure again. "
        "See: https://github.com/vladkens/twscrape/issues"
    )


def apply_twscrape_fix():
    """Apply the monkey patch to twscrape"""
    try:
        from twscrape import xclid
        xclid.get_scripts_list = patched_get_scripts_list
        logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")
    except Exception as e:
        logger.error(f"Failed to apply twscrape monkey patch: {e}")
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00			`# utils/twscrape_fix.py`
			`"""`
fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]\|\|e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 2026-04-29 21:32:27 +03:00			`Monkey patch for twscrape to fix parsing of Twitter's JS bundle.`

			`Fixes two known issues:`
			`1. Issue #284: Malformed JSON with unquoted keys`
			`(old fix, kept for backward compatibility)`
			`2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id`
			`generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to`
			`'u.u=e=>""+(({...})[e]\|\|e)+"."+({...})[e]+"a.js"'`
			`Fix from: https://github.com/vladkens/twscrape/pull/303`

			`Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.`
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00			`"""`

			`import json`
			`import re`
fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]\|\|e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 2026-04-29 21:32:27 +03:00			`from typing import Iterator`
feat: Implement comprehensive non-hierarchical logging system - Created new logging infrastructure with per-component filtering - Added 6 log levels: DEBUG, INFO, API, WARNING, ERROR, CRITICAL - Implemented non-hierarchical level control (any combination can be enabled) - Migrated 917 print() statements across 31 files to structured logging - Created web UI (system.html) for runtime configuration with dark theme - Added global level controls to enable/disable levels across all components - Added timestamp format control (off/time/date/datetime options) - Implemented log rotation (10MB per file, 5 backups) - Added API endpoints for dynamic log configuration - Configured HTTP request logging with filtering via api.requests component - Intercepted APScheduler logs with proper formatting - Fixed persistence paths to use /app/memory for Docker volume compatibility - Fixed checkbox display bug in web UI (enabled_levels now properly shown) - Changed System Settings button to open in same tab instead of new window Components: bot, api, api.requests, autonomous, persona, vision, llm, conversation, mood, dm, scheduled, gpu, media, server, commands, sentiment, core, apscheduler All settings persist across container restarts via JSON config. 2026-01-10 20:46:19 +02:00			`from utils.logger import get_logger`

			`logger = get_logger('core')`
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00

			`def script_url(k: str, v: str):`
			`return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"`


fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]\|\|e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 2026-04-29 21:32:27 +03:00			`def _js_obj_to_dict(s: str) -> dict:`
			`"""`
			`Parse a JavaScript object literal with unquoted numeric keys into a Python dict.`
			`Handles both plain integers (20113) and scientific notation (88e3 → 88000).`
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00
fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]\|\|e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 2026-04-29 21:32:27 +03:00			`From: https://github.com/vladkens/twscrape/pull/303`
			`"""`
			`# Scientific notation first so the plain-int pass does not consume only the mantissa`
			`s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)`
			`# Plain integer keys`
			`s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)`
			`return json.loads('{' + s + '}')`


			`def patched_get_scripts_list(text: str) -> Iterator[str]:`
			`"""`
			`Fixed version that handles Twitter's changing JS bundle format.`

			`Uses a robust two-pass approach:`
			`1. Try to find the script map using generic regex patterns`
			`2. Fall back to known format-specific splits`

			`Twitter keeps changing the JS bundle structure. The key invariant is that`
			`there's always a JavaScript object literal mapping chunk IDs to hashes,`
			`somewhere in a function that constructs script URLs with ".a.js" suffix.`
			`"""`
			`# Strategy: Find the JS object that maps IDs to hash values.`
			`# The format is always some variation of:`
			`# ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"`
			`# or:`
			`# ... => e + "." + ({...})[e] + "a.js"`
			`#`
			`# We use regex to find the LAST object literal before "a.js" that looks`
			`# like a hash map (integer keys, short hex-ish string values).`

			`# Approach 1: Known patterns (newest first)`
			`patterns = [`
			`# Pattern from PR #303 (April 2026):`
			`# u.u=e=>""+(({name_map})[e]\|\|e)+"."+({hash_map})[e]+"a.js"`
			`{`
			`"name_split_start": '(({',`
			`"name_split_end": '})[e]\|\|e)',`
			`"hash_split_start": '\|e)+"."+({',`
			`"hash_split_end": '})[e]+"a.js"',`
			`},`
			`# Alternative: same but without the \|\|e fallback`
			`{`
			`"name_split_start": '""+(({',`
			`"name_split_end": '})[e]',`
			`"hash_split_start": ')+"."+({',`
			`"hash_split_end": '})[e]+"a.js"',`
			`},`
			`# Old format (pre-April 2026):`
			`# e=>e+"."+{...}[e]+"a.js"`
			`{`
			`"name_split_start": None, # single map`
			`"name_split_end": None,`
			`"hash_split_start": 'e=>e+"."+',`
			`"hash_split_end": '[e]+"a.js"',`
			`},`
			`]`

			`for pattern in patterns:`
			`try:`
			`if pattern["name_split_start"] is None:`
			`# Single-map old format`
			`scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]`
			`names = None`
			`hashes = _js_obj_to_dict(scripts)`
			`else:`
			`# Two-map new format`
			`name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]`
			`hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]`
			`names = _js_obj_to_dict(name_raw)`
			`hashes = _js_obj_to_dict(hash_raw)`

			`for k, hash_val in hashes.items():`
			`name = names.get(k, k) if names else k`
			`yield script_url(name, f"{hash_val}a")`
			`logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")`
			`return`
			`except (IndexError, KeyError, json.JSONDecodeError):`
			`continue`

			`# If ALL patterns failed, log a snippet of the text for debugging`
			`# Find any line near "a.js" to help diagnose`
			`snippet = ""`
			`for line in text.split('\n'):`
			`if 'a.js' in line and ('{' in line or '=>' in line):`
			`snippet = line.strip()[:300]`
			`break`
			`if not snippet:`
			`# Try to find any JSON-like object near script URL construction`
			`match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)`
			`if match:`
			`snippet = match.group(0)[:400]`

			`logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")`
			`raise Exception(`
			`"Failed to parse scripts: unknown JS bundle format. "`
			`"Twitter may have changed their JS structure again. "`
			`"See: https://github.com/vladkens/twscrape/issues"`
			`)`
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00

			`def apply_twscrape_fix():`
			`"""Apply the monkey patch to twscrape"""`
			`try:`
			`from twscrape import xclid`
			`xclid.get_scripts_list = patched_get_scripts_list`
fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]\|\|e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 2026-04-29 21:32:27 +03:00			`logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")`
Fix: Apply twscrape monkey patch to resolve 'Failed to parse scripts' error Twitter changed their JavaScript response format to include unquoted keys in JSON objects, which breaks twscrape's parser. This fix applies a monkey patch that uses regex to quote the unquoted keys before parsing. This resolves the issue preventing figurine notifications from being sent for the past several days. Reference: https://github.com/vladkens/twscrape/issues/284 2025-12-10 09:48:25 +02:00			`except Exception as e:`
feat: Implement comprehensive non-hierarchical logging system - Created new logging infrastructure with per-component filtering - Added 6 log levels: DEBUG, INFO, API, WARNING, ERROR, CRITICAL - Implemented non-hierarchical level control (any combination can be enabled) - Migrated 917 print() statements across 31 files to structured logging - Created web UI (system.html) for runtime configuration with dark theme - Added global level controls to enable/disable levels across all components - Added timestamp format control (off/time/date/datetime options) - Implemented log rotation (10MB per file, 5 backups) - Added API endpoints for dynamic log configuration - Configured HTTP request logging with filtering via api.requests component - Intercepted APScheduler logs with proper formatting - Fixed persistence paths to use /app/memory for Docker volume compatibility - Fixed checkbox display bug in web UI (enabled_levels now properly shown) - Changed System Settings button to open in same tab instead of new window Components: bot, api, api.requests, autonomous, persona, vision, llm, conversation, mood, dm, scheduled, gpu, media, server, commands, sentiment, core, apscheduler All settings persist across container restarts via JSON config. 2026-01-10 20:46:19 +02:00			`logger.error(f"Failed to apply twscrape monkey patch: {e}")`