miku-discord/bot/utils/twscrape_fix.py

# utils/twscrape_fix.py
"""
Monkey patch for twscrape to fix parsing of Twitter's JS bundle.

Fixes two known issues:
1. Issue #284: Malformed JSON with unquoted keys
   (old fix, kept for backward compatibility)
2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id
   generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to
   'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"'
   Fix from: https://github.com/vladkens/twscrape/pull/303

Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.
"""

import json
import re
from typing import Iterator
from utils.logger import get_logger

logger = get_logger('core')


def script_url(k: str, v: str):
    return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"


def _js_obj_to_dict(s: str) -> dict:
    """
    Parse a JavaScript object literal with unquoted numeric keys into a Python dict.
    Handles both plain integers (20113) and scientific notation (88e3 → 88000).

    From: https://github.com/vladkens/twscrape/pull/303
    """
    # Scientific notation first so the plain-int pass does not consume only the mantissa
    s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)
    # Plain integer keys
    s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)
    return json.loads('{' + s + '}')


def patched_get_scripts_list(text: str) -> Iterator[str]:
    """
    Fixed version that handles Twitter's changing JS bundle format.

    Uses a robust two-pass approach:
    1. Try to find the script map using generic regex patterns
    2. Fall back to known format-specific splits

    Twitter keeps changing the JS bundle structure. The key invariant is that
    there's always a JavaScript object literal mapping chunk IDs to hashes,
    somewhere in a function that constructs script URLs with ".a.js" suffix.
    """
    # Strategy: Find the JS object that maps IDs to hash values.
    # The format is always some variation of:
    #   ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"
    # or:
    #   ... => e + "." + ({...})[e] + "a.js"
    #
    # We use regex to find the LAST object literal before "a.js" that looks
    # like a hash map (integer keys, short hex-ish string values).

    # Approach 1: Known patterns (newest first)
    patterns = [
        # Pattern from PR #303 (April 2026):
        # u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js"
        {
            "name_split_start": '(({',
            "name_split_end": '})[e]||e)',
            "hash_split_start": '|e)+"."+({',
            "hash_split_end": '})[e]+"a.js"',
        },
        # Alternative: same but without the ||e fallback
        {
            "name_split_start": '""+(({',
            "name_split_end": '})[e]',
            "hash_split_start": ')+"."+({',
            "hash_split_end": '})[e]+"a.js"',
        },
        # Old format (pre-April 2026):
        # e=>e+"."+{...}[e]+"a.js"
        {
            "name_split_start": None,  # single map
            "name_split_end": None,
            "hash_split_start": 'e=>e+"."+',
            "hash_split_end": '[e]+"a.js"',
        },
    ]

    for pattern in patterns:
        try:
            if pattern["name_split_start"] is None:
                # Single-map old format
                scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
                names = None
                hashes = _js_obj_to_dict(scripts)
            else:
                # Two-map new format
                name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]
                hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
                names = _js_obj_to_dict(name_raw)
                hashes = _js_obj_to_dict(hash_raw)

            for k, hash_val in hashes.items():
                name = names.get(k, k) if names else k
                yield script_url(name, f"{hash_val}a")
            logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")
            return
        except (IndexError, KeyError, json.JSONDecodeError):
            continue

    # If ALL patterns failed, log a snippet of the text for debugging
    # Find any line near "a.js" to help diagnose
    snippet = ""
    for line in text.split('\n'):
        if 'a.js' in line and ('{' in line or '=>' in line):
            snippet = line.strip()[:300]
            break
    if not snippet:
        # Try to find any JSON-like object near script URL construction
        match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)
        if match:
            snippet = match.group(0)[:400]

    logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")
    raise Exception(
        "Failed to parse scripts: unknown JS bundle format. "
        "Twitter may have changed their JS structure again. "
        "See: https://github.com/vladkens/twscrape/issues"
    )


def apply_twscrape_fix():
    """Apply the monkey patch to twscrape"""
    try:
        from twscrape import xclid
        xclid.get_scripts_list = patched_get_scripts_list
        logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")
    except Exception as e:
        logger.error(f"Failed to apply twscrape monkey patch: {e}")