# utils/twscrape_fix.py """ Monkey patch for twscrape to fix parsing of Twitter's JS bundle. Fixes two known issues: 1. Issue #284: Malformed JSON with unquoted keys (old fix, kept for backward compatibility) 2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to 'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"' Fix from: https://github.com/vladkens/twscrape/pull/303 Without this patch, twscrape raises IndexError and locks accounts for 15 minutes. """ import json import re from typing import Iterator from utils.logger import get_logger logger = get_logger('core') def script_url(k: str, v: str): return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js" def _js_obj_to_dict(s: str) -> dict: """ Parse a JavaScript object literal with unquoted numeric keys into a Python dict. Handles both plain integers (20113) and scientific notation (88e3 → 88000). From: https://github.com/vladkens/twscrape/pull/303 """ # Scientific notation first so the plain-int pass does not consume only the mantissa s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s) # Plain integer keys s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s) return json.loads('{' + s + '}') def patched_get_scripts_list(text: str) -> Iterator[str]: """ Fixed version that handles Twitter's changing JS bundle format. Uses a robust two-pass approach: 1. Try to find the script map using generic regex patterns 2. Fall back to known format-specific splits Twitter keeps changing the JS bundle structure. The key invariant is that there's always a JavaScript object literal mapping chunk IDs to hashes, somewhere in a function that constructs script URLs with ".a.js" suffix. """ # Strategy: Find the JS object that maps IDs to hash values. # The format is always some variation of: # ... => "" + ({...})[e] + "." + ({...})[e] + "a.js" # or: # ... => e + "." + ({...})[e] + "a.js" # # We use regex to find the LAST object literal before "a.js" that looks # like a hash map (integer keys, short hex-ish string values). # Approach 1: Known patterns (newest first) patterns = [ # Pattern from PR #303 (April 2026): # u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js" { "name_split_start": '(({', "name_split_end": '})[e]||e)', "hash_split_start": '|e)+"."+({', "hash_split_end": '})[e]+"a.js"', }, # Alternative: same but without the ||e fallback { "name_split_start": '""+(({', "name_split_end": '})[e]', "hash_split_start": ')+"."+({', "hash_split_end": '})[e]+"a.js"', }, # Old format (pre-April 2026): # e=>e+"."+{...}[e]+"a.js" { "name_split_start": None, # single map "name_split_end": None, "hash_split_start": 'e=>e+"."+', "hash_split_end": '[e]+"a.js"', }, ] for pattern in patterns: try: if pattern["name_split_start"] is None: # Single-map old format scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0] names = None hashes = _js_obj_to_dict(scripts) else: # Two-map new format name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0] hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0] names = _js_obj_to_dict(name_raw) hashes = _js_obj_to_dict(hash_raw) for k, hash_val in hashes.items(): name = names.get(k, k) if names else k yield script_url(name, f"{hash_val}a") logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...") return except (IndexError, KeyError, json.JSONDecodeError): continue # If ALL patterns failed, log a snippet of the text for debugging # Find any line near "a.js" to help diagnose snippet = "" for line in text.split('\n'): if 'a.js' in line and ('{' in line or '=>' in line): snippet = line.strip()[:300] break if not snippet: # Try to find any JSON-like object near script URL construction match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL) if match: snippet = match.group(0)[:400] logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}") raise Exception( "Failed to parse scripts: unknown JS bundle format. " "Twitter may have changed their JS structure again. " "See: https://github.com/vladkens/twscrape/issues" ) def apply_twscrape_fix(): """Apply the monkey patch to twscrape""" try: from twscrape import xclid xclid.get_scripts_list = patched_get_scripts_list logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)") except Exception as e: logger.error(f"Failed to apply twscrape monkey patch: {e}")