From 20891179eee237acb940828ccdefd323cdaba67f Mon Sep 17 00:00:00 2001 From: koko210Serve Date: Wed, 29 Apr 2026 21:32:27 +0300 Subject: [PATCH] fix(twitter): update twscrape monkey patch for JS bundle format change Twitter changed the JS bundle structure from the old single-map format (e=>e+"."+{...}[e]+"a.js") to a new two-map format (u.u=e=>""+(({name})[e]||e)+"."+({hash})[e]+"a.js"), breaking x-client-transaction-id generation. This caused IndexError: list index out of range, which twscrape interpreted as an account timeout (15-min lockout), preventing Miku from fetching/sharing tweets. The fix adds: - A robust multi-pattern parser that tries known formats in order - The _js_obj_to_dict helper from PR #303 for handling unquoted numeric keys and scientific notation in JS object literals - Debug logging to capture the JS snippet when ALL patterns fail, making future breakage easier to diagnose References: - https://github.com/vladkens/twscrape/issues/302 - https://github.com/vladkens/twscrape/pull/303 --- bot/utils/twscrape_fix.py | 134 ++++++++++++++++++++++++++++++++------ 1 file changed, 115 insertions(+), 19 deletions(-) diff --git a/bot/utils/twscrape_fix.py b/bot/utils/twscrape_fix.py index 53dd57a..5e5ea0a 100644 --- a/bot/utils/twscrape_fix.py +++ b/bot/utils/twscrape_fix.py @@ -1,12 +1,21 @@ # utils/twscrape_fix.py """ -Monkey patch for twscrape to fix "Failed to parse scripts" error. -Twitter started returning malformed JSON with unquoted keys. -See: https://github.com/vladkens/twscrape/issues/284 +Monkey patch for twscrape to fix parsing of Twitter's JS bundle. + +Fixes two known issues: +1. Issue #284: Malformed JSON with unquoted keys + (old fix, kept for backward compatibility) +2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id + generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to + 'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"' + Fix from: https://github.com/vladkens/twscrape/pull/303 + +Without this patch, twscrape raises IndexError and locks accounts for 15 minutes. """ import json import re +from typing import Iterator from utils.logger import get_logger logger = get_logger('core') @@ -16,22 +25,109 @@ def script_url(k: str, v: str): return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js" -def patched_get_scripts_list(text: str): - """Fixed version that handles unquoted keys in Twitter's JSON response""" - scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0] +def _js_obj_to_dict(s: str) -> dict: + """ + Parse a JavaScript object literal with unquoted numeric keys into a Python dict. + Handles both plain integers (20113) and scientific notation (88e3 → 88000). - try: - for k, v in json.loads(scripts).items(): - yield script_url(k, f"{v}a") - except json.decoder.JSONDecodeError: - # Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_node_modules_ws_browser_js - fixed_scripts = re.sub( - r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):', - r'\1\2"\3"\4:', - scripts - ) - for k, v in json.loads(fixed_scripts).items(): - yield script_url(k, f"{v}a") + From: https://github.com/vladkens/twscrape/pull/303 + """ + # Scientific notation first so the plain-int pass does not consume only the mantissa + s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s) + # Plain integer keys + s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s) + return json.loads('{' + s + '}') + + +def patched_get_scripts_list(text: str) -> Iterator[str]: + """ + Fixed version that handles Twitter's changing JS bundle format. + + Uses a robust two-pass approach: + 1. Try to find the script map using generic regex patterns + 2. Fall back to known format-specific splits + + Twitter keeps changing the JS bundle structure. The key invariant is that + there's always a JavaScript object literal mapping chunk IDs to hashes, + somewhere in a function that constructs script URLs with ".a.js" suffix. + """ + # Strategy: Find the JS object that maps IDs to hash values. + # The format is always some variation of: + # ... => "" + ({...})[e] + "." + ({...})[e] + "a.js" + # or: + # ... => e + "." + ({...})[e] + "a.js" + # + # We use regex to find the LAST object literal before "a.js" that looks + # like a hash map (integer keys, short hex-ish string values). + + # Approach 1: Known patterns (newest first) + patterns = [ + # Pattern from PR #303 (April 2026): + # u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js" + { + "name_split_start": '(({', + "name_split_end": '})[e]||e)', + "hash_split_start": '|e)+"."+({', + "hash_split_end": '})[e]+"a.js"', + }, + # Alternative: same but without the ||e fallback + { + "name_split_start": '""+(({', + "name_split_end": '})[e]', + "hash_split_start": ')+"."+({', + "hash_split_end": '})[e]+"a.js"', + }, + # Old format (pre-April 2026): + # e=>e+"."+{...}[e]+"a.js" + { + "name_split_start": None, # single map + "name_split_end": None, + "hash_split_start": 'e=>e+"."+', + "hash_split_end": '[e]+"a.js"', + }, + ] + + for pattern in patterns: + try: + if pattern["name_split_start"] is None: + # Single-map old format + scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0] + names = None + hashes = _js_obj_to_dict(scripts) + else: + # Two-map new format + name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0] + hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0] + names = _js_obj_to_dict(name_raw) + hashes = _js_obj_to_dict(hash_raw) + + for k, hash_val in hashes.items(): + name = names.get(k, k) if names else k + yield script_url(name, f"{hash_val}a") + logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...") + return + except (IndexError, KeyError, json.JSONDecodeError): + continue + + # If ALL patterns failed, log a snippet of the text for debugging + # Find any line near "a.js" to help diagnose + snippet = "" + for line in text.split('\n'): + if 'a.js' in line and ('{' in line or '=>' in line): + snippet = line.strip()[:300] + break + if not snippet: + # Try to find any JSON-like object near script URL construction + match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL) + if match: + snippet = match.group(0)[:400] + + logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}") + raise Exception( + "Failed to parse scripts: unknown JS bundle format. " + "Twitter may have changed their JS structure again. " + "See: https://github.com/vladkens/twscrape/issues" + ) def apply_twscrape_fix(): @@ -39,6 +135,6 @@ def apply_twscrape_fix(): try: from twscrape import xclid xclid.get_scripts_list = patched_get_scripts_list - logger.info("Applied twscrape monkey patch for 'Failed to parse scripts' fix") + logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)") except Exception as e: logger.error(f"Failed to apply twscrape monkey patch: {e}")