fix(twitter): update twscrape monkey patch for JS bundle format change

Twitter changed the JS bundle structure from the old single-map format
(e=>e+"."+{...}[e]+"a.js") to a new two-map format
(u.u=e=>""+(({name})[e]||e)+"."+({hash})[e]+"a.js"), breaking
x-client-transaction-id generation.

This caused IndexError: list index out of range, which twscrape
interpreted as an account timeout (15-min lockout), preventing Miku
from fetching/sharing tweets.

The fix adds:
- A robust multi-pattern parser that tries known formats in order
- The _js_obj_to_dict helper from PR #303 for handling unquoted numeric
  keys and scientific notation in JS object literals
- Debug logging to capture the JS snippet when ALL patterns fail,
  making future breakage easier to diagnose

References:
- https://github.com/vladkens/twscrape/issues/302
- https://github.com/vladkens/twscrape/pull/303
This commit is contained in:
2026-04-29 21:32:27 +03:00
parent 694590a620
commit 20891179ee

View File

@@ -1,12 +1,21 @@
# utils/twscrape_fix.py # utils/twscrape_fix.py
""" """
Monkey patch for twscrape to fix "Failed to parse scripts" error. Monkey patch for twscrape to fix parsing of Twitter's JS bundle.
Twitter started returning malformed JSON with unquoted keys.
See: https://github.com/vladkens/twscrape/issues/284 Fixes two known issues:
1. Issue #284: Malformed JSON with unquoted keys
(old fix, kept for backward compatibility)
2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id
generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to
'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"'
Fix from: https://github.com/vladkens/twscrape/pull/303
Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.
""" """
import json import json
import re import re
from typing import Iterator
from utils.logger import get_logger from utils.logger import get_logger
logger = get_logger('core') logger = get_logger('core')
@@ -16,22 +25,109 @@ def script_url(k: str, v: str):
return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js" return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
def patched_get_scripts_list(text: str): def _js_obj_to_dict(s: str) -> dict:
"""Fixed version that handles unquoted keys in Twitter's JSON response""" """
scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0] Parse a JavaScript object literal with unquoted numeric keys into a Python dict.
Handles both plain integers (20113) and scientific notation (88e3 → 88000).
From: https://github.com/vladkens/twscrape/pull/303
"""
# Scientific notation first so the plain-int pass does not consume only the mantissa
s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)
# Plain integer keys
s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)
return json.loads('{' + s + '}')
def patched_get_scripts_list(text: str) -> Iterator[str]:
"""
Fixed version that handles Twitter's changing JS bundle format.
Uses a robust two-pass approach:
1. Try to find the script map using generic regex patterns
2. Fall back to known format-specific splits
Twitter keeps changing the JS bundle structure. The key invariant is that
there's always a JavaScript object literal mapping chunk IDs to hashes,
somewhere in a function that constructs script URLs with ".a.js" suffix.
"""
# Strategy: Find the JS object that maps IDs to hash values.
# The format is always some variation of:
# ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"
# or:
# ... => e + "." + ({...})[e] + "a.js"
#
# We use regex to find the LAST object literal before "a.js" that looks
# like a hash map (integer keys, short hex-ish string values).
# Approach 1: Known patterns (newest first)
patterns = [
# Pattern from PR #303 (April 2026):
# u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js"
{
"name_split_start": '(({',
"name_split_end": '})[e]||e)',
"hash_split_start": '|e)+"."+({',
"hash_split_end": '})[e]+"a.js"',
},
# Alternative: same but without the ||e fallback
{
"name_split_start": '""+(({',
"name_split_end": '})[e]',
"hash_split_start": ')+"."+({',
"hash_split_end": '})[e]+"a.js"',
},
# Old format (pre-April 2026):
# e=>e+"."+{...}[e]+"a.js"
{
"name_split_start": None, # single map
"name_split_end": None,
"hash_split_start": 'e=>e+"."+',
"hash_split_end": '[e]+"a.js"',
},
]
for pattern in patterns:
try: try:
for k, v in json.loads(scripts).items(): if pattern["name_split_start"] is None:
yield script_url(k, f"{v}a") # Single-map old format
except json.decoder.JSONDecodeError: scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
# Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_node_modules_ws_browser_js names = None
fixed_scripts = re.sub( hashes = _js_obj_to_dict(scripts)
r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):', else:
r'\1\2"\3"\4:', # Two-map new format
scripts name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]
hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
names = _js_obj_to_dict(name_raw)
hashes = _js_obj_to_dict(hash_raw)
for k, hash_val in hashes.items():
name = names.get(k, k) if names else k
yield script_url(name, f"{hash_val}a")
logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")
return
except (IndexError, KeyError, json.JSONDecodeError):
continue
# If ALL patterns failed, log a snippet of the text for debugging
# Find any line near "a.js" to help diagnose
snippet = ""
for line in text.split('\n'):
if 'a.js' in line and ('{' in line or '=>' in line):
snippet = line.strip()[:300]
break
if not snippet:
# Try to find any JSON-like object near script URL construction
match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)
if match:
snippet = match.group(0)[:400]
logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")
raise Exception(
"Failed to parse scripts: unknown JS bundle format. "
"Twitter may have changed their JS structure again. "
"See: https://github.com/vladkens/twscrape/issues"
) )
for k, v in json.loads(fixed_scripts).items():
yield script_url(k, f"{v}a")
def apply_twscrape_fix(): def apply_twscrape_fix():
@@ -39,6 +135,6 @@ def apply_twscrape_fix():
try: try:
from twscrape import xclid from twscrape import xclid
xclid.get_scripts_list = patched_get_scripts_list xclid.get_scripts_list = patched_get_scripts_list
logger.info("Applied twscrape monkey patch for 'Failed to parse scripts' fix") logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")
except Exception as e: except Exception as e:
logger.error(f"Failed to apply twscrape monkey patch: {e}") logger.error(f"Failed to apply twscrape monkey patch: {e}")