fix(twitter): update twscrape monkey patch for JS bundle format change
Twitter changed the JS bundle structure from the old single-map format
(e=>e+"."+{...}[e]+"a.js") to a new two-map format
(u.u=e=>""+(({name})[e]||e)+"."+({hash})[e]+"a.js"), breaking
x-client-transaction-id generation.
This caused IndexError: list index out of range, which twscrape
interpreted as an account timeout (15-min lockout), preventing Miku
from fetching/sharing tweets.
The fix adds:
- A robust multi-pattern parser that tries known formats in order
- The _js_obj_to_dict helper from PR #303 for handling unquoted numeric
keys and scientific notation in JS object literals
- Debug logging to capture the JS snippet when ALL patterns fail,
making future breakage easier to diagnose
References:
- https://github.com/vladkens/twscrape/issues/302
- https://github.com/vladkens/twscrape/pull/303
This commit is contained in:
@@ -1,12 +1,21 @@
|
|||||||
# utils/twscrape_fix.py
|
# utils/twscrape_fix.py
|
||||||
"""
|
"""
|
||||||
Monkey patch for twscrape to fix "Failed to parse scripts" error.
|
Monkey patch for twscrape to fix parsing of Twitter's JS bundle.
|
||||||
Twitter started returning malformed JSON with unquoted keys.
|
|
||||||
See: https://github.com/vladkens/twscrape/issues/284
|
Fixes two known issues:
|
||||||
|
1. Issue #284: Malformed JSON with unquoted keys
|
||||||
|
(old fix, kept for backward compatibility)
|
||||||
|
2. Issue #302: Twitter changed JS bundle format, breaking x-client-transaction-id
|
||||||
|
generation. The old format 'e=>e+"."+{...}[e]+"a.js"' changed to
|
||||||
|
'u.u=e=>""+(({...})[e]||e)+"."+({...})[e]+"a.js"'
|
||||||
|
Fix from: https://github.com/vladkens/twscrape/pull/303
|
||||||
|
|
||||||
|
Without this patch, twscrape raises IndexError and locks accounts for 15 minutes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
from typing import Iterator
|
||||||
from utils.logger import get_logger
|
from utils.logger import get_logger
|
||||||
|
|
||||||
logger = get_logger('core')
|
logger = get_logger('core')
|
||||||
@@ -16,22 +25,109 @@ def script_url(k: str, v: str):
|
|||||||
return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
|
return f"https://abs.twimg.com/responsive-web/client-web/{k}.{v}.js"
|
||||||
|
|
||||||
|
|
||||||
def patched_get_scripts_list(text: str):
|
def _js_obj_to_dict(s: str) -> dict:
|
||||||
"""Fixed version that handles unquoted keys in Twitter's JSON response"""
|
"""
|
||||||
scripts = text.split('e=>e+"."+')[1].split('[e]+"a.js"')[0]
|
Parse a JavaScript object literal with unquoted numeric keys into a Python dict.
|
||||||
|
Handles both plain integers (20113) and scientific notation (88e3 → 88000).
|
||||||
|
|
||||||
|
From: https://github.com/vladkens/twscrape/pull/303
|
||||||
|
"""
|
||||||
|
# Scientific notation first so the plain-int pass does not consume only the mantissa
|
||||||
|
s = re.sub(r'\b(\d+e\d+)(?=\s*:)', lambda m: '"' + str(int(float(m.group(1)))) + '"', s)
|
||||||
|
# Plain integer keys
|
||||||
|
s = re.sub(r'\b(\d+)(?=\s*:)', r'"\1"', s)
|
||||||
|
return json.loads('{' + s + '}')
|
||||||
|
|
||||||
|
|
||||||
|
def patched_get_scripts_list(text: str) -> Iterator[str]:
|
||||||
|
"""
|
||||||
|
Fixed version that handles Twitter's changing JS bundle format.
|
||||||
|
|
||||||
|
Uses a robust two-pass approach:
|
||||||
|
1. Try to find the script map using generic regex patterns
|
||||||
|
2. Fall back to known format-specific splits
|
||||||
|
|
||||||
|
Twitter keeps changing the JS bundle structure. The key invariant is that
|
||||||
|
there's always a JavaScript object literal mapping chunk IDs to hashes,
|
||||||
|
somewhere in a function that constructs script URLs with ".a.js" suffix.
|
||||||
|
"""
|
||||||
|
# Strategy: Find the JS object that maps IDs to hash values.
|
||||||
|
# The format is always some variation of:
|
||||||
|
# ... => "" + ({...})[e] + "." + ({...})[e] + "a.js"
|
||||||
|
# or:
|
||||||
|
# ... => e + "." + ({...})[e] + "a.js"
|
||||||
|
#
|
||||||
|
# We use regex to find the LAST object literal before "a.js" that looks
|
||||||
|
# like a hash map (integer keys, short hex-ish string values).
|
||||||
|
|
||||||
|
# Approach 1: Known patterns (newest first)
|
||||||
|
patterns = [
|
||||||
|
# Pattern from PR #303 (April 2026):
|
||||||
|
# u.u=e=>""+(({name_map})[e]||e)+"."+({hash_map})[e]+"a.js"
|
||||||
|
{
|
||||||
|
"name_split_start": '(({',
|
||||||
|
"name_split_end": '})[e]||e)',
|
||||||
|
"hash_split_start": '|e)+"."+({',
|
||||||
|
"hash_split_end": '})[e]+"a.js"',
|
||||||
|
},
|
||||||
|
# Alternative: same but without the ||e fallback
|
||||||
|
{
|
||||||
|
"name_split_start": '""+(({',
|
||||||
|
"name_split_end": '})[e]',
|
||||||
|
"hash_split_start": ')+"."+({',
|
||||||
|
"hash_split_end": '})[e]+"a.js"',
|
||||||
|
},
|
||||||
|
# Old format (pre-April 2026):
|
||||||
|
# e=>e+"."+{...}[e]+"a.js"
|
||||||
|
{
|
||||||
|
"name_split_start": None, # single map
|
||||||
|
"name_split_end": None,
|
||||||
|
"hash_split_start": 'e=>e+"."+',
|
||||||
|
"hash_split_end": '[e]+"a.js"',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern in patterns:
|
||||||
try:
|
try:
|
||||||
for k, v in json.loads(scripts).items():
|
if pattern["name_split_start"] is None:
|
||||||
yield script_url(k, f"{v}a")
|
# Single-map old format
|
||||||
except json.decoder.JSONDecodeError:
|
scripts = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
|
||||||
# Fix unquoted keys like: node_modules_pnpm_ws_8_18_0_node_modules_ws_browser_js
|
names = None
|
||||||
fixed_scripts = re.sub(
|
hashes = _js_obj_to_dict(scripts)
|
||||||
r'([,\{])(\s*)([\w]+_[\w_]+)(\s*):',
|
else:
|
||||||
r'\1\2"\3"\4:',
|
# Two-map new format
|
||||||
scripts
|
name_raw = text.split(pattern["name_split_start"])[1].split(pattern["name_split_end"])[0]
|
||||||
|
hash_raw = text.split(pattern["hash_split_start"])[1].split(pattern["hash_split_end"])[0]
|
||||||
|
names = _js_obj_to_dict(name_raw)
|
||||||
|
hashes = _js_obj_to_dict(hash_raw)
|
||||||
|
|
||||||
|
for k, hash_val in hashes.items():
|
||||||
|
name = names.get(k, k) if names else k
|
||||||
|
yield script_url(name, f"{hash_val}a")
|
||||||
|
logger.info(f"Successfully parsed scripts using pattern: {pattern['hash_split_start'][:40]}...")
|
||||||
|
return
|
||||||
|
except (IndexError, KeyError, json.JSONDecodeError):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# If ALL patterns failed, log a snippet of the text for debugging
|
||||||
|
# Find any line near "a.js" to help diagnose
|
||||||
|
snippet = ""
|
||||||
|
for line in text.split('\n'):
|
||||||
|
if 'a.js' in line and ('{' in line or '=>' in line):
|
||||||
|
snippet = line.strip()[:300]
|
||||||
|
break
|
||||||
|
if not snippet:
|
||||||
|
# Try to find any JSON-like object near script URL construction
|
||||||
|
match = re.search(r'.{0,200}a\.js.{0,200}', text, re.DOTALL)
|
||||||
|
if match:
|
||||||
|
snippet = match.group(0)[:400]
|
||||||
|
|
||||||
|
logger.error(f"Failed to parse scripts. Text snippet near 'a.js': {snippet}")
|
||||||
|
raise Exception(
|
||||||
|
"Failed to parse scripts: unknown JS bundle format. "
|
||||||
|
"Twitter may have changed their JS structure again. "
|
||||||
|
"See: https://github.com/vladkens/twscrape/issues"
|
||||||
)
|
)
|
||||||
for k, v in json.loads(fixed_scripts).items():
|
|
||||||
yield script_url(k, f"{v}a")
|
|
||||||
|
|
||||||
|
|
||||||
def apply_twscrape_fix():
|
def apply_twscrape_fix():
|
||||||
@@ -39,6 +135,6 @@ def apply_twscrape_fix():
|
|||||||
try:
|
try:
|
||||||
from twscrape import xclid
|
from twscrape import xclid
|
||||||
xclid.get_scripts_list = patched_get_scripts_list
|
xclid.get_scripts_list = patched_get_scripts_list
|
||||||
logger.info("Applied twscrape monkey patch for 'Failed to parse scripts' fix")
|
logger.info("Applied twscrape monkey patch (JS bundle parsing fix for issues #284 + #302)")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to apply twscrape monkey patch: {e}")
|
logger.error(f"Failed to apply twscrape monkey patch: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user