# utils/twitter_fetcher.py import asyncio import json from twscrape import API, gather, Account from playwright.async_api import async_playwright from pathlib import Path COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json" async def extract_media_urls(page, tweet_url): print(f"๐Ÿ” Visiting tweet page: {tweet_url}") try: await page.goto(tweet_url, timeout=15000) await page.wait_for_timeout(1000) media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']") urls = set() for element in media_elements: src = await element.get_attribute("src") if src: cleaned = src.split("&name=")[0] + "&name=large" urls.add(cleaned) print(f"๐Ÿ–ผ๏ธ Found {len(urls)} media URLs on tweet: {tweet_url}") return list(urls) except Exception as e: print(f"โŒ Playwright error on {tweet_url}: {e}") return [] async def fetch_miku_tweets(limit=5): # Load cookies from JSON file with open(COOKIE_PATH, "r", encoding="utf-8") as f: cookie_list = json.load(f) cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list) # Add the account to twscrape api = API() await api.pool.add_account( username="HSankyuu39", password="x", # placeholder (won't be used) email="x", # optional email_password="x", # optional cookies=cookie_header ) await api.pool.login_all() print(f"๐Ÿ”Ž Searching for Miku tweets (limit={limit})...") query = 'Hatsune Miku OR ๅˆ้ŸณใƒŸใ‚ฏ has:images after:2025' tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"})) print(f"๐Ÿ“„ Found {len(tweets)} tweets, launching browser...") async with async_playwright() as p: browser = await p.firefox.launch(headless=True) context = await browser.new_context() await context.route("**/*", lambda route, request: ( route.abort() if any([ request.resource_type in ["font", "stylesheet"], "analytics" in request.url, "googletagmanager" in request.url, "ads-twitter" in request.url, ]) else route.continue_() )) page = await context.new_page() results = [] for i, tweet in enumerate(tweets, 1): username = tweet.user.username tweet_url = f"https://twitter.com/{username}/status/{tweet.id}" print(f"๐Ÿงต Processing tweet {i}/{len(tweets)} from @{username}") media_urls = await extract_media_urls(page, tweet_url) if media_urls: results.append({ "username": username, "text": tweet.rawContent, "url": tweet_url, "media": media_urls }) await browser.close() print(f"โœ… Finished! Returning {len(results)} tweet(s) with media.") return results