Files
miku-discord/.bot.bak.80825/utils/twitter_fetcher.py

89 lines
3.0 KiB
Python
Raw Normal View History

2025-12-07 17:15:09 +02:00
# utils/twitter_fetcher.py
import asyncio
import json
from twscrape import API, gather, Account
from playwright.async_api import async_playwright
from pathlib import Path
COOKIE_PATH = Path(__file__).parent / "x.com.cookies.json"
async def extract_media_urls(page, tweet_url):
print(f"🔍 Visiting tweet page: {tweet_url}")
try:
await page.goto(tweet_url, timeout=15000)
await page.wait_for_timeout(1000)
media_elements = await page.query_selector_all("img[src*='pbs.twimg.com/media']")
urls = set()
for element in media_elements:
src = await element.get_attribute("src")
if src:
cleaned = src.split("&name=")[0] + "&name=large"
urls.add(cleaned)
print(f"🖼️ Found {len(urls)} media URLs on tweet: {tweet_url}")
return list(urls)
except Exception as e:
print(f"❌ Playwright error on {tweet_url}: {e}")
return []
async def fetch_miku_tweets(limit=5):
# Load cookies from JSON file
with open(COOKIE_PATH, "r", encoding="utf-8") as f:
cookie_list = json.load(f)
cookie_header = "; ".join(f"{c['name']}={c['value']}" for c in cookie_list)
# Add the account to twscrape
api = API()
await api.pool.add_account(
username="HSankyuu39",
password="x", # placeholder (won't be used)
email="x", # optional
email_password="x", # optional
cookies=cookie_header
)
await api.pool.login_all()
print(f"🔎 Searching for Miku tweets (limit={limit})...")
query = 'Hatsune Miku OR 初音ミク has:images after:2025'
tweets = await gather(api.search(query, limit=limit, kv={"product": "Top"}))
print(f"📄 Found {len(tweets)} tweets, launching browser...")
async with async_playwright() as p:
browser = await p.firefox.launch(headless=True)
context = await browser.new_context()
await context.route("**/*", lambda route, request: (
route.abort() if any([
request.resource_type in ["font", "stylesheet"],
"analytics" in request.url,
"googletagmanager" in request.url,
"ads-twitter" in request.url,
]) else route.continue_()
))
page = await context.new_page()
results = []
for i, tweet in enumerate(tweets, 1):
username = tweet.user.username
tweet_url = f"https://twitter.com/{username}/status/{tweet.id}"
print(f"🧵 Processing tweet {i}/{len(tweets)} from @{username}")
media_urls = await extract_media_urls(page, tweet_url)
if media_urls:
results.append({
"username": username,
"text": tweet.rawContent,
"url": tweet_url,
"media": media_urls
})
await browser.close()
print(f"✅ Finished! Returning {len(results)} tweet(s) with media.")
return results