├── .gitattributes ├── LICENSE ├── manual-scrape.js ├── auto-scrape-batch.js ├── scrape-with-original-tweet.js └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Arjun Aditya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /manual-scrape.js: -------------------------------------------------------------------------------- 1 | (() => { 2 | const scraped = new Set(); 3 | const results = []; 4 | 5 | const extractTweets = () => { 6 | const articles = document.querySelectorAll("article"); 7 | 8 | articles.forEach((article) => { 9 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 10 | const userEl = article.querySelector('div[dir="ltr"] > span'); 11 | 12 | const statGroup = article.querySelector('div[role="group"]'); 13 | if (!statGroup) return; 14 | 15 | let replies = null, reposts = null, likes = null, views = null; 16 | 17 | const statElements = statGroup.querySelectorAll('[aria-label]'); 18 | statElements.forEach((el) => { 19 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 20 | const match = label.match(/([\d.,Kk]+)/); 21 | const value = match ? match[1].replace(/,/g, "") : null; 22 | 23 | if (label.includes("reply")) replies = value; 24 | else if (label.includes("repost")) reposts = value; 25 | else if (label.includes("like")) likes = value; 26 | else if (label.includes("view")) views = value; 27 | }); 28 | 29 | const text = textEl?.innerText?.trim(); 30 | const username = userEl?.innerText?.trim(); 31 | 32 | if (text && username) { 33 | const id = `${username}::${text}`; 34 | if (!scraped.has(id)) { 35 | scraped.add(id); 36 | results.push({ username, text, replies, reposts, likes, views }); 37 | console.log(`@${username} — 💬 ${replies} 🔁 ${reposts} ❤️ ${likes} 👁️ ${views}\n> ${text}`); 38 | } 39 | } 40 | }); 41 | }; 42 | 43 | extractTweets(); 44 | 45 | const observer = new MutationObserver(() => { 46 | extractTweets(); 47 | }); 48 | 49 | observer.observe(document.body, { childList: true, subtree: true }); 50 | 51 | console.log("Scraper is live... just keep scrolling!"); 52 | console.log("Use `downloadTweets()` to save as json."); 53 | 54 | window.downloadTweets = () => { 55 | const blob = new Blob([JSON.stringify(results, null, 2)], { type: "application/json" }); 56 | const url = URL.createObjectURL(blob); 57 | const a = document.createElement("a"); 58 | a.href = url; 59 | a.download = "tweets_with_stats.json"; 60 | a.click(); 61 | URL.revokeObjectURL(url); 62 | const message = `Downloaded ${results.length} tweets as tweets_with_stats.json`; 63 | console.log(message); 64 | return message; 65 | }; 66 | 67 | })(); 68 | -------------------------------------------------------------------------------- /auto-scrape-batch.js: -------------------------------------------------------------------------------- 1 | (() => { 2 | window.currentChunk = []; 3 | const scraped = new Set(); 4 | let chunk = 1; 5 | const CHUNK_SIZE = 100; 6 | 7 | const saveChunk = () => { 8 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 9 | const a = document.createElement("a"); 10 | a.href = URL.createObjectURL(blob); 11 | a.download = `tweets_${chunk++}.json`; 12 | a.click(); 13 | URL.revokeObjectURL(a.href); 14 | console.log(`💾 Saved ${CHUNK_SIZE} tweets as tweets_${chunk - 1}.json`); 15 | window.currentChunk = []; // 🔥 delete them from memory! 16 | }; 17 | 18 | const extractTweets = () => { 19 | const articles = document.querySelectorAll("article"); 20 | articles.forEach((article) => { 21 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 22 | const userEl = article.querySelector('div[dir="ltr"] > span'); 23 | const statGroup = article.querySelector('div[role="group"]'); 24 | if (!textEl || !userEl || !statGroup) return; 25 | 26 | let replies = null, reposts = null, likes = null, views = null; 27 | statGroup.querySelectorAll('[aria-label]').forEach((el) => { 28 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 29 | const value = label.match(/([\d.,Kk]+)/)?.[1]?.replace(/,/g, "") || null; 30 | if (label.includes("reply")) replies = value; 31 | else if (label.includes("repost")) reposts = value; 32 | else if (label.includes("like")) likes = value; 33 | else if (label.includes("view")) views = value; 34 | }); 35 | 36 | const text = textEl?.innerText?.trim(); 37 | const username = userEl?.innerText?.trim(); 38 | const id = `${username}::${text}`; 39 | if (text && username && !scraped.has(id)) { 40 | window.currentChunk.push({ username, text, replies, reposts, likes, views }); 41 | scraped.add(id); 42 | console.log(`[${window.currentChunk.length}] @${username}: ${text}`); 43 | if (window.currentChunk.length >= CHUNK_SIZE) saveChunk(); 44 | } 45 | }); 46 | }; 47 | 48 | const observer = new MutationObserver(() => extractTweets()); 49 | observer.observe(document.body, { childList: true, subtree: true }); 50 | 51 | window.scrollInterval = setInterval(() => window.scrollBy(0, 1000), 1500); 52 | 53 | window.stopScroll = () => { 54 | clearInterval(window.scrollInterval); 55 | if (window.currentChunk.length > 0) { 56 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 57 | const a = document.createElement("a"); 58 | a.href = URL.createObjectURL(blob); 59 | a.download = `tweets_final_${window.currentChunk.length}.json`; 60 | a.click(); 61 | URL.revokeObjectURL(a.href); 62 | console.log("🛑 Final partial chunk saved."); 63 | } else { 64 | console.log("🛑 Stopped. No tweets left to save."); 65 | } 66 | }; 67 | 68 | console.log("🚀 Scraper started. Will auto-save every 100 tweets and flush memory each time."); 69 | })(); 70 | -------------------------------------------------------------------------------- /scrape-with-original-tweet.js: -------------------------------------------------------------------------------- 1 | (() => { 2 | window.currentChunk = []; 3 | const scraped = new Set(); 4 | let chunk = 1; 5 | const CHUNK_SIZE = 100; 6 | 7 | const saveChunk = () => { 8 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 9 | const a = document.createElement("a"); 10 | a.href = URL.createObjectURL(blob); 11 | a.download = `tweets_${chunk++}.json`; 12 | a.click(); 13 | URL.revokeObjectURL(a.href); 14 | console.log(`💾 Saved ${CHUNK_SIZE} tweets as tweets_${chunk - 1}.json`); 15 | window.currentChunk = []; // 🔥 delete them from memory! 16 | }; 17 | 18 | const extractTweetId = (article) => { 19 | // Method 1: Try to find a link with tweet ID pattern 20 | const tweetLink = article.querySelector('a[href*="/status/"]'); 21 | if (tweetLink) { 22 | const href = tweetLink.getAttribute('href'); 23 | const match = href.match(/\/status\/(\d+)/); 24 | if (match) return match[1]; 25 | } 26 | 27 | // Method 2: Try to find time element with datetime attribute 28 | const timeEl = article.querySelector('time'); 29 | if (timeEl) { 30 | const nearestLink = timeEl.closest('a') || timeEl.parentElement?.querySelector('a'); 31 | if (nearestLink) { 32 | const href = nearestLink.getAttribute('href'); 33 | const match = href?.match(/\/status\/(\d+)/); 34 | if (match) return match[1]; 35 | } 36 | } 37 | 38 | // Method 3: Search all links in the article for status pattern 39 | const allLinks = article.querySelectorAll('a[href]'); 40 | for (const link of allLinks) { 41 | const href = link.getAttribute('href'); 42 | const match = href?.match(/\/status\/(\d+)/); 43 | if (match) return match[1]; 44 | } 45 | 46 | return null; 47 | }; 48 | 49 | const extractUsername = (article) => { 50 | // Method 1: Try to extract from any link that contains a username pattern 51 | const links = article.querySelectorAll('a[href]'); 52 | for (const link of links) { 53 | const href = link.getAttribute('href'); 54 | // Look for pattern like /username or /username/status/... 55 | const match = href?.match(/^\/([^\/]+)(?:\/|$)/); 56 | if (match && match[1] && !match[1].includes('status') && !match[1].includes('search') && !match[1].includes('home')) { 57 | return match[1]; 58 | } 59 | } 60 | 61 | // Method 2: Look for elements that might contain @username 62 | const spanElements = article.querySelectorAll('span'); 63 | for (const span of spanElements) { 64 | const text = span.innerText?.trim(); 65 | if (text && text.startsWith('@')) { 66 | return text.substring(1); // Remove @ symbol 67 | } 68 | } 69 | 70 | // Method 3: Try to find username in data attributes or other patterns 71 | const userLinks = article.querySelectorAll('a[href*="/"]'); 72 | for (const link of userLinks) { 73 | const href = link.getAttribute('href'); 74 | if (href?.startsWith('/') && !href.includes('/status/') && !href.includes('/search') && !href.includes('/home')) { 75 | const username = href.substring(1).split('/')[0]; 76 | if (username && username.length > 0 && !username.includes('?')) { 77 | return username; 78 | } 79 | } 80 | } 81 | 82 | return null; 83 | }; 84 | 85 | const extractDisplayName = (article) => { 86 | // Try to find the display name (full name) 87 | const nameSelectors = [ 88 | 'div[dir="ltr"] > span', 89 | 'a[role="link"] span', 90 | 'div[data-testid="User-Name"] span' 91 | ]; 92 | 93 | for (const selector of nameSelectors) { 94 | const element = article.querySelector(selector); 95 | if (element && element.innerText?.trim()) { 96 | const text = element.innerText.trim(); 97 | // Make sure it's not a username (doesn't start with @) 98 | if (!text.startsWith('@')) { 99 | return text; 100 | } 101 | } 102 | } 103 | 104 | return null; 105 | }; 106 | 107 | const extractTweets = () => { 108 | const articles = document.querySelectorAll("article"); 109 | articles.forEach((article) => { 110 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 111 | const statGroup = article.querySelector('div[role="group"]'); 112 | 113 | if (!textEl || !statGroup) return; 114 | 115 | // Extract engagement stats 116 | let replies = null, reposts = null, likes = null, views = null; 117 | statGroup.querySelectorAll('[aria-label]').forEach((el) => { 118 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 119 | const value = label.match(/([\d.,Kk]+)/)?.[1]?.replace(/,/g, "") || null; 120 | if (label.includes("reply")) replies = value; 121 | else if (label.includes("repost")) reposts = value; 122 | else if (label.includes("like")) likes = value; 123 | else if (label.includes("view")) views = value; 124 | }); 125 | 126 | // Extract basic info 127 | const text = textEl?.innerText?.trim(); 128 | const username = extractUsername(article); 129 | const displayName = extractDisplayName(article); 130 | const tweetId = extractTweetId(article); 131 | 132 | // Create tweet URL if we have the ID and username 133 | let tweetUrl = null; 134 | if (tweetId && username) { 135 | tweetUrl = `https://x.com/${username}/status/${tweetId}`; 136 | } 137 | 138 | const id = `${username}::${text}`; 139 | 140 | if (text && username && !scraped.has(id)) { 141 | const tweetData = { 142 | username, 143 | displayName, 144 | text, 145 | replies, 146 | reposts, 147 | likes, 148 | views, 149 | tweetId, 150 | tweetUrl 151 | }; 152 | 153 | window.currentChunk.push(tweetData); 154 | scraped.add(id); 155 | console.log(`[${window.currentChunk.length}] @${username} (${displayName}): ${text}`); 156 | if (tweetUrl) console.log(` 🔗 ${tweetUrl}`); 157 | 158 | if (window.currentChunk.length >= CHUNK_SIZE) saveChunk(); 159 | } 160 | }); 161 | }; 162 | 163 | const observer = new MutationObserver(() => extractTweets()); 164 | observer.observe(document.body, { childList: true, subtree: true }); 165 | 166 | window.scrollInterval = setInterval(() => window.scrollBy(0, 1000), 1500); 167 | 168 | window.stopScroll = () => { 169 | clearInterval(window.scrollInterval); 170 | observer.disconnect(); // Stop observing when done 171 | 172 | if (window.currentChunk.length > 0) { 173 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 174 | const a = document.createElement("a"); 175 | a.href = URL.createObjectURL(blob); 176 | a.download = `tweets_final_${window.currentChunk.length}.json`; 177 | a.click(); 178 | URL.revokeObjectURL(a.href); 179 | console.log("🛑 Final partial chunk saved."); 180 | } else { 181 | console.log("🛑 Stopped. No tweets left to save."); 182 | } 183 | }; 184 | 185 | console.log("🚀 Enhanced scraper started. Will auto-save every 100 tweets with tweet URLs!"); 186 | console.log("📝 Each tweet now includes: username, displayName, text, engagement stats, tweetId, and tweetUrl"); 187 | console.log("⏹️ Call window.stopScroll() to stop and save remaining tweets"); 188 | })(); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrape Tweets while Scrolling 2 | 3 | Scrape Tweets while Scrolling 4 | 5 | 6 | 7 | 8 | 1. Go to Chrome 9 | 10 | 11 | 2. Go to X.com 12 | 13 | 14 | 3. Open your Browser Console and Paste 15 | 16 | ```js 17 | (() => { 18 | const scraped = new Set(); 19 | const results = []; 20 | 21 | const extractTweets = () => { 22 | const articles = document.querySelectorAll("article"); 23 | 24 | articles.forEach((article) => { 25 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 26 | const userEl = article.querySelector('div[dir="ltr"] > span'); 27 | 28 | const statGroup = article.querySelector('div[role="group"]'); 29 | if (!statGroup) return; 30 | 31 | let replies = null, reposts = null, likes = null, views = null; 32 | 33 | const statElements = statGroup.querySelectorAll('[aria-label]'); 34 | statElements.forEach((el) => { 35 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 36 | const match = label.match(/([\d.,Kk]+)/); 37 | const value = match ? match[1].replace(/,/g, "") : null; 38 | 39 | if (label.includes("reply")) replies = value; 40 | else if (label.includes("repost")) reposts = value; 41 | else if (label.includes("like")) likes = value; 42 | else if (label.includes("view")) views = value; 43 | }); 44 | 45 | const text = textEl?.innerText?.trim(); 46 | const username = userEl?.innerText?.trim(); 47 | 48 | if (text && username) { 49 | const id = `${username}::${text}`; 50 | if (!scraped.has(id)) { 51 | scraped.add(id); 52 | results.push({ username, text, replies, reposts, likes, views }); 53 | console.log(`@${username} — 💬 ${replies} 🔁 ${reposts} ❤️ ${likes} 👁️ ${views}\n> ${text}`); 54 | } 55 | } 56 | }); 57 | }; 58 | 59 | extractTweets(); 60 | 61 | const observer = new MutationObserver(() => { 62 | extractTweets(); 63 | }); 64 | 65 | observer.observe(document.body, { childList: true, subtree: true }); 66 | 67 | console.log("Scraper is live... just keep scrolling!"); 68 | console.log("Use `downloadTweets()` to save as json."); 69 | 70 | window.downloadTweets = () => { 71 | const blob = new Blob([JSON.stringify(results, null, 2)], { type: "application/json" }); 72 | const url = URL.createObjectURL(blob); 73 | const a = document.createElement("a"); 74 | a.href = url; 75 | a.download = "tweets_with_stats.json"; 76 | a.click(); 77 | URL.revokeObjectURL(url); 78 | const message = `Downloaded ${results.length} tweets as tweets_with_stats.json`; 79 | console.log(message); 80 | return message; 81 | }; 82 | 83 | })(); 84 | ``` 85 | 86 | Voila you're done 87 | 88 | download via thia 89 | ```js 90 | downloadTweets() 91 | ``` 92 | 93 | 94 | very random but this the graphql endpoint 95 | 96 |
97 | X Graphql Endpoint 98 | 99 | ```bash 100 | https://x.com/i/api/graphql/0uQE4rvNofAr4pboHOZWVA/UserTweets?variables={ 101 | "userId": "1654221044503408640", 102 | "count": 20, 103 | "includePromotedContent": true, 104 | "withQuickPromoteEligibilityTweetFields": true, 105 | "withVoice": true 106 | }&features={ 107 | "rweb_video_screen_enabled": false, 108 | "payments_enabled": false, 109 | "profile_label_improvements_pcf_label_in_post_enabled": true, 110 | "rweb_tipjar_consumption_enabled": true, 111 | "verified_phone_label_enabled": true, 112 | "creator_subscriptions_tweet_preview_api_enabled": true, 113 | "responsive_web_graphql_timeline_navigation_enabled": true, 114 | "responsive_web_graphql_skip_user_profile_image_extensions_enabled": false, 115 | "premium_content_api_read_enabled": false, 116 | "communities_web_enable_tweet_community_results_fetch": true, 117 | "c9s_tweet_anatomy_moderator_badge_enabled": true, 118 | "responsive_web_grok_analyze_button_fetch_trends_enabled": false, 119 | "responsive_web_grok_analyze_post_followups_enabled": true, 120 | "responsive_web_jetfuel_frame": true, 121 | "responsive_web_grok_share_attachment_enabled": true, 122 | "articles_preview_enabled": true, 123 | "responsive_web_edit_tweet_api_enabled": true, 124 | "graphql_is_translatable_rweb_tweet_is_translatable_enabled": true, 125 | "view_counts_everywhere_api_enabled": true, 126 | "longform_notetweets_consumption_enabled": true, 127 | "responsive_web_twitter_article_tweet_consumption_enabled": true, 128 | "tweet_awards_web_tipping_enabled": false, 129 | "responsive_web_grok_show_grok_translated_post": false, 130 | "responsive_web_grok_analysis_button_from_backend": true, 131 | "creator_subscriptions_quote_tweet_preview_enabled": false, 132 | "freedom_of_speech_not_reach_fetch_enabled": true, 133 | "standardized_nudges_misinfo": true, 134 | "tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled": true, 135 | "longform_notetweets_rich_text_read_enabled": true, 136 | "longform_notetweets_inline_media_enabled": true, 137 | "responsive_web_grok_image_annotation_enabled": true, 138 | "responsive_web_grok_community_note_auto_translation_is_enabled": false, 139 | "responsive_web_enhance_cards_enabled": false 140 | }&fieldToggles={ 141 | "withArticlePlainText": false 142 | } 143 | ``` 144 | 145 | ```bash 146 | curl 'https://x.com/i/api/graphql/0uQE4rvNofAr4pboHOZWVA/UserTweets?variables=...' \ 147 | -H 'authorization: Bearer AAAAAAAAAAAAAAAAANRegergerAAAAAnNwIzUejRCOuH5...' \ 148 | -H 'x-csrf-token: ' \ 149 | -H 'cookie: auth_token=...; ct0=...' \ 150 | -H 'x-twitter-auth-type: OAuth2Session' \ 151 | -H 'x-twitter-active-user: yes' 152 | ``` 153 | 154 |
155 | 156 | you can do whatever the heck u want wit this info and pls use your web console it's love 157 | 158 | alsooooo 159 | 160 | Most likely i'm banned from twitter for this basic thing or maybe just winning + freedom of speech is a joke and flawed with their own standards. 161 | 162 | But again use this for educational purposes only and don't misuse this but one of my main reason to build this is to replicate a persona of my fav twitter creators and write tweets like them :3 163 | 164 | Wait are you lazy? You need Auto Scroll 165 | 166 | ### Auto Scroll with Batch Scraping 167 | 168 | 1. first step to start scraping 169 | ```js 170 | (() => { 171 | window.currentChunk = []; 172 | const scraped = new Set(); 173 | let chunk = 1; 174 | const CHUNK_SIZE = 100; 175 | 176 | const saveChunk = () => { 177 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 178 | const a = document.createElement("a"); 179 | a.href = URL.createObjectURL(blob); 180 | a.download = `tweets_${chunk++}.json`; 181 | a.click(); 182 | URL.revokeObjectURL(a.href); 183 | console.log(`💾 Saved ${CHUNK_SIZE} tweets as tweets_${chunk - 1}.json`); 184 | window.currentChunk = []; // 🔥 delete them from memory! 185 | }; 186 | 187 | const extractTweets = () => { 188 | const articles = document.querySelectorAll("article"); 189 | articles.forEach((article) => { 190 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 191 | const userEl = article.querySelector('div[dir="ltr"] > span'); 192 | const statGroup = article.querySelector('div[role="group"]'); 193 | if (!textEl || !userEl || !statGroup) return; 194 | 195 | let replies = null, reposts = null, likes = null, views = null; 196 | statGroup.querySelectorAll('[aria-label]').forEach((el) => { 197 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 198 | const value = label.match(/([\d.,Kk]+)/)?.[1]?.replace(/,/g, "") || null; 199 | if (label.includes("reply")) replies = value; 200 | else if (label.includes("repost")) reposts = value; 201 | else if (label.includes("like")) likes = value; 202 | else if (label.includes("view")) views = value; 203 | }); 204 | 205 | const text = textEl?.innerText?.trim(); 206 | const username = userEl?.innerText?.trim(); 207 | const id = `${username}::${text}`; 208 | if (text && username && !scraped.has(id)) { 209 | window.currentChunk.push({ username, text, replies, reposts, likes, views }); 210 | scraped.add(id); 211 | console.log(`[${window.currentChunk.length}] @${username}: ${text}`); 212 | if (window.currentChunk.length >= CHUNK_SIZE) saveChunk(); 213 | } 214 | }); 215 | }; 216 | 217 | const observer = new MutationObserver(() => extractTweets()); 218 | observer.observe(document.body, { childList: true, subtree: true }); 219 | 220 | window.scrollInterval = setInterval(() => window.scrollBy(0, 1000), 1500); 221 | 222 | window.stopScroll = () => { 223 | clearInterval(window.scrollInterval); 224 | if (window.currentChunk.length > 0) { 225 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 226 | const a = document.createElement("a"); 227 | a.href = URL.createObjectURL(blob); 228 | a.download = `tweets_final_${window.currentChunk.length}.json`; 229 | a.click(); 230 | URL.revokeObjectURL(a.href); 231 | console.log("🛑 Final partial chunk saved."); 232 | } else { 233 | console.log("🛑 Stopped. No tweets left to save."); 234 | } 235 | }; 236 | 237 | console.log("🚀 Scraper started. Will auto-save every 100 tweets and flush memory each time."); 238 | })(); 239 | ``` 240 | 241 | 2. Stop Scroll 242 | ```js 243 | stopScroll(); 244 | ``` 245 | 246 | this will download all the tweets saved 247 | 248 | 249 | 250 | 251 | 4. Cleanup (Reset Everything) ~ optional 252 | 253 | ```js 254 | delete window.currentChunk; 255 | delete window.scrollInterval; 256 | delete window.stopScroll; 257 | ``` 258 | 259 | 260 | 261 | 262 | ## Want the original tweet and the username in the scrapper data 263 | > Try this 264 | ```js 265 | 266 | (() => { 267 | window.currentChunk = []; 268 | const scraped = new Set(); 269 | let chunk = 1; 270 | const CHUNK_SIZE = 100; 271 | 272 | const saveChunk = () => { 273 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 274 | const a = document.createElement("a"); 275 | a.href = URL.createObjectURL(blob); 276 | a.download = `tweets_${chunk++}.json`; 277 | a.click(); 278 | URL.revokeObjectURL(a.href); 279 | console.log(`💾 Saved ${CHUNK_SIZE} tweets as tweets_${chunk - 1}.json`); 280 | window.currentChunk = []; // 🔥 delete them from memory! 281 | }; 282 | 283 | const extractTweetId = (article) => { 284 | // Method 1: Try to find a link with tweet ID pattern 285 | const tweetLink = article.querySelector('a[href*="/status/"]'); 286 | if (tweetLink) { 287 | const href = tweetLink.getAttribute('href'); 288 | const match = href.match(/\/status\/(\d+)/); 289 | if (match) return match[1]; 290 | } 291 | 292 | // Method 2: Try to find time element with datetime attribute 293 | const timeEl = article.querySelector('time'); 294 | if (timeEl) { 295 | const nearestLink = timeEl.closest('a') || timeEl.parentElement?.querySelector('a'); 296 | if (nearestLink) { 297 | const href = nearestLink.getAttribute('href'); 298 | const match = href?.match(/\/status\/(\d+)/); 299 | if (match) return match[1]; 300 | } 301 | } 302 | 303 | // Method 3: Search all links in the article for status pattern 304 | const allLinks = article.querySelectorAll('a[href]'); 305 | for (const link of allLinks) { 306 | const href = link.getAttribute('href'); 307 | const match = href?.match(/\/status\/(\d+)/); 308 | if (match) return match[1]; 309 | } 310 | 311 | return null; 312 | }; 313 | 314 | const extractUsername = (article) => { 315 | // Method 1: Try to extract from any link that contains a username pattern 316 | const links = article.querySelectorAll('a[href]'); 317 | for (const link of links) { 318 | const href = link.getAttribute('href'); 319 | // Look for pattern like /username or /username/status/... 320 | const match = href?.match(/^\/([^\/]+)(?:\/|$)/); 321 | if (match && match[1] && !match[1].includes('status') && !match[1].includes('search') && !match[1].includes('home')) { 322 | return match[1]; 323 | } 324 | } 325 | 326 | // Method 2: Look for elements that might contain @username 327 | const spanElements = article.querySelectorAll('span'); 328 | for (const span of spanElements) { 329 | const text = span.innerText?.trim(); 330 | if (text && text.startsWith('@')) { 331 | return text.substring(1); // Remove @ symbol 332 | } 333 | } 334 | 335 | // Method 3: Try to find username in data attributes or other patterns 336 | const userLinks = article.querySelectorAll('a[href*="/"]'); 337 | for (const link of userLinks) { 338 | const href = link.getAttribute('href'); 339 | if (href?.startsWith('/') && !href.includes('/status/') && !href.includes('/search') && !href.includes('/home')) { 340 | const username = href.substring(1).split('/')[0]; 341 | if (username && username.length > 0 && !username.includes('?')) { 342 | return username; 343 | } 344 | } 345 | } 346 | 347 | return null; 348 | }; 349 | 350 | const extractDisplayName = (article) => { 351 | // Try to find the display name (full name) 352 | const nameSelectors = [ 353 | 'div[dir="ltr"] > span', 354 | 'a[role="link"] span', 355 | 'div[data-testid="User-Name"] span' 356 | ]; 357 | 358 | for (const selector of nameSelectors) { 359 | const element = article.querySelector(selector); 360 | if (element && element.innerText?.trim()) { 361 | const text = element.innerText.trim(); 362 | // Make sure it's not a username (doesn't start with @) 363 | if (!text.startsWith('@')) { 364 | return text; 365 | } 366 | } 367 | } 368 | 369 | return null; 370 | }; 371 | 372 | const extractTweets = () => { 373 | const articles = document.querySelectorAll("article"); 374 | articles.forEach((article) => { 375 | const textEl = article.querySelector('div[data-testid="tweetText"]'); 376 | const statGroup = article.querySelector('div[role="group"]'); 377 | 378 | if (!textEl || !statGroup) return; 379 | 380 | // Extract engagement stats 381 | let replies = null, reposts = null, likes = null, views = null; 382 | statGroup.querySelectorAll('[aria-label]').forEach((el) => { 383 | const label = el.getAttribute("aria-label")?.toLowerCase() || ""; 384 | const value = label.match(/([\d.,Kk]+)/)?.[1]?.replace(/,/g, "") || null; 385 | if (label.includes("reply")) replies = value; 386 | else if (label.includes("repost")) reposts = value; 387 | else if (label.includes("like")) likes = value; 388 | else if (label.includes("view")) views = value; 389 | }); 390 | 391 | // Extract basic info 392 | const text = textEl?.innerText?.trim(); 393 | const username = extractUsername(article); 394 | const displayName = extractDisplayName(article); 395 | const tweetId = extractTweetId(article); 396 | 397 | // Create tweet URL if we have the ID and username 398 | let tweetUrl = null; 399 | if (tweetId && username) { 400 | tweetUrl = `https://x.com/${username}/status/${tweetId}`; 401 | } 402 | 403 | const id = `${username}::${text}`; 404 | 405 | if (text && username && !scraped.has(id)) { 406 | const tweetData = { 407 | username, 408 | displayName, 409 | text, 410 | replies, 411 | reposts, 412 | likes, 413 | views, 414 | tweetId, 415 | tweetUrl 416 | }; 417 | 418 | window.currentChunk.push(tweetData); 419 | scraped.add(id); 420 | console.log(`[${window.currentChunk.length}] @${username} (${displayName}): ${text}`); 421 | if (tweetUrl) console.log(` 🔗 ${tweetUrl}`); 422 | 423 | if (window.currentChunk.length >= CHUNK_SIZE) saveChunk(); 424 | } 425 | }); 426 | }; 427 | 428 | const observer = new MutationObserver(() => extractTweets()); 429 | observer.observe(document.body, { childList: true, subtree: true }); 430 | 431 | window.scrollInterval = setInterval(() => window.scrollBy(0, 1000), 1500); 432 | 433 | window.stopScroll = () => { 434 | clearInterval(window.scrollInterval); 435 | observer.disconnect(); // Stop observing when done 436 | 437 | if (window.currentChunk.length > 0) { 438 | const blob = new Blob([JSON.stringify(window.currentChunk, null, 2)], { type: "application/json" }); 439 | const a = document.createElement("a"); 440 | a.href = URL.createObjectURL(blob); 441 | a.download = `tweets_final_${window.currentChunk.length}.json`; 442 | a.click(); 443 | URL.revokeObjectURL(a.href); 444 | console.log("🛑 Final partial chunk saved."); 445 | } else { 446 | console.log("🛑 Stopped. No tweets left to save."); 447 | } 448 | }; 449 | 450 | console.log("🚀 Enhanced scraper started. Will auto-save every 100 tweets with tweet URLs!"); 451 | console.log("📝 Each tweet now includes: username, displayName, text, engagement stats, tweetId, and tweetUrl"); 452 | console.log("⏹️ Call window.stopScroll() to stop and save remaining tweets"); 453 | })(); 454 | ``` --------------------------------------------------------------------------------