├── .gitattributes ├── .gitignore ├── README.md ├── scrapper.ipynb ├── scrapper.py ├── twitter-scraper-tut.ipynb └── twitter_scraper.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twitter-Scraper 2 | This is not a perfect scraper, so feel free to add improvements if you find any. 3 | 4 | IMPROVEMENTS: 5 | - Improved error handling so that tweets are not rejected if certain fields are null, etc... 6 | - Leveraged the `WebDriverAwait` class to enable better detection of desired load states 7 | - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session. 8 | 9 | NOTES AND THINGS TO THINK ABOUT: 10 | - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of 11 | seconds that the program will wait until attempting to scroll again. I'm currently making 5 attemps with 12 | a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`. 13 | This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down 14 | quicker. 15 | - The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how 16 | many tweets are processed from each scroll. I've written more about this in the function docstring. 17 | - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much 18 | better solution than a hard-coded `sleep` call because it will only timeout after a certain period of 19 | time if specific conditions are not met. There are many other sections of this code that could be 20 | improved, I'm sure, by leveraging this class. 21 | - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such 22 | as a database save via `pyodbc`, `sqlite3`, or whatever you want really. 23 | - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url 24 | is built. You can then leverage this to make your searches more customized... with date ranges, special keywords, 25 | etc... --> https://twitter.com/search-advanced? 26 | -------------------------------------------------------------------------------- /scrapper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import re\n", 10 | "import csv\n", 11 | "from getpass import getpass\n", 12 | "from time import sleep\n", 13 | "from selenium.webdriver.common.keys import Keys\n", 14 | "from selenium.common.exceptions import NoSuchElementException\n", 15 | "from msedge.selenium_tools import Edge, EdgeOptions " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "def get_tweet_data(card):\n", 25 | " \"\"\"Extract data from tweet card\"\"\"\n", 26 | " username = card.find_element_by_xpath('.//span').text\n", 27 | " try:\n", 28 | " handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n", 29 | " except NoSuchElementException:\n", 30 | " return\n", 31 | " \n", 32 | " try:\n", 33 | " postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n", 34 | " except NoSuchElementException:\n", 35 | " return\n", 36 | " \n", 37 | " comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n", 38 | " responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n", 39 | " text = comment + responding\n", 40 | " reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n", 41 | " retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n", 42 | " like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n", 43 | "\n", 44 | " \n", 45 | " tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n", 46 | " return tweet " 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "search_term = input('search term: ')\n", 56 | "\n", 57 | "# create instance of web driver\n", 58 | "options = EdgeOptions()\n", 59 | "options.use_chromium = True\n", 60 | "driver = Edge(options=options)\n", 61 | "\n", 62 | "# navigate to login screen\n", 63 | "driver.get('https://twitter.com/search')\n", 64 | "driver.maximize_window()\n", 65 | "sleep(5)\n", 66 | "\n", 67 | "# find search input and search for term\n", 68 | "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n", 69 | "search_input.send_keys(search_term)\n", 70 | "search_input.send_keys(Keys.RETURN)\n", 71 | "sleep(1)\n", 72 | "\n", 73 | "# navigate to historical 'latest' tab\n", 74 | "driver.find_element_by_link_text('Latest').click()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# get all tweets on the page\n", 84 | "data = []\n", 85 | "tweet_ids = set()\n", 86 | "last_position = driver.execute_script(\"return window.pageYOffset;\")\n", 87 | "scrolling = True\n", 88 | "\n", 89 | "while scrolling:\n", 90 | " page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n", 91 | " for card in page_cards[-15:]:\n", 92 | " tweet = get_tweet_data(card)\n", 93 | " if tweet:\n", 94 | " tweet_id = ''.join(tweet)\n", 95 | " if tweet_id not in tweet_ids:\n", 96 | " tweet_ids.add(tweet_id)\n", 97 | " data.append(tweet)\n", 98 | " \n", 99 | " scroll_attempt = 0\n", 100 | " while True:\n", 101 | " # check scroll position\n", 102 | " driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n", 103 | " sleep(2)\n", 104 | " curr_position = driver.execute_script(\"return window.pageYOffset;\")\n", 105 | " if last_position == curr_position:\n", 106 | " scroll_attempt += 1\n", 107 | " \n", 108 | " # end of scroll region\n", 109 | " if scroll_attempt >= 3:\n", 110 | " scrolling = False\n", 111 | " break\n", 112 | " else:\n", 113 | " sleep(2) # attempt another scroll\n", 114 | " else:\n", 115 | " last_position = curr_position\n", 116 | " break\n", 117 | "\n", 118 | "# close the web driver\n", 119 | "driver.close()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "### Saving the tweet data" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n", 136 | " header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n", 137 | " writer = csv.writer(f)\n", 138 | " writer.writerow(header)\n", 139 | " writer.writerows(data)" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "interpreter": { 145 | "hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138" 146 | }, 147 | "kernelspec": { 148 | "display_name": "Python 3.7.0 64-bit", 149 | "language": "python", 150 | "name": "python3" 151 | }, 152 | "language_info": { 153 | "codemirror_mode": { 154 | "name": "ipython", 155 | "version": 3 156 | }, 157 | "file_extension": ".py", 158 | "mimetype": "text/x-python", 159 | "name": "python", 160 | "nbconvert_exporter": "python", 161 | "pygments_lexer": "ipython3", 162 | "version": "3.7.0" 163 | }, 164 | "orig_nbformat": 4 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 2 168 | } 169 | -------------------------------------------------------------------------------- /scrapper.py: -------------------------------------------------------------------------------- 1 | """ 2 | TITLE: 3 | A simple search-based twitter scraper 4 | LAST MODIFIED: 5 | 2022-01-14 6 | AUTHOR: 7 | Israel Dryer 8 | israel.dryer@gmail.com 9 | This is not a perfect scraper, so feel free to add improvements if you find any. 10 | IMPROVEMENTS: 11 | - Improved error handling so that tweets are not rejected if certain fields are null, etc... 12 | - Leveraged the `WebDriverAwait` class to enable better detection of desired load states 13 | - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session. 14 | NOTES AND THINGS TO THINK ABOUT: 15 | - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of 16 | seconds that the program will wait until attempting to scroll again. I'm currently making 5 attempts with 17 | a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`. 18 | This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down 19 | quicker. 20 | - The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how 21 | many tweets are processed from each scroll. I've written more about this in the function docstring. 22 | - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much 23 | better solution than a hard-coded `sleep` call because it will only timeout after a certain period of 24 | time if specific conditions are not met. There are many other sections of this code that could be 25 | improved, I'm sure, by leveraging this class. 26 | - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such 27 | as a database save via `pyodbc`, `sqlite3`, or whatever you want really. 28 | - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url 29 | is built. You can then leverage this to make your searches more customized... with date ranges, special keywords, 30 | etc... --> https://twitter.com/search-advanced? 31 | """ 32 | import csv 33 | from time import sleep 34 | from msedge.selenium_tools import Edge, EdgeOptions 35 | from selenium.webdriver.common.keys import Keys 36 | from selenium.webdriver.common.by import By 37 | from selenium.common import exceptions 38 | 39 | 40 | def create_webdriver_instance(): 41 | options = EdgeOptions() 42 | options.use_chromium = True 43 | driver = Edge(options=options) 44 | return driver 45 | 46 | 47 | def twitter_search(driver, search_term): 48 | url = 'https://twitter.com/search' 49 | driver.get(url) 50 | driver.maximize_window() 51 | sleep(5) 52 | 53 | search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]') 54 | search_input.send_keys(search_term) 55 | search_input.send_keys(Keys.RETURN) 56 | sleep(5) 57 | return True 58 | 59 | 60 | def change_page_sort(tab_name, driver): 61 | """Options for this program are `Latest` and `Top`""" 62 | tab = driver.find_element_by_link_text(tab_name) 63 | tab.click() 64 | xpath_tab_state = f'//a[contains(text(),\"{tab_name}\") and @aria-selected=\"true\"]' 65 | return xpath_tab_state 66 | 67 | 68 | def generate_tweet_id(tweet): 69 | return ''.join(tweet) 70 | 71 | 72 | def scroll_down_page(driver, last_position, num_seconds_to_load=0.5, scroll_attempt=0, max_attempts=5): 73 | """The function will try to scroll down the page and will check the current 74 | and last positions as an indicator. If the current and last positions are the same after `max_attempts` 75 | the assumption is that the end of the scroll region has been reached and the `end_of_scroll_region` 76 | flag will be returned as `True`""" 77 | end_of_scroll_region = False 78 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 79 | sleep(num_seconds_to_load) 80 | curr_position = driver.execute_script("return window.pageYOffset;") 81 | if curr_position == last_position: 82 | if scroll_attempt < max_attempts: 83 | end_of_scroll_region = True 84 | else: 85 | scroll_down_page(last_position, curr_position, scroll_attempt + 1) 86 | last_position = curr_position 87 | return last_position, end_of_scroll_region 88 | 89 | 90 | def save_tweet_data_to_csv(records, filepath, mode='a+'): 91 | header = ['User', 'Handle', 'PostDate', 'TweetText', 'ReplyCount', 'RetweetCount', 'LikeCount'] 92 | with open(filepath, mode=mode, newline='', encoding='utf-8') as f: 93 | writer = csv.writer(f) 94 | if mode == 'w': 95 | writer.writerow(header) 96 | if records: 97 | writer.writerow(records) 98 | 99 | 100 | def collect_all_tweets_from_current_view(driver, lookback_limit=25): 101 | """The page is continously loaded, so as you scroll down the number of tweets returned by this function will 102 | continue to grow. To limit the risk of 're-processing' the same tweet over and over again, you can set the 103 | `lookback_limit` to only process the last `x` number of tweets extracted from the page in each iteration. 104 | You may need to play around with this number to get something that works for you. I've set the default 105 | based on my computer settings and internet speed, etc...""" 106 | page_cards = driver.find_elements_by_xpath('//article[@data-testid="tweet"]') 107 | if len(page_cards) <= lookback_limit: 108 | return page_cards 109 | else: 110 | return page_cards[-lookback_limit:] 111 | 112 | 113 | def extract_data_from_current_tweet_card(card): 114 | try: 115 | user = card.find_element_by_xpath('.//span').text 116 | except exceptions.NoSuchElementException: 117 | user = "" 118 | except exceptions.StaleElementReferenceException: 119 | return 120 | try: 121 | handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text 122 | except exceptions.NoSuchElementException: 123 | handle = "" 124 | try: 125 | """ 126 | If there is no post date here, there it is usually sponsored content, or some 127 | other form of content where post dates do not apply. You can set a default value 128 | for the postdate on Exception if you which to keep this record. By default I am 129 | excluding these. 130 | """ 131 | postdate = card.find_element_by_xpath('.//time').get_attribute('datetime') 132 | except exceptions.NoSuchElementException: 133 | return 134 | try: 135 | _comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text 136 | except exceptions.NoSuchElementException: 137 | _comment = "" 138 | try: 139 | _responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text 140 | except exceptions.NoSuchElementException: 141 | _responding = "" 142 | tweet_text = _comment + _responding 143 | try: 144 | reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text 145 | except exceptions.NoSuchElementException: 146 | reply_count = "" 147 | try: 148 | retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text 149 | except exceptions.NoSuchElementException: 150 | retweet_count = "" 151 | try: 152 | like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text 153 | except exceptions.NoSuchElementException: 154 | like_count = "" 155 | 156 | tweet = (user, handle, postdate, tweet_text, reply_count, retweet_count, like_count) 157 | return tweet 158 | 159 | 160 | def main(search_term, filepath, page_sort='Latest'): 161 | save_tweet_data_to_csv(None, filepath, 'w') # create file for saving records 162 | last_position = None 163 | end_of_scroll_region = False 164 | unique_tweets = set() 165 | 166 | driver = create_webdriver_instance() 167 | twitter_search_page_term = twitter_search(driver, search_term) 168 | if not twitter_search_page_term: 169 | return 170 | 171 | change_page_sort(page_sort, driver) 172 | 173 | while not end_of_scroll_region: 174 | cards = collect_all_tweets_from_current_view(driver) 175 | for card in cards: 176 | try: 177 | tweet = extract_data_from_current_tweet_card(card) 178 | except exceptions.StaleElementReferenceException: 179 | continue 180 | if not tweet: 181 | continue 182 | tweet_id = generate_tweet_id(tweet) 183 | if tweet_id not in unique_tweets: 184 | unique_tweets.add(tweet_id) 185 | save_tweet_data_to_csv(tweet, filepath) 186 | last_position, end_of_scroll_region = scroll_down_page(driver, last_position) 187 | driver.quit() 188 | 189 | 190 | if __name__ == '__main__': 191 | path = 'pysimplegui.csv' 192 | term = 'pysimplegui' 193 | 194 | main(term, path) -------------------------------------------------------------------------------- /twitter-scraper-tut.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Twitter Scraper" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import re\n", 17 | "import csv\n", 18 | "from getpass import getpass\n", 19 | "from time import sleep\n", 20 | "from selenium.webdriver.common.keys import Keys\n", 21 | "from selenium.common.exceptions import NoSuchElementException\n", 22 | "from msedge.selenium_tools import Edge, EdgeOptions\n", 23 | "\n", 24 | "def get_tweet_data(card):\n", 25 | " \"\"\"Extract data from tweet card\"\"\"\n", 26 | " username = card.find_element_by_xpath('.//span').text\n", 27 | " try:\n", 28 | " handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n", 29 | " except NoSuchElementException:\n", 30 | " return\n", 31 | " \n", 32 | " try:\n", 33 | " postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n", 34 | " except NoSuchElementException:\n", 35 | " return\n", 36 | " \n", 37 | " comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n", 38 | " responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n", 39 | " text = comment + responding\n", 40 | " reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n", 41 | " retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n", 42 | " like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n", 43 | " \n", 44 | " # get a string of all emojis contained in the tweet\n", 45 | " \"\"\"Emojis are stored as images... so I convert the filename, which is stored as unicode, into \n", 46 | " the emoji character.\"\"\"\n", 47 | " emoji_tags = card.find_elements_by_xpath('.//img[contains(@src, \"emoji\")]')\n", 48 | " emoji_list = []\n", 49 | " for tag in emoji_tags:\n", 50 | " filename = tag.get_attribute('src')\n", 51 | " try:\n", 52 | " emoji = chr(int(re.search(r'svg\\/([a-z0-9]+)\\.svg', filename).group(1), base=16))\n", 53 | " except AttributeError:\n", 54 | " continue\n", 55 | " if emoji:\n", 56 | " emoji_list.append(emoji)\n", 57 | " emojis = ' '.join(emoji_list)\n", 58 | " \n", 59 | " tweet = (username, handle, postdate, text, emojis, reply_cnt, retweet_cnt, like_cnt)\n", 60 | " return tweet " 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "username: israel.dryer@gmail.com\n", 73 | "Password: ·············\n", 74 | "search term: #turkcell\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# application variables\n", 80 | "user = input('username: ')\n", 81 | "my_password = getpass('Password: ')\n", 82 | "search_term = input('search term: ')\n", 83 | "\n", 84 | "# create instance of web driver\n", 85 | "options = EdgeOptions()\n", 86 | "options.use_chromium = True\n", 87 | "driver = Edge(options=options)\n", 88 | "\n", 89 | "# navigate to login screen\n", 90 | "driver.get('https://www.twitter.com/login')\n", 91 | "driver.maximize_window()\n", 92 | "sleep(5)\n", 93 | "username = driver.find_element_by_xpath('//input[@name=\"text\"]')\n", 94 | "username.send_keys(user)\n", 95 | "username.send_keys(Keys.RETURN)\n", 96 | "sleep(3)\n", 97 | "\n", 98 | "password = driver.find_element_by_xpath('//input[@name=\"password\"]')\n", 99 | "password.send_keys(my_password)\n", 100 | "password.send_keys(Keys.RETURN)\n", 101 | "sleep(3)\n", 102 | "\n", 103 | "# find search input and search for term\n", 104 | "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n", 105 | "search_input.send_keys(search_term)\n", 106 | "search_input.send_keys(Keys.RETURN)\n", 107 | "sleep(1)\n", 108 | "\n", 109 | "# navigate to historical 'latest' tab\n", 110 | "driver.find_element_by_link_text('Latest').click()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# get all tweets on the page\n", 120 | "data = []\n", 121 | "tweet_ids = set()\n", 122 | "last_position = driver.execute_script(\"return window.pageYOffset;\")\n", 123 | "scrolling = True\n", 124 | "\n", 125 | "while scrolling:\n", 126 | " page_cards = driver.find_elements_by_xpath('//div[@data-testid=\"tweet\"]')\n", 127 | " for card in page_cards[-15:]:\n", 128 | " tweet = get_tweet_data(card)\n", 129 | " if tweet:\n", 130 | " tweet_id = ''.join(tweet)\n", 131 | " if tweet_id not in tweet_ids:\n", 132 | " tweet_ids.add(tweet_id)\n", 133 | " data.append(tweet)\n", 134 | " \n", 135 | " scroll_attempt = 0\n", 136 | " while True:\n", 137 | " # check scroll position\n", 138 | " driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n", 139 | " sleep(2)\n", 140 | " curr_position = driver.execute_script(\"return window.pageYOffset;\")\n", 141 | " if last_position == curr_position:\n", 142 | " scroll_attempt += 1\n", 143 | " \n", 144 | " # end of scroll region\n", 145 | " if scroll_attempt >= 3:\n", 146 | " scrolling = False\n", 147 | " break\n", 148 | " else:\n", 149 | " sleep(2) # attempt another scroll\n", 150 | " else:\n", 151 | " last_position = curr_position\n", 152 | " break\n", 153 | "\n", 154 | "# close the web driver\n", 155 | "driver.close()" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Saving the tweet data" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 4, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n", 172 | " header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Emojis', 'Comments', 'Likes', 'Retweets']\n", 173 | " writer = csv.writer(f)\n", 174 | " writer.writerow(header)\n", 175 | " writer.writerows(data)" 176 | ] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.8.3" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 4 200 | } 201 | -------------------------------------------------------------------------------- /twitter_scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | TITLE: 3 | A simple search-based twitter scraper 4 | 5 | LAST MODIFIED: 6 | 2020-12-21 7 | 8 | AUTHOR: 9 | Israel Dryer 10 | israel.dryer@gmail.com 11 | 12 | This is not a perfect scraper, so feel free to add improvements if you find any. 13 | 14 | IMPROVEMENTS: 15 | - Improved error handling so that tweets are not rejected if certain fields are null, etc... 16 | - Leveraged the `WebDriverAwait` class to enable better detection of desired load states 17 | - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session. 18 | 19 | NOTES AND THINGS TO THINK ABOUT: 20 | - Twitter will block you from logging (temporary) in via the webdriver if you log in too many times in a single day. 21 | 22 | - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of 23 | seconds that the program will wait until attempting to scroll again. I'm currently making 5 attempts with 24 | a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`. 25 | This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down 26 | quicker. 27 | 28 | - The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how 29 | many tweets are processed from each scroll. I've written more about this in the function docstring. 30 | 31 | - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much 32 | better solution than a hard-coded `sleep` call because it will only timeout after a certain period of 33 | time if specific conditions are not met. There are many other sections of this code that could be 34 | improved, I'm sure, by leveraging this class. 35 | 36 | - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such 37 | as a database save via `pyodbc`, `sqlite3`, or whatever you want really. 38 | 39 | - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url 40 | is built. You can then leverage this to make your searches more customized... with date ranges, special keywords, 41 | etc... --> https://twitter.com/search-advanced? 42 | """ 43 | import csv 44 | from time import sleep 45 | from msedge.selenium_tools import Edge, EdgeOptions 46 | from selenium.webdriver.common.keys import Keys 47 | from selenium.webdriver.common.by import By 48 | from selenium.webdriver.support.ui import WebDriverWait 49 | from selenium.webdriver.support import expected_conditions 50 | from selenium.common import exceptions 51 | 52 | 53 | def create_webdriver_instance(): 54 | options = EdgeOptions() 55 | options.use_chromium = True 56 | driver = Edge(options=options) 57 | return driver 58 | 59 | 60 | def login_to_twitter(username, password, driver): 61 | url = 'https://twitter.com/login' 62 | try: 63 | driver.get(url) 64 | xpath_username = '//input[@name="session[username_or_email]"]' 65 | WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.XPATH, xpath_username))) 66 | uid_input = driver.find_element_by_xpath(xpath_username) 67 | uid_input.send_keys(username) 68 | except exceptions.TimeoutException: 69 | print("Timeout while waiting for Login screen") 70 | return False 71 | 72 | pwd_input = driver.find_element_by_xpath('//input[@name="session[password]"]') 73 | pwd_input.send_keys(password) 74 | try: 75 | pwd_input.send_keys(Keys.RETURN) 76 | url = "https://twitter.com/home" 77 | WebDriverWait(driver, 10).until(expected_conditions.url_to_be(url)) 78 | except exceptions.TimeoutException: 79 | print("Timeout while waiting for home screen") 80 | return True 81 | 82 | 83 | def find_search_input_and_enter_criteria(search_term, driver): 84 | xpath_search = '//input[@aria-label="Search query"]' 85 | search_input = driver.find_element_by_xpath(xpath_search) 86 | search_input.send_keys(search_term) 87 | search_input.send_keys(Keys.RETURN) 88 | return True 89 | 90 | 91 | def change_page_sort(tab_name, driver): 92 | """Options for this program are `Latest` and `Top`""" 93 | tab = driver.find_element_by_link_text(tab_name) 94 | tab.click() 95 | xpath_tab_state = f'//a[contains(text(),\"{tab_name}\") and @aria-selected=\"true\"]' 96 | 97 | 98 | def generate_tweet_id(tweet): 99 | return ''.join(tweet) 100 | 101 | 102 | def scroll_down_page(driver, last_position, num_seconds_to_load=0.5, scroll_attempt=0, max_attempts=5): 103 | """The function will try to scroll down the page and will check the current 104 | and last positions as an indicator. If the current and last positions are the same after `max_attempts` 105 | the assumption is that the end of the scroll region has been reached and the `end_of_scroll_region` 106 | flag will be returned as `True`""" 107 | end_of_scroll_region = False 108 | driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") 109 | sleep(num_seconds_to_load) 110 | curr_position = driver.execute_script("return window.pageYOffset;") 111 | if curr_position == last_position: 112 | if scroll_attempt < max_attempts: 113 | end_of_scroll_region = True 114 | else: 115 | scroll_down_page(last_position, curr_position, scroll_attempt + 1) 116 | last_position = curr_position 117 | return last_position, end_of_scroll_region 118 | 119 | 120 | def save_tweet_data_to_csv(records, filepath, mode='a+'): 121 | header = ['User', 'Handle', 'PostDate', 'TweetText', 'ReplyCount', 'RetweetCount', 'LikeCount'] 122 | with open(filepath, mode=mode, newline='', encoding='utf-8') as f: 123 | writer = csv.writer(f) 124 | if mode == 'w': 125 | writer.writerow(header) 126 | if records: 127 | writer.writerow(records) 128 | 129 | 130 | def collect_all_tweets_from_current_view(driver, lookback_limit=25): 131 | """The page is continously loaded, so as you scroll down the number of tweets returned by this function will 132 | continue to grow. To limit the risk of 're-processing' the same tweet over and over again, you can set the 133 | `lookback_limit` to only process the last `x` number of tweets extracted from the page in each iteration. 134 | You may need to play around with this number to get something that works for you. I've set the default 135 | based on my computer settings and internet speed, etc...""" 136 | page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]') 137 | if len(page_cards) <= lookback_limit: 138 | return page_cards 139 | else: 140 | return page_cards[-lookback_limit:] 141 | 142 | 143 | def extract_data_from_current_tweet_card(card): 144 | try: 145 | user = card.find_element_by_xpath('.//span').text 146 | except exceptions.NoSuchElementException: 147 | user = "" 148 | except exceptions.StaleElementReferenceException: 149 | return 150 | try: 151 | handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text 152 | except exceptions.NoSuchElementException: 153 | handle = "" 154 | try: 155 | """ 156 | If there is no post date here, there it is usually sponsored content, or some 157 | other form of content where post dates do not apply. You can set a default value 158 | for the postdate on Exception if you which to keep this record. By default I am 159 | excluding these. 160 | """ 161 | postdate = card.find_element_by_xpath('.//time').get_attribute('datetime') 162 | except exceptions.NoSuchElementException: 163 | return 164 | try: 165 | _comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text 166 | except exceptions.NoSuchElementException: 167 | _comment = "" 168 | try: 169 | _responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text 170 | except exceptions.NoSuchElementException: 171 | _responding = "" 172 | tweet_text = _comment + _responding 173 | try: 174 | reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text 175 | except exceptions.NoSuchElementException: 176 | reply_count = "" 177 | try: 178 | retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text 179 | except exceptions.NoSuchElementException: 180 | retweet_count = "" 181 | try: 182 | like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text 183 | except exceptions.NoSuchElementException: 184 | like_count = "" 185 | 186 | tweet = (user, handle, postdate, tweet_text, reply_count, retweet_count, like_count) 187 | return tweet 188 | 189 | 190 | def main(username, password, search_term, filepath, page_sort='Latest'): 191 | save_tweet_data_to_csv(None, filepath, 'w') # create file for saving records 192 | last_position = None 193 | end_of_scroll_region = False 194 | unique_tweets = set() 195 | 196 | driver = create_webdriver_instance() 197 | logged_in = login_to_twitter(username, password, driver) 198 | if not logged_in: 199 | return 200 | 201 | search_found = find_search_input_and_enter_criteria(search_term, driver) 202 | if not search_found: 203 | return 204 | 205 | change_page_sort(page_sort, driver) 206 | 207 | while not end_of_scroll_region: 208 | cards = collect_all_tweets_from_current_view(driver) 209 | for card in cards: 210 | try: 211 | tweet = extract_data_from_current_tweet_card(card) 212 | except exceptions.StaleElementReferenceException: 213 | continue 214 | if not tweet: 215 | continue 216 | tweet_id = generate_tweet_id(tweet) 217 | if tweet_id not in unique_tweets: 218 | unique_tweets.add(tweet_id) 219 | save_tweet_data_to_csv(tweet, filepath) 220 | last_position, end_of_scroll_region = scroll_down_page(driver, last_position) 221 | driver.quit() 222 | 223 | 224 | if __name__ == '__main__': 225 | usr = "email@gmail.com" 226 | pwd = "password" 227 | path = 'pysimplegui.csv' 228 | term = 'pysimplegui' 229 | 230 | main(usr, pwd, term, path) 231 | --------------------------------------------------------------------------------