├── .gitattributes
├── .gitignore
├── README.md
├── scrapper.ipynb
├── scrapper.py
├── twitter-scraper-tut.ipynb
└── twitter_scraper.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/
121 | .dmypy.json
122 | dmypy.json
123 | 
124 | # Pyre type checker
125 | .pyre/
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Twitter-Scraper
 2 | This is not a perfect scraper, so feel free to add improvements if you find any.
 3 | 
 4 | IMPROVEMENTS:
 5 | - Improved error handling so that tweets are not rejected if certain fields are null, etc...
 6 | - Leveraged the `WebDriverAwait` class to enable better detection of desired load states
 7 | - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session.
 8 | 
 9 | NOTES AND THINGS TO THINK ABOUT:
10 | - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of
11 | seconds that the program will wait until attempting to scroll again. I'm currently making 5 attemps with
12 | a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`.
13 | This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down
14 | quicker.
15 | -  The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how
16 | many tweets are processed from each scroll. I've written more about this in the function docstring.
17 | - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much
18 | better solution than a hard-coded `sleep` call because it will only timeout after a certain period of
19 | time if specific conditions are not met. There are many other sections of this code that could be
20 | improved, I'm sure, by leveraging this class.
21 | - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such
22 | as a database save via `pyodbc`, `sqlite3`, or whatever you want really.
23 | - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url
24 |  is built. You can then leverage this to make your searches more customized... with date ranges, special keywords,
25 |  etc...  --> https://twitter.com/search-advanced?
26 | 


--------------------------------------------------------------------------------
/scrapper.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import re\n",
 10 |     "import csv\n",
 11 |     "from getpass import getpass\n",
 12 |     "from time import sleep\n",
 13 |     "from selenium.webdriver.common.keys import Keys\n",
 14 |     "from selenium.common.exceptions import NoSuchElementException\n",
 15 |     "from msedge.selenium_tools import Edge, EdgeOptions "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "def get_tweet_data(card):\n",
 25 |     "    \"\"\"Extract data from tweet card\"\"\"\n",
 26 |     "    username = card.find_element_by_xpath('.//span').text\n",
 27 |     "    try:\n",
 28 |     "        handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n",
 29 |     "    except NoSuchElementException:\n",
 30 |     "        return\n",
 31 |     "    \n",
 32 |     "    try:\n",
 33 |     "        postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n",
 34 |     "    except NoSuchElementException:\n",
 35 |     "        return\n",
 36 |     "    \n",
 37 |     "    comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n",
 38 |     "    responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n",
 39 |     "    text = comment + responding\n",
 40 |     "    reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n",
 41 |     "    retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n",
 42 |     "    like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n",
 43 |     "\n",
 44 |     "    \n",
 45 |     "    tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n",
 46 |     "    return tweet  "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "search_term = input('search term: ')\n",
 56 |     "\n",
 57 |     "# create instance of web driver\n",
 58 |     "options = EdgeOptions()\n",
 59 |     "options.use_chromium = True\n",
 60 |     "driver = Edge(options=options)\n",
 61 |     "\n",
 62 |     "# navigate to login screen\n",
 63 |     "driver.get('https://twitter.com/search')\n",
 64 |     "driver.maximize_window()\n",
 65 |     "sleep(5)\n",
 66 |     "\n",
 67 |     "# find search input and search for term\n",
 68 |     "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n",
 69 |     "search_input.send_keys(search_term)\n",
 70 |     "search_input.send_keys(Keys.RETURN)\n",
 71 |     "sleep(1)\n",
 72 |     "\n",
 73 |     "# navigate to historical 'latest' tab\n",
 74 |     "driver.find_element_by_link_text('Latest').click()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "# get all tweets on the page\n",
 84 |     "data = []\n",
 85 |     "tweet_ids = set()\n",
 86 |     "last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
 87 |     "scrolling = True\n",
 88 |     "\n",
 89 |     "while scrolling:\n",
 90 |     "    page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n",
 91 |     "    for card in page_cards[-15:]:\n",
 92 |     "        tweet = get_tweet_data(card)\n",
 93 |     "        if tweet:\n",
 94 |     "            tweet_id = ''.join(tweet)\n",
 95 |     "            if tweet_id not in tweet_ids:\n",
 96 |     "                tweet_ids.add(tweet_id)\n",
 97 |     "                data.append(tweet)\n",
 98 |     "            \n",
 99 |     "    scroll_attempt = 0\n",
100 |     "    while True:\n",
101 |     "        # check scroll position\n",
102 |     "        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n",
103 |     "        sleep(2)\n",
104 |     "        curr_position = driver.execute_script(\"return window.pageYOffset;\")\n",
105 |     "        if last_position == curr_position:\n",
106 |     "            scroll_attempt += 1\n",
107 |     "            \n",
108 |     "            # end of scroll region\n",
109 |     "            if scroll_attempt >= 3:\n",
110 |     "                scrolling = False\n",
111 |     "                break\n",
112 |     "            else:\n",
113 |     "                sleep(2) # attempt another scroll\n",
114 |     "        else:\n",
115 |     "            last_position = curr_position\n",
116 |     "            break\n",
117 |     "\n",
118 |     "# close the web driver\n",
119 |     "driver.close()"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### Saving the tweet data"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n",
136 |     "    header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n",
137 |     "    writer = csv.writer(f)\n",
138 |     "    writer.writerow(header)\n",
139 |     "    writer.writerows(data)"
140 |    ]
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "interpreter": {
145 |    "hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138"
146 |   },
147 |   "kernelspec": {
148 |    "display_name": "Python 3.7.0 64-bit",
149 |    "language": "python",
150 |    "name": "python3"
151 |   },
152 |   "language_info": {
153 |    "codemirror_mode": {
154 |     "name": "ipython",
155 |     "version": 3
156 |    },
157 |    "file_extension": ".py",
158 |    "mimetype": "text/x-python",
159 |    "name": "python",
160 |    "nbconvert_exporter": "python",
161 |    "pygments_lexer": "ipython3",
162 |    "version": "3.7.0"
163 |   },
164 |   "orig_nbformat": 4
165 |  },
166 |  "nbformat": 4,
167 |  "nbformat_minor": 2
168 | }
169 | 


--------------------------------------------------------------------------------
/scrapper.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     TITLE:
  3 |         A simple search-based twitter scraper
  4 |     LAST MODIFIED:
  5 |         2022-01-14
  6 |     AUTHOR:
  7 |         Israel Dryer
  8 |         israel.dryer@gmail.com
  9 |     This is not a perfect scraper, so feel free to add improvements if you find any.
 10 |     IMPROVEMENTS:
 11 |     - Improved error handling so that tweets are not rejected if certain fields are null, etc...
 12 |     - Leveraged the `WebDriverAwait` class to enable better detection of desired load states
 13 |     - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session.
 14 |     NOTES AND THINGS TO THINK ABOUT:
 15 |     - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of
 16 |     seconds that the program will wait until attempting to scroll again. I'm currently making 5 attempts with
 17 |     a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`.
 18 |     This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down
 19 |     quicker.
 20 |     -  The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how
 21 |     many tweets are processed from each scroll. I've written more about this in the function docstring.
 22 |     - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much
 23 |     better solution than a hard-coded `sleep` call because it will only timeout after a certain period of
 24 |     time if specific conditions are not met. There are many other sections of this code that could be
 25 |     improved, I'm sure, by leveraging this class.
 26 |     - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such
 27 |     as a database save via `pyodbc`, `sqlite3`, or whatever you want really.
 28 |     - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url
 29 |      is built. You can then leverage this to make your searches more customized... with date ranges, special keywords,
 30 |      etc...  --> https://twitter.com/search-advanced?
 31 | """
 32 | import csv
 33 | from time import sleep
 34 | from msedge.selenium_tools import Edge, EdgeOptions
 35 | from selenium.webdriver.common.keys import Keys
 36 | from selenium.webdriver.common.by import By
 37 | from selenium.common import exceptions
 38 | 
 39 | 
 40 | def create_webdriver_instance():
 41 |     options = EdgeOptions()
 42 |     options.use_chromium = True
 43 |     driver = Edge(options=options)
 44 |     return driver
 45 | 
 46 | 
 47 | def twitter_search(driver, search_term):
 48 |     url = 'https://twitter.com/search'
 49 |     driver.get(url)
 50 |     driver.maximize_window()
 51 |     sleep(5)
 52 | 
 53 |     search_input = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
 54 |     search_input.send_keys(search_term)
 55 |     search_input.send_keys(Keys.RETURN)
 56 |     sleep(5)
 57 |     return True
 58 | 
 59 | 
 60 | def change_page_sort(tab_name, driver):
 61 |     """Options for this program are `Latest` and `Top`"""
 62 |     tab = driver.find_element_by_link_text(tab_name)
 63 |     tab.click()
 64 |     xpath_tab_state = f'//a[contains(text(),\"{tab_name}\") and @aria-selected=\"true\"]'
 65 |     return xpath_tab_state
 66 | 
 67 | 
 68 | def generate_tweet_id(tweet):
 69 |     return ''.join(tweet)
 70 | 
 71 | 
 72 | def scroll_down_page(driver, last_position, num_seconds_to_load=0.5, scroll_attempt=0, max_attempts=5):
 73 |     """The function will try to scroll down the page and will check the current
 74 |     and last positions as an indicator. If the current and last positions are the same after `max_attempts`
 75 |     the assumption is that the end of the scroll region has been reached and the `end_of_scroll_region`
 76 |     flag will be returned as `True`"""
 77 |     end_of_scroll_region = False
 78 |     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
 79 |     sleep(num_seconds_to_load)
 80 |     curr_position = driver.execute_script("return window.pageYOffset;")
 81 |     if curr_position == last_position:
 82 |         if scroll_attempt < max_attempts:
 83 |             end_of_scroll_region = True
 84 |         else:
 85 |             scroll_down_page(last_position, curr_position, scroll_attempt + 1)
 86 |     last_position = curr_position
 87 |     return last_position, end_of_scroll_region
 88 | 
 89 | 
 90 | def save_tweet_data_to_csv(records, filepath, mode='a+'):
 91 |     header = ['User', 'Handle', 'PostDate', 'TweetText', 'ReplyCount', 'RetweetCount', 'LikeCount']
 92 |     with open(filepath, mode=mode, newline='', encoding='utf-8') as f:
 93 |         writer = csv.writer(f)
 94 |         if mode == 'w':
 95 |             writer.writerow(header)
 96 |         if records:
 97 |             writer.writerow(records)
 98 | 
 99 | 
100 | def collect_all_tweets_from_current_view(driver, lookback_limit=25):
101 |     """The page is continously loaded, so as you scroll down the number of tweets returned by this function will
102 |      continue to grow. To limit the risk of 're-processing' the same tweet over and over again, you can set the
103 |      `lookback_limit` to only process the last `x` number of tweets extracted from the page in each iteration.
104 |      You may need to play around with this number to get something that works for you. I've set the default
105 |      based on my computer settings and internet speed, etc..."""
106 |     page_cards = driver.find_elements_by_xpath('//article[@data-testid="tweet"]')
107 |     if len(page_cards) <= lookback_limit:
108 |         return page_cards
109 |     else:
110 |         return page_cards[-lookback_limit:]
111 | 
112 | 
113 | def extract_data_from_current_tweet_card(card):
114 |     try:
115 |         user = card.find_element_by_xpath('.//span').text
116 |     except exceptions.NoSuchElementException:
117 |         user = ""
118 |     except exceptions.StaleElementReferenceException:
119 |         return
120 |     try:
121 |         handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
122 |     except exceptions.NoSuchElementException:
123 |         handle = ""
124 |     try:
125 |         """
126 |         If there is no post date here, there it is usually sponsored content, or some
127 |         other form of content where post dates do not apply. You can set a default value
128 |         for the postdate on Exception if you which to keep this record. By default I am
129 |         excluding these.
130 |         """
131 |         postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')
132 |     except exceptions.NoSuchElementException:
133 |         return
134 |     try:
135 |         _comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
136 |     except exceptions.NoSuchElementException:
137 |         _comment = ""
138 |     try:
139 |         _responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
140 |     except exceptions.NoSuchElementException:
141 |         _responding = ""
142 |     tweet_text = _comment + _responding
143 |     try:
144 |         reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text
145 |     except exceptions.NoSuchElementException:
146 |         reply_count = ""
147 |     try:
148 |         retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text
149 |     except exceptions.NoSuchElementException:
150 |         retweet_count = ""
151 |     try:
152 |         like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text
153 |     except exceptions.NoSuchElementException:
154 |         like_count = ""
155 | 
156 |     tweet = (user, handle, postdate, tweet_text, reply_count, retweet_count, like_count)
157 |     return tweet
158 | 
159 | 
160 | def main(search_term, filepath, page_sort='Latest'):
161 |     save_tweet_data_to_csv(None, filepath, 'w')  # create file for saving records
162 |     last_position = None
163 |     end_of_scroll_region = False
164 |     unique_tweets = set()
165 | 
166 |     driver = create_webdriver_instance()
167 |     twitter_search_page_term = twitter_search(driver, search_term)
168 |     if not twitter_search_page_term:
169 |         return
170 | 
171 |     change_page_sort(page_sort, driver)
172 | 
173 |     while not end_of_scroll_region:
174 |         cards = collect_all_tweets_from_current_view(driver)
175 |         for card in cards:
176 |             try:
177 |                 tweet = extract_data_from_current_tweet_card(card)
178 |             except exceptions.StaleElementReferenceException:
179 |                 continue
180 |             if not tweet:
181 |                 continue
182 |             tweet_id = generate_tweet_id(tweet)
183 |             if tweet_id not in unique_tweets:
184 |                 unique_tweets.add(tweet_id)
185 |                 save_tweet_data_to_csv(tweet, filepath)
186 |         last_position, end_of_scroll_region = scroll_down_page(driver, last_position)
187 |     driver.quit()
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     path = 'pysimplegui.csv'
192 |     term = 'pysimplegui'
193 |     
194 |     main(term, path)


--------------------------------------------------------------------------------
/twitter-scraper-tut.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Twitter Scraper"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import re\n",
 17 |     "import csv\n",
 18 |     "from getpass import getpass\n",
 19 |     "from time import sleep\n",
 20 |     "from selenium.webdriver.common.keys import Keys\n",
 21 |     "from selenium.common.exceptions import NoSuchElementException\n",
 22 |     "from msedge.selenium_tools import Edge, EdgeOptions\n",
 23 |     "\n",
 24 |     "def get_tweet_data(card):\n",
 25 |     "    \"\"\"Extract data from tweet card\"\"\"\n",
 26 |     "    username = card.find_element_by_xpath('.//span').text\n",
 27 |     "    try:\n",
 28 |     "        handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n",
 29 |     "    except NoSuchElementException:\n",
 30 |     "        return\n",
 31 |     "    \n",
 32 |     "    try:\n",
 33 |     "        postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n",
 34 |     "    except NoSuchElementException:\n",
 35 |     "        return\n",
 36 |     "    \n",
 37 |     "    comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n",
 38 |     "    responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n",
 39 |     "    text = comment + responding\n",
 40 |     "    reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n",
 41 |     "    retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n",
 42 |     "    like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n",
 43 |     "    \n",
 44 |     "    # get a string of all emojis contained in the tweet\n",
 45 |     "    \"\"\"Emojis are stored as images... so I convert the filename, which is stored as unicode, into \n",
 46 |     "    the emoji character.\"\"\"\n",
 47 |     "    emoji_tags = card.find_elements_by_xpath('.//img[contains(@src, \"emoji\")]')\n",
 48 |     "    emoji_list = []\n",
 49 |     "    for tag in emoji_tags:\n",
 50 |     "        filename = tag.get_attribute('src')\n",
 51 |     "        try:\n",
 52 |     "            emoji = chr(int(re.search(r'svg\\/([a-z0-9]+)\\.svg', filename).group(1), base=16))\n",
 53 |     "        except AttributeError:\n",
 54 |     "            continue\n",
 55 |     "        if emoji:\n",
 56 |     "            emoji_list.append(emoji)\n",
 57 |     "    emojis = ' '.join(emoji_list)\n",
 58 |     "    \n",
 59 |     "    tweet = (username, handle, postdate, text, emojis, reply_cnt, retweet_cnt, like_cnt)\n",
 60 |     "    return tweet    "
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 2,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "username:  israel.dryer@gmail.com\n",
 73 |       "Password:  ·············\n",
 74 |       "search term:  #turkcell\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "# application variables\n",
 80 |     "user = input('username: ')\n",
 81 |     "my_password = getpass('Password: ')\n",
 82 |     "search_term = input('search term: ')\n",
 83 |     "\n",
 84 |     "# create instance of web driver\n",
 85 |     "options = EdgeOptions()\n",
 86 |     "options.use_chromium = True\n",
 87 |     "driver = Edge(options=options)\n",
 88 |     "\n",
 89 |     "# navigate to login screen\n",
 90 |     "driver.get('https://www.twitter.com/login')\n",
 91 |     "driver.maximize_window()\n",
 92 |     "sleep(5)\n",
 93 |     "username = driver.find_element_by_xpath('//input[@name=\"text\"]')\n",
 94 |     "username.send_keys(user)\n",
 95 |     "username.send_keys(Keys.RETURN)\n",
 96 |     "sleep(3)\n",
 97 |     "\n",
 98 |     "password = driver.find_element_by_xpath('//input[@name=\"password\"]')\n",
 99 |     "password.send_keys(my_password)\n",
100 |     "password.send_keys(Keys.RETURN)\n",
101 |     "sleep(3)\n",
102 |     "\n",
103 |     "# find search input and search for term\n",
104 |     "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n",
105 |     "search_input.send_keys(search_term)\n",
106 |     "search_input.send_keys(Keys.RETURN)\n",
107 |     "sleep(1)\n",
108 |     "\n",
109 |     "# navigate to historical 'latest' tab\n",
110 |     "driver.find_element_by_link_text('Latest').click()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 3,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# get all tweets on the page\n",
120 |     "data = []\n",
121 |     "tweet_ids = set()\n",
122 |     "last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
123 |     "scrolling = True\n",
124 |     "\n",
125 |     "while scrolling:\n",
126 |     "    page_cards = driver.find_elements_by_xpath('//div[@data-testid=\"tweet\"]')\n",
127 |     "    for card in page_cards[-15:]:\n",
128 |     "        tweet = get_tweet_data(card)\n",
129 |     "        if tweet:\n",
130 |     "            tweet_id = ''.join(tweet)\n",
131 |     "            if tweet_id not in tweet_ids:\n",
132 |     "                tweet_ids.add(tweet_id)\n",
133 |     "                data.append(tweet)\n",
134 |     "            \n",
135 |     "    scroll_attempt = 0\n",
136 |     "    while True:\n",
137 |     "        # check scroll position\n",
138 |     "        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n",
139 |     "        sleep(2)\n",
140 |     "        curr_position = driver.execute_script(\"return window.pageYOffset;\")\n",
141 |     "        if last_position == curr_position:\n",
142 |     "            scroll_attempt += 1\n",
143 |     "            \n",
144 |     "            # end of scroll region\n",
145 |     "            if scroll_attempt >= 3:\n",
146 |     "                scrolling = False\n",
147 |     "                break\n",
148 |     "            else:\n",
149 |     "                sleep(2) # attempt another scroll\n",
150 |     "        else:\n",
151 |     "            last_position = curr_position\n",
152 |     "            break\n",
153 |     "\n",
154 |     "# close the web driver\n",
155 |     "driver.close()"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Saving the tweet data"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 4,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": [
171 |     "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n",
172 |     "    header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Emojis', 'Comments', 'Likes', 'Retweets']\n",
173 |     "    writer = csv.writer(f)\n",
174 |     "    writer.writerow(header)\n",
175 |     "    writer.writerows(data)"
176 |    ]
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.8.3"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 4
200 | }
201 | 


--------------------------------------------------------------------------------
/twitter_scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     TITLE:
  3 |         A simple search-based twitter scraper
  4 | 
  5 |     LAST MODIFIED:
  6 |         2020-12-21
  7 | 
  8 |     AUTHOR:
  9 |         Israel Dryer
 10 |         israel.dryer@gmail.com
 11 | 
 12 |     This is not a perfect scraper, so feel free to add improvements if you find any.
 13 | 
 14 |     IMPROVEMENTS:
 15 |     - Improved error handling so that tweets are not rejected if certain fields are null, etc...
 16 |     - Leveraged the `WebDriverAwait` class to enable better detection of desired load states
 17 |     - Each record is saved while scraping instead of all at the end; minimizing data loss for a failed session.
 18 | 
 19 |     NOTES AND THINGS TO THINK ABOUT:
 20 |     - Twitter will block you from logging (temporary) in via the webdriver if you log in too many times in a single day.
 21 | 
 22 |     - The `scroll_down_page` function has an argument for `num_seconds_to_load` that represents the num of
 23 |     seconds that the program will wait until attempting to scroll again. I'm currently making 5 attempts with
 24 |     a pause between. You could also increase the number of max attempts and decrease the `num_seconds_to_load`.
 25 |     This could possibly speed up the scraping as you would be more likely to get to a successfull scroll down
 26 |     quicker.
 27 | 
 28 |     -  The `collect_all_tweets_from_current_view` function has a `lookback_limit` argument that controls how
 29 |     many tweets are processed from each scroll. I've written more about this in the function docstring.
 30 | 
 31 |     - I've implemented `WebDriverWait` in several sections of this updated code. I think this is a much
 32 |     better solution than a hard-coded `sleep` call because it will only timeout after a certain period of
 33 |     time if specific conditions are not met. There are many other sections of this code that could be
 34 |     improved, I'm sure, by leveraging this class.
 35 | 
 36 |     - Feel free to replace the `save_tweet_data_to_csv` function with any other `io` option you want, such
 37 |     as a database save via `pyodbc`, `sqlite3`, or whatever you want really.
 38 | 
 39 |     - I encourage you to explore the "Advanced Search" functionality. Try adding your criteria and see how the url
 40 |      is built. You can then leverage this to make your searches more customized... with date ranges, special keywords,
 41 |      etc...  --> https://twitter.com/search-advanced?
 42 | """
 43 | import csv
 44 | from time import sleep
 45 | from msedge.selenium_tools import Edge, EdgeOptions
 46 | from selenium.webdriver.common.keys import Keys
 47 | from selenium.webdriver.common.by import By
 48 | from selenium.webdriver.support.ui import WebDriverWait
 49 | from selenium.webdriver.support import expected_conditions
 50 | from selenium.common import exceptions
 51 | 
 52 | 
 53 | def create_webdriver_instance():
 54 |     options = EdgeOptions()
 55 |     options.use_chromium = True
 56 |     driver = Edge(options=options)
 57 |     return driver
 58 | 
 59 | 
 60 | def login_to_twitter(username, password, driver):
 61 |     url = 'https://twitter.com/login'
 62 |     try:
 63 |         driver.get(url)
 64 |         xpath_username = '//input[@name="session[username_or_email]"]'
 65 |         WebDriverWait(driver, 10).until(expected_conditions.presence_of_element_located((By.XPATH, xpath_username)))
 66 |         uid_input = driver.find_element_by_xpath(xpath_username)
 67 |         uid_input.send_keys(username)
 68 |     except exceptions.TimeoutException:
 69 |         print("Timeout while waiting for Login screen")
 70 |         return False
 71 | 
 72 |     pwd_input = driver.find_element_by_xpath('//input[@name="session[password]"]')
 73 |     pwd_input.send_keys(password)
 74 |     try:
 75 |         pwd_input.send_keys(Keys.RETURN)
 76 |         url = "https://twitter.com/home"
 77 |         WebDriverWait(driver, 10).until(expected_conditions.url_to_be(url))
 78 |     except exceptions.TimeoutException:
 79 |         print("Timeout while waiting for home screen")
 80 |     return True
 81 | 
 82 | 
 83 | def find_search_input_and_enter_criteria(search_term, driver):
 84 |     xpath_search = '//input[@aria-label="Search query"]'
 85 |     search_input = driver.find_element_by_xpath(xpath_search)
 86 |     search_input.send_keys(search_term)
 87 |     search_input.send_keys(Keys.RETURN)
 88 |     return True
 89 | 
 90 | 
 91 | def change_page_sort(tab_name, driver):
 92 |     """Options for this program are `Latest` and `Top`"""
 93 |     tab = driver.find_element_by_link_text(tab_name)
 94 |     tab.click()
 95 |     xpath_tab_state = f'//a[contains(text(),\"{tab_name}\") and @aria-selected=\"true\"]'
 96 | 
 97 | 
 98 | def generate_tweet_id(tweet):
 99 |     return ''.join(tweet)
100 | 
101 | 
102 | def scroll_down_page(driver, last_position, num_seconds_to_load=0.5, scroll_attempt=0, max_attempts=5):
103 |     """The function will try to scroll down the page and will check the current
104 |     and last positions as an indicator. If the current and last positions are the same after `max_attempts`
105 |     the assumption is that the end of the scroll region has been reached and the `end_of_scroll_region`
106 |     flag will be returned as `True`"""
107 |     end_of_scroll_region = False
108 |     driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
109 |     sleep(num_seconds_to_load)
110 |     curr_position = driver.execute_script("return window.pageYOffset;")
111 |     if curr_position == last_position:
112 |         if scroll_attempt < max_attempts:
113 |             end_of_scroll_region = True
114 |         else:
115 |             scroll_down_page(last_position, curr_position, scroll_attempt + 1)
116 |     last_position = curr_position
117 |     return last_position, end_of_scroll_region
118 | 
119 | 
120 | def save_tweet_data_to_csv(records, filepath, mode='a+'):
121 |     header = ['User', 'Handle', 'PostDate', 'TweetText', 'ReplyCount', 'RetweetCount', 'LikeCount']
122 |     with open(filepath, mode=mode, newline='', encoding='utf-8') as f:
123 |         writer = csv.writer(f)
124 |         if mode == 'w':
125 |             writer.writerow(header)
126 |         if records:
127 |             writer.writerow(records)
128 | 
129 | 
130 | def collect_all_tweets_from_current_view(driver, lookback_limit=25):
131 |     """The page is continously loaded, so as you scroll down the number of tweets returned by this function will
132 |      continue to grow. To limit the risk of 're-processing' the same tweet over and over again, you can set the
133 |      `lookback_limit` to only process the last `x` number of tweets extracted from the page in each iteration.
134 |      You may need to play around with this number to get something that works for you. I've set the default
135 |      based on my computer settings and internet speed, etc..."""
136 |     page_cards = driver.find_elements_by_xpath('//div[@data-testid="tweet"]')
137 |     if len(page_cards) <= lookback_limit:
138 |         return page_cards
139 |     else:
140 |         return page_cards[-lookback_limit:]
141 | 
142 | 
143 | def extract_data_from_current_tweet_card(card):
144 |     try:
145 |         user = card.find_element_by_xpath('.//span').text
146 |     except exceptions.NoSuchElementException:
147 |         user = ""
148 |     except exceptions.StaleElementReferenceException:
149 |         return
150 |     try:
151 |         handle = card.find_element_by_xpath('.//span[contains(text(), "@")]').text
152 |     except exceptions.NoSuchElementException:
153 |         handle = ""
154 |     try:
155 |         """
156 |         If there is no post date here, there it is usually sponsored content, or some
157 |         other form of content where post dates do not apply. You can set a default value
158 |         for the postdate on Exception if you which to keep this record. By default I am
159 |         excluding these.
160 |         """
161 |         postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')
162 |     except exceptions.NoSuchElementException:
163 |         return
164 |     try:
165 |         _comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text
166 |     except exceptions.NoSuchElementException:
167 |         _comment = ""
168 |     try:
169 |         _responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text
170 |     except exceptions.NoSuchElementException:
171 |         _responding = ""
172 |     tweet_text = _comment + _responding
173 |     try:
174 |         reply_count = card.find_element_by_xpath('.//div[@data-testid="reply"]').text
175 |     except exceptions.NoSuchElementException:
176 |         reply_count = ""
177 |     try:
178 |         retweet_count = card.find_element_by_xpath('.//div[@data-testid="retweet"]').text
179 |     except exceptions.NoSuchElementException:
180 |         retweet_count = ""
181 |     try:
182 |         like_count = card.find_element_by_xpath('.//div[@data-testid="like"]').text
183 |     except exceptions.NoSuchElementException:
184 |         like_count = ""
185 | 
186 |     tweet = (user, handle, postdate, tweet_text, reply_count, retweet_count, like_count)
187 |     return tweet
188 | 
189 | 
190 | def main(username, password, search_term, filepath, page_sort='Latest'):
191 |     save_tweet_data_to_csv(None, filepath, 'w')  # create file for saving records
192 |     last_position = None
193 |     end_of_scroll_region = False
194 |     unique_tweets = set()
195 | 
196 |     driver = create_webdriver_instance()
197 |     logged_in = login_to_twitter(username, password, driver)
198 |     if not logged_in:
199 |         return
200 | 
201 |     search_found = find_search_input_and_enter_criteria(search_term, driver)
202 |     if not search_found:
203 |         return
204 | 
205 |     change_page_sort(page_sort, driver)
206 | 
207 |     while not end_of_scroll_region:
208 |         cards = collect_all_tweets_from_current_view(driver)
209 |         for card in cards:
210 |             try:
211 |                 tweet = extract_data_from_current_tweet_card(card)
212 |             except exceptions.StaleElementReferenceException:
213 |                 continue
214 |             if not tweet:
215 |                 continue
216 |             tweet_id = generate_tweet_id(tweet)
217 |             if tweet_id not in unique_tweets:
218 |                 unique_tweets.add(tweet_id)
219 |                 save_tweet_data_to_csv(tweet, filepath)
220 |         last_position, end_of_scroll_region = scroll_down_page(driver, last_position)
221 |     driver.quit()
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     usr = "email@gmail.com"
226 |     pwd = "password"
227 |     path = 'pysimplegui.csv'
228 |     term = 'pysimplegui'
229 | 
230 |     main(usr, pwd, term, path)
231 | 


--------------------------------------------------------------------------------