├── README.md └── Webscraping.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # webscraping_ny_mta 2 | 3 | The Jupyter Notebook in this repo contains code to webscrap the NY MTA turnstile data. I automate the download of hundreds of data files. 4 | 5 | More detailed instructions on how to webscrape is elaborated on my Medium post located here: https://medium.com/@julia_kho/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460 6 | -------------------------------------------------------------------------------- /Webscraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Webscrape Example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 8, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import requests\n", 17 | "import urllib.request\n", 18 | "import time\n", 19 | "from bs4 import BeautifulSoup" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "### Set the URL you want to webscrape from" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 9, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "url = 'http://web.mta.info/developers/turnstile.html'" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### Connect to the URL" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 10, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "response = requests.get(url)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 11, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": 11, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "response #200 means it went through" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Parse HTML and save to BeautifulSoup object" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 12, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "soup = BeautifulSoup(response.text, \"html.parser\")" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 9, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "\n", 99 | "\n", 101 | "\n", 102 | "\n", 103 | "\n", 104 | "mta.info | Turnstile Data\n", 105 | "\n", 106 | "\n", 107 | "\n", 108 | "\n", 109 | "\n", 110 | "\n", 111 | "\n", 112 | "\n", 113 | "\n", 114 | "\n", 115 | "\n", 116 | "\n", 117 | "\n", 118 | "\n", 119 | "\n", 130 | "\n", 133 | "\n", 139 | "\n", 140 | "\n", 141 | "\n", 142 | "
\n", 143 | "Skip to main content\n", 144 | "
\n", 145 | "
\n", 146 | "
\n", 147 | "
\n", 148 | "\"Go\n", 149 | "
\n", 150 | "
\n", 151 | "
\n", 158 | "
\n", 159 | "
\n", 160 | "\n", 161 | "\n", 162 | "\n", 163 | "\n", 164 | "\n", 165 | "\n", 166 | "
\n", 167 | "
\n", 168 | "
\n", 169 | "\n", 173 | "
\n", 174 | "\n", 207 | "
\n", 208 | "
\n", 209 | "
\n", 210 | "\n", 211 | " \n", 212 | "\n", 213 | "

Turnstile Data

\n", 214 | "
\n", 215 | "
\n", 216 | "

Key/Resources

\n", 217 | "

\n", 218 | "

\n", 227 | "

\n", 228 | "
\n", 229 | "

Data Files

\n", 230 | "
\n", 231 | "Saturday, September 22, 2018
Saturday, September 15, 2018
Saturday, September 08, 2018
Saturday, September 01, 2018
Saturday, August 25, 2018
Saturday, August 18, 2018
Saturday, August 11, 2018
Saturday, August 04, 2018
Saturday, July 28, 2018
Saturday, July 21, 2018
Saturday, July 14, 2018
Saturday, July 07, 2018
Saturday, June 30, 2018
Saturday, June 23, 2018
Saturday, June 16, 2018
Saturday, June 09, 2018
Saturday, June 02, 2018
Saturday, May 26, 2018
Saturday, May 19, 2018
Saturday, May 12, 2018
Saturday, May 05, 2018
Saturday, April 28, 2018
Saturday, April 21, 2018
Saturday, April 14, 2018
Saturday, April 07, 2018
Saturday, March 31, 2018
Saturday, March 24, 2018
Saturday, March 17, 2018
Saturday, March 10, 2018
Saturday, March 03, 2018
Saturday, February 24, 2018
Saturday, February 17, 2018
Saturday, February 10, 2018
Saturday, February 03, 2018
Saturday, January 27, 2018
Saturday, January 20, 2018
Saturday, January 13, 2018
Saturday, January 06, 2018
Saturday, December 30, 2017
Saturday, December 23, 2017
Saturday, December 16, 2017
Saturday, December 09, 2017
Saturday, December 02, 2017
Saturday, November 25, 2017
Saturday, November 18, 2017
Saturday, November 11, 2017
Saturday, November 04, 2017
Saturday, October 28, 2017
Saturday, October 21, 2017
Saturday, October 14, 2017
Saturday, October 07, 2017
Saturday, September 30, 2017
Saturday, September 23, 2017
Saturday, September 16, 2017
Saturday, September 09, 2017
Saturday, September 02, 2017
Saturday, August 26, 2017
Saturday, August 19, 2017
Saturday, August 12, 2017
Saturday, August 05, 2017
Saturday, July 29, 2017
Saturday, July 22, 2017
Saturday, July 15, 2017
Saturday, July 08, 2017
Saturday, July 01, 2017
Saturday, June 24, 2017
Saturday, June 17, 2017
Saturday, June 10, 2017
Saturday, June 03, 2017
Saturday, May 27, 2017
Saturday, May 20, 2017
Saturday, May 13, 2017
Saturday, May 06, 2017
Saturday, April 29, 2017
Saturday, April 22, 2017
Saturday, April 15, 2017
Saturday, April 08, 2017
Saturday, April 01, 2017
Saturday, March 25, 2017
Saturday, March 18, 2017
Saturday, March 11, 2017
Saturday, March 04, 2017
Saturday, February 25, 2017
Saturday, February 18, 2017
Saturday, February 11, 2017
Saturday, February 04, 2017
Saturday, January 28, 2017
Saturday, January 21, 2017
Saturday, January 14, 2017
Saturday, January 07, 2017
Saturday, December 31, 2016
Saturday, December 24, 2016
Saturday, December 17, 2016
Saturday, December 10, 2016
Saturday, December 03, 2016
Saturday, November 26, 2016
Saturday, November 19, 2016
Saturday, November 12, 2016
Saturday, November 05, 2016
Saturday, October 29, 2016
Saturday, October 22, 2016
Saturday, October 15, 2016
Saturday, October 08, 2016
Saturday, October 01, 2016
Saturday, September 24, 2016
Saturday, September 17, 2016
Saturday, September 10, 2016
Saturday, September 03, 2016
Saturday, August 27, 2016
Saturday, August 20, 2016
Saturday, August 13, 2016
Saturday, August 06, 2016
Saturday, July 30, 2016
Saturday, July 23, 2016
Saturday, July 16, 2016
Saturday, July 09, 2016
Saturday, July 02, 2016
Saturday, June 25, 2016
Saturday, June 18, 2016
Saturday, June 11, 2016
Saturday, June 04, 2016
Saturday, May 28, 2016
Saturday, May 21, 2016
Saturday, May 14, 2016
Saturday, May 07, 2016
Saturday, April 30, 2016
Saturday, April 23, 2016
Saturday, April 16, 2016
Saturday, April 09, 2016
Saturday, April 02, 2016
Saturday, March 26, 2016
Saturday, March 19, 2016
Saturday, March 12, 2016
Saturday, March 05, 2016
Saturday, February 27, 2016
Saturday, February 20, 2016
Saturday, February 13, 2016
Saturday, February 06, 2016
Saturday, January 30, 2016
Saturday, January 23, 2016
Saturday, January 16, 2016
Saturday, January 09, 2016
Saturday, January 02, 2016
Saturday, December 26, 2015
Saturday, December 19, 2015
Saturday, December 12, 2015
Saturday, December 05, 2015
Saturday, November 28, 2015
Saturday, November 21, 2015
Saturday, November 14, 2015
Saturday, November 07, 2015
Saturday, October 31, 2015
Saturday, October 24, 2015
Saturday, October 17, 2015
Saturday, October 10, 2015
Saturday, October 03, 2015
Saturday, September 26, 2015
Saturday, September 19, 2015
Saturday, September 12, 2015
Saturday, September 05, 2015
Saturday, August 29, 2015
Saturday, August 22, 2015
Saturday, August 15, 2015
Saturday, August 08, 2015
Saturday, August 01, 2015
Saturday, July 25, 2015
Saturday, July 18, 2015
Saturday, July 11, 2015
Saturday, July 04, 2015
Saturday, June 27, 2015
Saturday, June 20, 2015
Saturday, June 13, 2015
Saturday, June 06, 2015
Saturday, May 30, 2015
Saturday, May 23, 2015
Saturday, May 16, 2015
Saturday, May 09, 2015
Saturday, May 02, 2015
Saturday, April 25, 2015
Saturday, April 18, 2015
Saturday, April 11, 2015
Saturday, April 04, 2015
Saturday, March 28, 2015
Saturday, March 21, 2015
Saturday, March 14, 2015
Saturday, March 07, 2015
Saturday, February 28, 2015
Saturday, February 21, 2015
Saturday, February 14, 2015
Saturday, February 07, 2015
Saturday, January 31, 2015
Saturday, January 24, 2015
Saturday, January 17, 2015
Saturday, January 10, 2015
Saturday, January 03, 2015
Saturday, December 27, 2014
Saturday, December 20, 2014
Saturday, December 13, 2014
Saturday, December 06, 2014
Saturday, November 29, 2014
Saturday, November 22, 2014
Saturday, November 15, 2014
Saturday, November 08, 2014
Saturday, November 01, 2014
Saturday, October 25, 2014
Saturday, October 18, 2014
Saturday, October 11, 2014
Saturday, October 04, 2014
Saturday, September 27, 2014
Saturday, September 20, 2014
Saturday, September 13, 2014
Saturday, September 06, 2014
Saturday, August 30, 2014
Saturday, August 23, 2014
Saturday, August 16, 2014
Saturday, August 09, 2014
Saturday, August 02, 2014
Saturday, July 26, 2014
Saturday, July 19, 2014
Saturday, July 12, 2014
Saturday, July 05, 2014
Saturday, June 28, 2014
Saturday, June 21, 2014
Saturday, June 14, 2014
Saturday, June 07, 2014
Saturday, May 31, 2014
Saturday, May 24, 2014
Saturday, May 17, 2014
Saturday, May 10, 2014
Saturday, May 03, 2014
Saturday, April 26, 2014
Saturday, April 19, 2014
Saturday, April 12, 2014
Saturday, April 05, 2014
Saturday, March 29, 2014
Saturday, March 22, 2014
Saturday, March 15, 2014
Saturday, March 08, 2014
Saturday, March 01, 2014
Saturday, February 22, 2014
Saturday, February 15, 2014
Saturday, February 08, 2014
Saturday, February 01, 2014
Saturday, January 25, 2014
Saturday, January 18, 2014
Saturday, January 11, 2014
Saturday, January 04, 2014
Saturday, December 28, 2013
Saturday, December 21, 2013
Saturday, December 14, 2013
Saturday, December 07, 2013
Saturday, November 30, 2013
Saturday, November 23, 2013
Saturday, November 16, 2013
Saturday, November 09, 2013
Saturday, November 02, 2013
Saturday, October 26, 2013
Saturday, October 19, 2013
Saturday, October 12, 2013
Saturday, October 05, 2013
Saturday, September 28, 2013
Saturday, September 21, 2013
Saturday, September 14, 2013
Saturday, September 07, 2013
Saturday, August 31, 2013
Saturday, August 24, 2013
Saturday, August 17, 2013
Saturday, August 10, 2013
Saturday, August 03, 2013
Saturday, July 27, 2013
Saturday, July 20, 2013
Saturday, July 13, 2013
Saturday, July 06, 2013
Saturday, June 29, 2013
Saturday, June 22, 2013
Saturday, June 15, 2013
Saturday, June 08, 2013
Saturday, June 01, 2013
Saturday, May 25, 2013
Saturday, May 18, 2013
Saturday, May 11, 2013
Saturday, May 04, 2013
Saturday, April 27, 2013
Saturday, April 20, 2013
Saturday, April 13, 2013
Saturday, April 06, 2013
Saturday, March 30, 2013
Saturday, March 23, 2013
Saturday, March 16, 2013
Saturday, March 09, 2013
Saturday, March 02, 2013
Saturday, February 23, 2013
Saturday, February 16, 2013
Saturday, February 09, 2013
Saturday, February 02, 2013
Saturday, January 26, 2013
Saturday, January 19, 2013
Saturday, January 12, 2013
Saturday, January 05, 2013
Saturday, December 29, 2012
Saturday, December 22, 2012
Saturday, December 15, 2012
Saturday, December 08, 2012
Saturday, December 01, 2012
Saturday, November 24, 2012
Saturday, November 17, 2012
Saturday, November 10, 2012
Saturday, November 03, 2012
Saturday, October 27, 2012
Saturday, October 20, 2012
Saturday, October 13, 2012
Saturday, October 06, 2012
Saturday, September 29, 2012
Saturday, September 22, 2012
Saturday, September 15, 2012
Saturday, September 08, 2012
Saturday, September 01, 2012
Saturday, August 25, 2012
Saturday, August 18, 2012
Saturday, August 11, 2012
Saturday, August 04, 2012
Saturday, July 28, 2012
Saturday, July 21, 2012
Saturday, July 14, 2012
Saturday, July 07, 2012
Saturday, June 30, 2012
Saturday, June 23, 2012
Saturday, June 16, 2012
Saturday, June 09, 2012
Saturday, June 02, 2012
Saturday, May 26, 2012
Saturday, May 19, 2012
Saturday, May 12, 2012
Saturday, May 05, 2012
Saturday, April 28, 2012
Saturday, April 21, 2012
Saturday, April 14, 2012
Saturday, April 07, 2012
Saturday, March 31, 2012
Saturday, March 24, 2012
Saturday, March 17, 2012
Saturday, March 10, 2012
Saturday, March 03, 2012
Saturday, February 25, 2012
Saturday, February 18, 2012
Saturday, February 11, 2012
Saturday, February 04, 2012
Saturday, January 28, 2012
Saturday, January 21, 2012
Saturday, January 14, 2012
Saturday, January 07, 2012
Saturday, December 31, 2011
Saturday, December 24, 2011
Monday, December 19, 2011
Saturday, December 10, 2011
Saturday, December 03, 2011
Saturday, November 26, 2011
Saturday, November 19, 2011
Saturday, November 12, 2011
Saturday, November 05, 2011
Saturday, October 29, 2011
Saturday, October 22, 2011
Saturday, October 15, 2011
Saturday, October 08, 2011
Saturday, October 01, 2011
Saturday, September 24, 2011
Saturday, September 17, 2011
Saturday, September 10, 2011
Saturday, September 03, 2011
Saturday, August 27, 2011
Saturday, August 20, 2011
Saturday, August 13, 2011
Saturday, August 06, 2011
Saturday, July 30, 2011
Saturday, July 23, 2011
Saturday, July 16, 2011
Saturday, July 09, 2011
Saturday, July 02, 2011
Saturday, June 25, 2011
Saturday, June 18, 2011
Saturday, June 11, 2011
Saturday, June 04, 2011
Saturday, May 28, 2011
Saturday, May 21, 2011
Saturday, May 14, 2011
Saturday, May 07, 2011
Saturday, April 30, 2011
Saturday, April 23, 2011
Saturday, April 16, 2011
Saturday, April 09, 2011
Saturday, April 02, 2011
Saturday, March 26, 2011
Saturday, March 19, 2011
Saturday, March 12, 2011
Saturday, March 05, 2011
Saturday, February 26, 2011
Saturday, February 19, 2011
Saturday, February 12, 2011
Saturday, February 05, 2011
Saturday, January 29, 2011
Saturday, January 22, 2011
Saturday, January 15, 2011
Saturday, January 08, 2011
Saturday, January 01, 2011
Saturday, December 25, 2010
Saturday, December 18, 2010
Saturday, December 11, 2010
Saturday, December 04, 2010
Saturday, November 27, 2010
Saturday, November 20, 2010
Saturday, November 13, 2010
Saturday, November 06, 2010
Saturday, October 30, 2010
Saturday, October 23, 2010
Saturday, October 16, 2010
Saturday, October 09, 2010
Saturday, October 02, 2010
Saturday, September 25, 2010
Saturday, September 18, 2010
Saturday, September 11, 2010
Saturday, September 04, 2010
Saturday, August 28, 2010
Saturday, August 21, 2010
Saturday, August 14, 2010
Saturday, August 07, 2010
Saturday, July 31, 2010
Saturday, July 24, 2010
Saturday, July 17, 2010
Saturday, July 10, 2010
Saturday, July 03, 2010
Saturday, June 26, 2010
Saturday, June 19, 2010
Saturday, June 12, 2010
Saturday, June 05, 2010
Saturday, May 22, 2010
Saturday, May 15, 2010
Saturday, May 08, 2010
Wednesday, May 05, 2010
\n", 232 | "
\n", 233 | "\n", 234 | "
\n", 235 | "\n", 236 | "\n", 237 | "\n", 249 | "
\n", 250 | "
\n", 251 | "
\n", 252 | "
\n", 253 | "
\n", 254 | "
    \n", 255 | "
  • Google Translate\"\"
  • \n", 256 | "
\n", 257 | "
\n", 258 | "
\n", 259 | "
\n", 260 | "
\n", 261 | "
\n", 262 | "\n", 279 | "\n", 280 | "\n", 289 | "\n", 296 | "\n", 298 | "
\n", 303 | "\n" 304 | ] 305 | }, 306 | "execution_count": 9, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "soup" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "### To locate all 'a' tags" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "soup.findAll('a')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "### Let's take a quick look at the very first data file, which starts on line 36" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 17, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "one_a_tag = soup.findAll('a')[36]" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": {}, 350 | "source": [ 351 | "### We want to extract the actual link" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 18, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "data": { 361 | "text/plain": [ 362 | "'data/nyct/turnstile/turnstile_180922.txt'" 363 | ] 364 | }, 365 | "execution_count": 18, 366 | "metadata": {}, 367 | "output_type": "execute_result" 368 | } 369 | ], 370 | "source": [ 371 | "link = one_a_tag['href']\n", 372 | "link" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "### The full download URL is 'http://web.mta.info/developers/' + link" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "## To download the whole data set, let's do a for loop through all a tags" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 5, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "line_count = 1\n", 396 | "for one_a_tag in soup.findAll('a'): #'a' tags are for links\n", 397 | " if line_count >= 36: #code for text files starts at line 36\n", 398 | " link = one_a_tag['href']\n", 399 | " download_url = 'http://web.mta.info/developers/'+ link\n", 400 | " urllib.request.urlretrieve(download_url,'./'+link[link.find('/turnstile_')+1:]) \n", 401 | " time.sleep(1)\n", 402 | " line_count +=1" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [] 411 | } 412 | ], 413 | "metadata": { 414 | "kernelspec": { 415 | "display_name": "Python 3", 416 | "language": "python", 417 | "name": "python3" 418 | }, 419 | "language_info": { 420 | "codemirror_mode": { 421 | "name": "ipython", 422 | "version": 3 423 | }, 424 | "file_extension": ".py", 425 | "mimetype": "text/x-python", 426 | "name": "python", 427 | "nbconvert_exporter": "python", 428 | "pygments_lexer": "ipython3", 429 | "version": "3.6.4" 430 | } 431 | }, 432 | "nbformat": 4, 433 | "nbformat_minor": 2 434 | } 435 | --------------------------------------------------------------------------------