├── .gitignore ├── LICENSE ├── Python syntax cheat sheet.ipynb ├── Python syntax cheat sheet.pdf ├── README.md ├── ire-board ├── IRE Board members - complete.ipynb ├── IRE Board members - working.ipynb ├── ire-board.html └── ire_board_scrape.py ├── md-warn-notices ├── Maryland WARN Notices - multiple pages.ipynb └── Maryland WARN Notices.ipynb ├── requirements.txt ├── sd-lobbyists ├── data │ └── .gitkeep └── download_lobbyist_data.py ├── tx-railroad-commission ├── dl_pages_details.py ├── dl_pages_results.py ├── main.py ├── pages-detail │ └── .gitkeep ├── pages-results │ └── .gitkeep ├── scrape_detail_pages.py └── tx-railroad-commission-data.csv └── us-senate-press-gallery ├── U.S. Senate Press Gallery - complete.ipynb └── U.S. Senate Press Gallery - working.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | # Created by https://www.toptal.com/developers/gitignore/api/osx 131 | # Edit at https://www.toptal.com/developers/gitignore?templates=osx 132 | 133 | ### OSX ### 134 | # General 135 | .DS_Store 136 | .AppleDouble 137 | .LSOverride 138 | 139 | # Icon must end with two \r 140 | Icon 141 | 142 | 143 | # Thumbnails 144 | ._* 145 | 146 | # Files that might appear in the root of a volume 147 | .DocumentRevisions-V100 148 | .fseventsd 149 | .Spotlight-V100 150 | .TemporaryItems 151 | .Trashes 152 | .VolumeIcon.icns 153 | .com.apple.timemachine.donotpresent 154 | 155 | # Directories potentially created on remote AFP share 156 | .AppleDB 157 | .AppleDesktop 158 | Network Trash Folder 159 | Temporary Items 160 | .apdisk 161 | 162 | # End of https://www.toptal.com/developers/gitignore/api/osx 163 | 164 | session-notes 165 | tx-railroad-commission/*/*.html 166 | sd-lobbyists/data/*.zip 167 | sd-lobbyists/data/*.csv 168 | ire-board/*.csv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 IRE & NICAR 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Python syntax cheat sheet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python syntax cheat sheet\n", 8 | "\n", 9 | "This notebook demonstrates some basic syntax rules of the Python programming language.\n", 10 | "\n", 11 | "- [Basic data types](#Basic-data-types)\n", 12 | " - [Strings](#Strings)\n", 13 | " - [Numbers and math](#Numbers-and-math)\n", 14 | " - [Booleans](#Booleans)\n", 15 | "- [Variable assignment](#Variable-assignment)\n", 16 | "- [String methods](#String-methods)\n", 17 | "- [Comments](#Comments)\n", 18 | "- [The print() function](#The-print()-function)\n", 19 | "- [Collections of data](#Collections-of-data)\n", 20 | " - [Lists](#Lists)\n", 21 | " - [Dictionaries](#Dictionaries)\n", 22 | "- [`for` loops](#for-loops)\n", 23 | "- [`if` statements](#if-statements)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### Basic data types\n", 31 | "Just like Excel and other data processing software, Python recognizes a variety of data types, including three we'll focus on here:\n", 32 | "- Strings (text)\n", 33 | "- Numbers (integers, numbers with decimals and more)\n", 34 | "- Booleans (`True` and `False`).\n", 35 | "\n", 36 | "You can use the built-in [`type()`](https://docs.python.org/3/library/functions.html#type) function to check the data type of a value." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "#### Strings\n", 44 | "\n", 45 | "A string is a group of characters -- letters, numbers, whatever -- enclosed within single or double quotes (doesn't matter as long as they match). The code in these notebooks uses single quotes. (The Python style guide doesn't recommend one over the other: [\"Pick a rule and stick to it.\"](https://www.python.org/dev/peps/pep-0008/#string-quotes))\n", 46 | "\n", 47 | "If your string _contains_ apostrophes or quotes, you have two options: _Escape_ the offending character with a forward slash `\\`:\n", 48 | "\n", 49 | "```python\n", 50 | "'Isn\\'t it nice here?'\n", 51 | "```\n", 52 | "\n", 53 | "... or change the surrounding punctuation:\n", 54 | "\n", 55 | "```python\n", 56 | "\"Isn't it nice here?\"\n", 57 | "```\n", 58 | "\n", 59 | "The style guide recommends the latter over the former.\n", 60 | "\n", 61 | "When you call the `type()` function on a string, Python will return `str`.\n", 62 | "\n", 63 | "Calling the [`str()` function](https://docs.python.org/3/library/stdtypes.html#str) on a value will return the string version of that value (see examples below)." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "'Investigative Reporters and Editors'" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "type('hello!')" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "45" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "type(45)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "str(45)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "type(str(45))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "str(True)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "If you \"add\" strings together with a plus sign `+`, it will concatenate them:" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "'IRE' + '/' + 'NICAR'" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "#### Numbers and math\n", 150 | "\n", 151 | "Python recognizes a variety of numeric data types. Two of the most common are integers (whole numbers) and floats (numbers with decimals).\n", 152 | "\n", 153 | "Calling `int()` on a piece of numeric data (even if it's being stored as a string) will attempt to coerce it to an integer; calling `float()` will try to convert it to a float." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "12" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "12.4" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "type(12)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "type(12.4)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "int(35.6)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "int('45')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "float(46)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "float('45')" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "You can do [basic math](https://www.digitalocean.com/community/tutorials/how-to-do-math-in-python-3-with-operators) in Python. You can also do [more advanced math](https://docs.python.org/3/library/math.html)." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "4+2" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "10-9" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": {}, 257 | "outputs": [], 258 | "source": [ 259 | "5*10" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "1000/10" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# ** raises a number to the power of another number\n", 278 | "5**2" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "#### Booleans\n", 286 | "\n", 287 | "Just like in Excel, which has `TRUE` and `FALSE` data types, Python has boolean data types. They are `True` and `False` -- note that only the first letter is capitalized, and they are not sandwiched between quotes.\n", 288 | "\n", 289 | "Boolean values are typically returned when you're evaluating some sort of conditional statement -- comparing values, checking to see if a string is inside another string or if a value is in a list, etc.\n", 290 | "\n", 291 | "[Python's comparison operators](https://docs.python.org/3/reference/expressions.html#comparisons) include:\n", 292 | "\n", 293 | "- `>` greater than\n", 294 | "- `<` less than\n", 295 | "- `>=` greater than or equal to\n", 296 | "- `<=` less than or equal to\n", 297 | "- `==` equal to\n", 298 | "- `!=` not equal to" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "True" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [ 316 | "False" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "4 > 6" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "10 == 10" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "'crapulence' == 'Crapulence'" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "type(True)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "### Variable assignment\n", 360 | "\n", 361 | "The `=` sign assigns a value to a variable name that you choose. Later, you can retrieve that value by referencing its variable name. Variable names can be pretty much anything you want ([as long as you follow some basic rules](https://thehelloworldprogram.com/python/python-variable-assignment-statements-rules-conventions-naming/)).\n", 362 | "\n", 363 | "This can be a tricky concept at first! For more detail, [here's a pretty good explainer from Digital Ocean](https://www.digitalocean.com/community/tutorials/how-to-use-variables-in-python-3)." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "my_name = 'Frank'" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "my_name" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "You can also _reassign_ a different value to a variable name, though it's usually better practice to create a new variable." 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "my_name = 'Susan'" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "my_name" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "A common thing to do is to \"save\" the results of an expression by assigning the result to a variable." 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "my_fav_number = 10 + 3" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "my_fav_number" 432 | ] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "It's also common to refer to previously defined variables in an expression: " 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": null, 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "nfl_teams = 32\n", 448 | "mlb_teams = 30\n", 449 | "nba_teams = 30\n", 450 | "nhl_teams = 31\n", 451 | "\n", 452 | "number_of_pro_sports_teams = nfl_teams + mlb_teams + nba_teams + nhl_teams" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "number_of_pro_sports_teams" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "### String methods\n", 469 | "\n", 470 | "Let's go back to strings for a second. String objects have a number of useful [methods](https://docs.python.org/3/library/stdtypes.html#string-methods) -- let's use an example string to demonstrate a few common ones." 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "my_cool_string = ' Hello, friends!'" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "`upper()` converts the string to uppercase:" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "my_cool_string.upper()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "`lower()` converts to lowercase:" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "my_cool_string.lower()" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "`replace()` will replace a piece of text with other text that you specify:" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "my_cool_string.replace('friends', 'enemies')" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "`count()` will count the number of occurrences of a character or group of characters: " 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "my_cool_string.count('H')" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "Note that `count()` is case-sensitive. If your task is \"count all the e's,\" convert your original string to upper or lowercase first:" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "metadata": {}, 557 | "outputs": [], 558 | "source": [ 559 | "my_cool_string.upper().count('E')" 560 | ] 561 | }, 562 | { 563 | "cell_type": "markdown", 564 | "metadata": {}, 565 | "source": [ 566 | "[`split()`](https://docs.python.org/3/library/stdtypes.html#str.split) will split the string into a [_list_](#Lists) (more on these in a second) on a given delimiter (if you don't specify a delimiter, it'll default to splitting on a space):" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": null, 572 | "metadata": {}, 573 | "outputs": [], 574 | "source": [ 575 | "my_cool_string.split()" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": {}, 582 | "outputs": [], 583 | "source": [ 584 | "my_cool_string.split(',')" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "my_cool_string.split('Pitt')" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "`strip()` removes whitespace from either side of your string (but not internal whitespace):" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "my_cool_string.strip()" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "You can use a cool thing called \"method chaining\" to combine methods -- just tack 'em onto the end. Let's say we wanted to strip whitespace from our string _and_ make it uppercase:" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": null, 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "my_cool_string.strip().upper()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "Notice, however, that our original string is unchanged:" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "my_cool_string" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": {}, 647 | "source": [ 648 | "Why? Because we haven't assigned the results of anything we've done to a variable. A common thing to do, especially when you're cleaning data, would be to assign the results to a new variable:" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "my_cool_string_clean = my_cool_string.strip().upper()" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": null, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "my_cool_string_clean" 667 | ] 668 | }, 669 | { 670 | "cell_type": "markdown", 671 | "metadata": {}, 672 | "source": [ 673 | "### Comments\n", 674 | "A line with a comment -- a note that you don't want Python to interpret -- starts with a `#` sign. These are notes to collaborators and to your future self about what's happening at this point in your script, and why.\n", 675 | "\n", 676 | "Typically you'd put this on the line right above the line of code you're commenting on:" 677 | ] 678 | }, 679 | { 680 | "cell_type": "code", 681 | "execution_count": null, 682 | "metadata": {}, 683 | "outputs": [], 684 | "source": [ 685 | "avg_settlement = 40827348.34328237\n", 686 | "\n", 687 | "# coercing this to an int because we don't need any decimal precision\n", 688 | "int(avg_settlement)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "Multi-line comments are sandwiched between triple quotes (or triple apostrophes):\n", 696 | "\n", 697 | "`'''\n", 698 | "this\n", 699 | "is a long\n", 700 | "comment\n", 701 | "'''`\n", 702 | "\n", 703 | "or\n", 704 | "\n", 705 | "`\"\"\"\n", 706 | "this\n", 707 | "is a long\n", 708 | "comment\n", 709 | "\"\"\"`" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "### The `print()` function\n", 717 | "\n", 718 | "So far, we've just been running the notebook cells to get the last value returned by the code we write. Using the [`print()`](https://docs.python.org/3/library/functions.html#print) function is a way to print specific things in your script to the screen. This function is handy for debugging.\n", 719 | "\n", 720 | "To print multiple things on the same line, separate them with a comma." 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "metadata": {}, 727 | "outputs": [], 728 | "source": [ 729 | "print('Hello!')" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [ 738 | "print(my_name)" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [ 747 | "print('Hello,', my_name)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "## Collections of data\n", 755 | "\n", 756 | "Now we're going to talk about two ways you can use Python to group data into a collection: lists and dictionaries." 757 | ] 758 | }, 759 | { 760 | "cell_type": "markdown", 761 | "metadata": {}, 762 | "source": [ 763 | "### Lists\n", 764 | "\n", 765 | "A _list_ is a comma-separated list of items inside square brackets: `[]`.\n", 766 | "\n", 767 | "Here's a list of ingredients, each one a string, that together makes up a salsa recipe." 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "salsa_ingredients = ['tomato', 'onion', 'jalapeño', 'lime', 'cilantro']" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "To get an item out of a list, you'd refer to its numerical position in the list -- its _index_ (1, 2, 3, etc.) -- inside square brackets immediately following your reference to that list. In Python, as in many other programming languages, counting starts at 0. That means the first item in a list is item `0`." 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": {}, 790 | "outputs": [], 791 | "source": [ 792 | "salsa_ingredients[0]" 793 | ] 794 | }, 795 | { 796 | "cell_type": "code", 797 | "execution_count": null, 798 | "metadata": {}, 799 | "outputs": [], 800 | "source": [ 801 | "salsa_ingredients[1]" 802 | ] 803 | }, 804 | { 805 | "cell_type": "markdown", 806 | "metadata": {}, 807 | "source": [ 808 | "You can use _negative indexing_ to grab things from the right-hand side of the list -- and in fact, `[-1]` is a common idiom for getting \"the last item in a list\" when it's not clear how many items are in your list." 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": null, 814 | "metadata": {}, 815 | "outputs": [], 816 | "source": [ 817 | "salsa_ingredients[-1]" 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "metadata": {}, 823 | "source": [ 824 | "If you wanted to get a slice of multiple items out of your list, you'd use colons (just like in Excel, kind of!).\n", 825 | "\n", 826 | "If you wanted to get the first three items, you'd do this:" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "salsa_ingredients[0:3]" 836 | ] 837 | }, 838 | { 839 | "cell_type": "markdown", 840 | "metadata": {}, 841 | "source": [ 842 | "You could also have left off the initial 0 -- when you leave out the first number, Python defaults to \"the first item in the list.\" In the same way, if you leave off the last number, Python defaults to \"the last item in the list.\"" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": {}, 849 | "outputs": [], 850 | "source": [ 851 | "salsa_ingredients[:3]" 852 | ] 853 | }, 854 | { 855 | "cell_type": "markdown", 856 | "metadata": {}, 857 | "source": [ 858 | "Note, too, that this slice is giving us items 0, 1 and 2. The `3` in our slice is the first item we _don't_ want. That can be kind of confusing at first. Let's try a few more:" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": null, 864 | "metadata": {}, 865 | "outputs": [], 866 | "source": [ 867 | "# everything in the list except the first item\n", 868 | "salsa_ingredients[1:]" 869 | ] 870 | }, 871 | { 872 | "cell_type": "code", 873 | "execution_count": null, 874 | "metadata": {}, 875 | "outputs": [], 876 | "source": [ 877 | "# the second, third and fourth items\n", 878 | "salsa_ingredients[1:4]" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": null, 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "# the last two items\n", 888 | "salsa_ingredients[-2:]" 889 | ] 890 | }, 891 | { 892 | "cell_type": "markdown", 893 | "metadata": {}, 894 | "source": [ 895 | "To see how many items are in a list, use the `len()` function:" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "len(salsa_ingredients)" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": {}, 910 | "source": [ 911 | "To add an item to a list, use the [`append()`](https://docs.python.org/3/tutorial/datastructures.html#more-on-lists) method:" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "salsa_ingredients" 921 | ] 922 | }, 923 | { 924 | "cell_type": "code", 925 | "execution_count": null, 926 | "metadata": {}, 927 | "outputs": [], 928 | "source": [ 929 | "salsa_ingredients.append('mayonnaise')" 930 | ] 931 | }, 932 | { 933 | "cell_type": "code", 934 | "execution_count": null, 935 | "metadata": {}, 936 | "outputs": [], 937 | "source": [ 938 | "salsa_ingredients" 939 | ] 940 | }, 941 | { 942 | "cell_type": "markdown", 943 | "metadata": {}, 944 | "source": [ 945 | "Haha _gross_. To remove an item from a list, use the `pop()` method. If you don't specify the index number of the item you want to pop out, it will default to \"the last item.\"" 946 | ] 947 | }, 948 | { 949 | "cell_type": "code", 950 | "execution_count": null, 951 | "metadata": {}, 952 | "outputs": [], 953 | "source": [ 954 | "salsa_ingredients.pop()" 955 | ] 956 | }, 957 | { 958 | "cell_type": "code", 959 | "execution_count": null, 960 | "metadata": { 961 | "scrolled": true 962 | }, 963 | "outputs": [], 964 | "source": [ 965 | "salsa_ingredients" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": {}, 971 | "source": [ 972 | "You can use the [`in` and `not in`](https://docs.python.org/3/reference/expressions.html#membership-test-operations) expressions to test membership in a list (will return a boolean):" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": null, 978 | "metadata": {}, 979 | "outputs": [], 980 | "source": [ 981 | "'lime' in salsa_ingredients" 982 | ] 983 | }, 984 | { 985 | "cell_type": "code", 986 | "execution_count": null, 987 | "metadata": {}, 988 | "outputs": [], 989 | "source": [ 990 | "'cilantro' not in salsa_ingredients" 991 | ] 992 | }, 993 | { 994 | "cell_type": "markdown", 995 | "metadata": {}, 996 | "source": [ 997 | "### Dictionaries\n", 998 | "\n", 999 | "A _dictionary_ is a comma-separated list of key/value pairs inside curly brackets: `{}`. Let's make an entire salsa recipe:" 1000 | ] 1001 | }, 1002 | { 1003 | "cell_type": "code", 1004 | "execution_count": null, 1005 | "metadata": {}, 1006 | "outputs": [], 1007 | "source": [ 1008 | "salsa = {\n", 1009 | " 'ingredients': salsa_ingredients,\n", 1010 | " 'instructions': 'Chop up all the ingredients and cook them for awhile.',\n", 1011 | " 'oz_made': 12\n", 1012 | "}" 1013 | ] 1014 | }, 1015 | { 1016 | "cell_type": "markdown", 1017 | "metadata": {}, 1018 | "source": [ 1019 | "To retrieve a value from a dictionary, you'd refer to the name of its key inside square brackets `[]` immediately after your reference to the dictionary:" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": null, 1025 | "metadata": {}, 1026 | "outputs": [], 1027 | "source": [ 1028 | "salsa['oz_made']" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": null, 1034 | "metadata": {}, 1035 | "outputs": [], 1036 | "source": [ 1037 | "salsa['ingredients']" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "markdown", 1042 | "metadata": {}, 1043 | "source": [ 1044 | "To add a new key/value pair to a dictionary, assign a new key to the dictionary inside square brackets and set the value of that key with `=`:" 1045 | ] 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": null, 1050 | "metadata": {}, 1051 | "outputs": [], 1052 | "source": [ 1053 | "salsa['tastes_great'] = True" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": null, 1059 | "metadata": {}, 1060 | "outputs": [], 1061 | "source": [ 1062 | "salsa" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "markdown", 1067 | "metadata": {}, 1068 | "source": [ 1069 | "To delete a key/value pair out of a dictionary, use the `del` command and reference the key:" 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "code", 1074 | "execution_count": null, 1075 | "metadata": {}, 1076 | "outputs": [], 1077 | "source": [ 1078 | "del salsa['tastes_great']" 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "code", 1083 | "execution_count": null, 1084 | "metadata": {}, 1085 | "outputs": [], 1086 | "source": [ 1087 | "salsa" 1088 | ] 1089 | }, 1090 | { 1091 | "cell_type": "markdown", 1092 | "metadata": {}, 1093 | "source": [ 1094 | "### Indentation\n", 1095 | "\n", 1096 | "Whitespace matters in Python. Sometimes you'll need to indent bits of code to make things work. This can be confusing! `IndentationError`s are common even for experienced programmers. (FWIW, Jupyter will try to be helpful and insert the correct amount of \"significant whitespace\" for you.)\n", 1097 | "\n", 1098 | "You can use tabs or spaces, just don't mix them. [The Python style guide](https://www.python.org/dev/peps/pep-0008/) recommends indenting your code in groups of four spaces, so that's what we'll use." 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "markdown", 1103 | "metadata": {}, 1104 | "source": [ 1105 | "### `for` loops\n", 1106 | "\n", 1107 | "You would use a `for` loop to iterate over a collection of things. The statement begins with the keyword `for` (lowercase), then a temporary `variable_name` of your choice to represent each item as you loop through the collection, then the Python keyword `in`, then the collection you're looping over (or its variable name), then a colon, then the indented block of code with instructions about what to do with each item in the collection.\n", 1108 | "\n", 1109 | "Let's say we have a list of numbers that we assign to the variable `list_of_numbers`." 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "code", 1114 | "execution_count": null, 1115 | "metadata": {}, 1116 | "outputs": [], 1117 | "source": [ 1118 | "list_of_numbers = [1, 2, 3, 4, 5, 6]" 1119 | ] 1120 | }, 1121 | { 1122 | "cell_type": "markdown", 1123 | "metadata": {}, 1124 | "source": [ 1125 | "We could loop over the list and print out each number:" 1126 | ] 1127 | }, 1128 | { 1129 | "cell_type": "code", 1130 | "execution_count": null, 1131 | "metadata": {}, 1132 | "outputs": [], 1133 | "source": [ 1134 | "for number in list_of_numbers:\n", 1135 | " print(number)" 1136 | ] 1137 | }, 1138 | { 1139 | "cell_type": "markdown", 1140 | "metadata": {}, 1141 | "source": [ 1142 | "We could print out each number _times 6_:" 1143 | ] 1144 | }, 1145 | { 1146 | "cell_type": "code", 1147 | "execution_count": null, 1148 | "metadata": {}, 1149 | "outputs": [], 1150 | "source": [ 1151 | "for number in list_of_numbers:\n", 1152 | " print(number*6)" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "markdown", 1157 | "metadata": {}, 1158 | "source": [ 1159 | "... whatever you need to do in you loop. Note that the variable name `number` in our loop is totally arbitrary. This also would work:" 1160 | ] 1161 | }, 1162 | { 1163 | "cell_type": "code", 1164 | "execution_count": null, 1165 | "metadata": {}, 1166 | "outputs": [], 1167 | "source": [ 1168 | "for banana in list_of_numbers:\n", 1169 | " print(banana)" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "markdown", 1174 | "metadata": {}, 1175 | "source": [ 1176 | "It can be hard, at first, to figure out what's a \"Python word\" and what's a variable name that you get to define. This comes with practice." 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "markdown", 1181 | "metadata": {}, 1182 | "source": [ 1183 | "Strings are iterable, too. Let's loop over the letters in a sentence:" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": null, 1189 | "metadata": {}, 1190 | "outputs": [], 1191 | "source": [ 1192 | "sentence = 'Hello, IRE/NICAR!'\n", 1193 | "\n", 1194 | "for letter in sentence:\n", 1195 | " print(letter)" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "markdown", 1200 | "metadata": {}, 1201 | "source": [ 1202 | "To this point: Strings are iterable, like lists, so you can use the same kinds of methods:" 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": null, 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [ 1211 | "# get the first five characters\n", 1212 | "sentence[:5]" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "code", 1217 | "execution_count": null, 1218 | "metadata": {}, 1219 | "outputs": [], 1220 | "source": [ 1221 | "# get the length of the sentence\n", 1222 | "len(sentence)" 1223 | ] 1224 | }, 1225 | { 1226 | "cell_type": "code", 1227 | "execution_count": null, 1228 | "metadata": {}, 1229 | "outputs": [], 1230 | "source": [ 1231 | "'Hello' in sentence" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "markdown", 1236 | "metadata": {}, 1237 | "source": [ 1238 | "You can iterate over dictionaries, too -- just remember that dictionaries _don't keep track of the order that items were added to it_.\n", 1239 | "\n", 1240 | "When you're looping over a dictionary, the variable name in your `for` loop will refer to the keys. Let's loop over our `salsa` dictionary from up above to see what I mean." 1241 | ] 1242 | }, 1243 | { 1244 | "cell_type": "code", 1245 | "execution_count": null, 1246 | "metadata": {}, 1247 | "outputs": [], 1248 | "source": [ 1249 | "for key in salsa:\n", 1250 | " print(key)" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "markdown", 1255 | "metadata": {}, 1256 | "source": [ 1257 | "To get the _value_ of a dictionary item in a for loop, you'd need to use the key to retrieve it from the dictionary:" 1258 | ] 1259 | }, 1260 | { 1261 | "cell_type": "code", 1262 | "execution_count": null, 1263 | "metadata": {}, 1264 | "outputs": [], 1265 | "source": [ 1266 | "for key in salsa:\n", 1267 | " print(salsa[key])" 1268 | ] 1269 | }, 1270 | { 1271 | "cell_type": "markdown", 1272 | "metadata": {}, 1273 | "source": [ 1274 | "### `if` statements\n", 1275 | "Just like in Excel, you can use the \"if\" keyword to handle conditional logic.\n", 1276 | "\n", 1277 | "These statements begin with the keyword `if` (lowercase), then the condition to evaluate, then a colon, then a new line with a block of indented code to execute if the condition resolves to `True`." 1278 | ] 1279 | }, 1280 | { 1281 | "cell_type": "code", 1282 | "execution_count": null, 1283 | "metadata": {}, 1284 | "outputs": [], 1285 | "source": [ 1286 | "if 4 < 6:\n", 1287 | " print('4 is less than 6')" 1288 | ] 1289 | }, 1290 | { 1291 | "cell_type": "markdown", 1292 | "metadata": {}, 1293 | "source": [ 1294 | "You can also add an `else` statement (and a colon) with an indented block of code you want to run if the condition resolves to `False`." 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "code", 1299 | "execution_count": null, 1300 | "metadata": {}, 1301 | "outputs": [], 1302 | "source": [ 1303 | "if 4 > 6:\n", 1304 | " print('4 is greater than 6?!')\n", 1305 | "else:\n", 1306 | " print('4 is not greater than 6.')" 1307 | ] 1308 | }, 1309 | { 1310 | "cell_type": "markdown", 1311 | "metadata": {}, 1312 | "source": [ 1313 | "If you need to, you can add multiple conditions with `elif`." 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "code", 1318 | "execution_count": null, 1319 | "metadata": {}, 1320 | "outputs": [], 1321 | "source": [ 1322 | "HOME_SCORE = 6\n", 1323 | "AWAY_SCORE = 8\n", 1324 | "\n", 1325 | "if HOME_SCORE > AWAY_SCORE:\n", 1326 | " print('we won!')\n", 1327 | "elif HOME_SCORE == AWAY_SCORE:\n", 1328 | " print('we tied!')\n", 1329 | "else:\n", 1330 | " print('we lost!')" 1331 | ] 1332 | } 1333 | ], 1334 | "metadata": { 1335 | "kernelspec": { 1336 | "display_name": "Python 3 (ipykernel)", 1337 | "language": "python", 1338 | "name": "python3" 1339 | }, 1340 | "language_info": { 1341 | "codemirror_mode": { 1342 | "name": "ipython", 1343 | "version": 3 1344 | }, 1345 | "file_extension": ".py", 1346 | "mimetype": "text/x-python", 1347 | "name": "python", 1348 | "nbconvert_exporter": "python", 1349 | "pygments_lexer": "ipython3", 1350 | "version": "3.10.9" 1351 | } 1352 | }, 1353 | "nbformat": 4, 1354 | "nbformat_minor": 2 1355 | } 1356 | -------------------------------------------------------------------------------- /Python syntax cheat sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/Python syntax cheat sheet.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NICAR 2023: Web scraping with Python 2 | 3 | ### 🔗 [bit.ly/nicar23-scraping](https://bit.ly/nicar23-scraping) 4 | 5 | This repo contains materials for a half-day workshop at the NICAR 2023 data journalism conference in Nashville on using Python to scrape data from websites. 6 | 7 | The session is scheduled for Sunday, March 5, from 9 a.m. - 12:30 p.m. in room `Midtown 3` on Meeting Space Level 2. 8 | 9 | ### First step 10 | 11 | Open the Terminal application. Copy and paste this text into the Terminal and hit enter: 12 | 13 | ```bat 14 | cd Desktop/hands_on_classes/20230305-sunday-web-scraping-with-python--preregistered-attendees-only & .\env\Scripts\activate 15 | ``` 16 | 17 | ### Course outline 18 | - Do you really need to scrape this? 19 | - Process overview: 20 | - Fetch, parse, write data to file 21 | - Some best practices 22 | - Make sure you feel OK about whether your scraping project is (legally, ethically, etc.) allowable 23 | - Don't DDOS your target server 24 | - When feasible, save copies of pages locally, then scrape from those files 25 | - [Rotate user-agent strings](https://www.useragents.me/) and other headers if necessary to avoid bot detection 26 | - Using your favorite brower's inspection tools to deconstruct the target page(s) 27 | - See if the data is delivered to the page in a ready-to-use format, such as JSON ([example](https://sdlegislature.gov/Session/Archived)) 28 | - Is the HTML part of the actual page structure, or is it built on the fly when the page loads? ([example](https://rrctx.force.com/s/complaints)) 29 | - Can you open the URL directly in an incognito window and get to the same content, or does the page require a specific state to deliver the content (via search navigation, etc.)? ([example](https://rrctx.force.com/s/ietrs-complaint/a0ct0000000mOmhAAE/complaint0000000008)) 30 | - Are there [URL query parameters](https://en.wikipedia.org/wiki/Query_string) that you can tweak to get different results? ([example](https://www.worksafe.qld.gov.au/news-and-events/alerts)) 31 | - Choose tools that the most sense for your target page(s) -- a few popular options: 32 | - [`requests`](https://requests.readthedocs.io/en/latest/) and [`BeautifulSoup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) 33 | - [`playwright`](https://playwright.dev/python) (optionally using `BeautifulSoup` for the HTML parsing) 34 | - [`scrapy`](https://scrapy.org/) for larger spidering/crawling tasks 35 | - Overview of our Python setup today 36 | - Activating the virtual environment 37 | - Jupyter notebooks 38 | - Running `.py` files from the command line 39 | - Our projects today: 40 | - [Maryland WARN notices](md-warn-notices) 41 | - [U.S. Senate press gallery](us-senate-press-gallery) 42 | - [IRE board members](ire-board) 43 | - [South Dakota lobbyist registration data](sd-lobbyists) 44 | - [Texas Railroad Commission complaints](tx-railroad-commission) 45 | 46 | ### Additional resources 47 | - Need to scrape on a timer? [Try GitHub Actions](https://palewi.re/docs/first-github-scraper) (Other options: Using your computer's scheduler tools, putting your script on a remote server with a [`crontab` configuration](https://en.wikipedia.org/wiki/Cron), [switching to Google Apps Script and setting up time-based triggers](https://developers.google.com/apps-script/guides/triggers), etc.) 48 | - [A neat technique for copying data to your clipboard while scraping a Flourish visualization](https://til.simonwillison.net/shot-scraper/scraping-flourish) 49 | - [Walkthrough: Class-based scraping](https://blog.apps.npr.org/2016/06/17/scraping-tips.html) 50 | 51 | 52 | ### Running this code at home 53 | - Install Python, if you haven't already ([here's our guide](https://docs.google.com/document/d/1cYmpfZEZ8r-09Q6Go917cKVcQk_d0P61gm0q8DAdIdg/edit)) 54 | - Clone or download this repo 55 | - `cd` into the repo directory and install the requirements, preferably into a virtual environment using your tooling of choice: `pip install -r requirements.txt` 56 | - `playwright install` 57 | - `jupyter notebook` to launch the notebook server 58 | -------------------------------------------------------------------------------- /ire-board/IRE Board members - complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4917bfba", 6 | "metadata": {}, 7 | "source": [ 8 | "# IRE Board members\n", 9 | "\n", 10 | "The goal: Scrape [this list of IRE board members](https://www.ire.org/about-ire/past-ire-board-members/) into a CSV.\n", 11 | "\n", 12 | "This project introduces a few new concepts:\n", 13 | "- Scraping data that's not part of a table\n", 14 | "- Specifying custom request headers to evade a bot detection rule on our server\n", 15 | "- Using string methods and default values when parsing out the data" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "bfd3d8c7", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "# stdlib library we'll use to write the CSV file\n", 26 | "import csv\n", 27 | "\n", 28 | "# installed library to handle the HTTP traffic\n", 29 | "import requests\n", 30 | "\n", 31 | "# installed library to parse the HTML\n", 32 | "from bs4 import BeautifulSoup" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "1acd7756", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "URL = 'https://www.ire.org/about-ire/past-ire-board-members/'" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "id": "accded42", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "# set up request headers\n", 53 | "# the IRE website rejects incoming requests with the\n", 54 | "# `requests` library's default user-agent, so we\n", 55 | "# need to pretend to be a browser -- we can do that by\n", 56 | "# setting the `User-Agent` value to mimic a value that\n", 57 | "# a browser would send, and add this to the headers\n", 58 | "# of the request before it's sent\n", 59 | "# read more: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent\n", 60 | "headers = {\n", 61 | " 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'\n", 62 | "}" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "03294e7e", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "# send a GET request to fetch the page using the headers we just created\n", 73 | "r = requests.get(\n", 74 | " 'https://www.ire.org/about-ire/past-ire-board-members/',\n", 75 | " headers=headers\n", 76 | ")\n", 77 | "\n", 78 | "# raise an error if the HTTP request returns an error code\n", 79 | "# HTTP codes: https://http.cat\n", 80 | "r.raise_for_status()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "e5c65871", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# use the BeautifulSoup object to parse the response text\n", 91 | "# -- r.text -- with the default HTML parser\n", 92 | "# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use\n", 93 | "soup = BeautifulSoup(r.text, 'html.parser')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "400f25c3", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "print(soup)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "73db6014", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# search the HTML tree to find the div\n", 114 | "# with the `id` attribute of \"past-ire-board-members\"\n", 115 | "target_div = soup.find(\n", 116 | " 'div',\n", 117 | " {'id': 'past-ire-board-members'}\n", 118 | ")" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "df88000b", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "print(target_div)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "4ad3f74f", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "# within that div, find all the paragraph tags\n", 139 | "members = target_div.find_all('p')" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "7b51b34b", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "members" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "cb711ee3", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# set up the CSV headers to write to file\n", 160 | "csv_headers = [\n", 161 | " 'name',\n", 162 | " 'terms',\n", 163 | " 'was_president',\n", 164 | " 'is_deceased'\n", 165 | "]" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "787cb02f", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# next, set up the file to write the CSV data into\n", 176 | "# https://docs.python.org/3/library/csv.html#csv.writer\n", 177 | "\n", 178 | "# open the CSV file in write ('w') mode, specifying newline='' to deal with\n", 179 | "# potential PC-only line ending problem\n", 180 | "with open('ire-board.csv', 'w', newline='') as outfile:\n", 181 | "\n", 182 | " # set up a csv.writer object tied to the file we just opened\n", 183 | " writer = csv.writer(outfile)\n", 184 | "\n", 185 | " # write the list of headers\n", 186 | " writer.writerow(csv_headers)\n", 187 | "\n", 188 | " # loop over the list of paragraphs we targeted above\n", 189 | " for member in members:\n", 190 | "\n", 191 | " # we don't want the entire Tag object, just the text\n", 192 | " text = member.text\n", 193 | "\n", 194 | " # set up some default values -- the member was not president\n", 195 | " was_president = False\n", 196 | "\n", 197 | " # and is not deceased\n", 198 | " is_deceased = False\n", 199 | "\n", 200 | " # IRE denotes past presidents with a leading asterisk\n", 201 | " # so check to see if the string startswith '*'\n", 202 | " # https://docs.python.org/3/library/stdtypes.html?highlight=startswith#str.startswith\n", 203 | " if text.startswith('*'):\n", 204 | "\n", 205 | " # if so, switch the value for the `was_president` variable to True\n", 206 | " was_president = True\n", 207 | "\n", 208 | " # check to see if \"(dec)\" is anywhere in the text, which\n", 209 | " # indicates this person is deceased\n", 210 | " # https://docs.python.org/3/reference/expressions.html#in\n", 211 | " if '(dec)' in text:\n", 212 | " is_deceased = True\n", 213 | "\n", 214 | " # next, start parsing out the pieces\n", 215 | " # separate the name from the terms by splitting on \"(\"\n", 216 | " text_split = text.split('(')\n", 217 | "\n", 218 | " # the name will be the first ([0]) item in the resulting list\n", 219 | " # while we're at it, strip off any leading asterisks\n", 220 | " # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.lstrip\n", 221 | " # and strip() off any leading or trailing whitespace\n", 222 | " # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.strip\n", 223 | " name = text_split[0].lstrip('*').strip()\n", 224 | "\n", 225 | " # the term(s) of service will be the second item ([1]) in that list\n", 226 | " # and the term text is always terminated with a closing parens\n", 227 | " # so splitting on that closing parens and taking the first ([0])\n", 228 | " # item in the list will give us the term(s)\n", 229 | " terms = text_split[1].split(')')[0]\n", 230 | "\n", 231 | " # put the collected data into a list\n", 232 | " data = [\n", 233 | " name,\n", 234 | " terms,\n", 235 | " was_president,\n", 236 | " is_deceased\n", 237 | " ]\n", 238 | "\n", 239 | " # and write this row of data into the CSV file\n", 240 | " writer.writerow(data)" 241 | ] 242 | } 243 | ], 244 | "metadata": { 245 | "kernelspec": { 246 | "display_name": "Python 3 (ipykernel)", 247 | "language": "python", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.10.9" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 5 265 | } 266 | -------------------------------------------------------------------------------- /ire-board/IRE Board members - working.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4917bfba", 6 | "metadata": {}, 7 | "source": [ 8 | "# IRE Board members\n", 9 | "\n", 10 | "The goal: Scrape [this list of IRE board members](https://www.ire.org/about-ire/past-ire-board-members/) into a CSV.\n", 11 | "\n", 12 | "This project introduces a few new concepts:\n", 13 | "- Scraping data that's not part of a table\n", 14 | "- Specifying custom request headers to evade a bot detection rule on our server\n", 15 | "- Using string methods and default values when parsing out the data\n", 16 | "\n", 17 | "[The completed version is here](IRE%20Board%20members%20-%20complete.ipynb).\n", 18 | "\n", 19 | "([See also this standalone version featuring a few more advanced techniques](/edit/ire-board/ire_board_scrape.py).)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "id": "bfd3d8c7", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "# stdlib library we'll use to write the CSV file\n", 30 | "import csv\n", 31 | "\n", 32 | "# installed library to handle the HTTP traffic\n", 33 | "import requests\n", 34 | "\n", 35 | "# installed library to parse the HTML\n", 36 | "from bs4 import BeautifulSoup" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "1acd7756", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "URL = 'https://www.ire.org/about-ire/past-ire-board-members/'" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "434e47d8", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# make the request\n", 57 | "\n", 58 | "# check for HTTP errors" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "accded42", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# set up request headers with a custom user-agent string\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "03294e7e", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# try the request again, with the new headers\n", 79 | "\n", 80 | "\n", 81 | "# and raise for errors\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "e5c65871", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# parse the HTML into soup\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "id": "7c3f3e35", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "73db6014", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "# search the HTML tree to find the div\n", 110 | "# with the `id` attribute of \"past-ire-board-members\"\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "df88000b", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "4ad3f74f", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# within that div, find all the paragraph tags\n" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "7b51b34b", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "id": "c0058f3a", 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# noodle around here to isolate the pieces of data for export" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "id": "6ec1c43b", 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "a8373c6c", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "7cfb1966", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "f35f134d", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "id": "cb711ee3", 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "# set up the CSV headers to write to file\n" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "787cb02f", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "# next, set up the file to write the CSV data into\n", 199 | "# https://docs.python.org/3/library/csv.html#csv.writer\n", 200 | "\n", 201 | "# open the CSV file in write ('w') mode, specifying newline='' to deal with\n", 202 | "# potential PC-only line ending problem\n", 203 | "\n", 204 | "\n", 205 | " # set up a csv.writer object tied to the file we just opened\n", 206 | "\n", 207 | "\n", 208 | " # write the list of headers\n", 209 | "\n", 210 | "\n", 211 | " # loop over the list of paragraphs we targeted above\n", 212 | "\n", 213 | "\n", 214 | " # we don't want the entire Tag object, just the text\n", 215 | "\n", 216 | "\n", 217 | " # set up some default values -- the member was not president\n", 218 | "\n", 219 | "\n", 220 | " # and is not deceased\n", 221 | "\n", 222 | "\n", 223 | " # IRE denotes past presidents with a leading asterisk\n", 224 | " # so check to see if the string startswith '*'\n", 225 | " # https://docs.python.org/3/library/stdtypes.html?highlight=startswith#str.startswith\n", 226 | "\n", 227 | "\n", 228 | " # if so, switch the value for the `was_president` variable to True\n", 229 | "\n", 230 | "\n", 231 | " # check to see if \"(dec)\" is anywhere in the text, which\n", 232 | " # indicates this person is deceased\n", 233 | " # https://docs.python.org/3/reference/expressions.html#in\n", 234 | "\n", 235 | "\n", 236 | " # next, start parsing out the pieces\n", 237 | " # separate the name from the terms by splitting on \"(\"\n", 238 | "\n", 239 | "\n", 240 | " # the name will be the first ([0]) item in the resulting list\n", 241 | " # while we're at it, strip off any leading asterisks\n", 242 | " # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.lstrip\n", 243 | " # and strip() off any leading or trailing whitespace\n", 244 | " # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.strip\n", 245 | "\n", 246 | "\n", 247 | " # the term(s) of service will be the second item ([1]) in that list\n", 248 | " # and the term text is always terminated with a closing parens\n", 249 | " # so splitting on that closing parens and taking the first ([0])\n", 250 | " # item in the list will give us the term(s)\n", 251 | "\n", 252 | "\n", 253 | " # put the collected data into a list\n", 254 | "\n", 255 | "\n", 256 | " # and write this row of data into the CSV file\n" 257 | ] 258 | } 259 | ], 260 | "metadata": { 261 | "kernelspec": { 262 | "display_name": "Python 3 (ipykernel)", 263 | "language": "python", 264 | "name": "python3" 265 | }, 266 | "language_info": { 267 | "codemirror_mode": { 268 | "name": "ipython", 269 | "version": 3 270 | }, 271 | "file_extension": ".py", 272 | "mimetype": "text/x-python", 273 | "name": "python", 274 | "nbconvert_exporter": "python", 275 | "pygments_lexer": "ipython3", 276 | "version": "3.10.9" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 5 281 | } 282 | -------------------------------------------------------------------------------- /ire-board/ire_board_scrape.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This version demonstrates a few more advanced techniques -- inline comments are mainly for stuff not covered in the basic script: 3 | - Separation of concerns: Writing a function to handle each task -- downloading the page and scraping the data -- and setting up the script to allow those functions to be imported into other scripts, if that need should ever arise 4 | - Doing a little more text processing to break the name into last/rest components, and to separate out terms of service, so now the atomic observation being written to file is a term of service, not a board member 5 | - Using csv.DictWriter instead of csv.writer 6 | - Demonstrating a few other useful Python techniques, such as list comprehensions, multiple assignment, star unpacking and custom list sorting 7 | ''' 8 | 9 | import os 10 | import csv 11 | 12 | import requests 13 | from bs4 import BeautifulSoup 14 | 15 | 16 | def download_page(url, html_file_out): 17 | 18 | if not os.path.exists(html_file_out): 19 | 20 | headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36' # noqa 22 | } 23 | 24 | r = requests.get( 25 | url, 26 | headers=headers 27 | ) 28 | 29 | r.raise_for_status() 30 | 31 | with open(html_file_out, 'w') as outfile: 32 | outfile.write(r.text) 33 | 34 | print(f'Downloaded {html_file_out}') 35 | 36 | return html_file_out 37 | 38 | 39 | def parse_data(html_file_in, csv_file_out): 40 | with open(html_file_in, 'r') as infile: 41 | html = infile.read() 42 | 43 | soup = BeautifulSoup( 44 | html, 45 | 'html.parser' 46 | ) 47 | 48 | target_div = soup.find( 49 | 'div', 50 | {'id': 'past-ire-board-members'} 51 | ) 52 | 53 | # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions 54 | members = [x.text.strip() for x in target_div.find_all('p')] 55 | 56 | csv_headers = [ 57 | 'name_last', 58 | 'name_rest', 59 | 'term_start', 60 | 'term_end', 61 | 'was_president', 62 | 'is_deceased' 63 | ] 64 | 65 | # start an empty list to hold records to write 66 | parsed_member_data = [] 67 | 68 | # loop over member text 69 | for member in members: 70 | 71 | was_president = False 72 | is_deceased = False 73 | 74 | if member.startswith('*'): 75 | was_president = True 76 | 77 | if '(dec)' in member: 78 | is_deceased = True 79 | 80 | # https://exercism.org/tracks/python/concepts/unpacking-and-multiple-assignment 81 | # https://docs.python.org/3/tutorial/controlflow.html?highlight=unpack#unpacking-argument-lists 82 | # here, the value attached to the `rest` var is ignored 83 | name, terms, *rest = member.split('(') 84 | 85 | name_clean = name.lstrip('*').strip() 86 | terms_clean = terms.split(')')[0] 87 | 88 | # split the name into last, rest 89 | name_split = name_clean.rsplit(' ', 1) 90 | 91 | # handle generational suffixes 92 | if name_split[-1] == 'Jr.': 93 | name_split = name_split[0].rsplit(' ', 1) 94 | name_split[0] += ' Jr.' 95 | 96 | rest, last = name_split 97 | 98 | # loop over the terms of service 99 | for term in terms_clean.split(','): 100 | term_start, term_end = term.strip().split('-') 101 | 102 | # create a dict by zipping together the headers with the list of data 103 | data = dict(zip(csv_headers, [ 104 | last, 105 | rest, 106 | term_start, 107 | term_end, 108 | was_president, 109 | is_deceased 110 | ])) 111 | 112 | # add the dict to the main list 113 | parsed_member_data.append(data) 114 | 115 | # sort member data by last name, then first name, then term start 116 | data_sorted = sorted( 117 | parsed_member_data, 118 | key=lambda x: ( 119 | x['name_last'], 120 | x['name_rest'], 121 | x['term_start'] 122 | ) 123 | ) 124 | 125 | # write to file, specifying the encoding and 126 | # dealing with a Windows-specific problem that 127 | # sometimes pops up when writing to file 128 | with open(csv_file_out, 'w', encoding='utf-8', newline='') as outfile: 129 | writer = csv.DictWriter( 130 | outfile, 131 | fieldnames=csv_headers 132 | ) 133 | writer.writeheader() 134 | writer.writerows(data_sorted) 135 | 136 | print(f'Wrote {csv_file_out}') 137 | 138 | 139 | # https://realpython.com/if-name-main-python/ 140 | if __name__ == '__main__': 141 | 142 | url = 'https://www.ire.org/about-ire/past-ire-board-members/' 143 | 144 | # https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals 145 | files_name = 'ire-board' 146 | filename_page = f'{files_name}.html' 147 | filename_csv = f'{files_name}-terms.csv' 148 | 149 | # call the functions 150 | download_page(url, filename_page) 151 | parse_data(filename_page, filename_csv) 152 | -------------------------------------------------------------------------------- /md-warn-notices/Maryland WARN Notices - multiple pages.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "8fa7c11c", 6 | "metadata": {}, 7 | "source": [ 8 | "# Maryland WARN Notices - multiple pages\n", 9 | "\n", 10 | "Extra credit: Figure out how to target and extract WARN data for multiple years. The process:\n", 11 | "- Using `requests`, fetch the main page\n", 12 | "- Using `bs4`, target the list of links to pages with data for previous years\n", 13 | "- Using a `for` loop, iterate over each link\n", 14 | " - Fetch the page\n", 15 | " - Turn the contents into `soup`\n", 16 | " - Target the elements to extract\n", 17 | " - Add the parsed data to your list" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "id": "7b6c6fb2", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Python 3 (ipykernel)", 32 | "language": "python", 33 | "name": "python3" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 3 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython3", 45 | "version": "3.10.9" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 5 50 | } 51 | -------------------------------------------------------------------------------- /md-warn-notices/Maryland WARN Notices.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Maryland WARN Notices\n", 8 | "\n", 9 | "The goal: Scrape the main table on [the first page of Maryland's list of WARN letters](https://www.dllr.state.md.us/employment/warn.shtml) and, if time, write the data to a CSV.\n", 10 | "\n", 11 | "### Table of contents\n", 12 | "\n", 13 | "- [Using Jupyter notebooks](#Using-Jupyter-notebooks)\n", 14 | "- [What _is_ a web page, anyway?](#What-is-a-web-page,-anyway?)\n", 15 | "- [Inspect the source](#Inspect-the-source)\n", 16 | "- [Import libraries](#Import-libraries)\n", 17 | "- [Request the page](#Request-the-page)\n", 18 | "- [Turn your HTML into soup](#Turn-your-HTML-into-soup)\n", 19 | "- [Targeting and extracting data](#Targeting-and-extracting-data)\n", 20 | "- [Write the results to file](#Write-the-results-to-file)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Using Jupyter notebooks\n", 28 | "\n", 29 | "There are several ways to write and run Python code on your computer. One way -- the method we're using today -- is to use [Jupyter notebooks](https://jupyter.org/), which run in your browser and allow you to intersperse documentation with your code. They're handy for bundling your code with a human-readable explanation of what's happening at each step. Check out some examples from the [L.A. Times](https://github.com/datadesk/notebooks) and [BuzzFeed News](https://github.com/BuzzFeedNews/everything#data-and-analyses).\n", 30 | "\n", 31 | "**To add a new cell to your notebook**: Click the + button in the menu or press the `b` button on your keyboard.\n", 32 | "\n", 33 | "**To run a cell of code**: Select the cell and click the \"Run\" button in the menu, or you can press Shift+Enter.\n", 34 | "\n", 35 | "**One common gotcha**: The notebook doesn't \"know\" about code you've written until you've _run_ the cell containing it. For example, if you define a variable called `my_name` in one cell, and later, when you try to access that variable in another cell but get an error that says `NameError: name 'my_name' is not defined`, the most likely solution is to run (or re-run) the cell in which you defined `my_name`." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "### What _is_ a web page, anyway?\n", 43 | "\n", 44 | "Generally, a web page consists of a bunch of specifically formatted text files stored on a computer (a _server_) that's probably sitting on a rack in a giant data center somewhere.\n", 45 | "\n", 46 | "Mostly you'll be dealing with `.html` (HyperText Markup Language) files that might include references to `.css` (Cascading Style Sheet) files, which determine how the page looks, and/or `.js` (JavaScript) files, which add interactivity, and other specially formatted text files.\n", 47 | "\n", 48 | "Today, we'll focus on the HTML, which gives structure to the page.\n", 49 | "\n", 50 | "Most HTML elements are represented by a pair of tags -- an opening tag and a closing tag.\n", 51 | "\n", 52 | "A table, for example, starts with `` and ends with `
`. The first tag tells the browser: \"Hey! I got a table here! Render it as a table.\" The closing tag (note the forward slash!) tells the browser: \"Hey! I'm all done with that table, thanks.\" Inside the table are nested more HTML tags representing rows (``) and cells (``).\n", 53 | "\n", 54 | "HTML elements can have any number of attributes, such as classes --\n", 55 | "\n", 56 | "``\n", 57 | "\n", 58 | "-- styles --\n", 59 | "\n", 60 | "`
`\n", 61 | "\n", 62 | "-- hyperlinks to other pages --\n", 63 | "\n", 64 | "`Click here to visit IRE's website`\n", 65 | "\n", 66 | "-- and IDs --\n", 67 | "\n", 68 | "`
`\n", 69 | "\n", 70 | "-- that will be useful to know about when we're scraping." 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "### Inspect the source\n", 78 | "\n", 79 | "You can look at the HTML that makes up a web page by _inspecting the source_ in a web browser. We like Chrome and Firefox for this; today, we'll use Chrome.\n", 80 | "\n", 81 | "You can inspect specific elements on the page by right-clicking on the page and selecting \"Inspect\" or \"Inspect Element\" from the context menu that pops up. Hover over elements in the \"Elements\" tab to highlight them on the page.\n", 82 | "\n", 83 | "To examine all of the source code that makes up a page, you can \"view source.\" In Chrome, hit `Ctrl+U` on a PC or `⌘+Opt+U` on a Mac. (It's also in the menu bar: View > Developer > View Page Source.)\n", 84 | "\n", 85 | "You'll get a page showing you all of the HTML code that makes up that page. Ignore 99% of it and try to locate the element(s) that you want to target (use `Ctrl+F` on a PC and `⌘+F` to find).\n", 86 | "\n", 87 | "Open up a Chrome browser and inspect the table on the [the first page of Maryland's list of WARN letters](https://www.dllr.state.md.us/employment/warn.shtml). Find the table we want to scrape.\n", 88 | "\n", 89 | "Is it the only table on the page? If not, does it have any attributes that would allow you to target it?" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "### Import libraries\n", 97 | "\n", 98 | "Step one is to _import_ two third-party Python libraries that will help us scrape this page:\n", 99 | "- `requests` is the de facto standard for making HTTP requests, similar to what happens when you type a URL into a browser window and hit enter.\n", 100 | "- `bs4`, or BeautifulSoup, is a popular library for parsing HTML into a data structure that Python can work with.\n", 101 | "\n", 102 | "These libraries are installed separately from Python on a per-project basis ([read more about our recommendations for setting up Python projects here](https://docs.google.com/document/d/1cYmpfZEZ8r-09Q6Go917cKVcQk_d0P61gm0q8DAdIdg/edit#heading=h.od2v1nkge5t1)).\n", 103 | "\n", 104 | "Run this cell (you'll only have to do this once):" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "import requests\n", 114 | "import bs4" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Request the page\n", 122 | "\n", 123 | "Next, we'll use the `get()` method of the `requests` library (which we just imported) to grab the web page.\n", 124 | "\n", 125 | "While we're at it, we'll _assign_ all the stuff that comes back to a new variable using `=`.\n", 126 | "\n", 127 | "The variable name is arbitrary, but it's usually good to pick something that describes the value it's pointing to.\n", 128 | "\n", 129 | "Notice that the URL we're grabbing is wrapped in quotes, making it a _string_ that Python will interepret as text (as opposed to numbers, booleans, etc.). You can read up more on Python data types and variable assignment [here](Python%20syntax%20cheat%20sheet.ipynb).\n", 130 | "\n", 131 | "Run these two cells:" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "URL = 'http://www.dllr.state.md.us/employment/warn.shtml'" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "warn_page = requests.get(URL)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Nothing appears to have happened, which is (usually) a good sign.\n", 157 | "\n", 158 | "If you want to make sure that your request was successful, you can check the `status_code` attribute of the Python object that was returned:" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "warn_page.status_code" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "A `200` code means all is well. `404` means the page wasn't found, etc. ([Here's one of our favorite lists of HTTP status codes](https://http.cat/) ([or here, if you prefer dogs](https://httpstatusdogs.com/)).)\n", 175 | "\n", 176 | "The object being stored as the `warn_page` variable came back with a lot of potentially useful information we could access. Today, we're mostly interested in the `.text` attribute -- the HTML that makes up the web page, same as if we'd viewed the page source. Let's take a look:" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "warn_page.text" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "### ✍️ Try it yourself\n", 193 | "\n", 194 | "Use the code blocks below to experiment with requesting web pages and checking out the HTML that gets returned.\n", 195 | "\n", 196 | "Some ideas to get you started:\n", 197 | "- `'http://ire.org'`\n", 198 | "- `'https://web.archive.org/web/20031202214318/http://www.tdcj.state.tx.us:80/stat/finalmeals.htm'`\n", 199 | "- `'https://en.wikipedia.org/w/index.php?title=List_of_animal_names'`" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "### Turn your HTML into soup\n", 242 | "\n", 243 | "The HTML in the `.text` attribute of the request object is just a string -- a big ol' chunk of text.\n", 244 | "\n", 245 | "Before we start targeting and extracting pieces of data in the HTML, we need to turn that chunk of text into a data structure that Python can work with. That's where the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (`bs4`) library comes in.\n", 246 | "\n", 247 | "We'll create a new instance of a `BeautifulSoup` object, which lives under the top-level `bs4` library that we imported earlier. We need to give it two things:\n", 248 | "- The HTML we'd like to parse -- `warn_page.text`\n", 249 | "- A string with the name of the type of parser to use -- `html.parser` is the default and usually fine, but [there are other options](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser)\n", 250 | "\n", 251 | "We'll save the parsed HTML as a new variable, `soup`." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "soup = bs4.BeautifulSoup(warn_page.text, 'html.parser')" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "Nothing happened, which is good! You can take a look at what `soup` is, but it looks pretty much like `warn_page.text`:" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "soup" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "If you want to be sure, you can use the Python function `type()` to check what sort of object you're dealing with:" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# the `str` type means a string, or text\n", 293 | "type(warn_page.text)" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# the `bs4.BeautifulSoup` type means we successfully created the object\n", 303 | "type(soup)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "### ✍️ Try it yourself\n", 311 | "\n", 312 | "Use the code blocks below to experiment fetching HTML and turning it into soup (if you fetched some pages earlier and saved them as variables, that'd be a good start)." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "### Targeting and extracting data\n", 355 | "\n", 356 | "Now that we have BeautifulSoup object loaded up, we can go hunting for the specific HTML elements that contain the data we need. Our general strategy:\n", 357 | "1. Find the main table with the data we want to grab\n", 358 | "2. Get a list of rows (the `tr` element, which stands for \"table row\") in that table\n", 359 | "3. Use a Python `for loop` to go through each table row and find the data inside it (`td`, or \"table data\")\n", 360 | "\n", 361 | "To accomplish this, we'll use two `bs4` methods:\n", 362 | "- [`find()`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find), which returns the first element that matches whatever criteria you hand it\n", 363 | "- [`find_all()`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all), which returns a _list_ of elements that match the criteria. ([Here's how Python lists work](Python%20syntax%20cheat%20sheet.ipynb#Lists).)" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": {}, 369 | "source": [ 370 | "#### Find the table\n", 371 | "\n", 372 | "To start with, we need to find the table. There are several ways to accomplish this, but because this is the only table on the page (view source and `Ctrl+F` to search for `\n", 687 | " # for row in rows[1:]:\n", 688 | " # cells = row.find_all('td')\n", 689 | " # etc. ...\n", 690 | " # but at the end, instead of `print(warn_date, naics_code, ...etc.)`\n", 691 | " # make it something like\n", 692 | " # data_out = [warn_date, naics_code, ...etc.]\n", 693 | " # `writer.writerow(data_out)`" 694 | ] 695 | }, 696 | { 697 | "cell_type": "markdown", 698 | "metadata": {}, 699 | "source": [ 700 | "If you look in the folder, you should see a new file: `warn-data.csv`. Hooray!\n", 701 | "\n", 702 | "🎉 🎉 🎉" 703 | ] 704 | }, 705 | { 706 | "cell_type": "markdown", 707 | "metadata": {}, 708 | "source": [ 709 | "### Extra credit problems\n", 710 | "\n", 711 | "1. **Remove internal whitespace:** Looking over the data, you probably noticed that some of the values have some unnecessary internal whitespace, which you could fix before you wrote each row to file. Python does not have a built-in string method to remove internal whitespace, unfortunately, but [Googling around](https://www.google.com/search?q=python+remove+internal+whitespace) will yield you a common strategy: Using the `split()` method to separate individual words in the string, then `join()`ing the resulting list on a single space. As an example:\n", 712 | "\n", 713 | "```python\n", 714 | "my_text = 'hello world how are you?'\n", 715 | "\n", 716 | "# split() will turn this into a list of words\n", 717 | "my_text_words = my_text.split()\n", 718 | "# ['hello', 'world', 'how', 'are', 'you?']\n", 719 | "\n", 720 | "# join on a single space\n", 721 | "my_text_clean = ' '.join(my_text_words)\n", 722 | "print(my_text_clean)\n", 723 | "# prints 'hello world how are you?'\n", 724 | "\n", 725 | "# or, as a one-liner\n", 726 | "my_text_clean = ' '.join(my_text.split())\n", 727 | "```\n", 728 | "\n", 729 | "2. **Fetch multiple years:** The table we scraped has WARN notices for the current year, but the agency also maintains pages with WARN notices for previous years -- there's a list of them in a section [toward the bottom of the page](https://www.dllr.state.md.us/employment/warn.shtml). See if you can figure out how to loop over multiple pages and scrape the contents of each into a single CSV -- [here's a notebook to work in](Maryland%20WARN%20Notices%20-%20multiple%20pages.ipynb).\n", 730 | "\n", 731 | "\n", 732 | "3. **Build a lookup table:** Each numeric code in the \"WIA Code\" column correspondes to a local area. See if you can figure out how to create a lookup dictionary that maps the numbers to their locations, then as you're looping over the data table, replace the numeric value in that column with the name of the local area instead. Here's a hint:\n", 733 | "\n", 734 | "```python\n", 735 | " lookup_dict = {\n", 736 | " '1': 'hello',\n", 737 | " '2': 'world'\n", 738 | " }\n", 739 | "\n", 740 | " print(lookup_dict.get('1'))\n", 741 | " # prints 'hello'\n", 742 | "\n", 743 | " print(lookup_dict.get('3'))\n", 744 | " # prints None\n", 745 | "\n", 746 | "```" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [] 755 | } 756 | ], 757 | "metadata": { 758 | "kernelspec": { 759 | "display_name": "Python 3 (ipykernel)", 760 | "language": "python", 761 | "name": "python3" 762 | }, 763 | "language_info": { 764 | "codemirror_mode": { 765 | "name": "ipython", 766 | "version": 3 767 | }, 768 | "file_extension": ".py", 769 | "mimetype": "text/x-python", 770 | "name": "python", 771 | "nbconvert_exporter": "python", 772 | "pygments_lexer": "ipython3", 773 | "version": "3.10.9" 774 | } 775 | }, 776 | "nbformat": 4, 777 | "nbformat_minor": 2 778 | } 779 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | anyio==3.6.2 2 | appnope==0.1.3 3 | argon2-cffi==21.3.0 4 | argon2-cffi-bindings==21.2.0 5 | arrow==1.2.3 6 | asttokens==2.2.1 7 | attrs==22.2.0 8 | backcall==0.2.0 9 | beautifulsoup4==4.11.1 10 | bleach==6.0.0 11 | bs4==0.0.1 12 | certifi==2022.9.24 13 | cffi==1.15.1 14 | charset-normalizer==2.1.1 15 | comm==0.1.2 16 | debugpy==1.6.6 17 | decorator==5.1.1 18 | defusedxml==0.7.1 19 | executing==1.2.0 20 | fastjsonschema==2.16.2 21 | fqdn==1.5.1 22 | greenlet==2.0.1 23 | idna==3.4 24 | ipykernel==6.21.0 25 | ipython==8.9.0 26 | ipython-genutils==0.2.0 27 | ipywidgets==8.0.4 28 | isoduration==20.11.0 29 | jedi==0.18.2 30 | Jinja2==3.1.2 31 | jsonpointer==2.3 32 | jsonschema==4.17.3 33 | jupyter==1.0.0 34 | jupyter-console==6.4.4 35 | jupyter-events==0.6.3 36 | jupyter_client==8.0.2 37 | jupyter_core==5.2.0 38 | jupyter_server==2.1.0 39 | jupyter_server_terminals==0.4.4 40 | jupyterlab-pygments==0.2.2 41 | jupyterlab-widgets==3.0.5 42 | MarkupSafe==2.1.2 43 | matplotlib-inline==0.1.6 44 | mistune==2.0.4 45 | nbclassic==0.5.1 46 | nbclient==0.7.2 47 | nbconvert==7.2.9 48 | nbformat==5.7.3 49 | nest-asyncio==1.5.6 50 | notebook==6.5.2 51 | notebook_shim==0.2.2 52 | numpy==1.24.1 53 | packaging==23.0 54 | pandas==1.5.3 55 | pandocfilters==1.5.0 56 | parso==0.8.3 57 | pexpect==4.8.0 58 | pickleshare==0.7.5 59 | platformdirs==2.6.2 60 | playwright==1.30.0 61 | prometheus-client==0.16.0 62 | prompt-toolkit==3.0.36 63 | psutil==5.9.4 64 | ptyprocess==0.7.0 65 | pure-eval==0.2.2 66 | pycparser==2.21 67 | pyee==9.0.4 68 | Pygments==2.14.0 69 | pyrsistent==0.19.3 70 | python-dateutil==2.8.2 71 | python-json-logger==2.0.4 72 | pytz==2022.7.1 73 | PyYAML==6.0 74 | pyzmq==25.0.0 75 | qtconsole==5.4.0 76 | QtPy==2.3.0 77 | requests==2.28.1 78 | rfc3339-validator==0.1.4 79 | rfc3986-validator==0.1.1 80 | Send2Trash==1.8.0 81 | six==1.16.0 82 | sniffio==1.3.0 83 | soupsieve==2.3.2.post1 84 | stack-data==0.6.2 85 | terminado==0.17.1 86 | tinycss2==1.2.1 87 | tornado==6.2 88 | traitlets==5.9.0 89 | typing_extensions==4.4.0 90 | uri-template==1.2.0 91 | urllib3==1.26.13 92 | wcwidth==0.2.6 93 | webcolors==1.12 94 | webencodings==0.5.1 95 | websocket-client==1.5.0 96 | widgetsnbextension==4.0.5 97 | -------------------------------------------------------------------------------- /sd-lobbyists/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/sd-lobbyists/data/.gitkeep -------------------------------------------------------------------------------- /sd-lobbyists/download_lobbyist_data.py: -------------------------------------------------------------------------------- 1 | ''' 2 | In this script, we'll visit the South Dakota lobbyist registration lookup tool -- https://sosenterprise.sd.gov/BusinessServices/Lobbyist/LobbyistSearch.aspx -- an asp.net site that tracks user state and doesn't have consistent download paths for the lobbyist data files, because the files are generated on the fly based on search inputs. Therefore, we'll use playwright, a browser automation testing tool that's also handy for scraping websites. 3 | 4 | The goal is to download each year's worth of lobbyist data for each type of lobbyist (private and public) and then assemble the results into a single data file. 5 | 6 | The private lobbyist search has an "Export data" button, which kicks out a .zip file containing one pipe-delimited text file, but the public lobbyist search does not, so public lobbyist data will need to be scraped from the results table that appears below the search box. 7 | ''' 8 | 9 | # we'll use this stdlib csv library to write out 10 | # CSV files of the public lobbyist tables 11 | import csv 12 | 13 | # used for checking to see if files already exist 14 | import os 15 | 16 | # used for pausing between requests 17 | import time 18 | 19 | # we'll use this to calculate the current year 20 | from datetime import date 21 | 22 | # we'll use pandas to handle the zip files, since 23 | # it can handle compressed files out the gate 24 | import pandas as pd 25 | 26 | # for getting wildcard references to local files 27 | import glob 28 | 29 | # playwright will run the browser 30 | from playwright.sync_api import sync_playwright 31 | 32 | 33 | # the URL where we want to start 34 | URL = 'https://sosenterprise.sd.gov/BusinessServices/Lobbyist/LobbyistSearch.aspx' 35 | 36 | # where the data files will land 37 | DIR_DATA = 'data' 38 | 39 | # the CSV file to write into 40 | CSV_FILEPATH = 'sd-lobbyists.csv' 41 | 42 | # grab the current year for comparison below 43 | THIS_YEAR = date.today().year 44 | 45 | # storing references to CSS selectors we'll use 46 | # more than once below 47 | LOCATOR_SELECT_YEAR = 'select#ctl00_MainContent_slctYears' 48 | LOCATOR_SELECT_TABLE_LEN = 'div#DataTables_Table_0_length select' 49 | LOCATOR_BUTTON_SEARCH = 'a#ctl00_MainContent_SearchButton' 50 | LOCATOR_BUTTON_EXPORT = 'a#ctl00_MainContent_ExportButton' 51 | LOCATOR_TABLE = 'table#DataTables_Table_0 tbody' 52 | LOCATOR_RADIO_PUBLIC = 'input#ctl00_MainContent_chkSearchByPublic' 53 | 54 | 55 | def download_data_private(page): 56 | 57 | # get a reference to the select menu that 58 | # allows you to switch years 59 | # https://playwright.dev/python/docs/api/class-page#page-locator 60 | select_year = page.locator(LOCATOR_SELECT_YEAR) 61 | 62 | # get a list of the options attached to this 63 | # select menu 64 | options = select_year.locator('option').all() 65 | 66 | # using a list comprehension with a conditional 67 | # `if` statement, get a list of values for these options, 68 | # but skip the option with "All" in the text 69 | # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions 70 | years = [x.get_attribute('value') for x in options if 'All' not in x.inner_text()] 71 | 72 | # loop over that list of values (years) 73 | # that we just isolated 74 | for year in years: 75 | 76 | # for each year, build a path to where 77 | # we want to download the file 78 | # using an f-string 79 | # https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals 80 | filename = f'{year}-private.zip' 81 | filepath = os.path.join(DIR_DATA, filename) 82 | 83 | # if file already exists, skip this one -- unless it's the current year 84 | if os.path.exists(filepath) and year != str(THIS_YEAR): 85 | continue 86 | 87 | # select the year from the select menu 88 | # https://playwright.dev/python/docs/api/class-locator#locator-select-option 89 | select_year.select_option(value=year) 90 | 91 | # set up the download 92 | # https://playwright.dev/python/docs/api/class-download 93 | with page.expect_download() as download_info: 94 | 95 | # click on the export button 96 | page.locator(LOCATOR_BUTTON_EXPORT).click() 97 | 98 | # wait for download to complete 99 | download = download_info.value 100 | 101 | # print a message letting us know what's happening 102 | print(f'Downloading {filepath}') 103 | 104 | # save the downloaded file to the path created above 105 | download.save_as(filepath) 106 | 107 | # make sure the page is done firing before 108 | # going to the next loop iteration 109 | page.wait_for_load_state('networkidle') 110 | 111 | # and throw in an explicit wait 112 | time.sleep(1) 113 | 114 | 115 | def download_data_public(page): 116 | # a list of CSV headers for the public lobbyist files 117 | # making sure to match the same header names 118 | # from the private lobbyist files 119 | HEADERS_PUBLIC_CSV = [ 120 | 'YEAR', 121 | 'LOBBYIST_LAST_NAME', 122 | 'LOBBYIST_FIRST_NAME', 123 | 'EMPLOYER' 124 | ] 125 | 126 | select = page.locator(LOCATOR_SELECT_YEAR) 127 | years = [x.inner_html() for x in select.locator('option').all() if 'All' not in x.inner_html()] # noqa 128 | 129 | for year in years: 130 | 131 | filename = f'{year}-public.csv' 132 | filepath = os.path.join(DIR_DATA, filename) 133 | 134 | if os.path.exists(filepath) and year != str(THIS_YEAR): 135 | continue 136 | 137 | select = page.locator(LOCATOR_SELECT_YEAR) 138 | select.select_option(year) 139 | page.locator(LOCATOR_BUTTON_SEARCH).click() 140 | page.wait_for_load_state('networkidle') 141 | 142 | select_len = page.locator(LOCATOR_SELECT_TABLE_LEN) 143 | select_len.select_option("1000") 144 | 145 | table = page.locator(LOCATOR_TABLE) 146 | rows = table.locator('tr').all() 147 | 148 | with open(filepath, 'w') as outfile: 149 | writer = csv.DictWriter(outfile, fieldnames=HEADERS_PUBLIC_CSV) 150 | writer.writeheader() 151 | 152 | for row in rows: 153 | cells = row.locator('td').all() 154 | year, name, dept = [x.inner_text() for x in cells] 155 | last, rest = [x.strip() for x in name.rsplit(',', 1)] 156 | data = [ 157 | year, 158 | last, 159 | rest, 160 | dept 161 | ] 162 | 163 | writer.writerow(dict(zip(HEADERS_PUBLIC_CSV, data))) 164 | 165 | print(f'Wrote {filepath}') 166 | 167 | time.sleep(1) 168 | 169 | 170 | def build_data_file(): 171 | 172 | # get a list of downloaded files 173 | files_private = glob.glob(f'{DIR_DATA}/*.zip') 174 | files_public = glob.glob(f'{DIR_DATA}/*.csv') 175 | 176 | # start a list to hold individual data frames 177 | data_frames = [] 178 | 179 | for file in files_private: 180 | df = pd.read_csv( 181 | file, 182 | compression='zip', 183 | delimiter='|' 184 | ) 185 | 186 | data_frames.append(df) 187 | 188 | for file in files_public: 189 | df = pd.read_csv(file) 190 | data_frames.append(df) 191 | 192 | df = pd.concat(data_frames) 193 | 194 | df.sort_values(['YEAR', 'LOBBYIST_LAST_NAME', 'LOBBYIST_FIRST_NAME'], ascending=[False, True, True]).to_csv(CSV_FILEPATH, index=False) 195 | 196 | return CSV_FILEPATH 197 | 198 | 199 | if __name__ == '__main__': 200 | 201 | if not os.path.exists(DIR_DATA): 202 | os.makedirs(DIR_DATA) 203 | 204 | with sync_playwright() as p: 205 | browser = p.chromium.launch(headless=False) 206 | page = browser.new_page() 207 | page.goto(URL, wait_until='networkidle') 208 | 209 | download_data_private(page) 210 | 211 | page.locator(LOCATOR_RADIO_PUBLIC).check() 212 | page.wait_for_load_state('networkidle') 213 | 214 | download_data_public(page) 215 | 216 | build_data_file() 217 | -------------------------------------------------------------------------------- /tx-railroad-commission/dl_pages_details.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import time 4 | 5 | from dl_pages_results import DIR_PAGES_RESULTS, BASE_URL 6 | 7 | from bs4 import BeautifulSoup 8 | from playwright.sync_api import sync_playwright, expect 9 | 10 | 11 | # make a reference to the directory where 12 | # the downloaded detail pages will land 13 | DIR_PAGES_DETAIL = 'pages-detail' 14 | 15 | 16 | def get_detail_page_links(): 17 | ''' 18 | A function to extract the detail page links 19 | from the results HTML files that we downloaded separately 20 | ''' 21 | 22 | # an empty list to hold the extracted links 23 | detail_page_links = [] 24 | 25 | # use the glob module to nab a list of all 26 | # the HTML files we want to parse 27 | # https://docs.python.org/3/library/glob.html 28 | filepaths_results = sorted(glob.glob(f'{DIR_PAGES_RESULTS}/*.html')) 29 | 30 | # loop over each results file 31 | for results_file in filepaths_results: 32 | 33 | # open it and read the HTML 34 | with open(results_file, 'r') as infile: 35 | html = infile.read() 36 | 37 | # turn it into soup 38 | soup = BeautifulSoup(html, 'html.parser') 39 | 40 | # find the table rows 41 | rows = soup.tbody.find_all('tr') 42 | 43 | # use a list comprehension to grab the links from each row 44 | # and prepend the base URL to ensure a fully 45 | # qualified URL to save a step later 46 | # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions 47 | links = [f"{BASE_URL}/s{x.find('a')['href']}" for x in rows] 48 | 49 | # add these links to the main list 50 | detail_page_links.extend(links) 51 | 52 | # return the list of links we just populated 53 | return detail_page_links 54 | 55 | 56 | def dl_pages_detail(): 57 | 58 | # call the function to get the links and store the results 59 | links = get_detail_page_links() 60 | 61 | # set up the playwright object 62 | # https://playwright.dev/python/docs/library#usage 63 | with sync_playwright() as p: 64 | 65 | # create a new Chromium browser, which 66 | # operates in headless mode by default 67 | browser = p.chromium.launch() 68 | 69 | # open a new page 70 | page = browser.new_page() 71 | 72 | # loop over the links we just grabbed 73 | for link in links: 74 | 75 | # use the unique ID in the URL as the filename 76 | filename = f"{link.split('/')[-1]}.html" 77 | 78 | # build the file path 79 | filepath = os.path.join( 80 | DIR_PAGES_DETAIL, 81 | filename 82 | ) 83 | 84 | # check to see if we already downloaded the page 85 | if not os.path.exists(filepath): 86 | 87 | # if not, navigate to the page 88 | # and wait for the assets to load 89 | page.goto( 90 | link, 91 | wait_until='networkidle' 92 | ) 93 | 94 | expect( 95 | page.get_by_title('Inspection Packages') 96 | ).to_be_visible() 97 | 98 | expect( 99 | page.get_by_text('Complaint Name') 100 | ).to_be_visible() 101 | 102 | time.sleep(2) 103 | 104 | 105 | if 'RRC SIGN IN' in page.locator('body').inner_text().upper(): 106 | print(f' PROBLEM downloading {link}') 107 | continue 108 | 109 | # target the content div and grab the HTML 110 | content = page.locator('html').inner_html() # noqa 111 | 112 | # if not, download it 113 | with open(filepath, 'w') as outfile: 114 | outfile.write(content) 115 | 116 | # and let us know what's up 117 | print(f'Downloaded {filepath}') 118 | 119 | # wait a tick before moving on to the next page 120 | time.sleep(0.5) 121 | 122 | # close the browser 123 | browser.close() 124 | 125 | 126 | if __name__ == '__main__': 127 | dl_pages_detail() 128 | -------------------------------------------------------------------------------- /tx-railroad-commission/dl_pages_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from playwright.sync_api import sync_playwright 5 | 6 | 7 | # create a variable pointing to the directory where 8 | # the cached pages should land 9 | DIR_PAGES_RESULTS = 'pages-results' 10 | 11 | # create a variable pointing to the base URL, which 12 | # we'll use in a couple places 13 | BASE_URL = 'https://rrctx.force.com' 14 | 15 | 16 | def download_pages_results(): 17 | 18 | # check to see if this directory exists 19 | if not os.path.exists(DIR_PAGES_RESULTS): 20 | 21 | # if not, create it 22 | os.makedirs(DIR_PAGES_RESULTS) 23 | 24 | # set up the playwright object 25 | # https://playwright.dev/python/docs/library#usage 26 | with sync_playwright() as p: 27 | 28 | # create a new Chromium browser, which 29 | # operates in headless mode by default 30 | browser = p.chromium.launch() 31 | 32 | # open a new page 33 | page = browser.new_page() 34 | 35 | # go to the initial complaints results page 36 | # and wait until all the assets are loaded 37 | # using an f-string to build the URL to navigate to 38 | # https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings 39 | page.goto( 40 | f'{BASE_URL}/s/complaints', 41 | wait_until='networkidle' 42 | ) 43 | 44 | # find the table and grab the HTML 45 | table = page.locator('table').inner_html() 46 | 47 | # get the page number we're on (1) 48 | # by finding the pagination element at the bottom of the page 49 | page_tracker = page.get_by_text(' | Page ').inner_text() 50 | 51 | # ... and then parsing out the page number with some splits 52 | page_num = page_tracker.split('Page')[1].split('of')[0].strip() 53 | 54 | # set up the filename -- using the .zfill() 55 | # string method to pad out the number 56 | # to three digits -- and the file path 57 | filename = f'{page_num.zfill(3)}.html' 58 | filepath = os.path.join( 59 | DIR_PAGES_RESULTS, 60 | filename 61 | ) 62 | 63 | # open the file and write 64 | # the table HTML captured above 65 | with open(filepath, 'w') as outfile: 66 | outfile.write(table) 67 | 68 | # let us know what's up 69 | print(f'Downloaded {filepath}') 70 | 71 | # next, create a process to iterate through 72 | # the other pages of the search results -- 73 | # a better move here would be to write a 74 | # recursive function, but a hacky while True / break 75 | # statement works too 76 | 77 | while True: 78 | # find the "Next" button 79 | next_button = page.locator('button', has_text='Next') 80 | 81 | # click it 82 | next_button.click() 83 | 84 | # wait for the next page to load 85 | page.wait_for_load_state('networkidle') 86 | 87 | # find the table and grab the HTML 88 | table = page.locator('table').inner_html() 89 | 90 | # get the page number we're on 91 | # by finding the pagination element at the bottom of the page 92 | page_tracker = page.get_by_text(' | Page ').inner_text() 93 | 94 | # and then parsing out the page number with some splits 95 | page_num = page_tracker.split('Page')[1].split('of')[0].strip() 96 | 97 | # set up the filename and path 98 | filename = f'{page_num.zfill(3)}.html' 99 | 100 | filepath = os.path.join( 101 | DIR_PAGES_RESULTS, 102 | filename 103 | ) 104 | 105 | # open the file and write into 106 | # it the table HTML captured above 107 | with open(filepath, 'w') as outfile: 108 | outfile.write(table) 109 | 110 | # let us know what's up 111 | print(f'Downloaded {filepath}') 112 | 113 | # see if this is the last page 114 | lpage = int(page_tracker.split('of')[-1]) 115 | 116 | # if the {x} in "Page {x} of {y}" number 117 | # is the same as {y}, we're done 118 | if int(page_num) == lpage: 119 | break 120 | 121 | # if not the last page, 122 | # wait half a second before moving on to the next page 123 | time.sleep(0.5) 124 | 125 | # shut down the browser 126 | browser.close() 127 | 128 | 129 | if __name__ == '__main__': 130 | download_pages_results() 131 | -------------------------------------------------------------------------------- /tx-railroad-commission/main.py: -------------------------------------------------------------------------------- 1 | from dl_pages_results import download_pages_results 2 | from dl_pages_details import dl_pages_detail 3 | from scrape_detail_pages import scrape_data 4 | 5 | if __name__ == '__main__': 6 | print('Downloading results pages ...') 7 | download_pages_results() 8 | print() 9 | 10 | print('Downloading detail pages ...') 11 | dl_pages_detail() 12 | print() 13 | 14 | print('Scraping data ...') 15 | file_details = scrape_data() 16 | print() 17 | 18 | print(f'Done! Wrote {file_details["record_count"]:,} records to {file_details["filepath"]}') 19 | -------------------------------------------------------------------------------- /tx-railroad-commission/pages-detail/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/tx-railroad-commission/pages-detail/.gitkeep -------------------------------------------------------------------------------- /tx-railroad-commission/pages-results/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/tx-railroad-commission/pages-results/.gitkeep -------------------------------------------------------------------------------- /tx-railroad-commission/scrape_detail_pages.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import csv 3 | import re 4 | import os 5 | from datetime import datetime 6 | 7 | from bs4 import BeautifulSoup 8 | 9 | from dl_pages_results import BASE_URL 10 | 11 | 12 | csv_filepath = 'tx-railroad-commission-data.csv' 13 | 14 | # set up headers for the CSV file 15 | csv_headers = [ 16 | 'complaint_id', 17 | 'complaint_url', 18 | 'capture_method', 19 | 'location', 20 | 'city', 21 | 'resolution_status', 22 | 'complaint_type', 23 | 'received_date', 24 | 'business_area', 25 | 'organization', 26 | 'unit_name', 27 | 'region', 28 | 'regulated_entity', 29 | 'p5_no', 30 | 'jurisdictional', 31 | 'regulated', 32 | 'complaint_description_type', 33 | 'complaint_description', 34 | 'resolution_description', 35 | 'complaint_comments', 36 | 'update_notes', 37 | 'close_date', 38 | 'explanation_type', 39 | 'explanation', 40 | 'referral_type', 41 | 'referred_to', 42 | 'inspection_packages_link', 43 | 'inspection_documents_link' 44 | ] 45 | 46 | # get a list of HTML files in the detail pages dir 47 | files = glob.glob('pages-detail/*.html') 48 | 49 | 50 | # set up a reusable function to scrape data from 51 | # a single HTML file 52 | def scrape_page(html_path): 53 | 54 | # open the file and read in the HTML 55 | with open(html_path, 'r') as infile: 56 | html = infile.read() 57 | 58 | # turn the HTML into a bs4 object 59 | soup = BeautifulSoup(html, 'html.parser') 60 | 61 | # and start locating the elements using various bs4 methods 62 | # https://www.crummy.com/software/BeautifulSoup/bs4/doc/ 63 | 64 | complaint_no = soup.find('span', text='Complaint Name').parent.next_sibling.text.split('-')[-1] 65 | 66 | url = f"{BASE_URL}/s/ietrs-complaint/{html_path.split('/')[-1].split('.html')[0]}/complaint{complaint_no}" 67 | 68 | capture_method = soup.find('span', text='Complaint Capture Method').parent.next_sibling.text.strip() 69 | 70 | location = soup.find('span', text='Complaint Location').parent.next_sibling.text.strip() 71 | 72 | resolution_status = soup.find('span', text='Complaint Resolution Status').parent.next_sibling.text.strip() 73 | 74 | complaint_type = soup.find('span', text=re.compile('Optional; If looking for a Railroad related option, this is not the correct jurisdiction')).parent.parent.next_sibling.text.strip() 75 | 76 | received_date = soup.find('span', text='Complaint Received Date').parent.next_sibling.text.strip() 77 | 78 | # additional integrity check for dates -- parse text as date 79 | received_date = datetime.strptime( 80 | received_date, 81 | '%m/%d/%Y' 82 | ).date().isoformat() 83 | 84 | business_area = soup.find('span', text='Business Area').parent.next_sibling.text.strip() 85 | 86 | city = soup.find('span', text='City').parent.next_sibling.text.strip() 87 | 88 | organization = soup.find('span', text='Organization').parent.next_sibling.text.strip() 89 | 90 | unit_name = soup.find('span', text='Unit Name').parent.next_sibling.text.strip() 91 | 92 | region = soup.find('span', text='Region').parent.next_sibling.text.strip() 93 | 94 | regulated_entity = soup.find('span', text='Regulated Entity').parent.next_sibling.text.strip() 95 | 96 | p5_no = soup.find('span', text='P5 #').parent.next_sibling.text.strip() 97 | 98 | jurisdictional = soup.find('span', text='Jurisdictional').parent.next_sibling.find('img').get('alt').strip() 99 | 100 | regulated = soup.find('span', text='Regulated').parent.next_sibling.text.strip() 101 | 102 | complaint_description_type = soup.find('span', text='Complaint Description Type').parent.next_sibling.text.strip() 103 | 104 | complaint_description = soup.find('span', text='Complaint Description').parent.next_sibling.text.strip() 105 | 106 | complaint_description = ' '.join(complaint_description.split()) 107 | 108 | resolution_description = soup.find('span', text='Complaint Resolution Description').parent.next_sibling.text.strip() 109 | 110 | resolution_description = ' '.join(resolution_description.split()) 111 | 112 | complaint_comments = soup.find('span', text='Complaint Comments').parent.next_sibling.text.strip() 113 | 114 | complaint_comments = ' '.join(resolution_description.split()) 115 | 116 | update_notes = soup.find('span', text='Update Notes').parent.next_sibling.text.strip() 117 | 118 | update_notes = ' '.join(update_notes.split()) 119 | 120 | close_date = soup.find('span', text='Complaint Close Date').parent.next_sibling.text.strip() 121 | 122 | # not every complaint has a close date 123 | if close_date: 124 | close_date = datetime.strptime( 125 | close_date, 126 | '%m/%d/%Y' 127 | ).date().isoformat() 128 | 129 | explanation_type = soup.find('span', text='Complaint Explanation Type').parent.next_sibling.text.strip() 130 | 131 | explanation = soup.find('span', text='Complaint Explanation').parent.next_sibling.text.strip() 132 | 133 | explanation = ' '.join(explanation.split()) 134 | 135 | referral_type = soup.find('span', text='Complaint Referred Type').parent.next_sibling.text.strip() 136 | 137 | referred_to = soup.find('span', text='Complaint Referred To:').parent.next_sibling.text.strip() 138 | 139 | # joining a split string on a single space is 140 | # a way to remove all unnecessary whitespace 141 | referred_to = ' '.join(referred_to.split()) 142 | 143 | inspection_packages = soup.find('span', {'title': 'Inspection Packages'}) 144 | 145 | inspection_packages_link = f"{BASE_URL}{inspection_packages.parent.get('href')}" 146 | 147 | inspection_documents = soup.find('span', {'title': 'Inspection Documents'}) 148 | 149 | inspection_documents_link = f"{BASE_URL}{inspection_documents.parent.get('href')}" 150 | 151 | # assemble the data in a list, maintaining 152 | # the same order as the CSV headers 153 | data = [ 154 | complaint_no, 155 | url, 156 | capture_method, 157 | location, 158 | city, 159 | resolution_status, 160 | complaint_type, 161 | received_date, 162 | business_area, 163 | organization, 164 | unit_name, 165 | region, 166 | regulated_entity, 167 | p5_no, 168 | jurisdictional, 169 | regulated, 170 | complaint_description_type, 171 | complaint_description, 172 | resolution_description, 173 | complaint_comments, 174 | update_notes, 175 | close_date, 176 | explanation_type, 177 | explanation, 178 | referral_type, 179 | referred_to, 180 | inspection_packages_link, 181 | inspection_documents_link 182 | ] 183 | 184 | # return a dictionary representation of the data 185 | return dict(zip(csv_headers, data)) 186 | 187 | 188 | def scrape_data(): 189 | # set up an empty list to hold the data to write to file 190 | all_data = [] 191 | 192 | # loop over the list of files 193 | for file in files: 194 | 195 | # call the function to scrape this file 196 | # and assign to a variable the dictionary that the function returns 197 | data = scrape_page(file) 198 | 199 | # append the dictionary to the list we set up to 200 | # collect data from each page 201 | all_data.append(data) 202 | 203 | # open a file in write mode, specify the encoding and 204 | # set newlines='' to deal with windows-specific line breaks 205 | with open(csv_filepath, 'w', encoding='utf-8', newline='') as outfile: 206 | 207 | # set up the writer object 208 | writer = csv.DictWriter( 209 | outfile, 210 | fieldnames=csv_headers 211 | ) 212 | 213 | # write the headers 214 | writer.writeheader() 215 | 216 | # write the data 217 | writer.writerows(all_data) 218 | 219 | return { 220 | 'record_count': len(all_data), 221 | 'filepath': csv_filepath 222 | } 223 | 224 | 225 | if __name__ == '__main__': 226 | scrape_data() 227 | -------------------------------------------------------------------------------- /us-senate-press-gallery/U.S. Senate Press Gallery - complete.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "05412996", 6 | "metadata": {}, 7 | "source": [ 8 | "# U.S. Senate press gallery\n", 9 | "\n", 10 | "The goal: [Scrape the list of journalists accredited to cover the U.S. Senate](https://www.dailypress.senate.gov/membership/membership-lists/) into a CSV. A little spelunking in the source code will reveal a table ready for extraction." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "id": "4c67ce4d", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# import libs\n", 21 | "import csv\n", 22 | "\n", 23 | "import requests\n", 24 | "from bs4 import BeautifulSoup" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "id": "99adeb32", 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# define the list of headers for the CSV\n", 35 | "headers = [\n", 36 | " 'first',\n", 37 | " 'last',\n", 38 | " 'affiliation'\n", 39 | "]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "a9744eb7", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# make the request\n", 50 | "req = requests.get('https://www.dailypress.senate.gov/membership/membership-lists/')" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "id": "8e710003", 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# turn the HTML into soup\n", 61 | "soup = BeautifulSoup(req.text, 'html.parser')\n", 62 | "\n", 63 | "# find the table\n", 64 | "table = soup.find('table')\n", 65 | "\n", 66 | "# grab a list of table rows (minus the header)\n", 67 | "rows = table.find_all('tr')[1:]" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "762a11be", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "print(rows)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "e65af642", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# open a CSV file to write data into\n", 88 | "with open('us-senate-press-gallery.csv', 'w', newline='') as outfile:\n", 89 | "\n", 90 | " # create a writer object\n", 91 | " writer = csv.writer(outfile)\n", 92 | "\n", 93 | " # write the list of headers to file\n", 94 | " writer.writerow(headers)\n", 95 | "\n", 96 | " # loop over the rows\n", 97 | " for row in rows:\n", 98 | "\n", 99 | " # find the cells in this row\n", 100 | " cells = row.find_all('td')\n", 101 | " \n", 102 | " # extract each piece of data from the list\n", 103 | " \n", 104 | " # first name is the first ([0]) list item\n", 105 | " first_name = cells[0].text.strip()\n", 106 | " \n", 107 | " # last name is second ([1])\n", 108 | " last_name = cells[1].text.strip()\n", 109 | "\n", 110 | " # affiliation is third ([2])\n", 111 | " affiliation = cells[2].text.strip()\n", 112 | "\n", 113 | " # write row to file\n", 114 | " writer.writerow([first_name, last_name, affiliation])" 115 | ] 116 | } 117 | ], 118 | "metadata": { 119 | "kernelspec": { 120 | "display_name": "Python 3 (ipykernel)", 121 | "language": "python", 122 | "name": "python3" 123 | }, 124 | "language_info": { 125 | "codemirror_mode": { 126 | "name": "ipython", 127 | "version": 3 128 | }, 129 | "file_extension": ".py", 130 | "mimetype": "text/x-python", 131 | "name": "python", 132 | "nbconvert_exporter": "python", 133 | "pygments_lexer": "ipython3", 134 | "version": "3.10.9" 135 | } 136 | }, 137 | "nbformat": 4, 138 | "nbformat_minor": 5 139 | } 140 | -------------------------------------------------------------------------------- /us-senate-press-gallery/U.S. Senate Press Gallery - working.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "05412996", 6 | "metadata": {}, 7 | "source": [ 8 | "# U.S. Senate press gallery\n", 9 | "\n", 10 | "The goal: [Scrape the list of journalists accredited to cover the U.S. Senate](https://www.dailypress.senate.gov/membership/membership-lists/) into a CSV. Some spelunking in the source code will show a table ready for extraction.\n", 11 | "\n", 12 | "[Here's the completed version](U.S.%20Senate%20Press%20Gallery%20-%20complete.ipynb)." 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "4c67ce4d", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "# import libs\n", 23 | "import csv\n", 24 | "\n", 25 | "import requests\n", 26 | "from bs4 import BeautifulSoup" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "id": "99adeb32", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# define the list of headers for the CSV\n", 37 | "headers = [\n", 38 | " 'first',\n", 39 | " 'last',\n", 40 | " 'affiliation'\n", 41 | "]" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 3, 47 | "id": "a9744eb7", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "# make the request\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 6, 57 | "id": "8e710003", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "# turn the HTML into soup\n", 62 | "\n", 63 | "\n", 64 | "# find the table\n", 65 | "\n", 66 | "\n", 67 | "# grab a list of table rows (minus the header)\n" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 9, 73 | "id": "e65af642", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "# open a CSV file to write data into\n", 78 | "\n", 79 | "\n", 80 | " # create a writer object\n", 81 | "\n", 82 | "\n", 83 | " # write the list of headers to file\n", 84 | "\n", 85 | "\n", 86 | " # loop over the rows\n", 87 | "\n", 88 | "\n", 89 | " # find the cells in this row\n", 90 | "\n", 91 | " \n", 92 | " # extract each piece of data from the list\n", 93 | " \n", 94 | " # first name is the first ([0]) list item\n", 95 | "\n", 96 | " \n", 97 | " # last name is second ([1])\n", 98 | "\n", 99 | "\n", 100 | " # affiliation is third ([2])\n", 101 | "\n", 102 | "\n", 103 | " # write row to file\n" 104 | ] 105 | } 106 | ], 107 | "metadata": { 108 | "kernelspec": { 109 | "display_name": "Python 3 (ipykernel)", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.10.9" 124 | } 125 | }, 126 | "nbformat": 4, 127 | "nbformat_minor": 5 128 | } 129 | --------------------------------------------------------------------------------