├── .gitignore
├── LICENSE
├── Python syntax cheat sheet.ipynb
├── Python syntax cheat sheet.pdf
├── README.md
├── ire-board
    ├── IRE Board members - complete.ipynb
    ├── IRE Board members - working.ipynb
    ├── ire-board.html
    └── ire_board_scrape.py
├── md-warn-notices
    ├── Maryland WARN Notices - multiple pages.ipynb
    └── Maryland WARN Notices.ipynb
├── requirements.txt
├── sd-lobbyists
    ├── data
    │   └── .gitkeep
    └── download_lobbyist_data.py
├── tx-railroad-commission
    ├── dl_pages_details.py
    ├── dl_pages_results.py
    ├── main.py
    ├── pages-detail
    │   └── .gitkeep
    ├── pages-results
    │   └── .gitkeep
    ├── scrape_detail_pages.py
    └── tx-railroad-commission-data.csv
└── us-senate-press-gallery
    ├── U.S. Senate Press Gallery - complete.ipynb
    └── U.S. Senate Press Gallery - working.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | # Created by https://www.toptal.com/developers/gitignore/api/osx
131 | # Edit at https://www.toptal.com/developers/gitignore?templates=osx
132 | 
133 | ### OSX ###
134 | # General
135 | .DS_Store
136 | .AppleDouble
137 | .LSOverride
138 | 
139 | # Icon must end with two \r
140 | Icon
141 | 
142 | 
143 | # Thumbnails
144 | ._*
145 | 
146 | # Files that might appear in the root of a volume
147 | .DocumentRevisions-V100
148 | .fseventsd
149 | .Spotlight-V100
150 | .TemporaryItems
151 | .Trashes
152 | .VolumeIcon.icns
153 | .com.apple.timemachine.donotpresent
154 | 
155 | # Directories potentially created on remote AFP share
156 | .AppleDB
157 | .AppleDesktop
158 | Network Trash Folder
159 | Temporary Items
160 | .apdisk
161 | 
162 | # End of https://www.toptal.com/developers/gitignore/api/osx
163 | 
164 | session-notes
165 | tx-railroad-commission/*/*.html
166 | sd-lobbyists/data/*.zip
167 | sd-lobbyists/data/*.csv
168 | ire-board/*.csv


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 IRE & NICAR
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Python syntax cheat sheet.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Python syntax cheat sheet\n",
   8 |     "\n",
   9 |     "This notebook demonstrates some basic syntax rules of the Python programming language.\n",
  10 |     "\n",
  11 |     "- [Basic data types](#Basic-data-types)\n",
  12 |     "    - [Strings](#Strings)\n",
  13 |     "    - [Numbers and math](#Numbers-and-math)\n",
  14 |     "    - [Booleans](#Booleans)\n",
  15 |     "- [Variable assignment](#Variable-assignment)\n",
  16 |     "- [String methods](#String-methods)\n",
  17 |     "- [Comments](#Comments)\n",
  18 |     "- [The print() function](#The-print()-function)\n",
  19 |     "- [Collections of data](#Collections-of-data)\n",
  20 |     "    - [Lists](#Lists)\n",
  21 |     "    - [Dictionaries](#Dictionaries)\n",
  22 |     "- [`for` loops](#for-loops)\n",
  23 |     "- [`if` statements](#if-statements)"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "### Basic data types\n",
  31 |     "Just like Excel and other data processing software, Python recognizes a variety of data types, including three we'll focus on here:\n",
  32 |     "- Strings (text)\n",
  33 |     "- Numbers (integers, numbers with decimals and more)\n",
  34 |     "- Booleans (`True` and `False`).\n",
  35 |     "\n",
  36 |     "You can use the built-in [`type()`](https://docs.python.org/3/library/functions.html#type) function to check the data type of a value."
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "markdown",
  41 |    "metadata": {},
  42 |    "source": [
  43 |     "#### Strings\n",
  44 |     "\n",
  45 |     "A string is a group of characters -- letters, numbers, whatever -- enclosed within single or double quotes (doesn't matter as long as they match). The code in these notebooks uses single quotes. (The Python style guide doesn't recommend one over the other: [\"Pick a rule and stick to it.\"](https://www.python.org/dev/peps/pep-0008/#string-quotes))\n",
  46 |     "\n",
  47 |     "If your string _contains_ apostrophes or quotes, you have two options: _Escape_ the offending character with a forward slash `\\`:\n",
  48 |     "\n",
  49 |     "```python\n",
  50 |     "'Isn\\'t it nice here?'\n",
  51 |     "```\n",
  52 |     "\n",
  53 |     "... or change the surrounding punctuation:\n",
  54 |     "\n",
  55 |     "```python\n",
  56 |     "\"Isn't it nice here?\"\n",
  57 |     "```\n",
  58 |     "\n",
  59 |     "The style guide recommends the latter over the former.\n",
  60 |     "\n",
  61 |     "When you call the `type()` function on a string, Python will return `str`.\n",
  62 |     "\n",
  63 |     "Calling the [`str()` function](https://docs.python.org/3/library/stdtypes.html#str) on a value will return the string version of that value (see examples below)."
  64 |    ]
  65 |   },
  66 |   {
  67 |    "cell_type": "code",
  68 |    "execution_count": null,
  69 |    "metadata": {},
  70 |    "outputs": [],
  71 |    "source": [
  72 |     "'Investigative Reporters and Editors'"
  73 |    ]
  74 |   },
  75 |   {
  76 |    "cell_type": "code",
  77 |    "execution_count": null,
  78 |    "metadata": {},
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "type('hello!')"
  82 |    ]
  83 |   },
  84 |   {
  85 |    "cell_type": "code",
  86 |    "execution_count": null,
  87 |    "metadata": {},
  88 |    "outputs": [],
  89 |    "source": [
  90 |     "45"
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": null,
  96 |    "metadata": {},
  97 |    "outputs": [],
  98 |    "source": [
  99 |     "type(45)"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "metadata": {},
 106 |    "outputs": [],
 107 |    "source": [
 108 |     "str(45)"
 109 |    ]
 110 |   },
 111 |   {
 112 |    "cell_type": "code",
 113 |    "execution_count": null,
 114 |    "metadata": {},
 115 |    "outputs": [],
 116 |    "source": [
 117 |     "type(str(45))"
 118 |    ]
 119 |   },
 120 |   {
 121 |    "cell_type": "code",
 122 |    "execution_count": null,
 123 |    "metadata": {},
 124 |    "outputs": [],
 125 |    "source": [
 126 |     "str(True)"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "markdown",
 131 |    "metadata": {},
 132 |    "source": [
 133 |     "If you \"add\" strings together with a plus sign `+`, it will concatenate them:"
 134 |    ]
 135 |   },
 136 |   {
 137 |    "cell_type": "code",
 138 |    "execution_count": null,
 139 |    "metadata": {},
 140 |    "outputs": [],
 141 |    "source": [
 142 |     "'IRE' + '/' + 'NICAR'"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "#### Numbers and math\n",
 150 |     "\n",
 151 |     "Python recognizes a variety of numeric data types. Two of the most common are integers (whole numbers) and floats (numbers with decimals).\n",
 152 |     "\n",
 153 |     "Calling `int()` on a piece of numeric data (even if it's being stored as a string) will attempt to coerce it to an integer; calling `float()` will try to convert it to a float."
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "code",
 158 |    "execution_count": null,
 159 |    "metadata": {},
 160 |    "outputs": [],
 161 |    "source": [
 162 |     "12"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "code",
 167 |    "execution_count": null,
 168 |    "metadata": {},
 169 |    "outputs": [],
 170 |    "source": [
 171 |     "12.4"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "code",
 176 |    "execution_count": null,
 177 |    "metadata": {},
 178 |    "outputs": [],
 179 |    "source": [
 180 |     "type(12)"
 181 |    ]
 182 |   },
 183 |   {
 184 |    "cell_type": "code",
 185 |    "execution_count": null,
 186 |    "metadata": {},
 187 |    "outputs": [],
 188 |    "source": [
 189 |     "type(12.4)"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "code",
 194 |    "execution_count": null,
 195 |    "metadata": {},
 196 |    "outputs": [],
 197 |    "source": [
 198 |     "int(35.6)"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": null,
 204 |    "metadata": {},
 205 |    "outputs": [],
 206 |    "source": [
 207 |     "int('45')"
 208 |    ]
 209 |   },
 210 |   {
 211 |    "cell_type": "code",
 212 |    "execution_count": null,
 213 |    "metadata": {},
 214 |    "outputs": [],
 215 |    "source": [
 216 |     "float(46)"
 217 |    ]
 218 |   },
 219 |   {
 220 |    "cell_type": "code",
 221 |    "execution_count": null,
 222 |    "metadata": {},
 223 |    "outputs": [],
 224 |    "source": [
 225 |     "float('45')"
 226 |    ]
 227 |   },
 228 |   {
 229 |    "cell_type": "markdown",
 230 |    "metadata": {},
 231 |    "source": [
 232 |     "You can do [basic math](https://www.digitalocean.com/community/tutorials/how-to-do-math-in-python-3-with-operators) in Python. You can also do [more advanced math](https://docs.python.org/3/library/math.html)."
 233 |    ]
 234 |   },
 235 |   {
 236 |    "cell_type": "code",
 237 |    "execution_count": null,
 238 |    "metadata": {},
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "4+2"
 242 |    ]
 243 |   },
 244 |   {
 245 |    "cell_type": "code",
 246 |    "execution_count": null,
 247 |    "metadata": {},
 248 |    "outputs": [],
 249 |    "source": [
 250 |     "10-9"
 251 |    ]
 252 |   },
 253 |   {
 254 |    "cell_type": "code",
 255 |    "execution_count": null,
 256 |    "metadata": {},
 257 |    "outputs": [],
 258 |    "source": [
 259 |     "5*10"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": null,
 265 |    "metadata": {},
 266 |    "outputs": [],
 267 |    "source": [
 268 |     "1000/10"
 269 |    ]
 270 |   },
 271 |   {
 272 |    "cell_type": "code",
 273 |    "execution_count": null,
 274 |    "metadata": {},
 275 |    "outputs": [],
 276 |    "source": [
 277 |     "# ** raises a number to the power of another number\n",
 278 |     "5**2"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "markdown",
 283 |    "metadata": {},
 284 |    "source": [
 285 |     "#### Booleans\n",
 286 |     "\n",
 287 |     "Just like in Excel, which has `TRUE` and `FALSE` data types, Python has boolean data types. They are `True` and `False` -- note that only the first letter is capitalized, and they are not sandwiched between quotes.\n",
 288 |     "\n",
 289 |     "Boolean values are typically returned when you're evaluating some sort of conditional statement -- comparing values, checking to see if a string is inside another string or if a value is in a list, etc.\n",
 290 |     "\n",
 291 |     "[Python's comparison operators](https://docs.python.org/3/reference/expressions.html#comparisons) include:\n",
 292 |     "\n",
 293 |     "- `>` greater than\n",
 294 |     "- `<` less than\n",
 295 |     "- `>=` greater than or equal to\n",
 296 |     "- `<=` less than or equal to\n",
 297 |     "- `==` equal to\n",
 298 |     "- `!=` not equal to"
 299 |    ]
 300 |   },
 301 |   {
 302 |    "cell_type": "code",
 303 |    "execution_count": null,
 304 |    "metadata": {},
 305 |    "outputs": [],
 306 |    "source": [
 307 |     "True"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "code",
 312 |    "execution_count": null,
 313 |    "metadata": {},
 314 |    "outputs": [],
 315 |    "source": [
 316 |     "False"
 317 |    ]
 318 |   },
 319 |   {
 320 |    "cell_type": "code",
 321 |    "execution_count": null,
 322 |    "metadata": {},
 323 |    "outputs": [],
 324 |    "source": [
 325 |     "4 > 6"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "code",
 330 |    "execution_count": null,
 331 |    "metadata": {},
 332 |    "outputs": [],
 333 |    "source": [
 334 |     "10 == 10"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": null,
 340 |    "metadata": {},
 341 |    "outputs": [],
 342 |    "source": [
 343 |     "'crapulence' == 'Crapulence'"
 344 |    ]
 345 |   },
 346 |   {
 347 |    "cell_type": "code",
 348 |    "execution_count": null,
 349 |    "metadata": {},
 350 |    "outputs": [],
 351 |    "source": [
 352 |     "type(True)"
 353 |    ]
 354 |   },
 355 |   {
 356 |    "cell_type": "markdown",
 357 |    "metadata": {},
 358 |    "source": [
 359 |     "### Variable assignment\n",
 360 |     "\n",
 361 |     "The `=` sign assigns a value to a variable name that you choose. Later, you can retrieve that value by referencing its variable name. Variable names can be pretty much anything you want ([as long as you follow some basic rules](https://thehelloworldprogram.com/python/python-variable-assignment-statements-rules-conventions-naming/)).\n",
 362 |     "\n",
 363 |     "This can be a tricky concept at first! For more detail, [here's a pretty good explainer from Digital Ocean](https://www.digitalocean.com/community/tutorials/how-to-use-variables-in-python-3)."
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": null,
 369 |    "metadata": {},
 370 |    "outputs": [],
 371 |    "source": [
 372 |     "my_name = 'Frank'"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "code",
 377 |    "execution_count": null,
 378 |    "metadata": {},
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "my_name"
 382 |    ]
 383 |   },
 384 |   {
 385 |    "cell_type": "markdown",
 386 |    "metadata": {},
 387 |    "source": [
 388 |     "You can also _reassign_ a different value to a variable name, though it's usually better practice to create a new variable."
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "code",
 393 |    "execution_count": null,
 394 |    "metadata": {},
 395 |    "outputs": [],
 396 |    "source": [
 397 |     "my_name = 'Susan'"
 398 |    ]
 399 |   },
 400 |   {
 401 |    "cell_type": "code",
 402 |    "execution_count": null,
 403 |    "metadata": {},
 404 |    "outputs": [],
 405 |    "source": [
 406 |     "my_name"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "markdown",
 411 |    "metadata": {},
 412 |    "source": [
 413 |     "A common thing to do is to \"save\" the results of an expression by assigning the result to a variable."
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "code",
 418 |    "execution_count": null,
 419 |    "metadata": {},
 420 |    "outputs": [],
 421 |    "source": [
 422 |     "my_fav_number = 10 + 3"
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": null,
 428 |    "metadata": {},
 429 |    "outputs": [],
 430 |    "source": [
 431 |     "my_fav_number"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "markdown",
 436 |    "metadata": {},
 437 |    "source": [
 438 |     "It's also common to refer to previously defined variables in an expression: "
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "code",
 443 |    "execution_count": null,
 444 |    "metadata": {},
 445 |    "outputs": [],
 446 |    "source": [
 447 |     "nfl_teams = 32\n",
 448 |     "mlb_teams = 30\n",
 449 |     "nba_teams = 30\n",
 450 |     "nhl_teams = 31\n",
 451 |     "\n",
 452 |     "number_of_pro_sports_teams = nfl_teams + mlb_teams + nba_teams + nhl_teams"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": null,
 458 |    "metadata": {},
 459 |    "outputs": [],
 460 |    "source": [
 461 |     "number_of_pro_sports_teams"
 462 |    ]
 463 |   },
 464 |   {
 465 |    "cell_type": "markdown",
 466 |    "metadata": {},
 467 |    "source": [
 468 |     "### String methods\n",
 469 |     "\n",
 470 |     "Let's go back to strings for a second. String objects have a number of useful [methods](https://docs.python.org/3/library/stdtypes.html#string-methods) -- let's use an example string to demonstrate a few common ones."
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "code",
 475 |    "execution_count": null,
 476 |    "metadata": {},
 477 |    "outputs": [],
 478 |    "source": [
 479 |     "my_cool_string = '    Hello, friends!'"
 480 |    ]
 481 |   },
 482 |   {
 483 |    "cell_type": "markdown",
 484 |    "metadata": {},
 485 |    "source": [
 486 |     "`upper()` converts the string to uppercase:"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "code",
 491 |    "execution_count": null,
 492 |    "metadata": {},
 493 |    "outputs": [],
 494 |    "source": [
 495 |     "my_cool_string.upper()"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "markdown",
 500 |    "metadata": {},
 501 |    "source": [
 502 |     "`lower()` converts to lowercase:"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": null,
 508 |    "metadata": {},
 509 |    "outputs": [],
 510 |    "source": [
 511 |     "my_cool_string.lower()"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "markdown",
 516 |    "metadata": {},
 517 |    "source": [
 518 |     "`replace()` will replace a piece of text with other text that you specify:"
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "code",
 523 |    "execution_count": null,
 524 |    "metadata": {},
 525 |    "outputs": [],
 526 |    "source": [
 527 |     "my_cool_string.replace('friends', 'enemies')"
 528 |    ]
 529 |   },
 530 |   {
 531 |    "cell_type": "markdown",
 532 |    "metadata": {},
 533 |    "source": [
 534 |     "`count()` will count the number of occurrences of a character or group of characters: "
 535 |    ]
 536 |   },
 537 |   {
 538 |    "cell_type": "code",
 539 |    "execution_count": null,
 540 |    "metadata": {},
 541 |    "outputs": [],
 542 |    "source": [
 543 |     "my_cool_string.count('H')"
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "markdown",
 548 |    "metadata": {},
 549 |    "source": [
 550 |     "Note that `count()` is case-sensitive. If your task is \"count all the e's,\" convert your original string to upper or lowercase first:"
 551 |    ]
 552 |   },
 553 |   {
 554 |    "cell_type": "code",
 555 |    "execution_count": null,
 556 |    "metadata": {},
 557 |    "outputs": [],
 558 |    "source": [
 559 |     "my_cool_string.upper().count('E')"
 560 |    ]
 561 |   },
 562 |   {
 563 |    "cell_type": "markdown",
 564 |    "metadata": {},
 565 |    "source": [
 566 |     "[`split()`](https://docs.python.org/3/library/stdtypes.html#str.split) will split the string into a [_list_](#Lists) (more on these in a second) on a given delimiter (if you don't specify a delimiter, it'll default to splitting on a space):"
 567 |    ]
 568 |   },
 569 |   {
 570 |    "cell_type": "code",
 571 |    "execution_count": null,
 572 |    "metadata": {},
 573 |    "outputs": [],
 574 |    "source": [
 575 |     "my_cool_string.split()"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "code",
 580 |    "execution_count": null,
 581 |    "metadata": {},
 582 |    "outputs": [],
 583 |    "source": [
 584 |     "my_cool_string.split(',')"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "code",
 589 |    "execution_count": null,
 590 |    "metadata": {},
 591 |    "outputs": [],
 592 |    "source": [
 593 |     "my_cool_string.split('Pitt')"
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "markdown",
 598 |    "metadata": {},
 599 |    "source": [
 600 |     "`strip()` removes whitespace from either side of your string (but not internal whitespace):"
 601 |    ]
 602 |   },
 603 |   {
 604 |    "cell_type": "code",
 605 |    "execution_count": null,
 606 |    "metadata": {},
 607 |    "outputs": [],
 608 |    "source": [
 609 |     "my_cool_string.strip()"
 610 |    ]
 611 |   },
 612 |   {
 613 |    "cell_type": "markdown",
 614 |    "metadata": {},
 615 |    "source": [
 616 |     "You can use a cool thing called \"method chaining\" to combine methods -- just tack 'em onto the end. Let's say we wanted to strip whitespace from our string _and_ make it uppercase:"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "code",
 621 |    "execution_count": null,
 622 |    "metadata": {},
 623 |    "outputs": [],
 624 |    "source": [
 625 |     "my_cool_string.strip().upper()"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "markdown",
 630 |    "metadata": {},
 631 |    "source": [
 632 |     "Notice, however, that our original string is unchanged:"
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "code",
 637 |    "execution_count": null,
 638 |    "metadata": {},
 639 |    "outputs": [],
 640 |    "source": [
 641 |     "my_cool_string"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "markdown",
 646 |    "metadata": {},
 647 |    "source": [
 648 |     "Why? Because we haven't assigned the results of anything we've done to a variable. A common thing to do, especially when you're cleaning data, would be to assign the results to a new variable:"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "code",
 653 |    "execution_count": null,
 654 |    "metadata": {},
 655 |    "outputs": [],
 656 |    "source": [
 657 |     "my_cool_string_clean = my_cool_string.strip().upper()"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "code",
 662 |    "execution_count": null,
 663 |    "metadata": {},
 664 |    "outputs": [],
 665 |    "source": [
 666 |     "my_cool_string_clean"
 667 |    ]
 668 |   },
 669 |   {
 670 |    "cell_type": "markdown",
 671 |    "metadata": {},
 672 |    "source": [
 673 |     "### Comments\n",
 674 |     "A line with a comment -- a note that you don't want Python to interpret -- starts with a `#` sign. These are notes to collaborators and to your future self about what's happening at this point in your script, and why.\n",
 675 |     "\n",
 676 |     "Typically you'd put this on the line right above the line of code you're commenting on:"
 677 |    ]
 678 |   },
 679 |   {
 680 |    "cell_type": "code",
 681 |    "execution_count": null,
 682 |    "metadata": {},
 683 |    "outputs": [],
 684 |    "source": [
 685 |     "avg_settlement = 40827348.34328237\n",
 686 |     "\n",
 687 |     "# coercing this to an int because we don't need any decimal precision\n",
 688 |     "int(avg_settlement)"
 689 |    ]
 690 |   },
 691 |   {
 692 |    "cell_type": "markdown",
 693 |    "metadata": {},
 694 |    "source": [
 695 |     "Multi-line comments are sandwiched between triple quotes (or triple apostrophes):\n",
 696 |     "\n",
 697 |     "`'''\n",
 698 |     "this\n",
 699 |     "is a long\n",
 700 |     "comment\n",
 701 |     "'''`\n",
 702 |     "\n",
 703 |     "or\n",
 704 |     "\n",
 705 |     "`\"\"\"\n",
 706 |     "this\n",
 707 |     "is a long\n",
 708 |     "comment\n",
 709 |     "\"\"\"`"
 710 |    ]
 711 |   },
 712 |   {
 713 |    "cell_type": "markdown",
 714 |    "metadata": {},
 715 |    "source": [
 716 |     "### The `print()` function\n",
 717 |     "\n",
 718 |     "So far, we've just been running the notebook cells to get the last value returned by the code we write. Using the [`print()`](https://docs.python.org/3/library/functions.html#print) function is a way to print specific things in your script to the screen. This function is handy for debugging.\n",
 719 |     "\n",
 720 |     "To print multiple things on the same line, separate them with a comma."
 721 |    ]
 722 |   },
 723 |   {
 724 |    "cell_type": "code",
 725 |    "execution_count": null,
 726 |    "metadata": {},
 727 |    "outputs": [],
 728 |    "source": [
 729 |     "print('Hello!')"
 730 |    ]
 731 |   },
 732 |   {
 733 |    "cell_type": "code",
 734 |    "execution_count": null,
 735 |    "metadata": {},
 736 |    "outputs": [],
 737 |    "source": [
 738 |     "print(my_name)"
 739 |    ]
 740 |   },
 741 |   {
 742 |    "cell_type": "code",
 743 |    "execution_count": null,
 744 |    "metadata": {},
 745 |    "outputs": [],
 746 |    "source": [
 747 |     "print('Hello,', my_name)"
 748 |    ]
 749 |   },
 750 |   {
 751 |    "cell_type": "markdown",
 752 |    "metadata": {},
 753 |    "source": [
 754 |     "## Collections of data\n",
 755 |     "\n",
 756 |     "Now we're going to talk about two ways you can use Python to group data into a collection: lists and dictionaries."
 757 |    ]
 758 |   },
 759 |   {
 760 |    "cell_type": "markdown",
 761 |    "metadata": {},
 762 |    "source": [
 763 |     "### Lists\n",
 764 |     "\n",
 765 |     "A _list_ is a comma-separated list of items inside square brackets: `[]`.\n",
 766 |     "\n",
 767 |     "Here's a list of ingredients, each one a string, that together makes up a salsa recipe."
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "code",
 772 |    "execution_count": null,
 773 |    "metadata": {},
 774 |    "outputs": [],
 775 |    "source": [
 776 |     "salsa_ingredients = ['tomato', 'onion', 'jalapeño', 'lime', 'cilantro']"
 777 |    ]
 778 |   },
 779 |   {
 780 |    "cell_type": "markdown",
 781 |    "metadata": {},
 782 |    "source": [
 783 |     "To get an item out of a list, you'd refer to its numerical position in the list -- its _index_ (1, 2, 3, etc.) -- inside square brackets immediately following your reference to that list. In Python, as in many other programming languages, counting starts at 0. That means the first item in a list is item `0`."
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": null,
 789 |    "metadata": {},
 790 |    "outputs": [],
 791 |    "source": [
 792 |     "salsa_ingredients[0]"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "code",
 797 |    "execution_count": null,
 798 |    "metadata": {},
 799 |    "outputs": [],
 800 |    "source": [
 801 |     "salsa_ingredients[1]"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "markdown",
 806 |    "metadata": {},
 807 |    "source": [
 808 |     "You can use _negative indexing_ to grab things from the right-hand side of the list -- and in fact, `[-1]` is a common idiom for getting \"the last item in a list\" when it's not clear how many items are in your list."
 809 |    ]
 810 |   },
 811 |   {
 812 |    "cell_type": "code",
 813 |    "execution_count": null,
 814 |    "metadata": {},
 815 |    "outputs": [],
 816 |    "source": [
 817 |     "salsa_ingredients[-1]"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "markdown",
 822 |    "metadata": {},
 823 |    "source": [
 824 |     "If you wanted to get a slice of multiple items out of your list, you'd use colons (just like in Excel, kind of!).\n",
 825 |     "\n",
 826 |     "If you wanted to get the first three items, you'd do this:"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "code",
 831 |    "execution_count": null,
 832 |    "metadata": {},
 833 |    "outputs": [],
 834 |    "source": [
 835 |     "salsa_ingredients[0:3]"
 836 |    ]
 837 |   },
 838 |   {
 839 |    "cell_type": "markdown",
 840 |    "metadata": {},
 841 |    "source": [
 842 |     "You could also have left off the initial 0 -- when you leave out the first number, Python defaults to \"the first item in the list.\" In the same way, if you leave off the last number, Python defaults to \"the last item in the list.\""
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "code",
 847 |    "execution_count": null,
 848 |    "metadata": {},
 849 |    "outputs": [],
 850 |    "source": [
 851 |     "salsa_ingredients[:3]"
 852 |    ]
 853 |   },
 854 |   {
 855 |    "cell_type": "markdown",
 856 |    "metadata": {},
 857 |    "source": [
 858 |     "Note, too, that this slice is giving us items 0, 1 and 2. The `3` in our slice is the first item we _don't_ want. That can be kind of confusing at first. Let's try a few more:"
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "code",
 863 |    "execution_count": null,
 864 |    "metadata": {},
 865 |    "outputs": [],
 866 |    "source": [
 867 |     "# everything in the list except the first item\n",
 868 |     "salsa_ingredients[1:]"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": null,
 874 |    "metadata": {},
 875 |    "outputs": [],
 876 |    "source": [
 877 |     "# the second, third and fourth items\n",
 878 |     "salsa_ingredients[1:4]"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": null,
 884 |    "metadata": {},
 885 |    "outputs": [],
 886 |    "source": [
 887 |     "# the last two items\n",
 888 |     "salsa_ingredients[-2:]"
 889 |    ]
 890 |   },
 891 |   {
 892 |    "cell_type": "markdown",
 893 |    "metadata": {},
 894 |    "source": [
 895 |     "To see how many items are in a list, use the `len()` function:"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "code",
 900 |    "execution_count": null,
 901 |    "metadata": {},
 902 |    "outputs": [],
 903 |    "source": [
 904 |     "len(salsa_ingredients)"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "metadata": {},
 910 |    "source": [
 911 |     "To add an item to a list, use the [`append()`](https://docs.python.org/3/tutorial/datastructures.html#more-on-lists) method:"
 912 |    ]
 913 |   },
 914 |   {
 915 |    "cell_type": "code",
 916 |    "execution_count": null,
 917 |    "metadata": {},
 918 |    "outputs": [],
 919 |    "source": [
 920 |     "salsa_ingredients"
 921 |    ]
 922 |   },
 923 |   {
 924 |    "cell_type": "code",
 925 |    "execution_count": null,
 926 |    "metadata": {},
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "salsa_ingredients.append('mayonnaise')"
 930 |    ]
 931 |   },
 932 |   {
 933 |    "cell_type": "code",
 934 |    "execution_count": null,
 935 |    "metadata": {},
 936 |    "outputs": [],
 937 |    "source": [
 938 |     "salsa_ingredients"
 939 |    ]
 940 |   },
 941 |   {
 942 |    "cell_type": "markdown",
 943 |    "metadata": {},
 944 |    "source": [
 945 |     "Haha _gross_. To remove an item from a list, use the `pop()` method. If you don't specify the index number of the item you want to pop out, it will default to \"the last item.\""
 946 |    ]
 947 |   },
 948 |   {
 949 |    "cell_type": "code",
 950 |    "execution_count": null,
 951 |    "metadata": {},
 952 |    "outputs": [],
 953 |    "source": [
 954 |     "salsa_ingredients.pop()"
 955 |    ]
 956 |   },
 957 |   {
 958 |    "cell_type": "code",
 959 |    "execution_count": null,
 960 |    "metadata": {
 961 |     "scrolled": true
 962 |    },
 963 |    "outputs": [],
 964 |    "source": [
 965 |     "salsa_ingredients"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "markdown",
 970 |    "metadata": {},
 971 |    "source": [
 972 |     "You can use the [`in` and `not in`](https://docs.python.org/3/reference/expressions.html#membership-test-operations) expressions to test membership in a list (will return a boolean):"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "code",
 977 |    "execution_count": null,
 978 |    "metadata": {},
 979 |    "outputs": [],
 980 |    "source": [
 981 |     "'lime' in salsa_ingredients"
 982 |    ]
 983 |   },
 984 |   {
 985 |    "cell_type": "code",
 986 |    "execution_count": null,
 987 |    "metadata": {},
 988 |    "outputs": [],
 989 |    "source": [
 990 |     "'cilantro' not in salsa_ingredients"
 991 |    ]
 992 |   },
 993 |   {
 994 |    "cell_type": "markdown",
 995 |    "metadata": {},
 996 |    "source": [
 997 |     "### Dictionaries\n",
 998 |     "\n",
 999 |     "A _dictionary_ is a comma-separated list of key/value pairs inside curly brackets: `{}`. Let's make an entire salsa recipe:"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": null,
1005 |    "metadata": {},
1006 |    "outputs": [],
1007 |    "source": [
1008 |     "salsa = {\n",
1009 |     "    'ingredients': salsa_ingredients,\n",
1010 |     "    'instructions': 'Chop up all the ingredients and cook them for awhile.',\n",
1011 |     "    'oz_made': 12\n",
1012 |     "}"
1013 |    ]
1014 |   },
1015 |   {
1016 |    "cell_type": "markdown",
1017 |    "metadata": {},
1018 |    "source": [
1019 |     "To retrieve a value from a dictionary, you'd refer to the name of its key inside square brackets `[]` immediately after your reference to the dictionary:"
1020 |    ]
1021 |   },
1022 |   {
1023 |    "cell_type": "code",
1024 |    "execution_count": null,
1025 |    "metadata": {},
1026 |    "outputs": [],
1027 |    "source": [
1028 |     "salsa['oz_made']"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "code",
1033 |    "execution_count": null,
1034 |    "metadata": {},
1035 |    "outputs": [],
1036 |    "source": [
1037 |     "salsa['ingredients']"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "markdown",
1042 |    "metadata": {},
1043 |    "source": [
1044 |     "To add a new key/value pair to a dictionary, assign a new key to the dictionary inside square brackets and set the value of that key with `=`:"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "code",
1049 |    "execution_count": null,
1050 |    "metadata": {},
1051 |    "outputs": [],
1052 |    "source": [
1053 |     "salsa['tastes_great'] = True"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "code",
1058 |    "execution_count": null,
1059 |    "metadata": {},
1060 |    "outputs": [],
1061 |    "source": [
1062 |     "salsa"
1063 |    ]
1064 |   },
1065 |   {
1066 |    "cell_type": "markdown",
1067 |    "metadata": {},
1068 |    "source": [
1069 |     "To delete a key/value pair out of a dictionary, use the `del` command and reference the key:"
1070 |    ]
1071 |   },
1072 |   {
1073 |    "cell_type": "code",
1074 |    "execution_count": null,
1075 |    "metadata": {},
1076 |    "outputs": [],
1077 |    "source": [
1078 |     "del salsa['tastes_great']"
1079 |    ]
1080 |   },
1081 |   {
1082 |    "cell_type": "code",
1083 |    "execution_count": null,
1084 |    "metadata": {},
1085 |    "outputs": [],
1086 |    "source": [
1087 |     "salsa"
1088 |    ]
1089 |   },
1090 |   {
1091 |    "cell_type": "markdown",
1092 |    "metadata": {},
1093 |    "source": [
1094 |     "### Indentation\n",
1095 |     "\n",
1096 |     "Whitespace matters in Python. Sometimes you'll need to indent bits of code to make things work. This can be confusing! `IndentationError`s are common even for experienced programmers. (FWIW, Jupyter will try to be helpful and insert the correct amount of \"significant whitespace\" for you.)\n",
1097 |     "\n",
1098 |     "You can use tabs or spaces, just don't mix them. [The Python style guide](https://www.python.org/dev/peps/pep-0008/) recommends indenting your code in groups of four spaces, so that's what we'll use."
1099 |    ]
1100 |   },
1101 |   {
1102 |    "cell_type": "markdown",
1103 |    "metadata": {},
1104 |    "source": [
1105 |     "### `for` loops\n",
1106 |     "\n",
1107 |     "You would use a `for` loop to iterate over a collection of things. The statement begins with the keyword `for` (lowercase), then a temporary `variable_name` of your choice to represent each item as you loop through the collection, then the Python keyword `in`, then the collection you're looping over (or its variable name), then a colon, then the indented block of code with instructions about what to do with each item in the collection.\n",
1108 |     "\n",
1109 |     "Let's say we have a list of numbers that we assign to the variable `list_of_numbers`."
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "code",
1114 |    "execution_count": null,
1115 |    "metadata": {},
1116 |    "outputs": [],
1117 |    "source": [
1118 |     "list_of_numbers = [1, 2, 3, 4, 5, 6]"
1119 |    ]
1120 |   },
1121 |   {
1122 |    "cell_type": "markdown",
1123 |    "metadata": {},
1124 |    "source": [
1125 |     "We could loop over the list and print out each number:"
1126 |    ]
1127 |   },
1128 |   {
1129 |    "cell_type": "code",
1130 |    "execution_count": null,
1131 |    "metadata": {},
1132 |    "outputs": [],
1133 |    "source": [
1134 |     "for number in list_of_numbers:\n",
1135 |     "    print(number)"
1136 |    ]
1137 |   },
1138 |   {
1139 |    "cell_type": "markdown",
1140 |    "metadata": {},
1141 |    "source": [
1142 |     "We could print out each number _times 6_:"
1143 |    ]
1144 |   },
1145 |   {
1146 |    "cell_type": "code",
1147 |    "execution_count": null,
1148 |    "metadata": {},
1149 |    "outputs": [],
1150 |    "source": [
1151 |     "for number in list_of_numbers:\n",
1152 |     "    print(number*6)"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "markdown",
1157 |    "metadata": {},
1158 |    "source": [
1159 |     "... whatever you need to do in you loop. Note that the variable name `number` in our loop is totally arbitrary. This also would work:"
1160 |    ]
1161 |   },
1162 |   {
1163 |    "cell_type": "code",
1164 |    "execution_count": null,
1165 |    "metadata": {},
1166 |    "outputs": [],
1167 |    "source": [
1168 |     "for banana in list_of_numbers:\n",
1169 |     "    print(banana)"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "markdown",
1174 |    "metadata": {},
1175 |    "source": [
1176 |     "It can be hard, at first, to figure out what's a \"Python word\" and what's a variable name that you get to define. This comes with practice."
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "markdown",
1181 |    "metadata": {},
1182 |    "source": [
1183 |     "Strings are iterable, too. Let's loop over the letters in a sentence:"
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "code",
1188 |    "execution_count": null,
1189 |    "metadata": {},
1190 |    "outputs": [],
1191 |    "source": [
1192 |     "sentence = 'Hello, IRE/NICAR!'\n",
1193 |     "\n",
1194 |     "for letter in sentence:\n",
1195 |     "    print(letter)"
1196 |    ]
1197 |   },
1198 |   {
1199 |    "cell_type": "markdown",
1200 |    "metadata": {},
1201 |    "source": [
1202 |     "To this point: Strings are iterable, like lists, so you can use the same kinds of methods:"
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "code",
1207 |    "execution_count": null,
1208 |    "metadata": {},
1209 |    "outputs": [],
1210 |    "source": [
1211 |     "# get the first five characters\n",
1212 |     "sentence[:5]"
1213 |    ]
1214 |   },
1215 |   {
1216 |    "cell_type": "code",
1217 |    "execution_count": null,
1218 |    "metadata": {},
1219 |    "outputs": [],
1220 |    "source": [
1221 |     "# get the length of the sentence\n",
1222 |     "len(sentence)"
1223 |    ]
1224 |   },
1225 |   {
1226 |    "cell_type": "code",
1227 |    "execution_count": null,
1228 |    "metadata": {},
1229 |    "outputs": [],
1230 |    "source": [
1231 |     "'Hello' in sentence"
1232 |    ]
1233 |   },
1234 |   {
1235 |    "cell_type": "markdown",
1236 |    "metadata": {},
1237 |    "source": [
1238 |     "You can iterate over dictionaries, too -- just remember that dictionaries _don't keep track of the order that items were added to it_.\n",
1239 |     "\n",
1240 |     "When you're looping over a dictionary, the variable name in your `for` loop will refer to the keys. Let's loop over our `salsa` dictionary from up above to see what I mean."
1241 |    ]
1242 |   },
1243 |   {
1244 |    "cell_type": "code",
1245 |    "execution_count": null,
1246 |    "metadata": {},
1247 |    "outputs": [],
1248 |    "source": [
1249 |     "for key in salsa:\n",
1250 |     "    print(key)"
1251 |    ]
1252 |   },
1253 |   {
1254 |    "cell_type": "markdown",
1255 |    "metadata": {},
1256 |    "source": [
1257 |     "To get the _value_ of a dictionary item in a for loop, you'd need to use the key to retrieve it from the dictionary:"
1258 |    ]
1259 |   },
1260 |   {
1261 |    "cell_type": "code",
1262 |    "execution_count": null,
1263 |    "metadata": {},
1264 |    "outputs": [],
1265 |    "source": [
1266 |     "for key in salsa:\n",
1267 |     "    print(salsa[key])"
1268 |    ]
1269 |   },
1270 |   {
1271 |    "cell_type": "markdown",
1272 |    "metadata": {},
1273 |    "source": [
1274 |     "### `if` statements\n",
1275 |     "Just like in Excel, you can use the \"if\" keyword to handle conditional logic.\n",
1276 |     "\n",
1277 |     "These statements begin with the keyword `if` (lowercase), then the condition to evaluate, then a colon, then a new line with a block of indented code to execute if the condition resolves to `True`."
1278 |    ]
1279 |   },
1280 |   {
1281 |    "cell_type": "code",
1282 |    "execution_count": null,
1283 |    "metadata": {},
1284 |    "outputs": [],
1285 |    "source": [
1286 |     "if 4 < 6:\n",
1287 |     "    print('4 is less than 6')"
1288 |    ]
1289 |   },
1290 |   {
1291 |    "cell_type": "markdown",
1292 |    "metadata": {},
1293 |    "source": [
1294 |     "You can also add an `else` statement (and a colon) with an indented block of code you want to run if the condition resolves to `False`."
1295 |    ]
1296 |   },
1297 |   {
1298 |    "cell_type": "code",
1299 |    "execution_count": null,
1300 |    "metadata": {},
1301 |    "outputs": [],
1302 |    "source": [
1303 |     "if 4 > 6:\n",
1304 |     "    print('4 is greater than 6?!')\n",
1305 |     "else:\n",
1306 |     "    print('4 is not greater than 6.')"
1307 |    ]
1308 |   },
1309 |   {
1310 |    "cell_type": "markdown",
1311 |    "metadata": {},
1312 |    "source": [
1313 |     "If you need to, you can add multiple conditions with `elif`."
1314 |    ]
1315 |   },
1316 |   {
1317 |    "cell_type": "code",
1318 |    "execution_count": null,
1319 |    "metadata": {},
1320 |    "outputs": [],
1321 |    "source": [
1322 |     "HOME_SCORE = 6\n",
1323 |     "AWAY_SCORE = 8\n",
1324 |     "\n",
1325 |     "if HOME_SCORE > AWAY_SCORE:\n",
1326 |     "    print('we won!')\n",
1327 |     "elif HOME_SCORE == AWAY_SCORE:\n",
1328 |     "    print('we tied!')\n",
1329 |     "else:\n",
1330 |     "    print('we lost!')"
1331 |    ]
1332 |   }
1333 |  ],
1334 |  "metadata": {
1335 |   "kernelspec": {
1336 |    "display_name": "Python 3 (ipykernel)",
1337 |    "language": "python",
1338 |    "name": "python3"
1339 |   },
1340 |   "language_info": {
1341 |    "codemirror_mode": {
1342 |     "name": "ipython",
1343 |     "version": 3
1344 |    },
1345 |    "file_extension": ".py",
1346 |    "mimetype": "text/x-python",
1347 |    "name": "python",
1348 |    "nbconvert_exporter": "python",
1349 |    "pygments_lexer": "ipython3",
1350 |    "version": "3.10.9"
1351 |   }
1352 |  },
1353 |  "nbformat": 4,
1354 |  "nbformat_minor": 2
1355 | }
1356 | 


--------------------------------------------------------------------------------
/Python syntax cheat sheet.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/Python syntax cheat sheet.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NICAR 2023: Web scraping with Python
 2 | 
 3 | ### 🔗 [bit.ly/nicar23-scraping](https://bit.ly/nicar23-scraping)
 4 | 
 5 | This repo contains materials for a half-day workshop at the NICAR 2023 data journalism conference in Nashville on using Python to scrape data from websites.
 6 | 
 7 | The session is scheduled for Sunday, March 5, from 9 a.m. - 12:30 p.m. in room `Midtown 3` on Meeting Space Level 2.
 8 | 
 9 | ### First step
10 | 
11 | Open the Terminal application. Copy and paste this text into the Terminal and hit enter:
12 | 
13 | ```bat
14 | cd Desktop/hands_on_classes/20230305-sunday-web-scraping-with-python--preregistered-attendees-only & .\env\Scripts\activate
15 | ```
16 | 
17 | ### Course outline
18 | - Do you really need to scrape this?
19 | - Process overview:
20 |     - Fetch, parse, write data to file
21 |     - Some best practices
22 |         - Make sure you feel OK about whether your scraping project is (legally, ethically, etc.) allowable
23 |         - Don't DDOS your target server
24 |         - When feasible, save copies of pages locally, then scrape from those files
25 |         - [Rotate user-agent strings](https://www.useragents.me/) and other headers if necessary to avoid bot detection
26 | - Using your favorite brower's inspection tools to deconstruct the target page(s)
27 |     - See if the data is delivered to the page in a ready-to-use format, such as JSON ([example](https://sdlegislature.gov/Session/Archived))
28 |     - Is the HTML part of the actual page structure, or is it built on the fly when the page loads? ([example](https://rrctx.force.com/s/complaints))
29 |     - Can you open the URL directly in an incognito window and get to the same content, or does the page require a specific state to deliver the content (via search navigation, etc.)? ([example](https://rrctx.force.com/s/ietrs-complaint/a0ct0000000mOmhAAE/complaint0000000008))
30 |     - Are there [URL query parameters](https://en.wikipedia.org/wiki/Query_string) that you can tweak to get different results? ([example](https://www.worksafe.qld.gov.au/news-and-events/alerts))
31 | - Choose tools that the most sense for your target page(s) -- a few popular options:
32 |     - [`requests`](https://requests.readthedocs.io/en/latest/) and [`BeautifulSoup`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
33 |     - [`playwright`](https://playwright.dev/python) (optionally using `BeautifulSoup` for the HTML parsing)
34 |     - [`scrapy`](https://scrapy.org/) for larger spidering/crawling tasks
35 | - Overview of our Python setup today
36 |     - Activating the virtual environment
37 |     - Jupyter notebooks
38 |     - Running `.py` files from the command line
39 | - Our projects today:
40 |     - [Maryland WARN notices](md-warn-notices)
41 |     - [U.S. Senate press gallery](us-senate-press-gallery)
42 |     - [IRE board members](ire-board)
43 |     - [South Dakota lobbyist registration data](sd-lobbyists)
44 |     - [Texas Railroad Commission complaints](tx-railroad-commission)
45 | 
46 | ### Additional resources
47 | - Need to scrape on a timer? [Try GitHub Actions](https://palewi.re/docs/first-github-scraper) (Other options: Using your computer's scheduler tools, putting your script on a remote server with a [`crontab` configuration](https://en.wikipedia.org/wiki/Cron), [switching to Google Apps Script and setting up time-based triggers](https://developers.google.com/apps-script/guides/triggers), etc.)
48 | - [A neat technique for copying data to your clipboard while scraping a Flourish visualization](https://til.simonwillison.net/shot-scraper/scraping-flourish)
49 | - [Walkthrough: Class-based scraping](https://blog.apps.npr.org/2016/06/17/scraping-tips.html)
50 | 
51 | 
52 | ### Running this code at home
53 | - Install Python, if you haven't already ([here's our guide](https://docs.google.com/document/d/1cYmpfZEZ8r-09Q6Go917cKVcQk_d0P61gm0q8DAdIdg/edit))
54 | - Clone or download this repo
55 | - `cd` into the repo directory and install the requirements, preferably into a virtual environment using your tooling of choice: `pip install -r requirements.txt`
56 | - `playwright install`
57 | - `jupyter notebook` to launch the notebook server
58 | 


--------------------------------------------------------------------------------
/ire-board/IRE Board members - complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "4917bfba",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# IRE Board members\n",
  9 |     "\n",
 10 |     "The goal: Scrape [this list of IRE board members](https://www.ire.org/about-ire/past-ire-board-members/) into a CSV.\n",
 11 |     "\n",
 12 |     "This project introduces a few new concepts:\n",
 13 |     "- Scraping data that's not part of a table\n",
 14 |     "- Specifying custom request headers to evade a bot detection rule on our server\n",
 15 |     "- Using string methods and default values when parsing out the data"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "id": "bfd3d8c7",
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "# stdlib library we'll use to write the CSV file\n",
 26 |     "import csv\n",
 27 |     "\n",
 28 |     "# installed library to handle the HTTP traffic\n",
 29 |     "import requests\n",
 30 |     "\n",
 31 |     "# installed library to parse the HTML\n",
 32 |     "from bs4 import BeautifulSoup"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "id": "1acd7756",
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "URL = 'https://www.ire.org/about-ire/past-ire-board-members/'"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "id": "accded42",
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "# set up request headers\n",
 53 |     "# the IRE website rejects incoming requests with the\n",
 54 |     "# `requests` library's default user-agent, so we\n",
 55 |     "# need to pretend to be a browser -- we can do that by\n",
 56 |     "# setting the `User-Agent` value to mimic a value that\n",
 57 |     "# a browser would send, and add this to the headers\n",
 58 |     "# of the request before it's sent\n",
 59 |     "# read more: https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent\n",
 60 |     "headers = {\n",
 61 |     "    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'\n",
 62 |     "}"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "id": "03294e7e",
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "# send a GET request to fetch the page using the headers we just created\n",
 73 |     "r = requests.get(\n",
 74 |     "    'https://www.ire.org/about-ire/past-ire-board-members/',\n",
 75 |     "    headers=headers\n",
 76 |     ")\n",
 77 |     "\n",
 78 |     "# raise an error if the HTTP request returns an error code\n",
 79 |     "# HTTP codes: https://http.cat\n",
 80 |     "r.raise_for_status()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "id": "e5c65871",
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# use the BeautifulSoup object to parse the response text\n",
 91 |     "# -- r.text -- with the default HTML parser\n",
 92 |     "# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use\n",
 93 |     "soup = BeautifulSoup(r.text, 'html.parser')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "id": "400f25c3",
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "print(soup)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "id": "73db6014",
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "# search the HTML tree to find the div\n",
114 |     "# with the `id` attribute of \"past-ire-board-members\"\n",
115 |     "target_div = soup.find(\n",
116 |     "    'div',\n",
117 |     "    {'id': 'past-ire-board-members'}\n",
118 |     ")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "df88000b",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "print(target_div)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "4ad3f74f",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "# within that div, find all the paragraph tags\n",
139 |     "members = target_div.find_all('p')"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "id": "7b51b34b",
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "members"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "id": "cb711ee3",
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# set up the CSV headers to write to file\n",
160 |     "csv_headers = [\n",
161 |     "    'name',\n",
162 |     "    'terms',\n",
163 |     "    'was_president',\n",
164 |     "    'is_deceased'\n",
165 |     "]"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "id": "787cb02f",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# next, set up the file to write the CSV data into\n",
176 |     "# https://docs.python.org/3/library/csv.html#csv.writer\n",
177 |     "\n",
178 |     "# open the CSV file in write ('w') mode, specifying newline='' to deal with\n",
179 |     "# potential PC-only line ending problem\n",
180 |     "with open('ire-board.csv', 'w', newline='') as outfile:\n",
181 |     "\n",
182 |     "    # set up a csv.writer object tied to the file we just opened\n",
183 |     "    writer = csv.writer(outfile)\n",
184 |     "\n",
185 |     "    # write the list of headers\n",
186 |     "    writer.writerow(csv_headers)\n",
187 |     "\n",
188 |     "    # loop over the list of paragraphs we targeted above\n",
189 |     "    for member in members:\n",
190 |     "\n",
191 |     "        # we don't want the entire Tag object, just the text\n",
192 |     "        text = member.text\n",
193 |     "\n",
194 |     "        # set up some default values -- the member was not president\n",
195 |     "        was_president = False\n",
196 |     "\n",
197 |     "        # and is not deceased\n",
198 |     "        is_deceased = False\n",
199 |     "\n",
200 |     "        # IRE denotes past presidents with a leading asterisk\n",
201 |     "        # so check to see if the string startswith '*'\n",
202 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=startswith#str.startswith\n",
203 |     "        if text.startswith('*'):\n",
204 |     "\n",
205 |     "            # if so, switch the value for the `was_president` variable to True\n",
206 |     "            was_president = True\n",
207 |     "\n",
208 |     "        # check to see if \"(dec)\" is anywhere in the text, which\n",
209 |     "        # indicates this person is deceased\n",
210 |     "        # https://docs.python.org/3/reference/expressions.html#in\n",
211 |     "        if '(dec)' in text:\n",
212 |     "            is_deceased = True\n",
213 |     "\n",
214 |     "        # next, start parsing out the pieces\n",
215 |     "        # separate the name from the terms by splitting on \"(\"\n",
216 |     "        text_split = text.split('(')\n",
217 |     "\n",
218 |     "        # the name will be the first ([0]) item in the resulting list\n",
219 |     "        # while we're at it, strip off any leading asterisks\n",
220 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.lstrip\n",
221 |     "        # and strip() off any leading or trailing whitespace\n",
222 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.strip\n",
223 |     "        name = text_split[0].lstrip('*').strip()\n",
224 |     "\n",
225 |     "        # the term(s) of service will be the second item ([1]) in that list\n",
226 |     "        # and the term text is always terminated with a closing parens\n",
227 |     "        # so splitting on that closing parens and taking the first ([0])\n",
228 |     "        # item in the list will give us the term(s)\n",
229 |     "        terms = text_split[1].split(')')[0]\n",
230 |     "\n",
231 |     "        # put the collected data into a list\n",
232 |     "        data = [\n",
233 |     "            name,\n",
234 |     "            terms,\n",
235 |     "            was_president,\n",
236 |     "            is_deceased\n",
237 |     "        ]\n",
238 |     "\n",
239 |     "        # and write this row of data into the CSV file\n",
240 |     "        writer.writerow(data)"
241 |    ]
242 |   }
243 |  ],
244 |  "metadata": {
245 |   "kernelspec": {
246 |    "display_name": "Python 3 (ipykernel)",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.10.9"
261 |   }
262 |  },
263 |  "nbformat": 4,
264 |  "nbformat_minor": 5
265 | }
266 | 


--------------------------------------------------------------------------------
/ire-board/IRE Board members - working.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "4917bfba",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# IRE Board members\n",
  9 |     "\n",
 10 |     "The goal: Scrape [this list of IRE board members](https://www.ire.org/about-ire/past-ire-board-members/) into a CSV.\n",
 11 |     "\n",
 12 |     "This project introduces a few new concepts:\n",
 13 |     "- Scraping data that's not part of a table\n",
 14 |     "- Specifying custom request headers to evade a bot detection rule on our server\n",
 15 |     "- Using string methods and default values when parsing out the data\n",
 16 |     "\n",
 17 |     "[The completed version is here](IRE%20Board%20members%20-%20complete.ipynb).\n",
 18 |     "\n",
 19 |     "([See also this standalone version featuring a few more advanced techniques](/edit/ire-board/ire_board_scrape.py).)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "id": "bfd3d8c7",
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "# stdlib library we'll use to write the CSV file\n",
 30 |     "import csv\n",
 31 |     "\n",
 32 |     "# installed library to handle the HTTP traffic\n",
 33 |     "import requests\n",
 34 |     "\n",
 35 |     "# installed library to parse the HTML\n",
 36 |     "from bs4 import BeautifulSoup"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "id": "1acd7756",
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "URL = 'https://www.ire.org/about-ire/past-ire-board-members/'"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "434e47d8",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# make the request\n",
 57 |     "\n",
 58 |     "# check for HTTP errors"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "id": "accded42",
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "# set up request headers with a custom user-agent string\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "03294e7e",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# try the request again, with the new headers\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "# and raise for errors\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "id": "e5c65871",
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "# parse the HTML into soup\n"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "id": "7c3f3e35",
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": []
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "id": "73db6014",
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "# search the HTML tree to find the div\n",
110 |     "# with the `id` attribute of \"past-ire-board-members\"\n"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "id": "df88000b",
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": []
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "id": "4ad3f74f",
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "# within that div, find all the paragraph tags\n"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "id": "7b51b34b",
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": []
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "id": "c0058f3a",
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# noodle around here to isolate the pieces of data for export"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "id": "6ec1c43b",
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": []
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "a8373c6c",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": []
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "7cfb1966",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": []
172 |   },
173 |   {
174 |    "cell_type": "code",
175 |    "execution_count": null,
176 |    "id": "f35f134d",
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": []
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "id": "cb711ee3",
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "# set up the CSV headers to write to file\n"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": null,
194 |    "id": "787cb02f",
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "# next, set up the file to write the CSV data into\n",
199 |     "# https://docs.python.org/3/library/csv.html#csv.writer\n",
200 |     "\n",
201 |     "# open the CSV file in write ('w') mode, specifying newline='' to deal with\n",
202 |     "# potential PC-only line ending problem\n",
203 |     "\n",
204 |     "\n",
205 |     "    # set up a csv.writer object tied to the file we just opened\n",
206 |     "\n",
207 |     "\n",
208 |     "    # write the list of headers\n",
209 |     "\n",
210 |     "\n",
211 |     "    # loop over the list of paragraphs we targeted above\n",
212 |     "\n",
213 |     "\n",
214 |     "        # we don't want the entire Tag object, just the text\n",
215 |     "\n",
216 |     "\n",
217 |     "        # set up some default values -- the member was not president\n",
218 |     "\n",
219 |     "\n",
220 |     "        # and is not deceased\n",
221 |     "\n",
222 |     "\n",
223 |     "        # IRE denotes past presidents with a leading asterisk\n",
224 |     "        # so check to see if the string startswith '*'\n",
225 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=startswith#str.startswith\n",
226 |     "\n",
227 |     "\n",
228 |     "            # if so, switch the value for the `was_president` variable to True\n",
229 |     "\n",
230 |     "\n",
231 |     "        # check to see if \"(dec)\" is anywhere in the text, which\n",
232 |     "        # indicates this person is deceased\n",
233 |     "        # https://docs.python.org/3/reference/expressions.html#in\n",
234 |     "\n",
235 |     "\n",
236 |     "        # next, start parsing out the pieces\n",
237 |     "        # separate the name from the terms by splitting on \"(\"\n",
238 |     "\n",
239 |     "\n",
240 |     "        # the name will be the first ([0]) item in the resulting list\n",
241 |     "        # while we're at it, strip off any leading asterisks\n",
242 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.lstrip\n",
243 |     "        # and strip() off any leading or trailing whitespace\n",
244 |     "        # https://docs.python.org/3/library/stdtypes.html?highlight=lstrip#str.strip\n",
245 |     "\n",
246 |     "\n",
247 |     "        # the term(s) of service will be the second item ([1]) in that list\n",
248 |     "        # and the term text is always terminated with a closing parens\n",
249 |     "        # so splitting on that closing parens and taking the first ([0])\n",
250 |     "        # item in the list will give us the term(s)\n",
251 |     "\n",
252 |     "\n",
253 |     "        # put the collected data into a list\n",
254 |     "\n",
255 |     "\n",
256 |     "        # and write this row of data into the CSV file\n"
257 |    ]
258 |   }
259 |  ],
260 |  "metadata": {
261 |   "kernelspec": {
262 |    "display_name": "Python 3 (ipykernel)",
263 |    "language": "python",
264 |    "name": "python3"
265 |   },
266 |   "language_info": {
267 |    "codemirror_mode": {
268 |     "name": "ipython",
269 |     "version": 3
270 |    },
271 |    "file_extension": ".py",
272 |    "mimetype": "text/x-python",
273 |    "name": "python",
274 |    "nbconvert_exporter": "python",
275 |    "pygments_lexer": "ipython3",
276 |    "version": "3.10.9"
277 |   }
278 |  },
279 |  "nbformat": 4,
280 |  "nbformat_minor": 5
281 | }
282 | 


--------------------------------------------------------------------------------
/ire-board/ire_board_scrape.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This version demonstrates a few more advanced techniques -- inline comments are mainly for stuff not covered in the basic script:
  3 | - Separation of concerns: Writing a function to handle each task -- downloading the page and scraping the data -- and setting up the script to allow those functions to be imported into other scripts, if that need should ever arise
  4 | - Doing a little more text processing to break the name into last/rest components, and to separate out terms of service, so now the atomic observation being written to file is a term of service, not a board member
  5 | - Using csv.DictWriter instead of csv.writer
  6 | - Demonstrating a few other useful Python techniques, such as list comprehensions, multiple assignment, star unpacking and custom list sorting
  7 | '''
  8 | 
  9 | import os
 10 | import csv
 11 | 
 12 | import requests
 13 | from bs4 import BeautifulSoup
 14 | 
 15 | 
 16 | def download_page(url, html_file_out):
 17 | 
 18 |     if not os.path.exists(html_file_out):
 19 | 
 20 |         headers = {
 21 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'  # noqa
 22 |         }
 23 | 
 24 |         r = requests.get(
 25 |             url,
 26 |             headers=headers
 27 |         )
 28 | 
 29 |         r.raise_for_status()
 30 | 
 31 |         with open(html_file_out, 'w') as outfile:
 32 |             outfile.write(r.text)
 33 | 
 34 |         print(f'Downloaded {html_file_out}')
 35 | 
 36 |     return html_file_out
 37 | 
 38 | 
 39 | def parse_data(html_file_in, csv_file_out):
 40 |     with open(html_file_in, 'r') as infile:
 41 |         html = infile.read()
 42 | 
 43 |     soup = BeautifulSoup(
 44 |         html,
 45 |         'html.parser'
 46 |     )
 47 | 
 48 |     target_div = soup.find(
 49 |         'div',
 50 |         {'id': 'past-ire-board-members'}
 51 |     )
 52 | 
 53 |     # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
 54 |     members = [x.text.strip() for x in target_div.find_all('p')]
 55 | 
 56 |     csv_headers = [
 57 |         'name_last',
 58 |         'name_rest',
 59 |         'term_start',
 60 |         'term_end',
 61 |         'was_president',
 62 |         'is_deceased'
 63 |     ]
 64 | 
 65 |     # start an empty list to hold records to write
 66 |     parsed_member_data = []
 67 | 
 68 |     # loop over member text
 69 |     for member in members:
 70 | 
 71 |         was_president = False
 72 |         is_deceased = False
 73 | 
 74 |         if member.startswith('*'):
 75 |             was_president = True
 76 | 
 77 |         if '(dec)' in member:
 78 |             is_deceased = True
 79 | 
 80 |         # https://exercism.org/tracks/python/concepts/unpacking-and-multiple-assignment
 81 |         # https://docs.python.org/3/tutorial/controlflow.html?highlight=unpack#unpacking-argument-lists
 82 |         # here, the value attached to the `rest` var is ignored
 83 |         name, terms, *rest = member.split('(')
 84 | 
 85 |         name_clean = name.lstrip('*').strip()
 86 |         terms_clean = terms.split(')')[0]
 87 | 
 88 |         # split the name into last, rest
 89 |         name_split = name_clean.rsplit(' ', 1)
 90 | 
 91 |         # handle generational suffixes
 92 |         if name_split[-1] == 'Jr.':
 93 |             name_split = name_split[0].rsplit(' ', 1)
 94 |             name_split[0] += ' Jr.'
 95 |         
 96 |         rest, last = name_split
 97 | 
 98 |         # loop over the terms of service
 99 |         for term in terms_clean.split(','):
100 |             term_start, term_end = term.strip().split('-')
101 | 
102 |             # create a dict by zipping together the headers with the list of data
103 |             data = dict(zip(csv_headers, [
104 |                 last,
105 |                 rest,
106 |                 term_start,
107 |                 term_end,
108 |                 was_president,
109 |                 is_deceased
110 |             ]))
111 | 
112 |             # add the dict to the main list
113 |             parsed_member_data.append(data)
114 | 
115 |     # sort member data by last name, then first name, then term start
116 |     data_sorted = sorted(
117 |         parsed_member_data,
118 |         key=lambda x: (
119 |             x['name_last'],
120 |             x['name_rest'],
121 |             x['term_start']
122 |         )
123 |     )
124 | 
125 |     # write to file, specifying the encoding and
126 |     # dealing with a Windows-specific problem that
127 |     # sometimes pops up when writing to file
128 |     with open(csv_file_out, 'w', encoding='utf-8', newline='') as outfile:
129 |         writer = csv.DictWriter(
130 |             outfile,
131 |             fieldnames=csv_headers
132 |         )
133 |         writer.writeheader()
134 |         writer.writerows(data_sorted)
135 | 
136 |     print(f'Wrote {csv_file_out}')
137 | 
138 | 
139 | # https://realpython.com/if-name-main-python/
140 | if __name__ == '__main__':
141 | 
142 |     url = 'https://www.ire.org/about-ire/past-ire-board-members/'
143 | 
144 |     # https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals
145 |     files_name = 'ire-board'
146 |     filename_page = f'{files_name}.html'
147 |     filename_csv = f'{files_name}-terms.csv'
148 | 
149 |     # call the functions
150 |     download_page(url, filename_page)
151 |     parse_data(filename_page, filename_csv)
152 | 


--------------------------------------------------------------------------------
/md-warn-notices/Maryland WARN Notices - multiple pages.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "id": "8fa7c11c",
 6 |    "metadata": {},
 7 |    "source": [
 8 |     "# Maryland WARN Notices - multiple pages\n",
 9 |     "\n",
10 |     "Extra credit: Figure out how to target and extract WARN data for multiple years. The process:\n",
11 |     "- Using `requests`, fetch the main page\n",
12 |     "- Using `bs4`, target the list of links to pages with data for previous years\n",
13 |     "- Using a `for` loop, iterate over each link\n",
14 |     "    - Fetch the page\n",
15 |     "    - Turn the contents into `soup`\n",
16 |     "    - Target the elements to extract\n",
17 |     "    - Add the parsed data to your list"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": null,
23 |    "id": "7b6c6fb2",
24 |    "metadata": {},
25 |    "outputs": [],
26 |    "source": []
27 |   }
28 |  ],
29 |  "metadata": {
30 |   "kernelspec": {
31 |    "display_name": "Python 3 (ipykernel)",
32 |    "language": "python",
33 |    "name": "python3"
34 |   },
35 |   "language_info": {
36 |    "codemirror_mode": {
37 |     "name": "ipython",
38 |     "version": 3
39 |    },
40 |    "file_extension": ".py",
41 |    "mimetype": "text/x-python",
42 |    "name": "python",
43 |    "nbconvert_exporter": "python",
44 |    "pygments_lexer": "ipython3",
45 |    "version": "3.10.9"
46 |   }
47 |  },
48 |  "nbformat": 4,
49 |  "nbformat_minor": 5
50 | }
51 | 


--------------------------------------------------------------------------------
/md-warn-notices/Maryland WARN Notices.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Maryland WARN Notices\n",
  8 |     "\n",
  9 |     "The goal: Scrape the main table on [the first page of Maryland's list of WARN letters](https://www.dllr.state.md.us/employment/warn.shtml) and, if time, write the data to a CSV.\n",
 10 |     "\n",
 11 |     "### Table of contents\n",
 12 |     "\n",
 13 |     "- [Using Jupyter notebooks](#Using-Jupyter-notebooks)\n",
 14 |     "- [What _is_ a web page, anyway?](#What-is-a-web-page,-anyway?)\n",
 15 |     "- [Inspect the source](#Inspect-the-source)\n",
 16 |     "- [Import libraries](#Import-libraries)\n",
 17 |     "- [Request the page](#Request-the-page)\n",
 18 |     "- [Turn your HTML into soup](#Turn-your-HTML-into-soup)\n",
 19 |     "- [Targeting and extracting data](#Targeting-and-extracting-data)\n",
 20 |     "- [Write the results to file](#Write-the-results-to-file)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "### Using Jupyter notebooks\n",
 28 |     "\n",
 29 |     "There are several ways to write and run Python code on your computer. One way -- the method we're using today -- is to use [Jupyter notebooks](https://jupyter.org/), which run in your browser and allow you to intersperse documentation with your code. They're handy for bundling your code with a human-readable explanation of what's happening at each step. Check out some examples from the [L.A. Times](https://github.com/datadesk/notebooks) and [BuzzFeed News](https://github.com/BuzzFeedNews/everything#data-and-analyses).\n",
 30 |     "\n",
 31 |     "**To add a new cell to your notebook**: Click the + button in the menu or press the `b` button on your keyboard.\n",
 32 |     "\n",
 33 |     "**To run a cell of code**: Select the cell and click the \"Run\" button in the menu, or you can press Shift+Enter.\n",
 34 |     "\n",
 35 |     "**One common gotcha**: The notebook doesn't \"know\" about code you've written until you've _run_ the cell containing it. For example, if you define a variable called `my_name` in one cell, and later, when you try to access that variable in another cell but get an error that says `NameError: name 'my_name' is not defined`, the most likely solution is to run (or re-run) the cell in which you defined `my_name`."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "### What _is_ a web page, anyway?\n",
 43 |     "\n",
 44 |     "Generally, a web page consists of a bunch of specifically formatted text files stored on a computer (a _server_) that's probably sitting on a rack in a giant data center somewhere.\n",
 45 |     "\n",
 46 |     "Mostly you'll be dealing with `.html` (HyperText Markup Language) files that might include references to `.css` (Cascading Style Sheet) files, which determine how the page looks, and/or `.js` (JavaScript) files, which add interactivity, and other specially formatted text files.\n",
 47 |     "\n",
 48 |     "Today, we'll focus on the HTML, which gives structure to the page.\n",
 49 |     "\n",
 50 |     "Most HTML elements are represented by a pair of tags -- an opening tag and a closing tag.\n",
 51 |     "\n",
 52 |     "A table, for example, starts with `<table>` and ends with `</table>`. The first tag tells the browser: \"Hey! I got a table here! Render it as a table.\" The closing tag (note the forward slash!) tells the browser: \"Hey! I'm all done with that table, thanks.\" Inside the table are nested more HTML tags representing rows (`<tr>`) and cells (`<td>`).\n",
 53 |     "\n",
 54 |     "HTML elements can have any number of attributes, such as classes --\n",
 55 |     "\n",
 56 |     "`<table class=\"cool-table\">`\n",
 57 |     "\n",
 58 |     "-- styles --\n",
 59 |     "\n",
 60 |     "`<table style=\"width:95%;\">`\n",
 61 |     "\n",
 62 |     "-- hyperlinks to other pages --\n",
 63 |     "\n",
 64 |     "`<a href=\"https://ire.org\">Click here to visit IRE's website</a>`\n",
 65 |     "\n",
 66 |     "-- and IDs --\n",
 67 |     "\n",
 68 |     "`<table id=\"cool-table\">`\n",
 69 |     "\n",
 70 |     "-- that will be useful to know about when we're scraping."
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "### Inspect the source\n",
 78 |     "\n",
 79 |     "You can look at the HTML that makes up a web page by _inspecting the source_ in a web browser. We like Chrome and Firefox for this; today, we'll use Chrome.\n",
 80 |     "\n",
 81 |     "You can inspect specific elements on the page by right-clicking on the page and selecting \"Inspect\" or \"Inspect Element\" from the context menu that pops up. Hover over elements in the \"Elements\" tab to highlight them on the page.\n",
 82 |     "\n",
 83 |     "To examine all of the source code that makes up a page, you can \"view source.\" In Chrome, hit `Ctrl+U` on a PC or `⌘+Opt+U` on a Mac. (It's also in the menu bar: View > Developer > View Page Source.)\n",
 84 |     "\n",
 85 |     "You'll get a page showing you all of the HTML code that makes up that page. Ignore 99% of it and try to locate the element(s) that you want to target (use `Ctrl+F` on a PC and `⌘+F` to find).\n",
 86 |     "\n",
 87 |     "Open up a Chrome browser and inspect the table on the [the first page of Maryland's list of WARN letters](https://www.dllr.state.md.us/employment/warn.shtml). Find the table we want to scrape.\n",
 88 |     "\n",
 89 |     "Is it the only table on the page? If not, does it have any attributes that would allow you to target it?"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "### Import libraries\n",
 97 |     "\n",
 98 |     "Step one is to _import_ two third-party Python libraries that will help us scrape this page:\n",
 99 |     "- `requests` is the de facto standard for making HTTP requests, similar to what happens when you type a URL into a browser window and hit enter.\n",
100 |     "- `bs4`, or BeautifulSoup, is a popular library for parsing HTML into a data structure that Python can work with.\n",
101 |     "\n",
102 |     "These libraries are installed separately from Python on a per-project basis ([read more about our recommendations for setting up Python projects here](https://docs.google.com/document/d/1cYmpfZEZ8r-09Q6Go917cKVcQk_d0P61gm0q8DAdIdg/edit#heading=h.od2v1nkge5t1)).\n",
103 |     "\n",
104 |     "Run this cell (you'll only have to do this once):"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": null,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "import requests\n",
114 |     "import bs4"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "### Request the page\n",
122 |     "\n",
123 |     "Next, we'll use the `get()` method of the `requests` library (which we just imported) to grab the web page.\n",
124 |     "\n",
125 |     "While we're at it, we'll _assign_ all the stuff that comes back to a new variable using `=`.\n",
126 |     "\n",
127 |     "The variable name is arbitrary, but it's usually good to pick something that describes the value it's pointing to.\n",
128 |     "\n",
129 |     "Notice that the URL we're grabbing is wrapped in quotes, making it a _string_ that Python will interepret as text (as opposed to numbers, booleans, etc.). You can read up more on Python data types and variable assignment [here](Python%20syntax%20cheat%20sheet.ipynb).\n",
130 |     "\n",
131 |     "Run these two cells:"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "URL = 'http://www.dllr.state.md.us/employment/warn.shtml'"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "warn_page = requests.get(URL)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "Nothing appears to have happened, which is (usually) a good sign.\n",
157 |     "\n",
158 |     "If you want to make sure that your request was successful, you can check the `status_code` attribute of the Python object that was returned:"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "warn_page.status_code"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "A `200` code means all is well. `404` means the page wasn't found, etc. ([Here's one of our favorite lists of HTTP status codes](https://http.cat/) ([or here, if you prefer dogs](https://httpstatusdogs.com/)).)\n",
175 |     "\n",
176 |     "The object being stored as the `warn_page` variable came back with a lot of potentially useful information we could access. Today, we're mostly interested in the `.text` attribute -- the HTML that makes up the web page, same as if we'd viewed the page source. Let's take a look:"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": null,
182 |    "metadata": {},
183 |    "outputs": [],
184 |    "source": [
185 |     "warn_page.text"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "### ✍️ Try it yourself\n",
193 |     "\n",
194 |     "Use the code blocks below to experiment with requesting web pages and checking out the HTML that gets returned.\n",
195 |     "\n",
196 |     "Some ideas to get you started:\n",
197 |     "- `'http://ire.org'`\n",
198 |     "- `'https://web.archive.org/web/20031202214318/http://www.tdcj.state.tx.us:80/stat/finalmeals.htm'`\n",
199 |     "- `'https://en.wikipedia.org/w/index.php?title=List_of_animal_names'`"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": []
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": []
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": []
236 |   },
237 |   {
238 |    "cell_type": "markdown",
239 |    "metadata": {},
240 |    "source": [
241 |     "### Turn your HTML into soup\n",
242 |     "\n",
243 |     "The HTML in the `.text` attribute of the request object is just a string -- a big ol' chunk of text.\n",
244 |     "\n",
245 |     "Before we start targeting and extracting pieces of data in the HTML, we need to turn that chunk of text into a data structure that Python can work with. That's where the [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) (`bs4`) library comes in.\n",
246 |     "\n",
247 |     "We'll create a new instance of a `BeautifulSoup` object, which lives under the top-level `bs4` library that we imported earlier. We need to give it two things:\n",
248 |     "- The HTML we'd like to parse -- `warn_page.text`\n",
249 |     "- A string with the name of the type of parser to use -- `html.parser` is the default and usually fine, but [there are other options](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser)\n",
250 |     "\n",
251 |     "We'll save the parsed HTML as a new variable, `soup`."
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": null,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "soup = bs4.BeautifulSoup(warn_page.text, 'html.parser')"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "Nothing happened, which is good! You can take a look at what `soup` is, but it looks pretty much like `warn_page.text`:"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "soup"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "markdown",
281 |    "metadata": {},
282 |    "source": [
283 |     "If you want to be sure, you can use the Python function `type()` to check what sort of object you're dealing with:"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "# the `str` type means a string, or text\n",
293 |     "type(warn_page.text)"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# the `bs4.BeautifulSoup` type means we successfully created the object\n",
303 |     "type(soup)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "### ✍️ Try it yourself\n",
311 |     "\n",
312 |     "Use the code blocks below to experiment fetching HTML and turning it into soup (if you fetched some pages earlier and saved them as variables, that'd be a good start)."
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": []
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": null,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": []
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {},
333 |    "outputs": [],
334 |    "source": []
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": []
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": []
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "### Targeting and extracting data\n",
355 |     "\n",
356 |     "Now that we have BeautifulSoup object loaded up, we can go hunting for the specific HTML elements that contain the data we need. Our general strategy:\n",
357 |     "1. Find the main table with the data we want to grab\n",
358 |     "2. Get a list of rows (the `tr` element, which stands for \"table row\") in that table\n",
359 |     "3. Use a Python `for loop` to go through each table row and find the data inside it (`td`, or \"table data\")\n",
360 |     "\n",
361 |     "To accomplish this, we'll use two `bs4` methods:\n",
362 |     "- [`find()`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find), which returns the first element that matches whatever criteria you hand it\n",
363 |     "- [`find_all()`](https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find-all), which returns a _list_ of elements that match the criteria. ([Here's how Python lists work](Python%20syntax%20cheat%20sheet.ipynb#Lists).)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "markdown",
368 |    "metadata": {},
369 |    "source": [
370 |     "#### Find the table\n",
371 |     "\n",
372 |     "To start with, we need to find the table. There are several ways to accomplish this, but because this is the only table on the page (view source and `Ctrl+F` to search for `<table` to confirm), we can simply say, \"Look through the `soup` object and find the table tag.\"\n",
373 |     "\n",
374 |     "Translated, the code is: `soup.find('table')`. While we're at it, save the results of that search to a new variable, `table`.\n",
375 |     "\n",
376 |     "Run these cells:"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "code",
381 |    "execution_count": null,
382 |    "metadata": {},
383 |    "outputs": [],
384 |    "source": [
385 |     "table = soup.find('table')"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "table"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "#### Find the rows in the table\n",
402 |     "\n",
403 |     "Next, use the `find_all()` method to drill down and get a list of rows in the table:"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": null,
409 |    "metadata": {},
410 |    "outputs": [],
411 |    "source": [
412 |     "rows = table.find_all('tr')"
413 |    ]
414 |   },
415 |   {
416 |    "cell_type": "code",
417 |    "execution_count": null,
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "rows"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "markdown",
426 |    "metadata": {},
427 |    "source": [
428 |     "To see how many items are in this list -- in other words, how many rows are in the table -- you can use the `len()` function:"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": null,
434 |    "metadata": {},
435 |    "outputs": [],
436 |    "source": [
437 |     "len(rows)"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "#### Loop through the rows and extract the data\n",
445 |     "\n",
446 |     "Next, we can use a [`for` loop](Python%20syntax%20cheat%20sheet.ipynb#for-loops) to go through the list of rows and start grabbing data from each one.\n",
447 |     "\n",
448 |     "Quick refresher on _for loop_ syntax: Start with the word `for` (lowercase), then a variable name to stand in for each item in the list that you're looping over, then the word `in` (lowercase), then the name of the list holding the items (`rows`, in our case), then a colon, then an indented block of code describing what we're doing to each item in the list.\n",
449 |     "\n",
450 |     "Each piece of data in the row will be stored in a `td` tag, which stands for \"table data.\" So inside the loop -- in the indented block -- we'll use the `find_all()` method to get a list of every `td` tag inside the row. And from there, we can access the content inside each tag.\n",
451 |     "\n",
452 |     "Our goal is to end up with a _list_ of data for each row that we will eventually write out to a file. Typically you'd probably do the work of looping and inspecting the results, step by step, in one code cell. But to show the thinking of how you might approach this (and to practice the syntax), we'll start by just printing out each row and then build from there. (`print()` will print a blank line to help us see exactly what we're working with in each row.)"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": [
461 |     "for row in rows:\n",
462 |     "    print(row)\n",
463 |     "    print()"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "Notice that the first item that prints is the header row with the column labels. You are free to keep these headers if you want, but I typically skip that row and define my own list of column names.\n",
471 |     "\n",
472 |     "(Another thing to consider: On better-constructed web pages, the cells in the header row will be represented by `th` (\"table header\") tags, not `td` (\"table data\") tags. The next step in our `for` loop is, \"Find all of the `td` tags in this row,\" so that would be something you would need to deal with.)\n",
473 |     "\n",
474 |     "We can skip the first row by using _list slicing_: adding square brackets after the name of the list with some instructions about which items in the list we want to select.\n",
475 |     "\n",
476 |     "Here, the syntax would be: `rows[1:]`, which means, take everything in the `rows` list starting with the item in position 1 (the second item) to the end of the list. Like many programming languages, Python starts counting at 0, so the result will leave off the first item in the list -- i.e. the item in position 0, i.e. the headers."
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "for row in rows[1:]:\n",
486 |     "    print(row)\n",
487 |     "    print()"
488 |    ]
489 |   },
490 |   {
491 |    "cell_type": "markdown",
492 |    "metadata": {},
493 |    "source": [
494 |     "Now we're cooking with gas. Let's start pulling out the data in each row. Start by using `find_all()` to grab a list of `td` tags:"
495 |    ]
496 |   },
497 |   {
498 |    "cell_type": "code",
499 |    "execution_count": null,
500 |    "metadata": {},
501 |    "outputs": [],
502 |    "source": [
503 |     "for row in rows[1:]:\n",
504 |     "    cells = row.find_all('td')\n",
505 |     "    print(cells)\n",
506 |     "    print()"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "Now we have, for each row, a _list_ of `td` tags. Next step is to look at the table and start grabbing specific values based on their position in the list and assigning them to human-readable variable names.\n",
514 |     "\n",
515 |     "Quick refresher on list syntax: To access a specific item in a list, use square brackets `[]` and the index number of the item you'd like to access. For instance, to get the first cell in the row -- the date that each WARN report was issued -- use `[0]`."
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {
522 |     "scrolled": true
523 |    },
524 |    "outputs": [],
525 |    "source": [
526 |     "for row in rows[1:]:\n",
527 |     "    cells = row.find_all('td')\n",
528 |     "    warn_date = cells[0]\n",
529 |     "    print(warn_date)\n",
530 |     "    print()"
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "markdown",
535 |    "metadata": {},
536 |    "source": [
537 |     "This is returning the entire `Tag` object -- we just want the contents inside it. You can access the `.text` attribute of the tag to get the text inside:"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": null,
543 |    "metadata": {
544 |     "scrolled": true
545 |    },
546 |    "outputs": [],
547 |    "source": [
548 |     "for row in rows[1:]:\n",
549 |     "    cells = row.find_all('td')\n",
550 |     "    warn_date = cells[0].text    \n",
551 |     "    print(warn_date)"
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "markdown",
556 |    "metadata": {},
557 |    "source": [
558 |     "In the next cell (`[1]`), the `.text` attribute will give you the NAICS code. In the third cell (`[2]`) you'll get the name of the business. Etc.\n",
559 |     "\n",
560 |     "It's also generally good practice to trim off external whitespace for each value, and you can use the Python built-in string method `strip()` to accomplish this as you march across the row.\n",
561 |     "\n",
562 |     "Which gets us this far:"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {},
569 |    "outputs": [],
570 |    "source": [
571 |     "for row in rows[1:]:\n",
572 |     "    cells = row.find_all('td')\n",
573 |     "    warn_date = cells[0].text.strip()\n",
574 |     "    naics_code = cells[1].text.strip()\n",
575 |     "    biz = cells[2].text.strip()\n",
576 |     "    print(warn_date, naics_code, biz)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "markdown",
581 |    "metadata": {},
582 |    "source": [
583 |     "### ✍️ Try it yourself\n",
584 |     "\n",
585 |     "Now that you've gotten this far, see if you can isolate the other pieces of data in each row."
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": null,
591 |    "metadata": {},
592 |    "outputs": [],
593 |    "source": [
594 |     "for row in rows[1:]:\n",
595 |     "    cells = row.find_all('td')\n",
596 |     "    warn_date = cells[0].text.strip()\n",
597 |     "    naics_code = cells[1].text.strip()\n",
598 |     "    biz = cells[2].text.strip()\n",
599 |     "    \n",
600 |     "    # address\n",
601 |     "    \n",
602 |     "    # wia_code\n",
603 |     "    \n",
604 |     "    # total_employees\n",
605 |     "    \n",
606 |     "    # effective_date\n",
607 |     "    \n",
608 |     "    # type_code\n",
609 |     "\n",
610 |     "    print(warn_date, naics_code, biz)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "markdown",
615 |    "metadata": {},
616 |    "source": [
617 |     "### Write the results to file\n",
618 |     "\n",
619 |     "Now that we've targeted our lists of data for each row, we can use Python's built-in [`csv`](https://docs.python.org/3/library/csv.html) module to write each list to a CSV file.\n",
620 |     "\n",
621 |     "First, import the csv module. (Typically, the convention is to handle all of your import statements at the top of your script, but it's no big deal if you do it here.)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {},
628 |    "outputs": [],
629 |    "source": [
630 |     "import csv"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "markdown",
635 |    "metadata": {},
636 |    "source": [
637 |     "Now define a list of headers to match the data (each column header will be a string) -- run this cell:"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {},
644 |    "outputs": [],
645 |    "source": [
646 |     "HEADERS = [\n",
647 |     "    'warn_date',\n",
648 |     "    'naics_code',\n",
649 |     "    'biz',\n",
650 |     "    'address',\n",
651 |     "    'wia_code',\n",
652 |     "    'total_employees',\n",
653 |     "    'effective_date',\n",
654 |     "    'type_code'\n",
655 |     "]"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "markdown",
660 |    "metadata": {},
661 |    "source": [
662 |     "Now, using something called a `with` block, open a new CSV file to write to and write some code to do the following things:\n",
663 |     "- Create a `csv.writer` object\n",
664 |     "- Write out the list of headers using the `writerow()` method of the `csv.writer` object\n",
665 |     "- Drop in the `for` loop you just wrote and, instead of just printing the contents of each cell, create a list of items and use the `writerow()` method of the `csv.writer` object to write your list of data to file"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "code",
670 |    "execution_count": null,
671 |    "metadata": {},
672 |    "outputs": [],
673 |    "source": [
674 |     "# create a file called 'warn-data.csv' in write ('w') mode\n",
675 |     "# specify that newlines are terminated by an empty string (this deals with a PC-specific problem)\n",
676 |     "# and use the `as` keyword to name the open file handler (the variable name `outfile` is arbitrary)\n",
677 |     "with open('warn-data.csv', 'w', newline='') as outfile:\n",
678 |     "    # go to the csv module we imported and make a new .writer object attached to the open file\n",
679 |     "    # and save it to a variable\n",
680 |     "    writer = csv.writer(outfile)\n",
681 |     "\n",
682 |     "    # write out the list of headers\n",
683 |     "    writer.writerow(HEADERS)\n",
684 |     "    \n",
685 |     "    # paste in the for loop you wrote earlier here -- watch the indentation!\n",
686 |     "    # it should be at this indentation level =>\n",
687 |     "    # for row in rows[1:]:\n",
688 |     "    #     cells = row.find_all('td')\n",
689 |     "    #     etc. ...\n",
690 |     "    # but at the end, instead of `print(warn_date, naics_code, ...etc.)`\n",
691 |     "    # make it something like\n",
692 |     "    # data_out = [warn_date, naics_code, ...etc.]\n",
693 |     "    # `writer.writerow(data_out)`"
694 |    ]
695 |   },
696 |   {
697 |    "cell_type": "markdown",
698 |    "metadata": {},
699 |    "source": [
700 |     "If you look in the folder, you should see a new file: `warn-data.csv`. Hooray!\n",
701 |     "\n",
702 |     "🎉 🎉 🎉"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "markdown",
707 |    "metadata": {},
708 |    "source": [
709 |     "### Extra credit problems\n",
710 |     "\n",
711 |     "1. **Remove internal whitespace:** Looking over the data, you probably noticed that some of the values have some unnecessary internal whitespace, which you could fix before you wrote each row to file. Python does not have a built-in string method to remove internal whitespace, unfortunately, but [Googling around](https://www.google.com/search?q=python+remove+internal+whitespace) will yield you a common strategy: Using the `split()` method to separate individual words in the string, then `join()`ing the resulting list on a single space. As an example:\n",
712 |     "\n",
713 |     "```python\n",
714 |     "my_text = 'hello     world      how are      you?'\n",
715 |     "\n",
716 |     "# split() will turn this into a list of words\n",
717 |     "my_text_words = my_text.split()\n",
718 |     "# ['hello', 'world', 'how', 'are', 'you?']\n",
719 |     "\n",
720 |     "# join on a single space\n",
721 |     "my_text_clean = ' '.join(my_text_words)\n",
722 |     "print(my_text_clean)\n",
723 |     "# prints 'hello world how are you?'\n",
724 |     "\n",
725 |     "# or, as a one-liner\n",
726 |     "my_text_clean = ' '.join(my_text.split())\n",
727 |     "```\n",
728 |     "\n",
729 |     "2. **Fetch multiple years:** The table we scraped has WARN notices for the current year, but the agency also maintains pages with WARN notices for previous years -- there's a list of them in a section [toward the bottom of the page](https://www.dllr.state.md.us/employment/warn.shtml). See if you can figure out how to loop over multiple pages and scrape the contents of each into a single CSV -- [here's a notebook to work in](Maryland%20WARN%20Notices%20-%20multiple%20pages.ipynb).\n",
730 |     "\n",
731 |     "\n",
732 |     "3. **Build a lookup table:** Each numeric code in the \"WIA Code\" column correspondes to a local area. See if you can figure out how to create a lookup dictionary that maps the numbers to their locations, then as you're looping over the data table, replace the numeric value in that column with the name of the local area instead. Here's a hint:\n",
733 |     "\n",
734 |     "```python\n",
735 |     "    lookup_dict = {\n",
736 |     "        '1': 'hello',\n",
737 |     "        '2': 'world'\n",
738 |     "    }\n",
739 |     "\n",
740 |     "    print(lookup_dict.get('1'))\n",
741 |     "    # prints 'hello'\n",
742 |     "\n",
743 |     "    print(lookup_dict.get('3'))\n",
744 |     "    # prints None\n",
745 |     "\n",
746 |     "```"
747 |    ]
748 |   },
749 |   {
750 |    "cell_type": "code",
751 |    "execution_count": null,
752 |    "metadata": {},
753 |    "outputs": [],
754 |    "source": []
755 |   }
756 |  ],
757 |  "metadata": {
758 |   "kernelspec": {
759 |    "display_name": "Python 3 (ipykernel)",
760 |    "language": "python",
761 |    "name": "python3"
762 |   },
763 |   "language_info": {
764 |    "codemirror_mode": {
765 |     "name": "ipython",
766 |     "version": 3
767 |    },
768 |    "file_extension": ".py",
769 |    "mimetype": "text/x-python",
770 |    "name": "python",
771 |    "nbconvert_exporter": "python",
772 |    "pygments_lexer": "ipython3",
773 |    "version": "3.10.9"
774 |   }
775 |  },
776 |  "nbformat": 4,
777 |  "nbformat_minor": 2
778 | }
779 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | anyio==3.6.2
 2 | appnope==0.1.3
 3 | argon2-cffi==21.3.0
 4 | argon2-cffi-bindings==21.2.0
 5 | arrow==1.2.3
 6 | asttokens==2.2.1
 7 | attrs==22.2.0
 8 | backcall==0.2.0
 9 | beautifulsoup4==4.11.1
10 | bleach==6.0.0
11 | bs4==0.0.1
12 | certifi==2022.9.24
13 | cffi==1.15.1
14 | charset-normalizer==2.1.1
15 | comm==0.1.2
16 | debugpy==1.6.6
17 | decorator==5.1.1
18 | defusedxml==0.7.1
19 | executing==1.2.0
20 | fastjsonschema==2.16.2
21 | fqdn==1.5.1
22 | greenlet==2.0.1
23 | idna==3.4
24 | ipykernel==6.21.0
25 | ipython==8.9.0
26 | ipython-genutils==0.2.0
27 | ipywidgets==8.0.4
28 | isoduration==20.11.0
29 | jedi==0.18.2
30 | Jinja2==3.1.2
31 | jsonpointer==2.3
32 | jsonschema==4.17.3
33 | jupyter==1.0.0
34 | jupyter-console==6.4.4
35 | jupyter-events==0.6.3
36 | jupyter_client==8.0.2
37 | jupyter_core==5.2.0
38 | jupyter_server==2.1.0
39 | jupyter_server_terminals==0.4.4
40 | jupyterlab-pygments==0.2.2
41 | jupyterlab-widgets==3.0.5
42 | MarkupSafe==2.1.2
43 | matplotlib-inline==0.1.6
44 | mistune==2.0.4
45 | nbclassic==0.5.1
46 | nbclient==0.7.2
47 | nbconvert==7.2.9
48 | nbformat==5.7.3
49 | nest-asyncio==1.5.6
50 | notebook==6.5.2
51 | notebook_shim==0.2.2
52 | numpy==1.24.1
53 | packaging==23.0
54 | pandas==1.5.3
55 | pandocfilters==1.5.0
56 | parso==0.8.3
57 | pexpect==4.8.0
58 | pickleshare==0.7.5
59 | platformdirs==2.6.2
60 | playwright==1.30.0
61 | prometheus-client==0.16.0
62 | prompt-toolkit==3.0.36
63 | psutil==5.9.4
64 | ptyprocess==0.7.0
65 | pure-eval==0.2.2
66 | pycparser==2.21
67 | pyee==9.0.4
68 | Pygments==2.14.0
69 | pyrsistent==0.19.3
70 | python-dateutil==2.8.2
71 | python-json-logger==2.0.4
72 | pytz==2022.7.1
73 | PyYAML==6.0
74 | pyzmq==25.0.0
75 | qtconsole==5.4.0
76 | QtPy==2.3.0
77 | requests==2.28.1
78 | rfc3339-validator==0.1.4
79 | rfc3986-validator==0.1.1
80 | Send2Trash==1.8.0
81 | six==1.16.0
82 | sniffio==1.3.0
83 | soupsieve==2.3.2.post1
84 | stack-data==0.6.2
85 | terminado==0.17.1
86 | tinycss2==1.2.1
87 | tornado==6.2
88 | traitlets==5.9.0
89 | typing_extensions==4.4.0
90 | uri-template==1.2.0
91 | urllib3==1.26.13
92 | wcwidth==0.2.6
93 | webcolors==1.12
94 | webencodings==0.5.1
95 | websocket-client==1.5.0
96 | widgetsnbextension==4.0.5
97 | 


--------------------------------------------------------------------------------
/sd-lobbyists/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/sd-lobbyists/data/.gitkeep


--------------------------------------------------------------------------------
/sd-lobbyists/download_lobbyist_data.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | In this script, we'll visit the South Dakota lobbyist registration lookup tool -- https://sosenterprise.sd.gov/BusinessServices/Lobbyist/LobbyistSearch.aspx -- an asp.net site that tracks user state and doesn't have consistent download paths for the lobbyist data files, because the files are generated on the fly based on search inputs. Therefore, we'll use playwright, a browser automation testing tool that's also handy for scraping websites.
  3 | 
  4 | The goal is to download each year's worth of lobbyist data for each type of lobbyist (private and public) and then assemble the results into a single data file.
  5 | 
  6 | The private lobbyist search has an "Export data" button, which kicks out a .zip file containing one pipe-delimited text file, but the public lobbyist search does not, so public lobbyist data will need to be scraped from the results table that appears below the search box.
  7 | '''
  8 | 
  9 | # we'll use this stdlib csv library to write out
 10 | # CSV files of the public lobbyist tables
 11 | import csv
 12 | 
 13 | # used for checking to see if files already exist
 14 | import os
 15 | 
 16 | # used for pausing between requests
 17 | import time
 18 | 
 19 | # we'll use this to calculate the current year
 20 | from datetime import date
 21 | 
 22 | # we'll use pandas to handle the zip files, since
 23 | # it can handle compressed files out the gate
 24 | import pandas as pd
 25 | 
 26 | # for getting wildcard references to local files
 27 | import glob
 28 | 
 29 | # playwright will run the browser
 30 | from playwright.sync_api import sync_playwright
 31 | 
 32 | 
 33 | # the URL where we want to start
 34 | URL = 'https://sosenterprise.sd.gov/BusinessServices/Lobbyist/LobbyistSearch.aspx'
 35 | 
 36 | # where the data files will land
 37 | DIR_DATA = 'data'
 38 | 
 39 | # the CSV file to write into
 40 | CSV_FILEPATH = 'sd-lobbyists.csv'
 41 | 
 42 | # grab the current year for comparison below
 43 | THIS_YEAR = date.today().year
 44 | 
 45 | # storing references to CSS selectors we'll use
 46 | # more than once below
 47 | LOCATOR_SELECT_YEAR = 'select#ctl00_MainContent_slctYears'
 48 | LOCATOR_SELECT_TABLE_LEN = 'div#DataTables_Table_0_length select'
 49 | LOCATOR_BUTTON_SEARCH = 'a#ctl00_MainContent_SearchButton'
 50 | LOCATOR_BUTTON_EXPORT = 'a#ctl00_MainContent_ExportButton'
 51 | LOCATOR_TABLE = 'table#DataTables_Table_0 tbody'
 52 | LOCATOR_RADIO_PUBLIC = 'input#ctl00_MainContent_chkSearchByPublic'
 53 | 
 54 | 
 55 | def download_data_private(page):
 56 | 
 57 |     # get a reference to the select menu that
 58 |     # allows you to switch years
 59 |     # https://playwright.dev/python/docs/api/class-page#page-locator
 60 |     select_year = page.locator(LOCATOR_SELECT_YEAR)
 61 | 
 62 |     # get a list of the options attached to this
 63 |     # select menu
 64 |     options = select_year.locator('option').all()
 65 | 
 66 |     # using a list comprehension with a conditional
 67 |     # `if` statement, get a list of values for these options,
 68 |     # but skip the option with "All" in the text
 69 |     # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
 70 |     years = [x.get_attribute('value') for x in options if 'All' not in x.inner_text()]
 71 | 
 72 |     # loop over that list of values (years)
 73 |     # that we just isolated
 74 |     for year in years:
 75 | 
 76 |         # for each year, build a path to where
 77 |         # we want to download the file
 78 |         # using an f-string
 79 |         # https://docs.python.org/3/tutorial/inputoutput.html#formatted-string-literals
 80 |         filename = f'{year}-private.zip'
 81 |         filepath = os.path.join(DIR_DATA, filename)
 82 | 
 83 |         # if file already exists, skip this one -- unless it's the current year
 84 |         if os.path.exists(filepath) and year != str(THIS_YEAR):
 85 |             continue
 86 | 
 87 |         # select the year from the select menu
 88 |         # https://playwright.dev/python/docs/api/class-locator#locator-select-option
 89 |         select_year.select_option(value=year)
 90 | 
 91 |         # set up the download
 92 |         # https://playwright.dev/python/docs/api/class-download
 93 |         with page.expect_download() as download_info:
 94 | 
 95 |             # click on the export button
 96 |             page.locator(LOCATOR_BUTTON_EXPORT).click()
 97 | 
 98 |             # wait for download to complete
 99 |             download = download_info.value
100 | 
101 |             # print a message letting us know what's happening
102 |             print(f'Downloading {filepath}')
103 | 
104 |             # save the downloaded file to the path created above
105 |             download.save_as(filepath)
106 | 
107 |         # make sure the page is done firing before
108 |         # going to the next loop iteration
109 |         page.wait_for_load_state('networkidle')
110 | 
111 |         # and throw in an explicit wait
112 |         time.sleep(1)
113 |     
114 | 
115 | def download_data_public(page):
116 |     # a list of CSV headers for the public lobbyist files
117 |     # making sure to match the same header names
118 |     # from the private lobbyist files
119 |     HEADERS_PUBLIC_CSV = [
120 |         'YEAR',
121 |         'LOBBYIST_LAST_NAME',
122 |         'LOBBYIST_FIRST_NAME',
123 |         'EMPLOYER'
124 |     ]
125 | 
126 |     select = page.locator(LOCATOR_SELECT_YEAR)
127 |     years = [x.inner_html() for x in select.locator('option').all() if 'All' not in x.inner_html()]  # noqa
128 | 
129 |     for year in years:
130 | 
131 |         filename = f'{year}-public.csv'
132 |         filepath = os.path.join(DIR_DATA, filename)
133 | 
134 |         if os.path.exists(filepath) and year != str(THIS_YEAR):
135 |             continue
136 | 
137 |         select = page.locator(LOCATOR_SELECT_YEAR)
138 |         select.select_option(year)
139 |         page.locator(LOCATOR_BUTTON_SEARCH).click()
140 |         page.wait_for_load_state('networkidle')
141 |        
142 |         select_len = page.locator(LOCATOR_SELECT_TABLE_LEN)
143 |         select_len.select_option("1000")
144 | 
145 |         table = page.locator(LOCATOR_TABLE)
146 |         rows = table.locator('tr').all()
147 | 
148 |         with open(filepath, 'w') as outfile:
149 |             writer = csv.DictWriter(outfile, fieldnames=HEADERS_PUBLIC_CSV)
150 |             writer.writeheader()
151 | 
152 |             for row in rows:
153 |                 cells = row.locator('td').all()
154 |                 year, name, dept = [x.inner_text() for x in cells]
155 |                 last, rest = [x.strip() for x in name.rsplit(',', 1)]
156 |                 data = [
157 |                     year,
158 |                     last,
159 |                     rest,
160 |                     dept
161 |                 ]
162 | 
163 |                 writer.writerow(dict(zip(HEADERS_PUBLIC_CSV, data)))
164 | 
165 |         print(f'Wrote {filepath}')
166 | 
167 |         time.sleep(1)
168 | 
169 | 
170 | def build_data_file():
171 | 
172 |     # get a list of downloaded files
173 |     files_private = glob.glob(f'{DIR_DATA}/*.zip')
174 |     files_public = glob.glob(f'{DIR_DATA}/*.csv')
175 | 
176 |     # start a list to hold individual data frames
177 |     data_frames = []
178 | 
179 |     for file in files_private:
180 |         df = pd.read_csv(
181 |             file,
182 |             compression='zip',
183 |             delimiter='|'
184 |         )
185 | 
186 |         data_frames.append(df)
187 | 
188 |     for file in files_public:
189 |         df = pd.read_csv(file)
190 |         data_frames.append(df)
191 | 
192 |     df = pd.concat(data_frames)
193 | 
194 |     df.sort_values(['YEAR', 'LOBBYIST_LAST_NAME', 'LOBBYIST_FIRST_NAME'], ascending=[False, True, True]).to_csv(CSV_FILEPATH, index=False)
195 | 
196 |     return CSV_FILEPATH
197 | 
198 | 
199 | if __name__ == '__main__':
200 | 
201 |     if not os.path.exists(DIR_DATA):
202 |         os.makedirs(DIR_DATA)
203 | 
204 |     with sync_playwright() as p:
205 |         browser = p.chromium.launch(headless=False)
206 |         page = browser.new_page()
207 |         page.goto(URL, wait_until='networkidle')
208 | 
209 |         download_data_private(page)
210 | 
211 |         page.locator(LOCATOR_RADIO_PUBLIC).check()
212 |         page.wait_for_load_state('networkidle')
213 | 
214 |         download_data_public(page)
215 | 
216 |     build_data_file()
217 | 


--------------------------------------------------------------------------------
/tx-railroad-commission/dl_pages_details.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import glob
  3 | import time
  4 | 
  5 | from dl_pages_results import DIR_PAGES_RESULTS, BASE_URL
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | from playwright.sync_api import sync_playwright, expect
  9 | 
 10 | 
 11 | # make a reference to the directory where
 12 | # the downloaded detail pages will land
 13 | DIR_PAGES_DETAIL = 'pages-detail'
 14 | 
 15 | 
 16 | def get_detail_page_links():
 17 |     '''
 18 |     A function to extract the detail page links
 19 |     from the results HTML files that we downloaded separately
 20 |     '''
 21 | 
 22 |     # an empty list to hold the extracted links
 23 |     detail_page_links = []
 24 | 
 25 |     # use the glob module to nab a list of all
 26 |     # the HTML files we want to parse
 27 |     # https://docs.python.org/3/library/glob.html
 28 |     filepaths_results = sorted(glob.glob(f'{DIR_PAGES_RESULTS}/*.html'))
 29 | 
 30 |     # loop over each results file
 31 |     for results_file in filepaths_results:
 32 | 
 33 |         # open it and read the HTML
 34 |         with open(results_file, 'r') as infile:
 35 |             html = infile.read()
 36 | 
 37 |         # turn it into soup
 38 |         soup = BeautifulSoup(html, 'html.parser')
 39 | 
 40 |         # find the table rows
 41 |         rows = soup.tbody.find_all('tr')
 42 | 
 43 |         # use a list comprehension to grab the links from each row
 44 |         # and prepend the base URL to ensure a fully
 45 |         # qualified URL to save a step later
 46 |         # https://docs.python.org/3/tutorial/datastructures.html#list-comprehensions
 47 |         links = [f"{BASE_URL}/s{x.find('a')['href']}" for x in rows]
 48 | 
 49 |         # add these links to the main list
 50 |         detail_page_links.extend(links)
 51 | 
 52 |     # return the list of links we just populated
 53 |     return detail_page_links
 54 | 
 55 | 
 56 | def dl_pages_detail():
 57 | 
 58 |     # call the function to get the links and store the results
 59 |     links = get_detail_page_links()
 60 | 
 61 |     # set up the playwright object
 62 |     # https://playwright.dev/python/docs/library#usage
 63 |     with sync_playwright() as p:
 64 | 
 65 |         # create a new Chromium browser, which
 66 |         # operates in headless mode by default
 67 |         browser = p.chromium.launch()
 68 | 
 69 |         # open a new page
 70 |         page = browser.new_page()
 71 | 
 72 |         # loop over the links we just grabbed
 73 |         for link in links:
 74 | 
 75 |             # use the unique ID in the URL as the filename
 76 |             filename = f"{link.split('/')[-1]}.html"
 77 | 
 78 |             # build the file path
 79 |             filepath = os.path.join(
 80 |                 DIR_PAGES_DETAIL,
 81 |                 filename
 82 |             )
 83 | 
 84 |             # check to see if we already downloaded the page
 85 |             if not os.path.exists(filepath):
 86 | 
 87 |                 # if not, navigate to the page
 88 |                 # and wait for the assets to load
 89 |                 page.goto(
 90 |                     link,
 91 |                     wait_until='networkidle'
 92 |                 )
 93 | 
 94 |                 expect(
 95 |                     page.get_by_title('Inspection Packages')
 96 |                 ).to_be_visible()
 97 | 
 98 |                 expect(
 99 |                     page.get_by_text('Complaint Name')
100 |                 ).to_be_visible()
101 | 
102 |                 time.sleep(2)
103 | 
104 | 
105 |                 if 'RRC SIGN IN' in page.locator('body').inner_text().upper():
106 |                     print(f'    PROBLEM downloading {link}')
107 |                     continue
108 | 
109 |                 # target the content div and grab the HTML
110 |                 content = page.locator('html').inner_html()  # noqa
111 | 
112 |                 # if not, download it
113 |                 with open(filepath, 'w') as outfile:
114 |                     outfile.write(content)
115 | 
116 |                 # and let us know what's up
117 |                 print(f'Downloaded {filepath}')
118 | 
119 |                 # wait a tick before moving on to the next page
120 |                 time.sleep(0.5)
121 | 
122 |         # close the browser
123 |         browser.close()
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     dl_pages_detail()
128 | 


--------------------------------------------------------------------------------
/tx-railroad-commission/dl_pages_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from playwright.sync_api import sync_playwright
  5 | 
  6 | 
  7 | # create a variable pointing to the directory where
  8 | # the cached pages should land
  9 | DIR_PAGES_RESULTS = 'pages-results'
 10 | 
 11 | # create a variable pointing to the base URL, which
 12 | # we'll use in a couple places
 13 | BASE_URL = 'https://rrctx.force.com'
 14 | 
 15 | 
 16 | def download_pages_results():
 17 | 
 18 |     # check to see if this directory exists
 19 |     if not os.path.exists(DIR_PAGES_RESULTS):
 20 | 
 21 |         # if not, create it
 22 |         os.makedirs(DIR_PAGES_RESULTS)
 23 | 
 24 |     # set up the playwright object
 25 |     # https://playwright.dev/python/docs/library#usage
 26 |     with sync_playwright() as p:
 27 | 
 28 |         # create a new Chromium browser, which
 29 |         # operates in headless mode by default
 30 |         browser = p.chromium.launch()
 31 | 
 32 |         # open a new page
 33 |         page = browser.new_page()
 34 | 
 35 |         # go to the initial complaints results page
 36 |         # and wait until all the assets are loaded
 37 |         # using an f-string to build the URL to navigate to
 38 |         # https://docs.python.org/3/tutorial/inputoutput.html#tut-f-strings
 39 |         page.goto(
 40 |             f'{BASE_URL}/s/complaints',
 41 |             wait_until='networkidle'
 42 |         )
 43 | 
 44 |         # find the table and grab the HTML
 45 |         table = page.locator('table').inner_html()
 46 | 
 47 |         # get the page number we're on (1)
 48 |         # by finding the pagination element at the bottom of the page
 49 |         page_tracker = page.get_by_text(' | Page ').inner_text()
 50 | 
 51 |         # ... and then parsing out the page number with some splits
 52 |         page_num = page_tracker.split('Page')[1].split('of')[0].strip()
 53 | 
 54 |         # set up the filename -- using the .zfill()
 55 |         # string method to pad out the number
 56 |         # to three digits -- and the file path
 57 |         filename = f'{page_num.zfill(3)}.html'
 58 |         filepath = os.path.join(
 59 |             DIR_PAGES_RESULTS,
 60 |             filename
 61 |         )
 62 | 
 63 |         # open the file and write
 64 |         # the table HTML captured above
 65 |         with open(filepath, 'w') as outfile:
 66 |             outfile.write(table)
 67 | 
 68 |         # let us know what's up
 69 |         print(f'Downloaded {filepath}')
 70 |         
 71 |         # next, create a process to iterate through
 72 |         # the other pages of the search results --
 73 |         # a better move here would be to write a 
 74 |         # recursive function, but a hacky while True / break
 75 |         # statement works too
 76 | 
 77 |         while True:
 78 |             # find the "Next" button
 79 |             next_button = page.locator('button', has_text='Next')
 80 | 
 81 |             # click it
 82 |             next_button.click()
 83 | 
 84 |             # wait for the next page to load
 85 |             page.wait_for_load_state('networkidle')
 86 | 
 87 |             # find the table and grab the HTML
 88 |             table = page.locator('table').inner_html()
 89 | 
 90 |             # get the page number we're on
 91 |             # by finding the pagination element at the bottom of the page
 92 |             page_tracker = page.get_by_text(' | Page ').inner_text()
 93 | 
 94 |             # and then parsing out the page number with some splits
 95 |             page_num = page_tracker.split('Page')[1].split('of')[0].strip()
 96 | 
 97 |             # set up the filename and path
 98 |             filename = f'{page_num.zfill(3)}.html'
 99 | 
100 |             filepath = os.path.join(
101 |                 DIR_PAGES_RESULTS,
102 |                 filename
103 |             )
104 | 
105 |             # open the file and write into
106 |             # it the table HTML captured above
107 |             with open(filepath, 'w') as outfile:
108 |                 outfile.write(table)
109 | 
110 |             # let us know what's up
111 |             print(f'Downloaded {filepath}')
112 | 
113 |             # see if this is the last page
114 |             lpage = int(page_tracker.split('of')[-1])
115 | 
116 |             # if the {x} in "Page {x} of {y}" number
117 |             # is the same as {y}, we're done
118 |             if int(page_num) == lpage:
119 |                 break
120 | 
121 |             # if not the last page,
122 |             # wait half a second before moving on to the next page
123 |             time.sleep(0.5)
124 | 
125 |         # shut down the browser
126 |         browser.close()
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     download_pages_results()
131 | 


--------------------------------------------------------------------------------
/tx-railroad-commission/main.py:
--------------------------------------------------------------------------------
 1 | from dl_pages_results import download_pages_results
 2 | from dl_pages_details import dl_pages_detail
 3 | from scrape_detail_pages import scrape_data
 4 | 
 5 | if __name__ == '__main__':
 6 |     print('Downloading results pages ...')
 7 |     download_pages_results()
 8 |     print()
 9 | 
10 |     print('Downloading detail pages ...')
11 |     dl_pages_detail()
12 |     print()
13 | 
14 |     print('Scraping data ...')
15 |     file_details = scrape_data()
16 |     print()
17 | 
18 |     print(f'Done! Wrote {file_details["record_count"]:,} records to {file_details["filepath"]}')
19 | 


--------------------------------------------------------------------------------
/tx-railroad-commission/pages-detail/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/tx-railroad-commission/pages-detail/.gitkeep


--------------------------------------------------------------------------------
/tx-railroad-commission/pages-results/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cjwinchester/nicar23-python-scraping/06b9e729075e6c04c7f0c777d3d99c317332c95a/tx-railroad-commission/pages-results/.gitkeep


--------------------------------------------------------------------------------
/tx-railroad-commission/scrape_detail_pages.py:
--------------------------------------------------------------------------------
  1 | import glob
  2 | import csv
  3 | import re
  4 | import os
  5 | from datetime import datetime
  6 | 
  7 | from bs4 import BeautifulSoup
  8 | 
  9 | from dl_pages_results import BASE_URL
 10 | 
 11 | 
 12 | csv_filepath = 'tx-railroad-commission-data.csv'
 13 | 
 14 | # set up headers for the CSV file
 15 | csv_headers = [
 16 |     'complaint_id',
 17 |     'complaint_url',
 18 |     'capture_method',
 19 |     'location',
 20 |     'city',
 21 |     'resolution_status',
 22 |     'complaint_type',
 23 |     'received_date',
 24 |     'business_area',
 25 |     'organization',
 26 |     'unit_name',
 27 |     'region',
 28 |     'regulated_entity',
 29 |     'p5_no',
 30 |     'jurisdictional',
 31 |     'regulated',
 32 |     'complaint_description_type',
 33 |     'complaint_description',
 34 |     'resolution_description',
 35 |     'complaint_comments',
 36 |     'update_notes',
 37 |     'close_date',
 38 |     'explanation_type',
 39 |     'explanation',
 40 |     'referral_type',
 41 |     'referred_to',
 42 |     'inspection_packages_link',
 43 |     'inspection_documents_link'
 44 | ]
 45 | 
 46 | # get a list of HTML files in the detail pages dir
 47 | files = glob.glob('pages-detail/*.html')
 48 | 
 49 | 
 50 | # set up a reusable function to scrape data from
 51 | # a single HTML file
 52 | def scrape_page(html_path):
 53 | 
 54 |     # open the file and read in the HTML
 55 |     with open(html_path, 'r') as infile:
 56 |         html = infile.read()
 57 | 
 58 |     # turn the HTML into a bs4 object
 59 |     soup = BeautifulSoup(html, 'html.parser')
 60 | 
 61 |     # and start locating the elements using various bs4 methods
 62 |     # https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 63 | 
 64 |     complaint_no = soup.find('span', text='Complaint Name').parent.next_sibling.text.split('-')[-1]
 65 | 
 66 |     url = f"{BASE_URL}/s/ietrs-complaint/{html_path.split('/')[-1].split('.html')[0]}/complaint{complaint_no}"
 67 | 
 68 |     capture_method = soup.find('span', text='Complaint Capture Method').parent.next_sibling.text.strip()
 69 | 
 70 |     location = soup.find('span', text='Complaint Location').parent.next_sibling.text.strip()
 71 | 
 72 |     resolution_status = soup.find('span', text='Complaint Resolution Status').parent.next_sibling.text.strip()
 73 | 
 74 |     complaint_type = soup.find('span', text=re.compile('Optional; If looking for a Railroad related option, this is not the correct jurisdiction')).parent.parent.next_sibling.text.strip()
 75 | 
 76 |     received_date = soup.find('span', text='Complaint Received Date').parent.next_sibling.text.strip()
 77 | 
 78 |     # additional integrity check for dates -- parse text as date
 79 |     received_date = datetime.strptime(
 80 |         received_date,
 81 |         '%m/%d/%Y'
 82 |     ).date().isoformat()
 83 | 
 84 |     business_area = soup.find('span', text='Business Area').parent.next_sibling.text.strip()
 85 | 
 86 |     city = soup.find('span', text='City').parent.next_sibling.text.strip()
 87 | 
 88 |     organization = soup.find('span', text='Organization').parent.next_sibling.text.strip()
 89 | 
 90 |     unit_name = soup.find('span', text='Unit Name').parent.next_sibling.text.strip()
 91 | 
 92 |     region = soup.find('span', text='Region').parent.next_sibling.text.strip()
 93 | 
 94 |     regulated_entity = soup.find('span', text='Regulated Entity').parent.next_sibling.text.strip()
 95 | 
 96 |     p5_no = soup.find('span', text='P5 #').parent.next_sibling.text.strip()
 97 | 
 98 |     jurisdictional = soup.find('span', text='Jurisdictional').parent.next_sibling.find('img').get('alt').strip()
 99 | 
100 |     regulated = soup.find('span', text='Regulated').parent.next_sibling.text.strip()
101 | 
102 |     complaint_description_type = soup.find('span', text='Complaint Description Type').parent.next_sibling.text.strip()
103 | 
104 |     complaint_description = soup.find('span', text='Complaint Description').parent.next_sibling.text.strip()
105 | 
106 |     complaint_description = ' '.join(complaint_description.split())
107 | 
108 |     resolution_description = soup.find('span', text='Complaint Resolution Description').parent.next_sibling.text.strip()
109 | 
110 |     resolution_description = ' '.join(resolution_description.split())
111 | 
112 |     complaint_comments = soup.find('span', text='Complaint Comments').parent.next_sibling.text.strip()
113 | 
114 |     complaint_comments = ' '.join(resolution_description.split())
115 | 
116 |     update_notes = soup.find('span', text='Update Notes').parent.next_sibling.text.strip()
117 | 
118 |     update_notes = ' '.join(update_notes.split())
119 | 
120 |     close_date = soup.find('span', text='Complaint Close Date').parent.next_sibling.text.strip()
121 | 
122 |     # not every complaint has a close date
123 |     if close_date:
124 |         close_date = datetime.strptime(
125 |             close_date,
126 |             '%m/%d/%Y'
127 |         ).date().isoformat()
128 | 
129 |     explanation_type = soup.find('span', text='Complaint Explanation Type').parent.next_sibling.text.strip()
130 | 
131 |     explanation = soup.find('span', text='Complaint Explanation').parent.next_sibling.text.strip()
132 | 
133 |     explanation = ' '.join(explanation.split())
134 | 
135 |     referral_type = soup.find('span', text='Complaint Referred Type').parent.next_sibling.text.strip()
136 | 
137 |     referred_to = soup.find('span', text='Complaint Referred To:').parent.next_sibling.text.strip()
138 | 
139 |     # joining a split string on a single space is
140 |     # a way to remove all unnecessary whitespace
141 |     referred_to = ' '.join(referred_to.split())
142 | 
143 |     inspection_packages = soup.find('span', {'title': 'Inspection Packages'})
144 | 
145 |     inspection_packages_link = f"{BASE_URL}{inspection_packages.parent.get('href')}"
146 | 
147 |     inspection_documents = soup.find('span', {'title': 'Inspection Documents'})
148 | 
149 |     inspection_documents_link = f"{BASE_URL}{inspection_documents.parent.get('href')}"
150 | 
151 |     # assemble the data in a list, maintaining
152 |     # the same order as the CSV headers
153 |     data = [
154 |         complaint_no,
155 |         url,
156 |         capture_method,
157 |         location,
158 |         city,
159 |         resolution_status,
160 |         complaint_type,
161 |         received_date,
162 |         business_area,
163 |         organization,
164 |         unit_name,
165 |         region,
166 |         regulated_entity,
167 |         p5_no,
168 |         jurisdictional,
169 |         regulated,
170 |         complaint_description_type,
171 |         complaint_description,
172 |         resolution_description,
173 |         complaint_comments,
174 |         update_notes,
175 |         close_date,
176 |         explanation_type,
177 |         explanation,
178 |         referral_type,
179 |         referred_to,
180 |         inspection_packages_link,
181 |         inspection_documents_link
182 |     ]
183 | 
184 |     # return a dictionary representation of the data
185 |     return dict(zip(csv_headers, data))
186 | 
187 | 
188 | def scrape_data():
189 |     # set up an empty list to hold the data to write to file
190 |     all_data = []
191 | 
192 |     # loop over the list of files
193 |     for file in files:
194 | 
195 |         # call the function to scrape this file
196 |         # and assign to a variable the dictionary that the function returns
197 |         data = scrape_page(file)
198 | 
199 |         # append the dictionary to the list we set up to
200 |         # collect data from each page
201 |         all_data.append(data)
202 | 
203 |     # open a file in write mode, specify the encoding and
204 |     # set newlines='' to deal with windows-specific line breaks
205 |     with open(csv_filepath, 'w', encoding='utf-8', newline='') as outfile:
206 | 
207 |         # set up the writer object
208 |         writer = csv.DictWriter(
209 |             outfile,
210 |             fieldnames=csv_headers
211 |         )
212 | 
213 |         # write the headers
214 |         writer.writeheader()
215 | 
216 |         # write the data
217 |         writer.writerows(all_data)
218 | 
219 |     return {
220 |         'record_count': len(all_data),
221 |         'filepath': csv_filepath
222 |     }
223 | 
224 | 
225 | if __name__ == '__main__':
226 |     scrape_data()
227 |     


--------------------------------------------------------------------------------
/us-senate-press-gallery/U.S. Senate Press Gallery - complete.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "05412996",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# U.S. Senate press gallery\n",
  9 |     "\n",
 10 |     "The goal: [Scrape the list of journalists accredited to cover the U.S. Senate](https://www.dailypress.senate.gov/membership/membership-lists/) into a CSV. A little spelunking in the source code will reveal a table ready for extraction."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "id": "4c67ce4d",
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "# import libs\n",
 21 |     "import csv\n",
 22 |     "\n",
 23 |     "import requests\n",
 24 |     "from bs4 import BeautifulSoup"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "id": "99adeb32",
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# define the list of headers for the CSV\n",
 35 |     "headers = [\n",
 36 |     "    'first',\n",
 37 |     "    'last',\n",
 38 |     "    'affiliation'\n",
 39 |     "]"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "a9744eb7",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# make the request\n",
 50 |     "req = requests.get('https://www.dailypress.senate.gov/membership/membership-lists/')"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "id": "8e710003",
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# turn the HTML into soup\n",
 61 |     "soup = BeautifulSoup(req.text, 'html.parser')\n",
 62 |     "\n",
 63 |     "# find the table\n",
 64 |     "table = soup.find('table')\n",
 65 |     "\n",
 66 |     "# grab a list of table rows (minus the header)\n",
 67 |     "rows = table.find_all('tr')[1:]"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "id": "762a11be",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "print(rows)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "id": "e65af642",
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# open a CSV file to write data into\n",
 88 |     "with open('us-senate-press-gallery.csv', 'w', newline='') as outfile:\n",
 89 |     "\n",
 90 |     "    # create a writer object\n",
 91 |     "    writer = csv.writer(outfile)\n",
 92 |     "\n",
 93 |     "    # write the list of headers to file\n",
 94 |     "    writer.writerow(headers)\n",
 95 |     "\n",
 96 |     "    # loop over the rows\n",
 97 |     "    for row in rows:\n",
 98 |     "\n",
 99 |     "        # find the cells in this row\n",
100 |     "        cells = row.find_all('td')\n",
101 |     "        \n",
102 |     "        # extract each piece of data from the list\n",
103 |     "        \n",
104 |     "        # first name is the first ([0]) list item\n",
105 |     "        first_name = cells[0].text.strip()\n",
106 |     "        \n",
107 |     "        # last name is second ([1])\n",
108 |     "        last_name = cells[1].text.strip()\n",
109 |     "\n",
110 |     "        # affiliation is third ([2])\n",
111 |     "        affiliation = cells[2].text.strip()\n",
112 |     "\n",
113 |     "        # write row to file\n",
114 |     "        writer.writerow([first_name, last_name, affiliation])"
115 |    ]
116 |   }
117 |  ],
118 |  "metadata": {
119 |   "kernelspec": {
120 |    "display_name": "Python 3 (ipykernel)",
121 |    "language": "python",
122 |    "name": "python3"
123 |   },
124 |   "language_info": {
125 |    "codemirror_mode": {
126 |     "name": "ipython",
127 |     "version": 3
128 |    },
129 |    "file_extension": ".py",
130 |    "mimetype": "text/x-python",
131 |    "name": "python",
132 |    "nbconvert_exporter": "python",
133 |    "pygments_lexer": "ipython3",
134 |    "version": "3.10.9"
135 |   }
136 |  },
137 |  "nbformat": 4,
138 |  "nbformat_minor": 5
139 | }
140 | 


--------------------------------------------------------------------------------
/us-senate-press-gallery/U.S. Senate Press Gallery - working.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "05412996",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# U.S. Senate press gallery\n",
  9 |     "\n",
 10 |     "The goal: [Scrape the list of journalists accredited to cover the U.S. Senate](https://www.dailypress.senate.gov/membership/membership-lists/) into a CSV. Some spelunking in the source code will show a table ready for extraction.\n",
 11 |     "\n",
 12 |     "[Here's the completed version](U.S.%20Senate%20Press%20Gallery%20-%20complete.ipynb)."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "id": "4c67ce4d",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "# import libs\n",
 23 |     "import csv\n",
 24 |     "\n",
 25 |     "import requests\n",
 26 |     "from bs4 import BeautifulSoup"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "id": "99adeb32",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# define the list of headers for the CSV\n",
 37 |     "headers = [\n",
 38 |     "    'first',\n",
 39 |     "    'last',\n",
 40 |     "    'affiliation'\n",
 41 |     "]"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 3,
 47 |    "id": "a9744eb7",
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "# make the request\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 6,
 57 |    "id": "8e710003",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "# turn the HTML into soup\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "# find the table\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "# grab a list of table rows (minus the header)\n"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 9,
 73 |    "id": "e65af642",
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# open a CSV file to write data into\n",
 78 |     "\n",
 79 |     "\n",
 80 |     "    # create a writer object\n",
 81 |     "\n",
 82 |     "\n",
 83 |     "    # write the list of headers to file\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "    # loop over the rows\n",
 87 |     "\n",
 88 |     "\n",
 89 |     "        # find the cells in this row\n",
 90 |     "\n",
 91 |     "        \n",
 92 |     "        # extract each piece of data from the list\n",
 93 |     "        \n",
 94 |     "        # first name is the first ([0]) list item\n",
 95 |     "\n",
 96 |     "        \n",
 97 |     "        # last name is second ([1])\n",
 98 |     "\n",
 99 |     "\n",
100 |     "        # affiliation is third ([2])\n",
101 |     "\n",
102 |     "\n",
103 |     "        # write row to file\n"
104 |    ]
105 |   }
106 |  ],
107 |  "metadata": {
108 |   "kernelspec": {
109 |    "display_name": "Python 3 (ipykernel)",
110 |    "language": "python",
111 |    "name": "python3"
112 |   },
113 |   "language_info": {
114 |    "codemirror_mode": {
115 |     "name": "ipython",
116 |     "version": 3
117 |    },
118 |    "file_extension": ".py",
119 |    "mimetype": "text/x-python",
120 |    "name": "python",
121 |    "nbconvert_exporter": "python",
122 |    "pygments_lexer": "ipython3",
123 |    "version": "3.10.9"
124 |   }
125 |  },
126 |  "nbformat": 4,
127 |  "nbformat_minor": 5
128 | }
129 | 


--------------------------------------------------------------------------------