├── .gitignore ├── CNAME ├── README.md ├── _quarto.yml ├── apis.ipynb ├── assets ├── att-scraper-selenium.mp4 ├── blm-wlm.png ├── blocked-terms.png ├── browser-automation.gif ├── browser0_01_threestep.png ├── browser0_01_threestep1.png ├── browser0_02_soup.png ├── browser0_02_soup1.png ├── browser1_01_version.png ├── browser1_01_version1.png ├── browser1_02_tiktok.png ├── browser1_02_tiktok1.png ├── browser1_03_dismiss.png ├── browser1_03_dismiss1.png ├── browser1_04_inspect.png ├── browser1_04_inspect1.png ├── browser1_05_inspect_tiktok_a.png ├── browser1_05_inspect_tiktok_a1.png ├── browser1_05_inspect_tiktok_b.png ├── browser1_05_inspect_tiktok_b1.png ├── copy-curl.png ├── dev-console.mp4 ├── favicon │ ├── .DS_Store │ ├── android-chrome-192x192.png │ ├── android-chrome-512x512.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon.ico │ └── site.webmanifest ├── favicon_ │ ├── android-chrome-192x192.png │ ├── android-chrome-512x512.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon.ico │ └── site.webmanifest ├── favicon_alt │ ├── android-chrome-192x192.png │ ├── android-chrome-512x512.png │ ├── apple-touch-icon.png │ ├── favicon-16x16.png │ ├── favicon-32x32.png │ ├── favicon.ico │ └── site.webmanifest ├── filter-network.mp4 ├── filter-requests.mp4 ├── ida.gif ├── ida.mp4 ├── ida.png ├── ida.webm ├── insepect-element-3D.jpg ├── inspect-element-logo.jpg ├── inspect-element-simple.jpg ├── inspect-element-v1.png ├── inspect-element-v2.png ├── inspect-element-v3.png ├── inspect-element.jpg ├── inspect-panel.png ├── inspect.png ├── just-windows.jpg ├── logo-twitter.png ├── ring-gizmodo.png ├── ring-map.png ├── sap2019.png ├── spicy.png └── wsj_tiktok.mp4 ├── best-practices-data-collection.ipynb ├── browser_automation.ipynb ├── build-your-own-datasets.ipynb ├── checklist.ipynb ├── index.qmd ├── references.bib ├── references.qmd ├── selenium_wire.ipynb ├── start.ipynb ├── styles.css └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.quarto/ 2 | /.DS_Store 3 | *.html 4 | *.pdf 5 | *.ipynb_checkpoints/ 6 | -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | inspectelement.org 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inspect Element 2 | 3 | This repository is used to render a [Quarto](https://quarto.org/) website. 4 | 5 | For tutorials refer to the Jupyter Notebooks (ending with `.ipynb`). 6 | 7 | Apologies for having everything unorganized in the top-level directory, once I figure out Quanto a bit more, I'll reorganize 8 | 9 | ## For contributors 10 | 11 | This site used Github pages and actions to publish the site. 12 | Do not push any changes or work on the `gh-pages` branch, as that is auto-populated using the `quarto publish gh-pages` command. 13 | 14 | Read about how this works [here](https://quarto.org/docs/publishing/github-pages.html). 15 | 16 | To preview the website locally, run: 17 | ``` 18 | quarto preview 19 | ``` 20 | 21 | Test the speed of the page here: 22 | https://pagespeed.web.dev/report?url=https%3A%2F%2Finspectelement.org%2F&form_factor=desktop 23 | 24 | Make images small using tools such as https://tinypng.com/ -------------------------------------------------------------------------------- /_quarto.yml: -------------------------------------------------------------------------------- 1 | project: 2 | type: book 3 | output-dir: docs 4 | resources: assets/ 5 | 6 | execute: 7 | freeze: auto 8 | 9 | # organization 10 | book: 11 | title: "Inspect Element" 12 | subtitle: "The practitioner's guide to hypothesis-driven data investigations" 13 | search: true 14 | image: assets/inspect-element-logo.jpg 15 | favicon: assets/favicon/favicon.ico 16 | # cover-image: assets/ida.webm 17 | # cover-image-alt: | 18 | # A Google Knowlege Panel of Ida Tarbell stained different colors from an audit. 19 | repo-url: https://github.com/yinleon/inspect-element 20 | repo-actions: [edit] 21 | site-url: https://inspectelement.org 22 | twitter-card: 23 | image: assets/inspect-element-logo.jpg 24 | card-style: "summary_large_image" 25 | image-width: 1200 26 | image-height: 630 27 | image-alt: "The inspect element logo" 28 | open-graph: 29 | title: Inspect Element 30 | description: A guide to hypothesis-driven investigations 31 | locale: us_EN 32 | site-name: Inspect Element 33 | image: assets/inspect-element-logo.jpg 34 | image-width: 1200 35 | image-height: 630 36 | image-alt: "The inspect element logo" 37 | sidebar: 38 | # logo: assets/inspect-element-logo.jpg 39 | collapse-level: 3 40 | pinned: true 41 | chapters: 42 | - text: "Welcome" 43 | file: index.qmd 44 | - text: "Planning investigations" 45 | part: start.ipynb 46 | chapters: 47 | - text: "Checklist" 48 | file: checklist.ipynb 49 | 50 | - part: build-your-own-datasets.ipynb 51 | chapters: 52 | - text: "Undocumented APIs" 53 | file: apis.ipynb 54 | - text: "Browser automation" 55 | file: browser_automation.ipynb 56 | # - text: "Advanced usage" 57 | # file: selenium_wire.ipynb 58 | # - text: "Crowdsourcing" 59 | # file: crowdsourcing.ipynb 60 | - text: "Parsing with Xpath" 61 | file: xpath.ipynb 62 | - text: "Best practices" 63 | file: best-practices-data-collection.ipynb 64 | # - part: Designing experiments 65 | # chapters: 66 | # - text: "Classification" 67 | # file: classification.ipynb 68 | appendices: 69 | - references.qmd 70 | # - acknowledgements.qmd 71 | 72 | # style choices 73 | format: 74 | html: 75 | theme: 76 | - cosmo 77 | - custom.scss 78 | callout-appearance: simple 79 | number-sections: false 80 | 81 | # misc 82 | bibliography: references.bib 83 | -------------------------------------------------------------------------------- /apis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "---\n", 8 | "title: \"Finding Undocumented APIs\"\n", 9 | "pagetitle: \"Finding Undocumented APIs\"\n", 10 | "description-meta: \"Introduction, case studies, and exercises for finding and using hidden and undocumented APIs.\"\n", 11 | "description-title: \"Introduction, case studies, and exercises for finding and using hidden and undocumented APIs.\"\n", 12 | "author: \"Leon Yin\"\n", 13 | "author-meta: \"Leon Yin\"\n", 14 | "date: \"02-24-2023\"\n", 15 | "date-modified: \"06-15-2023\"\n", 16 | "bibliography: references.bib\n", 17 | "execute: \n", 18 | " enabled: false\n", 19 | "keywords: data collection, hidden api, undocumented api, web scraping, api\n", 20 | "twitter-card:\n", 21 | " title: Finding Undocumented APIs\n", 22 | " description: Introduction, case studies, and exercises for finding and using undocumented APIs hidden in plain sight.\n", 23 | " image: assets/inspect-element-logo.jpg\n", 24 | "open-graph:\n", 25 | " title: Finding Undocumented APIs\n", 26 | " description: Introduction, case studies, and exercises for finding and using undocumented APIs hidden in plain sight.\n", 27 | " locale: us_EN\n", 28 | " site-name: Inspect Element\n", 29 | " image: assets/inspect-element-logo.jpg\n", 30 | "href: apis\n", 31 | "---" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "data": { 41 | "text/markdown": [ 42 | "\n", 43 | "📖 Read online\n", 44 | "🖥️ Interactive version\n", 45 | "⚙️ GitHub\n", 46 | "🏛 Citation\n", 47 | "
\n" 48 | ], 49 | "text/plain": [ 50 | "" 51 | ] 52 | }, 53 | "metadata": {}, 54 | "output_type": "display_data" 55 | } 56 | ], 57 | "source": [ 58 | "#| echo: false\n", 59 | "from utils import build_buttons\n", 60 | "build_buttons(link= 'apis', \n", 61 | " github= 'https://github.com/yinleon/inspect-element/blob/main/apis.ipynb', \n", 62 | " citation= True)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Most APIs are undocumented and hidden in plain sight. \n", 70 | "\n", 71 | "Being able to find these APIs can provide a rich, reliable, and scalable method of building your own datasets.\n", 72 | "\n", 73 | "Learn how to find them in the wild, and how they've been used in past investigations.\n", 74 | "\n", 75 | "👉[Click here to jump to the tutorial](#tutorial)." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "# Intro\n", 83 | "## What is an A-P-I?\n", 84 | "\n", 85 | "If you have tried to get a driver's license or a travel visa, you have experienced bureaucracy at its finest-- a series of lines, forms, credential-showing, and waiting.\n", 86 | "\n", 87 | "Application Program Interfaces, or APIs, are digitized bureaucracy. You make a request, and then wait in a queue to be served by a server. However, instead of leaving with a driver's license or a custom plate, what you’re waiting for is well-formatted data. As for making mistakes... well, you'll get an automated rejection and zero sympathy.\n", 88 | "\n", 89 | "Most APIs are undocumented and hidden in plain sight. There is no set terminology for these APIs, so for simplicity's sake we'll refer to them as \"undocumented APIs\".\n", 90 | "\n", 91 | "Some investigations are only possible after finding and reverse-engineering undocumented APIs. Our first case study illustrates this in detail, where my reporting partner Aaron Sankin and I discovered undocumented APIs out of necessity while reporting on YouTube." 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "## A key tool for investigations\n", 99 | "\n", 100 | "YouTube is the [largest video-hosting platform](https://www.statista.com/topics/2019/youtube/) in the world, and plays a major role in the creator economy thanks to the YouTube Partner Program that shares ad revenue with eligible creators.\n", 101 | "\n", 102 | "Advertising is central to YouTube, and major companies have boycotted the platform in the past in response to their ads appearing alongside extremist content. We wanted to better understand YouTube's advertising system, especially how they treated hateful conspiracy theories, which at the time, seemed to thrive on the platform.\n", 103 | " \n", 104 | "To start investigating this topic, we got acquainted with the Google Ads portal. Anyone can sign-up, and see all the tools that marketers can use to reach users across the Google adverse. YouTube has a special section of the ad portal, where marketers can target their ads based on user demographics and the content of videos.\n", 105 | "\n", 106 | "We investigated a specific targeting tool that allows ad-buyers to use keyword searches to find videos and channels to place their ads on. \n", 107 | "\n", 108 | "Read the [investigation](https://themarkup.org/google-the-giant/2021/04/08/google-youtube-hate-videos-ad-keywords-blocklist-failures) along with its accompanying [methodology](https://themarkup.org/google-the-giant/2021/04/08/how-we-discovered-googles-hate-blocklist-for-ad-placements-on-youtube).\n", 109 | "\n", 110 | "In an initial test, we found that searching for the racist \"[White genocide](https://www.adl.org/resources/glossary-term/white-genocide)\" conspiracy theory returned no videos, but by simply removing spaces, we were shown relevant results.\n", 111 | "\n", 112 | "
\n", 113 | "
Searching the Google Ads portal for YouTube videos related to a conspiracy theory, and circumventing blocking measures. Source: Google.com/The Markup
\n", 114 | "
\n", 115 | "\n", 116 | "This anecdotal test suggested that Google was using a keyword blocklist that hid results for certain search terms, but not others. We performed further tests and found that searching for swears and strings of gibberish also surfaced no results. \n", 117 | "\n", 118 | "We wanted to verify if Google was using a keyword blocklist, and test how consistent that blocklist was with YouTube's \"advertiser friendly\" [guidelines](https://support.google.com/youtube/answer/6162278?hl=en). Unfortunately, the portal did not make it possible to discern between a blocked keyword and one that may have been too obscure to return any results.\n", 119 | "\n", 120 | "Our colleague Surya Mattu suggested using the web browser's built-in `developer tools` to monitor [network requests](https://doc.arcgis.com/en/appstudio/extend-apps/apinetworkrequest.htm) while we made searches in the portal. This proved to be a breakthrough that allowed us to isolate the API-endpoint being called during this process, reverse-engineer it to return results for any given keyword, and analyze the API's response before its contents were displayed in the portal.\n", 121 | "\n", 122 | "By looking closely at the API responses, we were able to identify clear structural differences based on Google's verdict of the keyword. Blocked terms returned an empty JSON string `{}`, whereas obscure terms returned a JSON with labels but no results:\n", 123 | "\n", 124 | "```{“videos”: [], “channels\": []}```\n", 125 | "\n", 126 | "With the categorization scheme established, we could confirm search terms were being blocked (read about this in detail [here](https://themarkup.org/google-the-giant/2021/04/08/how-we-discovered-googles-hate-blocklist-for-ad-placements-on-youtube#data-collection)). Moreover, with the API at our service, we could test any set of keywords, so we tested well-known [hate terms](https://themarkup.org/google-the-giant/2021/04/08/how-we-discovered-googles-hate-blocklist-for-ad-placements-on-youtube#sourcing-hate-keywords) and phrases related to \"racial justice and representation\" that we asked [independent advocacy groups](https://themarkup.org/google-the-giant/2021/04/09/how-we-discovered-googles-social-justice-blocklist-for-youtube-ad-placements#sourcing-social-justice-keywords) to send us. \n", 127 | "\n", 128 | "After testing the two keyword lists, we saw a pattern of Google blocking racial justice terms (like \"Black power\"), while showing advertisers results for well-known hate terms (like \"White power\").\n", 129 | "\n", 130 | "
\n", 131 | "\n", 132 | "
\n", 133 | "\n", 134 | "This was my first time finding and using an undocumented API for an investigation. Doing so revealed essential information that was not visible to everyday users, and it allowed us to perform systematic tests and bring receipts." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "Before we dive deeper into how to find undocumented APIs, it's important to note that some APIs are \"official\" and well-documented to the public. These APIs can also be used to great effect.\n", 142 | "\n", 143 | "## Documented APIs\n", 144 | "\n", 145 | "Many businesses sell their services using APIs.\n", 146 | "\n", 147 | "The benefit of documented APIs is self-explanatory, you know what you are going to get, and there are notes and examples to help developers use the tool as intended.\n", 148 | "\n", 149 | "Some documented APIs are also free to use, making them a great tool for teaching and research. \n", 150 | "One such API that journalists frequent is the [Census Data API](https://www.census.gov/data/developers/guidance/api-user-guide.html), which we use to retrieve statistical survey data from across the United States.\n", 151 | "Unfortunately, free APIs can often disappear or have their access severely limited-- as we've seen with Twitter (no longer free), YouTube (severely restricted), and Facebook (deprecated entirely)." 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "### How have Documented APIs been used?\n", 159 | "\n", 160 | "- [Gender Shades](https://gendershades.org/) was an audit of three commercially-available facial recognition APIs used to automate gender classification (from Microsoft, IBM, and Face++). The authors created a benchmark image dataset of faces hand-labeled by gender and skin tone, and tested each facial recognition model by sending the benchmark dataset through each respective model's API [@pmlr-v81-buolamwini18a]. The authors found that many models had high error rates for female and Black faces, with the worst performance on Black female faces.\n", 161 | "\n", 162 | "- Google's Perspective API was developed to filter out toxic comments for publishers such as [The New York Times](https://www.nytimes.com/2017/06/13/insider/have-a-comment-leave-a-comment.htm). Importantly, Perspective used \"[training data](https://en.wikipedia.org/wiki/Training,_validation,_and_test_data_sets)\" sourced from human-labeled [Wikipedia](https://en.wikipedia.org/wiki/Artificial_intelligence_in_Wikimedia_projects) edits. [An academic study](https://maartensap.com/pdfs/sap2019risk.pdf) found racially biased classifications of Tweets. For example, the use of certain identifiers for minority groups would flag a comment as \"toxic\" [@sap-etal-2019-risk]. Because Google had released the API publicly, researchers could access and audit this technology directly through the API.\n", 163 | "\n", 164 | "Now, let's get back to APIs that are undocumented and hidden." 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Undocumented APIs\n", 172 | "\n", 173 | "These are the unsung heroes making sure websites run, often times executing essential functions behind the scenes.\n", 174 | "Many of these functions are so mundane, you probably don't even realize that something is happening.\n", 175 | "\n", 176 | "If you spend time on social media platforms, you'll find that the good times keep rolling, and you'll never reach the end of the page. That is because \"infinite scroll\" is powered by an API that is called upon as you approach the bottom of the page to load more fun things to eat up your day.\n", 177 | "\n", 178 | "Sometimes engineers find these API endpoints and build open source software to access public data programmatically. See [Instaloader](https://instaloader.github.io/) (for Instagram) as an example.\n", 179 | "\n", 180 | "Learning how to find and use these publicly available APIs can help you build evidence and test hypotheses that are otherwise unreachable due to lack of access." 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": {}, 186 | "source": [ 187 | "# Case Studies\n", 188 | "\n", 189 | "## *How have undocumented APIs been used?*\n", 190 | "\n", 191 | "Journalists and researchers have used undocumented APIs to catalog Amazon Ring's sprawling surveillance network [@calacci-2022; @gizmodo-ring-2019], measure inequities in internet access using Facebook's advertising ecosystem [@garcia-2018], and parse complex government documents listing presidential appointees [@willis-plum].\n", 192 | "\n", 193 | "Using undocumented APIs has three key strengths:\n", 194 | "\n", 195 | "1. **Richness**: APIs often contain information that is not visible on web pages.
\n", 196 | "2. **Reliability**: APIs execute essential functions, so they don't change often. This can make for a reliable data source over time.
\n", 197 | "3. **Scalability**: You can collect more information in less time using this method compared to [headless browsers](https://en.wikipedia.org/wiki/Headless_browser), such as Selenium, Puppeteer, and Playwright (Not throwing shade-- these tools have their purpose).\n", 198 | "\n", 199 | "Next we will cover three case studies, each of which is intended to highlight one of these benefits." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Case study on richness: Google's blocklist for YouTube advertisers\n", 207 | "\n", 208 | "I'm not going to rehash this case study, since we led with it in the [introduction](#a-key-tool-for-investigations), but...\n", 209 | "\n", 210 | "Using undocumented APIs can reveal **rich** metadata. This includes hidden fields that are not displayed to everyday users of a website, as well as subtle changes to the structural in how data is returned. \n", 211 | "\n", 212 | "Using this metadata produces receipts you can follow by deciphering the meaning of these hidden fields, finding traces left by missing data, and identifying patterns that are otherwise hidden from the surface (front-end) world.\n", 213 | "\n", 214 | "Certainly this was the case with the YouTube investigation, and something that we'll brush on again in the hands-on tutorial at the end of this section." 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Case study on reliability: Amazon branded products\n", 222 | "\n", 223 | "If you have ever scraped HTML from a website, you've likely found yourself with a broken scraper. \n", 224 | "\n", 225 | "This occurs when class names, accessibility labels, text, or something else has changed and confused your scraper. In this sense, HTML scraping can be fragile and fickle, especially if your collecting data over a prolonged period of time.\n", 226 | "\n", 227 | "A stark example is Facebook's timeline, where you'll find [elements](https://developer.mozilla.org/en-US/docs/Glossary/Element) of the page are arbitrarily named, oddly-nested, and ever-changing.\n", 228 | "\n", 229 | "Using undocumented APIs can often get you the same information with a higher success-rate. This is because these APIs interact with the same backend (fetching information before being rendered, named, and nestled neatly into a webpage), and are often essential to the operation of the website.\n", 230 | "\n", 231 | "In the investigation \"[Amazon's Advantage](https://themarkup.org/amazons-advantage/2021/10/14/how-we-analyzed-amazons-treatment-of-its-brands-in-search-results)\", Adrianne Jeffries and I found a **reliable** method of identifying Amazon brands and exclusive products. At the time, these products were not clearly labeled, most Americans we surveyed were unable to identify Amazon's top brands, and no source of truth existed. \n", 232 | "\n", 233 | "We developed a approach to identify these products as Amazon private label using a filter found in the user interface of Amazon's website. The \"Our brands\" filter did a lot of heavy lifting in our investigation, and we found that it was powered by a undocumented API that listed all the Amazon branded products for a given search.\n", 234 | "\n", 235 | "This method was key to our investigation, which required persistent data collection over a period of several months. To our surprise, the API continued to work after we went to Amazon for comments on our detailed methodology, after we published our investigation, and even after Amazon executives were accused of perjury by members of the U.S. Congress.\n", 236 | "\n", 237 | "Usually the party gets shut down once you call the parents, but in this case it didn't.\n", 238 | "\n", 239 | "Because the API continued to work, we used it in a browser extension ([_Amazon Brand Detector_](https://themarkup.org/amazons-advantage/2021/11/29/introducing-amazon-brand-detector)) that we (including Ritu Ghiya and Jeff Crouse) built to highlight Amazon brands for shoppers around the globe. About half a year later, Amazon added an orange disclaimer of \"Amazon brand\" to their branded products, but the API and extension still work at the time of writing, more than a year later.\n", 240 | "\n", 241 | "This case study emphasizes the reliability of using undocumented APIs, not only for collecting datasets, but for persistent accountability efforts." 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## Case study on scalability: collecting Internet Plans\n", 249 | "\n", 250 | "In the investigation, \"[Still Loading](https://themarkup.org/show-your-work/2022/10/19/how-we-uncovered-disparities-in-internet-deals)\" my reporting partner Aaron Sankin and I collected and analyzed over 1 million internet service plans across major cities in the United States. \n", 251 | "\n", 252 | "We learned a technique from a trio of researchers from Princeton, that used the lookup tools found on the internet service providers' websites to retrieve internet plans for a specific address [@princeton-2020].\n", 253 | "\n", 254 | "However, doing this using a browser (as a real person would) is incredibly slow. Even with 10 automated browsers (see below) with unique IP addresses, it would have taken months to collect a representative sample of a single major American city." 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "
\n", 262 | "
Automating checking for internet plans from AT&T using Selenium browser automation.
\n", 265 | "
\n" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "Browser automation is bulky. Not only do you need to load every asset of a web page, there is also the compute resources necessary to spin up a browser. When you can get away without having to mock user interactions, or use rendered page elements, finding the underlying API(s) can be quicker and more eloquent.\n", 273 | "\n", 274 | "Initially, the workflow for getting an internet plan seemed too complex to pull off using an API-- there was user authentication that set a cookie, choosing an address from a list of suggestions, and adding an apartment number when prompted.\n", 275 | "\n", 276 | "However, we were able to keep track of cookies using a `session` (read about this advanced topic [here](https://requests.readthedocs.io/en/latest/user/advanced/#session-objects)), and speed things up by bundling the sequence of successive API calls into a function.\n", 277 | "\n", 278 | "Not only was this function easier to write, but it was able to be written and executed [asynchronously](https://en.wikipedia.org/wiki/Asynchrony_(computer_programming)). Meaning we could request internet plans from many addresses at the same time.\n", 279 | "\n", 280 | "This allowed us to collect AT&T internet plans for a representative sample of 21 cities in two days, rather than two years. \n", 281 | "\n", 282 | "Timely data collection is key. Solving this issue allowed us to be ambitious in the scope of our investigation, which [ultimately found](https://themarkup.org/still-loading/2022/10/19/dollars-to-megabits-you-may-be-paying-400-times-as-much-as-your-neighbor-for-internet-service) that Internet pricing disparities were common for lower-income, least-White, and historically redlined areas.\n", 283 | "\n", 284 | "When it comes to web scraping, undocumented APIs offer unmatched **scalability** to collect massive amounts of data. This is especially true when you orchestrate them with asynchronous and multi-threaded programming (another topic we plan to cover in a future section).\n", 285 | "\n", 286 | "Although the process of finding undocumented APIs is not too complicated (as you'll see in the tutorial), the chances of finding one that is helpful for your investigation or research are still quite low. Don't be deterred, that just makes finding a useful one more special." 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "# Tutorial" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "## *How to find and use undocumented APIs*\n", 301 | "\n", 302 | "In this exercise, you'll learn to sniff out undocumented APIs using the web browser’s `developer tools` (shortened to dev tools), figure out how they work, test different inputs, and analyze API responses.\n", 303 | "\n", 304 | "You can do most of this tutorial with zero coding, but it'll hold you back from using APIs to their fullest.\n", 305 | "\n", 306 | "::: {.callout-note}\n", 307 | "Note that if you're in a workshop setting: hitting the example API at the same time will get us all blocked from the website!\n", 308 | ":::" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "### 1. First open the developer console. \n", 316 | "\n", 317 | "See how on [Chrome](https://developer.chrome.com/docs/devtools/open/) or [Firefox](https://developer.mozilla.org/en-US/docs/Learn/Common_questions/What_are_browser_developer_tools) here. \n", 318 | "\n", 319 | "In this tutorial, we'll see how Amazon.com autocomplete search suggestions work.\n", 320 | "\n", 321 | "One way to get to the dev tools it to right-click and “Inspect” an element on the page. " 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "
\n", 329 | "\n", 330 | "
Example of inspecting an element on a page using a right-click
\n", 331 | "
" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "\n", 339 | "This will open the dev tools under the “Elements” tab, which is used to explore the source code of a page. \n", 340 | "\n", 341 | "Page source code is useful because it reveals clues that are otherwise unseen by regular users. Often times, clues are in accessibility features known as ARIA elements.\n", 342 | "\n", 343 | "However, this tutorial is not about source code... it's about API requests that populate what we see on the page, and the hidden fields that we don't see.\n", 344 | "\n", 345 | "Let's try this!\n", 346 | "\n", 347 | "With dev tools open, go to Amazon.com, select the search bar on the website, and start typing a query (such as \"spicy\")." 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "### 2. Click the “Network” tab.\n", 355 | "\n", 356 | "This section of the dev tools is used to monitor network requests.\n", 357 | "\n", 358 | "*Background*\n", 359 | "\n", 360 | "Everything on a page is retrieved from some outside source, likely a server. This includes things like images embedded on the page, JavaScript code running in the background, and all the bits of “content” that populate the page before us.\n", 361 | "\n", 362 | "Using the `Network` tab, we can find out how this information is requested from a server, and intercept the response before it is rendered on the page.\n", 363 | "\n", 364 | "These responses are information-rich, and contain fields that don’t end up in the source code *or* in the user interface that most people encounter when they visit a site.\n", 365 | "\n", 366 | "Further, we can reverse-engineer how this request is made, and use it to collect structured data at scale. This is the power of finding undocumented APIs.\n", 367 | "\n", 368 | "*Back to the console...*\n", 369 | "\n", 370 | "The `Network` tab can look pretty hectic at first. It has many uses, and a lot of information. We'll cover some of the basics." 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "### 3. Filter requests by fetch/XHR\n", 387 | "\n", 388 | "This will reveal only API calls made to servers. This includes internal servers that are hosted by the website we’re inspecting, as well as external servers. The latter often includes [third-party trackers](https://themarkup.org/blacklight) used in adtech, and verification services to authenticate user behavior." 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "You might see quite a few network requests that were loaded onto the page. Look at \"Domain\" and \"File\" to narrow down where requests were sent, and whether the names are telling of the purpose of the request. \n", 405 | "\n", 406 | "::: {.callout-tip}\n", 407 | "#### Pro tip:\n", 408 | "You can \"Filter URLs\" using different properties (see how to do this for [Chrome](https://developer.chrome.com/docs/devtools/network/reference/#filter-by-property) and [Firefox](https://firefox-source-docs.mozilla.org/devtools-user/network_monitor/request_list/index.html#filtering-requests)).\n", 409 | ":::\n", 410 | "\n", 411 | "In this example, notice that a request was sent to the \"Domain\" `completion.amazon.com`, using an API endpoint (in the \"File\" column) named `suggestions`. This is likely the API being called to populate autocompleted search suggestions on the Amazon marketplace. Reading \"File\" names can help determine each API's function.\n", 412 | "\n", 413 | "When clicking the network request, you'll see \"Headers\". Those are the [HTTP headers](https://developer.mozilla.org/en-US/docs/Glossary/Request_header) that were sent along with the network request. This is not useful for us _just yet_, instead we want to see what data gets transferred as a result of the API call.\n", 414 | "\n", 415 | "To do this, we'll look at the request's \"Response\" attributes." 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "### 4. Analyze the response\n", 423 | "This might seem intimidating at first, but let me _key_ you in on some tips. Responses are almost always JSON-formatted. JSON is made up of lists and [key-value](https://en.wikipedia.org/wiki/Name%E2%80%93value_pair) pairs. This means the information is stored like a dictionary, with words and their corresponding definitions.\n", 424 | "\n", 425 | "Looking at the JSON response, it looks like Amazon's `completion.amazon.com/suggestions` API returns a list of \"suggestions\". Each item in the list of suggestions has a \"value\", in the example above that \"value\" is `spicy ramen`. \n", 426 | "\n", 427 | "**Check your work**: confirm this interpretation is correct by cross-referencing the API response with what a user would see on the website.\n", 428 | "\n", 429 | "\n", 430 | "
\n", 431 | "\n", 432 | "
Amazon's suggestions for \"spicy\".
\n", 433 | "
\n", 434 | "\n", 435 | "Another check you can perform `CTRL+F` the JSON response for a unique string. This could be a string of text on the page (or something else) that serves as a unique tracer. Verifying its presence will help pinpoint the right API call." 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "Getting these steps down, is your one way ticket to spicy town, and you don't need to code at all.\n", 443 | "\n", 444 | "However, some rudimentary coding can help you figure out how to use the API for vast inputs to collect your own dataset." 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "### 5. Copy as cURL\n", 452 | "\n", 453 | "If you find an HTTP request that returns a response with useful information you can start to reverse-engineer it. To do that, we can isolate it by right-clicking the HTTP request and selecting “copy as cURL”. ([cURL](https://developer.ibm.com/articles/what-is-curl-command/) stands for client URL, and is a tool used to transfer data across networks.)\n", 454 | "\n", 455 | "" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "### 6. Curl to requests\n", 463 | "We can use a site like [curlconverter.com](https://curlconverter.com/) to convert the cURL we copied into a reusable API call. In this example, we use the default conversion to a Python `requests` script. You can do the same for any language and framework.\n", 464 | "\n", 465 | "Here is what the converted cURL looks like after being converted to a Python request:" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 1, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "import requests\n", 475 | "\n", 476 | "cookies = {\n", 477 | " 'aws-ubid-main': '836-8365128-6734270',\n", 478 | " 'session-id-time': '2082787201l',\n", 479 | " 'ubid-main': '135-7086948-2591317',\n", 480 | " 'aws-priv': 'eyJ2IjoxLCJldSI6MCwic3QiOjB9',\n", 481 | " 'aws-target-static-id': '1593060129944-225088',\n", 482 | " 'lc-main': 'en_US',\n", 483 | " 'x-main': 'Oz3Tb5n2p0ic7OhF3cU5dc9B4ZR2gFjhKEsP4zikHHD3Gk2O7NpSmuShBxLFrhpZ',\n", 484 | " 'at-main': 'Atza|IwEBILB5ARQ_IgTCiBLam_XE2pyT76jXTbAXHOm2AJomLPmDgoJUJIIlUmyFeh_gChLHCycKjNlys-5CqqMabKieAzqSf607ChJsNevw-V06e7VKgcWjvoMaZRWlGiZ-c5wSJ-e4QzIWzAxTS1EI6sRUaRZRv-a0ZpOJQ-sHHB99006ytcrHhubdrXYPJRqEP5Q-_30JtESMpAkASoOs4vETSFp5BDBJfSWWETeotpIVXwA4NoC8E59bZb_5wHTW9cRBSWYGi1XL7CRl2xGbJaO2Gv3unuhGMB1tiq9iwxodSPBBTw',\n", 485 | " 'sess-at-main': '\"PUq9PW1TbO9CTYhGMo7l1Dz+wedh40Ki8Z9rPC+1TSI=\"',\n", 486 | " 'sst-main': 'Sst1|PQHsbeSFCMSY0X0_WgvTo5NUCaZkG2J9RPqWWy0fCpyWopJXgu6_drU_LstOdJB2cDmaVCXwkNpsF5yNPrBDj3Wtx-TC-AaYZn6WUdp8vNRPb6iYqxPAjRDnfK3pCnHqt19I0GoG7Bd1wnOxkAvnH0992IUq14kH6Ojm0J8noVPwMez0lltD-jxBwtDQ_EZYUkZG741RDVEojfziawJY9iKc-cLCnKmhi-ca1PPJnsimPV4lXRtMAGFbf9nMkKq4CbpkaRMdVtlPr20vF9eqg_V_-LY_V7S44WlO-_t_bFBnK8Q',\n", 487 | " 'i18n-prefs': 'USD',\n", 488 | " 'session-token': 'ptze73uznXExrMCSV9AklvNOKa1ND9F0rlQH2ioSM26Vr6hSheH8O4v4P8Lg3zuv7oDM+HZ+8f2TlyoPXUmPShprMXdvEpAQieXUw7+83PZOJvkkg1jwP0NiG0ZqksIYOr3Zuwt3omMcfCKRReWKxl5rGaDEM6AISpwI5aMDDCnA7fWbVO/QQYNxUZMifc599EZ5Fg3uGjCAhBlb6I7UO8ewRbXJ1bo9',\n", 489 | " 'session-id': '139-9925917-2023535',\n", 490 | " 'aws-userInfo-signed': 'eyJ0eXAiOiJKV1MiLCJrZXlSZWdpb24iOiJ1cy1lYXN0LTEiLCJhbGciOiJFUzM4NCIsImtpZCI6ImFhNDFkZjRjLTMxMzgtNGVkOC04YmU5LWYyMzUzYzNkOTEzYiJ9..LWFZOJMDcYdu6od6Nk8TmhAFMGA9O98O4tIOsVlR7w5vAS_JgVixL8j75u6jTgjfWkdddhKqa5kgsXDmGNbjhzLIsD48ch1BUodlzxqeQfn0r8onIwLbUIHEnk6X-AJE',\n", 491 | " 'skin': 'noskin',\n", 492 | "}\n", 493 | "\n", 494 | "headers = {\n", 495 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0',\n", 496 | " 'Accept': 'application/json, text/javascript, */*; q=0.01',\n", 497 | " 'Accept-Language': 'en-US,en;q=0.5',\n", 498 | " # 'Accept-Encoding': 'gzip, deflate, br',\n", 499 | " 'Origin': 'https://www.amazon.com',\n", 500 | " 'Connection': 'keep-alive',\n", 501 | " 'Referer': 'https://www.amazon.com/',\n", 502 | " \n", 503 | " 'Sec-Fetch-Dest': 'empty',\n", 504 | " 'Sec-Fetch-Mode': 'cors',\n", 505 | " 'Sec-Fetch-Site': 'same-site',\n", 506 | "}\n", 507 | "\n", 508 | "params = {\n", 509 | " 'limit': '11',\n", 510 | " 'prefix': 'spicy',\n", 511 | " 'suggestion-type': [\n", 512 | " 'WIDGET',\n", 513 | " 'KEYWORD',\n", 514 | " ],\n", 515 | " 'page-type': 'Gateway',\n", 516 | " 'alias': 'aps',\n", 517 | " 'site-variant': 'desktop',\n", 518 | " 'version': '3',\n", 519 | " 'event': 'onKeyPress',\n", 520 | " 'wc': '',\n", 521 | " 'lop': 'en_US',\n", 522 | " 'last-prefix': '\\0',\n", 523 | " 'avg-ks-time': '2486',\n", 524 | " 'fb': '1',\n", 525 | " 'session-id': '139-9925917-2023535',\n", 526 | " 'request-id': 'SVMTJXRDBQ9T8M7BRGNJ',\n", 527 | " 'mid': 'ATVPDKIKX0DER',\n", 528 | " 'plain-mid': '1',\n", 529 | " 'client-info': 'amazon-search-ui',\n", 530 | "}\n", 531 | "\n", 532 | "response = requests.get('https://completion.amazon.com/api/2017/suggestions', \n", 533 | " params=params, cookies=cookies, headers=headers)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "You can run this Python code, as-is, and it should work.\n", 541 | "\n", 542 | "Just a reminder that you can run this code interactively on [Google Colab](https://colab.research.google.com/) by [copying this tutorial](https://colab.research.google.com/github/yinleon/inspect-element/blob/main/apis.ipynb) (it's written as a Jupyter Notebook), or running a \"new notebook\" and pasting the code in.\n", 543 | "\n", 544 | "Press the little play button on the left to run the code. You should see something that looks similar to what you saw in the inspector." 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "# to see the response, run this cell:\n", 554 | "response.json()" 555 | ] 556 | }, 557 | { 558 | "cell_type": "markdown", 559 | "metadata": {}, 560 | "source": [ 561 | "### 7. Strip it down\n", 562 | "\n", 563 | "You might be overwhelmed with the parameters that go into this API request. Like the response output, the inputs are formatted like a JSON, too. Start removing these parameters one-by-one. \n", 564 | "\n", 565 | "Keep parameters for authentication, and also the input parameters that you can change for your own purposes. Notice that the example query of \"spicy\" stored in the `prefix` parameter.\n", 566 | "\n", 567 | "::: {.callout-tip}\n", 568 | "#### Pro tip:\n", 569 | "Parameter values can expire, so periodically test the request and each parameter to assure you only keep the shelf-stable parts.\n", 570 | ":::" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 2, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "headers = {\n", 580 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0',\n", 581 | " 'Accept': 'application/json, text/javascript, */*; q=0.01',\n", 582 | " 'Accept-Language': 'en-US,en;q=0.5',\n", 583 | "}\n", 584 | "\n", 585 | "params = {\n", 586 | " 'prefix': 'spicy',\n", 587 | " 'suggestion-type': [\n", 588 | " 'WIDGET',\n", 589 | " 'KEYWORD',\n", 590 | " ],\n", 591 | " 'alias': 'aps',\n", 592 | " 'plain-mid': '1',\n", 593 | "}\n", 594 | "\n", 595 | "response = requests.get('https://completion.amazon.com/api/2017/suggestions', params=params, headers=headers)" 596 | ] 597 | }, 598 | { 599 | "cell_type": "markdown", 600 | "metadata": {}, 601 | "source": [ 602 | "### 8. Recycle and reuse\n", 603 | "\n", 604 | "With the stripped down request, try to submit a few— let’s say 10 or 20, requests with new parameters set by you.\n", 605 | "\n", 606 | "For convenience, we can write the stripped down API call as a function that takes any `keyword` as input." 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 3, 612 | "metadata": {}, 613 | "outputs": [], 614 | "source": [ 615 | "import pandas as pd\n", 616 | "import time\n", 617 | "\n", 618 | "def search_suggestions(keyword):\n", 619 | " \"\"\"\n", 620 | " Get autocompleted search suggestions for a `keyword` search on Amazon.com.\n", 621 | " \"\"\"\n", 622 | " headers = {\n", 623 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:100.0) Gecko/20100101 Firefox/100.0',\n", 624 | " 'Accept': 'application/json, text/javascript, */*; q=0.01',\n", 625 | " 'Accept-Language': 'en-US,en;q=0.5',\n", 626 | " }\n", 627 | "\n", 628 | " params = {\n", 629 | " 'prefix': keyword,\n", 630 | " 'suggestion-type': [\n", 631 | " 'WIDGET',\n", 632 | " 'KEYWORD',\n", 633 | " ],\n", 634 | " 'alias': 'aps',\n", 635 | " 'plain-mid': '1',\n", 636 | " }\n", 637 | "\n", 638 | " response = requests.get('https://completion.amazon.com/api/2017/suggestions', \n", 639 | " params=params, headers=headers)\n", 640 | " return response.json()" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": {}, 646 | "source": [ 647 | "In this step the code gets refactored to make it repeatable and reusable, but it's a bit of a jump if you're not a coder. If that's you, still try to read the code -- you should be able to get a rough idea of what does what, and how it's similar to the code you had in the last step. " 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": {}, 653 | "source": [ 654 | "Here we can set new input parameters in `keyword`, and make the an API call using each keyword. Try changing some of the code (eg. the keywords) and rerunning it to check your understanding." 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 4, 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "# Here are our inputs (what searches we'll get autocompleted)\n", 664 | "keywords = [\n", 665 | " 'a', 'b', 'cookie', 'sock', 'zelda', '12'\n", 666 | "]\n", 667 | "\n", 668 | "# Here we'll go through each input, get the suggestions, and then add the `suggestions` to a list.\n", 669 | "data = []\n", 670 | "for keyword in keywords:\n", 671 | " suggestions = search_suggestions(keyword)\n", 672 | " suggestions['search_word'] = keyword # keep track of the seed keyword\n", 673 | " time.sleep(1) # best practice to put some time between API calls.\n", 674 | " data.extend(suggestions['suggestions'])" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "We saved the API responses in a list called `data`, and put them into a [Pandas](https://pandas.pydata.org/) DataFrame to analyze." 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 5, 687 | "metadata": {}, 688 | "outputs": [ 689 | { 690 | "data": { 691 | "text/html": [ 692 | "
\n", 693 | "\n", 706 | "\n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | "
suggTypetypevaluerefTagcandidateSourcesstrategyIdpriorghosthelpqueryUnderstandingFeatures
4KeywordSuggestionKEYWORDasmanex twisthaler 30 inhalernb_sb_ss_i_5_1localorganic0.0FalseFalse[{'source': 'QU_TOOL', 'annotations': []}]
13KeywordSuggestionKEYWORDbathroom organizernb_sb_ss_i_4_1localorganic0.0FalseFalse[{'source': 'QU_TOOL', 'annotations': []}]
19KeywordSuggestionKEYWORDbaby wipesnb_sb_ss_i_10_1localorganic0.0FalseFalse[{'source': 'QU_TOOL', 'annotations': []}]
12KeywordSuggestionKEYWORDbaby registry searchnb_sb_ss_i_3_1localorganic0.0FalseFalse[{'source': 'QU_TOOL', 'annotations': []}]
16KeywordSuggestionKEYWORDb013xkha4m b08xzrxczm b07xxphqzk b09rwjblc7nb_sb_ss_i_7_1localorganic0.0FalseFalse[{'source': 'QU_TOOL', 'annotations': []}]
\n", 790 | "
" 791 | ], 792 | "text/plain": [ 793 | " suggType type value \\\n", 794 | "4 KeywordSuggestion KEYWORD asmanex twisthaler 30 inhaler \n", 795 | "13 KeywordSuggestion KEYWORD bathroom organizer \n", 796 | "19 KeywordSuggestion KEYWORD baby wipes \n", 797 | "12 KeywordSuggestion KEYWORD baby registry search \n", 798 | "16 KeywordSuggestion KEYWORD b013xkha4m b08xzrxczm b07xxphqzk b09rwjblc7 \n", 799 | "\n", 800 | " refTag candidateSources strategyId prior ghost help \\\n", 801 | "4 nb_sb_ss_i_5_1 local organic 0.0 False False \n", 802 | "13 nb_sb_ss_i_4_1 local organic 0.0 False False \n", 803 | "19 nb_sb_ss_i_10_1 local organic 0.0 False False \n", 804 | "12 nb_sb_ss_i_3_1 local organic 0.0 False False \n", 805 | "16 nb_sb_ss_i_7_1 local organic 0.0 False False \n", 806 | "\n", 807 | " queryUnderstandingFeatures \n", 808 | "4 [{'source': 'QU_TOOL', 'annotations': []}] \n", 809 | "13 [{'source': 'QU_TOOL', 'annotations': []}] \n", 810 | "19 [{'source': 'QU_TOOL', 'annotations': []}] \n", 811 | "12 [{'source': 'QU_TOOL', 'annotations': []}] \n", 812 | "16 [{'source': 'QU_TOOL', 'annotations': []}] " 813 | ] 814 | }, 815 | "execution_count": 5, 816 | "metadata": {}, 817 | "output_type": "execute_result" 818 | } 819 | ], 820 | "source": [ 821 | "df = pd.DataFrame(data)\n", 822 | "\n", 823 | "# show 5 random auto suggestions\n", 824 | "df.sample(5, random_state=303)" 825 | ] 826 | }, 827 | { 828 | "cell_type": "markdown", 829 | "metadata": {}, 830 | "source": [ 831 | "If you look at the columns, you might be flooded with more questions:\n", 832 | "\n", 833 | "- Some terms may be `blackListed`, what does that mean and what words, if any, are `blackListed = True`?
\n", 834 | "- Are some searches paid for, and not `organic`?
\n", 835 | "- What is `ghost`?
\n", 836 | "\n", 837 | "This metadata is only visible from the API, and can lead to new story ideas and directions to pursue. \n", 838 | "\n", 839 | "Unfortunately, because this API is undocumented, asking these questions and figuring out what everything represents is difficult. Use your curiosity and look at many examples. The feature of the API is being able to make many queries at scale, which should help answer these questions. Reporting this out with sources is also essential in this process." 840 | ] 841 | }, 842 | { 843 | "cell_type": "markdown", 844 | "metadata": {}, 845 | "source": [ 846 | "## Do it yourself\n", 847 | "Find an API in the wild, isolate it, strip it down, reverse-engineer it and analyze some of its results.\n", 848 | "\n", 849 | "If a website has a search bar or a text box that queries a server or database, there's a good chance that you can find an API.\n", 850 | "\n", 851 | "Revisit the steps we outlined above, and apply them to a new website.\n", 852 | "If you aren't a coder, try to get steps 1-6 (I believe in you!).\n" 853 | ] 854 | }, 855 | { 856 | "cell_type": "code", 857 | "execution_count": null, 858 | "metadata": {}, 859 | "outputs": [], 860 | "source": [] 861 | }, 862 | { 863 | "cell_type": "markdown", 864 | "metadata": {}, 865 | "source": [ 866 | "If you are a coder, try some of the advanced usage below.\n", 867 | "\n", 868 | "### For advanced usage...\n", 869 | "- Handle errors for bad requests, rate limiting, and other issues that could arise.
\n", 870 | "- Restructure the API response to better analyze (called \"parsing\" the data).
\n", 871 | "- you can use `session` instead of pure requests. This is helpful if cookies and authentication are involved. Read more about that [here](https://requests.readthedocs.io/en/latest/user/advanced/#session-objects).
\n", 872 | "- you can make a request asynchronous to speed up data collection (without overloading the site's servers, of course).
\n", 873 | "- Implement steps 6-onwards in another programming language.
\n" 874 | ] 875 | }, 876 | { 877 | "cell_type": "markdown", 878 | "metadata": {}, 879 | "source": [ 880 | "## Predetermined prompts\n", 881 | "Don't know where to look? Here are some ideas:\n", 882 | "\n", 883 | "- YouTube recommendations.
\n", 884 | "- [Blacklight](https://themarkup.org/blacklight)'s API to find third-party trackers on a website.
\n", 885 | "- Amtrak's train statuses." 886 | ] 887 | }, 888 | { 889 | "cell_type": "markdown", 890 | "metadata": {}, 891 | "source": [ 892 | "## Homework assignment\n", 893 | "\n", 894 | "**Practice**: Keep trying to find APIs in the wild. Think about the websites you frequent, topics that interest you, or stories you're currently working on. You won't always find an API, and that's OK.\n", 895 | "\n", 896 | "**Scoping**: Determine how the API could be used to produce data to answer a reporting question or hypothesis. What will be your sample for a quick test, i.e. how many data points are enough to know if you have something? \n", 897 | "\n", 898 | "**Reporting**: Determine the meaning and significance of hidden fields that are returned.\n", 899 | "\n", 900 | "Ultimately APIs are a tool, and data is useless without a purpose. Hopefully this worksheet helps you in your time of need." 901 | ] 902 | }, 903 | { 904 | "cell_type": "markdown", 905 | "metadata": {}, 906 | "source": [ 907 | "# Related readings\n", 908 | "More tutorials on the same subject:\n", 909 | "\n", 910 | "- [\"Scraping XHR\"](https://scrapism.lav.io/scraping-xhr/) - Sam Lavigne
\n", 911 | "- [\"Web Scraping 201: finding the API\"](https://www.gregreda.com/2015/02/15/web-scraping-finding-the-api/) - Greg Reda
\n", 912 | "- [\"How to use undocumented web APIs\"](https://jvns.ca/blog/2022/03/10/how-to-use-undocumented-web-apis/) - Julia Evans\n", 913 | "\n", 914 | "Topical and timeless:\n", 915 | "\n", 916 | "- [\"Computational research in the post-API age\"](https://dfreelon.org/publications/2018_Computational_research_in_the_postAPI_age.pdf) - Deen Freelon\n", 917 | "\n", 918 | "Notable investigations and audits using undocumented APIs:\n", 919 | "\n", 920 | "- [\"Ring’s Hidden Data Let Us Map Amazon's Sprawling Home Surveillance Network\"](https://gizmodo.com/ring-s-hidden-data-let-us-map-amazons-sprawling-home-su-1840312279) - Dell Cameron and Dhruv Mehrota
\n", 921 | "- \"[Porch piracy: are we overracting to package thefts from doorsteps?](https://www.theguardian.com/us-news/2022/aug/25/porch-piracy-package-thefts-doorstep-delivery)\" - Lam Thuy Vo
\n", 922 | "- \"[The Cop in Your Neighbor's Doorbell](https://site.dcalacci.net/papers/ring-cscw-2021.pdf)\" - Dan Calacci et al.\n", 923 | "- \"[Analyzing gender inequality through large-scale Facebook advertising data](https://www.pnas.org/doi/full/10.1073/pnas.1717781115)\" - David Garcia et al.
\n", 924 | "- \"[Freeing the Plum Book](https://source.opennews.org/articles/freeing-plum-book/)\" - Derek Willis
\n", 925 | "\n", 926 | "Please reach out with more examples to add." 927 | ] 928 | }, 929 | { 930 | "cell_type": "markdown", 931 | "metadata": {}, 932 | "source": [ 933 | "## Artifacts\n", 934 | "Slides from workshops can be found here:\n", 935 | "\n", 936 | "[2023-02-24 @ Tow Center Columbia](https://docs.google.com/presentation/d/1e1QoSNXv2m90lhhyUMSzUlMxXJD_Ar5DtC43PcpTcWU)
\n", 937 | "[2023-03-04 @ NICAR](https://docs.google.com/presentation/d/1hWMqcBNfs9BbaVywMGJPf_BcR9PpVLlHPCGBsAzz-No/)
\n", 938 | "[2023-06-12 @ FAccT](https://docs.google.com/presentation/d/1-9rODLyxJawasNIn_rp9E_oHUOB1SUGCpAb8TbmyxKU/edit?usp=sharing)
\n", 939 | "[2023-06-14 @ Journocoders](https://paper.dropbox.com/doc/Journocoders-June-2023-1t7nrjYNWoiPhK0sx8rH0)
\n", 940 | "[2023-06-22 @ C+J DATAJ](https://docs.google.com/presentation/d/10_mWNwr_fsrX0r8e6xWhFruhdUZUbLEA9HqLI3HtEFg)" 941 | ] 942 | }, 943 | { 944 | "cell_type": "markdown", 945 | "metadata": {}, 946 | "source": [ 947 | "# Citation\n", 948 | "\n", 949 | "To cite this chapter, please use the following BibTex entry:\n", 950 | "\n", 951 | "
\n",
 952 |     "@incollection{inspect2023browser,\n",
 953 |     "  author    = {Yin, Leon},\n",
 954 |     "  title     = {Finding Undocumented APIs},\n",
 955 |     "  booktitle = {Inspect Element: The practitioner's guide to hypothesis-driven data investigations},\n",
 956 |     "  year      = {2023},\n",
 957 |     "  editor    = {Yin, Leon and Sapiezynski, Piotr},\n",
 958 |     "  note      = {\\url{https://inspectelement.org}}\n",
 959 |     "}\n",
 960 |     "
\n", 961 | "\n", 962 | "## Acknowledgements\n", 963 | "\n", 964 | "Thank you Max Harlow for suggestions in making the tutorial easier to understand, and a well-compressed thank you to Simon Fondrie-Teitler for helping optimize this page." 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": null, 970 | "metadata": {}, 971 | "outputs": [], 972 | "source": [] 973 | } 974 | ], 975 | "metadata": { 976 | "kernelspec": { 977 | "display_name": "Python 3", 978 | "language": "python", 979 | "name": "python3" 980 | }, 981 | "language_info": { 982 | "codemirror_mode": { 983 | "name": "ipython", 984 | "version": 3 985 | }, 986 | "file_extension": ".py", 987 | "mimetype": "text/x-python", 988 | "name": "python", 989 | "nbconvert_exporter": "python", 990 | "pygments_lexer": "ipython3", 991 | "version": "3.7.3" 992 | }, 993 | "vscode": { 994 | "interpreter": { 995 | "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6" 996 | } 997 | } 998 | }, 999 | "nbformat": 4, 1000 | "nbformat_minor": 4 1001 | } 1002 | -------------------------------------------------------------------------------- /assets/att-scraper-selenium.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/att-scraper-selenium.mp4 -------------------------------------------------------------------------------- /assets/blm-wlm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/blm-wlm.png -------------------------------------------------------------------------------- /assets/blocked-terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/blocked-terms.png -------------------------------------------------------------------------------- /assets/browser-automation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser-automation.gif -------------------------------------------------------------------------------- /assets/browser0_01_threestep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser0_01_threestep.png -------------------------------------------------------------------------------- /assets/browser0_01_threestep1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser0_01_threestep1.png -------------------------------------------------------------------------------- /assets/browser0_02_soup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser0_02_soup.png -------------------------------------------------------------------------------- /assets/browser0_02_soup1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser0_02_soup1.png -------------------------------------------------------------------------------- /assets/browser1_01_version.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_01_version.png -------------------------------------------------------------------------------- /assets/browser1_01_version1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_01_version1.png -------------------------------------------------------------------------------- /assets/browser1_02_tiktok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_02_tiktok.png -------------------------------------------------------------------------------- /assets/browser1_02_tiktok1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_02_tiktok1.png -------------------------------------------------------------------------------- /assets/browser1_03_dismiss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_03_dismiss.png -------------------------------------------------------------------------------- /assets/browser1_03_dismiss1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_03_dismiss1.png -------------------------------------------------------------------------------- /assets/browser1_04_inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_04_inspect.png -------------------------------------------------------------------------------- /assets/browser1_04_inspect1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_04_inspect1.png -------------------------------------------------------------------------------- /assets/browser1_05_inspect_tiktok_a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_05_inspect_tiktok_a.png -------------------------------------------------------------------------------- /assets/browser1_05_inspect_tiktok_a1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_05_inspect_tiktok_a1.png -------------------------------------------------------------------------------- /assets/browser1_05_inspect_tiktok_b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_05_inspect_tiktok_b.png -------------------------------------------------------------------------------- /assets/browser1_05_inspect_tiktok_b1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/browser1_05_inspect_tiktok_b1.png -------------------------------------------------------------------------------- /assets/copy-curl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/copy-curl.png -------------------------------------------------------------------------------- /assets/dev-console.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/dev-console.mp4 -------------------------------------------------------------------------------- /assets/favicon/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/.DS_Store -------------------------------------------------------------------------------- /assets/favicon/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/android-chrome-192x192.png -------------------------------------------------------------------------------- /assets/favicon/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/android-chrome-512x512.png -------------------------------------------------------------------------------- /assets/favicon/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/apple-touch-icon.png -------------------------------------------------------------------------------- /assets/favicon/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/favicon-16x16.png -------------------------------------------------------------------------------- /assets/favicon/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/favicon-32x32.png -------------------------------------------------------------------------------- /assets/favicon/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon/favicon.ico -------------------------------------------------------------------------------- /assets/favicon/site.webmanifest: -------------------------------------------------------------------------------- 1 | {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"} -------------------------------------------------------------------------------- /assets/favicon_/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/android-chrome-192x192.png -------------------------------------------------------------------------------- /assets/favicon_/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/android-chrome-512x512.png -------------------------------------------------------------------------------- /assets/favicon_/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/apple-touch-icon.png -------------------------------------------------------------------------------- /assets/favicon_/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/favicon-16x16.png -------------------------------------------------------------------------------- /assets/favicon_/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/favicon-32x32.png -------------------------------------------------------------------------------- /assets/favicon_/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_/favicon.ico -------------------------------------------------------------------------------- /assets/favicon_/site.webmanifest: -------------------------------------------------------------------------------- 1 | {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"} -------------------------------------------------------------------------------- /assets/favicon_alt/android-chrome-192x192.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/android-chrome-192x192.png -------------------------------------------------------------------------------- /assets/favicon_alt/android-chrome-512x512.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/android-chrome-512x512.png -------------------------------------------------------------------------------- /assets/favicon_alt/apple-touch-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/apple-touch-icon.png -------------------------------------------------------------------------------- /assets/favicon_alt/favicon-16x16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/favicon-16x16.png -------------------------------------------------------------------------------- /assets/favicon_alt/favicon-32x32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/favicon-32x32.png -------------------------------------------------------------------------------- /assets/favicon_alt/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/favicon_alt/favicon.ico -------------------------------------------------------------------------------- /assets/favicon_alt/site.webmanifest: -------------------------------------------------------------------------------- 1 | {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"} -------------------------------------------------------------------------------- /assets/filter-network.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/filter-network.mp4 -------------------------------------------------------------------------------- /assets/filter-requests.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/filter-requests.mp4 -------------------------------------------------------------------------------- /assets/ida.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ida.gif -------------------------------------------------------------------------------- /assets/ida.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ida.mp4 -------------------------------------------------------------------------------- /assets/ida.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ida.png -------------------------------------------------------------------------------- /assets/ida.webm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ida.webm -------------------------------------------------------------------------------- /assets/insepect-element-3D.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/insepect-element-3D.jpg -------------------------------------------------------------------------------- /assets/inspect-element-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element-logo.jpg -------------------------------------------------------------------------------- /assets/inspect-element-simple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element-simple.jpg -------------------------------------------------------------------------------- /assets/inspect-element-v1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element-v1.png -------------------------------------------------------------------------------- /assets/inspect-element-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element-v2.png -------------------------------------------------------------------------------- /assets/inspect-element-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element-v3.png -------------------------------------------------------------------------------- /assets/inspect-element.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-element.jpg -------------------------------------------------------------------------------- /assets/inspect-panel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect-panel.png -------------------------------------------------------------------------------- /assets/inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/inspect.png -------------------------------------------------------------------------------- /assets/just-windows.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/just-windows.jpg -------------------------------------------------------------------------------- /assets/logo-twitter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/logo-twitter.png -------------------------------------------------------------------------------- /assets/ring-gizmodo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ring-gizmodo.png -------------------------------------------------------------------------------- /assets/ring-map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/ring-map.png -------------------------------------------------------------------------------- /assets/sap2019.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/sap2019.png -------------------------------------------------------------------------------- /assets/spicy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/spicy.png -------------------------------------------------------------------------------- /assets/wsj_tiktok.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yinleon/inspect-element/e14feb43394f7b8ff83d8a0c79f6b9424bd9335b/assets/wsj_tiktok.mp4 -------------------------------------------------------------------------------- /best-practices-data-collection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4ae1d029", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Best practices for data collection\"\n", 10 | "author: \"Leon Yin\"\n", 11 | "date-modified: \"02-14-2023\"\n", 12 | "href: data-collection-best-practices\n", 13 | "---" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "9b499b02", 19 | "metadata": {}, 20 | "source": [ 21 | "In the previous chapters, we covered techniques and methods of data collection.\n", 22 | "\n", 23 | "You can apply those techniques towards building a _data pipeline_, which can increase the reliability and scale of your data collection significantly. The professional pursuit of this task is data engineering, and it often involves APIs, cloud computing, and databases. \n", 24 | "\n", 25 | "Here are some helpful tips for building datasets." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "id": "5fd083b6", 31 | "metadata": {}, 32 | "source": [ 33 | "## Don't repeat work\n", 34 | "\n", 35 | "Before you collect data, check if you've already collected it. \n", 36 | "\n", 37 | "Create a programmatic naming structure for a \"target\"-- this could be a filename or a unique ID in a database, and check if it exists. \n", 38 | "\n", 39 | "If it already exists, move on.\n", 40 | "\n", 41 | "Below is a dummy example of a scraper for video metadata that checks if a file with the same `video_id` has already been saved." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "id": "e8fbe17f", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import os\n", 52 | "import time\n", 53 | "\n", 54 | "def collect_video_metadata(video_id):\n", 55 | " \"\"\"\n", 56 | " This is an example of a data collection function\n", 57 | " that checks if a video_id has already been collected.\n", 58 | " \"\"\"\n", 59 | " # consistently structure the target filename (fn_out)\n", 60 | " fn_out = f\"video_metadata_{video_id}.csv\"\n", 61 | " \n", 62 | " # check if the file exists, if it does: move on\n", 63 | " if os.path.exists(fn_out):\n", 64 | " print(\"already collected\")\n", 65 | " return\n", 66 | " \n", 67 | " # collect the data (not actually implemented)\n", 68 | " print(\"time to do some work!\")\n", 69 | " \n", 70 | " # save the file. Instead of real data, we'll save text that says, \"Collected\".\n", 71 | " with open(fn_out, 'w') as f:\n", 72 | " f.write(\"Collected\")\n", 73 | " return" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "7411a123", 79 | "metadata": {}, 80 | "source": [ 81 | "Let's try to collect some video metadata for a `video_id` of our choosing." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 2, 87 | "id": "434d88e4", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "video_id = \"schfiftyfive\"" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "id": "393d97c4", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "#| echo: false\n", 102 | "def delete_file(video_id):\n", 103 | " fn_out = f\"video_metadata_{video_id}.csv\"\n", 104 | " if os.path.exists(fn_out):\n", 105 | " os.remove(fn_out)\n", 106 | "delete_file(video_id = video_id)" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "id": "a04e5394", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "time to do some work!\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "collect_video_metadata(video_id = video_id)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "4099c7c6", 130 | "metadata": {}, 131 | "source": [ 132 | "Let's try to run the same exact function with the same input:" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 5, 138 | "id": "8be15f97", 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "already collected\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "collect_video_metadata(video_id = video_id)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "7f328030", 156 | "metadata": {}, 157 | "source": [ 158 | "The second time you call it, the function ends early.\n", 159 | "\n", 160 | "When collecting a large dataset, these steps are essential to make the best use of time." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "id": "d97d14a3", 166 | "metadata": {}, 167 | "source": [ 168 | "## Make a todo list\n", 169 | "\n", 170 | "In addition to not repeating yourself, keep tabs on what needs to be done. That could be a simple CSV file, or something more advanced like a queuing system such as AWS SQS. For queuing systems, you can clear tickets that have been finished, and re-do tickets that might have failed." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "ac1ae858", 176 | "metadata": {}, 177 | "source": [ 178 | "## Save receipts\n", 179 | "\n", 180 | "Save the output of every step, especially the earliest steps of collecting a JSON response from a server, or the HTML of a website. \n", 181 | "\n", 182 | "You can always re-write parsers that turn that \"raw\" data into something neat and actionable. \n", 183 | "\n", 184 | "Websites and API responses _can_ change, so web parsers can break easily. It is safer to just save the data straight from the source, and process it later.\n", 185 | "\n", 186 | "If you're collecting a web page through browser automation, save a screenshot. It's helpful to have reference material of what the web page looked like when you captured it.\n", 187 | "\n", 188 | "This is something we did at the Markup when we collected Facebook data from a national panel over several months, and again, when we collected Google search results.\n", 189 | "\n", 190 | "These receipts don't just play a role in the underlying analysis, they can be used as powerful exhibits in your investigation.\n", 191 | "\n", 192 | "
\n", 193 | "
" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "id": "39f4d48a", 201 | "metadata": {}, 202 | "source": [ 203 | "## Break up the work, and make it as small as possible\n", 204 | "\n", 205 | "Break scraping tasks into the smallest units of work. This makes scaling up easier, and it also prevents a single point of failure disrupting your entire workflow.\n", 206 | "\n", 207 | "Certain components of a scraper can be slower than others. By dividing the tasks, you can better identify bottlenecks to optimize the pipeline. Use to-do lists, and check for existing files to help communicate between tasks.\n", 208 | "\n", 209 | "Remember that big problems can be broken up into smaller problems. Being smart can help you get to the finish line faster and debug issues quicker." 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "id": "95ae852b", 215 | "metadata": {}, 216 | "source": [ 217 | "## Bigger isn't always better\n", 218 | "\n", 219 | "Be smart with how you use data, rather than depend on big numbers. Data isn't in-itself valuable.\n", 220 | "\n", 221 | "It's better to start off smaller, with a trial analysis (we often call it a quick-sniff in the newsroom) to make sure you have a testable hypothesis.\n", 222 | "\n", 223 | "This is always a step I use at my newsroom to plan longer data investigations, and see what kind of story we could write if we spent more time on the data collection and honing the methodology." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "857531fa", 229 | "metadata": {}, 230 | "source": [ 231 | "## Spotcheck everything\n", 232 | "\n", 233 | "Manually check your programmatically saved results with the live results. Small errors can be systematic errors if you don't catch them manually. Choose a reasonable sample size (such as N=100), to assure what you're analyzing is exactly what you think you are.\n", 234 | "\n", 235 | "This is something we did to [bullet-proof](https://themarkup.org/google-the-giant/2020/07/28/how-we-analyzed-google-search-results-web-assay-parsing-tool#appendix-3-error-analysis) almost every investigation, even if we didn't publish the results of that hand-check." 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "8304e329", 241 | "metadata": {}, 242 | "source": [ 243 | "## Conclusion\n", 244 | "\n", 245 | "These tips are not definitive. If you want to share tips, please make a suggestion via email or GitHub. " 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "2e688668", 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.7.3" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 5 278 | } 279 | -------------------------------------------------------------------------------- /browser_automation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "raw", 5 | "id": "ad2fe7e4", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Browser Automation\"\n", 10 | "pagetitle: \"Browser Automation\"\n", 11 | "description-meta: \"Introduction, case studies, and exercises for automating browsers.\"\n", 12 | "description-title: \"Introduction, case studies, and exercises for automating browsers.\"\n", 13 | "author: \"Piotr Sapiezynski and Leon Yin\"\n", 14 | "author-meta: Piotr Sapiezynski and Leon Yin\"\n", 15 | "date: \"06-11-2023\"\n", 16 | "date-modified: \"06-17-2023\"\n", 17 | "execute: \n", 18 | " enabled: false\n", 19 | "keywords: data collection, web scraping, browser automation, algorithm audits, personalization\n", 20 | "twitter-card:\n", 21 | " title: Browser Automation\n", 22 | " description: Introduction, case studies, and exercises for automating browsers.\n", 23 | " image: assets/inspect-element-logo.jpg\n", 24 | "open-graph:\n", 25 | " title: Browser Automation\n", 26 | " description: Introduction, case studies, and exercises for automating browsers.\n", 27 | " locale: us_EN\n", 28 | " site-name: Inspect Element\n", 29 | " image: assets/inspect-element-logo.jpg\n", 30 | "href: browser_automation\n", 31 | "---" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 40, 37 | "id": "36120f3d", 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/markdown": [ 43 | "\n", 44 | "📖 Read online\n", 45 | "⚙️ GitHub\n", 46 | "🏛 Citation\n", 47 | "
\n" 48 | ], 49 | "text/plain": [ 50 | "" 51 | ] 52 | }, 53 | "metadata": {}, 54 | "output_type": "display_data" 55 | } 56 | ], 57 | "source": [ 58 | "#| echo: false\n", 59 | "from utils import build_buttons\n", 60 | "from importlib import reload\n", 61 | "import utils\n", 62 | "reload(utils)\n", 63 | "utils.build_buttons(link= 'browser_automation', \n", 64 | " github= 'https://github.com/yinleon/inspect-element/blob/main/browser_automation.ipynb',\n", 65 | " colab = False,\n", 66 | " citation= True)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "id": "6f8a5a96", 72 | "metadata": {}, 73 | "source": [ 74 | "Browser automation is a fundamental web scraping technique for building your own dataset.\n", 75 | "\n", 76 | "It is essential for investigating personalization, working with rendered elements, and waiting for scripts and code to execute on a web page.\n", 77 | "\n", 78 | "However, browser automation can be resource intensive and slow compared to other data collection approaches.\n", 79 | "\n", 80 | "👉[Click here to jump to the Selenium tutorial](#tutorial)." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "id": "19b88ce3", 86 | "metadata": {}, 87 | "source": [ 88 | "# Intro\n", 89 | "\n", 90 | "If you’ve tried to buy concert tickets to a popular act lately, you’ve probably watched in horror as the blue “available” seats evaporate before your eyes the instant tickets are released. Part of that may be pure ✨star power✨, but more than likely, bots were programmed to buy tickets to be resold at a premium.\n", 91 | "\n", 92 | "These bots are programmed to act like an eager fan: waiting in the queue, selecting a seat, and paying for the show. These tasks can all be executed using browser automation.\n", 93 | "\n", 94 | "**Browser automation** is used to programmatically interact with web applications. \n", 95 | "\n", 96 | "The most frequent use case for browser automation is to run tests on websites by simulating user behavior (mouse clicks, scrolling, and filling out forms). This is routine and invisible work that you wouldn’t remember, unlike seeing your dream of crowd surfing with your favorite musician disappear thanks to ticket-buying bots.\n", 97 | "\n", 98 | "But browser automation has another use, one which _may_ make your dreams come true: web scraping.\n", 99 | "\n", 100 | "Browser automation isn’t always the best solution for building a dataset, but it is necessary when you need to:\n", 101 | "\n", 102 | "1. **Analyze rendered HTML**: see what's on a website as a user would.\n", 103 | "2. **Simulate user behavior**: experiment with personalization and experience a website as a user would.\n", 104 | "3. **Trigger event execution**: retrieve responses to JavaScript or [network requests](/apis.html) following an action.\n", 105 | "\n", 106 | "These reasons are often interrelated. We will walk through case studies (below) that highlight at least one of these strengths, as well as why browser automation was a necessary choice.\n", 107 | "\n", 108 | "Some popular browser automation tools are [Puppeteer](https://pptr.dev/), [Playwright](https://playwright.dev/), and [Selenium](https://www.selenium.dev/documentation/webdriver/elements/). \n", 109 | "\n", 110 | "## Headless Browsing\n", 111 | "\n", 112 | "Browser automation can be executed in a \"headless\" state by some tools.\n", 113 | "\n", 114 | "This doesn't mean that the browser is a ghost or anything like that, it just means that the _user interface_ is not visible.\n", 115 | "\n", 116 | "One benefit of headless browsing is that it is less [resource intensive](/apis.html#case-study-on-scalability-collecting-internet-plans), however there is no visibility into what the browser is doing, making headless scrapers difficult to debug.\n", 117 | "\n", 118 | "Luckily, some browser automation tools (such as Selenium) allow you to [toggle headless browsing](https://www.selenium.dev/blog/2023/headless-is-going-away/) on and off. Other tools, such as Puppeteer only allow you to use headless browsing.\n", 119 | "\n", 120 | "If you’re new to browser automation, we suggest not using headless browsing off the bat. Instead try Selenium (or Playwright), which is exactly what we’ll do in the [tutorial](#tutorial) below." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "id": "f46d0841", 126 | "metadata": {}, 127 | "source": [ 128 | "
\n", 129 | "
Using Selenium to automate browsing TikTok's \"For You\" page for food videos.
\n", 130 | "
" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "id": "b35529f2", 136 | "metadata": {}, 137 | "source": [ 138 | "# Case Studies\n", 139 | "## Case Study 1: Google Search\n", 140 | "In the investigation “[Google the Giant](https://themarkup.org/google-the-giant/2020/07/28/google-search-results-prioritize-google-products-over-competitors),” The Markup wanted to measure how much of a Google Search page is “Google.” Aside from the daunting task of classifying what is \"Google,\" and what is \"not Google,\" the team of two investigative journalists-- Adrianne Jeffries and Leon Yin (a co-author of this section) needed to measure real estate on a web page.\n", 141 | "\n", 142 | "The team developed a [targeted staining technique](https://themarkup.org/google-the-giant/2020/07/28/how-we-analyzed-google-search-results-web-assay-parsing-tool) inspired by the life sciences, originally used to highlight the presence of chemicals, compounds, or cancers. \n", 143 | "\n", 144 | "
\n", 145 | "\"https://themarkup.org/google-the-giant/2020/07/28/how-we-analyzed-google-search-results-web-assay-parsing-tool#google-search-flow\"\n", 149 | "
\n", 150 | "Source: The Markup\n", 151 | "
\n", 152 | "
\n", 153 | "\n", 154 | "The reporters wrote over [68 web parsers](https://github.com/the-markup/investigation-google-search-audit/blob/master/utils/parsers.py) to identify elements on trending Google Search results as \"Google,\" or three other categories. Once an element was identified, they could find the [coordinates](https://developer.mozilla.org/en-US/docs/Web/SVG/Element/rect) of each element along with its corresponding bounding box. Using the categorization and bounding box, The Markup were able to measure how many pixels were allocated to Google properties, as well as where they were placed on a down the page for a mobile phone.\n", 155 | "\n", 156 | "
\n", 157 | "\"https://themarkup.org/google-the-giant/2020/07/28/how-we-analyzed-google-search-results-web-assay-parsing-tool#google-search-flow\"\n", 161 | "
\n", 162 | "Source: The Markup\n", 163 | "
\n", 164 | "
\n", 165 | "\n", 166 | "Browser automation tools' ability to collect and analyze **rendered HTML pages** can be essential. This is especially the case for search results, since most search results contain modules, carousels, and other non-standardized rows and columns that are more complex than lists.\n", 167 | "\n", 168 | "Rendered HTML can be used to analyze the allocation of real estate on a website, which can be a useful metric to gauge self-preferencing and [anti-competitive business practices](https://themarkup.org/amazons-advantage/2021/10/14/amazon-puts-its-own-brands-first-above-better-rated-products) relevant to [antitrust](https://themarkup.org/google-the-giant/2020/07/29/congressman-says-the-markup-investigation-proves-google-has-created-a-walled-garden). Take for example this case study, which was placed above the others because one of this section's co-authors happened to work on it." 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "id": "b9edba05", 174 | "metadata": {}, 175 | "source": [ 176 | "## Case Study 2: Deanonymizing Google's Ad Network\n", 177 | "\n", 178 | "Google ad sellers offer space on websites like virtual billboards, and are compensated by Google after an ad is shown. However, unlike physical ad sellers, almost all of the ~1.3 million ad sellers on Google are anonymous. To limit transparency further, multiple websites and apps can be monetized by the same seller, and it’s not clear which websites are part of Google’s ad network in the first place. \n", 179 | "\n", 180 | "As a result, [advertisers](https://checkmyads.org/branded/google-ads-has-become-a-massive-dark-money-operation/) and the public do not know who is making money from Google ads. Fortunately, watchdog groups, industry analysts, and reporters have developed methods to hold Google accountable for this oversight.\n", 181 | "\n", 182 | "The methods boil down to triggering a JavaScript function that sends a request to Google to show an ad on a loaded web page. Importantly, the request reveals the seller ID used to monetize the website displaying the ad, and in doing so, links the seller ID to the website.\n", 183 | "\n", 184 | "In 2022, reporters from ProPublica used Playwrite to [automated this process](https://www.propublica.org/article/google-display-ads-piracy-porn-fraud) to visit 7 million websites and deanonymize over 900,000 Google ad sellers. Their investigation found some websites were able to monetize advertisements, despite breaking Google’s policies.\n", 185 | "\n", 186 | "ProPublica's investigation used browser automation tools to **trigger event execution** to successfully load ads. Often, this required waiting a page to fully render, scrolling down to potential ad space, and browsing multiple pages. The reporters used a combination of network requests, rendered HTML, and cross-referencing screenshots to confirm that each website monetized ads from Google’s ad network.\n", 187 | "\n", 188 | "Browser automation can help you trawl for clues, especially when it comes to looking for specific network requests sent to a central player by many different websites." 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "8a959bae", 194 | "metadata": {}, 195 | "source": [ 196 | "## Case Study 3: TikTok Personalization\n", 197 | "An investigation conducted by the Wall Street Journal, \"[Inside TikTok's Algorithm](https://www.wsj.com/articles/tiktok-algorithm-video-investigation-11626877477)\" found that even when a user does not like, share, or follow any creators, TikTok still personalizes their \"For You\" page based on how long they watch the recommended videos.\n", 198 | "\n", 199 | "In particular, the WSJ investigation found that users who watch content related to depression and skip other content are soon presented with mental health content and little else. Importantly, this effect happened even when the users did not explicitly like or share any videos, nor did they follow any creators. \n", 200 | "\n", 201 | "You can watch the WSJ's video showing how they mimic user behavior to study the effects of personalization:" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "id": "8b22990d", 207 | "metadata": { 208 | "tags": [] 209 | }, 210 | "source": [ 211 | "
\n", 212 | "
Source: WSJ
\n", 215 | "
\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "id": "c96a049d", 221 | "metadata": {}, 222 | "source": [ 223 | "This investigation was possible only after **simulating user behavior** and triggering personalization from TikTok's \"For You\" recommendations." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "4bd4d5d5", 229 | "metadata": {}, 230 | "source": [ 231 | "# Tutorial\n", 232 | "In the hands-on tutorial we will attempt to study personalization on TikTok with a mock experiment. \n", 233 | "\n", 234 | "We’re going to teach you the basics of browser automation in Selenium, but the techniques we'll discuss could be used to study any other website using any other automation tool.\n", 235 | "\n", 236 | "We will try to replicate elements of the WSJ investigation and see if we can trigger a personalized \"For You\" page. Although the WSJ ran their investigation using an Android on a Raspberry Pi, we will try our luck with something you can run locally on a personal computer using browser automation.\n", 237 | "\n", 238 | "In this tutorial we'll use Selenium to watch TikTok videos where the description mentions keywords of our choosing, while skipping all others. In doing so, you will learn practical skills such as:\n", 239 | "\n", 240 | "* Setting up the automated browser in Python\n", 241 | "* Hiding signs that are easy tells of an automated browser\n", 242 | "* Finding particular elements on the screen, extracting their content, and interacting with them\n", 243 | "* Scrolling\n", 244 | "* Taking screenshots\n", 245 | "\n", 246 | "Importantly, we’ll be watching videos with lighter topics than depression (the example chosen in the WSJ investigation.).\n", 247 | "\n", 248 | "::: {.callout-tip}\n", 249 | "#### Pro tip: Minimizing harms\n", 250 | "When developing an audit or investigation, start with low-stakes themes: both to minimize your exposure to harmful content and to avoid boosting their popularity, unnecessarily.\n", 251 | ":::" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "id": "4fd9eae2", 257 | "metadata": {}, 258 | "source": [ 259 | "## Step 1: Setting up the browser\n", 260 | "Our setup will consist of a real browser and an interface that will allow us to control that browser using Python. We chose Google Chrome because it's the most popular browser and easy enough (famous last words) to set up.\n", 261 | "\n", 262 | "### 1.1 Installing Google Chrome\n", 263 | "Please download the most recent version [here](https://www.google.com/chrome/).\n", 264 | "\n", 265 | "If you already have Google Chrome installed, make sure it's a latest version by opening Chrome and pasting this address in the address bar: [chrome://settings/help](chrome://settings/help). Now verify that there are no pending updates.\n", 266 | "\n", 267 | "![](assets/browser1_01_version1.png \"Google Chrome window showing the current version\")\n", 268 | "\n", 269 | "### 1.2 Installing the webdriver\n", 270 | "The `webdriver` is our interface between Python and the browser. It is specific to the browser (there are different webdrivers for Firefox [called Gecko], Safari, etc) and even to the particular version of the browser. It's easier to ensure we are working with the correct version by installing a webdriver that automatically detects the current version of Chrome. \n", 271 | "\n", 272 | "Run the code in the cell below to download the Python package [`chromedriver-binary-auto`](https://pypi.org/project/chromedriver-binary-auto/). Adding an exclamation mark before code in Jupyter notebook allows you to run commands as if you were in your computer terminal's [command line](https://www.computerhope.com/jargon/c/commandi.htm)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 3, 278 | "id": "8e720357", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "Requirement already satisfied: chromedriver-binary-auto in /Users/leon/miniconda3/lib/python3.7/site-packages (0.2.6)\n", 286 | "Collecting chromedriver-binary-auto\n", 287 | " Downloading chromedriver-binary-auto-0.3.1.tar.gz (5.6 kB)\n", 288 | " Preparing metadata (setup.py) ... \u001b[?25ldone\n", 289 | "\u001b[?25hBuilding wheels for collected packages: chromedriver-binary-auto\n", 290 | " Building wheel for chromedriver-binary-auto (setup.py) ... \u001b[?25ldone\n", 291 | "\u001b[?25h Created wheel for chromedriver-binary-auto: filename=chromedriver_binary_auto-0.3.1-py3-none-any.whl size=8766235 sha256=f391019058992cc819aa8f6b7feb58bf2ef7477951fdfc91c4ae9a8079531d04\n", 292 | " Stored in directory: /Users/leon/Library/Caches/pip/wheels/59/5a/a1/80470a1f46c83d62550af8d40634bdb8bc5f034d910d80ff2b\n", 293 | "Successfully built chromedriver-binary-auto\n", 294 | "Installing collected packages: chromedriver-binary-auto\n", 295 | " Attempting uninstall: chromedriver-binary-auto\n", 296 | " Found existing installation: chromedriver-binary-auto 0.2.6\n", 297 | " Uninstalling chromedriver-binary-auto-0.2.6:\n", 298 | " Successfully uninstalled chromedriver-binary-auto-0.2.6\n", 299 | "Successfully installed chromedriver-binary-auto-0.3.1\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "!pip install chromedriver-binary-auto --upgrade" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "id": "bf52b6cc", 310 | "metadata": {}, 311 | "source": [ 312 | "Let's see if the installation worked correctly! Run the cell below to import the correct webdriver and open a new Chrome window.\n" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 4, 318 | "id": "f5029767", 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "from selenium import webdriver\n", 323 | "import chromedriver_binary # adds the chromedriver binary to the path\n", 324 | "\n", 325 | "driver = webdriver.Chrome()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "d7e0baee", 331 | "metadata": {}, 332 | "source": [ 333 | "The `chrome-driver-auto` package should have installed a driver that's suitable for your current Chrome version running the line of code above should have opened a new Chrome window.\n", 334 | "\n", 335 | "This step is notoriously hard, and you might get a version mismatch error:\n", 336 | "\n", 337 | "```\n", 338 | "SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 112\n", 339 | "Current browser version is 113 with binary path /Applications/Google Chrome.app/Contents/MacOS/Google Chrome\n", 340 | "```\n", 341 | "It means that you probably updated your Chrome in the meantime. To fix it, reinstall the Python package:" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 4, 347 | "id": "51cc50bb", 348 | "metadata": {}, 349 | "outputs": [ 350 | { 351 | "name": "stdout", 352 | "output_type": "stream", 353 | "text": [ 354 | "Collecting chromedriver-binary-auto\n", 355 | " Using cached chromedriver_binary_auto-0.2.6-py3-none-any.whl\n", 356 | "Installing collected packages: chromedriver-binary-auto\n", 357 | " Attempting uninstall: chromedriver-binary-auto\n", 358 | " Found existing installation: chromedriver-binary-auto 0.2.6\n", 359 | " Uninstalling chromedriver-binary-auto-0.2.6:\n", 360 | " Successfully uninstalled chromedriver-binary-auto-0.2.6\n", 361 | "Successfully installed chromedriver-binary-auto-0.2.6\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "!pip install --upgrade --force-reinstall chromedriver-binary-auto" 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "id": "9b14a1b0", 372 | "metadata": {}, 373 | "source": [ 374 | "If everything works fine and you have the window open, our setup is complete and you can now close the Chrome window:" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "edf3f8b3", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "driver.close()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "id": "a6adc24d", 390 | "metadata": {}, 391 | "source": [ 392 | "## Step 2: Hiding typical tells of an automated browser\n", 393 | "When you open Chrome with Selenium you'll notice that the window displays a warning about being an \"automated session\". \n", 394 | "Even though the warning is only displayed to you, the webdriver leaves behind other red flags that inform website administrators that you are using browser automation.\n", 395 | "\n", 396 | "The website admins will use these red flags to refuse service to your browser.\n", 397 | "\n", 398 | "Let's remove those." 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 8, 404 | "id": "7080780b", 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "options = webdriver.ChromeOptions()\n", 409 | "options.add_argument(\"start-maximized\")\n", 410 | "\n", 411 | "# remove all signs of this being an automated browser\n", 412 | "options.add_argument('--disable-blink-features=AutomationControlled')\n", 413 | "options.add_experimental_option(\"excludeSwitches\", [\"enable-automation\"])\n", 414 | "options.add_experimental_option('useAutomationExtension', False)\n", 415 | "\n", 416 | "# open the browser with the new options\n", 417 | "driver = webdriver.Chrome(options=options)\n", 418 | "driver.get('https://tiktok.com/foryou')" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "id": "d29801fa", 424 | "metadata": {}, 425 | "source": [ 426 | "This should open a new window without those warnings and navigate to tiktok.com:\n", 427 | "\n", 428 | "![](assets/browser1_02_tiktok1.png \"tiktok main page\")\n", 429 | "\n", 430 | "\n" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "id": "0257856d", 436 | "metadata": {}, 437 | "source": [ 438 | "## Step 3: Finding elements on page and interacting with them\n", 439 | "\n", 440 | "We will perform our mock experiment without logging in (but we will also learn how to create multiple accounts and how to log in later).\n", 441 | "\n", 442 | "Instead of logging in, our first interaction will be dismissing this login window. Doing this programmatically has two steps:\n", 443 | "\n", 444 | "1. We need to identify that \\[X\\] button in the page source \n", 445 | "2. And then click it\n", 446 | "\n", 447 | "Let's inspect the button element:\n", 448 | "![](assets/browser1_03_dismiss1.png \"Inspecting the Dismiss button\")\n", 449 | "\n", 450 | "In my case, the particular element that the Developer Tools navigated to is just the graphic on the button, not the button itself, but you can still find the actual button by hovering your mouse over different elements in the source and seeing what elements on page are highlighted:\n", 451 | "\n", 452 | "![](assets/browser1_04_inspect1.png \"Inspecting the Dismiss button\")\n", 453 | "\n", 454 | "Our close button is a `
` element, whose `data-e2e` attribute is `\"modal-close-inner-button\"`. \n", 455 | "\n", 456 | "There are many ways to fish for the exact element you want, and [many of those methods](https://www.selenium.dev/documentation/webdriver/elements/locators/) are built into Selenium. One way to find it would be using a `CSS_SELECTOR`, like so:" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 4, 462 | "id": "4f4b5b2c", 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "data": { 467 | "text/plain": [ 468 | "" 469 | ] 470 | }, 471 | "execution_count": 4, 472 | "metadata": {}, 473 | "output_type": "execute_result" 474 | } 475 | ], 476 | "source": [ 477 | "from selenium.webdriver.common.by import By\n", 478 | "\n", 479 | "close_button = driver.find_element(By.CSS_SELECTOR, '[data-e2e=\"modal-close-inner-button\"]')\n", 480 | "close_button" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "id": "5baa78b5", 486 | "metadata": {}, 487 | "source": [ 488 | "If Selenium successfully finds an element, you'll get a `WebElement` object of the first match. However, if Selenium **does not** find the element-- for example because the element hasn't loaded yet, you will get an empty object in return. This will crash your script if you try to interact with the empty element. \n", 489 | "\n", 490 | "One thing you can do is to tell Selenium to wait up to `X_seconds` for that particular element before trying to click on it, like this:" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 9, 496 | "id": "6a708b94", 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "" 503 | ] 504 | }, 505 | "execution_count": 9, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "from selenium.webdriver.support.ui import WebDriverWait\n", 512 | "from selenium.webdriver.support import expected_conditions as EC\n", 513 | "\n", 514 | "# let's wait up to 20 seconds\n", 515 | "X_seconds = 20\n", 516 | "wait = WebDriverWait(driver, timeout = X_seconds)\n", 517 | "wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '[data-e2e=\"modal-close-inner-button\"]')))\n", 518 | "\n", 519 | "# this line will only execute whenever the element was found (or after 20 seconds it it wasn't)\n", 520 | "close_button = driver.find_element(By.CSS_SELECTOR, '[data-e2e=\"modal-close-inner-button\"]')\n", 521 | "close_button" 522 | ] 523 | }, 524 | { 525 | "cell_type": "markdown", 526 | "id": "2fbde11d", 527 | "metadata": {}, 528 | "source": [ 529 | "We seem to have found something, let's click it! `WebElement`s come equipped with special functions you can use to [interact](https://www.selenium.dev/documentation/webdriver/elements/interactions/) with them:" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 10, 535 | "id": "32f95832", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "close_button.click()" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "id": "bea11899", 545 | "metadata": {}, 546 | "source": [ 547 | "Did you notice a change on the page? Congratulations! You just automated the browser to click something." 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "id": "094e00f3", 553 | "metadata": {}, 554 | "source": [ 555 | "## Step 4: Scrolling\n", 556 | "\n", 557 | "We now have a browser instance open and displaying the For You page. Let's scroll through the videos.\n", 558 | "\n", 559 | "If you are a *real person* who (for whatever reason) visits TikTok on their computer, you could press the down key the keyboard to see new videos. We will do that programmatically instead:" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 11, 565 | "id": "2a37f38a", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "from selenium.webdriver.common.action_chains import ActionChains\n", 570 | "from selenium.webdriver.common.keys import Keys\n", 571 | "\n", 572 | "actions = ActionChains(driver)\n", 573 | "actions.send_keys(Keys.ARROW_DOWN)\n", 574 | "actions.perform()" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "id": "866f0198", 580 | "metadata": {}, 581 | "source": [ 582 | "When you run the cell above you will see that your browser scrolls down to the next video. You just automated scrolling!" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "id": "19c5ad4f", 588 | "metadata": {}, 589 | "source": [ 590 | "## Step 5: Finding TikTok videos on the page\n", 591 | "\n", 592 | "Now that the site loaded and you can browse it, let's find all the TikTok videos that are displayed and extract the information (called metadata) from each of them.\n", 593 | "\n", 594 | "1. Right click on the white space around a TikTok video and choose \"Inspect\".\n", 595 | "![Inspect Element](assets/browser1_05_inspect_tiktok_a1.png)\n", 596 | "1. Hover your mouse over the surrounding `
` elements and observe the highlighted elements on the page to see which ones correspond to each TikTok video.\n", 597 | "![Inspect Element](assets/browser1_05_inspect_tiktok_b1.png)\n", 598 | "1. You will see that each video is in a separate `
` container but each of these containers has the same `data-e2e` attribute with the value of `recommend-list-item-container`.\n", 599 | "1. Similarly to how we found the close button, we can now use this to find all videos on page:" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 12, 605 | "id": "9026378c", 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "videos = driver.find_elements(By.CSS_SELECTOR, '[data-e2e=\"recommend-list-item-container\"]')" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "id": "26f81de7", 615 | "metadata": {}, 616 | "source": [ 617 | "When we searched for the \"dismiss\" button we used the `driver.find_element()` function because we were only interested in the first element that matched our CSS selector.\n", 618 | "\n", 619 | "Now we're trying to find all videos on page, so we use the `driver.find_elements()` function instead - it returns the complete list of elements that match the selector." 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 13, 625 | "id": "7c199654", 626 | "metadata": {}, 627 | "outputs": [ 628 | { 629 | "data": { 630 | "text/plain": [ 631 | "[,\n", 632 | " ,\n", 633 | " ,\n", 634 | " ,\n", 635 | " ,\n", 636 | " ,\n", 637 | " ,\n", 638 | " ,\n", 639 | " ]" 640 | ] 641 | }, 642 | "execution_count": 13, 643 | "metadata": {}, 644 | "output_type": "execute_result" 645 | } 646 | ], 647 | "source": [ 648 | "videos" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "id": "0bf6bc5e", 654 | "metadata": {}, 655 | "source": [ 656 | "## Step 6: Parsing TikTok metadata\n", 657 | "Now that we found all the TikTok videos on the page, let's extract the description from each - this is how we will decide whether to watch the video, or to skip it. The process of extracting a specific field from a webpage is \"parsing\".\n", 658 | "\n", 659 | "1. Pick any description, right click, \"Inspect\". \n", 660 | "1. Let's locate the `
` that contains the whole description (including any hashtags) and make note of its `data-e2s` attribute.\n", 661 | "1. Now let's write the code that, extracts the description from a single video (note that you can get the text content of any element by calling `element.text`)" 662 | ] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": 14, 667 | "id": "4ba29e77", 668 | "metadata": {}, 669 | "outputs": [ 670 | { 671 | "name": "stdout", 672 | "output_type": "stream", 673 | "text": [ 674 | "\n", 675 | "The last one 😂😂 #pet #cat #dog #cute #animals #funny #foryou #fyp\n", 676 | "الرد على @hadeelalsamare #اكسبلور #fyp #fypシ\n", 677 | "BEST MAGIC TRICKS REVEALED 😱😳 #magician #learnfromme #foru #popular\n", 678 | "The most Useful Toy ever! 2 😂 #fun #play #fyp\n", 679 | "Iphone 13 pro max #repair #tamarshabi🥰 תיקון\n", 680 | "Herb-Crusted Rack of Lamb 😍 #lamb #easyrecipe #easyrecipes #asmrfood #foodtok #cooktok #dinnerwithme #homecook #homecooking #dinnerideas #dinnerparty\n", 681 | "#fyp #halsey #geazy #scandal\n", 682 | "شو رأيكم كان فيها تكفي اللقمة اللي بتمها؟ 😐#hasanandhawraa #ramdan2023 #رمضان_يجمعنا #رمضان\n" 683 | ] 684 | } 685 | ], 686 | "source": [ 687 | "for video in videos:\n", 688 | " print(video.find_element(By.CSS_SELECTOR, '[data-e2e=\"video-desc\"]').text)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "id": "ede219b6", 694 | "metadata": {}, 695 | "source": [ 696 | "::: {.callout-note}\n", 697 | "Note: We previously searched for elements using `driver.find_element()` and `driver.find_elements()`. That allowed us to search the whole page. Notice that here, instead of `driver`, we're using a particular element which we called `video`: this way we can search for elements **within an element**, rather than on the whole page.\n", 698 | ":::" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "id": "623497ab", 704 | "metadata": {}, 705 | "source": [ 706 | "## Step 7: Finding the TikTok video that's currently playing\n", 707 | "We know how to scroll to the next video, and we know how to find all videos that are loaded.\n", 708 | "At this point we could either:\n", 709 | "\n", 710 | "1. Assume that at the beginning, the 0th video is playing, and then every time we press arrow down, the next video is being displayed
\n", 711 | "2. Assume that the arrow down does not always work and each time verify which video is actually playing\n", 712 | "\n", 713 | "The problem with the first approach is that even if scrolling fails just once, our experiment will be compromised (after it happens we will be watching and skipping different videos that our script tells us). This is why we will go with the second approach and verify which video is actually playing. Back to our favorite tool- inspect element!\n", 714 | "\n", 715 | "When you right click on the playing video, you will see that instead of our familiar UI we get a custom TikTok menu, so that won't work. Try right-clicking on the description of the video instead, then hovering over different elements in the inspector and expanding the one that highlights the video in the browser. Dig deep until you get to the `div` that only contains the video. \n", 716 | "\n", 717 | "Still in the inspector try looking at the video below. You will see that the `div` that contains the video is missing and there is no element with the tag name `video`. That's how we can find if the video is currently playing - its `div` will contain the `video` element that we can find by `TAG_NAME`:" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 15, 723 | "id": "3d653bf1", 724 | "metadata": {}, 725 | "outputs": [ 726 | { 727 | "name": "stdout", 728 | "output_type": "stream", 729 | "text": [ 730 | "playing \n", 731 | "not playing The last one 😂😂 #pet #cat #dog #cute #animals #funny #foryou #fyp\n", 732 | "not playing الرد على @hadeelalsamare #اكسبلور #fyp #fypシ\n", 733 | "not playing BEST MAGIC TRICKS REVEALED 😱😳 #magician #learnfromme #foru #popular\n", 734 | "not playing The most Useful Toy ever! 2 😂 #fun #play #fyp\n", 735 | "not playing Iphone 13 pro max #repair #tamarshabi🥰 תיקון\n", 736 | "not playing Herb-Crusted Rack of Lamb 😍 #lamb #easyrecipe #easyrecipes #asmrfood #foodtok #cooktok #dinnerwithme #homecook #homecooking #dinnerideas #dinnerparty\n", 737 | "not playing #fyp #halsey #geazy #scandal\n", 738 | "not playing شو رأيكم كان فيها تكفي اللقمة اللي بتمها؟ 😐#hasanandhawraa #ramdan2023 #رمضان_يجمعنا #رمضان\n" 739 | ] 740 | } 741 | ], 742 | "source": [ 743 | "for video in videos:\n", 744 | " description = video.find_element(By.CSS_SELECTOR, '[data-e2e=\"video-desc\"]').text\n", 745 | " if video.find_elements(By.TAG_NAME, 'video'):\n", 746 | " playing = 'playing'\n", 747 | " else:\n", 748 | " playing = 'not playing'\n", 749 | " print(playing, description)" 750 | ] 751 | }, 752 | { 753 | "cell_type": "markdown", 754 | "id": "a521b431", 755 | "metadata": {}, 756 | "source": [ 757 | "## Step 8: Taking screenshots and saving page source\n", 758 | "The presentation of your results might be more compelling, when its accompanied by screenshots, rather than just data. Selenium allows you to take screenshots of the whole screen, or just a particular element (though the latter is a bit cumbersome):" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": 22, 764 | "id": "9c2a3e99", 765 | "metadata": {}, 766 | "outputs": [], 767 | "source": [ 768 | "# take a screenshot of the whole browser\n", 769 | "driver.save_screenshot('full_screenshot.png')\n", 770 | "\n", 771 | "# take a screenshot of just one video\n", 772 | "screenshot = video.screenshot_as_png\n", 773 | "with open('element_screenshot.png', 'wb') as output:\n", 774 | " output.write(screenshot)" 775 | ] 776 | }, 777 | { 778 | "cell_type": "markdown", 779 | "id": "a1a69b7c", 780 | "metadata": {}, 781 | "source": [ 782 | "In the spirit of _bringing receipts_, you can also save the entire webpage to parse it later." 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 25, 788 | "id": "e88ebe1b", 789 | "metadata": {}, 790 | "outputs": [], 791 | "source": [ 792 | "# save the source of the entire page\n", 793 | "page_html = driver.page_source\n", 794 | "with open('webpage.html', 'w') as output:\n", 795 | " output.write(page_html)" 796 | ] 797 | }, 798 | { 799 | "cell_type": "markdown", 800 | "id": "9f90af21", 801 | "metadata": {}, 802 | "source": [ 803 | "::: {.callout-tip}\n", 804 | "#### Pro tip: Keep these records to sanity check your results\n", 805 | "Taking a screenshot and saving the page source is a useful practice for checking your work. Use the two to cross-reference what was visible in the browser and whatever data you end up extracting during the parsing step.\n", 806 | ":::\n", 807 | "\n", 808 | "Let's close the browser for now, and kick this workflow up a notch." 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": 16, 814 | "id": "4673c388", 815 | "metadata": {}, 816 | "outputs": [], 817 | "source": [ 818 | "driver.close()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "id": "6decdc72", 824 | "metadata": {}, 825 | "source": [ 826 | "## Step 9: Putting it all together\n", 827 | "At this point, we can read the description of TikTok videos and navigate the \"For You\" page. \n", 828 | "\n", 829 | "That's most of the setup we need to try our mock experiment:
\n", 830 | "let's watch all TikTok videos that mention food in the description and skip videos that do not mention food.\n", 831 | "\n", 832 | "After one hundred videos, we will see whether we are served videos from FoodTok more frequently than other topics.\n", 833 | "\n", 834 | "::: {.callout-tip}\n", 835 | "#### Pro tip: Use functions!\n", 836 | "So far we wrote code to open the browser, close the dialog, and find videos as separate cells in the notebook. We _could_ copy that code over here to use it, but it will be much easier to understand and maintain the code if we write clean, well-documented functions with descriptive names.\n", 837 | ":::" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": 17, 843 | "id": "85f37900", 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "from selenium import webdriver\n", 848 | "from selenium.webdriver.common.by import By\n", 849 | "from selenium.webdriver.common.action_chains import ActionChains\n", 850 | "from selenium.webdriver.common.keys import Keys\n", 851 | "from selenium.webdriver.support.ui import WebDriverWait\n", 852 | "from selenium.webdriver.support import expected_conditions as EC\n", 853 | "\n", 854 | "import chromedriver_binary\n", 855 | "\n", 856 | "\n", 857 | "\n", 858 | "def open_browser():\n", 859 | " \"\"\"\n", 860 | " Opens a new automated browser window with all tell-tales of automated browser disabled\n", 861 | " \"\"\"\n", 862 | " options = webdriver.ChromeOptions()\n", 863 | " options.add_argument(\"start-maximized\")\n", 864 | "\n", 865 | " # remove all signs of this being an automated browser\n", 866 | " options.add_argument('--disable-blink-features=AutomationControlled')\n", 867 | " options.add_experimental_option(\"excludeSwitches\", [\"enable-automation\"])\n", 868 | " options.add_experimental_option('useAutomationExtension', False)\n", 869 | "\n", 870 | " # open the browser with the new options\n", 871 | " driver = webdriver.Chrome(options=options)\n", 872 | " return driver\n", 873 | "\n", 874 | "def close_login_dialog(driver):\n", 875 | " \"\"\"\n", 876 | " Waits for the login dialog to appear, then closes it\n", 877 | " \"\"\"\n", 878 | " \n", 879 | " # rather than trying to click a button that might have not loaded yet, we will \n", 880 | " # wait up to 20 seconds for it to actually appear first\n", 881 | " wait = WebDriverWait(driver, timeout = 20)\n", 882 | " wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '[data-e2e=\"modal-close-inner-button\"]')))\n", 883 | " \n", 884 | " close_button = driver.find_element(By.CSS_SELECTOR, '[data-e2e=\"modal-close-inner-button\"]')\n", 885 | " if close_button:\n", 886 | " close_button.click()\n", 887 | "\n", 888 | "def arrow_down(driver):\n", 889 | " \"\"\"\n", 890 | " Sends the ARROW_DOWN key to a webdriver instance.\n", 891 | " \"\"\"\n", 892 | " actions = ActionChains(driver)\n", 893 | " actions.send_keys(Keys.ARROW_DOWN)\n", 894 | " actions.perform()\n", 895 | " \n", 896 | "def find_videos(driver):\n", 897 | " \"\"\"\n", 898 | " Finds all tiktoks loaded in the browser\n", 899 | " \"\"\"\n", 900 | " videos = driver.find_elements(By.CSS_SELECTOR, '[data-e2e=\"recommend-list-item-container\"]')\n", 901 | " return videos\n", 902 | "\n", 903 | "def get_description(video):\n", 904 | " \"\"\"\n", 905 | " Extracts the video description along with any hashtags\n", 906 | " \"\"\"\n", 907 | " try:\n", 908 | " description = video.find_element(By.CSS_SELECTOR, '[data-e2e=\"video-desc\"]').text\n", 909 | " except:\n", 910 | " # if the description is missing, just get any text from the video\n", 911 | " description = video.text\n", 912 | " return description\n", 913 | "\n", 914 | "def get_current(videos):\n", 915 | " \"\"\"\n", 916 | " Given the list of videos it returns the one that's currently playing\n", 917 | " \"\"\"\n", 918 | " for video in videos:\n", 919 | " if video.find_elements(By.TAG_NAME, 'video'):\n", 920 | " # this one has the video, we can return it and that ends the function.\n", 921 | " return video\n", 922 | " \n", 923 | " return None\n", 924 | "\n", 925 | "def is_target_video(description, keywords):\n", 926 | " \"\"\"\n", 927 | " Looks for keywords in the given description. \n", 928 | " NOTE: only looks for the substring IE partial match is enough.\n", 929 | " Returns `True` if there are any or `False` when there are none.\n", 930 | " \"\"\"\n", 931 | " # check in any of the keywords is in the description\n", 932 | " for keyword in keywords:\n", 933 | " if keyword in description:\n", 934 | " # we have a video of interest, let's watch it \n", 935 | " return True\n", 936 | " \n", 937 | " # if we're still here it means no keywords were found\n", 938 | " return False\n", 939 | "\n", 940 | "def screenshot(video, filename=\"screenshot.png\"):\n", 941 | " \"\"\"\n", 942 | " Saves a screenshot of a given video to a specified file\n", 943 | " \"\"\"\n", 944 | " screenshot = video.screenshot_as_png\n", 945 | " with open(filename, 'wb') as output:\n", 946 | " output.write(screenshot)\n", 947 | " \n", 948 | "def save_source(driver, filename=\"screenshot.html\"):\n", 949 | " \"\"\"\n", 950 | " Saves the browser HTML to a file\n", 951 | " \"\"\"\n", 952 | " page_html = driver.page_source\n", 953 | " with open('webpage.html', 'w') as output:\n", 954 | " output.write(page_html)" 955 | ] 956 | }, 957 | { 958 | "cell_type": "markdown", 959 | "id": "5286eb0d", 960 | "metadata": {}, 961 | "source": [ 962 | "Ok, with that out of the way, let's set up our first data collection.\n", 963 | "\n", 964 | "First, let's make a directory to save screenshots. We will save screenshots here whenever we find a video related to food." 965 | ] 966 | }, 967 | { 968 | "cell_type": "code", 969 | "execution_count": 18, 970 | "id": "4e4e7c69", 971 | "metadata": {}, 972 | "outputs": [], 973 | "source": [ 974 | "import os\n", 975 | "\n", 976 | "os.makedirs('data/screenshots/', exist_ok=True)" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": 22, 982 | "id": "b72f60b8", 983 | "metadata": {}, 984 | "outputs": [ 985 | { 986 | "name": "stdout", 987 | "output_type": "stream", 988 | "text": [ 989 | "0 False ДО КОНЦА😂 а какой у тебя рост?\n", 990 | "1 False • Reprodução: (SBT/Programa Raul Gil) 🇧🇷\n", 991 | "#combateaosuicidio\n", 992 | "2 False #stitch #이어찍기 #추천 #fyp #viral #xyzbca #korean #おすすめ\n", 993 | "3 True Cuando hago papas de esta manera, todos me preguntan por la receta😋😱#viral #parati #recetas #cocina #recetasfaciles #papa #queso #jamon #food #saborestiktok\n", 994 | "4 False #ومش_هزود_في_الملام #explore\n", 995 | "#fypシ #foryoupage #fyp #viral\n", 996 | "#مش_هنظبط_الريتش_بقي🖤 #حزين\n", 997 | "#حالات_واتس_حزينه💔 #foryou\n", 998 | "5 False #PasiondeGavilanes #telenovelacolombiana\n", 999 | "6 False #accident a veces pasa de todo 👉 sigueme para PARTE 2.\n", 1000 | "7 False Zjedzcie se tez cos fajnego dzis #gotowaniezdominika\n", 1001 | "8 False كيف تكتب اسم يوسف بخط جميل♥️🌹-\n", 1002 | "-\n", 1003 | "-\n", 1004 | "-\n", 1005 | "9 False بنت الجنوب 🔥🤍🇹🇳#مطماطة_قابس_تونس #اكسبلور\n", 1006 | "10 False Game on\n", 1007 | "11 False Чи бачите різницю між фото? Чи бачите які кадри зроблені на дорогу , а які на дешеву камеру? ☺️ #фотограф #фотоапарат #обзор #фотографія\n", 1008 | "12 False #bendiciones #mideseo #TikTok #viral #\n", 1009 | "13 False The most Useful Toy ever! 2 😂 #fun #play #fyp\n", 1010 | "14 False Replying to @user4034722293618\n", 1011 | "15 False jajeczniczka z kielbasiana\n", 1012 | "16 False كام مره بكيت ؟ 🥺💔🎧 #المصمم_sheko🎧 #الرتش_فى_زمه_الله💔 #حالات_واتس #شاشه_سوداء #مصمم_حالات_واتس_شاشه_سوداء #fypシ #foryou #fyp #viral #music #tiktok\n", 1013 | "17 False #movie #movieclip #fyp\n", 1014 | "18 False Я ПРОТИВ КУРЕНИЯ, А ВЫ?\n", 1015 | "19 False Uno de nuestros trends favoritos 😍🍭 @SHICKO 💊 @N E N A 🍓\n", 1016 | "20 False Esse final me quebrou…🥺💛\n", 1017 | "\n", 1018 | "🎥Filme: Extraordinário\n", 1019 | "\n", 1020 | "#disciplina #motivacional #trechosvisionarios #extraordinar\n", 1021 | "21 False Parece que o Vin Diesel curtiu “Vai Sentando” 😅\n", 1022 | "22 False Para mi mama una niña valiente ♥️💕🇺🇸#parati #parati #parati #parati #parati #fyp #fyp #viral #viral #viral #viral #viral #viral #vistas #vistas #vistas ##vistas #vistas #muyviral @TikTok\n", 1023 | "23 False #drawing #viralvideo🔥 #fypシ゚viral\n", 1024 | "24 False شو رأيكم كان فيها تكفي اللقمة اللي بتمها؟ 😐#hasanandhawraa #ramdan2023 #رمضان_يجمعنا #رمضان\n", 1025 | "25 False Brock is always there to save the day 🦈💪🏼 #wwe #wrestling #wrestlingmemes #brocklesnar #wweisfake #fakesituation #sharkattack #sharks #wwe2023 #nextgen #wwenetwork #smackdown #wwefan #bodybuilding #beach #holiday #pool #sea\n", 1026 | "26 False HONEY, I SEE YOU #foryou #omspxd #music #mashonda #fyp #lyrics #speed #spedup #🎧\n", 1027 | "27 False #fyp #mrbeast #foryou wow 😳😲\n", 1028 | "28 False \n", 1029 | "29 False Sometimes its better to save your breathe. #trauma #traumahealing #awakining #love #relationship #relatable #loveyourself #men #women #healing #problems #girltalk #therapy #couple #mom #fyp #fypシ #emotion\n", 1030 | "30 False I love my body 🥰💜.. Dc: @Dance God 🦅🇬🇭 #purplespeedy\n", 1031 | "31 False raye mi camioneta por ustedes jajajajajajaja\n", 1032 | "32 False \n", 1033 | "33 False My new car #catsoftiktok #fyp #fypシ\n", 1034 | "34 False \n", 1035 | "35 False Fiz um almoço insano! @Mateus ASMR\n", 1036 | "36 False #bajonesemocionales #🥀💔\n", 1037 | "37 False Don't mess with Cristiano 😤|| #cristianoronaldo #cr7 #mufc #intermilan #manutd #viral #ucl #tiktoktrending\n", 1038 | "38 False This Small Town Farmer Better Buckle Up! - End #dealornodeal #show #fyp #deal\n", 1039 | "39 False Genius, billionaire, playboy, philanthropist... and a great dancer🕺#downey #rdj #robertdowneyjr #ironman #tonystark #unrealdowneyjr #unrealrobertdowneyjr\n", 1040 | "40 False Celebre as suas vitórias, amiga! 😍 Fazer 1% todos os dias vai te levar a lugares que você nem imagina.\n", 1041 | "Eu treino com a @Queima Diária 🔥 desde novembro de 2022 e fico muito feliz com esses resultados. Quem vem nessa comigo? Clica no link da bio ou nos stories e experimente por 30 dias!\n", 1042 | "41 False اكتب شيء تؤجر عليه ✨🤍 #fyp #قران #عبدالرحمن_مسعد\n", 1043 | "42 False I ❤️ Michael Jordan 🏀 #mercuri_88 #funny #littlebrother #tiktok #mom #CapCut #basketball #nba #jordan\n", 1044 | "43 False Estavam com saudade? Nao me deixa sem graça nao caraaaa kkkkkk\n", 1045 | "44 False يعني ارسمها علشان افرحها ويحصل معايا كدة 🤦‍♂️ #علي_الراوي\n", 1046 | "45 False Ролик уже на канале💋\n", 1047 | "46 False What k-drama do you think this is? #kdrama #드라마 #seoul #theglory\n", 1048 | "47 False cat #cat #catsoftiktok #fun #foryou #fyp #viral #funny #😂😂😂 #🤣🤣🤣\n", 1049 | "48 False #korea #seoul #socialexperiment #fyp\n", 1050 | "49 False \n", 1051 | "50 False #fyp #foryou #طيران\n", 1052 | "51 False الماء والنار… 🥀💔 #lebrany #viral #foryou #explor\n", 1053 | "52 False #foryou #recovery #homecare #gloves\n", 1054 | "53 False Салат из одного ингредиента\n", 1055 | "54 False #blog #vacuna Hoy tocó hacer vacunar a Salchipapu contra la rabia 🥺🐶\n", 1056 | "55 False Song name: Jegi Jegi\n", 1057 | "Watch full song on youtube ( Barbud Music )\n", 1058 | "\n", 1059 | "#lailakhan #newsong #rejarahish #tiktokpakistan\n", 1060 | "56 False Putting automatic stickers on manual doors 😂 #rosscreations #prank\n", 1061 | "57 False Abril 11 parte 7 “Comida Turka”\n", 1062 | "58 True recipe: @ファビオ飯(イタリア料理人)🇮🇹Fabio #tiktokfood #asmr\n", 1063 | "59 False Metallic silver epoxy floor🔥 #fyp #epoxyresin #garagegoals #epoxypour #polyasparticfloors #polyaspartic #theepoxypros\n", 1064 | "60 False Enter the homepage to watch more wonderful videos#movieclips\n", 1065 | "61 False Respuesta a @RZㅤGOLOSAღ -😅 @Duhsein\n", 1066 | "62 False Почему «Титаник» до сих пор не подняли со дна океана? #титаник\n", 1067 | "63 False Funny homework!✨✨#asmr #home #goodthing #foryou\n", 1068 | "64 False 😂😂@도윤 #주전 #fyp\n", 1069 | "65 False #parati #fyp #foryou #foryoupage #viral #trump #trump2024 #biden #teamtrump #donaldtrump\n", 1070 | "66 False Não acreditei no resultado🥺🙌🏼\n", 1071 | "67 False Atât de vrednică sunt… 😂\n", 1072 | "M-am făcut de negreală pe obraz🤦🏻‍♀️😂 #soferițadecamion🚛😍 #AGLogistics #oriundeîneuropa #truckgirl\n", 1073 | "68 False Gatinho Resgatado na chuva 🙏🏻 #jesus #jesuscristo #deus #resgateanimal #resgate #gato #gatinho #cat #viraliza\n", 1074 | "69 False #pegar un video de\n", 1075 | "@Yohary Michell Rios #maestra #maestros #universidad #universidad #clases #clasesvirtuales #profesora #profesor #fyp #parati #fouryou #fouyoupage #escuela #escuelatiktok #viral #\n", 1076 | "70 False So cuteee😂\n", 1077 | "71 False بوظتلهم الدنيا 😂\n", 1078 | "72 False #pourtoi #foryou #cpl #bracelet #trend\n", 1079 | "73 False What’s one way He’s held you as you’ve stepped out in faith? 🌊 #UNITED #fyp #christiantiktok #worship #Oceans\n", 1080 | "74 False Antwort auf @🍇Wallah Krise🍇 I am going out tonight 💚 #bumpyride\n", 1081 | "75 False #ليلياناا_نحن_عنوان_الجمال👑😍 #viral #fipシ #foryou #foryoupage #جمال #مكياج #شنيون #عرايس #لف #ميش #اكسبلور #لايك #هشتاك #مشاهير_تيك_توك #تخصيل\n", 1082 | "76 False Full Episode 293 on YT & Spotify | ShxtsnGigs Podcast\n", 1083 | "77 False GAME DE RUA COM LARRIKA! #gamederua #viral #fy #fypシ #pravoce #foryoupage\n", 1084 | "78 False The smallest phone #CapCut #oppo #infinix #Motorola #zte #huawei #vivo #samsung\n", 1085 | "79 False \n", 1086 | "80 False Respect Moment in Football ❤️#footballeur #surprise #fan #respectmoment #respectinfootball #moment #respect #foryou #pourtoi #football\n", 1087 | "81 False I think I got it in my pants 😧 #learnfromkhaby #comic\n", 1088 | "82 False Respondendo a @hg_11236 ta aqui a reacao dela ❤️❤️❤️❤️ fofa demais! #fypシ #diadasmaes #surpresa\n", 1089 | "83 False Наступ на Белгород. Що роблять добровольці там #війна #грайворон #белгород #українськийтікток #андрійковаленко\n", 1090 | "84 False Have you ever eaten a cappuccino croissant? ☕️🥐\n", 1091 | ".\n", 1092 | ".\n", 1093 | ".\n", 1094 | "#pastry #pasticceria #italia #croissant\n", 1095 | "85 False #recetas #facil whatia en tierrra\n", 1096 | "86 False seyran inşallah gidersin feritinn bı kazimdan tokat yemediği kalmamisti#yalıcapkınıxferit #feritkorhan #seyrankorhan #mertramazandemir #afrasaraçoğlu #seyfer #yalıçapkını #keşfet #fypシ #foryoupage #foryou #viral\n", 1097 | "87 False \n", 1098 | "88 False La puissance de l’eau #pourtoi #meteo #inondation #eau #vigilance\n", 1099 | "89 False Olha a aranha\n", 1100 | "#alegriaquecontagia #comedia #viral #rireomelhorremedio #rireprosfortes #rirrenovaalma #gargalhada #fypシ #viralvideo #comediante #trolagem\n", 1101 | "90 False Se puede ser infiel por chat? VIDEO COMPLETO EN EL LINK DE MI PERFIL ✅ #juliosinfiltros #relaciones #pareja #relacionessanas #infidelidad #infieles #microinfidelidades\n", 1102 | "91 False Replying to @MC Codër\n", 1103 | "92 False #kamalaghalan❣\n", 1104 | "93 False Лобода про детей\n", 1105 | "94 False Відмічай друга😅#українськийтікток #футболкизпринтами #подарунокхлопцю #подарунокдругу\n", 1106 | "95 False Find your self worth.#real #loyalty #love #sad #sadquotes #relatable #betryal #foryou #scrolling #mindset #reality #xyzbca #fyp\n", 1107 | "96 False #київ #вибух #нло #метеорит #ракета #сяйво #спалах #сніданокз1плюс1\n", 1108 | "97 False اكثر مسلسل حبيتوها برمضان ؟#مهند_رفل #explore\n", 1109 | "98 False المنتج اللي قالب التيك توك .. أسفنجة التنضيف السحرية 🧐 #حركة_لاكسبلورر #fyp #gym #عبدالرحمن_وابتسام #trendingtiktok #challenge #fypシ\n", 1110 | "99 True Scotch Egg 😍🥚 #scotchegg #egg #easyrecipe #easyrecipes #caviar #eggs #asmrfood #bacon #cooktok #foodtok #recipesoftiktok #homecook #dinnerideas #eggrecipe #breakfastideas #fancy\n" 1111 | ] 1112 | } 1113 | ], 1114 | "source": [ 1115 | "import time\n", 1116 | "\n", 1117 | "# if the description has any one these words, we will watch the video\n", 1118 | "keywords = ['food', 'dish', 'cook', 'pizza', 'recipe', 'mukbang', 'dinner', 'foodie', 'restaurant']\n", 1119 | "\n", 1120 | "# this is where will we store decisions we take\n", 1121 | "decisions = []\n", 1122 | "\n", 1123 | "# open a browser, and go to TikTok's For You page.\n", 1124 | "driver = open_browser()\n", 1125 | "driver.get('https://tiktok.com/foryou')\n", 1126 | "close_login_dialog(driver)\n", 1127 | "\n", 1128 | "for tiktok_index in range(0, 100):\n", 1129 | " # get all videos\n", 1130 | " tiktoks = find_videos(driver)\n", 1131 | " \n", 1132 | " # the current tiktok is the one that's currently showing the video player\n", 1133 | " current_video = get_current(tiktoks)\n", 1134 | " \n", 1135 | " if current_video is None:\n", 1136 | " print('no more videos')\n", 1137 | " break\n", 1138 | " \n", 1139 | " # read the description of the video\n", 1140 | " description = get_description(current_video)\n", 1141 | " \n", 1142 | " # categorize the video as relevant to `keywords` or not.\n", 1143 | " contains_keyword = is_target_video(description, keywords)\n", 1144 | " decisions.append(contains_keyword )\n", 1145 | " \n", 1146 | " print(tiktok_index, contains_keyword, description)\n", 1147 | " \n", 1148 | " if contains_keyword:\n", 1149 | " # we have a video of interest, let's take a screenshot\n", 1150 | " ## here we declare the files we'll save. they're named according to their order.\n", 1151 | " fn_screenshot = f\"data/screenshots/screenshot_{tiktok_index:05}.png\"\n", 1152 | " fn_page_soure = fn_screenshot.replace('.png', '.html')\n", 1153 | " screenshot(current_video, fn_screenshot)\n", 1154 | " save_source(driver, fn_page_source)\n", 1155 | " # and now watch it for 30 seconds\n", 1156 | " time.sleep(30)\n", 1157 | " \n", 1158 | " # move to the next video\n", 1159 | " arrow_down(driver)\n", 1160 | " time.sleep(2)\n", 1161 | " \n", 1162 | "driver.close()" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "markdown", 1167 | "id": "02711c22", 1168 | "metadata": {}, 1169 | "source": [ 1170 | "::: {.callout-tip}\n", 1171 | "#### Pro tip: Be careful about keywords\n", 1172 | "For experiments that use `keywords`, the choices we make will directly shape our results. In the field, you can mitigate your own predisposition and biases by working with [domain experts to curate keyword lists](https://themarkup.org/google-the-giant/2021/04/09/how-we-discovered-googles-social-justice-blocklist-for-youtube-ad-placements#sourcing-social-justice-keywords).\n", 1173 | ":::" 1174 | ] 1175 | }, 1176 | { 1177 | "cell_type": "code", 1178 | "execution_count": 24, 1179 | "id": "d234ea90", 1180 | "metadata": {}, 1181 | "outputs": [ 1182 | { 1183 | "data": { 1184 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEGCAYAAAB2EqL0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARG0lEQVR4nO3debBkZX3G8e8DBBhABEUNizgQURyJjmSKIFiWCiaiEtQQcI2SKC4YBLXiEqvERGORGBfKiBJFMIUjQlBwRUXixqLDIuAQRQERHGFcWERZhF/+OOdqz2Xu3HeY6Xub7u+n6tbtc/p09+/wNveZ855z3jdVhSRJs9lgvguQJN03GBiSpCYGhiSpiYEhSWpiYEiSmmw03wUMyzbbbFMLFy6c7zIk6T7lggsu+HlVPWh1z41tYCxcuJBly5bNdxmSdJ+S5MczPWeXlCSpiYEhSWpiYEiSmhgYkqQmBoYkqcmcXCWV5IHAWf3iHwN3ASv75T2q6o65qEOSdO/NSWBU1S+AxQBJjgJ+XVXvGtwmSYBU1d1zUZMkae3M630YSR4OnAFcBDwO2C/Jd6tqq/755wL7VtVLkzwEOBbYEbgbOLyqzpvLej9+/jWcfvF1v18+YPH2PP/Pd5zLEqQ18js6maa3+6LttuSt+z96vX/OKJzD2BV4T1UtAq5bw3bHAP9WVUuAg4APT98gyaFJliVZtnLlynu8wbo6/eLrWL7iZgCWr7h5lQaSRoHf0ck02O7DNAp3ev+oqlpuyd4XeGTXcwXA1kkWVNVvp1ZU1XHAcQBLliwZysxQi7bdkpNf/ngO/tC5w3h7aZ35HZ1MU+0+TKMQGLcOPL4byMDypgOPgyfIJWnejEKX1O/1J7x/lWSXJBsAzx54+ivAYVMLSRbPdX2SNMlGKjB6bwDOBM4Brh1Yfxiwd5JLkiwHXjYfxUnSpJrzLqmqOmrg8Q/pL7cdWHcycPJqXrcSOHDY9UmSVm8UjzAkSSPIwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNdloTU8mec6anq+q09ZvOZKkUbXGwAD2738/GNgL+Gq//GTgHMDAkKQJscbAqKpDAJJ8CVhUVSv65W2BE4ZenSRpZLSew3joVFj0rgd2HEI9kqQRNVuX1JSzkpwJLO2XDwa+MpySJEmjqCkwqurVSZ4NPLFfdVxVfWp4ZUmSRk3rEQbAhcAtVfWVJJsluV9V3TKswiRJo6XpHEaSlwGnAh/qV20PfHpYRUmSRk/rSe/DgL2BmwGq6gq6S20lSROiNTBur6o7phaSbATUcEqSJI2i1sD4WpI3AwuSPBU4BfjM8MqSJI2a1sB4I7ASuBR4OfB54C3DKkqSNHpaL6u9G/iv/keSNIGaAiPJ3sBRwMP61wSoqtp5eKVJkkZJ630YHwGOBC4A7hpeOZKkUdUaGDdV1ReGWokkaaTNNh/G7v3Ds5P8O91w5rdPPV9VFw6xNknSCJntCOM/pi0vGXhcwFPWbzmSpFE123wYT56rQiRJo611LKl/TbLVwPLWSd4+vLIkSaOm9ca9/arqxqmFqvoV8PThlCRJGkWtgbFhkk2mFpIsADZZw/aSpDHTelntSXSz7n20Xz4E+NhwSpIkjaLWoUGOTvJdYN9+1b9U1ZnDK0uSNGpahwY5uqreAHxxNeskSROg9RzGU1ezbr/1WYgkabTNdqf3K4FXATsnuWTgqfsB3xpmYZKk0TJbl9THgS8A76SbE2PKLVX1y6FVJUkaObPd6X0TcBPwPIAkDwY2BbZIskVVXTP8EiVJo6D1Tu/9k1wBXAV8Dbia7shDkjQhWk96vx3YE/hBVe0E7AOcN7SqJEkjpzUw7qyqXwAbJNmgqs5m1ZFrJUljrvVO7xuTbAF8AzgpyQ3ArcMrS5I0atZ4hJHkiCR7AM8CfgMcQXfz3o+A/YdfniRpVMx2hLED8F5gV+BSunsvzgE+42W1kjRZZrus9vUASTamO2exF93Ag8clubGqFg2/REnSKGg9h7EA2BK4f//zU7ojDknShJhtaJDjgEcDtwDn03VHvbufQEmSNEFmu6x2R7qJkn4GXAdcC9y4xldIksbSbOcwnpYkdEcZewGvA3ZL8kvg3Kp66xzUKEkaAbOew6iqAi5LciPduFI3Ac8E9gAMDEmaELOdwzic7shiL+BOunMY5wDH40lvSZoosx1hLAROAY6sqhXDL0eSNKpmO4fx2rkqRJI02loHH5QkTTgDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSEwNDktTEwJAkNRlaYCS5K8nFAz8L17DtwiSXDasWSdK622iI7/3bqlo8xPeXJM2hYQbGPfRHGf8NbN6venVVnTNtm0cDHwU2pjsC+uuquiLJC4HD+/XnA6+qqruGUefbPvM9lv/05nusX77iZhZtu+Uqywd/6NxhlCDdK35HJ9P0dh+WYQbGgiQX94+vqqpnAzcAT62q25LsAiwFlkx73SuA91XVSUk2BjZM8ijgYGDvqrozyQeAFwAfG3xhkkOBQwF23HHH9b5Di7bdkgMWbw/w+9/SKPE7OpkG232YUlXDeePk11W1xbR19wfeDywG7gIeUVWb9Ucen62q3ZI8H/gnujA4rT+6eDXwZrrAAVgALK2qo2b6/CVLltSyZcvW815J0nhLckFVTf+HPDDHXVLAkcD1wGPpuptum75BVX08yfnAM4DPJ3k5EODEqnrTXBYrSfqDub6s9v7Aiqq6G3gRsOH0DZLsDFxZVccApwOPAc4CDkzy4H6bByR52NyVLUma68D4APDiJN8FdgVuXc02BwGX9ec/dgM+VlXLgbcAX0pyCfBlYNs5qlmSxBDPYcw3z2FI0tpb0zkM7/SWJDUxMCRJTQwMSVITA0OS1GRsT3onWQn8eB3eYhvg5+upnPuKSdxnmMz9dp8nx9ru98Oq6kGre2JsA2NdJVk205UC42oS9xkmc7/d58mxPvfbLilJUhMDQ5LUxMCY2XHzXcA8mMR9hsncb/d5cqy3/fYchiSpiUcYkqQmBoYkqYmBMU2SpyX5fpIfJnnjfNczDEkemuTsJMuTfC/Ja/r1D0jy5SRX9L+3nu9ahyHJhkkuSvLZfnmnJOf3bX5yP9Pj2EiyVZJTk/xfksuTPH4S2jrJkf33+7IkS5NsOo5tneT4JDckuWxg3WrbN51j+v2/JMnua/NZBsaAJBsC/wnsBywCnpdk0fxWNRS/A15XVYuAPYHD+v18I3BWVe1CNwfJWAYm8Brg8oHlo4H3VNXDgV8Bfz8vVQ3P+4AvVtWudJOXXc6Yt3WS7YHDgSVVtRvd3DvPZTzb+gTgadPWzdS++wG79D+HAseuzQcZGKvaA/hhVV1ZVXcAnwAOmOea1ruqWlFVF/aPb6H7A7I93b6e2G92IvCs+alweJLsQDeb44f75QBPAU7tNxmr/e6nRX4i8BGAqrqjqm5kAtqabkbRBUk2AjYDVjCGbV1VXwd+OW31TO17AN0cQ1VV5wFbJWmeW8jAWNX2wE8Glq/t142tfj71xwHnAw+pqhX9Uz8DHjJPZQ3Te4F/BO7ulx8I3FhVv+uXx63NdwJWAh/tu+E+nGRzxrytq+o64F3ANXRBcRNwAePd1oNmat91+htnYEywJFsA/wMcUVU3Dz5X3fXWY3XNdZJnAjdU1QXzXcsc2gjYHTi2qh5HN8vlKt1PY9rWW9P9a3onYDtgc+7ZbTMR1mf7Ghirug546MDyDv26sZPkj+jC4qSqOq1fff3U4Wn/+4b5qm9I9gb+KsnVdN2NT6Hr39+q77aA8Wvza4Frq+r8fvlUugAZ97beF7iqqlZW1Z3AaXTtP85tPWim9l2nv3EGxqq+A+zSX0mxMd1JsjPmuab1ru+3/whweVW9e+CpM4AX949fDJw+17UNU1W9qap2qKqFdG371ap6AXA2cGC/2Vjtd1X9DPhJkkf2q/YBljPmbU3XFbVnks367/vUfo9tW08zU/ueAfxtf7XUnsBNA11Xs/JO72mSPJ2un3tD4Piqesc8l7TeJXkC8A3gUv7Ql/9muvMYnwR2pBsa/qCqmn4ybSwkeRLw+qp6ZpKd6Y44HgBcBLywqm6fz/rWpySL6U7ybwxcCRxC94/FsW7rJG8DDqa7KvAi4KV0/fVj1dZJlgJPohvG/HrgrcCnWU379uH5frruud8Ah1TVsubPMjAkSS3skpIkNTEwJElNDAxJUhMDQ5LUxMCQJDUxMDQR+tF5/3LauiOSHJtkuySnzvC6/02yZD18/glJrkuySb+8TX8D4TpL8qSpkXelYTIwNCmW0t2sN+i5wNKq+mlVHbia16xvdwF/Nwefs1b6UZqlWRkYmhSnAs+Ymv+gH3RxO+AbSRZOzSWQZEGST/TzRnwKWDD1Bkn+Ism5SS5Mcko/FhdJ9ukH9ru0n5tgkxlqeC9w5MDQFFPvu8oRQpL3J3lJ//jqJO9McnGSZUl2T3Jmkh8lecXA22yZ5HPp5nL5YJINZqn56iRHJ7kQ+Jt7/V9VE8XA0ETo72L+Nt18ANAdXXyy7nnn6iuB31TVo+jumP0z6LqQgLcA+1bV7sAy4LVJNqWbj+DgqvpTusH+XjlDGdcA3wRetJblX1NVi+nuzj+BbmiLPYG3DWyzB/APdPO4/AnwnJlqHnjNL6pq96r6xFrWowm10eybSGNjqlvq9P736ibPeSJwDEBVXZLkkn79nnR/jL/Vja7AxsC5wCPpBrn7Qb/dicBhdEcTq/PO/vM/txZ1T41ndimwRT+HyS1Jbk+yVf/ct6vqSvj9UBFPAG6boeYpJ69FDZKBoYlyOvCeflrKzdZymPMAX66q562yMnns2hRQVVckuRg4aGD171j1aH/TaS+bGuvo7oHHU8tT/w9PP1KqmWoecGtr3RLYJaUJUlW/phut9Hi6o43V+TrwfIAkuwGP6defB+yd5OH9c5sneQTwfWDh1Hq67qavzVLKO4DXDyz/GFiUZJP+iGGftdqxzh79KMsb0A2498011CzdKwaGJs1SunmtZwqMY4EtklwO/DPdLG1U1UrgJcDSvpvqXGDXqrqNbvTXU5JMjf77wTUVUFXfAy4cWP4J3ciil/W/L7oX+/UdulFILweuAj41U8334r0lwNFqJUmNPMKQJDUxMCRJTQwMSVITA0OS1MTAkCQ1MTAkSU0MDElSk/8HABSZHuiqMGwAAAAASUVORK5CYII=\n", 1185 | "text/plain": [ 1186 | "
" 1187 | ] 1188 | }, 1189 | "metadata": { 1190 | "needs_background": "light" 1191 | }, 1192 | "output_type": "display_data" 1193 | } 1194 | ], 1195 | "source": [ 1196 | "import matplotlib.pyplot as plt\n", 1197 | "plt.plot(decisions, ds='steps')\n", 1198 | "plt.xlabel('Video Number')\n", 1199 | "plt.ylabel('Watched')\n", 1200 | "plt.yticks([0, 1], ['False', 'True']);" 1201 | ] 1202 | }, 1203 | { 1204 | "cell_type": "markdown", 1205 | "id": "63f7077e", 1206 | "metadata": {}, 1207 | "source": [ 1208 | "The figure above shows when during our 100-videos-long session we were recommended a video about food (from `keywords`). The x-axis is chronological, the 1st video displayed is on the left, and the most recent video is on the right. The y-axis is \"yes\" or \"no,\" depending on if the video was related to food. " 1209 | ] 1210 | }, 1211 | { 1212 | "cell_type": "markdown", 1213 | "id": "b28a3649", 1214 | "metadata": {}, 1215 | "source": [ 1216 | "### Results\n", 1217 | "\n", 1218 | "You can look back to the `data/screenshots` folder we created to check whether the videos we watched appear to be food-related. \n", 1219 | "\n", 1220 | "If the feed was indeed increasingly filled with food videos, we would see more lines towards the right of the graph. At least here it does not appear to be the case. \n", 1221 | "\n", 1222 | "Does it mean that the WSJ investigation was wrong, or that TikTok stopped personalizing content? \n", 1223 | "\n", 1224 | "The answer is \"No,\" for several reasons: \n", 1225 | "\n", 1226 | "1. We only scrolled through 100 videos, this is likely too few to observe any effects. Try re-running with a higher number!
\n", 1227 | "2. When studying personalization you should use an account per profile and make sure you're logged in, rather than relying on a fresh browser. So, instead of closing the login dialog, try actually logging in! You know how to find and click buttons, and [this is how you put text in text fields](https://www.geeksforgeeks.org/send_keys-element-method-selenium-python/).
\n", 1228 | "3. When you're not logged in, you will be presented with content from all over the world, in all languages. If you filtered `keywords` in just one language, you will miss plenty of target content in other languages.
\n", 1229 | "4. You should always have a baseline to compare to. In this case, you should probably run two accounts at the same time - one that watches food videos and one that doesn't. Then you compare the prevalence of food videos between these two.
\n", 1230 | "5. The WSJ investigation was run on the mobile app rather than on a desktop browser. Perhaps TikTok's personalization works differently based on device or operating system." 1231 | ] 1232 | }, 1233 | { 1234 | "cell_type": "markdown", 1235 | "id": "5a227a0b", 1236 | "metadata": {}, 1237 | "source": [ 1238 | "## Advanced Usage\n", 1239 | "\n", 1240 | "Above we highlighted some ideas to make your investigation or study more robust, some are methodological choices, but others are technical.\n", 1241 | "\n", 1242 | "There are some advanced use-cases and tasks you can perform with browser automation that include\n", 1243 | "\n", 1244 | "- Authentication using the browser and storing cookies for later use.
\n", 1245 | "- Intercept background [API](/apis.html) calls and combine browser automation with API calls. See [`selenium-wire`](https://pypi.org/project/selenium-wire/) as an example.
\n", 1246 | "- Signing in with one or more email addresses.
\n", 1247 | "\n", 1248 | "We may cover some or all of these topics in subsequent tutorials, but you should feel free to experiment.\n", 1249 | "\n", 1250 | "Let us know what you're interested in learning more about!" 1251 | ] 1252 | }, 1253 | { 1254 | "cell_type": "markdown", 1255 | "id": "e5623624", 1256 | "metadata": {}, 1257 | "source": [ 1258 | "# Related Readings\n", 1259 | "\n", 1260 | "More tutorials on the same subject:\n", 1261 | "\n", 1262 | "- \"[Using real browsers](https://scrapism.lav.io/using-real-browsers/)\" - Sam Lavigne\n", 1263 | "\n", 1264 | "Notable investigations, audits, and tools using browser automation:\n", 1265 | "\n", 1266 | "- \"[Blacklight](https://themarkup.org/blacklight)\" - a investigative tool by Surya Mattu
\n", 1267 | "- \"[TheirTube](https://www.their.tube/)\" - an art and advocacy project by Tomo Kihara
\n", 1268 | "- \"[Worlds Apart](https://www.nrk.no/osloogviken/xl/tiktok-doesn_t-show-the-war-in-ukraine-to-russian-users-1.15921522)\" - a TikTok investigation by Henrik Bøe and Christian Nicolai Bjørke
\n", 1269 | "- \"[WebSearcher](https://github.com/gitronald/WebSearcher)\" - A Python package by Ronald E. Robertson
\n", 1270 | "- \"[Googling for Abortion](https://journalqd.org/article/view/2752)\" - Yelena Mejova, Tatiana Gracyk, and Ronald E. Robertson
\n", 1271 | "- \"[webXray](https://webxray.org/)\" - A website forensics tool by Tim Liebert
\n", 1272 | "- \"[OpenWPM](https://github.com/itdelatrisu/OpenWPM)\" - A privacy-measurement tool\n", 1273 | "\n", 1274 | "Please reach out with more examples to add." 1275 | ] 1276 | }, 1277 | { 1278 | "cell_type": "markdown", 1279 | "id": "c29c689d", 1280 | "metadata": {}, 1281 | "source": [ 1282 | "# Citation\n", 1283 | "\n", 1284 | "To cite this chapter, please use the following BibTex entry:\n", 1285 | "\n", 1286 | "
\n",
1287 |     "@incollection{inspect2023browser,\n",
1288 |     "  author    = {Sapiezynski, Piotr and Yin, Leon},\n",
1289 |     "  title     = {Browser Automation},\n",
1290 |     "  booktitle = {Inspect Element: the practitioner's guide to hypothesis-driven data investigations},\n",
1291 |     "  year      = {2023},\n",
1292 |     "  editor    = {Yin, Leon and Sapiezynski, Piotr},\n",
1293 |     "  note      = {\\url{https://inspectelement.org}}\n",
1294 |     "}\n",
1295 |     "
\n", 1296 | "\n", 1297 | "## Acknowledgements\n", 1298 | "\n", 1299 | "Thank you to Ruth Talbot and John West for answering questions about their two respective investigations." 1300 | ] 1301 | }, 1302 | { 1303 | "cell_type": "code", 1304 | "execution_count": null, 1305 | "id": "a0611f22", 1306 | "metadata": {}, 1307 | "outputs": [], 1308 | "source": [] 1309 | } 1310 | ], 1311 | "metadata": { 1312 | "kernelspec": { 1313 | "display_name": "Python 3", 1314 | "language": "python", 1315 | "name": "python3" 1316 | }, 1317 | "language_info": { 1318 | "codemirror_mode": { 1319 | "name": "ipython", 1320 | "version": 3 1321 | }, 1322 | "file_extension": ".py", 1323 | "mimetype": "text/x-python", 1324 | "name": "python", 1325 | "nbconvert_exporter": "python", 1326 | "pygments_lexer": "ipython3", 1327 | "version": "3.7.3" 1328 | } 1329 | }, 1330 | "nbformat": 4, 1331 | "nbformat_minor": 5 1332 | } 1333 | -------------------------------------------------------------------------------- /build-your-own-datasets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "14dbb3e2", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Build your own datasets\"\n", 10 | "date-modified: \"06-26-2023\"\n", 11 | "href: data\n", 12 | "---" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "3ab5cb24", 18 | "metadata": {}, 19 | "source": [ 20 | "\"Finding stories in datasets\" is a misnomer in data journalism. Most open source datasets were created with a use case in mind. Seldom is that original use case compatible with a hypothesis you'll want to test, let alone a source that could lead any form of accountability.\n", 21 | "\n", 22 | "The same can be said for the academy. Analyzing existing datasets will not lead to drastically different conclusions, and can disproportionately inflate the importance of a topic, just because the data is readily available.\n", 23 | "\n", 24 | "Instead, you can build your own datasets by synthesizing publicly available data and records obtained from records requests.\n", 25 | "\n", 26 | "\n", 27 | "## Public data sources explained\n", 28 | "\n", 29 | "There's a difference between \"open data\" and \"publicly available data\":\n", 30 | "\n", 31 | "- **Open data** is typically already combined into a spreadsheet or database. Additionally, open data is usually documented, and easily available for the public to use. See for example, [climate data from NOAA](https://www.ncei.noaa.gov/cdo-web/), or the U.S. Census Bureau's [American Community Survey](https://www.census.gov/programs-surveys/acs/technical-documentation/code-lists.html).\n", 32 | "\n", 33 | "- **Publicly available data** lives on the open web, but has yet to be synthesized into a cohesive data set. It's up to you to collect these data points, responsibly. Search engines and many other technology companies (such as AI developers) depend on \"crawling\" these sources. \n", 34 | "\n", 35 | "At a minimum: only collect data with intention, do not overload websites' servers, and abstain from collecting personally identifiable information without user consent.\n", 36 | "\n", 37 | "## What to expect in this section?\n", 38 | "\n", 39 | "Publicly available data is a useful tool to audit and investigate technologies and their underlying business practices.\n", 40 | "\n", 41 | "The following sections will cover programmatic data collection and best-practices.\n", 42 | "\n", 43 | "We'll discuss data collection techniques such as:\n", 44 | "\n", 45 | "- [Finding undocumented APIs](/apis.html)
\n", 46 | "- [Browser automation](/browser_automation.html)
\n", 47 | "- App automation
\n", 48 | "- Parsing HTML and JSON\n", 49 | "\n", 50 | "Use these techniques to build datasets that allow you to test original hypotheses, design clear experiments, and understand the limitations that come along with the decisions you make." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "6da08a7f", 56 | "metadata": {}, 57 | "source": [ 58 | "## Scraping is not a crime\n", 59 | "\n", 60 | "Although big tech giants and data brokers often [depend on web scraping](https://www.bloomberg.com/news/articles/2023-02-02/meta-was-scraping-sites-for-years-while-fighting-the-practice#xj4y7vzkg) for their business models, they seldom use that data in the public interest or release data that could be used to hold themselves accountable. \n", 61 | "\n", 62 | "This guide exists to teach you how to build evidence that leads to accountability. However, know that using data to investigate powerful entities is not without risks. \n", 63 | "\n", 64 | "If you're in the United States: know what violates the **Computer Fraud and Abuses Act (CFAA)**, which primarily prohibits unauthorized access to a computer network.\n", 65 | "\n", 66 | "Recent cases such as *[Van Buren v. United States](https://www.supremecourt.gov/opinions/20pdf/19-783_k53l.pdf)*, *[hiQ v Linkedin](https://www.fbm.com/publications/what-recent-rulings-in-hiq-v-linkedin-and-other-cases-say-about-the-legality-of-data-scraping/)*, and *[Sandvig v. Barr](https://www.aclu.org/documents/sandvig-v-barr-memorandum-opinion)* helped shape interpretations of CFAA for collecting public data with automated means, such as web scraping.\n", 67 | "\n", 68 | "Although the [legal landscape](https://www.eff.org/deeplinks/2022/04/scraping-public-websites-still-isnt-crime-court-appeals-declares) is changing to favor [web scraping in the public interest](https://themarkup.org/news/2020/12/03/why-web-scraping-is-vital-to-democracy), we still see governments and industry titans attempt to shut down accountability efforts. Take for example:\n", 69 | "\n", 70 | "- A journalist in Missouri was called a hacker by the governor and threatened prosecution for identifying a flaw that revealed social security numbers of school employees after [inspecting the page source](https://arstechnica.com/tech-policy/2021/10/viewing-website-html-code-is-not-illegal-or-hacking-prof-tells-missouri-gov/).\n", 71 | "- Academic researchers at NYU received a cease-and-desist notice [for crowdsourcing Political ads from Facebook](https://knightcolumbia.org/content/researchers-nyu-knight-institute-condemn-facebooks-effort-to-squelch-independent-research-about-misinformation).\n", 72 | "\n", 73 | "::: {.callout-note}\n", 74 | "Even if your activity does not fall within CFAA’s purview or violate any other law, online services can suspend your account(s) for breaking their terms of service. For that reason, be careful involving your personal/institutional accounts in web scraping, and volunteers' if you're crowdsourcing data.\n", 75 | ":::\n", 76 | "\n", 77 | "If you want more information on the topic, several of the field's top researchers explore the legal and ethical considerations in Section 4.1 of @Metaxa-book.\n", 78 | "\n", 79 | "**This is NOT legal advice.** Discuss your intentions and your plan to collect data with your editor and legal counsel (if you're a journalist), or your advisor and ethics board (if you're a researcher). \n", 80 | "\n", 81 | "Having institutional support is essential to make sure you are protected, and that you and your superiors are well-informed about the risks." 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "62642cb1", 87 | "metadata": {}, 88 | "source": [ 89 | "\n", 90 | "### Aaron Swartz\n", 91 | "\n", 92 | "Contemporary legal interpretations of CFAA and web scraping can be traced back to the late activist and engineer Aaron Swartz.\n", 93 | "\n", 94 | "In 2008, Swartz was investigated by the F.B.I. for [scraping 2.7 million public court records](https://www.aaronswartzday.org/pacer-project-explained/) from PACER and sharing it with the public. Swartz redistributed information that is in the public domain, but hosted by a central entity that charges fees for accessing that public information.\n", 95 | "\n", 96 | "The bureau concluded that Swartz did not violate any laws, but three years later, Swartz was arrested and [federally indicted](https://www.documentcloud.org/documents/217117-united-states-of-america-v-aaron-swartz) for mass-downloading academic articles from JSTOR using a laptop stored in an MIT closet. Although neither JSTOR, MIT, nor state prosecutors chose to litigate, [federal prosecutors](https://en.wikipedia.org/wiki/Carmen_Ortiz#Prosecution_of_Aaron_Swartz) sought maximal penalties: Swartz faced $1 million in fees and 35 years in prison– charges that were deeply criticized by [lawyers](https://www.wbur.org/news/2013/01/16/gertner-criticizes-ortiz-swartz) and [experts](https://unhandled.com/2013/01/12/the-truth-about-aaron-swartzs-crime/). \n", 97 | "\n", 98 | "Swartz's prosecution and [untimely passing](https://boingboing.net/2013/01/12/rip-aaron-swartz.html) would have a chilling effect on web scraping in the academy for years to come. But attitudes are changing slowly, with journalists, researchers, and other public interest technologists receiving more legal and institutional protections to collect publicly available data.\n", 99 | "\n", 100 | "You can learn more about Aaron Swartz in the documentary “[The Internet’s Own Boy](https://archive.org/details/TheInternetsOwnBoyTheStoryOfAaronSwartz),“ directed by Brian Knappenberger, and on the website [AaronSwartzDay.org](https://www.aaronswartzday.org/about/)." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "fbf68a47", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | } 111 | ], 112 | "metadata": { 113 | "kernelspec": { 114 | "display_name": "Python 3", 115 | "language": "python", 116 | "name": "python3" 117 | }, 118 | "language_info": { 119 | "codemirror_mode": { 120 | "name": "ipython", 121 | "version": 3 122 | }, 123 | "file_extension": ".py", 124 | "mimetype": "text/x-python", 125 | "name": "python", 126 | "nbconvert_exporter": "python", 127 | "pygments_lexer": "ipython3", 128 | "version": "3.7.3" 129 | } 130 | }, 131 | "nbformat": 4, 132 | "nbformat_minor": 5 133 | } 134 | -------------------------------------------------------------------------------- /checklist.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "c3d5cc64", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Pitching Data Investigations\"\n", 10 | "pagetitle: \"Pitching Data Investigations\"\n", 11 | "description-meta: \"Experiment checklist\"\n", 12 | "description-title: \"Experiment checklist\"\n", 13 | "date: \"08-13-2023\"\n", 14 | "date-modified: \"01-04-2025\"\n", 15 | "bibliography: references.bib\n", 16 | "execute: \n", 17 | " enabled: false\n", 18 | "keywords: pitching stories, experiment planning\n", 19 | "twitter-card:\n", 20 | " title: \"Pitching Hypothesis-Driven Data Investigations\"\n", 21 | " description: \"Answer these questions to bullet-proof your story\"\n", 22 | " image: assets/inspect-element-logo.jpg\n", 23 | "open-graph:\n", 24 | " title: \"Pitching Hypothesis-Driven Data Investigations\"\n", 25 | " description: \"Answer these questions to bullet-proof your story\"\n", 26 | " locale: us_EN\n", 27 | " site-name: Inspect Element\n", 28 | " image: assets/inspect-element-logo.jpg\n", 29 | "format:\n", 30 | " html:\n", 31 | " toc: false\n", 32 | "href: checklist\n", 33 | "---" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "68af51e4", 39 | "metadata": {}, 40 | "source": [ 41 | "This document asks essential questions to plan data investigations and experiments. Revisit these questions throughout your reporting and research, and use them to communicate your intentions and limitations with your editor. It will help determine if a story is worth pursuing by giving an estimation of time, complexity, and impact.\n", 42 | "\n", 43 | "As a side benefit, these questions form the backbone of a methodology to get reviewed by experts, as well as the subject of your investigation.\n", 44 | "\n", 45 | "::: {.callout-note}\n", 46 | "Copy the checklist as text [below](#accountability-reporting-checklist), or as a public [Google Doc](https://docs.google.com/document/d/1LXXxMgRlssaQwzs_fmqwIpwySdqXMXpwLaHQ-N5Vdw8/edit?usp=sharing).\n", 47 | ":::\n", 48 | "\n", 49 | "\n", 50 | "## Accountability Reporting Checklist\n", 51 | "To direct reporting, highlight open factual questions to answer.\n", 52 | "\n", 53 | "**Hypothesis**: What is the reporting question?\n", 54 | "
\n", 55 | "_This is an investigative claim (1-2 sentences) that can be tested. _\n", 56 | "
\n", 57 | "\n", 58 | "**Wrongdoing**: Who is causing it? Who is harmed?\n", 59 | "
\n", 60 | "_Who are the key players?
\n", 61 | "What’s the scale (size of the market, # of people affected) and scope (local, national, international)?
\n", 62 | "Is it getting worse? \n", 63 | "_\n", 64 | "
\n", 65 | "\n", 66 | "**Accountability**: What standard will I use to assess harm?\n", 67 | "
\n", 68 | "_Note relevant laws (regulation) and corporate claims to check.
\n", 69 | "Is there hypocrisy, misuse of power, or legal gray areas?
\n", 70 | "Is the problem being addressed?_\n", 71 | "
\n", 72 | "\n", 73 | "**Lit Review and Anecdotes**: What evidence supports the hypothesis?\n", 74 | "
\n", 75 | "_Include the best research and reporting (with links) on the subject.
\n", 76 | "What will we do differently and what do we bring to the table?
\n", 77 | "Have you conducted preliminary interviews or found other leads?_\n", 78 | "
\n", 79 | "\n", 80 | "**Categorization**: Choose accurate and reproducible terminology.\n", 81 | "
\n", 82 | "_Seldom are key variables found neat and tidy within a spreadsheet column._
\n", 83 | "_How will you categorize terms (“hate speech” or “slow internet”)?_
\n", 84 | "_Are there experts to lean on?_\n", 85 | "
\n", 86 | "\n", 87 | "**Viability**: What is a quick experiment and reporting plan to test the hypothesis?\n", 88 | "
\n", 89 | "_Determine whether we have a feasible story early._\n", 90 | "
\n", 91 | "\n", 92 | "**Key ingredients**: List with specificity.\n", 93 | "
\n", 94 | "_Note the “getability” of each item._\n", 95 | "\n", 96 | "- **Humans** – what does the ideal interviewee look like?\n", 97 | "\n", 98 | "- **Documents** – Are there agencies to FOIA or other receipts we can find?\n", 99 | "\n", 100 | "- **Data** – how will you gather, sample, merge, clean, analyze the data?\n", 101 | "\n", 102 | "- **Observation** - ground-truthing, field reporting, etc.\n", 103 | "
\n", 104 | "\n", 105 | "**Visualize the article**: what are possible headlines, copy, or graphics?\n", 106 | "
\n", 107 | "_With reporting elements in hand, how will you present the key information to readers?_\n", 108 | "
\n", 109 | "\n", 110 | "**Expectations**: what are the [min and max](https://gijn.org/resource/introduction-investigative-journalism/) stories?\n", 111 | "
\n", 112 | "\n", 113 | "- **Minimum story**\n", 114 | "\n", 115 | "- **Maximum story**\n", 116 | "
\n", 117 | "\n", 118 | "**Bulletproofing**: What are obstacles and limitations? \n", 119 | "
\n", 120 | "_What problems do you foresee and how do we address them?
\n", 121 | "Are there outside experts or colleagues who will roleplay Reviewer #2?_\n", 122 | "
\n", 123 | "\n", 124 | "**List of questions to answer before proceeding**\n", 125 | "
\n", 126 | "...\n", 127 | "
" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "eefd3256", 133 | "metadata": {}, 134 | "source": [ 135 | "## Acknowledgements\n", 136 | "\n", 137 | "This checklist is adapted from a [checklist](https://docs.google.com/document/d/19tft98L90zUq4tn6TBXgaFacdbaI3Kr_/) used by my editors Julia Angwin and Evelyn Larrubia at The Markup. As of 2025, I have incorporated aspects of several other checklists used by peers across different newsrooms. Jeremy Singer-Vine provided feedback on an previous draft of the adapted list.\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "raw", 142 | "id": "91e903da", 143 | "metadata": {}, 144 | "source": [] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 3", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.7.3" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 5 168 | } 169 | -------------------------------------------------------------------------------- /index.qmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Inspect Element" 3 | href: intro 4 | format: 5 | html: 6 | toc: False 7 | --- 8 | 9 | 10 | 11 | This guide walks through in-depth case studies and hands-on tutorials to help you investigate opaque systems, systematically. 12 | 13 | As reporters and researchers, we are watchdogs of titans of industry, government institutions and the powerful. The answers to how these entities impact society are often scattered, intermingled, or sealed away. Luckily, with a little know-how, we can reveal evidence of wrongdoing that can lead to accoutability. 14 | 15 | With this guide you will learn how to build your own datasets by [finding undocumented APIs](/apis.html), [automating browsers](/browser_automation.html), and crowdsourcing. Most importantly, you'll build the intution to develop a [hypothesis](https://unesdoc.unesco.org/in/documentViewer.xhtml?v=2.1.196&id=p::usmarcdef_0000193078&file=/in/rest/annotationSVC/DownloadWatermarkedAttachment/attach_import_ba5365b5-bd0c-4bb9-a008-3e0935174cbe%3F_%3D193078eng.pdf&locale=en&multi=true&ark=/ark:/48223/pf0000193078/PDF/193078eng.pdf#%5B%7B%22num%22%3A76%2C%22gen%22%3A0%7D%2C%7B%22name%22%3A%22XYZ%22%7D%2Cnull%2Cnull%2C0%5D), and bullet-proof methodologies to answer key questions for your reporting process and research. 16 | 17 | As practitioners, we've had to learn on the job, hit dead ends, defend our decisions, and seek external expertise across fields and industries. Here we distill these experiences and include tips for veterans and new-comers alike. 18 | 19 | The disciplines we'll draw from include: investigative journalism, data engineering, computer science, social science, and other branches of information science. 20 | 21 | Don't code? No problem: the guide emphasizes underlying principles and uses plain-language ("pseudocode") explainations to accompany any code. 22 | 23 | ### Who wrote this? 24 | 25 | Inspect Element is written by investigative data journalist [Leon Yin](https://leonyin.org) with contributions by [Piotr Sapiezynski](https://www.sapiezynski.com) and others [TK](https://en.wikipedia.org/wiki/To_come_(publishing)). 26 | 27 | Leon will frequently reference past investigations he's worked on in this guide. 28 | You can read those investigations plus new stories at [Bloomberg](https://www.bloomberg.com/authors/AWeGYifVgnI/leon-yin) and [The Markup](https://themarkup.org/people/leon-yin). 29 | 30 | This site was generated using the [Quarto](https://quarto.org/) open-source publishing system. 31 | 32 | #### Corrections, comments, suggestions? 33 | 34 | Email: inspectelement@leonyin.org
35 | File an issue: on [GitHub](https://github.com/yinleon/inspect-element/issues/new) 36 | -------------------------------------------------------------------------------- /references.bib: -------------------------------------------------------------------------------- 1 | @InProceedings{pmlr-v81-buolamwini18a, 2 | title = {Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification}, 3 | author = {Buolamwini, Joy and Gebru, Timnit}, 4 | booktitle = {Proceedings of the 1st Conference on Fairness, Accountability and Transparency}, 5 | pages = {77--91}, 6 | year = {2018}, 7 | editor = {Friedler, Sorelle A. and Wilson, Christo}, 8 | volume = {81}, 9 | series = {Proceedings of Machine Learning Research}, 10 | month = {23--24 Feb}, 11 | publisher = {PMLR}, 12 | pdf = {http://proceedings.mlr.press/v81/buolamwini18a/buolamwini18a.pdf}, 13 | url = {https://proceedings.mlr.press/v81/buolamwini18a.html}, 14 | abstract = {Recent studies demonstrate that machine learning algorithms can discriminate based on classes like race and gender. In this work, we present an approach to evaluate bias present in automated facial analysis algorithms and datasets with respect to phenotypic subgroups. Using the dermatologist approved Fitzpatrick Skin Type classification system, we characterize the gender and skin type distribution of two facial analysis benchmarks, IJB-A and Adience. We find that these datasets are overwhelmingly composed of lighter-skinned subjects (79.6% for IJB-A and 86.2% for Adience) and introduce a new facial analysis dataset which is balanced by gender and skin type. We evaluate 3 commercial gender classification systems using our dataset and show that darker-skinned females are the most misclassified group (with error rates of up to 34.7%). The maximum error rate for lighter-skinned males is 0.8%. The substantial disparities in the accuracy of classifying darker females, lighter females, darker males, and lighter males in gender classification systems require urgent attention if commercial companies are to build genuinely fair, transparent and accountable facial analysis algorithms.} 15 | } 16 | @inproceedings{sap-etal-2019-risk, 17 | title = "The Risk of Racial Bias in Hate Speech Detection", 18 | author = "Sap, Maarten and 19 | Card, Dallas and 20 | Gabriel, Saadia and 21 | Choi, Yejin and 22 | Smith, Noah A.", 23 | booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics", 24 | month = jul, 25 | year = "2019", 26 | address = "Florence, Italy", 27 | publisher = "Association for Computational Linguistics", 28 | url = "https://aclanthology.org/P19-1163", 29 | doi = "10.18653/v1/P19-1163", 30 | pages = "1668--1678", 31 | abstract = "We investigate how annotators{'} insensitivity to differences in dialect can lead to racial bias in automatic hate speech detection models, potentially amplifying harm against minority populations. We first uncover unexpected correlations between surface markers of African American English (AAE) and ratings of toxicity in several widely-used hate speech datasets. Then, we show that models trained on these corpora acquire and propagate these biases, such that AAE tweets and tweets by self-identified African Americans are up to two times more likely to be labelled as offensive compared to others. Finally, we propose *dialect* and *race priming* as ways to reduce the racial bias in annotation, showing that when annotators are made explicitly aware of an AAE tweet{'}s dialect they are significantly less likely to label the tweet as offensive.", 32 | } 33 | @article{freelon-2018-api, 34 | author = "Deen Freelon", 35 | title = "Computational Research in the Post-API Age", 36 | journal = {Political Communication}, 37 | volume = {35}, 38 | number = {4}, 39 | pages = {665-668}, 40 | year = "2018", 41 | publisher = {Routledge}, 42 | doi = {10.1080/10584609.2018.1477506}, 43 | URL = { 44 | https://doi.org/10.1080/10584609.2018.1477506 45 | }, 46 | eprint = { 47 | https://doi.org/10.1080/10584609.2018.1477506 48 | } 49 | } 50 | @article{gizmodo-ring-2019, 51 | author = {Cameron, Dell and Mehrota, Dhruv}, 52 | date = {2019-12-08}, 53 | title = {Ring’s Hidden Data Let Us Map Amazon's Sprawling Home Surveillance Network}, 54 | journal = {Gizmodo}, 55 | url = {https://gizmodo.com/ring-s-hidden-data-let-us-map-amazons-sprawling-home-su-1840312279}, 56 | urldate = {2022-02-22} 57 | } 58 | @article{willis-plum, 59 | author = {Derek Willis}, 60 | date = {2013-04-11}, 61 | title = {Freeing the Plum Book}, 62 | journal = {Source}, 63 | url = {https://source.opennews.org/articles/freeing-plum-book/}, 64 | urldate = {2022-02-22} 65 | } 66 | @article{calacci-2022, 67 | author = {Calacci, Dan and Shen, Jeffrey J. and Pentland, Alex}, 68 | title = {The Cop In Your Neighbor's Doorbell: Amazon Ring and the Spread of Participatory Mass Surveillance}, 69 | year = {2022}, 70 | issue_date = {November 2022}, 71 | publisher = {Association for Computing Machinery}, 72 | address = {New York, NY, USA}, 73 | volume = {6}, 74 | number = {CSCW2}, 75 | url = {https://doi.org/10.1145/3555125}, 76 | doi = {10.1145/3555125}, 77 | journal = {Proc. ACM Hum.-Comput. Interact.}, 78 | month = {nov}, 79 | articleno = {400}, 80 | numpages = {47}, 81 | keywords = {platforms, law enforcement, surveillance, data & society} 82 | } 83 | @ARTICLE{garcia-2018, 84 | author = {{Garcia}, David and {Mitike Kassa}, Yonas and {Cuevas}, Angel and {Cebrian}, Manuel and {Moro}, Esteban and {Rahwan}, Iyad and {Cuevas}, Ruben}, 85 | title = "{Analyzing gender inequality through large-scale Facebook advertising data}", 86 | journal = {Proceedings of the National Academy of Science}, 87 | keywords = {Computer Science - Computers and Society}, 88 | year = 2018, 89 | month = jul, 90 | volume = {115}, 91 | number = {27}, 92 | pages = {6958-6963}, 93 | doi = {10.1073/pnas.1717781115}, 94 | archivePrefix = {arXiv}, 95 | eprint = {1710.03705}, 96 | primaryClass = {cs.CY}, 97 | adsurl = {https://ui.adsabs.harvard.edu/abs/2018PNAS..115.6958G}, 98 | adsnote = {Provided by the SAO/NASA Astrophysics Data System} 99 | } 100 | @inproceedings{princeton-2020, 101 | author = {Major, David and Teixeira, Ross and Mayer, Jonathan}, 102 | title = {No WAN's Land: Mapping U.S. Broadband Coverage with Millions of Address Queries to ISPs}, 103 | year = {2020}, 104 | isbn = {9781450381383}, 105 | publisher = {Association for Computing Machinery}, 106 | address = {New York, NY, USA}, 107 | url = {https://doi.org/10.1145/3419394.3423652}, 108 | doi = {10.1145/3419394.3423652}, 109 | abstract = {Accurate broadband coverage data is essential for public policy planning and government support programs. In the United States, the Federal Communications Commission is responsible for maintaining national broadband coverage data. Observers have panned the FCC's broadband maps for overstating availability, due to coarsegrained data collection and a low coverage threshold.We demonstrate a new approach to building broadband coverage maps: automated large-scale queries to the public availability checking tools offered by major internet service providers. We reverse engineer the coverage tools for nine major ISPs in the U.S., test over 19 million residential street addresses across nine states for service, and compare the results to the FCC's maps.Our results demonstrate that the FCC's coverage data significantly overstates the availability of each ISP's service, access to any broadband, connection speeds available to consumers, and competition in broadband markets. We also find that the FCC's data disproportionately overstates coverage in rural and minority communities. Our results highlight a promising direction for developing more accurate broadband maps and validating coverage reports.}, 110 | booktitle = {Proceedings of the ACM Internet Measurement Conference}, 111 | pages = {393–419}, 112 | numpages = {27}, 113 | location = {Virtual Event, USA}, 114 | series = {IMC '20} 115 | } 116 | @BOOK{Metaxa-book, 117 | author={Metaxa, Danaë and Park, Joon Sung and Robertson, Ronald E. and Karahalios, Karrie and Wilson, Christo and Hancock, Jeff and Sandvig, Christian}, 118 | booktitle={Auditing Algorithms: Understanding Algorithmic Systems from the Outside In}, 119 | year={2021}, 120 | volume={}, 121 | number={}, 122 | pages={}, 123 | doi={}} -------------------------------------------------------------------------------- /references.qmd: -------------------------------------------------------------------------------- 1 | # References {.unnumbered} 2 | 3 | ::: {#refs} 4 | ::: -------------------------------------------------------------------------------- /selenium_wire.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "164ee7fa", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Advanced Usage\"\n", 10 | "pagetitle: \"Selenium Wire\"\n", 11 | "description-meta: \"Introduction, case studies, and exercises for automating browsers.\"\n", 12 | "description-title: \"Introduction, case studies, and exercises for automating browsers.\"\n", 13 | "author: \"Piotr Sapiezynski and Leon Yin\"\n", 14 | "author-meta: Piotr Sapiezynski and Leon Yin\"\n", 15 | "date: \"06-11-2023\"\n", 16 | "date-modified: \"06-17-2023\"\n", 17 | "execute: \n", 18 | " enabled: false\n", 19 | "keywords: data collection, web scraping, browser automation, algorithm audits, personalization\n", 20 | "twitter-card:\n", 21 | " title: Browser Automation\n", 22 | " description: Introduction, case studies, and exercises for automating browsers.\n", 23 | " image: assets/inspect-element-logo.jpg\n", 24 | "open-graph:\n", 25 | " title: Browser Automation\n", 26 | " description: Introduction, case studies, and exercises for automating browsers.\n", 27 | " locale: us_EN\n", 28 | " site-name: Inspect Element\n", 29 | " image: assets/inspect-element-logo.jpg\n", 30 | "href: selenium_wire\n", 31 | "---" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "75bb9edb", 37 | "metadata": {}, 38 | "source": [ 39 | "This section will walk through advanced use cases you might run into when using browser automation.\n", 40 | "\n", 41 | "1. Intercepting network requests (API calls) while browsing\n", 42 | "2. " 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "id": "3cafe56c", 48 | "metadata": {}, 49 | "source": [ 50 | "### Requirements.txt\n", 51 | "\n", 52 | "Here are the Python packages we'll use to initercept traffic in Selenium.\n", 53 | "\n", 54 | "- `selenium-wire` is a package that offers the same functionality of `selenium`, with the added bonus of being able to intercept network traffic. (API requests).
\n", 55 | "- `brotlipy` is a package used to decode compressed responses from servers: aka when the response looks like random characters.
\n", 56 | "- `chromedriver-binary-auto` to help Selenium find the web driver for Chromium.\n", 57 | "\n", 58 | "We'll also upgrade a default library `requests`, because older versions of the package will not function property." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "id": "1b1650b9", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# !pip install selenium-wire requests chromedriver-binary-auto bropotlipy" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 2, 74 | "id": "d0a1df65", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# !pip install requests --upgrade" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "411f9d7e", 84 | "metadata": {}, 85 | "source": [ 86 | "## Intercepting Network Requests in Selenium\n", 87 | "\n", 88 | "Selenium-Wire can be used anytime you would use Selenium. All we need to do is change the import from `selenium` to `seleniumwire`.
\n", 89 | "Notice we continue too use `chromedriver_binary` to make our lives easier." 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 40, 95 | "id": "da075956", 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "from seleniumwire import webdriver\n", 100 | "import chromedriver_binary\n", 101 | "\n", 102 | "driver = webdriver.Chrome()" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "id": "e6a1116d", 108 | "metadata": {}, 109 | "source": [ 110 | "Ideally, a blank window of Chrome should appear without any error messages. Some M1-series Macbooks run into issues download Selenium. Here's [a potential fix](https://stackoverflow.com/a/74651536/18264897) for that issue." 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "03127423", 116 | "metadata": {}, 117 | "source": [ 118 | "### Visiting a website and triggering requests\n", 119 | "\n", 120 | "To demonstrate how to intercept network requests in Selenium, we'll trigger DuckDuckGo's autocomplete in the browser and fetch the network request (undocumented API) running in the background communicating with their servers." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 41, 126 | "id": "81481446", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# open the duckduckgo website in our automated browser\n", 131 | "driver.get('https://duckduckgo.com')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "b908ccf4", 137 | "metadata": {}, 138 | "source": [ 139 | "Now, manually type \"why are\" in the search box to trigger the autocomplete function.
\n", 140 | "Bonus: do this programmatically.\n", 141 | "\n", 142 | "You'll notice this is nearly identical to our [finding undocumented APIs](/apis.html) tutorial.\n", 143 | "\n", 144 | "Rather than find the network requests in the `DevTools`, we can view them programmatically here, thanks for Selenium Wire's built-in `requests` attribute to web`driver`." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 42, 150 | "id": "8b6e1db5", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "[Request(method='GET', url='https://duckduckgo.com/ac/?q=why+a&kl=wt-wt', headers=[('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('sec-fetch-site', 'same-origin'), ('sec-fetch-mode', 'cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b''),\n", 157 | " Request(method='GET', url='https://duckduckgo.com/ac/?q=why+are&kl=wt-wt', headers=[('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('sec-fetch-site', 'same-origin'), ('sec-fetch-mode', 'cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b''),\n", 158 | " Request(method='POST', url='https://improving.duckduckgo.com/t/page_home_searchbox_suggest?1476016&b=chrome&d=d&l=en-US&p=mac&atb=v398-5&pre_va=n&pre_atbva=r&atbi=true&i=false&ak=false&ax=false', headers=[('content-length', '0'), ('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('origin', 'https://duckduckgo.com'), ('sec-fetch-site', 'same-site'), ('sec-fetch-mode', 'no-cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b'')]" 159 | ] 160 | }, 161 | "execution_count": 42, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "driver.requests[-3:]" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "id": "29a7ea68", 173 | "metadata": {}, 174 | "source": [ 175 | "Above we list the three latest network requests, and find that the url `https://duckduckgo.com/ac/?q=why+are+&kl=wt-wt` seems like its the undocumented API for autocomplete.\n", 176 | "\n", 177 | "As you'll soon notice if you repeat this step: requests are being made all the time!" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 44, 183 | "id": "0afe2e2a", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# You can save a list of them like so:\n", 188 | "saved_requests = driver.requests" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "id": "c8da8119", 194 | "metadata": {}, 195 | "source": [ 196 | "You can filter the requests using a [list comprehension](https://www.w3schools.com/python/python_lists_comprehension.asp) (or any other way of sifting through a list)." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 38, 202 | "id": "4ee71a62", 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "look_for = 'duckduckgo.com/ac/'" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 45, 212 | "id": "fe3f1c20", 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "[Request(method='GET', url='https://duckduckgo.com/ac/?q=why&kl=wt-wt', headers=[('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('sec-fetch-site', 'same-origin'), ('sec-fetch-mode', 'cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b''),\n", 219 | " Request(method='GET', url='https://duckduckgo.com/ac/?q=why+a&kl=wt-wt', headers=[('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('sec-fetch-site', 'same-origin'), ('sec-fetch-mode', 'cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b''),\n", 220 | " Request(method='GET', url='https://duckduckgo.com/ac/?q=why+are&kl=wt-wt', headers=[('sec-ch-ua', '\"Google Chrome\";v=\"117\", \"Not;A=Brand\";v=\"8\", \"Chromium\";v=\"117\"'), ('sec-ch-ua-mobile', '?0'), ('user-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36'), ('sec-ch-ua-platform', '\"macOS\"'), ('accept', '*/*'), ('sec-fetch-site', 'same-origin'), ('sec-fetch-mode', 'cors'), ('sec-fetch-dest', 'empty'), ('referer', 'https://duckduckgo.com/'), ('accept-encoding', 'gzip, deflate, br'), ('accept-language', 'en-US,en;q=0.9')], body=b'')]" 221 | ] 222 | }, 223 | "execution_count": 45, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "found_requests = [r for r in saved_requests if look_for in r.url]\n", 230 | "found_requests" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 48, 236 | "id": "1bec48c3", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "last_request = found_requests[-1]\n", 241 | "response = last_request.response" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "id": "16fe94d0", 247 | "metadata": {}, 248 | "source": [ 249 | "### Decoding compressed responses\n", 250 | "The responses are stored as parameters with each request, let's look at the response to the most recent request:" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 49, 256 | "id": "c58e75b7", 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/plain": [ 262 | "Response(status_code=200, reason='', headers=[('server', 'nginx'), ('date', 'Sun, 08 Oct 2023 19:00:13 GMT'), ('content-type', 'application/javascript; charset=UTF-8'), ('vary', 'Accept-Encoding'), ('strict-transport-security', 'max-age=31536000'), ('permissions-policy', 'interest-cohort=()'), ('content-security-policy', \"default-src 'none' ; connect-src https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; manifest-src https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; media-src https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; script-src blob: https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ 'unsafe-inline' 'unsafe-eval' ; font-src data: https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; img-src data: https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; style-src https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ 'unsafe-inline' ; object-src 'none' ; worker-src blob: ; child-src blob: https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; frame-src blob: https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; form-action https://duckduckgo.com https://*.duckduckgo.com https://duckduckgogg42xjoc72x3sjasowoarfbgcmvfimaftt6twagswzczad.onion/ ; frame-ancestors 'self' ; base-uri 'self' ; block-all-mixed-content ;\"), ('x-frame-options', 'SAMEORIGIN'), ('x-xss-protection', '1;mode=block'), ('x-content-type-options', 'nosniff'), ('referrer-policy', 'origin'), ('expect-ct', 'max-age=0'), ('expires', 'Sun, 08 Oct 2023 19:00:12 GMT'), ('cache-control', 'no-cache'), ('content-disposition', 'attachment; filename=\"ac.json\"'), ('x-duckduckgo-privacy-random-ac', 'KKaGpdmkLv5hsArQVxfyla6iy2x7phPdgzFF9y4OEmDQM22jXjpdi63UcafVemongnWfvd7t4M2UHSBfTcTO86FFoVWn5ohN9oS5jB55XQanjiumgc7doNlTKogtb2xuT5RRetED8JP77fOeDytjtWIziK7YreXbqqyctWE5ev63eplZk48OFVBLquPkW4uXD5Y6BNdVH6nVd30zD35NIeU1Lslr5RpgyT6ePASh2uJiH2TDWyqG4OrxMMp8zMmIa3zieb5yeMspNakS1HCgNbcoXfXLWaEYPCJghRLJtQAJXvmDPMLYrwMUbG5XkhCchd8S'), ('content-encoding', 'br')], body=b'\\x15B\\x01\\x00\\xc4\\xca\\xb9\\x94\\xdd\\x89\\xaeP\\xaf,_\\x93\\x0bPa\\xf3@np\\xc0\\x1e\\xf8\\xf2\\xb8\\x8d\\xe3\\x9c\\xe8d;\\x0e8\\xd8\\xa0\\x1b{h\\xb6!zn\\xf6\\xb5\\xe3\\xd7\\xacq\\x17\\xaa9\\x98g|Cv\\xc5.o\\xcc\\xd2\\t.\\xd0\\xd7\\xaa\\x9d\\xe9\\xa1\\xe6?\\xe1m\\r\\x01\\x85%A\\xc8\\xc0\\xfa\\x1b\\xd0\\x84\\x03cq\\xf2\\xae3\\xac\\x0c\\x82VvuX<\\xec\\xdd\\xde\\xdb\\xac;\\xe5A\\x08;\\x84x\\xbc\\xc3\\x8d\\x91[\\xeaa!n\\x86\\xe0\\xb4\\xbd\\x11\\xf3\\x11\\xf0\\xb9\\xd4\\xaa\\xbd\\x99\\xa5]\\x83]\\x88]\\x9d\\xf9\\x9f\\x01')" 263 | ] 264 | }, 265 | "execution_count": 49, 266 | "metadata": {}, 267 | "output_type": "execute_result" 268 | } 269 | ], 270 | "source": [ 271 | "response" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "id": "cedd41a0", 277 | "metadata": {}, 278 | "source": [ 279 | "We were expecting a list of suggestions but got this instead:" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 50, 285 | "id": "32ae7c86", 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "data": { 290 | "text/plain": [ 291 | "b'\\x15B\\x01\\x00\\xc4\\xca\\xb9\\x94\\xdd\\x89\\xaeP\\xaf,_\\x93\\x0bPa\\xf3@np\\xc0\\x1e\\xf8\\xf2\\xb8\\x8d\\xe3\\x9c\\xe8d;\\x0e8\\xd8\\xa0\\x1b{h\\xb6!zn\\xf6\\xb5\\xe3\\xd7\\xacq\\x17\\xaa9\\x98g|Cv\\xc5.o\\xcc\\xd2\\t.\\xd0\\xd7\\xaa\\x9d\\xe9\\xa1\\xe6?\\xe1m\\r\\x01\\x85%A\\xc8\\xc0\\xfa\\x1b\\xd0\\x84\\x03cq\\xf2\\xae3\\xac\\x0c\\x82VvuX<\\xec\\xdd\\xde\\xdb\\xac;\\xe5A\\x08;\\x84x\\xbc\\xc3\\x8d\\x91[\\xeaa!n\\x86\\xe0\\xb4\\xbd\\x11\\xf3\\x11\\xf0\\xb9\\xd4\\xaa\\xbd\\x99\\xa5]\\x83]\\x88]\\x9d\\xf9\\x9f\\x01'" 292 | ] 293 | }, 294 | "execution_count": 50, 295 | "metadata": {}, 296 | "output_type": "execute_result" 297 | } 298 | ], 299 | "source": [ 300 | "response.body" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "fe26d312", 306 | "metadata": {}, 307 | "source": [ 308 | "The response body looks like this because it's compressed. To actually read it, we will have to decompress it first.\n", 309 | "\n", 310 | "We'll use brotli to `decode` the compressed response." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 51, 316 | "id": "5acd6efd", 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/plain": [ 322 | "'[{\"phrase\":\"why are ray bans so expensive\"},{\"phrase\":\"why are eggs so expensive\"},{\"phrase\":\"why are flags at half mast today\"},{\"phrase\":\"why are cats scared of cucumbers\"},{\"phrase\":\"why are gas prices rising\"},{\"phrase\":\"why are flamingos pink\"},{\"phrase\":\"why are my feet swollen\"},{\"phrase\":\"why are firetrucks red\"}]'" 323 | ] 324 | }, 325 | "execution_count": 51, 326 | "metadata": {}, 327 | "output_type": "execute_result" 328 | } 329 | ], 330 | "source": [ 331 | "import brotli\n", 332 | "\n", 333 | "decompressed = brotli.decompress(response.body).decode('utf-8')\n", 334 | "decompressed" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "id": "59ba9383", 340 | "metadata": {}, 341 | "source": [ 342 | "That's much more like it! We can turn it into a python object using the `json` package:" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 52, 348 | "id": "11a92739", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "[{'phrase': 'why are ray bans so expensive'},\n", 355 | " {'phrase': 'why are eggs so expensive'},\n", 356 | " {'phrase': 'why are flags at half mast today'},\n", 357 | " {'phrase': 'why are cats scared of cucumbers'},\n", 358 | " {'phrase': 'why are gas prices rising'},\n", 359 | " {'phrase': 'why are flamingos pink'},\n", 360 | " {'phrase': 'why are my feet swollen'},\n", 361 | " {'phrase': 'why are firetrucks red'}]" 362 | ] 363 | }, 364 | "execution_count": 52, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "import json\n", 371 | "resp_json = json.loads(decompressed)\n", 372 | "resp_json" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "id": "4edee539", 378 | "metadata": {}, 379 | "source": [ 380 | "### Replaying requests\n", 381 | "\n", 382 | "We can also re-play any of the intercepted requests using the `requests` package. This way Let's stick to the recent one:" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": 100, 388 | "id": "0d6c2396", 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "data": { 393 | "text/plain": [ 394 | "[{'phrase': 'why are flags at half mast today'},\n", 395 | " {'phrase': 'why are cats afraid of cucumber'},\n", 396 | " {'phrase': 'why are eggs so expensive'},\n", 397 | " {'phrase': 'why are gas prices rising'},\n", 398 | " {'phrase': 'why are flamingos pink'},\n", 399 | " {'phrase': 'why are you interested in this position'},\n", 400 | " {'phrase': 'why are my balls so itchy'},\n", 401 | " {'phrase': 'why are you running'}]" 402 | ] 403 | }, 404 | "execution_count": 100, 405 | "metadata": {}, 406 | "output_type": "execute_result" 407 | } 408 | ], 409 | "source": [ 410 | "import requests\n", 411 | "\n", 412 | "request = driver.last_request\n", 413 | "response = requests.get(request)\n", 414 | "response.json()" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "id": "9c44e6db", 420 | "metadata": {}, 421 | "source": [ 422 | "We can also modify any of the parameters of the Request before sending it off. Let's change the query from \"why are\" to \"why is\". The query is expressed as a parameter `q` in the URL:\n", 423 | "\n", 424 | "[https://duckduckgo.com/ac/?q=why+are+&kl=wt-wt](https://duckduckgo.com/ac/?q=why+are+&kl=wt-wt)\n", 425 | "\n", 426 | "for our convenience we can modify the URL parameters by changing the `request.params` dictionary:" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 102, 432 | "id": "bb03eb5a", 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "{'q': 'why are', 'kl': 'wt-wt'}" 439 | ] 440 | }, 441 | "execution_count": 102, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "params = request.params\n", 448 | "params" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 103, 454 | "id": "43853f2f", 455 | "metadata": {}, 456 | "outputs": [ 457 | { 458 | "data": { 459 | "text/plain": [ 460 | "'https://duckduckgo.com/ac/?q=why+is&kl=wt-wt'" 461 | ] 462 | }, 463 | "execution_count": 103, 464 | "metadata": {}, 465 | "output_type": "execute_result" 466 | } 467 | ], 468 | "source": [ 469 | "params['q'] = 'why is'\n", 470 | "request.params = params\n", 471 | "request.url" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 104, 477 | "id": "de8b6e63", 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "data": { 482 | "text/plain": [ 483 | "[{'phrase': 'why is the sky blue'},\n", 484 | " {'phrase': 'why is my poop green'},\n", 485 | " {'phrase': 'why is the sun red'},\n", 486 | " {'phrase': 'why is the air quality bad today'},\n", 487 | " {'phrase': 'why is ronaldo benched'},\n", 488 | " {'phrase': 'why is it important'},\n", 489 | " {'phrase': 'why is roblox down'},\n", 490 | " {'phrase': 'why is gail off cbs'}]" 491 | ] 492 | }, 493 | "execution_count": 104, 494 | "metadata": {}, 495 | "output_type": "execute_result" 496 | } 497 | ], 498 | "source": [ 499 | "#request.url = request.url.replace('why+are','why+is')\n", 500 | "response = requests.get(request)\n", 501 | "response.json()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "id": "8ef60e27", 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [] 511 | } 512 | ], 513 | "metadata": { 514 | "kernelspec": { 515 | "display_name": "Python 3", 516 | "language": "python", 517 | "name": "python3" 518 | }, 519 | "language_info": { 520 | "codemirror_mode": { 521 | "name": "ipython", 522 | "version": 3 523 | }, 524 | "file_extension": ".py", 525 | "mimetype": "text/x-python", 526 | "name": "python", 527 | "nbconvert_exporter": "python", 528 | "pygments_lexer": "ipython3", 529 | "version": "3.7.3" 530 | } 531 | }, 532 | "nbformat": 4, 533 | "nbformat_minor": 5 534 | } 535 | -------------------------------------------------------------------------------- /start.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "44df20e3", 6 | "metadata": {}, 7 | "source": [ 8 | "---\n", 9 | "title: \"Planning Investigations\"\n", 10 | "pagetitle: \"Planning Investigations\"\n", 11 | "description-meta: \"Start here\"\n", 12 | "description-title: \"Start here\"\n", 13 | "author: \"Leon Yin\"\n", 14 | "author-meta: \"Leon Yin\"\n", 15 | "date: \"08-13-2023\"\n", 16 | "date-modified: \"01-04-2025\"\n", 17 | "bibliography: references.bib\n", 18 | "execute: \n", 19 | " enabled: false\n", 20 | "keywords: pitching stories, experiment planning\n", 21 | "twitter-card:\n", 22 | " title: \"Planning Data Investigations\"\n", 23 | " description: \"How to keep tabs on a complicated data experiment\"\n", 24 | " image: assets/inspect-element-logo.jpg\n", 25 | "open-graph:\n", 26 | " title: \"Planning Data Investigations\"\n", 27 | " description: \"How to keep tabs on a complicated data experiment\"\n", 28 | " locale: us_EN\n", 29 | " site-name: Inspect Element\n", 30 | " image: assets/inspect-element-logo.jpg\n", 31 | "href: start\n", 32 | "---" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "id": "794a553c", 38 | "metadata": {}, 39 | "source": [ 40 | "Investigations can get complicated and you might not know where to start.\n", 41 | "\n", 42 | "In this section we'll introduce you to a planning checklist that we use for our own investigations.\n", 43 | "\n", 44 | "The checklist will help you choose an accountability angle, identify tangible harms, and form a testable hypothesis. \n", 45 | "\n", 46 | "Most importantly, the checklist covers questions you'll need to answer in order to develop a defensible methodology and a bullet-proof story. \n", 47 | "\n", 48 | "Your readers and future self will thank you for being forthright about your investigation's vulnerabilities. Moreover, you'll get a sense of the experiment's feasibility before you invest too much time into it.\n", 49 | "\n", 50 | "Although grim, the checklist can help expedite the decision to kill a story. Killing a story is a difficult process-- there's even a podcast on the topic called _[Killed Stories](https://killedstories.com/)_, but it's better to catch fatal flaws and irreconcilable uncertainties early. Trust that doing this efficiently is a gift to yourself and your colleagues. \n", 51 | "\n", 52 | "The questions in the checklist cover fundamental topics we'll discuss throughout the practitioner's guide such as:\n", 53 | "\n", 54 | "1. [Data collection](/build-your-own-datasets.html)
\n", 55 | "2. Quick viability tests
\n", 56 | "3. Classification
\n", 57 | "4. Limitations
\n", 58 | "5. Communicating findings for a general audience
\n", 59 | "\n", 60 | "You won't need to fill out a checklist for every story, but it is **super helpful for projects with original data collection and/or analysis**.\n", 61 | "\n", 62 | "Lastly, view the checklist is a starting point. Add and edit questions with your team to assure you can publish your findings with certainty.\n", 63 | "\n", 64 | "Good luck." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "id": "5d58a843", 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.7.3" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 5 97 | } 98 | -------------------------------------------------------------------------------- /styles.css: -------------------------------------------------------------------------------- 1 | /* css styles */ -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from IPython.display import display, Markdown 2 | 3 | 4 | def build_buttons(link, github, colab = True, citation=False): 5 | """ 6 | Used to generate github, collab, and website-linked buttons 7 | """ 8 | if citation: 9 | citation_str = f'🏛 Citation' 10 | else: 11 | citation_str = '' 12 | 13 | if colab: 14 | colab_str = f'\n🖥️ Interactive version' 15 | else: 16 | colab_str = '' 17 | display(Markdown(f""" 18 | 📖 Read online{colab_str} 19 | ⚙️ GitHub 20 | {citation_str} 21 |
22 | """)) 23 | --------------------------------------------------------------------------------