├── exercises ├── collecting_data │ ├── exercise.js │ ├── problem.md │ └── solution │ │ └── solution.js ├── menu.json ├── outputting_csv │ ├── exercise.js │ ├── problem.md │ └── solution │ │ └── solution.js ├── parsing_html │ ├── exercise.js │ ├── problem.md │ └── solution │ │ └── solution.js ├── requesting_pages │ ├── exercise.js │ ├── problem.md │ └── solution │ │ └── solution.js └── traversing_dom │ ├── exercise.js │ ├── problem.md │ └── solution │ └── solution.js ├── index.js ├── nutella.png ├── package.json └── readme.md /exercises/collecting_data/exercise.js: -------------------------------------------------------------------------------- 1 | var exercise = require('workshopper-exercise')() 2 | , filecheck = require('workshopper-exercise/filecheck') 3 | , execute = require('workshopper-exercise/execute') 4 | , comparestdout = require('workshopper-exercise/comparestdout') 5 | 6 | // checks that the submission file actually exists 7 | exercise = filecheck(exercise) 8 | 9 | // execute the solution and submission in parallel with spawn() 10 | exercise = execute(exercise) 11 | 12 | // compare stdout of solution and submission 13 | exercise = comparestdout(exercise) 14 | 15 | module.exports = exercise 16 | -------------------------------------------------------------------------------- /exercises/collecting_data/problem.md: -------------------------------------------------------------------------------- 1 | Now, we will go through all of the links on the page and create a data table (csv file) out of it. 2 | 3 | You might want to get a div of interest first, and then `find` items inside of it. 4 | 5 | For example, let's get the `a` tag and the `date` in this html: 6 | 7 | ```html 8 |

9 | Workshoppers 10 |

11 | Date: February 16, 2012 12 |

13 |

14 | ``` 15 | 16 | Get the div: 17 | ```js 18 | var div = $('div.title') 19 | ``` 20 | 21 | Use `find` to search the div (more on `find` here: https://github.com/cheeriojs/cheerio#traversing) 22 | 23 | ```js 24 | var a = div.find('a') 25 | var date = div.find('.date') 26 | ``` 27 | 28 | Now create the row: 29 | ```js 30 | var row = { 31 | href: a.attr('href'), 32 | date: date.text() 33 | } 34 | 35 | console.log(row) 36 | ``` 37 | 38 | You'll need to use this technique in this exercise. 39 | 40 | # Exercise 41 | 42 | Let's get the top links on the "science" subreddit from February 16, 2012: 43 | 44 | Go to `http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/` and look at the page source by right-clicking a link and clicking `Inspect Element` 45 | 46 | Parse the html and create a row for each `a` link on the page. You'll need to use `map` and `console.log`. 47 | 48 | Each row should include 3 fields: the visible `score`, the link's `href`, and `contents` (text contents of the `a` tag). 49 | 50 | Example: 51 | ``` 52 | {"score": "15", "href": "/web/20120216223019/http://www.bbc.co.uk/news/science-environment-17015559", "content": "\'New frontier\' of Antarctic lake exploration - What\'s behind the drive to explore Antarctica\'s lakes?bbc.co.ukDrJulianBashircommentsharecancel"} 53 | {"score": "3", "href": "/web/20120216223019/http://www.reddit.com/other-link-here", "content": "Some other link"} 54 | ``` 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /exercises/collecting_data/solution/solution.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio') 2 | var got = require('got') 3 | 4 | var URL = 'http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/' 5 | 6 | got(URL, function (err, html) { 7 | $ = cheerio.load(html) 8 | $('.link').map(function (i, el) { 9 | el = $(el) 10 | var score = el.find('.score.unvoted') 11 | var a = el.find('a') 12 | var row = { 13 | score: score.text(), 14 | href: a.attr('href'), 15 | content: a.text() 16 | } 17 | console.log(row) 18 | }) 19 | }) 20 | -------------------------------------------------------------------------------- /exercises/menu.json: -------------------------------------------------------------------------------- 1 | [ 2 | "Requesting Pages", 3 | "Parsing HTML", 4 | "Traversing Dom", 5 | "Collecting Data", 6 | "Outputting CSV" 7 | ] 8 | -------------------------------------------------------------------------------- /exercises/outputting_csv/exercise.js: -------------------------------------------------------------------------------- 1 | var exercise = require('workshopper-exercise')() 2 | , filecheck = require('workshopper-exercise/filecheck') 3 | , execute = require('workshopper-exercise/execute') 4 | , comparestdout = require('workshopper-exercise/comparestdout') 5 | 6 | // checks that the submission file actually exists 7 | exercise = filecheck(exercise) 8 | 9 | // execute the solution and submission in parallel with spawn() 10 | exercise = execute(exercise) 11 | 12 | // compare stdout of solution and submission 13 | exercise = comparestdout(exercise) 14 | 15 | module.exports = exercise 16 | -------------------------------------------------------------------------------- /exercises/outputting_csv/problem.md: -------------------------------------------------------------------------------- 1 | Say we wanted to put the table into a csv file. To output csv, we can use a format stream: 2 | 3 | ```js 4 | var writer = require('format-data')('csv') 5 | ``` 6 | `writer` is now ready to accept rows to write. 7 | 8 | You can now `write` each row, like so: 9 | 10 | ``` 11 | writer.write({ header1: value, header2: value }) 12 | ``` 13 | 14 | And then, you can say where the data should go. In this case, we're writing to `process.stdout`: 15 | ``` 16 | writer.pipe(process.stdout) 17 | ``` 18 | 19 | You could also write to a file, like this: 20 | 21 | ``` 22 | var fs = require('fs') 23 | var file = fs.createWriteStream('output.csv') 24 | writer.pipe(file) 25 | ``` 26 | 27 | For this exercise, use `process.stdout` for testing. 28 | 29 | # Exercise 30 | 31 | Transform your file from the last exercise so that instead of `console.log`, it uses a `writer.write`. 32 | -------------------------------------------------------------------------------- /exercises/outputting_csv/solution/solution.js: -------------------------------------------------------------------------------- 1 | var cheerio = require('cheerio') 2 | var got = require('got') 3 | var writer = require('format-data')('csv') 4 | 5 | var URL = 'http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/' 6 | 7 | got(URL, function (err, html) { 8 | $ = cheerio.load(html) 9 | $('.link').map(function (i, el) { 10 | el = $(el) 11 | var score = el.find('.score.unvoted') 12 | var a = el.find('a') 13 | var row = { 14 | score: score.text(), 15 | href: a.attr('href'), 16 | content: a.text() 17 | } 18 | writer.write(row) 19 | }) 20 | }) 21 | 22 | writer.pipe(process.stdout) -------------------------------------------------------------------------------- /exercises/parsing_html/exercise.js: -------------------------------------------------------------------------------- 1 | var exercise = require('workshopper-exercise')() 2 | , filecheck = require('workshopper-exercise/filecheck') 3 | , execute = require('workshopper-exercise/execute') 4 | , comparestdout = require('workshopper-exercise/comparestdout') 5 | 6 | // checks that the submission file actually exists 7 | exercise = filecheck(exercise) 8 | 9 | // execute the solution and submission in parallel with spawn() 10 | exercise = execute(exercise) 11 | 12 | // compare stdout of solution and submission 13 | exercise = comparestdout(exercise) 14 | 15 | module.exports = exercise -------------------------------------------------------------------------------- /exercises/parsing_html/problem.md: -------------------------------------------------------------------------------- 1 | # Traversing the dom! 2 | 3 | `cheerio` is a library that allows you to use a jQuery-like syntax right here in your terminal. If you know anything about jQuery this one should be easy for you. 4 | 5 | # Overview 6 | Let's start with a simple example. Say we want to access the 'h1' tag in the following html: 7 | 8 | ``` 9 | 10 | 11 |

There is no cow level.

12 | 13 | 14 | ``` 15 | 16 | We can use `cheerio` to prepare the html like this: 17 | 18 | ``` 19 | var cheerio = require('cheerio') 20 | var html = '

There is no cow level.

'' 21 | var $ = cheerio.load(html) 22 | ``` 23 | 24 | We can use the new `$` variable to query the html -- to get the `h1` tag, you can use `$('h1')`. 25 | 26 | There are a variety of functions you can use with the object you get (See `https://www.npmjs.com/package/cheerio`): 27 | 28 | Example: 29 | 30 | * `$('h1').text()` will return `There is no cow level`. 31 | * `$('body').html()` will return `

There is no cow level

` 32 | 33 | 34 | 35 | # Exercise 36 | 37 | Go to the following link in your browser: 38 | 39 | `http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/` 40 | 41 | Take your file from the last tutorial. Use `got`, `cheerio`, and `console.log` to print out the **readable text** of the website using the `text()` function. 42 | 43 | **hint: all of the content is in the `body` tag** 44 | -------------------------------------------------------------------------------- /exercises/parsing_html/solution/solution.js: -------------------------------------------------------------------------------- 1 | var got = require('got') 2 | var cheerio = require('cheerio') 3 | 4 | var URL = 'http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/' 5 | 6 | got(URL, function (err, html) { 7 | var $ = cheerio.load(html) 8 | var content = $('body') 9 | console.log(content.text()) 10 | }) -------------------------------------------------------------------------------- /exercises/requesting_pages/exercise.js: -------------------------------------------------------------------------------- 1 | var exercise = require('workshopper-exercise')() 2 | , filecheck = require('workshopper-exercise/filecheck') 3 | , execute = require('workshopper-exercise/execute') 4 | , comparestdout = require('workshopper-exercise/comparestdout') 5 | 6 | // checks that the submission file actually exists 7 | exercise = filecheck(exercise) 8 | 9 | // execute the solution and submission in parallel with spawn() 10 | exercise = execute(exercise) 11 | 12 | // compare stdout of solution and submission 13 | exercise = comparestdout(exercise) 14 | 15 | module.exports = exercise 16 | -------------------------------------------------------------------------------- /exercises/requesting_pages/problem.md: -------------------------------------------------------------------------------- 1 | # The project 2 | 3 | First, we need to know how to grab the contents of the webpage as html. Create a file called 'index.js'. You can use the `got` module to easily retrieve the contents of the webpage. You'll want to `require` the `got` module, like so. 4 | 5 | ```js 6 | var got = require('got') 7 | ``` 8 | 9 | `got` will go get the webpage, and when it's done, it will call the function (err, html). `err` will be an error object (which we can look at) and `html` will have the page contents. 10 | 11 | Here is an example that prints the html from webpage `http://nodeschool.io`. 12 | 13 | ```js 14 | var got = require('got') 15 | 16 | got('http://nodeschool.io', function (err, html) { 17 | console.log(html) 18 | }) 19 | ``` 20 | 21 | # Exercise 22 | 23 | Let's look at reddit.com's science subreddit in February, 2012. 24 | 25 | `http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/` 26 | 27 | Use `got`, and `console.log` to print out the contents of the page. 28 | 29 | -------------------------------------------------------------------------------- /exercises/requesting_pages/solution/solution.js: -------------------------------------------------------------------------------- 1 | var got = require('got') 2 | 3 | var URL = 'http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/' 4 | got(URL, function (err, html) { 5 | console.log(html) 6 | }) -------------------------------------------------------------------------------- /exercises/traversing_dom/exercise.js: -------------------------------------------------------------------------------- 1 | var exercise = require('workshopper-exercise')() 2 | , filecheck = require('workshopper-exercise/filecheck') 3 | , execute = require('workshopper-exercise/execute') 4 | , comparestdout = require('workshopper-exercise/comparestdout') 5 | 6 | // checks that the submission file actually exists 7 | exercise = filecheck(exercise) 8 | 9 | // execute the solution and submission in parallel with spawn() 10 | exercise = execute(exercise) 11 | 12 | // compare stdout of solution and submission 13 | exercise = comparestdout(exercise) 14 | 15 | module.exports = exercise 16 | -------------------------------------------------------------------------------- /exercises/traversing_dom/problem.md: -------------------------------------------------------------------------------- 1 | # Loops 2 | 3 | Okay, now we want to get all of the links in the page. A link looks like this: 4 | 5 | ```html 6 | Click me! 7 | ``` 8 | 9 | If we want to select an `a` tag and get it's contents, we can do 10 | 11 | ```js 12 | $('a').text() 13 | ``` 14 | 15 | This might select multiple tags, though, so we might need to go through each of them if you want to do something with them. Here's one way to go through each item: 16 | 17 | ```js 18 | $('a').map(function (i, el) { 19 | // you can use either 'el' or 'this' 20 | $(this).text() 21 | }) 22 | ``` 23 | 24 | ## Exercise 25 | 26 | Get all of the `a` elements on the page and `console.log` the `href` attribute. You can grab their `href` attribute using `attr`. 27 | 28 | More info here: `https://github.com/cheeriojs/cheerio#attributes` 29 | 30 | 31 | -------------------------------------------------------------------------------- /exercises/traversing_dom/solution/solution.js: -------------------------------------------------------------------------------- 1 | var got = require('got') 2 | var cheerio = require('cheerio') 3 | 4 | var URL = 'http://web.archive.org/web/20120216223019/http://www.reddit.com/r/science/' 5 | 6 | got(URL, function (err, html) { 7 | var $ = cheerio.load(html) 8 | $('a').map(function (i, el) { 9 | console.log($(el).attr('href')) 10 | }) 11 | }) 12 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | var workshopper = require('workshopper'), 4 | path = require('path') 5 | 6 | function fpath (f) { 7 | return path.join(__dirname, f) 8 | } 9 | 10 | workshopper({ 11 | name : 'nutella-scrape', 12 | title : 'Nutella Scraper', 13 | subtitle : 'Learn how to scrape webpages with Node.js', 14 | appDir : __dirname, 15 | menuItems : [], 16 | exerciseDir : fpath('./exercises/') 17 | }) -------------------------------------------------------------------------------- /nutella.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/okdistribute/nutella-scrape/f66b938309ef246d89598f8fa4004bf5cafa3bc1/nutella.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nutella-scrape", 3 | "version": "1.1.1", 4 | "description": "a nodeschool workshop to teach scraping", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/karissa/nutella-scrape.git" 12 | }, 13 | "bin": "./index.js", 14 | "keywords": [ 15 | "nodeschool", 16 | "scraping", 17 | "tutorial", 18 | "school", 19 | "exercise", 20 | "help", 21 | "scrape" 22 | ], 23 | "author": "Karissa McKelvey (http://karissamck.com/)", 24 | "license": "BSD-2-Clause", 25 | "bugs": { 26 | "url": "https://github.com/karissa/nutella-scrape/issues" 27 | }, 28 | "preferGlobal": true, 29 | "homepage": "https://github.com/karissa/nutella-scrape", 30 | "dependencies": { 31 | "cheerio": "^0.19.0", 32 | "format-data": "^2.1.2", 33 | "got": "^4.1.1", 34 | "workshopper": "^2.7.0", 35 | "workshopper-exercise": "^2.4.0" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # nutella-scrape 2 | 3 | [![NPM](https://nodei.co/npm/nutella-scrape.png?downloads=true&stars=true&global=true)](https://nodei.co/npm/nutella-scrape/) 4 | 5 | ![nutella](https://github.com/karissa/nutella-scrape/blob/master/nutella.png) 6 | 7 | 1. Run `sudo npm install nutella-scrape -g` 8 | 2. Run `nutella-scrape` 9 | 3. ??? 10 | 4. LEARN!! 11 | 12 | In this tutorial, we will work through how to scrape websites using Node.js for the primary purpose of using it in other programs -- in servers, frontends (yes, Node works in the browser!), or just writing a table to disk for analysis elsewhere. 13 | 14 | The DOM (Document Object Model) is an abstract concept describing how we can interact with HTML. JavaScript is GREAT for traversing HTML (i.e., the DOM) because it was made to work with HTML in the first place. 15 | 16 | ## TODO 17 | 18 | * parallel 19 | * spoofing 20 | * cookies/login walls 21 | * electron-microscope --------------------------------------------------------------------------------