├── .gitignore ├── .npmignore ├── test.js ├── example.js ├── package.json ├── LICENSE.md ├── README.md └── index.js /.gitignore: -------------------------------------------------------------------------------- 1 | bower_components 2 | node_modules 3 | *.log 4 | .DS_Store 5 | bundle.js 6 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | bower_components 2 | node_modules 3 | *.log 4 | .DS_Store 5 | bundle.js 6 | example.js 7 | test 8 | test.js 9 | demo/ 10 | .npmignore 11 | LICENSE.md -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | var awwwardsStream = require('./'); 2 | var test = require('tape'); 3 | 4 | test('scrape Awwwards data', function (t) { 5 | t.plan(12); // 12 entries per page for now :) 6 | awwwardsStream({ 7 | pages: 1, 8 | type: 'sotd' 9 | }).on('data', site => { 10 | t.equal(typeof site.url, 'string'); 11 | }); 12 | }); 13 | -------------------------------------------------------------------------------- /example.js: -------------------------------------------------------------------------------- 1 | var awwwards = require('./'); 2 | var padLeft = require('pad-left'); 3 | 4 | var sites = []; 5 | 6 | awwwards({ 7 | type: 'sotd', // Site of the Day 8 | pages: 50, 9 | rate: 50 10 | }) 11 | .on('data', site => { 12 | sites.push(site); 13 | }) 14 | .on('end', frequencies); 15 | 16 | function frequencies () { 17 | console.log('Total sites: %d\n', sites.length); 18 | 19 | var freqs = sites 20 | .map(site => site.author.country) 21 | .reduce((dict, key) => { 22 | if (key in dict) dict[key]++; 23 | else dict[key] = 1; 24 | return dict; 25 | }, {}); 26 | 27 | var result = Object.keys(freqs).map(k => { 28 | return { key: k, frequency: freqs[k] }; 29 | }); 30 | 31 | result.sort((a, b) => b.frequency - a.frequency); 32 | 33 | var digits = String(result.length).length; 34 | result.slice(0, 10).forEach((d, i) => { 35 | var num = padLeft((1 + i), digits, ' '); 36 | console.log('%s. %s (%s)', num, d.key, d.frequency); 37 | }); 38 | } 39 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "awwwards-stream", 3 | "version": "1.0.0", 4 | "description": "scrape Awwwards data", 5 | "main": "index.js", 6 | "license": "MIT", 7 | "author": { 8 | "name": "Matt DesLauriers", 9 | "email": "dave.des@gmail.com", 10 | "url": "https://github.com/mattdesl" 11 | }, 12 | "dependencies": { 13 | "cheerio": "^0.20.0", 14 | "defined": "^1.0.0", 15 | "from2": "^2.1.1", 16 | "got": "^6.1.1", 17 | "moment": "^2.11.2", 18 | "url-join": "0.0.1" 19 | }, 20 | "devDependencies": { 21 | "faucet": "0.0.1", 22 | "pad-left": "^2.0.1", 23 | "tape": "^4.4.0" 24 | }, 25 | "scripts": { 26 | "test": "node test.js | faucet" 27 | }, 28 | "keywords": [ 29 | "scrape", 30 | "awwwards", 31 | "data", 32 | "url", 33 | "urls", 34 | "site", 35 | "sites" 36 | ], 37 | "repository": { 38 | "type": "git", 39 | "url": "git://github.com/Jam3/awwwards-stream.git" 40 | }, 41 | "homepage": "https://github.com/Jam3/awwwards-stream", 42 | "bugs": { 43 | "url": "https://github.com/Jam3/awwwards-stream/issues" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Jam3 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 20 | OR OTHER DEALINGS IN THE SOFTWARE. 21 | 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # awwwards-stream 2 | 3 | [![experimental](http://badges.github.io/stability-badges/dist/experimental.svg)](http://github.com/badges/stability-badges) 4 | 5 | Creates a readable stream of [Awwwards.com data](http://awwwards.com/) by scraping their HTML. 6 | 7 | > :warning: This is fragile and should only be used for offline experimentation / artistic purposes. It is not an official API and you should rate limit your requests to keep stress off the Awwwards servers. It may break at any point and should not be used in a live Node.js server. 8 | 9 | ## Install 10 | 11 | ```sh 12 | npm i awwwards-stream --save 13 | ``` 14 | 15 | ## Example 16 | 17 | ```js 18 | var awwwards = require('awwwards-stream'); 19 | 20 | awwwards({ 21 | type: 'sotd', // Site of the Day 22 | pages: 2, 23 | startPage: 0 24 | }) 25 | .on('data', site => { 26 | console.log('%s: %s', site.title, site.url) 27 | }) 28 | .on('end', () => { 29 | console.log('Finished') 30 | }); 31 | ``` 32 | 33 | Results (from Feb 9, 2016) – 34 | 35 | ``` 36 | Centurion-Magazine: http://centurion-magazine.com 37 | Active Theory v3: http://activetheory.net 38 | A Short Journey: http://www.ashortjourney.com 39 | Tolia - The Longest Short Films: http://tolia.ge/ 40 | Make Me Pulse 2016 Wishes: http://2016.makemepulse.com 41 | Jérémie Battaglia: http://jeremiebattaglia.com/ 42 | Do You Still Believe?: http://www.doyoustillbelieve.com/ 43 | Red Collar: http://redcollar.digital 44 | Publicis90: http://www.publicis90.com 45 | A State of War: http://astateofwar.org.au 46 | Doudou Blues: http://www.doudoublues.com/ 47 | The Grey Tales: http://thegreytales.net 48 | Finished 49 | ``` 50 | 51 | See [./example.js](./example.js) for a more practical example, finding the countries that are publishing the last 600 Sites of the Day: 52 | 53 | ``` 54 | Total sites: 600 55 | 56 | 1. U.S.A. (131) 57 | 2. France (129) 58 | 3. United Kingdom (48) 59 | 4. Netherlands (48) 60 | 5. Denmark (24) 61 | 6. Canada (21) 62 | 7. Italy (21) 63 | 8. Germany (17) 64 | 9. Belgium (17) 65 | 10. Russia (12) 66 | 67 | ``` 68 | 69 | ## Usage 70 | 71 | [![NPM](https://nodei.co/npm/awwwards-stream.png)](https://www.npmjs.com/package/awwwards-stream) 72 | 73 | #### `stream = awwwardsStream(opts)` 74 | 75 | Returns a readable object stream that emits objects for each entry in the Awwwards data. Options: 76 | 77 | - `type` (String) can be one of: `sotd, sotm, nominees, mentions, trending` – defaults to `'sotd'` 78 | - `startPage` (Number) starting page, default 0 79 | - `pages` (Number) total number of pages to scrape, default `Infinity` 80 | - `rate` (Number) delay between successive requests, default 250 ms 81 | 82 | The objects have this form: 83 | 84 | ```js 85 | { 86 | title: String, // Title of site 87 | url: String, // URL of site 88 | thumbnail: String, // URL of thumbnail 89 | entry: String, // Site URL on Awwwards.com 90 | hearts: Number, // # of hearts 91 | author: { // Agency/User/Author 92 | name: String, // e.g. "Jam3" 93 | entry: String, // Author URL on Awwwards.com 94 | country: String // e.g. "Canada" 95 | }, 96 | rating: Number|undefined, // Rating/10 or undefined 97 | date: String, // a Date string for this entry 98 | developerAward: Boolean // If the site has this badge 99 | }; 100 | ``` 101 | 102 | ## License 103 | 104 | MIT, see [LICENSE.md](http://github.com/Jam3/awwwards-stream/blob/master/LICENSE.md) for details. 105 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | var got = require('got'); 2 | var format = require('util').format; 3 | var defined = require('defined'); 4 | var cheerio = require('cheerio'); 5 | var urlJoin = require('url-join'); 6 | var moment = require('moment'); 7 | var from2 = require('from2'); 8 | 9 | var BASE_URL = 'http://www.awwwards.com/'; 10 | var entry = BASE_URL + '%s/?page=%d'; 11 | var types = { 12 | sotd: 'awards-of-the-day', 13 | sotm: 'awards-of-the-month', 14 | nominees: 'nominees', 15 | mentions: 'honorable-mentions', 16 | trending: 'websites/trend' 17 | }; 18 | 19 | module.exports = awwwardsStream; 20 | function awwwardsStream (opt) { 21 | opt = opt || {}; 22 | var type = opt.type || 'sotd'; 23 | if (!(type in types)) { 24 | throw new TypeError('invalid type ' + type); 25 | } 26 | type = types[type.toLowerCase()]; 27 | 28 | var startPage = defined(opt.startPage, 0); 29 | var page = startPage; 30 | var pages = defined(opt.pages, Infinity); 31 | var rate = defined(opt.rate, 250); 32 | var chunks = []; 33 | 34 | return from2.obj(read); 35 | 36 | function read (size, next) { 37 | if (chunks.length <= 0) { 38 | // Grab more data... 39 | setTimeout(() => { 40 | readPage((err, data) => { 41 | if (err) return next(err); 42 | if (!data) { 43 | next(null, null); // no more pages 44 | } else { 45 | chunks = data.slice(); 46 | next(null, chunks.shift()); 47 | } 48 | }); 49 | }, rate); 50 | } else { 51 | next(null, chunks.shift()); 52 | } 53 | } 54 | 55 | function readPage (cb) { 56 | if (page >= (startPage + pages)) { 57 | cb(null, null); 58 | return; 59 | } 60 | 61 | var url = format(entry, type, (1 + page)); 62 | got(url) 63 | .then(resp => { 64 | page++; 65 | var $ = cheerio.load(resp.body); 66 | var noResults = $('div.no-results'); 67 | if (noResults.length) { 68 | cb(null, null); // no more pages to show 69 | } else { 70 | cb(null, parsePage($)); 71 | } 72 | }) 73 | .catch(err => cb(err)); 74 | } 75 | 76 | function parsePage ($) { 77 | var grid = $('ul.grid.list-item'); 78 | var items = grid.find('li'); 79 | return items.map((i, el) => { 80 | el = $(el); 81 | 82 | var thumb = el.find('figure.rollover.site'); 83 | var url = thumb.find('a').eq(1).attr('href'); 84 | var thumbUrl = thumb.find('img').attr('src'); 85 | var developerAward = Boolean(thumb.find('div.label.developer').length); 86 | 87 | var info = el.find('.info'); 88 | var titleEl = info.find('h3 > a'); 89 | var entryUrl = absoluteUrl(titleEl.attr('href')); 90 | 91 | var hearts = parseInt(info.find('.add-like > .total').text(), 10) || 0; 92 | var rows = info.find('div.row'); 93 | var authorEl = rows.eq(0).find('a'); 94 | var strongs = rows.eq(0).find('strong'); 95 | var country = strongs.eq(1).text().trim(); 96 | var rating = parseFloat(strongs.eq(2).text().trim()); 97 | var date = parseDate(rows.eq(1).text().trim()); 98 | 99 | var obj = { 100 | title: titleEl.text().trim(), 101 | url: url, 102 | thumbnail: thumbUrl, 103 | entry: entryUrl, 104 | hearts: hearts, 105 | author: { 106 | name: authorEl.text(), 107 | entry: absoluteUrl(authorEl.attr('href')), 108 | country: country 109 | }, 110 | date: date, 111 | developerAward: developerAward 112 | }; 113 | 114 | // Some sections don't show ratings 115 | if (isFinite(rating)) obj.rating = rating; 116 | return obj; 117 | }).get(); 118 | } 119 | } 120 | 121 | function absoluteUrl (url) { 122 | return url ? urlJoin(BASE_URL, url) : undefined; 123 | } 124 | 125 | function parseDate (date) { 126 | return moment(date, 'MMMM DD, YYYY').format(); 127 | } 128 | --------------------------------------------------------------------------------