├── index.js ├── .npmignore ├── ratatouille-demo.png ├── LICENSE ├── .gitignore ├── package.json ├── src ├── steps-parser.js ├── index.js ├── ingredients-parser.js └── phrases.js ├── README.md ├── STORY.md └── CODE_OF_CONDUCT.md /index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./lib'); 2 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | STORY.md 2 | ratatouille-demo.png 3 | CODE_OF_CONDUCT.md 4 | -------------------------------------------------------------------------------- /ratatouille-demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/captainsafia/ratatouille/HEAD/ratatouille-demo.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Safia Abdalla 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (http://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # Typescript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | lib 60 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@captainsafia/ratatouille", 3 | "version": "1.3.0", 4 | "description": "A Node.js interface to Allrecipes.com", 5 | "main": "index.js", 6 | "scripts": { 7 | "prepublish": "npm run build", 8 | "build:lib": "babel src -d lib", 9 | "build:clean": "rimraf lib", 10 | "build": "npm run build:clean && npm run build:lib" 11 | }, 12 | "keywords": [ 13 | "allrecipes", 14 | "interface", 15 | "cooking" 16 | ], 17 | "author": "Safia Abdalla ", 18 | "repository": { 19 | "type": "git", 20 | "url": "https://github.com/captainsafia/ratatouille" 21 | }, 22 | "license": "MIT", 23 | "dependencies": { 24 | "cheerio": "^0.22.0", 25 | "moment": "^2.17.1", 26 | "pos": "^0.4.2", 27 | "sync-request": "^4.0.1" 28 | }, 29 | "publishConfig": { 30 | "access": "public" 31 | }, 32 | "devDependencies": { 33 | "babel-cli": "^6.23.0", 34 | "babel-preset-env": "^1.2.1", 35 | "rimraf": "^2.6.1" 36 | }, 37 | "babel": { 38 | "presets": [ 39 | [ 40 | "env", 41 | { 42 | "targets": { 43 | "node": "current" 44 | } 45 | } 46 | ] 47 | ] 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/steps-parser.js: -------------------------------------------------------------------------------- 1 | import pos from 'pos'; 2 | import { methods, tools } from './phrases'; 3 | 4 | class StepParser { 5 | constructor(text, ingredients) { 6 | this.text = this.parseText(text); 7 | this.ingredients = ingredients; 8 | } 9 | 10 | parseText(text) { 11 | return text.toLowerCase() 12 | .replace(/[.,\/#!$%\^&\*;:{}=\-_`~()]/g,""); 13 | } 14 | 15 | get method() { 16 | return methods.filter((method) => this.text.includes(method)); 17 | } 18 | 19 | get tools() { 20 | return tools.filter((tool) => this.text.includes(tool)); 21 | } 22 | 23 | get ingredientsUsed() { 24 | return this.ingredients.filter((ingredient) => this.text.includes(ingredient.name)); 25 | } 26 | } 27 | 28 | export default class StepsParser { 29 | constructor($, ingredients) { 30 | this.$ = $; 31 | this.ingredients = ingredients; 32 | } 33 | 34 | get steps() { 35 | return (this.$('.recipe-directions__list--item')).map((index, element) => { 36 | if (element.children.length > 0) { 37 | const text = element.children[0].data; 38 | const step = new StepParser(text, this.ingredients); 39 | return { 40 | method: step.method, 41 | tools: step.tools, 42 | ingredients: step.ingredientsUsed, 43 | raw: text, 44 | }; 45 | } 46 | }).toArray(); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ratatouille 2 | 3 | ratatouille is a Node.js scrapper for allrecipes.com. It allows you to 4 | extract basic information about a recipe, the ingredients required, and 5 | the steps involved. 6 | 7 | ### Installation 8 | 9 | ``` 10 | npm install @captainsafia/ratatouille 11 | ``` 12 | 13 | ### Usage 14 | 15 | ``` 16 | > import Ratatouille from '@captainsafia/ratatouille'; 17 | > const recipe = new Ratatouille('http://allrecipes.com/recipe/254910/kicked-up-mac-cheese'); 18 | > recipe.readyInTime 19 | '25 minutes' 20 | > recipe.prepTime 21 | '10 minutes' 22 | > recipe.cookTime 23 | '15 minutes' 24 | > recipe.calories 25 | '830' 26 | > recipe.ingredients 27 | [ { quantity: '14.5', 28 | unit: 'ounce', 29 | preparation: '', 30 | name: 'package macaroni and cheese mix' }, 31 | { quantity: '1/2', unit: 'cup', preparation: '', name: 'milk' }, 32 | { quantity: '1/2', 33 | unit: 'cup', 34 | preparation: 'divided', 35 | name: 'butter' }, 36 | { quantity: '15', 37 | unit: 'ounce', 38 | preparation: 'drained', 39 | name: 'can Libby\'s® Sweet Peas' }, 40 | { quantity: '15', 41 | unit: 'ounce', 42 | preparation: 'drained', 43 | name: 'can Libby\'s® Whole Kernel Sweet Corn' }, 44 | { quantity: '4', 45 | unit: 'ounce', 46 | preparation: 'drained', 47 | name: 'jar diced pimentos' }, 48 | { quantity: '1/2', 49 | unit: 'cups', 50 | preparation: '', 51 | name: 'panko bread crumbs' } ] 52 | ``` 53 | 54 | ![Ratatouille Demo](ratatouille-demo.png) 55 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | import request from 'sync-request'; 3 | import moment from 'moment'; 4 | 5 | import IngredientsParser from './ingredients-parser.js'; 6 | import StepsParser from './steps-parser.js'; 7 | 8 | export default class Ratatouille { 9 | constructor(url) { 10 | if (this.isValidURL(url)) { 11 | this.url = url; 12 | const html = request('GET', url).getBody('utf8'); 13 | this.$ = cheerio.load(html); 14 | } else { 15 | throw new Error(`URL ${url} is invalid!`); 16 | } 17 | } 18 | 19 | isValidURL(url) { 20 | return url.includes('allrecipes.com'); 21 | } 22 | 23 | get name() { 24 | return this.$('[itemProp="name"]').text(); 25 | } 26 | 27 | get servings() { 28 | const rawString = this.$('.adjustServings .subtext').text(); 29 | const numServings = rawString.match(/\d+/g) 30 | return numServings ? numServings[0] : 0; 31 | } 32 | 33 | get readyInTime() { 34 | const duration = this.$('[itemProp="totalTime"]').attr('datetime'); 35 | return moment.duration(duration).humanize(); 36 | } 37 | 38 | get prepTime() { 39 | const duration = this.$('[itemProp="prepTime"]').attr('datetime'); 40 | return moment.duration(duration).humanize(); 41 | } 42 | 43 | get cookTime() { 44 | const duration = this.$('[itemProp="cookTime"]').attr('datetime'); 45 | return moment.duration(duration).humanize(); 46 | } 47 | 48 | get calories() { 49 | return this.$('.calorie-count span:first-child').text(); 50 | } 51 | 52 | get ingredients() { 53 | const ingredientsParser = new IngredientsParser(this.$); 54 | return ingredientsParser.ingredients; 55 | } 56 | 57 | get steps() { 58 | const stepsParser = new StepsParser(this.$, this.ingredients); 59 | return stepsParser.steps; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/ingredients-parser.js: -------------------------------------------------------------------------------- 1 | import cheerio from 'cheerio'; 2 | import { units } from './phrases'; 3 | 4 | class IngredientParser { 5 | constructor(text) { 6 | this.strings = this.parseText(text); 7 | } 8 | 9 | parseText(text) { 10 | return text.replace(/[()]/g, '').split(' '); 11 | } 12 | 13 | get unit() { 14 | const unit = this.strings.filter((string) => { 15 | return units.indexOf(string) >= 0; 16 | })[0]; 17 | return unit; 18 | } 19 | 20 | get quantity() { 21 | const unitIndex = this.strings.indexOf(this.unit); 22 | if (!isNaN(this.strings[unitIndex - 2])) { 23 | return this.strings[unitIndex - 2] + ' ' + this.strings[unitIndex - 1]; 24 | } else { 25 | return this.strings[unitIndex - 1]; 26 | } 27 | } 28 | 29 | get preparation() { 30 | const delimiter = this.strings.filter((value) => { 31 | return value.endsWith(','); 32 | })[0]; 33 | const delimiterIndex = this.strings.indexOf(delimiter); 34 | if (delimiterIndex >= 0) { 35 | return this.strings[delimiterIndex + 1]; 36 | } else { 37 | return ''; 38 | } 39 | } 40 | 41 | get name() { 42 | const unitIndex = this.strings.indexOf(this.unit); 43 | const delimiter = this.strings.filter((value) => { 44 | return value.endsWith(','); 45 | })[0]; 46 | const delimiterIndex = this.strings.indexOf(delimiter); 47 | const end = delimiterIndex >= 0 ? delimiterIndex + 1: this.strings.length; 48 | const name = this.strings.slice(unitIndex + 1, end); 49 | return name.join(' ').replace(',', ''); 50 | } 51 | } 52 | 53 | export default class IngredientsParser { 54 | constructor($) { 55 | this.$ = $; 56 | } 57 | 58 | get ingredients() { 59 | return (this.$('span.recipe-ingred_txt[itemprop="ingredients"]')).map((index, element) => { 60 | const text = element.children[0].data; 61 | const ingredient = new IngredientParser(text); 62 | return { 63 | quantity: ingredient.quantity, 64 | unit: ingredient.unit, 65 | preparation: ingredient.preparation, 66 | name: ingredient.name, 67 | }; 68 | }).toArray(); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/phrases.js: -------------------------------------------------------------------------------- 1 | export const tools = [ 2 | 'spoon', 3 | 'cup', 4 | 'bowl', 5 | 'cutting board', 6 | 'knife', 7 | 'peeler', 8 | 'colander', 9 | 'strainer', 10 | 'grater', 11 | 'can opener', 12 | 'saucepan', 13 | 'frying pan', 14 | 'pan', 15 | 'baking dish', 16 | 'blender', 17 | 'spatula', 18 | 'tongs', 19 | 'garlic press', 20 | 'ladle', 21 | 'ricer', 22 | 'pot holder', 23 | 'rolling pin', 24 | 'scissors', 25 | 'whisk', 26 | 'skillet', 27 | 'wok', 28 | 'baking sheet', 29 | 'cookie sheets', 30 | 'casserole dish', 31 | 'pot', 32 | 'slow cooker' 33 | ]; 34 | 35 | export const units = [ 36 | 'cup', 37 | 'cups', 38 | 'ounce', 39 | 'teaspoon', 40 | 'teaspoons', 41 | 'tablespoon', 42 | 'tablespoons', 43 | 'cloves' 44 | ]; 45 | 46 | export const methods = [ 47 | 'bake', 48 | 'barbecue', 49 | 'baste', 50 | 'boil', 51 | 'braise', 52 | 'brown', 53 | 'caramelize', 54 | 'chop', 55 | 'cut', 56 | 'freeze', 57 | 'fry', 58 | 'glaze', 59 | 'grill', 60 | 'juice', 61 | 'marinate', 62 | 'microwave', 63 | 'mince', 64 | 'puree', 65 | 'refrigerage', 66 | 'roast', 67 | 'rotisserie', 68 | 'sautee', 69 | 'season', 70 | 'simmer', 71 | 'steam', 72 | 'stew', 73 | 'stir fry', 74 | 'tenderize', 75 | 'beat', 76 | 'bind', 77 | 'blackened', 78 | 'blanch', 79 | 'blend', 80 | 'braise', 81 | 'broil', 82 | 'brown', 83 | 'brush', 84 | 'butterfly', 85 | 'carmelize', 86 | 'chop', 87 | 'coat', 88 | 'combine', 89 | 'core', 90 | 'cream', 91 | 'crimp', 92 | 'crisp', 93 | 'cure', 94 | 'deep-fry', 95 | 'deep fry', 96 | 'deglaze', 97 | 'dice', 98 | 'dot', 99 | 'dredge', 100 | 'drizzle', 101 | 'dust', 102 | 'fold', 103 | 'fry', 104 | 'garnish', 105 | 'glaze', 106 | 'grate', 107 | 'grease', 108 | 'grill', 109 | 'grind', 110 | 'knead', 111 | 'marinate', 112 | 'mince', 113 | 'pan fry', 114 | 'pan-fry', 115 | 'poach', 116 | 'pressure cook', 117 | 'puree', 118 | 'reduce', 119 | 'roast', 120 | 'sautee', 121 | 'saute', 122 | 'sauté', 123 | 'scald', 124 | 'score', 125 | 'sear', 126 | 'season', 127 | 'set', 128 | 'shred', 129 | 'sift', 130 | 'simmer', 131 | 'skim', 132 | 'steam', 133 | 'steep', 134 | 'stewing', 135 | 'stir fry', 136 | 'stir-fry', 137 | 'thin', 138 | 'toss', 139 | 'unleavened', 140 | 'whip', 141 | 'whisk' 142 | ]; 143 | -------------------------------------------------------------------------------- /STORY.md: -------------------------------------------------------------------------------- 1 | Ratatouille is a very hacky Node.js scrapper for [AllRecipes.com](http://allrecipes.com/). 2 | When I say very hacky, I mean very hacky. 3 | 4 | Ratatouille is the first open source project I've released that spun directly 5 | off of a school project. I'm currently enrolled in a Natural Language Processing 6 | class with a final project that involves scraping the popular recipes website 7 | and applying some transformations to the recipes, for example transforming a 8 | non-vegeterian recipe to a vegeterian recipe. 9 | 10 | Web scraping has a special place in my heart. In my early days of tinkering with 11 | search engines and information retrieval systems, I built quite a few web scrapers. 12 | As it just so happens, that I cannot resist a good web scraping challenge. Scraping 13 | AllRecipes.com was particularly tricky, the website uses an Angular front-end and can be 14 | particularly messy to parse. Not to mention the requirements of the project demanded 15 | a high level of precision. It was necessary to extract information like the quantity, the 16 | unit of the quantity, the name, and the preparation method of each ingredient. It also 17 | required that each step in the recipe be parsed to extract the step involved, the ingredients 18 | involved, and the tools involved. You guessed right, natural language processing is a tough task. 19 | 20 | Extracting some of the information was quite easy. For example, extracting the 21 | preparation time and the cook time was trivial. It involved using `cheerio` to pull 22 | out attribute in a `time` element. It was only my second time seeing the `