├── .DS_Store ├── .gitignore ├── README.md ├── bin └── schemaParser ├── example └── main.js ├── lib └── schemaParser.js ├── package.json └── test └── main.js /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vespa/semantic-schema-parser/9602def0d34f6194c0b621cfba263c13594c45b5/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Semantic Schema Parser 2 | 3 | A Nodejs module to extract http://schema.org micro-data from HTML and convert it in a JSON object. 4 | ## Install 5 | npm install semantic-schema-parser 6 | 7 | ## How use it 8 | 9 | //set a list of URLs 10 | var schema = require("semantic-schema-parser"); 11 | var urls = [ 12 | "http://www.imdb.com/title/tt0096874/", 13 | "http://www.imdb.com/title/tt0087469/" 14 | ]; 15 | 16 | //Be happy 17 | 18 | schema.parseURLs(urls, 19 | // set a callback 20 | function(msg){ 21 | // returns a JSON; 22 | msg = JSON.stringify(msg); 23 | // do something 24 | }); 25 | 26 | You also can send the content you want to parse as a string using the parseContent function: 27 | 28 | var schema = require("semantic-schema-parser"); 29 | var myString = "...etc..." 30 | schema.parseContent(myString, function(msg){ 31 | // returns a JSON; 32 | msg = JSON.stringify(msg); 33 | // do something 34 | }); 35 | 36 | ## Example 37 | Get in the example folder and run the command 38 | node main.js 39 | 40 | The example will create a file named result.json based in a URLs list. That file have a text example of the generated JSON object. 41 | 42 | Probably , It'll be something like that: 43 | 44 | { 45 | 46 | "http://www.ebay.com/itm/Doctor-Who-3D-TARDIS-Police-Box-Pewter-Tall-PENDANT-20-Long-Chain-Necklace-/331098975095?pt=LH_DefaultDomain_0&hash=item4d17096b77": { 47 | "title": " Doctor Who 3D Tardis Police Box Pewter Tall Pendant 20\" Long Chain Necklace | eBay ", 48 | "url": "http://www.ebay.com/itm/Doctor-Who-3D-TARDIS-Police-Box-Pewter-Tall-PENDANT-20-Long-Chain-Necklace-/331098975095?pt=LH_DefaultDomain_0&hash=item4d17096b77", 49 | "elems": [ 50 | { 51 | "product": [ 52 | { 53 | "image": { 54 | "itemprop": "image", 55 | "src": "http://i.ebayimg.com/00/s/MTMzNVgxNjAw/z/GJAAAOxy63FSxgzz/$_35.JPG" 56 | } 57 | }, 58 | { 59 | "image": { 60 | "itemprop": "image", 61 | "src": "http://i.ebayimg.com/00/s/MTMzNVgxNjAw/z/GJAAAOxy63FSxgzz/$_35.JPG" 62 | } 63 | }, 64 | { 65 | "price": { 66 | "itemprop": "price", 67 | "text": "US $2.99" 68 | } 69 | }, 70 | { 71 | "availability": { 72 | "itemprop": "availability", 73 | "content": "http://schema.org/InStock" 74 | } 75 | }, 76 | { 77 | "priceCurrency": { 78 | "itemprop": "priceCurrency", 79 | "content": "USD" 80 | } 81 | }, 82 | { 83 | "name": { 84 | "itemprop": "name", 85 | "text": "Doctor Who 3DTARDIS Police Box Pewter" 86 | } 87 | }, 88 | { 89 | "name": { 90 | "itemprop": "name", 91 | "text": " Tall PENDANT 20\" LongChain Necklace " 92 | } 93 | } 94 | ] 95 | }, 96 | { 97 | "offers": [ 98 | { 99 | "price": { 100 | "itemprop": "price", 101 | "text": "US $2.99" 102 | } 103 | }, 104 | { 105 | "availability": { 106 | "itemprop": "availability", 107 | "content": "http://schema.org/InStock" 108 | } 109 | }, 110 | { 111 | "priceCurrency": { 112 | "itemprop": "priceCurrency", 113 | "content": "USD" 114 | } 115 | } 116 | ] 117 | } 118 | ] 119 | } 120 | } 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /bin/schemaParser: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | var path = require('path'); 3 | var fs = require('fs'); 4 | var lib = path.join(path.dirname(fs.realpathSync(__filename)), '../lib'); 5 | require(lib + '/main.js'); -------------------------------------------------------------------------------- /example/main.js: -------------------------------------------------------------------------------- 1 | (function() { 2 | //Declaring variables 3 | var fs, schema, filedata; 4 | 5 | //Requiring files 6 | fs = require('fs'); 7 | schema = require ('../lib/schemaParser'); 8 | 9 | 10 | var urls = [ 11 | // "http://www.imdb.com/title/tt0096874/", 12 | //"https://www.baby.com.br/produtos/tenis-xadrez-grafite-com-velcro-converse-all-star", 13 | //"http://www.walmart.com.br/produto/Telefonia/iPhone/Apple/413770-Apple-iPhone-4S-8GB-Preto-Desbloqueado", 14 | //"http://www.foodnetwork.com/recipes/ree-drummond/cajun-chicken-pasta-recipe.html", 15 | "https://www.iba.com.br/livro-digital-ebook/O-Andar-do-B%C3%AAbado-93b053a9949ea2616a8b7457507a874d", 16 | //"http://www.ebay.com/itm/Unique-Doctor-Who-3D-TARDIS-Police-Box-Pewter-Tall-PENDANT-Long-Chian-Necklace-/161350182697?pt=Fashion_Jewelry&hash=item2591386729" 17 | ]; 18 | 19 | 20 | //Reading files 21 | schema.parseURLs(urls, function(msg){ 22 | fs.writeFile("../result.json", JSON.stringify(msg), function(err){ 23 | if(err){ 24 | console.log("report: something gones wrong:" +err); 25 | }else{ 26 | console.log("report generated!"); 27 | } 28 | }); 29 | }); 30 | 31 | }).call(this) -------------------------------------------------------------------------------- /lib/schemaParser.js: -------------------------------------------------------------------------------- 1 | "use strict"; 2 | 3 | var request = require("request"), 4 | cheerio = require("cheerio"), 5 | $; 6 | 7 | 8 | var scrapper = { 9 | obj:[], 10 | 11 | findAllEntities: function(){ 12 | return $("[itemtype]").toArray().reverse(); 13 | }, 14 | setElementValues: function(item, props){ 15 | if($(props).attr("itemprop") || $(props).attr("itemtype")||$(props).attr("itemscope")){ 16 | var obj = { 17 | itemtype : $(props).attr("itemtype") || false, 18 | itemscope : $(props).attr("itemscope") || false, 19 | itemprop : $(props).attr("itemprop") || false, 20 | href : $(props).attr("href") || false, 21 | content : $(props).attr("content") || false, 22 | text: $(props).text().replace(/[\n\t]|\s{2,}/g, "") || false, 23 | src: $(props).attr("src") || false, 24 | // html: $.html(props).replace(/[\n\t]|\s{2,}/g, "") 25 | }; 26 | var temp = {}; 27 | for (var x in obj) { 28 | if(obj[x]){ 29 | temp[x] = obj[x]; 30 | } 31 | } 32 | 33 | var objName = $(props).attr("itemtype") || $(props).attr("itemprop"); 34 | item[objName] = temp; 35 | // item[objName].name = objName; 36 | } 37 | return item; 38 | }, 39 | countObj: function(obj){ 40 | var ct = 0; 41 | for(var x in obj){ 42 | ct++; 43 | } 44 | return ct; 45 | 46 | }, 47 | myObjects: [], 48 | generateElementsTree: function(obj){ 49 | var children = $(obj).children(), 50 | arr = [], 51 | c = children.length; 52 | while(c--){ 53 | var child = children[c]; 54 | var item = {}; 55 | if($(child).children().length!== 0){ 56 | var elem = this.generateElementsTree(child); 57 | item = this.setElementValues(elem, child); 58 | }else{ 59 | item = this.setElementValues(item, child); 60 | } 61 | if(Object.keys(item).length!=0 && item.length !=0){ 62 | if(item.constructor === Object){ 63 | this.myObjects.push(item); 64 | } 65 | arr.push(item); 66 | } 67 | } 68 | return arr; 69 | }, 70 | generateTree: function(obj){ 71 | var objName = this.getItemType(obj), 72 | tree = {}; 73 | if($(obj).attr("itemprop")){ 74 | objName = $(obj).attr("itemprop"); 75 | } 76 | 77 | tree[objName] = this.generateElementsTree(obj); 78 | tree[objName] = this.myObjects.reverse(); 79 | 80 | return tree; 81 | }, 82 | getItemType: function(obj){ 83 | var objName = $(obj).attr("itemtype"); 84 | if(typeof objName!== "undefined"){ 85 | return objName.substring((objName.lastIndexOf("/")+1), objName.length).toLowerCase(); 86 | } else{ 87 | return $(obj).attr("itemprop"); 88 | } 89 | }, 90 | result:{}, 91 | parser: function(url){ 92 | var arr = this.findAllEntities(), 93 | c = arr.length; 94 | url = url || "object"; 95 | this.result[url] = {}; 96 | this.result[url].title = $("title").text(); 97 | this.result[url].url = url; 98 | this.result[url]["elems"]=[]; 99 | while(c--){ 100 | this.myObjects = []; 101 | var tree = this.generateTree(arr[c]); 102 | this.result[url]["elems"].push(tree); 103 | } 104 | return this.result; 105 | }, 106 | }; 107 | 108 | var schemaParser = { 109 | obj:{}, 110 | parseContent: function(content, callback){ 111 | schemaParser.obj = {}; 112 | scrapper.result = {}; 113 | if(content.constructor === String){ 114 | $ = cheerio.load(content); 115 | } else{ 116 | $ = content; 117 | } 118 | var d = scrapper.parser(); 119 | if(callback){ 120 | var k =Object.keys(d)[0]; 121 | callback(d[k]); 122 | } 123 | }, 124 | parseURLs: function(list, callback){ 125 | var count = list.length; 126 | if(count > 0 ){ 127 | var url = list[0]; 128 | request(url, function(err, resp, body) { 129 | if (err) 130 | throw err; 131 | $ = cheerio.load(body); 132 | var d = scrapper.parser(url); 133 | schemaParser.obj = d; 134 | list.shift(); 135 | schemaParser.parseURLs(list, callback); 136 | //callback(d); 137 | }); 138 | }else{ 139 | callback(schemaParser.obj); 140 | } 141 | } 142 | }; 143 | exports.parseURLs = schemaParser.parseURLs; 144 | exports.parseContent = schemaParser.parseContent; 145 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "semantic-schema-parser", 3 | "version": "0.1.0", 4 | "description": "A NodeJS module to extract micro-data from html docs", 5 | "main": "./lib/schemaParser", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "https://github.com/vespa/semantic-schema-parser.git" 12 | }, 13 | "keywords": [ 14 | "schema", 15 | "schema.org", 16 | "microdata", 17 | "micro-data", 18 | "microformats", 19 | "parser", 20 | "htmlParser" 21 | ], 22 | "author": "Danilo C. Vespa", 23 | "license": "MIT", 24 | "bugs": { 25 | "url": "https://github.com/vespa/semantic-schema-parser/issues" 26 | }, 27 | "homepage": "https://github.com/vespa/semantic-schema-parser", 28 | "dependencies": { 29 | "cheerio": "~0.13.1", 30 | "request": "~2.34.0" 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /test/main.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vespa/semantic-schema-parser/9602def0d34f6194c0b621cfba263c13594c45b5/test/main.js --------------------------------------------------------------------------------