├── .env ├── .gitignore ├── LICENSE ├── README.md ├── lib ├── aliexpress.js ├── amazon.js ├── config.js ├── imaging.js └── index.js ├── model ├── criteria.js ├── index.js ├── match.js └── product.js ├── package.json └── test └── sample.html /.env: -------------------------------------------------------------------------------- 1 | NODE_ENV=development 2 | 3 | FILE_DROP=/Users/tmillar/dev/repo/geek-scrape/ 4 | 5 | DB_USER=admin 6 | DB_PWD=password 7 | DB_HOST=localhost 8 | DB_PORT=27017 9 | DB_NAME=compare 10 | databaseConnectionTimeout=300000 11 | 12 | EMAIL_SERVICE=gmail 13 | EMAIL_USER= 14 | EMAIL=PWD= 15 | 16 | AWS_USER= 17 | AWS_KEY= 18 | AWS_SECRET= 19 | AWS_ASSOCIATE= -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env.production 2 | 3 | # Created by .ignore support plugin (hsz.mobi) 4 | ### Node template 5 | # Logs 6 | logs 7 | *.log 8 | 9 | # Images 10 | files 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | 17 | # Directory for instrumented libs generated by jscoverage/JSCover 18 | lib-cov 19 | 20 | # Coverage directory used by tools like istanbul 21 | coverage 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # node-waf configuration 27 | .lock-wscript 28 | 29 | # Compiled binary addons (http://nodejs.org/api/addons.html) 30 | build/Release 31 | 32 | # Dependency directory 33 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git- 34 | node_modules 35 | 36 | 37 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 38 | 39 | *.iml 40 | 41 | ## Directory-based project format: 42 | .idea/ 43 | # if you remove the above rule, at least ignore the following: 44 | 45 | # User-specific stuff: 46 | # .idea/workspace.xml 47 | # .idea/tasks.xml 48 | # .idea/dictionaries 49 | 50 | # Sensitive or high-churn files: 51 | # .idea/dataSources.ids 52 | # .idea/dataSources.xml 53 | # .idea/sqlDataSources.xml 54 | # .idea/dynamic.xml 55 | # .idea/uiDesigner.xml 56 | 57 | # Gradle: 58 | # .idea/gradle.xml 59 | # .idea/libraries 60 | 61 | # Mongo Explorer plugin: 62 | # .idea/mongoSettings.xml 63 | 64 | ## File-based project format: 65 | *.ipr 66 | *.iws 67 | 68 | ## Plugin-specific files: 69 | 70 | # IntelliJ 71 | /out/ 72 | 73 | # mpeltonen/sbt-idea plugin 74 | .idea_modules/ 75 | 76 | # JIRA plugin 77 | atlassian-ide-plugin.xml 78 | 79 | # Crashlytics plugin (for Android Studio and IntelliJ) 80 | com_crashlytics_export_strings.xml 81 | crashlytics.properties 82 | crashlytics-build.properties -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 trentm 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenCV Product Comparer 2 | Want to be a killer Amazon seller? Well so did I. This is a crude, rude, slow, and disk heavy crawler to find you the unicorn product to sell on Amazon. Using OpenCV and scraping products off both Amazon & AliExpress in hopes of finding the products you can make lots of money selling. Good Luck! 3 | 4 | ### Purpose 5 | My wife wanted to sell some products on Amazon, I thought I would spend the weekend building this to help. 6 | 7 | What it is this...This tool will populate a MongoDB collection "categories" containing all the official categories used in Amazon's database. Then you define the category you want to find products that have a good profit margin. Using Amazon's Product API, it pages through products and downloads images of each product. At the same time, it is scraping images matching certain criteria off AliExpress. Then OpenCV will try to find the cheapest version of the Amazon product for sale on AliExpress. A "Match" document is created that shows the degree of similarity between products. 8 | 9 | Ideally you will find products on Amazon that are also for sale on AliExpress, likely at a price that makes it profitable to buy them in bulk and then sell on Amazon. 10 | 11 | ### TODO 12 | Lots including forking processes to make it run concurrently. 13 | 14 | ### History 15 | I created this back in early 2015 wanted to archive it before cleaning it off my MBP. 16 | 17 | ### How To Use 18 | - You will need MongoDB either local or hosted. 19 | - Make sure you have a client & secret key to Amazon's Product API (I think the name may have changed) (Not sure Amazon still has or exposes this service as expected) 20 | - If running local you will need OpenCV in your PATH. I tested on 2.4.11 with success and 3.0.0 with some success. Sorry, I haven't ran this in years so I am going off recollections 21 | - Lastly, since OpenCV is not a service, the comparer will download images from both Amazon & AliExpress locally and compare them. 22 | -------------------------------------------------------------------------------- /lib/aliexpress.js: -------------------------------------------------------------------------------- 1 | require('dotenv').load(); 2 | 3 | var x = require('x-ray')(), 4 | hashomatic = require('hash-o-matic'), 5 | async = require('async'), 6 | _ = require('underscore'), 7 | nodemailer = require('nodemailer'), 8 | transporter = nodemailer.createTransport({ 9 | service: process.env.EMAIL_SERVICE, 10 | auth: { 11 | user: process.env.EMAIL_USER, 12 | pass: process.env.EMAIL_PWD 13 | } 14 | }), 15 | mongoose = require('mongoose'), 16 | request = require('request').defaults({encoding: null}), 17 | fs = require('fs'); 18 | 19 | require(__dirname + '/../model'); 20 | 21 | var handleError = function (err) { 22 | 23 | if (err) { 24 | 25 | if (err.key) { 26 | 27 | switch (err.key) { 28 | 29 | case 404: 30 | //console.log('Product not found: ' + err.id); 31 | break; 32 | 33 | case 500: 34 | console.log('Product failed to parse: ' + err.id); 35 | break; 36 | 37 | } 38 | } 39 | } 40 | 41 | }; 42 | 43 | var generateGuid = function(){ 44 | 45 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 46 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8); 47 | return v.toString(16); 48 | }); 49 | }; 50 | 51 | var fetchImages = function(obj, cb) { 52 | 53 | var entity = obj; 54 | async.series([ 55 | 56 | function(callback){ 57 | 58 | async.each(entity.imageFiles, function (file, cbRemove) { 59 | 60 | fs.unlink(file, cbRemove); 61 | 62 | }, function (err) { 63 | 64 | return callback(err); 65 | 66 | }); 67 | 68 | }, 69 | 70 | function(callback){ 71 | 72 | //clear array 73 | entity.imageFiles = []; 74 | 75 | async.each(entity.images, function (image, cbAdd) { 76 | 77 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg'; 78 | var writeStream = fs.createWriteStream(file); 79 | writeStream.on('close', function () { 80 | 81 | entity.imageFiles.push(file); 82 | return cbAdd(); 83 | 84 | }); 85 | 86 | writeStream.on('error', function (e) { 87 | console.log('failed'); 88 | return cbAdd('failed'); 89 | }); 90 | 91 | request(image).pipe(writeStream); 92 | 93 | 94 | }, function (err) { 95 | 96 | return callback(err); 97 | 98 | }); 99 | } 100 | 101 | ], function(err){ 102 | 103 | cb(err, entity); 104 | 105 | }); 106 | }; 107 | 108 | AliExpressScraper = function() { 109 | 110 | return { 111 | 112 | fetch: function() { 113 | 114 | var Product = mongoose.models.Product; 115 | var entities; 116 | 117 | async.series([ 118 | 119 | function (cb) { 120 | 121 | //if nothing then fake one - remove this 122 | Product.find({ 123 | 'source':'AE', 124 | $or: [ { "isScraped": {"$exists": false} }, { "isScraped": false } ]}) 125 | .limit(1000).exec(function (err, docs) { 126 | 127 | entities = docs; 128 | return cb(err); 129 | 130 | }); 131 | 132 | }, 133 | 134 | function (cb) { 135 | 136 | async.eachLimit(entities, 5, 137 | 138 | function (entity, cbMain) { 139 | 140 | fetchImages(entity, function(err, result){ 141 | 142 | result.isScraped = true; 143 | result.save(cbMain); 144 | }); 145 | 146 | }, function (err) { 147 | 148 | return cb(err); 149 | 150 | }); 151 | } 152 | 153 | ], function (err) { 154 | 155 | new AliExpressScraper().fetch(); 156 | 157 | }); 158 | 159 | }, 160 | 161 | run: function () { 162 | 163 | var Product = mongoose.models.Product; 164 | 165 | var mailOptions = { 166 | from: 'AE Parser ', 167 | to: 'geek.envy.amazon@gmail.com, trent.millar@gmail.com', 168 | subject: 'NEW PRODUCTS', 169 | html: 'New Products - ' + new Date().toDateString() + '

' 170 | }; 171 | 172 | var updatedProducts = []; 173 | var newProducts = []; 174 | 175 | /* 176 | http://m.aliexpress.com/item/2005197291.html 177 | http://m.aliexpress.com/item-desc/2005197291.html 178 | http://m.aliexpress.com/getSiteProductEvaluation.htm?productId=32266892657 179 | */ 180 | var params = { 181 | range: [1000357604 /*2005197290*/, 9999999999] 182 | }; 183 | 184 | async.series([ 185 | 186 | function (cb) { 187 | 188 | //get highest AE id 189 | Product.findOne({'source': 'AE'}).limit(1).sort('-externalId').exec(function (err, doc) { 190 | 191 | if (err) return cb(err); 192 | params.range[0] = Number(doc.externalId) + 1; 193 | cb(); 194 | 195 | }); 196 | 197 | }, 198 | 199 | function (cb) { 200 | 201 | async.doWhilst( 202 | function (callback) { 203 | var url = 'http://m.aliexpress.com/item/' + ++params.range[0] + '.html'; 204 | x(url, { 205 | 'page': 'body', 206 | 'title': 'p.ms-detail-subject' 207 | })(function (err, obj) { 208 | 209 | if (!obj || !obj.page) return callback(); 210 | 211 | var page = obj.page; 212 | var title = obj.title; 213 | 214 | if (page && page.length > 0) { 215 | 216 | var begin = 'var runParams ='; 217 | 218 | if (!page.match(/runParams/gi)) { 219 | handleError({ 220 | "key": 404, 221 | "id": params.range[0] 222 | }); 223 | return callback(); 224 | } 225 | 226 | try { 227 | var partial = page.substring(page.indexOf(begin) + begin.length); 228 | var javascript = partial.substring(0, partial.indexOf('};') + 1); 229 | 230 | //clean 231 | javascript = javascript.replace('// sku', ''); 232 | 233 | var json = eval('(' + javascript + ')'); 234 | 235 | json.title = title; 236 | json.hash = hashomatic.hash({ 237 | "displayPrice": json.displayPrice 238 | }, true, true); 239 | 240 | Product.findOne({"externalId": json.productId}, function (err, doc) { 241 | 242 | if (doc) { 243 | 244 | if (doc.hash !== json.hash) { 245 | 246 | updatedProducts.push({ 247 | 'message': doc.price + ' now ' + json.price, 248 | 'title': json.title, 249 | 'link': url 250 | }); 251 | 252 | Product.buildAliExpressProduct(json, function (err, entity) { 253 | 254 | entity.previousProduct = doc; 255 | entity.save(function (err, saved) { 256 | 257 | Product.remove({_id: doc._id}, callback); 258 | 259 | }); 260 | 261 | }); 262 | 263 | } else { 264 | //exists 265 | return callback(); 266 | } 267 | 268 | } else { 269 | 270 | newProducts.push({ 271 | 'message': json.displayPrice, 272 | 'title': json.title, 273 | 'link': url 274 | }); 275 | 276 | Product.buildAliExpressProduct(json, function (err, entity) { 277 | 278 | fetchImages(entity, function(err, result){ 279 | 280 | result.isScraped = true; 281 | return result.save(callback); 282 | 283 | }); 284 | 285 | }); 286 | } 287 | 288 | }); 289 | } catch (e) { 290 | 291 | handleError({ 292 | "key": 500, 293 | "id": params.range[0] 294 | }); 295 | return callback(err); 296 | 297 | } 298 | 299 | } else { 300 | return callback(); 301 | } 302 | 303 | }); 304 | }, 305 | 306 | function () { 307 | 308 | return params.range[0] < params.range[1]; 309 | 310 | }, 311 | 312 | function (err) { 313 | 314 | mailOptions.html += 'Updated

New

'; 315 | 316 | var updates = '', news = ''; 317 | _.each(updatedProducts, function (product) { 318 | 319 | updates += '
  • ' + product.title + ' - ' + product.message + '
  • '; 320 | 321 | }); 322 | 323 | _.each(newProducts, function (product) { 324 | 325 | news += '
  • ' + product.title + ' - ' + product.message + '
  • '; 326 | 327 | }); 328 | 329 | mailOptions.html = mailOptions.html.replace('@@update@@', updates).replace('@@new@@', news); 330 | 331 | transporter.sendMail(mailOptions, function (error, info) { 332 | if (error) { 333 | console.log(error); 334 | } else { 335 | console.log('Message sent: ' + info.response); 336 | } 337 | 338 | return cb(); 339 | 340 | }); 341 | }); 342 | } 343 | 344 | ], function (err) { 345 | 346 | console.log('complete ' + err); 347 | 348 | }); 349 | 350 | } 351 | } 352 | } 353 | 354 | module.exports.aescraper = new AliExpressScraper(); -------------------------------------------------------------------------------- /lib/amazon.js: -------------------------------------------------------------------------------- 1 | require('dotenv').load(); 2 | 3 | var x = require('x-ray')(), 4 | async = require('async'), 5 | _ = require('underscore'), 6 | util = require('util'), 7 | OperationHelper = require('apac').OperationHelper, 8 | opHelper = new OperationHelper({ 9 | awsId: process.env.AWS_KEY, 10 | awsSecret: process.env.AWS_SECRET, 11 | assocId: process.env.AWS_ASSOCIATE, 12 | version: '2013-08-01' 13 | }), 14 | mongoose = require('mongoose'), 15 | request = require('request').defaults({encoding: null}), 16 | fs = require('fs'); 17 | 18 | 19 | var generateGuid = function(){ 20 | 21 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 22 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8); 23 | return v.toString(16); 24 | }); 25 | }; 26 | 27 | var fetchImages = function(obj, cb) { 28 | 29 | var entity = obj; 30 | async.series([ 31 | 32 | function(callback){ 33 | 34 | async.each(entity.imageFiles, function (file, cbRemove) { 35 | 36 | fs.unlink(file, cbRemove); 37 | 38 | }, function (err) { 39 | 40 | return callback(err); 41 | 42 | }); 43 | 44 | }, 45 | 46 | function(callback){ 47 | 48 | //clear array 49 | entity.imageFiles = []; 50 | 51 | async.each(entity.images, function (image, cbAdd) { 52 | 53 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg'; 54 | var writeStream = fs.createWriteStream(file); 55 | writeStream.on('close', function () { 56 | 57 | entity.imageFiles.push(file); 58 | return cbAdd(); 59 | 60 | }); 61 | 62 | writeStream.on('error', function () { 63 | console.log('failed'); 64 | return cbAdd('failed'); 65 | }); 66 | 67 | request(image).pipe(writeStream); 68 | 69 | 70 | }, function (err) { 71 | 72 | return callback(err); 73 | 74 | }); 75 | } 76 | 77 | ], function(err){ 78 | 79 | cb(err, entity); 80 | 81 | }); 82 | }; 83 | 84 | AmazonScraper = function() { 85 | 86 | return { 87 | 88 | run: function (finished) { 89 | 90 | require(__dirname + '/../model'); 91 | var Product = mongoose.models.Product; 92 | var Criteria = mongoose.models.Criteria; 93 | var criterion = []; 94 | 95 | async.series([ 96 | 97 | function (cb) { 98 | 99 | //if nothing then fake one - remove this 100 | Criteria.findOne().exec(function (err, doc) { 101 | 102 | if (err) return cb(err); 103 | if (doc) return cb(); 104 | 105 | var criteria = new Criteria(); 106 | criteria.operation = "ItemSearch"; 107 | criteria.searchIndex = "HomeGarden"; 108 | criteria.keywords = "pillow bamboo"; 109 | criteria.responseGroup = "ItemAttributes,BrowseNodes,Offers,VariationOffers,Images"; 110 | criteria.condition = "New"; 111 | criteria.minPrice = "100"; 112 | criteria.maxPrice = "5000"; 113 | criteria.date = new Date(); 114 | criteria.disabled = false; 115 | criteria.save(cb); 116 | 117 | }); 118 | 119 | }, 120 | 121 | function (cb) { 122 | 123 | Criteria.find({'disabled': false}).sort("-lastRunDate").exec(function (err, docs) { 124 | 125 | criterion = docs; 126 | return cb(err); 127 | 128 | }); 129 | 130 | }, 131 | 132 | function (cb) { 133 | 134 | async.eachLimit(criterion, 1, 135 | 136 | function (criteria, cbMain) { 137 | 138 | //const 139 | var step = 10; 140 | 141 | var req = criteria.amazonCriteria; 142 | req.MinimumPrice = criteria.minPrice; 143 | var maximumPrice = criteria.maxPrice; 144 | var delta = step; 145 | 146 | req.MaximumPrice = req.MinimumPrice; 147 | req.VariationPage = 1; 148 | req.IncludeReviewsSummary = true; 149 | 150 | var go = true; 151 | var elapsed = new Date().getTime(); 152 | 153 | async.doWhilst( 154 | function (callback) { 155 | 156 | async.series([ 157 | 158 | function (cb2) { 159 | 160 | var countdown = function () { 161 | setTimeout(function () { 162 | 163 | if ((elapsed + 1000) < new Date().getTime()) { 164 | 165 | return cb2(); 166 | 167 | } 168 | countdown(); 169 | 170 | }, 1); 171 | }; 172 | countdown(); 173 | 174 | }, 175 | 176 | function (cb2) { 177 | 178 | if(req.VariationPage === 1) { 179 | req.MinimumPrice = req.MaximumPrice + 1; 180 | req.MaximumPrice += delta; 181 | } 182 | 183 | console.log('Running ' + req.MinimumPrice + ' - ' + req.MaximumPrice ); 184 | 185 | if(req.MaximumPrice >= maximumPrice) go = false; 186 | 187 | opHelper.execute(criteria.operation, JSON.parse(JSON.stringify(req)), 188 | function (error, results) { 189 | 190 | elapsed = new Date().getTime(); 191 | 192 | if (error) return cb2(error); 193 | 194 | var _resp = (results.ItemSearchResponse && results.ItemSearchResponse.Items) 195 | ? results.ItemSearchResponse.Items[0] 196 | : null; 197 | 198 | if (!_resp) { 199 | 200 | if (results.ItemSearchResponse && 201 | results.ItemSearchResponse.Error && results.ItemSearchResponse.Items.Error.length > 0) { 202 | 203 | return callback(results.ItemSearchResponse.Error[0].Message); 204 | } 205 | 206 | return cb2('fd up'); 207 | } 208 | 209 | var _req = _resp.Request[0]; 210 | 211 | if (_req.IsValid && _req.IsValid[0] == "False") { 212 | 213 | var msg = ''; 214 | _.each(_.pluck(_req.Errors, 'Error'), function (error) { 215 | 216 | if (error && error.length > 0) 217 | msg += '\n' + error[0].Message; 218 | 219 | }); 220 | return cb2(msg); 221 | 222 | } 223 | 224 | var result = results.ItemSearchResponse && results.ItemSearchResponse.Items 225 | && results.ItemSearchResponse.Items.length > 0 226 | ? results.ItemSearchResponse.Items[0] 227 | : null; 228 | 229 | if (!result) { 230 | //nothiing??? 231 | return cb2('shitz fan'); 232 | } 233 | 234 | //check for 0 results 235 | var total = Number(result.TotalResults[0]); 236 | 237 | if (total === 0) { 238 | delta = 5 * step; 239 | return cb2(); 240 | } 241 | else if (total < 10) { 242 | delta = step; 243 | } 244 | else if (total > 10) { 245 | 246 | var pages = Math.ceil(total / 10); 247 | 248 | if (pages > 10) { 249 | if(delta > 1) { 250 | req.MaximumPrice = req.MinimumPrice - 1; 251 | req.VariationPage = 1; 252 | delta = 1;//Math.ceil(step / 5); 253 | return cb2(); 254 | } 255 | } else { 256 | req.VariationPage = req.VariationPage < pages ? (req.VariationPage + 1) : 1; 257 | } 258 | } 259 | 260 | if (result.Item.length == 0) return callback(); 261 | 262 | async.each(result.Item, function (item, callback2) { 263 | 264 | //build out product 265 | var entity; 266 | var ignore = false; 267 | 268 | async.series([ 269 | 270 | function(cbBuild){ 271 | 272 | Product.buildAmazonApiProduct(item, function (err, doc) { 273 | 274 | entity = doc; 275 | return cbBuild(err); 276 | 277 | }); 278 | }, 279 | 280 | function(cbBuild){ 281 | 282 | Product.findOne({"externalId": entity.externalId}, function (err, doc) { 283 | 284 | if (doc) { 285 | 286 | entity = doc; 287 | 288 | if (doc.hash !== entity.hash) { 289 | 290 | entity.previousProduct = doc; 291 | 292 | fetchImages(entity, function(err, result){ 293 | 294 | result.save(function (err, saved) { 295 | 296 | entity = saved; 297 | Product.remove({_id: doc._id}, cbBuild); 298 | 299 | }); 300 | }); 301 | 302 | } else { 303 | ignore = true; 304 | return cbBuild(); 305 | } 306 | } else { 307 | 308 | fetchImages(entity, function(err, result){ 309 | 310 | result.save(function(err, saved){ 311 | 312 | entity = saved; 313 | if(err) console.log('Err ' + err); 314 | return cbBuild(err); 315 | }); 316 | 317 | }); 318 | } 319 | }); 320 | 321 | }, 322 | 323 | function(cbBuild) { 324 | 325 | if (ignore) return cbBuild(); 326 | if (!entity.offerUrl) return cbBuild(); 327 | 328 | var url = entity.offerUrl; 329 | 330 | var huntsman = require('huntsman'); 331 | var spider = huntsman.spider(); 332 | 333 | spider.extensions = [ 334 | huntsman.extension( 'cheerio' ) // load cheerio extension 335 | ]; 336 | 337 | spider.on(url, function ( err, res ){ 338 | 339 | if (!res.extension.cheerio) return; // content is not html 340 | var $ = res.extension.cheerio; 341 | 342 | entity.sellers=[]; 343 | 344 | $('.olpOffer').each(function(){ 345 | 346 | var offer = $(this); 347 | if (!/new/gmi.test($('.olpOffer').find('.olpCondition').text())) { 348 | return; 349 | } 350 | var shipping = offer.find('p.olpShippingInfo span').text().trim(); 351 | var isExpedited = /Expedited shipping available/gmi.test(location); 352 | var availability = offer.find('.olpAvailability').text().trim(); 353 | var price = offer.find('span.olpOfferPrice').text().trim().replace('$', ''); 354 | var location = 'UNKNOWN'; 355 | var sellerRating = 'NEW'; 356 | 357 | if (/\d+\%/gmi.test(offer.find('.olpSellerColumn b').text())) { 358 | 359 | sellerRating = offer.find('.olpSellerColumn b') 360 | .text().match(/\d+\%/gmi)[0].replace('%', ''); 361 | } 362 | 363 | var sellerLink = offer.find('.olpSellerColumn p.olpSellerName a').attr('href'); 364 | 365 | if(!/http:\/\/www.amazon/gmi.test(sellerLink)){ 366 | 367 | sellerLink = 'http://www.amazon.com' + sellerLink; 368 | } 369 | 370 | var _location = offer.find('.olpDeliveryColumn ul li span').text(); 371 | 372 | if (/ships from\D+?\./gmi.test(_location)) { 373 | location = _location.match(/ships from\D+?\./gmi)[0] 374 | .replace(/ships from /gmi, '').replace('.', ''); 375 | 376 | if (/,/gmi.test(location)) { 377 | 378 | var split = location.split(','); 379 | location = split[split.length - 1].trim(); 380 | 381 | } 382 | } 383 | 384 | if (/\$[0-9]+\.[0-9]+/gmi.test(shipping)) { 385 | shipping = shipping.match(/\$[0-9]+\.[0-9]+/gmi)[0].replace('$',''); 386 | } else if (/free/gmi.test(shipping)) { 387 | shipping = 0; 388 | } else { 389 | shipping = null; 390 | } 391 | 392 | entity.sellers.push({ 393 | availability: availability, 394 | isExpedited: isExpedited, 395 | price: price, 396 | location: location, 397 | sellerStoreUrl: sellerLink, 398 | sellerRating: sellerRating, 399 | shippingPrice: shipping 400 | }); 401 | }); 402 | 403 | return cbBuild(); 404 | 405 | }); 406 | 407 | spider.queue.add(url); 408 | spider.start(); 409 | 410 | }, 411 | 412 | function(cbBuild){ 413 | 414 | entity.save(cbBuild); 415 | } 416 | 417 | 418 | ], function(err){ 419 | 420 | callback2(err); 421 | }); 422 | 423 | 424 | }, function (err) { 425 | 426 | return cb2(err); 427 | 428 | }); 429 | 430 | }); 431 | 432 | } 433 | ], function (err) { 434 | 435 | return callback(err); 436 | 437 | }); 438 | 439 | }, 440 | 441 | function () { 442 | 443 | return go; 444 | 445 | }, 446 | 447 | function (err) { 448 | 449 | cbMain(err); 450 | 451 | }); 452 | 453 | }, function (err) { 454 | 455 | return cb(err); 456 | 457 | }); 458 | } 459 | 460 | ], function (err) { 461 | 462 | return finished(err); 463 | 464 | }); 465 | 466 | } 467 | } 468 | } 469 | 470 | module.exports.amznscraper = new AmazonScraper(); -------------------------------------------------------------------------------- /lib/config.js: -------------------------------------------------------------------------------- 1 | var config = {}; 2 | 3 | config.mail = {}; 4 | config.mail = { 5 | apikey: process.env.mailchimp || null, 6 | listid: process.env.mailchimp_listid || null, 7 | mailchimp: '', 8 | apiurl: parseMailApiUrl(process.env.mailchimp_api_url), 9 | mailchimp_doubleoptin: process.env.mailchimp_doubleoptin || true, 10 | mailchimp_allowupdates: process.env.mailchimp_allowupdates || true 11 | }; 12 | 13 | 14 | /* MongoDb */ 15 | /* 16 | config.db = {}; 17 | config.db = { 18 | databaseUsername: process.env.databaseUsername || '', 19 | databasePassword: process.env.databasePassword || '', 20 | databaseHost: process.env.databaseHost || 'localhost', 21 | databaseName: process.env.databaseName || 'oasis', 22 | databasePort: process.env.databasePort || 27017 23 | }; 24 | 25 | config.db.databaseMongoUri = {}; 26 | 27 | config.db.databaseMongoUri = process.env.databaseMongoUri || config.db.databaseHost + 28 | ':' + config.db.databasePort + '/' + config.db.databaseName + '?connectTimeoutMS=30000&w=1'; 29 | 30 | config.db.databaseUri = 'mongodb://' + config.db.databaseUsername + 31 | (config.db.databaseUsername === "" ? '' : ':') + config.db.databasePassword + 32 | (config.db.databaseUsername === "" ? '' : '@') + config.db.databaseMongoUri; 33 | */ 34 | config.db = {}; 35 | config.db = { 36 | databaseUsername: process.env.databaseUsername || '', 37 | databasePassword: process.env.databasePassword || '', 38 | databaseHost: process.env.databaseHost || 'localhost', 39 | databaseName: process.env.databaseName || 'oasis', 40 | databasePort: process.env.databasePort || 27017 41 | }; 42 | 43 | var credentials = (config.db.databaseUsername && config.db.databasePassword) 44 | ? config.db.databaseUsername + ':' + config.db.databasePassword + '@' 45 | : ''; 46 | 47 | 48 | config.db.databaseMongoUri = 'mongodb://' + credentials + config.db.databaseHost + 49 | ':' + config.db.databasePort + '/' + config.db.databaseName; 50 | 51 | if(process.env.databaseHost2 && process.env.databasePort2) { 52 | config.db.databaseMongoUri += ',mongodb://' + credentials + process.env.databaseHost2 +':' + 53 | process.env.databasePort2 + '/' + config.db.databaseName; 54 | } 55 | 56 | if(process.env.databaseHost3 && process.env.databasePort3) { 57 | config.db.databaseMongoUri += ',mongodb://' + credentials + process.env.databaseHost3 +':' + 58 | process.env.databasePort3 + '/' + config.db.databaseName; 59 | } 60 | 61 | //config.db.databaseMongoUri += '/' + config.db.databaseName + '?connectTimeoutMS=30000&w=1&authMechanism=MONGODB-CR'; 62 | 63 | config.db.databaseUri = config.db.databaseMongoUri; 64 | 65 | config.db.databaseConnectionTimeout = Number(process.env.databaseConnectionTimeout || 30000); 66 | 67 | /* Redis */ 68 | config.keystore = {}; 69 | config.keystore = { 70 | databaseHost: process.env.redisHost || 'localhost', 71 | databasePort: process.env.redisPort || 6379, 72 | databaseAuth: process.env.redisPrimaryKey || process.env.redisSecondaryKey || null 73 | }; 74 | 75 | /* Logging */ 76 | config.logging = {}; 77 | config.logging = { 78 | filename: process.env.LOGGER_FILENAME, 79 | level: process.env.LOGGER_LEVEL, 80 | azure_account: process.env.LOGGER_AZURE_ACCOUNT, 81 | azure_key: process.env.LOGGER_AZURE_KEY, 82 | azure_table: process.env.LOGGER_AZURE_TABLE 83 | }; 84 | 85 | module.exports = config; -------------------------------------------------------------------------------- /lib/imaging.js: -------------------------------------------------------------------------------- 1 | require('dotenv').load(); 2 | 3 | var async = require('async'), 4 | _ = require('underscore'), 5 | util = require('util'), 6 | mongoose = require('mongoose'), 7 | hashomatic = require('hash-o-matic'), 8 | fs = require('fs'); 9 | 10 | 11 | var generateGuid = function(){ 12 | 13 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 14 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8); 15 | return v.toString(16); 16 | }); 17 | }; 18 | 19 | Imaging = function() { 20 | 21 | return { 22 | 23 | run: function (finished) { 24 | 25 | require(__dirname + '/../model'); 26 | var Product = mongoose.models.Product; 27 | var Match = mongoose.models.Match; 28 | 29 | var dir = process.env.FILE_DROP + 'files/'; 30 | var files = []; 31 | var matches = []; 32 | 33 | var cv; 34 | try{ 35 | //cv = require('opencv'); 36 | } catch(e){ 37 | return finished(e); 38 | } 39 | 40 | async.series([ 41 | 42 | function (cb) { 43 | 44 | files = []; 45 | matches = []; 46 | 47 | fs.readdir(dir,function(err,results){ 48 | if (err) cb(err); 49 | 50 | files = results; 51 | 52 | return cb(); 53 | }); 54 | 55 | 56 | }, 57 | 58 | function (cb) { 59 | 60 | var _srcImg; 61 | var _match; 62 | var go = true; 63 | 64 | async.doWhilst( 65 | function (callback) { 66 | 67 | async.series([ 68 | 69 | function (cb2) { 70 | 71 | if(files.length === 0){ 72 | //get out of here 73 | go = false; 74 | return callback(); 75 | } 76 | 77 | var file = files.shift(); 78 | 79 | _match = new Match(); 80 | _match.sourceFile = file; 81 | _match.date = new Date(); 82 | _match.sendMatchNotification = false; 83 | 84 | try { 85 | cv.readImage(dir + file, function (err, img) { 86 | if (err) { 87 | return cb2(err); 88 | } 89 | _srcImg = img; 90 | cb2(); 91 | 92 | }); 93 | } 94 | catch(e){ 95 | cb2(e); 96 | } 97 | }, 98 | 99 | function (cb2) { 100 | 101 | var dstFile; 102 | 103 | async.eachLimit(files, 10, function (file, callback2) { 104 | 105 | _match.destFile = file; 106 | 107 | var dstImg; 108 | dstFile = dir + file; 109 | 110 | async.series([ 111 | 112 | function (cbBuild) { 113 | 114 | try { 115 | cv.readImage(dstFile, function (err, img) { 116 | 117 | if (err) { 118 | return cbBuild(err); 119 | } 120 | 121 | dstImg = img; 122 | return cbBuild(); 123 | 124 | }); 125 | } catch(e){ 126 | cbBuild(e); 127 | } 128 | }, 129 | 130 | function(cbBuild){ 131 | 132 | try { 133 | cv.ImageSimilarity(_srcImg, dstImg, function (err, dissimilarity) { 134 | if (err) { 135 | return cbBuild(err); 136 | } 137 | 138 | _match.dissimilarity = dissimilarity; 139 | matches.push(_match); 140 | return cbBuild(); 141 | }); 142 | } catch(e){ 143 | cbBuild(e); 144 | } 145 | 146 | } 147 | 148 | ], function (err) { 149 | 150 | callback2(err); 151 | }); 152 | 153 | 154 | }, function (err) { 155 | 156 | return cb2(err); 157 | 158 | }); 159 | 160 | } 161 | 162 | ], function (err) { 163 | 164 | return callback(err); 165 | 166 | }); 167 | 168 | }, 169 | 170 | function () { 171 | 172 | return go; 173 | 174 | }, 175 | 176 | function (err) { 177 | 178 | cb(err); 179 | 180 | }); 181 | }, 182 | 183 | function(cb){ 184 | 185 | async.each(matches, function(match, cbMatch){ 186 | 187 | async.series([ 188 | 189 | function(cbSave){ 190 | 191 | var key = /match.sourceFile/gmi; 192 | 193 | Product.findOne({imageFiles: {$in:[key]}}) 194 | .exec(function(err, src){ 195 | 196 | if(err) return cbSave(err); 197 | if(!src) return cbSave('No match for ' + match.sourceFile); 198 | 199 | _match.externalIdSource = src.externalId; 200 | _match.source = src._id; 201 | _match.srcSource = src.source; 202 | _match.title = src.title; 203 | _match.srcUrl = src.url; 204 | 205 | return cbSave(); 206 | }); 207 | }, 208 | 209 | function(cbSave){ 210 | 211 | var key = /match.destFile/gmi; 212 | 213 | Product.findOne({imageFiles: {$in:[key]}}) 214 | .exec(function(err, dst){ 215 | 216 | if(err) return cbSave(err); 217 | if(!dst) return cbSave('No match for ' + match.destFile); 218 | 219 | _match.externalIdDest = dst.externalId; 220 | _match.dest = dst._id; 221 | _match.destSource = dst.source; 222 | _match.destUrl = dst.url; 223 | 224 | return cbSave(); 225 | }); 226 | }, 227 | 228 | function(cbSave){ 229 | 230 | _match.hash = _match.generateHash; 231 | 232 | Match.count({hash: _match.hash}, function(err, c){ 233 | 234 | if(c > 0) return cbSave(true); 235 | 236 | return cbSave(err); 237 | }); 238 | 239 | }, 240 | 241 | function(cbSave){ 242 | 243 | _match.save(cbSave); 244 | 245 | } 246 | 247 | ], function(err){ 248 | 249 | return cbMatch(); 250 | 251 | }); 252 | 253 | }, function(err){ 254 | 255 | return cb(err); 256 | 257 | }); 258 | 259 | } 260 | 261 | ], function (err) { 262 | 263 | return finished(err); 264 | 265 | }); 266 | 267 | } 268 | } 269 | } 270 | 271 | module.exports.imaging = new Imaging(); -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | require('dotenv').load(); 2 | 3 | console.log(process.env); 4 | 5 | var x = require('x-ray')(), 6 | async = require('async'), 7 | _ = require('underscore'), 8 | mongoose = require('mongoose'), 9 | util = require('util'), 10 | huntsman = require('huntsman'), 11 | spider = huntsman.spider(), 12 | request = require('request').defaults({encoding: null}), 13 | fs = require('fs'), 14 | imaging = require(__dirname + '/imaging.js').imaging, 15 | aeScraper = require(__dirname + '/aliexpress.js').aescraper, 16 | amznScraper = require(__dirname + '/amazon.js').amznscraper; 17 | 18 | require(__dirname + '/../model'); 19 | 20 | var connectionString = 'mongodb://' + process.env.DB_USER + ':' + process.env.DB_PWD + '@' + 21 | process.env.DB_HOST + ':' + process.env.DB_PORT + '/' + process.env.DB_NAME; 22 | mongoose.connect(connectionString); 23 | 24 | var Product = mongoose.models.Product; 25 | var Match = mongoose.models.Match; 26 | var Criteria = mongoose.models.Criteria; 27 | 28 | 29 | spider.extensions = [ 30 | huntsman.extension( 'recurse' ), // load recurse extension & follow anchor links 31 | huntsman.extension( 'cheerio' ) // load cheerio extension 32 | ]; 33 | 34 | // run AE scraper 35 | 36 | aeScraper.run(); 37 | aeScraper.fetch(); 38 | 39 | var runAmzn = function(){ 40 | amznScraper.run(function(){ 41 | runAmzn(); 42 | }); 43 | }; 44 | runAmzn(); 45 | 46 | /*var runImaging = function(){ 47 | imaging.run(function(){ 48 | runImaging(); 49 | }); 50 | }; 51 | runImaging();*/ 52 | 53 | return; 54 | 55 | var dst, src; 56 | 57 | //compare pillows 58 | async.series([ 59 | 60 | function(cb){ 61 | 62 | Product.findOne({'externalId': '1000201992'}).exec(function(err, entity){ 63 | 64 | dst = entity; 65 | cb(err); 66 | 67 | }); 68 | 69 | }, 70 | 71 | function(cb){ 72 | 73 | Product.findOne({'externalId': 'B00EINBSJ2'}).exec(function(err, entity){ 74 | 75 | src = entity; 76 | cb(err); 77 | 78 | }); 79 | 80 | }, 81 | 82 | function(cb){ 83 | 84 | Match.generateMatch(src, dst, function(err, certainty){ 85 | 86 | if(err) return cb(err); 87 | 88 | }); 89 | 90 | } 91 | 92 | ], function(err){ 93 | 94 | console.log(err); 95 | 96 | }); -------------------------------------------------------------------------------- /model/criteria.js: -------------------------------------------------------------------------------- 1 | var mongoose = require('mongoose'), 2 | hashomatic = require('hash-o-matic'), 3 | _ = require('underscore'), 4 | async = require('async'), 5 | Schema = mongoose.Schema; 6 | 7 | 8 | function Criteria(){ 9 | 10 | var criteriaSchema = new Schema({ 11 | operation: String, // ItemLookup REQUIRED 12 | searchIndex: String,// Books 13 | keywords: String, // harry potter 14 | responseGroup: String, // ItemAttributes,Offers [http://docs.aws.amazon.com/AWSECommerceService/latest/DG/CHAP_ResponseGroupsList.html] 15 | itemId: String, // ASIN 16 | merchantId: String, // Amazon (ONLY VALUE ALLOWED) 17 | condition: String, // All, New, Used 18 | 19 | minPrice: Number, 20 | maxPrice: Number, 21 | 22 | date: Date, 23 | disabled: Boolean, 24 | lastRunDate: Date 25 | 26 | }); 27 | 28 | criteriaSchema.virtual('amazonCriteria').get(function(){ 29 | 30 | var self = this; 31 | 32 | var criteria = {}; 33 | if(self.searchIndex && self.searchIndex.length > 0) 34 | criteria.SearchIndex = self.searchIndex; 35 | if(self.keywords && self.keywords.length > 0) 36 | criteria.Keywords = self.keywords; 37 | if(self.responseGroup && self.responseGroup.length > 0) 38 | criteria.ResponseGroup = self.responseGroup; 39 | //if(self.itemId && self.itemId.length > 0) 40 | // criteria.ItemId = self.itemId; 41 | //if(self.merchantId && self.merchantId.length > 0) 42 | // criteria.MerchantId = self.merchantId; 43 | 44 | // Note, if condition then must search for item id 45 | //if(self.condition && self.condition.length > 0) 46 | //criteria.Condition = self.condition; 47 | 48 | criteria.clean = function(){ 49 | 50 | var $this = this; 51 | 52 | var valid = ['SearchIndex', 'Keywords', 'ResponseGroup', 'MinimumPrice', 'MaximumPrice', 'VariationPage']; 53 | _.each(Object.keys($this), function(key){ 54 | 55 | if(valid.indexOf(key) >= 0) return; 56 | 57 | delete $this[key]; 58 | 59 | }); 60 | 61 | } 62 | 63 | return criteria; 64 | }); 65 | 66 | return mongoose.model('Criteria', criteriaSchema); 67 | 68 | } 69 | 70 | module.exports = new Criteria(); -------------------------------------------------------------------------------- /model/index.js: -------------------------------------------------------------------------------- 1 | require(__dirname + '/product'); 2 | require(__dirname + '/match'); 3 | require(__dirname + '/criteria'); -------------------------------------------------------------------------------- /model/match.js: -------------------------------------------------------------------------------- 1 | var mongoose = require('mongoose'), 2 | hashomatic = require('hash-o-matic'), 3 | //cv = require('opencv'), 4 | async = require('async'), 5 | request = require('request').defaults({encoding: null}), 6 | fs = require('fs'), 7 | Schema = mongoose.Schema; 8 | 9 | 10 | var generateGuid = function(){ 11 | 12 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) { 13 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8); 14 | return v.toString(16); 15 | }); 16 | }; 17 | 18 | function Match(){ 19 | 20 | var matchSchema = new Schema({ 21 | externalIdSource: String, 22 | externalIdDest: String, 23 | sourceFile: String, 24 | destFile: String, 25 | source: Schema.ObjectId, 26 | dest: Schema.ObjectId, 27 | destSource: String, 28 | srcSource: String, 29 | destUrl: String, 30 | srcUrl: String, 31 | title: String, 32 | hash: String, 33 | dissimilarity: Number, 34 | date: Date, 35 | sendMatchNotification: Boolean, 36 | sendNotificationDate: Date 37 | 38 | }); 39 | 40 | matchSchema.virtual('generateHash').get(function(){ 41 | 42 | var self = this; 43 | 44 | return hashomatic.hash({ 45 | "1": self.externalIdSource, 46 | "2": self.externalIdDest, 47 | "3": self.sourceFile, 48 | "4": self.destFile, 49 | "5": self.destSource, 50 | "6": self.srcSource, 51 | "7": self.destUrl, 52 | "8": self.srcUrl, 53 | "9": self.title 54 | }, true, true); 55 | }); 56 | 57 | matchSchema.statics.generateMatch = function(src, dst, func) { 58 | 59 | if(!src || !dst){ 60 | return func(); 61 | } 62 | 63 | var $this = new this(); 64 | 65 | $this.externalIdSource = src.externalId; 66 | $this.externalIdDest = dst.externalId; 67 | $this.source = src._id; 68 | $this.dest = dst._id; 69 | $this.date = new Date(); 70 | $this.dissimilarity = 9999; 71 | $this.sendMatchNotification = false; 72 | 73 | var srcImages = src.getImages; 74 | var dstImages = dst.getImages; 75 | 76 | $this.hash = hashomatic.hash({ 77 | "1": src.externalId, 78 | "2": dst.externalId, 79 | "3": srcImages, 80 | "4": dstImages 81 | },true,true); 82 | 83 | var srcFiles = [], dstFiles = []; 84 | 85 | async.series([ 86 | 87 | function(cb) { 88 | 89 | if(src.imageFiles && (src.imageFiles.length === srcImages.length)){ 90 | //return cb(); 91 | } 92 | 93 | //clear array 94 | src.imageFiles=[]; 95 | 96 | async.each(srcImages, function(image, callback){ 97 | 98 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg'; 99 | var writeStream = fs.createWriteStream(file); 100 | writeStream.on('close', function() { 101 | 102 | src.imageFiles.push(file); 103 | return callback(); 104 | 105 | }); 106 | 107 | writeStream.on('error', function() { 108 | console.log('failed'); 109 | return callback(); 110 | }); 111 | 112 | request(image).pipe(writeStream); 113 | 114 | 115 | request({uri:image, encoding:'binary'}, function(err, r, body){ 116 | if (err) return callback(err); 117 | if (!/image\//.test(r.headers['content-type'])) return callback('Not an image'); 118 | 119 | fs.writeFile(file+'_.jpg', body.image, 'base64', function(err) { 120 | console.log(err); 121 | }); 122 | }); 123 | 124 | }, function(err){ 125 | 126 | if(err) return cb(err); 127 | 128 | if(src.imageFiles && src.imageFiles.length > 0){ 129 | 130 | src.save(cb); 131 | 132 | } else { 133 | 134 | return cb(); 135 | 136 | } 137 | 138 | }); 139 | 140 | 141 | }, 142 | 143 | function(cb) { 144 | 145 | if(dst.imageFiles && (dst.imageFiles.length === dstImages.length)){ 146 | //return cb(); 147 | } 148 | 149 | //clear array 150 | dst.imageFiles=[]; 151 | 152 | async.each(dstImages, function(image, callback){ 153 | 154 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg'; 155 | var writeStream = fs.createWriteStream(file); 156 | writeStream.on('close', function() { 157 | 158 | dst.imageFiles.push(file); 159 | return callback(); 160 | 161 | }); 162 | 163 | writeStream.on('error', function() { 164 | return callback(); 165 | }); 166 | 167 | request(image).pipe(writeStream); 168 | 169 | }, function(err){ 170 | 171 | if(err) return cb(err); 172 | 173 | if(dst.imageFiles && dst.imageFiles.length > 0){ 174 | 175 | dst.save(cb); 176 | 177 | } else { 178 | 179 | return cb(); 180 | 181 | } 182 | 183 | }); 184 | 185 | }, 186 | 187 | function(cb){ 188 | 189 | async.each(src.imageFiles, function(image, callback){ 190 | 191 | /*cv.readImage(image, function(err, img) { 192 | 193 | srcFiles.push(img); 194 | callback(); 195 | 196 | });*/ 197 | 198 | },cb); 199 | 200 | }, 201 | 202 | function(cb){ 203 | 204 | async.each(dst.imageFiles, function(image, callback){ 205 | 206 | /*cv.readImage(image, function(err, img) { 207 | 208 | dstFiles.push(img); 209 | callback(); 210 | 211 | });*/ 212 | 213 | },cb); 214 | 215 | }, 216 | 217 | function(cb){ 218 | 219 | async.each(srcFiles, function(image, callback){ 220 | 221 | var srcImage = image; 222 | 223 | async.each(dstFiles, function(image, callback2){ 224 | 225 | var dstImage = image; 226 | 227 | /* cv.ImageSimilarity(srcImage, dstImage, function (err, dissimilarity) { 228 | if (err) throw err; 229 | 230 | if(dissimilarity < $this.dissimilarity) { 231 | $this.dissimilarity = dissimilarity; 232 | } 233 | 234 | return callback2(); 235 | });*/ 236 | 237 | }, callback); 238 | 239 | }, function (err) { 240 | 241 | if(err) return cb(err); 242 | 243 | return $this.save(cb); 244 | 245 | }); 246 | 247 | } 248 | 249 | ], function(err){ 250 | 251 | if(err) return func(err); 252 | 253 | console.log('Match ' + $this.certainty > 80); 254 | 255 | return func(null, $this.certainty); 256 | 257 | }); 258 | 259 | 260 | } 261 | 262 | return mongoose.model('Match', matchSchema); 263 | 264 | } 265 | 266 | module.exports = new Match(); -------------------------------------------------------------------------------- /model/product.js: -------------------------------------------------------------------------------- 1 | var mongoose = require('mongoose'), 2 | hashomatic = require('hash-o-matic'), 3 | _ = require('underscore'), 4 | Schema = mongoose.Schema; 5 | 6 | function Product(){ 7 | 8 | var freightSchema = new Schema({ 9 | origin: String, //sendGoodsCountryFullName 10 | originISO: String, //sendGoodsCountry 11 | domesticFreight: String, //domesticFreight 12 | totalFreight: String, //totalFreight 13 | currencyCode: String, //currency 14 | discountPercentage: Number, //discount 15 | totalDiscount: String, //saveMoney 16 | shippingPeriod: String, //time 17 | actualPrice: String, //price 18 | discountType: String, //discountType 19 | commitDay: String, //commitDay 20 | shipperCode: String, //serviceName 21 | shipperName: String //company 22 | },{ _id: false }); 23 | 24 | var propertySchema = new Schema({ 25 | key: String, 26 | value: String 27 | },{ _id: false }); 28 | 29 | var skuSchema = new Schema({ 30 | externalId: String, //skuProducts@key 31 | isActivity: Boolean, //isActiviy 32 | price: String, //skuPrice 33 | inventoryCount: Number, //count 34 | images: [String], //skuPropertyImageSummPath 35 | properties: [propertySchema], 36 | /*type: String, //skuPropertyName 37 | value: String, //propertyValueName*/ 38 | hash: String //hash 39 | },{ _id: false }); 40 | 41 | var sellerSchema = new Schema({ 42 | availability: String, 43 | isExpedited: Boolean, 44 | price: String, 45 | location: String, 46 | sellerStoreUrl: String, 47 | sellerRating: String, 48 | shippingPrice: String 49 | },{ _id: false }); 50 | 51 | var productSchema = new Schema({ 52 | externalId: String, //productId 53 | title: String, //title 54 | source: String, // AE 55 | images: [String], //imageUrls 56 | matrices: [String], //opencv matrices 57 | added: Date, // new Date() 58 | url: String, // generate 59 | items: [skuSchema], 60 | freight: [freightSchema], 61 | unit: String, //displayUnit 62 | price: String, //displayPrice[1] 63 | currencyCode: String, //displayPrice[0] 64 | isFreeShipping: Boolean, //isFreeShipping 65 | isItemOffline: Boolean, //isItemOffline 66 | targetCountry: String, //userCountryName 67 | targetCountryCode: String, //userCountryCode 68 | hash: String, //hash, 69 | newNotificationDate: Date, 70 | sendUpdatedNotification: Boolean, 71 | updatedNotificationDate: Date, 72 | previousProduct: Schema.Types.Mixed, 73 | imageFiles:[String], 74 | imageData: [String], 75 | inventoryCount: Number, 76 | 77 | sellers:[sellerSchema], 78 | offerUrl: String, 79 | isScraped: Boolean, 80 | relatedLinks: [String], 81 | keywords: [String], 82 | sellerCount: Number, 83 | salesRank: Number, 84 | group: String, 85 | category: String, 86 | handlingDays: Number, 87 | isFba: Boolean, 88 | dump: String, 89 | 90 | reviews: Number, 91 | brand: String, 92 | properties: [propertySchema] 93 | 94 | }); 95 | 96 | productSchema.virtual('getImages').get(function(){ 97 | 98 | var images = this.images; 99 | 100 | if( Object.prototype.toString.call( images ) !== '[object Array]' ) { 101 | images = []; 102 | } 103 | 104 | for(var x = 0; x < this.items.length; x++){ 105 | 106 | var item = this.items[x]; 107 | for(var y = 0; y < item.images.lenght; y++) { 108 | 109 | var image = item.images[y]; 110 | 111 | if(image && image.length > 0){ 112 | 113 | images.push(image); 114 | 115 | } 116 | 117 | }; 118 | } 119 | 120 | return images; 121 | 122 | }); 123 | 124 | /* 125 | uri: 'http://www.amazon.com/gp/product/B00Z9MOZ14/ref=s9_ri_bw_g23_i20/183-5518956-5356440?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-6&pf_rd_r=09797QFH4GCN7EQ8X6B9&pf_rd_t=101&pf_rd_p=2130231282&pf_rd_i=1055398', 126 | title: 'Peakeep Digital Alarm Clock Battery Operated with Dual Alarms and Snooze Function - Travel Alarm Clock and Home Alarm Clock - Optional Weekday Alarm Mode and Sensor Light', 127 | reviewCount: '19 customer reviews', 128 | brand: 'Peakeep', 129 | priceBlock: '$15.99', 130 | properties: 131 | [ { key: 'Brand Name', value: 'Peakeep' }, 132 | { key: 'Model Number', value: 'MHP3112B' }, 133 | { key: 'Product Dimensions', 134 | value: '5.2 x 1.8 x 3.1 inches ; 7 ounces' }, 135 | { key: 'Shipping Weight', 136 | value: '7.2 ounces (View shipping rates and policies)' }, 137 | { key: 'ASIN', value: 'B00Z9MOZ14' }, 138 | { key: 'Item model number', value: 'MHP3112B' }, 139 | { key: 'Date first available at Amazon.com', 140 | value: 'June 9, 2015' } ], 141 | images: 142 | [ 'http://ecx.images-amazon.com/images/I/51xAF2EnacL.jpg', 143 | 'http://ecx.images-amazon.com/images/I/51CGEEwepEL.jpg', 144 | 'http://ecx.images-amazon.com/images/I/51K317w6HKL.jpg', 145 | 'http://ecx.images-amazon.com/images/I/41ad3l-jYQL.jpg', 146 | 'http://ecx.images-amazon.com/images/I/41VTK2%2BWVSL.jpg', 147 | 'http://ecx.images-amazon.com/images/I/41xKv1k6xnL.jpg' ] } 148 | */ 149 | productSchema.statics.buildAmazonProduct = function (obj, cb) { 150 | 151 | if(!obj){ 152 | return cb(); 153 | } 154 | 155 | var $this = new this(); 156 | 157 | $this.url = obj.uri; 158 | $this.title = obj.title; 159 | $this.brand = obj.brand; 160 | $this.price = obj.priceBlock.replace('$',''); 161 | 162 | if(obj.reviewCount){ 163 | var test = obj.reviewCount.match(/\d+\,\d+/); 164 | if(test){ 165 | $this.reviews = Number(test[0].replace(',', '')); 166 | } 167 | } 168 | 169 | for(var x = 0; x < obj.images.length;x++){ 170 | 171 | $this.images.push(obj.images[x]); 172 | 173 | } 174 | 175 | for(var x = 0; x < obj.properties.length;x++){ 176 | 177 | $this.properties.push(obj.properties[x]); 178 | 179 | } 180 | 181 | var id = $this.url.match(/\/product\/\w+/); 182 | 183 | if(!id){ 184 | return cb('Unable to parse ID from url ' + $this.url); 185 | } 186 | 187 | $this.externalId = id[0].replace('/product/', '').replace(/\//gmi, ''); 188 | $this.source = 'AMZN'; 189 | $this.source += ($this.url.match(/\.com/)) ? '-US' : '-CA'; 190 | $this.added = new Date(); 191 | $this.sendUpdatedNotification = false; 192 | 193 | $this.hash = hashomatic.hash({ 194 | "price": $this.price 195 | }, true, true); 196 | 197 | return cb(null, $this); 198 | }; 199 | 200 | productSchema.statics.buildAliExpressProduct = function (obj, cb) { 201 | 202 | if(!obj){ 203 | return cb(); 204 | } 205 | 206 | var $this = new this(); 207 | 208 | $this.externalId = obj.productId; 209 | $this.title = obj.title; 210 | $this.source = 'AE'; 211 | $this.images = obj.imageUrls; 212 | $this.added = new Date(); 213 | $this.url = 'http://www.aliexpress.com/item/-/'+obj.productId+'.html'; 214 | $this.unit = obj.displayUnit; 215 | $this.isFreeShipping = obj.isFreeShipping; 216 | $this.isItemOffline = obj.isItemOffline; 217 | $this.targetCountry = obj.userCountryName; 218 | $this.targetCountryCode = obj.userCountryCode; 219 | $this.sendUpdatedNotification = false; 220 | 221 | var split = obj.displayPrice.split(" "); 222 | if(split.length > 1) { 223 | $this.price = split[1].replace('$',''); 224 | $this.currencyCode = split[0]; 225 | } 226 | 227 | $this.hash = hashomatic.hash({ 228 | "displayPrice": obj.displayPrice 229 | }, true, true); 230 | 231 | for(var x = 0; x < obj.skuProducts.length; x++){ 232 | 233 | var item = obj.skuProducts[x]; 234 | var sku = item.skuPropIds; 235 | var detail = item[sku]; 236 | 237 | if(!detail) continue; 238 | 239 | if(sku.match(/,/)) sku = sku.split(",")[0]; 240 | 241 | var meat = { 242 | externalId: sku, 243 | isActivity: detail.isActivity, 244 | price: detail.skuPrice, 245 | inventoryCount: detail.count, 246 | images: [], 247 | properties:[] 248 | }; 249 | 250 | for(var y = 0; y < obj.skuPropertyList.length; y++){ 251 | var property = obj.skuPropertyList[y]; 252 | 253 | for(var z = 0; z < property.skuPropertyValues.length; z++){ 254 | var value = property.skuPropertyValues[z]; 255 | 256 | if(value.propertyValueId == sku) { 257 | 258 | if(value.skuPropertyImageSummPath){ 259 | meat.images.push(value.skuPropertyImageSummPath); 260 | } 261 | meat.properties.push({ 262 | key: property.skuPropertyName, 263 | value: value.propertyValueName 264 | }); 265 | break; 266 | } 267 | } 268 | } 269 | 270 | $this.items.push(meat); 271 | 272 | } 273 | 274 | for(var x = 0; x < obj.freightItems.length; x++) { 275 | 276 | var item = obj.freightItems[x]; 277 | 278 | $this.freight.push({ 279 | 280 | origin: item.sendGoodsCountryFullName, 281 | originISO: item.sendGoodsCountry, 282 | domesticFreight: item.domesticFreight, 283 | totalFreight: item.totalFreight, 284 | currencyCode: item.currency, 285 | discountPercentage: item.discount, 286 | totalDiscount: item.saveMoney, 287 | shippingPeriod: item.time, 288 | actualPrice: item.price, 289 | discountType: item.discountType, 290 | commitDay: item.commitDay, 291 | shipperCode: item.serviceName, 292 | shipperName: item.company 293 | 294 | }); 295 | 296 | } 297 | 298 | return cb(null, $this); 299 | }; 300 | 301 | productSchema.statics.buildAmazonApiProduct = function (obj, cb) { 302 | 303 | if(!obj){ 304 | return cb(); 305 | } 306 | 307 | var $this = new this(); 308 | 309 | $this.dump = JSON.stringify(obj,null,0); 310 | 311 | $this.source = 'AMZN'; 312 | $this.isScraped = false; 313 | //$this.source += ($this.url.match(/\.com/)) ? '-US' : '-CA'; 314 | $this.added = new Date(); 315 | $this.sendUpdatedNotification = false; 316 | $this.externalId = obj.ASIN[0]; 317 | $this.url = obj.DetailPageURL[0]; 318 | 319 | if(obj.ImageSets && obj.ImageSets.length >0) { 320 | 321 | _.each(obj.ImageSets[0].ImageSet, function (imageset) { 322 | 323 | $this.images.push(imageset.MediumImage[0].URL[0]); 324 | 325 | }); 326 | 327 | } 328 | 329 | if(obj.SimilarProducts && obj.SimilarProducts.length > 0) { 330 | var similar = obj.SimilarProducts[0].SimilarProduct; 331 | 332 | _.each(similar, function (compare) { 333 | 334 | $this.keywords.push(compare.Title[0]); 335 | $this.relatedLinks.push('http://www.amazon.com/gp/product/'+compare.ASIN[0]); 336 | 337 | }); 338 | } 339 | var attr = obj.ItemAttributes[0]; 340 | 341 | $this.group = attr.ProductGroup[0]; 342 | 343 | $this.category = (attr.Binding || attr.ProductGroup)[0]; 344 | $this.brand = (attr.Brand || ['UNKNOWN'])[0]; 345 | 346 | $this.properties.push({key: 'dump', value: JSON.stringify(attr)}); 347 | 348 | if(attr.Color && attr.Color.length > 0) 349 | $this.properties.push({key:'color', value: attr.Color[0]}); 350 | 351 | if(attr.EAN && attr.EAN.length > 0) 352 | $this.properties.push({key:'EAN', value: attr.EAN[0]}); 353 | 354 | $this.title = attr.Title[0]; 355 | 356 | var i = 0; 357 | _.each(attr.Feature, function(feature){ 358 | 359 | var goods = feature; 360 | 361 | if(/:/.test(goods)){ 362 | 363 | var split = goods.split(':'); 364 | $this.properties.push({key:split[0],value:split[1]}); 365 | 366 | } else { 367 | $this.keywords.push(goods); 368 | } 369 | 370 | }); 371 | if(attr.Label && attr.Label.length > 0) $this.keywords.push(attr.Label[0]); 372 | if(attr.Brand && attr.Brand.length > 0) $this.keywords.push(attr.Brand[0]); 373 | if(attr.Manufacturer && attr.Manufacturer.length > 0) $this.keywords.push(attr.Manufacturer[0]); 374 | $this.keywords.push(attr.Title[0]); 375 | 376 | if(obj.SalesRank && obj.SalesRank.length > 0) 377 | $this.salesRank = Number(obj.SalesRank[0]); 378 | 379 | var offersummary = obj.OfferSummary[0]; 380 | 381 | if(offersummary.LowestNewPrice && offersummary.LowestNewPrice.length > 0) { 382 | $this.price = offersummary.LowestNewPrice[0].Amount[0] / 100; 383 | $this.currencyCode = offersummary.LowestNewPrice[0].CurrencyCode[0]; 384 | } 385 | 386 | $this.sellerCount = offersummary.TotalNew[0]; 387 | $this.hasReviews = obj.CustomerReviews[0].HasReviews[0] == "true"; 388 | 389 | var offers = obj.Offers[0]; 390 | 391 | if(offers.Offer && offers.Offer.length > 0) { 392 | 393 | var offer = offers.Offer[0].OfferListing[0]; 394 | 395 | $this.merchant = offers.Offer[0].Merchant[0].Name[0]; 396 | 397 | var offerUrl = (offers.MoreOffersUrl["0"]).match(/http:\/\/[a-z0-9_-].+?%/i); 398 | 399 | if (offerUrl) { 400 | $this.offerUrl = offerUrl[0].replace('%', ''); 401 | } 402 | 403 | $this.handlingDays = Number(offer.AvailabilityAttributes[0].MaximumHours) / 24; 404 | 405 | $this.isFba = offer.IsEligibleForPrime[0] == 1 || offer.IsEligibleForSuperSaverShipping[0] == 1; 406 | } 407 | 408 | $this.hash = hashomatic.hash({ 409 | "price": $this.price, 410 | "isFba": $this.isFba, 411 | "sellerCount": $this.sellerCount 412 | }, true, true); 413 | 414 | return cb(null, $this); 415 | } 416 | 417 | 418 | return mongoose.model('Product', productSchema); 419 | 420 | } 421 | 422 | module.exports = new Product(); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "geek-scrape", 3 | "version": "0.0.1", 4 | "description": "", 5 | "main": "lib/index.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "test": "mocha test/*.js" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "git://bitbucket.org/trent_millar/geek-scrape.git" 15 | }, 16 | "keywords": [ 17 | "" 18 | ], 19 | "readmeFilename": "README.md", 20 | "author": { 21 | "name": "Trent Millar" 22 | }, 23 | "dependencies": { 24 | "apac": "^1.0.0", 25 | "async": "latest", 26 | "dotenv": "^1.2.0", 27 | "hash-o-matic": "^0.1.3", 28 | "huntsman": "^0.2.12", 29 | "mongoose": "^4.1.3", 30 | "nedb": "^1.1.3", 31 | "nodemailer": "^1.4.0", 32 | "opencv": "^3.2.0", 33 | "request": "^2.61.0", 34 | "underscore": "latest", 35 | "x-ray": "latest" 36 | }, 37 | "devDependencies": { 38 | "mocha": "~1.18.2", 39 | "should": "~3.3.1" 40 | } 41 | } 42 | --------------------------------------------------------------------------------