├── .env
├── .gitignore
├── LICENSE
├── README.md
├── lib
├── aliexpress.js
├── amazon.js
├── config.js
├── imaging.js
└── index.js
├── model
├── criteria.js
├── index.js
├── match.js
└── product.js
├── package.json
└── test
└── sample.html
/.env:
--------------------------------------------------------------------------------
1 | NODE_ENV=development
2 |
3 | FILE_DROP=/Users/tmillar/dev/repo/geek-scrape/
4 |
5 | DB_USER=admin
6 | DB_PWD=password
7 | DB_HOST=localhost
8 | DB_PORT=27017
9 | DB_NAME=compare
10 | databaseConnectionTimeout=300000
11 |
12 | EMAIL_SERVICE=gmail
13 | EMAIL_USER=
14 | EMAIL=PWD=
15 |
16 | AWS_USER=
17 | AWS_KEY=
18 | AWS_SECRET=
19 | AWS_ASSOCIATE=
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env.production
2 |
3 | # Created by .ignore support plugin (hsz.mobi)
4 | ### Node template
5 | # Logs
6 | logs
7 | *.log
8 |
9 | # Images
10 | files
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 |
17 | # Directory for instrumented libs generated by jscoverage/JSCover
18 | lib-cov
19 |
20 | # Coverage directory used by tools like istanbul
21 | coverage
22 |
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 |
26 | # node-waf configuration
27 | .lock-wscript
28 |
29 | # Compiled binary addons (http://nodejs.org/api/addons.html)
30 | build/Release
31 |
32 | # Dependency directory
33 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git-
34 | node_modules
35 |
36 |
37 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
38 |
39 | *.iml
40 |
41 | ## Directory-based project format:
42 | .idea/
43 | # if you remove the above rule, at least ignore the following:
44 |
45 | # User-specific stuff:
46 | # .idea/workspace.xml
47 | # .idea/tasks.xml
48 | # .idea/dictionaries
49 |
50 | # Sensitive or high-churn files:
51 | # .idea/dataSources.ids
52 | # .idea/dataSources.xml
53 | # .idea/sqlDataSources.xml
54 | # .idea/dynamic.xml
55 | # .idea/uiDesigner.xml
56 |
57 | # Gradle:
58 | # .idea/gradle.xml
59 | # .idea/libraries
60 |
61 | # Mongo Explorer plugin:
62 | # .idea/mongoSettings.xml
63 |
64 | ## File-based project format:
65 | *.ipr
66 | *.iws
67 |
68 | ## Plugin-specific files:
69 |
70 | # IntelliJ
71 | /out/
72 |
73 | # mpeltonen/sbt-idea plugin
74 | .idea_modules/
75 |
76 | # JIRA plugin
77 | atlassian-ide-plugin.xml
78 |
79 | # Crashlytics plugin (for Android Studio and IntelliJ)
80 | com_crashlytics_export_strings.xml
81 | crashlytics.properties
82 | crashlytics-build.properties
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 trentm
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenCV Product Comparer
2 | Want to be a killer Amazon seller? Well so did I. This is a crude, rude, slow, and disk heavy crawler to find you the unicorn product to sell on Amazon. Using OpenCV and scraping products off both Amazon & AliExpress in hopes of finding the products you can make lots of money selling. Good Luck!
3 |
4 | ### Purpose
5 | My wife wanted to sell some products on Amazon, I thought I would spend the weekend building this to help.
6 |
7 | What it is this...This tool will populate a MongoDB collection "categories" containing all the official categories used in Amazon's database. Then you define the category you want to find products that have a good profit margin. Using Amazon's Product API, it pages through products and downloads images of each product. At the same time, it is scraping images matching certain criteria off AliExpress. Then OpenCV will try to find the cheapest version of the Amazon product for sale on AliExpress. A "Match" document is created that shows the degree of similarity between products.
8 |
9 | Ideally you will find products on Amazon that are also for sale on AliExpress, likely at a price that makes it profitable to buy them in bulk and then sell on Amazon.
10 |
11 | ### TODO
12 | Lots including forking processes to make it run concurrently.
13 |
14 | ### History
15 | I created this back in early 2015 wanted to archive it before cleaning it off my MBP.
16 |
17 | ### How To Use
18 | - You will need MongoDB either local or hosted.
19 | - Make sure you have a client & secret key to Amazon's Product API (I think the name may have changed) (Not sure Amazon still has or exposes this service as expected)
20 | - If running local you will need OpenCV in your PATH. I tested on 2.4.11 with success and 3.0.0 with some success. Sorry, I haven't ran this in years so I am going off recollections
21 | - Lastly, since OpenCV is not a service, the comparer will download images from both Amazon & AliExpress locally and compare them.
22 |
--------------------------------------------------------------------------------
/lib/aliexpress.js:
--------------------------------------------------------------------------------
1 | require('dotenv').load();
2 |
3 | var x = require('x-ray')(),
4 | hashomatic = require('hash-o-matic'),
5 | async = require('async'),
6 | _ = require('underscore'),
7 | nodemailer = require('nodemailer'),
8 | transporter = nodemailer.createTransport({
9 | service: process.env.EMAIL_SERVICE,
10 | auth: {
11 | user: process.env.EMAIL_USER,
12 | pass: process.env.EMAIL_PWD
13 | }
14 | }),
15 | mongoose = require('mongoose'),
16 | request = require('request').defaults({encoding: null}),
17 | fs = require('fs');
18 |
19 | require(__dirname + '/../model');
20 |
21 | var handleError = function (err) {
22 |
23 | if (err) {
24 |
25 | if (err.key) {
26 |
27 | switch (err.key) {
28 |
29 | case 404:
30 | //console.log('Product not found: ' + err.id);
31 | break;
32 |
33 | case 500:
34 | console.log('Product failed to parse: ' + err.id);
35 | break;
36 |
37 | }
38 | }
39 | }
40 |
41 | };
42 |
43 | var generateGuid = function(){
44 |
45 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
46 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
47 | return v.toString(16);
48 | });
49 | };
50 |
51 | var fetchImages = function(obj, cb) {
52 |
53 | var entity = obj;
54 | async.series([
55 |
56 | function(callback){
57 |
58 | async.each(entity.imageFiles, function (file, cbRemove) {
59 |
60 | fs.unlink(file, cbRemove);
61 |
62 | }, function (err) {
63 |
64 | return callback(err);
65 |
66 | });
67 |
68 | },
69 |
70 | function(callback){
71 |
72 | //clear array
73 | entity.imageFiles = [];
74 |
75 | async.each(entity.images, function (image, cbAdd) {
76 |
77 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg';
78 | var writeStream = fs.createWriteStream(file);
79 | writeStream.on('close', function () {
80 |
81 | entity.imageFiles.push(file);
82 | return cbAdd();
83 |
84 | });
85 |
86 | writeStream.on('error', function (e) {
87 | console.log('failed');
88 | return cbAdd('failed');
89 | });
90 |
91 | request(image).pipe(writeStream);
92 |
93 |
94 | }, function (err) {
95 |
96 | return callback(err);
97 |
98 | });
99 | }
100 |
101 | ], function(err){
102 |
103 | cb(err, entity);
104 |
105 | });
106 | };
107 |
108 | AliExpressScraper = function() {
109 |
110 | return {
111 |
112 | fetch: function() {
113 |
114 | var Product = mongoose.models.Product;
115 | var entities;
116 |
117 | async.series([
118 |
119 | function (cb) {
120 |
121 | //if nothing then fake one - remove this
122 | Product.find({
123 | 'source':'AE',
124 | $or: [ { "isScraped": {"$exists": false} }, { "isScraped": false } ]})
125 | .limit(1000).exec(function (err, docs) {
126 |
127 | entities = docs;
128 | return cb(err);
129 |
130 | });
131 |
132 | },
133 |
134 | function (cb) {
135 |
136 | async.eachLimit(entities, 5,
137 |
138 | function (entity, cbMain) {
139 |
140 | fetchImages(entity, function(err, result){
141 |
142 | result.isScraped = true;
143 | result.save(cbMain);
144 | });
145 |
146 | }, function (err) {
147 |
148 | return cb(err);
149 |
150 | });
151 | }
152 |
153 | ], function (err) {
154 |
155 | new AliExpressScraper().fetch();
156 |
157 | });
158 |
159 | },
160 |
161 | run: function () {
162 |
163 | var Product = mongoose.models.Product;
164 |
165 | var mailOptions = {
166 | from: 'AE Parser ',
167 | to: 'geek.envy.amazon@gmail.com, trent.millar@gmail.com',
168 | subject: 'NEW PRODUCTS',
169 | html: 'New Products - ' + new Date().toDateString() + ''
170 | };
171 |
172 | var updatedProducts = [];
173 | var newProducts = [];
174 |
175 | /*
176 | http://m.aliexpress.com/item/2005197291.html
177 | http://m.aliexpress.com/item-desc/2005197291.html
178 | http://m.aliexpress.com/getSiteProductEvaluation.htm?productId=32266892657
179 | */
180 | var params = {
181 | range: [1000357604 /*2005197290*/, 9999999999]
182 | };
183 |
184 | async.series([
185 |
186 | function (cb) {
187 |
188 | //get highest AE id
189 | Product.findOne({'source': 'AE'}).limit(1).sort('-externalId').exec(function (err, doc) {
190 |
191 | if (err) return cb(err);
192 | params.range[0] = Number(doc.externalId) + 1;
193 | cb();
194 |
195 | });
196 |
197 | },
198 |
199 | function (cb) {
200 |
201 | async.doWhilst(
202 | function (callback) {
203 | var url = 'http://m.aliexpress.com/item/' + ++params.range[0] + '.html';
204 | x(url, {
205 | 'page': 'body',
206 | 'title': 'p.ms-detail-subject'
207 | })(function (err, obj) {
208 |
209 | if (!obj || !obj.page) return callback();
210 |
211 | var page = obj.page;
212 | var title = obj.title;
213 |
214 | if (page && page.length > 0) {
215 |
216 | var begin = 'var runParams =';
217 |
218 | if (!page.match(/runParams/gi)) {
219 | handleError({
220 | "key": 404,
221 | "id": params.range[0]
222 | });
223 | return callback();
224 | }
225 |
226 | try {
227 | var partial = page.substring(page.indexOf(begin) + begin.length);
228 | var javascript = partial.substring(0, partial.indexOf('};') + 1);
229 |
230 | //clean
231 | javascript = javascript.replace('// sku', '');
232 |
233 | var json = eval('(' + javascript + ')');
234 |
235 | json.title = title;
236 | json.hash = hashomatic.hash({
237 | "displayPrice": json.displayPrice
238 | }, true, true);
239 |
240 | Product.findOne({"externalId": json.productId}, function (err, doc) {
241 |
242 | if (doc) {
243 |
244 | if (doc.hash !== json.hash) {
245 |
246 | updatedProducts.push({
247 | 'message': doc.price + ' now ' + json.price,
248 | 'title': json.title,
249 | 'link': url
250 | });
251 |
252 | Product.buildAliExpressProduct(json, function (err, entity) {
253 |
254 | entity.previousProduct = doc;
255 | entity.save(function (err, saved) {
256 |
257 | Product.remove({_id: doc._id}, callback);
258 |
259 | });
260 |
261 | });
262 |
263 | } else {
264 | //exists
265 | return callback();
266 | }
267 |
268 | } else {
269 |
270 | newProducts.push({
271 | 'message': json.displayPrice,
272 | 'title': json.title,
273 | 'link': url
274 | });
275 |
276 | Product.buildAliExpressProduct(json, function (err, entity) {
277 |
278 | fetchImages(entity, function(err, result){
279 |
280 | result.isScraped = true;
281 | return result.save(callback);
282 |
283 | });
284 |
285 | });
286 | }
287 |
288 | });
289 | } catch (e) {
290 |
291 | handleError({
292 | "key": 500,
293 | "id": params.range[0]
294 | });
295 | return callback(err);
296 |
297 | }
298 |
299 | } else {
300 | return callback();
301 | }
302 |
303 | });
304 | },
305 |
306 | function () {
307 |
308 | return params.range[0] < params.range[1];
309 |
310 | },
311 |
312 | function (err) {
313 |
314 | mailOptions.html += 'Updated
New';
315 |
316 | var updates = '', news = '';
317 | _.each(updatedProducts, function (product) {
318 |
319 | updates += '' + product.title + ' - ' + product.message + '';
320 |
321 | });
322 |
323 | _.each(newProducts, function (product) {
324 |
325 | news += '' + product.title + ' - ' + product.message + '';
326 |
327 | });
328 |
329 | mailOptions.html = mailOptions.html.replace('@@update@@', updates).replace('@@new@@', news);
330 |
331 | transporter.sendMail(mailOptions, function (error, info) {
332 | if (error) {
333 | console.log(error);
334 | } else {
335 | console.log('Message sent: ' + info.response);
336 | }
337 |
338 | return cb();
339 |
340 | });
341 | });
342 | }
343 |
344 | ], function (err) {
345 |
346 | console.log('complete ' + err);
347 |
348 | });
349 |
350 | }
351 | }
352 | }
353 |
354 | module.exports.aescraper = new AliExpressScraper();
--------------------------------------------------------------------------------
/lib/amazon.js:
--------------------------------------------------------------------------------
1 | require('dotenv').load();
2 |
3 | var x = require('x-ray')(),
4 | async = require('async'),
5 | _ = require('underscore'),
6 | util = require('util'),
7 | OperationHelper = require('apac').OperationHelper,
8 | opHelper = new OperationHelper({
9 | awsId: process.env.AWS_KEY,
10 | awsSecret: process.env.AWS_SECRET,
11 | assocId: process.env.AWS_ASSOCIATE,
12 | version: '2013-08-01'
13 | }),
14 | mongoose = require('mongoose'),
15 | request = require('request').defaults({encoding: null}),
16 | fs = require('fs');
17 |
18 |
19 | var generateGuid = function(){
20 |
21 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
22 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
23 | return v.toString(16);
24 | });
25 | };
26 |
27 | var fetchImages = function(obj, cb) {
28 |
29 | var entity = obj;
30 | async.series([
31 |
32 | function(callback){
33 |
34 | async.each(entity.imageFiles, function (file, cbRemove) {
35 |
36 | fs.unlink(file, cbRemove);
37 |
38 | }, function (err) {
39 |
40 | return callback(err);
41 |
42 | });
43 |
44 | },
45 |
46 | function(callback){
47 |
48 | //clear array
49 | entity.imageFiles = [];
50 |
51 | async.each(entity.images, function (image, cbAdd) {
52 |
53 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg';
54 | var writeStream = fs.createWriteStream(file);
55 | writeStream.on('close', function () {
56 |
57 | entity.imageFiles.push(file);
58 | return cbAdd();
59 |
60 | });
61 |
62 | writeStream.on('error', function () {
63 | console.log('failed');
64 | return cbAdd('failed');
65 | });
66 |
67 | request(image).pipe(writeStream);
68 |
69 |
70 | }, function (err) {
71 |
72 | return callback(err);
73 |
74 | });
75 | }
76 |
77 | ], function(err){
78 |
79 | cb(err, entity);
80 |
81 | });
82 | };
83 |
84 | AmazonScraper = function() {
85 |
86 | return {
87 |
88 | run: function (finished) {
89 |
90 | require(__dirname + '/../model');
91 | var Product = mongoose.models.Product;
92 | var Criteria = mongoose.models.Criteria;
93 | var criterion = [];
94 |
95 | async.series([
96 |
97 | function (cb) {
98 |
99 | //if nothing then fake one - remove this
100 | Criteria.findOne().exec(function (err, doc) {
101 |
102 | if (err) return cb(err);
103 | if (doc) return cb();
104 |
105 | var criteria = new Criteria();
106 | criteria.operation = "ItemSearch";
107 | criteria.searchIndex = "HomeGarden";
108 | criteria.keywords = "pillow bamboo";
109 | criteria.responseGroup = "ItemAttributes,BrowseNodes,Offers,VariationOffers,Images";
110 | criteria.condition = "New";
111 | criteria.minPrice = "100";
112 | criteria.maxPrice = "5000";
113 | criteria.date = new Date();
114 | criteria.disabled = false;
115 | criteria.save(cb);
116 |
117 | });
118 |
119 | },
120 |
121 | function (cb) {
122 |
123 | Criteria.find({'disabled': false}).sort("-lastRunDate").exec(function (err, docs) {
124 |
125 | criterion = docs;
126 | return cb(err);
127 |
128 | });
129 |
130 | },
131 |
132 | function (cb) {
133 |
134 | async.eachLimit(criterion, 1,
135 |
136 | function (criteria, cbMain) {
137 |
138 | //const
139 | var step = 10;
140 |
141 | var req = criteria.amazonCriteria;
142 | req.MinimumPrice = criteria.minPrice;
143 | var maximumPrice = criteria.maxPrice;
144 | var delta = step;
145 |
146 | req.MaximumPrice = req.MinimumPrice;
147 | req.VariationPage = 1;
148 | req.IncludeReviewsSummary = true;
149 |
150 | var go = true;
151 | var elapsed = new Date().getTime();
152 |
153 | async.doWhilst(
154 | function (callback) {
155 |
156 | async.series([
157 |
158 | function (cb2) {
159 |
160 | var countdown = function () {
161 | setTimeout(function () {
162 |
163 | if ((elapsed + 1000) < new Date().getTime()) {
164 |
165 | return cb2();
166 |
167 | }
168 | countdown();
169 |
170 | }, 1);
171 | };
172 | countdown();
173 |
174 | },
175 |
176 | function (cb2) {
177 |
178 | if(req.VariationPage === 1) {
179 | req.MinimumPrice = req.MaximumPrice + 1;
180 | req.MaximumPrice += delta;
181 | }
182 |
183 | console.log('Running ' + req.MinimumPrice + ' - ' + req.MaximumPrice );
184 |
185 | if(req.MaximumPrice >= maximumPrice) go = false;
186 |
187 | opHelper.execute(criteria.operation, JSON.parse(JSON.stringify(req)),
188 | function (error, results) {
189 |
190 | elapsed = new Date().getTime();
191 |
192 | if (error) return cb2(error);
193 |
194 | var _resp = (results.ItemSearchResponse && results.ItemSearchResponse.Items)
195 | ? results.ItemSearchResponse.Items[0]
196 | : null;
197 |
198 | if (!_resp) {
199 |
200 | if (results.ItemSearchResponse &&
201 | results.ItemSearchResponse.Error && results.ItemSearchResponse.Items.Error.length > 0) {
202 |
203 | return callback(results.ItemSearchResponse.Error[0].Message);
204 | }
205 |
206 | return cb2('fd up');
207 | }
208 |
209 | var _req = _resp.Request[0];
210 |
211 | if (_req.IsValid && _req.IsValid[0] == "False") {
212 |
213 | var msg = '';
214 | _.each(_.pluck(_req.Errors, 'Error'), function (error) {
215 |
216 | if (error && error.length > 0)
217 | msg += '\n' + error[0].Message;
218 |
219 | });
220 | return cb2(msg);
221 |
222 | }
223 |
224 | var result = results.ItemSearchResponse && results.ItemSearchResponse.Items
225 | && results.ItemSearchResponse.Items.length > 0
226 | ? results.ItemSearchResponse.Items[0]
227 | : null;
228 |
229 | if (!result) {
230 | //nothiing???
231 | return cb2('shitz fan');
232 | }
233 |
234 | //check for 0 results
235 | var total = Number(result.TotalResults[0]);
236 |
237 | if (total === 0) {
238 | delta = 5 * step;
239 | return cb2();
240 | }
241 | else if (total < 10) {
242 | delta = step;
243 | }
244 | else if (total > 10) {
245 |
246 | var pages = Math.ceil(total / 10);
247 |
248 | if (pages > 10) {
249 | if(delta > 1) {
250 | req.MaximumPrice = req.MinimumPrice - 1;
251 | req.VariationPage = 1;
252 | delta = 1;//Math.ceil(step / 5);
253 | return cb2();
254 | }
255 | } else {
256 | req.VariationPage = req.VariationPage < pages ? (req.VariationPage + 1) : 1;
257 | }
258 | }
259 |
260 | if (result.Item.length == 0) return callback();
261 |
262 | async.each(result.Item, function (item, callback2) {
263 |
264 | //build out product
265 | var entity;
266 | var ignore = false;
267 |
268 | async.series([
269 |
270 | function(cbBuild){
271 |
272 | Product.buildAmazonApiProduct(item, function (err, doc) {
273 |
274 | entity = doc;
275 | return cbBuild(err);
276 |
277 | });
278 | },
279 |
280 | function(cbBuild){
281 |
282 | Product.findOne({"externalId": entity.externalId}, function (err, doc) {
283 |
284 | if (doc) {
285 |
286 | entity = doc;
287 |
288 | if (doc.hash !== entity.hash) {
289 |
290 | entity.previousProduct = doc;
291 |
292 | fetchImages(entity, function(err, result){
293 |
294 | result.save(function (err, saved) {
295 |
296 | entity = saved;
297 | Product.remove({_id: doc._id}, cbBuild);
298 |
299 | });
300 | });
301 |
302 | } else {
303 | ignore = true;
304 | return cbBuild();
305 | }
306 | } else {
307 |
308 | fetchImages(entity, function(err, result){
309 |
310 | result.save(function(err, saved){
311 |
312 | entity = saved;
313 | if(err) console.log('Err ' + err);
314 | return cbBuild(err);
315 | });
316 |
317 | });
318 | }
319 | });
320 |
321 | },
322 |
323 | function(cbBuild) {
324 |
325 | if (ignore) return cbBuild();
326 | if (!entity.offerUrl) return cbBuild();
327 |
328 | var url = entity.offerUrl;
329 |
330 | var huntsman = require('huntsman');
331 | var spider = huntsman.spider();
332 |
333 | spider.extensions = [
334 | huntsman.extension( 'cheerio' ) // load cheerio extension
335 | ];
336 |
337 | spider.on(url, function ( err, res ){
338 |
339 | if (!res.extension.cheerio) return; // content is not html
340 | var $ = res.extension.cheerio;
341 |
342 | entity.sellers=[];
343 |
344 | $('.olpOffer').each(function(){
345 |
346 | var offer = $(this);
347 | if (!/new/gmi.test($('.olpOffer').find('.olpCondition').text())) {
348 | return;
349 | }
350 | var shipping = offer.find('p.olpShippingInfo span').text().trim();
351 | var isExpedited = /Expedited shipping available/gmi.test(location);
352 | var availability = offer.find('.olpAvailability').text().trim();
353 | var price = offer.find('span.olpOfferPrice').text().trim().replace('$', '');
354 | var location = 'UNKNOWN';
355 | var sellerRating = 'NEW';
356 |
357 | if (/\d+\%/gmi.test(offer.find('.olpSellerColumn b').text())) {
358 |
359 | sellerRating = offer.find('.olpSellerColumn b')
360 | .text().match(/\d+\%/gmi)[0].replace('%', '');
361 | }
362 |
363 | var sellerLink = offer.find('.olpSellerColumn p.olpSellerName a').attr('href');
364 |
365 | if(!/http:\/\/www.amazon/gmi.test(sellerLink)){
366 |
367 | sellerLink = 'http://www.amazon.com' + sellerLink;
368 | }
369 |
370 | var _location = offer.find('.olpDeliveryColumn ul li span').text();
371 |
372 | if (/ships from\D+?\./gmi.test(_location)) {
373 | location = _location.match(/ships from\D+?\./gmi)[0]
374 | .replace(/ships from /gmi, '').replace('.', '');
375 |
376 | if (/,/gmi.test(location)) {
377 |
378 | var split = location.split(',');
379 | location = split[split.length - 1].trim();
380 |
381 | }
382 | }
383 |
384 | if (/\$[0-9]+\.[0-9]+/gmi.test(shipping)) {
385 | shipping = shipping.match(/\$[0-9]+\.[0-9]+/gmi)[0].replace('$','');
386 | } else if (/free/gmi.test(shipping)) {
387 | shipping = 0;
388 | } else {
389 | shipping = null;
390 | }
391 |
392 | entity.sellers.push({
393 | availability: availability,
394 | isExpedited: isExpedited,
395 | price: price,
396 | location: location,
397 | sellerStoreUrl: sellerLink,
398 | sellerRating: sellerRating,
399 | shippingPrice: shipping
400 | });
401 | });
402 |
403 | return cbBuild();
404 |
405 | });
406 |
407 | spider.queue.add(url);
408 | spider.start();
409 |
410 | },
411 |
412 | function(cbBuild){
413 |
414 | entity.save(cbBuild);
415 | }
416 |
417 |
418 | ], function(err){
419 |
420 | callback2(err);
421 | });
422 |
423 |
424 | }, function (err) {
425 |
426 | return cb2(err);
427 |
428 | });
429 |
430 | });
431 |
432 | }
433 | ], function (err) {
434 |
435 | return callback(err);
436 |
437 | });
438 |
439 | },
440 |
441 | function () {
442 |
443 | return go;
444 |
445 | },
446 |
447 | function (err) {
448 |
449 | cbMain(err);
450 |
451 | });
452 |
453 | }, function (err) {
454 |
455 | return cb(err);
456 |
457 | });
458 | }
459 |
460 | ], function (err) {
461 |
462 | return finished(err);
463 |
464 | });
465 |
466 | }
467 | }
468 | }
469 |
470 | module.exports.amznscraper = new AmazonScraper();
--------------------------------------------------------------------------------
/lib/config.js:
--------------------------------------------------------------------------------
1 | var config = {};
2 |
3 | config.mail = {};
4 | config.mail = {
5 | apikey: process.env.mailchimp || null,
6 | listid: process.env.mailchimp_listid || null,
7 | mailchimp: '',
8 | apiurl: parseMailApiUrl(process.env.mailchimp_api_url),
9 | mailchimp_doubleoptin: process.env.mailchimp_doubleoptin || true,
10 | mailchimp_allowupdates: process.env.mailchimp_allowupdates || true
11 | };
12 |
13 |
14 | /* MongoDb */
15 | /*
16 | config.db = {};
17 | config.db = {
18 | databaseUsername: process.env.databaseUsername || '',
19 | databasePassword: process.env.databasePassword || '',
20 | databaseHost: process.env.databaseHost || 'localhost',
21 | databaseName: process.env.databaseName || 'oasis',
22 | databasePort: process.env.databasePort || 27017
23 | };
24 |
25 | config.db.databaseMongoUri = {};
26 |
27 | config.db.databaseMongoUri = process.env.databaseMongoUri || config.db.databaseHost +
28 | ':' + config.db.databasePort + '/' + config.db.databaseName + '?connectTimeoutMS=30000&w=1';
29 |
30 | config.db.databaseUri = 'mongodb://' + config.db.databaseUsername +
31 | (config.db.databaseUsername === "" ? '' : ':') + config.db.databasePassword +
32 | (config.db.databaseUsername === "" ? '' : '@') + config.db.databaseMongoUri;
33 | */
34 | config.db = {};
35 | config.db = {
36 | databaseUsername: process.env.databaseUsername || '',
37 | databasePassword: process.env.databasePassword || '',
38 | databaseHost: process.env.databaseHost || 'localhost',
39 | databaseName: process.env.databaseName || 'oasis',
40 | databasePort: process.env.databasePort || 27017
41 | };
42 |
43 | var credentials = (config.db.databaseUsername && config.db.databasePassword)
44 | ? config.db.databaseUsername + ':' + config.db.databasePassword + '@'
45 | : '';
46 |
47 |
48 | config.db.databaseMongoUri = 'mongodb://' + credentials + config.db.databaseHost +
49 | ':' + config.db.databasePort + '/' + config.db.databaseName;
50 |
51 | if(process.env.databaseHost2 && process.env.databasePort2) {
52 | config.db.databaseMongoUri += ',mongodb://' + credentials + process.env.databaseHost2 +':' +
53 | process.env.databasePort2 + '/' + config.db.databaseName;
54 | }
55 |
56 | if(process.env.databaseHost3 && process.env.databasePort3) {
57 | config.db.databaseMongoUri += ',mongodb://' + credentials + process.env.databaseHost3 +':' +
58 | process.env.databasePort3 + '/' + config.db.databaseName;
59 | }
60 |
61 | //config.db.databaseMongoUri += '/' + config.db.databaseName + '?connectTimeoutMS=30000&w=1&authMechanism=MONGODB-CR';
62 |
63 | config.db.databaseUri = config.db.databaseMongoUri;
64 |
65 | config.db.databaseConnectionTimeout = Number(process.env.databaseConnectionTimeout || 30000);
66 |
67 | /* Redis */
68 | config.keystore = {};
69 | config.keystore = {
70 | databaseHost: process.env.redisHost || 'localhost',
71 | databasePort: process.env.redisPort || 6379,
72 | databaseAuth: process.env.redisPrimaryKey || process.env.redisSecondaryKey || null
73 | };
74 |
75 | /* Logging */
76 | config.logging = {};
77 | config.logging = {
78 | filename: process.env.LOGGER_FILENAME,
79 | level: process.env.LOGGER_LEVEL,
80 | azure_account: process.env.LOGGER_AZURE_ACCOUNT,
81 | azure_key: process.env.LOGGER_AZURE_KEY,
82 | azure_table: process.env.LOGGER_AZURE_TABLE
83 | };
84 |
85 | module.exports = config;
--------------------------------------------------------------------------------
/lib/imaging.js:
--------------------------------------------------------------------------------
1 | require('dotenv').load();
2 |
3 | var async = require('async'),
4 | _ = require('underscore'),
5 | util = require('util'),
6 | mongoose = require('mongoose'),
7 | hashomatic = require('hash-o-matic'),
8 | fs = require('fs');
9 |
10 |
11 | var generateGuid = function(){
12 |
13 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
14 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
15 | return v.toString(16);
16 | });
17 | };
18 |
19 | Imaging = function() {
20 |
21 | return {
22 |
23 | run: function (finished) {
24 |
25 | require(__dirname + '/../model');
26 | var Product = mongoose.models.Product;
27 | var Match = mongoose.models.Match;
28 |
29 | var dir = process.env.FILE_DROP + 'files/';
30 | var files = [];
31 | var matches = [];
32 |
33 | var cv;
34 | try{
35 | //cv = require('opencv');
36 | } catch(e){
37 | return finished(e);
38 | }
39 |
40 | async.series([
41 |
42 | function (cb) {
43 |
44 | files = [];
45 | matches = [];
46 |
47 | fs.readdir(dir,function(err,results){
48 | if (err) cb(err);
49 |
50 | files = results;
51 |
52 | return cb();
53 | });
54 |
55 |
56 | },
57 |
58 | function (cb) {
59 |
60 | var _srcImg;
61 | var _match;
62 | var go = true;
63 |
64 | async.doWhilst(
65 | function (callback) {
66 |
67 | async.series([
68 |
69 | function (cb2) {
70 |
71 | if(files.length === 0){
72 | //get out of here
73 | go = false;
74 | return callback();
75 | }
76 |
77 | var file = files.shift();
78 |
79 | _match = new Match();
80 | _match.sourceFile = file;
81 | _match.date = new Date();
82 | _match.sendMatchNotification = false;
83 |
84 | try {
85 | cv.readImage(dir + file, function (err, img) {
86 | if (err) {
87 | return cb2(err);
88 | }
89 | _srcImg = img;
90 | cb2();
91 |
92 | });
93 | }
94 | catch(e){
95 | cb2(e);
96 | }
97 | },
98 |
99 | function (cb2) {
100 |
101 | var dstFile;
102 |
103 | async.eachLimit(files, 10, function (file, callback2) {
104 |
105 | _match.destFile = file;
106 |
107 | var dstImg;
108 | dstFile = dir + file;
109 |
110 | async.series([
111 |
112 | function (cbBuild) {
113 |
114 | try {
115 | cv.readImage(dstFile, function (err, img) {
116 |
117 | if (err) {
118 | return cbBuild(err);
119 | }
120 |
121 | dstImg = img;
122 | return cbBuild();
123 |
124 | });
125 | } catch(e){
126 | cbBuild(e);
127 | }
128 | },
129 |
130 | function(cbBuild){
131 |
132 | try {
133 | cv.ImageSimilarity(_srcImg, dstImg, function (err, dissimilarity) {
134 | if (err) {
135 | return cbBuild(err);
136 | }
137 |
138 | _match.dissimilarity = dissimilarity;
139 | matches.push(_match);
140 | return cbBuild();
141 | });
142 | } catch(e){
143 | cbBuild(e);
144 | }
145 |
146 | }
147 |
148 | ], function (err) {
149 |
150 | callback2(err);
151 | });
152 |
153 |
154 | }, function (err) {
155 |
156 | return cb2(err);
157 |
158 | });
159 |
160 | }
161 |
162 | ], function (err) {
163 |
164 | return callback(err);
165 |
166 | });
167 |
168 | },
169 |
170 | function () {
171 |
172 | return go;
173 |
174 | },
175 |
176 | function (err) {
177 |
178 | cb(err);
179 |
180 | });
181 | },
182 |
183 | function(cb){
184 |
185 | async.each(matches, function(match, cbMatch){
186 |
187 | async.series([
188 |
189 | function(cbSave){
190 |
191 | var key = /match.sourceFile/gmi;
192 |
193 | Product.findOne({imageFiles: {$in:[key]}})
194 | .exec(function(err, src){
195 |
196 | if(err) return cbSave(err);
197 | if(!src) return cbSave('No match for ' + match.sourceFile);
198 |
199 | _match.externalIdSource = src.externalId;
200 | _match.source = src._id;
201 | _match.srcSource = src.source;
202 | _match.title = src.title;
203 | _match.srcUrl = src.url;
204 |
205 | return cbSave();
206 | });
207 | },
208 |
209 | function(cbSave){
210 |
211 | var key = /match.destFile/gmi;
212 |
213 | Product.findOne({imageFiles: {$in:[key]}})
214 | .exec(function(err, dst){
215 |
216 | if(err) return cbSave(err);
217 | if(!dst) return cbSave('No match for ' + match.destFile);
218 |
219 | _match.externalIdDest = dst.externalId;
220 | _match.dest = dst._id;
221 | _match.destSource = dst.source;
222 | _match.destUrl = dst.url;
223 |
224 | return cbSave();
225 | });
226 | },
227 |
228 | function(cbSave){
229 |
230 | _match.hash = _match.generateHash;
231 |
232 | Match.count({hash: _match.hash}, function(err, c){
233 |
234 | if(c > 0) return cbSave(true);
235 |
236 | return cbSave(err);
237 | });
238 |
239 | },
240 |
241 | function(cbSave){
242 |
243 | _match.save(cbSave);
244 |
245 | }
246 |
247 | ], function(err){
248 |
249 | return cbMatch();
250 |
251 | });
252 |
253 | }, function(err){
254 |
255 | return cb(err);
256 |
257 | });
258 |
259 | }
260 |
261 | ], function (err) {
262 |
263 | return finished(err);
264 |
265 | });
266 |
267 | }
268 | }
269 | }
270 |
271 | module.exports.imaging = new Imaging();
--------------------------------------------------------------------------------
/lib/index.js:
--------------------------------------------------------------------------------
1 | require('dotenv').load();
2 |
3 | console.log(process.env);
4 |
5 | var x = require('x-ray')(),
6 | async = require('async'),
7 | _ = require('underscore'),
8 | mongoose = require('mongoose'),
9 | util = require('util'),
10 | huntsman = require('huntsman'),
11 | spider = huntsman.spider(),
12 | request = require('request').defaults({encoding: null}),
13 | fs = require('fs'),
14 | imaging = require(__dirname + '/imaging.js').imaging,
15 | aeScraper = require(__dirname + '/aliexpress.js').aescraper,
16 | amznScraper = require(__dirname + '/amazon.js').amznscraper;
17 |
18 | require(__dirname + '/../model');
19 |
20 | var connectionString = 'mongodb://' + process.env.DB_USER + ':' + process.env.DB_PWD + '@' +
21 | process.env.DB_HOST + ':' + process.env.DB_PORT + '/' + process.env.DB_NAME;
22 | mongoose.connect(connectionString);
23 |
24 | var Product = mongoose.models.Product;
25 | var Match = mongoose.models.Match;
26 | var Criteria = mongoose.models.Criteria;
27 |
28 |
29 | spider.extensions = [
30 | huntsman.extension( 'recurse' ), // load recurse extension & follow anchor links
31 | huntsman.extension( 'cheerio' ) // load cheerio extension
32 | ];
33 |
34 | // run AE scraper
35 |
36 | aeScraper.run();
37 | aeScraper.fetch();
38 |
39 | var runAmzn = function(){
40 | amznScraper.run(function(){
41 | runAmzn();
42 | });
43 | };
44 | runAmzn();
45 |
46 | /*var runImaging = function(){
47 | imaging.run(function(){
48 | runImaging();
49 | });
50 | };
51 | runImaging();*/
52 |
53 | return;
54 |
55 | var dst, src;
56 |
57 | //compare pillows
58 | async.series([
59 |
60 | function(cb){
61 |
62 | Product.findOne({'externalId': '1000201992'}).exec(function(err, entity){
63 |
64 | dst = entity;
65 | cb(err);
66 |
67 | });
68 |
69 | },
70 |
71 | function(cb){
72 |
73 | Product.findOne({'externalId': 'B00EINBSJ2'}).exec(function(err, entity){
74 |
75 | src = entity;
76 | cb(err);
77 |
78 | });
79 |
80 | },
81 |
82 | function(cb){
83 |
84 | Match.generateMatch(src, dst, function(err, certainty){
85 |
86 | if(err) return cb(err);
87 |
88 | });
89 |
90 | }
91 |
92 | ], function(err){
93 |
94 | console.log(err);
95 |
96 | });
--------------------------------------------------------------------------------
/model/criteria.js:
--------------------------------------------------------------------------------
1 | var mongoose = require('mongoose'),
2 | hashomatic = require('hash-o-matic'),
3 | _ = require('underscore'),
4 | async = require('async'),
5 | Schema = mongoose.Schema;
6 |
7 |
8 | function Criteria(){
9 |
10 | var criteriaSchema = new Schema({
11 | operation: String, // ItemLookup REQUIRED
12 | searchIndex: String,// Books
13 | keywords: String, // harry potter
14 | responseGroup: String, // ItemAttributes,Offers [http://docs.aws.amazon.com/AWSECommerceService/latest/DG/CHAP_ResponseGroupsList.html]
15 | itemId: String, // ASIN
16 | merchantId: String, // Amazon (ONLY VALUE ALLOWED)
17 | condition: String, // All, New, Used
18 |
19 | minPrice: Number,
20 | maxPrice: Number,
21 |
22 | date: Date,
23 | disabled: Boolean,
24 | lastRunDate: Date
25 |
26 | });
27 |
28 | criteriaSchema.virtual('amazonCriteria').get(function(){
29 |
30 | var self = this;
31 |
32 | var criteria = {};
33 | if(self.searchIndex && self.searchIndex.length > 0)
34 | criteria.SearchIndex = self.searchIndex;
35 | if(self.keywords && self.keywords.length > 0)
36 | criteria.Keywords = self.keywords;
37 | if(self.responseGroup && self.responseGroup.length > 0)
38 | criteria.ResponseGroup = self.responseGroup;
39 | //if(self.itemId && self.itemId.length > 0)
40 | // criteria.ItemId = self.itemId;
41 | //if(self.merchantId && self.merchantId.length > 0)
42 | // criteria.MerchantId = self.merchantId;
43 |
44 | // Note, if condition then must search for item id
45 | //if(self.condition && self.condition.length > 0)
46 | //criteria.Condition = self.condition;
47 |
48 | criteria.clean = function(){
49 |
50 | var $this = this;
51 |
52 | var valid = ['SearchIndex', 'Keywords', 'ResponseGroup', 'MinimumPrice', 'MaximumPrice', 'VariationPage'];
53 | _.each(Object.keys($this), function(key){
54 |
55 | if(valid.indexOf(key) >= 0) return;
56 |
57 | delete $this[key];
58 |
59 | });
60 |
61 | }
62 |
63 | return criteria;
64 | });
65 |
66 | return mongoose.model('Criteria', criteriaSchema);
67 |
68 | }
69 |
70 | module.exports = new Criteria();
--------------------------------------------------------------------------------
/model/index.js:
--------------------------------------------------------------------------------
1 | require(__dirname + '/product');
2 | require(__dirname + '/match');
3 | require(__dirname + '/criteria');
--------------------------------------------------------------------------------
/model/match.js:
--------------------------------------------------------------------------------
1 | var mongoose = require('mongoose'),
2 | hashomatic = require('hash-o-matic'),
3 | //cv = require('opencv'),
4 | async = require('async'),
5 | request = require('request').defaults({encoding: null}),
6 | fs = require('fs'),
7 | Schema = mongoose.Schema;
8 |
9 |
10 | var generateGuid = function(){
11 |
12 | return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, function(c) {
13 | var r = Math.random()*16|0, v = c == 'x' ? r : (r&0x3|0x8);
14 | return v.toString(16);
15 | });
16 | };
17 |
18 | function Match(){
19 |
20 | var matchSchema = new Schema({
21 | externalIdSource: String,
22 | externalIdDest: String,
23 | sourceFile: String,
24 | destFile: String,
25 | source: Schema.ObjectId,
26 | dest: Schema.ObjectId,
27 | destSource: String,
28 | srcSource: String,
29 | destUrl: String,
30 | srcUrl: String,
31 | title: String,
32 | hash: String,
33 | dissimilarity: Number,
34 | date: Date,
35 | sendMatchNotification: Boolean,
36 | sendNotificationDate: Date
37 |
38 | });
39 |
40 | matchSchema.virtual('generateHash').get(function(){
41 |
42 | var self = this;
43 |
44 | return hashomatic.hash({
45 | "1": self.externalIdSource,
46 | "2": self.externalIdDest,
47 | "3": self.sourceFile,
48 | "4": self.destFile,
49 | "5": self.destSource,
50 | "6": self.srcSource,
51 | "7": self.destUrl,
52 | "8": self.srcUrl,
53 | "9": self.title
54 | }, true, true);
55 | });
56 |
57 | matchSchema.statics.generateMatch = function(src, dst, func) {
58 |
59 | if(!src || !dst){
60 | return func();
61 | }
62 |
63 | var $this = new this();
64 |
65 | $this.externalIdSource = src.externalId;
66 | $this.externalIdDest = dst.externalId;
67 | $this.source = src._id;
68 | $this.dest = dst._id;
69 | $this.date = new Date();
70 | $this.dissimilarity = 9999;
71 | $this.sendMatchNotification = false;
72 |
73 | var srcImages = src.getImages;
74 | var dstImages = dst.getImages;
75 |
76 | $this.hash = hashomatic.hash({
77 | "1": src.externalId,
78 | "2": dst.externalId,
79 | "3": srcImages,
80 | "4": dstImages
81 | },true,true);
82 |
83 | var srcFiles = [], dstFiles = [];
84 |
85 | async.series([
86 |
87 | function(cb) {
88 |
89 | if(src.imageFiles && (src.imageFiles.length === srcImages.length)){
90 | //return cb();
91 | }
92 |
93 | //clear array
94 | src.imageFiles=[];
95 |
96 | async.each(srcImages, function(image, callback){
97 |
98 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg';
99 | var writeStream = fs.createWriteStream(file);
100 | writeStream.on('close', function() {
101 |
102 | src.imageFiles.push(file);
103 | return callback();
104 |
105 | });
106 |
107 | writeStream.on('error', function() {
108 | console.log('failed');
109 | return callback();
110 | });
111 |
112 | request(image).pipe(writeStream);
113 |
114 |
115 | request({uri:image, encoding:'binary'}, function(err, r, body){
116 | if (err) return callback(err);
117 | if (!/image\//.test(r.headers['content-type'])) return callback('Not an image');
118 |
119 | fs.writeFile(file+'_.jpg', body.image, 'base64', function(err) {
120 | console.log(err);
121 | });
122 | });
123 |
124 | }, function(err){
125 |
126 | if(err) return cb(err);
127 |
128 | if(src.imageFiles && src.imageFiles.length > 0){
129 |
130 | src.save(cb);
131 |
132 | } else {
133 |
134 | return cb();
135 |
136 | }
137 |
138 | });
139 |
140 |
141 | },
142 |
143 | function(cb) {
144 |
145 | if(dst.imageFiles && (dst.imageFiles.length === dstImages.length)){
146 | //return cb();
147 | }
148 |
149 | //clear array
150 | dst.imageFiles=[];
151 |
152 | async.each(dstImages, function(image, callback){
153 |
154 | var file = process.env.FILE_DROP + 'files/' + generateGuid() + '.jpg';
155 | var writeStream = fs.createWriteStream(file);
156 | writeStream.on('close', function() {
157 |
158 | dst.imageFiles.push(file);
159 | return callback();
160 |
161 | });
162 |
163 | writeStream.on('error', function() {
164 | return callback();
165 | });
166 |
167 | request(image).pipe(writeStream);
168 |
169 | }, function(err){
170 |
171 | if(err) return cb(err);
172 |
173 | if(dst.imageFiles && dst.imageFiles.length > 0){
174 |
175 | dst.save(cb);
176 |
177 | } else {
178 |
179 | return cb();
180 |
181 | }
182 |
183 | });
184 |
185 | },
186 |
187 | function(cb){
188 |
189 | async.each(src.imageFiles, function(image, callback){
190 |
191 | /*cv.readImage(image, function(err, img) {
192 |
193 | srcFiles.push(img);
194 | callback();
195 |
196 | });*/
197 |
198 | },cb);
199 |
200 | },
201 |
202 | function(cb){
203 |
204 | async.each(dst.imageFiles, function(image, callback){
205 |
206 | /*cv.readImage(image, function(err, img) {
207 |
208 | dstFiles.push(img);
209 | callback();
210 |
211 | });*/
212 |
213 | },cb);
214 |
215 | },
216 |
217 | function(cb){
218 |
219 | async.each(srcFiles, function(image, callback){
220 |
221 | var srcImage = image;
222 |
223 | async.each(dstFiles, function(image, callback2){
224 |
225 | var dstImage = image;
226 |
227 | /* cv.ImageSimilarity(srcImage, dstImage, function (err, dissimilarity) {
228 | if (err) throw err;
229 |
230 | if(dissimilarity < $this.dissimilarity) {
231 | $this.dissimilarity = dissimilarity;
232 | }
233 |
234 | return callback2();
235 | });*/
236 |
237 | }, callback);
238 |
239 | }, function (err) {
240 |
241 | if(err) return cb(err);
242 |
243 | return $this.save(cb);
244 |
245 | });
246 |
247 | }
248 |
249 | ], function(err){
250 |
251 | if(err) return func(err);
252 |
253 | console.log('Match ' + $this.certainty > 80);
254 |
255 | return func(null, $this.certainty);
256 |
257 | });
258 |
259 |
260 | }
261 |
262 | return mongoose.model('Match', matchSchema);
263 |
264 | }
265 |
266 | module.exports = new Match();
--------------------------------------------------------------------------------
/model/product.js:
--------------------------------------------------------------------------------
1 | var mongoose = require('mongoose'),
2 | hashomatic = require('hash-o-matic'),
3 | _ = require('underscore'),
4 | Schema = mongoose.Schema;
5 |
6 | function Product(){
7 |
8 | var freightSchema = new Schema({
9 | origin: String, //sendGoodsCountryFullName
10 | originISO: String, //sendGoodsCountry
11 | domesticFreight: String, //domesticFreight
12 | totalFreight: String, //totalFreight
13 | currencyCode: String, //currency
14 | discountPercentage: Number, //discount
15 | totalDiscount: String, //saveMoney
16 | shippingPeriod: String, //time
17 | actualPrice: String, //price
18 | discountType: String, //discountType
19 | commitDay: String, //commitDay
20 | shipperCode: String, //serviceName
21 | shipperName: String //company
22 | },{ _id: false });
23 |
24 | var propertySchema = new Schema({
25 | key: String,
26 | value: String
27 | },{ _id: false });
28 |
29 | var skuSchema = new Schema({
30 | externalId: String, //skuProducts@key
31 | isActivity: Boolean, //isActiviy
32 | price: String, //skuPrice
33 | inventoryCount: Number, //count
34 | images: [String], //skuPropertyImageSummPath
35 | properties: [propertySchema],
36 | /*type: String, //skuPropertyName
37 | value: String, //propertyValueName*/
38 | hash: String //hash
39 | },{ _id: false });
40 |
41 | var sellerSchema = new Schema({
42 | availability: String,
43 | isExpedited: Boolean,
44 | price: String,
45 | location: String,
46 | sellerStoreUrl: String,
47 | sellerRating: String,
48 | shippingPrice: String
49 | },{ _id: false });
50 |
51 | var productSchema = new Schema({
52 | externalId: String, //productId
53 | title: String, //title
54 | source: String, // AE
55 | images: [String], //imageUrls
56 | matrices: [String], //opencv matrices
57 | added: Date, // new Date()
58 | url: String, // generate
59 | items: [skuSchema],
60 | freight: [freightSchema],
61 | unit: String, //displayUnit
62 | price: String, //displayPrice[1]
63 | currencyCode: String, //displayPrice[0]
64 | isFreeShipping: Boolean, //isFreeShipping
65 | isItemOffline: Boolean, //isItemOffline
66 | targetCountry: String, //userCountryName
67 | targetCountryCode: String, //userCountryCode
68 | hash: String, //hash,
69 | newNotificationDate: Date,
70 | sendUpdatedNotification: Boolean,
71 | updatedNotificationDate: Date,
72 | previousProduct: Schema.Types.Mixed,
73 | imageFiles:[String],
74 | imageData: [String],
75 | inventoryCount: Number,
76 |
77 | sellers:[sellerSchema],
78 | offerUrl: String,
79 | isScraped: Boolean,
80 | relatedLinks: [String],
81 | keywords: [String],
82 | sellerCount: Number,
83 | salesRank: Number,
84 | group: String,
85 | category: String,
86 | handlingDays: Number,
87 | isFba: Boolean,
88 | dump: String,
89 |
90 | reviews: Number,
91 | brand: String,
92 | properties: [propertySchema]
93 |
94 | });
95 |
96 | productSchema.virtual('getImages').get(function(){
97 |
98 | var images = this.images;
99 |
100 | if( Object.prototype.toString.call( images ) !== '[object Array]' ) {
101 | images = [];
102 | }
103 |
104 | for(var x = 0; x < this.items.length; x++){
105 |
106 | var item = this.items[x];
107 | for(var y = 0; y < item.images.lenght; y++) {
108 |
109 | var image = item.images[y];
110 |
111 | if(image && image.length > 0){
112 |
113 | images.push(image);
114 |
115 | }
116 |
117 | };
118 | }
119 |
120 | return images;
121 |
122 | });
123 |
124 | /*
125 | uri: 'http://www.amazon.com/gp/product/B00Z9MOZ14/ref=s9_ri_bw_g23_i20/183-5518956-5356440?pf_rd_m=ATVPDKIKX0DER&pf_rd_s=merchandised-search-6&pf_rd_r=09797QFH4GCN7EQ8X6B9&pf_rd_t=101&pf_rd_p=2130231282&pf_rd_i=1055398',
126 | title: 'Peakeep Digital Alarm Clock Battery Operated with Dual Alarms and Snooze Function - Travel Alarm Clock and Home Alarm Clock - Optional Weekday Alarm Mode and Sensor Light',
127 | reviewCount: '19 customer reviews',
128 | brand: 'Peakeep',
129 | priceBlock: '$15.99',
130 | properties:
131 | [ { key: 'Brand Name', value: 'Peakeep' },
132 | { key: 'Model Number', value: 'MHP3112B' },
133 | { key: 'Product Dimensions',
134 | value: '5.2 x 1.8 x 3.1 inches ; 7 ounces' },
135 | { key: 'Shipping Weight',
136 | value: '7.2 ounces (View shipping rates and policies)' },
137 | { key: 'ASIN', value: 'B00Z9MOZ14' },
138 | { key: 'Item model number', value: 'MHP3112B' },
139 | { key: 'Date first available at Amazon.com',
140 | value: 'June 9, 2015' } ],
141 | images:
142 | [ 'http://ecx.images-amazon.com/images/I/51xAF2EnacL.jpg',
143 | 'http://ecx.images-amazon.com/images/I/51CGEEwepEL.jpg',
144 | 'http://ecx.images-amazon.com/images/I/51K317w6HKL.jpg',
145 | 'http://ecx.images-amazon.com/images/I/41ad3l-jYQL.jpg',
146 | 'http://ecx.images-amazon.com/images/I/41VTK2%2BWVSL.jpg',
147 | 'http://ecx.images-amazon.com/images/I/41xKv1k6xnL.jpg' ] }
148 | */
149 | productSchema.statics.buildAmazonProduct = function (obj, cb) {
150 |
151 | if(!obj){
152 | return cb();
153 | }
154 |
155 | var $this = new this();
156 |
157 | $this.url = obj.uri;
158 | $this.title = obj.title;
159 | $this.brand = obj.brand;
160 | $this.price = obj.priceBlock.replace('$','');
161 |
162 | if(obj.reviewCount){
163 | var test = obj.reviewCount.match(/\d+\,\d+/);
164 | if(test){
165 | $this.reviews = Number(test[0].replace(',', ''));
166 | }
167 | }
168 |
169 | for(var x = 0; x < obj.images.length;x++){
170 |
171 | $this.images.push(obj.images[x]);
172 |
173 | }
174 |
175 | for(var x = 0; x < obj.properties.length;x++){
176 |
177 | $this.properties.push(obj.properties[x]);
178 |
179 | }
180 |
181 | var id = $this.url.match(/\/product\/\w+/);
182 |
183 | if(!id){
184 | return cb('Unable to parse ID from url ' + $this.url);
185 | }
186 |
187 | $this.externalId = id[0].replace('/product/', '').replace(/\//gmi, '');
188 | $this.source = 'AMZN';
189 | $this.source += ($this.url.match(/\.com/)) ? '-US' : '-CA';
190 | $this.added = new Date();
191 | $this.sendUpdatedNotification = false;
192 |
193 | $this.hash = hashomatic.hash({
194 | "price": $this.price
195 | }, true, true);
196 |
197 | return cb(null, $this);
198 | };
199 |
200 | productSchema.statics.buildAliExpressProduct = function (obj, cb) {
201 |
202 | if(!obj){
203 | return cb();
204 | }
205 |
206 | var $this = new this();
207 |
208 | $this.externalId = obj.productId;
209 | $this.title = obj.title;
210 | $this.source = 'AE';
211 | $this.images = obj.imageUrls;
212 | $this.added = new Date();
213 | $this.url = 'http://www.aliexpress.com/item/-/'+obj.productId+'.html';
214 | $this.unit = obj.displayUnit;
215 | $this.isFreeShipping = obj.isFreeShipping;
216 | $this.isItemOffline = obj.isItemOffline;
217 | $this.targetCountry = obj.userCountryName;
218 | $this.targetCountryCode = obj.userCountryCode;
219 | $this.sendUpdatedNotification = false;
220 |
221 | var split = obj.displayPrice.split(" ");
222 | if(split.length > 1) {
223 | $this.price = split[1].replace('$','');
224 | $this.currencyCode = split[0];
225 | }
226 |
227 | $this.hash = hashomatic.hash({
228 | "displayPrice": obj.displayPrice
229 | }, true, true);
230 |
231 | for(var x = 0; x < obj.skuProducts.length; x++){
232 |
233 | var item = obj.skuProducts[x];
234 | var sku = item.skuPropIds;
235 | var detail = item[sku];
236 |
237 | if(!detail) continue;
238 |
239 | if(sku.match(/,/)) sku = sku.split(",")[0];
240 |
241 | var meat = {
242 | externalId: sku,
243 | isActivity: detail.isActivity,
244 | price: detail.skuPrice,
245 | inventoryCount: detail.count,
246 | images: [],
247 | properties:[]
248 | };
249 |
250 | for(var y = 0; y < obj.skuPropertyList.length; y++){
251 | var property = obj.skuPropertyList[y];
252 |
253 | for(var z = 0; z < property.skuPropertyValues.length; z++){
254 | var value = property.skuPropertyValues[z];
255 |
256 | if(value.propertyValueId == sku) {
257 |
258 | if(value.skuPropertyImageSummPath){
259 | meat.images.push(value.skuPropertyImageSummPath);
260 | }
261 | meat.properties.push({
262 | key: property.skuPropertyName,
263 | value: value.propertyValueName
264 | });
265 | break;
266 | }
267 | }
268 | }
269 |
270 | $this.items.push(meat);
271 |
272 | }
273 |
274 | for(var x = 0; x < obj.freightItems.length; x++) {
275 |
276 | var item = obj.freightItems[x];
277 |
278 | $this.freight.push({
279 |
280 | origin: item.sendGoodsCountryFullName,
281 | originISO: item.sendGoodsCountry,
282 | domesticFreight: item.domesticFreight,
283 | totalFreight: item.totalFreight,
284 | currencyCode: item.currency,
285 | discountPercentage: item.discount,
286 | totalDiscount: item.saveMoney,
287 | shippingPeriod: item.time,
288 | actualPrice: item.price,
289 | discountType: item.discountType,
290 | commitDay: item.commitDay,
291 | shipperCode: item.serviceName,
292 | shipperName: item.company
293 |
294 | });
295 |
296 | }
297 |
298 | return cb(null, $this);
299 | };
300 |
301 | productSchema.statics.buildAmazonApiProduct = function (obj, cb) {
302 |
303 | if(!obj){
304 | return cb();
305 | }
306 |
307 | var $this = new this();
308 |
309 | $this.dump = JSON.stringify(obj,null,0);
310 |
311 | $this.source = 'AMZN';
312 | $this.isScraped = false;
313 | //$this.source += ($this.url.match(/\.com/)) ? '-US' : '-CA';
314 | $this.added = new Date();
315 | $this.sendUpdatedNotification = false;
316 | $this.externalId = obj.ASIN[0];
317 | $this.url = obj.DetailPageURL[0];
318 |
319 | if(obj.ImageSets && obj.ImageSets.length >0) {
320 |
321 | _.each(obj.ImageSets[0].ImageSet, function (imageset) {
322 |
323 | $this.images.push(imageset.MediumImage[0].URL[0]);
324 |
325 | });
326 |
327 | }
328 |
329 | if(obj.SimilarProducts && obj.SimilarProducts.length > 0) {
330 | var similar = obj.SimilarProducts[0].SimilarProduct;
331 |
332 | _.each(similar, function (compare) {
333 |
334 | $this.keywords.push(compare.Title[0]);
335 | $this.relatedLinks.push('http://www.amazon.com/gp/product/'+compare.ASIN[0]);
336 |
337 | });
338 | }
339 | var attr = obj.ItemAttributes[0];
340 |
341 | $this.group = attr.ProductGroup[0];
342 |
343 | $this.category = (attr.Binding || attr.ProductGroup)[0];
344 | $this.brand = (attr.Brand || ['UNKNOWN'])[0];
345 |
346 | $this.properties.push({key: 'dump', value: JSON.stringify(attr)});
347 |
348 | if(attr.Color && attr.Color.length > 0)
349 | $this.properties.push({key:'color', value: attr.Color[0]});
350 |
351 | if(attr.EAN && attr.EAN.length > 0)
352 | $this.properties.push({key:'EAN', value: attr.EAN[0]});
353 |
354 | $this.title = attr.Title[0];
355 |
356 | var i = 0;
357 | _.each(attr.Feature, function(feature){
358 |
359 | var goods = feature;
360 |
361 | if(/:/.test(goods)){
362 |
363 | var split = goods.split(':');
364 | $this.properties.push({key:split[0],value:split[1]});
365 |
366 | } else {
367 | $this.keywords.push(goods);
368 | }
369 |
370 | });
371 | if(attr.Label && attr.Label.length > 0) $this.keywords.push(attr.Label[0]);
372 | if(attr.Brand && attr.Brand.length > 0) $this.keywords.push(attr.Brand[0]);
373 | if(attr.Manufacturer && attr.Manufacturer.length > 0) $this.keywords.push(attr.Manufacturer[0]);
374 | $this.keywords.push(attr.Title[0]);
375 |
376 | if(obj.SalesRank && obj.SalesRank.length > 0)
377 | $this.salesRank = Number(obj.SalesRank[0]);
378 |
379 | var offersummary = obj.OfferSummary[0];
380 |
381 | if(offersummary.LowestNewPrice && offersummary.LowestNewPrice.length > 0) {
382 | $this.price = offersummary.LowestNewPrice[0].Amount[0] / 100;
383 | $this.currencyCode = offersummary.LowestNewPrice[0].CurrencyCode[0];
384 | }
385 |
386 | $this.sellerCount = offersummary.TotalNew[0];
387 | $this.hasReviews = obj.CustomerReviews[0].HasReviews[0] == "true";
388 |
389 | var offers = obj.Offers[0];
390 |
391 | if(offers.Offer && offers.Offer.length > 0) {
392 |
393 | var offer = offers.Offer[0].OfferListing[0];
394 |
395 | $this.merchant = offers.Offer[0].Merchant[0].Name[0];
396 |
397 | var offerUrl = (offers.MoreOffersUrl["0"]).match(/http:\/\/[a-z0-9_-].+?%/i);
398 |
399 | if (offerUrl) {
400 | $this.offerUrl = offerUrl[0].replace('%', '');
401 | }
402 |
403 | $this.handlingDays = Number(offer.AvailabilityAttributes[0].MaximumHours) / 24;
404 |
405 | $this.isFba = offer.IsEligibleForPrime[0] == 1 || offer.IsEligibleForSuperSaverShipping[0] == 1;
406 | }
407 |
408 | $this.hash = hashomatic.hash({
409 | "price": $this.price,
410 | "isFba": $this.isFba,
411 | "sellerCount": $this.sellerCount
412 | }, true, true);
413 |
414 | return cb(null, $this);
415 | }
416 |
417 |
418 | return mongoose.model('Product', productSchema);
419 |
420 | }
421 |
422 | module.exports = new Product();
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "geek-scrape",
3 | "version": "0.0.1",
4 | "description": "",
5 | "main": "lib/index.js",
6 | "directories": {
7 | "test": "test"
8 | },
9 | "scripts": {
10 | "test": "mocha test/*.js"
11 | },
12 | "repository": {
13 | "type": "git",
14 | "url": "git://bitbucket.org/trent_millar/geek-scrape.git"
15 | },
16 | "keywords": [
17 | ""
18 | ],
19 | "readmeFilename": "README.md",
20 | "author": {
21 | "name": "Trent Millar"
22 | },
23 | "dependencies": {
24 | "apac": "^1.0.0",
25 | "async": "latest",
26 | "dotenv": "^1.2.0",
27 | "hash-o-matic": "^0.1.3",
28 | "huntsman": "^0.2.12",
29 | "mongoose": "^4.1.3",
30 | "nedb": "^1.1.3",
31 | "nodemailer": "^1.4.0",
32 | "opencv": "^3.2.0",
33 | "request": "^2.61.0",
34 | "underscore": "latest",
35 | "x-ray": "latest"
36 | },
37 | "devDependencies": {
38 | "mocha": "~1.18.2",
39 | "should": "~3.3.1"
40 | }
41 | }
42 |
--------------------------------------------------------------------------------