├── .gitignore
├── Ebay Capstone Progress Journal.rtf
├── README.md
├── _config.yml
├── bh_photo_scraper
    ├── bh_photo_scraper
    │   ├── __init__.py
    │   ├── __init__.pyc
    │   ├── items.py
    │   ├── items.pyc
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── pipelines.pyc
    │   ├── settings.py
    │   ├── settings.pyc
    │   └── spiders
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── digital_camera_spider.py
    │   │   └── digital_camera_spider.pyc
    └── scrapy.cfg
├── capstone-technical-report
    └── images
    │   ├── buyers_guide_example.png
    │   ├── classification_case_study.png
    │   ├── completed_items_v2.png
    │   ├── csm_start_price.png
    │   ├── cyber_shot.png
    │   ├── example_dataframe.png
    │   └── gently_used.png
├── data-analysis
    ├── CSM-start-price-exploration.ipynb
    ├── auctions-modeling-2.ipynb
    ├── auctions-modeling-classification.ipynb
    ├── auctions-modeling-regression.ipynb
    ├── auctions-modeling.ipynb
    ├── data-cleaning
    │   ├── extract-features-bh-photo.ipynb
    │   └── extract-features-from-ebay-category-specifics.ipynb
    ├── ebay-exploration.ipynb
    ├── feature-engineer-time-of-day.ipynb
    ├── feature-engineering-concurrent-similar-median-start-price
    │   └── feature-engineer-concurrent-median-start-price.ipynb
    ├── find-model-prices.ipynb
    └── utilities
    │   ├── clean_text.py
    │   └── plot_learning_curve.py
├── ebay-api-scraper
    ├── .ipynb_checkpoints
    │   ├── datetime test-checkpoint.ipynb
    │   ├── ebay-exploration-checkpoint.ipynb
    │   ├── ebay-scraper-checkpoint.ipynb
    │   └── scrapy-development-checkpoint.ipynb
    ├── common.py
    ├── common.pyc
    ├── create-ebay-tables.py
    ├── datetime test.ipynb
    ├── ebay-scraper.ipynb
    ├── ebay.yaml
    ├── ebay_scraper
    │   ├── ebay_scraper
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   └── settings.cpython-35.pyc
    │   │   ├── items.py
    │   │   ├── items.pyc
    │   │   ├── pipelines.py
    │   │   ├── pipelines.pyc
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-35.pyc
    │   │   │       └── ebay_spider.cpython-35.pyc
    │   │   │   ├── ebay_spider.py
    │   │   │   ├── ebay_spider.pyc
    │   │   │   └── ebay_spider_v2.py
    │   ├── ebay_spider_log.log
    │   └── scrapy.cfg
    ├── find-completed-listing.py
    ├── finding.py
    ├── parallel-requests.py
    ├── scrapy-development.ipynb
    ├── trading.py
    └── update-ebay-table.py
├── index.md
├── mongo-test
    └── mongo-test.py
└── postgresql-test
    ├── postgres cheat sheet
    └── postgresql-test.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ebay-api-scraper/ebay_scraper/ebay_scraper/ebay_spider_log.log
 2 | 
 3 | data-analysis/pickles
 4 | data-analysis/feature-engineering-concurrent-similar-median-start-price/pickles
 5 | data-analysis/pickles/auctions.p
 6 | data-analysis/.ipynb_checkpoints/CSM-start-price-exploration-checkpoint.ipynb
 7 | data-analysis/.ipynb_checkpoints/auctions-modeling-checkpoint.ipynb
 8 | data-analysis/.ipynb_checkpoints/feature-engineer-concurrent-median-start-price-checkpoint.ipynb
 9 | data-analysis/.ipynb_checkpoints/ipython-parallel-tutorial-checkpoint.ipynb
10 | data-analysis/feature-engineering-concurrent-similar-median-start-price/.ipynb_checkpoints/
11 | data-analysis/feature-engineering-concurrent-similar-median-start-price/pickles/
12 | data-analysis/pickles/df_classification_count_vec.p
13 | data-analysis/pickles/df_classification_tfidf.p
14 | data-analysis/pickles/df_regression_tfidf.p
15 | data-analysis/pickles/df_regression_tfidf_all.p
16 | data-analysis/utilities/clean_text.pyc
17 | ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/items.cpython-35.pyc
18 | ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider_v2.pyc
19 | 


--------------------------------------------------------------------------------
/Ebay Capstone Progress Journal.rtf:
--------------------------------------------------------------------------------
  1 | {\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810
  2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fmodern\fcharset0 Courier;\f2\froman\fcharset0 Palatino-Roman;
  3 | \f3\fmodern\fcharset0 Courier-Bold;\f4\fswiss\fcharset0 ArialMT;\f5\fnil\fcharset0 HelveticaNeue;
  4 | }
  5 | {\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red85\green142\blue40;\red0\green0\blue0;
  6 | \red255\green83\blue8;\red133\green0\blue175;\red174\green0\blue240;\red255\green255\blue255;\red255\green39\blue18;
  7 | \red63\green105\blue30;\red255\green255\blue51;\red179\green179\blue179;\red128\green128\blue128;\red255\green250\blue131;
  8 | \red38\green38\blue38;\red255\green255\blue255;\red194\green229\blue166;\red192\green237\blue254;\red255\green252\blue171;
  9 | \red255\green164\blue159;\red254\green187\blue100;\red194\green229\blue166;\red0\green0\blue0;\red255\green255\blue255;
 10 | }
 11 | {\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\csgenericrgb\c33333\c55686\c15686;\csgenericrgb\c0\c0\c0;
 12 | \csgenericrgb\c100000\c32549\c3137;\csgenericrgb\c52157\c0\c68627;\csgenericrgb\c68235\c0\c94118;\csgenericrgb\c100000\c100000\c100000;\csgenericrgb\c100000\c15294\c7059;
 13 | \csgenericrgb\c24706\c41176\c11765;\csgenericrgb\c100000\c100000\c20000;\csgray\c75407;\csgray\c57415;\csgenericrgb\c100000\c98039\c51373;
 14 | \cssrgb\c20000\c20000\c20000;\cssrgb\c100000\c100000\c100000;\csgenericrgb\c76078\c89804\c65098;\csgenericrgb\c75294\c92941\c99608;\csgenericrgb\c100000\c98824\c67059;
 15 | \csgenericrgb\c100000\c64314\c62353;\csgenericrgb\c99608\c73333\c39216;\csgenericrgb\c76078\c89804\c65098;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000;
 16 | }
 17 | \margl1440\margr1440\vieww19000\viewh9060\viewkind0
 18 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 19 | 
 20 | \f0\fs24 \cf0 eBay Capstone Work Journal:\
 21 | I chose cameras because:\
 22 | \pard\pardeftab720\partightenfactor0
 23 | \cf2 \expnd0\expndtw0\kerning0
 24 | a) they can be evaluated by using well-structured data that lends itself well to machine learning techniques\
 25 | b) they are representable as a set of easily quantified parameters\
 26 | c) they represent a large market of used items\
 27 | d) their prices are predictable and relatively stable for short-term horizons. \
 28 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 29 | \cf0 \kerning1\expnd0\expndtw0 \
 30 | \
 31 | 3/21 - 3/27:\
 32 | - Experimented with MongoDB, didn\'92t like fact that data was stored in JSON, and the fact that some rows can have different columns.\
 33 | - Chose postgresql due to familiarity with SQL, flexible types, \
 34 | - setup postresql ebay database with completed_items table under user: nathan\
 35 | - Set up ebay API to get completed items and store data into postgres database. \
 36 | - set up multi-processed web scraper with scrapy and multiprocessing.Pool to update \'93condition\'94 fields in ebay table from scraped item condition data. Sometimes the item doesn\'92t have a condition, in which case, an empty string is put into the field. \
 37 | \
 38 | 3/28:\
 39 | Goal:\
 40 | in \
 41 | /Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_scraper \
 42 | rub scrapy\
 43 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 44 | 
 45 | \f1 \cf3 $ scrapy crawl ebay_crawl_spider -a url_start_index=92224\
 46 | \
 47 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 48 | \cf4 enter postgres database \cf3 \
 49 | $ psql - U nathan ebay
 50 | \f0 \
 51 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 52 | \cf0 \
 53 | - Setup up Scrapy spider. \
 54 | 	- Following tutorial, when I ran scrapy crawl ebay, I got TypeError: \'91float\'92 object is not itterable. \
 55 | 		- Solved problem by updating scrapy using conda. Only problem is that I also updated scrapy using conda system-wide, which scrapy documentation told me not to do (I updated before reading that). \
 56 | 	- Creating pipeline.\
 57 | 		- created scrapy->postgres pipeline class for storing condition data into the ebay table \
 58 | 		- added pipeline to settings.py\
 59 | 		- Need to read itemId,URL from ebay table (x)\
 60 | 		- Need to set options for throttling (x)\
 61 | 		- Set postgres config in settings.py (x)\
 62 | 	- Test spider with hardcoded ebay urls\
 63 | 		- Worked but it\'92s slow (x)\
 64 | 		- Speed up scrapy crawl ebay  (X)\
 65 | 		- Create log of errors from scraps (X)\
 66 | \
 67 | 3/29:\
 68 | - Pull postgres data into pandas data frame using sqlalchemy (x)\
 69 | 	- Deleted duplicates in completed_items table (x)\
 70 | 	
 71 | \f1 \cf3 DELETE FROM tablename\
 72 | 	WHERE id IN \
 73 | 		(SELECT id FROM (SELECT id, ROW_NUMBER() OVER (partition BY column1, column2, column3 ORDER BY id) AS rnum FROM tablename) \
 74 | 		as t WHERE t.rnum > 1);\
 75 | \
 76 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 77 | \cf4 or specifically for deleting duplicate itemId :\cf3 \
 78 | \
 79 | DELETE FROM completed_items_v2 WHERE id IN (SELECT id FROM (SELECT ci."id", ROW_NUMBER() OVER (partition BY ci."itemId" ORDER BY ci."id") AS rnum FROM completed_items_v2 as ci) as t WHERE t.rnum > 1);\cf5 \
 80 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 81 | 
 82 | \f0 \cf0 \
 83 | \
 84 | - Light cleaning (x)\
 85 | - Light EDA (x)\
 86 | - Discovered that I had a typo in my find-completed-listings.py, which was causing the \'93categoryId\'94 search in my filter to not work.\
 87 | Therefore I was getting completed items from every ebay category (only filter I was using was the keyword \'91camera\'92). \
 88 | 	- apiy_request dictionary had \'91CategoryId\'94 which should have been \'93categoryId\'94. \
 89 | 	- Not sure what to do now. I can change the categoryId to the right category and then start my api requests from the max  But then I will get a lot of duplicate items. \
 90 | 	- I think the best strategy is to just copy my old postgres table into a new table so I don\'92t lose my searches thus far. \
 91 | 	- Then filter that table by the categoryId in SQL. \
 92 | 	- Then begin filling that table with new data starting at a low price, using no keywords. \
 93 | 	\
 94 | - Vectorize title with CountVectorizer (x) \
 95 | - Use NLP features + model to predict sold status (x)\
 96 | 	- Baseline accuracy = 0.897\
 97 | 	- First pass model (random forest) accuracy = 0.913\
 98 | \
 99 | 3/30:\
100 | - Create new table\
101 | 	- Copy completed_items (x)\
102 | 	- Filter: (x)\
103 | 		- categoryId = 15230 # film cameras\
104 | 		- categoryId = 31388 # digital cameras \
105 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
106 | 
107 | \f1 \cf0 		\cf3 DELETE FROM completed_items_15230_31388 as ci WHERE ci."primaryCategory.categoryId"!=31388 AND ci."primaryCategory.categoryId"!=15230;
108 | \f0 \cf0 \
109 | 	- OK that didn\'92t work. When we copy completed_items, a serial (id_seq table) was not created. Therefore, in our completed_items_15230_31388 table, the id column was not a serial, and was not being automatically generated when we inserted values. \
110 | 		- I copied old table to new table. Then I created a new serial primary key column. Then I deleted the old id column. Then I deleted duplicate rows (rows with duplicate itemId values). \
111 | \
112 | \
113 | - using categoryId = 15230, 31388 and no keywords, query starting at minPrice = 20, up until 2500 (x)\
114 | 	- For some reason, I was able to query as much data using the ebay API with only one dev API key. Not sure why. \
115 | \
116 | \
117 | Goals for 3/31 and weekend:\
118 | - Use scrapy to do recursive scrap and get starting bid price.\
119 | - Continue NLP exploration and modeling with different features and techniques.\
120 | \
121 | 3/31:\
122 | - What happens to values returned or yielded from parse() and parse_start_prce() in spider?\
123 | - Where to instantiate Item()? \
124 | - How to update item class instance in spider within different callback functions? \
125 | \
126 | 4/1\
127 | - Got nested scrape working, but now ebay is requesting captcha every time I want to scrape. \
128 | - Using cactusVPN, I was able to start scraping again. \
129 | 	- VPN disconnects if I send too many requests too fast. Using these settings, it seems to work:\
130 | 		
131 | \f1 \cf6 AUTOTHROTTLE_START_DELAY = 0.5\
132 | 		AUTOTHROTTLE_MAX_DELAY = 2\
133 | 		AUTOTHROTTLE_TARGET_CONCURRENCY = 2\
134 | 		CONCURRENT_REQUESTS = 2\
135 | 		DOWNLOAD_DELAY = 0.9
136 | \f0 \cf0 \
137 | - So I\'92m collecting itemId, conditiondescription, duration, and startPrice now. \
138 | \
139 | \
140 | I was thinking about what factors we\'92re going to consider in the model. I think that, along with completed listings, a big factor would be the other listings that are active at the time the model does it\'92s calculation for a specific item. \
141 | \
142 | The reason we want to see which items were listed concurrently is because (I would think) you would want to weight the factors from the concurrent listings more than the previously completed listings. Like if you are selling a Nikon ec380, and there is already a Nikon ec380 listed with a startPrice of $300, and all factors being equal, you probably don\'92t want to list it for much more than $300. Even if historically those cameras sell with startPrice of $400, you probably don\'92t want to list it for $400. Therefore I want to weight the concurrent listings factors more than the completed listings. \
143 | \
144 | Is it possible to do something like this? Is it even smart to do something like this? \
145 | \
146 | Also, I don\'92t see anyone doing this in the white papers that have been released previously for ebay listings end price predictions. Could be a novel method. \
147 | \
148 | \
149 | (You could figure out which items are concurrent by looking at the startTime and endTime columns. You could then create a new column called concurrentGroupNumber, that would be generated by incrementing through the first startTime to the last startTime in equally spaced chunks. In each chunk, you look at all the listings that are active in that chunk, and for each of those listings, assign the same groupNumber to them. That allows you to see concurrent listings.)\
150 | \
151 | Goals for 4/1 and 4/2:\
152 | - Scrape more data. Current number of rows in 
153 | \f1 \cf7 completed_items_15230_31388 is 46225. 
154 | \f0 \cf0 \
155 | 	- Collect completed_items auctions for starting at 3/30 at 11:10:00, until 4/2 12:00:00 (x)\
156 | 	- collect buy it now auctions of prices from 20 to 2500 and store in table completed_items_15230_31388 (x) \
157 | 		- 
158 | \f1 \cf7 \{'name': 'ListingType', 'value':'AuctionWithBIN'\},
159 | \f0 \cf0 \
160 | 	- Collect fixedPrice auctions of prices from 20 to 2500 and store in table completed_items_15230_31388 \
161 | - Don\'92t know if \'91FixedPrice\'92 listings were sold or if they just ended. \
162 | 	- Okay this is a big problem. It seems like \'91AuctionWithBIN\'92 is like a deprecated feature because if you go on ebay.com now, you can only find auctions OR BuyItNow listings, but there don\'92t seem to be listings that are AuctionWithBIN. \
163 | 	- So I might have to just scrape data for the Buy It Now listings. \
164 | 		- Although the only problem I\'92m seeing here is that you can\'92t find Buy It Now listings from stores\'85 which is a problem because stores make up a lot of the business.\
165 | \
166 | - I experimented with changing the code of connections in:\
167 | 
168 | \f1  /Users/Naekid/anaconda3/envs/dsi/lib/python2.7/site-packages/ebaysdk/connection.py
169 | \f0 \
170 | and then running the samples in:\
171 | 
172 | \f1 /Users/Naekid/anaconda3/envs/dsi/lib/python2.7/site-packages/ebaysdk-2.1.4-py2.7.egg/samples\
173 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
174 | 
175 | \f0 \cf0 For some reason, the samples in .egg use the code in the other ebaysdk/ directory. Who knows. \
176 | \
177 | I created a GitHub ticket on the python SDK page. I also created a ebay developers forum post. \
178 | Here\'92s a sample URI HTTP GET with the correct sellingState:\
179 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
180 | {\field{\*\fldinst{HYPERLINK "http://svcs.ebay.com/services/search/FindingService/v1?OPERATION-NAME=findCompletedItems&SERVICE-VERSION=1.7.0&SECURITY-APPNAME=nathanzo-ebaypric-PRD-cbed4d450-05d217d8&RESPONSE-DATA-FORMAT=XML&keywords=222461424089"}}{\fldrslt \cf0 http://svcs.ebay.com/services/search/FindingService/v1?OPERATION-NAME=findCompletedItems&SERVICE-VERSION=1.7.0&SECURITY-APPNAME=nathanzo-ebaypric-PRD-cbed4d450-05d217d8&RESPONSE-DATA-FORMAT=XML&keywords=222461424089}}\
181 | s\
182 | I\'92ve decided that the best course of action is to move on to sending my own HTTP GET requests to ebay using the structure outlined in their API documentation. \
183 | \
184 | 4/3:\
185 | - Discovered bug that was causing incorrect sellingState to be returned. The problem was that the the 
186 | \f1 X-EBAY-SOA-SERVICE-VERSION 
187 | \f0 Header field in the ebaySDK was set to 1.0.0. When I hardcoded it to 1.13.0, that fixed the problem. Also, you should be able to change the version in the ebay.yaml file, but that didn\'92t work for me.\
188 | \
189 | - Wrote my own wrapper for Ebay API to send findCompletedItems requests to specific itemId, retreive sellingState, and update SQL table. \
190 | \
191 | - scrape ebay for condition information of FixedPrice items. \
192 | \
193 | - my-ebay-api-port/development.ipynb is for using HTTP GET ebay API to update sellingState for items that are already in my ebay database. \
194 | - scrapy is used to get condition description for all items, and startPrice for items that had > 0 bid_count.\
195 | 	- scrapy is not getting startPrice successfully. \
196 | \
197 | \
198 | 4/4:\
199 | - I\'92m thinking of starting a new database from scratch since the old one is super messy. \
200 | 	- First objective is to get the startPrice.\
201 | 	- Once I have the startPrice, then I can use find-completed-items on \'91Auction\'92,\'92AuctionWithBIN\'92,\'92FixedPrice\'92,\'92StoreInventory\'92\
202 | \
203 | - Making new spider: ebay_spider_v2\
204 | 	- All I need to scrape is the conditionDesription, startPrice \
205 | - Turns out the two different bid history pages are due to 2 different version of the page. For example:\
206 | 	{\field{\*\fldinst{HYPERLINK "http://www.ebay.com/bfl/viewbids/291989472205?item=291989472205&rt=nc&_trksid=p2047675.l2565"}}{\fldrslt http://www.ebay.com/bfl/viewbids/291989472205?item=291989472205&rt=nc&_trksid=p2047675.l2565}}\
207 | 	{\field{\*\fldinst{HYPERLINK "http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=291989472205&rt=nc&_trksid=p2047675.l2565"}}{\fldrslt http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=291989472205&rt=nc&_trksid=p2047675.l2565}}\
208 | Slightly different - I don\'92t know how I\'92m getting these two. But it\'92s not a big deal. We can just grab the entire HTML of either page, and use this code to get the startPrice:\
209 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
210 | 
211 | \f1 \cf3 bid_history_items = response.xpath("//span/text()").extract()\
212 | if bid_history_items:\
213 | 	for i,text in enumerate(bid_history_items):\
214 | 		if text == 'Starting Price':\
215 | 			startPrice = bid_history_items[i+1]\
216 | 			item['startPrice'] = float(startPrice.replace('$',''))\
217 | \
218 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
219 | 
220 | \f0 \cf0 It seems to work. \
221 | \
222 | - I also found out that sometimes if a listing is an auction but ends with a BuyItNow, the findCompletedItems does not always show that it was a \'91AuctionWithBIN\'92 or that it ended with a BuyItNow. It\'92s also going to require two nested scrapes to get the start price. \
223 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
224 | \cf8 \cb9 I will have to investigate this further, but looking into listings of type \'91AuctionWithBIN\'92\
225 | \cf4 \cb1 And seeing if they are fucked up.\cf0 \
226 | \
227 | \
228 | 4/5:\
229 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
230 | 
231 | \f2 \cf10 - I think that a unique contribution to this problem would be to utilize concurrent listings, as well as completed listings.\
232 | 
233 | \fs28 	- 
234 | \fs24 Calculate 
235 | \i similar
236 | \i0  (via NLP), concurrent (defined below) listing mean/median starting price. This alone is a useful application of ML.\
237 | 	- Calculate similar completed listing mean/median start price. \
238 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
239 | 
240 | \f0 \cf0 \
241 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
242 | 
243 | \f2 \cf11 \cb12 It might be a good idea to consider the price of similar listings 
244 | \i at the time of posting
245 | \i0  a given listing, but that would require lots of extra scraping. \cf4 \cb1 \
246 | \cf11 \cb13 So instead, we\'92ll just consider the 
247 | \i start price 
248 | \i0 of concurrent listings.\cf0 \cb1 \
249 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
250 | 
251 | \fs28 \cf10 \
252 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
253 | 
254 | \fs24 \cf0 \
255 | FEATURE ENGINEERING for mean/median startPrice of Concurrent Listings, and mean/median of startPrice and endPrice of Completed Listings. \
256 | \
257 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
258 | 
259 | \f1 \cf0 FIRST create 4 new columns in dataframe \
260 | 	similarConcurrentListing.meanStartPrice\
261 | 	similarConcurrentListing.medianStartPrice\
262 | 	similarCompletedListing.meanStartPrice\
263 | 	similarCompletedListing.medianStartPrice\
264 | 	similarCompletedListing.meanEndPrice\
265 | 	similarCompletedListing.meanEndPrice\
266 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
267 | 
268 | \f2 \cf0 \
269 | We need to define concurrent listings.
270 | \f1 \
271 | \
272 | 
273 | \f2 Filter for Concurrent and Completed Listings
274 | \f1 \
275 | 1 Method - \
276 | 	if listing2.starTime > listing1.startTime, then l2.startTime should be < H1 hours after l1.starTime \
277 | 	If listing2.startTime < listing1.startTime, then l2.endTime should be > H2 hours after listing1.startTime\
278 | We could just choose H1 and H2, but maybe EDA is a good way of finding this best time. We could also use H1 and H2 as hyper parameters.\
279 | \
280 | sort dataframe by startTime DESC \
281 | for listing1 in listings:\
282 | 	for listing2 in listing: # only consider concurrent and past listings (exclude future listings)\
283 | 		if listing2.startTime < listing1.endTime and listing2.endTime > listing1.startTime:\
284 | 			# listing1 and listing2 are concurrent listings\
285 | 			add to dataframe with [\'91concurrent\'92]=1		\
286 | 				\
287 | 		elif listing2.endTime <= listing1.startTime:\
288 | 			# listing2 is a past listing for listing1 \
289 | 			add to data frame with [\'91concurrent\'92]=0\
290 | \
291 | \
292 | 2 Method - \
293 | Actually instead, let\'92s frame the problem so that we calculate the percentage of overlap between the time periods of l1 and l2. Define l2 to be a concurrent listing if it\'92s percentage overlap with l1 is > H, where H is a number between 0 and 1. H could be a hyper parameter to the modeling pipeline. \
294 | \
295 | sort dataframe by startTime DESC \
296 | for listing1 in listings:\
297 | 	for listing2 in listing: # only consider concurrent and past listings (exclude future listings)\
298 | 		if listing2.startTime < listing1.endTime and listing2.endTime > listing1.startTime:\
299 | 			# listing1 and listing2 are concurrent listings\
300 | 			add to dataframe with [\'91concurrent\'92]=1		\
301 | 				\
302 | 		elif listing2.endTime <= listing1.startTime:\
303 | 			# listing2 is a past listing for listing1 \
304 | 			add to data frame with [\'91concurrent\'92]=0\
305 | \
306 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
307 | 
308 | \f3\b \cf0 3 Method - THE BEST (most realistic)
309 | \f1\b0 \
310 | sort dataframe by startTime DESC\
311 | for listing1 in listings:\
312 | 	create new dataframe for concurrent and completed listings (no future listings)\
313 | 	add listing1 to new dataframe\
314 | 	for listing2 in listings:\
315 | 		if listing2.startTime < listing1.startTime and listing2.endTime > listing1.startTime:\
316 | 			add listing2 to new dataframe with [\'91concurrent\'92]=1\
317 | 		else:\
318 | 			add listing2 to new dataframe with [\'91concurrent\'92]=0\
319 | 	calculate median startPrice of listings with [\'91concurrent\'92]==1
320 | \f2 \
321 | \
322 | For EDA on that aspect, Vrushank suggested plotting percentage of concurrent time (x-axis) vs. number of listings within that time, although that wouldn\'92t tell you how much the hyper parameter affects the accuracy of the model.
323 | \f1 \
324 | \
325 | \
326 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
327 | 
328 | \f2 \cf0 VECTORIZE
329 | \f1 \
330 | 	Vectorize new dataframe (BoW, TFIDF, shingles, sPacy) as vdf\
331 | 	Calculate similarity (Jaccard, Cosine), between listing1 and all other listings in vdf\
332 | 	Set a threshold for similairty so that only items OVER that threshold will be considered in the model, i.e. filter vdf so that we only are considering listings very similar to the original listing\
333 | 	# Now you have a vectorized df of listings similar to listing1\
334 | 	calculate mean/median of concurrent=1 listings and concurrent=0 listings, seperately\
335 | 	You now have 2 new features for your listing, which you can add as new columns to the row for listing1 in the original dataframe\
336 | 	\
337 | \
338 | \
339 | 
340 | \f2 Just had an idea. Essentially, instead of finding similar titles myself, I use ebay\'92s search to find similar titles using topics created by LDA to find the commonly searched keywords from the titles that I already have in my database. Then, for each LDA topic/keyword, you search bay\'92s current and completed listings. But then you still have to calculate similarity, because your alternative is to just take the top n searches from ebay\'92s results, and obviously some of those results will be very different to each other. But using ebay\'92s search results combined with my own similarity calculation might be more effective than JUST using my own similarity calculation. Let\'92s try using my own similarity calculation first because we will have to do that anyway. 
341 | \f1 \
342 | \
343 | 		\
344 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
345 | 
346 | \f2 \cf4 \cb14 Problem: People can list items as auctions, and update those listings midway through to have BuyItNow available, but this won\'92t show up in a grab from the ebay API. \
347 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
348 | \cf0 \cb1 From the documentation:\
349 | \pard\pardeftab720\sl300\partightenfactor0
350 | 
351 | \f4\i\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0
352 | Auction\
353 | Competitive-bid online auction format. Buyers engage in competitive bidding, 
354 | \b although Buy It Now may be offered as long as no valid bids have been placed
355 | \b0 .\
356 | 
357 | \i0 \
358 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
359 | 
360 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 This is happening for certain listings, such as 332172404108. \
361 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
362 | 
363 | \b \cf0 It has a listingType of \'91Auction\'92, no \'91startPrice\'92, BuyItNowAvailable=\'91f\'92, and bidCount=1, then that final selling bid was a Buy It Now.
364 | \b0  \
365 | This must have happened because the user updated their listing after it was already posted and online for some time. \
366 | \
367 | xpath for \'91see original listing\'92 for these pages:\
368 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
369 | \cf10 //span[contains(@class, 'vi-inl-lnk')]//@href\
370 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
371 | \cf0 \
372 | There are two options here.\
373 | 1. We could go and scrape for the start price, and then make two listings out of each of these listings: 1 would be an \'91Auction\'92 listing that did NOT sell, and had the original start price. And the other would be a \'91AuctionWithBIN\'92 that DID sell. \
374 | 2. We just make the listing a \'91AuctionWithBIN\'92 that DID sell, with a start price equal to the sale price.\
375 | \
376 | Let\'92s see how many of these listings there are. if there are a lot, that gives more reason to do option 1, because if we don\'92t, we are losing out on significant information with the \'91Auction\'92 listings that did not sell. But it\'92s going to take a lot of rescraping to get those, and just adds a shit ton of complications.\
377 | The number of listings (so far on 4/5) is 4,432. \
378 | I mean it\'92s not a big deal to go with option 2, because we\'92re just kind of pretending those \'91Auction\'92 listings didn\'92t exist, which isn\'92t horrible.\
379 | \
380 | So to deal with these listings with option 2:\
381 | # listingType of \'91Auction\'92, no \'91startPrice\'92, BuyItNowAvailable=\'91f\'92, and bidCount=1\
382 | mask = (df['listingInfo.listingType']=='Auction')\\\
383 |         & (np.isnan(df['startprice'])) \\\
384 |         & (df['sellingStatus.bidCount']==1.0)\
385 | dfm = df[mask]\
386 | dfm[\'91listingInfo.listingType\'92] = \'91AuctionWithBIN\'92\
387 | dfm[\'91listingInfo.listingType\'92] = \'91AuctionWithBIN\'92\
388 | \
389 | OKAY and this new information:\
390 | \pard\pardeftab720\sl300\partightenfactor0
391 | 
392 | \f4\i\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0
393 | AuctionWithBIN\
394 | Same as Auction format, but Buy It Now is enabled. AuctionWithBIN changes to Auction if a valid bid has been placed on the item.
395 | \i0 \
396 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
397 | 
398 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 Which means we have funky information BUT here is what I realized. Because ebay doesn\'92t update \'91Auction\'92 to \'91AuctionWithBIN\'92 for listings where the user updated the listing type midway through\'85\
399 | \
400 | \'85We need to create our own 
401 | \b endListingType
402 | \b0 , that concerns the end state of the listing. \
403 | IF an item did NOT SELL, then we care about the END state (that\'92s the only state there is). \
404 | 	- if sellingState = \'91EndedWithoutSales\'92:\
405 | 		- 
406 | \b endListingType 
407 | \b0 = listingInfo.listingType\
408 | 		- 
409 | \b startprice 
410 | \b0 = sellingStatus.currentPrice.value\
411 | IF an item SOLD, then we care how about the END state (the state of the item when it was sold)\
412 | 	- else if sellingState = \'91EndedWithSales\'92:\
413 | 		- If listingInfo.listingType was \'91Auction\'92:\
414 | 			- 
415 | \b if listingType= \'91Auction\'92,  \'91startprice\'92=NaN,  \'92bidCount\'92=1.0 (because user can only change to BIN w/ 0 bids), then that final selling bid was a Buy It Now.
416 | \b0  This also catches the case where an AuctionWithBIN \
417 | 				- 
418 | \b endListingType
419 | \b0  = \'91AuctionWithBIN\'92\
420 | 				- 
421 | \b startprice
422 | \b0  = sellingStatus.currentPrice.value\
423 | 			- 
424 | \b if listingType= \'91Auction\'92,  \'91startprice\'92 
425 | \fs32 !=
426 | \fs24  NaN\
427 | 				- endListingType 
428 | \b0 = \'91Auction\'92\
429 | 				- 
430 | \b startprice
431 | \b0  = startprice\
432 | 		- if listingInfo.listingType = \'91AuctionWithBIN\'92:  # see note\
433 | 			
434 | \b - endListingType 
435 | \b0 = \'91AuctionWithBIN\'92\
436 | 			- 
437 | \b startprice
438 | \b0  = sellingStatus.currentPrice.value\
439 | # I believe this is only true for items that stayed as \'91AuctionWithBIN\'92 the entire life of the listing, and if I\'92m right\'85\
440 | # I\'92m looking for an \'91AuctionWithBIN\'92 that changed to an \'91Auction\'92 because someone put a bid, but then eventually it SOLD as BuyItNow.\
441 | # I don\'92t that exists because (from documentation): 
442 | \f4\i On most sites, 
443 | \b the Buy It Now option is removed (and this value returns false) once a valid bid is made on the associated item
444 | \b0  (a valid bid could be a bid above the reserve price). buyItNowAvailable will return "false" if the listing type is anything but "AuctionWithBIN". Please ignore buyItNowAvailable for fixed-price listings.
445 | \i0 \
446 | 
447 | \f2 \
448 | 	- if listingInfo.listingType= \'91FixedPrice\'92 or listingInfo.listingType= \'92StoreInventory\'92: # \'91FixedPrice\'92 means PURE BuyItNow, and no Auction available. \
449 | 		-
450 | \b  endListingType 
451 | \b0 = \'91FixedPrice\'92 or \'91StoreInventory\'92\
452 | 		- start price = sellingStatus.currentPrice.value 
453 | \f0 \
454 | \
455 | \
456 | \
457 | \
458 | \
459 | 
460 | \f2 4/6:\
461 | Created baseline model. Used the following features:\
462 | [titles_df, conditions_df, auction_condition_dummies, start_price_series, sold]\
463 | Baseline accuracy: 
464 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
465 | 0.793\
466 | 
467 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 Predicted Accuracy: 
468 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
469 | 0.871\
470 | 
471 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
472 | \
473 | 4/7:\
474 | \
475 | Directions to move forward in:\
476 | \
477 | \
478 | \
479 | - Used regression model to predict end prices for auction listings (x)\
480 | \
481 | - What model is best to use for this kind of regression, where we have NLP involved?	- SVM, \'85?\
482 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
483 | 
484 | \f0 \cf0 \
485 | \
486 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
487 | 
488 | \f2 \cf0 4/9:\
489 | -\cb17  
490 | \b Does lowering the start price increase the probability of selling an item?
491 | \b0 \cb1 To explore this question, we need to compare the start price and sold_state of items that are similar. So we want to compare items that are as similar as possible, with the only difference being the start price. Then we can make a logistic regression model using just startPrice and check the coefficient to see if if there\'92s a significant correlation. So the problem is to find items that are similar to each other. How do we do this? 
492 | \b Use NLP to vectorize titles, then calculate similarity between items (using title, condition as features), then take one set of similar items, and see if there is a relationship between startPrice and sold_state. \
493 | 
494 | \b0 Plot Sold (x-axis) vs. Average Start Price \
495 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
496 | \cf0 \cb18 - Using TF-IDF and cosine similarity, we were able to make this plot using a particular listing (the  \'91Auction\'92 listing having the highest number of listings similar to it - which has an index of 3080), and using a similarity threshold of 0.5, and found the mean startPrice for sold items to be about $45, and for unsold items $125. \cb1 \
497 | Let\'92s move on to plotting a change startPrice vs Probability of selling.  \
498 | \
499 | \cb17 - 
500 | \b We want to test if changing the startPrice actually causes my model to increase the likelihood of selling
501 | \b0 .\cb1  So, using my current classifier, I\'92ll take a listing that is very common (has many listings similar to itself), and have the model repeatedly calculate probability of SOLD for a listing, as I increase it\'92s start Price from 0x to 2x. Make a plot of the results. 1\
502 | \cb18 - I\'92m only able to plot 1 plot at a time, so I can\'92t get an aggregate view.  Based on briefly looking at some of the items, most of them seem to follow the pattern whereby as you increase startPrice, the probability of selling decreases. \cb1 \
503 | \
504 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
505 | \cf4 \cb11 - Another exciting problem came up. \cf0 \cb1 \
506 | - Discovered that there are very few auctions with an endPrice between ~120 to ~180, and I\'92m not sure why this is. \
507 | 	- \cf4 We should collect more \'91Auction\'92 data with a 120 < endprice < 180 to make up for this shortage (x)\cf0 \
508 | 	- use 
509 | \f4\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0
510 | MinPrice in item filter 
511 | \f2 with 120 to 180 dollars.  (x)\
512 | 	- then delete duplicates (x)
513 | \fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
514 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
515 | \cf4 We have a huge dearth of items with endPrices around $120 to $180. I explored that and then then found that the create_end_listing_type_and_start_price was creating a lot of endListingType values of \'91Invalid URL\'92, which I was then filtering out, which was also filtering out a lot of endPrices in the range 120-180. I thought this meant that the URLs were actually 
516 | \i \cf4 invalid
517 | \i0 \cf4 , but then I discovered that these URLs were actually valid (by using manual HTTP requests instead of the python port). So i looked into how I create the endListingType of \'91Invalid URL\'92 and found that the important condition is to have a startPrice of NaN, so basically a lot of items with endPrice in the range 120-180 had NaN startPrices\
518 | \
519 | So I had a lot of startPrice of NaN, which is weird because my scraper should have collected a startPrice if there is a valid URL. But my clue was that this weird behavior was only occurring for items with a specific range of endPrices, so that led me to believe that the most likely explanation was that I simply had accidentally skipped scraping some items when I set my url_start_index. So to attempt to scrape the right ones, I changed the scraper to query the database for items starting at the LEAST recent, and because I started scraping at $20 and moved up in price over time, I found out that the $120 items begin at around index 39500 in my database. So I started scraping at 39500, and we get to around $180 at index 54000, and scraping this many should take around 4 hours. \
520 | \
521 | Let\'92s see if this works. \
522 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
523 | \cf4 \cb18 Yep, that was the problem. Just needed to scrape every itemId in our database. We\'92re still at a dearth for items with an endPrice > $650, so we need to keep scraping. \cf0 \cb1 \
524 | \
525 | \
526 | 4/10:\
527 | Goals:\
528 | - Feature Engineer median startPrice of similar, completed listings\
529 | - Make Pickles \
530 | \
531 | Journal:\
532 | 1. Create a pickle out of cleaned auctions DataFrame.\
533 | 2. Create a new notebook for feature engineering the median start price of concurrent listings\
534 | 	- Create a pile of feature engineered auctions data frame\
535 | 	- Import new auctions into old notebook \
536 | \
537 | 4/11:\
538 | - Working on feature engineer, but the function is estimated to take around 2 hours. So I\'92ll run it during passover. \
539 | 	- It ran in like 10 minutes. Weird. \
540 | \
541 | 4/12: \
542 | - Add concurrent similar median start price feature and see if it improves model. \
543 | 	- It did not. (x)\
544 | - save EC2 Image (x)\
545 | - Grid Search on EC2 instance for best classifier \
546 | - Find median endPrice of similar, completed listings\
547 | - profitability metric \
548 | 	- Take a random sample of data\
549 | 	- For each sample:\
550 | 		- Calculate sold_probability and predicted_end_price for a range of start prices from 0 to 2x start price. \
551 | 		- where sold_probability > 0.5, calculate optimal start price, and optimal end price. sold_state_pred = 1.\
552 | 		- If there is no sold_probability > 0.5, then optimal start price = 0 and optimal end price = 0. sold_state_pred = 0.\
553 | 		- if  sold_state_pred == 1 and sold_state_true == 1:\
554 | 			- Calculate profit_diff = end_price_pred - end_price_true\
555 | 		- if sold_state_pred == 1 and sold_state_true == 0:\
556 | 			- Calculate profit_diff = end_price_pred - end_price_true (which is 0)\
557 | 		- if sold_state_pred == 0 and sold_state_true == 1:\
558 | 			- Calculate profit_diff = end_price_pred (which is 0) - end_price_true\
559 | 	average_profit_lift = np.mean(profit_diff)\
560 | \
561 | 4/13:\
562 | - Created new EC2 instance (x)\
563 | 	- Used default Linux AMI ->\
564 | 	- Installed Anaconda with:\
565 | 		- $ wget https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh\
566 | 		- $ bash Anaconda2-4.3.1-Linux-x86_64.sh  n\
567 | 		- Closed SSH opened new SSH\
568 | 		- Transfer files to instance \
569 | 		- ssh -i "ebay-price-predictor-3.pem" -L 8000:localhost:8888 ec2-user@ec2-54-183-29-45.us-west-1.compute.amazonaws.com\
570 | \
571 | - (on ec2) Grid Search RandomForestClassifier (X)\
572 | 	- run a process in background: nohup python classification-grid-search.py &\
573 | 	- $ top -> k -> (PID_#) -> 15\
574 | \
575 | 	
576 | \b Best Estimator
577 | \b0 : RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\
578 |              max_depth=10, max_features='auto', max_leaf_nodes=None,\
579 |              min_impurity_split=1e-07, min_samples_leaf=5,\
580 |              min_samples_split=6, min_weight_fraction_leaf=0.0,\
581 |              n_estimators=500, n_jobs=-1, oob_score=False,\
582 |              random_state=None, verbose=0, warm_start=False)\
583 | 	- 
584 | \b Best Score
585 | \b0  (roc_auc)  0.6544\
586 | 	- 
587 | \b Best Parameters
588 | \b0 : \{'max_features': 'auto', 'min_samples_split': 6, 'n_estimators': 500, 'max_depth': 10, 'min_samples_leaf': 5\}\
589 | \
590 | - (on ec2) Tried ExtraTreesClassifier(n_estimators=25, boostrap=True), which took WAY longer, and didn\'92t improve results. \
591 | \
592 | - Explore distribution of end prices \
593 | 	- It\'92s skewed to the left \
594 | 	- Should transform endPrice (log(endPrice) = output) and predict that then transform back (endPrice = 10^output ? )\
595 | \
596 | - Create time of day feature  (x)\
597 | 	- did not really help \
598 | \
599 | Question to ask vrushank:\
600 | 	- How to transform endPrice?\
601 | 	- Explain time of day feature didn\'92t help\
602 | 	- \
603 | \
604 | \
605 | Presentation:\
606 | \pard\pardeftab720\sl320\partightenfactor0
607 | 
608 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
609 | Baseline Accuracy: 0.857667278492\
610 | Baseline RFC accuracy: 0.89738027699\
611 | \
612 | baseline mean_absolute_error: 66.7116406965 \
613 | predicted mean_absolute_error: 27.0460162035\
614 | \
615 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
616 | 
617 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
618 | \pard\pardeftab720\sl320\partightenfactor0
619 | 
620 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
621 | Optimal Predicted End Price:$618.714156395, Optimal Start Price:$590.0, Chance of Selling:0.87, Expected Profit:$538.281316064\
622 | \pard\pardeftab720\sl320\partightenfactor0
623 | 
624 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
625 | \pard\pardeftab720\sl320\partightenfactor0
626 | 
627 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
628 | Average End Price: $262.72
629 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
630 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
631 | 
632 | \f1\fs28 \cf0 Average Lift: $34.20\
633 | % Average Lift: 13% Increased Profit on Average!\
634 | on 20,000 Auction Listings: $\cf2 \cb16 \expnd0\expndtw0\kerning0
635 | 6,719,374.15 Net Increased Profit\
636 | \cf0 \cb1 \kerning1\expnd0\expndtw0 \
637 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
638 | 
639 | \f2\fs24 \cf0 \
640 | \
641 | \
642 | Goals:\cf10 \
643 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
644 | 
645 | \b \cf0 Feature Engineering:
646 | \b0 \
647 | - median startPrice of similar, completed listings (x)\
648 | 	- Use preprocessing.normalize() on data and then create SCM start price  feature again\
649 | 	- Talk with Vrushank about investigation into SCM feature \
650 | 		- do i need to normalize output end Prices since they are not normally distributed?\
651 | 		- How to combine valida`tion metric and profitability metric? \
652 | - median endPrice of similar, completed listings \
653 | - time of day listing went on (x)\
654 | \
655 | 
656 | \b Model Optimization (to combat over-fitting):\
657 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
658 | 
659 | \b0 \cf0 - plot a learning curve to figure out how much data I really need until training error & test error converge\
660 | 	- Use this information to downsample (using pandas sample) when grid searching (to make models quicker to train) \
661 | - grid search vectorizer settings\
662 | - grid search models \
663 | - grid search model parameters \
664 | - optimize classification models for Recall \
665 | - optimize regression models for mean_absolute_error \
666 | \
667 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
668 | 
669 | \b \cf0 Profitability Optimization Metric:
670 | \b0 \
671 | - Calculate the average expected price increase we were able to get for all items. \
672 | 	- Use AWS/Dominoe to do this \
673 | \
674 | 
675 | \b Identify Listing factors that contribute to changes in probability of selling & predicted end price:
676 | \b0 \
677 | - Recommend replacements/deletion for words that decrease probability\
678 | \
679 | 
680 | \b Using Images + NN To Predict Sale
681 | \b0 \
682 | \
683 | \
684 | 4/14:\
685 | - Extract Listing Features - Model, Lens, MegaPixels\
686 | 	- we can use the getItem Call in the Trading API to get category specific information (like Model, MegaPixels)\
687 | 	- We used getCategorySpecifics Call in the Trading API to get the \'93Item Specifics\'94 for the Digital Camera (#31388) category. They are:\
688 | Type\
689 | Brand\
690 | MPN\
691 | Series\
692 | Model\
693 | Megapixels\
694 | Optical Zoom\
695 | Features\
696 | Color\
697 | Bundled Items\
698 | Connectivity\
699 | Battery Type\
700 | Manufacturer Warranty\
701 | Screen Size\
702 | Digital Zoom\
703 | Country/Region of Manufacture\
704 | \
705 | 4/15:\
706 | \
707 | 3 ways to extract Brand, Model information:\
708 | 1. Use ebay API. Problematic because we only get a certain number of calls per day.\
709 | 2. Scrapy. Long development + takes many hours. \
710 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
711 | \cf0 \cb17 3. Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \
712 | \
713 | \
714 | I think the most important features for each listings are:\
715 | Brand - \cb17 Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \
716 | Model - \cb17 Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \
717 | Lens Type - \cb17 Extract Lens Type from title using string matching (\'93Body\'94=0, \\d-\\d=1, \'93Len\'94=1, \'93mm\'94=1. \cb1 \
718 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
719 | \cf0 Condition - Already have (x)\
720 | Seller Feedback - Already have (x)\
721 | Free Shipping - Already have (x)\
722 | Bundled Items / Extas - \cb14 Will need to use scrapy or several days of ebay API. Not likely to happen. \cb1 \
723 | Original Packaging - \cb19 Would need to use scrapy. Not likely to happen\cb1 \
724 | \
725 | Features to Engineer with these features:\
726 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
727 | 
728 | \b \cf0 - Median listing price of FixedPrice listings of same Brand,Model, Condition for Classification.\
729 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
730 | \cf0 - Median end price of Auctions,FixedPrice listings of same Brand,Model, Condition for Regression.
731 | \b0 \
732 | \
733 | \
734 | Get a list of brands/models with:\
735 | select DISTINCT "Brand" from category_specifics;\
736 | select DISTINCT "Model" from category_specifics;\
737 | \
738 | 4/16:\
739 | Model Extraction worked really well!!!\
740 | First I extract obvious models with regex (for example 
741 | \f5\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0
742 | dsc-wx300
743 | \f2 ).
744 | \fs24  Then i use TF-IDF vectorization + cosine similarity to match a filtered version of the title with the list of models pulled using ebay API. From a spot check, it\'92s working really well! More issues with the regex extraction than with TF-IDF actually, so we\'92re going to try to use TF-IDF the whole way. 
745 | \b Actually just did a test - Regex is messing things up, so let\'92s just use TF-IDF + Cosine Similarity to extract model name on all listings. 
746 | \b0 \
747 | \
748 | We also need to make more API calls with getItem so that we have a bigger camera Model list. Right now we only have like 1000 distinct models. Which, with my spot check, has been fine, but we can use some of my 5000 limit to potentially get more models. We could also use the API to get Bundled Items. However I think I want to focus on just auctions at this point. \
749 | So I need to 
750 | \b create a table called 
751 | \f3 category_specifics_auctions
752 | \f2\b0 :\
753 | 1. Copy itemId from completed_items_v2 where listingType != \'91FixedPrice\'92 and listingType != \'91StoreInventory\'92 into table called category_specifics_auctions\
754 | 2. Update category_specifics_auctions using rows from category_specifics that have non-empty Brands.\
755 | TOO TIME-INTENSIVE!\
756 | We\'92re just going to use category_specifics to get a list of brands and models. \
757 | We will need to use scrapy if we want to get Bundled Items.\
758 | \
759 | Lens extraction with regex went fine, but I realized that the 18-55mm text is only PART of the lens description. There 
760 | \f1\fs28 is more to the lens than the 15-55mm
761 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \
762 | \
763 | Okay so Model Extraction did NOT work really well. I don\'92t know why I thought it worked well earlier. \
764 | I still need to extract brand, model information from title. But I need a CLEAN LIST of models to compare with the listing title. So In order to get a clean list, I\'92ll scrape B&H Video for all their cameras, and store that information into a table in my database called b_h_camera_inventory\
765 | Create table:\
766 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
767 | 
768 | \f1 \cf0 CREATE TABLE b_h_digital_camera_inventory (\
769 | 	"id" 				SERIAL PRIMARY KEY,\
770 | 	"Brand" 			TEXT,\
771 | 	"Model" 			TEXT,\
772 | 	"Retail Price" 		DECIMAL,\
773 | 	"Body Only" 		BOOLEAN,\
774 | 	"Kit"				BOOLEAN,\
775 | 	"Has Lens" 			BOOLEAN,\
776 | 	"Lens"				TEXT,\
777 | 	"B&H Id" 			TEXT,\
778 | 	"Title"			TEXT\
779 | );
780 | \f2 \
781 | \
782 | # -*- coding: utf-8 -*-\
783 | \
784 | # Define here the models for your scraped items\
785 | #\
786 | # See documentation in:\
787 | # http://doc.scrapy.org/en/latest/topics/items.html\
788 | \
789 | import scrapy\
790 | \
791 | \
792 | class CameraRetailerScraperItem(scrapy.Item):\
793 |     # define the fields for your item here like:\
794 |     # name = scrapy.Field()\
795 |     brand = scrapy.Field(default='NULL')\
796 |     model = scrapy.Field(default='NULL')\
797 |     retailPrice = scrapy.Field(default='NULL')\
798 |     bodyOnly = scrapy.Field(default='NULL')\
799 |     kit = scrapy.Field(default='NULL')\
800 |     hasLens = scrapy.Field(default='NULL')\
801 |     lens = scrapy.Field(default='NULL')\
802 |     bhId = scrapy.Field(default='NULL')\
803 | \
804 | \
805 | \
806 | 4/17:\
807 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
808 | 
809 | \b\fs32 \cf0 Classification
810 | \b0\fs24 \
811 | 
812 | \b\fs28 pd.read_pickle('./pickles/df_classification_count_vec.p')
813 | \b0\fs24  \
814 | \
815 | RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1) \
816 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
817 | \cf0 \ul \ulc0 Not Cross Val\ulnone \
818 | Baseline Accuracy: 0.847\
819 | Model accuracy: 0.903\
820 | \ul 3-Fold Cross Val:\ulnone \
821 | \
822 | Logistic Regression:\
823 | \ul Not\ulnone  \ul Cross Val:\ulnone \
824 | Baseline Accuracy: 0.8475 \
825 | Model accuracy: 0.8754\
826 | \ul 3-Fold\ulnone  \ul Cross Val:\ulnone \
827 | \cb20 Accuracy: 0.781099028892\cb1 \
828 | \
829 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
830 | \cf0 Logistic Regression Interesting Important Features:\
831 | 6th (u\'92fast', 1.87)\
832 | 9th (u\'92box', 1.6658)\
833 | 18th (u\'92gently used', 1.3718)\
834 | 25th (u\'92good cosmetic', 1.0716)\
835 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
836 | \cf0 \
837 | \
838 | Classification Ensemble (RF, LR, XG):\
839 | \ul KFold Cross Val:\ulnone \
840 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
841 | \cf0 [(0.94599636950383215, 0.89573459715639814),  \
842 | (0.94433519891090612, 0.88674868898749493),  \
843 | (0.94418393586446836, 0.89179104477611937)]\
844 | \cb21 Overfitting.\cb1 \
845 | Baseline Accuracy: 0.854 
846 | \f1 \
847 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
848 | 
849 | \f2 \cf0 \cb22 Cross Validated Ensemble GMean Prediction Accuracy: 0.891\
850 | Increase Accuracy due to model: 0.036\cb1 \
851 | \
852 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
853 | 
854 | \b\fs32 \cf0 Regression\
855 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
856 | 
857 | \b0\fs24 \cf0 RandomForestRegressor\
858 | \pard\pardeftab720\sl320\partightenfactor0
859 | 
860 | \f1\fs28 \cf23 \cb24 \expnd0\expndtw0\kerning0
861 | \outl0\strokewidth0 \strokec23 Average Cross Validated RFR Score: -41.1394388889\
862 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
863 | 
864 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \outl0\strokewidth0 \
865 | \
866 | \
867 | \
868 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
869 | \cf0 \
870 | \
871 | \
872 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
873 | \cf0 \
874 | \
875 | \
876 | \
877 | \
878 | \
879 | \
880 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Ebay Listing Optimizer With Machine Learning
  2 | 
  3 | If you've ever bought something on Ebay, you know that it can be difficult to know if a particular listing is a good deal or not. And if you're selling, it can be hard to determine which options will draw bidders to your auction. What if there was a way to increase the likelihood that you would sell your listing on ebay, just by swapping a few keywords in your title? What if there was a way to filter listings for only the best deals?
  4 | 
  5 | To investiage these questions, I dug into actual ebay data and built a machine learning system to help sellers make more sales on ebay, and alert shoppers to the best deals so they can make smarter buying decisions and save money. In order to do that, I had to create models that could predict if an auction would sell or not (at least one person would bid), and if a listing would sell, how much it would sell for.
  6 | 
  7 | In this report, I'll discuss the decisions I made and show revelant code blocks and visualizations that describe my modeling process. 
  8 | 
  9 | # 1. Web Scraping, Data Cleaning, Data Piping
 10 | Using the Ebay API and a related Python wrapper [https://github.com/timotheus/ebaysdk-python], I collected data for 100,000 completed listings in the "Digital Camera" category for the past 3 months, and stored the data in a table within a postgres database. I ended up using  20,000 "Auction" type listings within the original dataset.
 11 | ![Sample of rows for a listing in Postgres database](./capstone-technical-report/images/completed_items_v2.png)
 12 | 
 13 | The data included features like
 14 | 
 15 | - Listing Title
 16 | - Seller Feedback Score
 17 | - Free Shipping Available
 18 | - Listing End Price
 19 | 
 20 | However, I suspected that start price and condition descriptions would be important features. Since they were not available through the API, I created a scraper with Scrapy to fetch URLs from my database, scrape the start price and condition, and store it back into my database.
 21 | 
 22 | Now that I had the necessary data in my database, I could import it into Python.
 23 | ![Subset of columns of my data](./capstone-technical-report/images/example_dataframe.png)
 24 | 
 25 | 
 26 | # 2. Pre-Processing
 27 | 
 28 | Pre-processing involved transforming the title text and the condition description as text features into numerical data. I used TF-IDF vectorization for the listing titles, as I wanted to cause unique words, like camera models, to have higher values. For conditions, I used count vectorizer, because I noticed many of the same words used across many different titles, such as "Functional","Like New", and "Scratched." I didn't want to down weight these kinds of words.
 29 | 
 30 | I also scaled all predictors to the same range so that I could compare model coefficients to each other directly.  
 31 | 
 32 | 
 33 | # 3. Modeling
 34 | 
 35 | ### Classification
 36 | 
 37 | The classification problem involved predicting whether or not a given auction listing would sell.
 38 | 
 39 | As a baseline, I used a model which simply predicted the majority class (1) for each listing, which resulted in the following scores:
 40 | 
 41 | | Model                     | Accuracy | Precision | Recall | F-1   |
 42 | |---------------------------|----------|-----------|--------|-------|
 43 | | Majority Class Classifier | 0.854    | 0.854     | 1.000  | 0.917 |
 44 | 
 45 | I then ran a simple Random Forest (100 trees), which gave me the following results for 3 fold CV:
 46 | 
 47 | | Model                     | Accuracy | Precision | Recall | F-1   |
 48 | |---------------------------|----------|-----------|--------|-------|
 49 | | Random Forest Classifier  | 0.860    |    0.877  | 0.95  | 0.911 |
 50 | 
 51 | I chose to optimize for accuracy, since I thought that false positives were equally as important as false negatives for predicting the sale state of auction listings. Accuracy provides a simple metric for evaluating how many listings my models are classifying incorrectly. 
 52 | 
 53 | ### Regression
 54 | 
 55 | The regression problem was to predict the end price *for listings that sold*. I was not interested in predicting the end price of items that did not sell, since that did not fit into my goal of helping buyer's find over-valued or under-valued deals. If my model was trained on unsold data, then the regression results could not reliably be used to predict what price items will eventually *sell* at.
 56 | 
 57 | As a baseline, I used the median price of all listings as my prediction for all listings. 
 58 | 
 59 | | Model                     | Median Absolute Error ($)   |
 60 | |---------------------------|----------|
 61 | | Median End Price Prediction | $66.71 |
 62 | 
 63 | I then ran a simple random forest regressor got the following result:
 64 | 
 65 | | Model                     | Median Absolute Error ($)   |
 66 | |---------------------------|----------|
 67 | | Baseline Random Forest Regressor | $38.36 (-$28.35)  (13.3%) |
 68 | 
 69 | The immediate modeling results provided a drop in MAE of 28.35 dollars, to land at an average error of 13.3% compared with the actual end price for each listing.
 70 | 
 71 | Before moving onto ensembling various classifiers, I decided to attempt to create a feature in the hopes of increasing the accuracy of my model.
 72 | 
 73 | # 4. Feature Engineering
 74 | When I thought about what potential factors could contribute to a particular listing selling or not, I hypothesized that listings on ebay are affected by other similar listings. Specifically, I thought that the start price of auctions listed on ebay at the same time, or listed "concurrently", would affect their respective end prices, and wanted to explore this route.I thought that the current price of each listing at the time of listing might be more influential than the start price, but in the interest of time, I decided to focus on start price.
 75 | 
 76 | I defined a listing to be concurrent with another listing if the second was posted before the second ended (without a restriction on the amount of concurrent time needed to qualify as a concurrent listing), and filtering in python. 
 77 | 
 78 | In order to filter for "similar listings", I vectorized each listings title using sklearn's `TfidfVectorizer` and then calculated a cosime similarity score for each listing. I took only the top 5 most similar items, or those items with a similarity score greather than 0.95, whichever provided more results. I chose 5 and 0.95 through spot-checking the results for a balance between number of results and accuracy in terms of observed similarity. 
 79 | 
 80 | The essense of the code is along the lines of:
 81 | ```python 
 82 |     concurrent_listings_df = auctions_subset[auctions_subset['listingInfo.endTime'].apply(lambda sub_listing_et: listing_st<sub_listing_et)]
 83 |     cos_sim_matrix = cosine_similarity(current_listings, concurrent_listings)
 84 |     concurrent_similar_listings_df = concurrent_listings_df[concurrent_listings_df['similarity_score']>min_sim_score]
 85 | ```
 86 | 
 87 | After I had the top 5 concurrent, similar listings, I took the median start price, and used that as a feature to my models.
 88 | 
 89 | Unfortunately, the new feature did not improve the error of my model:
 90 | Accuracy score with feature: 0.825 (-2.9%)
 91 | Median Absolute Error: $42.12 (+$3.76)
 92 | 
 93 | When I plotted the median start price of concurrent, similar listings versus end price I found this:
 94 | ![Concurrent Median start price vs. End Price](./capstone-technical-report/images/csm_start_price.png)
 95 | 
 96 | There is no correlation between the two, which suggests that people do not consider the *start price* of concurrent, similar listings when deciding to bid on items. However, my hunch is that people do consider the *current price* of concurrent, similar listings. Due to time constraints, I decided to move on instead of attempting to acquire the bidding history for each listing.
 97 | 
 98 | # 5. Hyper-parameter Optimization
 99 | 
100 | To increase the accuracy of my modeling efforts, I decided to create an ensemble of classifiers for the classification task. I experimented with KNN, Logistic Regression, Gradient Boosted Trees, and Random Forest. Ultimatley I chose an ensemble of Gradient Boosted Trees, Random Forest, and Logistic Regression, averaging their respective probability calculations through a geometric mean, and using a decision threshold of 0.5.
101 | 
102 | Although the feature-engineering was not as successful as hoped it would be, I knew that I could still reduce over-fitting and therefore reduce my test error by grid-searching for optimal hyper-parameters. I knew that I was over-fitting as my training and test error were significantly different. For the Random Forest classifier, my accuracy scores were 0.927 for training accuracy, and 0.877 for test accuracy. The difference of 5% told me that my model was not generalizing well enough, and that tuning hyper-parameters would potentially decrease the variance and the bias of my model.
103 | 
104 | I used Amazon EC2 to run the grid-search and model fitting on a more powerful computer than I had available to use locally. 
105 | 
106 | ### Classification
107 | For the XGBoost model, I grid searched through:
108 | ```python
109 | 'max_depth': [3,5,7,9]
110 | ```
111 | and found the best depth to be 7. 
112 | 
113 | For logistic regression, adding a L2 (Ridge) regularization term with a weight of 0.8 provided the best results. 
114 | 
115 | And for Random Forest, I grid searched through:
116 | ```python
117 | 'min_samples_split':[2, 4, 6],
118 | 'min_samples_leaf':[1,3,5],
119 | 'max_depth':[4, 8]
120 | ```
121 | but found the best parameters to be the default parameters, (and 500 trees).
122 | 
123 | When I combined these three models into an ensemble I was able to achieve the following scores:
124 | 
125 | | Model                     | Accuracy | Precision | Recall | F-1   |
126 | |---------------------------|----------|-----------|--------|-------|
127 | | Ensemble (RF, LR, XGboost)  | 0.891 (+3.7%)  | 0.90  | 0.98  | 0.942 |
128 | 
129 | An increase of 3.7% from baseline. Modest improvement!
130 | 
131 | ### Regression
132 | 
133 | For regression, I experimented with linear Regression, including Lasso and Ridge regularization, Random Forest Regressor and SKLearn's Gradient Boosted Regressor, and found the best model to be the Gradient Boosted Regressor. 
134 | 
135 | 
136 | | Model                     | Median Absolute Error ($)   |
137 | |---------------------------|----------|
138 | | Optimized Gradient Boosted Regressor | $25.82 (-$40.91), or 9.6% |
139 | 
140 | At best, we were able to achieve 9.6% error on end price predictions for each item listing, on average through the entire dataset.
141 | 
142 | 
143 | # 5. Application
144 | 
145 | ### Seller Listing Optimizer (Classification)
146 | 
147 | It's a neat ML exercise to try to predict whether an auction is going to sell or not, but how is it useful to shoppers? One way is that, by looking at feature weights in the logistic regression classifier, we can determine which features increase or decrease the probability of sale of a listing.
148 | 
149 | For instance, it is possible that certain words in the title or condition description may increase the probability of sale, and in fact, that is the case. 
150 | 
151 | Looking at the feature importances in the logistic regression model, we can see that some words are associated with positive increases in probability:
152 | 'slightly used', 0.649
153 | 'gently used', 1.369
154 | 
155 | 'Slightly used' and 'gently used' are two ways of saying the same thing, but one has a much higher probaiblity of sale. To test the affect of using 'gently used' we can look at the following case study:
156 | ![Case Study](./capstone-technical-report/images/classification_case_study.png)
157 | 
158 | The condition description - *Camera is in good, working condition with minor cosmetic wear* - is what we want to focus on. What happens if we include the terms "gently used" in the condition description?
159 | ![Gently used](./capstone-technical-report/images/gently_used.png)
160 | 
161 | As you can see from the bar chart above, including the term "gently used" in the condition description, causes the model to predict a 3% higher probability of sale for this particular item.
162 | 
163 | Although it is only a modest increase, the example is only to show that features can be tweaked to increase a listings probability of sale.
164 | 
165 | ### Over-Valued and Under-Valued Item Alert (Regression)
166 | 
167 | On the buyer's side of things, predicting end price of an auction is useful because this gives us an idea of the value of items on Ebay. If we know the going price for an auction on ebay, then we can alert buyers to whether or not the item is under or over-valued at the current bid price. If it's over-valued, the buyer can avoid that listing and instead wait for a listing that is under-valued. The buyer can also simply set a maximum bid ceiling equal to the predicted end price, and feel comfortable knowing that they won't be paying more than the market rate for that item.
168 | 
169 | Let's take a look at an example:
170 | ![cyber shot](./capstone-technical-report/images/cyber_shot.png)
171 | 
172 | This Cyber Shot digital camera sold for $369.00, but did the buyer get a good deal?
173 | 
174 | Let's investigate. 
175 | 
176 | The actual end price for the camera was $369.00, but the *predicted* end price, according to our model was $314.90.
177 | If we factor in our 9.6% average error, then our model could be off by $314.90 + 9.6% = $345.13, in a worst case scenario in which the error increases the predicted price.
178 | 
179 | In this scenario, the buyer actually overpaid by $369.00 - $345.13 = $23.87.
180 | 
181 | If the buyer had waited to bid on another listing, they could have potentially saved $24!
182 | 
183 | If we aggregate these findings to to all sold listings in our database, we find that 24.4%, or 3,959 listings are found to be over-priced. The average amount over-paid on each listing is $27.85, which means that all buyers of digital cameras on ebay could have collectively saved $110,277. That's a nice chunk of change!
184 | 
185 | Below is a mock up of a web app that could use the models to alert buyers to the value of items they are browsing.
186 | 
187 | ![Mock up of a web app that would use the research I've outlined in this report.](./capstone-technical-report/images/buyers_guide_example.png)
188 | 
189 | Buyers would know which listings are OK to bid on, and which they should avoid in order to maximize their spending power.
190 | 
191 | # 6. Conclusion
192 | 
193 | Overall, I found this to be incredibly helpful learning experience, however I would want to reduce the regression error to below 5% before I would use the model myself. 
194 | 
195 | One avenue I would be interested in exploring is using listing images as features to my model. One way of doing this would be to train a nueral network using the greyscaled image matrix as an input and training on the sold state (1=sold, 0=unsold). My hypothesis is that higher quality images tend to sell more often than lower quality images, and if this were true, then the nueral network would learn to identify low quality images and high quality images. The network could be used to classify each image then, and use that classification as an input into the classification and regression models. Just one idea for how I might extend this project in the future.
196 | 
197 | Thanks for reading! I hope you find this write-up useful in your own data science journey.
198 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-time-machine


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/__init__.py


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/__init__.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BhPhotoDigitalCameraItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     brand = scrapy.Field()
15 |     model = scrapy.Field()
16 |     retail_price = scrapy.Field()
17 |     body_only = scrapy.Field()
18 |     kit = scrapy.Field()
19 |     has_lens = scrapy.Field()
20 |     lens = scrapy.Field()
21 |     bh_id = scrapy.Field()
22 |     title = scrapy.Field()


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/items.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class BhPhotoScraperSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import psycopg2
 9 | import logging
10 | 
11 | 
12 | class BhPhotoDigitalCameraPipeline(object):
13 | 
14 | 	def __init__(self, postgres_host,postgres_user,postgres_db,postgres_table):
15 | 		self.postgres_host=postgres_host
16 | 		self.postgres_user=postgres_user
17 | 		self.postgres_db=postgres_db
18 | 		self.postgres_table=postgres_table
19 | 
20 | 	@classmethod
21 | 	def from_crawler(cls, crawler):
22 | 		return cls(
23 | 			postgres_host=crawler.settings.get('POSTGRES_HOST'),
24 | 			postgres_user=crawler.settings.get('POSTGRES_USER'),
25 | 			postgres_db=crawler.settings.get('POSTGRES_DB'),
26 | 			postgres_table=crawler.settings.get('POSTGRES_TABLE'),
27 | 		)	
28 | 
29 | 
30 | 	def open_spider(self, spider):
31 | 		self.conn = psycopg2.connect("dbname={} user={} host={}".format(self.postgres_db,   \
32 | 																		self.postgres_user, \
33 | 																		self.postgres_host) \
34 | 		)
35 | 		self.cur = self.conn.cursor()
36 | 
37 | 
38 | 	def process_item(self, item, spider):
39 | 		'''store data into postgres database 
40 | 
41 | 		'''
42 | 		for field in item.fields:
43 | 			item.setdefault(field, 'NULL')
44 | 
45 | 		# SQL = '''
46 | 		# UPDATE {table_name}
47 | 		# SET "Brand"='{brand}',
48 | 		# 	"Model"='{model}',
49 | 		# 	"Retail Price"={retail_price},
50 | 		# 	"Body Only"={body_only},
51 | 		# 	"Kit"={kit},
52 | 		# 	"Has Lens"={has_lens},
53 | 		# 	"Lens"='{lens}',
54 | 		# 	"B&H Id"='{bh_id}',
55 | 		# 	"Title"='{title}'
56 | 		# ;
57 | 		# '''.format( table_name=self.postgres_table, 
58 | 		# 			brand=item['brand'],
59 | 		# 			model=item['model'],
60 | 		# 			retail_price=item['retail_price'],
61 | 		# 			body_only=item['body_only'],
62 | 		# 			kit=item['kit'],
63 | 		# 			has_lens=item['has_lens'],
64 | 		# 			lens=item['lens'],
65 | 		# 			bh_id=item['bh_id'],
66 | 		# 			title=item['title']					
67 | 		# 	) 
68 | 		
69 | 
70 | 		insert_statement = '''INSERT INTO {table_name} (%s) VALUES %s;'''.format(table_name=self.postgres_table)
71 | 
72 | 		keys = ['Brand','Title']
73 | 		keys = ['"{}"'.format(key) for key in keys] 
74 | 		values = (item['brand'],item['title'])	
75 | 
76 | 		SQL = self.cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), values))
77 | 
78 | 		try:
79 | 			self.cur.execute(SQL) # execute SQL, and commit changes 
80 | 			self.conn.commit()
81 | 		except:
82 | 			logging.debug('Error with executing SQL statement.\n SQL = {}'.format(SQL))
83 | 			self.conn.rollback()
84 | 
85 | 
86 | 		return item
87 | 
88 | 
89 | 	def close_spider(self, spider):
90 | 		self.conn.close()
91 | 		self.cur.close()
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/pipelines.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for bh_photo_scraper project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | import logging
 13 | 
 14 | 
 15 | BOT_NAME = 'bh_photo_scraper'
 16 | 
 17 | SPIDER_MODULES = ['bh_photo_scraper.spiders']
 18 | NEWSPIDER_MODULE = 'bh_photo_scraper.spiders'
 19 | 
 20 | 
 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 22 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
 23 | 
 24 | # Obey robots.txt rules
 25 | ROBOTSTXT_OBEY = False
 26 | 
 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 28 | CONCURRENT_REQUESTS = 1
 29 | 
 30 | # Configure a delay for requests for the same website (default: 0)
 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 32 | # See also autothrottle settings and docs
 33 | DOWNLOAD_DELAY = 1
 34 | # The download delay setting will honor only one of:
 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 36 | #CONCURRENT_REQUESTS_PER_IP = 16
 37 | 
 38 | # Disable cookies (enabled by default)
 39 | COOKIES_ENABLED = False
 40 | 
 41 | # Disable Telnet Console (enabled by default)
 42 | #TELNETCONSOLE_ENABLED = False
 43 | 
 44 | # Override the default request headers:
 45 | #DEFAULT_REQUEST_HEADERS = {
 46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 47 | #   'Accept-Language': 'en',
 48 | #}
 49 | 
 50 | # Enable or disable spider middlewares
 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 52 | #SPIDER_MIDDLEWARES = {
 53 | #    'bh_photo_scraper.middlewares.BhPhotoScraperSpiderMiddleware': 543,
 54 | #}
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 | #DOWNLOADER_MIDDLEWARES = {
 59 | #    'bh_photo_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
 60 | #}
 61 | 
 62 | # Enable or disable extensions
 63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 64 | #EXTENSIONS = {
 65 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 66 | #}
 67 | 
 68 | # Configure item pipelines
 69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 70 | #ITEM_PIPELINES = {
 71 | #    'bh_photo_scraper.pipelines.BhPhotoScraperPipeline': 300,
 72 | #}
 73 | 
 74 | # Configure item pipelines
 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 76 | ITEM_PIPELINES = {
 77 |     'bh_photo_scraper.pipelines.BhPhotoDigitalCameraPipeline': 300,
 78 | }
 79 | # set up the pipeline settings for postgres
 80 | POSTGRES_HOST = "localhost"
 81 | POSTGRES_USER = "nathan"
 82 | POSTGRES_DB = "ebay"
 83 | POSTGRES_TABLE = "b_h_digital_camera_inventory"
 84 | 
 85 | 
 86 | # Enable and configure the AutoThrottle extension (disabled by default)
 87 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 88 | AUTOTHROTTLE_ENABLED = True
 89 | # The initial download delay
 90 | AUTOTHROTTLE_START_DELAY = 1
 91 | # The maximum download delay to be set in case of high latencies
 92 | AUTOTHROTTLE_MAX_DELAY = 2
 93 | # The average number of requests Scrapy should be sending in parallel to
 94 | # each remote server
 95 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 96 | # Enable showing throttling stats for every response received:
 97 | #AUTOTHROTTLE_DEBUG = False
 98 | 
 99 | # Enable and configure HTTP caching (disabled by default)
100 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
101 | #HTTPCACHE_ENABLED = True
102 | #HTTPCACHE_EXPIRATION_SECS = 0
103 | #HTTPCACHE_DIR = 'httpcache'
104 | #HTTPCACHE_IGNORE_HTTP_CODES = []
105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
106 | 
107 | # LOGGING
108 | # LOG_FILE = '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_spider_log.log'
109 | # LOG_ENABLED = True
110 | # LOG_LEVEL = logging.ERROR
111 | 
112 | 


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/settings.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/spiders/__init__.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/bh_photo_scraper/bh_photo_scraper/')
 3 | 
 4 | import scrapy
 5 | from scrapy.spiders import CrawlSpider
 6 | import psycopg2
 7 | import logging
 8 | 
 9 | from items import BhPhotoDigitalCameraItem
10 | from bs4 import BeautifulSoup
11 | 
12 | class DigitalCameraSpider(scrapy.Spider):
13 |     name = "digital_camera_spider"
14 | 
15 | 
16 |     def start_requests(self):
17 | 
18 |         url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/N/4288586282' # Digital Cameras
19 |         # url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/1/N/4288586282?via=js'
20 | 
21 |         # Get number of pages
22 |         yield scrapy.Request(url=url, callback=self.get_num_pages)
23 |         
24 |         # yield scrapy.Request(url=url, callback=self.parse, meta={'num_pages':num_pages})
25 | 
26 | 
27 | 
28 |     def parse(self, response):
29 | 
30 |         num_pages = response.meta['num_pages']
31 |         page_num = response.meta['page_num']
32 | 
33 |         ids = response.xpath("//span[1]/span[@class='sku']/text()").extract()
34 | 
35 |         brands = response.xpath("//a[@class='c5']/span[1]/text()").extract()
36 |         titles = response.xpath("//a[@class='c5']/span[2]/text()").extract()        
37 | 
38 |         if len(brands) != len(titles): # an element in brands is a new-release title, remove it
39 |             for i,brand in enumerate(brands):
40 |                 if len(brand.split()) > 1: 
41 |                     brands.pop(i)
42 | 
43 | 
44 | 
45 |         # XPATH does not work entirely for prices, use beautifulsoup instead
46 |         # soup = BeautifulSoup(response.body, 'lxml')
47 |         # prices = [float(price.get_text().strip().strip('$').replace(',','')) \
48 |         #         for price in soup.find_all('span','price')]        
49 | 
50 | 
51 |         for i in range(len(brands)):
52 |             item = BhPhotoDigitalCameraItem()
53 |             item['brand'] = brands[i].strip()
54 |             item['title'] = titles[i].strip()            
55 | 
56 |             # NOTE: Sometimes, the price field is not there, hopefully this only occurs when 
57 |             # the item is  at the bottom of the page, otherwise the brands,titles,ids,prices  
58 |             # elements will be out of sync.
59 |             # item['bh_id'] = ids[i].strip()
60 |             # try:
61 |             #     item['retail_price'] = prices[i] 
62 |             # except IndexError as e: 
63 |             #     print e
64 |                 # item['retail_price'] = None
65 |             
66 | 
67 |             yield item
68 | 
69 | 
70 | 
71 |         # when done processing items, move onto next page 
72 |         if page_num <= num_pages:
73 |             logging.debug('Scraping page {}'.format(page_num))
74 |             next_url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/{}/N/4288586282?via=js'.format(page_num)
75 |             yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num+1})
76 |         else:
77 |             logging.debug('Should be done scraping..')
78 |             # raise CloseSpider('Done Crawling.')
79 |             yield
80 | 
81 | 
82 |     def get_num_pages(self, response):
83 |         logging.debug('Made it here!')
84 |         num_pages = response.xpath("//p[@class='pageNuber']/text()").extract_first().strip().split()[-1]
85 |         page_num = 1 # start at page 1
86 |         yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num+1})
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc


--------------------------------------------------------------------------------
/bh_photo_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = bh_photo_scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = bh_photo_scraper
12 | 


--------------------------------------------------------------------------------
/capstone-technical-report/images/buyers_guide_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/buyers_guide_example.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/classification_case_study.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/classification_case_study.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/completed_items_v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/completed_items_v2.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/csm_start_price.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/csm_start_price.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/cyber_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/cyber_shot.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/example_dataframe.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/example_dataframe.png


--------------------------------------------------------------------------------
/capstone-technical-report/images/gently_used.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/gently_used.png


--------------------------------------------------------------------------------
/data-analysis/utilities/clean_text.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import nltk
 4 | from nltk.corpus import stopwords
 5 | from nltk.stem.porter import PorterStemmer
 6 | from nltk.stem.lancaster import LancasterStemmer
 7 | 
 8 | def clean_text(doc, remove_stop_words=True, remove_digits=False, remove_punc=True, stem=False):
 9 |     
10 |     # 1. Remove any HTML markup
11 |     text = BeautifulSoup(doc, 'lxml').get_text()  
12 |     
13 |     # 2. Extract special negator like n't
14 |     text = re.sub('n\'t', ' not', text)
15 |     
16 |     # 3. remove punctuation(except .-)
17 |     if remove_punc:
18 |         text = re.sub('[^a-zA-Z.\-\d]', ' ', text)
19 |         
20 |     if remove_digits:
21 |         text = re.sub('[.\d]', ' ', text)
22 |         
23 |     # 4. Convert to lower case 
24 |     text = text.lower()
25 |         
26 |     # 5. Remove stop words
27 |     if remove_stop_words:
28 |         stops = set(stopwords.words("english"))
29 |         text = [w for w in text.split(' ') if not w in stops]
30 |         text = ' '.join(text)
31 |                 
32 |     # 6. apply Porter Stemming
33 |     # probably don't need this
34 |     if stem:
35 |         stemmer = PorterStemmer()
36 |         stemmer = LancasterStemmer()
37 |         text = [stemmer.stem(w) for w in text.split(' ')]
38 |         text = ' '.join(text)
39 |         
40 |     # 7. Remove extra white space
41 |     text = re.sub(' +',' ', text)
42 |         
43 |     return text


--------------------------------------------------------------------------------
/data-analysis/utilities/plot_learning_curve.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | from sklearn.model_selection import learning_curve
  4 | from sklearn.model_selection import ShuffleSplit
  5 | 
  6 | 
  7 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
  8 |                         n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), 
  9 |                         scoring='accuracy'):
 10 |     """
 11 |     Generate a simple plot of the test and training learning curve.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     estimator : object type that implements the "fit" and "predict" methods
 16 |         An object of that type which is cloned for each validation.
 17 | 
 18 |     title : string
 19 |         Title for the chart.
 20 | 
 21 |     X : array-like, shape (n_samples, n_features)
 22 |         Training vector, where n_samples is the number of samples and
 23 |         n_features is the number of features.
 24 | 
 25 |     y : array-like, shape (n_samples) or (n_samples, n_features), optional
 26 |         Target relative to X for classification or regression;
 27 |         None for unsupervised learning.
 28 | 
 29 |     ylim : tuple, shape (ymin, ymax), optional
 30 |         Defines minimum and maximum yvalues plotted.
 31 | 
 32 |     cv : int, cross-validation generator or an iterable, optional
 33 |         Determines the cross-validation splitting strategy.
 34 |         Possible inputs for cv are:
 35 |           - None, to use the default 3-fold cross-validation,
 36 |           - integer, to specify the number of folds.
 37 |           - An object to be used as a cross-validation generator.
 38 |           - An iterable yielding train/test splits.
 39 | 
 40 |         For integer/None inputs, if ``y`` is binary or multiclass,
 41 |         :class:`StratifiedKFold` used. If the estimator is not a classifier
 42 |         or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
 43 | 
 44 |         Refer :ref:`User Guide <cross_validation>` for the various
 45 |         cross-validators that can be used here.
 46 | 
 47 |     n_jobs : integer, optional
 48 |         Number of jobs to run in parallel (default 1).
 49 |     """
 50 |     plt.figure()
 51 |     plt.title(title)
 52 |     if ylim is not None:
 53 |         plt.ylim(*ylim)
 54 |     plt.xlabel("Training examples")
 55 |     plt.ylabel("Score")
 56 | 
 57 |     train_sizes, train_scores, test_scores = learning_curve(
 58 |         estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, 
 59 |         scoring=scoring)
 60 | 
 61 |     train_scores_mean = np.mean(train_scores, axis=1)
 62 |     train_scores_std = np.std(train_scores, axis=1)
 63 |     test_scores_mean = np.mean(test_scores, axis=1)
 64 |     test_scores_std = np.std(test_scores, axis=1)
 65 | 
 66 |     plt.grid()
 67 | 
 68 |     plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
 69 |                      train_scores_mean + train_scores_std, alpha=0.1,
 70 |                      color="r")
 71 |     plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
 72 |                      test_scores_mean + test_scores_std, alpha=0.1, color="g")
 73 |     plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
 74 |              label="Training score")
 75 |     plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
 76 |              label="Cross-validation score")
 77 | 
 78 |     plt.legend(loc="best")
 79 |     
 80 |     return plt
 81 | 
 82 | def example_learning_curve():
 83 |     from sklearn.naive_bayes import GaussianNB
 84 |     from sklearn.svm import SVC
 85 |     from sklearn.datasets import load_digits
 86 | 
 87 |     digits = load_digits()
 88 |     X, y = digits.data, digits.target
 89 | 
 90 | 
 91 |     title = "Learning Curves (Naive Bayes)"
 92 |     # Cross validation with 100 iterations to get smoother mean test and train
 93 |     # score curves, each time with 20% data randomly selected as a validation set.
 94 |     cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0)
 95 | 
 96 |     estimator = GaussianNB()
 97 |     plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4)
 98 | 
 99 |     title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)"
100 |     # SVC is more expensive so we do a lower number of CV iterations:
101 |     cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
102 |     estimator = SVC(gamma=0.001)
103 |     plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4)
104 | 
105 |     plt.show()
106 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/.ipynb_checkpoints/datetime test-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": false,
 8 |     "deletable": true,
 9 |     "editable": true
10 |    },
11 |    "outputs": [
12 |     {
13 |      "data": {
14 |       "text/plain": [
15 |        "'2017-03-26T00:00:00'"
16 |       ]
17 |      },
18 |      "execution_count": 1,
19 |      "metadata": {},
20 |      "output_type": "execute_result"
21 |     }
22 |    ],
23 |    "source": [
24 |     "#get datetime in datetime in ISO-8601 format\n",
25 |     "import datetime \n",
26 |     "\n",
27 |     "'2017'\n",
28 |     "date = datetime.datetime(2017,3,26)\n",
29 |     "\n",
30 |     "date.isoformat()\n",
31 |     "# datetime.datetime.now().isoformat()"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 5,
37 |    "metadata": {
38 |     "collapsed": false,
39 |     "deletable": true,
40 |     "editable": true
41 |    },
42 |    "outputs": [
43 |     {
44 |      "name": "stdout",
45 |      "output_type": "stream",
46 |      "text": [
47 |       "2017-03-26T00:00:000Z\n"
48 |      ]
49 |     }
50 |    ],
51 |    "source": [
52 |     "time_string = '2017-03-26'\n",
53 |     "print datetime.datetime.strptime(time_string, \"%Y-%m-%d\").isoformat() + '0Z'"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "metadata": {
60 |     "collapsed": true
61 |    },
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 2",
69 |    "language": "python",
70 |    "name": "python2"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 2
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython2",
82 |    "version": "2.7.13"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/.ipynb_checkpoints/scrapy-development-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": []
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "Python 2",
16 |    "language": "python",
17 |    "name": "python2"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 2
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython2",
29 |    "version": "2.7.13"
30 |   }
31 |  },
32 |  "nbformat": 4,
33 |  "nbformat_minor": 2
34 | }
35 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/common.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | '''
 3 | © 2012-2013 eBay Software Foundation
 4 | Authored by: Tim Keefer
 5 | Licensed under CDDL 1.0
 6 | '''
 7 | 
 8 | 
 9 | def dump(api, full=False):
10 | 
11 |     print("\n")
12 | 
13 |     if api.warnings():
14 |         print("Warnings" + api.warnings())
15 | 
16 |     if api.response.content:
17 |         print("Call Success: %s in length" % len(api.response.content))
18 | 
19 |     print("Response code: %s" % api.response_code())
20 |     print("Response DOM1: %s" % api.response_dom())  # deprecated
21 |     print("Response ETREE: %s" % api.response.dom())
22 | 
23 |     if full:
24 |         print(api.response.content)
25 |         print(api.response.json())
26 |         print("Response Reply: %s" % api.response.reply)
27 |     else:
28 |         dictstr = "%s" % api.response.dict()
29 |         # print("Response dictionary: %s..." % dictstr[:150])
30 |         print("Response dictionary: %s..." % dictstr[:])
31 |         replystr = "%s" % api.response.reply
32 |         # print("Response Reply: %s" % replystr[:150])
33 |         print("Response Reply: %s" % replystr[:])
34 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/common.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/common.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/create-ebay-tables.py:
--------------------------------------------------------------------------------
  1 | import psycopg2
  2 | import sys
  3 | 
  4 | numArgs = len(sys.argv)
  5 | if numArgs < 1 or numArgs > 5:
  6 | 	print 'ERROR: Not enough arguments. Please input "host",user",dbname","tablename" as arguments.'
  7 | 	sys.exit()
  8 | 
  9 | (host, user, dbname, tablename) = tuple(sys.argv[1:])
 10 | 
 11 | # dbname='test-db1'
 12 | # tablename='tablename'
 13 | # user='nathan'
 14 | # host='localhost'
 15 | 
 16 | print (host, user, dbname, tablename)
 17 | 
 18 | try:
 19 |     conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
 20 |     print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
 21 | except:
 22 |     print "I am unable to connect to the database"
 23 | 
 24 | cur = conn.cursor()
 25 | 
 26 | SQL = ''' 
 27 | CREATE TABLE {tablename} (
 28 | 	"id" 					SERIAL PRIMARY KEY,
 29 | 	"timestamp"				TIMESTAMP WITH TIME ZONE,
 30 | 	"itemId" 				BIGINT,
 31 | 	"topRatedListing" 		BOOLEAN,
 32 | 	"globalId"				TEXT,
 33 | 	"title"					TEXT,
 34 | 	"subtitle"				TEXT,
 35 | 	"country"				TEXT,
 36 | 	
 37 | 	"primaryCategory.categoryId"		INTEGER,
 38 | 	"primaryCategory.categoryName"		TEXT,
 39 | 	"secondaryCategory.categoryId"		TEXT,
 40 | 	"secondaryCategory.categoryName"	TEXT,
 41 | 	"pictureURLLarge"					TEXT,
 42 | 	"galleryURL"						TEXT,
 43 | 	
 44 | 	"sellerInfo.feedbackRatingStar"			TEXT,
 45 | 	"sellerInfo.feedbackScore"				INTEGER,
 46 | 	"sellerInfo.positiveFeedbackPercent"	DECIMAL,
 47 | 	"sellerInfo.sellerUserName"				TEXT,
 48 | 	"sellerInfo.topRatedSeller"				BOOLEAN,
 49 | 	"shippingInfo.expeditedShipping"					BOOLEAN,
 50 | 	"shippingInfo.shipToLocations"						TEXT,	
 51 | 	"shippingInfo.shippingServiceCost.value"			DECIMAL,
 52 | 	"shippingInfo.oneDayShippingAvailable"				BOOLEAN,
 53 | 	"shippingInfo.handlingTime"							SMALLINT,
 54 | 	"shippingInfo.shippingType"							TEXT,
 55 | 	
 56 | 	"autoPay"				BOOLEAN,
 57 | 	"location"				TEXT,
 58 | 	"postalCode"			INTEGER,
 59 | 	"returnsAccepted"		BOOLEAN,
 60 | 	"viewItemURL"			TEXT,
 61 | 	
 62 | 	"sellingStatus.currentPrice.value"			DECIMAL,	
 63 | 	"startprice"								DECIMAL,
 64 | 	"endPrice"									DECIMAL,
 65 | 	"sellingStatus.bidCount" 					SMALLINT,
 66 | 	"sellingStatus.sellingState"				TEXT,
 67 | 	"paymentMethod"								TEXT,
 68 | 	
 69 | 	"isMultiVariationListing"			BOOLEAN,
 70 | 	
 71 | 	"condition"							TEXT,
 72 | 	"condition.conditionId"				INTEGER,
 73 | 	"condition.conditionDisplayName"	TEXT,
 74 | 	"listingInfo.listingType"			TEXT,
 75 | 	"listingInfo.gift"					BOOLEAN,
 76 | 	"listingInfo.bestOfferEnabled"		BOOLEAN,
 77 | 	"listingInfo.buyItNowAvailable"		BOOLEAN,
 78 | 	"listingInfo.buyItNowPrice.value"	DECIMAL,
 79 | 	"listingInfo.startTime"				TIMESTAMP WITH TIME ZONE,
 80 | 	"listingInfo.endTime"				TIMESTAMP WITH TIME ZONE,
 81 | 	"conditiondescription"				TEXT
 82 | )
 83 | '''.format(tablename=tablename)
 84 | 
 85 | cur.execute(SQL)
 86 | 
 87 | conn.commit()
 88 | cur.close()
 89 | conn.close()
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/datetime test.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {
 7 |     "collapsed": false,
 8 |     "deletable": true,
 9 |     "editable": true
10 |    },
11 |    "outputs": [
12 |     {
13 |      "data": {
14 |       "text/plain": [
15 |        "'2017-03-26T00:00:00'"
16 |       ]
17 |      },
18 |      "execution_count": 1,
19 |      "metadata": {},
20 |      "output_type": "execute_result"
21 |     }
22 |    ],
23 |    "source": [
24 |     "#get datetime in datetime in ISO-8601 format\n",
25 |     "import datetime \n",
26 |     "\n",
27 |     "'2017'\n",
28 |     "date = datetime.datetime(2017,3,26)\n",
29 |     "\n",
30 |     "date.isoformat()\n",
31 |     "# datetime.datetime.now().isoformat()"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 5,
37 |    "metadata": {
38 |     "collapsed": false,
39 |     "deletable": true,
40 |     "editable": true
41 |    },
42 |    "outputs": [
43 |     {
44 |      "name": "stdout",
45 |      "output_type": "stream",
46 |      "text": [
47 |       "2017-03-26T00:00:000Z\n"
48 |      ]
49 |     }
50 |    ],
51 |    "source": [
52 |     "time_string = '2017-03-26'\n",
53 |     "print datetime.datetime.strptime(time_string, \"%Y-%m-%d\").isoformat() + '0Z'"
54 |    ]
55 |   },
56 |   {
57 |    "cell_type": "code",
58 |    "execution_count": null,
59 |    "metadata": {
60 |     "collapsed": true
61 |    },
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 2",
69 |    "language": "python",
70 |    "name": "python2"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 2
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython2",
82 |    "version": "2.7.13"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay.yaml:
--------------------------------------------------------------------------------
 1 | # eBay SDK Defaults
 2 | 
 3 | name: ebay_api_config
 4 |     
 5 | 
 6 | # Trading API Sandbox - https://www.x.com/developers/ebay/products/trading-api
 7 | api.sandbox.ebay.com:
 8 |     compatibility: 719
 9 |     # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8
10 |     # token: AgAAAA**AQAAAA**aAAAAA**yO/RWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkIahDJWBqQ+dj6x9nY+seQ**jaoDAA**AAMAAA**1nvC7MyyoPnKbCiKLRFPOw4kdtLxsj04ZhZz85P15OQgKol4c3uBlaqDvzucVMe427r9T93vTBZVhZFn0vpWP08kYe374UD/VXzwQPAiDFBBbrKKE88idiqhrVSYRl9xcBLGE4wpanWHYoDs3AQ8go17il6XAfFYry+iHJIMEDKo4ikwhIRp7AVmhqt4K0+U1RtJsxjxYxWnH0GLrWuwSFWpu/XnitRibO5OC/o0p9ryf2v7suR2lSUbQ2VSqBXDMJG1I5vcfT+118lzaU3mpVugIU6dG2+k2v+t/IswpTvg9pv/XD1U5usgwWJWQStZG/6kT358OqkhPwJF56HtK31vEDnsL9SQ+p9JY/pIcoQIz1u6XaYlZfersZvwAJLgsF44jzgu+Y8WUUnvCMH9v7tNZ/2zhY/rIuBWrK5GuobpZ2kiu5SYiBBuJeWiBysvsAGWuS9a+VzImG1b+s/jQJGyiuUmjs8aCj5KzDWjSTvbBzWpsHWQaVRCrVqiZiv+mJ7hRqcpXNXUiNIzKBVX9tfEiYnC6gykTl5VMqLBzM26eQt1R5fnyWItbXS5r0+pm1otJi+YdFxm8d+b4174YMmCIBdjv9ceJL7uS/ssNnnVeqk3FptiNFlW9tf+PYt9KfysCi6N7NtRRWH4/oQ6Zpj+4RSFc0XJsTBkey1FNcm4H4zSz7cqmnx4i695nA3HaMDqASKsNIUfzTMjN2BGoTT2xFmHef9VriHDGcwqgCOPSbuWeuuTjbTUr6u0jNFj
11 |     
12 |     appid: NathanZo-ebaypric-PRD-4090fc79c-86f1fb32
13 |     token: AgAAAA**AQAAAA**aAAAAA**nnLyWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkYGoCZWBpwydj6x9nY+seQ**HrMDAA**AAMAAA**3T8/OrOs7miNFtluqR31OFpfd4Y+apVBX9Q00gftP2wHe+gdw1G1c6+cNnWi1v7LZFsTeHfE1tAAhJT5CcHiRr8k2EMLqxB3Hwqj8P27tOnJm9otP4/WvS680f9GR6KDNyVSTTCaGXqqTxSu2Nibb8nx5q9jTeq5DoLlOS24+SG8eyq6rc6nGOuqRFP4ki/bpH3EQMACiZEOQyN5Zuvt8ubk/ogJMKnscRNGnIxI1G+nfdteEaQQO8Lv/nk4nof9fA2S+65m11dCpFpxy/RO+zM3+8a7N49FLC3/j3fH3jxbv2RgDbnJtK3YJUC9Ypa9cc6PyGY8caGhA1G3FNELE9FP4+bCTqHj+SaoOpOAT41yobrLmTan4/YmC5FfVYjG9wfPlzuZkiuL0sV/eeRvTXUcbO6ImUBYeVXpYKwNUPZ30qGC/SDFst1UeXDDVtOY5cqEZmeQOs0V7hVHmIBGpsaNLx/2ItQ8WQQoPn8X8YjEYtGPP3UX8yjqVkao/nPOuIYnWkwl1SaYufsFCVkwxhKGRg1ZZcjAezy2kk9HXym8p7dpV0J367Y+K2qUByDIv6tkihXk6KWYXLcQgtHfJ2wkjpx4ItGusVIQ5Kp2p+LflzBnQ4VWgSn/sLlmf0cP1aWhUt8qvOd+sgxc4oAuD3tRyB3zyQO6Tw8Cp3WEg9/fsdv6csxM/dHkDjk30D859uE3R2HNwP7OPrOdb70h64NFvxGUjE4ib9lch5yR9K1pWjiyeb2dANGMLPeDsPKz
14 | 
15 | 
16 |     # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0
17 |     appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b
18 |     
19 |     certid: PRD-bed4d450833b-99a1-4224-9e6e-1e94
20 |     devid: 0cc86638-a322-4390-8da7-76b3800fa795
21 |     
22 | 
23 | # Trading API - https://www.x.com/developers/ebay/products/trading-api
24 | api.ebay.com:
25 |     version: 719
26 |     # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8
27 |     # token: AgAAAA**AQAAAA**aAAAAA**yO/RWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkIahDJWBqQ+dj6x9nY+seQ**jaoDAA**AAMAAA**1nvC7MyyoPnKbCiKLRFPOw4kdtLxsj04ZhZz85P15OQgKol4c3uBlaqDvzucVMe427r9T93vTBZVhZFn0vpWP08kYe374UD/VXzwQPAiDFBBbrKKE88idiqhrVSYRl9xcBLGE4wpanWHYoDs3AQ8go17il6XAfFYry+iHJIMEDKo4ikwhIRp7AVmhqt4K0+U1RtJsxjxYxWnH0GLrWuwSFWpu/XnitRibO5OC/o0p9ryf2v7suR2lSUbQ2VSqBXDMJG1I5vcfT+118lzaU3mpVugIU6dG2+k2v+t/IswpTvg9pv/XD1U5usgwWJWQStZG/6kT358OqkhPwJF56HtK31vEDnsL9SQ+p9JY/pIcoQIz1u6XaYlZfersZvwAJLgsF44jzgu+Y8WUUnvCMH9v7tNZ/2zhY/rIuBWrK5GuobpZ2kiu5SYiBBuJeWiBysvsAGWuS9a+VzImG1b+s/jQJGyiuUmjs8aCj5KzDWjSTvbBzWpsHWQaVRCrVqiZiv+mJ7hRqcpXNXUiNIzKBVX9tfEiYnC6gykTl5VMqLBzM26eQt1R5fnyWItbXS5r0+pm1otJi+YdFxm8d+b4174YMmCIBdjv9ceJL7uS/ssNnnVeqk3FptiNFlW9tf+PYt9KfysCi6N7NtRRWH4/oQ6Zpj+4RSFc0XJsTBkey1FNcm4H4zSz7cqmnx4i695nA3HaMDqASKsNIUfzTMjN2BGoTT2xFmHef9VriHDGcwqgCOPSbuWeuuTjbTUr6u0jNFj
28 |     
29 |     appid: NathanZo-ebaypric-PRD-4090fc79c-86f1fb32
30 |     token: AgAAAA**AQAAAA**aAAAAA**nnLyWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkYGoCZWBpwydj6x9nY+seQ**HrMDAA**AAMAAA**3T8/OrOs7miNFtluqR31OFpfd4Y+apVBX9Q00gftP2wHe+gdw1G1c6+cNnWi1v7LZFsTeHfE1tAAhJT5CcHiRr8k2EMLqxB3Hwqj8P27tOnJm9otP4/WvS680f9GR6KDNyVSTTCaGXqqTxSu2Nibb8nx5q9jTeq5DoLlOS24+SG8eyq6rc6nGOuqRFP4ki/bpH3EQMACiZEOQyN5Zuvt8ubk/ogJMKnscRNGnIxI1G+nfdteEaQQO8Lv/nk4nof9fA2S+65m11dCpFpxy/RO+zM3+8a7N49FLC3/j3fH3jxbv2RgDbnJtK3YJUC9Ypa9cc6PyGY8caGhA1G3FNELE9FP4+bCTqHj+SaoOpOAT41yobrLmTan4/YmC5FfVYjG9wfPlzuZkiuL0sV/eeRvTXUcbO6ImUBYeVXpYKwNUPZ30qGC/SDFst1UeXDDVtOY5cqEZmeQOs0V7hVHmIBGpsaNLx/2ItQ8WQQoPn8X8YjEYtGPP3UX8yjqVkao/nPOuIYnWkwl1SaYufsFCVkwxhKGRg1ZZcjAezy2kk9HXym8p7dpV0J367Y+K2qUByDIv6tkihXk6KWYXLcQgtHfJ2wkjpx4ItGusVIQ5Kp2p+LflzBnQ4VWgSn/sLlmf0cP1aWhUt8qvOd+sgxc4oAuD3tRyB3zyQO6Tw8Cp3WEg9/fsdv6csxM/dHkDjk30D859uE3R2HNwP7OPrOdb70h64NFvxGUjE4ib9lch5yR9K1pWjiyeb2dANGMLPeDsPKz
31 | 
32 |     # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0
33 |     # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b
34 |     
35 |     certid: PRD-bed4d450833b-99a1-4224-9e6e-1e94
36 |     devid: 0cc86638-a322-4390-8da7-76b3800fa795
37 |     
38 | 
39 | # Finding API - https://www.x.com/developers/ebay/products/finding-api
40 | svcs.ebay.com:
41 |     # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8
42 |     # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0
43 |     # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b
44 |     
45 |     version: 1.13.0
46 | 
47 | # Shopping API - https://www.x.com/developers/ebay/products/shopping-api
48 | open.api.ebay.com:
49 |     # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8
50 |     # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0
51 |     # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b
52 |     version: 671
53 | 
54 |     # Optional affiliate tracking
55 |     # http://developer.ebay.com/DevZone/shopping/docs/Concepts/ShoppingAPI_FormatOverview.html#StandardURLParameters
56 |     trackingid: ENTER_YOUR_TRACKINGID_HERE
57 |     trackingpartnercode: ENTER_YOUR_PARTNERCODE_HERE 
58 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.py


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/settings.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/settings.cpython-35.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class EbayScraperItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     itemId = scrapy.Field(default='NULL')
16 |     conditionDescription = scrapy.Field(default='NULL')
17 |     startPrice = scrapy.Field(default='NULL')
18 |     endPrice = scrapy.Field(default='NULL')
19 |     duration = scrapy.Field(default='NULL')
20 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/items.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import psycopg2
 9 | import logging
10 | 
11 | 
12 | class EbayPostgresPipeline(object):
13 | 
14 | 	def __init__(self, postgres_host,postgres_user,postgres_db,postgres_table):
15 | 		self.postgres_host=postgres_host
16 | 		self.postgres_user=postgres_user
17 | 		self.postgres_db=postgres_db
18 | 		self.postgres_table=postgres_table
19 | 
20 | 	'''
21 | 	The settings attribute is set in the base Spider class after the spider is
22 | 	 initialized. If you want to use the settings before the initialization 
23 | 	 (e.g., in your spider’s __init__() method), you’ll need to override the 
24 | 	 from_crawler() method.
25 | 	 '''
26 | 	@classmethod
27 | 	def from_crawler(cls, crawler):
28 | 		return cls(
29 | 			postgres_host=crawler.settings.get('POSTGRES_HOST'),
30 | 			postgres_user=crawler.settings.get('POSTGRES_USER'),
31 | 			postgres_db=crawler.settings.get('POSTGRES_DB'),
32 | 			postgres_table=crawler.settings.get('POSTGRES_TABLE'),
33 | 		)	
34 | 
35 | 
36 | 	def open_spider(self, spider):
37 | 		self.conn = psycopg2.connect("dbname={} user={} host={}".format(self.postgres_db,   \
38 | 																		self.postgres_user, \
39 | 																		self.postgres_host) \
40 | 		)
41 | 		self.cur = self.conn.cursor()
42 | 
43 | 
44 | 	def process_item(self, item, spider):
45 | 		'''store data into postgres database 
46 | 
47 | 		'''
48 | 
49 | 
50 | 
51 | 		SQL = '''
52 | 		UPDATE ONLY {table_name} as ci
53 | 		SET conditiondescription='{condition}',
54 | 			startprice={start_price}
55 | 		WHERE ci."itemId"={item_id};
56 | 		'''.format( table_name=self.postgres_table, 
57 | 					condition=item['conditionDescription'], 
58 | 					start_price=item['startPrice'],
59 | 					item_id=item['itemId'] 
60 | 			) 
61 | 
62 | 
63 | 		try:
64 | 			self.cur.execute(SQL) # execute SQL, and commit changes 
65 | 			self.conn.commit()
66 | 		except:
67 | 			logging.debug('Error with executing SQL statement.\n SQL = {}'.format(SQL))
68 | 			self.conn.rollback()
69 | 
70 | 
71 | 		return item
72 | 
73 | 
74 | 	def close_spider(self, spider):
75 | 		self.conn.close()
76 | 		self.cur.close()
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/settings.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Scrapy settings for ebay_scraper project
  5 | #
  6 | # For simplicity, this file contains only settings considered important or
  7 | # commonly used. You can find more settings consulting the documentation:
  8 | #
  9 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 11 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 12 | import logging
 13 | 
 14 | BOT_NAME = 'ebay_scraper'
 15 | 
 16 | SPIDER_MODULES = ['ebay_scraper.spiders']
 17 | NEWSPIDER_MODULE = 'ebay_scraper.spiders'
 18 | 
 19 | 
 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 21 | # USER_AGENT = 'ebay_scraper (+http://www.yourdomain.com)'
 22 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
 23 | 
 24 | # Obey robots.txt rules
 25 | ROBOTSTXT_OBEY = False
 26 | 
 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 28 | CONCURRENT_REQUESTS = 2
 29 | 
 30 | # Configure a delay for requests for the same website (default: 0)
 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 32 | # See also autothrottle settings and docs
 33 | DOWNLOAD_DELAY = 1 # default = 0
 34 | # The download delay setting will honor only one of:
 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 36 | # CONCURRENT_REQUESTS_PER_IP = 16
 37 | 
 38 | # Disable cookies (enabled by default)
 39 | COOKIES_ENABLED = False
 40 | 
 41 | # Disable Telnet Console (enabled by default)
 42 | #TELNETCONSOLE_ENABLED = False
 43 | 
 44 | # Override the default request headers:
 45 | #DEFAULT_REQUEST_HEADERS = {
 46 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 47 | #   'Accept-Language': 'en',
 48 | #}
 49 | 
 50 | # Enable or disable spider middlewares
 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 52 | #SPIDER_MIDDLEWARES = {
 53 | #    'ebay_scraper.middlewares.MyCustomSpiderMiddleware': 543,
 54 | #}
 55 | 
 56 | # Enable or disable downloader middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 58 | #DOWNLOADER_MIDDLEWARES = {
 59 | #    'ebay_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
 60 | #}
 61 | 
 62 | # Enable or disable extensions
 63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 64 | #EXTENSIONS = {
 65 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 66 | #}
 67 | 
 68 | # Configure item pipelines
 69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 70 | ITEM_PIPELINES = {
 71 |     'ebay_scraper.pipelines.EbayPostgresPipeline': 300,
 72 | }
 73 | # set up the pipeline settings for postgres
 74 | POSTGRES_HOST = "localhost"
 75 | POSTGRES_USER = "nathan"
 76 | POSTGRES_DB = "ebay"
 77 | POSTGRES_TABLE = "completed_items_v2"
 78 | # POSTGRES_TABLE = "completed_items"
 79 | # POSTGRES_TABLE = "scrapy_test"
 80 | 
 81 | 
 82 | # Enable and configure the AutoThrottle extension (disabled by default)
 83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 84 | AUTOTHROTTLE_ENABLED = True
 85 | # The initial download delay
 86 | AUTOTHROTTLE_START_DELAY = 0.5
 87 | # The maximum download delay to be set in case of high latencies
 88 | AUTOTHROTTLE_MAX_DELAY = 2
 89 | # The average number of requests Scrapy should be sending in parallel to
 90 | # each remote server. High value -> High speed
 91 | AUTOTHROTTLE_TARGET_CONCURRENCY = 2
 92 | # Enable showing throttling stats for every response received:
 93 | # AUTOTHROTTLE_DEBUG = True
 94 | 
 95 | # Enable and configure HTTP caching (disabled by default)
 96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 97 | #HTTPCACHE_ENABLED = True
 98 | #HTTPCACHE_EXPIRATION_SECS = 0
 99 | #HTTPCACHE_DIR = 'httpcache'
100 | #HTTPCACHE_IGNORE_HTTP_CODES = []
101 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
102 | 
103 | # LOGGING
104 | # LOG_FILE = '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_spider_log.log'
105 | # LOG_ENABLED = True
106 | # LOG_LEVEL = logging.ERROR
107 | 
108 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/settings.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/ebay_spider.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/ebay_spider.cpython-35.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | # from items import EbayScraperItem
  3 | import items
  4 | import psycopg2
  5 | import logging
  6 | from scrapy.utils.log import configure_logging
  7 | from pprint import pprint 
  8 | 
  9 | class EbaySpider(scrapy.Spider):
 10 |     name = "ebay"
 11 | 
 12 |     def __init__(self, url_start_index=0, *args, **kwargs):
 13 |         super(EbaySpider, self).__init__(*args, **kwargs) # don't know what this does, but saw it in documentation
 14 |         self.url_start_index = int(url_start_index)        
 15 | 
 16 | 
 17 |     def start_requests(self):  
 18 |         
 19 | 
 20 | 
 21 |         #--- Connect to ebay database, grab itemId, URL 
 22 |         postgres_host = self.crawler.settings.get('POSTGRES_HOST')
 23 |         postgres_user = self.crawler.settings.get('POSTGRES_USER')
 24 |         postgres_db = self.crawler.settings.get('POSTGRES_DB')
 25 |         postgres_table = self.crawler.settings.get('POSTGRES_TABLE')
 26 | 
 27 |         conn = psycopg2.connect("dbname={} user={} host={}".format(postgres_db, postgres_user, postgres_host))
 28 |         cur = conn.cursor()
 29 |         
 30 |         # Start scraping at item in database that is furthest back in time
 31 |         # That way, we can always pick up scraping where we left off, and even if we put 
 32 |         # new data into table, we don't overwrite it in a new scrape. 
 33 |         SQL = '''
 34 |         SELECT ci."itemId", ci."viewItemURL", ci."listingInfo.listingType"
 35 |         FROM {tablename} as ci
 36 |         ORDER BY ci."timestamp" ASC; 
 37 |         '''.format(tablename=postgres_table)
 38 |         cur.execute(SQL)
 39 |         urls = [(str(url), listingType) for itemId,url,listingType in cur.fetchall()]
 40 |         num_urls_total = len(urls)
 41 |         urls = urls[self.url_start_index:]         # limit scraping to only the indeces we care about. we could do this in SQL, and we should make that change later
 42 | 
 43 | 
 44 |         # ---- HARDCODED FOR DEV/TESTING PURPOSES ---- #
 45 |         # urls = [
 46 |             # "http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=192140341983&rt=nc&_trksid=p2047675.l2565"
 47 |             # 'http://www.ebay.com/itm/Nikon-D750-24-3-MP-Digital-SLR-Camera-Black-Body-Only-Used-/222447550032',
 48 |             # 'http://www.ebay.com/itm/Canon-EOS-5D-Mark-II-24-105mm-Lens-and-Camera-Bag-/272592893520',
 49 |             # 'http://www.ebay.com/itm/DJI-Inspire-1-V1-0-4K-X3-Camera-and-3-Axis-Gimbal-Drone-Quadcopter-Extras-/302257646034',
 50 |             # 'http://www.ebay.com/itm/Samsung-NX-NX1-28-2-MP-Digital-Camera-Black-Kit-w-50-200mm-OIS-Lens-/222445254405',
 51 |             # 'http://www.ebay.com/itm/Panasonic-AJ-HDC27F-2-3-HD-DVCPRO-Varicam-Video-Camera-Camcorder-w-Viewfinder-/142319141084',
 52 |             # 'http://www.ebay.com/itm/High-Speed-Pin-Registered-Super-8-Cartridge-Camera-Very-Rare-Logmar-Wilcam-/252816500866',
 53 |             # 'http://www.ebay.com/itm/Carl-Zeiss-Planar-T-80mm-f-2-AF-Lens-Contax-645-camera-/332163276401',
 54 |             # 'http://www.ebay.com/itm/DJI-Mavic-Pro-Folding-Drone-4K-Stabilized-Camera-Active-Track-Avoidance-GPS-/252821264198',
 55 |             # 'http://www.ebay.com/itm/Nikon-D40-6-1MP-Digital-SLR-Camera-Black-Kit-w-AF-S-DX-18-55mm-Lens-/262891375158'
 56 |         # ]
 57 |         # urls = [("http://www.ebay.com/itm/Canon-EOS-7D-18-0-MP-Digital-SLR-Camera-Black-Body-Only-/192140341983",'Auction')]
 58 | 
 59 |         # THIS CAN RETURN A GENERATOR or "LIST OF REQUESTS"
 60 |         # https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.start_requests
 61 |         for i,tup in enumerate(urls):
 62 |             logging.debug("scraping #{} out of {} urls.".format(i+self.url_start_index, num_urls_total))
 63 |             url = tup[0]
 64 |             listingType = tup[1]
 65 |             yield scrapy.Request(url=url, callback=self.parse, meta={'listingType':listingType}) # after yielding the request, scrapy will go and download the url, and then call the callback function
 66 | 
 67 | 
 68 | 
 69 |     def parse(self, response):
 70 | 
 71 |         item = items.EbayScraperItem()
 72 | 
 73 |         listingType = response.meta['listingType']
 74 | 
 75 |         # Item condition
 76 |         item_condition_xpath = "//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()"        
 77 |         item['conditionDescription'] = str(response.xpath(item_condition_xpath) \
 78 |                                             .extract_first(default='NULL') \
 79 |                                     ).decode('unicode_escape')  \
 80 |                                     .encode('ascii','ignore') \
 81 |                                     .replace("\'","") 
 82 | 
 83 |         # Item ID
 84 |         item_id_xpath = "//div[@id='descItemNumber']/text()"
 85 |         item['itemId'] = int(response.xpath(item_id_xpath).extract_first())
 86 |         
 87 | 
 88 |         if listingType == 'Auction' or listingType == 'AuctionWithBIN':
 89 |             bid_count = int(response.xpath("//a[@id='vi-VR-bid-lnk']/span[1]/text()").extract_first())
 90 |             bid_history_url = response.xpath("//a[@id='vi-VR-bid-lnk']/@href").extract_first()
 91 | 
 92 |             if bid_history_url != None:
 93 | 
 94 |                 if bid_count > 0: # this prevents us from making an unecessary requests if there is no startPrice (because no bids)                
 95 | 
 96 |                     logging.debug('bid_history_url = {}'.format(bid_history_url))
 97 |                     logging.debug('bid_count = {}'.format(bid_count))
 98 | 
 99 |                     return scrapy.Request(url=bid_history_url, callback=self.parse_start_price, meta={'item':item})
100 | 
101 |                 else: # if the item had 0 bids             
102 |                     item['startPrice'] = float(str(response.xpath("//span[@class='notranslate vi-VR-cvipPrice']/text()").extract_first()).split('$')[1].replace(',',''))
103 |                     item['duration'] = 'NULL'
104 |                     item['endPrice'] = 'NULL'
105 |                     return item # don't request a new url, just send item to pipeline.py
106 | 
107 |         else: # 'FixedPrice' or 'StoreInventory'
108 |             item['endPrice'] = float(str(response.xpath("//span[@id='prcIsum']/text()").extract_first()).split('$')[1].replace(',',''))
109 |             item['startPrice'] = 'NULL'
110 |             item['duration'] = 'NULL'
111 |             return item
112 | 
113 | 
114 | 
115 |     def parse_start_price(self, response):
116 | 
117 |         item = response.meta['item'] # grab item attribute from response 
118 | 
119 | 
120 |         # item end price - I don't think we need this, because the endPrice is given in findCOmpletedItems
121 | 
122 |         # end_price_xpath = "//div[2]/table/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr/td[@class='BHctBidVal']/text()"
123 |         # item['endPrice'] = float(str(response.xpath(end_price_xpath).extract_first()).split('$')[1].replace(',',''))
124 |         item['endPrice'] = 'NULL'
125 | 
126 |         # Item duration
127 |         duration_xpath = "//span[@class='titleValueFont'][4]/text()"
128 |         item['duration'] = str(response.xpath(duration_xpath).extract_first()) \
129 |                             .decode('unicode_escape') \
130 |                             .encode('ascii','ignore') \
131 |                             .split('\r')[0]
132 | 
133 | 
134 |         # Item start price - ebay has (at least) 2 different types of HTML pages for the startPrice info
135 |         # try grabbing first xpath 
136 |         start_price_xpath = "//tr[@id='viznobrd']/td[@class='contentValueFont'][1]/text()"        
137 |         startPrice = response.xpath(start_price_xpath).extract_first(default='NULL') 
138 |         
139 |         logging.debug("startPrice = {}".format(startPrice))
140 | 
141 |         if startPrice != 'NULL': # the first x path worked 
142 |             startPrice = float(startPrice.split('$')[1].replace(',',''))
143 |             item['startPrice'] = startPrice
144 |             return item
145 | 
146 |         # Try grabbing the second xpath if the first xpath didn't work
147 |         start_price_xpath = "//table[@id='w2-w3-w0-w0']"
148 | 
149 |         for item in response.xpath(start_price_xpath).extract():
150 |             logging.debug('item in response.xpath() SECOND PATH = {}'.format(item))
151 | 
152 |         startPrice = response.xpath(start_price_xpath).extract_first(default='NULL') 
153 |         if startPrice != 'NULL': # if the 2nd xpath worked...
154 |             logging.debug('url = {}'.format(response.url))
155 |             logging.debug('SECOND XPATH => startPrice = {}'.format(startPrice))
156 |             startPrice = startPrice.split('$')[-1] # take the last entry in the table, which is something like: 80.0023 Mar 2017 at 1:23:58PM PDT            
157 |             startPrice = '.'.join([startPrice.split('.')[0], startPrice.split('.')[1][:2]]) # take numbers before decimal and concatenate with 2 digits after decimal
158 |             startPrice = startPrice.replace(',','')
159 |             item['startPrice'] = float(startPrice)
160 |             return item
161 | 
162 | 
163 |         logging.debug('response.url = {}'.format(response.url))
164 |         logging.debug('startPrice = {}'.format(startPrice))
165 |         logging.debug('itemId = {}'.format(item['itemId']))
166 | 
167 |         # if the first 2 xpaths didn't work... DEBUG
168 |         item['startPrice'] = 'NULL'            
169 |         return item 
170 | 
171 | 
172 |         


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.pyc


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider_v2.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | from scrapy.spiders import CrawlSpider, Rule
  3 | from scrapy.linkextractors import LinkExtractor
  4 | # import items
  5 | from items import EbayScraperItem
  6 | import psycopg2
  7 | import logging
  8 | from scrapy.utils.log import configure_logging
  9 | from pprint import pprint 
 10 | 
 11 | class EbaySpider(CrawlSpider):
 12 | 	name = "ebay_crawl_spider"
 13 | 
 14 | 	custom_settings = {
 15 | 		'POSTGRES_TABLE':"completed_items_v2",
 16 | 		'AUTOTHROTTLE_TARGET_CONCURRENCY':1,
 17 | 		'DONWLOAD_DELAY':0.8,
 18 |         'ITEM_PIPELINES': {
 19 |         	'ebay_scraper.pipelines.EbayPostgresPipeline': 300,
 20 |         }
 21 |     }
 22 | 
 23 | 
 24 | 	def __init__(self, url_start_index=0, url_end_index=0, *args, **kwargs):
 25 | 		super(EbaySpider, self).__init__(*args, **kwargs) # don't know what this does, but saw it in documentation
 26 | 		self.url_start_index = int(url_start_index)      
 27 | 		self.url_end_index = int(url_end_index)  
 28 | 
 29 | 
 30 | 	def start_requests(self):  
 31 | 
 32 | 		#--- Connect to ebay database, grab itemId, URL 
 33 | 		postgres_host = self.crawler.settings.get('POSTGRES_HOST')
 34 | 		postgres_user = self.crawler.settings.get('POSTGRES_USER')
 35 | 		postgres_db = self.crawler.settings.get('POSTGRES_DB')
 36 | 		postgres_table = self.crawler.settings.get('POSTGRES_TABLE')
 37 | 
 38 | 		conn = psycopg2.connect("dbname={} user={} host={}".format(postgres_db, postgres_user, postgres_host))
 39 | 		cur = conn.cursor()
 40 | 
 41 | 		# Start scraping at item in database that is furthest back in time
 42 | 		# That way, we can always pick up scraping where we left off, and even if we put 
 43 | 		# new data into table, we don't overwrite it in a new scrape. 
 44 | 		SQL = '''
 45 | 		SELECT ci."itemId", ci."viewItemURL"
 46 | 		FROM {tablename} as ci
 47 | 		ORDER BY ci."timestamp" ASC; 
 48 | 		'''.format(tablename=postgres_table)
 49 | 		cur.execute(SQL)
 50 | 
 51 | 		urls = [(int(itemId),str(url)) for itemId,url in cur.fetchall()]
 52 | 		if self.url_end_index == 0: 
 53 | 			self.url_end_index = len(urls)
 54 | 		num_urls_total = len(urls)
 55 | 		urls = urls[self.url_start_index:self.url_end_index]         # limit scraping to only the indeces we care about. we could do this in SQL, and we should make that change later
 56 | 
 57 | 		for i,(itemId,url) in enumerate(urls):
 58 | 			logging.debug("scraping #{} out of {} urls.".format(i+self.url_start_index, num_urls_total))
 59 | 			yield scrapy.Request(url=url, callback=self.parse, meta={'itemId':itemId, 'dont_redirect':True}) # after yielding the request, scrapy will go and download the url, and then call the callback function
 60 | 
 61 | 		# ---- HARDCODED FOR DEV/TESTING PURPOSES ---- #
 62 | 		# urls = ["http://www.ebay.com/itm/NIKON-D-DF-16-2-MP-DIGITAL-SLR-CAMERA-SILVER-KIT-AF-S-MICRO-60MM-LENS-/291997000430",
 63 | 		# 		"http://www.ebay.com/itm/Canon-T3i-Body-And-Kit-/272591253524", 
 64 | 		# 		"http://www.ebay.com/itm/Fujifilm-FinePix-XP-XP70-16-4MP-Waterproof-Digital-Camera-5X-Optical-Zoom-/112242371882",
 65 | 		# 		"http://www.ebay.com/itm/Canon-Minolta-and-Pentax-Cameras-2-Bags-9-Lenses-and-Filters-/292024262357"
 66 | 		# ]
 67 | 		# for url in urls:
 68 | 		# 	yield scrapy.Request(url=url, callback=self.parse) # after yielding the request, scrapy will go and download the url, and then call the callback function
 69 | 	
 70 | 
 71 | 
 72 | 	def parse(self, response):
 73 | 
 74 | 		item = EbayScraperItem()
 75 | 		item['itemId'] = response.meta['itemId']
 76 | 
 77 | 		# Get condition 
 78 | 		item['conditionDescription'] = response.xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()")\
 79 | 												.extract_first(default='NULL')\
 80 | 												.encode('ascii','ignore')\
 81 | 												.replace('\'', '')
 82 | 
 83 | 
 84 | 		# Scrape bid history URL in order to get startPrice 
 85 | 		bid_history_url = response.xpath("//a[@id='vi-VR-bid-lnk']/@href").extract_first()
 86 | 		if bid_history_url != None:
 87 | 			bid_count = int(response.xpath("//a[@id='vi-VR-bid-lnk']/span[1]/text()").extract_first())
 88 | 			if bid_count > 0:
 89 | 				return scrapy.Request(url=bid_history_url, callback=self.parse_start_price, meta={'item':item})
 90 | 			else:
 91 | 				item['startPrice'] = 'NULL'
 92 | 				return item
 93 | 		else:
 94 | 			item['startPrice'] = 'NULL'
 95 | 			return item
 96 | 
 97 | 
 98 | 	def parse_start_price(self, response):
 99 | 		
100 | 		item = response.meta['item']
101 | 
102 | 		# 1st xpath attempt
103 | 		startPrice = response.xpath("//tr[@id='viznobrd']/td[@class='contentValueFont'][1]/text()").extract_first()
104 | 		if startPrice != None:
105 | 			item['startPrice'] = float(startPrice.split('$')[1])
106 | 			return item
107 | 
108 | 		# 2nd xpath attempt 
109 | 		# startPrice = response.xpath("//span/span/text()").extract()[-3]	
110 | 		bid_history_items = response.xpath("//span/text()").extract()		
111 | 		if bid_history_items:
112 | 			for i,text in enumerate(bid_history_items):
113 | 				if text == 'Starting Price':
114 | 					startPrice = bid_history_items[i+1]
115 | 					item['startPrice'] = float(startPrice.replace('$',''))
116 | 					return item
117 | 
118 | 					
119 | 
120 | 		logging.debug('1ST AND 2ND XPATH DID NOT WORK.\n itemId = {}\n'.format(item['itemId']))
121 | 		item['startPrice'] = 'NULL'
122 | 
123 | 		return item
124 | 		
125 | 
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/ebay_scraper/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ebay_scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ebay_scraper
12 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/find-completed-listing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from optparse import OptionParser
  4 | import psycopg2
  5 | from psycopg2.extensions import AsIs
  6 | from collections import OrderedDict
  7 | import datetime
  8 | import pprint 
  9 | 
 10 | numArgs = len(sys.argv)
 11 | if numArgs < 1 or numArgs > 7:
 12 |     print 'ERROR: Not enough arguments. Please input "host",user",dbname","tablename",minPrice,maxPrice as arguments.'
 13 |     sys.exit()
 14 | 
 15 | (host, user, dbname, TABLE_NAME, minPrice, maxPrice) = tuple(sys.argv[1:])
 16 | 
 17 | pagesToQuery = int(input('Enter number of pages to query:'))
 18 | entriesPerPage = int(input('Enter number of entries per page to query:'))
 19 | pageStart = int(input('Enter page number to start at:'))
 20 | 
 21 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__))
 22 | 
 23 | 
 24 | import ebaysdk
 25 | from ebaysdk.finding import Connection as finding
 26 | from ebaysdk.exception import ConnectionError
 27 | 
 28 | def init_options():
 29 |     usage = "usage: %prog [options]"
 30 |     parser = OptionParser(usage=usage)
 31 | 
 32 |     parser.add_option("-d", "--debug",action="store_true", dest="debug", default=False,help="Enabled debugging [default: %default]")
 33 | 
 34 |     parser.add_option("-y", "--yaml",dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml',
 35 |         help="Specifies the name of the YAML defaults file. [default: %default]")
 36 | 
 37 |     parser.add_option("-a", "--appid",dest="appid", default=None,help="Specifies the eBay application id to use.")
 38 | 
 39 |     (opts, args) = parser.parse_args()
 40 | 
 41 |     return opts, args
 42 | 
 43 | 
 44 | def run(opts, pagesToQuery=1, entriesPerPage=1, pageStart=1):
 45 | 
 46 |     # --- set up query parameters ; COULD NOT GET THIS TO HAVE ANY AFFECT
 47 |     endTimeFrom = '2017-01-12 00:00:00'
 48 |     endTimeTo   = '2017-04-4 00:00:00'
 49 |     endTimeFrom = datetime.datetime.strptime(endTimeFrom, "%Y-%m-%d %H:%M:%S").isoformat() + '.000Z'
 50 |     endTimeTo   = datetime.datetime.strptime(endTimeTo, "%Y-%m-%d %H:%M:%S").isoformat() + '.000Z'
 51 |     print 'endTimeFrom:',endTimeFrom
 52 |     print 'endTimeTo:',endTimeTo
 53 | 
 54 | 
 55 |     # ------ CONNECT TO POSTGRES DATABSE ----- #
 56 |     # dbname='ebay'
 57 |     # user='nathan'
 58 |     # host='localhost'
 59 |     # TABLE_NAME = 'completed_items_15230_31388'
 60 | 
 61 |     try:
 62 |         conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
 63 |         print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
 64 |     except:
 65 |         print "ERROR: Unable to connect to the database." 
 66 |         sys.exit("Check database connection settings and try again.")
 67 | 
 68 |     cur = conn.cursor()
 69 | 
 70 |     # ------------ QUERY EBAY ---------------- #
 71 |     try:
 72 |         api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True)
 73 | 
 74 |         for pageNum in range(pageStart, pageStart+pagesToQuery+1): 
 75 | 
 76 |             api_request = {
 77 |                 # 'keywords': 'camera',
 78 |                 'categoryId'  :  '31388', # 31388 : Digital cameras     
 79 |                 'itemFilter': [
 80 |                     {'name': 'LocatedIn', 'value': 'US'},
 81 |                     {'name': 'Currency', 'value':'USD'},
 82 | 
 83 |                     # {'name': 'Condition', 'value': 'Used'},
 84 |                     {'name': 'MinPrice',  'value': minPrice},
 85 |                     {'name': 'MaxPrice',  'value': maxPrice},
 86 | 
 87 |                     # {'name': 'ListingType', 'value':'Auction'},
 88 |                     # {'name': 'ListingType', 'value':'AuctionWithBIN'},
 89 |                     # {'name': 'ListingType', 'value':'FixedPrice'},
 90 |                     # {'name': 'SoldItemsOnly', 'value':'true'},
 91 | 
 92 |                     {'name': 'HideDuplicateItems', 'value':'true'},
 93 | 
 94 |                     # {'name': 'SellerBusinessType', 'value' : 'Private'},
 95 |                     
 96 |                     {'name': 'EndTimeFrom', 'value': endTimeFrom},
 97 |                     {'name': 'EndTimeTo',   'value': endTimeTo}
 98 |                 ],                
 99 |                 'outputSelector': [
100 |                   'PictureURLLarge',
101 |                   'SellerInfo',
102 |                   'UnitPriceInfo'
103 |                 ],
104 |                 'paginationInput': {
105 |                     'entriesPerPage': entriesPerPage, # max = 100
106 |                     'pageNumber': pageNum    # execute the call with subsequent values for this field 
107 |                 },                
108 |                 'sortOrder' : 'EndTimeSoonest'
109 |             }
110 | 
111 |             response = api.execute('findCompletedItems', api_request)
112 | 
113 |             dic = response.dict()
114 | 
115 |             # if failure, print detail s
116 |             if dic['ack'] != 'Success':
117 |                 print 'ack: ',dic['ack']
118 |                 print 'error message: ',dic['errorMessage']
119 | 
120 |             if pageNum == 1:
121 |                 # print dic
122 |                 totalPages = dic['paginationOutput']['totalPages']
123 |                 totalEntries = dic['paginationOutput']['totalEntries']
124 |                 # _count = dic['searchResult']['_count']
125 |                 print 'Total Pages = {}'.format(totalPages)
126 |                 print 'Total Entries = {}'.format(totalEntries)
127 | 
128 | 
129 |             # print "dic['searchResult']['item'][0]:{}".format(dic['searchResult']['item'][0])
130 |             # pprint.pprint(dic['searchResult']['item'][0])
131 | 
132 |             # ------ STORE EBAY DATA IN DICTIONARY ------ #
133 |             ebay_data_dict = OrderedDict()
134 | 
135 |             timestamp = dic['timestamp'] # Example : '2017-03-25T01:58:10.520Z'
136 |             ebay_data_dict['timestamp'] = timestamp 
137 | 
138 |             for entryNum in range(len(dic['searchResult']['item'])-1):
139 |                 for key1,val1 in dic['searchResult']['item'][entryNum].iteritems():
140 |                     if type(val1) is dict:
141 |                         for key2,val2 in val1.iteritems():
142 |                             if type(val2) is dict:
143 |                                 for key3,val3 in val2.iteritems():
144 |                                     # print '{}.{}.{} : {}'.format(key1,key2,key3,val3)
145 |                                     key = '.'.join([key1,key2,key3])
146 |                                     val = val3
147 |                                     ebay_data_dict[key] = val
148 |                             else:
149 |                                 # print '{}.{} : {}'.format(key1,key2,val2)
150 |                                 key = '.'.join([key1,key2])
151 |                                 val = val2
152 |                                 ebay_data_dict[key] = val
153 |                     else:
154 |                         # print '{} : {}\n'.format(key1, val1)
155 |                         key = key1
156 |                         val = val1
157 |                         ebay_data_dict[key] = val
158 | 
159 |                 # remove entries we don't need
160 |                 bad_keys = [ \
161 |                 "searchResult.item.attribute", \
162 |                 "searchResult.item.attribute.value",\
163 |                 "searchResult.item.attribute.name", \
164 |                 "searchResult.item.discountPriceInfo.originalRetailPrice_currencyId", \
165 |                 "searchResult.item._distance"
166 |                 "searchResult.item.galleryInfoContainer.galleryURL._gallerySize",\
167 |                 "searchResult.item.listingInfo.convertedBuyItNowPrice._currencyId", \
168 |                 "sellingStatus.convertedCurrentPrice.value", \
169 |                 "sellingStatus.convertedCurrentPrice._currencyId",  \
170 |                 "sellingStatus.currentPrice._currencyId", \
171 |                 "listingInfo.buyItNowPrice._currencyId", \
172 |                 "listingInfo.convertedBuyItNowPrice._currencyId", \
173 |                 "shippingInfo.shippingServiceCost._currencyId", \
174 |                 "listingInfo.convertedBuyItNowPrice.value", \
175 |                 "galleryPlusPictureURL", \
176 |                 "storeInfo.storeURL", \
177 |                 "storeInfo.storeName", \
178 |                 "productId._type",\
179 |                 "productId.value", 
180 |                 "charityId",\
181 |                 "discountPriceInfo.soldOnEbay", \
182 |                 "discountPriceInfo.pricingTreatment", \
183 |                 "discountPriceInfo.originalRetailPrice._currencyId", \
184 |                 "discountPriceInfo.originalRetailPrice.value", \
185 |                 "discountPriceInfo.soldOffEbay", \
186 |                 "discountPriceInfo.minimumAdvertisedPriceExposure",\
187 |                 ]
188 |                 for key in bad_keys:
189 |                     if key in ebay_data_dict.keys():
190 |                         ebay_data_dict.pop(key)
191 | 
192 |                 # ------ ENTER EBAY DATA INTO TABLE ----- #
193 |                 currentEntryNum = entryNum + ((pageNum-1) * entriesPerPage)
194 |                 totalEntriesNum = dic['paginationOutput']['totalEntries']
195 |                 print "inserting item #{} out of {} into table {} in database {}".format(currentEntryNum,totalEntriesNum, TABLE_NAME, dbname)
196 | 
197 |                 pprint.pprint(ebay_data_dict)
198 | 
199 |                 keys = ['"{}"'.format(key) for key in ebay_data_dict.keys()] # surround key with quotes
200 |                 values = ebay_data_dict.values() # extract values 
201 |                 insert_statement = 'INSERT INTO {} (%s) values %s'.format(TABLE_NAME)
202 |                 query = cur.mogrify(insert_statement, (AsIs(','.join(keys)), tuple(values)))
203 |                 cur.execute(query)
204 |                 conn.commit()
205 | 
206 | 
207 |         # ------ CLOSE CONNECTION TO DATABSE ----- #
208 |         cur.close()
209 |         conn.close()
210 | 
211 | 
212 |     except ConnectionError as e:
213 |         print(e)
214 |         print(e.response.dict())
215 | 
216 | 
217 | #-------------------------------#
218 | #------------ MAIN -------------#
219 | #-------------------------------#
220 | 
221 | if __name__ == "__main__":
222 |     # print 'connecting to database...'
223 | 	print("Finding samples for SDK version %s" % ebaysdk.get_version())
224 | 	(opts, args) = init_options()
225 | 	run(opts, pagesToQuery=pagesToQuery, entriesPerPage=entriesPerPage, pageStart=pageStart)
226 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/finding.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | © 2012-2013 eBay Software Foundation
  4 | Authored by: Tim Keefer
  5 | Licensed under CDDL 1.0
  6 | '''
  7 | 
  8 | import os
  9 | import sys
 10 | from optparse import OptionParser
 11 | from pprint import pprint 
 12 | 
 13 | # The line below will add this file's parent directory 
 14 | # to the search path for python modules
 15 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__)) # /Users/Naekid/Desktop/capstone-DSI-5/ebaysdk-python/samples
 16 | 
 17 | from common import dump # ebay SDK support file 
 18 | 
 19 | import ebaysdk
 20 | from ebaysdk.finding import Connection as finding
 21 | from ebaysdk.exception import ConnectionError
 22 | 
 23 | 
 24 | def init_options():
 25 |     usage = "usage: %prog [options]"
 26 |     parser = OptionParser(usage=usage)
 27 | 
 28 |     parser.add_option("-d", "--debug",
 29 |                       action="store_true", dest="debug", default=False,
 30 |                       help="Enabled debugging [default: %default]")
 31 |     parser.add_option("-y", "--yaml",
 32 |                       dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml',
 33 |                       help="Specifies the name of the YAML defaults file. [default: %default]")
 34 |     parser.add_option("-a", "--appid",
 35 |                       dest="appid", default=None,
 36 |                       help="Specifies the eBay application id to use.")
 37 | 
 38 |     (opts, args) = parser.parse_args()
 39 |     return opts, args
 40 | 
 41 | 
 42 | 
 43 | def find_completed_item(opts):
 44 | 
 45 |     try:
 46 |         api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True)
 47 | 
 48 |         api_request = {
 49 |             'keywords': 122431840128,
 50 |         }
 51 | 
 52 |         response = api.execute('findCompletedItems', api_request)
 53 | 
 54 |         dic = response.dict()
 55 | 
 56 |         pprint(dic)
 57 | 
 58 |     except ConnectionError as e:
 59 |         print(e)
 60 |         print(e.response.dict())
 61 | 
 62 | 
 63 | 
 64 | def run(opts):
 65 | 
 66 |     try:
 67 |         api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True)
 68 | 
 69 |         api_request = {
 70 |             'keywords': 'camera',
 71 |             'CategoryId' : '31388',
 72 |             'itemFilter': [
 73 |                 {'name': 'Condition', 'value': 'Used'},
 74 |                 {'name': 'LocatedIn', 'value': 'US'},
 75 |                 {'name': 'MinPrice',  'value': '10'}
 76 |             ],
 77 |              'paginationInput': {
 78 |                 'entriesPerPage': '1',
 79 |                 'pageNumber': '1'    
 80 |             },
 81 |             'sortOrder': 'PricePlusShippingLowest',
 82 |         }
 83 | 
 84 |         response = api.execute('findCompletedItems', api_request)
 85 | 
 86 |         dic = response.dict()
 87 | 
 88 |         # print dic.keys() # ['ack', 'timestamp', 'version', 'searchResult', 'paginationOutput']
 89 |         # print dic['searchResult'] # ['item', '_count']
 90 |         # print  dic['searchResult']['item'][0].keys() # ['itemId', 'topRatedListing', 'globalId', 'title', 'country', 'primaryCategory', 'autoPay', 'galleryURL', 'shippingInfo', 'location', 'postalCode', 'returnsAccepted', 'viewItemURL', 'sellingStatus', 'paymentMethod', 'isMultiVariationListing', 'condition', 'listingInfo']
 91 | 
 92 |         for item in dic['searchResult']['item']:
 93 |             print 'listing title:\t\t', item['title']
 94 |             print 'listing sale price($):\t', item['sellingStatus']['currentPrice']['value']
 95 | 
 96 | 
 97 |         # dump(api)
 98 | 
 99 |     except ConnectionError as e:
100 |         print(e)
101 |         print(e.response.dict())
102 | 
103 | 
104 | def run_unicode(opts):
105 | 
106 |     try:
107 |         api = finding(debug=opts.debug, appid=opts.appid,
108 |                       config_file=opts.yaml, warnings=True)
109 | 
110 |         api_request = {
111 |             'keywords': u'Kościół',
112 |         }
113 | 
114 |         response = api.execute('findItemsAdvanced', api_request)
115 |         for i in response.reply.searchResult.item:
116 |             if i.title.find(u'ś') >= 0:
117 |                 print("Matched: %s" % i.title)
118 |                 break
119 | 
120 |         dump(api)
121 | 
122 |     except ConnectionError as e:
123 |         print(e)
124 |         print(e.response.dict())
125 | 
126 | 
127 | def run2(opts):
128 |     try:
129 |         api = finding(debug=opts.debug, appid=opts.appid,
130 |                       config_file=opts.yaml)
131 | 
132 |         response = api.execute('findItemsByProduct',
133 |                                '<productId type="ReferenceID">53039031</productId><paginationInput><entriesPerPage>1</entriesPerPage></paginationInput>')
134 | 
135 |         dump(api)
136 | 
137 |     except ConnectionError as e:
138 |         print(e)
139 |         print(e.response.dict())
140 | 
141 | 
142 | def run_motors(opts):
143 |     api = finding(siteid='EBAY-MOTOR', debug=opts.debug, appid=opts.appid, config_file=opts.yaml,
144 |                   warnings=True)
145 | 
146 |     api.execute('findItemsAdvanced', {
147 |         'keywords': 'tesla',
148 |     })
149 | 
150 |     if api.error():
151 |         raise Exception(api.error())
152 | 
153 |     if api.response_content():
154 |         print("Call Success: %s in length" % len(api.response_content()))
155 | 
156 |     print("Response code: %s" % api.response_code())
157 |     print("Response DOM: %s" % api.response_dom())
158 | 
159 |     dictstr = "%s" % api.response_dict()
160 |     print("Response dictionary: %s..." % dictstr[:250])
161 | 
162 | if __name__ == "__main__":
163 |     print("Finding samples for SDK version %s" % ebaysdk.get_version())
164 |     (opts, args) = init_options()
165 |     find_completed_item(opts)
166 |     # run(opts)
167 |     # run2(opts)
168 |     # run_motors(opts)
169 |     # run_unicode(opts)
170 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/parallel-requests.py:
--------------------------------------------------------------------------------
  1 | # Based on http://www.eamonnbell.com/blog/2015/10/05/the-right-way-to-use-requests-in-parallel-in-python/
  2 | 
  3 | import sys
  4 | import requests
  5 | from multiprocessing import Pool
  6 | import psycopg2
  7 | from scrapy.selector import Selector
  8 | from scrapy.http import HtmlResponse
  9 | import math
 10 | 
 11 | start_index = input('Which index are you starting at?')
 12 | 
 13 | def request_url(data):	
 14 |     (itemId,url) = data
 15 |     print 'parsing itemId:{} from url {}'.format(itemId, url)
 16 |     HTML = requests.get(url).text
 17 |     try:
 18 |         condition = str(Selector(text=HTML).xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()").extract()[0])
 19 |         condition = condition.replace("\'","")
 20 |     except:
 21 |         condition = 'NULL' 
 22 |     
 23 |     return (itemId,condition)
 24 | 
 25 | 
 26 | # connect to database
 27 | dbname='ebay'
 28 | user='nathan'
 29 | host='localhost'
 30 | TABLE_NAME = 'completed_items'
 31 | 
 32 | try:
 33 |     conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
 34 |     print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
 35 | except:
 36 |     print "ERROR: Unable to connect to the database." 
 37 |     sys.exit("Check database connection settings and try again.")
 38 | 
 39 | cur = conn.cursor()
 40 | 
 41 | 
 42 | 
 43 | # get itemId, url from table
 44 | SQL = '''
 45 | SELECT ci."itemId", ci."viewItemURL" 
 46 | FROM completed_items as ci;
 47 | '''
 48 | cur.execute(SQL)
 49 | data = [(int(itemId),str(url)) for itemId,url in cur.fetchall()]
 50 | 
 51 | data_len = int(math.ceil(len(data)/100.))
 52 | # print data[:5]
 53 | print '# of entries in completed_items:',data_len
 54 | print 'start_index:',start_index
 55 | # sys.exit()
 56 | 
 57 | #------- INSERT SCRAPY HERE 
 58 | 
 59 | 
 60 | # break up multi-threaded processing into chunks, so we don't load entire data set into memory 
 61 | for i in range(data_len):
 62 |     print i
 63 |     if i < start_index:
 64 |     	continue
 65 |     else:
 66 | 		try:
 67 | 		    temp_data = data[i*100:i*100+100]
 68 | 		except:
 69 | 		    temp_data = data[i*100:]
 70 | 
 71 | 		pool = Pool(processes=3)
 72 | 		item_condition_list = pool.map(request_url, temp_data)
 73 | 
 74 | 		# for every chunk of data, update postresql table
 75 | 		for itemId,condition in item_condition_list:
 76 | 		    SQL = '''
 77 | 		    UPDATE ONLY completed_items as ci
 78 | 		    SET conditiondescription = '{condition}'
 79 | 		    WHERE ci."itemId" = {itemId};
 80 | 		    '''.format(condition=condition,itemId=itemId)    
 81 | 		    cur.execute(SQL)
 82 | 		    conn.commit()    
 83 | 
 84 | 
 85 | cur.close()
 86 | conn.close()
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | #----------------------- TEST -----------------------#
110 | # import requests
111 | # from multiprocessing import Pool
112 | 
113 | 
114 | # url_list = [ (222447550032,
115 | #   'http://www.ebay.com/itm/Nikon-D750-24-3-MP-Digital-SLR-Camera-Black-Body-Only-Used-/222447550032'),
116 | #  (272592893520,
117 | #   'http://www.ebay.com/itm/Canon-EOS-5D-Mark-II-24-105mm-Lens-and-Camera-Bag-/272592893520'),
118 | #  (302257646034,
119 | #   'http://www.ebay.com/itm/DJI-Inspire-1-V1-0-4K-X3-Camera-and-3-Axis-Gimbal-Drone-Quadcopter-Extras-/302257646034'),
120 | #  (222445254405,
121 | #   'http://www.ebay.com/itm/Samsung-NX-NX1-28-2-MP-Digital-Camera-Black-Kit-w-50-200mm-OIS-Lens-/222445254405'),
122 | #  (142319141084,
123 | #   'http://www.ebay.com/itm/Panasonic-AJ-HDC27F-2-3-HD-DVCPRO-Varicam-Video-Camera-Camcorder-w-Viewfinder-/142319141084'),
124 | #  (252816500866,
125 | #   'http://www.ebay.com/itm/High-Speed-Pin-Registered-Super-8-Cartridge-Camera-Very-Rare-Logmar-Wilcam-/252816500866'),
126 | #  (332163276401,
127 | #   'http://www.ebay.com/itm/Carl-Zeiss-Planar-T-80mm-f-2-AF-Lens-Contax-645-camera-/332163276401'),
128 | #  (252821264198,
129 | #   'http://www.ebay.com/itm/DJI-Mavic-Pro-Folding-Drone-4K-Stabilized-Camera-Active-Track-Avoidance-GPS-/252821264198'),
130 | # ]
131 | 
132 | # url_list = [url for itemId,url in url_list]
133 | 
134 | # def internet_getter(url):
135 | # 	s = requests.Session()
136 | 
137 | # 	print url
138 | # 	s.get(url).text
139 | 
140 | 
141 | # pool = Pool(processes=3)
142 | # pool_outputs = pool.map(internet_getter,
143 | #                         url_list)
144 | 
145 | # pool.close()
146 | # pool.join()
147 | 
148 | # print pool_outputs


--------------------------------------------------------------------------------
/ebay-api-scraper/scrapy-development.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {
 7 |     "collapsed": true
 8 |    },
 9 |    "outputs": [],
10 |    "source": []
11 |   }
12 |  ],
13 |  "metadata": {
14 |   "kernelspec": {
15 |    "display_name": "Python 2",
16 |    "language": "python",
17 |    "name": "python2"
18 |   },
19 |   "language_info": {
20 |    "codemirror_mode": {
21 |     "name": "ipython",
22 |     "version": 2
23 |    },
24 |    "file_extension": ".py",
25 |    "mimetype": "text/x-python",
26 |    "name": "python",
27 |    "nbconvert_exporter": "python",
28 |    "pygments_lexer": "ipython2",
29 |    "version": "2.7.13"
30 |   }
31 |  },
32 |  "nbformat": 4,
33 |  "nbformat_minor": 2
34 | }
35 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/trading.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | '''
  3 | © 2012-2013 eBay Software Foundation
  4 | Authored by: Tim Keefer
  5 | Licensed under CDDL 1.0
  6 | '''
  7 | 
  8 | import os
  9 | import sys
 10 | import datetime
 11 | from optparse import OptionParser
 12 | import psycopg2
 13 | import time
 14 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__))
 15 | 
 16 | from common import dump
 17 | 
 18 | import ebaysdk
 19 | from ebaysdk.utils import getNodeText
 20 | from ebaysdk.exception import ConnectionError
 21 | from ebaysdk.trading import Connection as Trading
 22 | 
 23 | from pprint import pprint
 24 | 
 25 | 
 26 | host = 'localhost'
 27 | user = 'nathan'
 28 | dbname = 'ebay' 
 29 | tablename = 'category_specifics'
 30 | 
 31 | 
 32 | numArgs = len(sys.argv)
 33 | if numArgs != 2:
 34 |     print 'Incorrect number of input arguments'
 35 |     sys.exit()
 36 | 
 37 | offset = sys.argv[1]
 38 | 
 39 | 
 40 | def init_options():
 41 | 
 42 |     usage = "usage: %prog [options]"
 43 |     parser = OptionParser(usage=usage)
 44 | 
 45 |     parser.add_option("-d", "--debug",
 46 |                       action="store_true", dest="debug", default=False,
 47 |                       help="Enabled debugging [default: %default]")
 48 |     parser.add_option("-y", "--yaml",
 49 |                       dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml',
 50 |                       help="Specifies the name of the YAML defaults file. [default: %default]")
 51 |     parser.add_option("-a", "--appid",
 52 |                       dest="appid", default=None,
 53 |                       help="Specifies the eBay application id to use.")
 54 |     parser.add_option("-p", "--devid",
 55 |                       dest="devid", default=None,
 56 |                       help="Specifies the eBay developer id to use.")
 57 |     parser.add_option("-c", "--certid",
 58 |                       dest="certid", default=None,
 59 |                       help="Specifies the eBay cert id to use.")
 60 | 
 61 |     (opts, args) = parser.parse_args()
 62 |     return opts, args
 63 | 
 64 | 
 65 | 
 66 | def getItem(opts):
 67 | 
 68 |     try:
 69 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
 70 |                       certid=opts.certid, devid=opts.devid)
 71 | 
 72 |         try:
 73 |             conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
 74 |             print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
 75 |         except:
 76 |             print "ERROR: Unable to connect to the database." 
 77 |             sys.exit("Check database connection settings and try again.")
 78 | 
 79 |         cur = conn.cursor()
 80 | 
 81 |         #----------- GRAB ALL ITEM IDs IN TABLE ----------#        
 82 |         query = '''SELECT ci."itemId" 
 83 |         FROM category_specifics as ci 
 84 |         ORDER BY ci."Model" ASC
 85 |         OFFSET {offset};'''.format(offset=offset) # 4745
 86 | 
 87 |         cur.execute(query)  
 88 |         item_ids = cur.fetchall() 
 89 | 
 90 |         for i,itemId in enumerate(item_ids[:]):
 91 | 
 92 |             itemId = itemId[0] # itemId is a tuple
 93 |             print 'Updating item #{} out of {}'.format(i+1, len(item_ids))
 94 |             print 'calling getItem for itemID:{}'.format(itemId)
 95 | 
 96 |             api_request = {
 97 |                 'itemID':itemId,
 98 |                 'IncludeItemSpecifics':1,
 99 |             }
100 | 
101 |             try:
102 |                 response = api.execute('GetItem', api_request)
103 |             except ConnectionError as e:
104 |                 print(e)
105 |                 print(e.response.dict())
106 |                 continue
107 | 
108 |             dic = response.dict()
109 | 
110 |             # pprint(dic) # debug 
111 | 
112 |             allowed_columns = ['Type','Brand','MPN','Series','Model',\
113 |                                 'Megapixels','Optical Zoom','Features',\
114 |                                 'Color','Bundled Items','Connectivity',\
115 |                                 'Battery Type','Manufacturer Warranty',\
116 |                                 'Screen Size','Digital Zoom']
117 |             newDict = {}
118 |             try:
119 |                 if not isinstance(dic['Item']['ItemSpecifics']['NameValueList'], list): # if only one item 
120 |                     # print dic['Item']['ItemSpecifics']['NameValueList']
121 |                     name = dic['Item']['ItemSpecifics']['NameValueList']['Name']
122 |                     if name in allowed_columns:
123 |                         value = dic['Item']['ItemSpecifics']['NameValueList']['Value']
124 |                         newDict[name] = value
125 | 
126 |                 else:
127 |                     for nameValueDict in dic['Item']['ItemSpecifics']['NameValueList']:
128 |                         # print nameValueDict
129 |                         name = nameValueDict['Name']
130 |                         if name in allowed_columns:
131 |                             value = nameValueDict['Value']          
132 |                             if isinstance(value, list):
133 |                                 value = ','.join(value) # join the lists into a string seperated by commas
134 | 
135 |                             try:
136 |                                 newDict[name] = value.decode('unicode_escape').encode('ascii','ignore')
137 |                             except:
138 |                                 print 'Problem with value, could not decode for some reason.'
139 |                                 continue
140 | 
141 |             except KeyError as e: # no item specifics in response 
142 |                 print 'No ItemSpecifics field in response.', e
143 |                 continue
144 | 
145 | 
146 | 
147 |             keys = ['"{}"'.format(key.decode('unicode_escape').encode('ascii','ignore')) for key in newDict.keys()]  
148 |             insert_statement = '''UPDATE {table_name} SET (%s) = %s WHERE "itemId"={item_id};
149 |                                 '''.format(table_name=tablename,
150 |                                             item_id=itemId)
151 |             query = cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), tuple(newDict.values())))
152 |             # print query
153 | 
154 |             try:
155 |                 cur.execute(query) # execute SQL, and commit changes 
156 |                 conn.commit()
157 |             except:
158 |                 print '\nError with executing SQL statement at item #{}, itemId={}.\n'.format(i, itemId)
159 |                 print query
160 |                 conn.rollback()
161 | 
162 |             # time.sleep(0.04) # throttle 
163 | 
164 |     except ConnectionError as e:
165 |         print(e)
166 |         print(e.response.dict())
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | def getCategorySpecifics(opts):
174 | 
175 | 
176 |     try:
177 |         conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
178 |         print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
179 |     except:
180 |         print "I am unable to connect to the database"
181 |         sys.exit()
182 | 
183 |     cur = conn.cursor()
184 | 
185 | 
186 |     try:
187 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
188 |                       certid=opts.certid, devid=opts.devid)
189 | 
190 |         api_request = {
191 |             'CategoryID':'31388' # Digital Cameras 
192 |         }
193 | 
194 |         response = api.execute('GetCategorySpecifics', api_request)
195 | 
196 |         dic = response.dict()
197 | 
198 |         for item in dic['Recommendations']['NameRecommendation']:
199 |             SQL = 'ALTER TABLE {} ADD COLUMN "{}" TEXT;'.format(tablename, item['Name'])
200 |             execute = raw_input('Execute "{}"? (y/n):'.format(SQL))
201 |             if execute == 'y':
202 |                 cur.execute(SQL)
203 |                 conn.commit()
204 |                 execute = 'n'
205 | 
206 | 
207 |     except ConnectionError as e:
208 |         print(e)
209 |         print(e.response.dict())
210 | 
211 |     cur.close()
212 |     conn.close()
213 | 
214 | 
215 | def getAPIAccessRules(opts):
216 | 
217 | 
218 |     try:
219 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
220 |                       certid=opts.certid, devid=opts.devid)
221 | 
222 | 
223 |         response = api.execute('GetApiAccessRules')
224 | 
225 |         dic = response.dict()
226 | 
227 |         pprint(dic)
228 | 
229 | 
230 |     except ConnectionError as e:
231 |         print(e)
232 |         print(e.response.dict())
233 | 
234 | 
235 | 
236 | def run(opts):
237 | 
238 |     try:
239 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
240 |                       certid=opts.certid, devid=opts.devid)
241 | 
242 |         api.execute('GetCharities', {'CharityID': 3897})
243 |         dump(api)
244 |         print(api.response.reply.Charity.Name)
245 | 
246 |     except ConnectionError as e:
247 |         print(e)
248 |         print(e.response.dict())
249 | 
250 | 
251 | def feedback(opts):
252 |     try:
253 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
254 |                       certid=opts.certid, devid=opts.devid, warnings=False)
255 | 
256 |         api.execute('GetFeedback', {'UserID': 'tim0th3us'})
257 |         dump(api)
258 | 
259 |         if int(api.response.reply.FeedbackScore) > 50:
260 |             print("Doing good!")
261 |         else:
262 |             print("Sell more, buy more..")
263 | 
264 |     except ConnectionError as e:
265 |         print(e)
266 |         print(e.response.dict())
267 | 
268 | 
269 | def getTokenStatus(opts):
270 | 
271 |     try:
272 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
273 |                       certid=opts.certid, devid=opts.devid, warnings=False)
274 | 
275 |         api.execute('GetTokenStatus')
276 |         dump(api)
277 | 
278 |     except ConnectionError as e:
279 |         print(e)
280 |         print(e.response.dict())
281 | 
282 | 
283 | def verifyAddItem(opts):
284 |     """http://www.utilities-online.info/xmltojson/#.UXli2it4avc
285 |     """
286 | 
287 |     try:
288 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
289 |                       certid=opts.certid, devid=opts.devid, warnings=False)
290 | 
291 |         myitem = {
292 |             "Item": {
293 |                 "Title": "Harry Potter and the Philosopher's Stone",
294 |                 "Description": "This is the first book in the Harry Potter series. In excellent condition!",
295 |                 "PrimaryCategory": {"CategoryID": "377"},
296 |                 "StartPrice": "1.0",
297 |                 "CategoryMappingAllowed": "true",
298 |                 "Country": "US",
299 |                 "ConditionID": "3000",
300 |                 "Currency": "USD",
301 |                 "DispatchTimeMax": "3",
302 |                 "ListingDuration": "Days_7",
303 |                 "ListingType": "Chinese",
304 |                 "PaymentMethods": "PayPal",
305 |                 "PayPalEmailAddress": "tkeefdddder@gmail.com",
306 |                 "PictureDetails": {"PictureURL": "http://i1.sandbox.ebayimg.com/03/i/00/30/07/20_1.JPG?set_id=8800005007"},
307 |                 "PostalCode": "95125",
308 |                 "Quantity": "1",
309 |                 "ReturnPolicy": {
310 |                     "ReturnsAcceptedOption": "ReturnsAccepted",
311 |                     "RefundOption": "MoneyBack",
312 |                     "ReturnsWithinOption": "Days_30",
313 |                     "Description": "If you are not satisfied, return the book for refund.",
314 |                     "ShippingCostPaidByOption": "Buyer"
315 |                 },
316 |                 "ShippingDetails": {
317 |                     "ShippingType": "Flat",
318 |                     "ShippingServiceOptions": {
319 |                         "ShippingServicePriority": "1",
320 |                         "ShippingService": "USPSMedia",
321 |                         "ShippingServiceCost": "2.50"
322 |                     }
323 |                 },
324 |                 "Site": "US"
325 |             }
326 |         }
327 | 
328 |         api.execute('VerifyAddItem', myitem)
329 |         dump(api)
330 | 
331 |     except ConnectionError as e:
332 |         print(e)
333 |         print(e.response.dict())
334 | 
335 | 
336 | def verifyAddItemErrorCodes(opts):
337 |     """http://www.utilities-online.info/xmltojson/#.UXli2it4avc
338 |     """
339 | 
340 |     try:
341 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
342 |                       certid=opts.certid, devid=opts.devid, warnings=False)
343 | 
344 |         myitem = {
345 |             "Item": {
346 |                 "Title": "Harry Potter and the Philosopher's Stone",
347 |                 "Description": "This is the first book in the Harry Potter series. In excellent condition!",
348 |                 "PrimaryCategory": {"CategoryID": "377aaaaaa"},
349 |                 "StartPrice": "1.0",
350 |                 "CategoryMappingAllowed": "true",
351 |                 "Country": "US",
352 |                 "ConditionID": "3000",
353 |                 "Currency": "USD",
354 |                 "DispatchTimeMax": "3",
355 |                 "ListingDuration": "Days_7",
356 |                 "ListingType": "Chinese",
357 |                 "PaymentMethods": "PayPal",
358 |                 "PayPalEmailAddress": "tkeefdddder@gmail.com",
359 |                 "PictureDetails": {"PictureURL": "http://i1.sandbox.ebayimg.com/03/i/00/30/07/20_1.JPG?set_id=8800005007"},
360 |                 "PostalCode": "95125",
361 |                 "Quantity": "1",
362 |                 "ReturnPolicy": {
363 |                     "ReturnsAcceptedOption": "ReturnsAccepted",
364 |                     "RefundOption": "MoneyBack",
365 |                     "ReturnsWithinOption": "Days_30",
366 |                     "Description": "If you are not satisfied, return the book for refund.",
367 |                     "ShippingCostPaidByOption": "Buyer"
368 |                 },
369 |                 "ShippingDetails": {
370 |                     "ShippingType": "Flat",
371 |                     "ShippingServiceOptions": {
372 |                         "ShippingServicePriority": "1",
373 |                         "ShippingService": "USPSMedia",
374 |                         "ShippingServiceCost": "2.50"
375 |                     }
376 |                 },
377 |                 "Site": "US"
378 |             }
379 |         }
380 | 
381 |         api.execute('VerifyAddItem', myitem)
382 | 
383 |     except ConnectionError as e:
384 |         # traverse the DOM to look for error codes
385 |         for node in api.response.dom().findall('ErrorCode'):
386 |             print("error code: %s" % node.text)
387 | 
388 |         # check for invalid data - error code 37
389 |         if 37 in api.response_codes():
390 |             print("Invalid data in request")
391 | 
392 |         print(e)
393 |         print(e.response.dict())
394 | 
395 | 
396 | def uploadPicture(opts):
397 | 
398 |     try:
399 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
400 |                       certid=opts.certid, devid=opts.devid, warnings=True)
401 | 
402 |         pictureData = {
403 |             "WarningLevel": "High",
404 |             "ExternalPictureURL": "http://developer.ebay.com/DevZone/XML/docs/images/hp_book_image.jpg",
405 |             "PictureName": "WorldLeaders"
406 |         }
407 | 
408 |         api.execute('UploadSiteHostedPictures', pictureData)
409 |         dump(api)
410 | 
411 |     except ConnectionError as e:
412 |         print(e)
413 |         print(e.response.dict())
414 | 
415 | 
416 | def uploadPictureFromFilesystem(opts, filepath):
417 | 
418 |     try:
419 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
420 |                       certid=opts.certid, devid=opts.devid, warnings=True)
421 | 
422 |         # pass in an open file
423 |         # the Requests module will close the file
424 |         files = {'file': ('EbayImage', open(filepath, 'rb'))}
425 | 
426 |         pictureData = {
427 |             "WarningLevel": "High",
428 |             "PictureName": "WorldLeaders"
429 |         }
430 | 
431 |         api.execute('UploadSiteHostedPictures', pictureData, files=files)
432 |         dump(api)
433 | 
434 |     except ConnectionError as e:
435 |         print(e)
436 |         print(e.response.dict())
437 | 
438 | 
439 | def memberMessages(opts):
440 | 
441 |     try:
442 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
443 |                       certid=opts.certid, devid=opts.devid, warnings=True)
444 | 
445 |         now = datetime.datetime.now()
446 | 
447 |         memberData = {
448 |             "WarningLevel": "High",
449 |             "MailMessageType": "All",
450 |             # "MessageStatus": "Unanswered",
451 |             "StartCreationTime": now - datetime.timedelta(days=60),
452 |             "EndCreationTime": now,
453 |             "Pagination": {
454 |                 "EntriesPerPage": "5",
455 |                 "PageNumber": "1"
456 |             }
457 |         }
458 | 
459 |         api.execute('GetMemberMessages', memberData)
460 | 
461 |         dump(api)
462 | 
463 |         if api.response.reply.has_key('MemberMessage'):
464 |             messages = api.response.reply.MemberMessage.MemberMessageExchange
465 | 
466 |             if type(messages) != list:
467 |                 messages = [messages]
468 | 
469 |             for m in messages:
470 |                 print("%s: %s" % (m.CreationDate, m.Question.Subject[:50]))
471 | 
472 |     except ConnectionError as e:
473 |         print(e)
474 |         print(e.response.dict())
475 | 
476 | 
477 | def getUser(opts):
478 |     try:
479 | 
480 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
481 |                       certid=opts.certid, devid=opts.devid, warnings=True, timeout=20, siteid='101')
482 | 
483 |         api.execute('GetUser', {'UserID': 'sallyma789'})
484 |         dump(api, full=False)
485 | 
486 |     except ConnectionError as e:
487 |         print(e)
488 |         print(e.response.dict())
489 | 
490 | 
491 | def getOrders(opts):
492 | 
493 |     try:
494 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
495 |                       certid=opts.certid, devid=opts.devid, warnings=True, timeout=20)
496 | 
497 |         api.execute('GetOrders', {'NumberOfDays': 30})
498 |         dump(api, full=False)
499 | 
500 |     except ConnectionError as e:
501 |         print(e)
502 |         print(e.response.dict())
503 | 
504 | 
505 | def categories(opts):
506 | 
507 |     try:
508 |         api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid,
509 |                       certid=opts.certid, devid=opts.devid, warnings=True, timeout=20, siteid='101')
510 | 
511 |         callData = {
512 |             'DetailLevel': 'ReturnAll',
513 |             'CategorySiteID': 101,
514 |             'LevelLimit': 4,
515 |         }
516 | 
517 |         api.execute('GetCategories', callData)
518 |         dump(api, full=False)
519 | 
520 |     except ConnectionError as e:
521 |         print(e)
522 |         print(e.response.dict())
523 | 
524 | '''
525 | api = trading(domain='api.sandbox.ebay.com')
526 | api.execute('GetCategories', {
527 |     'DetailLevel': 'ReturnAll',
528 |     'CategorySiteID': 101,
529 |     'LevelLimit': 4,
530 | })
531 | '''
532 | 
533 | if __name__ == "__main__":
534 |     (opts, args) = init_options()
535 | 
536 |     print("Trading API Samples for version %s" % ebaysdk.get_version())
537 | 
538 |     """
539 |     run(opts)
540 |     feedback(opts)
541 |     verifyAddItem(opts)
542 |     getTokenStatus(opts)
543 |     verifyAddItemErrorCodes(opts)
544 |     uploadPicture(opts)
545 |     uploadPictureFromFilesystem(opts, ("%s/test_image.jpg" % os.path.dirname(__file__)))
546 |     memberMessages(opts)
547 |     categories(opts)
548 |     """
549 |     # getUser(opts)
550 |     # getOrders(opts)
551 | 
552 | 
553 |     getItem(opts)
554 |     # getCategorySpecifics(opts)
555 |     # getAPIAccessRules(opts)
556 | 


--------------------------------------------------------------------------------
/ebay-api-scraper/update-ebay-table.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | from scrapy.selector import Selector
 3 | from scrapy.http import HtmlResponse
 4 | import requests
 5 | 
 6 | 
 7 | start_index = input('Which index are you starting at?')
 8 | 
 9 | dbname='ebay'
10 | user='nathan'
11 | host='localhost'
12 | TABLE_NAME = 'completed_items'
13 | 
14 | try:
15 |     conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
16 |     print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
17 | except:
18 |     print "ERROR: Unable to connect to the database." 
19 |     sys.exit("Check database connection settings and try again.")
20 | 
21 | cur = conn.cursor()
22 | 
23 | SQL = '''
24 | SELECT ci."itemId", ci."viewItemURL" 
25 | FROM completed_items as ci;
26 | '''
27 | cur.execute(SQL)
28 | data = [(int(itemId),str(url)) for itemId,url in cur.fetchall()]
29 | 
30 | 
31 | SQL = '''
32 | SELECT count(*) 
33 | FROM completed_items;
34 | '''
35 | cur.execute(SQL)
36 | total_rows = int(cur.fetchall()[0][0])
37 | 
38 | 
39 | for i,(itemId,url) in enumerate(data):
40 |     if i < start_index:
41 |         continue 
42 | 
43 |     if (i+1) % 50 == 0:
44 |         print "updating row #{} out of {}".format(i+1, total_rows)
45 |     HTML = requests.get(url).text
46 |     try:
47 |         condition = str(Selector(text=HTML).xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()").extract()[0])
48 |         condition = condition.replace("\'","")
49 |     except:
50 |         condition = 'NULL' 
51 |     print i,condition
52 |     SQL = '''
53 |     UPDATE ONLY completed_items as ci
54 |     SET conditiondescription = '{condition}'
55 |     WHERE ci."itemId" = {itemId};
56 |     '''.format(condition=condition,itemId=itemId)    
57 |     cur.execute(SQL)
58 |     conn.commit()
59 |     
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 |     
71 |     
72 | 


--------------------------------------------------------------------------------
/index.md:
--------------------------------------------------------------------------------
1 | # Ebay Listing Optimization With Machine Learning
2 | 


--------------------------------------------------------------------------------
/mongo-test/mongo-test.py:
--------------------------------------------------------------------------------
 1 | from pymongo import MongoClient
 2 | import datetime
 3 | 
 4 | client = MongoClient('localhost', 27017)
 5 | 
 6 | # get a database
 7 | db = client['posts-database']
 8 | 
 9 | # get a collection called posts
10 | posts = db['posts']
11 | 
12 | # sample data
13 | post = {
14 | 	"author": "Mike",
15 | 	"text": "My first blog post!",
16 | 	"tags": ["mongodb", "python", "pymongo"],
17 | 	"date": datetime.datetime.utcnow()
18 | }
19 | 
20 | # insert a document into a collection
21 | # when a document is inserted, a special key, "_id" 
22 | # is automatically added if the document doesn't contain the "_id" key. "_id" must be unique. 
23 | post_id = posts.insert_one(post).inserted_id 
24 | print post_id
25 | 
26 | # After inserting the first document, the posts collection has actually been created on the server. 
27 | # verify by listing all the collections in the database 
28 | print db.collection_names(include_system_collections=False) 
29 | 
30 | 


--------------------------------------------------------------------------------
/postgresql-test/postgres cheat sheet:
--------------------------------------------------------------------------------
  1 | #------------- TERMINAL COMMANDS -------------#
  2 | 
  3 | # Enter a postgres database 
  4 | $ psql [dbname]
  5 | 
  6 | # Enter a database called test-db1 with username nathan
  7 | $ psql -U nathan ebay
  8 | 
  9 | # Enter postgres database with username postgres
 10 | $ psql -U postgres postgres 
 11 | 
 12 | 
 13 | #------------------ POSTGRES -----------------#
 14 | #--- for running within postgres shell prompt ----#
 15 | 
 16 | #--- How to check users 
 17 | postgres=# \du
 18 | 
 19 | #--- Check current user
 20 | postgres=# select current_user;
 21 | 
 22 | #--- Check current database 
 23 | postgres=# select current_database();
 24 | 
 25 | #--- Print a list of databases
 26 | postgres=# \l
 27 | 
 28 | #--- Create a database 
 29 | postgres=# CREATE DATABASE dbname;
 30 | 
 31 | #--- Print a list of tables
 32 | postgres=# \d 
 33 | 
 34 | #--- Print a description of a table 
 35 | postgres=# \d tablename
 36 | 
 37 | #--- Quit
 38 | postgres=# \q
 39 | 
 40 | 
 41 | #------------------ POSTGRES -----------------#
 42 | #----------------- SQL COMMANDS --------------#
 43 | 
 44 | #------- DATABASE MANAGEMENT -------# 
 45 | 
 46 | #--- Delete, Drop a database 
 47 | DROP DATABASE [IF EXISTS] name;
 48 | 
 49 | #--- Create a user with permission to create databases 
 50 | CREATE USER nathan CREATEDB
 51 | 
 52 | #--------- TABLE MANAGEMENT --------#
 53 | 
 54 | #--- Create a table 
 55 | CREATE TABLE tablename 
 56 | 
 57 | #--- Copy a table
 58 | SELECT * INTO newTable FROM oldTable;
 59 | 
 60 | #--- Drop a table
 61 | DROP TABLE tablename;
 62 | 
 63 | #--- delete all rows in a table
 64 | TRUNCATE tablename;
 65 | #--- Delete some rows in a table 
 66 | DELETE FROM 
 67 | 	completed_items_15230_31388 as ci 
 68 | WHERE 
 69 | 	ci."primaryCategory.categoryId"!=31388 
 70 | 	AND 
 71 | 	ci."primaryCategory.categoryId"!=15230;
 72 | 
 73 | #--- View column names in a table
 74 | SELECT column_name from information_schema.columns where table_name ='your_table';
 75 | 
 76 | #--- Create a primary key in your table 
 77 | ALTER TABLE completed_items ADD COLUMN id SERIAL;
 78 | 
 79 | #--- Drop a column
 80 | ALTER TABLE tablename DROP COLUMN colname;
 81 | 
 82 | #--- Rename a column
 83 | ALTER TABLE distributors RENAME COLUMN address TO city;
 84 | 
 85 | #--- Insert data into a table 
 86 | INSERT INTO tablename (col1, col2) VALUES (val1, val2);
 87 | INSERT INTO tablename 
 88 | 	SELECT * FROM table2name;
 89 | 
 90 | #--- Delete rows with duplicate values in certain columns
 91 | # A frequent question in IRC is how to delete rows that are duplicates over a set of columns, keeping only the one with the lowest ID.
 92 | # This query does that for all rows of tablename having the same column1, column2, and column3.
 93 | DELETE FROM tablename
 94 | WHERE id IN (SELECT id
 95 |               FROM (SELECT id,
 96 |                              ROW_NUMBER() OVER (partition BY column1, column2, column3 ORDER BY id) AS rnum
 97 |                      FROM tablename) t
 98 |               WHERE t.rnum > 1);
 99 | 
100 | #--- Add serial primary key to a table
101 | ALTER TABLE tableName ADD COLUMN id SERIAL PRIMARY KEY;
102 | 
103 | #------- SELECTION ------#
104 | 
105 | #--- Get the hour from a table 
106 | SELECT CAST(EXTRACT(DAY FROM colname) AS INT) FROM tablename;
107 | 
108 | #--- select a range of rows
109 | # Get rows 5 - 14
110 | SELECT * from tablename LIMIT 10 OFFSET 5; 
111 | 
112 | #------------------- SETTINGS ---------------#
113 | #--- toggle between wide/narrow output (print one column per line in the terminal!!)
114 | postgres=# \x on 
115 | 
116 | 


--------------------------------------------------------------------------------
/postgresql-test/postgresql-test.py:
--------------------------------------------------------------------------------
 1 | import psycopg2
 2 | 
 3 | dbname='test-db1'
 4 | user='nathan'
 5 | host='localhost'
 6 | 
 7 | try:
 8 |     conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host))
 9 |     print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host)
10 | except:
11 |     print "I am unable to connect to the database"
12 | 
13 | cur = conn.cursor()
14 | 
15 | cur.execute()
16 | 
17 | rows = cur.fetchall()
18 | 
19 | if rows:
20 | 	for row in rows:
21 |     	print row[0]
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------