├── .gitignore ├── Ebay Capstone Progress Journal.rtf ├── README.md ├── _config.yml ├── bh_photo_scraper ├── bh_photo_scraper │ ├── __init__.py │ ├── __init__.pyc │ ├── items.py │ ├── items.pyc │ ├── middlewares.py │ ├── pipelines.py │ ├── pipelines.pyc │ ├── settings.py │ ├── settings.pyc │ └── spiders │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── digital_camera_spider.py │ │ └── digital_camera_spider.pyc └── scrapy.cfg ├── capstone-technical-report └── images │ ├── buyers_guide_example.png │ ├── classification_case_study.png │ ├── completed_items_v2.png │ ├── csm_start_price.png │ ├── cyber_shot.png │ ├── example_dataframe.png │ └── gently_used.png ├── data-analysis ├── CSM-start-price-exploration.ipynb ├── auctions-modeling-2.ipynb ├── auctions-modeling-classification.ipynb ├── auctions-modeling-regression.ipynb ├── auctions-modeling.ipynb ├── data-cleaning │ ├── extract-features-bh-photo.ipynb │ └── extract-features-from-ebay-category-specifics.ipynb ├── ebay-exploration.ipynb ├── feature-engineer-time-of-day.ipynb ├── feature-engineering-concurrent-similar-median-start-price │ └── feature-engineer-concurrent-median-start-price.ipynb ├── find-model-prices.ipynb └── utilities │ ├── clean_text.py │ └── plot_learning_curve.py ├── ebay-api-scraper ├── .ipynb_checkpoints │ ├── datetime test-checkpoint.ipynb │ ├── ebay-exploration-checkpoint.ipynb │ ├── ebay-scraper-checkpoint.ipynb │ └── scrapy-development-checkpoint.ipynb ├── common.py ├── common.pyc ├── create-ebay-tables.py ├── datetime test.ipynb ├── ebay-scraper.ipynb ├── ebay.yaml ├── ebay_scraper │ ├── ebay_scraper │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ └── settings.cpython-35.pyc │ │ ├── items.py │ │ ├── items.pyc │ │ ├── pipelines.py │ │ ├── pipelines.pyc │ │ ├── settings.py │ │ ├── settings.pyc │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ └── ebay_spider.cpython-35.pyc │ │ │ ├── ebay_spider.py │ │ │ ├── ebay_spider.pyc │ │ │ └── ebay_spider_v2.py │ ├── ebay_spider_log.log │ └── scrapy.cfg ├── find-completed-listing.py ├── finding.py ├── parallel-requests.py ├── scrapy-development.ipynb ├── trading.py └── update-ebay-table.py ├── index.md ├── mongo-test └── mongo-test.py └── postgresql-test ├── postgres cheat sheet └── postgresql-test.py /.gitignore: -------------------------------------------------------------------------------- 1 | ebay-api-scraper/ebay_scraper/ebay_scraper/ebay_spider_log.log 2 | 3 | data-analysis/pickles 4 | data-analysis/feature-engineering-concurrent-similar-median-start-price/pickles 5 | data-analysis/pickles/auctions.p 6 | data-analysis/.ipynb_checkpoints/CSM-start-price-exploration-checkpoint.ipynb 7 | data-analysis/.ipynb_checkpoints/auctions-modeling-checkpoint.ipynb 8 | data-analysis/.ipynb_checkpoints/feature-engineer-concurrent-median-start-price-checkpoint.ipynb 9 | data-analysis/.ipynb_checkpoints/ipython-parallel-tutorial-checkpoint.ipynb 10 | data-analysis/feature-engineering-concurrent-similar-median-start-price/.ipynb_checkpoints/ 11 | data-analysis/feature-engineering-concurrent-similar-median-start-price/pickles/ 12 | data-analysis/pickles/df_classification_count_vec.p 13 | data-analysis/pickles/df_classification_tfidf.p 14 | data-analysis/pickles/df_regression_tfidf.p 15 | data-analysis/pickles/df_regression_tfidf_all.p 16 | data-analysis/utilities/clean_text.pyc 17 | ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/items.cpython-35.pyc 18 | ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider_v2.pyc 19 | -------------------------------------------------------------------------------- /Ebay Capstone Progress Journal.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1504\cocoasubrtf810 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fmodern\fcharset0 Courier;\f2\froman\fcharset0 Palatino-Roman; 3 | \f3\fmodern\fcharset0 Courier-Bold;\f4\fswiss\fcharset0 ArialMT;\f5\fnil\fcharset0 HelveticaNeue; 4 | } 5 | {\colortbl;\red255\green255\blue255;\red0\green0\blue0;\red85\green142\blue40;\red0\green0\blue0; 6 | \red255\green83\blue8;\red133\green0\blue175;\red174\green0\blue240;\red255\green255\blue255;\red255\green39\blue18; 7 | \red63\green105\blue30;\red255\green255\blue51;\red179\green179\blue179;\red128\green128\blue128;\red255\green250\blue131; 8 | \red38\green38\blue38;\red255\green255\blue255;\red194\green229\blue166;\red192\green237\blue254;\red255\green252\blue171; 9 | \red255\green164\blue159;\red254\green187\blue100;\red194\green229\blue166;\red0\green0\blue0;\red255\green255\blue255; 10 | } 11 | {\*\expandedcolortbl;;\cssrgb\c0\c0\c0;\csgenericrgb\c33333\c55686\c15686;\csgenericrgb\c0\c0\c0; 12 | \csgenericrgb\c100000\c32549\c3137;\csgenericrgb\c52157\c0\c68627;\csgenericrgb\c68235\c0\c94118;\csgenericrgb\c100000\c100000\c100000;\csgenericrgb\c100000\c15294\c7059; 13 | \csgenericrgb\c24706\c41176\c11765;\csgenericrgb\c100000\c100000\c20000;\csgray\c75407;\csgray\c57415;\csgenericrgb\c100000\c98039\c51373; 14 | \cssrgb\c20000\c20000\c20000;\cssrgb\c100000\c100000\c100000;\csgenericrgb\c76078\c89804\c65098;\csgenericrgb\c75294\c92941\c99608;\csgenericrgb\c100000\c98824\c67059; 15 | \csgenericrgb\c100000\c64314\c62353;\csgenericrgb\c99608\c73333\c39216;\csgenericrgb\c76078\c89804\c65098;\cssrgb\c0\c0\c0;\cssrgb\c100000\c100000\c100000; 16 | } 17 | \margl1440\margr1440\vieww19000\viewh9060\viewkind0 18 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 19 | 20 | \f0\fs24 \cf0 eBay Capstone Work Journal:\ 21 | I chose cameras because:\ 22 | \pard\pardeftab720\partightenfactor0 23 | \cf2 \expnd0\expndtw0\kerning0 24 | a) they can be evaluated by using well-structured data that lends itself well to machine learning techniques\ 25 | b) they are representable as a set of easily quantified parameters\ 26 | c) they represent a large market of used items\ 27 | d) their prices are predictable and relatively stable for short-term horizons. \ 28 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 29 | \cf0 \kerning1\expnd0\expndtw0 \ 30 | \ 31 | 3/21 - 3/27:\ 32 | - Experimented with MongoDB, didn\'92t like fact that data was stored in JSON, and the fact that some rows can have different columns.\ 33 | - Chose postgresql due to familiarity with SQL, flexible types, \ 34 | - setup postresql ebay database with completed_items table under user: nathan\ 35 | - Set up ebay API to get completed items and store data into postgres database. \ 36 | - set up multi-processed web scraper with scrapy and multiprocessing.Pool to update \'93condition\'94 fields in ebay table from scraped item condition data. Sometimes the item doesn\'92t have a condition, in which case, an empty string is put into the field. \ 37 | \ 38 | 3/28:\ 39 | Goal:\ 40 | in \ 41 | /Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_scraper \ 42 | rub scrapy\ 43 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 44 | 45 | \f1 \cf3 $ scrapy crawl ebay_crawl_spider -a url_start_index=92224\ 46 | \ 47 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 48 | \cf4 enter postgres database \cf3 \ 49 | $ psql - U nathan ebay 50 | \f0 \ 51 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 52 | \cf0 \ 53 | - Setup up Scrapy spider. \ 54 | - Following tutorial, when I ran scrapy crawl ebay, I got TypeError: \'91float\'92 object is not itterable. \ 55 | - Solved problem by updating scrapy using conda. Only problem is that I also updated scrapy using conda system-wide, which scrapy documentation told me not to do (I updated before reading that). \ 56 | - Creating pipeline.\ 57 | - created scrapy->postgres pipeline class for storing condition data into the ebay table \ 58 | - added pipeline to settings.py\ 59 | - Need to read itemId,URL from ebay table (x)\ 60 | - Need to set options for throttling (x)\ 61 | - Set postgres config in settings.py (x)\ 62 | - Test spider with hardcoded ebay urls\ 63 | - Worked but it\'92s slow (x)\ 64 | - Speed up scrapy crawl ebay (X)\ 65 | - Create log of errors from scraps (X)\ 66 | \ 67 | 3/29:\ 68 | - Pull postgres data into pandas data frame using sqlalchemy (x)\ 69 | - Deleted duplicates in completed_items table (x)\ 70 | 71 | \f1 \cf3 DELETE FROM tablename\ 72 | WHERE id IN \ 73 | (SELECT id FROM (SELECT id, ROW_NUMBER() OVER (partition BY column1, column2, column3 ORDER BY id) AS rnum FROM tablename) \ 74 | as t WHERE t.rnum > 1);\ 75 | \ 76 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 77 | \cf4 or specifically for deleting duplicate itemId :\cf3 \ 78 | \ 79 | DELETE FROM completed_items_v2 WHERE id IN (SELECT id FROM (SELECT ci."id", ROW_NUMBER() OVER (partition BY ci."itemId" ORDER BY ci."id") AS rnum FROM completed_items_v2 as ci) as t WHERE t.rnum > 1);\cf5 \ 80 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 81 | 82 | \f0 \cf0 \ 83 | \ 84 | - Light cleaning (x)\ 85 | - Light EDA (x)\ 86 | - Discovered that I had a typo in my find-completed-listings.py, which was causing the \'93categoryId\'94 search in my filter to not work.\ 87 | Therefore I was getting completed items from every ebay category (only filter I was using was the keyword \'91camera\'92). \ 88 | - apiy_request dictionary had \'91CategoryId\'94 which should have been \'93categoryId\'94. \ 89 | - Not sure what to do now. I can change the categoryId to the right category and then start my api requests from the max But then I will get a lot of duplicate items. \ 90 | - I think the best strategy is to just copy my old postgres table into a new table so I don\'92t lose my searches thus far. \ 91 | - Then filter that table by the categoryId in SQL. \ 92 | - Then begin filling that table with new data starting at a low price, using no keywords. \ 93 | \ 94 | - Vectorize title with CountVectorizer (x) \ 95 | - Use NLP features + model to predict sold status (x)\ 96 | - Baseline accuracy = 0.897\ 97 | - First pass model (random forest) accuracy = 0.913\ 98 | \ 99 | 3/30:\ 100 | - Create new table\ 101 | - Copy completed_items (x)\ 102 | - Filter: (x)\ 103 | - categoryId = 15230 # film cameras\ 104 | - categoryId = 31388 # digital cameras \ 105 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 106 | 107 | \f1 \cf0 \cf3 DELETE FROM completed_items_15230_31388 as ci WHERE ci."primaryCategory.categoryId"!=31388 AND ci."primaryCategory.categoryId"!=15230; 108 | \f0 \cf0 \ 109 | - OK that didn\'92t work. When we copy completed_items, a serial (id_seq table) was not created. Therefore, in our completed_items_15230_31388 table, the id column was not a serial, and was not being automatically generated when we inserted values. \ 110 | - I copied old table to new table. Then I created a new serial primary key column. Then I deleted the old id column. Then I deleted duplicate rows (rows with duplicate itemId values). \ 111 | \ 112 | \ 113 | - using categoryId = 15230, 31388 and no keywords, query starting at minPrice = 20, up until 2500 (x)\ 114 | - For some reason, I was able to query as much data using the ebay API with only one dev API key. Not sure why. \ 115 | \ 116 | \ 117 | Goals for 3/31 and weekend:\ 118 | - Use scrapy to do recursive scrap and get starting bid price.\ 119 | - Continue NLP exploration and modeling with different features and techniques.\ 120 | \ 121 | 3/31:\ 122 | - What happens to values returned or yielded from parse() and parse_start_prce() in spider?\ 123 | - Where to instantiate Item()? \ 124 | - How to update item class instance in spider within different callback functions? \ 125 | \ 126 | 4/1\ 127 | - Got nested scrape working, but now ebay is requesting captcha every time I want to scrape. \ 128 | - Using cactusVPN, I was able to start scraping again. \ 129 | - VPN disconnects if I send too many requests too fast. Using these settings, it seems to work:\ 130 | 131 | \f1 \cf6 AUTOTHROTTLE_START_DELAY = 0.5\ 132 | AUTOTHROTTLE_MAX_DELAY = 2\ 133 | AUTOTHROTTLE_TARGET_CONCURRENCY = 2\ 134 | CONCURRENT_REQUESTS = 2\ 135 | DOWNLOAD_DELAY = 0.9 136 | \f0 \cf0 \ 137 | - So I\'92m collecting itemId, conditiondescription, duration, and startPrice now. \ 138 | \ 139 | \ 140 | I was thinking about what factors we\'92re going to consider in the model. I think that, along with completed listings, a big factor would be the other listings that are active at the time the model does it\'92s calculation for a specific item. \ 141 | \ 142 | The reason we want to see which items were listed concurrently is because (I would think) you would want to weight the factors from the concurrent listings more than the previously completed listings. Like if you are selling a Nikon ec380, and there is already a Nikon ec380 listed with a startPrice of $300, and all factors being equal, you probably don\'92t want to list it for much more than $300. Even if historically those cameras sell with startPrice of $400, you probably don\'92t want to list it for $400. Therefore I want to weight the concurrent listings factors more than the completed listings. \ 143 | \ 144 | Is it possible to do something like this? Is it even smart to do something like this? \ 145 | \ 146 | Also, I don\'92t see anyone doing this in the white papers that have been released previously for ebay listings end price predictions. Could be a novel method. \ 147 | \ 148 | \ 149 | (You could figure out which items are concurrent by looking at the startTime and endTime columns. You could then create a new column called concurrentGroupNumber, that would be generated by incrementing through the first startTime to the last startTime in equally spaced chunks. In each chunk, you look at all the listings that are active in that chunk, and for each of those listings, assign the same groupNumber to them. That allows you to see concurrent listings.)\ 150 | \ 151 | Goals for 4/1 and 4/2:\ 152 | - Scrape more data. Current number of rows in 153 | \f1 \cf7 completed_items_15230_31388 is 46225. 154 | \f0 \cf0 \ 155 | - Collect completed_items auctions for starting at 3/30 at 11:10:00, until 4/2 12:00:00 (x)\ 156 | - collect buy it now auctions of prices from 20 to 2500 and store in table completed_items_15230_31388 (x) \ 157 | - 158 | \f1 \cf7 \{'name': 'ListingType', 'value':'AuctionWithBIN'\}, 159 | \f0 \cf0 \ 160 | - Collect fixedPrice auctions of prices from 20 to 2500 and store in table completed_items_15230_31388 \ 161 | - Don\'92t know if \'91FixedPrice\'92 listings were sold or if they just ended. \ 162 | - Okay this is a big problem. It seems like \'91AuctionWithBIN\'92 is like a deprecated feature because if you go on ebay.com now, you can only find auctions OR BuyItNow listings, but there don\'92t seem to be listings that are AuctionWithBIN. \ 163 | - So I might have to just scrape data for the Buy It Now listings. \ 164 | - Although the only problem I\'92m seeing here is that you can\'92t find Buy It Now listings from stores\'85 which is a problem because stores make up a lot of the business.\ 165 | \ 166 | - I experimented with changing the code of connections in:\ 167 | 168 | \f1 /Users/Naekid/anaconda3/envs/dsi/lib/python2.7/site-packages/ebaysdk/connection.py 169 | \f0 \ 170 | and then running the samples in:\ 171 | 172 | \f1 /Users/Naekid/anaconda3/envs/dsi/lib/python2.7/site-packages/ebaysdk-2.1.4-py2.7.egg/samples\ 173 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 174 | 175 | \f0 \cf0 For some reason, the samples in .egg use the code in the other ebaysdk/ directory. Who knows. \ 176 | \ 177 | I created a GitHub ticket on the python SDK page. I also created a ebay developers forum post. \ 178 | Here\'92s a sample URI HTTP GET with the correct sellingState:\ 179 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 180 | {\field{\*\fldinst{HYPERLINK "http://svcs.ebay.com/services/search/FindingService/v1?OPERATION-NAME=findCompletedItems&SERVICE-VERSION=1.7.0&SECURITY-APPNAME=nathanzo-ebaypric-PRD-cbed4d450-05d217d8&RESPONSE-DATA-FORMAT=XML&keywords=222461424089"}}{\fldrslt \cf0 http://svcs.ebay.com/services/search/FindingService/v1?OPERATION-NAME=findCompletedItems&SERVICE-VERSION=1.7.0&SECURITY-APPNAME=nathanzo-ebaypric-PRD-cbed4d450-05d217d8&RESPONSE-DATA-FORMAT=XML&keywords=222461424089}}\ 181 | s\ 182 | I\'92ve decided that the best course of action is to move on to sending my own HTTP GET requests to ebay using the structure outlined in their API documentation. \ 183 | \ 184 | 4/3:\ 185 | - Discovered bug that was causing incorrect sellingState to be returned. The problem was that the the 186 | \f1 X-EBAY-SOA-SERVICE-VERSION 187 | \f0 Header field in the ebaySDK was set to 1.0.0. When I hardcoded it to 1.13.0, that fixed the problem. Also, you should be able to change the version in the ebay.yaml file, but that didn\'92t work for me.\ 188 | \ 189 | - Wrote my own wrapper for Ebay API to send findCompletedItems requests to specific itemId, retreive sellingState, and update SQL table. \ 190 | \ 191 | - scrape ebay for condition information of FixedPrice items. \ 192 | \ 193 | - my-ebay-api-port/development.ipynb is for using HTTP GET ebay API to update sellingState for items that are already in my ebay database. \ 194 | - scrapy is used to get condition description for all items, and startPrice for items that had > 0 bid_count.\ 195 | - scrapy is not getting startPrice successfully. \ 196 | \ 197 | \ 198 | 4/4:\ 199 | - I\'92m thinking of starting a new database from scratch since the old one is super messy. \ 200 | - First objective is to get the startPrice.\ 201 | - Once I have the startPrice, then I can use find-completed-items on \'91Auction\'92,\'92AuctionWithBIN\'92,\'92FixedPrice\'92,\'92StoreInventory\'92\ 202 | \ 203 | - Making new spider: ebay_spider_v2\ 204 | - All I need to scrape is the conditionDesription, startPrice \ 205 | - Turns out the two different bid history pages are due to 2 different version of the page. For example:\ 206 | {\field{\*\fldinst{HYPERLINK "http://www.ebay.com/bfl/viewbids/291989472205?item=291989472205&rt=nc&_trksid=p2047675.l2565"}}{\fldrslt http://www.ebay.com/bfl/viewbids/291989472205?item=291989472205&rt=nc&_trksid=p2047675.l2565}}\ 207 | {\field{\*\fldinst{HYPERLINK "http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=291989472205&rt=nc&_trksid=p2047675.l2565"}}{\fldrslt http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=291989472205&rt=nc&_trksid=p2047675.l2565}}\ 208 | Slightly different - I don\'92t know how I\'92m getting these two. But it\'92s not a big deal. We can just grab the entire HTML of either page, and use this code to get the startPrice:\ 209 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 210 | 211 | \f1 \cf3 bid_history_items = response.xpath("//span/text()").extract()\ 212 | if bid_history_items:\ 213 | for i,text in enumerate(bid_history_items):\ 214 | if text == 'Starting Price':\ 215 | startPrice = bid_history_items[i+1]\ 216 | item['startPrice'] = float(startPrice.replace('$',''))\ 217 | \ 218 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 219 | 220 | \f0 \cf0 It seems to work. \ 221 | \ 222 | - I also found out that sometimes if a listing is an auction but ends with a BuyItNow, the findCompletedItems does not always show that it was a \'91AuctionWithBIN\'92 or that it ended with a BuyItNow. It\'92s also going to require two nested scrapes to get the start price. \ 223 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 224 | \cf8 \cb9 I will have to investigate this further, but looking into listings of type \'91AuctionWithBIN\'92\ 225 | \cf4 \cb1 And seeing if they are fucked up.\cf0 \ 226 | \ 227 | \ 228 | 4/5:\ 229 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 230 | 231 | \f2 \cf10 - I think that a unique contribution to this problem would be to utilize concurrent listings, as well as completed listings.\ 232 | 233 | \fs28 - 234 | \fs24 Calculate 235 | \i similar 236 | \i0 (via NLP), concurrent (defined below) listing mean/median starting price. This alone is a useful application of ML.\ 237 | - Calculate similar completed listing mean/median start price. \ 238 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 239 | 240 | \f0 \cf0 \ 241 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 242 | 243 | \f2 \cf11 \cb12 It might be a good idea to consider the price of similar listings 244 | \i at the time of posting 245 | \i0 a given listing, but that would require lots of extra scraping. \cf4 \cb1 \ 246 | \cf11 \cb13 So instead, we\'92ll just consider the 247 | \i start price 248 | \i0 of concurrent listings.\cf0 \cb1 \ 249 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 250 | 251 | \fs28 \cf10 \ 252 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 253 | 254 | \fs24 \cf0 \ 255 | FEATURE ENGINEERING for mean/median startPrice of Concurrent Listings, and mean/median of startPrice and endPrice of Completed Listings. \ 256 | \ 257 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 258 | 259 | \f1 \cf0 FIRST create 4 new columns in dataframe \ 260 | similarConcurrentListing.meanStartPrice\ 261 | similarConcurrentListing.medianStartPrice\ 262 | similarCompletedListing.meanStartPrice\ 263 | similarCompletedListing.medianStartPrice\ 264 | similarCompletedListing.meanEndPrice\ 265 | similarCompletedListing.meanEndPrice\ 266 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 267 | 268 | \f2 \cf0 \ 269 | We need to define concurrent listings. 270 | \f1 \ 271 | \ 272 | 273 | \f2 Filter for Concurrent and Completed Listings 274 | \f1 \ 275 | 1 Method - \ 276 | if listing2.starTime > listing1.startTime, then l2.startTime should be < H1 hours after l1.starTime \ 277 | If listing2.startTime < listing1.startTime, then l2.endTime should be > H2 hours after listing1.startTime\ 278 | We could just choose H1 and H2, but maybe EDA is a good way of finding this best time. We could also use H1 and H2 as hyper parameters.\ 279 | \ 280 | sort dataframe by startTime DESC \ 281 | for listing1 in listings:\ 282 | for listing2 in listing: # only consider concurrent and past listings (exclude future listings)\ 283 | if listing2.startTime < listing1.endTime and listing2.endTime > listing1.startTime:\ 284 | # listing1 and listing2 are concurrent listings\ 285 | add to dataframe with [\'91concurrent\'92]=1 \ 286 | \ 287 | elif listing2.endTime <= listing1.startTime:\ 288 | # listing2 is a past listing for listing1 \ 289 | add to data frame with [\'91concurrent\'92]=0\ 290 | \ 291 | \ 292 | 2 Method - \ 293 | Actually instead, let\'92s frame the problem so that we calculate the percentage of overlap between the time periods of l1 and l2. Define l2 to be a concurrent listing if it\'92s percentage overlap with l1 is > H, where H is a number between 0 and 1. H could be a hyper parameter to the modeling pipeline. \ 294 | \ 295 | sort dataframe by startTime DESC \ 296 | for listing1 in listings:\ 297 | for listing2 in listing: # only consider concurrent and past listings (exclude future listings)\ 298 | if listing2.startTime < listing1.endTime and listing2.endTime > listing1.startTime:\ 299 | # listing1 and listing2 are concurrent listings\ 300 | add to dataframe with [\'91concurrent\'92]=1 \ 301 | \ 302 | elif listing2.endTime <= listing1.startTime:\ 303 | # listing2 is a past listing for listing1 \ 304 | add to data frame with [\'91concurrent\'92]=0\ 305 | \ 306 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 307 | 308 | \f3\b \cf0 3 Method - THE BEST (most realistic) 309 | \f1\b0 \ 310 | sort dataframe by startTime DESC\ 311 | for listing1 in listings:\ 312 | create new dataframe for concurrent and completed listings (no future listings)\ 313 | add listing1 to new dataframe\ 314 | for listing2 in listings:\ 315 | if listing2.startTime < listing1.startTime and listing2.endTime > listing1.startTime:\ 316 | add listing2 to new dataframe with [\'91concurrent\'92]=1\ 317 | else:\ 318 | add listing2 to new dataframe with [\'91concurrent\'92]=0\ 319 | calculate median startPrice of listings with [\'91concurrent\'92]==1 320 | \f2 \ 321 | \ 322 | For EDA on that aspect, Vrushank suggested plotting percentage of concurrent time (x-axis) vs. number of listings within that time, although that wouldn\'92t tell you how much the hyper parameter affects the accuracy of the model. 323 | \f1 \ 324 | \ 325 | \ 326 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 327 | 328 | \f2 \cf0 VECTORIZE 329 | \f1 \ 330 | Vectorize new dataframe (BoW, TFIDF, shingles, sPacy) as vdf\ 331 | Calculate similarity (Jaccard, Cosine), between listing1 and all other listings in vdf\ 332 | Set a threshold for similairty so that only items OVER that threshold will be considered in the model, i.e. filter vdf so that we only are considering listings very similar to the original listing\ 333 | # Now you have a vectorized df of listings similar to listing1\ 334 | calculate mean/median of concurrent=1 listings and concurrent=0 listings, seperately\ 335 | You now have 2 new features for your listing, which you can add as new columns to the row for listing1 in the original dataframe\ 336 | \ 337 | \ 338 | \ 339 | 340 | \f2 Just had an idea. Essentially, instead of finding similar titles myself, I use ebay\'92s search to find similar titles using topics created by LDA to find the commonly searched keywords from the titles that I already have in my database. Then, for each LDA topic/keyword, you search bay\'92s current and completed listings. But then you still have to calculate similarity, because your alternative is to just take the top n searches from ebay\'92s results, and obviously some of those results will be very different to each other. But using ebay\'92s search results combined with my own similarity calculation might be more effective than JUST using my own similarity calculation. Let\'92s try using my own similarity calculation first because we will have to do that anyway. 341 | \f1 \ 342 | \ 343 | \ 344 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 345 | 346 | \f2 \cf4 \cb14 Problem: People can list items as auctions, and update those listings midway through to have BuyItNow available, but this won\'92t show up in a grab from the ebay API. \ 347 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 348 | \cf0 \cb1 From the documentation:\ 349 | \pard\pardeftab720\sl300\partightenfactor0 350 | 351 | \f4\i\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0 352 | Auction\ 353 | Competitive-bid online auction format. Buyers engage in competitive bidding, 354 | \b although Buy It Now may be offered as long as no valid bids have been placed 355 | \b0 .\ 356 | 357 | \i0 \ 358 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 359 | 360 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 This is happening for certain listings, such as 332172404108. \ 361 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 362 | 363 | \b \cf0 It has a listingType of \'91Auction\'92, no \'91startPrice\'92, BuyItNowAvailable=\'91f\'92, and bidCount=1, then that final selling bid was a Buy It Now. 364 | \b0 \ 365 | This must have happened because the user updated their listing after it was already posted and online for some time. \ 366 | \ 367 | xpath for \'91see original listing\'92 for these pages:\ 368 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 369 | \cf10 //span[contains(@class, 'vi-inl-lnk')]//@href\ 370 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 371 | \cf0 \ 372 | There are two options here.\ 373 | 1. We could go and scrape for the start price, and then make two listings out of each of these listings: 1 would be an \'91Auction\'92 listing that did NOT sell, and had the original start price. And the other would be a \'91AuctionWithBIN\'92 that DID sell. \ 374 | 2. We just make the listing a \'91AuctionWithBIN\'92 that DID sell, with a start price equal to the sale price.\ 375 | \ 376 | Let\'92s see how many of these listings there are. if there are a lot, that gives more reason to do option 1, because if we don\'92t, we are losing out on significant information with the \'91Auction\'92 listings that did not sell. But it\'92s going to take a lot of rescraping to get those, and just adds a shit ton of complications.\ 377 | The number of listings (so far on 4/5) is 4,432. \ 378 | I mean it\'92s not a big deal to go with option 2, because we\'92re just kind of pretending those \'91Auction\'92 listings didn\'92t exist, which isn\'92t horrible.\ 379 | \ 380 | So to deal with these listings with option 2:\ 381 | # listingType of \'91Auction\'92, no \'91startPrice\'92, BuyItNowAvailable=\'91f\'92, and bidCount=1\ 382 | mask = (df['listingInfo.listingType']=='Auction')\\\ 383 | & (np.isnan(df['startprice'])) \\\ 384 | & (df['sellingStatus.bidCount']==1.0)\ 385 | dfm = df[mask]\ 386 | dfm[\'91listingInfo.listingType\'92] = \'91AuctionWithBIN\'92\ 387 | dfm[\'91listingInfo.listingType\'92] = \'91AuctionWithBIN\'92\ 388 | \ 389 | OKAY and this new information:\ 390 | \pard\pardeftab720\sl300\partightenfactor0 391 | 392 | \f4\i\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0 393 | AuctionWithBIN\ 394 | Same as Auction format, but Buy It Now is enabled. AuctionWithBIN changes to Auction if a valid bid has been placed on the item. 395 | \i0 \ 396 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 397 | 398 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 Which means we have funky information BUT here is what I realized. Because ebay doesn\'92t update \'91Auction\'92 to \'91AuctionWithBIN\'92 for listings where the user updated the listing type midway through\'85\ 399 | \ 400 | \'85We need to create our own 401 | \b endListingType 402 | \b0 , that concerns the end state of the listing. \ 403 | IF an item did NOT SELL, then we care about the END state (that\'92s the only state there is). \ 404 | - if sellingState = \'91EndedWithoutSales\'92:\ 405 | - 406 | \b endListingType 407 | \b0 = listingInfo.listingType\ 408 | - 409 | \b startprice 410 | \b0 = sellingStatus.currentPrice.value\ 411 | IF an item SOLD, then we care how about the END state (the state of the item when it was sold)\ 412 | - else if sellingState = \'91EndedWithSales\'92:\ 413 | - If listingInfo.listingType was \'91Auction\'92:\ 414 | - 415 | \b if listingType= \'91Auction\'92, \'91startprice\'92=NaN, \'92bidCount\'92=1.0 (because user can only change to BIN w/ 0 bids), then that final selling bid was a Buy It Now. 416 | \b0 This also catches the case where an AuctionWithBIN \ 417 | - 418 | \b endListingType 419 | \b0 = \'91AuctionWithBIN\'92\ 420 | - 421 | \b startprice 422 | \b0 = sellingStatus.currentPrice.value\ 423 | - 424 | \b if listingType= \'91Auction\'92, \'91startprice\'92 425 | \fs32 != 426 | \fs24 NaN\ 427 | - endListingType 428 | \b0 = \'91Auction\'92\ 429 | - 430 | \b startprice 431 | \b0 = startprice\ 432 | - if listingInfo.listingType = \'91AuctionWithBIN\'92: # see note\ 433 | 434 | \b - endListingType 435 | \b0 = \'91AuctionWithBIN\'92\ 436 | - 437 | \b startprice 438 | \b0 = sellingStatus.currentPrice.value\ 439 | # I believe this is only true for items that stayed as \'91AuctionWithBIN\'92 the entire life of the listing, and if I\'92m right\'85\ 440 | # I\'92m looking for an \'91AuctionWithBIN\'92 that changed to an \'91Auction\'92 because someone put a bid, but then eventually it SOLD as BuyItNow.\ 441 | # I don\'92t that exists because (from documentation): 442 | \f4\i On most sites, 443 | \b the Buy It Now option is removed (and this value returns false) once a valid bid is made on the associated item 444 | \b0 (a valid bid could be a bid above the reserve price). buyItNowAvailable will return "false" if the listing type is anything but "AuctionWithBIN". Please ignore buyItNowAvailable for fixed-price listings. 445 | \i0 \ 446 | 447 | \f2 \ 448 | - if listingInfo.listingType= \'91FixedPrice\'92 or listingInfo.listingType= \'92StoreInventory\'92: # \'91FixedPrice\'92 means PURE BuyItNow, and no Auction available. \ 449 | - 450 | \b endListingType 451 | \b0 = \'91FixedPrice\'92 or \'91StoreInventory\'92\ 452 | - start price = sellingStatus.currentPrice.value 453 | \f0 \ 454 | \ 455 | \ 456 | \ 457 | \ 458 | \ 459 | 460 | \f2 4/6:\ 461 | Created baseline model. Used the following features:\ 462 | [titles_df, conditions_df, auction_condition_dummies, start_price_series, sold]\ 463 | Baseline accuracy: 464 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 465 | 0.793\ 466 | 467 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 Predicted Accuracy: 468 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 469 | 0.871\ 470 | 471 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 472 | \ 473 | 4/7:\ 474 | \ 475 | Directions to move forward in:\ 476 | \ 477 | \ 478 | \ 479 | - Used regression model to predict end prices for auction listings (x)\ 480 | \ 481 | - What model is best to use for this kind of regression, where we have NLP involved? - SVM, \'85?\ 482 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 483 | 484 | \f0 \cf0 \ 485 | \ 486 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 487 | 488 | \f2 \cf0 4/9:\ 489 | -\cb17 490 | \b Does lowering the start price increase the probability of selling an item? 491 | \b0 \cb1 To explore this question, we need to compare the start price and sold_state of items that are similar. So we want to compare items that are as similar as possible, with the only difference being the start price. Then we can make a logistic regression model using just startPrice and check the coefficient to see if if there\'92s a significant correlation. So the problem is to find items that are similar to each other. How do we do this? 492 | \b Use NLP to vectorize titles, then calculate similarity between items (using title, condition as features), then take one set of similar items, and see if there is a relationship between startPrice and sold_state. \ 493 | 494 | \b0 Plot Sold (x-axis) vs. Average Start Price \ 495 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 496 | \cf0 \cb18 - Using TF-IDF and cosine similarity, we were able to make this plot using a particular listing (the \'91Auction\'92 listing having the highest number of listings similar to it - which has an index of 3080), and using a similarity threshold of 0.5, and found the mean startPrice for sold items to be about $45, and for unsold items $125. \cb1 \ 497 | Let\'92s move on to plotting a change startPrice vs Probability of selling. \ 498 | \ 499 | \cb17 - 500 | \b We want to test if changing the startPrice actually causes my model to increase the likelihood of selling 501 | \b0 .\cb1 So, using my current classifier, I\'92ll take a listing that is very common (has many listings similar to itself), and have the model repeatedly calculate probability of SOLD for a listing, as I increase it\'92s start Price from 0x to 2x. Make a plot of the results. 1\ 502 | \cb18 - I\'92m only able to plot 1 plot at a time, so I can\'92t get an aggregate view. Based on briefly looking at some of the items, most of them seem to follow the pattern whereby as you increase startPrice, the probability of selling decreases. \cb1 \ 503 | \ 504 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 505 | \cf4 \cb11 - Another exciting problem came up. \cf0 \cb1 \ 506 | - Discovered that there are very few auctions with an endPrice between ~120 to ~180, and I\'92m not sure why this is. \ 507 | - \cf4 We should collect more \'91Auction\'92 data with a 120 < endprice < 180 to make up for this shortage (x)\cf0 \ 508 | - use 509 | \f4\fs26 \cf15 \cb16 \expnd0\expndtw0\kerning0 510 | MinPrice in item filter 511 | \f2 with 120 to 180 dollars. (x)\ 512 | - then delete duplicates (x) 513 | \fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 514 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 515 | \cf4 We have a huge dearth of items with endPrices around $120 to $180. I explored that and then then found that the create_end_listing_type_and_start_price was creating a lot of endListingType values of \'91Invalid URL\'92, which I was then filtering out, which was also filtering out a lot of endPrices in the range 120-180. I thought this meant that the URLs were actually 516 | \i \cf4 invalid 517 | \i0 \cf4 , but then I discovered that these URLs were actually valid (by using manual HTTP requests instead of the python port). So i looked into how I create the endListingType of \'91Invalid URL\'92 and found that the important condition is to have a startPrice of NaN, so basically a lot of items with endPrice in the range 120-180 had NaN startPrices\ 518 | \ 519 | So I had a lot of startPrice of NaN, which is weird because my scraper should have collected a startPrice if there is a valid URL. But my clue was that this weird behavior was only occurring for items with a specific range of endPrices, so that led me to believe that the most likely explanation was that I simply had accidentally skipped scraping some items when I set my url_start_index. So to attempt to scrape the right ones, I changed the scraper to query the database for items starting at the LEAST recent, and because I started scraping at $20 and moved up in price over time, I found out that the $120 items begin at around index 39500 in my database. So I started scraping at 39500, and we get to around $180 at index 54000, and scraping this many should take around 4 hours. \ 520 | \ 521 | Let\'92s see if this works. \ 522 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 523 | \cf4 \cb18 Yep, that was the problem. Just needed to scrape every itemId in our database. We\'92re still at a dearth for items with an endPrice > $650, so we need to keep scraping. \cf0 \cb1 \ 524 | \ 525 | \ 526 | 4/10:\ 527 | Goals:\ 528 | - Feature Engineer median startPrice of similar, completed listings\ 529 | - Make Pickles \ 530 | \ 531 | Journal:\ 532 | 1. Create a pickle out of cleaned auctions DataFrame.\ 533 | 2. Create a new notebook for feature engineering the median start price of concurrent listings\ 534 | - Create a pile of feature engineered auctions data frame\ 535 | - Import new auctions into old notebook \ 536 | \ 537 | 4/11:\ 538 | - Working on feature engineer, but the function is estimated to take around 2 hours. So I\'92ll run it during passover. \ 539 | - It ran in like 10 minutes. Weird. \ 540 | \ 541 | 4/12: \ 542 | - Add concurrent similar median start price feature and see if it improves model. \ 543 | - It did not. (x)\ 544 | - save EC2 Image (x)\ 545 | - Grid Search on EC2 instance for best classifier \ 546 | - Find median endPrice of similar, completed listings\ 547 | - profitability metric \ 548 | - Take a random sample of data\ 549 | - For each sample:\ 550 | - Calculate sold_probability and predicted_end_price for a range of start prices from 0 to 2x start price. \ 551 | - where sold_probability > 0.5, calculate optimal start price, and optimal end price. sold_state_pred = 1.\ 552 | - If there is no sold_probability > 0.5, then optimal start price = 0 and optimal end price = 0. sold_state_pred = 0.\ 553 | - if sold_state_pred == 1 and sold_state_true == 1:\ 554 | - Calculate profit_diff = end_price_pred - end_price_true\ 555 | - if sold_state_pred == 1 and sold_state_true == 0:\ 556 | - Calculate profit_diff = end_price_pred - end_price_true (which is 0)\ 557 | - if sold_state_pred == 0 and sold_state_true == 1:\ 558 | - Calculate profit_diff = end_price_pred (which is 0) - end_price_true\ 559 | average_profit_lift = np.mean(profit_diff)\ 560 | \ 561 | 4/13:\ 562 | - Created new EC2 instance (x)\ 563 | - Used default Linux AMI ->\ 564 | - Installed Anaconda with:\ 565 | - $ wget https://repo.continuum.io/archive/Anaconda2-4.3.1-Linux-x86_64.sh\ 566 | - $ bash Anaconda2-4.3.1-Linux-x86_64.sh n\ 567 | - Closed SSH opened new SSH\ 568 | - Transfer files to instance \ 569 | - ssh -i "ebay-price-predictor-3.pem" -L 8000:localhost:8888 ec2-user@ec2-54-183-29-45.us-west-1.compute.amazonaws.com\ 570 | \ 571 | - (on ec2) Grid Search RandomForestClassifier (X)\ 572 | - run a process in background: nohup python classification-grid-search.py &\ 573 | - $ top -> k -> (PID_#) -> 15\ 574 | \ 575 | 576 | \b Best Estimator 577 | \b0 : RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\ 578 | max_depth=10, max_features='auto', max_leaf_nodes=None,\ 579 | min_impurity_split=1e-07, min_samples_leaf=5,\ 580 | min_samples_split=6, min_weight_fraction_leaf=0.0,\ 581 | n_estimators=500, n_jobs=-1, oob_score=False,\ 582 | random_state=None, verbose=0, warm_start=False)\ 583 | - 584 | \b Best Score 585 | \b0 (roc_auc) 0.6544\ 586 | - 587 | \b Best Parameters 588 | \b0 : \{'max_features': 'auto', 'min_samples_split': 6, 'n_estimators': 500, 'max_depth': 10, 'min_samples_leaf': 5\}\ 589 | \ 590 | - (on ec2) Tried ExtraTreesClassifier(n_estimators=25, boostrap=True), which took WAY longer, and didn\'92t improve results. \ 591 | \ 592 | - Explore distribution of end prices \ 593 | - It\'92s skewed to the left \ 594 | - Should transform endPrice (log(endPrice) = output) and predict that then transform back (endPrice = 10^output ? )\ 595 | \ 596 | - Create time of day feature (x)\ 597 | - did not really help \ 598 | \ 599 | Question to ask vrushank:\ 600 | - How to transform endPrice?\ 601 | - Explain time of day feature didn\'92t help\ 602 | - \ 603 | \ 604 | \ 605 | Presentation:\ 606 | \pard\pardeftab720\sl320\partightenfactor0 607 | 608 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 609 | Baseline Accuracy: 0.857667278492\ 610 | Baseline RFC accuracy: 0.89738027699\ 611 | \ 612 | baseline mean_absolute_error: 66.7116406965 \ 613 | predicted mean_absolute_error: 27.0460162035\ 614 | \ 615 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 616 | 617 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 618 | \pard\pardeftab720\sl320\partightenfactor0 619 | 620 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 621 | Optimal Predicted End Price:$618.714156395, Optimal Start Price:$590.0, Chance of Selling:0.87, Expected Profit:$538.281316064\ 622 | \pard\pardeftab720\sl320\partightenfactor0 623 | 624 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 625 | \pard\pardeftab720\sl320\partightenfactor0 626 | 627 | \f1\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 628 | Average End Price: $262.72 629 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 630 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 631 | 632 | \f1\fs28 \cf0 Average Lift: $34.20\ 633 | % Average Lift: 13% Increased Profit on Average!\ 634 | on 20,000 Auction Listings: $\cf2 \cb16 \expnd0\expndtw0\kerning0 635 | 6,719,374.15 Net Increased Profit\ 636 | \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 637 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 638 | 639 | \f2\fs24 \cf0 \ 640 | \ 641 | \ 642 | Goals:\cf10 \ 643 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 644 | 645 | \b \cf0 Feature Engineering: 646 | \b0 \ 647 | - median startPrice of similar, completed listings (x)\ 648 | - Use preprocessing.normalize() on data and then create SCM start price feature again\ 649 | - Talk with Vrushank about investigation into SCM feature \ 650 | - do i need to normalize output end Prices since they are not normally distributed?\ 651 | - How to combine valida`tion metric and profitability metric? \ 652 | - median endPrice of similar, completed listings \ 653 | - time of day listing went on (x)\ 654 | \ 655 | 656 | \b Model Optimization (to combat over-fitting):\ 657 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 658 | 659 | \b0 \cf0 - plot a learning curve to figure out how much data I really need until training error & test error converge\ 660 | - Use this information to downsample (using pandas sample) when grid searching (to make models quicker to train) \ 661 | - grid search vectorizer settings\ 662 | - grid search models \ 663 | - grid search model parameters \ 664 | - optimize classification models for Recall \ 665 | - optimize regression models for mean_absolute_error \ 666 | \ 667 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 668 | 669 | \b \cf0 Profitability Optimization Metric: 670 | \b0 \ 671 | - Calculate the average expected price increase we were able to get for all items. \ 672 | - Use AWS/Dominoe to do this \ 673 | \ 674 | 675 | \b Identify Listing factors that contribute to changes in probability of selling & predicted end price: 676 | \b0 \ 677 | - Recommend replacements/deletion for words that decrease probability\ 678 | \ 679 | 680 | \b Using Images + NN To Predict Sale 681 | \b0 \ 682 | \ 683 | \ 684 | 4/14:\ 685 | - Extract Listing Features - Model, Lens, MegaPixels\ 686 | - we can use the getItem Call in the Trading API to get category specific information (like Model, MegaPixels)\ 687 | - We used getCategorySpecifics Call in the Trading API to get the \'93Item Specifics\'94 for the Digital Camera (#31388) category. They are:\ 688 | Type\ 689 | Brand\ 690 | MPN\ 691 | Series\ 692 | Model\ 693 | Megapixels\ 694 | Optical Zoom\ 695 | Features\ 696 | Color\ 697 | Bundled Items\ 698 | Connectivity\ 699 | Battery Type\ 700 | Manufacturer Warranty\ 701 | Screen Size\ 702 | Digital Zoom\ 703 | Country/Region of Manufacture\ 704 | \ 705 | 4/15:\ 706 | \ 707 | 3 ways to extract Brand, Model information:\ 708 | 1. Use ebay API. Problematic because we only get a certain number of calls per day.\ 709 | 2. Scrapy. Long development + takes many hours. \ 710 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 711 | \cf0 \cb17 3. Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \ 712 | \ 713 | \ 714 | I think the most important features for each listings are:\ 715 | Brand - \cb17 Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \ 716 | Model - \cb17 Extract Brand,Model from title using Brand,Model information already in category_specific table. Fastest. \cb1 \ 717 | Lens Type - \cb17 Extract Lens Type from title using string matching (\'93Body\'94=0, \\d-\\d=1, \'93Len\'94=1, \'93mm\'94=1. \cb1 \ 718 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 719 | \cf0 Condition - Already have (x)\ 720 | Seller Feedback - Already have (x)\ 721 | Free Shipping - Already have (x)\ 722 | Bundled Items / Extas - \cb14 Will need to use scrapy or several days of ebay API. Not likely to happen. \cb1 \ 723 | Original Packaging - \cb19 Would need to use scrapy. Not likely to happen\cb1 \ 724 | \ 725 | Features to Engineer with these features:\ 726 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 727 | 728 | \b \cf0 - Median listing price of FixedPrice listings of same Brand,Model, Condition for Classification.\ 729 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 730 | \cf0 - Median end price of Auctions,FixedPrice listings of same Brand,Model, Condition for Regression. 731 | \b0 \ 732 | \ 733 | \ 734 | Get a list of brands/models with:\ 735 | select DISTINCT "Brand" from category_specifics;\ 736 | select DISTINCT "Model" from category_specifics;\ 737 | \ 738 | 4/16:\ 739 | Model Extraction worked really well!!!\ 740 | First I extract obvious models with regex (for example 741 | \f5\fs28 \cf2 \cb16 \expnd0\expndtw0\kerning0 742 | dsc-wx300 743 | \f2 ). 744 | \fs24 Then i use TF-IDF vectorization + cosine similarity to match a filtered version of the title with the list of models pulled using ebay API. From a spot check, it\'92s working really well! More issues with the regex extraction than with TF-IDF actually, so we\'92re going to try to use TF-IDF the whole way. 745 | \b Actually just did a test - Regex is messing things up, so let\'92s just use TF-IDF + Cosine Similarity to extract model name on all listings. 746 | \b0 \ 747 | \ 748 | We also need to make more API calls with getItem so that we have a bigger camera Model list. Right now we only have like 1000 distinct models. Which, with my spot check, has been fine, but we can use some of my 5000 limit to potentially get more models. We could also use the API to get Bundled Items. However I think I want to focus on just auctions at this point. \ 749 | So I need to 750 | \b create a table called 751 | \f3 category_specifics_auctions 752 | \f2\b0 :\ 753 | 1. Copy itemId from completed_items_v2 where listingType != \'91FixedPrice\'92 and listingType != \'91StoreInventory\'92 into table called category_specifics_auctions\ 754 | 2. Update category_specifics_auctions using rows from category_specifics that have non-empty Brands.\ 755 | TOO TIME-INTENSIVE!\ 756 | We\'92re just going to use category_specifics to get a list of brands and models. \ 757 | We will need to use scrapy if we want to get Bundled Items.\ 758 | \ 759 | Lens extraction with regex went fine, but I realized that the 18-55mm text is only PART of the lens description. There 760 | \f1\fs28 is more to the lens than the 15-55mm 761 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \ 762 | \ 763 | Okay so Model Extraction did NOT work really well. I don\'92t know why I thought it worked well earlier. \ 764 | I still need to extract brand, model information from title. But I need a CLEAN LIST of models to compare with the listing title. So In order to get a clean list, I\'92ll scrape B&H Video for all their cameras, and store that information into a table in my database called b_h_camera_inventory\ 765 | Create table:\ 766 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 767 | 768 | \f1 \cf0 CREATE TABLE b_h_digital_camera_inventory (\ 769 | "id" SERIAL PRIMARY KEY,\ 770 | "Brand" TEXT,\ 771 | "Model" TEXT,\ 772 | "Retail Price" DECIMAL,\ 773 | "Body Only" BOOLEAN,\ 774 | "Kit" BOOLEAN,\ 775 | "Has Lens" BOOLEAN,\ 776 | "Lens" TEXT,\ 777 | "B&H Id" TEXT,\ 778 | "Title" TEXT\ 779 | ); 780 | \f2 \ 781 | \ 782 | # -*- coding: utf-8 -*-\ 783 | \ 784 | # Define here the models for your scraped items\ 785 | #\ 786 | # See documentation in:\ 787 | # http://doc.scrapy.org/en/latest/topics/items.html\ 788 | \ 789 | import scrapy\ 790 | \ 791 | \ 792 | class CameraRetailerScraperItem(scrapy.Item):\ 793 | # define the fields for your item here like:\ 794 | # name = scrapy.Field()\ 795 | brand = scrapy.Field(default='NULL')\ 796 | model = scrapy.Field(default='NULL')\ 797 | retailPrice = scrapy.Field(default='NULL')\ 798 | bodyOnly = scrapy.Field(default='NULL')\ 799 | kit = scrapy.Field(default='NULL')\ 800 | hasLens = scrapy.Field(default='NULL')\ 801 | lens = scrapy.Field(default='NULL')\ 802 | bhId = scrapy.Field(default='NULL')\ 803 | \ 804 | \ 805 | \ 806 | 4/17:\ 807 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 808 | 809 | \b\fs32 \cf0 Classification 810 | \b0\fs24 \ 811 | 812 | \b\fs28 pd.read_pickle('./pickles/df_classification_count_vec.p') 813 | \b0\fs24 \ 814 | \ 815 | RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1) \ 816 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 817 | \cf0 \ul \ulc0 Not Cross Val\ulnone \ 818 | Baseline Accuracy: 0.847\ 819 | Model accuracy: 0.903\ 820 | \ul 3-Fold Cross Val:\ulnone \ 821 | \ 822 | Logistic Regression:\ 823 | \ul Not\ulnone \ul Cross Val:\ulnone \ 824 | Baseline Accuracy: 0.8475 \ 825 | Model accuracy: 0.8754\ 826 | \ul 3-Fold\ulnone \ul Cross Val:\ulnone \ 827 | \cb20 Accuracy: 0.781099028892\cb1 \ 828 | \ 829 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 830 | \cf0 Logistic Regression Interesting Important Features:\ 831 | 6th (u\'92fast', 1.87)\ 832 | 9th (u\'92box', 1.6658)\ 833 | 18th (u\'92gently used', 1.3718)\ 834 | 25th (u\'92good cosmetic', 1.0716)\ 835 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 836 | \cf0 \ 837 | \ 838 | Classification Ensemble (RF, LR, XG):\ 839 | \ul KFold Cross Val:\ulnone \ 840 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 841 | \cf0 [(0.94599636950383215, 0.89573459715639814), \ 842 | (0.94433519891090612, 0.88674868898749493), \ 843 | (0.94418393586446836, 0.89179104477611937)]\ 844 | \cb21 Overfitting.\cb1 \ 845 | Baseline Accuracy: 0.854 846 | \f1 \ 847 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 848 | 849 | \f2 \cf0 \cb22 Cross Validated Ensemble GMean Prediction Accuracy: 0.891\ 850 | Increase Accuracy due to model: 0.036\cb1 \ 851 | \ 852 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 853 | 854 | \b\fs32 \cf0 Regression\ 855 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 856 | 857 | \b0\fs24 \cf0 RandomForestRegressor\ 858 | \pard\pardeftab720\sl320\partightenfactor0 859 | 860 | \f1\fs28 \cf23 \cb24 \expnd0\expndtw0\kerning0 861 | \outl0\strokewidth0 \strokec23 Average Cross Validated RFR Score: -41.1394388889\ 862 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 863 | 864 | \f2\fs24 \cf0 \cb1 \kerning1\expnd0\expndtw0 \outl0\strokewidth0 \ 865 | \ 866 | \ 867 | \ 868 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 869 | \cf0 \ 870 | \ 871 | \ 872 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 873 | \cf0 \ 874 | \ 875 | \ 876 | \ 877 | \ 878 | \ 879 | \ 880 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ebay Listing Optimizer With Machine Learning 2 | 3 | If you've ever bought something on Ebay, you know that it can be difficult to know if a particular listing is a good deal or not. And if you're selling, it can be hard to determine which options will draw bidders to your auction. What if there was a way to increase the likelihood that you would sell your listing on ebay, just by swapping a few keywords in your title? What if there was a way to filter listings for only the best deals? 4 | 5 | To investiage these questions, I dug into actual ebay data and built a machine learning system to help sellers make more sales on ebay, and alert shoppers to the best deals so they can make smarter buying decisions and save money. In order to do that, I had to create models that could predict if an auction would sell or not (at least one person would bid), and if a listing would sell, how much it would sell for. 6 | 7 | In this report, I'll discuss the decisions I made and show revelant code blocks and visualizations that describe my modeling process. 8 | 9 | # 1. Web Scraping, Data Cleaning, Data Piping 10 | Using the Ebay API and a related Python wrapper [https://github.com/timotheus/ebaysdk-python], I collected data for 100,000 completed listings in the "Digital Camera" category for the past 3 months, and stored the data in a table within a postgres database. I ended up using 20,000 "Auction" type listings within the original dataset. 11 | ![Sample of rows for a listing in Postgres database](./capstone-technical-report/images/completed_items_v2.png) 12 | 13 | The data included features like 14 | 15 | - Listing Title 16 | - Seller Feedback Score 17 | - Free Shipping Available 18 | - Listing End Price 19 | 20 | However, I suspected that start price and condition descriptions would be important features. Since they were not available through the API, I created a scraper with Scrapy to fetch URLs from my database, scrape the start price and condition, and store it back into my database. 21 | 22 | Now that I had the necessary data in my database, I could import it into Python. 23 | ![Subset of columns of my data](./capstone-technical-report/images/example_dataframe.png) 24 | 25 | 26 | # 2. Pre-Processing 27 | 28 | Pre-processing involved transforming the title text and the condition description as text features into numerical data. I used TF-IDF vectorization for the listing titles, as I wanted to cause unique words, like camera models, to have higher values. For conditions, I used count vectorizer, because I noticed many of the same words used across many different titles, such as "Functional","Like New", and "Scratched." I didn't want to down weight these kinds of words. 29 | 30 | I also scaled all predictors to the same range so that I could compare model coefficients to each other directly. 31 | 32 | 33 | # 3. Modeling 34 | 35 | ### Classification 36 | 37 | The classification problem involved predicting whether or not a given auction listing would sell. 38 | 39 | As a baseline, I used a model which simply predicted the majority class (1) for each listing, which resulted in the following scores: 40 | 41 | | Model | Accuracy | Precision | Recall | F-1 | 42 | |---------------------------|----------|-----------|--------|-------| 43 | | Majority Class Classifier | 0.854 | 0.854 | 1.000 | 0.917 | 44 | 45 | I then ran a simple Random Forest (100 trees), which gave me the following results for 3 fold CV: 46 | 47 | | Model | Accuracy | Precision | Recall | F-1 | 48 | |---------------------------|----------|-----------|--------|-------| 49 | | Random Forest Classifier | 0.860 | 0.877 | 0.95 | 0.911 | 50 | 51 | I chose to optimize for accuracy, since I thought that false positives were equally as important as false negatives for predicting the sale state of auction listings. Accuracy provides a simple metric for evaluating how many listings my models are classifying incorrectly. 52 | 53 | ### Regression 54 | 55 | The regression problem was to predict the end price *for listings that sold*. I was not interested in predicting the end price of items that did not sell, since that did not fit into my goal of helping buyer's find over-valued or under-valued deals. If my model was trained on unsold data, then the regression results could not reliably be used to predict what price items will eventually *sell* at. 56 | 57 | As a baseline, I used the median price of all listings as my prediction for all listings. 58 | 59 | | Model | Median Absolute Error ($) | 60 | |---------------------------|----------| 61 | | Median End Price Prediction | $66.71 | 62 | 63 | I then ran a simple random forest regressor got the following result: 64 | 65 | | Model | Median Absolute Error ($) | 66 | |---------------------------|----------| 67 | | Baseline Random Forest Regressor | $38.36 (-$28.35) (13.3%) | 68 | 69 | The immediate modeling results provided a drop in MAE of 28.35 dollars, to land at an average error of 13.3% compared with the actual end price for each listing. 70 | 71 | Before moving onto ensembling various classifiers, I decided to attempt to create a feature in the hopes of increasing the accuracy of my model. 72 | 73 | # 4. Feature Engineering 74 | When I thought about what potential factors could contribute to a particular listing selling or not, I hypothesized that listings on ebay are affected by other similar listings. Specifically, I thought that the start price of auctions listed on ebay at the same time, or listed "concurrently", would affect their respective end prices, and wanted to explore this route.I thought that the current price of each listing at the time of listing might be more influential than the start price, but in the interest of time, I decided to focus on start price. 75 | 76 | I defined a listing to be concurrent with another listing if the second was posted before the second ended (without a restriction on the amount of concurrent time needed to qualify as a concurrent listing), and filtering in python. 77 | 78 | In order to filter for "similar listings", I vectorized each listings title using sklearn's `TfidfVectorizer` and then calculated a cosime similarity score for each listing. I took only the top 5 most similar items, or those items with a similarity score greather than 0.95, whichever provided more results. I chose 5 and 0.95 through spot-checking the results for a balance between number of results and accuracy in terms of observed similarity. 79 | 80 | The essense of the code is along the lines of: 81 | ```python 82 | concurrent_listings_df = auctions_subset[auctions_subset['listingInfo.endTime'].apply(lambda sub_listing_et: listing_stmin_sim_score] 85 | ``` 86 | 87 | After I had the top 5 concurrent, similar listings, I took the median start price, and used that as a feature to my models. 88 | 89 | Unfortunately, the new feature did not improve the error of my model: 90 | Accuracy score with feature: 0.825 (-2.9%) 91 | Median Absolute Error: $42.12 (+$3.76) 92 | 93 | When I plotted the median start price of concurrent, similar listings versus end price I found this: 94 | ![Concurrent Median start price vs. End Price](./capstone-technical-report/images/csm_start_price.png) 95 | 96 | There is no correlation between the two, which suggests that people do not consider the *start price* of concurrent, similar listings when deciding to bid on items. However, my hunch is that people do consider the *current price* of concurrent, similar listings. Due to time constraints, I decided to move on instead of attempting to acquire the bidding history for each listing. 97 | 98 | # 5. Hyper-parameter Optimization 99 | 100 | To increase the accuracy of my modeling efforts, I decided to create an ensemble of classifiers for the classification task. I experimented with KNN, Logistic Regression, Gradient Boosted Trees, and Random Forest. Ultimatley I chose an ensemble of Gradient Boosted Trees, Random Forest, and Logistic Regression, averaging their respective probability calculations through a geometric mean, and using a decision threshold of 0.5. 101 | 102 | Although the feature-engineering was not as successful as hoped it would be, I knew that I could still reduce over-fitting and therefore reduce my test error by grid-searching for optimal hyper-parameters. I knew that I was over-fitting as my training and test error were significantly different. For the Random Forest classifier, my accuracy scores were 0.927 for training accuracy, and 0.877 for test accuracy. The difference of 5% told me that my model was not generalizing well enough, and that tuning hyper-parameters would potentially decrease the variance and the bias of my model. 103 | 104 | I used Amazon EC2 to run the grid-search and model fitting on a more powerful computer than I had available to use locally. 105 | 106 | ### Classification 107 | For the XGBoost model, I grid searched through: 108 | ```python 109 | 'max_depth': [3,5,7,9] 110 | ``` 111 | and found the best depth to be 7. 112 | 113 | For logistic regression, adding a L2 (Ridge) regularization term with a weight of 0.8 provided the best results. 114 | 115 | And for Random Forest, I grid searched through: 116 | ```python 117 | 'min_samples_split':[2, 4, 6], 118 | 'min_samples_leaf':[1,3,5], 119 | 'max_depth':[4, 8] 120 | ``` 121 | but found the best parameters to be the default parameters, (and 500 trees). 122 | 123 | When I combined these three models into an ensemble I was able to achieve the following scores: 124 | 125 | | Model | Accuracy | Precision | Recall | F-1 | 126 | |---------------------------|----------|-----------|--------|-------| 127 | | Ensemble (RF, LR, XGboost) | 0.891 (+3.7%) | 0.90 | 0.98 | 0.942 | 128 | 129 | An increase of 3.7% from baseline. Modest improvement! 130 | 131 | ### Regression 132 | 133 | For regression, I experimented with linear Regression, including Lasso and Ridge regularization, Random Forest Regressor and SKLearn's Gradient Boosted Regressor, and found the best model to be the Gradient Boosted Regressor. 134 | 135 | 136 | | Model | Median Absolute Error ($) | 137 | |---------------------------|----------| 138 | | Optimized Gradient Boosted Regressor | $25.82 (-$40.91), or 9.6% | 139 | 140 | At best, we were able to achieve 9.6% error on end price predictions for each item listing, on average through the entire dataset. 141 | 142 | 143 | # 5. Application 144 | 145 | ### Seller Listing Optimizer (Classification) 146 | 147 | It's a neat ML exercise to try to predict whether an auction is going to sell or not, but how is it useful to shoppers? One way is that, by looking at feature weights in the logistic regression classifier, we can determine which features increase or decrease the probability of sale of a listing. 148 | 149 | For instance, it is possible that certain words in the title or condition description may increase the probability of sale, and in fact, that is the case. 150 | 151 | Looking at the feature importances in the logistic regression model, we can see that some words are associated with positive increases in probability: 152 | 'slightly used', 0.649 153 | 'gently used', 1.369 154 | 155 | 'Slightly used' and 'gently used' are two ways of saying the same thing, but one has a much higher probaiblity of sale. To test the affect of using 'gently used' we can look at the following case study: 156 | ![Case Study](./capstone-technical-report/images/classification_case_study.png) 157 | 158 | The condition description - *Camera is in good, working condition with minor cosmetic wear* - is what we want to focus on. What happens if we include the terms "gently used" in the condition description? 159 | ![Gently used](./capstone-technical-report/images/gently_used.png) 160 | 161 | As you can see from the bar chart above, including the term "gently used" in the condition description, causes the model to predict a 3% higher probability of sale for this particular item. 162 | 163 | Although it is only a modest increase, the example is only to show that features can be tweaked to increase a listings probability of sale. 164 | 165 | ### Over-Valued and Under-Valued Item Alert (Regression) 166 | 167 | On the buyer's side of things, predicting end price of an auction is useful because this gives us an idea of the value of items on Ebay. If we know the going price for an auction on ebay, then we can alert buyers to whether or not the item is under or over-valued at the current bid price. If it's over-valued, the buyer can avoid that listing and instead wait for a listing that is under-valued. The buyer can also simply set a maximum bid ceiling equal to the predicted end price, and feel comfortable knowing that they won't be paying more than the market rate for that item. 168 | 169 | Let's take a look at an example: 170 | ![cyber shot](./capstone-technical-report/images/cyber_shot.png) 171 | 172 | This Cyber Shot digital camera sold for $369.00, but did the buyer get a good deal? 173 | 174 | Let's investigate. 175 | 176 | The actual end price for the camera was $369.00, but the *predicted* end price, according to our model was $314.90. 177 | If we factor in our 9.6% average error, then our model could be off by $314.90 + 9.6% = $345.13, in a worst case scenario in which the error increases the predicted price. 178 | 179 | In this scenario, the buyer actually overpaid by $369.00 - $345.13 = $23.87. 180 | 181 | If the buyer had waited to bid on another listing, they could have potentially saved $24! 182 | 183 | If we aggregate these findings to to all sold listings in our database, we find that 24.4%, or 3,959 listings are found to be over-priced. The average amount over-paid on each listing is $27.85, which means that all buyers of digital cameras on ebay could have collectively saved $110,277. That's a nice chunk of change! 184 | 185 | Below is a mock up of a web app that could use the models to alert buyers to the value of items they are browsing. 186 | 187 | ![Mock up of a web app that would use the research I've outlined in this report.](./capstone-technical-report/images/buyers_guide_example.png) 188 | 189 | Buyers would know which listings are OK to bid on, and which they should avoid in order to maximize their spending power. 190 | 191 | # 6. Conclusion 192 | 193 | Overall, I found this to be incredibly helpful learning experience, however I would want to reduce the regression error to below 5% before I would use the model myself. 194 | 195 | One avenue I would be interested in exploring is using listing images as features to my model. One way of doing this would be to train a nueral network using the greyscaled image matrix as an input and training on the sold state (1=sold, 0=unsold). My hypothesis is that higher quality images tend to sell more often than lower quality images, and if this were true, then the nueral network would learn to identify low quality images and high quality images. The network could be used to classify each image then, and use that classification as an input into the classification and regression models. Just one idea for how I might extend this project in the future. 196 | 197 | Thanks for reading! I hope you find this write-up useful in your own data science journey. 198 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-time-machine -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/__init__.py -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/__init__.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BhPhotoDigitalCameraItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | brand = scrapy.Field() 15 | model = scrapy.Field() 16 | retail_price = scrapy.Field() 17 | body_only = scrapy.Field() 18 | kit = scrapy.Field() 19 | has_lens = scrapy.Field() 20 | lens = scrapy.Field() 21 | bh_id = scrapy.Field() 22 | title = scrapy.Field() -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/items.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BhPhotoScraperSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import psycopg2 9 | import logging 10 | 11 | 12 | class BhPhotoDigitalCameraPipeline(object): 13 | 14 | def __init__(self, postgres_host,postgres_user,postgres_db,postgres_table): 15 | self.postgres_host=postgres_host 16 | self.postgres_user=postgres_user 17 | self.postgres_db=postgres_db 18 | self.postgres_table=postgres_table 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls( 23 | postgres_host=crawler.settings.get('POSTGRES_HOST'), 24 | postgres_user=crawler.settings.get('POSTGRES_USER'), 25 | postgres_db=crawler.settings.get('POSTGRES_DB'), 26 | postgres_table=crawler.settings.get('POSTGRES_TABLE'), 27 | ) 28 | 29 | 30 | def open_spider(self, spider): 31 | self.conn = psycopg2.connect("dbname={} user={} host={}".format(self.postgres_db, \ 32 | self.postgres_user, \ 33 | self.postgres_host) \ 34 | ) 35 | self.cur = self.conn.cursor() 36 | 37 | 38 | def process_item(self, item, spider): 39 | '''store data into postgres database 40 | 41 | ''' 42 | for field in item.fields: 43 | item.setdefault(field, 'NULL') 44 | 45 | # SQL = ''' 46 | # UPDATE {table_name} 47 | # SET "Brand"='{brand}', 48 | # "Model"='{model}', 49 | # "Retail Price"={retail_price}, 50 | # "Body Only"={body_only}, 51 | # "Kit"={kit}, 52 | # "Has Lens"={has_lens}, 53 | # "Lens"='{lens}', 54 | # "B&H Id"='{bh_id}', 55 | # "Title"='{title}' 56 | # ; 57 | # '''.format( table_name=self.postgres_table, 58 | # brand=item['brand'], 59 | # model=item['model'], 60 | # retail_price=item['retail_price'], 61 | # body_only=item['body_only'], 62 | # kit=item['kit'], 63 | # has_lens=item['has_lens'], 64 | # lens=item['lens'], 65 | # bh_id=item['bh_id'], 66 | # title=item['title'] 67 | # ) 68 | 69 | 70 | insert_statement = '''INSERT INTO {table_name} (%s) VALUES %s;'''.format(table_name=self.postgres_table) 71 | 72 | keys = ['Brand','Title'] 73 | keys = ['"{}"'.format(key) for key in keys] 74 | values = (item['brand'],item['title']) 75 | 76 | SQL = self.cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), values)) 77 | 78 | try: 79 | self.cur.execute(SQL) # execute SQL, and commit changes 80 | self.conn.commit() 81 | except: 82 | logging.debug('Error with executing SQL statement.\n SQL = {}'.format(SQL)) 83 | self.conn.rollback() 84 | 85 | 86 | return item 87 | 88 | 89 | def close_spider(self, spider): 90 | self.conn.close() 91 | self.cur.close() 92 | 93 | 94 | -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/pipelines.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for bh_photo_scraper project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | import logging 13 | 14 | 15 | BOT_NAME = 'bh_photo_scraper' 16 | 17 | SPIDER_MODULES = ['bh_photo_scraper.spiders'] 18 | NEWSPIDER_MODULE = 'bh_photo_scraper.spiders' 19 | 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = False 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | CONCURRENT_REQUESTS = 1 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | DOWNLOAD_DELAY = 1 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | #CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'bh_photo_scraper.middlewares.BhPhotoScraperSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | #DOWNLOADER_MIDDLEWARES = { 59 | # 'bh_photo_scraper.middlewares.MyCustomDownloaderMiddleware': 543, 60 | #} 61 | 62 | # Enable or disable extensions 63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 64 | #EXTENSIONS = { 65 | # 'scrapy.extensions.telnet.TelnetConsole': None, 66 | #} 67 | 68 | # Configure item pipelines 69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 70 | #ITEM_PIPELINES = { 71 | # 'bh_photo_scraper.pipelines.BhPhotoScraperPipeline': 300, 72 | #} 73 | 74 | # Configure item pipelines 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 76 | ITEM_PIPELINES = { 77 | 'bh_photo_scraper.pipelines.BhPhotoDigitalCameraPipeline': 300, 78 | } 79 | # set up the pipeline settings for postgres 80 | POSTGRES_HOST = "localhost" 81 | POSTGRES_USER = "nathan" 82 | POSTGRES_DB = "ebay" 83 | POSTGRES_TABLE = "b_h_digital_camera_inventory" 84 | 85 | 86 | # Enable and configure the AutoThrottle extension (disabled by default) 87 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 88 | AUTOTHROTTLE_ENABLED = True 89 | # The initial download delay 90 | AUTOTHROTTLE_START_DELAY = 1 91 | # The maximum download delay to be set in case of high latencies 92 | AUTOTHROTTLE_MAX_DELAY = 2 93 | # The average number of requests Scrapy should be sending in parallel to 94 | # each remote server 95 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 96 | # Enable showing throttling stats for every response received: 97 | #AUTOTHROTTLE_DEBUG = False 98 | 99 | # Enable and configure HTTP caching (disabled by default) 100 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 101 | #HTTPCACHE_ENABLED = True 102 | #HTTPCACHE_EXPIRATION_SECS = 0 103 | #HTTPCACHE_DIR = 'httpcache' 104 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 105 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 106 | 107 | # LOGGING 108 | # LOG_FILE = '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_spider_log.log' 109 | # LOG_ENABLED = True 110 | # LOG_LEVEL = logging.ERROR 111 | 112 | -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/settings.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/spiders/__init__.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/bh_photo_scraper/bh_photo_scraper/') 3 | 4 | import scrapy 5 | from scrapy.spiders import CrawlSpider 6 | import psycopg2 7 | import logging 8 | 9 | from items import BhPhotoDigitalCameraItem 10 | from bs4 import BeautifulSoup 11 | 12 | class DigitalCameraSpider(scrapy.Spider): 13 | name = "digital_camera_spider" 14 | 15 | 16 | def start_requests(self): 17 | 18 | url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/N/4288586282' # Digital Cameras 19 | # url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/1/N/4288586282?via=js' 20 | 21 | # Get number of pages 22 | yield scrapy.Request(url=url, callback=self.get_num_pages) 23 | 24 | # yield scrapy.Request(url=url, callback=self.parse, meta={'num_pages':num_pages}) 25 | 26 | 27 | 28 | def parse(self, response): 29 | 30 | num_pages = response.meta['num_pages'] 31 | page_num = response.meta['page_num'] 32 | 33 | ids = response.xpath("//span[1]/span[@class='sku']/text()").extract() 34 | 35 | brands = response.xpath("//a[@class='c5']/span[1]/text()").extract() 36 | titles = response.xpath("//a[@class='c5']/span[2]/text()").extract() 37 | 38 | if len(brands) != len(titles): # an element in brands is a new-release title, remove it 39 | for i,brand in enumerate(brands): 40 | if len(brand.split()) > 1: 41 | brands.pop(i) 42 | 43 | 44 | 45 | # XPATH does not work entirely for prices, use beautifulsoup instead 46 | # soup = BeautifulSoup(response.body, 'lxml') 47 | # prices = [float(price.get_text().strip().strip('$').replace(',','')) \ 48 | # for price in soup.find_all('span','price')] 49 | 50 | 51 | for i in range(len(brands)): 52 | item = BhPhotoDigitalCameraItem() 53 | item['brand'] = brands[i].strip() 54 | item['title'] = titles[i].strip() 55 | 56 | # NOTE: Sometimes, the price field is not there, hopefully this only occurs when 57 | # the item is at the bottom of the page, otherwise the brands,titles,ids,prices 58 | # elements will be out of sync. 59 | # item['bh_id'] = ids[i].strip() 60 | # try: 61 | # item['retail_price'] = prices[i] 62 | # except IndexError as e: 63 | # print e 64 | # item['retail_price'] = None 65 | 66 | 67 | yield item 68 | 69 | 70 | 71 | # when done processing items, move onto next page 72 | if page_num <= num_pages: 73 | logging.debug('Scraping page {}'.format(page_num)) 74 | next_url = 'https://www.bhphotovideo.com/c/buy/Digital-Cameras/ci/9811/pn/{}/N/4288586282?via=js'.format(page_num) 75 | yield scrapy.Request(next_url, callback=self.parse, meta={'num_pages':num_pages,'page_num':page_num+1}) 76 | else: 77 | logging.debug('Should be done scraping..') 78 | # raise CloseSpider('Done Crawling.') 79 | yield 80 | 81 | 82 | def get_num_pages(self, response): 83 | logging.debug('Made it here!') 84 | num_pages = response.xpath("//p[@class='pageNuber']/text()").extract_first().strip().split()[-1] 85 | page_num = 1 # start at page 1 86 | yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True, meta={'num_pages':num_pages,'page_num':page_num+1}) 87 | 88 | 89 | -------------------------------------------------------------------------------- /bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/bh_photo_scraper/bh_photo_scraper/spiders/digital_camera_spider.pyc -------------------------------------------------------------------------------- /bh_photo_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = bh_photo_scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = bh_photo_scraper 12 | -------------------------------------------------------------------------------- /capstone-technical-report/images/buyers_guide_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/buyers_guide_example.png -------------------------------------------------------------------------------- /capstone-technical-report/images/classification_case_study.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/classification_case_study.png -------------------------------------------------------------------------------- /capstone-technical-report/images/completed_items_v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/completed_items_v2.png -------------------------------------------------------------------------------- /capstone-technical-report/images/csm_start_price.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/csm_start_price.png -------------------------------------------------------------------------------- /capstone-technical-report/images/cyber_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/cyber_shot.png -------------------------------------------------------------------------------- /capstone-technical-report/images/example_dataframe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/example_dataframe.png -------------------------------------------------------------------------------- /capstone-technical-report/images/gently_used.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/capstone-technical-report/images/gently_used.png -------------------------------------------------------------------------------- /data-analysis/utilities/clean_text.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import nltk 4 | from nltk.corpus import stopwords 5 | from nltk.stem.porter import PorterStemmer 6 | from nltk.stem.lancaster import LancasterStemmer 7 | 8 | def clean_text(doc, remove_stop_words=True, remove_digits=False, remove_punc=True, stem=False): 9 | 10 | # 1. Remove any HTML markup 11 | text = BeautifulSoup(doc, 'lxml').get_text() 12 | 13 | # 2. Extract special negator like n't 14 | text = re.sub('n\'t', ' not', text) 15 | 16 | # 3. remove punctuation(except .-) 17 | if remove_punc: 18 | text = re.sub('[^a-zA-Z.\-\d]', ' ', text) 19 | 20 | if remove_digits: 21 | text = re.sub('[.\d]', ' ', text) 22 | 23 | # 4. Convert to lower case 24 | text = text.lower() 25 | 26 | # 5. Remove stop words 27 | if remove_stop_words: 28 | stops = set(stopwords.words("english")) 29 | text = [w for w in text.split(' ') if not w in stops] 30 | text = ' '.join(text) 31 | 32 | # 6. apply Porter Stemming 33 | # probably don't need this 34 | if stem: 35 | stemmer = PorterStemmer() 36 | stemmer = LancasterStemmer() 37 | text = [stemmer.stem(w) for w in text.split(' ')] 38 | text = ' '.join(text) 39 | 40 | # 7. Remove extra white space 41 | text = re.sub(' +',' ', text) 42 | 43 | return text -------------------------------------------------------------------------------- /data-analysis/utilities/plot_learning_curve.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.model_selection import learning_curve 4 | from sklearn.model_selection import ShuffleSplit 5 | 6 | 7 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, 8 | n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5), 9 | scoring='accuracy'): 10 | """ 11 | Generate a simple plot of the test and training learning curve. 12 | 13 | Parameters 14 | ---------- 15 | estimator : object type that implements the "fit" and "predict" methods 16 | An object of that type which is cloned for each validation. 17 | 18 | title : string 19 | Title for the chart. 20 | 21 | X : array-like, shape (n_samples, n_features) 22 | Training vector, where n_samples is the number of samples and 23 | n_features is the number of features. 24 | 25 | y : array-like, shape (n_samples) or (n_samples, n_features), optional 26 | Target relative to X for classification or regression; 27 | None for unsupervised learning. 28 | 29 | ylim : tuple, shape (ymin, ymax), optional 30 | Defines minimum and maximum yvalues plotted. 31 | 32 | cv : int, cross-validation generator or an iterable, optional 33 | Determines the cross-validation splitting strategy. 34 | Possible inputs for cv are: 35 | - None, to use the default 3-fold cross-validation, 36 | - integer, to specify the number of folds. 37 | - An object to be used as a cross-validation generator. 38 | - An iterable yielding train/test splits. 39 | 40 | For integer/None inputs, if ``y`` is binary or multiclass, 41 | :class:`StratifiedKFold` used. If the estimator is not a classifier 42 | or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. 43 | 44 | Refer :ref:`User Guide ` for the various 45 | cross-validators that can be used here. 46 | 47 | n_jobs : integer, optional 48 | Number of jobs to run in parallel (default 1). 49 | """ 50 | plt.figure() 51 | plt.title(title) 52 | if ylim is not None: 53 | plt.ylim(*ylim) 54 | plt.xlabel("Training examples") 55 | plt.ylabel("Score") 56 | 57 | train_sizes, train_scores, test_scores = learning_curve( 58 | estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, 59 | scoring=scoring) 60 | 61 | train_scores_mean = np.mean(train_scores, axis=1) 62 | train_scores_std = np.std(train_scores, axis=1) 63 | test_scores_mean = np.mean(test_scores, axis=1) 64 | test_scores_std = np.std(test_scores, axis=1) 65 | 66 | plt.grid() 67 | 68 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 69 | train_scores_mean + train_scores_std, alpha=0.1, 70 | color="r") 71 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 72 | test_scores_mean + test_scores_std, alpha=0.1, color="g") 73 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 74 | label="Training score") 75 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", 76 | label="Cross-validation score") 77 | 78 | plt.legend(loc="best") 79 | 80 | return plt 81 | 82 | def example_learning_curve(): 83 | from sklearn.naive_bayes import GaussianNB 84 | from sklearn.svm import SVC 85 | from sklearn.datasets import load_digits 86 | 87 | digits = load_digits() 88 | X, y = digits.data, digits.target 89 | 90 | 91 | title = "Learning Curves (Naive Bayes)" 92 | # Cross validation with 100 iterations to get smoother mean test and train 93 | # score curves, each time with 20% data randomly selected as a validation set. 94 | cv = ShuffleSplit(n_splits=100, test_size=0.2, random_state=0) 95 | 96 | estimator = GaussianNB() 97 | plot_learning_curve(estimator, title, X, y, ylim=(0.7, 1.01), cv=cv, n_jobs=4) 98 | 99 | title = "Learning Curves (SVM, RBF kernel, $\gamma=0.001$)" 100 | # SVC is more expensive so we do a lower number of CV iterations: 101 | cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) 102 | estimator = SVC(gamma=0.001) 103 | plot_learning_curve(estimator, title, X, y, (0.7, 1.01), cv=cv, n_jobs=4) 104 | 105 | plt.show() 106 | -------------------------------------------------------------------------------- /ebay-api-scraper/.ipynb_checkpoints/datetime test-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [ 12 | { 13 | "data": { 14 | "text/plain": [ 15 | "'2017-03-26T00:00:00'" 16 | ] 17 | }, 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "output_type": "execute_result" 21 | } 22 | ], 23 | "source": [ 24 | "#get datetime in datetime in ISO-8601 format\n", 25 | "import datetime \n", 26 | "\n", 27 | "'2017'\n", 28 | "date = datetime.datetime(2017,3,26)\n", 29 | "\n", 30 | "date.isoformat()\n", 31 | "# datetime.datetime.now().isoformat()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 5, 37 | "metadata": { 38 | "collapsed": false, 39 | "deletable": true, 40 | "editable": true 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "2017-03-26T00:00:000Z\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "time_string = '2017-03-26'\n", 53 | "print datetime.datetime.strptime(time_string, \"%Y-%m-%d\").isoformat() + '0Z'" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 2", 69 | "language": "python", 70 | "name": "python2" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 2 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython2", 82 | "version": "2.7.13" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /ebay-api-scraper/.ipynb_checkpoints/scrapy-development-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 2", 16 | "language": "python", 17 | "name": "python2" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 2 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython2", 29 | "version": "2.7.13" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 2 34 | } 35 | -------------------------------------------------------------------------------- /ebay-api-scraper/common.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | © 2012-2013 eBay Software Foundation 4 | Authored by: Tim Keefer 5 | Licensed under CDDL 1.0 6 | ''' 7 | 8 | 9 | def dump(api, full=False): 10 | 11 | print("\n") 12 | 13 | if api.warnings(): 14 | print("Warnings" + api.warnings()) 15 | 16 | if api.response.content: 17 | print("Call Success: %s in length" % len(api.response.content)) 18 | 19 | print("Response code: %s" % api.response_code()) 20 | print("Response DOM1: %s" % api.response_dom()) # deprecated 21 | print("Response ETREE: %s" % api.response.dom()) 22 | 23 | if full: 24 | print(api.response.content) 25 | print(api.response.json()) 26 | print("Response Reply: %s" % api.response.reply) 27 | else: 28 | dictstr = "%s" % api.response.dict() 29 | # print("Response dictionary: %s..." % dictstr[:150]) 30 | print("Response dictionary: %s..." % dictstr[:]) 31 | replystr = "%s" % api.response.reply 32 | # print("Response Reply: %s" % replystr[:150]) 33 | print("Response Reply: %s" % replystr[:]) 34 | -------------------------------------------------------------------------------- /ebay-api-scraper/common.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/common.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/create-ebay-tables.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | import sys 3 | 4 | numArgs = len(sys.argv) 5 | if numArgs < 1 or numArgs > 5: 6 | print 'ERROR: Not enough arguments. Please input "host",user",dbname","tablename" as arguments.' 7 | sys.exit() 8 | 9 | (host, user, dbname, tablename) = tuple(sys.argv[1:]) 10 | 11 | # dbname='test-db1' 12 | # tablename='tablename' 13 | # user='nathan' 14 | # host='localhost' 15 | 16 | print (host, user, dbname, tablename) 17 | 18 | try: 19 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 20 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 21 | except: 22 | print "I am unable to connect to the database" 23 | 24 | cur = conn.cursor() 25 | 26 | SQL = ''' 27 | CREATE TABLE {tablename} ( 28 | "id" SERIAL PRIMARY KEY, 29 | "timestamp" TIMESTAMP WITH TIME ZONE, 30 | "itemId" BIGINT, 31 | "topRatedListing" BOOLEAN, 32 | "globalId" TEXT, 33 | "title" TEXT, 34 | "subtitle" TEXT, 35 | "country" TEXT, 36 | 37 | "primaryCategory.categoryId" INTEGER, 38 | "primaryCategory.categoryName" TEXT, 39 | "secondaryCategory.categoryId" TEXT, 40 | "secondaryCategory.categoryName" TEXT, 41 | "pictureURLLarge" TEXT, 42 | "galleryURL" TEXT, 43 | 44 | "sellerInfo.feedbackRatingStar" TEXT, 45 | "sellerInfo.feedbackScore" INTEGER, 46 | "sellerInfo.positiveFeedbackPercent" DECIMAL, 47 | "sellerInfo.sellerUserName" TEXT, 48 | "sellerInfo.topRatedSeller" BOOLEAN, 49 | "shippingInfo.expeditedShipping" BOOLEAN, 50 | "shippingInfo.shipToLocations" TEXT, 51 | "shippingInfo.shippingServiceCost.value" DECIMAL, 52 | "shippingInfo.oneDayShippingAvailable" BOOLEAN, 53 | "shippingInfo.handlingTime" SMALLINT, 54 | "shippingInfo.shippingType" TEXT, 55 | 56 | "autoPay" BOOLEAN, 57 | "location" TEXT, 58 | "postalCode" INTEGER, 59 | "returnsAccepted" BOOLEAN, 60 | "viewItemURL" TEXT, 61 | 62 | "sellingStatus.currentPrice.value" DECIMAL, 63 | "startprice" DECIMAL, 64 | "endPrice" DECIMAL, 65 | "sellingStatus.bidCount" SMALLINT, 66 | "sellingStatus.sellingState" TEXT, 67 | "paymentMethod" TEXT, 68 | 69 | "isMultiVariationListing" BOOLEAN, 70 | 71 | "condition" TEXT, 72 | "condition.conditionId" INTEGER, 73 | "condition.conditionDisplayName" TEXT, 74 | "listingInfo.listingType" TEXT, 75 | "listingInfo.gift" BOOLEAN, 76 | "listingInfo.bestOfferEnabled" BOOLEAN, 77 | "listingInfo.buyItNowAvailable" BOOLEAN, 78 | "listingInfo.buyItNowPrice.value" DECIMAL, 79 | "listingInfo.startTime" TIMESTAMP WITH TIME ZONE, 80 | "listingInfo.endTime" TIMESTAMP WITH TIME ZONE, 81 | "conditiondescription" TEXT 82 | ) 83 | '''.format(tablename=tablename) 84 | 85 | cur.execute(SQL) 86 | 87 | conn.commit() 88 | cur.close() 89 | conn.close() 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /ebay-api-scraper/datetime test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false, 8 | "deletable": true, 9 | "editable": true 10 | }, 11 | "outputs": [ 12 | { 13 | "data": { 14 | "text/plain": [ 15 | "'2017-03-26T00:00:00'" 16 | ] 17 | }, 18 | "execution_count": 1, 19 | "metadata": {}, 20 | "output_type": "execute_result" 21 | } 22 | ], 23 | "source": [ 24 | "#get datetime in datetime in ISO-8601 format\n", 25 | "import datetime \n", 26 | "\n", 27 | "'2017'\n", 28 | "date = datetime.datetime(2017,3,26)\n", 29 | "\n", 30 | "date.isoformat()\n", 31 | "# datetime.datetime.now().isoformat()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 5, 37 | "metadata": { 38 | "collapsed": false, 39 | "deletable": true, 40 | "editable": true 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "2017-03-26T00:00:000Z\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "time_string = '2017-03-26'\n", 53 | "print datetime.datetime.strptime(time_string, \"%Y-%m-%d\").isoformat() + '0Z'" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 2", 69 | "language": "python", 70 | "name": "python2" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 2 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython2", 82 | "version": "2.7.13" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay.yaml: -------------------------------------------------------------------------------- 1 | # eBay SDK Defaults 2 | 3 | name: ebay_api_config 4 | 5 | 6 | # Trading API Sandbox - https://www.x.com/developers/ebay/products/trading-api 7 | api.sandbox.ebay.com: 8 | compatibility: 719 9 | # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8 10 | # token: AgAAAA**AQAAAA**aAAAAA**yO/RWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkIahDJWBqQ+dj6x9nY+seQ**jaoDAA**AAMAAA**1nvC7MyyoPnKbCiKLRFPOw4kdtLxsj04ZhZz85P15OQgKol4c3uBlaqDvzucVMe427r9T93vTBZVhZFn0vpWP08kYe374UD/VXzwQPAiDFBBbrKKE88idiqhrVSYRl9xcBLGE4wpanWHYoDs3AQ8go17il6XAfFYry+iHJIMEDKo4ikwhIRp7AVmhqt4K0+U1RtJsxjxYxWnH0GLrWuwSFWpu/XnitRibO5OC/o0p9ryf2v7suR2lSUbQ2VSqBXDMJG1I5vcfT+118lzaU3mpVugIU6dG2+k2v+t/IswpTvg9pv/XD1U5usgwWJWQStZG/6kT358OqkhPwJF56HtK31vEDnsL9SQ+p9JY/pIcoQIz1u6XaYlZfersZvwAJLgsF44jzgu+Y8WUUnvCMH9v7tNZ/2zhY/rIuBWrK5GuobpZ2kiu5SYiBBuJeWiBysvsAGWuS9a+VzImG1b+s/jQJGyiuUmjs8aCj5KzDWjSTvbBzWpsHWQaVRCrVqiZiv+mJ7hRqcpXNXUiNIzKBVX9tfEiYnC6gykTl5VMqLBzM26eQt1R5fnyWItbXS5r0+pm1otJi+YdFxm8d+b4174YMmCIBdjv9ceJL7uS/ssNnnVeqk3FptiNFlW9tf+PYt9KfysCi6N7NtRRWH4/oQ6Zpj+4RSFc0XJsTBkey1FNcm4H4zSz7cqmnx4i695nA3HaMDqASKsNIUfzTMjN2BGoTT2xFmHef9VriHDGcwqgCOPSbuWeuuTjbTUr6u0jNFj 11 | 12 | appid: NathanZo-ebaypric-PRD-4090fc79c-86f1fb32 13 | token: AgAAAA**AQAAAA**aAAAAA**nnLyWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkYGoCZWBpwydj6x9nY+seQ**HrMDAA**AAMAAA**3T8/OrOs7miNFtluqR31OFpfd4Y+apVBX9Q00gftP2wHe+gdw1G1c6+cNnWi1v7LZFsTeHfE1tAAhJT5CcHiRr8k2EMLqxB3Hwqj8P27tOnJm9otP4/WvS680f9GR6KDNyVSTTCaGXqqTxSu2Nibb8nx5q9jTeq5DoLlOS24+SG8eyq6rc6nGOuqRFP4ki/bpH3EQMACiZEOQyN5Zuvt8ubk/ogJMKnscRNGnIxI1G+nfdteEaQQO8Lv/nk4nof9fA2S+65m11dCpFpxy/RO+zM3+8a7N49FLC3/j3fH3jxbv2RgDbnJtK3YJUC9Ypa9cc6PyGY8caGhA1G3FNELE9FP4+bCTqHj+SaoOpOAT41yobrLmTan4/YmC5FfVYjG9wfPlzuZkiuL0sV/eeRvTXUcbO6ImUBYeVXpYKwNUPZ30qGC/SDFst1UeXDDVtOY5cqEZmeQOs0V7hVHmIBGpsaNLx/2ItQ8WQQoPn8X8YjEYtGPP3UX8yjqVkao/nPOuIYnWkwl1SaYufsFCVkwxhKGRg1ZZcjAezy2kk9HXym8p7dpV0J367Y+K2qUByDIv6tkihXk6KWYXLcQgtHfJ2wkjpx4ItGusVIQ5Kp2p+LflzBnQ4VWgSn/sLlmf0cP1aWhUt8qvOd+sgxc4oAuD3tRyB3zyQO6Tw8Cp3WEg9/fsdv6csxM/dHkDjk30D859uE3R2HNwP7OPrOdb70h64NFvxGUjE4ib9lch5yR9K1pWjiyeb2dANGMLPeDsPKz 14 | 15 | 16 | # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0 17 | appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b 18 | 19 | certid: PRD-bed4d450833b-99a1-4224-9e6e-1e94 20 | devid: 0cc86638-a322-4390-8da7-76b3800fa795 21 | 22 | 23 | # Trading API - https://www.x.com/developers/ebay/products/trading-api 24 | api.ebay.com: 25 | version: 719 26 | # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8 27 | # token: AgAAAA**AQAAAA**aAAAAA**yO/RWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkIahDJWBqQ+dj6x9nY+seQ**jaoDAA**AAMAAA**1nvC7MyyoPnKbCiKLRFPOw4kdtLxsj04ZhZz85P15OQgKol4c3uBlaqDvzucVMe427r9T93vTBZVhZFn0vpWP08kYe374UD/VXzwQPAiDFBBbrKKE88idiqhrVSYRl9xcBLGE4wpanWHYoDs3AQ8go17il6XAfFYry+iHJIMEDKo4ikwhIRp7AVmhqt4K0+U1RtJsxjxYxWnH0GLrWuwSFWpu/XnitRibO5OC/o0p9ryf2v7suR2lSUbQ2VSqBXDMJG1I5vcfT+118lzaU3mpVugIU6dG2+k2v+t/IswpTvg9pv/XD1U5usgwWJWQStZG/6kT358OqkhPwJF56HtK31vEDnsL9SQ+p9JY/pIcoQIz1u6XaYlZfersZvwAJLgsF44jzgu+Y8WUUnvCMH9v7tNZ/2zhY/rIuBWrK5GuobpZ2kiu5SYiBBuJeWiBysvsAGWuS9a+VzImG1b+s/jQJGyiuUmjs8aCj5KzDWjSTvbBzWpsHWQaVRCrVqiZiv+mJ7hRqcpXNXUiNIzKBVX9tfEiYnC6gykTl5VMqLBzM26eQt1R5fnyWItbXS5r0+pm1otJi+YdFxm8d+b4174YMmCIBdjv9ceJL7uS/ssNnnVeqk3FptiNFlW9tf+PYt9KfysCi6N7NtRRWH4/oQ6Zpj+4RSFc0XJsTBkey1FNcm4H4zSz7cqmnx4i695nA3HaMDqASKsNIUfzTMjN2BGoTT2xFmHef9VriHDGcwqgCOPSbuWeuuTjbTUr6u0jNFj 28 | 29 | appid: NathanZo-ebaypric-PRD-4090fc79c-86f1fb32 30 | token: AgAAAA**AQAAAA**aAAAAA**nnLyWA**nY+sHZ2PrBmdj6wVnY+sEZ2PrA2dj6ACkYGoCZWBpwydj6x9nY+seQ**HrMDAA**AAMAAA**3T8/OrOs7miNFtluqR31OFpfd4Y+apVBX9Q00gftP2wHe+gdw1G1c6+cNnWi1v7LZFsTeHfE1tAAhJT5CcHiRr8k2EMLqxB3Hwqj8P27tOnJm9otP4/WvS680f9GR6KDNyVSTTCaGXqqTxSu2Nibb8nx5q9jTeq5DoLlOS24+SG8eyq6rc6nGOuqRFP4ki/bpH3EQMACiZEOQyN5Zuvt8ubk/ogJMKnscRNGnIxI1G+nfdteEaQQO8Lv/nk4nof9fA2S+65m11dCpFpxy/RO+zM3+8a7N49FLC3/j3fH3jxbv2RgDbnJtK3YJUC9Ypa9cc6PyGY8caGhA1G3FNELE9FP4+bCTqHj+SaoOpOAT41yobrLmTan4/YmC5FfVYjG9wfPlzuZkiuL0sV/eeRvTXUcbO6ImUBYeVXpYKwNUPZ30qGC/SDFst1UeXDDVtOY5cqEZmeQOs0V7hVHmIBGpsaNLx/2ItQ8WQQoPn8X8YjEYtGPP3UX8yjqVkao/nPOuIYnWkwl1SaYufsFCVkwxhKGRg1ZZcjAezy2kk9HXym8p7dpV0J367Y+K2qUByDIv6tkihXk6KWYXLcQgtHfJ2wkjpx4ItGusVIQ5Kp2p+LflzBnQ4VWgSn/sLlmf0cP1aWhUt8qvOd+sgxc4oAuD3tRyB3zyQO6Tw8Cp3WEg9/fsdv6csxM/dHkDjk30D859uE3R2HNwP7OPrOdb70h64NFvxGUjE4ib9lch5yR9K1pWjiyeb2dANGMLPeDsPKz 31 | 32 | # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0 33 | # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b 34 | 35 | certid: PRD-bed4d450833b-99a1-4224-9e6e-1e94 36 | devid: 0cc86638-a322-4390-8da7-76b3800fa795 37 | 38 | 39 | # Finding API - https://www.x.com/developers/ebay/products/finding-api 40 | svcs.ebay.com: 41 | # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8 42 | # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0 43 | # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b 44 | 45 | version: 1.13.0 46 | 47 | # Shopping API - https://www.x.com/developers/ebay/products/shopping-api 48 | open.api.ebay.com: 49 | # appid: nathanzo-ebaypric-PRD-cbed4d450-05d217d8 50 | # appid: YunusGen-HelpforN-PRD-b08f655c9-9bfb3bd0 51 | # appid: TaylorKi-taylorsh-PRD-f0902ebf9-31c2395b 52 | version: 671 53 | 54 | # Optional affiliate tracking 55 | # http://developer.ebay.com/DevZone/shopping/docs/Concepts/ShoppingAPI_FormatOverview.html#StandardURLParameters 56 | trackingid: ENTER_YOUR_TRACKINGID_HERE 57 | trackingpartnercode: ENTER_YOUR_PARTNERCODE_HERE 58 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.py -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__init__.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/settings.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/__pycache__/settings.cpython-35.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class EbayScraperItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | itemId = scrapy.Field(default='NULL') 16 | conditionDescription = scrapy.Field(default='NULL') 17 | startPrice = scrapy.Field(default='NULL') 18 | endPrice = scrapy.Field(default='NULL') 19 | duration = scrapy.Field(default='NULL') 20 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/items.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import psycopg2 9 | import logging 10 | 11 | 12 | class EbayPostgresPipeline(object): 13 | 14 | def __init__(self, postgres_host,postgres_user,postgres_db,postgres_table): 15 | self.postgres_host=postgres_host 16 | self.postgres_user=postgres_user 17 | self.postgres_db=postgres_db 18 | self.postgres_table=postgres_table 19 | 20 | ''' 21 | The settings attribute is set in the base Spider class after the spider is 22 | initialized. If you want to use the settings before the initialization 23 | (e.g., in your spider’s __init__() method), you’ll need to override the 24 | from_crawler() method. 25 | ''' 26 | @classmethod 27 | def from_crawler(cls, crawler): 28 | return cls( 29 | postgres_host=crawler.settings.get('POSTGRES_HOST'), 30 | postgres_user=crawler.settings.get('POSTGRES_USER'), 31 | postgres_db=crawler.settings.get('POSTGRES_DB'), 32 | postgres_table=crawler.settings.get('POSTGRES_TABLE'), 33 | ) 34 | 35 | 36 | def open_spider(self, spider): 37 | self.conn = psycopg2.connect("dbname={} user={} host={}".format(self.postgres_db, \ 38 | self.postgres_user, \ 39 | self.postgres_host) \ 40 | ) 41 | self.cur = self.conn.cursor() 42 | 43 | 44 | def process_item(self, item, spider): 45 | '''store data into postgres database 46 | 47 | ''' 48 | 49 | 50 | 51 | SQL = ''' 52 | UPDATE ONLY {table_name} as ci 53 | SET conditiondescription='{condition}', 54 | startprice={start_price} 55 | WHERE ci."itemId"={item_id}; 56 | '''.format( table_name=self.postgres_table, 57 | condition=item['conditionDescription'], 58 | start_price=item['startPrice'], 59 | item_id=item['itemId'] 60 | ) 61 | 62 | 63 | try: 64 | self.cur.execute(SQL) # execute SQL, and commit changes 65 | self.conn.commit() 66 | except: 67 | logging.debug('Error with executing SQL statement.\n SQL = {}'.format(SQL)) 68 | self.conn.rollback() 69 | 70 | 71 | return item 72 | 73 | 74 | def close_spider(self, spider): 75 | self.conn.close() 76 | self.cur.close() 77 | 78 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/pipelines.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/settings.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: utf-8 -*- 3 | 4 | # Scrapy settings for ebay_scraper project 5 | # 6 | # For simplicity, this file contains only settings considered important or 7 | # commonly used. You can find more settings consulting the documentation: 8 | # 9 | # http://doc.scrapy.org/en/latest/topics/settings.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 11 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 12 | import logging 13 | 14 | BOT_NAME = 'ebay_scraper' 15 | 16 | SPIDER_MODULES = ['ebay_scraper.spiders'] 17 | NEWSPIDER_MODULE = 'ebay_scraper.spiders' 18 | 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | # USER_AGENT = 'ebay_scraper (+http://www.yourdomain.com)' 22 | USER_AGENT = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" 23 | 24 | # Obey robots.txt rules 25 | ROBOTSTXT_OBEY = False 26 | 27 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 28 | CONCURRENT_REQUESTS = 2 29 | 30 | # Configure a delay for requests for the same website (default: 0) 31 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 32 | # See also autothrottle settings and docs 33 | DOWNLOAD_DELAY = 1 # default = 0 34 | # The download delay setting will honor only one of: 35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 36 | # CONCURRENT_REQUESTS_PER_IP = 16 37 | 38 | # Disable cookies (enabled by default) 39 | COOKIES_ENABLED = False 40 | 41 | # Disable Telnet Console (enabled by default) 42 | #TELNETCONSOLE_ENABLED = False 43 | 44 | # Override the default request headers: 45 | #DEFAULT_REQUEST_HEADERS = { 46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 47 | # 'Accept-Language': 'en', 48 | #} 49 | 50 | # Enable or disable spider middlewares 51 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 52 | #SPIDER_MIDDLEWARES = { 53 | # 'ebay_scraper.middlewares.MyCustomSpiderMiddleware': 543, 54 | #} 55 | 56 | # Enable or disable downloader middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 58 | #DOWNLOADER_MIDDLEWARES = { 59 | # 'ebay_scraper.middlewares.MyCustomDownloaderMiddleware': 543, 60 | #} 61 | 62 | # Enable or disable extensions 63 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 64 | #EXTENSIONS = { 65 | # 'scrapy.extensions.telnet.TelnetConsole': None, 66 | #} 67 | 68 | # Configure item pipelines 69 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 70 | ITEM_PIPELINES = { 71 | 'ebay_scraper.pipelines.EbayPostgresPipeline': 300, 72 | } 73 | # set up the pipeline settings for postgres 74 | POSTGRES_HOST = "localhost" 75 | POSTGRES_USER = "nathan" 76 | POSTGRES_DB = "ebay" 77 | POSTGRES_TABLE = "completed_items_v2" 78 | # POSTGRES_TABLE = "completed_items" 79 | # POSTGRES_TABLE = "scrapy_test" 80 | 81 | 82 | # Enable and configure the AutoThrottle extension (disabled by default) 83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 84 | AUTOTHROTTLE_ENABLED = True 85 | # The initial download delay 86 | AUTOTHROTTLE_START_DELAY = 0.5 87 | # The maximum download delay to be set in case of high latencies 88 | AUTOTHROTTLE_MAX_DELAY = 2 89 | # The average number of requests Scrapy should be sending in parallel to 90 | # each remote server. High value -> High speed 91 | AUTOTHROTTLE_TARGET_CONCURRENCY = 2 92 | # Enable showing throttling stats for every response received: 93 | # AUTOTHROTTLE_DEBUG = True 94 | 95 | # Enable and configure HTTP caching (disabled by default) 96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 97 | #HTTPCACHE_ENABLED = True 98 | #HTTPCACHE_EXPIRATION_SECS = 0 99 | #HTTPCACHE_DIR = 'httpcache' 100 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 101 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 102 | 103 | # LOGGING 104 | # LOG_FILE = '/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay_scraper/ebay_spider_log.log' 105 | # LOG_ENABLED = True 106 | # LOG_LEVEL = logging.ERROR 107 | 108 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/settings.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__init__.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/ebay_spider.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/__pycache__/ebay_spider.cpython-35.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | # from items import EbayScraperItem 3 | import items 4 | import psycopg2 5 | import logging 6 | from scrapy.utils.log import configure_logging 7 | from pprint import pprint 8 | 9 | class EbaySpider(scrapy.Spider): 10 | name = "ebay" 11 | 12 | def __init__(self, url_start_index=0, *args, **kwargs): 13 | super(EbaySpider, self).__init__(*args, **kwargs) # don't know what this does, but saw it in documentation 14 | self.url_start_index = int(url_start_index) 15 | 16 | 17 | def start_requests(self): 18 | 19 | 20 | 21 | #--- Connect to ebay database, grab itemId, URL 22 | postgres_host = self.crawler.settings.get('POSTGRES_HOST') 23 | postgres_user = self.crawler.settings.get('POSTGRES_USER') 24 | postgres_db = self.crawler.settings.get('POSTGRES_DB') 25 | postgres_table = self.crawler.settings.get('POSTGRES_TABLE') 26 | 27 | conn = psycopg2.connect("dbname={} user={} host={}".format(postgres_db, postgres_user, postgres_host)) 28 | cur = conn.cursor() 29 | 30 | # Start scraping at item in database that is furthest back in time 31 | # That way, we can always pick up scraping where we left off, and even if we put 32 | # new data into table, we don't overwrite it in a new scrape. 33 | SQL = ''' 34 | SELECT ci."itemId", ci."viewItemURL", ci."listingInfo.listingType" 35 | FROM {tablename} as ci 36 | ORDER BY ci."timestamp" ASC; 37 | '''.format(tablename=postgres_table) 38 | cur.execute(SQL) 39 | urls = [(str(url), listingType) for itemId,url,listingType in cur.fetchall()] 40 | num_urls_total = len(urls) 41 | urls = urls[self.url_start_index:] # limit scraping to only the indeces we care about. we could do this in SQL, and we should make that change later 42 | 43 | 44 | # ---- HARDCODED FOR DEV/TESTING PURPOSES ---- # 45 | # urls = [ 46 | # "http://offer.ebay.com/ws/eBayISAPI.dll?ViewBids&item=192140341983&rt=nc&_trksid=p2047675.l2565" 47 | # 'http://www.ebay.com/itm/Nikon-D750-24-3-MP-Digital-SLR-Camera-Black-Body-Only-Used-/222447550032', 48 | # 'http://www.ebay.com/itm/Canon-EOS-5D-Mark-II-24-105mm-Lens-and-Camera-Bag-/272592893520', 49 | # 'http://www.ebay.com/itm/DJI-Inspire-1-V1-0-4K-X3-Camera-and-3-Axis-Gimbal-Drone-Quadcopter-Extras-/302257646034', 50 | # 'http://www.ebay.com/itm/Samsung-NX-NX1-28-2-MP-Digital-Camera-Black-Kit-w-50-200mm-OIS-Lens-/222445254405', 51 | # 'http://www.ebay.com/itm/Panasonic-AJ-HDC27F-2-3-HD-DVCPRO-Varicam-Video-Camera-Camcorder-w-Viewfinder-/142319141084', 52 | # 'http://www.ebay.com/itm/High-Speed-Pin-Registered-Super-8-Cartridge-Camera-Very-Rare-Logmar-Wilcam-/252816500866', 53 | # 'http://www.ebay.com/itm/Carl-Zeiss-Planar-T-80mm-f-2-AF-Lens-Contax-645-camera-/332163276401', 54 | # 'http://www.ebay.com/itm/DJI-Mavic-Pro-Folding-Drone-4K-Stabilized-Camera-Active-Track-Avoidance-GPS-/252821264198', 55 | # 'http://www.ebay.com/itm/Nikon-D40-6-1MP-Digital-SLR-Camera-Black-Kit-w-AF-S-DX-18-55mm-Lens-/262891375158' 56 | # ] 57 | # urls = [("http://www.ebay.com/itm/Canon-EOS-7D-18-0-MP-Digital-SLR-Camera-Black-Body-Only-/192140341983",'Auction')] 58 | 59 | # THIS CAN RETURN A GENERATOR or "LIST OF REQUESTS" 60 | # https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.start_requests 61 | for i,tup in enumerate(urls): 62 | logging.debug("scraping #{} out of {} urls.".format(i+self.url_start_index, num_urls_total)) 63 | url = tup[0] 64 | listingType = tup[1] 65 | yield scrapy.Request(url=url, callback=self.parse, meta={'listingType':listingType}) # after yielding the request, scrapy will go and download the url, and then call the callback function 66 | 67 | 68 | 69 | def parse(self, response): 70 | 71 | item = items.EbayScraperItem() 72 | 73 | listingType = response.meta['listingType'] 74 | 75 | # Item condition 76 | item_condition_xpath = "//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()" 77 | item['conditionDescription'] = str(response.xpath(item_condition_xpath) \ 78 | .extract_first(default='NULL') \ 79 | ).decode('unicode_escape') \ 80 | .encode('ascii','ignore') \ 81 | .replace("\'","") 82 | 83 | # Item ID 84 | item_id_xpath = "//div[@id='descItemNumber']/text()" 85 | item['itemId'] = int(response.xpath(item_id_xpath).extract_first()) 86 | 87 | 88 | if listingType == 'Auction' or listingType == 'AuctionWithBIN': 89 | bid_count = int(response.xpath("//a[@id='vi-VR-bid-lnk']/span[1]/text()").extract_first()) 90 | bid_history_url = response.xpath("//a[@id='vi-VR-bid-lnk']/@href").extract_first() 91 | 92 | if bid_history_url != None: 93 | 94 | if bid_count > 0: # this prevents us from making an unecessary requests if there is no startPrice (because no bids) 95 | 96 | logging.debug('bid_history_url = {}'.format(bid_history_url)) 97 | logging.debug('bid_count = {}'.format(bid_count)) 98 | 99 | return scrapy.Request(url=bid_history_url, callback=self.parse_start_price, meta={'item':item}) 100 | 101 | else: # if the item had 0 bids 102 | item['startPrice'] = float(str(response.xpath("//span[@class='notranslate vi-VR-cvipPrice']/text()").extract_first()).split('$')[1].replace(',','')) 103 | item['duration'] = 'NULL' 104 | item['endPrice'] = 'NULL' 105 | return item # don't request a new url, just send item to pipeline.py 106 | 107 | else: # 'FixedPrice' or 'StoreInventory' 108 | item['endPrice'] = float(str(response.xpath("//span[@id='prcIsum']/text()").extract_first()).split('$')[1].replace(',','')) 109 | item['startPrice'] = 'NULL' 110 | item['duration'] = 'NULL' 111 | return item 112 | 113 | 114 | 115 | def parse_start_price(self, response): 116 | 117 | item = response.meta['item'] # grab item attribute from response 118 | 119 | 120 | # item end price - I don't think we need this, because the endPrice is given in findCOmpletedItems 121 | 122 | # end_price_xpath = "//div[2]/table/tbody/tr[2]/td/table/tbody/tr[2]/td/table/tbody/tr/td[@class='BHctBidVal']/text()" 123 | # item['endPrice'] = float(str(response.xpath(end_price_xpath).extract_first()).split('$')[1].replace(',','')) 124 | item['endPrice'] = 'NULL' 125 | 126 | # Item duration 127 | duration_xpath = "//span[@class='titleValueFont'][4]/text()" 128 | item['duration'] = str(response.xpath(duration_xpath).extract_first()) \ 129 | .decode('unicode_escape') \ 130 | .encode('ascii','ignore') \ 131 | .split('\r')[0] 132 | 133 | 134 | # Item start price - ebay has (at least) 2 different types of HTML pages for the startPrice info 135 | # try grabbing first xpath 136 | start_price_xpath = "//tr[@id='viznobrd']/td[@class='contentValueFont'][1]/text()" 137 | startPrice = response.xpath(start_price_xpath).extract_first(default='NULL') 138 | 139 | logging.debug("startPrice = {}".format(startPrice)) 140 | 141 | if startPrice != 'NULL': # the first x path worked 142 | startPrice = float(startPrice.split('$')[1].replace(',','')) 143 | item['startPrice'] = startPrice 144 | return item 145 | 146 | # Try grabbing the second xpath if the first xpath didn't work 147 | start_price_xpath = "//table[@id='w2-w3-w0-w0']" 148 | 149 | for item in response.xpath(start_price_xpath).extract(): 150 | logging.debug('item in response.xpath() SECOND PATH = {}'.format(item)) 151 | 152 | startPrice = response.xpath(start_price_xpath).extract_first(default='NULL') 153 | if startPrice != 'NULL': # if the 2nd xpath worked... 154 | logging.debug('url = {}'.format(response.url)) 155 | logging.debug('SECOND XPATH => startPrice = {}'.format(startPrice)) 156 | startPrice = startPrice.split('$')[-1] # take the last entry in the table, which is something like: 80.0023 Mar 2017 at 1:23:58PM PDT 157 | startPrice = '.'.join([startPrice.split('.')[0], startPrice.split('.')[1][:2]]) # take numbers before decimal and concatenate with 2 digits after decimal 158 | startPrice = startPrice.replace(',','') 159 | item['startPrice'] = float(startPrice) 160 | return item 161 | 162 | 163 | logging.debug('response.url = {}'.format(response.url)) 164 | logging.debug('startPrice = {}'.format(startPrice)) 165 | logging.debug('itemId = {}'.format(item['itemId'])) 166 | 167 | # if the first 2 xpaths didn't work... DEBUG 168 | item['startPrice'] = 'NULL' 169 | return item 170 | 171 | 172 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NathanZorndorf/ebay-price-predictor/9e2055fd7b1c96c6715d7dcbd2882da4403048c5/ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider.pyc -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/ebay_scraper/spiders/ebay_spider_v2.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy.spiders import CrawlSpider, Rule 3 | from scrapy.linkextractors import LinkExtractor 4 | # import items 5 | from items import EbayScraperItem 6 | import psycopg2 7 | import logging 8 | from scrapy.utils.log import configure_logging 9 | from pprint import pprint 10 | 11 | class EbaySpider(CrawlSpider): 12 | name = "ebay_crawl_spider" 13 | 14 | custom_settings = { 15 | 'POSTGRES_TABLE':"completed_items_v2", 16 | 'AUTOTHROTTLE_TARGET_CONCURRENCY':1, 17 | 'DONWLOAD_DELAY':0.8, 18 | 'ITEM_PIPELINES': { 19 | 'ebay_scraper.pipelines.EbayPostgresPipeline': 300, 20 | } 21 | } 22 | 23 | 24 | def __init__(self, url_start_index=0, url_end_index=0, *args, **kwargs): 25 | super(EbaySpider, self).__init__(*args, **kwargs) # don't know what this does, but saw it in documentation 26 | self.url_start_index = int(url_start_index) 27 | self.url_end_index = int(url_end_index) 28 | 29 | 30 | def start_requests(self): 31 | 32 | #--- Connect to ebay database, grab itemId, URL 33 | postgres_host = self.crawler.settings.get('POSTGRES_HOST') 34 | postgres_user = self.crawler.settings.get('POSTGRES_USER') 35 | postgres_db = self.crawler.settings.get('POSTGRES_DB') 36 | postgres_table = self.crawler.settings.get('POSTGRES_TABLE') 37 | 38 | conn = psycopg2.connect("dbname={} user={} host={}".format(postgres_db, postgres_user, postgres_host)) 39 | cur = conn.cursor() 40 | 41 | # Start scraping at item in database that is furthest back in time 42 | # That way, we can always pick up scraping where we left off, and even if we put 43 | # new data into table, we don't overwrite it in a new scrape. 44 | SQL = ''' 45 | SELECT ci."itemId", ci."viewItemURL" 46 | FROM {tablename} as ci 47 | ORDER BY ci."timestamp" ASC; 48 | '''.format(tablename=postgres_table) 49 | cur.execute(SQL) 50 | 51 | urls = [(int(itemId),str(url)) for itemId,url in cur.fetchall()] 52 | if self.url_end_index == 0: 53 | self.url_end_index = len(urls) 54 | num_urls_total = len(urls) 55 | urls = urls[self.url_start_index:self.url_end_index] # limit scraping to only the indeces we care about. we could do this in SQL, and we should make that change later 56 | 57 | for i,(itemId,url) in enumerate(urls): 58 | logging.debug("scraping #{} out of {} urls.".format(i+self.url_start_index, num_urls_total)) 59 | yield scrapy.Request(url=url, callback=self.parse, meta={'itemId':itemId, 'dont_redirect':True}) # after yielding the request, scrapy will go and download the url, and then call the callback function 60 | 61 | # ---- HARDCODED FOR DEV/TESTING PURPOSES ---- # 62 | # urls = ["http://www.ebay.com/itm/NIKON-D-DF-16-2-MP-DIGITAL-SLR-CAMERA-SILVER-KIT-AF-S-MICRO-60MM-LENS-/291997000430", 63 | # "http://www.ebay.com/itm/Canon-T3i-Body-And-Kit-/272591253524", 64 | # "http://www.ebay.com/itm/Fujifilm-FinePix-XP-XP70-16-4MP-Waterproof-Digital-Camera-5X-Optical-Zoom-/112242371882", 65 | # "http://www.ebay.com/itm/Canon-Minolta-and-Pentax-Cameras-2-Bags-9-Lenses-and-Filters-/292024262357" 66 | # ] 67 | # for url in urls: 68 | # yield scrapy.Request(url=url, callback=self.parse) # after yielding the request, scrapy will go and download the url, and then call the callback function 69 | 70 | 71 | 72 | def parse(self, response): 73 | 74 | item = EbayScraperItem() 75 | item['itemId'] = response.meta['itemId'] 76 | 77 | # Get condition 78 | item['conditionDescription'] = response.xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()")\ 79 | .extract_first(default='NULL')\ 80 | .encode('ascii','ignore')\ 81 | .replace('\'', '') 82 | 83 | 84 | # Scrape bid history URL in order to get startPrice 85 | bid_history_url = response.xpath("//a[@id='vi-VR-bid-lnk']/@href").extract_first() 86 | if bid_history_url != None: 87 | bid_count = int(response.xpath("//a[@id='vi-VR-bid-lnk']/span[1]/text()").extract_first()) 88 | if bid_count > 0: 89 | return scrapy.Request(url=bid_history_url, callback=self.parse_start_price, meta={'item':item}) 90 | else: 91 | item['startPrice'] = 'NULL' 92 | return item 93 | else: 94 | item['startPrice'] = 'NULL' 95 | return item 96 | 97 | 98 | def parse_start_price(self, response): 99 | 100 | item = response.meta['item'] 101 | 102 | # 1st xpath attempt 103 | startPrice = response.xpath("//tr[@id='viznobrd']/td[@class='contentValueFont'][1]/text()").extract_first() 104 | if startPrice != None: 105 | item['startPrice'] = float(startPrice.split('$')[1]) 106 | return item 107 | 108 | # 2nd xpath attempt 109 | # startPrice = response.xpath("//span/span/text()").extract()[-3] 110 | bid_history_items = response.xpath("//span/text()").extract() 111 | if bid_history_items: 112 | for i,text in enumerate(bid_history_items): 113 | if text == 'Starting Price': 114 | startPrice = bid_history_items[i+1] 115 | item['startPrice'] = float(startPrice.replace('$','')) 116 | return item 117 | 118 | 119 | 120 | logging.debug('1ST AND 2ND XPATH DID NOT WORK.\n itemId = {}\n'.format(item['itemId'])) 121 | item['startPrice'] = 'NULL' 122 | 123 | return item 124 | 125 | 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /ebay-api-scraper/ebay_scraper/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ebay_scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ebay_scraper 12 | -------------------------------------------------------------------------------- /ebay-api-scraper/find-completed-listing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from optparse import OptionParser 4 | import psycopg2 5 | from psycopg2.extensions import AsIs 6 | from collections import OrderedDict 7 | import datetime 8 | import pprint 9 | 10 | numArgs = len(sys.argv) 11 | if numArgs < 1 or numArgs > 7: 12 | print 'ERROR: Not enough arguments. Please input "host",user",dbname","tablename",minPrice,maxPrice as arguments.' 13 | sys.exit() 14 | 15 | (host, user, dbname, TABLE_NAME, minPrice, maxPrice) = tuple(sys.argv[1:]) 16 | 17 | pagesToQuery = int(input('Enter number of pages to query:')) 18 | entriesPerPage = int(input('Enter number of entries per page to query:')) 19 | pageStart = int(input('Enter page number to start at:')) 20 | 21 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__)) 22 | 23 | 24 | import ebaysdk 25 | from ebaysdk.finding import Connection as finding 26 | from ebaysdk.exception import ConnectionError 27 | 28 | def init_options(): 29 | usage = "usage: %prog [options]" 30 | parser = OptionParser(usage=usage) 31 | 32 | parser.add_option("-d", "--debug",action="store_true", dest="debug", default=False,help="Enabled debugging [default: %default]") 33 | 34 | parser.add_option("-y", "--yaml",dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml', 35 | help="Specifies the name of the YAML defaults file. [default: %default]") 36 | 37 | parser.add_option("-a", "--appid",dest="appid", default=None,help="Specifies the eBay application id to use.") 38 | 39 | (opts, args) = parser.parse_args() 40 | 41 | return opts, args 42 | 43 | 44 | def run(opts, pagesToQuery=1, entriesPerPage=1, pageStart=1): 45 | 46 | # --- set up query parameters ; COULD NOT GET THIS TO HAVE ANY AFFECT 47 | endTimeFrom = '2017-01-12 00:00:00' 48 | endTimeTo = '2017-04-4 00:00:00' 49 | endTimeFrom = datetime.datetime.strptime(endTimeFrom, "%Y-%m-%d %H:%M:%S").isoformat() + '.000Z' 50 | endTimeTo = datetime.datetime.strptime(endTimeTo, "%Y-%m-%d %H:%M:%S").isoformat() + '.000Z' 51 | print 'endTimeFrom:',endTimeFrom 52 | print 'endTimeTo:',endTimeTo 53 | 54 | 55 | # ------ CONNECT TO POSTGRES DATABSE ----- # 56 | # dbname='ebay' 57 | # user='nathan' 58 | # host='localhost' 59 | # TABLE_NAME = 'completed_items_15230_31388' 60 | 61 | try: 62 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 63 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 64 | except: 65 | print "ERROR: Unable to connect to the database." 66 | sys.exit("Check database connection settings and try again.") 67 | 68 | cur = conn.cursor() 69 | 70 | # ------------ QUERY EBAY ---------------- # 71 | try: 72 | api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True) 73 | 74 | for pageNum in range(pageStart, pageStart+pagesToQuery+1): 75 | 76 | api_request = { 77 | # 'keywords': 'camera', 78 | 'categoryId' : '31388', # 31388 : Digital cameras 79 | 'itemFilter': [ 80 | {'name': 'LocatedIn', 'value': 'US'}, 81 | {'name': 'Currency', 'value':'USD'}, 82 | 83 | # {'name': 'Condition', 'value': 'Used'}, 84 | {'name': 'MinPrice', 'value': minPrice}, 85 | {'name': 'MaxPrice', 'value': maxPrice}, 86 | 87 | # {'name': 'ListingType', 'value':'Auction'}, 88 | # {'name': 'ListingType', 'value':'AuctionWithBIN'}, 89 | # {'name': 'ListingType', 'value':'FixedPrice'}, 90 | # {'name': 'SoldItemsOnly', 'value':'true'}, 91 | 92 | {'name': 'HideDuplicateItems', 'value':'true'}, 93 | 94 | # {'name': 'SellerBusinessType', 'value' : 'Private'}, 95 | 96 | {'name': 'EndTimeFrom', 'value': endTimeFrom}, 97 | {'name': 'EndTimeTo', 'value': endTimeTo} 98 | ], 99 | 'outputSelector': [ 100 | 'PictureURLLarge', 101 | 'SellerInfo', 102 | 'UnitPriceInfo' 103 | ], 104 | 'paginationInput': { 105 | 'entriesPerPage': entriesPerPage, # max = 100 106 | 'pageNumber': pageNum # execute the call with subsequent values for this field 107 | }, 108 | 'sortOrder' : 'EndTimeSoonest' 109 | } 110 | 111 | response = api.execute('findCompletedItems', api_request) 112 | 113 | dic = response.dict() 114 | 115 | # if failure, print detail s 116 | if dic['ack'] != 'Success': 117 | print 'ack: ',dic['ack'] 118 | print 'error message: ',dic['errorMessage'] 119 | 120 | if pageNum == 1: 121 | # print dic 122 | totalPages = dic['paginationOutput']['totalPages'] 123 | totalEntries = dic['paginationOutput']['totalEntries'] 124 | # _count = dic['searchResult']['_count'] 125 | print 'Total Pages = {}'.format(totalPages) 126 | print 'Total Entries = {}'.format(totalEntries) 127 | 128 | 129 | # print "dic['searchResult']['item'][0]:{}".format(dic['searchResult']['item'][0]) 130 | # pprint.pprint(dic['searchResult']['item'][0]) 131 | 132 | # ------ STORE EBAY DATA IN DICTIONARY ------ # 133 | ebay_data_dict = OrderedDict() 134 | 135 | timestamp = dic['timestamp'] # Example : '2017-03-25T01:58:10.520Z' 136 | ebay_data_dict['timestamp'] = timestamp 137 | 138 | for entryNum in range(len(dic['searchResult']['item'])-1): 139 | for key1,val1 in dic['searchResult']['item'][entryNum].iteritems(): 140 | if type(val1) is dict: 141 | for key2,val2 in val1.iteritems(): 142 | if type(val2) is dict: 143 | for key3,val3 in val2.iteritems(): 144 | # print '{}.{}.{} : {}'.format(key1,key2,key3,val3) 145 | key = '.'.join([key1,key2,key3]) 146 | val = val3 147 | ebay_data_dict[key] = val 148 | else: 149 | # print '{}.{} : {}'.format(key1,key2,val2) 150 | key = '.'.join([key1,key2]) 151 | val = val2 152 | ebay_data_dict[key] = val 153 | else: 154 | # print '{} : {}\n'.format(key1, val1) 155 | key = key1 156 | val = val1 157 | ebay_data_dict[key] = val 158 | 159 | # remove entries we don't need 160 | bad_keys = [ \ 161 | "searchResult.item.attribute", \ 162 | "searchResult.item.attribute.value",\ 163 | "searchResult.item.attribute.name", \ 164 | "searchResult.item.discountPriceInfo.originalRetailPrice_currencyId", \ 165 | "searchResult.item._distance" 166 | "searchResult.item.galleryInfoContainer.galleryURL._gallerySize",\ 167 | "searchResult.item.listingInfo.convertedBuyItNowPrice._currencyId", \ 168 | "sellingStatus.convertedCurrentPrice.value", \ 169 | "sellingStatus.convertedCurrentPrice._currencyId", \ 170 | "sellingStatus.currentPrice._currencyId", \ 171 | "listingInfo.buyItNowPrice._currencyId", \ 172 | "listingInfo.convertedBuyItNowPrice._currencyId", \ 173 | "shippingInfo.shippingServiceCost._currencyId", \ 174 | "listingInfo.convertedBuyItNowPrice.value", \ 175 | "galleryPlusPictureURL", \ 176 | "storeInfo.storeURL", \ 177 | "storeInfo.storeName", \ 178 | "productId._type",\ 179 | "productId.value", 180 | "charityId",\ 181 | "discountPriceInfo.soldOnEbay", \ 182 | "discountPriceInfo.pricingTreatment", \ 183 | "discountPriceInfo.originalRetailPrice._currencyId", \ 184 | "discountPriceInfo.originalRetailPrice.value", \ 185 | "discountPriceInfo.soldOffEbay", \ 186 | "discountPriceInfo.minimumAdvertisedPriceExposure",\ 187 | ] 188 | for key in bad_keys: 189 | if key in ebay_data_dict.keys(): 190 | ebay_data_dict.pop(key) 191 | 192 | # ------ ENTER EBAY DATA INTO TABLE ----- # 193 | currentEntryNum = entryNum + ((pageNum-1) * entriesPerPage) 194 | totalEntriesNum = dic['paginationOutput']['totalEntries'] 195 | print "inserting item #{} out of {} into table {} in database {}".format(currentEntryNum,totalEntriesNum, TABLE_NAME, dbname) 196 | 197 | pprint.pprint(ebay_data_dict) 198 | 199 | keys = ['"{}"'.format(key) for key in ebay_data_dict.keys()] # surround key with quotes 200 | values = ebay_data_dict.values() # extract values 201 | insert_statement = 'INSERT INTO {} (%s) values %s'.format(TABLE_NAME) 202 | query = cur.mogrify(insert_statement, (AsIs(','.join(keys)), tuple(values))) 203 | cur.execute(query) 204 | conn.commit() 205 | 206 | 207 | # ------ CLOSE CONNECTION TO DATABSE ----- # 208 | cur.close() 209 | conn.close() 210 | 211 | 212 | except ConnectionError as e: 213 | print(e) 214 | print(e.response.dict()) 215 | 216 | 217 | #-------------------------------# 218 | #------------ MAIN -------------# 219 | #-------------------------------# 220 | 221 | if __name__ == "__main__": 222 | # print 'connecting to database...' 223 | print("Finding samples for SDK version %s" % ebaysdk.get_version()) 224 | (opts, args) = init_options() 225 | run(opts, pagesToQuery=pagesToQuery, entriesPerPage=entriesPerPage, pageStart=pageStart) 226 | -------------------------------------------------------------------------------- /ebay-api-scraper/finding.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | © 2012-2013 eBay Software Foundation 4 | Authored by: Tim Keefer 5 | Licensed under CDDL 1.0 6 | ''' 7 | 8 | import os 9 | import sys 10 | from optparse import OptionParser 11 | from pprint import pprint 12 | 13 | # The line below will add this file's parent directory 14 | # to the search path for python modules 15 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__)) # /Users/Naekid/Desktop/capstone-DSI-5/ebaysdk-python/samples 16 | 17 | from common import dump # ebay SDK support file 18 | 19 | import ebaysdk 20 | from ebaysdk.finding import Connection as finding 21 | from ebaysdk.exception import ConnectionError 22 | 23 | 24 | def init_options(): 25 | usage = "usage: %prog [options]" 26 | parser = OptionParser(usage=usage) 27 | 28 | parser.add_option("-d", "--debug", 29 | action="store_true", dest="debug", default=False, 30 | help="Enabled debugging [default: %default]") 31 | parser.add_option("-y", "--yaml", 32 | dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml', 33 | help="Specifies the name of the YAML defaults file. [default: %default]") 34 | parser.add_option("-a", "--appid", 35 | dest="appid", default=None, 36 | help="Specifies the eBay application id to use.") 37 | 38 | (opts, args) = parser.parse_args() 39 | return opts, args 40 | 41 | 42 | 43 | def find_completed_item(opts): 44 | 45 | try: 46 | api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True) 47 | 48 | api_request = { 49 | 'keywords': 122431840128, 50 | } 51 | 52 | response = api.execute('findCompletedItems', api_request) 53 | 54 | dic = response.dict() 55 | 56 | pprint(dic) 57 | 58 | except ConnectionError as e: 59 | print(e) 60 | print(e.response.dict()) 61 | 62 | 63 | 64 | def run(opts): 65 | 66 | try: 67 | api = finding(debug=opts.debug, appid=opts.appid,config_file=opts.yaml, warnings=True) 68 | 69 | api_request = { 70 | 'keywords': 'camera', 71 | 'CategoryId' : '31388', 72 | 'itemFilter': [ 73 | {'name': 'Condition', 'value': 'Used'}, 74 | {'name': 'LocatedIn', 'value': 'US'}, 75 | {'name': 'MinPrice', 'value': '10'} 76 | ], 77 | 'paginationInput': { 78 | 'entriesPerPage': '1', 79 | 'pageNumber': '1' 80 | }, 81 | 'sortOrder': 'PricePlusShippingLowest', 82 | } 83 | 84 | response = api.execute('findCompletedItems', api_request) 85 | 86 | dic = response.dict() 87 | 88 | # print dic.keys() # ['ack', 'timestamp', 'version', 'searchResult', 'paginationOutput'] 89 | # print dic['searchResult'] # ['item', '_count'] 90 | # print dic['searchResult']['item'][0].keys() # ['itemId', 'topRatedListing', 'globalId', 'title', 'country', 'primaryCategory', 'autoPay', 'galleryURL', 'shippingInfo', 'location', 'postalCode', 'returnsAccepted', 'viewItemURL', 'sellingStatus', 'paymentMethod', 'isMultiVariationListing', 'condition', 'listingInfo'] 91 | 92 | for item in dic['searchResult']['item']: 93 | print 'listing title:\t\t', item['title'] 94 | print 'listing sale price($):\t', item['sellingStatus']['currentPrice']['value'] 95 | 96 | 97 | # dump(api) 98 | 99 | except ConnectionError as e: 100 | print(e) 101 | print(e.response.dict()) 102 | 103 | 104 | def run_unicode(opts): 105 | 106 | try: 107 | api = finding(debug=opts.debug, appid=opts.appid, 108 | config_file=opts.yaml, warnings=True) 109 | 110 | api_request = { 111 | 'keywords': u'Kościół', 112 | } 113 | 114 | response = api.execute('findItemsAdvanced', api_request) 115 | for i in response.reply.searchResult.item: 116 | if i.title.find(u'ś') >= 0: 117 | print("Matched: %s" % i.title) 118 | break 119 | 120 | dump(api) 121 | 122 | except ConnectionError as e: 123 | print(e) 124 | print(e.response.dict()) 125 | 126 | 127 | def run2(opts): 128 | try: 129 | api = finding(debug=opts.debug, appid=opts.appid, 130 | config_file=opts.yaml) 131 | 132 | response = api.execute('findItemsByProduct', 133 | '530390311') 134 | 135 | dump(api) 136 | 137 | except ConnectionError as e: 138 | print(e) 139 | print(e.response.dict()) 140 | 141 | 142 | def run_motors(opts): 143 | api = finding(siteid='EBAY-MOTOR', debug=opts.debug, appid=opts.appid, config_file=opts.yaml, 144 | warnings=True) 145 | 146 | api.execute('findItemsAdvanced', { 147 | 'keywords': 'tesla', 148 | }) 149 | 150 | if api.error(): 151 | raise Exception(api.error()) 152 | 153 | if api.response_content(): 154 | print("Call Success: %s in length" % len(api.response_content())) 155 | 156 | print("Response code: %s" % api.response_code()) 157 | print("Response DOM: %s" % api.response_dom()) 158 | 159 | dictstr = "%s" % api.response_dict() 160 | print("Response dictionary: %s..." % dictstr[:250]) 161 | 162 | if __name__ == "__main__": 163 | print("Finding samples for SDK version %s" % ebaysdk.get_version()) 164 | (opts, args) = init_options() 165 | find_completed_item(opts) 166 | # run(opts) 167 | # run2(opts) 168 | # run_motors(opts) 169 | # run_unicode(opts) 170 | -------------------------------------------------------------------------------- /ebay-api-scraper/parallel-requests.py: -------------------------------------------------------------------------------- 1 | # Based on http://www.eamonnbell.com/blog/2015/10/05/the-right-way-to-use-requests-in-parallel-in-python/ 2 | 3 | import sys 4 | import requests 5 | from multiprocessing import Pool 6 | import psycopg2 7 | from scrapy.selector import Selector 8 | from scrapy.http import HtmlResponse 9 | import math 10 | 11 | start_index = input('Which index are you starting at?') 12 | 13 | def request_url(data): 14 | (itemId,url) = data 15 | print 'parsing itemId:{} from url {}'.format(itemId, url) 16 | HTML = requests.get(url).text 17 | try: 18 | condition = str(Selector(text=HTML).xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()").extract()[0]) 19 | condition = condition.replace("\'","") 20 | except: 21 | condition = 'NULL' 22 | 23 | return (itemId,condition) 24 | 25 | 26 | # connect to database 27 | dbname='ebay' 28 | user='nathan' 29 | host='localhost' 30 | TABLE_NAME = 'completed_items' 31 | 32 | try: 33 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 34 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 35 | except: 36 | print "ERROR: Unable to connect to the database." 37 | sys.exit("Check database connection settings and try again.") 38 | 39 | cur = conn.cursor() 40 | 41 | 42 | 43 | # get itemId, url from table 44 | SQL = ''' 45 | SELECT ci."itemId", ci."viewItemURL" 46 | FROM completed_items as ci; 47 | ''' 48 | cur.execute(SQL) 49 | data = [(int(itemId),str(url)) for itemId,url in cur.fetchall()] 50 | 51 | data_len = int(math.ceil(len(data)/100.)) 52 | # print data[:5] 53 | print '# of entries in completed_items:',data_len 54 | print 'start_index:',start_index 55 | # sys.exit() 56 | 57 | #------- INSERT SCRAPY HERE 58 | 59 | 60 | # break up multi-threaded processing into chunks, so we don't load entire data set into memory 61 | for i in range(data_len): 62 | print i 63 | if i < start_index: 64 | continue 65 | else: 66 | try: 67 | temp_data = data[i*100:i*100+100] 68 | except: 69 | temp_data = data[i*100:] 70 | 71 | pool = Pool(processes=3) 72 | item_condition_list = pool.map(request_url, temp_data) 73 | 74 | # for every chunk of data, update postresql table 75 | for itemId,condition in item_condition_list: 76 | SQL = ''' 77 | UPDATE ONLY completed_items as ci 78 | SET conditiondescription = '{condition}' 79 | WHERE ci."itemId" = {itemId}; 80 | '''.format(condition=condition,itemId=itemId) 81 | cur.execute(SQL) 82 | conn.commit() 83 | 84 | 85 | cur.close() 86 | conn.close() 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | #----------------------- TEST -----------------------# 110 | # import requests 111 | # from multiprocessing import Pool 112 | 113 | 114 | # url_list = [ (222447550032, 115 | # 'http://www.ebay.com/itm/Nikon-D750-24-3-MP-Digital-SLR-Camera-Black-Body-Only-Used-/222447550032'), 116 | # (272592893520, 117 | # 'http://www.ebay.com/itm/Canon-EOS-5D-Mark-II-24-105mm-Lens-and-Camera-Bag-/272592893520'), 118 | # (302257646034, 119 | # 'http://www.ebay.com/itm/DJI-Inspire-1-V1-0-4K-X3-Camera-and-3-Axis-Gimbal-Drone-Quadcopter-Extras-/302257646034'), 120 | # (222445254405, 121 | # 'http://www.ebay.com/itm/Samsung-NX-NX1-28-2-MP-Digital-Camera-Black-Kit-w-50-200mm-OIS-Lens-/222445254405'), 122 | # (142319141084, 123 | # 'http://www.ebay.com/itm/Panasonic-AJ-HDC27F-2-3-HD-DVCPRO-Varicam-Video-Camera-Camcorder-w-Viewfinder-/142319141084'), 124 | # (252816500866, 125 | # 'http://www.ebay.com/itm/High-Speed-Pin-Registered-Super-8-Cartridge-Camera-Very-Rare-Logmar-Wilcam-/252816500866'), 126 | # (332163276401, 127 | # 'http://www.ebay.com/itm/Carl-Zeiss-Planar-T-80mm-f-2-AF-Lens-Contax-645-camera-/332163276401'), 128 | # (252821264198, 129 | # 'http://www.ebay.com/itm/DJI-Mavic-Pro-Folding-Drone-4K-Stabilized-Camera-Active-Track-Avoidance-GPS-/252821264198'), 130 | # ] 131 | 132 | # url_list = [url for itemId,url in url_list] 133 | 134 | # def internet_getter(url): 135 | # s = requests.Session() 136 | 137 | # print url 138 | # s.get(url).text 139 | 140 | 141 | # pool = Pool(processes=3) 142 | # pool_outputs = pool.map(internet_getter, 143 | # url_list) 144 | 145 | # pool.close() 146 | # pool.join() 147 | 148 | # print pool_outputs -------------------------------------------------------------------------------- /ebay-api-scraper/scrapy-development.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 2", 16 | "language": "python", 17 | "name": "python2" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 2 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython2", 29 | "version": "2.7.13" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 2 34 | } 35 | -------------------------------------------------------------------------------- /ebay-api-scraper/trading.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | © 2012-2013 eBay Software Foundation 4 | Authored by: Tim Keefer 5 | Licensed under CDDL 1.0 6 | ''' 7 | 8 | import os 9 | import sys 10 | import datetime 11 | from optparse import OptionParser 12 | import psycopg2 13 | import time 14 | sys.path.insert(0, '%s/../' % os.path.dirname(__file__)) 15 | 16 | from common import dump 17 | 18 | import ebaysdk 19 | from ebaysdk.utils import getNodeText 20 | from ebaysdk.exception import ConnectionError 21 | from ebaysdk.trading import Connection as Trading 22 | 23 | from pprint import pprint 24 | 25 | 26 | host = 'localhost' 27 | user = 'nathan' 28 | dbname = 'ebay' 29 | tablename = 'category_specifics' 30 | 31 | 32 | numArgs = len(sys.argv) 33 | if numArgs != 2: 34 | print 'Incorrect number of input arguments' 35 | sys.exit() 36 | 37 | offset = sys.argv[1] 38 | 39 | 40 | def init_options(): 41 | 42 | usage = "usage: %prog [options]" 43 | parser = OptionParser(usage=usage) 44 | 45 | parser.add_option("-d", "--debug", 46 | action="store_true", dest="debug", default=False, 47 | help="Enabled debugging [default: %default]") 48 | parser.add_option("-y", "--yaml", 49 | dest="yaml", default='/Users/Naekid/Desktop/capstone-DSI-5/ebay-price-predictor/ebay-api-scraper/ebay.yaml', 50 | help="Specifies the name of the YAML defaults file. [default: %default]") 51 | parser.add_option("-a", "--appid", 52 | dest="appid", default=None, 53 | help="Specifies the eBay application id to use.") 54 | parser.add_option("-p", "--devid", 55 | dest="devid", default=None, 56 | help="Specifies the eBay developer id to use.") 57 | parser.add_option("-c", "--certid", 58 | dest="certid", default=None, 59 | help="Specifies the eBay cert id to use.") 60 | 61 | (opts, args) = parser.parse_args() 62 | return opts, args 63 | 64 | 65 | 66 | def getItem(opts): 67 | 68 | try: 69 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 70 | certid=opts.certid, devid=opts.devid) 71 | 72 | try: 73 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 74 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 75 | except: 76 | print "ERROR: Unable to connect to the database." 77 | sys.exit("Check database connection settings and try again.") 78 | 79 | cur = conn.cursor() 80 | 81 | #----------- GRAB ALL ITEM IDs IN TABLE ----------# 82 | query = '''SELECT ci."itemId" 83 | FROM category_specifics as ci 84 | ORDER BY ci."Model" ASC 85 | OFFSET {offset};'''.format(offset=offset) # 4745 86 | 87 | cur.execute(query) 88 | item_ids = cur.fetchall() 89 | 90 | for i,itemId in enumerate(item_ids[:]): 91 | 92 | itemId = itemId[0] # itemId is a tuple 93 | print 'Updating item #{} out of {}'.format(i+1, len(item_ids)) 94 | print 'calling getItem for itemID:{}'.format(itemId) 95 | 96 | api_request = { 97 | 'itemID':itemId, 98 | 'IncludeItemSpecifics':1, 99 | } 100 | 101 | try: 102 | response = api.execute('GetItem', api_request) 103 | except ConnectionError as e: 104 | print(e) 105 | print(e.response.dict()) 106 | continue 107 | 108 | dic = response.dict() 109 | 110 | # pprint(dic) # debug 111 | 112 | allowed_columns = ['Type','Brand','MPN','Series','Model',\ 113 | 'Megapixels','Optical Zoom','Features',\ 114 | 'Color','Bundled Items','Connectivity',\ 115 | 'Battery Type','Manufacturer Warranty',\ 116 | 'Screen Size','Digital Zoom'] 117 | newDict = {} 118 | try: 119 | if not isinstance(dic['Item']['ItemSpecifics']['NameValueList'], list): # if only one item 120 | # print dic['Item']['ItemSpecifics']['NameValueList'] 121 | name = dic['Item']['ItemSpecifics']['NameValueList']['Name'] 122 | if name in allowed_columns: 123 | value = dic['Item']['ItemSpecifics']['NameValueList']['Value'] 124 | newDict[name] = value 125 | 126 | else: 127 | for nameValueDict in dic['Item']['ItemSpecifics']['NameValueList']: 128 | # print nameValueDict 129 | name = nameValueDict['Name'] 130 | if name in allowed_columns: 131 | value = nameValueDict['Value'] 132 | if isinstance(value, list): 133 | value = ','.join(value) # join the lists into a string seperated by commas 134 | 135 | try: 136 | newDict[name] = value.decode('unicode_escape').encode('ascii','ignore') 137 | except: 138 | print 'Problem with value, could not decode for some reason.' 139 | continue 140 | 141 | except KeyError as e: # no item specifics in response 142 | print 'No ItemSpecifics field in response.', e 143 | continue 144 | 145 | 146 | 147 | keys = ['"{}"'.format(key.decode('unicode_escape').encode('ascii','ignore')) for key in newDict.keys()] 148 | insert_statement = '''UPDATE {table_name} SET (%s) = %s WHERE "itemId"={item_id}; 149 | '''.format(table_name=tablename, 150 | item_id=itemId) 151 | query = cur.mogrify(insert_statement, (psycopg2.extensions.AsIs(','.join(keys)), tuple(newDict.values()))) 152 | # print query 153 | 154 | try: 155 | cur.execute(query) # execute SQL, and commit changes 156 | conn.commit() 157 | except: 158 | print '\nError with executing SQL statement at item #{}, itemId={}.\n'.format(i, itemId) 159 | print query 160 | conn.rollback() 161 | 162 | # time.sleep(0.04) # throttle 163 | 164 | except ConnectionError as e: 165 | print(e) 166 | print(e.response.dict()) 167 | 168 | 169 | 170 | 171 | 172 | 173 | def getCategorySpecifics(opts): 174 | 175 | 176 | try: 177 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 178 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 179 | except: 180 | print "I am unable to connect to the database" 181 | sys.exit() 182 | 183 | cur = conn.cursor() 184 | 185 | 186 | try: 187 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 188 | certid=opts.certid, devid=opts.devid) 189 | 190 | api_request = { 191 | 'CategoryID':'31388' # Digital Cameras 192 | } 193 | 194 | response = api.execute('GetCategorySpecifics', api_request) 195 | 196 | dic = response.dict() 197 | 198 | for item in dic['Recommendations']['NameRecommendation']: 199 | SQL = 'ALTER TABLE {} ADD COLUMN "{}" TEXT;'.format(tablename, item['Name']) 200 | execute = raw_input('Execute "{}"? (y/n):'.format(SQL)) 201 | if execute == 'y': 202 | cur.execute(SQL) 203 | conn.commit() 204 | execute = 'n' 205 | 206 | 207 | except ConnectionError as e: 208 | print(e) 209 | print(e.response.dict()) 210 | 211 | cur.close() 212 | conn.close() 213 | 214 | 215 | def getAPIAccessRules(opts): 216 | 217 | 218 | try: 219 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 220 | certid=opts.certid, devid=opts.devid) 221 | 222 | 223 | response = api.execute('GetApiAccessRules') 224 | 225 | dic = response.dict() 226 | 227 | pprint(dic) 228 | 229 | 230 | except ConnectionError as e: 231 | print(e) 232 | print(e.response.dict()) 233 | 234 | 235 | 236 | def run(opts): 237 | 238 | try: 239 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 240 | certid=opts.certid, devid=opts.devid) 241 | 242 | api.execute('GetCharities', {'CharityID': 3897}) 243 | dump(api) 244 | print(api.response.reply.Charity.Name) 245 | 246 | except ConnectionError as e: 247 | print(e) 248 | print(e.response.dict()) 249 | 250 | 251 | def feedback(opts): 252 | try: 253 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 254 | certid=opts.certid, devid=opts.devid, warnings=False) 255 | 256 | api.execute('GetFeedback', {'UserID': 'tim0th3us'}) 257 | dump(api) 258 | 259 | if int(api.response.reply.FeedbackScore) > 50: 260 | print("Doing good!") 261 | else: 262 | print("Sell more, buy more..") 263 | 264 | except ConnectionError as e: 265 | print(e) 266 | print(e.response.dict()) 267 | 268 | 269 | def getTokenStatus(opts): 270 | 271 | try: 272 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 273 | certid=opts.certid, devid=opts.devid, warnings=False) 274 | 275 | api.execute('GetTokenStatus') 276 | dump(api) 277 | 278 | except ConnectionError as e: 279 | print(e) 280 | print(e.response.dict()) 281 | 282 | 283 | def verifyAddItem(opts): 284 | """http://www.utilities-online.info/xmltojson/#.UXli2it4avc 285 | """ 286 | 287 | try: 288 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 289 | certid=opts.certid, devid=opts.devid, warnings=False) 290 | 291 | myitem = { 292 | "Item": { 293 | "Title": "Harry Potter and the Philosopher's Stone", 294 | "Description": "This is the first book in the Harry Potter series. In excellent condition!", 295 | "PrimaryCategory": {"CategoryID": "377"}, 296 | "StartPrice": "1.0", 297 | "CategoryMappingAllowed": "true", 298 | "Country": "US", 299 | "ConditionID": "3000", 300 | "Currency": "USD", 301 | "DispatchTimeMax": "3", 302 | "ListingDuration": "Days_7", 303 | "ListingType": "Chinese", 304 | "PaymentMethods": "PayPal", 305 | "PayPalEmailAddress": "tkeefdddder@gmail.com", 306 | "PictureDetails": {"PictureURL": "http://i1.sandbox.ebayimg.com/03/i/00/30/07/20_1.JPG?set_id=8800005007"}, 307 | "PostalCode": "95125", 308 | "Quantity": "1", 309 | "ReturnPolicy": { 310 | "ReturnsAcceptedOption": "ReturnsAccepted", 311 | "RefundOption": "MoneyBack", 312 | "ReturnsWithinOption": "Days_30", 313 | "Description": "If you are not satisfied, return the book for refund.", 314 | "ShippingCostPaidByOption": "Buyer" 315 | }, 316 | "ShippingDetails": { 317 | "ShippingType": "Flat", 318 | "ShippingServiceOptions": { 319 | "ShippingServicePriority": "1", 320 | "ShippingService": "USPSMedia", 321 | "ShippingServiceCost": "2.50" 322 | } 323 | }, 324 | "Site": "US" 325 | } 326 | } 327 | 328 | api.execute('VerifyAddItem', myitem) 329 | dump(api) 330 | 331 | except ConnectionError as e: 332 | print(e) 333 | print(e.response.dict()) 334 | 335 | 336 | def verifyAddItemErrorCodes(opts): 337 | """http://www.utilities-online.info/xmltojson/#.UXli2it4avc 338 | """ 339 | 340 | try: 341 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 342 | certid=opts.certid, devid=opts.devid, warnings=False) 343 | 344 | myitem = { 345 | "Item": { 346 | "Title": "Harry Potter and the Philosopher's Stone", 347 | "Description": "This is the first book in the Harry Potter series. In excellent condition!", 348 | "PrimaryCategory": {"CategoryID": "377aaaaaa"}, 349 | "StartPrice": "1.0", 350 | "CategoryMappingAllowed": "true", 351 | "Country": "US", 352 | "ConditionID": "3000", 353 | "Currency": "USD", 354 | "DispatchTimeMax": "3", 355 | "ListingDuration": "Days_7", 356 | "ListingType": "Chinese", 357 | "PaymentMethods": "PayPal", 358 | "PayPalEmailAddress": "tkeefdddder@gmail.com", 359 | "PictureDetails": {"PictureURL": "http://i1.sandbox.ebayimg.com/03/i/00/30/07/20_1.JPG?set_id=8800005007"}, 360 | "PostalCode": "95125", 361 | "Quantity": "1", 362 | "ReturnPolicy": { 363 | "ReturnsAcceptedOption": "ReturnsAccepted", 364 | "RefundOption": "MoneyBack", 365 | "ReturnsWithinOption": "Days_30", 366 | "Description": "If you are not satisfied, return the book for refund.", 367 | "ShippingCostPaidByOption": "Buyer" 368 | }, 369 | "ShippingDetails": { 370 | "ShippingType": "Flat", 371 | "ShippingServiceOptions": { 372 | "ShippingServicePriority": "1", 373 | "ShippingService": "USPSMedia", 374 | "ShippingServiceCost": "2.50" 375 | } 376 | }, 377 | "Site": "US" 378 | } 379 | } 380 | 381 | api.execute('VerifyAddItem', myitem) 382 | 383 | except ConnectionError as e: 384 | # traverse the DOM to look for error codes 385 | for node in api.response.dom().findall('ErrorCode'): 386 | print("error code: %s" % node.text) 387 | 388 | # check for invalid data - error code 37 389 | if 37 in api.response_codes(): 390 | print("Invalid data in request") 391 | 392 | print(e) 393 | print(e.response.dict()) 394 | 395 | 396 | def uploadPicture(opts): 397 | 398 | try: 399 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 400 | certid=opts.certid, devid=opts.devid, warnings=True) 401 | 402 | pictureData = { 403 | "WarningLevel": "High", 404 | "ExternalPictureURL": "http://developer.ebay.com/DevZone/XML/docs/images/hp_book_image.jpg", 405 | "PictureName": "WorldLeaders" 406 | } 407 | 408 | api.execute('UploadSiteHostedPictures', pictureData) 409 | dump(api) 410 | 411 | except ConnectionError as e: 412 | print(e) 413 | print(e.response.dict()) 414 | 415 | 416 | def uploadPictureFromFilesystem(opts, filepath): 417 | 418 | try: 419 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 420 | certid=opts.certid, devid=opts.devid, warnings=True) 421 | 422 | # pass in an open file 423 | # the Requests module will close the file 424 | files = {'file': ('EbayImage', open(filepath, 'rb'))} 425 | 426 | pictureData = { 427 | "WarningLevel": "High", 428 | "PictureName": "WorldLeaders" 429 | } 430 | 431 | api.execute('UploadSiteHostedPictures', pictureData, files=files) 432 | dump(api) 433 | 434 | except ConnectionError as e: 435 | print(e) 436 | print(e.response.dict()) 437 | 438 | 439 | def memberMessages(opts): 440 | 441 | try: 442 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 443 | certid=opts.certid, devid=opts.devid, warnings=True) 444 | 445 | now = datetime.datetime.now() 446 | 447 | memberData = { 448 | "WarningLevel": "High", 449 | "MailMessageType": "All", 450 | # "MessageStatus": "Unanswered", 451 | "StartCreationTime": now - datetime.timedelta(days=60), 452 | "EndCreationTime": now, 453 | "Pagination": { 454 | "EntriesPerPage": "5", 455 | "PageNumber": "1" 456 | } 457 | } 458 | 459 | api.execute('GetMemberMessages', memberData) 460 | 461 | dump(api) 462 | 463 | if api.response.reply.has_key('MemberMessage'): 464 | messages = api.response.reply.MemberMessage.MemberMessageExchange 465 | 466 | if type(messages) != list: 467 | messages = [messages] 468 | 469 | for m in messages: 470 | print("%s: %s" % (m.CreationDate, m.Question.Subject[:50])) 471 | 472 | except ConnectionError as e: 473 | print(e) 474 | print(e.response.dict()) 475 | 476 | 477 | def getUser(opts): 478 | try: 479 | 480 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 481 | certid=opts.certid, devid=opts.devid, warnings=True, timeout=20, siteid='101') 482 | 483 | api.execute('GetUser', {'UserID': 'sallyma789'}) 484 | dump(api, full=False) 485 | 486 | except ConnectionError as e: 487 | print(e) 488 | print(e.response.dict()) 489 | 490 | 491 | def getOrders(opts): 492 | 493 | try: 494 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 495 | certid=opts.certid, devid=opts.devid, warnings=True, timeout=20) 496 | 497 | api.execute('GetOrders', {'NumberOfDays': 30}) 498 | dump(api, full=False) 499 | 500 | except ConnectionError as e: 501 | print(e) 502 | print(e.response.dict()) 503 | 504 | 505 | def categories(opts): 506 | 507 | try: 508 | api = Trading(debug=opts.debug, config_file=opts.yaml, appid=opts.appid, 509 | certid=opts.certid, devid=opts.devid, warnings=True, timeout=20, siteid='101') 510 | 511 | callData = { 512 | 'DetailLevel': 'ReturnAll', 513 | 'CategorySiteID': 101, 514 | 'LevelLimit': 4, 515 | } 516 | 517 | api.execute('GetCategories', callData) 518 | dump(api, full=False) 519 | 520 | except ConnectionError as e: 521 | print(e) 522 | print(e.response.dict()) 523 | 524 | ''' 525 | api = trading(domain='api.sandbox.ebay.com') 526 | api.execute('GetCategories', { 527 | 'DetailLevel': 'ReturnAll', 528 | 'CategorySiteID': 101, 529 | 'LevelLimit': 4, 530 | }) 531 | ''' 532 | 533 | if __name__ == "__main__": 534 | (opts, args) = init_options() 535 | 536 | print("Trading API Samples for version %s" % ebaysdk.get_version()) 537 | 538 | """ 539 | run(opts) 540 | feedback(opts) 541 | verifyAddItem(opts) 542 | getTokenStatus(opts) 543 | verifyAddItemErrorCodes(opts) 544 | uploadPicture(opts) 545 | uploadPictureFromFilesystem(opts, ("%s/test_image.jpg" % os.path.dirname(__file__))) 546 | memberMessages(opts) 547 | categories(opts) 548 | """ 549 | # getUser(opts) 550 | # getOrders(opts) 551 | 552 | 553 | getItem(opts) 554 | # getCategorySpecifics(opts) 555 | # getAPIAccessRules(opts) 556 | -------------------------------------------------------------------------------- /ebay-api-scraper/update-ebay-table.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from scrapy.selector import Selector 3 | from scrapy.http import HtmlResponse 4 | import requests 5 | 6 | 7 | start_index = input('Which index are you starting at?') 8 | 9 | dbname='ebay' 10 | user='nathan' 11 | host='localhost' 12 | TABLE_NAME = 'completed_items' 13 | 14 | try: 15 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 16 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 17 | except: 18 | print "ERROR: Unable to connect to the database." 19 | sys.exit("Check database connection settings and try again.") 20 | 21 | cur = conn.cursor() 22 | 23 | SQL = ''' 24 | SELECT ci."itemId", ci."viewItemURL" 25 | FROM completed_items as ci; 26 | ''' 27 | cur.execute(SQL) 28 | data = [(int(itemId),str(url)) for itemId,url in cur.fetchall()] 29 | 30 | 31 | SQL = ''' 32 | SELECT count(*) 33 | FROM completed_items; 34 | ''' 35 | cur.execute(SQL) 36 | total_rows = int(cur.fetchall()[0][0]) 37 | 38 | 39 | for i,(itemId,url) in enumerate(data): 40 | if i < start_index: 41 | continue 42 | 43 | if (i+1) % 50 == 0: 44 | print "updating row #{} out of {}".format(i+1, total_rows) 45 | HTML = requests.get(url).text 46 | try: 47 | condition = str(Selector(text=HTML).xpath("//td[@class='sellerNotesContent']/span[@class='viSNotesCnt']/text()").extract()[0]) 48 | condition = condition.replace("\'","") 49 | except: 50 | condition = 'NULL' 51 | print i,condition 52 | SQL = ''' 53 | UPDATE ONLY completed_items as ci 54 | SET conditiondescription = '{condition}' 55 | WHERE ci."itemId" = {itemId}; 56 | '''.format(condition=condition,itemId=itemId) 57 | cur.execute(SQL) 58 | conn.commit() 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /index.md: -------------------------------------------------------------------------------- 1 | # Ebay Listing Optimization With Machine Learning 2 | -------------------------------------------------------------------------------- /mongo-test/mongo-test.py: -------------------------------------------------------------------------------- 1 | from pymongo import MongoClient 2 | import datetime 3 | 4 | client = MongoClient('localhost', 27017) 5 | 6 | # get a database 7 | db = client['posts-database'] 8 | 9 | # get a collection called posts 10 | posts = db['posts'] 11 | 12 | # sample data 13 | post = { 14 | "author": "Mike", 15 | "text": "My first blog post!", 16 | "tags": ["mongodb", "python", "pymongo"], 17 | "date": datetime.datetime.utcnow() 18 | } 19 | 20 | # insert a document into a collection 21 | # when a document is inserted, a special key, "_id" 22 | # is automatically added if the document doesn't contain the "_id" key. "_id" must be unique. 23 | post_id = posts.insert_one(post).inserted_id 24 | print post_id 25 | 26 | # After inserting the first document, the posts collection has actually been created on the server. 27 | # verify by listing all the collections in the database 28 | print db.collection_names(include_system_collections=False) 29 | 30 | -------------------------------------------------------------------------------- /postgresql-test/postgres cheat sheet: -------------------------------------------------------------------------------- 1 | #------------- TERMINAL COMMANDS -------------# 2 | 3 | # Enter a postgres database 4 | $ psql [dbname] 5 | 6 | # Enter a database called test-db1 with username nathan 7 | $ psql -U nathan ebay 8 | 9 | # Enter postgres database with username postgres 10 | $ psql -U postgres postgres 11 | 12 | 13 | #------------------ POSTGRES -----------------# 14 | #--- for running within postgres shell prompt ----# 15 | 16 | #--- How to check users 17 | postgres=# \du 18 | 19 | #--- Check current user 20 | postgres=# select current_user; 21 | 22 | #--- Check current database 23 | postgres=# select current_database(); 24 | 25 | #--- Print a list of databases 26 | postgres=# \l 27 | 28 | #--- Create a database 29 | postgres=# CREATE DATABASE dbname; 30 | 31 | #--- Print a list of tables 32 | postgres=# \d 33 | 34 | #--- Print a description of a table 35 | postgres=# \d tablename 36 | 37 | #--- Quit 38 | postgres=# \q 39 | 40 | 41 | #------------------ POSTGRES -----------------# 42 | #----------------- SQL COMMANDS --------------# 43 | 44 | #------- DATABASE MANAGEMENT -------# 45 | 46 | #--- Delete, Drop a database 47 | DROP DATABASE [IF EXISTS] name; 48 | 49 | #--- Create a user with permission to create databases 50 | CREATE USER nathan CREATEDB 51 | 52 | #--------- TABLE MANAGEMENT --------# 53 | 54 | #--- Create a table 55 | CREATE TABLE tablename 56 | 57 | #--- Copy a table 58 | SELECT * INTO newTable FROM oldTable; 59 | 60 | #--- Drop a table 61 | DROP TABLE tablename; 62 | 63 | #--- delete all rows in a table 64 | TRUNCATE tablename; 65 | #--- Delete some rows in a table 66 | DELETE FROM 67 | completed_items_15230_31388 as ci 68 | WHERE 69 | ci."primaryCategory.categoryId"!=31388 70 | AND 71 | ci."primaryCategory.categoryId"!=15230; 72 | 73 | #--- View column names in a table 74 | SELECT column_name from information_schema.columns where table_name ='your_table'; 75 | 76 | #--- Create a primary key in your table 77 | ALTER TABLE completed_items ADD COLUMN id SERIAL; 78 | 79 | #--- Drop a column 80 | ALTER TABLE tablename DROP COLUMN colname; 81 | 82 | #--- Rename a column 83 | ALTER TABLE distributors RENAME COLUMN address TO city; 84 | 85 | #--- Insert data into a table 86 | INSERT INTO tablename (col1, col2) VALUES (val1, val2); 87 | INSERT INTO tablename 88 | SELECT * FROM table2name; 89 | 90 | #--- Delete rows with duplicate values in certain columns 91 | # A frequent question in IRC is how to delete rows that are duplicates over a set of columns, keeping only the one with the lowest ID. 92 | # This query does that for all rows of tablename having the same column1, column2, and column3. 93 | DELETE FROM tablename 94 | WHERE id IN (SELECT id 95 | FROM (SELECT id, 96 | ROW_NUMBER() OVER (partition BY column1, column2, column3 ORDER BY id) AS rnum 97 | FROM tablename) t 98 | WHERE t.rnum > 1); 99 | 100 | #--- Add serial primary key to a table 101 | ALTER TABLE tableName ADD COLUMN id SERIAL PRIMARY KEY; 102 | 103 | #------- SELECTION ------# 104 | 105 | #--- Get the hour from a table 106 | SELECT CAST(EXTRACT(DAY FROM colname) AS INT) FROM tablename; 107 | 108 | #--- select a range of rows 109 | # Get rows 5 - 14 110 | SELECT * from tablename LIMIT 10 OFFSET 5; 111 | 112 | #------------------- SETTINGS ---------------# 113 | #--- toggle between wide/narrow output (print one column per line in the terminal!!) 114 | postgres=# \x on 115 | 116 | -------------------------------------------------------------------------------- /postgresql-test/postgresql-test.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | 3 | dbname='test-db1' 4 | user='nathan' 5 | host='localhost' 6 | 7 | try: 8 | conn = psycopg2.connect("dbname={} user={} host={}".format(dbname, user, host)) 9 | print '\nConnected to {} with user:{} on host:{}\n'.format(dbname, user, host) 10 | except: 11 | print "I am unable to connect to the database" 12 | 13 | cur = conn.cursor() 14 | 15 | cur.execute() 16 | 17 | rows = cur.fetchall() 18 | 19 | if rows: 20 | for row in rows: 21 | print row[0] 22 | 23 | 24 | --------------------------------------------------------------------------------