├── craigslist ├── __init__.py ├── models.pyc ├── __init__.pyc ├── settings.pyc ├── pipelines.pyc ├── spiders │ ├── __init__.pyc │ ├── CraigSpyder.pyc │ ├── __init__.py │ └── CraigSpyder.py ├── items.py ├── settings.py ├── models.py └── pipelines.py ├── ZipCodeWashington.xlsx ├── README.md ├── ZipCodeSeattleOnly.csv ├── DataFormatting.py └── CraigAnalysis.R /craigslist/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /craigslist/models.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/models.pyc -------------------------------------------------------------------------------- /ZipCodeWashington.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/ZipCodeWashington.xlsx -------------------------------------------------------------------------------- /craigslist/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/__init__.pyc -------------------------------------------------------------------------------- /craigslist/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/settings.pyc -------------------------------------------------------------------------------- /craigslist/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/pipelines.pyc -------------------------------------------------------------------------------- /craigslist/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/spiders/__init__.pyc -------------------------------------------------------------------------------- /craigslist/spiders/CraigSpyder.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/spiders/CraigSpyder.pyc -------------------------------------------------------------------------------- /craigslist/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /craigslist/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item, Field 9 | 10 | 11 | class CraigslistItem(Item): 12 | title = Field() 13 | link = Field() 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Craigslist-Pricing-Project 2 | ========================== 3 | 4 | 1. Enter directory that you want to store the scraper and then in CMD enter: 5 | 6 | scrapy startproject tutorial 7 | 8 | 2. Change base url to specific craigslist apartment link e.g. http://sfbay.craigslist.org/search/apa for San Francisco 9 | 10 | 3. To run the scraper type in the directory: 11 | 12 | scrapy crawl craig -o items.csv -t csv 13 | 14 | For more in-depth to customize, here's the blog post: 15 | 16 | https://racketracer.wordpress.com/2015/01/29/practical-scraping-using-scrapy/ 17 | 18 | Replace the pipelines.py file with an empty file if you don't want to store the data in a postgresql database. 19 | 20 | -------------------------------------------------------------------------------- /ZipCodeSeattleOnly.csv: -------------------------------------------------------------------------------- 1 | zipcode,latitude,longitude 2 | 98101,47.61067,-122.33438 3 | 98102,47.63287,-122.32253 4 | 98103,47.671346,-122.34166 5 | 98104,47.60252,-122.32855 6 | 98105,47.66377,-122.30118 7 | 98106,47.53282,-122.35443 8 | 98107,47.66747,-122.37468 9 | 98109,47.630648,-122.34675 10 | 98112,47.629653,-122.29752 11 | 98115,47.68382,-122.30122 12 | 98116,47.57487,-122.39392 13 | 98117,47.685919,-122.37838 14 | 98118,47.543348,-122.27496 15 | 98119,47.63877,-122.36694 16 | 98121,47.61487,-122.34578 17 | 98122,47.61157,-122.30406 18 | 98125,47.716648,-122.30308 19 | 98126,47.54687,-122.3748 20 | 98133,47.733852,-122.34461 21 | 98134,47.57867,-122.33441 22 | 98136,47.538887,-122.38803 23 | 98144,47.58577,-122.30081 24 | 98154,47.60632,-122.33357 25 | 98174,47.604718,-122.33523 26 | 98177,47.740886,-122.36978 27 | 98199,47.64767,-122.39758 28 | -------------------------------------------------------------------------------- /craigslist/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for craigslist project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'craigslist' 12 | 13 | SPIDER_MODULES = ['craigslist.spiders'] 14 | NEWSPIDER_MODULE = 'craigslist.spiders' 15 | ITEM_PIPELINES = ['craigslist.pipelines.CraigslistPipeline'] 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | #USER_AGENT = 'craigslist (+http://www.yourdomain.com)' 19 | 20 | DATABASE = { 21 | 'drivername': 'postgres', 22 | 'host': 'localhost', 23 | 'port': '5432', 24 | 'username': 'postgres', 25 | 'password': 'Project2015', 26 | 'database': 'craigslist' 27 | } -------------------------------------------------------------------------------- /craigslist/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Mar 07 18:55:53 2015 4 | 5 | @author: Jay 6 | """ 7 | 8 | #models.py 9 | 10 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, BigInteger 11 | from sqlalchemy.ext.declarative import declarative_base 12 | from sqlalchemy.engine.url import URL 13 | 14 | import settings 15 | 16 | DeclarativeBase = declarative_base() 17 | # <--snip--> 18 | class Apts(DeclarativeBase): 19 | __tablename__ = "seattle" 20 | 21 | craigId = Column('craigId', BigInteger, primary_key=True) 22 | title = Column('title', String) 23 | link = Column('link', String, nullable=True) 24 | price = Column('price', Integer, nullable=False) 25 | #area = Column('area', String, nullable=True) 26 | beds = Column('beds', Integer, nullable=False) 27 | size = Column('size', Integer, nullable=False) 28 | date = Column('date', String, nullable=False) 29 | numPic = Column('numPic', Integer, nullable=True) 30 | postDate = Column('postDate', DateTime, nullable=False) 31 | updateDate = Column('updateDate', DateTime, nullable=False) 32 | reposts = Column('reposts', Integer, nullable=False) 33 | contentLen = Column('contentLen', Integer, nullable=False) 34 | baths = Column('baths', Float, nullable=False) 35 | latitude = Column('latitude', Float(Precision=8), nullable=False) 36 | longitude = Column('longitude', Float(Precision=8), nullable=False) 37 | zipcode = Column('zipcode', String, nullable=False) 38 | 39 | def create_deals_table(engine): 40 | DeclarativeBase.metadata.create_all(engine) 41 | 42 | def db_connect(): 43 | return create_engine(URL(**settings.DATABASE)) -------------------------------------------------------------------------------- /DataFormatting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 31 13:51:04 2014 4 | 5 | @author: Jay 6 | """ 7 | 8 | import pandas as pd 9 | 10 | #TODO Check for duplicates 11 | # Check for ID duplicates 12 | 13 | zips = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/ZipCodeSeattle.csv" 14 | zipSeattle = pd.read_csv(zips) 15 | data = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/craigSeattle.csv" 16 | craig = pd.read_csv(data) 17 | craig['zipcode'] = NaN 18 | 19 | for i in range(0, len(craig.coord)-1): 20 | valMin = 10 21 | for j in range(0, len(zipSeattle.latitude)-1): 22 | temp = findDist(craig.latitude[i], craig.longitude[i], zipSeattle.latitude[j], zipSeattle.longitude[j]) 23 | if temp < valMin: 24 | valMin = temp 25 | craig.zipcode[i] = zipSeattle.zipcode[j] 26 | 27 | craig = craig[craig.price < 15000] 28 | craig = craig[craig.price > 100] 29 | craig = craig[craig.size < 10000] 30 | craig.baths = double(craig.baths) 31 | craig = craig[craig.baths < 10] 32 | 33 | #craig = craig[craig.beds > 0] 34 | #craig = craig[craig.size > 0] 35 | craig = craig[craig.zipcode > 0] 36 | #df.loc[0] = pd.Series(item) 37 | #Extra Clean 38 | 39 | craig.to_csv("C:/Users/Jay/Dropbox/Coding Projects/craigslist/craig11_15Formatted.csv") 40 | 41 | def findZip(address): 42 | for s in address: 43 | if len(s) == 5: 44 | print(s) 45 | return s 46 | 47 | def findDist(lat, lon, dataLat, dataLon): 48 | return abs(lat-dataLat) + abs(lon - dataLon) 49 | 50 | #Splicing 51 | #sep = "[" 52 | #for i in range(0, len(neigh['NeighSubArea'])): 53 | # if sep in neigh['NeighSubArea'][i]: 54 | # neigh['NeighSubArea'][i] = neigh['NeighSubArea'][i].split(sep, 1)[0] 55 | 56 | #if key returns value 57 | #use value as new key 58 | #recursive 59 | #Seattle neighborhood 5-count 60 | 61 | #Code for GeoPy Google API 62 | #for i in range(0, len(craig.coord)-1): 63 | # location = geolocator.reverse(craig.coord[i]).address 64 | # if location is not None: 65 | # zipcode = findZip(location.split(", ")) 66 | # craig.zipcode[i] = zipcode 67 | # time.sleep(1) 68 | # print(i) 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /craigslist/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | ''' 9 | DOCUMENTATION 10 | 11 | ''' 12 | from sqlalchemy.orm import sessionmaker 13 | from models import Apts, db_connect, create_deals_table 14 | from sqlalchemy import update 15 | import csv 16 | 17 | class CraigslistPipeline(object): 18 | 19 | def __init__(self): 20 | """ 21 | Initializes database connection and sessionmaker. 22 | Creates deals table. 23 | """ 24 | engine = db_connect() 25 | create_deals_table(engine) 26 | self.Session = sessionmaker(bind=engine) 27 | zipDict = self.createZip() 28 | global zipDict 29 | 30 | def createZip(self): 31 | zips = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/ZipCodeSeattleOnly.csv" 32 | with open(zips) as csvfile: 33 | reader = csv.DictReader(csvfile) 34 | zipDict = {} 35 | for row in reader: 36 | zipDict[row['zipcode']] = {'latitude':row['latitude']} 37 | zipDict[row['zipcode']].update({'longitude':row['longitude']}) 38 | return zipDict 39 | 40 | def findZip(self, zipDict, item): 41 | valMin = 10 42 | zip1 = '' 43 | for code in zipDict: 44 | temp = self.findDist(float(zipDict[code]['latitude']), float(zipDict[code]['longitude']), 45 | item['latitude'], item['longitude']) 46 | if temp < valMin: 47 | valMin = temp 48 | zip1 = code 49 | return zip1 50 | 51 | def findDist(self, lat, lon, dataLat, dataLon): 52 | return abs(lat-dataLat) + abs(lon - dataLon) 53 | 54 | def process_item(self, item, spider): 55 | """Save deals in the database. 56 | 57 | This method is called for every item pipeline component. 58 | 59 | """ 60 | global zipDict 61 | session = self.Session() 62 | item['zipcode'] = self.findZip(zipDict, item) 63 | #TODO: change to below if after a week or two 64 | #if item['reposts'] == 1: 65 | old = session.query(Apts.reposts).filter(Apts.craigId==item['craigId']) 66 | if old.all(): 67 | #TODO: 68 | #if session.query(Apts.updateDate).filter(Apt 69 | update(Apts).where(Apts.craigId==item['craigId']).values(reposts=old+1) 70 | else: 71 | deal = Apts(**item) 72 | try: 73 | session.add(deal) 74 | session.commit() 75 | except: 76 | session.rollback() 77 | raise 78 | finally: 79 | session.close() 80 | 81 | return item -------------------------------------------------------------------------------- /CraigAnalysis.R: -------------------------------------------------------------------------------- 1 | craig = read.csv("C:/Users/Jay/Dropbox/Coding Projects/craigslist/craigTest1.csv") 2 | craig$zipcode = as.factor(craig$zipcode) 3 | lm1 = lm(price ~ beds + size) 4 | lm2 = lm(price ~ beds + size + zipcode, data = craig) 5 | lm3 = lm(price ~ beds + size + zipcode + numPic, data = craig) 6 | 7 | #Test example prediction 8 | predict(lm3, newdata = data.frame(beds = '3', size = 1560), type='response') 9 | 10 | craig$zipcode = factor(craig$zipcode) 11 | table(craig$zipcode) 12 | craiglm = lm(price ~ beds + size + numPic + baths + zipcode, data = craig) 13 | control = lm(price ~ beds + size, data = craig) 14 | craigLm1 = lm(price ~ beds + size + zipcode + numPic, data = craig1000) 15 | craigLm2 = lm(price ~ beds + size + zipcode, data = craig1000) 16 | craigLm3 = lm(price ~ beds + size + zipcode*numPic, data = craig1000) 17 | craigLm4 = lm(price ~ zipcode + beds*size, data = craig1000) 18 | #Price is 2700 19 | predict(test, newdata = data.frame(beds = 2, size = 1200, contentLen = 5, numPic = 7, baths = 2, zipcode = "98101"), type='response') 20 | #Price is 4500 21 | predict(craigLm1, newdata = data.frame(beds = 2, size = 1430, numPic = 11, zipcode = "98102"), type='response') 22 | #Price is 1950 23 | predict(craigLm1, newdata = data.frame(beds = 3, size = 1320, numPic = 15, zipcode = "98108"), type='response') 24 | 25 | listNum = (tail(sort(table(craigClean$zipcode)), 40)) 26 | craigClean = craigClean[craigClean$zipcode %in% row.names(listNum),] 27 | 28 | craigSeattle = craig[craig$zipcode %in% SeattleCode,] 29 | craigSeattle$zipcode = factor(craigSeattle$zipcode) 30 | table(craigSeattle$zipcode) 31 | craigSeattlelm = lm(price ~ beds + size + numPic + baths + zipcode + contentLen, data = craigSeattle) 32 | craigStand = lm(price ~ beds + size + baths + zipcode, data = craigStand) 33 | summary(craigSeattlelm) 34 | plot(density(resid(craigSeattlelm))) 35 | qqnorm(resid(craigSeattlelm)) 36 | 37 | SeattleCode <- c("98177", "98133", "98125", "98117", "98103", "98115", 38 | "98107", "98105", "98199", "98119", "98109", "98102", 39 | "98112", "98121", "98101", "98122", "98104", "98134", 40 | "98144", "98116", "98126", "98136", "98106", "98108", 41 | "98118", "98154", "98164", "98174") 42 | 43 | lm1CV = cv.lm(df = craigClean, form.lm = craigLm1, m = 2) 44 | 45 | #Visualization 46 | 47 | craigplot1=craigSeattle[craigSeattle$price < 5500, ] 48 | bw <- diff(range(craigplot1$price)) / (2 * IQR(craigplot1$price) / length(craigplot1$price)^(1/3)) 49 | ggplot() + geom_histogram(aes(craigplot1$price), binwidth = bw) 50 | 51 | # Aggregate beds together and calculate means 52 | craigGrouping = aggregate(craigSeattle, list(craigSeattle$beds), mean) 53 | craigGrouping = craigGrouping[craigGrouping < 5, ] 54 | ggplot(data=craigGrouping, aes(x=Group.1, y=price, fill=Group.1)) + geom_bar(colour="black", stat="identity") 55 | + guides(fill=FALSE) + xlab("Number of Bedrooms") + ylab("Price") + ggtitle("Average Price Per Number of Bedrooms") 56 | + ylim(0, 5000) 57 | 58 | #Aggregate beds together and calculate medians 59 | craigGrouping = craigGrouping[,c("beds","price")] 60 | craigGrouping = aggregate(craigGrouping, list(craigGrouping$beds), median) 61 | ggplot(data=craigplot3, aes(x=beds, y=price, fill=city)) + 62 | geom_bar(stat="identity", position='dodge', colour="black") + 63 | scale_fill_manual(values=c("#999999", "#E69F00")) + 64 | geom_text(aes(label=price, y = price + 300)) 65 | -------------------------------------------------------------------------------- /craigslist/spiders/CraigSpyder.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Oct 17 15:02:19 2014 4 | 5 | @author: Jay 6 | """ 7 | 8 | import scrapy 9 | #import pandas as pd 10 | #scrapy crawl craig -o items.csv -t csv 11 | 12 | 13 | 14 | #Item class with listed fields to scrape 15 | class CraigslistItem(scrapy.Item): 16 | date = scrapy.Field() 17 | title = scrapy.Field() 18 | link = scrapy.Field() 19 | price = scrapy.Field() 20 | #area = scrapy.Field() 21 | beds = scrapy.Field() 22 | size = scrapy.Field() 23 | craigId = scrapy.Field() 24 | numPic = scrapy.Field() 25 | postDate = scrapy.Field() 26 | updateDate = scrapy.Field() 27 | baths = scrapy.Field() 28 | latitude = scrapy.Field() 29 | longitude = scrapy.Field() 30 | contentLen = scrapy.Field() 31 | reposts = scrapy.Field() 32 | zipcode = scrapy.Field() 33 | 34 | class MySpider(scrapy.Spider): 35 | name = "craig" 36 | allowed_domains = ["craigslist.org"] 37 | #Base url for Seattle apartment rentals. Change if necessary 38 | base_url = "http://seattle.craigslist.org/search/see/apa?" 39 | start_urls = ["http://seattle.craigslist.org/search/see/apa?"] 40 | #Initially grab all of the urls up to where craigslist allows 41 | #In this case, it's 2400 42 | for i in range(1, 5): 43 | start_urls.append(base_url + "s=" + str(i) + "00&") 44 | 45 | # def __init__(self): 46 | # global df 47 | # test = CraigslistItem() 48 | # test = self.initialize(test) 49 | # df = pd.DataFrame(columns = list(test.keys()), index=xrange(0,2400)) 50 | 51 | def parse(self, response): 52 | #find all postings 53 | postings = response.xpath(".//p") 54 | #loop through the postings 55 | for i in range(0, len(postings)-1): 56 | item = CraigslistItem() 57 | #grab craiglist apartment listing ID 58 | item["craigId"] = int(''.join(postings[i].xpath("@data-pid").extract())) 59 | temp = postings[i].xpath("span[@class='txt']") 60 | info = temp.xpath("span[@class='pl']") 61 | #title of posting 62 | item["title"] = ''.join(info.xpath("a/text()").extract()) 63 | #date of posting 64 | item["date"] = ''.join(info.xpath("time/text()").extract()) 65 | #pre-processing for getting the price in the right format 66 | price = ''.join(temp.xpath("span")[2].xpath("span[@class='price']").xpath("text()").extract()) 67 | #item["area"] = ''.join(temp.xpath("span")[2].xpath("span[@class='pnr']").xpath("small/text()").extract()) 68 | item["price"] = price.replace("$","") 69 | item["link"] = ''.join(info.xpath("a/@href").extract()) 70 | follow = "http://seattle.craigslist.org" + item["link"] 71 | #Parse request to follow the posting link into the actual post 72 | request = scrapy.Request(follow , callback=self.parse_item_page) 73 | request.meta['item'] = item 74 | #self.df.loc[i] = pd.Series(item) 75 | yield request 76 | 77 | #Parsing method to grab items from inside the individual postings 78 | def parse_item_page(self, response): 79 | #import pdb; pdb.set_trace() 80 | item = response.meta["item"] 81 | maplocation = response.xpath("//div[contains(@id,'map')]") 82 | latitude = ''.join(maplocation.xpath('@data-latitude').extract()) 83 | longitude = ''.join(maplocation.xpath('@data-longitude').extract()) 84 | if latitude: 85 | item['latitude'] = float(latitude) 86 | if longitude: 87 | item['longitude'] = float(longitude) 88 | attr = response.xpath("//p[@class='attrgroup']") 89 | try: 90 | item["beds"] = int(attr.xpath("span/b/text()")[0].extract()) 91 | bath = attr.xpath("span/b/text()")[1].extract() 92 | item["size"] = int(''.join(attr.xpath("span")[1].xpath("b/text()").extract())) 93 | if(bath.isdigit()): 94 | item["baths"] = float(attr.xpath("span/b/text()")[1].extract()) 95 | item["baths"] = float(bath) 96 | except: 97 | pass 98 | item["contentLen"] = len(response.xpath("//section[@id='postingbody']").xpath("text()").extract()) 99 | postinginfo = response.xpath("//p[@class = 'postinginfo reveal']").xpath("time/@datetime") 100 | item["postDate"] = postinginfo.extract() 101 | item["updateDate"] = postinginfo.extract() 102 | #TODO: check this equal to if it's valid 103 | if item["updateDate"] != item["postDate"]: 104 | item["reposts"] = 1 105 | else: 106 | item["reposts"] = 0 107 | item["numPic"] = len(response.xpath("//div[@id='thumbs']").xpath("a")) 108 | return item 109 | 110 | 111 | --------------------------------------------------------------------------------