├── craigslist
    ├── __init__.py
    ├── models.pyc
    ├── __init__.pyc
    ├── settings.pyc
    ├── pipelines.pyc
    ├── spiders
    │   ├── __init__.pyc
    │   ├── CraigSpyder.pyc
    │   ├── __init__.py
    │   └── CraigSpyder.py
    ├── items.py
    ├── settings.py
    ├── models.py
    └── pipelines.py
├── ZipCodeWashington.xlsx
├── README.md
├── ZipCodeSeattleOnly.csv
├── DataFormatting.py
└── CraigAnalysis.R


/craigslist/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/craigslist/models.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/models.pyc


--------------------------------------------------------------------------------
/ZipCodeWashington.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/ZipCodeWashington.xlsx


--------------------------------------------------------------------------------
/craigslist/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/__init__.pyc


--------------------------------------------------------------------------------
/craigslist/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/settings.pyc


--------------------------------------------------------------------------------
/craigslist/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/pipelines.pyc


--------------------------------------------------------------------------------
/craigslist/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/spiders/__init__.pyc


--------------------------------------------------------------------------------
/craigslist/spiders/CraigSpyder.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jayfeng1/Craigslist-Pricing-Project/HEAD/craigslist/spiders/CraigSpyder.pyc


--------------------------------------------------------------------------------
/craigslist/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/craigslist/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item, Field
 9 | 
10 | 
11 | class CraigslistItem(Item):
12 |     title = Field()
13 |     link = Field()
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Craigslist-Pricing-Project
 2 | ==========================
 3 | 
 4 | 1. Enter directory that you want to store the scraper and then in CMD enter: 
 5 | 
 6 |       scrapy startproject tutorial
 7 | 
 8 | 2. Change base url to specific craigslist apartment link e.g. http://sfbay.craigslist.org/search/apa for San Francisco
 9 | 
10 | 3. To run the scraper type in the directory: 
11 | 
12 |     scrapy crawl craig -o items.csv -t csv
13 | 
14 | For more in-depth to customize, here's the blog post:
15 | 
16 | https://racketracer.wordpress.com/2015/01/29/practical-scraping-using-scrapy/
17 | 
18 | Replace the pipelines.py file with an empty file if you don't want to store the data in a postgresql database. 
19 | 
20 | 


--------------------------------------------------------------------------------
/ZipCodeSeattleOnly.csv:
--------------------------------------------------------------------------------
 1 | zipcode,latitude,longitude
 2 | 98101,47.61067,-122.33438
 3 | 98102,47.63287,-122.32253
 4 | 98103,47.671346,-122.34166
 5 | 98104,47.60252,-122.32855
 6 | 98105,47.66377,-122.30118
 7 | 98106,47.53282,-122.35443
 8 | 98107,47.66747,-122.37468
 9 | 98109,47.630648,-122.34675
10 | 98112,47.629653,-122.29752
11 | 98115,47.68382,-122.30122
12 | 98116,47.57487,-122.39392
13 | 98117,47.685919,-122.37838
14 | 98118,47.543348,-122.27496
15 | 98119,47.63877,-122.36694
16 | 98121,47.61487,-122.34578
17 | 98122,47.61157,-122.30406
18 | 98125,47.716648,-122.30308
19 | 98126,47.54687,-122.3748
20 | 98133,47.733852,-122.34461
21 | 98134,47.57867,-122.33441
22 | 98136,47.538887,-122.38803
23 | 98144,47.58577,-122.30081
24 | 98154,47.60632,-122.33357
25 | 98174,47.604718,-122.33523
26 | 98177,47.740886,-122.36978
27 | 98199,47.64767,-122.39758
28 | 


--------------------------------------------------------------------------------
/craigslist/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for craigslist project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'craigslist'
12 | 
13 | SPIDER_MODULES = ['craigslist.spiders']
14 | NEWSPIDER_MODULE = 'craigslist.spiders'
15 | ITEM_PIPELINES = ['craigslist.pipelines.CraigslistPipeline']
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | #USER_AGENT = 'craigslist (+http://www.yourdomain.com)'
19 | 
20 | DATABASE = {
21 |     'drivername': 'postgres',
22 |     'host': 'localhost',
23 |     'port': '5432',
24 |     'username': 'postgres',
25 |     'password': 'Project2015',
26 |     'database': 'craigslist'
27 | }


--------------------------------------------------------------------------------
/craigslist/models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Mar 07 18:55:53 2015
 4 | 
 5 | @author: Jay
 6 | """
 7 | 
 8 | #models.py
 9 | 
10 | from sqlalchemy import create_engine, Column, Integer, String, DateTime, Float, BigInteger
11 | from sqlalchemy.ext.declarative import declarative_base
12 | from sqlalchemy.engine.url import URL
13 | 
14 | import settings
15 | 
16 | DeclarativeBase = declarative_base()
17 | # <--snip-->
18 | class Apts(DeclarativeBase):
19 |     __tablename__ = "seattle"
20 |     
21 |     craigId = Column('craigId', BigInteger, primary_key=True)
22 |     title = Column('title', String)
23 |     link = Column('link', String, nullable=True)
24 |     price = Column('price', Integer, nullable=False)
25 |     #area = Column('area', String, nullable=True)
26 |     beds = Column('beds', Integer, nullable=False)
27 |     size = Column('size', Integer, nullable=False)
28 |     date = Column('date', String, nullable=False)
29 |     numPic = Column('numPic', Integer, nullable=True)
30 |     postDate = Column('postDate', DateTime, nullable=False)
31 |     updateDate = Column('updateDate', DateTime, nullable=False)
32 |     reposts = Column('reposts', Integer, nullable=False)
33 |     contentLen = Column('contentLen', Integer, nullable=False)
34 |     baths = Column('baths', Float, nullable=False)
35 |     latitude = Column('latitude', Float(Precision=8), nullable=False)
36 |     longitude = Column('longitude', Float(Precision=8), nullable=False)
37 |     zipcode = Column('zipcode', String, nullable=False)
38 | 
39 | def create_deals_table(engine):
40 |     DeclarativeBase.metadata.create_all(engine)
41 | 
42 | def db_connect():
43 |     return create_engine(URL(**settings.DATABASE))


--------------------------------------------------------------------------------
/DataFormatting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Fri Oct 31 13:51:04 2014
 4 | 
 5 | @author: Jay
 6 | """
 7 | 
 8 | import pandas as pd
 9 | 
10 | #TODO Check for duplicates
11 | # Check for ID duplicates
12 | 
13 | zips = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/ZipCodeSeattle.csv"
14 | zipSeattle = pd.read_csv(zips)
15 | data = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/craigSeattle.csv"
16 | craig = pd.read_csv(data)
17 | craig['zipcode'] = NaN
18 | 
19 | for i in range(0, len(craig.coord)-1):
20 |     valMin = 10
21 |     for j in range(0, len(zipSeattle.latitude)-1):
22 |         temp = findDist(craig.latitude[i], craig.longitude[i], zipSeattle.latitude[j], zipSeattle.longitude[j])
23 |         if temp < valMin:
24 |             valMin = temp
25 |             craig.zipcode[i] = zipSeattle.zipcode[j]
26 | 
27 | craig = craig[craig.price < 15000]
28 | craig = craig[craig.price > 100] 
29 | craig = craig[craig.size < 10000]
30 | craig.baths = double(craig.baths)
31 | craig = craig[craig.baths < 10]
32 | 
33 | #craig = craig[craig.beds > 0]
34 | #craig = craig[craig.size > 0]
35 | craig = craig[craig.zipcode > 0]         
36 | #df.loc[0] = pd.Series(item)
37 | #Extra Clean
38 | 
39 | craig.to_csv("C:/Users/Jay/Dropbox/Coding Projects/craigslist/craig11_15Formatted.csv")
40 |     
41 | def findZip(address):
42 |     for s in address:
43 |         if len(s) == 5:
44 |             print(s)
45 |             return s
46 |             
47 | def findDist(lat, lon, dataLat, dataLon):
48 |     return abs(lat-dataLat) + abs(lon - dataLon)
49 |     
50 | #Splicing
51 | #sep = "["
52 | #for i in range(0, len(neigh['NeighSubArea'])):
53 | #    if sep in neigh['NeighSubArea'][i]:
54 | #        neigh['NeighSubArea'][i] = neigh['NeighSubArea'][i].split(sep, 1)[0]
55 | 
56 | #if key returns value
57 | #use value as new key
58 | #recursive
59 | #Seattle neighborhood 5-count
60 | 
61 | #Code for GeoPy Google API
62 | #for i in range(0, len(craig.coord)-1):
63 | #    location = geolocator.reverse(craig.coord[i]).address
64 | #    if location is not None: 
65 | #        zipcode = findZip(location.split(", "))
66 | #        craig.zipcode[i] = zipcode
67 | #        time.sleep(1)
68 | #    print(i)
69 | 
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/craigslist/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | '''
 9 | DOCUMENTATION
10 |     
11 | '''
12 | from sqlalchemy.orm import sessionmaker
13 | from models import Apts, db_connect, create_deals_table
14 | from sqlalchemy import update
15 | import csv
16 | 
17 | class CraigslistPipeline(object):
18 |     
19 |     def __init__(self):
20 |         """
21 |         Initializes database connection and sessionmaker.
22 |         Creates deals table.
23 |         """
24 |         engine = db_connect()
25 |         create_deals_table(engine)
26 |         self.Session = sessionmaker(bind=engine)
27 |         zipDict = self.createZip()
28 |         global zipDict
29 |         
30 |     def createZip(self):
31 |         zips = "C:/Users/Jay/Dropbox/Coding Projects/craigslist/ZipCodeSeattleOnly.csv"
32 |         with open(zips) as csvfile:
33 |             reader = csv.DictReader(csvfile)
34 |             zipDict = {}
35 |             for row in reader:
36 |                 zipDict[row['zipcode']] = {'latitude':row['latitude']}
37 |                 zipDict[row['zipcode']].update({'longitude':row['longitude']})
38 |         return zipDict
39 |         
40 |     def findZip(self, zipDict, item):
41 |         valMin = 10
42 |         zip1 = ''
43 |         for code in zipDict:
44 |             temp = self.findDist(float(zipDict[code]['latitude']), float(zipDict[code]['longitude']), 
45 |                     item['latitude'], item['longitude'])
46 |             if temp < valMin:
47 |                 valMin = temp
48 |                 zip1 = code
49 |         return zip1
50 |     
51 |     def findDist(self, lat, lon, dataLat, dataLon):
52 |         return abs(lat-dataLat) + abs(lon - dataLon)
53 | 
54 |     def process_item(self, item, spider):
55 |         """Save deals in the database.
56 | 
57 |         This method is called for every item pipeline component.
58 | 
59 |         """
60 |         global zipDict
61 |         session = self.Session()
62 |         item['zipcode'] = self.findZip(zipDict, item)
63 |         #TODO: change to below if after a week or two
64 |         #if item['reposts'] == 1:
65 |         old = session.query(Apts.reposts).filter(Apts.craigId==item['craigId'])
66 |         if old.all():
67 |             #TODO:
68 |             #if session.query(Apts.updateDate).filter(Apt
69 |             update(Apts).where(Apts.craigId==item['craigId']).values(reposts=old+1)          
70 |         else:
71 |             deal = Apts(**item)
72 |             try:
73 |                 session.add(deal)
74 |                 session.commit()
75 |             except:
76 |                 session.rollback()
77 |                 raise
78 |             finally:
79 |                 session.close()
80 |     
81 |             return item


--------------------------------------------------------------------------------
/CraigAnalysis.R:
--------------------------------------------------------------------------------
 1 | craig = read.csv("C:/Users/Jay/Dropbox/Coding Projects/craigslist/craigTest1.csv")
 2 | craig$zipcode = as.factor(craig$zipcode)
 3 | lm1 = lm(price ~ beds + size)
 4 | lm2 = lm(price ~ beds + size + zipcode, data = craig)
 5 | lm3 = lm(price ~ beds + size + zipcode + numPic, data = craig)
 6 | 
 7 | #Test example prediction
 8 | predict(lm3, newdata = data.frame(beds = '3', size = 1560), type='response')
 9 | 
10 | craig$zipcode = factor(craig$zipcode)
11 | table(craig$zipcode)
12 | craiglm = lm(price ~ beds + size + numPic + baths + zipcode, data = craig)
13 | control = lm(price ~ beds + size, data = craig)
14 | craigLm1 = lm(price ~ beds + size + zipcode + numPic, data = craig1000)
15 | craigLm2 = lm(price ~ beds + size + zipcode, data = craig1000)
16 | craigLm3 = lm(price ~ beds + size + zipcode*numPic, data = craig1000)
17 | craigLm4 = lm(price ~ zipcode + beds*size, data = craig1000)
18 | #Price is 2700
19 | predict(test, newdata = data.frame(beds = 2, size = 1200, contentLen = 5, numPic = 7, baths = 2, zipcode = "98101"), type='response')
20 | #Price is 4500
21 | predict(craigLm1, newdata = data.frame(beds = 2, size = 1430, numPic = 11, zipcode = "98102"), type='response')
22 | #Price is 1950
23 | predict(craigLm1, newdata = data.frame(beds = 3, size = 1320, numPic = 15, zipcode = "98108"), type='response')
24 | 
25 | listNum = (tail(sort(table(craigClean$zipcode)), 40))
26 | craigClean = craigClean[craigClean$zipcode %in% row.names(listNum),]
27 | 
28 | craigSeattle = craig[craig$zipcode %in% SeattleCode,]
29 | craigSeattle$zipcode = factor(craigSeattle$zipcode)
30 | table(craigSeattle$zipcode)
31 | craigSeattlelm = lm(price ~ beds + size + numPic + baths + zipcode + contentLen, data = craigSeattle)
32 | craigStand = lm(price ~ beds + size + baths + zipcode, data = craigStand)
33 | summary(craigSeattlelm)
34 | plot(density(resid(craigSeattlelm)))
35 | qqnorm(resid(craigSeattlelm))
36 | 
37 | SeattleCode <- c("98177", "98133", "98125", "98117", "98103", "98115", 
38 |                  "98107", "98105", "98199", "98119", "98109", "98102", 
39 |                  "98112", "98121", "98101", "98122", "98104", "98134", 
40 |                  "98144", "98116", "98126", "98136", "98106", "98108", 
41 |                  "98118", "98154", "98164", "98174")
42 | 
43 | lm1CV = cv.lm(df = craigClean, form.lm = craigLm1, m = 2)
44 | 
45 | #Visualization
46 | 
47 | craigplot1=craigSeattle[craigSeattle$price < 5500, ]
48 | bw <- diff(range(craigplot1$price)) / (2 * IQR(craigplot1$price) / length(craigplot1$price)^(1/3))
49 | ggplot() + geom_histogram(aes(craigplot1$price), binwidth = bw)
50 | 
51 | # Aggregate beds together and calculate means
52 | craigGrouping = aggregate(craigSeattle, list(craigSeattle$beds), mean)
53 | craigGrouping = craigGrouping[craigGrouping < 5, ]
54 | ggplot(data=craigGrouping, aes(x=Group.1, y=price, fill=Group.1)) + geom_bar(colour="black", stat="identity") 
55 |   + guides(fill=FALSE) + xlab("Number of Bedrooms") + ylab("Price") + ggtitle("Average Price Per Number of Bedrooms") 
56 |   + ylim(0, 5000)
57 | 
58 | #Aggregate beds together and calculate medians
59 | craigGrouping = craigGrouping[,c("beds","price")]
60 | craigGrouping = aggregate(craigGrouping, list(craigGrouping$beds), median)
61 | ggplot(data=craigplot3, aes(x=beds, y=price, fill=city)) + 
62 |   geom_bar(stat="identity", position='dodge', colour="black") + 
63 |   scale_fill_manual(values=c("#999999", "#E69F00")) + 
64 |   geom_text(aes(label=price, y = price + 300))
65 | 


--------------------------------------------------------------------------------
/craigslist/spiders/CraigSpyder.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Oct 17 15:02:19 2014
  4 | 
  5 | @author: Jay
  6 | """
  7 | 
  8 | import scrapy
  9 | #import pandas as pd
 10 | #scrapy crawl craig -o items.csv -t csv
 11 | 
 12 | 
 13 | 
 14 | #Item class with listed fields to scrape
 15 | class CraigslistItem(scrapy.Item):
 16 |     date = scrapy.Field()
 17 |     title = scrapy.Field()
 18 |     link = scrapy.Field()
 19 |     price = scrapy.Field()
 20 |     #area = scrapy.Field()
 21 |     beds = scrapy.Field()
 22 |     size = scrapy.Field()
 23 |     craigId = scrapy.Field()
 24 |     numPic = scrapy.Field()
 25 |     postDate = scrapy.Field()
 26 |     updateDate = scrapy.Field()
 27 |     baths = scrapy.Field()
 28 |     latitude = scrapy.Field()
 29 |     longitude = scrapy.Field()
 30 |     contentLen = scrapy.Field()
 31 |     reposts = scrapy.Field()
 32 |     zipcode = scrapy.Field()
 33 | 
 34 | class MySpider(scrapy.Spider):
 35 |     name = "craig"
 36 |     allowed_domains = ["craigslist.org"]
 37 |     #Base url for Seattle apartment rentals. Change if necessary
 38 |     base_url = "http://seattle.craigslist.org/search/see/apa?"
 39 |     start_urls = ["http://seattle.craigslist.org/search/see/apa?"]
 40 |     #Initially grab all of the urls up to where craigslist allows
 41 |     #In this case, it's 2400 
 42 |     for i in range(1, 5):
 43 |         start_urls.append(base_url + "s=" + str(i) + "00&")
 44 |         
 45 | #    def __init__(self):
 46 | #        global df
 47 | #        test = CraigslistItem()
 48 | #        test = self.initialize(test)
 49 | #        df = pd.DataFrame(columns = list(test.keys()), index=xrange(0,2400))
 50 | 
 51 |     def parse(self, response):
 52 |         #find all postings
 53 |         postings = response.xpath(".//p")
 54 |         #loop through the postings
 55 |         for i in range(0, len(postings)-1):
 56 |             item = CraigslistItem()
 57 |             #grab craiglist apartment listing ID
 58 |             item["craigId"] = int(''.join(postings[i].xpath("@data-pid").extract()))
 59 |             temp = postings[i].xpath("span[@class='txt']")
 60 |             info = temp.xpath("span[@class='pl']")
 61 |             #title of posting
 62 |             item["title"] = ''.join(info.xpath("a/text()").extract())
 63 |             #date of posting
 64 |             item["date"] = ''.join(info.xpath("time/text()").extract())
 65 |             #pre-processing for getting the price in the right format
 66 |             price = ''.join(temp.xpath("span")[2].xpath("span[@class='price']").xpath("text()").extract())
 67 |             #item["area"] = ''.join(temp.xpath("span")[2].xpath("span[@class='pnr']").xpath("small/text()").extract())
 68 |             item["price"] = price.replace("$","")
 69 |             item["link"] = ''.join(info.xpath("a/@href").extract())
 70 |             follow = "http://seattle.craigslist.org" + item["link"]
 71 |             #Parse request to follow the posting link into the actual post
 72 |             request = scrapy.Request(follow , callback=self.parse_item_page)
 73 |             request.meta['item'] = item
 74 |             #self.df.loc[i] = pd.Series(item)
 75 |             yield request
 76 | 
 77 |     #Parsing method to grab items from inside the individual postings
 78 |     def parse_item_page(self, response):
 79 |         #import pdb; pdb.set_trace()
 80 |         item = response.meta["item"]
 81 |         maplocation = response.xpath("//div[contains(@id,'map')]")
 82 |         latitude = ''.join(maplocation.xpath('@data-latitude').extract())
 83 |         longitude = ''.join(maplocation.xpath('@data-longitude').extract())
 84 |         if latitude:
 85 |             item['latitude'] = float(latitude)
 86 |         if longitude:
 87 |             item['longitude'] = float(longitude)
 88 |         attr = response.xpath("//p[@class='attrgroup']")
 89 |         try:
 90 |             item["beds"] = int(attr.xpath("span/b/text()")[0].extract())
 91 |             bath = attr.xpath("span/b/text()")[1].extract()
 92 |             item["size"] = int(''.join(attr.xpath("span")[1].xpath("b/text()").extract()))
 93 |             if(bath.isdigit()):
 94 |                 item["baths"] = float(attr.xpath("span/b/text()")[1].extract())
 95 |             item["baths"] = float(bath)
 96 |         except:
 97 |             pass
 98 |         item["contentLen"] = len(response.xpath("//section[@id='postingbody']").xpath("text()").extract())
 99 |         postinginfo = response.xpath("//p[@class = 'postinginfo reveal']").xpath("time/@datetime")
100 |         item["postDate"] = postinginfo.extract()
101 |         item["updateDate"] = postinginfo.extract()
102 |         #TODO: check this equal to if it's valid
103 |         if item["updateDate"] != item["postDate"]:
104 |             item["reposts"] = 1
105 |         else:
106 |             item["reposts"] = 0
107 |         item["numPic"] = len(response.xpath("//div[@id='thumbs']").xpath("a"))
108 |         return item
109 |         
110 |             
111 | 


--------------------------------------------------------------------------------