└── spider.py /spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import sys 5 | import requests 6 | import datetime 7 | import traceback 8 | import ConfigParser 9 | import time 10 | import random 11 | 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | 16 | class HousingStockSpider(): 17 | BASE_URL = 'http://www.tjfdc.com.cn/pages/xwzw/clfbfqk.aspx?SelMnu=CLFBFQK&fid=' 18 | AJAX_URL = 'http://www.tjfdc.com.cn/pages/xwzw/Data/clfbfqkHandler.ashx' 19 | DATA_PATH = os.path.join(os.path.dirname(__file__), 'data') 20 | CONFIG_FILE = os.path.join(os.path.dirname(__file__), 'config.ini') 21 | DATA_FILE = os.path.join(os.path.dirname(__file__), 'total_data.csv') 22 | default_headers = { 23 | 'Host': 'www.tjfdc.com.cn', 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 25 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 26 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 27 | 'Accept-Encoding': 'gzip, deflate', 28 | 'Referer': 'http://www.tjfdc.com.cn/Pages/xwzw.aspx', 29 | } 30 | ajax_headers = { 31 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 32 | 'Accept-Encoding': 'gzip, deflate', 33 | 'Accept-Language': 'zh-CN,zh;q=0.8', 34 | 'Host': 'www.tjfdc.com.cn', 35 | 'Origin': 'http://www.tjfdc.com.cn', 36 | 'Referer': 'http://www.tjfdc.com.cn/pages/xwzw/clfbfqk.aspx?SelMnu=CLFBFQK', 37 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36', 38 | 'X-Requested-With': 'XMLHttpRequest', 39 | } 40 | default_query_dict = { 41 | 'selmnu': 'CLFBFQK', 42 | 'rows': 1, 43 | 'dt': u'Wed May 03 2017 22:56:04 GMT 0800 (中国标准时间)', 44 | 'gRow_': 20, 45 | 'gPage_': 1, 46 | 'txtDate': '2017-05-03', 47 | } 48 | default_config_dict = { 49 | 'start_date': '1967-01-01', 50 | } 51 | 52 | def __init__(self): 53 | if not os.path.exists(HousingStockSpider.DATA_PATH): 54 | print u'正在创建数据文件夹...' 55 | os.mkdir(HousingStockSpider.DATA_PATH) 56 | try: 57 | resp = requests.get(HousingStockSpider.BASE_URL, headers=HousingStockSpider.default_headers) 58 | if resp.status_code == 200: 59 | print u'初始化成功' 60 | except: 61 | traceback.print_exc() 62 | print u'初始化失败,请截图联系作者' 63 | 64 | def get_data_by_date(self, date_str): 65 | # 初始化参数 66 | gPage_ = 1 67 | gRow_ = 20 68 | current_dt = datetime.datetime.now() 69 | dt = current_dt.strftime(u'%a %b %d %Y %H:%M:%S GMT 0800 (中国标准时间)') 70 | txtDate = date_str 71 | HousingStockSpider.default_query_dict['gPage_'] = gPage_ 72 | HousingStockSpider.default_query_dict['gRow_'] = gRow_ 73 | HousingStockSpider.default_query_dict['dt'] = dt 74 | HousingStockSpider.default_query_dict['txtDate'] = txtDate 75 | rows_list = [] 76 | try: 77 | # 获取数据总量 78 | resp = requests.get(url=HousingStockSpider.AJAX_URL, params=HousingStockSpider.default_query_dict, 79 | headers=HousingStockSpider.default_headers) 80 | json_data = resp.json() 81 | total = json_data['total'] # "rows": [{ "ID": "1","cell": ["1","32470-001978","宝坻区宝鑫景苑东湖园1-2-10"]},] 82 | HousingStockSpider.default_query_dict['gRow_'] = total # 更换请求的数据量为最大值,加快速度 83 | # 获取所有数据 84 | resp = requests.get(url=HousingStockSpider.AJAX_URL, params=HousingStockSpider.default_query_dict, 85 | headers=HousingStockSpider.default_headers) 86 | json_data = resp.json() 87 | rows = json_data['rows'] 88 | for row in rows: 89 | items = list() 90 | items.append(date_str) 91 | items.append(row['cell'][1]) 92 | items.append(row['cell'][2]) 93 | rows_list.append(items) 94 | print u'成功抓取{}的数据,共{}条,成功{}条'.format(date_str, total, len(rows_list)) 95 | except: 96 | traceback.print_exc() 97 | print u'抓取失败,请截图联系作者' 98 | return rows_list 99 | 100 | def auto_fetch(self): 101 | # 检查配置文件 102 | if not os.path.exists(HousingStockSpider.CONFIG_FILE): 103 | print u'正在重新生成配置文件...' 104 | with open(HousingStockSpider.CONFIG_FILE, 'w') as f: 105 | f.write('[default]' + os.linesep) 106 | for key, value in HousingStockSpider.default_config_dict.iteritems(): 107 | f.write(('%s = %s' + os.linesep) % (key, value)) 108 | # 读取开始日期 109 | cf = ConfigParser.ConfigParser() 110 | cf.read(HousingStockSpider.CONFIG_FILE) 111 | try: 112 | start_date_str = cf.get('default', 'start_date') 113 | except: 114 | print u'正在重新生成配置文件...' 115 | with open(HousingStockSpider.CONFIG_FILE, 'w') as f: 116 | f.write('[default]' + os.linesep) 117 | for key, value in HousingStockSpider.default_config_dict.iteritems(): 118 | f.write(('%s = %s' + os.linesep) % (key, value)) 119 | cf = ConfigParser.ConfigParser() 120 | cf.read(HousingStockSpider.CONFIG_FILE) 121 | # 当前日期 122 | curr_date = datetime.datetime.now() 123 | # 转换日期 124 | try: 125 | start_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d') 126 | except: 127 | print u'配置文件格式不对,应用默认配置' 128 | start_date = datetime.datetime.strptime(HousingStockSpider.default_config_dict['start_date'], '%Y-%m-%d') 129 | # 爬取 130 | while start_date <= curr_date: 131 | date_str = start_date.strftime('%Y-%m-%d') 132 | data_list = self.get_data_by_date(date_str) 133 | if data_list: 134 | # 写单独文件 135 | with open(os.path.join(HousingStockSpider.DATA_PATH, '{}.csv'.format(date_str)), 'w') as f: 136 | for data in data_list: 137 | line = ','.join(data) 138 | f.write(line + os.linesep) 139 | # 写汇总文件 140 | with open(HousingStockSpider.DATA_FILE, 'a') as f: 141 | for data in data_list: 142 | line = ','.join(data) 143 | f.write(line + os.linesep) 144 | sleep_time = random.randint(1, 5) 145 | print u'休眠{}s...'.format(sleep_time) 146 | time.sleep(sleep_time) 147 | # 日期自增 148 | start_date += datetime.timedelta(days=1) 149 | # 写回配置 150 | cf.set('default', 'start_date', start_date.strftime('%Y-%m-%d')) 151 | cf.write(open(HousingStockSpider.CONFIG_FILE, 'w')) 152 | 153 | 154 | if __name__ == '__main__': 155 | spider = HousingStockSpider() 156 | spider.auto_fetch() 157 | --------------------------------------------------------------------------------