├── .gitignore ├── digits └── 0fcc0d83409c547d3a9d038cc7808fa3s.png ├── get_training_data.py ├── label ├── 0.png ├── 1.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png └── 9.png ├── main.py ├── rental.py └── ziroom.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | *.o 6 | 7 | # Packages 8 | #*.egg 9 | *.egg-info 10 | dist 11 | build 12 | eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | logs 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | 38 | # virtual env 39 | .venv 40 | 41 | # Editors 42 | *.swp 43 | .ropeproject 44 | 45 | dataviz/data/ 46 | dlib/shape_predictor_68_face_landmarks.dat 47 | -------------------------------------------------------------------------------- /digits/0fcc0d83409c547d3a9d038cc7808fa3s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/digits/0fcc0d83409c547d3a9d038cc7808fa3s.png -------------------------------------------------------------------------------- /get_training_data.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import glob 3 | import sys 4 | import time 5 | import os 6 | import json 7 | import re 8 | from lxml import etree 9 | import requests 10 | import numpy as np 11 | import cv2 12 | 13 | headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"} 14 | FOLDER = "digits" 15 | 16 | def fix_url(url): 17 | if re.match(r'//', url): 18 | url = 'http:{}'.format(url) 19 | return url 20 | 21 | def save_pic(url): 22 | print(url) 23 | r = requests.get(url=url, headers=headers, verify=False) 24 | filename = url.split("/")[-1] 25 | filepath = os.path.join(FOLDER, filename) 26 | with open(filepath, "wb") as f: 27 | f.write(r.content) 28 | 29 | def get_pic_url(url): 30 | r = requests.get(url, headers=headers, verify=False) 31 | content = r.content.decode("utf-8") 32 | matched = re.search(r'var ROOM_PRICE = (.*);', content) 33 | price_json = matched.group(1) 34 | data = json.loads(price_json) 35 | image_url = data["image"] 36 | if re.match(r'//', image_url): 37 | image_url = fix_url(image_url) 38 | return image_url 39 | 40 | def label_pic(filepath): 41 | im = cv2.imread(filepath) 42 | imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) 43 | ret, thresh = cv2.threshold(imgray, 127, 255, 0) 44 | im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 45 | for contour in contours: 46 | [x, y, w, h] = cv2.boundingRect(contour) 47 | roi = imgray[y:y+h, x:x+w] 48 | roismall = cv2.resize(roi, (30, 30)) 49 | cv2.imshow("small", roismall) 50 | key = cv2.waitKey(0) 51 | if key == 27: 52 | sys.exit() 53 | 54 | digit = int(chr(key)) 55 | outname = "{}.png".format(digit) 56 | outpath = os.path.join("label", outname) 57 | cv2.imwrite(outpath, roismall) 58 | 59 | def label_data(): 60 | pics = os.listdir(FOLDER) 61 | for pic in pics: 62 | filename = pic.split(".")[0] 63 | patt = "label/{}_*".format(filename) 64 | saved_digits = glob.glob(patt) 65 | 66 | if len(saved_digits) == 10: 67 | print("{} done".format(patt)) 68 | continue 69 | filepath = os.path.join(FOLDER, pic) 70 | label_pic(filepath) 71 | 72 | def load_data(): 73 | pics = os.listdir("label") 74 | samples = np.empty((0, 900)) 75 | labels = [] 76 | for pic in pics: 77 | filepath = os.path.join("label", pic) 78 | label = int(pic.split(".")[0].split("_")[-1]) 79 | labels.append(label) 80 | im = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE) 81 | sample = im.reshape((1, 900)).astype(np.float32) 82 | samples = np.append(samples, sample, 0) 83 | labels = np.array(labels).reshape((-1, 1)).astype(np.float32) 84 | return [samples, labels] 85 | 86 | def recog_num(im): 87 | [samples, labels] = load_data() 88 | samples = samples.astype(np.float32) 89 | lables = labels.astype(np.float32) 90 | model = cv2.ml.KNearest_create() 91 | print(samples.dtype, labels.dtype) 92 | model.train(samples, cv2.ml.ROW_SAMPLE, labels) 93 | imgray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) 94 | ret, thresh = cv2.threshold(imgray, 127, 255, 0) 95 | im2, contours, hierarchy = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 96 | for contour in contours[::-1]: 97 | [x, y, w, h] = cv2.boundingRect(contour) 98 | roi = imgray[y:y+h, x:x+w] 99 | roismall = cv2.resize(roi, (30, 30)) 100 | sample = roismall.reshape((1, 900)).astype(np.float32) 101 | ret, results, neighbours, distances = model.findNearest(sample, k = 1) 102 | print(int(results[0,0])) 103 | 104 | 105 | if __name__ == "__main__": 106 | im = cv2.imread("e72ac241b410eac63a652dc1349521fds.png") 107 | recog_num(im) 108 | -------------------------------------------------------------------------------- /label/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/0.png -------------------------------------------------------------------------------- /label/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/1.png -------------------------------------------------------------------------------- /label/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/2.png -------------------------------------------------------------------------------- /label/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/3.png -------------------------------------------------------------------------------- /label/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/4.png -------------------------------------------------------------------------------- /label/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/5.png -------------------------------------------------------------------------------- /label/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/6.png -------------------------------------------------------------------------------- /label/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/7.png -------------------------------------------------------------------------------- /label/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/8.png -------------------------------------------------------------------------------- /label/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pythonml/house/dee19eee0a67d13dcfcfce564680912f0c878416/label/9.png -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import urllib3 2 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 3 | 4 | import math 5 | import re 6 | from lxml import etree 7 | import time 8 | import json 9 | import pymongo 10 | import requests 11 | 12 | DB = "guangzhou" 13 | base_url = "https://gz.ke.com" 14 | 15 | def get_disctricts(): 16 | url = base_url + "/ershoufang/" 17 | r = requests.get(url, verify=False) 18 | content = r.content.decode("utf-8") 19 | root = etree.HTML(content) 20 | distr_nodes = root.xpath('.//div[@class="m-filter"]//div[@data-role="ershoufang"]/div/a') 21 | result = [] 22 | for node in distr_nodes: 23 | rel_url = node.attrib["href"] 24 | distr_url = "" 25 | if re.match(r'https://', rel_url): 26 | distr_url = rel_url 27 | else: 28 | distr_url = base_url + rel_url 29 | distr_name = node.text 30 | result.append([distr_name, distr_url]) 31 | return result 32 | 33 | def get_sub_districts(): 34 | districts = get_disctricts() 35 | result = [] 36 | client = pymongo.MongoClient() 37 | db = client[DB] 38 | for item in districts: 39 | distr_name = item[0] 40 | distr_url = item[1] 41 | r = requests.get(distr_url, verify=False) 42 | content = r.content.decode("utf-8") 43 | root = etree.HTML(content) 44 | subdistr_nodes = root.xpath('.//div[@class="m-filter"]//div[@data-role="ershoufang"]/div')[1].xpath('./a') 45 | for node in subdistr_nodes: 46 | sub_distr_name = node.text 47 | sub_distr_url = base_url + node.attrib["href"] 48 | db.sub_districts.insert_one({ 49 | "district": distr_name, 50 | "sub_district": sub_distr_name, 51 | "url": sub_distr_url, 52 | }) 53 | 54 | def get_item_num(entry_url): 55 | r = requests.get(entry_url, verify=False) 56 | content = r.content.decode("utf-8") 57 | root = etree.HTML(content) 58 | num_nodes = root.xpath('.//div[@class="content "]//h2[contains(@class, "total")]/span') 59 | if len(num_nodes) == 0: 60 | raise Exception("no total number for {}".format(entry_url)) 61 | num_str = num_nodes[0].text.strip() 62 | return int(num_str) 63 | 64 | def get_houses_by_sub_district(sub_distr_id, entry_url): 65 | url_patt = entry_url + "pg{}/" 66 | 67 | total_num = get_item_num(entry_url) 68 | last_page = math.ceil(total_num/30) 69 | i = 1 70 | client = pymongo.MongoClient() 71 | db = client[DB] 72 | for i in range(1, last_page+1, 1): 73 | url = url_patt.format(i) 74 | r = requests.get(url, verify=False) 75 | content = r.content.decode("utf-8") 76 | root = etree.HTML(content) 77 | content_node = root.find('.//div[@class="content "]') 78 | if content_node is None: 79 | print(url) 80 | r = requests.get(url, verify=False) 81 | content = r.content.decode("utf-8") 82 | root = etree.HTML(content) 83 | ul_node = root.find('.//div[@class="content "]') 84 | 85 | ul_node = root.find('.//ul[@class="sellListContent"]') 86 | div_info = ul_node.xpath('.//div[contains(@class, "info")]') 87 | for div_node in div_info: 88 | title_nodes = div_node.xpath('./div[@class="title"]/a[contains(@class, "maidian-detail")]') 89 | if len(title_nodes) == 0: 90 | print("title not found") 91 | continue 92 | title_node = title_nodes[0] 93 | title = title_node.text 94 | maidian = title_node.attrib["data-maidian"] 95 | url = title_node.attrib["href"] 96 | 97 | xiaoqu_nodes = div_node.xpath('./div[@class="address"]/div[@class="houseInfo"]/a') 98 | xiaoqu_name = "" 99 | house_info = "" 100 | if len(xiaoqu_nodes) > 0: 101 | xiaoqu_name = xiaoqu_nodes[0].text 102 | house_info = xiaoqu_nodes[0].tail 103 | 104 | pos_nodes = div_node.xpath('./div[@class="flood"]/div[@class="positionInfo"]/span') 105 | building_info = "" 106 | if len(pos_nodes) > 0: 107 | building_info = pos_nodes[0].tail 108 | matched = re.search(r'(.*)\s+-\s+$', building_info) 109 | if matched: 110 | building_info = matched.group(1) 111 | 112 | area_nodes = div_node.xpath('./div[@class="flood"]/div[@class="positionInfo"]/a') 113 | area = "" 114 | if len(area_nodes) > 0: 115 | area_node = area_nodes[0] 116 | area = area_node.text 117 | 118 | follow_nodes = div_node.xpath('./div[@class="followInfo"]/span') 119 | follow_info = "" 120 | if len(follow_nodes) > 0: 121 | follow_node = follow_nodes[0] 122 | follow_info = follow_node.tail 123 | 124 | subway_nodes = div_node.xpath('./div[@class="tag"]/span[@class="subway"]') 125 | subway_info = "" 126 | if len(subway_nodes) > 0: 127 | subway_node = subway_nodes[0] 128 | subway_info = subway_node.text 129 | 130 | tax_nodes = div_node.xpath('./div[@class="tag"]/span[@class="taxfree"]') 131 | tax_info = "" 132 | if len(tax_nodes) > 0: 133 | tax_node = tax_nodes[0] 134 | tax_info = tax_node.text 135 | 136 | price_nodes = div_node.xpath('./div[@class="priceInfo"]/div[@class="totalPrice"]/span') 137 | price_num = 0 138 | price_unit = "" 139 | if len(price_nodes) > 0: 140 | price_node = price_nodes[0] 141 | price_num = price_node.text 142 | price_unit = price_node.tail 143 | 144 | up_nodes = div_node.xpath('./div[@class="priceInfo"]/div[@class="unitPrice"]') 145 | unit_price = 0 146 | if len(up_nodes) > 0: 147 | up_node = up_nodes[0] 148 | unit_price = up_node.attrib["data-price"] 149 | 150 | item = { 151 | "item_id": maidian, 152 | "sub_distr_id": sub_distr_id, 153 | "title": title, 154 | "url": url, 155 | "house_info": house_info, 156 | "xiaoqu_name": xiaoqu_name, 157 | "building_info": building_info, 158 | "area": area, 159 | "follow_info": follow_info, 160 | "subway_info": subway_info, 161 | "tax_info": tax_info, 162 | "price_num": price_num, 163 | "price_unit": price_unit, 164 | "unit_price": unit_price, 165 | } 166 | db.house.insert_one(item) 167 | i += 1 168 | 169 | def get_all_houses(): 170 | client = pymongo.MongoClient() 171 | db = client[DB] 172 | sub_distr_rows = db.sub_districts.find() 173 | start = 1 174 | for sub_distr in sub_distr_rows: 175 | entry_url = sub_distr["url"] 176 | sub_distr_id = sub_distr["_id"] 177 | distr_name = sub_distr["district"] 178 | sub_distr_name = sub_distr["sub_district"] 179 | print(distr_name, sub_distr_name) 180 | #if distr_name == "福田区" and sub_distr_name == "银湖": 181 | # start = 1 182 | if start == 1: 183 | get_houses_by_sub_district(sub_distr_id, entry_url) 184 | 185 | def parse_house_info(house_info): 186 | items = house_info.split("|") 187 | house_type = "apartment" 188 | matched = re.search(r'别墅', items[1]) 189 | info_items = items[1:] 190 | if matched: 191 | info_items = items[2:] 192 | house_type = "house" 193 | 194 | if len(info_items) < 4: 195 | print(house_info) 196 | return {"house_type": "", 197 | "shi_num": -1, 198 | "ting_num": -1, 199 | "size": -1, 200 | "has_lift": -1, 201 | "direction": "", 202 | "decoration": "", 203 | } 204 | room_info = info_items[0] 205 | size_info = info_items[1] 206 | direc_info = info_items[2] 207 | decor_info = info_items[3] 208 | lift_info = "" 209 | if len(info_items) >= 5: 210 | lift_info = info_items[4] 211 | matched = re.search(r'(\d+)室(\d+)厅', room_info) 212 | shi_num = 0 213 | ting_num = 0 214 | if matched: 215 | shi_num = int(matched.group(1)) 216 | ting_num = int(matched.group(2)) 217 | 218 | matched = re.search(r'([.0-9]+)平米', size_info) 219 | size = 0.0 220 | if matched: 221 | size = float(matched.group(1)) 222 | 223 | has_lift = None 224 | if re.search(r'有电梯', lift_info): 225 | has_lift = True 226 | elif re.search(r'无电梯', lift_info): 227 | has_lift = False 228 | result = {"house_type": house_type, 229 | "shi_num": shi_num, 230 | "ting_num": ting_num, 231 | "size": size, 232 | "has_lift": has_lift, 233 | "direction": direc_info, 234 | "decoration": decor_info, 235 | } 236 | return result 237 | 238 | def update_house_info(): 239 | client = pymongo.MongoClient() 240 | db = client[DB] 241 | houses = db.house.find() 242 | for house in houses: 243 | object_id = house["_id"] 244 | price_num = float(house["price_num"]) 245 | unit_price = float(house["unit_price"]) 246 | building_info = house["building_info"] 247 | matched = re.search(r'(\d+)年', building_info) 248 | build_year = 0 249 | if matched: 250 | build_year = int(matched.group(1)) 251 | db.house.update({"_id": house["_id"]}, {"$set": {"price_num": price_num, "unit_price": unit_price, "build_year": build_year}}) 252 | info = parse_house_info(house["house_info"]) 253 | db.house.update({"_id": house["_id"]}, {"$set": info}) 254 | 255 | def stats(): 256 | client = pymongo.MongoClient() 257 | db = client[DB] 258 | 259 | print("=========== most expensive =============") 260 | houses = db.house.aggregate([ 261 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 262 | {"$unwind": "$sub_districts"}, 263 | {"$match": {"sub_districts": {"$ne": []}}}, 264 | {"$sort": {"price_num": -1}}, 265 | {"$limit": 15} 266 | ]) 267 | for house in houses: 268 | print("{},{},{},{},{}".format(house["sub_districts"]["district"], house["title"], house["xiaoqu_name"], house["price_num"], house["unit_price"])) 269 | 270 | print("=========== average house age =============") 271 | houses = db.house.aggregate([ 272 | {"$match": {"build_year": {"$gt": 0}}}, 273 | ]) 274 | total = 0 275 | count = 0 276 | for house in houses: 277 | total += house["build_year"] 278 | count += 1 279 | avg_build_year = total/count 280 | avg_age = 2018 - avg_build_year 281 | print(avg_age) 282 | 283 | print("=========== most expensive xiaoqu in each district =============") 284 | districts = db.sub_districts.aggregate([ 285 | {"$group": {"_id": "$district", "district_name": {"$first": "$district"}, "sub_districts": {"$push": "$_id"}}}, 286 | ]) 287 | for district in districts: 288 | district_name = district["district_name"] 289 | sub_districts = district["sub_districts"] 290 | xiaoqus = db.house.aggregate([ 291 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 292 | {"$unwind": "$sub_districts"}, 293 | {"$match": {"sub_districts.district": district_name}}, 294 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "sub_district": {"$first": "$sub_districts.sub_district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}}, 295 | {"$match": {"count": {"$gte": 3}}}, 296 | {"$sort": {"avg_price": -1}}, 297 | {"$limit": 1}, 298 | ]) 299 | for xiaoqu in xiaoqus: 300 | print("{},{},{},{}".format(xiaoqu["district_name"], xiaoqu["sub_district"], xiaoqu["xiaoqu_name"], xiaoqu["avg_price"], )) 301 | 302 | print("=========== cheapest xiaoqu in each district =============") 303 | districts = db.sub_districts.aggregate([ 304 | {"$group": {"_id": "$district", "district_name": {"$first": "$district"}, "sub_districts": {"$push": "$_id"}}}, 305 | ]) 306 | for district in districts: 307 | district_name = district["district_name"] 308 | sub_districts = district["sub_districts"] 309 | xiaoqus = db.house.aggregate([ 310 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 311 | {"$unwind": "$sub_districts"}, 312 | {"$match": {"sub_districts.district": district_name}}, 313 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}}, 314 | {"$match": {"count": {"$gte": 3}}}, 315 | {"$sort": {"avg_price": 1}}, 316 | {"$limit": 1}, 317 | ]) 318 | for xiaoqu in xiaoqus: 319 | print(xiaoqu["district_name"], xiaoqu["xiaoqu_name"], xiaoqu["avg_price"], xiaoqu["count"]) 320 | 321 | print("=========== average unit price =============") 322 | houses = db.house.find() 323 | total = 0 324 | count = 0 325 | for house in houses: 326 | total += house["unit_price"] 327 | count += 1 328 | avg_price = total/count 329 | print(avg_price) 330 | 331 | print("=========== average house price =============") 332 | houses = db.house.find() 333 | total = 0 334 | count = 0 335 | for house in houses: 336 | total += house["price_num"] 337 | count += 1 338 | avg_price = total/count 339 | print(avg_price) 340 | 341 | print("=========== apartment/house =============") 342 | houses = db.house.aggregate([ 343 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 344 | {"$unwind": "$sub_districts"}, 345 | {"$match": {"sub_districts": {"$ne": []}}}, 346 | {"$group": {"_id": "$house_type", "house_type": {"$first": "$house_type"}, "count": {"$sum": 1}}}, 347 | ]) 348 | for house in houses: 349 | print(house["house_type"], house["count"]) 350 | 351 | print("=========== biggest houses =============") 352 | houses = db.house.aggregate([ 353 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 354 | {"$unwind": "$sub_districts"}, 355 | {"$match": {"sub_districts": {"$ne": []}}}, 356 | {"$sort": {"size": -1}}, 357 | {"$limit": 15} 358 | ]) 359 | for house in houses: 360 | print("{},{},{},{},{},{}".format(house["title"], house["sub_districts"]["district"], house["sub_districts"]["sub_district"], house["size"], house["xiaoqu_name"], house["price_num"])) 361 | 362 | print("=========== most number of houses district name =============") 363 | houses = db.house.aggregate([ 364 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 365 | {"$unwind": "$sub_districts"}, 366 | {"$match": {"sub_districts": {"$ne": []}}}, 367 | {"$group": {"_id": "$sub_districts.district", "district_name": {"$first": "$sub_districts.district"}, "count": {"$sum": 1}}}, 368 | {"$sort": {"count": -1}}, 369 | ]) 370 | for house in houses: 371 | print(house["district_name"], house["count"]) 372 | 373 | print("=========== most number of houses xiaoqu name =============") 374 | houses = db.house.aggregate([ 375 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 376 | {"$unwind": "$sub_districts"}, 377 | {"$match": {"sub_districts": {"$ne": []}}}, 378 | {"$group": {"_id": "$xiaoqu_name", "xiaoqu_name": {"$first": "$xiaoqu_name"}, "count": {"$sum": 1}}}, 379 | {"$sort": {"count": -1}}, 380 | {"$limit": 10} 381 | ]) 382 | for house in houses: 383 | print(house["xiaoqu_name"], house["count"]) 384 | 385 | print("=========== most expensive xiaoqu name =============") 386 | houses = db.house.aggregate([ 387 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 388 | {"$unwind": "$sub_districts"}, 389 | {"$match": {"sub_districts": {"$ne": []}}}, 390 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "sub_district_name": {"$first": "$sub_districts.sub_district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 391 | {"$sort": {"avg_unit_price": -1}}, 392 | {"$limit": 15} 393 | ]) 394 | for house in houses: 395 | print("{},{},{},{}".format(house["district_name"], house["sub_district_name"], house["xiaoqu_name"], house["avg_unit_price"])) 396 | 397 | print("=========== most expensive sub district =============") 398 | houses = db.house.aggregate([ 399 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 400 | {"$unwind": "$sub_districts"}, 401 | {"$match": {"sub_districts": {"$ne": []}}}, 402 | {"$group": {"_id": "$sub_districts.sub_district", "district_name": {"$first": "$sub_districts.district"}, "sub_district_name": {"$first": "$sub_districts.sub_district"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 403 | {"$sort": {"avg_unit_price": -1}}, 404 | ]) 405 | for house in houses: 406 | print("{},{},{}".format(house["district_name"], house["sub_district_name"], house["avg_unit_price"])) 407 | 408 | print("=========== most expensive district =============") 409 | houses = db.house.aggregate([ 410 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 411 | {"$unwind": "$sub_districts"}, 412 | {"$match": {"sub_districts": {"$ne": []}}}, 413 | {"$group": {"_id": "$sub_districts.district", "district_name": {"$first": "$sub_districts.district"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 414 | {"$sort": {"avg_unit_price": -1}}, 415 | ]) 416 | for house in houses: 417 | print(house["district_name"], house["avg_unit_price"]) 418 | 419 | print("=========== most expensive unit price =============") 420 | houses = db.house.aggregate([ 421 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_district"}}, 422 | {"$unwind": "$sub_district"}, 423 | {"$match": {"sub_districts": {"$ne": []}}}, 424 | {"$sort": {"unit_price": -1}}, 425 | {"$limit": 10} 426 | ]) 427 | for house in houses: 428 | print(house["title"], house["url"], house["xiaoqu_name"], house["price_num"], house["unit_price"]) 429 | 430 | if __name__ == "__main__": 431 | stats() 432 | -------------------------------------------------------------------------------- /rental.py: -------------------------------------------------------------------------------- 1 | import urllib3 2 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 3 | 4 | import math 5 | import re 6 | from lxml import etree 7 | import time 8 | import json 9 | import pymongo 10 | import requests 11 | 12 | DB = "shzu" 13 | base_url = "https://sh.zu.ke.com" 14 | 15 | def get_disctricts(): 16 | url = base_url + "/zufang/" 17 | r = requests.get(url, verify=False) 18 | content = r.content.decode("utf-8") 19 | root = etree.HTML(content) 20 | distr_nodes = root.xpath('.//div[@class="filter"]//ul[@data-target="area"]/li[@class="filter__item--level2 "]/a') 21 | result = [] 22 | for node in distr_nodes: 23 | rel_url = node.attrib["href"] 24 | distr_url = "" 25 | if re.match(r'https://', rel_url): 26 | distr_url = rel_url 27 | else: 28 | distr_url = base_url + rel_url 29 | distr_name = node.text 30 | result.append([distr_name, distr_url]) 31 | return result 32 | 33 | def get_sub_districts(): 34 | districts = get_disctricts() 35 | result = [] 36 | client = pymongo.MongoClient() 37 | db = client[DB] 38 | for item in districts: 39 | distr_name = item[0] 40 | distr_url = item[1] 41 | r = requests.get(distr_url, verify=False) 42 | content = r.content.decode("utf-8") 43 | root = etree.HTML(content) 44 | subdistr_nodes = root.xpath('.//div[@class="filter"]//ul[@data-target="area"]/li[@class="filter__item--level3 "]/a') 45 | for node in subdistr_nodes: 46 | sub_distr_name = node.text 47 | sub_distr_url = base_url + node.attrib["href"] 48 | db.sub_districts.insert_one({ 49 | "district": distr_name, 50 | "sub_district": sub_distr_name, 51 | "url": sub_distr_url, 52 | }) 53 | 54 | def get_item_num(entry_url): 55 | r = requests.get(entry_url, verify=False) 56 | content = r.content.decode("utf-8") 57 | root = etree.HTML(content) 58 | num_nodes = root.xpath('.//div[@id="content"]//span[@class="content__title--hl"]') 59 | if len(num_nodes) == 0: 60 | raise Exception("no total number for {}".format(entry_url)) 61 | num_str = num_nodes[0].text.strip() 62 | return int(num_str) 63 | 64 | def get_houses_by_sub_district(sub_distr_id, entry_url): 65 | url_patt = entry_url + "pg{}/" 66 | 67 | total_num = get_item_num(entry_url) 68 | last_page = math.ceil(total_num/30) 69 | i = 1 70 | client = pymongo.MongoClient() 71 | db = client[DB] 72 | for i in range(1, last_page+1, 1): 73 | url = url_patt.format(i) 74 | r = requests.get(url, verify=False) 75 | content = r.content.decode("utf-8") 76 | root = etree.HTML(content) 77 | content_nodes = root.xpath('.//div[@class="content__list"]') 78 | if len(content_nodes) == 0: 79 | print(url) 80 | r = requests.get(url, verify=False) 81 | content = r.content.decode("utf-8") 82 | root = etree.HTML(content) 83 | 84 | div_nodes = root.xpath('.//div[@class="content__list--item"]') 85 | for div_node in div_nodes: 86 | title_nodes = div_node.xpath('.//p[contains(@class, content__list--item--title)]/a') 87 | if len(title_nodes) == 0: 88 | print("title not found") 89 | continue 90 | title_node = title_nodes[0] 91 | title = title_node.text.strip() 92 | url = title_node.attrib["href"] 93 | 94 | area_nodes = div_node.xpath('.//p[@class="content__list--item--des"]') 95 | area = "" 96 | if len(area_nodes) == 0: 97 | continue 98 | area_node = area_nodes[0] 99 | area = 0 100 | direction = "" 101 | room_type = "" 102 | for node in area_node: 103 | if node.tag == "a": 104 | continue 105 | text = node.tail.strip() 106 | matched = re.search(r'(\d+)㎡', text) 107 | if matched: 108 | area = matched.group(1) 109 | elif re.search(r'室', text): 110 | room_type = text 111 | else: 112 | direction = text 113 | 114 | price_nodes = div_node.xpath('.//span[@class="content__list--item-price"]/em') 115 | price_info = 0 116 | if len(price_nodes) > 0: 117 | price_node = price_nodes[0] 118 | price_info = price_node.text 119 | 120 | item = { 121 | "sub_distr_id": sub_distr_id, 122 | "title": title, 123 | "url": url, 124 | "area": area, 125 | "direction": direction, 126 | "room_type": room_type, 127 | "price_info": price_info, 128 | } 129 | db.house.insert_one(item) 130 | i += 1 131 | 132 | def get_all_houses(): 133 | client = pymongo.MongoClient() 134 | db = client[DB] 135 | sub_distr_rows = db.sub_districts.find() 136 | start = 1 137 | for sub_distr in sub_distr_rows: 138 | entry_url = sub_distr["url"] 139 | sub_distr_id = sub_distr["_id"] 140 | distr_name = sub_distr["district"] 141 | sub_distr_name = sub_distr["sub_district"] 142 | print(distr_name, sub_distr_name) 143 | #if distr_name == "福田区" and sub_distr_name == "银湖": 144 | # start = 1 145 | if start == 1: 146 | get_houses_by_sub_district(sub_distr_id, entry_url) 147 | 148 | def parse_house_info(house_info): 149 | items = house_info.split("|") 150 | house_type = "apartment" 151 | matched = re.search(r'别墅', items[1]) 152 | info_items = items[1:] 153 | if matched: 154 | info_items = items[2:] 155 | house_type = "house" 156 | 157 | if len(info_items) < 4: 158 | print(house_info) 159 | return {"house_type": "", 160 | "shi_num": -1, 161 | "ting_num": -1, 162 | "size": -1, 163 | "has_lift": -1, 164 | "direction": "", 165 | "decoration": "", 166 | } 167 | room_info = info_items[0] 168 | size_info = info_items[1] 169 | direc_info = info_items[2] 170 | decor_info = info_items[3] 171 | lift_info = "" 172 | if len(info_items) >= 5: 173 | lift_info = info_items[4] 174 | matched = re.search(r'(\d+)室(\d+)厅', room_info) 175 | shi_num = 0 176 | ting_num = 0 177 | if matched: 178 | shi_num = int(matched.group(1)) 179 | ting_num = int(matched.group(2)) 180 | 181 | matched = re.search(r'([.0-9]+)平米', size_info) 182 | size = 0.0 183 | if matched: 184 | size = float(matched.group(1)) 185 | 186 | has_lift = None 187 | if re.search(r'有电梯', lift_info): 188 | has_lift = True 189 | elif re.search(r'无电梯', lift_info): 190 | has_lift = False 191 | result = {"house_type": house_type, 192 | "shi_num": shi_num, 193 | "ting_num": ting_num, 194 | "size": size, 195 | "has_lift": has_lift, 196 | "direction": direc_info, 197 | "decoration": decor_info, 198 | } 199 | return result 200 | 201 | def update_house_info(): 202 | client = pymongo.MongoClient() 203 | db = client[DB] 204 | houses = db.house.find() 205 | for house in houses: 206 | object_id = house["_id"] 207 | price_num = float(house["price_num"]) 208 | unit_price = float(house["unit_price"]) 209 | building_info = house["building_info"] 210 | matched = re.search(r'(\d+)年', building_info) 211 | build_year = 0 212 | if matched: 213 | build_year = int(matched.group(1)) 214 | db.house.update({"_id": house["_id"]}, {"$set": {"price_num": price_num, "unit_price": unit_price, "build_year": build_year}}) 215 | info = parse_house_info(house["house_info"]) 216 | db.house.update({"_id": house["_id"]}, {"$set": info}) 217 | 218 | def stats(): 219 | client = pymongo.MongoClient() 220 | db = client[DB] 221 | 222 | print("=========== most expensive =============") 223 | houses = db.house.aggregate([ 224 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 225 | {"$unwind": "$sub_districts"}, 226 | {"$match": {"sub_districts": {"$ne": []}}}, 227 | {"$sort": {"price_num": -1}}, 228 | {"$limit": 15} 229 | ]) 230 | for house in houses: 231 | print("{},{},{},{},{}".format(house["sub_districts"]["district"], house["title"], house["xiaoqu_name"], house["price_num"], house["unit_price"])) 232 | 233 | print("=========== average house age =============") 234 | houses = db.house.aggregate([ 235 | {"$match": {"build_year": {"$gt": 0}}}, 236 | ]) 237 | total = 0 238 | count = 0 239 | for house in houses: 240 | total += house["build_year"] 241 | count += 1 242 | avg_build_year = total/count 243 | avg_age = 2018 - avg_build_year 244 | print(avg_age) 245 | 246 | print("=========== most expensive xiaoqu in each district =============") 247 | districts = db.sub_districts.aggregate([ 248 | {"$group": {"_id": "$district", "district_name": {"$first": "$district"}, "sub_districts": {"$push": "$_id"}}}, 249 | ]) 250 | for district in districts: 251 | district_name = district["district_name"] 252 | sub_districts = district["sub_districts"] 253 | xiaoqus = db.house.aggregate([ 254 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 255 | {"$unwind": "$sub_districts"}, 256 | {"$match": {"sub_districts.district": district_name}}, 257 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "sub_district": {"$first": "$sub_districts.sub_district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}}, 258 | {"$match": {"count": {"$gte": 3}}}, 259 | {"$sort": {"avg_price": -1}}, 260 | {"$limit": 1}, 261 | ]) 262 | for xiaoqu in xiaoqus: 263 | print("{},{},{},{}".format(xiaoqu["district_name"], xiaoqu["sub_district"], xiaoqu["xiaoqu_name"], xiaoqu["avg_price"], )) 264 | 265 | print("=========== cheapest xiaoqu in each district =============") 266 | districts = db.sub_districts.aggregate([ 267 | {"$group": {"_id": "$district", "district_name": {"$first": "$district"}, "sub_districts": {"$push": "$_id"}}}, 268 | ]) 269 | for district in districts: 270 | district_name = district["district_name"] 271 | sub_districts = district["sub_districts"] 272 | xiaoqus = db.house.aggregate([ 273 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 274 | {"$unwind": "$sub_districts"}, 275 | {"$match": {"sub_districts.district": district_name}}, 276 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}}, 277 | {"$match": {"count": {"$gte": 3}}}, 278 | {"$sort": {"avg_price": 1}}, 279 | {"$limit": 1}, 280 | ]) 281 | for xiaoqu in xiaoqus: 282 | print(xiaoqu["district_name"], xiaoqu["xiaoqu_name"], xiaoqu["avg_price"], xiaoqu["count"]) 283 | 284 | print("=========== average unit price =============") 285 | houses = db.house.find() 286 | total = 0 287 | count = 0 288 | for house in houses: 289 | total += house["unit_price"] 290 | count += 1 291 | avg_price = total/count 292 | print(avg_price) 293 | 294 | print("=========== average house price =============") 295 | houses = db.house.find() 296 | total = 0 297 | count = 0 298 | for house in houses: 299 | total += house["price_num"] 300 | count += 1 301 | avg_price = total/count 302 | print(avg_price) 303 | 304 | print("=========== apartment/house =============") 305 | houses = db.house.aggregate([ 306 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 307 | {"$unwind": "$sub_districts"}, 308 | {"$match": {"sub_districts": {"$ne": []}}}, 309 | {"$group": {"_id": "$house_type", "house_type": {"$first": "$house_type"}, "count": {"$sum": 1}}}, 310 | ]) 311 | for house in houses: 312 | print(house["house_type"], house["count"]) 313 | 314 | print("=========== biggest houses =============") 315 | houses = db.house.aggregate([ 316 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 317 | {"$unwind": "$sub_districts"}, 318 | {"$match": {"sub_districts": {"$ne": []}}}, 319 | {"$sort": {"size": -1}}, 320 | {"$limit": 20} 321 | ]) 322 | for house in houses: 323 | print("{},{},{},{},{}".format(house["sub_districts"]["district"], house["xiaoqu_name"], house["title"], house["size"], house["price_num"])) 324 | 325 | print("=========== most number of houses district name =============") 326 | houses = db.house.aggregate([ 327 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 328 | {"$unwind": "$sub_districts"}, 329 | {"$match": {"sub_districts": {"$ne": []}}}, 330 | {"$group": {"_id": "$sub_districts.district", "district_name": {"$first": "$sub_districts.district"}, "count": {"$sum": 1}}}, 331 | {"$sort": {"count": -1}}, 332 | ]) 333 | for house in houses: 334 | print(house["district_name"], house["count"]) 335 | 336 | print("=========== most number of houses xiaoqu name =============") 337 | houses = db.house.aggregate([ 338 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 339 | {"$unwind": "$sub_districts"}, 340 | {"$match": {"sub_districts": {"$ne": []}}}, 341 | {"$group": {"_id": "$xiaoqu_name", "xiaoqu_name": {"$first": "$xiaoqu_name"}, "count": {"$sum": 1}}}, 342 | {"$sort": {"count": -1}}, 343 | {"$limit": 10} 344 | ]) 345 | for house in houses: 346 | print(house["xiaoqu_name"], house["count"]) 347 | 348 | print("=========== most expensive xiaoqu name =============") 349 | houses = db.house.aggregate([ 350 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 351 | {"$unwind": "$sub_districts"}, 352 | {"$match": {"sub_districts": {"$ne": []}}}, 353 | {"$group": {"_id": "$xiaoqu_name", "district_name": {"$first": "$sub_districts.district"}, "sub_district_name": {"$first": "$sub_districts.sub_district"}, "xiaoqu_name": {"$first": "$xiaoqu_name"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 354 | {"$sort": {"avg_unit_price": -1}}, 355 | {"$limit": 15} 356 | ]) 357 | for house in houses: 358 | print("{},{},{},{}".format(house["district_name"], house["sub_district_name"], house["xiaoqu_name"], house["avg_unit_price"])) 359 | 360 | print("=========== most expensive sub district =============") 361 | houses = db.house.aggregate([ 362 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 363 | {"$unwind": "$sub_districts"}, 364 | {"$match": {"sub_districts": {"$ne": []}}}, 365 | {"$group": {"_id": "$sub_districts.sub_district", "district_name": {"$first": "$sub_districts.district"}, "sub_district_name": {"$first": "$sub_districts.sub_district"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 366 | {"$sort": {"avg_unit_price": -1}}, 367 | ]) 368 | for house in houses: 369 | print("{},{},{}".format(house["district_name"], house["sub_district_name"], house["avg_unit_price"])) 370 | 371 | print("=========== most expensive district =============") 372 | houses = db.house.aggregate([ 373 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_districts"}}, 374 | {"$unwind": "$sub_districts"}, 375 | {"$match": {"sub_districts": {"$ne": []}}}, 376 | {"$group": {"_id": "$sub_districts.district", "district_name": {"$first": "$sub_districts.district"}, "avg_unit_price": {"$avg": "$unit_price"}}}, 377 | {"$sort": {"avg_unit_price": -1}}, 378 | ]) 379 | for house in houses: 380 | print(house["district_name"], house["avg_unit_price"]) 381 | 382 | print("=========== most expensive unit price =============") 383 | houses = db.house.aggregate([ 384 | {"$lookup": {"from": "sub_districts", "localField": "sub_distr_id", "foreignField": "_id", "as": "sub_district"}}, 385 | {"$unwind": "$sub_district"}, 386 | {"$match": {"sub_districts": {"$ne": []}}}, 387 | {"$sort": {"unit_price": -1}}, 388 | {"$limit": 10} 389 | ]) 390 | for house in houses: 391 | print(house["title"], house["url"], house["xiaoqu_name"], house["price_num"], house["unit_price"]) 392 | 393 | if __name__ == "__main__": 394 | get_all_houses() 395 | -------------------------------------------------------------------------------- /ziroom.py: -------------------------------------------------------------------------------- 1 | import re 2 | from lxml import etree 3 | import time 4 | import json 5 | import pymongo 6 | import requests 7 | 8 | DB = "shziroom" 9 | base_url = "http://sh.ziroom.com" 10 | headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"} 11 | 12 | def fix_url(url): 13 | if re.match(r'//', url): 14 | url = 'http:{}'.format(url) 15 | return url 16 | 17 | def get_sub_districts(node): 18 | sub_nodes = node.xpath('.//div[@class="con"]/span/a') 19 | result = [] 20 | for sub_node in sub_nodes: 21 | sub_district = sub_node.text 22 | url = sub_node.attrib["href"] 23 | url = fix_url(url) 24 | if sub_district == "全部": 25 | continue 26 | 27 | result.append({"sub_district": sub_district, "url": url}) 28 | return result 29 | 30 | def get_disctricts(): 31 | url = base_url + "/z/nl/z1.html" 32 | r = requests.get(url, headers=headers, verify=False) 33 | content = r.content.decode("utf-8") 34 | root = etree.HTML(content) 35 | distr_nodes = root.xpath('.//dl[contains(@class, "zIndex6")]/dd/ul/li') 36 | client = pymongo.MongoClient() 37 | db = client[DB] 38 | for distr_node in distr_nodes: 39 | nodes = distr_node.xpath('.//span[@class="tag"]/a') 40 | if len(nodes) == 0: 41 | continue 42 | node = nodes[0] 43 | district = node.text 44 | url = node.attrib["href"] 45 | url = fix_url(url) 46 | sub_distrs = get_sub_districts(distr_node) 47 | for sub_distr in sub_distrs: 48 | item = {"district": district, 49 | "sub_district": sub_distr["sub_district"], 50 | "url": sub_distr["url"] 51 | } 52 | db.sub_districts.insert_one(item) 53 | 54 | def get_price(price_node): 55 | num_nodes = price_node.xpath('./span[@class="num"]') 56 | print(price_node.text) 57 | offset_map = { 58 | 1: 6, 59 | 30: 5, 60 | 6: 3, 61 | 90: 2, 62 | 120: 1, 63 | 3: 4, 64 | 7: 8, 65 | 210: 9, 66 | 5: 0, 67 | 270: 7, 68 | } 69 | 70 | price = 0 71 | for num_node in num_nodes: 72 | style = num_node.attrib["style"] 73 | matched = re.match(r'background-position:-(\d+)px', style) 74 | if not matched: 75 | raise Exception("error getting price") 76 | offset = matched.group(1) 77 | num = offset_map[offset] 78 | price = price*10 + num 79 | return price 80 | 81 | def get_houses_by_sub_district(sub_distr_id, entry_url): 82 | url_patt = entry_url + "?p={}" 83 | 84 | i = 1 85 | client = pymongo.MongoClient() 86 | db = client[DB] 87 | while True: 88 | url = url_patt.format(i) 89 | url = "http://sh.ziroom.com/z/nl/z1-d310112.html" 90 | r = requests.get(url, headers=headers, verify=False) 91 | content = r.content.decode("utf-8") 92 | print(content) 93 | return 94 | root = etree.HTML(content) 95 | house_nodes = root.xpath('.//ul[@id="houseList"]/li[@class="clearfix"]') 96 | if len(house_nodes) == 0: 97 | break 98 | for house_node in house_nodes: 99 | title_nodes = house_node.xpath('.//div[@class="txt"]/h3/a') 100 | if len(title_nodes) == 0: 101 | continue 102 | title = title_nodes[0].text 103 | area = 0 104 | floor_info = "" 105 | room_type = "" 106 | detail_nodes = house_node.xpath('.//div[@class="detail"]/p/span') 107 | for node in detail_nodes: 108 | print(etree.tostring(node)) 109 | text = node.text 110 | matched = re.search(r'(\d+) ㎡', text) 111 | if matched: 112 | area = matched.group(1) 113 | elif re.search(r'室', text): 114 | room_type = text 115 | price_nodes = house_node.xpath('.//div[@class="priceDetail"]/p[@class="price"]') 116 | if len(price_nodes) == 0: 117 | continue 118 | price = get_price(price_nodes[0]) 119 | print(price) 120 | i += 1 121 | 122 | def get_all_houses(): 123 | client = pymongo.MongoClient() 124 | db = client[DB] 125 | sub_distr_rows = db.sub_districts.find() 126 | for sub_distr in sub_distr_rows: 127 | entry_url = sub_distr["url"] 128 | sub_distr_id = sub_distr["_id"] 129 | distr_name = sub_distr["district"] 130 | sub_distr_name = sub_distr["sub_district"] 131 | print(distr_name, sub_distr_name) 132 | get_houses_by_sub_district(sub_distr_id, entry_url) 133 | break 134 | 135 | if __name__ == "__main__": 136 | get_all_houses() 137 | --------------------------------------------------------------------------------