├── 58search.py ├── README.md ├── datawash.py ├── pyspark.py └── search-wash.ipynb /58search.py: -------------------------------------------------------------------------------- 1 | #爬取58tc上重庆二手房的price square floors rooms subway area信息并保存为csv格式,存到文件 res.csv中 2 | #@author smallsmart 3 | #GitHub地址: github.com/smallsmartlc 4 | 5 | import requests 6 | from bs4 import BeautifulSoup 7 | import csv 8 | import os 9 | with open('res.csv', mode='a',newline='') as csv_file: 10 | fieldnames = ['name', 'price', 'square','floors','rooms','subway', 'area'] 11 | # 设置字段 12 | writer = csv.DictWriter(csv_file,fieldnames=fieldnames) 13 | # writer.writeheader(); #写入csv标题,第一次爬数据时使用 14 | 15 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36"} 16 | #1号线102,2号线103,3号线104,4号线196,5号线198,6号线105,10号线200,环线199 17 | #每次爬取时根据url 18 | url = 'https://cq.58.com/ershoufang/sub/pn2';#pnXX 指第x页 19 | r = requests.get(url=url, headers=headers) 20 | html = r.text 21 | # html.parser解析器 22 | soup = BeautifulSoup(html, 'html.parser') 23 | con = soup.find('ul',class_='house-list-wrap') 24 | con_list = con.find_all('li' ,class_= 'sendsoj') 25 | con_list 26 | for i in con_list: 27 | name = i.find('h2',class_='title').find('a').get_text(); 28 | price = i.find('div',class_='price').find('p',class_ = 'unit').get_text(); 29 | list = i.find('div',class_='list-info').find('p',class_ = 'baseinfo').find_all('span'); 30 | square = list[1].get_text(); 31 | floors = list[3].get_text(); 32 | rooms = list[0].get_text(); 33 | subwaybox = i.find('div',class_='list-info').find_all('p',class_ = 'baseinfo')[1].find_all('span'); 34 | subway = ""; 35 | if len(subwaybox)>1 : 36 | subway = subwaybox[1].get_text(); 37 | else : 38 | subway = ""; 39 | area= i.find('div',class_='list-info').find_all('p',class_ = 'baseinfo')[1].find_all('a')[1].get_text(); 40 | json = {'name': name.strip(),'price':price.strip() ,'square':square.strip(),'floors':floors.strip(),'rooms' : rooms.strip(),'subway': subway , 'area': area} 41 | print(json)#打印结果 42 | writer.writerow(json)#将结果写入csv 43 | 44 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 大数据期末作业@smallsmart 2 | 编写爬虫代码,从58同城爬取你所在市/区的二手房的数据,包括预测目标(因变量):每平米价格,特征变量(自变量):总面积,楼层,房间数,是否近地铁,位于哪个区,我们的目的是选用合适的算法训练模型,当有新的数据输入时,预测出每平米的价格。 3 | 4 | 5 | -------------------------------------------------------------------------------- /datawash.py: -------------------------------------------------------------------------------- 1 | #对爬取的数据进行调整,把字符转成数字 2 | # 读文件和写文件 3 | #@author smallsmart 4 | #GitHub地址: github.com/smallsmartlc 5 | 6 | import csv 7 | import re 8 | 9 | resf = csv.reader(open('res.csv','r')); 10 | newf = open('res2.csv',mode='w',newline=''); 11 | fieldnames = ['price', 'square','floors','rooms','subway', 'area'] 12 | # 设置字段 13 | writer = csv.DictWriter(newf,fieldnames=fieldnames) 14 | # writer = csv.writer(newf) 15 | writer.writeheader(); #写入csv标题 16 | a = 0; 17 | dict = {'巴南区':1,'北碚区': 2,'璧山区': 3,'大渡口区': 4,'大足区':5,'垫江县':6,'涪陵区':7,'合川区':8,'江北区':9,'江津区':10,'九龙坡区':11,'开州区(开县)':12,'梁平县':13,'南岸区':14,'南川区':15,'荣昌区':16,'沙坪坝区':17,'铜梁区':18,'潼南区':19,'万州区':20,'永川区':21,'渝北区':22,'渝中区':23,'云阳县':24,'长寿区':25}#设置字典,通过数据来设定特征值 18 | for i in resf: 19 | a=a+1 20 | if a == 1: 21 | continue; 22 | #使用正则表达式匹配数据 23 | price = re.compile('^[0-9]*').findall(i[1])[0] 24 | square = re.compile('[1-9]\d*\.\d*|0\.\d*[1-9]\d*').findall(i[2])[0] 25 | floors = re.compile('[0-9]+').findall(i[3])[0] 26 | rooms = int(re.compile('[0-9]+').findall(i[4])[0]) + int(re.compile('[0-9]+').findall(i[4])[1])+ int(re.compile('[0-9]+').findall(i[4])[2]) 27 | #判断是否有地铁 28 | subway = 0; 29 | if len(i[5])>1: 30 | subway = 1; 31 | #通过字典设置区域的特征值 32 | area = dict[i[6]] 33 | json = {'price': price,'square':square,'floors':floors,'rooms':rooms,'subway':subway,'area':area} 34 | writer.writerow(json)#将处理好的数据写入 35 | print(json) 36 | 37 | 38 | -------------------------------------------------------------------------------- /pyspark.py: -------------------------------------------------------------------------------- 1 | #代码来自简书NEO_X https://www.jianshu.com/p/06d157ba1a08 2 | 3 | from pyspark.sql import SparkSession 4 | spark=SparkSession.builder.appName('lin_reg').getOrCreate() 5 | 6 | # 2-读取数据 7 | from pyspark.ml.regression import LinearRegression 8 | #将爬取的数据上传hdfs后,修改文件读取地址 9 | df=spark.read.csv('hdfs://localhost:9000/input_spark/res2.csv',inferSchema=True,header=True) 10 | 11 | # 3-探索分析数据 12 | print('-------------- 探索分析数据 -----------------') 13 | print((df.count(), len(df.columns))) # 查看数据规模 14 | df.printSchema() # 查看数据结构类型 15 | df.describe().show(5,False) # 查看数据集的统计数据,包括平均值,标准差,数量统计等。 16 | from pyspark.sql.functions import corr 17 | df.select(corr('square','price')).show() # 计算数据方差 18 | 19 | # 4-数据转换,适应模型算法中的要求 20 | from pyspark.ml.linalg import Vector 21 | from pyspark.ml.feature import VectorAssembler # 导入库VectorAssembler 22 | 23 | print('-------------- 数据转换 ------------------') 24 | #修改列名 25 | vec_assmebler=VectorAssembler(inputCols=['square', 'floors', 'rooms', 'subway', 'area'],outputCol='features') # 转换,这里相对将多元一次方程中的各变量存放到一个向量中 26 | features_df=vec_assmebler.transform(df) 27 | 28 | features_df.printSchema() # 查看变换后的结构。 29 | 30 | model_df=features_df.select('features','price') # 构建用于线性回归的数据模型 31 | 32 | # 5-将数据划分为 训练数据和预测数据 33 | train_df,test_df=model_df.randomSplit([0.7,0.3]) # 训练数据和预测数据的比例为 7比3 34 | 35 | print((train_df.count(), len(train_df.columns))) 36 | 37 | print((test_df.count(), len(test_df.columns))) 38 | 39 | # 6-构建线性回归模型 40 | 41 | from pyspark.ml.regression import LinearRegression # 导入线性回顾库 42 | 43 | print('-------------- 构建线性回归模型 ------------------') 44 | 45 | lin_Reg=LinearRegression(labelCol='price') # labelCol,相对于featrues列,表示要进行预测的列 46 | 47 | lr_model=lin_Reg.fit(train_df) # 训练数据 ,fit返回一个 fitted model,即LineRegressionModel对象 48 | 49 | print('{}{}'.format('方程截距:',lr_model.intercept)) # intercept 线性方程的截距。 50 | 51 | print('{}{}'.format('方程参数系数:',lr_model.coefficients)) # 回归方程中的,变量参数 ,这里分别对应var_1,var_2,var_3,var_4,var_5 52 | 53 | training_predictions=lr_model.evaluate(train_df) # 查看预测数据 54 | 55 | print('{}{}'.format('误差差值平方:',training_predictions.meanSquaredError)) # 误差值差值平方 56 | 57 | print('{}{}'.format('判定系数:',training_predictions.r2 )) # r2 判定系数,用来判定,构建的模型是否能够准确的预测,越大说明预测的准确率越高 58 | 59 | # 7-使用预测数据,用已经到构建好的预测模型 lr_model 60 | test_results=lr_model.evaluate(test_df) 61 | 62 | print(test_results.r2) # 查看预测的拟合程度 63 | print(test_results.meanSquaredError) # 查看均方误差 -------------------------------------------------------------------------------- /search-wash.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "{'price': '14125', 'square': '63.01', 'floors': '25', 'rooms': 5, 'subway': 1, 'area': 14}\n", 13 | "{'price': '8487', 'square': '74.0', 'floors': '32', 'rooms': 5, 'subway': 0, 'area': 21}\n", 14 | "{'price': '15802', 'square': '63.92', 'floors': '33', 'rooms': 5, 'subway': 1, 'area': 17}\n", 15 | "{'price': '10318', 'square': '126.0', 'floors': '15', 'rooms': 8, 'subway': 1, 'area': 14}\n", 16 | "{'price': '8899', 'square': '118.0', 'floors': '6', 'rooms': 8, 'subway': 1, 'area': 2}\n", 17 | "{'price': '13379', 'square': '79.23', 'floors': '46', 'rooms': 5, 'subway': 1, 'area': 17}\n", 18 | "{'price': '4337', 'square': '101.0', 'floors': '8', 'rooms': 7, 'subway': 0, 'area': 6}\n", 19 | "{'price': '7541', 'square': '61.0', 'floors': '30', 'rooms': 5, 'subway': 0, 'area': 10}\n", 20 | "{'price': '5758', 'square': '99.0', 'floors': '32', 'rooms': 7, 'subway': 0, 'area': 10}\n", 21 | "{'price': '15704', 'square': '146.46', 'floors': '25', 'rooms': 7, 'subway': 1, 'area': 14}\n", 22 | "{'price': '11643', 'square': '140.0', 'floors': '8', 'rooms': 8, 'subway': 1, 'area': 22}\n", 23 | "{'price': '3715', 'square': '126.0', 'floors': '7', 'rooms': 7, 'subway': 0, 'area': 6}\n", 24 | "{'price': '14339', 'square': '81.6', 'floors': '18', 'rooms': 5, 'subway': 1, 'area': 22}\n", 25 | "{'price': '14078', 'square': '103.0', 'floors': '19', 'rooms': 7, 'subway': 1, 'area': 22}\n", 26 | "{'price': '16875', 'square': '80.0', 'floors': '32', 'rooms': 6, 'subway': 1, 'area': 17}\n", 27 | "{'price': '9070', 'square': '86.0', 'floors': '32', 'rooms': 6, 'subway': 0, 'area': 4}\n", 28 | "{'price': '5718', 'square': '99.0', 'floors': '7', 'rooms': 7, 'subway': 0, 'area': 12}\n", 29 | "{'price': '14511', 'square': '41.35', 'floors': '20', 'rooms': 2, 'subway': 0, 'area': 17}\n", 30 | "{'price': '4262', 'square': '102.77', 'floors': '33', 'rooms': 7, 'subway': 0, 'area': 10}\n", 31 | "{'price': '8824', 'square': '102.0', 'floors': '18', 'rooms': 7, 'subway': 0, 'area': 20}\n", 32 | "{'price': '13734', 'square': '75.0', 'floors': '33', 'rooms': 6, 'subway': 0, 'area': 1}\n", 33 | "{'price': '12459', 'square': '88.29', 'floors': '33', 'rooms': 5, 'subway': 1, 'area': 17}\n", 34 | "{'price': '7897', 'square': '77.0', 'floors': '32', 'rooms': 4, 'subway': 0, 'area': 26}\n", 35 | "{'price': '9073', 'square': '137.0', 'floors': '18', 'rooms': 8, 'subway': 1, 'area': 14}\n", 36 | "{'price': '4900', 'square': '149.0', 'floors': '32', 'rooms': 7, 'subway': 0, 'area': 10}\n", 37 | "{'price': '12267', 'square': '75.0', 'floors': '33', 'rooms': 6, 'subway': 0, 'area': 1}\n", 38 | "{'price': '13248', 'square': '88.32', 'floors': '33', 'rooms': 7, 'subway': 0, 'area': 1}\n", 39 | "{'price': '14271', 'square': '77.08', 'floors': '26', 'rooms': 6, 'subway': 1, 'area': 22}\n", 40 | "{'price': '6563', 'square': '96.0', 'floors': '11', 'rooms': 6, 'subway': 0, 'area': 10}\n", 41 | "{'price': '12238', 'square': '143.0', 'floors': '16', 'rooms': 8, 'subway': 1, 'area': 22}\n", 42 | "{'price': '5613', 'square': '98.0', 'floors': '25', 'rooms': 7, 'subway': 0, 'area': 10}\n", 43 | "{'price': '11500', 'square': '120.0', 'floors': '34', 'rooms': 7, 'subway': 1, 'area': 17}\n", 44 | "{'price': '10438', 'square': '40.24', 'floors': '19', 'rooms': 3, 'subway': 0, 'area': 1}\n", 45 | "{'price': '10371', 'square': '135.0', 'floors': '7', 'rooms': 7, 'subway': 0, 'area': 1}\n", 46 | "{'price': '12717', 'square': '81.0', 'floors': '12', 'rooms': 6, 'subway': 0, 'area': 17}\n", 47 | "{'price': '6463', 'square': '147.0', 'floors': '11', 'rooms': 8, 'subway': 0, 'area': 10}\n", 48 | "{'price': '14945', 'square': '360.0', 'floors': '3', 'rooms': 15, 'subway': 0, 'area': 22}\n", 49 | "{'price': '9280', 'square': '120.7', 'floors': '32', 'rooms': 7, 'subway': 1, 'area': 1}\n", 50 | "{'price': '5761', 'square': '92.0', 'floors': '32', 'rooms': 6, 'subway': 0, 'area': 10}\n", 51 | "{'price': '12372', 'square': '97.0', 'floors': '30', 'rooms': 7, 'subway': 1, 'area': 22}\n", 52 | "{'price': '6283', 'square': '78.0', 'floors': '32', 'rooms': 5, 'subway': 0, 'area': 10}\n", 53 | "{'price': '17986', 'square': '116.76', 'floors': '11', 'rooms': 8, 'subway': 1, 'area': 22}\n", 54 | "{'price': '13531', 'square': '98.0', 'floors': '29', 'rooms': 7, 'subway': 0, 'area': 11}\n", 55 | "{'price': '6429', 'square': '112.0', 'floors': '32', 'rooms': 7, 'subway': 0, 'area': 10}\n", 56 | "{'price': '11072', 'square': '140.0', 'floors': '6', 'rooms': 8, 'subway': 0, 'area': 12}\n", 57 | "{'price': '7318', 'square': '82.0', 'floors': '32', 'rooms': 5, 'subway': 0, 'area': 10}\n", 58 | "{'price': '12538', 'square': '134.0', 'floors': '32', 'rooms': 8, 'subway': 0, 'area': 12}\n", 59 | "{'price': '14243', 'square': '99.0', 'floors': '28', 'rooms': 6, 'subway': 0, 'area': 17}\n", 60 | "{'price': '7212', 'square': '104.0', 'floors': '30', 'rooms': 8, 'subway': 0, 'area': 10}\n", 61 | "{'price': '12500', 'square': '136.0', 'floors': '13', 'rooms': 8, 'subway': 1, 'area': 22}\n", 62 | "{'price': '12281', 'square': '114.0', 'floors': '13', 'rooms': 7, 'subway': 0, 'area': 9}\n", 63 | "{'price': '9031', 'square': '90.8', 'floors': '32', 'rooms': 6, 'subway': 0, 'area': 4}\n", 64 | "{'price': '9973', 'square': '96.06', 'floors': '33', 'rooms': 7, 'subway': 1, 'area': 1}\n", 65 | "{'price': '10338', 'square': '89.0', 'floors': '8', 'rooms': 7, 'subway': 1, 'area': 1}\n", 66 | "{'price': '15843', 'square': '82.06', 'floors': '31', 'rooms': 6, 'subway': 1, 'area': 23}\n", 67 | "{'price': '10000', 'square': '38.0', 'floors': '31', 'rooms': 3, 'subway': 0, 'area': 1}\n", 68 | "{'price': '12451', 'square': '115.5', 'floors': '8', 'rooms': 8, 'subway': 0, 'area': 14}\n", 69 | "{'price': '16552', 'square': '58.0', 'floors': '12', 'rooms': 4, 'subway': 1, 'area': 23}\n", 70 | "{'price': '13115', 'square': '61.0', 'floors': '28', 'rooms': 4, 'subway': 0, 'area': 11}\n", 71 | "{'price': '9643', 'square': '112.0', 'floors': '24', 'rooms': 7, 'subway': 1, 'area': 1}\n", 72 | "{'price': '12089', 'square': '405.09', 'floors': '5', 'rooms': 16, 'subway': 1, 'area': 22}\n", 73 | "{'price': '4815', 'square': '135.0', 'floors': '24', 'rooms': 7, 'subway': 0, 'area': 21}\n", 74 | "{'price': '5772', 'square': '79.7', 'floors': '26', 'rooms': 5, 'subway': 0, 'area': 19}\n", 75 | "{'price': '9879', 'square': '164.0', 'floors': '2', 'rooms': 11, 'subway': 0, 'area': 1}\n", 76 | "{'price': '13338', 'square': '91.47', 'floors': '18', 'rooms': 7, 'subway': 0, 'area': 14}\n", 77 | "{'price': '5300', 'square': '117.0', 'floors': '7', 'rooms': 8, 'subway': 0, 'area': 19}\n", 78 | "{'price': '8607', 'square': '122.0', 'floors': '8', 'rooms': 8, 'subway': 0, 'area': 3}\n", 79 | "{'price': '5200', 'square': '105.0', 'floors': '17', 'rooms': 7, 'subway': 0, 'area': 5}\n", 80 | "{'price': '13871', 'square': '447.0', 'floors': '3', 'rooms': 11, 'subway': 0, 'area': 22}\n", 81 | "{'price': '9660', 'square': '88.0', 'floors': '24', 'rooms': 6, 'subway': 1, 'area': 17}\n", 82 | "{'price': '10512', 'square': '72.3', 'floors': '34', 'rooms': 5, 'subway': 1, 'area': 4}\n" 83 | ] 84 | }, 85 | { 86 | "ename": "KeyError", 87 | "evalue": "'丰都县'", 88 | "output_type": "error", 89 | "traceback": [ 90 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 91 | "\u001b[1;31mKeyError\u001b[0m Traceback (most recent call last)", 92 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[0marea1\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'div'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mclass_\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'list-info'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'p'\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mclass_\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m'baseinfo'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfind_all\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'a'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_text\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m;\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 48\u001b[1;33m \u001b[0marea\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0marea1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 49\u001b[0m \u001b[0mjson\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m{\u001b[0m\u001b[1;34m'price'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mprice\u001b[0m \u001b[1;33m,\u001b[0m\u001b[1;34m'square'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0msquare\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'floors'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mfloors\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'rooms'\u001b[0m \u001b[1;33m:\u001b[0m \u001b[0mrooms\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m'subway'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0msubway\u001b[0m \u001b[1;33m,\u001b[0m \u001b[1;34m'area'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0marea\u001b[0m\u001b[1;33m}\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[0mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mjson\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;31m#打印结果\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 93 | "\u001b[1;31mKeyError\u001b[0m: '丰都县'" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "import requests\n", 99 | "from bs4 import BeautifulSoup\n", 100 | "import csv\n", 101 | "import os \n", 102 | "import re\n", 103 | "with open('Tiga.csv', mode='a',newline='') as csv_file:\n", 104 | " fieldnames = ['price', 'square','floors','rooms','subway', 'area']\n", 105 | " # 设置字段\n", 106 | " writer = csv.DictWriter(csv_file,fieldnames=fieldnames)\n", 107 | " #writer.writeheader(); #写入csv标题,第一次爬数据时使用\n", 108 | "\n", 109 | " headers = {\"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36\"}\n", 110 | " url = 'https://cq.58.com/ershoufang/sub/pn2'#pnXX 指第x页\n", 111 | " r = requests.get(url=url, headers=headers)\n", 112 | " html = r.text \n", 113 | " # html.parser解析器\n", 114 | " soup = BeautifulSoup(html, 'html.parser')\n", 115 | " con = soup.find('ul',class_='house-list-wrap')\n", 116 | " con_list = con.find_all('li' ,class_= 'sendsoj')\n", 117 | " con_list\n", 118 | " \n", 119 | " dict = {'奉节县':26,'巴南区':1,'北碚区': 2,'璧山区': 3,'大渡口区': 4,'大足区':5,'垫江县':6,'涪陵区':7,\n", 120 | " '合川区':8,'江北区':9,'江津区':10,'九龙坡区':11,'开州区(开县)':12,'梁平县':13,'南岸区':14,'南川区':15,\n", 121 | " '荣昌区':16,'沙坪坝区':17,'铜梁区':18,'潼南区':19,'万州区':20,'永川区':21,'渝北区':22,'渝中区':23,'云阳县':24,'长寿区':25}#设置字典,通过数据来设定特征值\n", 122 | " \n", 123 | " for i in con_list:\n", 124 | " name = i.find('h2',class_='title').find('a').get_text();\n", 125 | " price1 = i.find('div',class_='price').find('p',class_ = 'unit').get_text();\n", 126 | " price = re.compile('^[0-9]*').findall(price1)[0]\n", 127 | " list = i.find('div',class_='list-info').find('p',class_ = 'baseinfo').find_all('span');\n", 128 | " square2 = list[1].get_text();\n", 129 | " square = re.compile('[1-9]\\d*\\.\\d*|0\\.\\d*[1-9]\\d*').findall(square2)[0]\n", 130 | " floors1 = list[3].get_text();\n", 131 | " floors = re.compile('[0-9]+').findall(floors1)[0]\n", 132 | " rooms1 = list[0].get_text();\n", 133 | " rooms = int(re.compile('[0-9]+').findall(rooms1)[0]) + int(re.compile('[0-9]+').findall(rooms1)[1])+ int(re.compile('[0-9]+').findall(rooms1)[2])\n", 134 | " subwaybox = i.find('div',class_='list-info').find_all('p',class_ = 'baseinfo')[1].find_all('span');\n", 135 | " subway1 = \"\";\n", 136 | " if len(subwaybox)>1 :\n", 137 | " subway1 = subwaybox[1].get_text();\n", 138 | " else :\n", 139 | " subway1 = \"\";\n", 140 | " subway = 0;\n", 141 | " if len(subway1)>1:\n", 142 | " subway = 1;\n", 143 | " \n", 144 | " area1= i.find('div',class_='list-info').find_all('p',class_ = 'baseinfo')[1].find_all('a')[1].get_text();\n", 145 | " area = dict[area1]\n", 146 | " json = {'price':price ,'square':square,'floors':floors,'rooms' : rooms,'subway': subway , 'area': area}\n", 147 | " print(json)#打印结果\n", 148 | " writer.writerow(json)#将结果写入csv\n" 149 | ] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.7.3" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 2 173 | } 174 | --------------------------------------------------------------------------------