├── Auxiliary └── crawelFunc.py ├── README.md ├── ResidentialAreaBoundaryControl.py ├── ResidentialAreaBoundaryFunc.py └── data ├── czfjsj.csv └── czfjsj_boundary.csv /Auxiliary/crawelFunc.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # author:Changing Xu 3 | # file:WuhanCoronavirusDataCrawel-crawelFunc 4 | # datetime:2020/1/27 17:17 5 | # software: PyCharm 6 | # 爬虫辅助func 7 | 8 | import os 9 | import csv 10 | import json 11 | import codecs 12 | import pandas as pd 13 | import time 14 | import datetime 15 | import random 16 | 17 | 18 | # 判断网络连接状况 19 | def judgeNetwork(): 20 | try: 21 | os.popen("ping www.baidu.com -n 1").read() 22 | return True 23 | except(Exception): 24 | print('网络连接失败') 25 | return False 26 | 27 | 28 | # pandas 读取csv 29 | def loadCSV(path, header=0): 30 | return pd.read_csv(path, header=header, sep=',', encoding='gbk') 31 | 32 | 33 | # 生成日期列表 34 | def createAssistDate(datestart=None, dateend=None): 35 | # 创建日期辅助表 36 | if datestart is None: 37 | datestart = '20200101' 38 | if dateend is None: 39 | dateend = datetime.datetime.now().strftime('%Y%m%d') 40 | # 转为日期格式 41 | datestart = datetime.datetime.strptime(datestart, '%Y%m%d') 42 | dateend = datetime.datetime.strptime(dateend, '%Y%m%d') 43 | date_list = [] 44 | date_list.append(datestart.strftime('%Y%m%d')) 45 | while datestart < dateend: 46 | # 日期叠加一天 47 | datestart += datetime.timedelta(days=+1) 48 | # 日期转字符串存入列表 49 | date_list.append(datestart.strftime('%Y%m%d')) 50 | return date_list 51 | 52 | 53 | # 随机生成指定位数 54 | def getRandomNum(Length): 55 | return '1'+''.join(str(random.sample(range(0,9),1)[0]) for _ in range(Length-1)) 56 | 57 | 58 | # 时间戳转换 59 | def timestampToLocaltime(timestamp): 60 | time_local = time.localtime(int(str(timestamp)[:10])) 61 | return time.strftime("%Y-%m-%d %H:%M:%S", time_local) 62 | 63 | 64 | def save_info(results, csvPath, judge): 65 | try: 66 | dataframe = pd.DataFrame(results, index=[judge]) 67 | if (judge): 68 | dataframe.to_csv(csvPath, sep=',', encoding="utf_8_sig", mode='w', index=0, columns=list(results.keys())) 69 | else: 70 | dataframe.to_csv(csvPath, sep=',', encoding="utf_8_sig", mode='a', header=0, index=0, columns=list(results.keys())) 71 | 72 | except Exception as e: 73 | print(e) 74 | 75 | 76 | def infoTemplate(): 77 | infoTitle = ['adoptType', 'createTime', 'dataInfoOperator', 'dataInfoState', 'dataInfoTime', 'entryWay', 'id', 'infoSource', 'infoType', 'modifyTime', 78 | 'provinceId', 'provinceName', 'pubDate', 'pubDateStr', 'sourceUrl', 'summary', 'title'] 79 | return dict(zip(infoTitle, ['' for _ in infoTitle])) 80 | 81 | 82 | def jsonToCsv(path): 83 | filePath, fileName = os.path.split(path) 84 | csvPath = os.path.join(os.path.dirname(filePath), 'csv', os.path.splitext(fileName)[0] + '.csv') 85 | flag = True 86 | infoTitle = ['adoptType', 'createTime', 'dataInfoOperator', 'dataInfoState', 'dataInfoTime', 'entryWay', 'id', 'infoSource', 'infoType', 'modifyTime', 87 | 'provinceId', 'provinceName', 'pubDate', 'pubDateStr', 'sourceUrl', 'summary', 'title'] 88 | with open(path, 'r', encoding='utf-8') as jsonData_f: 89 | jsonData = json.load(jsonData_f) 90 | for info in jsonData['data']: 91 | dictKeys = list(info.keys()) 92 | infoTitle_tmp = infoTitle.copy() 93 | for titleName in infoTitle: 94 | if titleName in dictKeys: 95 | infoTitle_tmp.remove(titleName) 96 | dictKeys.remove(titleName) 97 | for titleName in infoTitle_tmp: info[titleName] = 'null' 98 | for titleName in dictKeys: del info[titleName] 99 | if info['createTime']: info['createLocalTime'] = timestampToLocaltime(info['createTime']) 100 | if info['dataInfoTime']: info['dataInfocLocalTime'] = timestampToLocaltime(info['dataInfoTime']) 101 | save_info(info, csvPath, flag) 102 | if flag: flag = False 103 | jsonData_f.close() 104 | print(f'{path} >>> {csvPath} convert complete') 105 | 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 小区边界爬取 2 | 3 | 1. 数据源:[百度地图api](https://lbsyun.baidu.com/index.php?title=jspopular/guide/introduction) 4 | 5 | 2. 项目结构 6 | ``` 7 | RESIDENTIALAREABOUNDARY 8 | │ ResidentialAreaBoundaryControl.py 主函数 9 | │ ResidentialAreaBoundaryFunc.py 方法库 10 | │ 11 | ├─Auxiliary 12 | │ crawelFunc.py 辅助函数 13 | └─data 14 | czfjsj.csv 样例-小区数据 15 | czfjsj_boundary.csv 样例-小区边界爬取结果 16 | ``` 17 | 18 | 3. 主函数需要参数: 19 | 20 | - orgin_csv_path:小区数据csv(参考 czfjsj.csv,仅需小区名称和位置即可) 21 | - output_path:输出目录 22 | - bd_ak:[百度地图ak列表](http://lbsyun.baidu.com/apiconsole/key?application=key),建议数据量大多添加几个ak,防止配额用完,该版本并未添加配额监测、负载均衡、断点续存等功能 -------------------------------------------------------------------------------- /ResidentialAreaBoundaryControl.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # author:Changing Xu 3 | # file:ResidentialAreaBoundary-ResidentialAreaBoundaryControl 4 | # datetime:2020/2/15 12:16 5 | # software: PyCharm 6 | import os 7 | from random import choice 8 | from Auxiliary.crawelFunc import loadCSV, save_info 9 | from ResidentialAreaBoundaryFunc import * 10 | 11 | bd_ak = ['your keys'] 12 | 13 | 14 | def crawel_residential_area_boundary(orgin_csv_path, output_path): 15 | judge = True 16 | orgin_df = loadCSV(orgin_csv_path) 17 | output_path=os.path.join(output_path,f"{os.path.basename(orgin_csv_path).split('.')[0]}_boundary.csv") 18 | for index, residential_info in orgin_df.iterrows(): 19 | residential_name = residential_info[0] 20 | residential_region = f'江苏省常州市{residential_info[1]}' 21 | residential_uid = get_residential_uid(residential_name, residential_region, choice(bd_ak)) 22 | if residential_uid != None: 23 | coord_bd09mc_list = get_boundary_by_uid(residential_uid) 24 | if coord_bd09mc_list != None: 25 | coord_bd09_list = transform_coordinate_batch(coord_bd09mc_list, choice(bd_ak)) 26 | if coord_bd09_list != None: 27 | for idx, coord in enumerate(coord_bd09_list.split(';')): 28 | save_info({'residential_name': residential_name, 29 | 'residential_region': residential_region, 30 | 'coord_x': coord.split(',')[0], 31 | 'coord_y': coord.split(',')[1], 32 | 'point_id': idx 33 | }, output_path, judge) 34 | if judge: 35 | judge = False 36 | print(f'{residential_name} Area Boundary Save Success\t {index}/{orgin_df.shape[0]}') 37 | 38 | 39 | if __name__ == '__main__': 40 | crawel_residential_area_boundary('data/czfjsj.csv','data') 41 | -------------------------------------------------------------------------------- /ResidentialAreaBoundaryFunc.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # author:Changing Xu 3 | # file:ResidentialAreaBoundary-ResidentialAreaBoundaryFunc 4 | # datetime:2020/2/15 14:31 5 | # software: PyCharm 6 | import requests 7 | from requests.adapters import HTTPAdapter 8 | import json 9 | 10 | 11 | def get_residential_uid(residential_name, region, bmap_key): 12 | bmap_localserach_url = f'http://api.map.baidu.com/place/v2/search?query={residential_name}®ion={region}&output=json&city_limit=true&ak={bmap_key}' 13 | s = requests.Session() 14 | s.mount('http://', HTTPAdapter(max_retries=3)) 15 | s.mount('https://', HTTPAdapter(max_retries=3)) 16 | 17 | data = s.get(bmap_localserach_url, timeout=5, headers={"Connection": "close"}) # , proxies=proxies 18 | data = data.text 19 | data = json.loads(data) 20 | if data['status'] == 0 and len(data['results']) > 0: 21 | try: 22 | for info in data['results']: 23 | if '-' not in info['name']: 24 | return info['uid'] 25 | return None 26 | except Exception as e: 27 | print(f'Error\t{bmap_localserach_url}') 28 | return None 29 | else: 30 | return None 31 | 32 | 33 | def get_boundary_by_uid(uid): 34 | ''' 35 | 根据uid获得边界 36 | :param uid: 百度地图 目标uid 37 | :return: None:无geo信息 else geos.join(;) 38 | ''' 39 | bmap_boundary_url = f'http://map.baidu.com/?reqflag=pcmap&from=webmap&qt=ext&uid={uid}&ext_ver=new&l=18' 40 | 41 | s = requests.Session() 42 | s.mount('http://', HTTPAdapter(max_retries=3)) # mount:将一个连接适配器注册到一个前缀上 43 | s.mount('https://', HTTPAdapter(max_retries=3)) # HTTPAdapter:通过实现传输适配器接口,为 session 和 HTTP、 HTTPS连接提供了一个通用的接口 44 | 45 | data = s.get(url=bmap_boundary_url, timeout=5, headers={"Connection": "close"}) 46 | data = data.text 47 | data = json.loads(data) 48 | content = data['content'] 49 | # print(data) 50 | if not 'geo' in content: 51 | return None 52 | try: 53 | geo = content['geo'] 54 | i = 0 55 | strsss = '' 56 | for jj in str(geo).split('|')[2].split('-')[1].split(','): 57 | jj = str(jj).strip(';') 58 | if i % 2 == 0: 59 | strsss = strsss + str(jj) + ',' 60 | else: 61 | strsss = strsss + str(jj) + ';' 62 | i = i + 1 63 | return strsss.strip(";") 64 | except Exception as e: 65 | print(f'Error\t{bmap_boundary_url}') 66 | return None 67 | 68 | 69 | def transform_coordinate_batch(coordinates, bmap_key): 70 | req_url = 'http://api.map.baidu.com/geoconv/v1/?coords=' + coordinates + '&from=6&to=5&ak=' + bmap_key 71 | 72 | s = requests.Session() 73 | s.mount('http://', HTTPAdapter(max_retries=3)) 74 | s.mount('https://', HTTPAdapter(max_retries=3)) 75 | 76 | data = s.get(req_url, timeout=5, headers={"Connection": "close"}) # , proxies=proxies 77 | data = data.text 78 | data = json.loads(data) 79 | coords = '' 80 | if data['status'] == 0: 81 | try: 82 | result = data['result'] 83 | if len(result) > 0: 84 | for res in result: 85 | lng = res['x'] 86 | lat = res['y'] 87 | coords = coords + ";" + str(lng) + "," + str(lat) 88 | return coords.strip(";") 89 | except Exception as e: 90 | print(f'Error\t{req_url}') 91 | return None 92 | else: 93 | return None 94 | 95 | -------------------------------------------------------------------------------- /data/czfjsj.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/XuCQ/ResidentialAreaBoundary/652094df400de345de2794c1a1ebf405c66542bd/data/czfjsj.csv --------------------------------------------------------------------------------