├── Auxiliary
    └── crawelFunc.py
├── README.md
├── ResidentialAreaBoundaryControl.py
├── ResidentialAreaBoundaryFunc.py
└── data
    ├── czfjsj.csv
    └── czfjsj_boundary.csv


/Auxiliary/crawelFunc.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | # author:Changing Xu
  3 | # file:WuhanCoronavirusDataCrawel-crawelFunc
  4 | # datetime:2020/1/27 17:17
  5 | # software: PyCharm
  6 | # 爬虫辅助func
  7 | 
  8 | import os
  9 | import csv
 10 | import json
 11 | import codecs
 12 | import pandas as pd
 13 | import time
 14 | import datetime
 15 | import random
 16 | 
 17 | 
 18 | # 判断网络连接状况
 19 | def judgeNetwork():
 20 |     try:
 21 |         os.popen("ping www.baidu.com -n 1").read()
 22 |         return True
 23 |     except(Exception):
 24 |         print('网络连接失败')
 25 |         return False
 26 | 
 27 | 
 28 | # pandas 读取csv
 29 | def loadCSV(path, header=0):
 30 |     return pd.read_csv(path, header=header, sep=',', encoding='gbk')
 31 | 
 32 | 
 33 | # 生成日期列表
 34 | def createAssistDate(datestart=None, dateend=None):
 35 |     # 创建日期辅助表
 36 |     if datestart is None:
 37 |         datestart = '20200101'
 38 |     if dateend is None:
 39 |         dateend = datetime.datetime.now().strftime('%Y%m%d')
 40 |     # 转为日期格式
 41 |     datestart = datetime.datetime.strptime(datestart, '%Y%m%d')
 42 |     dateend = datetime.datetime.strptime(dateend, '%Y%m%d')
 43 |     date_list = []
 44 |     date_list.append(datestart.strftime('%Y%m%d'))
 45 |     while datestart < dateend:
 46 |         # 日期叠加一天
 47 |         datestart += datetime.timedelta(days=+1)
 48 |         # 日期转字符串存入列表
 49 |         date_list.append(datestart.strftime('%Y%m%d'))
 50 |     return date_list
 51 | 
 52 | 
 53 | # 随机生成指定位数
 54 | def getRandomNum(Length):
 55 |     return '1'+''.join(str(random.sample(range(0,9),1)[0]) for _ in range(Length-1))
 56 | 
 57 | 
 58 | # 时间戳转换
 59 | def timestampToLocaltime(timestamp):
 60 |     time_local = time.localtime(int(str(timestamp)[:10]))
 61 |     return time.strftime("%Y-%m-%d %H:%M:%S", time_local)
 62 | 
 63 | 
 64 | def save_info(results, csvPath, judge):
 65 |     try:
 66 |         dataframe = pd.DataFrame(results, index=[judge])
 67 |         if (judge):
 68 |             dataframe.to_csv(csvPath, sep=',', encoding="utf_8_sig", mode='w', index=0, columns=list(results.keys()))
 69 |         else:
 70 |             dataframe.to_csv(csvPath, sep=',', encoding="utf_8_sig", mode='a', header=0, index=0, columns=list(results.keys()))
 71 | 
 72 |     except Exception as e:
 73 |         print(e)
 74 | 
 75 | 
 76 | def infoTemplate():
 77 |     infoTitle = ['adoptType', 'createTime', 'dataInfoOperator', 'dataInfoState', 'dataInfoTime', 'entryWay', 'id', 'infoSource', 'infoType', 'modifyTime',
 78 |                  'provinceId', 'provinceName', 'pubDate', 'pubDateStr', 'sourceUrl', 'summary', 'title']
 79 |     return dict(zip(infoTitle, ['' for _ in infoTitle]))
 80 | 
 81 | 
 82 | def jsonToCsv(path):
 83 |     filePath, fileName = os.path.split(path)
 84 |     csvPath = os.path.join(os.path.dirname(filePath), 'csv', os.path.splitext(fileName)[0] + '.csv')
 85 |     flag = True
 86 |     infoTitle = ['adoptType', 'createTime', 'dataInfoOperator', 'dataInfoState', 'dataInfoTime', 'entryWay', 'id', 'infoSource', 'infoType', 'modifyTime',
 87 |                  'provinceId', 'provinceName', 'pubDate', 'pubDateStr', 'sourceUrl', 'summary', 'title']
 88 |     with open(path, 'r', encoding='utf-8') as jsonData_f:
 89 |         jsonData = json.load(jsonData_f)
 90 |         for info in jsonData['data']:
 91 |             dictKeys = list(info.keys())
 92 |             infoTitle_tmp = infoTitle.copy()
 93 |             for titleName in infoTitle:
 94 |                 if titleName in dictKeys:
 95 |                     infoTitle_tmp.remove(titleName)
 96 |                     dictKeys.remove(titleName)
 97 |             for titleName in infoTitle_tmp: info[titleName] = 'null'
 98 |             for titleName in dictKeys: del info[titleName]
 99 |             if info['createTime']: info['createLocalTime'] = timestampToLocaltime(info['createTime'])
100 |             if info['dataInfoTime']: info['dataInfocLocalTime'] = timestampToLocaltime(info['dataInfoTime'])
101 |             save_info(info, csvPath, flag)
102 |             if flag: flag = False
103 |         jsonData_f.close()
104 |     print(f'{path} >>> {csvPath} convert complete')
105 | 
106 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 小区边界爬取
 2 | 
 3 | 1. 数据源：[百度地图api](https://lbsyun.baidu.com/index.php?title=jspopular/guide/introduction)
 4 | 
 5 | 2. 项目结构
 6 | ``` 
 7 |    RESIDENTIALAREABOUNDARY
 8 |    │  ResidentialAreaBoundaryControl.py 主函数
 9 |    │  ResidentialAreaBoundaryFunc.py 方法库
10 |    │  
11 |    ├─Auxiliary
12 |    │     crawelFunc.py 辅助函数
13 |    └─data
14 |          czfjsj.csv 样例-小区数据
15 |          czfjsj_boundary.csv 样例-小区边界爬取结果
16 |    ```
17 | 
18 | 3. 主函数需要参数：
19 | 
20 |    - orgin_csv_path：小区数据csv（参考 czfjsj.csv，仅需小区名称和位置即可）
21 |    - output_path：输出目录
22 |    - bd_ak：[百度地图ak列表](http://lbsyun.baidu.com/apiconsole/key?application=key)，建议数据量大多添加几个ak，防止配额用完，该版本并未添加配额监测、负载均衡、断点续存等功能


--------------------------------------------------------------------------------
/ResidentialAreaBoundaryControl.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # author:Changing Xu
 3 | # file:ResidentialAreaBoundary-ResidentialAreaBoundaryControl
 4 | # datetime:2020/2/15 12:16
 5 | # software: PyCharm
 6 | import os
 7 | from random import choice
 8 | from Auxiliary.crawelFunc import loadCSV, save_info
 9 | from ResidentialAreaBoundaryFunc import *
10 | 
11 | bd_ak = ['your keys']
12 | 
13 | 
14 | def crawel_residential_area_boundary(orgin_csv_path, output_path):
15 |     judge = True
16 |     orgin_df = loadCSV(orgin_csv_path)
17 |     output_path=os.path.join(output_path,f"{os.path.basename(orgin_csv_path).split('.')[0]}_boundary.csv")
18 |     for index, residential_info in orgin_df.iterrows():
19 |         residential_name = residential_info[0]
20 |         residential_region = f'江苏省常州市{residential_info[1]}'
21 |         residential_uid = get_residential_uid(residential_name, residential_region, choice(bd_ak))
22 |         if residential_uid != None:
23 |             coord_bd09mc_list = get_boundary_by_uid(residential_uid)
24 |             if coord_bd09mc_list != None:
25 |                 coord_bd09_list = transform_coordinate_batch(coord_bd09mc_list, choice(bd_ak))
26 |                 if coord_bd09_list != None:
27 |                     for idx, coord in enumerate(coord_bd09_list.split(';')):
28 |                         save_info({'residential_name': residential_name,
29 |                                    'residential_region': residential_region,
30 |                                    'coord_x': coord.split(',')[0],
31 |                                    'coord_y': coord.split(',')[1],
32 |                                    'point_id': idx
33 |                                    }, output_path, judge)
34 |                         if judge:
35 |                             judge = False
36 |                     print(f'{residential_name}  Area Boundary Save Success\t {index}/{orgin_df.shape[0]}')
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     crawel_residential_area_boundary('data/czfjsj.csv','data')
41 | 


--------------------------------------------------------------------------------
/ResidentialAreaBoundaryFunc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # author:Changing Xu
 3 | # file:ResidentialAreaBoundary-ResidentialAreaBoundaryFunc
 4 | # datetime:2020/2/15 14:31
 5 | # software: PyCharm
 6 | import requests
 7 | from requests.adapters import HTTPAdapter
 8 | import json
 9 | 
10 | 
11 | def get_residential_uid(residential_name, region, bmap_key):
12 |     bmap_localserach_url = f'http://api.map.baidu.com/place/v2/search?query={residential_name}&region={region}&output=json&city_limit=true&ak={bmap_key}'
13 |     s = requests.Session()
14 |     s.mount('http://', HTTPAdapter(max_retries=3))
15 |     s.mount('https://', HTTPAdapter(max_retries=3))
16 | 
17 |     data = s.get(bmap_localserach_url, timeout=5, headers={"Connection": "close"})  # , proxies=proxies
18 |     data = data.text
19 |     data = json.loads(data)
20 |     if data['status'] == 0 and len(data['results']) > 0:
21 |         try:
22 |             for info in data['results']:
23 |                 if '-' not in info['name']:
24 |                     return info['uid']
25 |             return None
26 |         except Exception as e:
27 |             print(f'Error\t{bmap_localserach_url}')
28 |             return None
29 |     else:
30 |         return None
31 | 
32 | 
33 | def get_boundary_by_uid(uid):
34 |     '''
35 |     根据uid获得边界
36 |     :param uid: 百度地图 目标uid
37 |     :return: None:无geo信息 else geos.join(;)
38 |     '''
39 |     bmap_boundary_url = f'http://map.baidu.com/?reqflag=pcmap&from=webmap&qt=ext&uid={uid}&ext_ver=new&l=18'
40 | 
41 |     s = requests.Session()
42 |     s.mount('http://', HTTPAdapter(max_retries=3))  # mount:将一个连接适配器注册到一个前缀上
43 |     s.mount('https://', HTTPAdapter(max_retries=3))  # HTTPAdapter:通过实现传输适配器接口，为 session 和 HTTP、 HTTPS连接提供了一个通用的接口
44 | 
45 |     data = s.get(url=bmap_boundary_url, timeout=5, headers={"Connection": "close"})
46 |     data = data.text
47 |     data = json.loads(data)
48 |     content = data['content']
49 |     # print(data)
50 |     if not 'geo' in content:
51 |         return None
52 |     try:
53 |         geo = content['geo']
54 |         i = 0
55 |         strsss = ''
56 |         for jj in str(geo).split('|')[2].split('-')[1].split(','):
57 |             jj = str(jj).strip(';')
58 |             if i % 2 == 0:
59 |                 strsss = strsss + str(jj) + ','
60 |             else:
61 |                 strsss = strsss + str(jj) + ';'
62 |             i = i + 1
63 |         return strsss.strip(";")
64 |     except Exception as e:
65 |         print(f'Error\t{bmap_boundary_url}')
66 |         return None
67 | 
68 | 
69 | def transform_coordinate_batch(coordinates, bmap_key):
70 |     req_url = 'http://api.map.baidu.com/geoconv/v1/?coords=' + coordinates + '&from=6&to=5&ak=' + bmap_key
71 | 
72 |     s = requests.Session()
73 |     s.mount('http://', HTTPAdapter(max_retries=3))
74 |     s.mount('https://', HTTPAdapter(max_retries=3))
75 | 
76 |     data = s.get(req_url, timeout=5, headers={"Connection": "close"})  # , proxies=proxies
77 |     data = data.text
78 |     data = json.loads(data)
79 |     coords = ''
80 |     if data['status'] == 0:
81 |         try:
82 |             result = data['result']
83 |             if len(result) > 0:
84 |                 for res in result:
85 |                     lng = res['x']
86 |                     lat = res['y']
87 |                     coords = coords + ";" + str(lng) + "," + str(lat)
88 |             return coords.strip(";")
89 |         except Exception as e:
90 |             print(f'Error\t{req_url}')
91 |             return None
92 |     else:
93 |         return None
94 | 
95 | 


--------------------------------------------------------------------------------
/data/czfjsj.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/XuCQ/ResidentialAreaBoundary/652094df400de345de2794c1a1ebf405c66542bd/data/czfjsj.csv


--------------------------------------------------------------------------------