├── .gitignore ├── 01应用宝App信息 ├── README.md ├── code │ ├── config.json │ └── main.py ├── images │ ├── output.png │ └── 应用宝爬虫演示.gif └── result │ └── tencent_games.xlsx ├── 02豌豆荚App信息 ├── README.md ├── code │ ├── config.json │ └── main.py ├── images │ ├── output.png │ └── 豌豆荚爬虫演示.gif └── result │ └── tencent_games.xlsx ├── 03安居客全国写字楼信息 ├── README.md ├── code │ ├── 01-城市信息获取.py │ └── 02-大中城市写字楼盘获取.py ├── data │ ├── 01_cities.csv │ ├── 02_effective_cities.csv │ └── 03_effective_cities_xzl.csv └── utils │ ├── __pycache__ │ ├── digital_decryption.cpython-37.pyc │ └── request.cpython-37.pyc │ ├── digital_decryption.py │ ├── lbs_transfrom.py │ └── request.py ├── LICENSE ├── README.md └── 工具箱 ├── 01安居客数字解密 ├── README.md ├── code │ └── keymap.py ├── images │ ├── 安居客加密数字样例.png │ ├── 安居客加密数字样例2.png │ └── 解密钥匙.png └── sample │ ├── keys.txt │ └── secret_price.txt └── 02腾讯坐标系转百度坐标系 ├── README.md ├── code └── tx2bd.py └── images ├── tmp.txt ├── 矫正经纬度.png ├── 经纬度差异.png └── 美团经纬度.png /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /01应用宝App信息/README.md: -------------------------------------------------------------------------------- 1 | # 应用宝APP信息爬取 2 | 3 | ## 适用场景 4 | 目标是根据一个或多个已知的**APP或关键字**查找出相同类型或关联的其他APP,在一定程度上作为APP扩量的初筛池。方法是利用应用宝的搜索引擎,输入搜索关键字,将返回列表的APP信息整合,返回搜索关键字、APP名称、PKG名称、下载量、简单描述。(注意:若多个搜索关键字都返回同一APP信息,则该APP仅作为第一个匹配的搜索关键字的结果,i.e. 返回结果中不存在重复APP) 5 | 6 | ## 使用教程 7 | 1. [点击这里下载][1]下载WebCrawler项目 8 | 9 | 2. 确保以下库均安装 10 | ```python 11 | import sys 12 | import json 13 | import pandas as pd 14 | import time 15 | from bs4 import BeautifulSoup 16 | from selenium import webdriver 17 | from selenium.webdriver.support.ui import WebDriverWait 18 | from selenium.webdriver.support import expected_conditions as EC 19 | ``` 20 | 21 | 3. 配置 *01应用宝App信息/code/config.json* 文件形如: 22 | ```javascript 23 | { 24 | "key_words":[ 25 | "王者荣耀","刺激战场" 26 | ], 27 | "output_tag":"tencent_games" 28 | } 29 | ``` 30 | 其中 *key_words* 为目标关键字,使用应用宝搜索引擎返回关联APP; *output_tag* 为指定输出excel文件名称,输出文件到 *01应用宝App信息/result/* 中,若有相同文件路径则会覆盖。 31 | 32 | 3. 执行 *01应用宝App信息/code/main.py* Python3脚本 33 | ```python 34 | python3 WebCrawler/01应用宝App信息/code/main.py 35 | ``` 36 | 4. 过程演示图片(若未显示,尝试使用代理后刷新页面) 37 | ![](images/应用宝爬虫演示.gif) 38 | 39 | 5. 稍等 *key_words* 长度个弹窗后,检查 *01应用宝App信息/result/${output_tag}* 文件,格式为: 40 | ![](images/output.png) 41 | 42 | [1]:https://github.com/Colin-zh/WebCrawler/archive/main.zip 43 | -------------------------------------------------------------------------------- /01应用宝App信息/code/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "key_words":[ 3 | "王者荣耀","和平精英" 4 | ], 5 | "output_tag":"tencent_games" 6 | } 7 | -------------------------------------------------------------------------------- /01应用宝App信息/code/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- encoding:utf-8 -*- 3 | import sys 4 | import json 5 | import pandas as pd 6 | import time 7 | from bs4 import BeautifulSoup 8 | from selenium import webdriver 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | def chrome_init(): 13 | options = webdriver.ChromeOptions() 14 | # 不加载图片,加快访问速度 15 | options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) 16 | # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium 17 | options.add_experimental_option('excludeSwitches', ['enable-automation']) 18 | return webdriver.Chrome(options=options) 19 | 20 | def get_pkg_details_yyb(key_word,url): 21 | 22 | # 开启浏览器及对应链接 23 | driver = chrome_init() 24 | driver.get(url) 25 | 26 | # 设置滚动限定条件 27 | content = driver.page_source 28 | target = '加载更多' 29 | 30 | # 执行滚动至对应css组件消失,记录html文件 31 | # 设定滚动间隔为2s 32 | time_start = time.time() 33 | while target in content and time.time()-time_start <= 60: 34 | driver.find_element_by_class_name('load-more').click() 35 | content = driver.page_source 36 | time.sleep(2) 37 | driver.close() 38 | 39 | # 转化为bs4 40 | bf = BeautifulSoup(content,features="lxml") 41 | 42 | # 抓取html5中关键信息 43 | app_info = bf.find_all('a', class_ = 'appName', target='_blank') 44 | app_name = list(map(lambda x: x.text, app_info)) 45 | pkg_name = list(map(lambda x: x.attrs['href'].replace('../myapp/detail.htm?apkName=',''), app_info)) 46 | cnt = list(map(lambda x: x.text.replace('人下载','') ,bf.find_all('div', class_='down-line'))) 47 | desc = list(map(lambda x: x.text,bf.find_all('div', class_='recommend-hidden-box'))) 48 | key_word = [key_word] * len(app_name) 49 | 50 | # 返回app相关信息 51 | return pd.DataFrame(list(zip(key_word,app_name,pkg_name,cnt,desc)),columns = ['key_word','app_name','pkg_name','download_cnt','description']) 52 | 53 | if __name__ == '__main__': 54 | 55 | # 读取json配置 56 | with open('%s/config.json'%sys.path[0],'r',encoding='utf8')as fp: 57 | json_data = json.load(fp) 58 | key_words = json_data['key_words'] 59 | output_tag = json_data['output_tag'] 60 | 61 | # 创建空dataframe存储app信息 62 | df = pd.DataFrame(columns = ['key_word','app_name','pkg_name','download_cnt','description']) 63 | 64 | # 执行应用宝抓取 65 | # 根据关键字获取所有app的url 66 | urls = list(map(lambda key_word: 'https://sj.qq.com/myapp/search.htm?kw={}'.format(key_word), key_words)) 67 | dict_yyb = dict(zip(key_words,urls)) 68 | 69 | # dataframe添加相关信息 70 | for key_word,url in dict_yyb.items(): 71 | df = df.append(get_pkg_details_yyb(key_word,url), ignore_index=True) 72 | 73 | # 数据整合 74 | # 去重,按key_words顺序保留不重复值 75 | df.pkg_name = df.pkg_name.apply(lambda x : x.split('&info=')[0]) 76 | df = df.drop_duplicates(['pkg_name']) 77 | 78 | # 指定路径生成xlsx文件 79 | df.to_excel(sys.path[0] + '/../result/' + output_tag + '.xlsx') 80 | -------------------------------------------------------------------------------- /01应用宝App信息/images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/01应用宝App信息/images/output.png -------------------------------------------------------------------------------- /01应用宝App信息/images/应用宝爬虫演示.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/01应用宝App信息/images/应用宝爬虫演示.gif -------------------------------------------------------------------------------- /01应用宝App信息/result/tencent_games.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/01应用宝App信息/result/tencent_games.xlsx -------------------------------------------------------------------------------- /02豌豆荚App信息/README.md: -------------------------------------------------------------------------------- 1 | # 豌豆荚APP信息爬取 2 | 3 | ## 适用场景 4 | 目标是根据一个或多个已知的**APP或关键字**查找出相同类型或关联的其他APP,在一定程度上作为APP扩量的初筛池。方法是利用豌豆荚的搜索引擎,输入搜索关键字,将返回列表的APP信息整合,返回搜索关键字、APP名称、PKG名称、下载量、简单描述。(注意:若多个搜索关键字都返回同一APP信息,则该APP仅作为第一个匹配的搜索关键字的结果,i.e. 返回结果中不存在重复APP) 5 | 6 | ## 使用教程 7 | 1. [点击这里下载][1]下载WebCrawler项目 8 | 9 | 2. 确保以下库均安装 10 | ```python 11 | import sys 12 | import json 13 | import pandas as pd 14 | import random 15 | import requests 16 | import progressbar 17 | from bs4 import BeautifulSoup 18 | ``` 19 | 20 | 3. 配置 *02豌豆荚App信息/code/config.json* 文件形如: 21 | ```javascript 22 | { 23 | "key_words":[ 24 | "王者荣耀","刺激战场" 25 | ], 26 | "output_tag":"tencent_games" 27 | } 28 | ``` 29 | 其中 *key_words* 为目标关键字,使用豌豆荚搜索引擎返回关联APP; *output_tag* 为指定输出excel文件名称,输出文件到 *02豌豆荚App信息/result/* 中,若有相同文件路径则会覆盖。 30 | 31 | 3. 执行 *02豌豆荚App信息/code/main.py* Python3脚本 32 | ```python 33 | python3 WebCrawler/02豌豆荚App信息/code/main.py 34 | ``` 35 | 4. 过程演示图片(若未显示,尝试使用代理后刷新页面) 36 | ![](images/豌豆荚爬虫演示.gif) 37 | 38 | 5. 稍等 *key_words* 长度个弹窗后,检查 *02豌豆荚宝App信息/result/${output_tag}* 文件,格式为: 39 | ![](images/output.png) 40 | 41 | [1]:https://github.com/Colin-zh/WebCrawler/archive/main.zip 42 | -------------------------------------------------------------------------------- /02豌豆荚App信息/code/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "key_words":[ 3 | "王者荣耀","刺激战场" 4 | ], 5 | "output_tag":"tencent_games" 6 | } 7 | -------------------------------------------------------------------------------- /02豌豆荚App信息/code/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- encoding:utf-8 -*- 3 | import sys 4 | import json 5 | import pandas as pd 6 | import random 7 | import requests 8 | import progressbar 9 | from bs4 import BeautifulSoup 10 | 11 | def get_details_wdj(bf,key_word,res): 12 | try: 13 | # 获取所有列表APP信息 14 | applist = bf.find_all('li', class_='search-item search-searchitems') 15 | # 遍历app_list清洗出App信息同时更新进度条状态 16 | for app in applist: 17 | res[0].append(key_word) 18 | res[1].append(app.find('a', class_ = 'name').text.strip()) 19 | res[2].append(app.find_all('a')[-1].attrs['data-app-pname']) 20 | res[3].append(app.find_all('span')[-1].text.strip()) 21 | res[4].append(app.find('div', class_ = 'comment').text.strip()) 22 | 23 | return res 24 | except Exception as e: 25 | return False 26 | 27 | def get_info_wdj(key_word): 28 | 29 | # user_agent列表 30 | user_agent_list = [ 31 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 32 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 33 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0', 34 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36', 35 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36' 36 | ] 37 | 38 | # referer列表(图片防盗链) 39 | #referer_list = [ 40 | # 'https://www.wandoujia.com/' 41 | #] 42 | 43 | # 初始化header 44 | header = {'User-Agent': random.choice(user_agent_list)} 45 | #header = {'User-Agent': random.choice(user_agent_list), 'Referer': random.choice(referer_list)} 46 | 47 | # 初始化分页索引 48 | pageIndex = 1 49 | 50 | # 初始化页面返回首页 51 | content = '-1' 52 | 53 | # 初始化返回列表,不可使用 [[]]*5 54 | res = [[],[],[],[],[]] 55 | 56 | # 返回为空信息 57 | #msg = '''{"state":{"code":2000000,"msg":"Ok","tips":""},"data":{"currPage":0,"totalPage":0,"content":""}}''' 58 | 59 | # 获取第一页信息同时获取总页数 60 | url = 'https://www.wandoujia.com/wdjweb/api/search/more?page={}&key={}'.format(pageIndex,key_word) 61 | html = requests.get(url = url, headers = header).text 62 | 63 | # 转化为bs4 64 | content = json.loads(html)['data']['content'] 65 | bf = BeautifulSoup(content,features="lxml") 66 | 67 | totalPage = json.loads(html)['data']['totalPage'] 68 | 69 | # 初始化progressbar 70 | # 定义进度条的显示样式 71 | widgets = [key_word + ' : ', progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()] 72 | 73 | # 创建进度条并开始运行 74 | pbar = progressbar.ProgressBar(maxval=totalPage, widgets=widgets).start() 75 | 76 | res = get_details_wdj(bf,key_word,res) 77 | if res: 78 | pbar.update(pageIndex) 79 | else: 80 | print('" ' + key_word + ' " ' + '爬取第%s'%pageIndex + '页时出错') 81 | 82 | # 循环获取剩余页数网页app信息 83 | while pageIndex < totalPage: 84 | 85 | # 更新url及html内容 86 | pageIndex += 1 87 | url = 'https://www.wandoujia.com/wdjweb/api/search/more?page={}&key={}'.format(pageIndex,key_word) 88 | html = requests.get(url = url, headers = header).text 89 | 90 | # 转化为bs4 91 | content = json.loads(html)['data']['content'] 92 | bf = BeautifulSoup(content,features="lxml") 93 | 94 | res = get_details_wdj(bf,key_word,res) 95 | if res: 96 | pbar.update(pageIndex) 97 | else: 98 | print('" ' + key_word + ' " ' + '爬取第%s'%pageIndex + '页时出错') 99 | 100 | # 结束进度条 101 | pbar.finish() 102 | 103 | return pd.DataFrame(zip(*res),columns = ['key_word','app_name','pkg_name','download_cnt','description']) 104 | 105 | if __name__ == '__main__': 106 | 107 | # 读取json配置 108 | with open('%s/config.json'%sys.path[0],'r',encoding='utf8')as fp: 109 | json_data = json.load(fp) 110 | key_words = json_data['key_words'] 111 | output_tag = json_data['output_tag'] 112 | 113 | # 创建空dataframe存储app信息 114 | df = pd.DataFrame(columns = ['key_word','app_name','pkg_name','download_cnt','description']) 115 | 116 | # 遍历获取各关键字对应app列表 117 | for key_word in key_words: 118 | tmp = get_info_wdj(key_word) 119 | # dataframe添加相关信息 120 | df = df.append(tmp, ignore_index = True) 121 | 122 | # 数据整合 123 | # 去重,按key_words顺序保留不重复值 124 | df = df.drop_duplicates(['pkg_name']) 125 | 126 | # 指定路径生成xlsx文件 127 | df.to_excel(sys.path[0] + '/../result/' + output_tag + '.xlsx') 128 | 129 | -------------------------------------------------------------------------------- /02豌豆荚App信息/images/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/02豌豆荚App信息/images/output.png -------------------------------------------------------------------------------- /02豌豆荚App信息/images/豌豆荚爬虫演示.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/02豌豆荚App信息/images/豌豆荚爬虫演示.gif -------------------------------------------------------------------------------- /02豌豆荚App信息/result/tencent_games.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/02豌豆荚App信息/result/tencent_games.xlsx -------------------------------------------------------------------------------- /03安居客全国写字楼信息/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /03安居客全国写字楼信息/code/01-城市信息获取.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append(sys.path[0]+'/..') 5 | import pandas as pd 6 | import progressbar 7 | import re 8 | from bs4 import BeautifulSoup 9 | from utils.request import get_request 10 | 11 | # 验证城市办公楼盘获取有效性 12 | def validation(city_dct): 13 | # 定义关键字 14 | msg = '办公楼盘' 15 | # 初始化有效城市字典 16 | effective_cities = {} 17 | # 初始化progressbar 18 | # 定义进度条的显示样式 19 | widgets = ['有效验证中 : ', 20 | progressbar.Percentage(), 21 | " ", 22 | progressbar.Bar(), 23 | " ", 24 | progressbar.ETA()] 25 | # 创建进度条并开始运行 26 | pbar = progressbar.ProgressBar(maxval=len(city_dct), 27 | widgets=widgets).start() 28 | i = 0 29 | for city,abbr in city_dct.items(): 30 | url = 'https://%s.anjuke.com/'%abbr 31 | content = get_request(url, anonymous=True) 32 | i += 1 33 | pbar.update(i) 34 | navigation_info = re.search(r'%s'%msg, content) 35 | if navigation_info: 36 | url = re.search(r'href=(.*?)>', navigation_info.group(0)).group(1) 37 | effective_cities[city] = {'abbr':abbr, 'url':url} 38 | # 结束进度条 39 | pbar.finish() 40 | return effective_cities 41 | 42 | if __name__ == '__main__': 43 | # 初始化城市链接页 44 | url = 'https://www.anjuke.com/sy-city.html' 45 | # 城市列表 46 | content = BeautifulSoup(get_request(url,True), features='lxml') 47 | cities = content.find('div', class_='letter_city').find_all('a') 48 | # 城市缩写清洗 49 | city_map = {} 50 | for city in cities: 51 | city_name = city.text.strip() 52 | city_abbr = city.attrs['href'].replace('https://','').replace('.anjuke.com','') 53 | city_map[city_name] = city_abbr 54 | # 安居客中会把小城市的缩写归类到附近大城市,例如:临安,富阳归为杭州 55 | # 因此需要对city_map进行清洗,仅保留大城市的对应关系 56 | # Step1 统计具有重复缩写城市的名称集合 57 | # city_df = pd.DataFrame.from_dict(city_map, 58 | # orient='index', 59 | # columns = ['abbr']).reset_index().rename(columns = {'index':'city'}) # 字典转dataframe 60 | # tmp = city_df.groupby('abbr')['city'].agg(['count', list]).reset_index() # 统计abbr重复个数及对应城市列表 61 | # tmp.where(tmp['count'] != 1).dropna() # 仅观测count不为1的城市缩写 62 | 63 | # Step2 观测发现,一下城市冗余 64 | duplicated_cities = ['巴州', '农安', '阿坝州', '阿坝州', '大邑', '金堂', '铜梁', '丰都', 65 | '长寿', '巢湖', '涪陵', '南川', '永川', '綦江', '黔江', '万州', '江津', 66 | '合川', '宁乡', '普兰店', '肇源', '长乐', '连江', '平潭', '白沙县', 67 | '儋州市', '澄迈县', '定安', '琼中', '屯昌', '文昌市', '淳安', '富阳', 68 | '临安', '桐庐', '肥东', '肥西', '庐江', '长丰', '龙门', '平阴', 69 | '济阳', '商河', '宜良', '文安', '永登', '榆中', '汝阳', '当涂', 70 | '宾阳', '横县', '宁海', '新建', '即墨', '胶南', '晋安', '陵水', 71 | '保亭', '东方市', '上虞', '无极', '辛集', '元氏', '辽中', '新民', 72 | '乐亭', '滦县', '周至', '户县', '蓝田', '丰县', '睢宁', '江都', 73 | '中牟', '巩义'] 74 | # 安居客涵盖的所有城市 75 | city_dct = {key: city_map[key] for key in city_map.keys() if key not in duplicated_cities} 76 | # 可直接通过导航栏中的 "办公楼盘" 获取写字楼的城市 77 | effective_cities = validation(city_dct) 78 | # 上述字典转DataFrame 79 | city_df = pd.DataFrame.from_dict(city_dct, 80 | orient='index', 81 | columns=['abbr']).reset_index().rename(columns = {'index':'city'}) # 字典转dataframe 82 | effective_cities_df = pd.DataFrame.from_dict(effective_cities, 83 | orient='index').reset_index() # 字典转dataframe 84 | # 输出保存 85 | city_df.to_csv('%s/../data/01_cities.csv'%sys.path[0], encoding='utf-8', index=False) 86 | effective_cities_df.to_csv('%s/../data/02_effective_cities.csv'%sys.path[0], encoding='utf-8', index=False) 87 | -------------------------------------------------------------------------------- /03安居客全国写字楼信息/code/02-大中城市写字楼盘获取.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | sys.path.append(sys.path[0]+'/..') 5 | import pandas as pd 6 | import re 7 | from bs4 import BeautifulSoup 8 | from utils.request import get_request 9 | from utils.digital_decryption import get_keymap,keymap_replace 10 | from utils.lbs_transfrom import lbsTransform 11 | 12 | def getInfo(effective_cities_dct): 13 | # 初始化结果列表与字典长度 14 | res = [] 15 | cities_len = len(effective_cities_dct) 16 | # 循环遍历字典爬虫 17 | for index,row in effective_cities_dct.items(): 18 | # 初始化页数与记录城市数 19 | pageIndex = 1 20 | cityIndex = index + 1 21 | # 初始化城市及链接信息 22 | city = row['index'] 23 | abbr = row['abbr'] 24 | url = row['url'] 25 | # 初始化停止抓取信息 26 | msg = '暂无匹配的楼盘' 27 | content_text = '-1' 28 | while msg not in content_text: 29 | print("【INFO】:正在获取{}第{}页,进度为{}/{}".format(city,pageIndex,cityIndex,cities_len)) 30 | # 初始化url 31 | url = re.search(r'https://.*?/loupan/',url).group(0) + 'p%s/'%pageIndex 32 | # 获取网页信息,更新content_text 33 | html = get_request(url,True) 34 | content = BeautifulSoup(html, features='lxml') 35 | content_text = content.find('div', class_='main-body').text 36 | # 提前结束 37 | if msg in content_text: 38 | break 39 | # 获取密钥 40 | keys = get_keymap(html) 41 | # 写字楼列表 42 | xzl_list = content.find_all('div', class_='list-item') 43 | # 写字楼细节 44 | for xzl in xzl_list: 45 | xzl_name = xzl.find('p', class_='list-item-content-title').text 46 | xzl_url = xzl.find('a', class_='for-track').attrs['href'] 47 | print("【INFO】:正在获取{}第{}页,当前楼盘为{},URL为{}".format(city,pageIndex,xzl_name,xzl_url)) 48 | xzl_address = xzl.find('p', class_='list-item-content-address').text.replace(' ','').replace('"','').replace('·','|').replace('\n','') 49 | xzl_price_encrypted = xzl.find('div', class_='list-item-content-price').find('p').text 50 | if xzl_price_encrypted != "暂无参考价格": 51 | xzl_price = keymap_replace(xzl_price_encrypted,keys) 52 | print("【INFO】:正在获取{}第{}页,当前楼盘为{},URL为{}".format(city,pageIndex,xzl_name,xzl_url)) 53 | else: 54 | xzl_price = xzl_price_encrypted 55 | print("【WARN】:正在获取{}第{}页,当前楼盘为{},URL为{},该楼盘无价格信息".format(city,pageIndex,xzl_name,xzl_url)) 56 | xzl_html = get_request(xzl_url, anonymous=True) 57 | print("【INFO】:正在进入楼盘详情页,获取{}第{}页楼盘为{},URL为{},".format(city,pageIndex,xzl_name,xzl_url)) 58 | xzl_lbs = re.search(r'var map = (.*?);', xzl_html, flags=re.M|re.S) 59 | if xzl_lbs: 60 | xzl_lbs = xzl_lbs.group(1).replace('\n','').replace(' ','') 61 | else: 62 | xzl_lbs = '' 63 | res. append([city,abbr,url,xzl_name,xzl_address,xzl_price,xzl_url,xzl_lbs]) 64 | print("【DONE】:当前楼盘获取完毕,已完成{}第{}页楼盘为{},URL为{},".format(city,pageIndex,xzl_name,xzl_url)) 65 | #更新页数 66 | pageIndex += 1 67 | print("【DONE】:获取{}完毕,共{}页,进度为{}/{}".format(city,pageIndex,cityIndex,cities_len)) 68 | return res 69 | 70 | 71 | if __name__ == 'main': 72 | # 读取可直接通过导航栏中 "办公楼盘" 的城市数据 73 | effective_cities_df = pd.read_csv('%s/../data/02_effective_cities.csv'%sys.path[0]) 74 | # 转字典,便于操作 75 | effective_cities_dct = effective_cities_df.to_dict(orient='index') 76 | # 执行遍历爬虫 77 | res = getInfo(effective_cities_dct) 78 | # 列表转DataFrame 79 | df_res = pd.DataFrame(res,columns=['city','abbr','city_url','xzl_name','xzl_addr','xzl_price','xzl_url','xzl_lbs']) 80 | # 按城市,名字,地址去重 81 | df_res = df_res.drop_duplicates(['city','xzl_name','xzl_addr']) 82 | # 经纬度清洗 83 | df_res = lbsTransform(df_res, lbs = 'lng') 84 | df_res = lbsTransform(df_res, lbs = 'lat') 85 | # 输出保存 86 | df_res.to_csv('%s/../data/03_effective_cities_xzl.csv'%sys.path[0], encoding='utf-8', index=False) 87 | 88 | -------------------------------------------------------------------------------- /03安居客全国写字楼信息/data/01_cities.csv: -------------------------------------------------------------------------------- 1 | city,abbr 2 | 鞍山,anshan 3 | 安阳,anyang 4 | 安庆,anqing 5 | 安康,ankang 6 | 安顺,anshun 7 | 阿坝,aba 8 | 阿克苏,akesu 9 | 阿里,ali 10 | 阿拉尔,alaer 11 | 阿拉善盟,alashanmeng 12 | 澳门,aomen 13 | 安丘,anqiu 14 | 安宁,anning 15 | 安吉县,anjixian 16 | 安溪,anxi 17 | 林州,aylinzhou 18 | 安岳,anyuexian 19 | 阿勒泰,aletai 20 | 北京,beijing 21 | 保定,baoding 22 | 包头,baotou 23 | 滨州,binzhou 24 | 宝鸡,baoji 25 | 蚌埠,bengbu 26 | 本溪,benxi 27 | 北海,beihai 28 | 巴音郭楞,bayinguoleng 29 | 巴中,bazhong 30 | 巴彦淖尔市,bayannaoer 31 | 亳州,bozhou 32 | 白银,baiyin 33 | 白城,baicheng 34 | 百色,baise 35 | 白山,baishan 36 | 博尔塔拉,boertala 37 | 毕节,bijie 38 | 保山,baoshan 39 | 霸州,bazh 40 | 北票,beipiao 41 | 北流,beiliu 42 | 博白,bobaixian 43 | 博罗,boluoxian 44 | 宝应县,baoyingxian 45 | 博兴,boxingxian 46 | 成都,chengdu 47 | 重庆,chongqing 48 | 长沙,cs 49 | 常州,cz 50 | 长春,cc 51 | 沧州,cangzhou 52 | 昌吉,changji 53 | 赤峰,chifeng 54 | 常德,changde 55 | 郴州,chenzhou 56 | 承德,chengde 57 | 长治,changzhi 58 | 池州,chizhou 59 | 滁州,chuzhou 60 | 朝阳,chaoyang 61 | 潮州,chaozhou 62 | 楚雄,chuxiong 63 | 巢湖,chaohu 64 | 昌都,changdu 65 | 长葛,changge 66 | 崇左,chongzuo 67 | 常熟,changshushi 68 | 赤壁,chibi 69 | 岑溪,cengxi 70 | 慈溪,cixi 71 | 崇州,chongzhou 72 | 慈利,cilixian 73 | 长岭,changlingxian 74 | 长兴,changxingxian 75 | 苍南县,cangnanxian 76 | 曹县,caoxian 77 | 长垣县,changyuanxian 78 | 昌乐,changle 79 | 沧县,cangxian 80 | 长宁,changning 81 | 磁县,cixian 82 | 茌平,chiping 83 | 大连,dalian 84 | 东莞,dg 85 | 德阳,deyang 86 | 大理,dali 87 | 德州,dezhou 88 | 东营,dongying 89 | 大庆,daqing 90 | 丹东,dandong 91 | 大同,datong 92 | 达州,dazhou 93 | 大丰,dafeng 94 | 德宏,dehong 95 | 定州,dingzhou 96 | 迪庆,diqing 97 | 定西,dingxi 98 | 大兴安岭,dxanling 99 | 东台,dongtai 100 | 邓州,dengzhou 101 | 东方,dongfang 102 | 儋州,danzhou 103 | 丹阳,danyang 104 | 灯塔,dengta 105 | 敦煌,dunhuang 106 | 大冶,daye 107 | 都匀,duyun 108 | 东阳,dongyang 109 | 都江堰,dujiangyan 110 | 东至,dongzhixian 111 | 德清,deqingxian 112 | 东海,donghaixian 113 | 单县,danxian 114 | 凤城,ddfengcheng 115 | 禹城,dzyucheng 116 | 大竹,dazhu 117 | 定边,dingbianxian 118 | 东明,dongmingxian 119 | 东平,dongpingxian 120 | 大悟,dawuxian 121 | 鄂尔多斯,eerduosi 122 | 恩施,enshi 123 | 鄂州,ezhou 124 | 恩平,enping 125 | 峨眉山,emeishan 126 | 佛山,foshan 127 | 福州,fz 128 | 阜阳,fuyang 129 | 抚顺,fushun 130 | 阜新,fuxin 131 | 抚州,fuzhoushi 132 | 防城港,fangchenggang 133 | 肥城市,feichengshi 134 | 丰城,fengchengshi 135 | 福清,fuqing 136 | 福安,fuan 137 | 福鼎,fuding 138 | 范县,fanxian 139 | 分宜,fenyixian 140 | 扶余,fuyushi 141 | 阜宁,funing 142 | 浮梁,fuliang 143 | 府谷,fuguxian 144 | 广州,guangzhou 145 | 贵阳,gy 146 | 桂林,guilin 147 | 赣州,ganzhou 148 | 广安,guangan 149 | 贵港,guigang 150 | 广元,guangyuan 151 | 甘孜,ganzi 152 | 甘南,gannan 153 | 馆陶,guantao 154 | 果洛,guoluo 155 | 固原,guyuan 156 | 公主岭市,gongzhulingshi 157 | 高邮,gaoyou 158 | 高密市,gaomishi 159 | 格尔木,geermu 160 | 广汉,guanghan 161 | 桂平,guiping 162 | 高安,gaoanshi 163 | 高碑店,gaobeidian 164 | 固始,gushixian 165 | 桂阳,guiyangxian 166 | 高平,gaopingshi 167 | 广饶县,guangraoxian 168 | 灌云县,guanyunxian 169 | 灌南县,guannanxian 170 | 固安,guan 171 | 谷城,gucheng 172 | 高唐,gaotangxian 173 | 冠县,guanxian 174 | 改则,gaizexian 175 | 杭州,hangzhou 176 | 合肥,hf 177 | 哈尔滨,heb 178 | 海口,haikou 179 | 惠州,huizhou 180 | 邯郸,handan 181 | 呼和浩特,huhehaote 182 | 黄冈,huanggang 183 | 淮南,huainan 184 | 黄山,huangshan 185 | 鹤壁,hebi 186 | 衡阳,hengyang 187 | 湖州,huzhou 188 | 衡水,hengshui 189 | 汉中,hanzhong 190 | 淮安,huaian 191 | 黄石,huangshi 192 | 菏泽,heze 193 | 怀化,huaihua 194 | 淮北,huaibei 195 | 葫芦岛,huludao 196 | 河源,heyuan 197 | 红河,honghe 198 | 哈密,hami 199 | 鹤岗,hegang 200 | 呼伦贝尔,hulunbeier 201 | 海北,haibei 202 | 海东,haidong 203 | 海南,hainan 204 | 河池,hechi 205 | 黑河,heihe 206 | 和县,hexian 207 | 贺州,hezhou 208 | 海拉尔,hailaer 209 | 霍邱,huoqiu 210 | 和田,hetian 211 | 黄南,huangnan 212 | 海西,hexi 213 | 鹤山,heshan 214 | 海城,haicheng 215 | 黄骅,huanghua 216 | 河间,hejian 217 | 韩城,hancheng 218 | 汉川市,hanchuanshi 219 | 海门,haimen 220 | 海宁,haining 221 | 海阳,haiyang 222 | 淮滨,huaibinxian 223 | 海安,haianxian 224 | 惠东,huidongxian 225 | 海丰县,haifengxian 226 | 桓台县,huantaixian 227 | 常宁,hychangning 228 | 海盐,haiyan 229 | 永城,hnyongcheng 230 | 滑县,huaxian 231 | 衡东,hengdongxian 232 | 华容,huarongxian 233 | 济南,jinan 234 | 嘉兴,jx 235 | 吉林,jilin 236 | 江门,jiangmen 237 | 荆门,jingmen 238 | 锦州,jinzhou 239 | 景德镇,jingdezhen 240 | 吉安,jian 241 | 济宁,jining 242 | 金华,jinhua 243 | 揭阳,jieyang 244 | 晋中,jinzhong 245 | 九江,jiujiang 246 | 焦作,jiaozuo 247 | 晋城,jincheng 248 | 荆州,jingzhou 249 | 佳木斯,jiamusi 250 | 酒泉,jiuquan 251 | 鸡西,jixi 252 | 济源,jiyuan 253 | 金昌,jinchang 254 | 嘉峪关,jiayuguan 255 | 江阴,jiangyin 256 | 靖江,jingjiang 257 | 简阳市,jianyangshi 258 | 金坛,jintan 259 | 吉首,jishou 260 | 景洪,jinghong 261 | 晋江,jinjiangshi 262 | 建瓯,jianou 263 | 胶州,jiaozhoux 264 | 句容,jurong 265 | 江油市,jiangyoushi 266 | 嘉鱼,jiayuxian 267 | 建湖,jianhuxian 268 | 嘉善,jiashanxian 269 | 莒县,juxian 270 | 昌邑,jlchangyi 271 | 桦甸,jlhuadian 272 | 京山,jmjingshan 273 | 进贤,jinxian 274 | 金湖,jinhu 275 | 钟祥,jmzhongxiang 276 | 孟州,jzmengzhou 277 | 靖边,jingbianxian 278 | 巨野,juyexian 279 | 鄄城,juanchengxian 280 | 姜堰,jiangyan 281 | 昆明,km 282 | 昆山,ks 283 | 开封,kaifeng 284 | 喀什,kashi 285 | 克拉玛依,kelamayi 286 | 垦利,kenli 287 | 克孜勒苏,lezilesu 288 | 库尔勒,kuerle 289 | 凯里,kaili 290 | 开平,kaiping 291 | 兰考,kflankao 292 | 兰州,lanzhou 293 | 廊坊,langfang 294 | 洛阳,luoyang 295 | 柳州,liuzhou 296 | 莱芜,laiwu 297 | 六安,luan 298 | 泸州,luzhou 299 | 丽江,lijiang 300 | 临沂,linyi 301 | 聊城,liaocheng 302 | 连云港,lianyungang 303 | 丽水,lishui 304 | 娄底,loudi 305 | 乐山,leshan 306 | 辽阳,liaoyang 307 | 拉萨,lasa 308 | 临汾,linfen 309 | 龙岩,longyan 310 | 漯河,luohe 311 | 凉山,liangshan 312 | 六盘水,liupanshui 313 | 辽源,liaoyuan 314 | 来宾,laibin 315 | 临沧,lingcang 316 | 临夏,linxia 317 | 临猗,linyishi 318 | 林芝,linzhi 319 | 陇南,longnan 320 | 吕梁,lvliang 321 | 临海市,linhaishi 322 | 龙海市,longhaishi 323 | 醴陵市,lilingshi 324 | 临清,linqing 325 | 龙口,longkou 326 | 莱阳,laiyang 327 | 耒阳,leiyang 328 | 溧阳,liyang 329 | 凌源,lingyuan 330 | 灵宝市,lingbaoshi 331 | 冷水江,lengshuijiang 332 | 涟源,lianyuan 333 | 陆丰,lufengshi 334 | 罗定,luoding 335 | 乐平市,lepingshi 336 | 莱州市,laizhoushi 337 | 莱西,laixi 338 | 梨树,lishuxian 339 | 利津,lijingxian 340 | 柳林,liulinxian 341 | 滦南,luannan 342 | 临朐,linju 343 | 宜阳,lyyiyang 344 | 乐陵,leling 345 | 澧县,lixian 346 | 梁山,liangshanxian 347 | 临邑,linyixian 348 | 鹿邑,luyixian 349 | 绵阳,mianyang 350 | 茂名,maoming 351 | 马鞍山,maanshan 352 | 牡丹江,mudanjiang 353 | 眉山,meishan 354 | 梅州,meizhou 355 | 明港,minggang 356 | 梅河口,meihekou 357 | 弥勒,mileshi 358 | 渑池,mianchixian 359 | 孟津,mengjin 360 | 南京,nanjing 361 | 宁波,nb 362 | 南昌,nc 363 | 南宁,nanning 364 | 南通,nantong 365 | 南充,nanchong 366 | 南阳,nanyang 367 | 宁德,ningde 368 | 内江,neijiang 369 | 南平,nanping 370 | 那曲,naqu 371 | 怒江,nujiang 372 | 南安,nananshi 373 | 宁国,ningguo 374 | 南城,nanchengxian 375 | 南县,nanxian 376 | 南漳,nanzhangxian 377 | 宁津,ningjinxian 378 | 宁阳,ningyangxian 379 | 攀枝花,panzhihua 380 | 平顶山,pingdingsha 381 | 盘锦,panjin 382 | 萍乡,pingxiang 383 | 濮阳,puyang 384 | 莆田,putian 385 | 普洱,puer 386 | 平凉,pingliang 387 | 普宁,puning 388 | 邳州,pizhou 389 | 蓬莱市,penglaishi 390 | 平湖,pinghu 391 | 平度,pingdu 392 | 彭州,pengzhou 393 | 舞钢,pdswugang 394 | 平阳,pingyang 395 | 平邑,pingyi 396 | 磐石,panshishi 397 | 青岛,qd 398 | 秦皇岛,qinhuangdao 399 | 泉州,quanzhou 400 | 曲靖,qujing 401 | 齐齐哈尔,qiqihaer 402 | 衢州,quzhou 403 | 清远,qingyuan 404 | 钦州,qinzhou 405 | 庆阳,qingyang 406 | 黔东南,qiandongnan 407 | 潜江,qianjiang 408 | 清徐,qingxu 409 | 黔南,qiannan 410 | 七台河,qitaihe 411 | 黔西南,qianxinan 412 | 迁安市,qiananshi 413 | 青州市,qingzhoushi 414 | 清镇,qingzhen 415 | 琼海,qionghai 416 | 沁阳,qinyangshi 417 | 曲阜,qufu 418 | 启东,qidong 419 | 淇县,qixian 420 | 祁阳,qiyangxian 421 | 渠县,quxian 422 | 杞县,qixianqu 423 | 迁西,qianxi 424 | 栖霞,qixia 425 | 江山,qzjiangshan 426 | 齐河,qihexian 427 | 祁东,qidongxian 428 | 日照,rizhao 429 | 日喀则,rikeze 430 | 瑞安,ruian 431 | 汝州市,ruzhoushi 432 | 任丘市,renqiushi 433 | 瑞金,ruijin 434 | 乳山市,rushanshi 435 | 仁怀,renhuai 436 | 瑞丽,ruili 437 | 如皋,rugao 438 | 荣成市,rongchengshi 439 | 如东,rudongxian 440 | 仁寿,renshouxian 441 | 日土,rituxian 442 | 上海,shanghai 443 | 深圳,shenzhen 444 | 苏州,suzhou 445 | 石家庄,sjz 446 | 沈阳,sy 447 | 三亚,sanya 448 | 绍兴,shaoxing 449 | 汕头,shantou 450 | 十堰,shiyan 451 | 三门峡,sanmenxia 452 | 三明,sanming 453 | 韶关,shaoguan 454 | 商丘,shangqiu 455 | 宿迁,suqian 456 | 绥化,suihua 457 | 邵阳,shaoyang 458 | 遂宁,suining 459 | 上饶,shangrao 460 | 四平,siping 461 | 石河子,shihezi 462 | 顺德,shunde 463 | 宿州,suzhoushi 464 | 松原,songyuan 465 | 沭阳,shuyang 466 | 石嘴山,shizuishan 467 | 随州,suizhou 468 | 朔州,shuozhou 469 | 汕尾,shanwei 470 | 三沙,sansha 471 | 商洛,shangluo 472 | 山南,shannan 473 | 神农架,shennongjia 474 | 双鸭山,shuangyashan 475 | 石狮,shishi 476 | 三河市,sanheshi 477 | 寿光,shouguang 478 | 嵊州,shengzhou 479 | 四会,sihui 480 | 邵武,shaowu 481 | 松滋,songzi 482 | 上杭,shagnhangxian 483 | 睢县,suixian 484 | 沙洋,shayangxian 485 | 邵东,shaodongxian 486 | 射洪,shehongxian 487 | 双峰,shuangfengxian 488 | 随县,suixia 489 | 邵阳县,shaoyangxian 490 | 泗阳县,siyangxian 491 | 泗洪县,sihongxian 492 | 安达,shanda 493 | 永安,smyongan 494 | 肇东,shzhaodong 495 | 广水,szguangshui 496 | 孝义,sxxiaoyi 497 | 商水,shangshui 498 | 射阳,sheyangxian 499 | 涉县,shexian 500 | 沈丘,shenqiuxian 501 | 神木,shenmuxian 502 | 天津,tianjin 503 | 太原,ty 504 | 泰州,taizhou 505 | 唐山,tangshan 506 | 泰安,taian 507 | 台州,taiz 508 | 铁岭,tieling 509 | 通辽,tongliao 510 | 铜陵,tongling 511 | 天水,tianshui 512 | 通化,tonghua 513 | 台山,taishan 514 | 铜川,tongchuan 515 | 吐鲁番,tulufan 516 | 天门,tianmen 517 | 图木舒克,tumushuke 518 | 桐城,tongcheng 519 | 铜仁,tongren 520 | 台湾,taiwan 521 | 太仓,taicang 522 | 泰兴,taixing 523 | 滕州市,tengzhoushi 524 | 桐乡,tongxiang 525 | 天长,tianchang 526 | 通许,tongxuxian 527 | 开原,tlkaiyuan 528 | 太康,taikangxian 529 | 郯城,tanchengxian 530 | 塔城,tuscaloosa 531 | 武汉,wuhan 532 | 无锡,wuxi 533 | 威海,weihai 534 | 潍坊,weifang 535 | 乌鲁木齐,wulumuqi 536 | 温州,wenzhou 537 | 芜湖,wuhu 538 | 梧州,wuzhou 539 | 渭南,weinan 540 | 乌海,wuhai 541 | 文山,wenshan 542 | 武威,wuwei 543 | 乌兰察布,wulanchabu 544 | 瓦房店,wafangdian 545 | 五家渠,wujiaqu 546 | 武夷山,wuyishan 547 | 吴忠,wuzhong 548 | 五指山,wuzhishan 549 | 温岭,wnelingshi 550 | 武安市,wuanshi 551 | 文昌,wenchang 552 | 乌兰浩特,wulanhaote 553 | 武穴,wuxue 554 | 万宁,wanning 555 | 尉氏,weishixian 556 | 无为,wuweixian 557 | 温县,wenxian 558 | 无棣,wudi 559 | 微山,weishanxian 560 | 汶上,wenshangxian 561 | 武义,wuyi 562 | 西安,xa 563 | 厦门,xm 564 | 徐州,xuzhou 565 | 湘潭,xiangtan 566 | 襄阳,xiangyang 567 | 新乡,xinxiang 568 | 信阳,xinyang 569 | 咸阳,xianyang 570 | 邢台,xingtai 571 | 孝感,xiaogan 572 | 西宁,xining 573 | 许昌,xuchang 574 | 忻州,xinzhou 575 | 宣城,xuancheng 576 | 咸宁,xianning 577 | 兴安盟,xinganmeng 578 | 新余,xinyu 579 | 西双版纳,bannan 580 | 香港,xianggang 581 | 湘西,xiangxi 582 | 仙桃,xiantao 583 | 锡林郭勒盟,xilinguole 584 | 新泰市,xintaishi 585 | 湘乡,xiangxiang 586 | 兴化,xinghuashi 587 | 兴义,xingyi 588 | 宣威,xuanwei 589 | 项城市,xiangchengshi 590 | 兴城,xingcheng 591 | 新沂,xinyishi 592 | 荥阳,xingyang 593 | 新密,xinmi 594 | 浚县,xunxian 595 | 襄垣,xiangyuanxian 596 | 孝昌,xiaochangxian 597 | 宣汉,xuanhanxian 598 | 象山,xiangshanxian 599 | 沛县,xzpeixian 600 | 老河口,xylaohekou 601 | 新安,xinan 602 | 香河,xianghe 603 | 宜城,xyyicheng 604 | 沙河,xtshahe 605 | 安陆,xganlu 606 | 湘阴,xiangyin 607 | 新昌,xinchang 608 | 盱眙,xuyuxian 609 | 莘县,xinxian 610 | 响水,xiangshuixian 611 | 新野,xinyexian 612 | 烟台,yt 613 | 扬州,yangzhou 614 | 宜昌,yichang 615 | 银川,yinchuan 616 | 阳江,yangjiang 617 | 永州,yongzhou 618 | 玉林,yulinshi 619 | 盐城,yancheng 620 | 岳阳,yueyang 621 | 运城,yuncheng 622 | 宜春,yichun 623 | 营口,yingkou 624 | 榆林,yulin 625 | 宜宾,yibin 626 | 益阳,yiyang 627 | 义乌,yiwu 628 | 玉溪,yuxi 629 | 伊犁,yili 630 | 阳泉,yangquan 631 | 延安,yanan 632 | 鹰潭,yingtan 633 | 延边,yanbian 634 | 云浮,yufu 635 | 雅安,yaan 636 | 阳春,yangchun 637 | 鄢陵,yanling 638 | 伊春,yichunshi 639 | 玉树,yushu 640 | 乐清,yueqing 641 | 禹州,yuzhou 642 | 永新,yongxin 643 | 永康市,yongkangshi 644 | 宜都,yidou 645 | 仪征,yizheng 646 | 延吉,yanji 647 | 扬中,yangzhong 648 | 伊宁,yining 649 | 英德,yingde 650 | 余姚,yuyao 651 | 偃师市,yanshishi 652 | 宜兴,yixing 653 | 永兴,yongxingxian 654 | 云梦,yunmengxian 655 | 玉环县,yuhuanxian 656 | 当阳,ycdangyang 657 | 攸县,youxian 658 | 玉田,yutian 659 | 永春,yongchun 660 | 伊川,yichuan 661 | 沅江,yyruanjiang 662 | 阳谷,yangguxian 663 | 沂南,yinanxian 664 | 沂源,yiyuanxian 665 | 郓城,yunchengxian 666 | 余江,yujiangc 667 | 燕郊,yanjiao 668 | 郑州,zhengzhou 669 | 珠海,zh 670 | 中山,zs 671 | 镇江,zhenjiang 672 | 淄博,zibo 673 | 张家口,zhangjiakou 674 | 株洲,zhuzhou 675 | 漳州,zhangzhou 676 | 湛江,zhanjiang 677 | 肇庆,zhaoqing 678 | 枣庄,zaozhuang 679 | 舟山,zhoushan 680 | 遵义,zunyi 681 | 驻马店,zhumadian 682 | 自贡,zigong 683 | 资阳,ziyang 684 | 周口,zhoukou 685 | 章丘,zhangqiu 686 | 张家界,zhangjiajie 687 | 诸城,zhucheng 688 | 庄河,zhuanghe 689 | 正定,zhengding 690 | 张北,zhangbei 691 | 张掖,zhangye 692 | 昭通,zhaotong 693 | 中卫,weizhong 694 | 赵县,zhaoxian 695 | 邹城市,zouchengshi 696 | 遵化,zunhua 697 | 张家港,zhangjiagang 698 | 枝江,zhijiang 699 | 招远市,zhaoyuanshi 700 | 资兴,zixing 701 | 樟树,zhangshu 702 | 诸暨,zhuji 703 | 涿州市,zhuozhoushi 704 | 枣阳市,zaoyangshi 705 | 泽州,zezhouxian 706 | 邹平县,zoupingxian 707 | 肇州,zhaozhou 708 | 漳浦,zhangpu 709 | -------------------------------------------------------------------------------- /03安居客全国写字楼信息/data/02_effective_cities.csv: -------------------------------------------------------------------------------- 1 | index,abbr,url 2 | 北京,beijing,https://bj.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 3 | 成都,chengdu,https://cd.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 4 | 重庆,chongqing,https://cq.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 5 | 长沙,cs,https://cs.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 6 | 长春,cc,https://cc.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 7 | 大连,dalian,https://dl.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 8 | 东莞,dg,https://dg.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 9 | 佛山,foshan,https://fs.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 10 | 福州,fz,https://fz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 11 | 广州,guangzhou,https://gz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 12 | 杭州,hangzhou,https://hz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 13 | 合肥,hf,https://hf.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 14 | 济南,jinan,https://jn.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 15 | 昆明,km,https://km.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 16 | 南京,nanjing,https://nj.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 17 | 宁波,nb,https://nb.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 18 | 南昌,nc,https://nc.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 19 | 青岛,qd,https://qd.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 20 | 上海,shanghai,https://sh.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 21 | 深圳,shenzhen,https://sz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 22 | 苏州,suzhou,https://su.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 23 | 石家庄,sjz,https://sjz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 24 | 沈阳,sy,https://shen.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 25 | 天津,tianjin,https://tj.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 26 | 太原,ty,https://ty.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 27 | 武汉,wuhan,https://wh.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 28 | 无锡,wuxi,https://wx.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 29 | 西安,xa,https://xa.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 30 | 厦门,xm,https://xm.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 31 | 郑州,zhengzhou,https://zz.sydc.anjuke.com/shangban/loupan/?from=spxzl_index_float_bglp 32 | -------------------------------------------------------------------------------- /03安居客全国写字楼信息/utils/__pycache__/digital_decryption.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/03安居客全国写字楼信息/utils/__pycache__/digital_decryption.cpython-37.pyc -------------------------------------------------------------------------------- /03安居客全国写字楼信息/utils/__pycache__/request.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/03安居客全国写字楼信息/utils/__pycache__/request.cpython-37.pyc -------------------------------------------------------------------------------- /03安居客全国写字楼信息/utils/digital_decryption.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- encoding:utf-8 -*-import requests 3 | import base64 4 | import re 5 | from io import BytesIO 6 | from fontTools.ttLib import TTFont 7 | from bs4 import BeautifulSoup as bs 8 | 9 | def get_keymap(html): 10 | """ 11 | 对base64加密的页面内容进行解密 12 | """ 13 | #获取密码 14 | keys_map = re.search(r' 2 | -------------------------------------------------------------------------------- /工具箱/01安居客数字解密/sample/secret_price.txt: -------------------------------------------------------------------------------- 1 | 麣.龤龥元/㎡/天 -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/README.md: -------------------------------------------------------------------------------- 1 | # 腾讯坐标系转百度坐标系 2 | 3 | ## 背景说明 4 | 在获取美团商圈信息时,在获取商圈名称及消费水平时,经纬度的精确性同时也必不可少。公司内部使用以百度坐标系为基准的机制,因此需要进行坐标系转化。 5 | 6 | ## 使用样例 7 | 1. 确保以下库均已安装: 8 | 9 | ```python 10 | import math 11 | ``` 12 | 13 | 14 | 2. 查看网页源代码后部分JavaScript中包含了位置的信息 15 | ![](images/美团经纬度.png) 16 | 17 | 获取到结果: 18 | 19 | ```python 20 | { ”name”:”静安大悦城”,”address”:”静安区西藏北路166号南座S1-08”,”lng”:121.471912,”lat”:31.243718} 21 | ``` 22 | 23 | 3. 而将上述对应经纬度放入百度拾取坐标系系统,发现存在lbs飘飞: 24 | ![](images/经纬度差异.png) 25 | 26 | 4. 将经纬度进行转换 27 | 28 | ```python 29 | def tx2bd(lng,lat): 30 | import math 31 | x_pi = (3.14159265358979324 * 3000.0) / 180.0 32 | x,y = float(lng),float(lat) 33 | z = math.sqrt( x * x + y * y) + 0.0002 * math.sin(y * x_pi) 34 | theta = math.atan2(y, x) + 0.000003 * math.cos(x * x_pi) 35 | lng = z * math.cos(theta) + 0.0065 36 | lat = z * math.sin(theta) + 0.006 37 | return (lng, lat) 38 | ``` 39 | 40 | 获取百度坐标系对应位置为: 41 | 42 | ```python 43 | { ”lng”:121.47856720971033,”lat”:331.249719090314727} 44 | ``` 45 | 46 | 5. 在百度拾取坐标系系统中搜索对应坐标,结果一致: 47 | ![](images/矫正经纬度.png) 48 | -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/code/tx2bd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | def tx2bd(lng,lat): 5 | import math 6 | x_pi = (3.14159265358979324 * 3000.0) / 180.0 7 | x,y = float(lng),float(lat) 8 | z = math.sqrt( x * x + y * y) + 0.0002 * math.sin(y * x_pi) 9 | theta = math.atan2(y, x) + 0.000003 * math.cos(x * x_pi) 10 | lng = z * math.cos(theta) + 0.0065 11 | lat = z * math.sin(theta) + 0.006 12 | return (lng, lat) 13 | -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/images/tmp.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/images/矫正经纬度.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/工具箱/02腾讯坐标系转百度坐标系/images/矫正经纬度.png -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/images/经纬度差异.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/工具箱/02腾讯坐标系转百度坐标系/images/经纬度差异.png -------------------------------------------------------------------------------- /工具箱/02腾讯坐标系转百度坐标系/images/美团经纬度.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Colin-zh/WebCrawler/05beb8bb35c5c3207b29e3d1e25befa78a4df28c/工具箱/02腾讯坐标系转百度坐标系/images/美团经纬度.png --------------------------------------------------------------------------------