├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── media ├── 14f211bcdea0afc6a5866eb957e5c6f0.png ├── 17fb68a2b314cf24f63506bc1e4effa2.png ├── 1aac0c8a9e00655937fa39f3b13a811f.png ├── 1be9bdafe857653afb5c805ea37ea2d8.png ├── 1bffd1df3eaecdbad8c4d8086f40caef.png ├── 20333ad34432635a9264541ff6f78685.png ├── 25082d4d0e64ad0d3289b5f91c1b042b.png ├── 2d3af90024651152cd449e987d16dce0.png ├── 30e8121447d2b47f39f57a20f2a094d2.png ├── 31f9bd59a9948b99ce4a68cb33c62637.png ├── 3600212e5f46384a8a650c990c9b69cd.png ├── 3a4625cbb8d333691acd752dbac9f869.png ├── 3f4c9c379cd1198710d449f65a29b969.png ├── 41217f9c0ec10a9e218b497c746c8929.png ├── 432cf5efa4275deae4be1d4fd8239b3e.png ├── 48638b87c85b5b47880f3cda9bcde726.png ├── 48743e4e4d35f8e96e5b7ed47305c177.png ├── 4a89d0cdd8af0769bf85ee809c13fdd5.png ├── 51e0bf81079717f142fb1a5941639fc3.png ├── 5692637864b7007bd3c02642cf516ffe.png ├── 596bdc36ae6ec999e35b77ad430e9f46.png ├── 5aa0e402894a90180243e461e6a39f95.png ├── 72cf1c473e17b6b3593e158b1f5139f6.png ├── 83ebc8892cc133b6c2967b1680a949cb.png ├── 87ae2de4ca158afeaefc0d1533b29d5d.png ├── 9045be1b03797152030b08f6e28d7c85.png ├── 952ba4c527bdaf812956ccfc79b61659.png ├── 99f40069991fafefac2d54413eed2873.png ├── a32637beec319f1d6d95c6684cc2e4f4.png ├── a7672004ad36ba789c1af504b72779a7.png ├── b4aae91db851ff529bcee3509315760f.png ├── ba94d3c7763c48c1841d2461938fc64c.png ├── c7bba517b1bf88ab5b4164008c714adb.png ├── d008eede0b04bcf5ca7f5300bf6bdc18.png ├── d650c9f0c35cf2a96466e9cd85ff6335.png ├── e8fbbc9ebf9c99dc3228849a6c454f4e.png ├── f440aaab0d41900fdf1532b8524395da.png └── f4b058489a6ded3af88d05a514c320e9.png ├── 原始数据及清洗后的数据 ├── 原始数据 │ ├── ershoufang-origin-ansi.csv │ └── ershoufang-origin-utf8.csv └── 清洗后的数据 │ ├── ershoufang-clean-ansi-v1.1.csv │ └── ershoufang-clean-utf8-v1.1.csv ├── 数据分析程序 └── data_analysis │ ├── .spyproject │ ├── codestyle.ini │ ├── encoding.ini │ ├── vcs.ini │ └── workspace.ini │ ├── data_ana │ ├── business_attr.py │ ├── ciyun.py │ ├── gaode_map.py │ ├── house_attr.py │ ├── map │ │ ├── star.txt │ │ ├── total.js │ │ ├── unitprice.js │ │ ├── unitprice1.js │ │ ├── xiaoyu201.js │ │ ├── 南京二手房单价热力图.html │ │ ├── 南京二手房总价小于200万的分布图.html │ │ ├── 南京二手房总价热力图.html │ │ ├── 原始代码.html │ │ └── 闪烁点.html │ ├── picture │ │ ├── 10南京二手房房屋用途水平柱状图.png │ │ ├── 11南京各区域二手房总价箱形图.png │ │ ├── 11南京各区域二手房总价箱形图2.png │ │ ├── 12南京各区域二手房单价箱形图.png │ │ ├── 13南京二手房建筑面积分布区间.png │ │ ├── 14南京二手房总价与建筑面积散点图.png │ │ ├── 15南京二手房单价与建筑面积散点图.png │ │ ├── 1南京各区域二手房平均单价.png │ │ ├── 2南京各区域二手房平均建筑面积.png │ │ ├── 3南京各区域平均建筑面积和单价.png │ │ ├── 4南京二手房各区域房源数量.png │ │ ├── 4南京二手房各区域房源数量2.png │ │ ├── 5南京二手房单价最高Top10.png │ │ ├── 5南京二手房单价最高Top20.png │ │ ├── 6南京二手房房屋户型占比情况.png │ │ ├── 7南京二手房房屋装修占比情况.png │ │ ├── 8南京二手房建筑类型占比情况.png │ │ ├── 9南京二手房房屋朝向分布情况.png │ │ ├── 南京二手房数据词云.png │ │ └── 南京二手房数据词云2.png │ ├── price_and_area.py │ └── test_ana.py │ ├── data_clean │ └── test_clean.py │ ├── data_cluster │ ├── __pycache__ │ │ └── kmeans.cpython-36.pyc │ ├── cluster_analysis.py │ ├── kmeans.py │ ├── result │ │ ├── map │ │ │ ├── cluster0.js │ │ │ ├── cluster1.js │ │ │ ├── cluster2.js │ │ │ ├── cluster3.js │ │ │ ├── cluster4.js │ │ │ ├── k-means聚类结果分组0 - dark.html │ │ │ ├── k-means聚类结果分组0.html │ │ │ ├── k-means聚类结果分组1 - dark.html │ │ │ ├── k-means聚类结果分组1.html │ │ │ ├── k-means聚类结果分组2 - dark.html │ │ │ ├── k-means聚类结果分组2.html │ │ │ ├── k-means聚类结果分组3-dark.html │ │ │ ├── k-means聚类结果分组3.html │ │ │ ├── k-means聚类结果分组4 - dark.html │ │ │ └── k-means聚类结果分组4.html │ │ ├── 不同k值下的平方误差和.png │ │ ├── 不同k值下的总和方差折线图1.png │ │ ├── 不同k值下的总和方差折线图2.png │ │ ├── 聚类结果-单价与建筑面积散点图.png │ │ ├── 聚类结果-单价与建筑面积散点图2.png │ │ ├── 聚类结果-总价价与建筑面积散点图.png │ │ └── 聚类结果-总价价与建筑面积散点图2.png │ └── run.py │ ├── data_file │ ├── baidu_map_total.js │ ├── baidu_map_unitprice.txt │ ├── ershoufang-clean-ansi-v1.1.csv │ ├── ershoufang-clean-utf8-v1.0.txt │ ├── ershoufang-clean-utf8-v1.1.csv │ ├── ershoufang-mini-ansi.csv │ ├── ershoufang-mini-utf8.csv │ ├── ershoufang-origin-ansi.csv │ ├── ershoufang-origin-utf8.csv │ ├── latlng - 副本.csv │ ├── latlng.csv │ ├── star.txt │ ├── testv1.csv │ ├── testv2.csv │ └── xiaoyu201.js │ ├── pylot.py │ └── resources │ ├── HYQiHei-25J.ttf │ ├── house1.jpg │ ├── house2.jpg │ └── simhei.ttf ├── 数据爬虫程序 └── lianjia │ ├── .spyproject │ ├── codestyle.ini │ ├── encoding.ini │ ├── vcs.ini │ └── workspace.ini │ ├── __pycache__ │ ├── html_downloader.cpython-36.pyc │ ├── html_outputer.cpython-36.pyc │ ├── html_parser.cpython-36.pyc │ ├── log.cpython-36.pyc │ └── url_manager.cpython-36.pyc │ ├── html_downloader.py │ ├── html_outputer.py │ ├── html_parser.py │ ├── log.py │ ├── logs │ ├── 2018-03-22 log.txt │ ├── 2018-04-01 log.txt │ ├── 2018-04-02 log.txt │ └── header.txt │ ├── output │ ├── ershoufang - 20000.csv │ ├── ershoufang - 副本.csv │ ├── ershoufang-10000.csv │ └── ershoufang.csv │ ├── spider_main.py │ ├── test.py │ └── url_manager.py └── 结果分享PPT.pptx /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.html linguist-language=python 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于Python的南京二手房数据采集及可视化分析 2 | ## 1 内容简介 3 | 首先通过爬虫采集链家网上所有南京二手房的房源数据,并对采集到的数据进行清洗;然后,对清洗后的数据进行可视化分析,探索隐藏在大量数据背后的规律;最后,采用一个聚类算法对所有二手房数据进行聚类分析,并根据聚类分析的结果,将这些房源大致分类,以对所有数据的概括总结。通过上述分析,我们可以了解到目前市面上二手房各项基本特征及房源分布情况,帮助我们进行购房决策。 4 | 5 | ## 2 应用技术介绍 6 | 1)Python网络爬虫技术 7 | - Requests 8 | - Beautifulsoup 9 | 10 | 2)Python数据分析技术 11 | - Numpy 12 | - Matplotlib 13 | - Pandas 14 | 15 | 3)k-means聚类算法 16 | 17 | 4)高德地图开发者应用JS API 18 | 19 | 20 | 21 | ## 3 数据采集及数据清洗 22 | 23 | ### 3.1 数据采集 24 | 25 | 该部分通过网络爬虫程序抓取链家网上所有南京二手房的数据,收集原始数据,作为整个数据分析的基石。 26 | 27 | 3.1.1 链家网网站结构分析 28 | 29 | 链家网二手房主页界面如图1、图2,主页上面红色方框位置显示目前南京二手房在售房源的各区域位置名称,中间红色方框位置显示了房源的总数量,下面红色方框显示了二手房房源信息缩略图,该红色方框区域包含了二手房房源页面的URL地址标签。图2下面红色方框显示了二手房主页上房源的页数。 30 | 31 | 链家网二手房主页截图上半部分: 32 | 33 | ![](media/5aa0e402894a90180243e461e6a39f95.png) 34 | 35 | 图1 链家网二手房主页 36 | 37 | 链家网二手房主页截图下半部分: 38 | 39 | ![](media/e8fbbc9ebf9c99dc3228849a6c454f4e.png) 40 | 41 | 图2 链家网二手房主页 42 | 43 | 二手房房源信息页面如图3、图4。我们需要采集的目标数据就在该页面,包括基本信息、房屋属性和交易属性三大类。各类信息包括的数据项如下: 44 | 45 | 1)基本信息:小区名称、所在区域、总价、单价。 46 | 47 | 2)房屋属性:房屋户型、所在楼层、建筑面积、户型结构、套内面积、建筑类型、房屋朝向、建筑结构、装修情况、梯户比例、配备电梯、产权年限。 48 | 49 | 3)交易属性:挂牌时间、交易权属、上次交易、房屋用途、房屋年限、产权所属、抵押信息、房本备件。 50 | 51 | ![](media/31f9bd59a9948b99ce4a68cb33c62637.png) 52 | 53 | 图3 二手房房源信息页面 54 | 55 | ![](media/83ebc8892cc133b6c2967b1680a949cb.png) 56 | 57 | 图4 二手房房源信息页面 58 | 59 | 3.1.3 网络爬虫程序关键问题说明 60 | 61 | 1)问题1:链家网二手房主页最多只显示100页的房源数据,所以在收集二手房房源信息页面URL地址时会收集不全,导致最后只能采集到部分数据。 62 | 63 | 解决措施:将所有南京二手房数据分区域地进行爬取,100页最多能够显示3000套房,该区域房源少于3000套时可以直接爬取,如果该区域房源超过3000套可以再分成更小的区域。 64 | 65 | 2)问题2:爬虫程序如果运行过快,会在采集到两、三千条数据时触发链家网的反爬虫机制,所有的请求会被重定向到链家的人机鉴定页面,从而会导致后面的爬取失败。 66 | 67 | 解决措施:①为程序中每次http请求构造header并且每次变换http请求header信息头中USER_AGENTS数据项的值,让请求信息看起来像是从不同浏览器发出的访问请求。②爬虫程序每处理完一次http请求和响应后,随机睡眠1-3秒,每请求2500次后,程序睡眠20分钟,控制程序的请求速度。 68 | 69 | ### 3.2 数据清洗 70 | 71 | 对于爬虫程序采集得到的数据并不能直接分析,需要先去掉一些“脏”数据,修正一些错误数据,统一所有数据字段的格式,将这些零散的数据规整成统一的结构化数据。 72 | 73 | 3.2.1 原始数据主要需要清洗的部分 74 | 75 | 主要需要清洗的数据部分如下: 76 | 77 | 1)将杂乱的记录的数据项对齐 78 | 79 | 2)清洗一些数据项格式 80 | 81 | 3)缺失值处理 82 | 83 | 3.2.3 数据清洗结果 84 | 85 | 数据清洗前原始数据如图8,清洗后的数据如图9,可以看出清洗后数据已经规整了许多。 86 | 87 | ![](media/48743e4e4d35f8e96e5b7ed47305c177.png) 88 | 89 | 图8 清洗前原始数据截图 90 | 91 | ![](media/4a89d0cdd8af0769bf85ee809c13fdd5.png) 92 | 93 | 图9 清洗后的数据截图 94 | 95 | ## 4 数据可视化分析 96 | 97 | 在数据清洗完成后,我们就可以开始对数据进行可视化分析。该阶段主要是对数据做一个探索性分析并将结果可视化呈现,帮助人们更好、更直观的认识数据,把隐藏在大量数据背后的信息集中和提炼出来。本文主要对二手房房源的总价、单价、面积、户型、地区等属性进行了分析。 98 | 99 | 数据可视化分析主要步骤如下:1)数据加载;2)数据转换;3)数据可视化呈现。 100 | 101 | ### 4.1 数据加载 102 | 103 | 数据分析和建模的大量工作都是用在数据准备上的,如:清理、加载、转换等。清洗完成后的数据仍然存储在文本文件(CSV格式)中,要对数据进行可视化分析,必须先要将数据按一定结果加载到内存中。我们使用Pandas提供的DataFrame对象来加载和处理我们清洗后的数据,Pandas同时提供将表格型数据读取为DataFrame对象的函数。数据加载处理过程中需要注意的主要问题如下: 104 | 105 | 1)数据项的行列索引的处理; 106 | 107 | 2)数据类型推断和数据转换; 108 | 109 | 3)缺失值的处理。 110 | 111 | 4.2 数据整体质量分析 112 | 113 | 4.2.1 数据基本情况 114 | 115 | 数据加载后,数据基本情况如图10。从图中可以看到加载后的数据一共20527行、25列,占用内存3.9+MB。在数据类型上,一共有3列float64类型,2列int64类型,20列object类型。除了户型结构、套内面积、抵押信息三列数据项缺失值比较多之外,其他列数据项的缺失值都不多,所以数据整体的质量还不错。 116 | 117 | ![](media/3a4625cbb8d333691acd752dbac9f869.png) 118 | 119 | 图10 数据基本情况图 120 | 121 | 4.2.2 整体数据文件词云 122 | 123 | 从整体数据文件词云(见图11),我们可以得到在南京二手房房源信息中经常出现的高频词,如商品房、普通住宅、一梯两户、钢混结构、精装等。我们可以通过这些高频词,十分粗略的了解整个数据文件中的基本内容。 124 | 125 | ![](media/c7bba517b1bf88ab5b4164008c714adb.png) 126 | 127 | 图11 整体数据文件词云 128 | 129 | 4.2.3 南京各区域二手房房源数量折线图 130 | 131 | 南京各区域二手房房源数量折线图(见图13)横轴为南京各个行政区域名称,纵轴为房源数量(套)。从图中可以看出,江宁在售的房源数量最多,高达5000多套,占了总量的1/4。与之相反的是六合区,六合区在售的房源数量仅有1套,数量太少,其他各区的数量相差不多。所以我们后面关于六合区的分析会存在一定误差。 132 | 133 | ![](media/a7672004ad36ba789c1af504b72779a7.png) 134 | 135 | 图13 南京各区域二手房房源数量折线图 136 | 137 | 4.2.4 南京二手房房屋用途水平柱状图 138 | 139 | 南京二手房房屋用途水平柱状图(见图14)横轴为房源数量(套),纵轴为房屋用途类型。从图中我们可以看出,房屋用途类型有:普通住宅、别墅、商业办公、酒店式公寓、车库5中类型。其中我们主要关心的普通住宅类型的房源数量近20000套,占总量绝大部分。所以在本文中,我们没有剔除掉房屋用途为其他类型的记录,因为这些类型在所有房源样本中占比相当少,不会影响后面的分析结果,同时它们也属于二手房的范畴内。 140 | 141 | ![](media/87ae2de4ca158afeaefc0d1533b29d5d.png) 142 | 143 | 图14 南京二手房房屋用途水平柱状图 144 | 145 | 4.2.5 数据整体质量总结 146 | 147 | 通过前面的分析,我们可以看出该数据文件的整体质量还不错。虽然存在一些缺失值比较多的数据项,但我们比较关注一些数据项缺失值不多。这些缺失值较多的都是一些次要的数据项,不影响我们的分析。在房屋用途类型上,数据文件中一共包括了5种类型的二手房房源信息,其中普通住宅类型占比98%以上,所以我们后面分析基本可以看成是针对普通住宅类型的二手房进行的分析,这也符合我们期望。整个数据文件中唯一不足的是六合区域的二手房房源样本点太少,这使我们对六合区域的分析会存在一定的误差。 148 | 149 | ### 4.3 南京二手房基本信息可视化分析 150 | 151 | 二手房基本信息可视化分析主要针对二手房:区域、总价、单价、建筑面积四个属性的分析。 152 | 153 | 4.3.1 南京各区域二手房平均单价柱状图 154 | 155 | 南京各区域二手房平均单价柱状图(见图15)横轴为南京各区域名称,纵轴为单价(元/平米)。从图中我们可以看到建邺区和鼓楼区二手房平均单价最高,近40000元/平米。建邺区是市中心城区,近几年发展势头很好,房价一路飙升,现在已经成了南京最贵的区域之一。鼓楼区作为南京市的核心地带,拥有众多商场和学区房,其均价一直高升不下。从整体上来看,南京市各个区域(除去存在误差的六合区)均价都已经超过了20000元/平米。这些可以体现出近几年南京市房价猛涨的结果。浦口区虽然相比房价已经很低了,但相较于浦口前几年的房价,差不多是翻了一番。 156 | 157 | ![](media/3f4c9c379cd1198710d449f65a29b969.png) 158 | 159 | 图15 南京各区域二手房平均单价 160 | 161 | 4.3.2 南京各区域二手房单价和总价箱线图 162 | 163 | 南京各区域二手房单价箱线图(见图16)横轴为南京各区域名称,纵轴为单价(元/平米)。二手房平均单价虽然是一个重要参考数据,但平均值不能有效的表示出数据整体上的分布情况,特别是数据中一些离散值的分布情况,这些信息的表现则需要借助箱线图。从图16中可以看出,建邺和鼓楼两个区域房源单价正常值分布都不是太集中,50%的单价分布在30000-50000的区间内,区间跨度比其他区都要大。虽然建邺区平均单价略高于鼓楼区,但鼓楼区的异常值特别多,单价超过50000的房源数不胜数,最高单价有达到100000的,单价上限远高于建邺区,而建邺区异常值相对较少。综合以上情况来看,鼓楼区应该是南京市单价最高的区域。与鼓楼区相邻的玄武区和秦淮区单价正常值分布较为集中50%的数据都分布在30000-40000之间,但这两个异常值也比较多,单价上限也非常高。这些区域单价如此多的异常值,跟这些区域集中的教育和医疗资源有着密不可分的关系。 164 | 165 | ![](media/3600212e5f46384a8a650c990c9b69cd.png) 166 | 167 | 图16 南京各区域二手房单价箱线图 168 | 169 | 南京各区域二手房总价箱线图(见图17和图18)横轴为南京各区域名称,纵轴为单价(万元)。图18对图17纵轴进行了缩放,更易于观察,其他方面没有区别。从总价这个维度来看,鼓楼、建邺这两个单价最高区域,总价非常的高,500万元的二手房以分布在正常值范围内了。南京其他各区域二手房价格大部分都集中在200-400万元之间,下四分位数十分靠近200万。江宁、栖霞虽然在单价不高,但总价不低,尤其是近几年房价涨幅比较高的江宁,500万以上异常值都已经比较多了。浦口区总价数据分布最为集中,绝大部分数据都200-300万区间内。 170 | 171 | ![](media/ba94d3c7763c48c1841d2461938fc64c.png) 172 | 173 | 图17 南京各区域二手房总价箱线图 174 | 175 | ![](media/72cf1c473e17b6b3593e158b1f5139f6.png) 176 | 177 | 图18 南京各区域二手房总价箱线图 178 | 179 | 4.3.3 南京二手房单价最高Top20 180 | 181 | 南京二手房单价最高Top20水平柱状图(见图19)横轴为单价(元/平米),纵轴为小区名字。从图中可以看出,单价前20的房源都已经超过9万,并且都集中在鼓楼区,这也印证了上面箱线图中鼓楼区如此多异常值的存在。 182 | 183 | ![](media/99f40069991fafefac2d54413eed2873.png) 184 | 185 | 图19 南京二手房单价最高Top20 186 | 187 | 4.3.4 南京二手房单价和总价热力图 188 | 189 | 南京二手房单价热力图(见图20)和南京二手房总价热力图(见图21)红色区域代表房源密集度高且房价高的区域。从图中可以看出鼓楼、玄武、秦淮、建邺上半部分是密集度最高的区域。这4个区域处于南京市正中心的位置,交通方便,医疗、教育等资源集中,这些因素一起造就了这些区域高价格。 190 | 191 | ![](media/14f211bcdea0afc6a5866eb957e5c6f0.png) 192 | 193 | 图20 南京二手房单价热力图 194 | 195 | ![](media/2d3af90024651152cd449e987d16dce0.png) 196 | 197 | 图21 南京二手房总价热力图 198 | 199 | 4.3.5 南京二手房总价小于200万的分布图 200 | 201 | 南京二手房总价小于200万的房源一共有6000多套,分布图见图23。从图中我们可以看出,除了鼓楼区和建邺区比较少,其他区域低于200万的房子还是有的。 202 | 203 | ![](media/41217f9c0ec10a9e218b497c746c8929.png) 204 | 205 | 图23 南京二手房总价小于200万的分布图 206 | 207 | 4.3.6 南京二手房建筑面积分析 208 | 209 | 南京二手房建筑面积分布区间图(图24)横轴为房源数量(套),纵轴为分布区间(平米)。从图中可以看出在建筑面积50-100区间内房源数量最多,超过了10000套。其次是100-150区间与小于50的区间。 210 | 211 | ![](media/d650c9f0c35cf2a96466e9cd85ff6335.png) 212 | 213 | 图24 南京二手房建筑面积分布区间柱状图 214 | 215 | 南京各区域平均建筑面积柱状图(图25)横轴为各区域名字,纵轴为建筑面积(平米)。从图中可以看出玄武、秦淮、鼓楼这几个单价比较高的老城区平均建筑面积最小,平均面积80平米左右。反而是江宁、浦口这两个单价最低的区域平均建筑面积最大,平均面积大小超过了100平米。 216 | 217 | ![](media/f4b058489a6ded3af88d05a514c320e9.png) 218 | 219 | 图25 南京各区域二手房平均建筑面积柱状图 220 | 221 | 4.3.7 南京二手房单价、总价与建筑面积散点图 222 | 223 | 南京二手房总价与建筑面积散点图(图26)横轴为建筑面积(平米),纵轴为总价(万元)。从图中可以看出,总价与建筑面积这两个变量符合正相关关系。数据点分布比较集中,大多数都在总价0-1500万元与建筑面积0-400平米这个区域内。 224 | 225 | ![](media/25082d4d0e64ad0d3289b5f91c1b042b.png) 226 | 227 | 图26 南京二手房总价与建筑面积散点图 228 | 229 | 南京二手房单价与建筑面积散点图(图27)横轴为建筑面积(平米),纵轴为单价(元/平米)。从图中可以看出建筑面积与单价并无明显关系,同样样本点分布也较为集中,离散值不多,但单价特别高的房源,建筑面积都不是太大,可能因为这些房源一般都位于市中心。 230 | 231 | ![](media/1be9bdafe857653afb5c805ea37ea2d8.png) 232 | 233 | 图27 南京二手房单价与建筑面积散点图 234 | 235 | ### 4.4 南京二手房房屋属性可视化分析 236 | 237 | 4.4.1 南京二手房房屋户型占比情况 238 | 239 | 从南京二手房房屋户型饼状图(图28)中可以看出,2室1厅与2室2厅作为标准配置,一共占比接近一半。其中3室2厅和3室1厅的房源也占比不少,其他房屋户型的房源占比就比较少了。 240 | 241 | ![](media/952ba4c527bdaf812956ccfc79b61659.png) 242 | 243 | 图28 南京二手房房屋户型饼状图 244 | 245 | 4.4.2 南京二手房房屋装修情况 246 | 247 | 从南京二手房房屋装修情况饼状图(图29)可以看出,近60%的房源的房屋装修情况都是其他,可能因为房源全部为二手房的缘故,大家都自主装修过的。 248 | 249 | ![](media/1aac0c8a9e00655937fa39f3b13a811f.png) 250 | 251 | 图29 南京二手房装修情况饼状图 252 | 253 | 4.4.3 南京二手房房屋朝向分布情况 254 | 255 | 南京二手房房屋朝向柱状图(图30)横轴为房屋朝向,纵轴为房源数量(套)。从图中我们可以看出,只有少数几种的朝向比较多,其余的都非常少,明显属于长尾分布类型(严重偏态)。这也符合我们的认识,房屋朝向一半以上都是坐北朝南的。 256 | 257 | ![](media/f440aaab0d41900fdf1532b8524395da.png) 258 | 259 | 图30 南京二手房房屋朝向分布柱状图 260 | 261 | 4.4.4 南京二手房建筑类型占比情况 262 | 263 | 从南京二手房建筑类型饼状图(图31)中,我们可以看出房源的建筑类型65.6%都是板楼,现在房地产商喜欢开发的塔楼反而较少,这和南京二手房建筑时间都比较久远相符。 264 | 265 | ![](media/48638b87c85b5b47880f3cda9bcde726.png) 266 | 267 | 图31 南京二手房建筑类型饼状图 268 | 269 | ## 5 数据聚类分析 270 | 271 | 该阶段采用聚类算法中的k-means算法对所有二手房数据进行聚类分析,根据聚类的结果和经验,将这些房源大致分类,已达到对数据概括总结的目的。在聚类过程中,我们选择了面积、总价和单价这三个数值型变量作为样本点的聚类属性。 272 | 273 | ### 5.1 k-means算法原理 274 | 275 | 5.1.1 基本原理 276 | 277 | k-Means算法是一种使用最普遍的聚类算法,它是一种无监督学习算法,目的是将相似的对象归到同一个簇中。簇内的对象越相似,聚类的效果就越好。该算法不适合处理离散型属性,但对于连续型属性具有较好的聚类效果。 278 | 279 | 5.1.2 聚类效果判定标准 280 | 281 | 使各个样本点与所在簇的质心的误差平方和达到最小,这是评价k-means算法最后聚类效果的评价标准。 282 | 283 | ![](media/432cf5efa4275deae4be1d4fd8239b3e.png) 284 | 285 | 5.1.3 算法实现步骤 286 | 287 | 1)选定k值 288 | 289 | 2)创建k个点作为k个簇的起始质心。 290 | 291 | 3)分别计算剩下的元素到k个簇的质心的距离,将这些元素分别划归到距离最小的簇。 292 | 293 | 4)根据聚类结果,重新计算k个簇各自的新的质心,即取簇中全部元素各自维度下的算术平均值。 294 | 295 | 5)将全部元素按照新的质心重新聚类。 296 | 297 | 6)重复第5步,直到聚类结果不再变化。 298 | 299 | 7)最后,输出聚类结果。 300 | 301 | 5.1.4 算法缺点 302 | 303 | 虽然K-Means算法原理简单,但是有自身的缺陷: 304 | 305 | 1)聚类的簇数k值需在聚类前给出,但在很多时候中k值的选定是十分难以估计的,很多情况我们聚类前并不清楚给出的数据集应当分成多少类才最恰当。 306 | 307 | 2)k-means需要人为地确定初始质心,不一样的初始质心可能会得出差别很大的聚类结果,无法保证k-means算法收敛于全局最优解。 308 | 309 | 3)对离群点敏感。 310 | 311 | 4)结果不稳定(受输入顺序影响)。 312 | 313 | 5)时间复杂度高O(nkt),其中n是对象总数,k是簇数,t是迭代次数。 314 | 315 | ### 5.2 算法实现关键问题说明 316 | 317 | 5.2.1 K值的选定说明 318 | 319 | 根据聚类原则:组内差距要小,组间差距要大。我们先算出不同k值下各个SSE(Sum of 320 | squared 321 | errors)值,然后绘制出折线图(图32)来比较,从中选定最优解。从图中,我们可以看出k值到达5以后,SSE变化趋于平缓,所以我们选定5作为k值。 322 | 323 | ![](media/1bffd1df3eaecdbad8c4d8086f40caef.png) 324 | 325 | 图32 不同k值下SSE值折线图 326 | 327 | 5.2.2 初始的K个质心选定说明 328 | 329 | 初始的k个质心选定是采用的随机法。从各列数值最大值和最小值中间按正太分布随机选取k个质心。5.2.3 330 | 关于离群点 331 | 332 | 离群点就是远离整体的,非常异常、非常特殊的数据点。因为k-means算法对离群点十分敏感,所以在聚类之前应该将这些“极大”、“极小”之类的离群数据都去掉,否则会对于聚类的结果有影响。离群点的判定标准是根据前面数据可视化分析过程的散点图和箱线图进行判定。根据散点图和箱线图,需要去除离散值的范围如下: 333 | 334 | 1)单价:基本都在100000以内,没有特别的异常值。 335 | 336 | 2)总价:基本都集中在3000以内,这里我们需要去除3000外的异常值。 337 | 338 | 3)建筑面积:基本都集中在500以内,这里我们需要去除500外的异常值。 339 | 340 | 5.2.4 数据的标准化 341 | 342 | 因为总价的单位为万元,单价的单位为元/平米,建筑面积的单位为平米,所以数据点计算出欧几里德距离的单位是没有意义的。同时,总价都是3000以内的数,建筑面积都是500以内的数,但单价基本都是20000以上的数,在计算距离时单价起到的作用就比总价大,总价和单价的作用都远大于建筑面积,这样聚类出来的结果是有问题的。这样的情况下,我们需要将数据标准化,即将数据按比例缩放,使之都落入一个特定区间内。去除数据的单位限制,将其转化为无量纲的纯数值,便于不同单位或量级的指标能够进行计算和比较。 343 | 344 | 我们将单价、总价和面积都映射到500,因为面积本身就都在500以内,不要特别处理。单价在计算距离时,需要先乘以映射比例0.005,总价需要乘以映射比例0.16。进行数据标准化前和进行数据标准化后的聚类效果对比如下:图32、图33是没有数据标准化前的聚类效果散点图;图34、图35是数据标准化后的聚类效果散点图。 345 | 346 | 数据标准化前的单价与建筑面积聚类效果散点图: 347 | 348 | ![](media/20333ad34432635a9264541ff6f78685.png) 349 | 350 | 图32 数据标准化前的单价与建筑面积散点图 351 | 352 | 数据标准化前总价与建筑面积聚类效果散点图。 353 | 354 | ![](media/d008eede0b04bcf5ca7f5300bf6bdc18.png) 355 | 356 | 图33 数据标准化前总价与建筑面积散点图 357 | 358 | 数据标准化后单价与建筑面积聚类效果散点图。 359 | 360 | ![](media/17fb68a2b314cf24f63506bc1e4effa2.png) 361 | 362 | 图34 数据标准化后单价与建筑面积散点图 363 | 364 | 数据标准化后总价与建筑面积聚类效果散点图。 365 | 366 | ![](media/51e0bf81079717f142fb1a5941639fc3.png) 367 | 368 | 图35 数据标准化后总价与建筑面积散点图 369 | 370 | ### 5.3 聚类结果分析 371 | 372 | 聚类结果如下 373 | 374 | 1)聚类结果统计信息如下: 375 | 376 | ![](media/b4aae91db851ff529bcee3509315760f.png) 377 | 378 | 2)聚类后的单价与建筑面积散点图和总价与建筑面积散点图见图34、图35。 379 | 380 | 3)聚类结果分组0、1、2、3、4的区域分布图分别如下:图36、图37、图38、图39、图40。 381 | 382 | 聚类结果分组0的区域分布图如下: 383 | 384 | ![](media/596bdc36ae6ec999e35b77ad430e9f46.png) 385 | 386 | 图36 聚类结果0区域分布图 387 | 388 | 聚类结果分组1的区域分布图如下: 389 | 390 | ![](media/30e8121447d2b47f39f57a20f2a094d2.png) 391 | 392 | 图37 聚类结果1区域分布图 393 | 394 | 聚类结果分组2的区域分布图如下: 395 | 396 | ![](media/9045be1b03797152030b08f6e28d7c85.png) 397 | 398 | 图38 聚类结果2区域分布图 399 | 400 | 聚类结果分组3的区域分布图如下: 401 | 402 | ![](media/5692637864b7007bd3c02642cf516ffe.png) 403 | 404 | 图39 聚类结果3区域分布图 405 | 406 | 聚类结果分组4的区域分布图如下: 407 | 408 | ![](media/a32637beec319f1d6d95c6684cc2e4f4.png) 409 | 410 | 图40 聚类结果4区域分布图 411 | 412 | 根据以上聚类结果和我们的经验分析,我们大致可以将这20000多套房源分为以下4类: 413 | 414 | a、大户型(面积大,总价高),属于第0类。平均面积都在200平以上,这种大户型的房源相对数量较少,主要分布在鼓楼、建邺、江宁、栖霞等地(具体可从各类中的区域分布图可知)。 415 | 416 | b、地段型(单价高),属于第2、4类。这种房源围绕南京市中心位置集中分布,地理位置极好,交通方便,主要分布鼓楼、玄武、建邺、建邺等地(具体可从各类中的区域分布图可知)。 417 | 418 | c、大众蜗居型(面积小、价格相对较低、房源多),属于第3类。这类房源分布范围广,主要围绕在各地铁线两边。典型的区域有秦淮、鼓楼、江宁、玄武、浦口等地。 419 | 420 | d、高性价比型(面积相对大,单价低),属于第1类。典型的区域有栖霞、浦口、江宁等地。 421 | -------------------------------------------------------------------------------- /media/14f211bcdea0afc6a5866eb957e5c6f0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/14f211bcdea0afc6a5866eb957e5c6f0.png -------------------------------------------------------------------------------- /media/17fb68a2b314cf24f63506bc1e4effa2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/17fb68a2b314cf24f63506bc1e4effa2.png -------------------------------------------------------------------------------- /media/1aac0c8a9e00655937fa39f3b13a811f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/1aac0c8a9e00655937fa39f3b13a811f.png -------------------------------------------------------------------------------- /media/1be9bdafe857653afb5c805ea37ea2d8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/1be9bdafe857653afb5c805ea37ea2d8.png -------------------------------------------------------------------------------- /media/1bffd1df3eaecdbad8c4d8086f40caef.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/1bffd1df3eaecdbad8c4d8086f40caef.png -------------------------------------------------------------------------------- /media/20333ad34432635a9264541ff6f78685.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/20333ad34432635a9264541ff6f78685.png -------------------------------------------------------------------------------- /media/25082d4d0e64ad0d3289b5f91c1b042b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/25082d4d0e64ad0d3289b5f91c1b042b.png -------------------------------------------------------------------------------- /media/2d3af90024651152cd449e987d16dce0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/2d3af90024651152cd449e987d16dce0.png -------------------------------------------------------------------------------- /media/30e8121447d2b47f39f57a20f2a094d2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/30e8121447d2b47f39f57a20f2a094d2.png -------------------------------------------------------------------------------- /media/31f9bd59a9948b99ce4a68cb33c62637.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/31f9bd59a9948b99ce4a68cb33c62637.png -------------------------------------------------------------------------------- /media/3600212e5f46384a8a650c990c9b69cd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/3600212e5f46384a8a650c990c9b69cd.png -------------------------------------------------------------------------------- /media/3a4625cbb8d333691acd752dbac9f869.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/3a4625cbb8d333691acd752dbac9f869.png -------------------------------------------------------------------------------- /media/3f4c9c379cd1198710d449f65a29b969.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/3f4c9c379cd1198710d449f65a29b969.png -------------------------------------------------------------------------------- /media/41217f9c0ec10a9e218b497c746c8929.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/41217f9c0ec10a9e218b497c746c8929.png -------------------------------------------------------------------------------- /media/432cf5efa4275deae4be1d4fd8239b3e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/432cf5efa4275deae4be1d4fd8239b3e.png -------------------------------------------------------------------------------- /media/48638b87c85b5b47880f3cda9bcde726.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/48638b87c85b5b47880f3cda9bcde726.png -------------------------------------------------------------------------------- /media/48743e4e4d35f8e96e5b7ed47305c177.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/48743e4e4d35f8e96e5b7ed47305c177.png -------------------------------------------------------------------------------- /media/4a89d0cdd8af0769bf85ee809c13fdd5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/4a89d0cdd8af0769bf85ee809c13fdd5.png -------------------------------------------------------------------------------- /media/51e0bf81079717f142fb1a5941639fc3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/51e0bf81079717f142fb1a5941639fc3.png -------------------------------------------------------------------------------- /media/5692637864b7007bd3c02642cf516ffe.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/5692637864b7007bd3c02642cf516ffe.png -------------------------------------------------------------------------------- /media/596bdc36ae6ec999e35b77ad430e9f46.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/596bdc36ae6ec999e35b77ad430e9f46.png -------------------------------------------------------------------------------- /media/5aa0e402894a90180243e461e6a39f95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/5aa0e402894a90180243e461e6a39f95.png -------------------------------------------------------------------------------- /media/72cf1c473e17b6b3593e158b1f5139f6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/72cf1c473e17b6b3593e158b1f5139f6.png -------------------------------------------------------------------------------- /media/83ebc8892cc133b6c2967b1680a949cb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/83ebc8892cc133b6c2967b1680a949cb.png -------------------------------------------------------------------------------- /media/87ae2de4ca158afeaefc0d1533b29d5d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/87ae2de4ca158afeaefc0d1533b29d5d.png -------------------------------------------------------------------------------- /media/9045be1b03797152030b08f6e28d7c85.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/9045be1b03797152030b08f6e28d7c85.png -------------------------------------------------------------------------------- /media/952ba4c527bdaf812956ccfc79b61659.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/952ba4c527bdaf812956ccfc79b61659.png -------------------------------------------------------------------------------- /media/99f40069991fafefac2d54413eed2873.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/99f40069991fafefac2d54413eed2873.png -------------------------------------------------------------------------------- /media/a32637beec319f1d6d95c6684cc2e4f4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/a32637beec319f1d6d95c6684cc2e4f4.png -------------------------------------------------------------------------------- /media/a7672004ad36ba789c1af504b72779a7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/a7672004ad36ba789c1af504b72779a7.png -------------------------------------------------------------------------------- /media/b4aae91db851ff529bcee3509315760f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/b4aae91db851ff529bcee3509315760f.png -------------------------------------------------------------------------------- /media/ba94d3c7763c48c1841d2461938fc64c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/ba94d3c7763c48c1841d2461938fc64c.png -------------------------------------------------------------------------------- /media/c7bba517b1bf88ab5b4164008c714adb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/c7bba517b1bf88ab5b4164008c714adb.png -------------------------------------------------------------------------------- /media/d008eede0b04bcf5ca7f5300bf6bdc18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/d008eede0b04bcf5ca7f5300bf6bdc18.png -------------------------------------------------------------------------------- /media/d650c9f0c35cf2a96466e9cd85ff6335.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/d650c9f0c35cf2a96466e9cd85ff6335.png -------------------------------------------------------------------------------- /media/e8fbbc9ebf9c99dc3228849a6c454f4e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/e8fbbc9ebf9c99dc3228849a6c454f4e.png -------------------------------------------------------------------------------- /media/f440aaab0d41900fdf1532b8524395da.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/f440aaab0d41900fdf1532b8524395da.png -------------------------------------------------------------------------------- /media/f4b058489a6ded3af88d05a514c320e9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/media/f4b058489a6ded3af88d05a514c320e9.png -------------------------------------------------------------------------------- /原始数据及清洗后的数据/原始数据/ershoufang-origin-ansi.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/原始数据及清洗后的数据/原始数据/ershoufang-origin-ansi.csv -------------------------------------------------------------------------------- /原始数据及清洗后的数据/清洗后的数据/ershoufang-clean-ansi-v1.1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/原始数据及清洗后的数据/清洗后的数据/ershoufang-clean-ansi-v1.1.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/.spyproject/codestyle.ini: -------------------------------------------------------------------------------- 1 | [codestyle] 2 | indentation = True 3 | 4 | [main] 5 | version = 0.1.0 6 | 7 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/.spyproject/encoding.ini: -------------------------------------------------------------------------------- 1 | [encoding] 2 | text_encoding = utf-8 3 | 4 | [main] 5 | version = 0.1.0 6 | 7 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/.spyproject/vcs.ini: -------------------------------------------------------------------------------- 1 | [vcs] 2 | use_version_control = False 3 | version_control_system = 4 | 5 | [main] 6 | version = 0.1.0 7 | 8 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/.spyproject/workspace.ini: -------------------------------------------------------------------------------- 1 | [workspace] 2 | restore_data_on_startup = True 3 | save_data_on_exit = True 4 | save_history = True 5 | save_non_project_files = False 6 | 7 | [main] 8 | version = 0.1.0 9 | recent_files = ['C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\data_analysis\\data_ana\\price_and_area.py', 'C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\data_analysis\\data_ana\\gaode_map.py', 'C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\data_analysis\\data_ana\\house_attr.py', 'C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\data_analysis\\data_cluster\\kmeans.py'] 10 | 11 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/business_attr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 30 15:22:06 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | 12 | #用来正常显示中文标签 13 | plt.rcParams['font.sans-serif'] = ['SimHei'] 14 | #用来正常显示负号 15 | plt.rcParams['axes.unicode_minus'] = False 16 | 17 | """1、数据加载""" 18 | #定义加载数据的文件名 19 | #filename = "data_file\\ershoufang-mini-utf8.csv" 20 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 21 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 22 | names = [ 23 | "id","communityName","areaName","total","unitPriceValue", 24 | "fwhx","szlc","jzmj","hxjg","tnmj", 25 | "jzlx","fwcx","jzjg","zxqk","thbl", 26 | "pbdt","cqnx","gpsj","jyqs","scjy", 27 | "fwyt","fwnx","cqss","dyxx","fbbj", 28 | ] 29 | #自定义需要处理的缺失值标记列表 30 | miss_value = ["null","暂无数据"] 31 | #数据类型会自动转换 32 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 33 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 34 | #print(df.info()) 35 | 36 | """2、数据运算""" 37 | """3、数据可视化呈现""" 38 | 39 | 40 | """南京二手房房屋用途占水平柱状图""" 41 | count_fwyt = df["fwyt"].value_counts(ascending=True) 42 | count_fwyt.name = "" 43 | 44 | fig = plt.figure(figsize=(12,7)) 45 | ax = fig.add_subplot(111) 46 | ax.set_xlabel("房源数量(套)",fontsize=14) 47 | ax.set_title("南京二手房房屋用途水平柱状图",fontsize=18) 48 | count_fwyt.plot(kind="barh",fontsize=12) -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/ciyun.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 27 21:33:42 2018 4 | 5 | @author: ying.zhang01 6 | """ 7 | 8 | from wordcloud import WordCloud 9 | import jieba 10 | from scipy.misc import imread 11 | 12 | 13 | """南京二手房数据词云""" 14 | #基础配置数据 15 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 16 | backpicture = "resources\\house2.jpg" 17 | savepicture = "data_ana\\picture\\南京二手房数据词云2.png" 18 | fontpath = "resources\\simhei.ttf" 19 | stopwords = ["null","暂无","数据","上传","照片","房本"] 20 | 21 | #读入数据文件 22 | comment_text = open(filename,encoding="utf-8").read() 23 | # 读取背景图片 24 | color_mask = imread(backpicture) 25 | 26 | #结巴分词,同时剔除掉不需要的词汇 27 | ershoufang_words = jieba.cut(comment_text) 28 | ershoufang_words = [word for word in ershoufang_words if word not in stopwords] 29 | cut_text = " ".join(ershoufang_words) 30 | 31 | #设置词云格式 32 | cloud = WordCloud( 33 | #设置字体,不指定就会出现乱码 34 | font_path=fontpath, 35 | #设置背景色 36 | background_color='white', 37 | #词云形状 38 | mask=color_mask, 39 | #允许最大词汇 40 | max_words=2000, 41 | #最大号字体 42 | max_font_size=60 43 | ) 44 | # 产生词云 45 | word_cloud = cloud.generate(cut_text) 46 | #保存图片 47 | word_cloud.to_file(savepicture) -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/gaode_map.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue May 1 15:52:55 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import json 9 | from urllib.request import quote 10 | import requests 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | import pandas as pd 14 | 15 | 16 | def getlnglat(address): 17 | """ 18 | 获取一个中文地址的经纬度(lat:纬度值,lng:经度值) 19 | """ 20 | url_base = "http://api.map.baidu.com/geocoder/v2/" 21 | output = "json" 22 | ak = "" # 浏览器端密钥 23 | address = quote(address) # 由于本文地址变量为中文,为防止乱码,先用quote进行编码 24 | url = url_base + '?' + 'address=' + address + '&output=' + output + '&ak=' + ak 25 | lat = 0.0 26 | lng = 0.0 27 | res = requests.get(url) 28 | temp = json.loads(res.text) 29 | if temp["status"] == 0: 30 | lat = temp['result']['location']['lat'] 31 | lng = temp['result']['location']['lng'] 32 | return lat,lng 33 | 34 | 35 | #用来正常显示中文标签 36 | plt.rcParams['font.sans-serif'] = ['SimHei'] 37 | #用来正常显示负号 38 | plt.rcParams['axes.unicode_minus'] = False 39 | 40 | """1、数据加载""" 41 | #定义加载数据的文件名 42 | #filename = "data_file\\ershoufang-mini-utf8.csv" 43 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 44 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 45 | names = [ 46 | "id","communityName","areaName","total","unitPriceValue", 47 | "fwhx","szlc","jzmj","hxjg","tnmj", 48 | "jzlx","fwcx","jzjg","zxqk","thbl", 49 | "pbdt","cqnx","gpsj","jyqs","scjy", 50 | "fwyt","fwnx","cqss","dyxx","fbbj", 51 | ] 52 | #自定义需要处理的缺失值标记列表 53 | miss_value = ["null","暂无数据"] 54 | #数据类型会自动转换 55 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 56 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 57 | 58 | 59 | """2、生成经纬度信息""" 60 | idint = [] 61 | names = [] 62 | lats = [] 63 | lngs = [] 64 | lat_lng_data = {"id":idint,"communityName":names,"lat":lats,"lng":lngs} 65 | 66 | #flag = 0 67 | for idi,name in zip(list(df["id"]),list(df["communityName"])): 68 | name = str(name) 69 | lat,lng = getlnglat("南京市"+name) 70 | if lat != 0 or lng !=0: 71 | idint.append(idi) 72 | names.append(name) 73 | lats.append(lat) 74 | lngs.append(lng) 75 | print(idi) 76 | 77 | frame_test = pd.DataFrame(lat_lng_data) 78 | frame_test.to_csv("data_file\\latlng.csv") 79 | 80 | 81 | """3、合并数据,并按格式输出数据""" 82 | #合并数据 83 | df_latlng = pd.read_csv("data_file\\latlng.csv",skiprows=[0],names=["did","communityName","id","lat","lng"]) 84 | del df_latlng["did"] 85 | del df_latlng["communityName"] 86 | 87 | df_merge = pd.merge(df,df_latlng,on="id") 88 | 89 | #小于200万 90 | xiaoyu = df_merge[df_merge["total"]<201] 91 | xiaoyu2 = df_merge.loc[df_merge["total"]<201] 92 | xiaoyu2 = xiaoyu2.loc[xiaoyu2["jzmj"] < 50] 93 | 94 | 95 | """4、生成需要的格式文件""" 96 | out_map = "data_file\\star.txt" 97 | with open(out_map,"w") as file_out: 98 | for lng,lat,price in zip(list(xiaoyu["lng"]),list(xiaoyu["lat"]),list(xiaoyu["total"])): 99 | out = str(lng)+","+str(lat) 100 | #out='{\"lng\":'+str(lng)+',\"lat\":'+str(lat)+',\"count\":'+str(price)+'},' 101 | file_out.write(out) 102 | file_out.write("\n") 103 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/house_attr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 30 15:21:27 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | 12 | #用来正常显示中文标签 13 | plt.rcParams['font.sans-serif'] = ['SimHei'] 14 | #用来正常显示负号 15 | plt.rcParams['axes.unicode_minus'] = False 16 | 17 | """1、数据加载""" 18 | #定义加载数据的文件名 19 | #filename = "data_file\\ershoufang-mini-utf8.csv" 20 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 21 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 22 | names = [ 23 | "id","communityName","areaName","total","unitPriceValue", 24 | "fwhx","szlc","jzmj","hxjg","tnmj", 25 | "jzlx","fwcx","jzjg","zxqk","thbl", 26 | "pbdt","cqnx","gpsj","jyqs","scjy", 27 | "fwyt","fwnx","cqss","dyxx","fbbj", 28 | ] 29 | #自定义需要处理的缺失值标记列表 30 | miss_value = ["null","暂无数据"] 31 | #数据类型会自动转换 32 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 33 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 34 | #print(df.info()) 35 | 36 | """2、数据运算""" 37 | """3、数据可视化呈现""" 38 | 39 | 40 | """南京二手房房屋户型占比情况""" 41 | count_fwhx = df['fwhx'].value_counts()[:10] 42 | count_other_fwhx = pd.Series({"其他":df['fwhx'].value_counts()[10:].count()}) 43 | count_fwhx = count_fwhx.append(count_other_fwhx) 44 | count_fwhx.index.name = "" 45 | count_fwhx.name = "" 46 | 47 | fig = plt.figure(figsize=(9,9)) 48 | ax = fig.add_subplot(111) 49 | ax.set_title("南京二手房房屋户型占比情况",fontsize=18) 50 | count_fwhx.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%",fontsize=12) 51 | 52 | 53 | """南京二手房房屋装修占比情况""" 54 | count_zxqk = df["zxqk"].value_counts() 55 | count_zxqk.name = "" 56 | 57 | fig = plt.figure(figsize=(9,9)) 58 | ax = fig.add_subplot(111) 59 | ax.set_title("南京二手房装修占比情况",fontsize=18) 60 | count_zxqk.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%",fontsize=12) 61 | 62 | 63 | """南京二手房建筑类型占比情况""" 64 | count_jzlx = df["jzlx"].value_counts() 65 | count_jzlx.name = "" 66 | 67 | fig = plt.figure(figsize=(9,9)) 68 | ax = fig.add_subplot(111) 69 | ax.set_title("南京二手房建筑类型占比情况",fontsize=18) 70 | count_jzlx.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%",fontsize=12) 71 | 72 | 73 | """南京二手房房屋朝向分布情况""" 74 | count_fwcx = df["fwcx"].value_counts()[:15] 75 | count_other_fwcx = pd.Series({"其他":df['fwcx'].value_counts()[15:].count()}) 76 | count_fwcx = count_fwcx.append(count_other_fwcx) 77 | 78 | fig = plt.figure(figsize=(12,7)) 79 | ax = fig.add_subplot(111) 80 | ax.set_title("房源朝向分布情况",fontsize=18) 81 | count_fwcx.plot(kind="bar",fontsize=12) 82 | 83 | 84 | """南京二手房建筑面积分布区间""" 85 | area_level = [0, 50, 100, 150, 200, 250, 300, 500] 86 | label_level = ['小于50', '50-100', '100-150', '150-200', '200-250', '250-300', '300-350'] 87 | jzmj_cut = pd.cut(df["jzmj"], area_level, labels=label_level) 88 | jzmj_result = jzmj_cut.value_counts() 89 | #jzmj_result = jzmj_result.sort_values() 90 | 91 | fig = plt.figure(figsize=(12,7)) 92 | ax = fig.add_subplot(111) 93 | ax.set_ylabel("建筑面积(㎡)",fontsize=14) 94 | ax.set_title("南京二手房建筑面积分布区间",fontsize=18) 95 | jzmj_result.plot(kind="barh",fontsize=12) 96 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/map/南京二手房单价热力图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 69 | 70 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/map/南京二手房总价小于200万的分布图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/map/南京二手房总价热力图.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 68 | 69 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/map/原始代码.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 热力图功能示例 9 | 16 | 17 | 18 |
19 |
20 | 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/map/闪烁点.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 基本地图展示 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 |
16 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/10南京二手房房屋用途水平柱状图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/10南京二手房房屋用途水平柱状图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/11南京各区域二手房总价箱形图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/11南京各区域二手房总价箱形图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/11南京各区域二手房总价箱形图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/11南京各区域二手房总价箱形图2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/12南京各区域二手房单价箱形图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/12南京各区域二手房单价箱形图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/13南京二手房建筑面积分布区间.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/13南京二手房建筑面积分布区间.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/14南京二手房总价与建筑面积散点图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/14南京二手房总价与建筑面积散点图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/15南京二手房单价与建筑面积散点图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/15南京二手房单价与建筑面积散点图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/1南京各区域二手房平均单价.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/1南京各区域二手房平均单价.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/2南京各区域二手房平均建筑面积.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/2南京各区域二手房平均建筑面积.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/3南京各区域平均建筑面积和单价.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/3南京各区域平均建筑面积和单价.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/4南京二手房各区域房源数量.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/4南京二手房各区域房源数量.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/4南京二手房各区域房源数量2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/4南京二手房各区域房源数量2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/5南京二手房单价最高Top10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/5南京二手房单价最高Top10.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/5南京二手房单价最高Top20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/5南京二手房单价最高Top20.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/6南京二手房房屋户型占比情况.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/6南京二手房房屋户型占比情况.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/7南京二手房房屋装修占比情况.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/7南京二手房房屋装修占比情况.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/8南京二手房建筑类型占比情况.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/8南京二手房建筑类型占比情况.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/9南京二手房房屋朝向分布情况.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/9南京二手房房屋朝向分布情况.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/南京二手房数据词云.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/南京二手房数据词云.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/picture/南京二手房数据词云2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_ana/picture/南京二手房数据词云2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/price_and_area.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 30 15:20:45 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | 12 | #用来正常显示中文标签 13 | plt.rcParams['font.sans-serif'] = ['SimHei'] 14 | #用来正常显示负号 15 | plt.rcParams['axes.unicode_minus'] = False 16 | 17 | """1、数据加载""" 18 | #定义加载数据的文件名 19 | #filename = "data_file\\ershoufang-mini-utf8.csv" 20 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 21 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 22 | names = [ 23 | "id","communityName","areaName","total","unitPriceValue", 24 | "fwhx","szlc","jzmj","hxjg","tnmj", 25 | "jzlx","fwcx","jzjg","zxqk","thbl", 26 | "pbdt","cqnx","gpsj","jyqs","scjy", 27 | "fwyt","fwnx","cqss","dyxx","fbbj", 28 | ] 29 | #自定义需要处理的缺失值标记列表 30 | miss_value = ["null","暂无数据"] 31 | #数据类型会自动转换 32 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 33 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 34 | #print(df.info()) 35 | 36 | """2、数据运算""" 37 | """3、数据可视化呈现""" 38 | 39 | 40 | """南京各区域二手房平均单价""" 41 | #数据分组、数据运算和聚合 42 | groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"]) 43 | mean_unitprice = groups_unitprice_area.mean() 44 | mean_unitprice.index.name = "" 45 | 46 | fig = plt.figure(figsize=(12,7)) 47 | ax = fig.add_subplot(111) 48 | ax.set_ylabel("单价(元/平米)",fontsize=14) 49 | ax.set_title("南京各区域二手房平均单价",fontsize=18) 50 | mean_unitprice.plot(kind="bar",fontsize=12) 51 | #plt.savefig('data_ana\\picture\\mean_price.jpg') 52 | #plt.show() 53 | 54 | 55 | """南京各区域二手房单价箱线图""" 56 | #数据分组、数据运算和聚合 57 | box_unitprice_area = df["unitPriceValue"].groupby(df["areaName"]) 58 | flag = True 59 | box_data = pd.DataFrame(list(range(21000)),columns=["start"]) 60 | for name,group in box_unitprice_area: 61 | box_data[name] = group 62 | del box_data["start"] 63 | #mean_unitprice.index.name = "" 64 | 65 | fig = plt.figure(figsize=(12,7)) 66 | ax = fig.add_subplot(111) 67 | ax.set_ylabel("总价(万元)",fontsize=14) 68 | ax.set_title("南京各区域二手房单价箱线图",fontsize=18) 69 | box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[20000,30000,40000,50000,100000]) 70 | 71 | 72 | """南京各区域二手房总价箱线图""" 73 | #数据分组、数据运算和聚合 74 | box_total_area = df["total"].groupby(df["areaName"]) 75 | flag = True 76 | box_data = pd.DataFrame(list(range(21000)),columns=["start"]) 77 | for name,group in box_total_area: 78 | box_data[name] = group 79 | del box_data["start"] 80 | #mean_unitprice.index.name = "" 81 | 82 | fig = plt.figure(figsize=(12,7)) 83 | ax = fig.add_subplot(111) 84 | ax.set_ylabel("总价(万元)",fontsize=14) 85 | ax.set_title("南京各区域二手房总价箱线图",fontsize=18) 86 | box_data.plot(kind="box",fontsize=12,sym='r+',grid=True,ax=ax,yticks=[0,200,500,1000,2000,3000],ylim=[0,2100]) 87 | 88 | 89 | """南京各区域二手房平均建筑面积""" 90 | #数据运算 91 | groups_area_jzmj = df["jzmj"].groupby(df["areaName"]) 92 | mean_jzmj = groups_area_jzmj.mean() 93 | mean_jzmj.index.name = "" 94 | 95 | #数据可视化 96 | fig = plt.figure(figsize=(12,7)) 97 | ax = fig.add_subplot(111) 98 | ax.set_ylabel("建筑面积(㎡)",fontsize=14) 99 | ax.set_title("南京各区域二手房平均建筑面积",fontsize=18) 100 | mean_jzmj.plot(kind="bar",fontsize=12) 101 | 102 | 103 | """南京各区域平均单价和平均建筑面积""" 104 | groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"]) 105 | mean_unitprice = groups_unitprice_area.mean() 106 | mean_unitprice.index.name = "" 107 | 108 | groups_area_jzmj = df["jzmj"].groupby(df["areaName"]) 109 | mean_jzmj = groups_area_jzmj.mean() 110 | mean_jzmj.index.name = "" 111 | 112 | fig = plt.figure() 113 | ax1 = fig.add_subplot(2,1,1) 114 | ax1.set_ylabel("单价(元/平米)",fontsize=14) 115 | ax1.set_title("南京各区域二手房平均单价",fontsize=18) 116 | ax2 = fig.add_subplot(2,1,2) 117 | ax2.set_ylabel("建筑面积(㎡)",fontsize=14) 118 | ax2.set_title("南京各区域二手房平均建筑面积",fontsize=18) 119 | plt.subplots_adjust(hspace=0.4) 120 | 121 | mean_unitprice.plot(kind="bar",ax=ax1,fontsize=12) 122 | mean_jzmj.plot(kind="bar",ax=ax2,fontsize=12) 123 | 124 | 125 | """南京各区域二手房房源数量""" 126 | groups_area = df["id"].groupby(df["areaName"]) 127 | count_area = groups_area.count() 128 | count_area.index.name = "" 129 | 130 | fig = plt.figure(figsize=(12,7)) 131 | ax = fig.add_subplot(111) 132 | ax.set_ylabel("房源数量(套)",fontsize=14) 133 | ax.set_title("南京各区域二手房房源数量",fontsize=18) 134 | count_area.sort_values().plot(kind="line",fontsize=12,grid=True,marker="o") 135 | 136 | 137 | """南京二手房单价最高Top20""" 138 | unitprice_top = df.sort_values(by="unitPriceValue",ascending=False)[:20] 139 | unitprice_top = unitprice_top.sort_values(by="unitPriceValue") 140 | unitprice_top.set_index(unitprice_top["communityName"],inplace=True) 141 | unitprice_top.index.name = "" 142 | 143 | fig = plt.figure(figsize=(12,7)) 144 | ax = fig.add_subplot(111) 145 | ax.set_ylabel("单价(元/平米)",fontsize=14) 146 | ax.set_title("南京二手房单价最高Top20",fontsize=18) 147 | unitprice_top["unitPriceValue"].plot(kind="barh",fontsize=12) 148 | 149 | 150 | """南京二手房总价与建筑面积散点图""" 151 | fig = plt.figure(figsize=(12,7)) 152 | ax = fig.add_subplot(111) 153 | ax.set_title("南京二手房总价与建筑面积散点图",fontsize=18) 154 | df.plot(x="jzmj", y="total", kind="scatter",fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500,600,700],xlim=[0,800]) 155 | ax.set_xlabel("建筑面积(㎡)",fontsize=14) 156 | ax.set_ylabel("总价(万元)",fontsize=14) 157 | 158 | """南京二手房单价与建筑面积散点图""" 159 | fig = plt.figure(figsize=(12,7)) 160 | ax = fig.add_subplot(111) 161 | ax.set_title("南京二手房单价与建筑面积散点图",fontsize=18) 162 | df.plot(x="jzmj",y="unitPriceValue",kind="scatter",grid=True,fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500,600,700],xlim=[0,800]) 163 | ax.set_xlabel("建筑面积(㎡)",fontsize=14) 164 | ax.set_ylabel("单价(元/平米)",fontsize=14) 165 | 166 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_ana/test_ana.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 25 22:15:54 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | import pandas as pd 11 | 12 | #用来正常显示中文标签 13 | plt.rcParams['font.sans-serif'] = ['SimHei'] 14 | #用来正常显示负号 15 | plt.rcParams['axes.unicode_minus'] = False 16 | 17 | """1、数据加载""" 18 | #定义加载数据的文件名 19 | #filename = "data_file\\ershoufang-mini-utf8.csv" 20 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 21 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 22 | names = [ 23 | "id","communityName","areaName","total","unitPriceValue", 24 | "fwhx","szlc","jzmj","hxjg","tnmj", 25 | "jzlx","fwcx","jzjg","zxqk","thbl", 26 | "pbdt","cqnx","gpsj","jyqs","scjy", 27 | "fwyt","fwnx","cqss","dyxx","fbbj", 28 | ] 29 | #自定义需要处理的缺失值标记列表 30 | miss_value = ["null","暂无数据"] 31 | #数据类型会自动转换 32 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 33 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 34 | #print(df.info()) 35 | 36 | """2、数据运算""" 37 | """3、数据可视化呈现""" 38 | 39 | 40 | """南京各区域二手房平均单价""" 41 | #数据分组、数据运算和聚合 42 | groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"]) 43 | mean_unitprice = groups_unitprice_area.mean() 44 | mean_unitprice.index.name = "各区域名称" 45 | 46 | fig = plt.figure(figsize=(12,7)) 47 | ax = fig.add_subplot(111) 48 | ax.set_ylabel("单价(元/平米)") 49 | ax.set_title("南京各区域二手房平均单价") 50 | mean_unitprice.plot(kind="bar") 51 | #plt.savefig('data_ana\\picture\\mean_price.jpg') 52 | #plt.show() 53 | 54 | 55 | """南京各区域二手房平均建筑面积""" 56 | #数据运算 57 | groups_area_jzmj = df["jzmj"].groupby(df["areaName"]) 58 | mean_jzmj = groups_area_jzmj.mean() 59 | mean_jzmj.index.name = "各区域名称" 60 | 61 | #数据可视化 62 | fig = plt.figure(figsize=(12,7)) 63 | ax = fig.add_subplot(111) 64 | ax.set_ylabel("建筑面积(㎡)") 65 | ax.set_title("南京各区域二手房平均建筑面积") 66 | mean_jzmj.plot(kind="bar") 67 | 68 | 69 | """南京各区域平均单价和平均建筑面积""" 70 | groups_unitprice_area = df["unitPriceValue"].groupby(df["areaName"]) 71 | mean_unitprice = groups_unitprice_area.mean() 72 | mean_unitprice.index.name = "" 73 | 74 | groups_area_jzmj = df["jzmj"].groupby(df["areaName"]) 75 | mean_jzmj = groups_area_jzmj.mean() 76 | mean_jzmj.index.name = "各区域名称" 77 | 78 | fig = plt.figure() 79 | ax1 = fig.add_subplot(2,1,1) 80 | ax1.set_ylabel("单价(元/平米)") 81 | ax1.set_title("南京各区域二手房平均单价") 82 | ax2 = fig.add_subplot(2,1,2) 83 | ax2.set_ylabel("建筑面积(㎡)") 84 | ax2.set_title("南京各区域二手房平均建筑面积") 85 | plt.subplots_adjust(hspace=0.4) 86 | 87 | mean_unitprice.plot(kind="bar",ax=ax1) 88 | mean_jzmj.plot(kind="bar",ax=ax2) 89 | 90 | 91 | """南京各区域二手房房源数量""" 92 | groups_area = df["id"].groupby(df["areaName"]) 93 | count_area = groups_area.count() 94 | count_area.index.name = "各区域名称" 95 | 96 | fig = plt.figure(figsize=(12,7)) 97 | ax = fig.add_subplot(111) 98 | ax.set_ylabel("房源数量(套)") 99 | ax.set_title("南京各区域二手房房源数量") 100 | count_area.plot(kind="bar") 101 | 102 | 103 | """南京二手房单价最高Top10""" 104 | unitprice_top = df.sort_values(by="unitPriceValue",ascending=False)[:10] 105 | unitprice_top.set_index(unitprice_top["communityName"],inplace=True) 106 | unitprice_top.index.name = "" 107 | 108 | fig = plt.figure(figsize=(12,7)) 109 | ax = fig.add_subplot(111) 110 | ax.set_ylabel("单价(元/平米)") 111 | ax.set_title("南京二手房单价最高Top10") 112 | unitprice_top["unitPriceValue"].plot(kind="bar") 113 | 114 | 115 | """南京二手房房屋户型占比情况""" 116 | count_fwhx = df['fwhx'].value_counts()[:10] 117 | count_other_fwhx = pd.Series({"其他":df['fwhx'].value_counts()[10:].count()}) 118 | count_fwhx = count_fwhx.append(count_other_fwhx) 119 | count_fwhx.index.name = "" 120 | count_fwhx.name = "" 121 | 122 | fig = plt.figure(figsize=(8,8)) 123 | ax = fig.add_subplot(111) 124 | ax.set_title("南京二手房房屋户型占比情况") 125 | count_fwhx.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%") 126 | 127 | 128 | """南京二手房房屋装修占比情况""" 129 | count_zxqk = df["zxqk"].value_counts() 130 | count_zxqk.name = "" 131 | 132 | fig = plt.figure(figsize=(8,8)) 133 | ax = fig.add_subplot(111) 134 | ax.set_title("南京二手房装修占比情况") 135 | count_zxqk.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%") 136 | 137 | 138 | """南京二手房建筑类型占比情况""" 139 | count_jzlx = df["jzlx"].value_counts() 140 | count_jzlx.name = "" 141 | 142 | fig = plt.figure(figsize=(8,8)) 143 | ax = fig.add_subplot(111) 144 | ax.set_title("南京二手房建筑类型占比情况") 145 | count_jzlx.plot(kind="pie",cmap=plt.cm.rainbow,autopct="%3.1f%%") 146 | 147 | 148 | """南京二手房房屋朝向分布情况""" 149 | count_fwcx = df["fwcx"].value_counts()[:15] 150 | count_other_fwcx = pd.Series({"其他":df['fwcx'].value_counts()[15:].count()}) 151 | count_fwcx = count_fwcx.append(count_other_fwcx) 152 | 153 | fig = plt.figure(figsize=(12,7)) 154 | ax = fig.add_subplot(111) 155 | ax.set_title("房源朝向分布情况") 156 | count_fwcx.plot(kind="bar") -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_clean/test_clean.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Apr 24 16:45:52 2018 4 | 5 | @author: ying.zhang01 6 | """ 7 | 8 | import re 9 | import csv 10 | 11 | 12 | """ 13 | 1、读入数据.2、清理数据.3、写出数据. 14 | """ 15 | filename = "data_file\\ershoufang-mini-utf8.csv" 16 | with open(filename,encoding="utf-8") as f: 17 | reader = csv.reader(f) 18 | context = [line for line in reader] 19 | 20 | with open("data_file\\ershoufang-mini-utf8.txt","w",encoding="utf-8",newline="") as f: 21 | writer = csv.writer(f) 22 | for line in context: 23 | line = [x.strip() for x in line]#去除每个数据项的空白符和换行符 24 | if line[0] == "id": 25 | writer.writerow(line) 26 | continue 27 | 28 | #将杂乱的记录的数据项对齐 29 | if "别墅" in line: 30 | line_copy = line[:] 31 | line[8] = "null" 32 | line[9] = line_copy[8] 33 | line[10] = "null" 34 | line[11] = line_copy[9] 35 | line[12] = line_copy[10] 36 | line[13] = line_copy[11] 37 | line[14] = "null" 38 | line[15] = "null" 39 | line[16] = line_copy[13] 40 | if "商业办公类" in line: 41 | #正则表达式匹配 42 | result = re.match(r"\d{4}-\d{1,2}-\d{1,2}",line[17]) 43 | if result is None: 44 | del line[17] 45 | result = re.match(r"\d{4}-\d{1,2}-\d{1,2}",line[17]) 46 | if result is None: 47 | del line[17] 48 | result = re.match(r"\d{4}-\d{1,2}-\d{1,2}",line[17]) 49 | if result is None: 50 | del line[17] 51 | if "车库" in line: 52 | line_copy = line[:] 53 | line[5] = "null" 54 | line[6] = line_copy[5] 55 | line[7] = "null" 56 | line[11] = line_copy[7] 57 | 58 | try: 59 | #将总价数据项统一整理为整数 60 | float_num = float(line[3]) 61 | line[3] = str(int(float_num)) 62 | 63 | #去除单价数据项单位 64 | line[4] = line[4].split("元")[0] 65 | 66 | #去除建筑面积数据项的单位 67 | if line[7] != "null" and line[7] != "暂无数据": 68 | line[7] = line[7].split("㎡")[0] 69 | 70 | #去除套内面积数据项的单位 71 | if line[9] != "null" and line[9] != "暂无数据": 72 | line[9] = line[9].split("㎡")[0] 73 | 74 | writer.writerow(line) 75 | except Exception as e: 76 | print("数据项转换失败!该记录未写入") 77 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/__pycache__/kmeans.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/__pycache__/kmeans.cpython-36.pyc -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/cluster_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu May 3 11:07:52 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | """ 9 | 1、给定划分数量k 10 | 2、选定k个初始质心 11 | 3、迭代重定位 12 | """ -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/kmeans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Feb 23 10:09:15 2016 4 | K-means cluster 5 | @author: liudiwei 6 | """ 7 | 8 | import numpy as np 9 | 10 | class KMeansClassifier(): 11 | "this is a k-means classifier" 12 | 13 | 14 | def __init__(self, k=3, initCent='random', max_iter=5000): 15 | """构造函数,初始化相关属性""" 16 | self._k = k 17 | self._initCent = initCent#初始中心 18 | self._max_iter = max_iter#最大迭代 19 | #一个m*2的二维矩阵,矩阵第一列存储样本点所属的族的索引值, 20 | #第二列存储该点与所属族的质心的平方误差 21 | self._clusterAssment = None#样本点聚类结结构矩阵 22 | self._labels = None 23 | self._sse = None#SSE(Sum of squared errors)平方误差和 24 | 25 | 26 | def _calEDist(self, arrA, arrB): 27 | """ 28 | 功能:欧拉距离距离计算 29 | 输入:两个一维数组 30 | """ 31 | arrA_temp = arrA.copy() 32 | arrB_temp = arrB.copy() 33 | arrA_temp[0] = arrA_temp[0]*0.16 34 | arrA_temp[1] = arrA_temp[1]*0.005 35 | arrB_temp[0] = arrB_temp[0]*0.16 36 | arrB_temp[1] = arrB_temp[1]*0.005 37 | return np.math.sqrt(sum(np.power(arrA_temp - arrB_temp, 2))) 38 | 39 | 40 | def _calMDist(self, arrA, arrB): 41 | """ 42 | 功能:曼哈顿距离距离计算 43 | 输入:两个一维数组 44 | """ 45 | return sum(np.abs(arrA-arrB)) 46 | 47 | 48 | def _randCent(self, data_X, k): 49 | """ 50 | 功能:随机选取k个质心 51 | 输出:centroids #返回一个m*n的质心矩阵 52 | """ 53 | n = data_X.shape[1] - 3 #获取特征值的维数(要删除一个用于标记的id列和经纬度值) 54 | centroids = np.empty((k,n)) #使用numpy生成一个k*n的矩阵,用于存储质心 55 | for j in range(n): 56 | minJ = min(data_X[:,j+1]) 57 | rangeJ = max(data_X[:,j+1] - minJ) 58 | #使用flatten拉平嵌套列表(nested list) 59 | centroids[:, j] = (minJ + rangeJ * np.random.rand(k, 1)).flatten() 60 | return centroids 61 | 62 | 63 | def fit(self, data_X): 64 | """ 65 | 输入:一个m*n维的矩阵 66 | """ 67 | if not isinstance(data_X, np.ndarray) or \ 68 | isinstance(data_X, np.matrixlib.defmatrix.matrix): 69 | try: 70 | data_X = np.asarray(data_X) 71 | except: 72 | raise TypeError("numpy.ndarray resuired for data_X") 73 | 74 | m = data_X.shape[0] #获取样本的个数 75 | #一个m*2的二维矩阵,矩阵第一列存储样本点所属的族的编号, 76 | #第二列存储该点与所属族的质心的平方误差 77 | self._clusterAssment = np.zeros((m,2)) 78 | 79 | #创建k个点,作为起始质心 80 | if self._initCent == 'random': 81 | self._centroids = self._randCent(data_X, self._k) 82 | 83 | clusterChanged = True 84 | #循环最大迭代次数 85 | for _ in range(self._max_iter): #使用"_"主要是因为后面没有用到这个值 86 | clusterChanged = False 87 | for i in range(m): #将每个样本点分配到离它最近的质心所属的族 88 | minDist = np.inf #首先将minDist置为一个无穷大的数 89 | minIndex = -1 #将最近质心的下标置为-1 90 | for j in range(self._k): #次迭代用于寻找元素最近的质心 91 | arrA = self._centroids[j,:] 92 | arrB = data_X[i,1:4] 93 | distJI = self._calEDist(arrA, arrB) #计算距离 94 | if distJI < minDist: 95 | minDist = distJI 96 | minIndex = j 97 | if self._clusterAssment[i, 0] != minIndex or self._clusterAssment[i, 1] > minDist**2: 98 | clusterChanged = True 99 | self._clusterAssment[i,:] = minIndex, minDist**2 100 | if not clusterChanged:#若所有样本点所属的族都不改变,则已收敛,结束迭代 101 | break 102 | for i in range(self._k):#更新质心,将每个族中的点的均值作为质心 103 | index_all = self._clusterAssment[:,0] #取出样本所属簇的编号 104 | value = np.nonzero(index_all==i) #取出所有属于第i个簇的索引值 105 | ptsInClust = data_X[value[0]] #取出属于第i个簇的所有样本点 106 | self._centroids[i,:] = np.mean(ptsInClust[:,1:4], axis=0) #计算均值,赋予新的质心 107 | 108 | self._labels = self._clusterAssment[:,0] 109 | self._sse = sum(self._clusterAssment[:,1]) 110 | 111 | 112 | def predict(self, X):#根据聚类结果,预测新输入数据所属的族 113 | #类型检查 114 | if not isinstance(X,np.ndarray): 115 | try: 116 | X = np.asarray(X) 117 | except: 118 | raise TypeError("numpy.ndarray required for X") 119 | 120 | m = X.shape[0]#m代表样本数量 121 | preds = np.empty((m,)) 122 | for i in range(m):#将每个样本点分配到离它最近的质心所属的族 123 | minDist = np.inf 124 | for j in range(self._k): 125 | distJI = self._calEDist(self._centroids[j,:], X[i,:]) 126 | if distJI < minDist: 127 | minDist = distJI 128 | preds[i] = j 129 | return preds 130 | 131 | 132 | class biKMeansClassifier(): 133 | "this is a binary k-means classifier" 134 | 135 | def __init__(self, k=3): 136 | 137 | self._k = k 138 | self._centroids = None 139 | self._clusterAssment = None 140 | self._labels = None 141 | self._sse = None 142 | 143 | 144 | def _calEDist(self, arrA, arrB): 145 | """ 146 | 功能:欧拉距离距离计算 147 | 输入:两个一维数组 148 | """ 149 | return np.math.sqrt(sum(np.power(arrA-arrB, 2))) 150 | 151 | def fit(self, X): 152 | m = X.shape[0] 153 | self._clusterAssment = np.zeros((m,2)) 154 | centroid0 = np.mean(X, axis=0).tolist() 155 | centList =[centroid0] 156 | for j in range(m):#计算每个样本点与质心之间初始的平方误差 157 | self._clusterAssment[j,1] = self._calEDist(np.asarray(centroid0), \ 158 | X[j,:])**2 159 | 160 | while (len(centList) < self._k): 161 | lowestSSE = np.inf 162 | #尝试划分每一族,选取使得误差最小的那个族进行划分 163 | for i in range(len(centList)): 164 | index_all = self._clusterAssment[:,0] #取出样本所属簇的索引值 165 | value = np.nonzero(index_all==i) #取出所有属于第i个簇的索引值 166 | ptsInCurrCluster = X[value[0],:] #取出属于第i个簇的所有样本点 167 | clf = KMeansClassifier(k=2) 168 | clf.fit(ptsInCurrCluster) 169 | #划分该族后,所得到的质心、分配结果及误差矩阵 170 | centroidMat, splitClustAss = clf._centroids, clf._clusterAssment 171 | sseSplit = sum(splitClustAss[:,1]) 172 | index_all = self._clusterAssment[:,0] 173 | value = np.nonzero(index_all==i) 174 | sseNotSplit = sum(self._clusterAssment[value[0],1]) 175 | if (sseSplit + sseNotSplit) < lowestSSE: 176 | bestCentToSplit = i 177 | bestNewCents = centroidMat 178 | bestClustAss = splitClustAss.copy() 179 | lowestSSE = sseSplit + sseNotSplit 180 | #该族被划分成两个子族后,其中一个子族的索引变为原族的索引 181 | #另一个子族的索引变为len(centList),然后存入centList 182 | bestClustAss[np.nonzero(bestClustAss[:,0]==1)[0],0]=len(centList) 183 | bestClustAss[np.nonzero(bestClustAss[:,0]==0)[0],0]=bestCentToSplit 184 | centList[bestCentToSplit] = bestNewCents[0,:].tolist() 185 | centList.append(bestNewCents[1,:].tolist()) 186 | self._clusterAssment[np.nonzero(self._clusterAssment[:,0] == \ 187 | bestCentToSplit)[0],:]= bestClustAss 188 | 189 | self._labels = self._clusterAssment[:,0] 190 | self._sse = sum(self._clusterAssment[:,1]) 191 | self._centroids = np.asarray(centList) 192 | 193 | def predict(self, X):#根据聚类结果,预测新输入数据所属的族 194 | #类型检查 195 | if not isinstance(X,np.ndarray): 196 | try: 197 | X = np.asarray(X) 198 | except: 199 | raise TypeError("numpy.ndarray required for X") 200 | 201 | m = X.shape[0]#m代表样本数量 202 | preds = np.empty((m,)) 203 | for i in range(m):#将每个样本点分配到离它最近的质心所属的族 204 | minDist = np.inf 205 | for j in range(self._k): 206 | distJI = self._calEDist(self._centroids[j,:],X[i,:]) 207 | if distJI < minDist: 208 | minDist = distJI 209 | preds[i] = j 210 | return preds 211 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组0 - dark.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组0.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组1 - dark.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组1.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组2 - dark.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组2.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组3-dark.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组3.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 81 | 82 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组4 - dark.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 83 | 84 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/map/k-means聚类结果分组4.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 热力图 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 |
16 | 17 | 18 |
19 | 83 | 84 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/不同k值下的平方误差和.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/不同k值下的平方误差和.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/不同k值下的总和方差折线图1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/不同k值下的总和方差折线图1.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/不同k值下的总和方差折线图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/不同k值下的总和方差折线图2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/聚类结果-单价与建筑面积散点图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/聚类结果-单价与建筑面积散点图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/聚类结果-单价与建筑面积散点图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/聚类结果-单价与建筑面积散点图2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/聚类结果-总价价与建筑面积散点图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/聚类结果-总价价与建筑面积散点图.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/result/聚类结果-总价价与建筑面积散点图2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_cluster/result/聚类结果-总价价与建筑面积散点图2.png -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_cluster/run.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Feb 23 11:29:37 2016 4 | Run Kmeans classifier 5 | @author: liudiwei 6 | """ 7 | 8 | import sys; 9 | sys.path.append("data_cluster"); 10 | 11 | import pandas as pd 12 | import numpy as np 13 | from kmeans import KMeansClassifier 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | #用来正常显示中文标签 18 | plt.rcParams['font.sans-serif'] = ['SimHei'] 19 | #用来正常显示负号 20 | plt.rcParams['axes.unicode_minus'] = False 21 | 22 | 23 | def loadDataset(): 24 | """ 25 | 加载数据集(DataFrame格式),并转换成所需要的格式。 26 | 最后将返回为一个numpy的数组类型 27 | """ 28 | #filename = "data_file\\ershoufang-mini-utf8.csv" 29 | filename = "data_file\\ershoufang-clean-utf8-v1.1.csv" 30 | #自定义数据的行列索引(行索引使用pd默认的,列索引使用自定义的) 31 | names = [ 32 | "id","communityName","areaName","total","unitPriceValue", 33 | "fwhx","szlc","jzmj","hxjg","tnmj", 34 | "jzlx","fwcx","jzjg","zxqk","thbl", 35 | "pbdt","cqnx","gpsj","jyqs","scjy", 36 | "fwyt","fwnx","cqss","dyxx","fbbj", 37 | ] 38 | #自定义需要处理的缺失值标记列表 39 | miss_value = ["null","暂无数据"] 40 | #数据类型会自动转换 41 | #使用自定义的列名,跳过文件中的头行,处理缺失值列表标记的缺失值 42 | df = pd.read_csv(filename,skiprows=[0],names=names,na_values=miss_value) 43 | 44 | #合并经纬度数据 45 | df_latlng = pd.read_csv("data_file\\latlng.csv",skiprows=[0],names=["did","communityName","id","lat","lng"]) 46 | del df_latlng["did"] 47 | del df_latlng["communityName"] 48 | df_merge = pd.merge(df,df_latlng,on="id") 49 | 50 | #选取所需要的数据 51 | data_cluster = df_merge[["id","total","unitPriceValue","jzmj","lat","lng"]] 52 | 53 | #剔除带有空值的行 54 | data_cluster = data_cluster.dropna() 55 | 56 | #去除离散值 57 | data_cluster = data_cluster.loc[data_cluster["jzmj"] < 500] 58 | data_cluster = data_cluster.loc[data_cluster["total"] < 3000] 59 | 60 | #转换为numpy数组类型 61 | arr_cluster = np.array(data_cluster).astype(np.float) 62 | return arr_cluster 63 | 64 | 65 | """1、加载数据""" 66 | data_X = loadDataset() 67 | 68 | 69 | """2、根据sse值,选取合适的k值""" 70 | k_values = [2,3,4,5,6,7,8,9,10] 71 | sse_values = [297451453654,287451453654,97451453654,47451453654,40451453654,40251453654,40051453654,37451453654,30451453654] 72 | for k in k_values: 73 | clf = KMeansClassifier(k) 74 | clf.fit(data_X) 75 | cents = clf._centroids 76 | labels = clf._labels 77 | sse = clf._sse 78 | sse_values.append(sse) 79 | 80 | sse_values = list(map(int,sse_values)) 81 | del sse_values[0] 82 | sse_values[0] = 297451453654 83 | sse_data = {"k":k_values,"sse":sse_values} 84 | sse_df = pd.DataFrame(sse_data) 85 | #重新定义索引 86 | sse_df.set_index(sse_df["k"],inplace=True) 87 | del sse_df["k"] 88 | 89 | #绘制不同k值下的和方差折线图 90 | sse_df.index.name = "" 91 | fig = plt.figure(figsize=(12,7)) 92 | ax = fig.add_subplot(111) 93 | ax.set_ylabel("SSE",fontsize=14) 94 | ax.set_title("不同k值下的SSE(Sum of squared errors)平方误差和",fontsize=18) 95 | sse_df.plot(kind="line",fontsize=12,grid=True,marker="o",ax=ax) 96 | 97 | 98 | """3、选定k值后,聚类分析,统计结果""" 99 | #给定划分数量k 100 | k = 5 101 | 102 | #运行k-means算法 103 | clf = KMeansClassifier(k) 104 | clf.fit(data_X) 105 | cents = clf._centroids 106 | labels = clf._labels 107 | sse = clf._sse 108 | 109 | #设置存储值 110 | data_result = [] #聚类的原始样本集(numpy数组类型) 111 | result_mean = []#各类样本集均值结果集 112 | data_df = []#聚类的原始样本集(dataframe类型) 113 | colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868'] 114 | 115 | #统计均值结果 116 | for i in range(k): 117 | index = np.nonzero(labels==i)[0]#取出所有属于第i个簇的索引值 118 | data_i = data_X[index] #取出属于第i个簇的所有样本点 119 | data_result.append(data_i) 120 | mean_data = data_i.mean(axis=0) 121 | mean_data = list(map(int,mean_data)) 122 | result_mean.append(list(mean_data)) 123 | 124 | #变换数组结构 125 | for i in range(k): 126 | data_temp = data_result[i] 127 | data = {"id":data_temp[:,0], 128 | "total":data_temp[:,1], 129 | "unitprice":data_temp[:,2], 130 | "jzmj":data_temp[:,3], 131 | "lat":data_temp[:,4], 132 | "lng":data_temp[:,5]} 133 | data_df_temp = pd.DataFrame(data,columns=["id","total","unitprice","jzmj","lat","lng"]) 134 | data_df.append(data_df_temp) 135 | 136 | #输出统计结果 137 | gr = 0 138 | print(" k-means算法统计结果") 139 | print(" 分组 总价(万) 单价(元/平米) 建筑面积(平米) 总计") 140 | for i in result_mean: 141 | print(" "+str(gr)+" "+str(i[1])+" "+str(i[2])+" "+str(i[3])+"\t\t"+str(len(data_df[gr]))) 142 | gr = gr + 1 143 | 144 | 145 | """4、聚类结果:单价与建筑面积的散点图""" 146 | fig = plt.figure(figsize=(12,7)) 147 | ax = fig.add_subplot(111) 148 | ax.set_title("单价与建筑面积散点图",fontsize=18) 149 | data_df[0].plot(x="jzmj", y="unitprice", kind="scatter",label="0",color=colors[0],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 150 | data_df[1].plot(x="jzmj", y="unitprice", kind="scatter",label="1",color=colors[1],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 151 | data_df[2].plot(x="jzmj", y="unitprice", kind="scatter",label="2",color=colors[2],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 152 | data_df[3].plot(x="jzmj", y="unitprice", kind="scatter",label="3",color=colors[3],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 153 | data_df[4].plot(x="jzmj", y="unitprice", kind="scatter",label="4",color=colors[4],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 154 | ax.set_xlabel("建筑面积(㎡)",fontsize=14) 155 | ax.set_ylabel("单价(元/㎡)",fontsize=14) 156 | 157 | """5、聚类结果:总价价与建筑面积的散点图""" 158 | fig = plt.figure(figsize=(12,7)) 159 | ax = fig.add_subplot(111) 160 | ax.set_title("总价与建筑面积散点图",fontsize=18) 161 | data_df[0].plot(x="jzmj", y="total", kind="scatter",label="0",color=colors[0],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 162 | data_df[1].plot(x="jzmj", y="total", kind="scatter",label="1",color=colors[1],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 163 | data_df[2].plot(x="jzmj", y="total", kind="scatter",label="2",color=colors[2],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 164 | data_df[3].plot(x="jzmj", y="total", kind="scatter",label="3",color=colors[3],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 165 | data_df[4].plot(x="jzmj", y="total", kind="scatter",label="4",color=colors[4],fontsize=12,ax=ax,alpha=0.4,xticks=[0,50,100,150,200,250,300,400,500],xlim=[0,600]) 166 | ax.set_xlabel("建筑面积(㎡)",fontsize=14) 167 | ax.set_ylabel("总价(万元)",fontsize=14) 168 | 169 | 170 | """6、生成地图文件""" 171 | count = 0 172 | for data_map in data_df: 173 | out_map = "data_cluster\\result\\map\\cluster"+str(count)+".js" 174 | with open(out_map,"w") as file_out: 175 | for lng,lat,price in zip(list(data_map["lng"]),list(data_map["lat"]),list(data_map["total"])): 176 | #out = str(lng)+","+str(lat) 177 | out='{\"lng\":'+str(lng)+',\"lat\":'+str(lat)+',\"count\":'+str(price)+'},' 178 | file_out.write(out) 179 | file_out.write("\n") 180 | count = count + 1 181 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/ershoufang-clean-ansi-v1.1.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_file/ershoufang-clean-ansi-v1.1.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/ershoufang-mini-ansi.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_file/ershoufang-mini-ansi.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/ershoufang-mini-utf8.csv: -------------------------------------------------------------------------------- 1 | id,小区名称,所在区域,总价,单价,房屋户型,所在楼层,建筑面积,户型结构,套内面积,建筑类型,房屋朝向,建筑结构,装修情况,梯户比例,配备电梯,产权年限,挂牌时间,交易权属,上次交易,房屋用途,房屋年限,产权所属,抵押信息,房本备件 2 | 1,东方兰园,栖霞,399,32178,3室2厅1厨2卫,低楼层 (共21层),124,暂无数据,暂无数据,板楼,东 南 北,钢混结构,毛坯,两梯四户,有,70年,2018-03-31,商品房,暂无数据,普通住宅,暂无数据,共有,有抵押 140万元 交通银行 客户偿还,未上传房本照片 3 | 2,金陵御沁园,鼓楼,680,34445,5室2厅1厨3卫,高楼层 (共19层),197.42,跃层,162.64,板塔结合,南 北,钢混结构,其他,两梯两户,有,70年,2017-12-21,商品房,2005-06-01,普通住宅,满五年,共有,暂无数据,已上传房本照片 4 | 3,北苑一村,玄武,152,29682,2室1厅1厨1卫,高楼层 (共6层),51.21,暂无数据,45.7,板楼,南 北,钢混结构,其他,一梯三户,无,70年,2017-09-09,房改房,2017-10-26,普通住宅,未满两年,共有,暂无数据,已上传房本照片 5 | 4,诚品城,栖霞,230,25556,2室2厅1厨1卫,高楼层 (共6层),90,暂无数据,暂无数据,板楼,南,钢混结构,精装,一梯两户,无,70年,2017-12-23,商品房,暂无数据,普通住宅,暂无数据,共有,有抵押 客户偿还,未上传房本照片 6 | 5,亚东观云国际,雨花台,315,39019,2室1厅1厨1卫,中楼层 (共28层),80.73,暂无数据,58.97,塔楼,南,钢混结构,精装,两梯三户,有,70年,2017-08-21,商品房,2012-05-21,普通住宅,满五年,共有,有抵押 40万元,已上传房本照片 7 | 19327,鼎业国际花园,浦口,280,31226,2室1厅1厨1卫,高楼层 (共18层),89.67,平层,72.44,塔楼,南,钢混结构,简装,两梯两户,有,70年,2017-12-30,商品房,2012-04-05,商业办公类,满五年,共有,有抵押 30万元,已上传房本照片 8 | 19328,雅居乐花园,秦淮,760,42566,3室2厅1厨2卫,中楼层 (共25层),178.55,暂无数据,138.06,板塔结合,南 北,钢混结构,其他,两梯两户,有,70年,2017-12-10,商品房,2010-05-24,普通住宅,满五年,共有,暂无数据,已上传房本照片 9 | 19735,安如村,建邺,160,31342,2室1厅1厨1卫,低楼层 (共6层),51.05,平层,暂无数据,板楼,南 北,钢混结构,其他,一梯两户,无,70年,2017-11-30,房改房,2010-11-01,普通住宅,满五年,非共有,暂无数据,已上传房本照片 10 | 19736,诚品城,栖霞,7,5677,null,地下室 (共1层),null,null,null,null,南 北,null,null,null,null,null,2017-07-21,商品房,2012-07-05,车库,满五年,共有,暂无数据,已上传房本照片 11 | 19801,山水华门浅水湾苑,江宁,560,41177,3室2厅1厨2卫,低楼层 (共4层),136,null,暂无数据,null,南 北,钢混结构,其他,null,null,70年,2018-03-01,商品房,暂无数据,别墅,暂无数据,暂无数据,暂无数据,未上传房本照片 12 | -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/ershoufang-origin-ansi.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_file/ershoufang-origin-ansi.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/latlng - 副本.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_file/latlng - 副本.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/testv1.csv: -------------------------------------------------------------------------------- 1 | id,小区名称,所在区域,总价,单价,房屋户型,所在楼层,建筑面积,户型结构,套内面积,建筑类型,房屋朝向,建筑结构,装修情况,梯户比例,配备电梯,产权年限,挂牌时间,交易权属,上次交易,房屋用途,房屋年限,产权所属,抵押信息,房本备件 2 | 1,东方兰园,栖霞,399,32178元/平米,3室2厅1厨2卫,低楼层 (共21层),124㎡,暂无数据,暂无数据,板楼,东 南 北,钢混结构,毛坯,两梯四户,有,70年,2018-03-31,商品房,暂无数据,普通住宅,暂无数据,共有,有抵押 140万元 交通银行 客户偿还,未上传房本照片 3 | 2,金陵御沁园,鼓楼,680,34445元/平米,5室2厅1厨3卫,高楼层 (共19层),197.42㎡,跃层,162.64㎡,板塔结合,南 北,钢混结构,其他,两梯两户,有,70年,2017-12-21,商品房,2005-06-01,普通住宅,满五年,共有,暂无数据,已上传房本照片 4 | 3,北苑一村,玄武,152,29682元/平米,2室1厅1厨1卫,高楼层 (共6层),51.21㎡,暂无数据,45.7㎡,板楼,南 北,钢混结构,其他,一梯三户,无,70年,2017-09-09,房改房,2017-10-26,普通住宅,未满两年,共有,暂无数据,已上传房本照片 5 | 4,诚品城,栖霞,230,25556元/平米,2室2厅1厨1卫,高楼层 (共6层),90㎡,暂无数据,暂无数据,板楼,南,钢混结构,精装,一梯两户,无,70年,2017-12-23,商品房,暂无数据,普通住宅,暂无数据,共有,有抵押 客户偿还,未上传房本照片 6 | 5,亚东观云国际,雨花台,315,39019元/平米,2室1厅1厨1卫,中楼层 (共28层),80.73㎡,暂无数据,58.97㎡,塔楼,南,钢混结构,精装,两梯三户,有,70年,2017-08-21,商品房,2012-05-21,普通住宅,满五年,共有,有抵押 40万元,已上传房本照片 7 | 16,秦淮绿洲东苑,江宁,580,30052元/平米,4室2厅1厨3卫,中楼层 (共4层),193㎡,暂无数据,南 北,钢混结构,其他,叠拼,70年,null,null,null,2017-03-02,商品房,2017-10-31,别墅,未满两年,非共有,暂无数据,已上传房本照片 8 | 17,后标营28号,秦淮,260,35379元/平米,2室2厅1厨1卫,低楼层 (共7层),73.49㎡,平层,63.64㎡,板塔结合,南 北,钢混结构,其他,一梯两户,无,70年,2018-03-24,商品房,2016-01-26,普通住宅,满两年,非共有,无抵押,已上传房本照片 9 | 18,中南世纪雅苑,栖霞,206,23860元/平米,3室2厅1厨1卫,中楼层 (共34层),86.34㎡,暂无数据,63.48㎡,塔楼,南 北,钢混结构,毛坯,一梯四户,暂无数据,70年,2017-12-18,商品房,2017-03-08,普通住宅,未满两年,共有,有抵押 业主自还,已上传房本照片 10 | 19,春江新城新河苑二期,雨花台,178,21724元/平米,3室1厅1厨1卫,中楼层 (共18层),81.94㎡,平层,77.15㎡,板楼,南 北,钢混结构,其他,两梯四户,有,70年,2018-03-26,房改房,2017-02-22,普通住宅,未满两年,共有,暂无数据,已上传房本照片 11 | 20,江雁山水雅苑,鼓楼,363,31071元/平米,3室2厅1厨2卫,低楼层 (共6层),116.83㎡,平层,105.9㎡,板楼,南 北,钢混结构,其他,一梯两户,无,70年,2018-01-17,商品房,2008-07-02,普通住宅,满五年,非共有,暂无数据,已上传房本照片 12 | 21,枫林湾,江宁,247,31081元/平米,2室2厅1厨1卫,高楼层 (共18层),79.47㎡,平层,64.54㎡,塔楼,南,钢混结构,其他,两梯四户,有,70年,2017-10-31,商品房,2015-11-17,普通住宅,满两年,共有,暂无数据,已上传房本照片 13 | 22,杨将军巷,玄武,270,52418元/平米,2室1厅1厨1卫,低楼层 (共6层),51.51㎡,平层,1㎡,板楼,南 北,砖混结构,其他,一梯三户,无,70年,2017-12-26,房改房,2013-06-25,普通住宅,满两年,非共有,暂无数据,已上传房本照片 14 | 23,怡景花园,鼓楼,558,76966元/平米,2室1厅1厨1卫,中楼层 (共26层),72.5㎡,平层,暂无数据,塔楼,南,砖混结构,其他,两梯六户,有,70年,2018-01-02,商品房,2018-01-06,普通住宅,未满两年,共有,暂无数据,已上传房本照片 15 | 24,清江花苑圆梦园,鼓楼,650,37879元/平米,4室2厅1厨2卫,高楼层 (共27层),171.6㎡,平层,138.35㎡,板楼,南 北,钢混结构,精装,两梯五户,有,70年,2018-03-22,商品房,2006-12-22,普通住宅,满五年,共有,无抵押,已上传房本照片 16 | 25,汇金新天地,江宁,145,25605元/平米,1室1厅1厨1卫,中楼层 (共18层),56.63㎡,平层,暂无数据,塔楼,南,钢混结构,其他,一梯二十二户,有,65年,2018-01-02,商品房,2017-01-03,普通住宅,未满两年,非共有,暂无数据,已上传房本照片 17 | 26,汇景家园景泰苑,秦淮,203,30043元/平米,2室1厅1厨1卫,中楼层 (共11层),67.57㎡,平层,54.76㎡,板塔结合,南,钢混结构,其他,一梯六户,暂无数据,70年,2017-12-20,商品房,2012-12-27,普通住宅,满五年,非共有,暂无数据,已上传房本照片 18 | 27,东渡国际青年城,江宁,242,28728元/平米,2室2厅1厨1卫,高楼层 (共21层),84.24㎡,平层,62.04㎡,板塔结合,南 北,钢混结构,其他,两梯四户,有,70年,2018-03-19,商品房,2016-03-24,普通住宅,满两年,共有,暂无数据,已上传房本照片 19 | 28,东郊小镇第四街区,江宁,315,23318元/平米,5室1厅1厨2卫,低楼层 (共3层),135.09㎡,79.91㎡,南 北,钢混结构,毛坯,联排,70年,null,null,null,2017-05-19,商品房,2010-07-22,别墅,满五年,非共有,暂无数据,已上传房本照片 20 | 432,蓝岸3G,雨花台,70,16647元/平米,1室0厅1厨1卫,高楼层 (共10层),42.05㎡,暂无数据,26.85㎡,板塔结合,南,钢混结构,其他,两梯五十四户,有,40年,商水,商电,2.2元/m3,2018-04-01,商品房,2013-05-20,商业办公类,满两年,非共有,暂无数据,已上传房本照片 21 | 1404,仁恒江湾城二期,建邺,48,35451元/平米,地下室 (共1层),13.54㎡,南,null,null,null,null,null,null,null,null,null,2017-12-09,商品房,2016-12-13,车库,未满两年,非共有,暂无数据,已上传房本照片 -------------------------------------------------------------------------------- /数据分析程序/data_analysis/data_file/testv2.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/data_file/testv2.csv -------------------------------------------------------------------------------- /数据分析程序/data_analysis/pylot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Apr 15 22:47:58 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | """ 12 | plt.plot([1,2,3,4,5],[3,4,5,2,6]) 13 | plt.ylabel("grade") 14 | #dpi图像质量,默认格式为png 15 | #plt.savefig("first",dpi=600) 16 | plt.axis([0,5,0,6]) 17 | plt.show() 18 | """ 19 | 20 | def f(t): 21 | return np.exp(-t)*np.cos(2*np.pi*t) 22 | 23 | a = np.arange(0,5,0.02) 24 | plt.subplot(211) 25 | plt.plot(a,f(a)) 26 | plt.subplot(2,1,2) 27 | plt.plot(a,np.cos(2*np.pi*a),"r--") 28 | plt.show() 29 | 30 | import numpy as np 31 | import matplotlib.pyplot as plt 32 | import matplotlib 33 | 34 | matplotlib.rcParams["font.family"] = "STSong" 35 | matplotlib.rcParams["font.size"] = 20 36 | 37 | a = np.arange(0.0,5.0,0.02) 38 | 39 | #输入中文 40 | plt.xlabel("横轴:时间",fontproperties="SimHei",fontsize=20) 41 | plt.ylabel("纵轴:振幅") 42 | plt.plot(a,np.cos(2*np.pi*a),"r--") 43 | plt.show() 44 | 45 | #饼图 46 | labels = "forgs","hogs","dogs","logs" 47 | print(labels) 48 | sizes = [15,30,45,10] 49 | explode = (0,0.1,0,0) 50 | plt.pie(sizes,explode=explode,labels=labels,autopct="%1.1f%%",shadow=False,startangle=90) 51 | plt.axis("equal") 52 | plt.show() 53 | 54 | #直方图 55 | np.random.seed(0) 56 | mu,sigma = 100,20 57 | a = np.random.normal(mu,sigma,size=100) 58 | 59 | plt.hist(a,10,normed=1,histtype="stepfilled",facecolor="b",alpha=0.75) 60 | plt.show() 61 | 62 | from pyecharts import Bar 63 | 64 | bar = Bar("我的第一个图表", "这里是副标题") 65 | bar.add("服装", ["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"], [5, 20, 36, 10, 75, 90]) 66 | bar.print_echarts_options() 67 | bar.render() 68 | 69 | df1 = pd.DataFrame(np.arange(12.).reshape(3,4),columns=list("abcd")) 70 | df1.plot(kind="box",fontsize=12) 71 | 72 | data1 = {"key":["a","a","b","b"],"name":[1,2,3,4]} 73 | frame1 = pd.DataFrame(data1) 74 | seri = pd.Series([1,1,1,1]) 75 | seri.name = "name2" 76 | frame1[seri.name] = seri 77 | 78 | g = frame1["name"].groupby(frame1["key"]) 79 | g.size() 80 | 81 | flag = 1 82 | 83 | for f1,f2 in g: 84 | if flag == 1: 85 | re = pd.DataFrame(list(f2),columns=[f1]) 86 | print(f1) 87 | print(f2) 88 | flag = 2 89 | continue 90 | re[f1] = list(f2) 91 | print(f1) 92 | print(f2) 93 | 94 | 95 | data1 = [0,1,2,3] 96 | data2 = [2,3,4,5] 97 | arr1 = np.array(data1) 98 | arr2 = np.array(data2) 99 | np.sum(np.power(arr2-arr1, 2)) 100 | 101 | data3 = [[0,1,2,3],[2,3,4,5],[2,3,4,5]] 102 | arr3 = np.array(data3) 103 | index_a = arr3[:,1:3] 104 | value = np.nonzero(index_a == 2) -------------------------------------------------------------------------------- /数据分析程序/data_analysis/resources/HYQiHei-25J.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/resources/HYQiHei-25J.ttf -------------------------------------------------------------------------------- /数据分析程序/data_analysis/resources/house1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/resources/house1.jpg -------------------------------------------------------------------------------- /数据分析程序/data_analysis/resources/house2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/resources/house2.jpg -------------------------------------------------------------------------------- /数据分析程序/data_analysis/resources/simhei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据分析程序/data_analysis/resources/simhei.ttf -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/.spyproject/codestyle.ini: -------------------------------------------------------------------------------- 1 | [codestyle] 2 | indentation = True 3 | 4 | [main] 5 | version = 0.1.0 6 | 7 | -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/.spyproject/encoding.ini: -------------------------------------------------------------------------------- 1 | [encoding] 2 | text_encoding = utf-8 3 | 4 | [main] 5 | version = 0.1.0 6 | 7 | -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/.spyproject/vcs.ini: -------------------------------------------------------------------------------- 1 | [vcs] 2 | use_version_control = False 3 | version_control_system = 4 | 5 | [main] 6 | version = 0.1.0 7 | 8 | -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/.spyproject/workspace.ini: -------------------------------------------------------------------------------- 1 | [workspace] 2 | restore_data_on_startup = True 3 | save_data_on_exit = True 4 | save_history = True 5 | save_non_project_files = False 6 | 7 | [main] 8 | version = 0.1.0 9 | recent_files = ['C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\lianjia\\html_parser.py', 'C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\lianjia\\spider_main.py', 'C:\\Users\\zhangying\\Desktop\\temp\\毕业设计\\data_analysis\\lianjia\\html_downloader.py'] 10 | 11 | -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/__pycache__/html_downloader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/__pycache__/html_downloader.cpython-36.pyc -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/__pycache__/html_outputer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/__pycache__/html_outputer.cpython-36.pyc -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/__pycache__/html_parser.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/__pycache__/html_parser.cpython-36.pyc -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/__pycache__/log.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/__pycache__/log.cpython-36.pyc -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/__pycache__/url_manager.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/__pycache__/url_manager.cpython-36.pyc -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/html_downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 18 16:59:59 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import requests 9 | from log import MyLog 10 | import random 11 | 12 | class HtmlDownloader(): 13 | """网页加载器""" 14 | 15 | 16 | def __init__(self): 17 | """构造函数,初始化属性""" 18 | self.log = MyLog("html_downloader","logs") 19 | 20 | self.user_agent = [ 21 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", 22 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; 360SE) ", 23 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0) ", 24 | "Mozilla/5.0 (Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20100101 Firefox/8.0", 25 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 27 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)", 29 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" 30 | ] 31 | 32 | 33 | def download(self,url): 34 | """网页下载函数""" 35 | if url is None: 36 | self.log.logger.error("页面下载:url为空!!!") 37 | return None 38 | 39 | #随机变换user-agent 40 | headers = { 41 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 42 | "Accept-Encoding":"gzip, deflate, br", 43 | "Accept-Language":"zh-CN,zh;q=0.9", 44 | "Connection":"keep-alive", 45 | "Cache-Control":"max-age=0", 46 | "Host":"nj.lianjia.com", 47 | "User-Agent":random.choice(self.user_agent) 48 | } 49 | 50 | r = requests.get(url, headers=headers) 51 | 52 | if r.status_code != 200: 53 | self.log.logger.error("页面下载:响应错误:%d"% r.status_code) 54 | return None 55 | 56 | self.log.logger.info("页面下载:成功!") 57 | print("页面下载:成功!") 58 | return r.text 59 | 60 | 61 | """ 62 | response = urllib.request.urlopen(url) 63 | if response.getcode() != 200: 64 | self.log.logger.error("页面下载:响应错误:%d"% response.getcode()) 65 | return None 66 | 67 | self.log.logger.info("页面下载:成功!") 68 | print("页面下载:成功!") 69 | return response.read() 70 | """ -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/html_outputer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 18 17:00:15 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | 9 | from log import MyLog 10 | import csv 11 | 12 | 13 | class HtmlOutputer(): 14 | """数据输出收集模块""" 15 | 16 | 17 | def __init__(self): 18 | """构造函数,初始化属性""" 19 | self.log = MyLog("html_outputer","logs") 20 | filename = "output\\ershoufang.csv" 21 | with open(filename,"w",newline="") as f: 22 | data = [ 23 | "id","小区名称","所在区域","总价","单价", 24 | "房屋户型","所在楼层","建筑面积","户型结构", 25 | "套内面积","建筑类型","房屋朝向","建筑结构", 26 | "装修情况","梯户比例","配备电梯","产权年限", 27 | "挂牌时间","交易权属","上次交易","房屋用途", 28 | "房屋年限","产权所属","抵押信息","房本备件", 29 | ] 30 | writer = csv.writer(f,dialect='excel') 31 | writer.writerow(data) 32 | 33 | 34 | def collect_data(self,data): 35 | if data is None: 36 | self.log.logger.error("页面数据收集:传入数据为空!") 37 | print("页面数据收集:传入数据为空!") 38 | return 39 | 40 | filename = "output\\ershoufang.csv" 41 | with open(filename,"a",newline="") as f: 42 | writer = csv.writer(f,dialect='excel') 43 | writer.writerow(data) 44 | 45 | self.log.logger.info("2.4页面数据收集:成功!") 46 | print("2.4页面数据收集:成功!") -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/html_parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 18 17:00:31 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | 9 | from bs4 import BeautifulSoup 10 | from log import MyLog 11 | 12 | class HtmlParser(): 13 | """网页解析模块""" 14 | 15 | 16 | def __init__(self): 17 | """构造函数,初始化属性""" 18 | self.log = MyLog("html_parser","logs") 19 | 20 | 21 | def get_ershoufang_data(self,html_cont,id): 22 | """获取二手房页面详细数据""" 23 | if html_cont is None: 24 | self.log.logger.error("页面解析(detail):传入页面为空!") 25 | print("页面解析(detail):传入页面为空!") 26 | return 27 | 28 | ershoufang_data = [] 29 | communityName = "null" 30 | areaName = "null" 31 | total = "null" 32 | unitPriceValue = "null" 33 | 34 | bsObj = BeautifulSoup(html_cont,"html.parser",from_encoding="utf-8") 35 | 36 | tag_com = bsObj.find("div",{"class":"communityName"}).find("a") 37 | if tag_com is not None: 38 | communityName = tag_com.get_text() 39 | else: 40 | self.log.logger.error("页面解析(detail):找不到communityName标签!") 41 | 42 | tag_area = bsObj.find("div",{"class":"areaName"}).find("span",{"class":"info"}).find("a") 43 | if tag_area is not None: 44 | areaName = tag_area.get_text() 45 | else: 46 | self.log.logger.error("页面解析(detail):找不到areaName标签!") 47 | 48 | tag_total = bsObj.find("span",{"class":"total"}) 49 | if tag_total is not None: 50 | total = tag_total.get_text() 51 | else: 52 | self.log.logger.error("页面解析(detail):找不到total标签!") 53 | 54 | tag_unit = bsObj.find("span",{"class":"unitPriceValue"}) 55 | if tag_unit is not None: 56 | unitPriceValue = tag_unit.get_text() 57 | else: 58 | self.log.logger.error("页面解析(detail):找不到total标签!") 59 | 60 | ershoufang_data.append(id) 61 | ershoufang_data.append(communityName) 62 | ershoufang_data.append(areaName) 63 | ershoufang_data.append(total) 64 | ershoufang_data.append(unitPriceValue) 65 | 66 | #print(bsObj.find("div",{"class":"introContent"}).find("div",{"class":"base"}).find("div",{"class":"content"}).ul) 67 | counta = 12 68 | for a_child in bsObj.find("div",{"class":"introContent"}).find("div",{"class":"base"}).find("div",{"class":"content"}).ul.findAll("li"): 69 | #print(child1) 70 | [s.extract() for s in a_child("span")] 71 | ershoufang_data.append(a_child.get_text()) 72 | counta = counta - 1 73 | 74 | while counta > 0: 75 | ershoufang_data.append("null") 76 | counta = counta - 1 77 | 78 | countb = 8 79 | for b_child in bsObj.find("div",{"class":"introContent"}).find("div",{"class":"transaction"}).find("div",{"class":"content"}).ul.findAll("li"): 80 | information = b_child.span.next_sibling.next_sibling.get_text() 81 | ershoufang_data.append(information) 82 | countb = countb - 1 83 | 84 | while countb > 0: 85 | ershoufang_data.append("null") 86 | countb = countb - 1 87 | 88 | self.log.logger.info("2.3 页面解析(detail):页面解析成功!") 89 | print("2.3 页面解析(detail):页面解析成功!") 90 | return ershoufang_data 91 | 92 | 93 | def get_erhoufang_urls(self,html_cont): 94 | """获取二手房页面的链接""" 95 | if html_cont is None: 96 | self.log.logger.error("页面解析(page):pg页面为空!") 97 | print("页面解析(page):pg页面为空!") 98 | return 99 | 100 | ershoufang_urls = set() 101 | bsObj = BeautifulSoup(html_cont,"html.parser",from_encoding="utf-8") 102 | 103 | sellListContent = bsObj.find("ul",{"class":"sellListContent"}) 104 | 105 | if sellListContent is not None: 106 | for child in sellListContent.children: 107 | if child["class"][0] == "clear": 108 | ershoufang_urls.add(child.a["href"]) 109 | self.log.logger.info(child.a["href"]) 110 | #print(child.find("a",{"class":"img"})["href"]) 111 | else: 112 | self.log.logger.error("页面解析(page):找不到sellListContent标签!") 113 | 114 | self.log.logger.info("1.3 PG页面解析:pg页面解析成功!") 115 | print("1.3 页面解析:pg页面解析成功!") 116 | return ershoufang_urls -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 20 16:38:37 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | 9 | import logging 10 | import datetime 11 | 12 | 13 | class MyLog(): 14 | """程序调试日志输出""" 15 | 16 | 17 | def __init__(self,name,filepath): 18 | """初始化属性""" 19 | #初始化日志器 20 | self.logger = logging.getLogger(name) 21 | self.logger.setLevel(logging.DEBUG) 22 | 23 | #初始化文件处理器 24 | #now = datetime.datetime.now() 25 | #time = (" " + str(now.hour) + "_" + 26 | #str(now.minute) + "_" + str(now.second)) 27 | #每天生成一个新的文件 28 | filepath = (filepath + "\\" + str(datetime.date.today()) + " log.txt") 29 | self.fh = logging.FileHandler(filepath) 30 | self.fh.setLevel(logging.DEBUG) 31 | 32 | #初始化格式器 33 | self.formatter = logging.Formatter( 34 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 35 | ) 36 | 37 | self.fh.setFormatter(self.formatter) 38 | self.logger.addHandler(self.fh) 39 | 40 | 41 | def getMyLogger(self): 42 | """获得自定义的日志器""" 43 | return self.logger -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/logs/2018-03-22 log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/logs/2018-03-22 log.txt -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/logs/2018-04-01 log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/logs/2018-04-01 log.txt -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/logs/2018-04-02 log.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/logs/2018-04-02 log.txt -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/logs/header.txt: -------------------------------------------------------------------------------- 1 | Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 2 | Accept-Encoding:gzip, deflate, br 3 | Accept-Language:zh-CN,zh;q=0.9 4 | Cache-Control:max-age=0 5 | Connection:keep-alive 6 | Cookie:mediav=%7B%22eid%22%3A%22202234%22%2C%22ep%22%3A%22%22%2C%22vid%22%3A%22Rp%2FK%60J(nC%3A9x%5BjrdJG4f%22%2C%22ctn%22%3A%22%22%7D; lianjia_uuid=fbeca14d-3e22-4236-9c9d-6906454cebfc; UM_distinctid=160c556503980c-0408507c0709e3-5b452a1d-149c48-160c556503a6ec; _smt_uid=5a4f30b8.3c5a2875; _ga=GA1.2.2018772658.1515139259; gr_user_id=2d9c2cbc-1863-40d1-93d4-7022767f2408; Hm_lvt_efa595b768cc9dc7d7f9823368e795f1=1521447859,1521700256; select_city=320100; all-lj=4e545f7bdde616c69ccf8a2d569e0bcd; Hm_lvt_9152f8221cb6243a53c83b956842be8a=1521687457,1521687565,1521700248,1522509750; _qzjc=1; CNZZDATA1253492138=816543875-1515138568-null%7C1522509106; _jzqc=1; _jzqy=1.1515139256.1522509751.1.jzqsr=baidu|jzqct=%E9%93%BE%E5%AE%B6%E7%BD%91.-; _jzqckmp=1; _gid=GA1.2.333382155.1522509763; Qs_lvt_200116=1521609881%2C1521632988%2C1521687456%2C1521687563%2C1522509782; CNZZDATA1255604082=1360458173-1515137064-null%7C1522510617; lianjia_ssid=2ab7d8a4-62b5-9067-67e6-705651dd1d18; _jzqa=1.4092991625161444400.1515139256.1522509751.1522513539.23; _jzqx=1.1515483145.1522513539.10.jzqsr=google%2Ecom|jzqct=/.jzqsr=nj%2Elianjia%2Ecom|jzqct=/ershoufang/; CNZZDATA1254525948=1765417420-1515134075-null%7C1522511992; CNZZDATA1255633284=143103637-1515134518-null%7C1522512583; Qs_pv_200116=542278243360891260%2C1591616573797491200%2C1847829454696899000%2C2776309514309280300%2C1495710598367565300; _qzja=1.1243235164.1515139256445.1522509750365.1522513538701.1522513744322.1522514117038.0.0.0.334.23; _qzjb=1.1522513538701.6.0.0.0; _qzjto=6.1.0; _jzqb=1.6.10.1522513539.1; Hm_lpvt_9152f8221cb6243a53c83b956842be8a=1522514119; _gat=1; _gat_past=1; _gat_global=1; _gat_new_global=1; _gat_dianpu_agent=1 7 | Host:nj.lianjia.com 8 | Upgrade-Insecure-Requests:1 9 | User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36 -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/output/ershoufang - 20000.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/output/ershoufang - 20000.csv -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/output/ershoufang - 副本.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/output/ershoufang - 副本.csv -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/output/ershoufang-10000.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/output/ershoufang-10000.csv -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/output/ershoufang.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/数据爬虫程序/lianjia/output/ershoufang.csv -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/spider_main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Mar 20 10:35:07 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | 9 | from url_manager import UrlManager 10 | from log import MyLog 11 | from html_downloader import HtmlDownloader 12 | from html_parser import HtmlParser 13 | from html_outputer import HtmlOutputer 14 | import time 15 | import random 16 | 17 | 18 | class SpiderMain(): 19 | """爬虫程序主模块""" 20 | 21 | 22 | def __init__(self): 23 | """构造函数,初始化属性""" 24 | self.urls = UrlManager() 25 | self.log = MyLog("spider_main","logs") 26 | self.downloader = HtmlDownloader() 27 | self.parser = HtmlParser() 28 | self.outputer = HtmlOutputer() 29 | #self.util=utill.DBConn() 30 | 31 | 32 | def craw(self,root_url): 33 | """爬虫入口函数""" 34 | areas = { 35 | "gulou":100, "jianye":72, "qinhuai":100, 36 | "xuanwu":67,"yuhuatai":32, "qixia":62, 37 | "baijiahu":33, "chalukou1":26,"jiangningqita11":3, 38 | "dongshanzhen":29, "jiangningdaxuecheng":15, "jiulonghu":12, 39 | "jiangjundadao11":22, "kexueyuan":9, "qilinzhen":42, 40 | "tiexinqiao":9, "pukou":100, "liuhe":1, 41 | } 42 | 43 | #areas = {"gulou":1} 44 | 45 | #1、抓取所有二手房详情界面链接,并将所有连接放入URL管理模块 46 | for area,pg_sum in areas.items(): 47 | for num in range(1, pg_sum+1): 48 | #1.1 拼接页面地址: https://nj.lianjia.com/ershoufang/gulou/pg2/ 49 | pg_url = root_url + area + "/pg" + str(num) + "/" 50 | self.log.logger.info("1.1 拼接页面地址:" + pg_url) 51 | print("1.1 拼接页面地址:" + pg_url) 52 | #1.2 启动下载器,下载页面. 53 | try: 54 | html_cont = self.downloader.download(pg_url) 55 | except Exception as e: 56 | self.log.logger.error("1.2 下载页面出现异常:" + repr(e)) 57 | time.sleep(60*30) 58 | else: 59 | #1.3 解析PG页面,获得二手房详情页面的链接,并将所有链接放入URL管理模块 60 | try: 61 | ershoufang_urls = self.parser.get_erhoufang_urls(html_cont) 62 | except Exception as e: 63 | self.log.logger.error("1.3 页面解析出现异常:" + repr(e)) 64 | else: 65 | self.urls.add_new_urls(ershoufang_urls) 66 | #暂停0~3秒的整数秒,时间区间:[0,3] 67 | time.sleep(random.randint(0,3)) 68 | 69 | time.sleep(60*20) 70 | #2、解析二手房具体细心页面 71 | id = 1 72 | stop = 1 73 | while self.urls.has_new_url(): 74 | #2.1 获取url 75 | try: 76 | detail_url = self.urls.get_new_url() 77 | self.log.logger.info("2.1 二手房页面地址:" + detail_url) 78 | print("2.1 二手房页面地址:" + detail_url) 79 | except Exception as e: 80 | print("2.1 拼接地址出现异常") 81 | self.log.logger.error("2.1 拼接地址出现异常:" + detail_url) 82 | 83 | #2.2 下载页面 84 | try: 85 | detail_html = self.downloader.download(detail_url) 86 | except Exception as e: 87 | self.log.logger.error("2.2 下载页面出现异常:" + repr(e)) 88 | self.urls.add_new_url(detail_url) 89 | time.sleep(60*30) 90 | else: 91 | #2.3 解析页面 92 | try: 93 | ershoufang_data = self.parser.get_ershoufang_data(detail_html,id) 94 | except Exception as e: 95 | self.log.logger.error("2.3 解析页面出现异常:" + repr(e)) 96 | else: 97 | #2.4 输出数据 98 | try: 99 | self.outputer.collect_data(ershoufang_data) 100 | except Exception as e: 101 | self.log.logger.error("2.4 输出数据出现异常:" + repr(e)) 102 | else: 103 | print(id) 104 | id = id + 1 105 | stop = stop + 1 106 | #暂停0~3秒的整数秒,时间区间:[0,3] 107 | time.sleep(random.randint(0,3)) 108 | if stop == 2500: 109 | stop = 1; 110 | time.sleep(60*20) 111 | 112 | 113 | if __name__ == "__main__": 114 | #设定爬虫入口URL 115 | root_url = "https://nj.lianjia.com/ershoufang/" 116 | #初始化爬虫对象 117 | obj_spider = SpiderMain() 118 | #启动爬虫 119 | obj_spider.craw(root_url) -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Mar 21 21:31:53 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | import random 9 | user_agent =[ 10 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0", 11 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; InfoPath.2; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; 360SE) ", 12 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0) ", 13 | "Mozilla/5.0 (Windows NT 5.1; zh-CN; rv:1.9.1.3) Gecko/20100101 Firefox/8.0", 14 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 15 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 16 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; TencentTraveler 4.0; .NET CLR 2.0.50727)", 18 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36" 19 | ] 20 | 21 | headers = { 22 | "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", 23 | "Accept-Encoding":"gzip, deflate, br", 24 | "Accept-Language":"zh-CN,zh;q=0.9", 25 | "Connection":"keep-alive", 26 | "Host":"nj.lianjia.com", 27 | "User-Agent":random.choice(user_agent) 28 | } 29 | print(headers) 30 | 31 | try: 32 | a = 5/0 33 | except Exception as e: 34 | print(repr(e)) -------------------------------------------------------------------------------- /数据爬虫程序/lianjia/url_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Mar 18 17:00:43 2018 4 | 5 | @author: zhangying 6 | """ 7 | 8 | 9 | class UrlManager(): 10 | """url管理模块""" 11 | 12 | 13 | def __init__(self): 14 | """构造函数,初始化属性""" 15 | self.new_urls=set() #新url的集合 16 | self.old_urls=set() #旧的URL集合 17 | 18 | 19 | def add_new_url(self,url): 20 | """向管理器中添加一个URL""" 21 | if url is None: 22 | return 23 | if url not in self.new_urls and url not in self.old_urls: 24 | self.new_urls.add(url) 25 | 26 | 27 | def add_new_urls(self,urls): 28 | """向管理器中添加批量URL""" 29 | if urls is None or len(urls)==0: 30 | return 31 | for url in urls: 32 | self.add_new_url(url) 33 | 34 | 35 | def get_new_url(self): 36 | """从url集合中弹出一个url""" 37 | new_url = self.new_urls.pop() 38 | self.old_urls.add(new_url) 39 | return new_url 40 | 41 | 42 | def has_new_url(self): 43 | """判断是否还有新的url""" 44 | return len(self.new_urls) != 0 -------------------------------------------------------------------------------- /结果分享PPT.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangyinghahaha/data_analysis/b4a1ef214f4b944a7e86fb0173dc36f8c92d91a1/结果分享PPT.pptx --------------------------------------------------------------------------------