├── 00_Python从零开始系列连载 ├── .ipynb_checkpoints │ ├── 01_Python的基本数据类型-checkpoint.ipynb │ ├── 02_Python的基本运算和表达式-checkpoint.ipynb │ ├── 03_Python程序的基本控制流程-checkpoint.ipynb │ ├── 04_Python特色数据类型 (列表)-checkpoint.ipynb │ ├── 05_Python特色数据类型(元组)-checkpoint.ipynb │ ├── 06_Python特色数据类型(字典)-checkpoint.ipynb │ ├── 07_Python特色数据类型(集合)-checkpoint.ipynb │ ├── 08_Python特色数据类型(函数)-checkpoint.ipynb │ ├── 09_Python文件操作-checkpoint.ipynb │ ├── 10_Python异常处理-checkpoint.ipynb │ ├── 11_Python的time模块简单使用-checkpoint.ipynb │ └── 12_Python的random模块简单使用-checkpoint.ipynb ├── 01_Python的基本数据类型.ipynb ├── 02_Python的基本运算和表达式.ipynb ├── 03_Python程序的基本控制流程.ipynb ├── 04_Python特色数据类型 (列表).ipynb ├── 05_Python特色数据类型(元组).ipynb ├── 06_Python特色数据类型(字典).ipynb ├── 07_Python特色数据类型(集合).ipynb ├── 08_Python特色数据类型(函数).ipynb ├── 09_Python文件操作.ipynb ├── 10_Python异常处理.ipynb ├── 11_Python的time模块简单使用.ipynb └── 12_Python的random模块简单使用.ipynb ├── 01_Python进阶系列连载 ├── .ipynb_checkpoints │ ├── 01_那些容易被忽略的问题-checkpoint.ipynb │ ├── 02_迭代器-checkpoint.ipynb │ └── 03_生成器-checkpoint.ipynb ├── 01_那些容易被忽略的问题.ipynb ├── 02_迭代器.ipynb └── 03_生成器.ipynb ├── 02_Python数据结构算法刷题 ├── .ipynb_checkpoints │ └── 01_(3n+1)猜想-checkpoint.ipynb └── 01_(3n+1)猜想.ipynb ├── 03_Python网络爬虫 ├── Python网络爬虫实战项目 │ ├── 01_python爬取电影天堂 │ │ ├── dytt.py │ │ └── 电影天堂.csv │ ├── 02_python爬取斗罗大陆小说 │ │ ├── dldl.py │ │ ├── 斗破苍穹小说.csv │ │ ├── 斗破苍穹小说.py │ │ └── 斗罗大陆小说.csv │ ├── 03_python爬取欧洲足球联赛数据 │ │ └── footballData.py │ ├── 04_python爬取豆瓣电影Top250 │ │ ├── douban_top250_movies.csv │ │ └── filmTop250.py │ ├── 05_python爬取股票数据 │ │ └── stockInfo.py │ ├── 06_python爬取人人贷网数据 │ │ └── peopleLoad.py │ ├── 07_python爬取创业邦创投库 │ │ ├── python爬取创业邦创投库.py │ │ └── resultsDatas.csv │ ├── 08_python抓取美团网百万商家信息 │ │ ├── meituan.csv │ │ └── python抓取美团网百万商家信息.py │ ├── 09_python爬取网易云音乐评论并把他们存入mysql数据库 │ │ └── python爬取网易云音乐评论并把他们存入mysql数据库.py │ ├── 10_python爬取“网上购物”类APP │ │ ├── apps.csv │ │ ├── python爬取网上购物类APP数据py │ │ └── 网上购物类APP数据分析并展示.py │ ├── 11_python爬取链家网房价信息 │ │ ├── Lianjia_Info_v1.py │ │ ├── Lianjia_Info_v2.py │ │ ├── Lianjia_Info_v3.py │ │ ├── Lianjia_Info_v4.py │ │ ├── Lianjia_Info_v4_analysis.py │ │ ├── lianjia.csv │ │ ├── lianjia_ershou_futian_100.xlsx │ │ └── lianjia_re_v4.csv │ ├── 12_python爬取并分析豆瓣中最新电影的影评(词云显示) │ │ ├── alice_mask.png │ │ ├── alice_mask1.png │ │ ├── python爬取并分析豆瓣中最新电影的影评.py │ │ ├── show_Chinese.png │ │ ├── stopwords.txt │ │ └── 豆瓣影评爬取入库.py │ ├── 13_python爬取豆瓣书籍信息 │ │ ├── books.csv │ │ └── python爬取豆瓣书籍信息.py │ ├── 14_python爬取今日头条信息并导入mongodb数据库 │ │ └── python爬取今日头条信息并导入mongodb数据库.py │ ├── 15_python使用selenium爬取百度招聘内容并存入mongodb数据库 │ │ └── python使用selenium爬取百度招聘内容并入mongodb数据库.py │ ├── 16_python爬取熊猫直播用户信息 │ │ └── python爬取熊猫直播用户信息.py │ ├── 17_scrapy爬取游天下南京短租房信息并存入mongodb数据库 │ │ └── youtxNanJin │ │ │ ├── README.txt │ │ │ ├── scrapy.cfg │ │ │ ├── youtxNanJin │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ ├── pipelines.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── youtxNanJin_spider.cpython-36.pyc │ │ │ │ └── youtxNanJin_spider.py │ │ │ ├── 游天下南京.csv │ │ │ └── 游天下南京.json │ ├── 18_scrapy爬取中国医学人才网信息并以json格式保存 │ │ └── chinadoctornet │ │ │ ├── README.txt │ │ │ ├── chinadoctornet │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ ├── pipelines.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── chinadoctornet_spider.cpython-36.pyc │ │ │ │ └── chinadoctornet_spider.py │ │ │ ├── scrapy.cfg │ │ │ ├── 中国医学人才网招聘最新招聘专栏.csv │ │ │ └── 中国医学人才网招聘最新招聘专栏.json │ ├── 19_scrapy框架爬取豆瓣电影top250信息 │ │ └── doubanmovie │ │ │ ├── README.txt │ │ │ ├── doubanmovie │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── doubanmovie_spider.cpython-36.pyc │ │ │ │ └── doubanmovie_spider.py │ │ │ ├── items.csv │ │ │ ├── items.json │ │ │ └── scrapy.cfg │ ├── 20_scrapy爬取织梦者网站信息并存入mongodb数据库 │ │ └── makedream │ │ │ ├── makedream │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ ├── pipelines.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── makedream_spider.cpython-36.pyc │ │ │ │ └── makedream_spider.py │ │ │ └── scrapy.cfg │ ├── 21_python爬取豆瓣电影前任3评论(词云显示) │ │ ├── ComentsAnaylst.py │ │ ├── ciyun.jpg │ │ ├── ciyun.png │ │ ├── douban.txt │ │ └── douban_qianren3.py │ ├── 22_python爬取Bilibili用户信息并导入mysql数据库 │ │ ├── bilibili_user.py │ │ ├── bilibili_user_info.sql │ │ └── user_agents.txt │ ├── 23_python爬取网易云音乐所有歌曲的评论数 │ │ ├── README.md │ │ ├── album_by_artist.py │ │ ├── artists.py │ │ ├── comments_by_music.py │ │ ├── music_by_album.py │ │ └── sql.py │ ├── 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库 │ │ └── findtrip │ │ │ ├── ctrip_items.csv │ │ │ ├── findtrip │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ ├── pipelines.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── ctrip_spider.cpython-36.pyc │ │ │ │ ├── qua_spider.cpython-36.pyc │ │ │ │ └── washctrip.cpython-36.pyc │ │ │ │ ├── ctrip_spider.py │ │ │ │ ├── qua_spider.py │ │ │ │ └── washctrip.py │ │ │ ├── qua_items.csv │ │ │ ├── qua_items.json │ │ │ └── scrapy.cfg │ ├── 25_scrapy爬取前程无忧网站python相关的工作信息 │ │ └── pythonjobs │ │ │ ├── PythonJobs.csv │ │ │ ├── pythonjobs │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ ├── pipelines.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── job_spider.cpython-36.pyc │ │ │ │ └── job_spider.py │ │ │ └── scrapy.cfg │ ├── 26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库 │ │ └── shuimujob │ │ │ ├── ghostdriver.log │ │ │ ├── scrapy.cfg │ │ │ └── shuimujob │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── platform.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── platform.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── shuimu_spider.cpython-36.pyc │ │ │ └── shuimu_spider.py │ ├── 27_scrapy爬取南京20000多套二手房信息 │ │ └── nj_house │ │ │ ├── house.csv │ │ │ ├── nj_house │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── items.cpython-36.pyc │ │ │ │ └── settings.cpython-36.pyc │ │ │ ├── items.py │ │ │ ├── middlewares.py │ │ │ ├── pipelines.py │ │ │ ├── settings.py │ │ │ └── spiders │ │ │ │ ├── __init__.py │ │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ └── lj_house.cpython-36.pyc │ │ │ │ └── lj_house.py │ │ │ └── scrapy.cfg │ └── 28_scrapy爬取链家北京二手房数据 │ │ └── LianJia │ │ ├── LianJia │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── items.cpython-36.pyc │ │ │ ├── pipelines.cpython-36.pyc │ │ │ └── settings.cpython-36.pyc │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── lianjia.cpython-36.pyc │ │ │ └── lianjia.py │ │ ├── lianjia.csv │ │ └── scrapy.cfg └── Python网络爬虫相关函数库介绍 │ ├── .ipynb_checkpoints │ └── 01_requests学习笔记-checkpoint.ipynb │ └── 01_requests学习笔记.ipynb └── README.md /00_Python从零开始系列连载/.ipynb_checkpoints/09_Python文件操作-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "文件是存储在外部介质的数据集合,通常可以长久保存(前提是这个介质不易损坏)\n", 8 | "\n", 9 | "通俗点说,文件就是存放数据的地方\n", 10 | "\n", 11 | "**绝对路径与相对路径**\n", 12 | "\n", 13 | "通常,我们使用电脑的时候,例如编写了一段代码,我们要把这段代码保存,方便下次使用\n", 14 | "\n", 15 | "你可能会把这段代码保存在硬盘某个位置" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "如果在Python中,我们要打开这个文件操作,该怎么操作呢?\n", 23 | "\n", 24 | "所以要打开这个文件操作也需要3个步骤:\n", 25 | "\n", 26 | "1.找出文件存放的路径,打开文件\n", 27 | "\n", 28 | "2.对文件修改操作\n", 29 | "\n", 30 | "3.关闭文件\n", 31 | "\n", 32 | "说到找出文件的存放路径,我们就必须讲讲绝对路径和相对路径的概念\n", 33 | "\n", 34 | "**绝对路径**\n", 35 | "\n", 36 | "绝对路径指的是从最初的硬盘开始一直进入到文件位置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## 未完待续!" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [] 54 | } 55 | ], 56 | "metadata": { 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.6.6" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 2 77 | } 78 | -------------------------------------------------------------------------------- /00_Python从零开始系列连载/.ipynb_checkpoints/11_Python的time模块简单使用-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "今天,我们在Python中演示一下time模块的常用方法" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "time.sleep(10)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "0\n", 42 | "1\n", 43 | "2\n", 44 | "3\n", 45 | "4\n", 46 | "5\n", 47 | "6\n", 48 | "7\n", 49 | "8\n", 50 | "9\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "for i in range(0, 10):\n", 56 | " print(i)\n", 57 | " time.sleep(1)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "首先,我们导入time模块\n", 65 | "\n", 66 | "我们要讲的第一个方法就是sleep()方法\n", 67 | "\n", 68 | "sleep就是睡觉休眠的意思,意味着执行的时候,系统休眠等待一会,不做其他操作\n", 69 | "\n", 70 | "当你运行以上代码,会发现隔一段时间打印一个数字\n", 71 | "\n", 72 | "而sleep()的括号中给出休眠时间,单位是秒\n", 73 | "\n", 74 | "常用的time模块下的方法还有:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## 未完待续!" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.6" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /00_Python从零开始系列连载/09_Python文件操作.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "文件是存储在外部介质的数据集合,通常可以长久保存(前提是这个介质不易损坏)\n", 8 | "\n", 9 | "通俗点说,文件就是存放数据的地方\n", 10 | "\n", 11 | "**绝对路径与相对路径**\n", 12 | "\n", 13 | "通常,我们使用电脑的时候,例如编写了一段代码,我们要把这段代码保存,方便下次使用\n", 14 | "\n", 15 | "你可能会把这段代码保存在硬盘某个位置" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "如果在Python中,我们要打开这个文件操作,该怎么操作呢?\n", 23 | "\n", 24 | "所以要打开这个文件操作也需要3个步骤:\n", 25 | "\n", 26 | "1.找出文件存放的路径,打开文件\n", 27 | "\n", 28 | "2.对文件修改操作\n", 29 | "\n", 30 | "3.关闭文件\n", 31 | "\n", 32 | "说到找出文件的存放路径,我们就必须讲讲绝对路径和相对路径的概念\n", 33 | "\n", 34 | "**绝对路径**\n", 35 | "\n", 36 | "绝对路径指的是从最初的硬盘开始一直进入到文件位置" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "## 未完待续!" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": true 51 | }, 52 | "outputs": [], 53 | "source": [] 54 | } 55 | ], 56 | "metadata": { 57 | "kernelspec": { 58 | "display_name": "Python 3", 59 | "language": "python", 60 | "name": "python3" 61 | }, 62 | "language_info": { 63 | "codemirror_mode": { 64 | "name": "ipython", 65 | "version": 3 66 | }, 67 | "file_extension": ".py", 68 | "mimetype": "text/x-python", 69 | "name": "python", 70 | "nbconvert_exporter": "python", 71 | "pygments_lexer": "ipython3", 72 | "version": "3.6.6" 73 | } 74 | }, 75 | "nbformat": 4, 76 | "nbformat_minor": 2 77 | } 78 | -------------------------------------------------------------------------------- /00_Python从零开始系列连载/11_Python的time模块简单使用.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "今天,我们在Python中演示一下time模块的常用方法" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "time.sleep(10)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "0\n", 42 | "1\n", 43 | "2\n", 44 | "3\n", 45 | "4\n", 46 | "5\n", 47 | "6\n", 48 | "7\n", 49 | "8\n", 50 | "9\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "for i in range(0, 10):\n", 56 | " print(i)\n", 57 | " time.sleep(1)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "首先,我们导入time模块\n", 65 | "\n", 66 | "我们要讲的第一个方法就是sleep()方法\n", 67 | "\n", 68 | "sleep就是睡觉休眠的意思,意味着执行的时候,系统休眠等待一会,不做其他操作\n", 69 | "\n", 70 | "当你运行以上代码,会发现隔一段时间打印一个数字\n", 71 | "\n", 72 | "而sleep()的括号中给出休眠时间,单位是秒\n", 73 | "\n", 74 | "常用的time模块下的方法还有:" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## 未完待续!" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.6" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /02_Python数据结构算法刷题/.ipynb_checkpoints/01_(3n+1)猜想-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### (3n+1)猜想\n", 8 | "\n", 9 | "又名卡拉兹(Callatz)猜想:\n", 10 | "\n", 11 | "对任何一个自然数n,如果它是偶数,那么把它砍掉一半;如果它是奇数,那么把(3n+1)砍掉一半。这样一直反复砍下去,最后一定在某一步得到n=1。卡拉兹在1950年的世界数学家大会上公布了这个猜想,传说当时耶鲁大学师生齐动员,拼命想证明这个貌似很傻很天真的命题,结果闹得学生们无心学业,一心只证(3n+1),以至于有人说这是一个阴谋,卡拉兹是在蓄意延缓美国数学界教学与科研的进展……\n", 12 | "\n", 13 | "我们今天的题目不是证明卡拉兹猜想,而是对给定的任一不超过1000的正整数n,简单地数一下,需要多少步(砍几下)才能得到n=1?\n", 14 | "\n", 15 | "输入格式:每个测试输入包含1个测试用例,即给出自然数n的值。\n", 16 | "\n", 17 | "输出格式:输出从n计算到1需要的步数。\n", 18 | "\n", 19 | "输入样例:\n", 20 | "3\n", 21 | "\n", 22 | "输出样例:\n", 23 | "5\n", 24 | "\n", 25 | "上代码:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "请输入0-1000内的整数: 99\n", 38 | "18\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "n = int(input('请输入0-1000内的整数: '))\n", 44 | "count = 0 \n", 45 | "while n != 1:\n", 46 | " if n%2 == 0:\n", 47 | " n = n/2\n", 48 | " else:\n", 49 | " n = (3*n+1)/2\n", 50 | " count += 1\n", 51 | "print (count)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "**注意点**:\n", 59 | "\n", 60 | "1. input得到的是字符型,需要强制转换为int\n", 61 | "2. 这里不适合用for循环,因为不知道何时循环结束\n", 62 | "3. 记得初始化步数i=0,并且每次在循环里自增\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.6" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /02_Python数据结构算法刷题/01_(3n+1)猜想.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### (3n+1)猜想\n", 8 | "\n", 9 | "又名卡拉兹(Callatz)猜想:\n", 10 | "\n", 11 | "对任何一个自然数n,如果它是偶数,那么把它砍掉一半;如果它是奇数,那么把(3n+1)砍掉一半。这样一直反复砍下去,最后一定在某一步得到n=1。卡拉兹在1950年的世界数学家大会上公布了这个猜想,传说当时耶鲁大学师生齐动员,拼命想证明这个貌似很傻很天真的命题,结果闹得学生们无心学业,一心只证(3n+1),以至于有人说这是一个阴谋,卡拉兹是在蓄意延缓美国数学界教学与科研的进展……\n", 12 | "\n", 13 | "我们今天的题目不是证明卡拉兹猜想,而是对给定的任一不超过1000的正整数n,简单地数一下,需要多少步(砍几下)才能得到n=1?\n", 14 | "\n", 15 | "输入格式:每个测试输入包含1个测试用例,即给出自然数n的值。\n", 16 | "\n", 17 | "输出格式:输出从n计算到1需要的步数。\n", 18 | "\n", 19 | "输入样例:\n", 20 | "3\n", 21 | "\n", 22 | "输出样例:\n", 23 | "5\n", 24 | "\n", 25 | "上代码:" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "请输入0-1000内的整数: 99\n", 38 | "18\n" 39 | ] 40 | } 41 | ], 42 | "source": [ 43 | "n = int(input('请输入0-1000内的整数: '))\n", 44 | "count = 0 \n", 45 | "while n != 1:\n", 46 | " if n%2 == 0:\n", 47 | " n = n/2\n", 48 | " else:\n", 49 | " n = (3*n+1)/2\n", 50 | " count += 1\n", 51 | "print (count)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "**注意点**:\n", 59 | "\n", 60 | "1. input得到的是字符型,需要强制转换为int\n", 61 | "2. 这里不适合用for循环,因为不知道何时循环结束\n", 62 | "3. 记得初始化步数i=0,并且每次在循环里自增\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.6.6" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 2 96 | } 97 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/dytt.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取电影天堂最新电影迅雷下载地址链接信息 3 | 所用模块:requests bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | 9 | url = 'https://www.dy2018.com/html/gndy/dyzz/index.html' 10 | 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 13 | } 14 | 15 | items_list = [] 16 | 17 | html = requests.get(url,headers=headers) 18 | html.encoding = 'gb2312' 19 | data = re.findall('.*?',html_1.text) 27 | #print(data_1[0]) 28 | list_1 = [i[1], url_1, data_1[0]] 29 | 30 | # list_1 = [url_1] 31 | 32 | items_list.append(list_1) 33 | #print (list_1) 34 | 35 | #print ('==========================================================================================================') 36 | 37 | for m in range(2, 298): 38 | url_2 = 'https://www.dy2018.com/html/gndy/dyzz/index_'+str(m)+'.html' 39 | print(url_2) 40 | html_2 = requests.get(url_2,headers=headers) 41 | html_2.encoding = 'gb2312' 42 | data_2 = re.findall('.*?',html_3.text) 50 | #print(data_3[0]) 51 | if len(data_3) < 1: 52 | continue 53 | list_2 = [n[1], url_3, data_3[0]] 54 | # list_2 = [url_3] 55 | 56 | 57 | items_list.append(list_2) 58 | #print (list_2) 59 | #print ('=====================================================================================================') 60 | 61 | df = pd.DataFrame(items_list, columns = ['电影名称','电影网址链接','电影迅雷下载链接']) 62 | 63 | df.to_csv('dytt.csv') -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/电影天堂.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/电影天堂.csv -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/dldl.py: -------------------------------------------------------------------------------- 1 | ''' 2 | 今日主题:python抓取斗罗大陆最新章节标题信息 3 | 所用模块:requests re bs4 pandas数据分析 4 | ''' 5 | import requests 6 | import re 7 | import pandas as pd 8 | from bs4 import BeautifulSoup #分析网页 获取标签内容 9 | 10 | url = 'https://www.freexs.org/novel/0/896/index.html' 11 | 12 | headers = { 13 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 14 | } 15 | 16 | items_list = [] 17 | 18 | html = requests.get(url,headers=headers) 19 | html.encoding = 'gb2312' 20 | 21 | data = re.findall('
(.*?)
' 68 | regx = re.compile(reg) 69 | ads = re.findall(regx, str(addresss)) 70 | # print(ads) 71 | # for adds in ads: 72 | # data = adds.split('|') 73 | # print(data) 74 | for itm_url, job_detail, ver_compny, ver_salary, ver_addres in zip(item_url, jobs, compy, salarys, ads): 75 | data = { 76 | 'itme_url': 'http://zhaopin.baidu.com'+itm_url.get('href'), 77 | 'job_detail': job_detail.string, 78 | 'ver_compny': str(ver_compny.string), 79 | 'ver_salary': ver_salary.string, 80 | 'ver_addres': str(ver_addres).split('|'), 81 | } 82 | print(data) 83 | # 插入数据库 84 | ver_job.insert_one(data) # 插入数据库失败 85 | f.write(str(data)) 86 | 87 | 88 | def get_page_source(page_num): 89 | time.sleep(2) 90 | driver.find_element_by_xpath('//*[@id="pagination"]/p/span/a[%s]' % page_num).click() 91 | # //*[@id="pagination"]/p/span/a[1] 为在第一页的按钮 92 | # //*[@id="pagination"]/p/span/a[2] 为第二页的按钮 93 | set_winscroll(driver) 94 | we_data = driver.page_source 95 | return we_data 96 | 97 | f = open('百度招聘前30页杭州.csv', 'a',encoding='utf-8') 98 | # 首页的数据 99 | def getBaiduHangZhouJob(we_data): 100 | parse_html(we_data) 101 | for i in range(1, 50): 102 | if i==1: 103 | we_data = get_page_source(1) 104 | parse_html(we_data) 105 | elif i<=5: 106 | we_data = get_page_source(str(2)) 107 | parse_html(we_data) 108 | else: 109 | we_data = get_page_source(str(3)) 110 | parse_html(we_data) 111 | f.close() 112 | 113 | 114 | if __name__ == '__main__': 115 | getBaiduHangZhouJob(we_data) 116 | # pool = Pool(processes=10) 117 | # pool.map_async(getBaiduHangZhouJob(we_data)) 118 | # pool.close() 119 | # f.close() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/16_python爬取熊猫直播用户信息/python爬取熊猫直播用户信息.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import requests 4 | import json 5 | import pandas as pd 6 | 7 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d".format(a=range(0,35),b=range(1501946526480,1501946526880)) 8 | 9 | headers = { 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0' 11 | , 12 | 'Cookie': '__guid=96554777.3243119502220345300.1500627276199.6702; smid=608e0bde-ffe2-4251-90ca-2938cabdc074; monitor_count=18' 13 | , 14 | } 15 | 16 | 17 | def getHtml(url): 18 | req = requests.get(url, headers=headers) 19 | print(req.text) 20 | return req.text 21 | 22 | 23 | def printInfos(data): 24 | jsondata = json.loads(data, "utf-8") 25 | # print(jsondata) 26 | itemsinfo = jsondata['data']['items'] 27 | items_list = [] 28 | for pinfo in itemsinfo: 29 | name = pinfo['name'] 30 | person_num = pinfo['person_num'] 31 | nickName = pinfo['userinfo']['nickName'] 32 | lelvel = pinfo['host_level_info'] 33 | lable = pinfo['label'] 34 | cname = pinfo['classification'] 35 | item_list = [name, person_num, nickName, lelvel, label, cname] 36 | items_list.append(item_list) 37 | df = pd.DataFrame(items_list, columns = ['name','person_num','nickName','host_level_info','label','classification']) 38 | df.to_csv('熊猫直播用户信息.csv') 39 | 40 | 41 | def mainStart(): 42 | for n in range(0, 3): 43 | pageindex = 1 + n 44 | pagetime = int(1501946526480 + n) 45 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d"%(pageindex,pagetime) 46 | data = getHtml(url) 47 | printInfos(data) 48 | 49 | mainStart() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl youtx -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl youtx -o items.csv 时以csv格式保存下载数据 3 | 4 | 5 | Scrapy必须背下来的命令: 6 | 1 创建项目: scrapy startproject youtxNanJin 7 | startproject: 表示创建项目 8 | youtxNanJin: 表示创建的项目名 9 | 10 | 2 创建爬虫: scrapy genspider youtx "http://www.youtx.com" 11 | genspider: 表示生成一个爬虫(默认是scrapy.Spider类) 12 | youtx: 表示爬虫名(对应爬虫代码里的 name 参数) 13 | "http://www.youtx.com": 表示允许爬虫爬取的域范围 14 | 15 | 3 执行爬虫: scrapy crawl youtx 16 | crawl: 表示启动一个sc rapy爬虫 17 | youtx: 表示需要启动的爬虫名(对应爬虫代码里的 name 参数) -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = youtxNanJin.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = youtxNanJin 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class YoutxnanjinItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 房源名称 17 | homeName = scrapy.Field() 18 | # 房源链接 19 | homeLine = scrapy.Field() 20 | # 房租单价 21 | homeSinglePrice = scrapy.Field() 22 | # 房租地址 23 | homeAddress = scrapy.Field() 24 | # 房租近期信息 25 | homeDetai = scrapy.Field() 26 | # 满七天价格 27 | homeSeven = scrapy.Field() 28 | # 满30天价格 29 | homeThirth = scrapy.Field() 30 | 31 | # 房东 32 | homePerson = scrapy.Field() 33 | # 房东头像 34 | homePersonImg = scrapy.Field() 35 | # 房东头像链接 36 | homePersonLink = scrapy.Field() 37 | 38 | # 房子大图 39 | homePicBg = scrapy.Field() 40 | # 房子大图链接 41 | homePicLink = scrapy.Field() 42 | 43 | # 品牌店铺信息 44 | # homePinPai = scrapy.Field() 45 | # 明星房东 46 | # homeStarrPerson = scrapy.Field() 47 | 48 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class YoutxnanjinSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | from scrapy.conf import settings 9 | import pymongo 10 | 11 | 12 | class YoutxnanjinPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class YouTXMongo(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | postItem = dict(item) 25 | self.post.insert(postItem) 26 | return item 27 | 28 | # 写入json文件 29 | class JsonWritePipline(object): 30 | def __init__(self): 31 | self.file = open('游天下南京.json','w',encoding='utf-8') 32 | 33 | def process_item(self,item,spider): 34 | line = json.dumps(dict(item),ensure_ascii=False)+"\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self,spider): 39 | self.file.close() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for youtxNanJin project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'youtxNanJin' 13 | 14 | SPIDER_MODULES = ['youtxNanJin.spiders'] 15 | NEWSPIDER_MODULE = 'youtxNanJin.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "YouTianXia" # 库名 30 | MONGO_COLL = "house_nanjin" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300, 77 | 'youtxNanJin.pipelines.YouTXMongo': 300, 78 | 'youtxNanJin.pipelines.JsonWritePipline': 300, 79 | } 80 | 81 | # Enable and configure the AutoThrottle extension (disabled by default) 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 83 | #AUTOTHROTTLE_ENABLED = True 84 | # The initial download delay 85 | #AUTOTHROTTLE_START_DELAY = 5 86 | # The maximum download delay to be set in case of high latencies 87 | #AUTOTHROTTLE_MAX_DELAY = 60 88 | # The average number of requests Scrapy should be sending in parallel to 89 | # each remote server 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 91 | # Enable showing throttling stats for every response received: 92 | #AUTOTHROTTLE_DEBUG = False 93 | 94 | # Enable and configure HTTP caching (disabled by default) 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 96 | #HTTPCACHE_ENABLED = True 97 | #HTTPCACHE_EXPIRATION_SECS = 0 98 | #HTTPCACHE_DIR = 'httpcache' 99 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 101 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/youtxNanJin_spider.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import scrapy 3 | from youtxNanJin.items import YoutxnanjinItem 4 | 5 | class NanJinDefault(scrapy.Spider): 6 | name = 'youtx' 7 | allowed_domains = ['youtx.com'] 8 | start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)] 9 | def parse(self, response): 10 | # print(response.body) 11 | node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']") 12 | # print(node_list) 13 | for node in node_list: 14 | item = YoutxnanjinItem() 15 | homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract() 16 | homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract() 17 | print(homeName) 18 | print(homeLink) 19 | 20 | # 单日价格 21 | homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract() 22 | print(homeSinglePrice) 23 | 24 | # 获取房源地址 25 | homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract() 26 | # 房租信息 27 | homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract() 28 | homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract() 29 | print(homeAddress) 30 | print(homeDesc) 31 | print(homeDesc2) 32 | 33 | # 满30天的信息 34 | homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract() 35 | print(homeThrty) 36 | # 房东信息 37 | homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract() 38 | # 房东链接 39 | homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract() 40 | print(homePerson) 41 | print(homePersonLink) 42 | 43 | # 房源大图图片 44 | homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract() 45 | homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract() 46 | print(homeBigPic) 47 | print(homeBigPicLink) 48 | # 房东头像信息 49 | personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract() 50 | # 房东头像链接地址 51 | personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract() 52 | 53 | print(personPic) 54 | print(homePersonLink) 55 | item['homeName'] ="".join(homeName) 56 | item['homeLine'] ="".join(homeLink) 57 | item['homeSinglePrice'] ="".join(homeSinglePrice) 58 | item['homeAddress'] ="".join(homeAddress) 59 | item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2) 60 | # 这里的值暂时没有取出来 61 | item['homeSeven'] ="".join(homeThrty) 62 | item['homeThirth'] ="".join(homeThrty) 63 | 64 | item['homePerson'] ="".join(homePerson) 65 | item['homePersonImg'] ="".join(personPic) 66 | item['homePersonLink'] ="".join(homePersonLink) 67 | item['homePicBg'] ="".join(homeBigPic) 68 | item['homePicLink'] ="".join(homeBigPicLink) 69 | yield item -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl docNet -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl docNet -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ChinadoctornetItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | 15 | # 爬取中国医学人才网的条目(共5个条目) 16 | # 医院名称 17 | hospitalName = scrapy.Field() 18 | # 医院规模 19 | hospitalSize = scrapy.Field() 20 | # 医院所在地 21 | hospitalAddress = scrapy.Field() 22 | # 医院科目 23 | hospitalDesc = scrapy.Field() 24 | # pass 25 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ChinadoctornetSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | # import json 8 | 9 | class ChinadoctornetPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | # class JsonWriterPipeline(object): 15 | # def __init__(self): 16 | # self.file = open('中国医学人才网招聘最新招聘专栏2.json', 'w', encoding='utf-8') 17 | 18 | # def process_item(self, item, spider): 19 | # line = json.dumps(dict(item), ensure_ascii=False) + "\n" 20 | # self.file.write(line) 21 | # return item 22 | 23 | # def spider_closed(self, spider): 24 | # self.file.close() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for chinadoctornet project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'chinadoctornet' 13 | 14 | SPIDER_MODULES = ['chinadoctornet.spiders'] 15 | NEWSPIDER_MODULE = 'chinadoctornet.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'chinadoctornet (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'chinadoctornet.middlewares.ChinadoctornetSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'chinadoctornet.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | # ITEM_PIPELINES = { 68 | # # 'chinadoctornet.pipelines.ChinadoctornetPipeline': 300, 69 | # 'chinadoctornet.pipelines.JsonWritePipline': 300, 70 | # } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/chinadoctornet_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from chinadoctornet.items import ChinadoctornetItem 4 | 5 | 6 | class ChinaDocNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'docNet' 9 | # 爬取域名的范围 10 | allowed_domains = ['yixuezp.com'] 11 | # 爬虫第一个url地址 12 | start_urls = ['http://www.yixuezp.com/zhaopin?page={}'.format(n) for n in range(0, 464)] # 463 13 | 14 | def parse(self, response): 15 | # 医院name 16 | node_list = response.xpath("//div[@class='newsjob']/ul/li") 17 | items = [] 18 | for node in node_list: 19 | item = ChinadoctornetItem() 20 | hospitalName = node.xpath("./a/text()").extract() 21 | hospitalSize = node.xpath("./span[1]/text()").extract() 22 | hospitalAddress = node.xpath("./span[2]/text()").extract() 23 | hospitalDesc = node.xpath("./p/a/text()").extract() 24 | 25 | item['hospitalName'] = hospitalName 26 | item['hospitalSize'] = hospitalSize 27 | item['hospitalAddress'] = hospitalAddress 28 | item['hospitalDesc'] = hospitalDesc 29 | items.append(item) 30 | # return items # 如果直接return的话,一页数据只会返回一条数据 31 | yield item #用yield 的话,可以交给下载器,继续执行下一步操作。 -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = chinadoctornet.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = chinadoctornet 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/README.txt: -------------------------------------------------------------------------------- 1 | 输入:scrapy crawl doubanMovie -o items.json 时以json格式保存下载数据 2 | 输入:scrapy crawl doubanMovie -o items.csv 时以csv格式保存下载数据 -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanmovieItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 电影名字 17 | name = scrapy.Field() 18 | # 电影信息 19 | info = scrapy.Field() 20 | # 评分 21 | rating = scrapy.Field() 22 | # 评论人数 23 | num = scrapy.Field() 24 | # 经典语句 25 | quote = scrapy.Field() 26 | # 电影图片 27 | img_url = scrapy.Field() 28 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanmovieSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanmoviePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for doubanmovie project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'doubanmovie' 13 | 14 | SPIDER_MODULES = ['doubanmovie.spiders'] 15 | NEWSPIDER_MODULE = 'doubanmovie.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'doubanmovie.middlewares.DoubanmovieSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'doubanmovie.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | #ITEM_PIPELINES = { 69 | # 'doubanmovie.pipelines.DoubanmoviePipeline': 300, 70 | #} 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/doubanmovie_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from doubanmovie.items import DoubanmovieItem 3 | 4 | class Movie(scrapy.Spider): 5 | # 爬虫唯一标识符 6 | name = 'doubanMovie' 7 | # 爬取域名 8 | allowed_domain = ['movie.douban.com'] 9 | # 爬取页面地址 10 | start_urls = ['https://movie.douban.com/top250'] 11 | 12 | def parse(self, response): 13 | selector = scrapy.Selector(response) 14 | # 解析出各个电影 15 | movies = selector.xpath('//div[@class="item"]') 16 | # 存放电影信息 17 | item = DoubanmovieItem() 18 | 19 | for movie in movies: 20 | 21 | # 电影各种语言名字的列表 22 | titles = movie.xpath('.//span[@class="title"]/text()').extract() 23 | # 将中文名与英文名合成一个字符串 24 | name = '' 25 | for title in titles: 26 | name += title.strip() 27 | item['name'] = name 28 | 29 | # 电影信息列表 30 | infos = movie.xpath('.//div[@class="bd"]/p/text()').extract() 31 | # 电影信息合成一个字符串 32 | fullInfo = '' 33 | for info in infos: 34 | fullInfo += info.strip() 35 | item['info'] = fullInfo 36 | # 提取评分信息 37 | item['rating'] = movie.xpath('.//span[@class="rating_num"]/text()').extract()[0].strip() 38 | # 提取评价人数 39 | item['num'] = movie.xpath('.//div[@class="star"]/span[last()]/text()').extract()[0].strip()[:-3] 40 | # 提取经典语句,quote可能为空 41 | quote = movie.xpath('.//span[@class="inq"]/text()').extract() 42 | if quote: 43 | quote = quote[0].strip() 44 | item['quote'] = quote 45 | # 提取电影图片 46 | item['img_url'] = movie.xpath('.//img/@src').extract()[0] 47 | 48 | yield item 49 | 50 | next_page = selector.xpath('//span[@class="next"]/a/@href').extract()[0] 51 | url = 'https://movie.douban.com/top250' + next_page 52 | if next_page: 53 | yield scrapy.Request(url, callback=self.parse) 54 | 55 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = doubanmovie.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = doubanmovie 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MakedreamItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | 16 | # 文章标题 17 | articleTitle = scrapy.Field() 18 | # 文章标题url 19 | articleUrl = scrapy.Field() 20 | # 文章描述 21 | articleDesc = scrapy.Field() 22 | # 文章发布时间 23 | articlePublic = scrapy.Field() 24 | # 文章类型 25 | articleType = scrapy.Field() 26 | # 文章标签 27 | articleTag = scrapy.Field() 28 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MakedreamSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import json 8 | import pymongo 9 | from scrapy.conf import settings 10 | 11 | class MakedreamPipeline(object): 12 | def process_item(self, item, spider): 13 | return item 14 | 15 | 16 | class DreamMongo(object): 17 | def __init__(self): 18 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 19 | self.db = self.client[settings['MONGO_DB']] 20 | self.post = self.db[settings['MONGO_COLL']] 21 | 22 | def process_item(self, item, spider): 23 | postItem = dict(item) 24 | self.post.insert(postItem) 25 | return item 26 | 27 | 28 | # 写入json文件类 29 | class JsonWritePipeline(object): 30 | def __init__(self): 31 | self.file = open('织梦网其他编程.json', 'w', encoding='utf-8') 32 | 33 | def process_item(self, item, spider): 34 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 35 | self.file.write(line) 36 | return item 37 | 38 | def spider_closed(self, spider): 39 | self.file.close() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for makedream project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'makedream' 13 | 14 | SPIDER_MODULES = ['makedream.spiders'] 15 | NEWSPIDER_MODULE = 'makedream.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'makedream (+http://www.yourdomain.com)' 20 | # 配置mongoDB 21 | MONGO_HOST = "127.0.0.1" # 主机IP 22 | MONGO_PORT = 27017 # 端口号 23 | MONGO_DB = "DreamDB" # 库名 24 | MONGO_COLL = "Dream_info" # collection 25 | 26 | 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | #CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | #CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | # COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | #TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | #DEFAULT_REQUEST_HEADERS = { 50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | # 'Accept-Language': 'en', 52 | #} 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'makedream.middlewares.MakedreamSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'makedream.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | # 'makedream.pipelines.MakedreamPipeline': 300, 76 | 'makedream.pipelines.JsonWritePipeline':300, 77 | 'makedream.pipelines.DreamMongo':300 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/makedream_spider.py: -------------------------------------------------------------------------------- 1 | # encoding=utf8 2 | import scrapy 3 | from makedream.items import MakedreamItem 4 | 5 | 6 | class DramingNet(scrapy.Spider): 7 | # 启动爬虫的名称 8 | name = 'dreaming' 9 | # 爬虫的域范围 10 | allowed_domains = ['zhimengzhe.com'] 11 | # 爬虫的第一个url 12 | start_urls = ['http://www.zhimengzhe.com/bianchengjiaocheng/qitabiancheng/index_{}.html'.format(n) for n in 13 | range(0, 1466)] 14 | 15 | # 爬取结果解析 16 | def parse(self, response): 17 | base_url = 'http://www.zhimengzhe.com' 18 | # print(response.body) 19 | node_list = response.xpath("//ul[@class='list-unstyled list-article']/li") 20 | for node in node_list: 21 | item = MakedreamItem() 22 | nextNode = node.xpath("./div[@class='pull-left ltxt w658']") 23 | print('*' * 30) 24 | title = nextNode.xpath('./h3/a/text()').extract() 25 | link = nextNode.xpath('./h3/a/@href').extract() 26 | desc = nextNode.xpath('./p/text()').extract() 27 | 28 | # 创建时间,类型,标签 29 | publicTime = nextNode.xpath("./div[@class='tagtime']/span[1]/text()").extract() 30 | publicType = nextNode.xpath("./div[@class='tagtime']/span[2]/a/text()").extract() 31 | publicTag = nextNode.xpath("./div[@class='tagtime']/span[3]/a/text()").extract() 32 | # node 33 | titleLink = base_url + ''.join(link) 34 | item['articleTitle'] = title 35 | # 文章标题url 36 | item['articleUrl'] = titleLink 37 | # 文章描述 38 | item['articleDesc'] = desc 39 | # 文章发布时间 40 | item['articlePublic'] = publicTime 41 | # 文章类型 42 | item['articleType'] = publicType 43 | # 文章标签 44 | item['articleTag'] = publicTag 45 | yield item -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = makedream.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = makedream 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ComentsAnaylst.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : commentsAnaylst.py(再见前任3的影评f词云) 4 | 5 | import matplotlib.pyplot as plt 6 | from PIL import Image 7 | from wordcloud import WordCloud 8 | import jieba 9 | import numpy as np 10 | #读取txt格式的文本内容 11 | text_from_file_with_apath = open('douban.txt','rb').read() 12 | 13 | #使用jieba进行分词,并对分词的结果以空格隔开 14 | wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True) 15 | wl_space_split = " ".join(wordlist_after_jieba) 16 | 17 | #对分词后的文本生成词云 18 | # my_wordcloud = WordCloud().generate(wl_space_split) 19 | 20 | font = r'C:\Windows\Fonts\simfang.ttf' 21 | mask = np.array(Image.open('ciyun.jpg')) 22 | wc = WordCloud(mask=mask,max_words=3000,collocations=False, font_path=font, width=5800, height=2400, margin=10,background_color='black').generate(wl_space_split) 23 | default_colors = wc.to_array() 24 | plt.title("QR 3") 25 | plt.imshow(wc) 26 | plt.axis("off") 27 | plt.savefig("ciyun.png") 28 | plt.show() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/douban_qianren3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Time : 2018/4/25 11:15 3 | # @File : test_douban_qianren3.py(再见前任3的影评) 4 | 5 | import csv 6 | import requests 7 | from lxml import etree 8 | import time 9 | 10 | 11 | url = 'https://movie.douban.com/subject/26662193/comments?start=0&limit=20&sort=new_score&status=P&percent_type=' 12 | 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36', 15 | 'Cookie': 'gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; bid=qh9RXgIGopg; viewed="26826540_24703171"; ap=1; ll="118172"; ct=y; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522129522%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DdnHqCRiT1HlhToCp0h1cpdyV8rB9f_OfOvJhjRPO3p1jrl764LGvi7gbYSdskDMh%26wd%3D%26eqid%3De15db1bb0000e3cd000000045ab9b6fe%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.10.1522130672.1522120744.; _pk_ses.100001.4cf6=*'} 16 | 17 | 18 | def get_html(current_url): 19 | time.sleep(2) 20 | r = requests.get(current_url, headers=headers) 21 | r.raise_for_status() 22 | return etree.HTML(r.text) 23 | 24 | 25 | def parse_html(content,writer): 26 | links = content.xpath("//*[@class='comment-item']") 27 | for link in links: 28 | content = link.xpath("./div[@class='comment']/p/text()")[0].strip() 29 | author = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/a/text()")[0].strip() 30 | time = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/span[@class='comment-time ']/text()")[ 31 | 0].strip() 32 | is_useful = link.xpath("./div[@class='comment']/h3/span[@class='comment-vote']/span[@class='votes']/text()")[0] 33 | print('content:', content) 34 | print('time:', time) 35 | print('is_useful:', is_useful) 36 | # detail = (author, time, is_useful, content) 37 | detail = (is_useful,content) 38 | writer.writerow(detail) 39 | 40 | 41 | if __name__ == '__main__': 42 | with open('douban.txt', 'a+', encoding='utf-8', newline='') as csvf: 43 | writer = csv.writer(csvf) 44 | writer.writerow(('作者', '时间', '有用数', '内容')) 45 | for page in range(0, 260, 20): 46 | url = 'https://movie.douban.com/subject/26662193/comments?start={}&limit=20&sort=new_score&status=P&percent_type='.format( 47 | page) 48 | r = get_html(url) 49 | parse_html(r,writer) -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user_info.sql: -------------------------------------------------------------------------------- 1 | # ************************************************************ 2 | # Sequel Pro SQL dump 3 | # Version 4135 4 | # 5 | # http://www.sequelpro.com/ 6 | # http://code.google.com/p/sequel-pro/ 7 | # 8 | # Host: 127.0.0.1 (MySQL 5.1.63) 9 | # Database: sunshine 10 | # Generation Time: 2018-04-26 13:33:32 +0000 11 | # ************************************************************ 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 21 | 22 | 23 | # Dump of table bilibili_user_info 24 | # ------------------------------------------------------------ 25 | 26 | CREATE TABLE `bilibili_user_info` ( 27 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT, 28 | `mid` varchar(11) DEFAULT NULL, 29 | `name` varchar(45) DEFAULT NULL, 30 | `sex` varchar(11) DEFAULT NULL, 31 | `face` varchar(200) DEFAULT NULL, 32 | `coins` int(11) DEFAULT NULL, 33 | `spacesta` int(11) DEFAULT NULL, 34 | `birthday` varchar(45) DEFAULT NULL, 35 | `place` varchar(45) DEFAULT NULL, 36 | `description` varchar(45) DEFAULT NULL, 37 | `article` int(11) DEFAULT NULL, 38 | `following` int(11) DEFAULT NULL, 39 | `fans` int(11) DEFAULT NULL, 40 | `playnum` int(30) DEFAULT NULL, 41 | `sign` varchar(300) DEFAULT NULL, 42 | `level` int(11) DEFAULT NULL, 43 | `exp` int(11) DEFAULT NULL, 44 | PRIMARY KEY (`id`) 45 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8; 46 | 47 | 48 | 49 | 50 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 51 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 52 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 53 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 54 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 55 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 56 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/README.md: -------------------------------------------------------------------------------- 1 | #### 这是一个爬取网易云音乐的所有的歌曲的评论数的爬虫。 2 | 3 | 以下为主要思路: 4 | 5 | - 1. 爬取所有的歌手信息([artists.py]); 6 | - 2. 根据上一步爬取到的歌手信息去爬取所有的专辑信息([album_by_artist.py]); 7 | - 3. 根据专辑信息爬取所有的歌曲信息([music_by_album.py]); 8 | - 4. 根据歌曲信息爬取其评论条数([comments_by_music.py]) 9 | - 5. 数据库相关的语句都存放于([sql.py])中。 -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/album_by_artist.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Album(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_albums(self, artist_id): 27 | params = {'id': artist_id, 'limit': '200'} 28 | # 获取歌手个人主页 29 | r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | albums = body.find_all('a', attrs={'class': 'tit f-thide s-fc0'}) # 获取所有专辑 36 | 37 | for album in albums: 38 | albume_id = album['href'].replace('/album?id=', '') 39 | sql.insert_album(albume_id, artist_id) 40 | 41 | 42 | if __name__ == '__main__': 43 | artists = sql.get_all_artist() 44 | my_album = Album() 45 | for i in artists: 46 | try: 47 | my_album.save_albums(i['ARTIST_ID']) 48 | # print(i) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(str(i) + ': ' + str(e)) 52 | time.sleep(5) 53 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/artists.py: -------------------------------------------------------------------------------- 1 | """ 2 | 获取所有的歌手信息 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from music_163 import sql 7 | 8 | headers = { 9 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 10 | 'Accept-Encoding': 'gzip, deflate, sdch', 11 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 12 | 'Cache-Control': 'no-cache', 13 | 'Connection': 'keep-alive', 14 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 15 | 'DNT': '1', 16 | 'Host': 'music.163.com', 17 | 'Pragma': 'no-cache', 18 | 'Referer': 'http://music.163.com/', 19 | 'Upgrade-Insecure-Requests': '1', 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 21 | } 22 | 23 | 24 | def save_artist(group_id, initial): 25 | params = {'id': group_id, 'initial': initial} 26 | r = requests.get('http://music.163.com/discover/artist/cat', params=params) 27 | 28 | # 网页解析 29 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 30 | body = soup.body 31 | 32 | hot_artists = body.find_all('a', attrs={'class': 'msk'}) 33 | artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'}) 34 | 35 | for artist in hot_artists: 36 | artist_id = artist['href'].replace('/artist?id=', '').strip() 37 | artist_name = artist['title'].replace('的音乐', '') 38 | try: 39 | sql.insert_artist(artist_id, artist_name) 40 | except Exception as e: 41 | # 打印错误日志 42 | print(e) 43 | 44 | for artist in artists: 45 | artist_id = artist['href'].replace('/artist?id=', '').strip() 46 | artist_name = artist['title'].replace('的音乐', '') 47 | try: 48 | sql.insert_artist(artist_id, artist_name) 49 | except Exception as e: 50 | # 打印错误日志 51 | print(e) 52 | 53 | 54 | gg = 4003 55 | 56 | save_artist(gg, 0) 57 | for i in range(65, 91): 58 | save_artist(gg, i) 59 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/music_by_album.py: -------------------------------------------------------------------------------- 1 | """ 2 | 根据专辑 ID 获取到所有的音乐 ID 3 | """ 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import time 7 | from music_163 import sql 8 | 9 | 10 | class Music(object): 11 | headers = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 15 | 'Cache-Control': 'no-cache', 16 | 'Connection': 'keep-alive', 17 | 'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)', 18 | 'DNT': '1', 19 | 'Host': 'music.163.com', 20 | 'Pragma': 'no-cache', 21 | 'Referer': 'http://music.163.com/', 22 | 'Upgrade-Insecure-Requests': '1', 23 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 24 | } 25 | 26 | def save_music(self, album_id): 27 | params = {'id': album_id} 28 | # 获取专辑对应的页面 29 | r = requests.get('http://music.163.com/album', headers=self.headers, params=params) 30 | 31 | # 网页解析 32 | soup = BeautifulSoup(r.content.decode(), 'html.parser') 33 | body = soup.body 34 | 35 | musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li') # 获取专辑的所有音乐 36 | 37 | for music in musics: 38 | music = music.find('a') 39 | music_id = music['href'].replace('/song?id=', '') 40 | music_name = music.getText() 41 | sql.insert_music(music_id, music_name, album_id) 42 | 43 | 44 | if __name__ == '__main__': 45 | albums = sql.get_all_album() 46 | my_music = Music() 47 | for i in albums: 48 | try: 49 | my_music.save_music(i['ALBUM_ID']) 50 | # print(i) 51 | except Exception as e: 52 | # 打印错误日志 53 | print(str(i) + ': ' + str(e)) 54 | time.sleep(5) 55 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/sql.py: -------------------------------------------------------------------------------- 1 | """ 2 | 一般 Python 用于连接 MySQL 的工具:pymysql 3 | """ 4 | import pymysql.cursors 5 | 6 | connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='***', db='sunshine',charset="utf8") 7 | 8 | 9 | # 保存评论 10 | def insert_comments(music_id, comments, detail, connection): 11 | with connection.cursor() as cursor: 12 | sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)" 13 | cursor.execute(sql, (music_id, comments, detail)) 14 | connection.commit() 15 | 16 | 17 | # 保存音乐 18 | def insert_music(music_id, music_name, album_id): 19 | with connection.cursor() as cursor: 20 | sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)" 21 | cursor.execute(sql, (music_id, music_name, album_id)) 22 | connection.commit() 23 | 24 | 25 | # 保存专辑 26 | def insert_album(album_id, artist_id): 27 | with connection.cursor() as cursor: 28 | sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)" 29 | cursor.execute(sql, (album_id, artist_id)) 30 | connection.commit() 31 | 32 | 33 | # 保存歌手 34 | def insert_artist(artist_id, artist_name): 35 | with connection.cursor() as cursor: 36 | sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME`) VALUES (%s, %s)" 37 | cursor.execute(sql, (artist_id, artist_name)) 38 | connection.commit() 39 | 40 | 41 | # 获取所有歌手的 ID 42 | def get_all_artist(): 43 | with connection.cursor() as cursor: 44 | sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID" 45 | cursor.execute(sql, ()) 46 | return cursor.fetchall() 47 | 48 | 49 | # 获取所有专辑的 ID 50 | def get_all_album(): 51 | with connection.cursor() as cursor: 52 | sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID" 53 | cursor.execute(sql, ()) 54 | return cursor.fetchall() 55 | 56 | 57 | # 获取所有音乐的 ID 58 | def get_all_music(): 59 | with connection.cursor() as cursor: 60 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID" 61 | cursor.execute(sql, ()) 62 | return cursor.fetchall() 63 | 64 | 65 | # 获取前一半音乐的 ID 66 | def get_before_music(): 67 | with connection.cursor() as cursor: 68 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000" 69 | cursor.execute(sql, ()) 70 | return cursor.fetchall() 71 | 72 | 73 | # 获取后一半音乐的 ID 74 | def get_after_music(): 75 | with connection.cursor() as cursor: 76 | sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429" 77 | cursor.execute(sql, ()) 78 | return cursor.fetchall() 79 | 80 | 81 | def dis_connect(): 82 | connection.close() 83 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FindtripItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | site = scrapy.Field() 16 | company = scrapy.Field() 17 | flight_time = scrapy.Field() 18 | airports = scrapy.Field() 19 | passtime = scrapy.Field() 20 | price = scrapy.Field() 21 | 22 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FindtripSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from findtrip.spiders.washctrip import wash 8 | import pymongo 9 | from scrapy.conf import settings 10 | from scrapy import log 11 | 12 | class FindtripPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | 17 | class MongoDBPipeline(object): 18 | def __init__(self): 19 | self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) 20 | self.db = self.client[settings['MONGO_DB']] 21 | self.post = self.db[settings['MONGO_COLL']] 22 | 23 | def process_item(self, item, spider): 24 | if item['site'] == 'Qua': 25 | if item['company']: 26 | item['company'] = wash(item['company']) 27 | if item['flight_time']: 28 | item['flight_time'] = wash(item['flight_time']) 29 | if item['airports']: 30 | item['airports'] = wash(item['airports']) 31 | if item['passtime']: 32 | item['passtime'] = wash(item['passtime']) 33 | if item['price']: 34 | item['price'] = wash(item['price']) 35 | for data in item: 36 | if not data: 37 | raise DropItem("Missing data!") 38 | self.collection.insert(dict(item)) 39 | log.msg("Question added to MongoDB database!", 40 | level=log.DEBUG, spider=spider) 41 | elif item['site'] == 'Ctrip': 42 | self.collection.insert(dict(item)) 43 | log.msg("Question added to MongoDB database!", 44 | level=log.DEBUG, spider=spider) 45 | 46 | return item -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for findtrip project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'findtrip' 13 | 14 | SPIDER_MODULES = ['findtrip.spiders'] 15 | NEWSPIDER_MODULE = 'findtrip.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'findtrip (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36' 21 | 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # 配置mongoDB 27 | MONGO_HOST = "127.0.0.1" # 主机IP 28 | MONGO_PORT = 27017 # 端口号 29 | MONGO_DB = "FindTrip" # 库名 30 | MONGO_COLL = "qua_findtrip" # collection 31 | 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | #CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | #DOWNLOAD_DELAY = 3 39 | # The download delay setting will honor only one of: 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | #CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | #TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | #DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | #} 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | #SPIDER_MIDDLEWARES = { 58 | # 'findtrip.middlewares.FindtripSpiderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | #DOWNLOADER_MIDDLEWARES = { 64 | # 'findtrip.middlewares.MyCustomDownloaderMiddleware': 543, 65 | #} 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | #EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | #} 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | # 'findtrip.pipelines.FindtripPipeline': 300, 77 | 'findtrip.pipelines.MongoDBPipeline': 300, 78 | } 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/ctrip_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class CtripSpider(scrapy.Spider): 5 | name = 'ctrip' 6 | start_urls = [ 7 | "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-19" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | fligint_div = "//div[@id='J_flightlist2']/div" 13 | dataList = sel.xpath(fligint_div) 14 | 15 | for index,each in enumerate(dataList): 16 | flight_each = fligint_div+'['+str(index+1)+']' 17 | flight_tr = flight_each+"//tr[@class='J_header_row']" 18 | istrain = sel.xpath(flight_each + "//div[@class='train_flight_tit']") 19 | 20 | if istrain: 21 | print ("this data is train add") 22 | else: 23 | company = sel.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()").extract() 24 | 25 | flight_time_from = sel.xpath(flight_tr + "//td[@class='right']/div[1]//text()").extract() 26 | flight_time_to = sel.xpath(flight_tr + "//td[@class='left']/div[1]//text()").extract() 27 | flight_time = [flight_time_from,flight_time_to] 28 | 29 | airports_from = sel.xpath(flight_tr + "//td[@class='right']/div[2]//text()").extract() 30 | airports_to = sel.xpath(flight_tr + "//td[@class='left']/div[2]//text()").extract() 31 | airports = [airports_from,airports_to] 32 | 33 | price_middle = sel.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()").extract() 34 | price = sel.xpath(flight_tr + "[1]//td[@class='price ']/span//text()").extract() 35 | if price_middle: 36 | price = price_middle 37 | elif price: 38 | price = price 39 | else: 40 | price = '' 41 | 42 | item = FindtripItem() 43 | item['site'] = 'Ctrip' 44 | item['company'] = company 45 | item['flight_time'] = flight_time 46 | item['airports'] = airports 47 | item['price'] = price 48 | yield item 49 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/qua_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from findtrip.items import FindtripItem 3 | 4 | class QuaSpider(scrapy.Spider): 5 | name = "qua" 6 | start_urls = [ 7 | "http://www.qua.com/flights/PEK-XMN/2016-05-12?m=CNY&from=flight_home" 8 | ] 9 | 10 | def parse(self, response): 11 | sel = scrapy.Selector(response) 12 | dataList = sel.xpath("//div[@class='m-fly-item s-oneway']") 13 | 14 | for index,each in enumerate(dataList): 15 | flight_each = "//div[@id='list-box']/div["+str(index+1)+"]" 16 | detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']" 17 | f_route_div = "//div[@class='m-fl-info-bd']/div" 18 | 19 | airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract() 20 | company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract() 21 | flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract() 22 | passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract() 23 | price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract() 24 | 25 | item = FindtripItem() 26 | item['site'] = 'Qua' 27 | item['company'] = company 28 | item['flight_time'] = flight_time 29 | item['airports'] = airports 30 | item['passtime'] = passtime 31 | item['price'] = price 32 | yield item 33 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/washctrip.py: -------------------------------------------------------------------------------- 1 | def wash(dateList): 2 | dateList = map(lambda x : x.split(), dateList) 3 | cleanList = [] 4 | for each in dateList: 5 | if each: 6 | cleanList.append(each[0]) 7 | return cleanList 8 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = findtrip.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = findtrip 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | 11 | class PythonjobsItem(Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | #pass 15 | title = Field() 16 | city = Field() 17 | company = Field() 18 | location = Field() 19 | url = Field() 20 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class PythonjobsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class PythonjobsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for pythonjobs project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'pythonjobs' 13 | 14 | SPIDER_MODULES = ['pythonjobs.spiders'] 15 | NEWSPIDER_MODULE = 'pythonjobs.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'pythonjobs (+http://www.yourdomain.com)' 20 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | #DOWNLOAD_DELAY = 3 32 | # The download delay setting will honor only one of: 33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | #CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | #TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | #DEFAULT_REQUEST_HEADERS = { 44 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | # 'Accept-Language': 'en', 46 | #} 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | #SPIDER_MIDDLEWARES = { 51 | # 'pythonjobs.middlewares.PythonjobsSpiderMiddleware': 543, 52 | #} 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'pythonjobs.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'pythonjobs.pipelines.PythonjobsPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/job_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from pythonjobs.items import PythonjobsItem 4 | #from bs4 import BeautifulSoup 5 | 6 | class JobspiderSpider(scrapy.Spider): 7 | name = 'jobSpider' 8 | allowed_domains = ['search.51job.com','jobs.51job.com'] 9 | 10 | def start_requests(self): 11 | for i in range(1,20): # Set pages to crawl here. 12 | url = "http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{0}.html".format(i) 13 | yield scrapy.Request(url) 14 | 15 | def parse(self, response): 16 | for sel in response.css("html body div.dw_wp div#resultList.dw_table div.el p.t1 span a"): 17 | url = sel.re('href="(.*?)"')[0] 18 | yield scrapy.Request(url,callback=self.parse_item) 19 | 20 | def parse_item(self, response): 21 | item = PythonjobsItem() 22 | item['title'] = response.xpath('//div[@class="cn"]/h1/@title').extract()[0] 23 | item['url'] = response.url 24 | item['city'] = response.xpath('//span[@class="lname"]/text()').extract()[0] 25 | item['company'] = response.xpath('//p[@class="cname"]/a/@title').extract()[0] 26 | item['location'] = response.xpath('//p[@class="fp"]/text()').extract()[1].rstrip() 27 | return item -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = pythonjobs.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = pythonjobs 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = shuimujob.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = shuimujob 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ShuimujobItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # pass 15 | title = scrapy.Field() 16 | href = scrapy.Field() 17 | author = scrapy.Field() 18 | time = scrapy.Field() 19 | content = scrapy.Field() 20 | is_dev = scrapy.Field() 21 | is_alg = scrapy.Field() 22 | is_fin = scrapy.Field() 23 | base_url_index = scrapy.Field() 24 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ShuimujobSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pymongo 8 | from scrapy.conf import settings 9 | from scrapy.exceptions import DropItem 10 | from scrapy import log 11 | 12 | class ShuimujobPipeline(object): 13 | def process_item(self, item, spider): 14 | return item 15 | 16 | class MongoDBPipeline(object): 17 | 18 | def __init__(self): 19 | pass 20 | 21 | 22 | def open_spider(self, spider): 23 | self.client = pymongo.MongoClient( 24 | settings['MONGODB_SERVER'], 25 | settings['MONGODB_PORT'] 26 | ) 27 | self.db = self.client[settings['MONGODB_DB']] 28 | self.collection = self.db[settings['MONGODB_COLLECTION']] 29 | 30 | def close_spider(self, spider): 31 | self.client.close() 32 | 33 | def process_item(self, item, spider): 34 | valid = True 35 | for data in item: 36 | if not data : 37 | valid = False 38 | raise DropItem("Missing {0}!".format(data)) 39 | if item['title'] == '': 40 | valid = False 41 | raise DropItem("title is '' ") 42 | if item['content'] == '': 43 | valid = False 44 | raise DropItem("content is '' ") 45 | if valid: 46 | self.collection.insert(dict(item)) 47 | return item 48 | 49 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/platform.py: -------------------------------------------------------------------------------- 1 | import sys 2 | def getPlatform(): 3 | platform='' 4 | if sys.platform.startswith('win'): 5 | platform = 'win' 6 | elif sys.platform.startswith('linux'): 7 | platform = 'linux' 8 | return platform -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for shuimujob project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'shuimujob' 13 | 14 | SPIDER_MODULES = ['shuimujob.spiders'] 15 | NEWSPIDER_MODULE = 'shuimujob.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'shuimujob (+http://www.yourdomain.com)' 20 | 21 | 22 | MONGODB_SERVER = "localhost" 23 | MONGODB_PORT = 27017 24 | MONGODB_DB = "shuimujob" 25 | MONGODB_COLLECTION = "job_info" 26 | 27 | # Obey robots.txt rules 28 | ROBOTSTXT_OBEY = False 29 | 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 31 | #CONCURRENT_REQUESTS = 32 32 | 33 | # Configure a delay for requests for the same website (default: 0) 34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 35 | # See also autothrottle settings and docs 36 | #DOWNLOAD_DELAY = 3 37 | # The download delay setting will honor only one of: 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 39 | #CONCURRENT_REQUESTS_PER_IP = 16 40 | 41 | # Disable cookies (enabled by default) 42 | COOKIES_ENABLED = False 43 | 44 | # Disable Telnet Console (enabled by default) 45 | #TELNETCONSOLE_ENABLED = False 46 | 47 | # Override the default request headers: 48 | #DEFAULT_REQUEST_HEADERS = { 49 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 50 | # 'Accept-Language': 'en', 51 | #} 52 | 53 | # Enable or disable spider middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 55 | #SPIDER_MIDDLEWARES = { 56 | # 'shuimujob.middlewares.ShuimujobSpiderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable downloader middlewares 60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 61 | #DOWNLOADER_MIDDLEWARES = { 62 | # 'shuimujob.middlewares.MyCustomDownloaderMiddleware': 543, 63 | #} 64 | 65 | # Enable or disable extensions 66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 67 | #EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | #} 70 | 71 | # Configure item pipelines 72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 73 | ITEM_PIPELINES = { 74 | # 'shuimujob.pipelines.ShuimujobPipeline': 300, 75 | 'shuimujob.pipelines.MongoDBPipeline':300 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/shuimu_spider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import scrapy 3 | from shuimujob.items import ShuimujobItem 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from bs4 import BeautifulSoup 9 | from scrapy import signals 10 | from scrapy.xlib.pydispatch import dispatcher 11 | from shuimujob.platform import getPlatform 12 | 13 | class SMSpider(scrapy.spiders.CrawlSpider): 14 | ''' 15 | #要建立一个 Spider,你可以为 scrapy.spider.BaseSpider 创建一个子类,并确定三个主要的、强制的属性: 16 | #name :爬虫的识别名,它必须是唯一的,在不同的爬虫中你必须定义不同的名字. 17 | #start_urls :爬虫开始爬的一个 URL 列表。爬虫从这里开始抓取数据,所以,第一次下载的数据将会从这些 URLS 开始。其他子 URL 将会从这些起始 URL 中继承性生成。 18 | #parse() :爬虫的方法,调用时候传入从每一个 URL 传回的 Response 对象作为参数,response 将会是 parse 方法的唯一的一个参数, 19 | #这个方法负责解析返回的数据、匹配抓取的数据(解析为 item )并跟踪更多的 URL。 20 | ''' 21 | name="shuimujob" 22 | base_url = 'http://www.newsmth.net/nForum/board/Intern' 23 | start_urls = [base_url] 24 | start_urls.extend([base_url+'?p='+str(i) for i in range(2,4)]) 25 | # start_urls = ['http://www.newsmth.net/'] 26 | platform = getPlatform() 27 | 28 | def __init__(self): 29 | scrapy.spiders.Spider.__init__(self) 30 | if self.platform == 'linux': 31 | self.driver = webdriver.PhantomJS() 32 | elif self.platform == 'win': 33 | self.driver = webdriver.PhantomJS() 34 | self.driver.set_page_load_timeout(15) 35 | dispatcher.connect(self.spider_closed, signals.spider_closed) 36 | 37 | 38 | 39 | def spider_closed(self, spider): 40 | self.driver.quit() 41 | 42 | def parse(self,response): 43 | self.driver.get(response.url) 44 | 45 | element = WebDriverWait(self.driver,30).until(EC.presence_of_all_elements_located((By.TAG_NAME,'table'))) 46 | page_source = self.driver.page_source 47 | bs_obj = BeautifulSoup(page_source, "lxml") 48 | table = bs_obj.find('table',class_='board-list tiz') 49 | intern_messages = table.find_all('tr',class_=False) 50 | for message in intern_messages: 51 | title, href, time, author = '','','','' 52 | td_9 = message.find('td',class_='title_9') 53 | if td_9: 54 | title = td_9.a.get_text().encode('utf-8','ignore') 55 | href = td_9.a['href'] 56 | td_10 = message.find('td', class_='title_10') 57 | if td_10: 58 | time=td_10.get_text().encode('utf-8','ignore') 59 | td_12 = message.find('td', class_='title_12') 60 | if td_12: 61 | author = td_12.a.get_text().encode('utf-8','ignore') 62 | item = ShuimujobItem() 63 | item['title'] = title 64 | item['href'] = href 65 | item['time'] = time 66 | item['author'] = author 67 | item['base_url_index'] = 0 68 | root_url = 'http://www.newsmth.net' 69 | # content = scrapy.Request(root_url+href,self.parse_content) 70 | if href!='': 71 | content = self.parse_content(root_url+href) 72 | # print 'content:', content 73 | item['content'] = content 74 | yield item 75 | 76 | def parse_content(self,url): 77 | 78 | self.driver.get(url) 79 | element = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table'))) 80 | page_source = self.driver.page_source 81 | bs_obj = BeautifulSoup(page_source, "lxml") 82 | return bs_obj.find('td', class_='a-content').p.get_text().encode('utf-8','ignore') -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/house.csv: -------------------------------------------------------------------------------- 1 | house,house_area,house_room,total_price,unit_price 2 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 3 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 4 | 天坛新寓 ,75.16,3室1厅,243.0,32332 5 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 6 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 7 | house,house_area,house_room,total_price,unit_price 8 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462 9 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722 10 | 天坛新寓 ,75.16,3室1厅,243.0,32332 11 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546 12 | 常府街10至16号 ,62.21,2室1厅,212.0,34079 13 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NjHouseItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | house=scrapy.Field() 15 | total_price=scrapy.Field() 16 | unit_price=scrapy.Field() 17 | house_room=scrapy.Field() 18 | house_area=scrapy.Field() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class NjHouseSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class NjHousePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for nj_house project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'nj_house' 13 | 14 | SPIDER_MODULES = ['nj_house.spiders'] 15 | NEWSPIDER_MODULE = 'nj_house.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'nj_house (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'nj_house.middlewares.NjHouseSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'nj_house.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'nj_house.pipelines.NjHousePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/lj_house.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import re 3 | import scrapy 4 | from nj_house.items import NjHouseItem 5 | 6 | class LjHouseSpider(scrapy.Spider): 7 | name = "lj_house" 8 | allowed_domains = ["nj.lianjia.com/ershoufang/"] 9 | start_urls = ['http://nj.lianjia.com/ershoufang//'] 10 | 11 | def parse(self, response): 12 | clears = response.css('.sellListContent li') 13 | item = NjHouseItem() 14 | for c in clears: 15 | house = c.css('.houseInfo a::text').extract_first() 16 | house_text = c.css('.houseInfo::text').extract_first() 17 | house_info_list = [e for e in re.split('\|', int(house_text)) if len(e) > 1] 18 | house_room = house_info_list[0].strip() 19 | house_area = ''.join(re.findall(r'[\d+\.]', house_info_list[1])) 20 | total_price = c.css('.totalPrice span::text').extract_first() 21 | unit_price = c.css('.unitPrice span::text').extract_first() 22 | unit_price = re.findall('\d+', unit_price)[0] 23 | 24 | item['house'] = house 25 | item['total_price'] = float(total_price) 26 | item['unit_price'] = int(unit_price) 27 | item['house_area'] = float(house_area) 28 | item['house_room'] = house_room 29 | yield item 30 | 31 | page_info = response.css('div[class="page-box fr"]').css('div::attr(page-data)').extract_first() 32 | page_list = re.findall('\d+', page_info) 33 | next_page = 'pg' + str(int(page_list[1]) + 1) 34 | url = self.start_urls[0] + next_page 35 | if next_page: 36 | yield Request(url=url, callback=self.parse) 37 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = nj_house.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = nj_house 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__init__.py -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LianjiaItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | # 标签 小区 户型 面积 关注人数 观看人数 发布时间 价格 均价 详情链接 经纬度 城区 15 | title = scrapy.Field() 16 | community = scrapy.Field() 17 | model = scrapy.Field() 18 | area = scrapy.Field() 19 | focus_num = scrapy.Field() 20 | watch_num = scrapy.Field() 21 | time = scrapy.Field() 22 | price = scrapy.Field() 23 | average_price = scrapy.Field() 24 | link = scrapy.Field() 25 | Latitude = scrapy.Field() 26 | city = scrapy.Field() -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LianjiaSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | from scrapy.conf import settings 10 | from LianJia.items import LianjiaItem 11 | 12 | class LianjiaPipeline(object): 13 | def __init__(self): 14 | host = settings['MONGODB_HOST'] 15 | port = settings['MONGODB_PORT'] 16 | db_name = settings['MONGODB_DBNAME'] 17 | client = pymongo.MongoClient(host=host,port=port) 18 | tdb = client[db_name] 19 | self.post = tdb[settings['MONGODB_DOCNAME']] 20 | 21 | def process_item(self, item, spider): 22 | if isinstance(item,LianjiaItem): 23 | try: 24 | info = dict(item) 25 | if self.post.insert(info): 26 | print('bingo') 27 | except Exception: 28 | pass 29 | return item 30 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for LianJia project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'LianJia' 13 | 14 | SPIDER_MODULES = ['LianJia.spiders'] 15 | NEWSPIDER_MODULE = 'LianJia.spiders' 16 | 17 | 18 | MONGODB_HOST = '127.0.0.1' 19 | MONGODB_PORT = 27017 20 | MONGODB_DBNAME = "lianjia" 21 | MONGODB_DOCNAME = "saveinfo_5" 22 | 23 | DOWNLOAD_DELAY = 10 24 | 25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 26 | #USER_AGENT = 'LianJia (+http://www.yourdomain.com)' 27 | 28 | # Obey robots.txt rules 29 | ROBOTSTXT_OBEY = False 30 | 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 32 | #CONCURRENT_REQUESTS = 32 33 | 34 | # Configure a delay for requests for the same website (default: 0) 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 36 | # See also autothrottle settings and docs 37 | #DOWNLOAD_DELAY = 3 38 | # The download delay setting will honor only one of: 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 40 | #CONCURRENT_REQUESTS_PER_IP = 16 41 | 42 | # Disable cookies (enabled by default) 43 | #COOKIES_ENABLED = False 44 | 45 | # Disable Telnet Console (enabled by default) 46 | #TELNETCONSOLE_ENABLED = False 47 | 48 | # Override the default request headers: 49 | #DEFAULT_REQUEST_HEADERS = { 50 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 51 | # 'Accept-Language': 'en', 52 | #} 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'LianJia.middlewares.LianjiaSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'LianJia.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | ITEM_PIPELINES = { 75 | 'LianJia.pipelines.LianjiaPipeline': 300, 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/lianjia.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/lianjia.cpython-36.pyc -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/lianjia.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/lianjia.csv -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = LianJia.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = LianJia 12 | -------------------------------------------------------------------------------- /03_Python网络爬虫/Python网络爬虫相关函数库介绍/.ipynb_checkpoints/01_requests学习笔记-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# 引入Requests库\n", 12 | "import requests" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "# 发起GET请求\n", 24 | "response = requests.get('https://www.baidu.com/')" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 4, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "