├── 00_Python从零开始系列连载
    ├── .ipynb_checkpoints
    │   ├── 01_Python的基本数据类型-checkpoint.ipynb
    │   ├── 02_Python的基本运算和表达式-checkpoint.ipynb
    │   ├── 03_Python程序的基本控制流程-checkpoint.ipynb
    │   ├── 04_Python特色数据类型 (列表)-checkpoint.ipynb
    │   ├── 05_Python特色数据类型（元组）-checkpoint.ipynb
    │   ├── 06_Python特色数据类型（字典）-checkpoint.ipynb
    │   ├── 07_Python特色数据类型（集合）-checkpoint.ipynb
    │   ├── 08_Python特色数据类型（函数）-checkpoint.ipynb
    │   ├── 09_Python文件操作-checkpoint.ipynb
    │   ├── 10_Python异常处理-checkpoint.ipynb
    │   ├── 11_Python的time模块简单使用-checkpoint.ipynb
    │   └── 12_Python的random模块简单使用-checkpoint.ipynb
    ├── 01_Python的基本数据类型.ipynb
    ├── 02_Python的基本运算和表达式.ipynb
    ├── 03_Python程序的基本控制流程.ipynb
    ├── 04_Python特色数据类型 (列表).ipynb
    ├── 05_Python特色数据类型（元组）.ipynb
    ├── 06_Python特色数据类型（字典）.ipynb
    ├── 07_Python特色数据类型（集合）.ipynb
    ├── 08_Python特色数据类型（函数）.ipynb
    ├── 09_Python文件操作.ipynb
    ├── 10_Python异常处理.ipynb
    ├── 11_Python的time模块简单使用.ipynb
    └── 12_Python的random模块简单使用.ipynb
├── 01_Python进阶系列连载
    ├── .ipynb_checkpoints
    │   ├── 01_那些容易被忽略的问题-checkpoint.ipynb
    │   ├── 02_迭代器-checkpoint.ipynb
    │   └── 03_生成器-checkpoint.ipynb
    ├── 01_那些容易被忽略的问题.ipynb
    ├── 02_迭代器.ipynb
    └── 03_生成器.ipynb
├── 02_Python数据结构算法刷题
    ├── .ipynb_checkpoints
    │   └── 01_(3n+1)猜想-checkpoint.ipynb
    └── 01_(3n+1)猜想.ipynb
├── 03_Python网络爬虫
    ├── Python网络爬虫实战项目
    │   ├── 01_python爬取电影天堂
    │   │   ├── dytt.py
    │   │   └── 电影天堂.csv
    │   ├── 02_python爬取斗罗大陆小说
    │   │   ├── dldl.py
    │   │   ├── 斗破苍穹小说.csv
    │   │   ├── 斗破苍穹小说.py
    │   │   └── 斗罗大陆小说.csv
    │   ├── 03_python爬取欧洲足球联赛数据
    │   │   └── footballData.py
    │   ├── 04_python爬取豆瓣电影Top250
    │   │   ├── douban_top250_movies.csv
    │   │   └── filmTop250.py
    │   ├── 05_python爬取股票数据
    │   │   └── stockInfo.py
    │   ├── 06_python爬取人人贷网数据
    │   │   └── peopleLoad.py
    │   ├── 07_python爬取创业邦创投库
    │   │   ├── python爬取创业邦创投库.py
    │   │   └── resultsDatas.csv
    │   ├── 08_python抓取美团网百万商家信息
    │   │   ├── meituan.csv
    │   │   └── python抓取美团网百万商家信息.py
    │   ├── 09_python爬取网易云音乐评论并把他们存入mysql数据库
    │   │   └── python爬取网易云音乐评论并把他们存入mysql数据库.py
    │   ├── 10_python爬取“网上购物”类APP
    │   │   ├── apps.csv
    │   │   ├── python爬取网上购物类APP数据py
    │   │   └── 网上购物类APP数据分析并展示.py
    │   ├── 11_python爬取链家网房价信息
    │   │   ├── Lianjia_Info_v1.py
    │   │   ├── Lianjia_Info_v2.py
    │   │   ├── Lianjia_Info_v3.py
    │   │   ├── Lianjia_Info_v4.py
    │   │   ├── Lianjia_Info_v4_analysis.py
    │   │   ├── lianjia.csv
    │   │   ├── lianjia_ershou_futian_100.xlsx
    │   │   └── lianjia_re_v4.csv
    │   ├── 12_python爬取并分析豆瓣中最新电影的影评(词云显示)
    │   │   ├── alice_mask.png
    │   │   ├── alice_mask1.png
    │   │   ├── python爬取并分析豆瓣中最新电影的影评.py
    │   │   ├── show_Chinese.png
    │   │   ├── stopwords.txt
    │   │   └── 豆瓣影评爬取入库.py
    │   ├── 13_python爬取豆瓣书籍信息
    │   │   ├── books.csv
    │   │   └── python爬取豆瓣书籍信息.py
    │   ├── 14_python爬取今日头条信息并导入mongodb数据库
    │   │   └── python爬取今日头条信息并导入mongodb数据库.py
    │   ├── 15_python使用selenium爬取百度招聘内容并存入mongodb数据库
    │   │   └── python使用selenium爬取百度招聘内容并入mongodb数据库.py
    │   ├── 16_python爬取熊猫直播用户信息
    │   │   └── python爬取熊猫直播用户信息.py
    │   ├── 17_scrapy爬取游天下南京短租房信息并存入mongodb数据库
    │   │   └── youtxNanJin
    │   │   │   ├── README.txt
    │   │   │   ├── scrapy.cfg
    │   │   │   ├── youtxNanJin
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   ├── pipelines.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── youtxNanJin_spider.cpython-36.pyc
    │   │   │       │   └── youtxNanJin_spider.py
    │   │   │   ├── 游天下南京.csv
    │   │   │   └── 游天下南京.json
    │   ├── 18_scrapy爬取中国医学人才网信息并以json格式保存
    │   │   └── chinadoctornet
    │   │   │   ├── README.txt
    │   │   │   ├── chinadoctornet
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   ├── pipelines.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── chinadoctornet_spider.cpython-36.pyc
    │   │   │       │   └── chinadoctornet_spider.py
    │   │   │   ├── scrapy.cfg
    │   │   │   ├── 中国医学人才网招聘最新招聘专栏.csv
    │   │   │   └── 中国医学人才网招聘最新招聘专栏.json
    │   ├── 19_scrapy框架爬取豆瓣电影top250信息
    │   │   └── doubanmovie
    │   │   │   ├── README.txt
    │   │   │   ├── doubanmovie
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── doubanmovie_spider.cpython-36.pyc
    │   │   │       │   └── doubanmovie_spider.py
    │   │   │   ├── items.csv
    │   │   │   ├── items.json
    │   │   │   └── scrapy.cfg
    │   ├── 20_scrapy爬取织梦者网站信息并存入mongodb数据库
    │   │   └── makedream
    │   │   │   ├── makedream
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   ├── pipelines.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── makedream_spider.cpython-36.pyc
    │   │   │       │   └── makedream_spider.py
    │   │   │   └── scrapy.cfg
    │   ├── 21_python爬取豆瓣电影前任3评论(词云显示)
    │   │   ├── ComentsAnaylst.py
    │   │   ├── ciyun.jpg
    │   │   ├── ciyun.png
    │   │   ├── douban.txt
    │   │   └── douban_qianren3.py
    │   ├── 22_python爬取Bilibili用户信息并导入mysql数据库
    │   │   ├── bilibili_user.py
    │   │   ├── bilibili_user_info.sql
    │   │   └── user_agents.txt
    │   ├── 23_python爬取网易云音乐所有歌曲的评论数
    │   │   ├── README.md
    │   │   ├── album_by_artist.py
    │   │   ├── artists.py
    │   │   ├── comments_by_music.py
    │   │   ├── music_by_album.py
    │   │   └── sql.py
    │   ├── 24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库
    │   │   └── findtrip
    │   │   │   ├── ctrip_items.csv
    │   │   │   ├── findtrip
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   ├── pipelines.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       ├── ctrip_spider.cpython-36.pyc
    │   │   │       │       ├── qua_spider.cpython-36.pyc
    │   │   │       │       └── washctrip.cpython-36.pyc
    │   │   │       │   ├── ctrip_spider.py
    │   │   │       │   ├── qua_spider.py
    │   │   │       │   └── washctrip.py
    │   │   │   ├── qua_items.csv
    │   │   │   ├── qua_items.json
    │   │   │   └── scrapy.cfg
    │   ├── 25_scrapy爬取前程无忧网站python相关的工作信息
    │   │   └── pythonjobs
    │   │   │   ├── PythonJobs.csv
    │   │   │   ├── pythonjobs
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   ├── pipelines.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── job_spider.cpython-36.pyc
    │   │   │       │   └── job_spider.py
    │   │   │   └── scrapy.cfg
    │   ├── 26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库
    │   │   └── shuimujob
    │   │   │   ├── ghostdriver.log
    │   │   │   ├── scrapy.cfg
    │   │   │   └── shuimujob
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │           ├── __init__.cpython-36.pyc
    │   │   │           ├── items.cpython-36.pyc
    │   │   │           ├── platform.cpython-36.pyc
    │   │   │           └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── platform.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │           ├── __init__.py
    │   │   │           ├── __pycache__
    │   │   │               ├── __init__.cpython-36.pyc
    │   │   │               └── shuimu_spider.cpython-36.pyc
    │   │   │           └── shuimu_spider.py
    │   ├── 27_scrapy爬取南京20000多套二手房信息
    │   │   └── nj_house
    │   │   │   ├── house.csv
    │   │   │   ├── nj_house
    │   │   │       ├── __init__.py
    │   │   │       ├── __pycache__
    │   │   │       │   ├── __init__.cpython-36.pyc
    │   │   │       │   ├── items.cpython-36.pyc
    │   │   │       │   └── settings.cpython-36.pyc
    │   │   │       ├── items.py
    │   │   │       ├── middlewares.py
    │   │   │       ├── pipelines.py
    │   │   │       ├── settings.py
    │   │   │       └── spiders
    │   │   │       │   ├── __init__.py
    │   │   │       │   ├── __pycache__
    │   │   │       │       ├── __init__.cpython-36.pyc
    │   │   │       │       └── lj_house.cpython-36.pyc
    │   │   │       │   └── lj_house.py
    │   │   │   └── scrapy.cfg
    │   └── 28_scrapy爬取链家北京二手房数据
    │   │   └── LianJia
    │   │       ├── LianJia
    │   │           ├── __init__.py
    │   │           ├── __pycache__
    │   │           │   ├── __init__.cpython-36.pyc
    │   │           │   ├── items.cpython-36.pyc
    │   │           │   ├── pipelines.cpython-36.pyc
    │   │           │   └── settings.cpython-36.pyc
    │   │           ├── items.py
    │   │           ├── middlewares.py
    │   │           ├── pipelines.py
    │   │           ├── settings.py
    │   │           └── spiders
    │   │           │   ├── __init__.py
    │   │           │   ├── __pycache__
    │   │           │       ├── __init__.cpython-36.pyc
    │   │           │       └── lianjia.cpython-36.pyc
    │   │           │   └── lianjia.py
    │   │       ├── lianjia.csv
    │   │       └── scrapy.cfg
    └── Python网络爬虫相关函数库介绍
    │   ├── .ipynb_checkpoints
    │       └── 01_requests学习笔记-checkpoint.ipynb
    │   └── 01_requests学习笔记.ipynb
└── README.md


/00_Python从零开始系列连载/.ipynb_checkpoints/09_Python文件操作-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "文件是存储在外部介质的数据集合，通常可以长久保存（前提是这个介质不易损坏）\n",
 8 |     "\n",
 9 |     "通俗点说，文件就是存放数据的地方\n",
10 |     "\n",
11 |     "**绝对路径与相对路径**\n",
12 |     "\n",
13 |     "通常，我们使用电脑的时候，例如编写了一段代码，我们要把这段代码保存，方便下次使用\n",
14 |     "\n",
15 |     "你可能会把这段代码保存在硬盘某个位置"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "markdown",
20 |    "metadata": {},
21 |    "source": [
22 |     "如果在Python中，我们要打开这个文件操作，该怎么操作呢？\n",
23 |     "\n",
24 |     "所以要打开这个文件操作也需要3个步骤：\n",
25 |     "\n",
26 |     "1.找出文件存放的路径，打开文件\n",
27 |     "\n",
28 |     "2.对文件修改操作\n",
29 |     "\n",
30 |     "3.关闭文件\n",
31 |     "\n",
32 |     "说到找出文件的存放路径，我们就必须讲讲绝对路径和相对路径的概念\n",
33 |     "\n",
34 |     "**绝对路径**\n",
35 |     "\n",
36 |     "绝对路径指的是从最初的硬盘开始一直进入到文件位置"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "markdown",
41 |    "metadata": {},
42 |    "source": [
43 |     "## 未完待续！"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {
50 |     "collapsed": true
51 |    },
52 |    "outputs": [],
53 |    "source": []
54 |   }
55 |  ],
56 |  "metadata": {
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   },
62 |   "language_info": {
63 |    "codemirror_mode": {
64 |     "name": "ipython",
65 |     "version": 3
66 |    },
67 |    "file_extension": ".py",
68 |    "mimetype": "text/x-python",
69 |    "name": "python",
70 |    "nbconvert_exporter": "python",
71 |    "pygments_lexer": "ipython3",
72 |    "version": "3.6.6"
73 |   }
74 |  },
75 |  "nbformat": 4,
76 |  "nbformat_minor": 2
77 | }
78 | 


--------------------------------------------------------------------------------
/00_Python从零开始系列连载/.ipynb_checkpoints/11_Python的time模块简单使用-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "今天，我们在Python中演示一下time模块的常用方法"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import time"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "time.sleep(10)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "0\n",
 42 |       "1\n",
 43 |       "2\n",
 44 |       "3\n",
 45 |       "4\n",
 46 |       "5\n",
 47 |       "6\n",
 48 |       "7\n",
 49 |       "8\n",
 50 |       "9\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "for i in range(0, 10):\n",
 56 |     "    print(i)\n",
 57 |     "    time.sleep(1)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "首先，我们导入time模块\n",
 65 |     "\n",
 66 |     "我们要讲的第一个方法就是sleep（）方法\n",
 67 |     "\n",
 68 |     "sleep就是睡觉休眠的意思，意味着执行的时候，系统休眠等待一会，不做其他操作\n",
 69 |     "\n",
 70 |     "当你运行以上代码，会发现隔一段时间打印一个数字\n",
 71 |     "\n",
 72 |     "而sleep（）的括号中给出休眠时间，单位是秒\n",
 73 |     "\n",
 74 |     "常用的time模块下的方法还有："
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## 未完待续！"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.6.6"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/00_Python从零开始系列连载/09_Python文件操作.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "文件是存储在外部介质的数据集合，通常可以长久保存（前提是这个介质不易损坏）\n",
 8 |     "\n",
 9 |     "通俗点说，文件就是存放数据的地方\n",
10 |     "\n",
11 |     "**绝对路径与相对路径**\n",
12 |     "\n",
13 |     "通常，我们使用电脑的时候，例如编写了一段代码，我们要把这段代码保存，方便下次使用\n",
14 |     "\n",
15 |     "你可能会把这段代码保存在硬盘某个位置"
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "markdown",
20 |    "metadata": {},
21 |    "source": [
22 |     "如果在Python中，我们要打开这个文件操作，该怎么操作呢？\n",
23 |     "\n",
24 |     "所以要打开这个文件操作也需要3个步骤：\n",
25 |     "\n",
26 |     "1.找出文件存放的路径，打开文件\n",
27 |     "\n",
28 |     "2.对文件修改操作\n",
29 |     "\n",
30 |     "3.关闭文件\n",
31 |     "\n",
32 |     "说到找出文件的存放路径，我们就必须讲讲绝对路径和相对路径的概念\n",
33 |     "\n",
34 |     "**绝对路径**\n",
35 |     "\n",
36 |     "绝对路径指的是从最初的硬盘开始一直进入到文件位置"
37 |    ]
38 |   },
39 |   {
40 |    "cell_type": "markdown",
41 |    "metadata": {},
42 |    "source": [
43 |     "## 未完待续！"
44 |    ]
45 |   },
46 |   {
47 |    "cell_type": "code",
48 |    "execution_count": null,
49 |    "metadata": {
50 |     "collapsed": true
51 |    },
52 |    "outputs": [],
53 |    "source": []
54 |   }
55 |  ],
56 |  "metadata": {
57 |   "kernelspec": {
58 |    "display_name": "Python 3",
59 |    "language": "python",
60 |    "name": "python3"
61 |   },
62 |   "language_info": {
63 |    "codemirror_mode": {
64 |     "name": "ipython",
65 |     "version": 3
66 |    },
67 |    "file_extension": ".py",
68 |    "mimetype": "text/x-python",
69 |    "name": "python",
70 |    "nbconvert_exporter": "python",
71 |    "pygments_lexer": "ipython3",
72 |    "version": "3.6.6"
73 |   }
74 |  },
75 |  "nbformat": 4,
76 |  "nbformat_minor": 2
77 | }
78 | 


--------------------------------------------------------------------------------
/00_Python从零开始系列连载/11_Python的time模块简单使用.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "今天，我们在Python中演示一下time模块的常用方法"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import time"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "time.sleep(10)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 3,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "name": "stdout",
 39 |      "output_type": "stream",
 40 |      "text": [
 41 |       "0\n",
 42 |       "1\n",
 43 |       "2\n",
 44 |       "3\n",
 45 |       "4\n",
 46 |       "5\n",
 47 |       "6\n",
 48 |       "7\n",
 49 |       "8\n",
 50 |       "9\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "for i in range(0, 10):\n",
 56 |     "    print(i)\n",
 57 |     "    time.sleep(1)"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "首先，我们导入time模块\n",
 65 |     "\n",
 66 |     "我们要讲的第一个方法就是sleep（）方法\n",
 67 |     "\n",
 68 |     "sleep就是睡觉休眠的意思，意味着执行的时候，系统休眠等待一会，不做其他操作\n",
 69 |     "\n",
 70 |     "当你运行以上代码，会发现隔一段时间打印一个数字\n",
 71 |     "\n",
 72 |     "而sleep（）的括号中给出休眠时间，单位是秒\n",
 73 |     "\n",
 74 |     "常用的time模块下的方法还有："
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "## 未完待续！"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.6.6"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/02_Python数据结构算法刷题/.ipynb_checkpoints/01_(3n+1)猜想-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "### (3n+1)猜想\n",
 8 |     "\n",
 9 |     "又名卡拉兹(Callatz)猜想：\n",
10 |     "\n",
11 |     "对任何一个自然数n，如果它是偶数，那么把它砍掉一半；如果它是奇数，那么把(3n+1)砍掉一半。这样一直反复砍下去，最后一定在某一步得到n=1。卡拉兹在1950年的世界数学家大会上公布了这个猜想，传说当时耶鲁大学师生齐动员，拼命想证明这个貌似很傻很天真的命题，结果闹得学生们无心学业，一心只证(3n+1)，以至于有人说这是一个阴谋，卡拉兹是在蓄意延缓美国数学界教学与科研的进展……\n",
12 |     "\n",
13 |     "我们今天的题目不是证明卡拉兹猜想，而是对给定的任一不超过1000的正整数n，简单地数一下，需要多少步（砍几下）才能得到n=1？\n",
14 |     "\n",
15 |     "输入格式：每个测试输入包含1个测试用例，即给出自然数n的值。\n",
16 |     "\n",
17 |     "输出格式：输出从n计算到1需要的步数。\n",
18 |     "\n",
19 |     "输入样例：\n",
20 |     "3\n",
21 |     "\n",
22 |     "输出样例：\n",
23 |     "5\n",
24 |     "\n",
25 |     "上代码："
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": 1,
31 |    "metadata": {},
32 |    "outputs": [
33 |     {
34 |      "name": "stdout",
35 |      "output_type": "stream",
36 |      "text": [
37 |       "请输入0-1000内的整数： 99\n",
38 |       "18\n"
39 |      ]
40 |     }
41 |    ],
42 |    "source": [
43 |     "n = int(input('请输入0-1000内的整数： '))\n",
44 |     "count = 0 \n",
45 |     "while n != 1:\n",
46 |     "    if n%2 == 0:\n",
47 |     "        n = n/2\n",
48 |     "    else:\n",
49 |     "        n = (3*n+1)/2\n",
50 |     "    count += 1\n",
51 |     "print (count)"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "markdown",
56 |    "metadata": {},
57 |    "source": [
58 |     "**注意点**：\n",
59 |     "\n",
60 |     "1. input得到的是字符型，需要强制转换为int\n",
61 |     "2. 这里不适合用for循环，因为不知道何时循环结束\n",
62 |     "3. 记得初始化步数i=0，并且每次在循环里自增\n"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "metadata": {
69 |     "collapsed": true
70 |    },
71 |    "outputs": [],
72 |    "source": []
73 |   }
74 |  ],
75 |  "metadata": {
76 |   "kernelspec": {
77 |    "display_name": "Python 3",
78 |    "language": "python",
79 |    "name": "python3"
80 |   },
81 |   "language_info": {
82 |    "codemirror_mode": {
83 |     "name": "ipython",
84 |     "version": 3
85 |    },
86 |    "file_extension": ".py",
87 |    "mimetype": "text/x-python",
88 |    "name": "python",
89 |    "nbconvert_exporter": "python",
90 |    "pygments_lexer": "ipython3",
91 |    "version": "3.6.6"
92 |   }
93 |  },
94 |  "nbformat": 4,
95 |  "nbformat_minor": 2
96 | }
97 | 


--------------------------------------------------------------------------------
/02_Python数据结构算法刷题/01_(3n+1)猜想.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "### (3n+1)猜想\n",
 8 |     "\n",
 9 |     "又名卡拉兹(Callatz)猜想：\n",
10 |     "\n",
11 |     "对任何一个自然数n，如果它是偶数，那么把它砍掉一半；如果它是奇数，那么把(3n+1)砍掉一半。这样一直反复砍下去，最后一定在某一步得到n=1。卡拉兹在1950年的世界数学家大会上公布了这个猜想，传说当时耶鲁大学师生齐动员，拼命想证明这个貌似很傻很天真的命题，结果闹得学生们无心学业，一心只证(3n+1)，以至于有人说这是一个阴谋，卡拉兹是在蓄意延缓美国数学界教学与科研的进展……\n",
12 |     "\n",
13 |     "我们今天的题目不是证明卡拉兹猜想，而是对给定的任一不超过1000的正整数n，简单地数一下，需要多少步（砍几下）才能得到n=1？\n",
14 |     "\n",
15 |     "输入格式：每个测试输入包含1个测试用例，即给出自然数n的值。\n",
16 |     "\n",
17 |     "输出格式：输出从n计算到1需要的步数。\n",
18 |     "\n",
19 |     "输入样例：\n",
20 |     "3\n",
21 |     "\n",
22 |     "输出样例：\n",
23 |     "5\n",
24 |     "\n",
25 |     "上代码："
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": 1,
31 |    "metadata": {},
32 |    "outputs": [
33 |     {
34 |      "name": "stdout",
35 |      "output_type": "stream",
36 |      "text": [
37 |       "请输入0-1000内的整数： 99\n",
38 |       "18\n"
39 |      ]
40 |     }
41 |    ],
42 |    "source": [
43 |     "n = int(input('请输入0-1000内的整数： '))\n",
44 |     "count = 0 \n",
45 |     "while n != 1:\n",
46 |     "    if n%2 == 0:\n",
47 |     "        n = n/2\n",
48 |     "    else:\n",
49 |     "        n = (3*n+1)/2\n",
50 |     "    count += 1\n",
51 |     "print (count)"
52 |    ]
53 |   },
54 |   {
55 |    "cell_type": "markdown",
56 |    "metadata": {},
57 |    "source": [
58 |     "**注意点**：\n",
59 |     "\n",
60 |     "1. input得到的是字符型，需要强制转换为int\n",
61 |     "2. 这里不适合用for循环，因为不知道何时循环结束\n",
62 |     "3. 记得初始化步数i=0，并且每次在循环里自增\n"
63 |    ]
64 |   },
65 |   {
66 |    "cell_type": "code",
67 |    "execution_count": null,
68 |    "metadata": {
69 |     "collapsed": true
70 |    },
71 |    "outputs": [],
72 |    "source": []
73 |   }
74 |  ],
75 |  "metadata": {
76 |   "kernelspec": {
77 |    "display_name": "Python 3",
78 |    "language": "python",
79 |    "name": "python3"
80 |   },
81 |   "language_info": {
82 |    "codemirror_mode": {
83 |     "name": "ipython",
84 |     "version": 3
85 |    },
86 |    "file_extension": ".py",
87 |    "mimetype": "text/x-python",
88 |    "name": "python",
89 |    "nbconvert_exporter": "python",
90 |    "pygments_lexer": "ipython3",
91 |    "version": "3.6.6"
92 |   }
93 |  },
94 |  "nbformat": 4,
95 |  "nbformat_minor": 2
96 | }
97 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/dytt.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取电影天堂最新电影迅雷下载地址链接信息
 3 | 所用模块：requests bs4 pandas数据分析
 4 |  '''
 5 | import requests
 6 | import re
 7 | import pandas as pd 
 8 | 
 9 | url = 'https://www.dy2018.com/html/gndy/dyzz/index.html'
10 | 
11 | headers = {
12 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
13 | }
14 | 
15 | items_list = []
16 | 
17 | html = requests.get(url,headers=headers)
18 | html.encoding = 'gb2312'
19 | data = re.findall('<a href="(.*?)" class="ulink" title="(.*?)>.*?</a>',html.text)
20 | for i in data:
21 | 	url_1 = 'https://www.dy2018.com/'+str(i[0])
22 | 	#print(i)
23 | 	#print(url_1)
24 | 	html_1= requests.get(url_1,headers=headers)
25 | 	html_1.encoding = 'gb2312'
26 | 	data_1 = re.findall('<a href="(.*?)">.*?</a></td>',html_1.text)
27 | 	#print(data_1[0])	
28 | 	list_1 = [i[1], url_1, data_1[0]]
29 | 
30 | 	# list_1 = [url_1]
31 | 
32 | 	items_list.append(list_1)
33 | 	#print (list_1)
34 | 
35 | #print ('==========================================================================================================')
36 | 
37 | for m  in range(2, 298):
38 | 	url_2 = 'https://www.dy2018.com/html/gndy/dyzz/index_'+str(m)+'.html'
39 | 	print(url_2)
40 | 	html_2 = requests.get(url_2,headers=headers)
41 | 	html_2.encoding = 'gb2312'
42 | 	data_2 = re.findall('<a href="(.*?)" class="ulink" title="(.*?)>.*?</a>',html_2.text)
43 | 	for n in data_2:
44 | 		url_3 = 'https://www.dy2018.com/'+str(n[0])
45 | 		#print(n)
46 | 		#print(url_3)
47 | 		html_3= requests.get(url_3,headers=headers)
48 | 		html_3.encoding = 'gb2312'
49 | 		data_3 = re.findall('<a href="(.*?)">.*?</a></td>',html_3.text)
50 | 		#print(data_3[0])
51 | 		if len(data_3) < 1:
52 | 			continue
53 | 		list_2 = [n[1], url_3, data_3[0]]
54 | 		# list_2 = [url_3]
55 | 		
56 | 
57 | 		items_list.append(list_2)
58 | 		#print (list_2)	
59 | 	#print ('=====================================================================================================')
60 | 
61 | df = pd.DataFrame(items_list, columns = ['电影名称','电影网址链接','电影迅雷下载链接'])
62 | 
63 | df.to_csv('dytt.csv')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/电影天堂.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/01_python爬取电影天堂/电影天堂.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/dldl.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取斗罗大陆最新章节标题信息
 3 | 所用模块：requests re bs4 pandas数据分析
 4 |  '''
 5 | import requests
 6 | import re
 7 | import pandas as pd 
 8 | from bs4 import BeautifulSoup #分析网页 获取标签内容
 9 | 
10 | url = 'https://www.freexs.org/novel/0/896/index.html'
11 | 
12 | headers = {
13 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
14 | }
15 | 
16 | items_list = []
17 | 
18 | html = requests.get(url,headers=headers)
19 | html.encoding = 'gb2312'
20 | 
21 | data = re.findall('<dd><a href="(.*?)">(.*?)</a></dd>',html.text)
22 | for i in data:
23 | 	url_1 = 'https://www.freexs.org/novel/0/896/'+str(i[0])
24 | 	print (i[1])
25 | 	print (url_1)
26 | 	list = [url_1, i[1]]
27 | 	items_list.append(list)
28 | 
29 | 
30 | 	# html_1 = requests.get(url_1,headers=headers)
31 | 	# html_1.encoding = 'gb2312'
32 | 	# soup = BeautifulSoup(html_1.text,'lxml')
33 | 	# title = soup.find('div', class_='readout').text #标题
34 | 	# print (title)
35 | 
36 | df = pd.DataFrame(items_list, columns = ['链接','章节主题'])
37 | df.to_csv('斗罗大陆小说.csv')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/斗破苍穹小说.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/斗破苍穹小说.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/斗破苍穹小说.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-*- coding: utf-8 -*-
 3 | '''
 4 | 今日主题：python抓取斗破苍穹最新章节标题信息
 5 | 所用模块：requests re bs4 pandas数据分析
 6 | '''
 7 | import requests
 8 | import re
 9 | import pandas as pd 
10 | from bs4 import BeautifulSoup #分析网页 获取标签内容
11 | 
12 | url = 'https://www.miaobige.com/read/68/'
13 | 
14 | headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}
15 | 
16 | items_list = []
17 | 
18 | html = requests.get(url, headers=headers)
19 | # html.encoding = 'gb2312'
20 | soup = BeautifulSoup(html.text,'html.parser')
21 | title = soup.find('div', id='readerlists')
22 | datas = re.findall('<li><a href="(.*?)">(.*?)</a></li>',title.text)
23 | for data in datas:
24 | 	url_1 = 'https://www.miaobige.com/' + data[0]
25 | 	print (data)
26 | 	item_list = [url_1, data[1]]
27 | 	items_list.append(item_list)
28 | 
29 | df = pd.DataFrame(items_list, columns = ['链接','章节主题'])
30 | df.to_csv('斗破苍穹小说.csv')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/斗罗大陆小说.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/02_python爬取斗罗大陆小说/斗罗大陆小说.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/04_python爬取豆瓣电影Top250/douban_top250_movies.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/04_python爬取豆瓣电影Top250/douban_top250_movies.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/04_python爬取豆瓣电影Top250/filmTop250.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python  
 2 | # -*- encoding:utf-8 -*-  
 3 |   
 4 | """ 
 5 | @author : Tom
 6 | @file : douban_movie 
 7 | @time : 2018/4/6 23:04 
 8 | @description :  
 9 |  
10 | """  
11 | 
12 | import requests  
13 | import re  
14 | from bs4 import BeautifulSoup  
15 | import csv  
16 |   
17 |   
18 | # 先创建一个csv文件，写好头部  
19 | with open("douban_top250_movies.csv", 'w') as filed:  # a+为添加，w为擦除重写  
20 |     csv_writer = csv.DictWriter(filed, [  
21 |         u'片名',  
22 |         u'评分',  
23 |         u'评分人数',  
24 |         u'一句话描述',  
25 |         u'豆瓣链接',  
26 |     ])  
27 |     csv_writer.writeheader()  
28 |   
29 |   
30 | def get_mov_info(response):  
31 |     mov_info = {}  
32 |     soup = BeautifulSoup(response.text, "lxml")  
33 |     movies = soup.find_all('div', class_="info")  
34 |   
35 |     for info in movies:  
36 |         # 获得电影的中文名  
37 |         mov_info['mov_name'] = info.find('span', class_='title').text  # find()只找到一个，结果以树结构返回  
38 |   
39 |         # 获得电影在豆瓣中的链接  
40 |         mov_info['mov_link'] = info.find('a').get('href')  
41 |   
42 |         # 找到评分以及评价人数  
43 |         rating_num = info.find(class_='rating_num')  
44 |         mov_info['rating_score'] = rating_num.text  
45 |         comment = rating_num.find_next_sibling().find_next_sibling()  
46 |         # 对评价字段切分  
47 |         comment_num = re.findall('\d{0,}', comment.text)  
48 |         mov_info['comment_nums'] = comment_num[0]    # 正则匹配re中没有find()，findall()以列表形式返回结果  
49 |   
50 |         # 获得一句话评价  
51 |         comment_one = info.find('span', class_='inq')  
52 |         if comment_one is None:  
53 |             mov_info['inq_comment'] = u' '  
54 |         else:  
55 |             mov_info['inq_comment'] = comment_one.text  
56 |         print (mov_info) 
57 |   
58 |         # 一条条存入csv文件  
59 |         write_csv(mov_info)  
60 |   
61 |   
62 | def write_csv(info_dict):  
63 |     with open("douban_top250_movies.csv", 'a+') as f:  
64 |         csv_write = csv.DictWriter(f, [  
65 |             u'片名',  
66 |             u'评分',  
67 |             u'评分人数',  
68 |             u'一句话描述',  
69 |             u'豆瓣链接',  
70 |         ])  
71 |         csv_write.writerow({                   # writerow()写入单行，writerows写入多行，这里只有一行数据，用writerows报错  
72 |             u'片名': info_dict['mov_name'],  
73 |             u'评分': info_dict['rating_score'],  
74 |             u'评分人数': info_dict['comment_nums'],  
75 |             u'一句话描述': info_dict['inq_comment'],  
76 |             u'豆瓣链接': info_dict['mov_link']  
77 |         })  
78 |   
79 | for num in range(0, 10):  
80 |     page = num * 25  
81 |     response = requests.get("https://movie.douban.com/top250?start=%d&filter=" % page)   
82 |     get_mov_info(response)


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/05_python爬取股票数据/stockInfo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 |  
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import traceback
 6 | import re
 7 |  
 8 | def getHTMLText(url):
 9 |     try:
10 |         r = requests.get(url)
11 |         r.raise_for_status()
12 |         r.encoding = r.apparent_encoding
13 |         return r.text
14 |     except:
15 |         return ""
16 |  
17 | def getStockList(lst, stockURL):
18 |     html = getHTMLText(stockURL)
19 |     soup = BeautifulSoup(html, 'html.parser') 
20 |     a = soup.find_all('a')
21 |     for i in a:
22 |         try:
23 |             href = i.attrs['href']
24 |             lst.append(re.findall(r"[s][hz]\d{6}", href)[0])
25 |         except:
26 |             continue
27 |  
28 | def getStockInfo(lst, stockURL, fpath):
29 |     count = 0
30 |     for stock in lst:
31 |         url = stockURL + stock + ".html"
32 |         html = getHTMLText(url)
33 |         try:
34 |             if html=="":
35 |                 continue
36 |             infoDict = {}
37 |             soup = BeautifulSoup(html, 'html.parser')
38 |             stockInfo = soup.find('div',attrs={'class':'stock-bets'})
39 |  
40 |             name = stockInfo.find_all(attrs={'class':'bets-name'})[0]
41 |             infoDict.update({'股票名称': name.text.split()[0]})
42 |              
43 |             keyList = stockInfo.find_all('dt')
44 |             valueList = stockInfo.find_all('dd')
45 |             for i in range(len(keyList)):
46 |                 key = keyList[i].text
47 |                 val = valueList[i].text
48 |                 infoDict[key] = val
49 |              
50 |             with open(fpath, 'a', encoding='utf-8') as f:
51 |                 f.write( str(infoDict) + '\n' )
52 |                 count = count + 1
53 |                 print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")
54 |         except:
55 |             count = count + 1
56 |             print("\r当前进度: {:.2f}%".format(count*100/len(lst)),end="")
57 |             continue
58 |  
59 | def main():
60 |     stock_list_url = 'http://quote.eastmoney.com/stocklist.html'
61 |     stock_info_url = 'https://gupiao.baidu.com/stock/'
62 |     output_file = 'BaiduStockInfo.csv'
63 |     slist=[]
64 |     getStockList(slist, stock_list_url)
65 |     getStockInfo(slist, stock_info_url, output_file)
66 |  
67 | main()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/06_python爬取人人贷网数据/peopleLoad.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import re
 3 | import numpy as np
 4 | import requests
 5 | import time
 6 | import random
 7 | from bs4 import BeautifulSoup
 8 |  
 9 | s=requests.session()
10 |  
11 | headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
12 | #根据浏览器下自行修改
13 |  
14 | headers['Cookie'] = 'gr_user_id=022d0f46-4981-4224-9895-18bfe32d9276; rrdLoginCartoon=rrdLoginCartoon; pgv_pvi=905847926; Hm_lvt_16f9bb97b83369e62ee1386631124bb1=1474288518,1474332677,1474336816,1474368269; Hm_lpvt_16f9bb97b83369e62ee1386631124bb1=1474372985; JSESSIONID=7EB90C9967D8C42B08DFB18EB9A9F74ED2ACC468B7D56B9372E2A20684713847; jforumUserInfo=bEAY23pgyLLLjII69w9oS%2BtK2jljmxa8%0A; IS_MOBLIE_IDPASS=true-false; activeTimestamp=5195275; gr_session_id_9199126ed94d770d=70bbe285-4ac6-42c9-a49b-9255d0eb9c46; gr_cs1_70bbe285-4ac6-42c9-a49b-9255d0eb9c46=user_id%3A5195275'
15 | #根据浏览器F12下的Request Headers->Cookie自行复制上去即可
16 |  
17 |  
18 | def parse_userinfo(loanid):#自定义解析借贷人信息的函数
19 |     timestamp=str(int(time.time())) + '%03d' % random.randint(0,999)<br>
20 |     urll="http://www.we.com/lend/detailPage.action?loanId=%.0f&timestamp=" % loanid+timestamp<br>    #这个urll我也不知道怎么来的，貌似可以用urll="http://www.we.com/loan/%f" % loanid+timestamp  <br>    #(就是页面本身，我也没试过)
21 |     result = s.get(urll,headers=headers)
22 |     html = BeautifulSoup(result.text,'lxml')
23 |     info = html.find_all('table',class_="ui-table-basic-list")
24 |     info1= info[0]
25 |     info2 = info1.find_all('div',class_="basic-filed")
26 |     userinfo = {}
27 |     for item in info2:
28 |         vartag = item.find('span')
29 |         var = vartag.string
30 |         if var == '信用评级':
31 |             var = '信用评分'
32 |             pf1 = repr(item.find('em'))
33 |             value = re.findall(r'\d+',pf1)
34 |         else:
35 |             valuetag = item.find('em')
36 |             value = valuetag.string
37 |         userinfo[var]=value
38 |     data = pd.DataFrame(userinfo)
39 |     return data
40 |  
41 | rrd=pd.read_csv('loanId.csv') #loanId是之前散标数据中的loanId,将其单独整理为一个csv文档
42 | loanId=rrd.ix[:,'loanId']
43 | user_info = ['昵称', '信用评分',
44 |  
45 |             '年龄', '学历', '婚姻',
46 |  
47 |             '申请借款', '信用额度', '逾期金额', '成功借款', '借款总额', '逾期次数','还清笔数', '待还本息', '严重逾期',
48 |  
49 |             '收入', '房产', '房贷', '车产', '车贷',
50 |  
51 | '公司行业', '公司规模', '岗位职位', '工作城市', '工作时间']
52 |  
53 | table = pd.DataFrame(np.array(user_info).reshape(1, 24), columns=user_info)
54 |  
55 | i = 1
56 |  
57 | for loanid in loanId:
58 |     table = pd.concat([table, parse_userinfo(loanid)])
59 |     print(i)
60 |     i += 1 #看一下循环多少次
61 |  
62 | table.to_csv('userinfo.csv',header=False)


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/07_python爬取创业邦创投库/python爬取创业邦创投库.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import requests
  4 | import time
  5 | from bs4 import BeautifulSoup
  6 | import pandas as pd
  7 | # 导入pandas库
  8 | 
  9 | # 设置列表页面URL的固定部分
 10 | url = 'https://bj.lianjia.com/ershoufang/'
 11 | BASE_URL_U1 = "http://www.cyzone.cn/event/list-764-0-"
 12 | BASE_URL_U2 = "-0-0-0-0/"
 13 | 
 14 | # 最好在http请求中设置一个头部信息，否则很容易被封ip
 15 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 16 | 'Accept':'text/html;q=0.9,*/*;q=0.8',
 17 | 'Accept-Charset':'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
 18 | 'Accept-Encoding':'gzip',
 19 | 'Connection':'close',
 20 | 'Referer':'http://www.baidu.com/link?url=_andhfsjjjKRgEWkj7i9cFmYYGsisrnm2A-TN3XZDQXxvGsM9k9ZZSnikW2Yds4s&amp;amp;wd=&amp;amp;eqid=c3435a7d00146bd600000003582bfd1f'
 21 | }
 22 | 
 23 | # 循环抓取列表页信息
 24 | for i in range(1,31):  # 分页
 25 |         if i == 1:
 26 |             i=str(1)
 27 |             var_url = (BASE_URL_U1 + i + BASE_URL_U2)
 28 |             r = requests.get(url=var_url, headers=headers)
 29 |             html = r.content
 30 |             #print(html)
 31 |         else:
 32 |             i=str(i)
 33 |             var_url=(BASE_URL_U1 + i + BASE_URL_U2)
 34 |             var_url=requests.get(url=var_url,headers=headers)
 35 |             html2=r.content
 36 |             html = html + html2
 37 |             # 每次间隔1秒
 38 |             time.sleep(1)
 39 |         
 40 |         # 解析抓取的页面内容
 41 |         res = BeautifulSoup(html, 'html.parser')
 42 |         # 获取感兴趣目标信息：
 43 |         # 提取公司名称
 44 |         # table>tbody>tr.table-plate3>td.tp2>span.tp2_tit>a     
 45 |         companys = res.find_all('span', 'tp2_tit')
 46 |         cnames = []
 47 |         print(len(companys))
 48 |         for item in companys:
 49 |             cname =  item.a.string
 50 |             cnames.append(cname)
 51 |     
 52 |         #print(cnames)
 53 |         # 获取感兴趣目标信息：
 54 |         # 提取公司详情url
 55 |         companys = res.find_all('span', 'tp2_tit')
 56 |         urls = []
 57 |         for item in companys:
 58 |             url =  item.a['href']
 59 |             urls.append(url)
 60 |         # 获取感兴趣目标信息：
 61 |         # 提取当前融资轮次,行业，投资方和更新时间
 62 | 
 63 |         # res = BeautifulSoup(html, 'html5lib')
 64 |         # finances = res.select('div#main > div.list-table3 > table > tbody > tr')
 65 |         finances = res.find_all('tr', 'table-plate3')
 66 |         # 融资轮次,行业,投资方,更新时间
 67 |         financing_rounds, businesses, investors, update_times  = [],[],[],[]
 68 |         #print(len(finances))
 69 |         for i in range(0, len(finances)):
 70 |             # 获取第一行数据(范围)
 71 |             items = finances[i].find_all('td')
 72 |             # print(items)
 73 |             # 获取融资轮次
 74 |             fround =  items[-5].text.strip()
 75 |             #获取行业
 76 |             business = items[-4].text.strip()
 77 |             #获取投资方
 78 |             investor = items[-3].text.strip()
 79 |             #获取更新时间
 80 |             update_time = items[-2].text.strip()
 81 |             financing_rounds.append(fround)
 82 |             businesses.append(business)
 83 |             investors.append(investor)
 84 |             update_times.append(update_time)
 85 |             
 86 |             # 将获取的数据进行汇总：
 87 |             #print(len(cnames))
 88 |             #print(len(urls))
 89 |             #print(len(financing_rounds))
 90 |             #print(len(financing_rounds))
 91 |             #print(len(businesses))
 92 |             #print(len(investors))
 93 |             #print(len(update_times))
 94 |     
 95 |             # 创建数据表
 96 |         resultsDatas = pd.DataFrame({'公司名称':cnames,'详情URL':urls,'融资轮次':financing_rounds,'行业':businesses,'投资方':investors,'更新时间':update_times})
 97 |             # 查看数据表内容
 98 |         print(resultsDatas)
 99 | 
100 | 
101 | resultsDatas.to_csv("resultsDatas.csv")


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/07_python爬取创业邦创投库/resultsDatas.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/07_python爬取创业邦创投库/resultsDatas.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/08_python抓取美团网百万商家信息/meituan.csv:
--------------------------------------------------------------------------------
1 | ,title,score,address,phone,Evaluation_number
2 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/08_python抓取美团网百万商家信息/python抓取美团网百万商家信息.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 今日主题：python抓取美团网百万商家信息
 3 | 所用模块：requests bs4 数据分析
 4 | 流程分析：1、获取主页源码
 5 |           2、获取二级菜单链接（美食、电影。。。）
 6 |           3、商品店家信息
 7 |  '''
 8 | import requests
 9 | from bs4 import BeautifulSoup #分析网页 获取标签内容
10 | import json
11 | import lxml
12 | import pandas as pd 
13 | 
14 | url = 'http://chs.meituan.com/'
15 | 
16 | headers = {
17 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
18 | }
19 |   
20 | #获取分类（电影、美食）
21 | def get_start_links(url, headers=None):
22 |     html = requests.get(url,headers=headers).text #发送请求获取主页文本
23 |     soup = BeautifulSoup(html, 'lxml') #解析网页
24 |     links = [link.find('div').find('div').find('dl').find('dt').find('a')['href'] for link in soup.find_all('div', class_='J-nav-item')]
25 |     return links
26 | 
27 | #获取分类链接中的店铺id
28 | def get_detail_id(url, headers=None):
29 |     html = requests.get(url,headers=headers).text
30 |     soup = BeautifulSoup(html,'lxml')
31 |     content_id = json.loads(soup.find('div', class_='J-scrollloader cf J-hub')['data-async-params'])
32 |     return json.loads(content_id.get('data')).get('poiidList') 
33 | 
34 | #获取店铺详情数据
35 | def get_item_info(url, headers=None):
36 |     html = requests.get(url,headers=headers).text
37 |     soup = BeautifulSoup(html,'lxml')
38 |     title = soup.find('span', class_='title').text #标题
39 |     score = soup.find('span', class_='biz-level').get_text() #评分
40 |     address = soup.find('span', class_='geo').text #地址
41 |     phone = soup.find_all('p', class_='under-title')[1].get_text() #电话
42 |     Evaluation_number = soup.find('a', class_='num rate-count').text #评价
43 |     print (u'店名： '+title) 
44 |     print (u'评论数量： '+Evaluation_number)
45 |     print (u'地址： '+address)
46 |     print (u'评分： '+score) 
47 |     print (u'电话： '+phone)
48 |     print ('======================================================')
49 |     return (title, score, address, phone, Evaluation_number)
50 | 
51 | 
52 | items_list = []
53 |    
54 | start_url_list = get_start_links(url)
55 | for j in start_url_list:#分类链接
56 |     for i in range(1,11): #多页
57 |         category_url = j+'/all/page()'.format(i) #完整的分类多页链接
58 |         shop_id_list = get_detail_id(category_url,headers=headers)
59 |         print (shop_id_list)
60 |         for shop_id in shop_id_list:
61 |             items = get_item_info(url+'shop/{}'.format(shop_id),headers)
62 |             items_list.append(items)
63 | 
64 | df = pd.DataFrame(items_list, columns = ['title','score','address','phone','Evaluation_number'])
65 | df.to_csv('meituan.csv')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/10_python爬取“网上购物”类APP/apps.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/10_python爬取“网上购物”类APP/apps.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/10_python爬取“网上购物”类APP/python爬取网上购物类APP数据py:
--------------------------------------------------------------------------------
  1 | # =========== Python3.X Jupyter ===========
  2 | # =========== 步骤一、抓取每一个子分类的URL ===========
  3 | 
  4 | # 导入第三方包
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | import numpy as np
  8 | import time
  9 | import pandas as pd
 10 | 
 11 | # 设置请求头
 12 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36'}
 13 | 
 14 | # 豌豆荚应用首页 > 安卓软件分类 > 网上购物 > 商城下载的链接
 15 | url = 'http://www.wandoujia.com/category/5017_591'
 16 | # 发送请求
 17 | res = requests.get(url, headers = headers).text
 18 | # 解析HTML
 19 | soup = BeautifulSoup(res, 'html.parser')
 20 | 
 21 | # 商城类app的5个分类链接及名称
 22 | category_urls = [i.findAll('a')[0]['href'] for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]]
 23 | category_names = [i.text.strip() for i in soup.findAll('ul',{'class':'switch-tab cate-tab'})[0].findAll('li')[1:]]
 24 | 
 25 | 
 26 | 
 27 | # =========== 步骤二、生成所有子分类及页码的URL ===========
 28 | 
 29 | # 各类别app的前10页urls
 30 | names = []
 31 | urls = []
 32 | 
 33 | for url,name in zip(category_urls,category_names):
 34 |     for i in range(1,11):
 35 |         names.append(name)
 36 |         urls.append(url+'/'+str(i))   
 37 | 		
 38 | 		
 39 | 		
 40 | 
 41 | # =========== 步骤三、抓取子分类页下各APP对应的URL ===========
 42 | 
 43 | # 根据每一页的url抓出app对应的链接
 44 | app_urls = []
 45 | 
 46 | for url in urls:
 47 |     res = requests.get(url, headers = headers).text
 48 |     soup = BeautifulSoup(res,'html.parser')
 49 |     
 50 |     # 返回每个页面中app的名称及对应的链接
 51 |     # 为防止报错，这里做了异常处理
 52 |     try:
 53 |         app_lists = soup.findAll('ul',{'id':'j-tag-list'})[0]
 54 |         app_urls.extend([i.findAll('a')[0]['href'] for i in app_lists.findAll('h2',{'class':'app-title-h2'})])
 55 |     except:
 56 |         pass
 57 | 
 58 | 
 59 | 
 60 | 
 61 | # =========== 步骤四、爬虫抓取各APP的详细信息 ===========
 62 | 
 63 | # 构建空的列表，用于数据的存储
 64 | appname = []
 65 | appcategory = []
 66 | install = []
 67 | love = []
 68 | comments = []
 69 | size = []
 70 | update = []
 71 | version = []
 72 | platform = []
 73 | company = []
 74 | 
 75 | for url in app_urls:
 76 |     res = requests.get(url, headers = headers).text
 77 |     soup = BeautifulSoup(res,'html.parser')
 78 |     
 79 |     try:
 80 |         # 抓取的信息
 81 |         appname.append(soup.find('p',{'class':'app-name'}).text.strip())
 82 |         appcategory.append('-'.join(soup.find('dl',{'class':'infos-list'}).findAll('dd')[1].text.strip().split('\n')))
 83 |         install.append(soup.find('span',{'class':'item install'}).find('i').text)
 84 |         love.append(soup.find('span',{'class':'item love'}).find('i').text)
 85 |         comments.append(soup.find('div',{'class':'comment-area'}).find('i').text)
 86 |         size.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[0].text.strip())
 87 |         update.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[3].text.strip())
 88 |         version.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[4].text.strip())
 89 |         platform.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[5].text.strip().split('\n')[0])
 90 |         company.append(soup.find('dl',{'class':'infos-list'}).findAll('dd')[6].text.strip())
 91 |     except:
 92 |         pass
 93 | 
 94 | 
 95 | 
 96 | 
 97 | # =========== 步骤五、数据存储 ===========
 98 | 
 99 | # 将存储的列表值写入到字典，并进行数据框的转换
100 | apps = pd.DataFrame({'appname':appname,'appcategory':appcategory,
101 |                       'install':install,'love':love,'comments':comments,'size':size,
102 |                       'update':update,'version':version,'platform':platform,'company':company})
103 |                       
104 | # 数据导出
105 | apps.to_csv('apps.csv', index = False)	


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/Lianjia_Info_v1.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8  
 2 | from bs4 import BeautifulSoup  
 3 | import re  
 4 | import requests  
 5 | from parsel import Selector  
 6 | import pandas as pd  
 7 | import time  
 8 | #############################################################  
 9 | ''''' 
10 | 这个模块爬取链家网福田区的二手房信息；仅仅爬取了前100页的数据 
11 | 为了避免反爬虫策略，设定每5秒钟抓取一页信息 
12 | @time=2018-04-24 
13 | @author=Tom 
14 |  
15 | '''  
16 |   
17 | ###########################################################  
18 | # 进行网络请求的浏览器头部  
19 | headers={  
20 |     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'  
21 |   
22 | }  
23 | # pages是不同页码的网址列表  
24 | pages=['https://sz.lianjia.com/ershoufang/futianqu/pg{}/'.format(x) for x in range(1,100)]  
25 | ############################################################  
26 |   
27 | #############################################################  
28 | lj_futian = pd.DataFrame(columns=['code','dec','img'])  
29 | count=0  
30 | def l_par_html(url):  
31 |     # 这个函数是用来获取链家网福田区二手房的信息  
32 |     wr=requests.get(url,headers=headers,stream=True)  
33 |     sel=Selector(wr.text)  
34 |     # describ用来获取房源的文字信息  
35 |     describ=sel.xpath('//li[@class="clear"]//text()').extract()  
36 |     new_information=([x for x in describ if x != '关注'and x != '加入对比' ])  
37 |     sep_infor=' '.join(new_information).split(r'/平米')[:-1]  
38 |     # hou_code用来获取房源的编号  
39 |     hou_code=sel.xpath('//li[@class="clear"]/a/@data-housecode').extract()  
40 |     # hou_image用来获取房源的图片  
41 |     hou_image=sel.xpath('//li[@class="clear"]/a/img/@data-original').extract()  
42 |     # 将信息形成表格全部写到一起  
43 |     pages_info=pd.DataFrame(list(zip(hou_code,sep_infor,hou_image)),columns=['code','dec','img'])  
44 |     return pages_info  
45 |   
46 | for page in pages:  
47 |     a=l_par_html(page)  
48 |     count=count+1  
49 |     print ('the '+str(count)+' page is sucessful')  
50 |     time.sleep(5)  
51 |     lj_futian=pd.concat([lj_futian,a],ignore_index=True)  
52 |   
53 | # 将表格数据输出到excel文件  
54 | lj_futian.to_excel('d:\\lianjia_ershou_futian_100.xlsx')  
55 | 
56 | 
57 | #encoding:utf-8
58 | # import json  # 使用json解码 因为拉勾网的格式是json
59 | # import requests  # 使用这个requests是得到网页源码
60 | # import pandas as pd   # 使用这个数据进行存储
61 | 
62 | # items = []  # 定义空列表用来存放你得到的数据
63 | # #  循环两页 这里爬取的是两页内容
64 | # for i in range(1,2):
65 | #     # 传入data 因为这个url是post的请求方法 pn指的是页数 kd指的是你搜索的内容
66 | #     data = {'first': 'true', 'pn': i, 'kd': 'python'}
67 | #     # 拉钩网的链接是固定的就变化的是页数 因为是post的提交方法 所以传入data
68 | #     yuan = requests.post('https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false', data=data).text
69 | #     # 使用json进行解码 因为返回的是一个json的格式
70 | #     yuan = json.loads(yuan)
71 | #     # 得到14个数据
72 | #     for i in range(14):
73 | #         item = []
74 | #         # 看下面的图片item里面的是什么数据
75 | #         item.append(yuan['content']['positionResult']['result'][i]['positionName'])
76 | #         item.append(yuan['content']['positionResult']['result'][i]['companyFullName'])
77 | #         item.append(yuan['content']['positionResult']['result'][i]['salary'])
78 | #         item.append(yuan['content']['positionResult']['result'][i]['city'])
79 | #         item.append(yuan['content']['positionResult']['result'][i]['positionAdvantage'])
80 | #         items.append(item)
81 | # #  使用的是pands的存数据 存为xlsx就是excel格式
82 | # data = pd.DataFrame(items)
83 | # data.to_excel('拉钩.xlsx')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/Lianjia_Info_v2.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8 
 2 | #爬取链家二手房信息  
 3 | import requests  
 4 | from bs4 import BeautifulSoup  
 5 | import csv  
 6 |   
 7 | def getHTMLText(url):  
 8 |     try:  
 9 |         r = requests.get(url,timeout=30)  
10 |         r.raise_for_status()  
11 |         r.encoding = r.apparent_encoding  
12 |         return r.text  
13 |     except:  
14 |         return '产生异常'  
15 |   
16 | def get_data(list,html):  
17 |     soup = BeautifulSoup(html,'html.parser')  
18 |     infos = soup.find('ul',{'class':'sellListContent'}).find_all('li')  
19 |     with open(r'lianjia.csv','a',encoding='utf-8') as f:  
20 |         for info in infos:  
21 |             name = info.find('div',{'class':'title'}).find('a').get_text()  
22 |             price =info.find('div',{'class':'priceInfo'}).find('div',{'class','totalPrice'}).find('span').get_text()  
23 |             f.write("{},{}\n".format(name,price))  
24 |         
25 | def main():  
26 |     start_url = 'https://sh.lianjia.com/ershoufang/pg'  
27 |     depth = 20  
28 |     info_list =[]  
29 |     for i in range(depth):  
30 |         url = start_url + str(i)  
31 |         html = getHTMLText(url)  
32 |         get_data(info_list,html)  
33 | main()  


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/Lianjia_Info_v3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import requests
 3 | from requests.exceptions import RequestException
 4 | from bs4 import BeautifulSoup
 5 | from time import sleep
 6 | import csv
 7 | 
 8 | 
 9 | def write_to_file(content):
10 |     with open('lianjia_bs4.csv', 'w') as csvfile:
11 |         writer = csv.writer(csvfile)
12 |         writer.writerows(content)
13 |         csvfile.close()
14 | 
15 | 
16 | def get_one_page(url):
17 |     try:
18 |         response = requests.get(url)
19 |         if response.status_code == 200:
20 |             return response.text
21 |         else:
22 |             return None
23 |     except RequestException:
24 |         return None
25 | 
26 | 
27 | def parse_one_page(html):
28 |     soup = BeautifulSoup(html, 'lxml')
29 |     prefix = 'http://sh.lianjia.com'
30 |     for item in soup.select('.info-panel'):
31 |         houseUrl = prefix + item.find("h2").a["href"]
32 |         title = item.find("h2").a["title"]
33 |         spans = item.find(class_="where").find_all("span")
34 |         xiaoqu, huxing, mianji = spans[0].string, spans[1].string.split('\xa0')[0], spans[2].string.split('\xa0')[0]
35 |         cons = item.find(class_="con").find_all("a")
36 |         area, sub_area = cons[0].string, cons[1].string
37 |         subway = item.find(class_="fang-subway-ex").string
38 |         price = item.find(class_="price").find(class_="num").string
39 |         data = item.find(class_="price-pre").string.split('\n')[0]
40 |         watched = item.find(class_="square").find(class_="num").string
41 | 
42 |         yield [houseUrl, title, xiaoqu, huxing, mianji, area, sub_area, subway, price, data, watched]
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     results = []
47 |     for page in range(100):
48 |         sleep(1)
49 |         print (page)
50 |         url = 'http://sh.lianjia.com/zufang/d' + str(page)
51 |         html = get_one_page(url)
52 |         for item in parse_one_page(html):
53 |             results.append(item)
54 |     write_to_file(results)


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/Lianjia_Info_v4.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import requests
 3 | from requests.exceptions import RequestException
 4 | from time import sleep
 5 | import re
 6 | import csv
 7 | 
 8 | 
 9 | def write_to_file(content):
10 |     with open('lianjia_re_v4.csv', 'w') as csvfile:
11 |         writer = csv.writer(csvfile)
12 |         writer.writerows(content)
13 |         csvfile.close()
14 | 
15 | 
16 | def get_one_page(url):
17 |     try:
18 |         response = requests.get(url)
19 |         if response.status_code == 200:
20 |             return response.text
21 |         else:
22 |             return None
23 |     except RequestException:
24 |         return None
25 | 
26 | 
27 | def parse_one_page(html):
28 |     pattern = re.compile('<h2>.*?"\shref="(.*?)".*?title="(.*?)".*?</a>.*?<span>(.*?)&nb.*?<span>' +
29 |                          '(.*?)&nb.*?/">(.*?)</a>.*?/">(.*?)</a>.*?</span>(.*?)<.*?-ex"><span>(.*?)<' +
30 |                          '.*?-ex"><span>(.*?)</span>.*?num">(\d+)<.*?-pre">(.*?)<.*?num">(\d+)<.*?</li>', re.S)
31 |     prefix = 'http://sh.lianjia.com'
32 |     items = re.findall(pattern, html)
33 |     for item in items:
34 |         item = list(item)
35 |         item[0] = prefix + item[0]
36 |         item[6] = item[6].strip()
37 |         item[10] = item[10].split('\n')[0]
38 |         yield item
39 | 
40 | 
41 | def main(page, results):
42 |     url = 'http://sh.lianjia.com/zufang/d' + str(page)
43 |     html = get_one_page(url)
44 |     for item in parse_one_page(html):
45 |         results.append(item)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     results = []
50 |     for i in range(100):
51 |         sleep(1)
52 |         print(i)
53 |         main(i+1, results)
54 |     write_to_file(results)


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/Lianjia_Info_v4_analysis.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import numpy as np
 3 | import csv
 4 | import re
 5 | import matplotlib.pyplot as plt
 6 | plt.rcParams['font.sans-serif'] = ['SimHei']
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     csv_reader = csv.reader(open('lianjia_re_v4.csv'))
11 |     content = []
12 |     for row in csv_reader:
13 |         content.append(row)
14 | 
15 |     all_region = []
16 |     regions = ['徐汇', '静安', '浦东', '杨浦', '闵行', '长宁', '宝山', '青浦',
17 |                '金山', '普陀','松江', '嘉定', '闸北', '虹口', '奉贤',
18 |                '崇明', '黄浦', '上海周边']
19 |     tmp = []
20 |     region_statistics = []
21 |     region_statistics_dict = {}
22 |     for item in content:
23 |         all_region.append(item[5])
24 | 
25 |     for region in regions:
26 |         if all_region.count(region):
27 |             region_statistics.append(all_region.count(region))
28 |             region_statistics_dict[region] = all_region.count(region)
29 |             tmp.append(region)
30 |     regions = tmp
31 | 
32 |     fangzu = {}
33 |     for region in regions:
34 |         fangzu[region] = 0
35 |     for item in content:
36 |         fangzu[item[5]] += int(item[-3])
37 |     fangzu_average = []
38 |     for region in regions:
39 |         fangzu_average.append(fangzu[region]/region_statistics_dict[region])
40 | 
41 |     area = {}
42 |     for region in regions:
43 |         area[region] = 0
44 |     for item in content:
45 |         tmp = item[4]
46 |         tmp = re.sub(r'[^\x00-\x7f]', '', tmp)
47 |         area[item[5]] += int(tmp)
48 |     area_average = []
49 |     for region in regions:
50 |         area_average.append(area[region] / region_statistics_dict[region])
51 |     for i in range(len(area_average)):
52 |         area_average[i] = fangzu_average[i]/area_average[i]
53 | 
54 |     # 地区分布
55 |     a = [i for i in range(1, len(regions) + 1)]
56 |     plt.bar(a, region_statistics, 0.4, color="blue")
57 |     xlocations = np.array(range(1, len(regions) + 1))
58 |     plt.xticks(xlocations, regions, rotation=60)
59 |     plt.ylabel("房屋数量")
60 |     plt.xlabel("地区")
61 |     plt.title("上海各区租房数量")
62 |     for a, b in zip(a, region_statistics):
63 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
64 |     plt.savefig("上海各区租房数量.jpg", dpi=300)
65 |     plt.close()
66 | 
67 |     # 各区房租情况
68 |     a = [i for i in range(1, len(regions) + 1)]
69 |     plt.bar(a, fangzu_average, 0.4, color="blue")
70 |     xlocations = np.array(range(1, len(regions) + 1))
71 |     plt.xticks(xlocations, regions, rotation=60)
72 |     plt.ylabel("月租 元/月")
73 |     plt.xlabel("地区")
74 |     plt.title("上海各区租房平均月租")
75 |     for a, b in zip(a, fangzu_average):
76 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
77 |     plt.savefig("上海各区租房房租信息.jpg", dpi=300)
78 |     plt.close()
79 | 
80 |     # 各区每平米平均月租
81 |     a = [i for i in range(1, len(regions) + 1)]
82 |     plt.bar(a, area_average, 0.4, color="blue")
83 |     xlocations = np.array(range(1, len(regions) + 1))
84 |     plt.xticks(xlocations, regions, rotation=60)
85 |     plt.ylabel("月租 元/月/平米")
86 |     plt.xlabel("地区")
87 |     plt.title("上海各区租房每平米平均月租")
88 |     for a, b in zip(a, area_average):
89 |         plt.text(a, b, '%.0f' % b, ha='center', va='bottom', fontsize=7)
90 |     plt.savefig("上海各区租房每平米房租信息.jpg", dpi=300)
91 |     plt.close()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/lianjia_ershou_futian_100.xlsx


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/lianjia_re_v4.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/11_python爬取链家网房价信息/lianjia_re_v4.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask.png


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/alice_mask1.png


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/12_python爬取并分析豆瓣中最新电影的影评(词云显示)/show_Chinese.png


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/13_python爬取豆瓣书籍信息/books.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/13_python爬取豆瓣书籍信息/books.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/13_python爬取豆瓣书籍信息/python爬取豆瓣书籍信息.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | #爬虫爬取豆瓣书目录
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import json
 7 | import pandas #该库用于对爬取的信息进行表格性操作
 8 | from skimage import io#该库用于打印爬取到的照片
 9 | 
10 | url = 'https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web&page={}&page_num=18'
11 | 
12 | #该函数式用来返回一个列表存放含有书籍信息的字典
13 | def bookList(url): 
14 |     newurl = requests.get(url)
15 |     soup = BeautifulSoup(newurl.text,'html.parser')
16 |     result_total = []
17 |     for book in soup.select('.book-item'):
18 |         if len(book.select('.book-brief'))>0:
19 | 
20 |             bookimag2 = io.imread(book.select('img')[1]['src'])#书的图片
21 |             io.imshow(bookimag2)
22 |             #io.show()#为了使爬取到的图片显示出来
23 | 
24 |             bookurl = book.select('a')[0]['href']#抓取书的链接url
25 |             #print('链接:   ',bookurl)
26 | 
27 |             result_total.append(booktextscore(bookurl))#将所抓取书的信息字典添加到列表里面
28 | 
29 |             bookimag1 = io.imread(book.select('img')[0]['src'])#背景图片
30 |             io.imshow(bookimag1)
31 |             #io.show()
32 |     return result_total #返回一个列
33 | 
34 | 
35 | #该函数式用来爬取书籍的名字，评分，评价人数以及书的简单介绍
36 | def booktextscore(url):
37 |     booktexturl = requests.get(url)
38 |     soup = BeautifulSoup(booktexturl.text,'html.parser')
39 |     result = {}#创建一个字典将相关书籍信息存入到字典中
40 |     bookname = soup.select('.book-breintro h3')[0].text
41 |     bookname2 = '《' + bookname + '》'
42 |     print(bookname2)
43 |     result['书籍名称'] = bookname2
44 | 
45 |     bookauthor = soup.select('.book-public')[0].text.lstrip('\n          ').rstrip('\n        ')
46 |     result['作者'] = bookauthor
47 |     print(bookauthor)
48 | 
49 |     print(url)
50 |     result['书籍链接'] = url
51 | 
52 |     score = soup.select('.total-score')[0].text#爬取该书评分，其中可能含有有些书籍由于评论人数不足导致没有评分，加一个判断默认该种情况成评分为0
53 |     if score == '评价人数不足':
54 |         score = 0
55 |     score = float(score)
56 |     result['书籍评分'] = float(score)#将评分强制转换成float类型的
57 |     print('评分:',score)
58 | 
59 |     commentnum = soup.select('.comment-number')[0].text#爬取本书评论人数
60 |     print(commentnum)
61 |     print('该书简介:\n')
62 |     result['书籍评论人数'] = commentnum 
63 | 
64 |     article = []#添加一个列表
65 |     for ench in soup.select('.layout-content'):#爬取的是图书详情
66 |         for p in ench.select('.paragraph-content  p')[:-1]:
67 |             article.append(p.text.strip())#将p标签中的文字添加到列表中
68 |         articlebook = '\n '.join(article)
69 |         #print(articlebook)
70 |     #result['书籍简介'] = articlebook 
71 |     return result
72 | 
73 | 
74 | 
75 | book_total = []
76 | #由于书籍信息有两页，所以加一个循环将两页书籍信息都添加进列表中方便生成表格
77 | for ench in range(1,4):
78 |     newurl = url.format(ench)#通过format将URL地址实现可变性，可以将两页书籍信息都打印出来
79 |     book_result = bookList(newurl)
80 |     book_total.extend(book_result)
81 | 
82 | df = pandas.DataFrame(book_total)
83 | df.to_csv('books.csv')#将爬取后的书籍信息通过pandas转换成表格形式
84 | 
85 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/14_python爬取今日头条信息并导入mongodb数据库/python爬取今日头条信息并导入mongodb数据库.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | import re
 5 | # 导入mongo 数据库
 6 | import pymongo
 7 | import json
 8 | 
 9 | # 打开数据库连接，mongodb默认端口为27017
10 | conn = pymongo.MongoClient(host='localhost',port=27017)
11 | # 选择或创建数据库
12 | toutiao = conn['toutiao']
13 | # 选择或者创建数据集合
14 | newsdata = toutiao['news']
15 | 
16 | toutiaoUrl = 'http://www.toutiao.com/api/pc/focus/'
17 | reqData = requests.get(toutiaoUrl).text
18 | print(reqData)
19 | 
20 | jsonData = json.loads(reqData)
21 | newsData = jsonData['data']['pc_feed_focus']
22 | 
23 | # 存储到数据库
24 | for new in newsData:
25 |     title = new['title']
26 |     img_url = new['image_url']
27 |     url = new['media_url']
28 |     data = {
29 |         'title':title,
30 |         'img_url':img_url,
31 |         'url':url
32 |     }
33 |     # 插入一行数据
34 |     newsdata.insert_one(data)
35 | 
36 | for i in newsdata.find():
37 |     # 从数据库中读取出来
38 |     print('i'+str(i))


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/15_python使用selenium爬取百度招聘内容并存入mongodb数据库/python使用selenium爬取百度招聘内容并入mongodb数据库.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import re
  4 | import time
  5 | 
  6 | from selenium import webdriver
  7 | from selenium.webdriver.common.keys import Keys
  8 | from selenium.webdriver.common.action_chains import ActionChains
  9 | 
 10 | from bs4 import BeautifulSoup
 11 | 
 12 | import pymongo
 13 | 
 14 | # 多进程
 15 | from multiprocessing import Pool
 16 | 
 17 | # 1 打开数据库连接，mongodb默认端口为27017
 18 | conn = pymongo.MongoClient(host='localhost',port=27017)
 19 | # 2 选择或创建数据库
 20 | jobdata = conn['baidujobs']
 21 | # 3 选择或创建数据集合
 22 | ver_job = jobdata['verjob']
 23 | 
 24 | baidu_baseurl = 'http://zhaopin.baidu.com/quanzhi?tid=4139&ie=utf8&oe=utf8&query=python%E6%9D%AD%E5%B7%9E&city_sug=%E6%9D%AD%E5%B7%9E'
 25 | def set_winscroll(driver):
 26 |     time.sleep(2)
 27 |     driver.execute_script('window.scrollBy(0,2000)')
 28 |     time.sleep(3)
 29 |     driver.execute_script('window.scrollBy(0,3000)')
 30 |     time.sleep(3)
 31 | 
 32 | 
 33 | # 1 初始化driver
 34 | driver = webdriver.PhantomJS()
 35 | # 2 调用get方法
 36 | driver.get(baidu_baseurl)
 37 | # 3 进入网页
 38 | set_winscroll(driver)
 39 | 
 40 | # 4 获取资源（第一页的数据）
 41 | we_data = driver.page_source
 42 | # print('first_we_data ' + we_data)
 43 | 
 44 | 
 45 | def parse_html(html):
 46 |     soup = BeautifulSoup(html, 'lxml')
 47 |     item_url = soup.findAll('a', {'class': 'clearfix item line-bottom'})
 48 |     # for item in zip(item_url):
 49 |     #     print(item.get('href'))
 50 | 
 51 |     # 职位信息
 52 |     jobs = soup.findAll('div', {'class': 'title-h3 line-clamp1'})
 53 |     # for job in jobs:
 54 |     # print(job.string) # 职位信息
 55 |     # 地址 + 公司名
 56 |     compy = soup.findAll('p', {'class': 'area line-clamp1'})
 57 |     # for com in compy:
 58 |     #     print(com.string)
 59 | 
 60 |     # 薪资
 61 |     salarys = soup.findAll('p', {'class': 'salary'})
 62 |     # for salary in salarys:
 63 |     #     print(salary.string)
 64 |     # 发布时间跟发布来源网站
 65 |     addresss = soup.findAll('div', {'class': 'right time'})
 66 |     # print(addresss)
 67 |     reg = r'<p>(.*?)</p>'
 68 |     regx = re.compile(reg)
 69 |     ads = re.findall(regx, str(addresss))
 70 |     # print(ads)
 71 |     # for adds in ads:
 72 |     #     data = adds.split('|')
 73 |     #     print(data)
 74 |     for itm_url, job_detail, ver_compny, ver_salary, ver_addres in zip(item_url, jobs, compy, salarys, ads):
 75 |         data = {
 76 |             'itme_url': 'http://zhaopin.baidu.com'+itm_url.get('href'),
 77 |             'job_detail': job_detail.string,
 78 |             'ver_compny': str(ver_compny.string),
 79 |             'ver_salary': ver_salary.string,
 80 |             'ver_addres': str(ver_addres).split('|'),
 81 |         }
 82 |         print(data)
 83 |         # 插入数据库
 84 |         ver_job.insert_one(data) # 插入数据库失败
 85 |         f.write(str(data))
 86 | 
 87 | 
 88 | def get_page_source(page_num):
 89 |     time.sleep(2)
 90 |     driver.find_element_by_xpath('//*[@id="pagination"]/p/span/a[%s]' % page_num).click()
 91 |     # //*[@id="pagination"]/p/span/a[1]  为在第一页的按钮
 92 |     # //*[@id="pagination"]/p/span/a[2]  为第二页的按钮
 93 |     set_winscroll(driver)
 94 |     we_data = driver.page_source
 95 |     return we_data
 96 | 
 97 | f = open('百度招聘前30页杭州.csv', 'a',encoding='utf-8')
 98 | # 首页的数据
 99 | def getBaiduHangZhouJob(we_data):
100 |     parse_html(we_data)
101 |     for i in range(1, 50):
102 |         if i==1:
103 |             we_data = get_page_source(1)
104 |             parse_html(we_data)
105 |         elif i<=5:
106 |             we_data = get_page_source(str(2))
107 |             parse_html(we_data)
108 |         else:
109 |             we_data = get_page_source(str(3))
110 |             parse_html(we_data)
111 |     f.close()
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     getBaiduHangZhouJob(we_data)
116 |     # pool = Pool(processes=10)
117 |     # pool.map_async(getBaiduHangZhouJob(we_data))
118 |     # pool.close()
119 |     # f.close()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/16_python爬取熊猫直播用户信息/python爬取熊猫直播用户信息.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import requests
 4 | import json
 5 | import pandas as pd 
 6 | 
 7 | url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d".format(a=range(0,35),b=range(1501946526480,1501946526880))
 8 | 
 9 | headers = {
10 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
11 |     ,
12 |     'Cookie': '__guid=96554777.3243119502220345300.1500627276199.6702; smid=608e0bde-ffe2-4251-90ca-2938cabdc074; monitor_count=18'
13 |     ,
14 | }
15 | 
16 | 
17 | def getHtml(url):
18 |     req = requests.get(url, headers=headers)
19 |     print(req.text)
20 |     return req.text
21 | 
22 | 
23 | def printInfos(data):
24 |     jsondata = json.loads(data, "utf-8")
25 |     # print(jsondata)
26 |     itemsinfo = jsondata['data']['items']
27 |     items_list = []
28 |     for pinfo in itemsinfo:
29 |         name = pinfo['name']
30 |         person_num = pinfo['person_num']
31 |         nickName = pinfo['userinfo']['nickName']
32 |         lelvel = pinfo['host_level_info']
33 |         lable = pinfo['label']       
34 |         cname = pinfo['classification']
35 |         item_list = [name, person_num, nickName, lelvel, label, cname]
36 |         items_list.append(item_list)
37 |     df = pd.DataFrame(items_list, columns = ['name','person_num','nickName','host_level_info','label','classification'])
38 |     df.to_csv('熊猫直播用户信息.csv')
39 | 
40 | 	
41 | def mainStart():
42 |     for n in range(0, 3):
43 |         pageindex = 1 + n
44 |         pagetime = int(1501946526480 + n)
45 |         url = "https://www.panda.tv/live_lists?status=2&order=person_num&token=&pageno=%d&pagenum=120&_=%d"%(pageindex,pagetime)
46 |         data = getHtml(url)
47 |         printInfos(data)
48 | 
49 | mainStart()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/README.txt:
--------------------------------------------------------------------------------
 1 | 输入：scrapy crawl youtx -o items.json 时以json格式保存下载数据
 2 | 输入：scrapy crawl youtx -o items.csv 时以csv格式保存下载数据
 3 | 
 4 | 
 5 | Scrapy必须背下来的命令：
 6 | 1 创建项目： scrapy startproject youtxNanJin
 7 | 			startproject: 表示创建项目
 8 | 			youtxNanJin： 表示创建的项目名
 9 | 
10 | 2 创建爬虫： scrapy genspider youtx "http://www.youtx.com"
11 | 			genspider： 表示生成一个爬虫（默认是scrapy.Spider类）
12 | 			youtx： 表示爬虫名（对应爬虫代码里的 name 参数）
13 | 			"http://www.youtx.com"： 表示允许爬虫爬取的域范围
14 | 
15 | 3 执行爬虫： scrapy crawl youtx
16 | 			crawl: 表示启动一个sc	rapy爬虫
17 | 			youtx: 表示需要启动的爬虫名（对应爬虫代码里的 name 参数）


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = youtxNanJin.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = youtxNanJin
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class YoutxnanjinItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     
16 | 	# 房源名称
17 |     homeName = scrapy.Field()
18 |     # 房源链接
19 |     homeLine = scrapy.Field()
20 |     # 房租单价
21 |     homeSinglePrice = scrapy.Field()
22 |     # 房租地址
23 |     homeAddress = scrapy.Field()
24 |     # 房租近期信息
25 |     homeDetai = scrapy.Field()
26 |     # 满七天价格
27 |     homeSeven = scrapy.Field()
28 |     # 满30天价格
29 |     homeThirth = scrapy.Field()
30 | 
31 |     # 房东
32 |     homePerson = scrapy.Field()
33 |     # 房东头像
34 |     homePersonImg = scrapy.Field()
35 |     # 房东头像链接
36 |     homePersonLink = scrapy.Field()
37 | 
38 |     # 房子大图
39 |     homePicBg = scrapy.Field()
40 |     # 房子大图链接
41 |     homePicLink = scrapy.Field()
42 | 
43 |     # 品牌店铺信息
44 |     # homePinPai = scrapy.Field()
45 |     # 明星房东
46 |     # homeStarrPerson = scrapy.Field()
47 | 
48 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class YoutxnanjinSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | from scrapy.conf import settings
 9 | import pymongo
10 | 
11 | 
12 | class YoutxnanjinPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | 
17 | class YouTXMongo(object):
18 |     def __init__(self):
19 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
20 |         self.db = self.client[settings['MONGO_DB']]
21 |         self.post = self.db[settings['MONGO_COLL']]
22 | 
23 |     def process_item(self, item, spider):
24 |         postItem = dict(item)
25 |         self.post.insert(postItem)
26 |         return item
27 | 
28 | # 写入json文件
29 | class JsonWritePipline(object):
30 |     def __init__(self):
31 |         self.file = open('游天下南京.json','w',encoding='utf-8')
32 | 
33 |     def process_item(self,item,spider):
34 |         line  = json.dumps(dict(item),ensure_ascii=False)+"\n"
35 |         self.file.write(line)
36 |         return item
37 | 
38 |     def spider_closed(self,spider):
39 |         self.file.close()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for youtxNanJin project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'youtxNanJin'
 13 | 
 14 | SPIDER_MODULES = ['youtxNanJin.spiders']
 15 | NEWSPIDER_MODULE = 'youtxNanJin.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'youtxNanJin (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 21 | 
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # 配置mongoDB
 27 | MONGO_HOST = "127.0.0.1"  # 主机IP
 28 | MONGO_PORT = 27017  # 端口号
 29 | MONGO_DB = "YouTianXia"  # 库名
 30 | MONGO_COLL = "house_nanjin"  # collection
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | #CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | COOKIES_ENABLED = False
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | #TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | #DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | #}
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'youtxNanJin.middlewares.YoutxnanjinSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'youtxNanJin.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    # 'youtxNanJin.pipelines.YoutxnanjinPipeline': 300,
 77 |     'youtxNanJin.pipelines.YouTXMongo': 300,
 78 |     'youtxNanJin.pipelines.JsonWritePipline': 300,
 79 | }
 80 | 
 81 | # Enable and configure the AutoThrottle extension (disabled by default)
 82 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 83 | #AUTOTHROTTLE_ENABLED = True
 84 | # The initial download delay
 85 | #AUTOTHROTTLE_START_DELAY = 5
 86 | # The maximum download delay to be set in case of high latencies
 87 | #AUTOTHROTTLE_MAX_DELAY = 60
 88 | # The average number of requests Scrapy should be sending in parallel to
 89 | # each remote server
 90 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 91 | # Enable showing throttling stats for every response received:
 92 | #AUTOTHROTTLE_DEBUG = False
 93 | 
 94 | # Enable and configure HTTP caching (disabled by default)
 95 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 96 | #HTTPCACHE_ENABLED = True
 97 | #HTTPCACHE_EXPIRATION_SECS = 0
 98 | #HTTPCACHE_DIR = 'httpcache'
 99 | #HTTPCACHE_IGNORE_HTTP_CODES = []
100 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
101 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/__pycache__/youtxNanJin_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/17_scrapy爬取游天下南京短租房信息并存入mongodb数据库/youtxNanJin/youtxNanJin/spiders/youtxNanJin_spider.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | import scrapy
 3 | from youtxNanJin.items import YoutxnanjinItem
 4 | 
 5 | class NanJinDefault(scrapy.Spider):
 6 |     name = 'youtx'
 7 |     allowed_domains = ['youtx.com']
 8 |     start_urls = ["http://www.youtx.com/nanjing/longrent1-page{}".format(n) for n in range(0,6)]
 9 |     def parse(self, response):
10 |         # print(response.body)
11 |         node_list = response.xpath("//div[@class='duanzu houseList']/ul/li[@class='clearfix']")
12 |         # print(node_list)
13 |         for node in node_list:
14 |             item = YoutxnanjinItem()
15 |             homeName = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/text()").extract()
16 |             homeLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/h3/a/@href").extract()
17 |             print(homeName)
18 |             print(homeLink)
19 | 
20 |             # 单日价格
21 |             homeSinglePrice = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/span/span[@class='housePrice']/text()").extract()
22 |             print(homeSinglePrice)
23 | 
24 |             # 获取房源地址
25 |             homeAddress = node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='clearfix mt5']/text()").extract()
26 |             # 房租信息
27 |             homeDesc =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/text()").extract()
28 |             homeDesc2 =node.xpath("./div[@class='houseInfo clearfix']/div[@class='houseInfo-left mt2']/p[@class='mt5']/span[2]/text()").extract()
29 |             print(homeAddress)
30 |             print(homeDesc)
31 |             print(homeDesc2)
32 | 
33 |             # 满30天的信息
34 |             homeThrty = node.xpath("./div[@class='houseInfo clearfix']/div[@class='house-tit clearfix']/div[@class='house-price mt9']/div[@class='mix12_5']/div[@class='discount']/div[@class='discount-price']/span//text()").extract()
35 |             print(homeThrty)
36 |             # 房东信息
37 |             homePerson = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/text()").extract()
38 |             # 房东链接
39 |             homePersonLink = node.xpath("./div[@class='houseInfo clearfix']/div[@class='agentInfo mt16']/p[1]/a/@href").extract()
40 |             print(homePerson)
41 |             print(homePersonLink)
42 | 
43 |             # 房源大图图片
44 |             homeBigPic = node.xpath("./div[@class='house-img']/a[1]/img/@src").extract()
45 |             homeBigPicLink = node.xpath("./div[@class='house-img']/a[1]/@href").extract()
46 |             print(homeBigPic)
47 |             print(homeBigPicLink)
48 |             # 房东头像信息
49 |             personPic = node.xpath("./div[@class='house-img']/a[2]/img/@src").extract()
50 |             # 房东头像链接地址
51 |             personPicLink = node.xpath("./div[@class='house-img']/a[2]/img/@href").extract()
52 | 
53 |             print(personPic)
54 |             print(homePersonLink)
55 |             item['homeName'] ="".join(homeName)
56 |             item['homeLine'] ="".join(homeLink)
57 |             item['homeSinglePrice'] ="".join(homeSinglePrice)
58 |             item['homeAddress'] ="".join(homeAddress)
59 |             item['homeDetai'] ="".join(homeDesc)+"".join(homeDesc2)
60 |             # 这里的值暂时没有取出来
61 |             item['homeSeven'] ="".join(homeThrty)
62 |             item['homeThirth'] ="".join(homeThrty)
63 | 
64 |             item['homePerson'] ="".join(homePerson)
65 |             item['homePersonImg'] ="".join(personPic)
66 |             item['homePersonLink'] ="".join(homePersonLink)
67 |             item['homePicBg'] ="".join(homeBigPic)
68 |             item['homePicLink'] ="".join(homeBigPicLink)
69 |             yield item


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/README.txt:
--------------------------------------------------------------------------------
1 | ﻿输入：scrapy crawl docNet -o items.json 时以json格式保存下载数据
2 | 输入：scrapy crawl docNet -o items.csv 时以csv格式保存下载数据


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ChinadoctornetItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     # 爬取中国医学人才网的条目(共5个条目)
16 |     # 医院名称
17 |     hospitalName = scrapy.Field()
18 |     # 医院规模
19 |     hospitalSize = scrapy.Field()
20 |     # 医院所在地
21 |     hospitalAddress = scrapy.Field()
22 |     # 医院科目
23 |     hospitalDesc = scrapy.Field()
24 |     # pass
25 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ChinadoctornetSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | # import json
 8 | 
 9 | class ChinadoctornetPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 
13 | 
14 | # class JsonWriterPipeline(object):
15 | #     def __init__(self):
16 | #         self.file = open('中国医学人才网招聘最新招聘专栏2.json', 'w', encoding='utf-8')
17 | 
18 | #     def process_item(self, item, spider):
19 | #         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
20 | #         self.file.write(line)
21 | #         return item
22 | 
23 | #     def spider_closed(self, spider):
24 | #         self.file.close()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for chinadoctornet project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'chinadoctornet'
13 | 
14 | SPIDER_MODULES = ['chinadoctornet.spiders']
15 | NEWSPIDER_MODULE = 'chinadoctornet.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'chinadoctornet (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'chinadoctornet.middlewares.ChinadoctornetSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'chinadoctornet.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | # ITEM_PIPELINES = {
68 | #    # 'chinadoctornet.pipelines.ChinadoctornetPipeline': 300,
69 | #      'chinadoctornet.pipelines.JsonWritePipline': 300,
70 | # }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/__pycache__/chinadoctornet_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/chinadoctornet/spiders/chinadoctornet_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import scrapy
 3 | from chinadoctornet.items import ChinadoctornetItem
 4 | 
 5 | 
 6 | class ChinaDocNet(scrapy.Spider):
 7 |     # 启动爬虫的名称
 8 |     name = 'docNet'
 9 |     # 爬取域名的范围
10 |     allowed_domains = ['yixuezp.com']
11 |     # 爬虫第一个url地址
12 |     start_urls = ['http://www.yixuezp.com/zhaopin?page={}'.format(n) for n in range(0, 464)]  # 463
13 | 
14 |     def parse(self, response):
15 |         # 医院name
16 |         node_list = response.xpath("//div[@class='newsjob']/ul/li")
17 |         items = []
18 |         for node in node_list:
19 |             item = ChinadoctornetItem()
20 |             hospitalName = node.xpath("./a/text()").extract()
21 |             hospitalSize = node.xpath("./span[1]/text()").extract()
22 |             hospitalAddress = node.xpath("./span[2]/text()").extract()
23 |             hospitalDesc = node.xpath("./p/a/text()").extract()
24 | 
25 |             item['hospitalName'] = hospitalName
26 |             item['hospitalSize'] = hospitalSize
27 |             item['hospitalAddress'] = hospitalAddress
28 |             item['hospitalDesc'] = hospitalDesc
29 |             items.append(item)
30 |             # return items # 如果直接return的话，一页数据只会返回一条数据
31 |             yield item #用yield 的话，可以交给下载器，继续执行下一步操作。


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = chinadoctornet.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = chinadoctornet
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/18_scrapy爬取中国医学人才网信息并以json格式保存/chinadoctornet/中国医学人才网招聘最新招聘专栏.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/README.txt:
--------------------------------------------------------------------------------
1 | ﻿输入：scrapy crawl doubanMovie -o items.json 时以json格式保存下载数据
2 | 输入：scrapy crawl doubanMovie -o items.csv 时以csv格式保存下载数据


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanmovieItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 | 
16 |     # 电影名字
17 |     name = scrapy.Field()
18 |     # 电影信息
19 |     info = scrapy.Field()
20 |     # 评分
21 |     rating = scrapy.Field()
22 |     # 评论人数
23 |     num = scrapy.Field()
24 |     # 经典语句
25 |     quote = scrapy.Field()
26 |     # 电影图片
27 |     img_url = scrapy.Field()
28 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class DoubanmovieSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DoubanmoviePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for doubanmovie project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'doubanmovie'
13 | 
14 | SPIDER_MODULES = ['doubanmovie.spiders']
15 | NEWSPIDER_MODULE = 'doubanmovie.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'doubanmovie (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:45.0) Gecko/20100101 Firefox/45.0'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'doubanmovie.middlewares.DoubanmovieSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'doubanmovie.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | #ITEM_PIPELINES = {
69 | #    'doubanmovie.pipelines.DoubanmoviePipeline': 300,
70 | #}
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/__pycache__/doubanmovie_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/doubanmovie/spiders/doubanmovie_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from doubanmovie.items import DoubanmovieItem
 3 | 
 4 | class Movie(scrapy.Spider):
 5 |     # 爬虫唯一标识符
 6 |     name = 'doubanMovie'
 7 |     # 爬取域名
 8 |     allowed_domain = ['movie.douban.com']
 9 |     # 爬取页面地址
10 |     start_urls = ['https://movie.douban.com/top250']
11 | 
12 |     def parse(self, response):
13 |         selector = scrapy.Selector(response)
14 |         # 解析出各个电影
15 |         movies = selector.xpath('//div[@class="item"]')
16 |         # 存放电影信息
17 |         item = DoubanmovieItem()
18 | 
19 |         for movie in movies:
20 | 
21 |             # 电影各种语言名字的列表
22 |             titles = movie.xpath('.//span[@class="title"]/text()').extract()
23 |             # 将中文名与英文名合成一个字符串
24 |             name = ''
25 |             for title in titles:
26 |                 name += title.strip()
27 |             item['name'] = name
28 | 
29 |             # 电影信息列表
30 |             infos = movie.xpath('.//div[@class="bd"]/p/text()').extract()
31 |             # 电影信息合成一个字符串
32 |             fullInfo = ''
33 |             for info in infos:
34 |                 fullInfo += info.strip()
35 |             item['info'] = fullInfo
36 |             # 提取评分信息
37 |             item['rating'] = movie.xpath('.//span[@class="rating_num"]/text()').extract()[0].strip()
38 |             # 提取评价人数
39 |             item['num'] = movie.xpath('.//div[@class="star"]/span[last()]/text()').extract()[0].strip()[:-3]
40 |             # 提取经典语句，quote可能为空
41 |             quote = movie.xpath('.//span[@class="inq"]/text()').extract()
42 |             if quote:
43 |                 quote = quote[0].strip()
44 |             item['quote'] = quote
45 |             # 提取电影图片
46 |             item['img_url'] = movie.xpath('.//img/@src').extract()[0]
47 | 
48 |             yield item
49 | 
50 |         next_page = selector.xpath('//span[@class="next"]/a/@href').extract()[0]
51 |         url = 'https://movie.douban.com/top250' + next_page
52 |         if next_page:
53 |             yield scrapy.Request(url, callback=self.parse)
54 | 
55 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/19_scrapy框架爬取豆瓣电影top250信息/doubanmovie/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = doubanmovie.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = doubanmovie
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MakedreamItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 | 
16 |     # 文章标题
17 |     articleTitle = scrapy.Field()
18 |     # 文章标题url
19 |     articleUrl = scrapy.Field()
20 |     # 文章描述
21 |     articleDesc = scrapy.Field()
22 |     # 文章发布时间
23 |     articlePublic = scrapy.Field()
24 |     # 文章类型
25 |     articleType = scrapy.Field()
26 |     # 文章标签
27 |     articleTag = scrapy.Field()
28 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class MakedreamSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import json
 8 | import pymongo
 9 | from scrapy.conf import settings
10 | 
11 | class MakedreamPipeline(object):
12 |     def process_item(self, item, spider):
13 |         return item
14 | 
15 | 
16 | class DreamMongo(object):
17 |     def __init__(self):
18 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
19 |         self.db = self.client[settings['MONGO_DB']]
20 |         self.post = self.db[settings['MONGO_COLL']]
21 | 
22 |     def process_item(self, item, spider):
23 |         postItem = dict(item)
24 |         self.post.insert(postItem)
25 |         return item
26 | 
27 | 
28 | # 写入json文件类
29 | class JsonWritePipeline(object):
30 |     def __init__(self):
31 |         self.file = open('织梦网其他编程.json', 'w', encoding='utf-8')
32 | 
33 |     def process_item(self, item, spider):
34 |         line = json.dumps(dict(item), ensure_ascii=False) + "\n"
35 |         self.file.write(line)
36 |         return item
37 | 
38 |     def spider_closed(self, spider):
39 |         self.file.close()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for makedream project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'makedream'
 13 | 
 14 | SPIDER_MODULES = ['makedream.spiders']
 15 | NEWSPIDER_MODULE = 'makedream.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'makedream (+http://www.yourdomain.com)'
 20 | # 配置mongoDB
 21 | MONGO_HOST = "127.0.0.1"  # 主机IP
 22 | MONGO_PORT = 27017  # 端口号
 23 | MONGO_DB = "DreamDB"  # 库名
 24 | MONGO_COLL = "Dream_info"  # collection
 25 | 
 26 | 
 27 | 
 28 | # Obey robots.txt rules
 29 | ROBOTSTXT_OBEY = False
 30 | 
 31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 32 | #CONCURRENT_REQUESTS = 32
 33 | 
 34 | # Configure a delay for requests for the same website (default: 0)
 35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 36 | # See also autothrottle settings and docs
 37 | #DOWNLOAD_DELAY = 3
 38 | # The download delay setting will honor only one of:
 39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 40 | #CONCURRENT_REQUESTS_PER_IP = 16
 41 | 
 42 | # Disable cookies (enabled by default)
 43 | # COOKIES_ENABLED = False
 44 | 
 45 | # Disable Telnet Console (enabled by default)
 46 | #TELNETCONSOLE_ENABLED = False
 47 | 
 48 | # Override the default request headers:
 49 | #DEFAULT_REQUEST_HEADERS = {
 50 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 51 | #   'Accept-Language': 'en',
 52 | #}
 53 | 
 54 | # Enable or disable spider middlewares
 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 56 | #SPIDER_MIDDLEWARES = {
 57 | #    'makedream.middlewares.MakedreamSpiderMiddleware': 543,
 58 | #}
 59 | 
 60 | # Enable or disable downloader middlewares
 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 62 | #DOWNLOADER_MIDDLEWARES = {
 63 | #    'makedream.middlewares.MyCustomDownloaderMiddleware': 543,
 64 | #}
 65 | 
 66 | # Enable or disable extensions
 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 68 | #EXTENSIONS = {
 69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 70 | #}
 71 | 
 72 | # Configure item pipelines
 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 74 | ITEM_PIPELINES = {
 75 |    # 'makedream.pipelines.MakedreamPipeline': 300,
 76 |     'makedream.pipelines.JsonWritePipeline':300,
 77 |     'makedream.pipelines.DreamMongo':300
 78 | }
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/__pycache__/makedream_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/makedream/spiders/makedream_spider.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf8
 2 | import scrapy
 3 | from makedream.items import MakedreamItem
 4 | 
 5 | 
 6 | class DramingNet(scrapy.Spider):
 7 |     # 启动爬虫的名称
 8 |     name = 'dreaming'
 9 |     # 爬虫的域范围
10 |     allowed_domains = ['zhimengzhe.com']
11 |     # 爬虫的第一个url
12 |     start_urls = ['http://www.zhimengzhe.com/bianchengjiaocheng/qitabiancheng/index_{}.html'.format(n) for n in
13 |                   range(0, 1466)]
14 | 
15 |     # 爬取结果解析
16 |     def parse(self, response):
17 |         base_url = 'http://www.zhimengzhe.com'
18 |         # print(response.body)
19 |         node_list = response.xpath("//ul[@class='list-unstyled list-article']/li")
20 |         for node in node_list:
21 |             item = MakedreamItem()
22 |             nextNode = node.xpath("./div[@class='pull-left ltxt w658']")
23 |             print('*' * 30)
24 |             title = nextNode.xpath('./h3/a/text()').extract()
25 |             link = nextNode.xpath('./h3/a/@href').extract()
26 |             desc = nextNode.xpath('./p/text()').extract()
27 | 
28 |             # 创建时间，类型，标签
29 |             publicTime = nextNode.xpath("./div[@class='tagtime']/span[1]/text()").extract()
30 |             publicType = nextNode.xpath("./div[@class='tagtime']/span[2]/a/text()").extract()
31 |             publicTag = nextNode.xpath("./div[@class='tagtime']/span[3]/a/text()").extract()
32 |             # node
33 |             titleLink = base_url + ''.join(link)
34 |             item['articleTitle'] = title
35 |             # 文章标题url
36 |             item['articleUrl'] = titleLink
37 |             # 文章描述
38 |             item['articleDesc'] = desc
39 |             # 文章发布时间
40 |             item['articlePublic'] = publicTime
41 |             # 文章类型
42 |             item['articleType'] = publicType
43 |             # 文章标签
44 |             item['articleTag'] = publicTag
45 |             yield item


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/20_scrapy爬取织梦者网站信息并存入mongodb数据库/makedream/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = makedream.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = makedream
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ComentsAnaylst.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2018/4/25 11:15
 3 | # @File    : commentsAnaylst.py（再见前任3的影评f词云）
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | from PIL import Image
 7 | from wordcloud import WordCloud
 8 | import jieba
 9 | import numpy as np
10 | #读取txt格式的文本内容
11 | text_from_file_with_apath = open('douban.txt','rb').read()
12 | 
13 | #使用jieba进行分词，并对分词的结果以空格隔开
14 | wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True)
15 | wl_space_split = " ".join(wordlist_after_jieba)
16 | 
17 | #对分词后的文本生成词云
18 | # my_wordcloud = WordCloud().generate(wl_space_split)
19 | 
20 | font = r'C:\Windows\Fonts\simfang.ttf'
21 | mask = np.array(Image.open('ciyun.jpg'))
22 | wc = WordCloud(mask=mask,max_words=3000,collocations=False, font_path=font, width=5800, height=2400, margin=10,background_color='black').generate(wl_space_split)
23 | default_colors = wc.to_array()
24 | plt.title("QR 3")
25 | plt.imshow(wc)
26 | plt.axis("off")
27 | plt.savefig("ciyun.png")
28 | plt.show()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.jpg


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/ciyun.png


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/21_python爬取豆瓣电影前任3评论(词云显示)/douban_qianren3.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # @Time    : 2018/4/25 11:15
 3 | # @File    : test_douban_qianren3.py（再见前任3的影评）
 4 | 
 5 | import csv
 6 | import requests
 7 | from lxml import etree
 8 | import time
 9 | 
10 | 
11 | url = 'https://movie.douban.com/subject/26662193/comments?start=0&limit=20&sort=new_score&status=P&percent_type='
12 | 
13 | headers = {
14 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36',
15 |     'Cookie': 'gr_user_id=ffdf2f63-ec37-49b5-99e8-0e0d28741172; bid=qh9RXgIGopg; viewed="26826540_24703171"; ap=1; ll="118172"; ct=y; _vwo_uuid_v2=8C5B24903B1D1D3886FE478B91C5DE97|7eac18658e7fecbbf3798b88cfcf6113; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1522129522%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DdnHqCRiT1HlhToCp0h1cpdyV8rB9f_OfOvJhjRPO3p1jrl764LGvi7gbYSdskDMh%26wd%3D%26eqid%3De15db1bb0000e3cd000000045ab9b6fe%22%5D; _pk_id.100001.4cf6=4e61f4192b9486a8.1485672092.10.1522130672.1522120744.; _pk_ses.100001.4cf6=*'}
16 | 
17 | 
18 | def get_html(current_url):
19 |     time.sleep(2)
20 |     r = requests.get(current_url, headers=headers)
21 |     r.raise_for_status()
22 |     return etree.HTML(r.text)
23 | 
24 | 
25 | def parse_html(content,writer):
26 |     links = content.xpath("//*[@class='comment-item']")
27 |     for link in links:
28 |         content = link.xpath("./div[@class='comment']/p/text()")[0].strip()
29 |         author = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/a/text()")[0].strip()
30 |         time = link.xpath("./div[@class='comment']/h3/span[@class='comment-info']/span[@class='comment-time ']/text()")[
31 |             0].strip()
32 |         is_useful = link.xpath("./div[@class='comment']/h3/span[@class='comment-vote']/span[@class='votes']/text()")[0]
33 |         print('content：', content)
34 |         print('time：', time)
35 |         print('is_useful：', is_useful)
36 |         # detail = (author, time, is_useful, content)
37 |         detail = (is_useful,content)
38 |         writer.writerow(detail)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     with open('douban.txt', 'a+', encoding='utf-8', newline='') as csvf:
43 |         writer = csv.writer(csvf)
44 |         writer.writerow(('作者', '时间', '有用数', '内容'))
45 |         for page in range(0, 260, 20):
46 |             url = 'https://movie.douban.com/subject/26662193/comments?start={}&limit=20&sort=new_score&status=P&percent_type='.format(
47 |                 page)
48 |             r = get_html(url)
49 |             parse_html(r,writer)


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/22_python爬取Bilibili用户信息并导入mysql数据库/bilibili_user_info.sql:
--------------------------------------------------------------------------------
 1 | # ************************************************************
 2 | # Sequel Pro SQL dump
 3 | # Version 4135
 4 | #
 5 | # http://www.sequelpro.com/
 6 | # http://code.google.com/p/sequel-pro/
 7 | #
 8 | # Host: 127.0.0.1 (MySQL 5.1.63)
 9 | # Database: sunshine
10 | # Generation Time: 2018-04-26 13:33:32 +0000
11 | # ************************************************************
12 | 
13 | 
14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
17 | /*!40101 SET NAMES utf8 */;
18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
21 | 
22 | 
23 | # Dump of table bilibili_user_info
24 | # ------------------------------------------------------------
25 | 
26 | CREATE TABLE `bilibili_user_info` (
27 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
28 |   `mid` varchar(11) DEFAULT NULL,
29 |   `name` varchar(45) DEFAULT NULL,
30 |   `sex` varchar(11) DEFAULT NULL,
31 |   `face` varchar(200) DEFAULT NULL,
32 |   `coins` int(11) DEFAULT NULL,
33 |   `spacesta` int(11) DEFAULT NULL,
34 |   `birthday` varchar(45) DEFAULT NULL,
35 |   `place` varchar(45) DEFAULT NULL,
36 |   `description` varchar(45) DEFAULT NULL,
37 |   `article` int(11) DEFAULT NULL,
38 |   `following` int(11) DEFAULT NULL,
39 |   `fans` int(11) DEFAULT NULL,
40 |   `playnum` int(30) DEFAULT NULL,
41 |   `sign` varchar(300) DEFAULT NULL,
42 |   `level` int(11) DEFAULT NULL,
43 |   `exp` int(11) DEFAULT NULL,
44 |   PRIMARY KEY (`id`)
45 | ) ENGINE=MyISAM DEFAULT CHARSET=utf8;
46 | 
47 | 
48 | 
49 | 
50 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
51 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
52 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
53 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
54 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
55 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
56 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/README.md:
--------------------------------------------------------------------------------
1 | #### 这是一个爬取网易云音乐的所有的歌曲的评论数的爬虫。
2 | 
3 | 以下为主要思路：
4 | 
5 | - 1. 爬取所有的歌手信息（[artists.py]）；
6 | - 2. 根据上一步爬取到的歌手信息去爬取所有的专辑信息（[album_by_artist.py]）；
7 | - 3. 根据专辑信息爬取所有的歌曲信息（[music_by_album.py]）；
8 | - 4. 根据歌曲信息爬取其评论条数（[comments_by_music.py]）
9 | - 5. 数据库相关的语句都存放于（[sql.py]）中。


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/album_by_artist.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 根据上一步获取的歌手的 ID 来用于获取所有的专辑 ID
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import time
 7 | from music_163 import sql
 8 | 
 9 | 
10 | class Album(object):
11 |     headers = {
12 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 |         'Accept-Encoding': 'gzip, deflate, sdch',
14 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
15 |         'Cache-Control': 'no-cache',
16 |         'Connection': 'keep-alive',
17 |         'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; _ga=GA1.2.1405085820.1476521280; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; JSESSIONID-WYYY=189f31767098c3bd9d03d9b968c065daf43cbd4c1596732e4dcb471beafe2bf0605b85e969f92600064a977e0b64a24f0af7894ca898b696bd58ad5f39c8fce821ec2f81f826ea967215de4d10469e9bd672e75d25f116a9d309d360582a79620b250625859bc039161c78ab125a1e9bf5d291f6d4e4da30574ccd6bbab70b710e3f358f%3A1476594130342; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476588849.1476592408.6; __utmb=94650624.11.10.1476592408; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
18 |         'DNT': '1',
19 |         'Host': 'music.163.com',
20 |         'Pragma': 'no-cache',
21 |         'Referer': 'http://music.163.com/',
22 |         'Upgrade-Insecure-Requests': '1',
23 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
24 |     }
25 | 
26 |     def save_albums(self, artist_id):
27 |         params = {'id': artist_id, 'limit': '200'}
28 |         # 获取歌手个人主页
29 |         r = requests.get('http://music.163.com/artist/album', headers=self.headers, params=params)
30 | 
31 |         # 网页解析
32 |         soup = BeautifulSoup(r.content.decode(), 'html.parser')
33 |         body = soup.body
34 | 
35 |         albums = body.find_all('a', attrs={'class': 'tit f-thide s-fc0'})  # 获取所有专辑
36 | 
37 |         for album in albums:
38 |             albume_id = album['href'].replace('/album?id=', '')
39 |             sql.insert_album(albume_id, artist_id)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     artists = sql.get_all_artist()
44 |     my_album = Album()
45 |     for i in artists:
46 |         try:
47 |             my_album.save_albums(i['ARTIST_ID'])
48 |             # print(i)
49 |         except Exception as e:
50 |             # 打印错误日志
51 |             print(str(i) + ': ' + str(e))
52 |             time.sleep(5)
53 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/artists.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 获取所有的歌手信息
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from music_163 import sql
 7 | 
 8 | headers = {
 9 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
10 |     'Accept-Encoding': 'gzip, deflate, sdch',
11 |     'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
12 |     'Cache-Control': 'no-cache',
13 |     'Connection': 'keep-alive',
14 |     'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; NTES_SESS=Fa2uk.YZsGoj59AgD6tRjTXGaJ8_1_4YvGfXUkS7C1NwtMe.tG1Vzr255TXM6yj2mKqTZzqFtoEKQrgewi9ZK60ylIqq5puaG6QIaNQ7EK5MTcRgHLOhqttDHfaI_vsBzB4bibfamzx1.fhlpqZh_FcnXUYQFw5F5KIBUmGJg7xdasvGf_EgfICWV; S_INFO=1476597594|1|0&80##|hourui93; NETEASE_AUTH_SOURCE=space; NETEASE_AUTH_USERNAME=hourui93; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=cbd082d2ce2cffbcd5c085d8bf565a95aee3173ddbbb00bfa270950f93f1d8bb4cb55a56a4049fa8c828373f630c78f4a43d6c3d252c4c44f44b098a9434a7d8fc110670a6e1e9af992c78092936b1e19351435ecff76a181993780035547fa5241a5afb96e8c665182d0d5b911663281967d675ff2658015887a94b3ee1575fa1956a5a%3A1476607977016; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476595468.1476606177.8; __utmb=94650624.20.10.1476606177; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
15 |     'DNT': '1',
16 |     'Host': 'music.163.com',
17 |     'Pragma': 'no-cache',
18 |     'Referer': 'http://music.163.com/',
19 |     'Upgrade-Insecure-Requests': '1',
20 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
21 | }
22 | 
23 | 
24 | def save_artist(group_id, initial):
25 |     params = {'id': group_id, 'initial': initial}
26 |     r = requests.get('http://music.163.com/discover/artist/cat', params=params)
27 | 
28 |     # 网页解析
29 |     soup = BeautifulSoup(r.content.decode(), 'html.parser')
30 |     body = soup.body
31 | 
32 |     hot_artists = body.find_all('a', attrs={'class': 'msk'})
33 |     artists = body.find_all('a', attrs={'class': 'nm nm-icn f-thide s-fc0'})
34 | 
35 |     for artist in hot_artists:
36 |         artist_id = artist['href'].replace('/artist?id=', '').strip()
37 |         artist_name = artist['title'].replace('的音乐', '')
38 |         try:
39 |             sql.insert_artist(artist_id, artist_name)
40 |         except Exception as e:
41 |             # 打印错误日志
42 |             print(e)
43 | 
44 |     for artist in artists:
45 |         artist_id = artist['href'].replace('/artist?id=', '').strip()
46 |         artist_name = artist['title'].replace('的音乐', '')
47 |         try:
48 |             sql.insert_artist(artist_id, artist_name)
49 |         except Exception as e:
50 |             # 打印错误日志
51 |             print(e)
52 | 
53 | 
54 | gg = 4003
55 | 
56 | save_artist(gg, 0)
57 | for i in range(65, 91):
58 |     save_artist(gg, i)
59 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/music_by_album.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 根据专辑 ID 获取到所有的音乐 ID
 3 | """
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import time
 7 | from music_163 import sql
 8 | 
 9 | 
10 | class Music(object):
11 |     headers = {
12 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 |         'Accept-Encoding': 'gzip, deflate, sdch',
14 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
15 |         'Cache-Control': 'no-cache',
16 |         'Connection': 'keep-alive',
17 |         'Cookie': '_ntes_nnid=7eced19b27ffae35dad3f8f2bf5885cd,1476521011210; _ntes_nuid=7eced19b27ffae35dad3f8f2bf5885cd; usertrack=c+5+hlgB7TgnsAmACnXtAg==; Province=025; City=025; NTES_PASSPORT=6n9ihXhbWKPi8yAqG.i2kETSCRa.ug06Txh8EMrrRsliVQXFV_orx5HffqhQjuGHkNQrLOIRLLotGohL9s10wcYSPiQfI2wiPacKlJ3nYAXgM; P_INFO=hourui93@163.com|1476523293|1|study|11&12|jis&1476511733&mail163#jis&320100#10#0#0|151889&0|g37_client_check&mailsettings&mail163&study&blog|hourui93@163.com; _ga=GA1.2.1405085820.1476521280; JSESSIONID-WYYY=fb5288e1c5f667324f1636d020704cab2f27ee915622b114f89027cbf60c38be2af6b9cbef2223c1f2581e3502f11b86efd60891d6f61b6f783c0d55114f8269fa801df7352f5cc4c8259876e563a6bd0212b504a8997723a0593b21d5b3d9076d4fa38c098be68e3c5d36d342e4a8e40c1f73378cec0b5851bd8a628886edbdd23a7093%3A1476623819662; _iuqxldmzr_=25; __utma=94650624.1038096298.1476521011.1476610320.1476622020.10; __utmb=94650624.14.10.1476622020; __utmc=94650624; __utmz=94650624.1476521011.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
18 |         'DNT': '1',
19 |         'Host': 'music.163.com',
20 |         'Pragma': 'no-cache',
21 |         'Referer': 'http://music.163.com/',
22 |         'Upgrade-Insecure-Requests': '1',
23 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
24 |     }
25 | 
26 |     def save_music(self, album_id):
27 |         params = {'id': album_id}
28 |         # 获取专辑对应的页面
29 |         r = requests.get('http://music.163.com/album', headers=self.headers, params=params)
30 | 
31 |         # 网页解析
32 |         soup = BeautifulSoup(r.content.decode(), 'html.parser')
33 |         body = soup.body
34 | 
35 |         musics = body.find('ul', attrs={'class': 'f-hide'}).find_all('li')  # 获取专辑的所有音乐
36 | 
37 |         for music in musics:
38 |             music = music.find('a')
39 |             music_id = music['href'].replace('/song?id=', '')
40 |             music_name = music.getText()
41 |             sql.insert_music(music_id, music_name, album_id)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     albums = sql.get_all_album()
46 |     my_music = Music()
47 |     for i in albums:
48 |         try:
49 |             my_music.save_music(i['ALBUM_ID'])
50 |             # print(i)
51 |         except Exception as e:
52 |             # 打印错误日志
53 |             print(str(i) + ': ' + str(e))
54 |             time.sleep(5)
55 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/23_python爬取网易云音乐所有歌曲的评论数/sql.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 一般 Python 用于连接 MySQL 的工具：pymysql
 3 | """
 4 | import pymysql.cursors
 5 | 
 6 | connection = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='***', db='sunshine',charset="utf8")
 7 | 
 8 | 
 9 | # 保存评论
10 | def insert_comments(music_id, comments, detail, connection):
11 |     with connection.cursor() as cursor:
12 |         sql = "INSERT INTO `comments` (`MUSIC_ID`, `COMMENTS`, `DETAILS`) VALUES (%s, %s, %s)"
13 |         cursor.execute(sql, (music_id, comments, detail))
14 |     connection.commit()
15 | 
16 | 
17 | # 保存音乐
18 | def insert_music(music_id, music_name, album_id):
19 |     with connection.cursor() as cursor:
20 |         sql = "INSERT INTO `musics` (`MUSIC_ID`, `MUSIC_NAME`, `ALBUM_ID`) VALUES (%s, %s, %s)"
21 |         cursor.execute(sql, (music_id, music_name, album_id))
22 |     connection.commit()
23 | 
24 | 
25 | # 保存专辑
26 | def insert_album(album_id, artist_id):
27 |     with connection.cursor() as cursor:
28 |         sql = "INSERT INTO `albums` (`ALBUM_ID`, `ARTIST_ID`) VALUES (%s, %s)"
29 |         cursor.execute(sql, (album_id, artist_id))
30 |     connection.commit()
31 | 
32 | 
33 | # 保存歌手
34 | def insert_artist(artist_id, artist_name):
35 |     with connection.cursor() as cursor:
36 |         sql = "INSERT INTO `artists` (`ARTIST_ID`, `ARTIST_NAME`) VALUES (%s, %s)"
37 |         cursor.execute(sql, (artist_id, artist_name))
38 |     connection.commit()
39 | 
40 | 
41 | # 获取所有歌手的 ID
42 | def get_all_artist():
43 |     with connection.cursor() as cursor:
44 |         sql = "SELECT `ARTIST_ID` FROM `artists` ORDER BY ARTIST_ID"
45 |         cursor.execute(sql, ())
46 |         return cursor.fetchall()
47 | 
48 | 
49 | # 获取所有专辑的 ID
50 | def get_all_album():
51 |     with connection.cursor() as cursor:
52 |         sql = "SELECT `ALBUM_ID` FROM `albums` ORDER BY ALBUM_ID"
53 |         cursor.execute(sql, ())
54 |         return cursor.fetchall()
55 | 
56 | 
57 | # 获取所有音乐的 ID
58 | def get_all_music():
59 |     with connection.cursor() as cursor:
60 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID"
61 |         cursor.execute(sql, ())
62 |         return cursor.fetchall()
63 | 
64 | 
65 | # 获取前一半音乐的 ID
66 | def get_before_music():
67 |     with connection.cursor() as cursor:
68 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 0, 800000"
69 |         cursor.execute(sql, ())
70 |         return cursor.fetchall()
71 | 
72 | 
73 | # 获取后一半音乐的 ID
74 | def get_after_music():
75 |     with connection.cursor() as cursor:
76 |         sql = "SELECT `MUSIC_ID` FROM `musics` ORDER BY MUSIC_ID LIMIT 800000, 1197429"
77 |         cursor.execute(sql, ())
78 |         return cursor.fetchall()
79 | 
80 | 
81 | def dis_connect():
82 |     connection.close()
83 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/ctrip_items.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class FindtripItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     site = scrapy.Field()
16 |     company = scrapy.Field()
17 |     flight_time = scrapy.Field()
18 |     airports = scrapy.Field()
19 |     passtime = scrapy.Field()
20 |     price = scrapy.Field()
21 | 
22 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class FindtripSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from findtrip.spiders.washctrip import wash
 8 | import pymongo
 9 | from scrapy.conf import settings
10 | from scrapy import log
11 | 
12 | class FindtripPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | 
17 | class MongoDBPipeline(object):
18 |     def __init__(self):
19 |         self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
20 |         self.db = self.client[settings['MONGO_DB']]
21 |         self.post = self.db[settings['MONGO_COLL']]
22 | 
23 |     def process_item(self, item, spider):
24 |         if item['site'] == 'Qua':
25 |             if item['company']:
26 |                 item['company'] = wash(item['company'])
27 |             if item['flight_time']:
28 |                 item['flight_time'] = wash(item['flight_time'])
29 |             if item['airports']:
30 |                 item['airports'] = wash(item['airports'])
31 |             if item['passtime']:
32 |                 item['passtime'] = wash(item['passtime'])
33 |             if item['price']:
34 |                 item['price'] = wash(item['price'])        
35 |             for data in item:
36 |                 if not data:
37 |                     raise DropItem("Missing data!")
38 |             self.collection.insert(dict(item))
39 |             log.msg("Question added to MongoDB database!",
40 |                     level=log.DEBUG, spider=spider)
41 |         elif item['site'] == 'Ctrip':
42 |             self.collection.insert(dict(item))
43 |             log.msg("Question added to MongoDB database!",
44 |                     level=log.DEBUG, spider=spider)
45 | 
46 |         return item


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for findtrip project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'findtrip'
 13 | 
 14 | SPIDER_MODULES = ['findtrip.spiders']
 15 | NEWSPIDER_MODULE = 'findtrip.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'findtrip (+http://www.yourdomain.com)'
 20 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
 21 | 
 22 | 
 23 | # Obey robots.txt rules
 24 | ROBOTSTXT_OBEY = False
 25 | 
 26 | # 配置mongoDB
 27 | MONGO_HOST = "127.0.0.1"  # 主机IP
 28 | MONGO_PORT = 27017  # 端口号
 29 | MONGO_DB = "FindTrip"  # 库名
 30 | MONGO_COLL = "qua_findtrip"  # collection
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | #CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | COOKIES_ENABLED = False
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | #TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | #DEFAULT_REQUEST_HEADERS = {
 51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 | #   'Accept-Language': 'en',
 53 | #}
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'findtrip.middlewares.FindtripSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'findtrip.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    # 'findtrip.pipelines.FindtripPipeline': 300,
 77 |    	 'findtrip.pipelines.MongoDBPipeline': 300,
 78 | }
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/ctrip_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/qua_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/__pycache__/washctrip.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/ctrip_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from findtrip.items import FindtripItem
 3 | 
 4 | class CtripSpider(scrapy.Spider):
 5 |     name = 'ctrip'
 6 |     start_urls = [
 7 |             "http://flights.ctrip.com/booking/XMN-BJS-day-1.html?DDate1=2016-04-19"
 8 |             ]
 9 | 
10 |     def parse(self, response):
11 |         sel = scrapy.Selector(response)
12 |         fligint_div = "//div[@id='J_flightlist2']/div"
13 |         dataList = sel.xpath(fligint_div)
14 | 
15 |         for index,each in enumerate(dataList):
16 |             flight_each = fligint_div+'['+str(index+1)+']'
17 |             flight_tr = flight_each+"//tr[@class='J_header_row']"
18 |             istrain = sel.xpath(flight_each + "//div[@class='train_flight_tit']")
19 | 
20 |             if istrain:
21 |                 print ("this data is train add")
22 |             else:
23 |                 company = sel.xpath(flight_tr + "//div[@class='info-flight J_flight_no']//text()").extract()
24 | 
25 |                 flight_time_from = sel.xpath(flight_tr + "//td[@class='right']/div[1]//text()").extract()
26 |                 flight_time_to = sel.xpath(flight_tr + "//td[@class='left']/div[1]//text()").extract()
27 |                 flight_time = [flight_time_from,flight_time_to]
28 | 
29 |                 airports_from =  sel.xpath(flight_tr + "//td[@class='right']/div[2]//text()").extract()
30 |                 airports_to = sel.xpath(flight_tr + "//td[@class='left']/div[2]//text()").extract()
31 |                 airports = [airports_from,airports_to]
32 | 
33 |                 price_middle = sel.xpath(flight_tr + "[1]//td[@class='price middle ']/span//text()").extract()
34 |                 price = sel.xpath(flight_tr + "[1]//td[@class='price ']/span//text()").extract()
35 |                 if price_middle:
36 |                     price = price_middle
37 |                 elif price:
38 |                     price = price
39 |                 else:
40 |                     price = ''
41 | 
42 |                 item = FindtripItem()
43 |                 item['site'] = 'Ctrip'
44 |                 item['company'] = company
45 |                 item['flight_time'] = flight_time
46 |                 item['airports'] = airports
47 |                 item['price'] = price
48 |                 yield item
49 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/qua_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from findtrip.items import FindtripItem
 3 | 
 4 | class QuaSpider(scrapy.Spider):
 5 |     name = "qua"
 6 |     start_urls = [
 7 |         "http://www.qua.com/flights/PEK-XMN/2016-05-12?m=CNY&from=flight_home"
 8 |     ]
 9 | 
10 |     def parse(self, response):
11 |         sel = scrapy.Selector(response)
12 |         dataList = sel.xpath("//div[@class='m-fly-item s-oneway']")
13 | 
14 |         for index,each in enumerate(dataList):
15 |             flight_each = "//div[@id='list-box']/div["+str(index+1)+"]"
16 |             detail_span = "//div[@class='fl-detail-nav']/ul/li[1]/span[@class='nav-label']"
17 |             f_route_div = "//div[@class='m-fl-info-bd']/div"
18 | 
19 |             airports = sel.xpath(flight_each + f_route_div + '/p[3]//text()').extract()
20 |             company = sel.xpath(flight_each + f_route_div + '/p[1]//text()').extract()
21 |             flight_time = sel.xpath(flight_each + f_route_div + '/p[2]//text()').extract()
22 |             passtime = sel.xpath(flight_each + f_route_div + '/p[4]//text()').extract()
23 |             price = sel.xpath(flight_each + "//div[@class='fl-price-box']//em//text()").extract()
24 | 
25 |             item = FindtripItem()
26 |             item['site'] = 'Qua'
27 |             item['company'] = company
28 |             item['flight_time'] = flight_time
29 |             item['airports'] = airports
30 |             item['passtime'] = passtime
31 |             item['price'] = price
32 |             yield item
33 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/findtrip/spiders/washctrip.py:
--------------------------------------------------------------------------------
1 | def wash(dateList):
2 |     dateList =  map(lambda x : x.split(), dateList)
3 |     cleanList = []
4 |     for each in dateList:
5 |         if each:
6 |             cleanList.append(each[0])
7 |     return cleanList
8 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/qua_items.json


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/24.scrapy爬取国内两大机票网站(去哪儿+携程)并存入mongodb数据库/findtrip/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = findtrip.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = findtrip
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Field, Item
 9 | 
10 | 
11 | class PythonjobsItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     #pass
15 |     title = Field()
16 |     city = Field()
17 |     company = Field()
18 |     location = Field()
19 |     url = Field()
20 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class PythonjobsSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class PythonjobsPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for pythonjobs project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'pythonjobs'
13 | 
14 | SPIDER_MODULES = ['pythonjobs.spiders']
15 | NEWSPIDER_MODULE = 'pythonjobs.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'pythonjobs (+http://www.yourdomain.com)'
20 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | #DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 | #}
47 | 
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | #SPIDER_MIDDLEWARES = {
51 | #    'pythonjobs.middlewares.PythonjobsSpiderMiddleware': 543,
52 | #}
53 | 
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | #    'pythonjobs.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 | 
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |     'pythonjobs.pipelines.PythonjobsPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/__pycache__/job_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/pythonjobs/spiders/job_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from pythonjobs.items import PythonjobsItem
 4 | #from bs4 import BeautifulSoup
 5 | 
 6 | class JobspiderSpider(scrapy.Spider):
 7 |     name = 'jobSpider'
 8 |     allowed_domains = ['search.51job.com','jobs.51job.com']
 9 | 
10 |     def start_requests(self):
11 |         for i in range(1,20):              # Set pages to crawl here.
12 |             url = "http://search.51job.com/list/000000,000000,0000,00,9,99,python,2,{0}.html".format(i)
13 |             yield scrapy.Request(url)
14 | 
15 |     def parse(self, response):
16 |         for sel in response.css("html body div.dw_wp div#resultList.dw_table div.el p.t1 span a"):
17 |             url  = sel.re('href="(.*?)"')[0]
18 |             yield scrapy.Request(url,callback=self.parse_item)
19 | 
20 |     def parse_item(self, response):
21 |         item = PythonjobsItem()
22 |         item['title'] = response.xpath('//div[@class="cn"]/h1/@title').extract()[0]
23 |         item['url'] = response.url
24 |         item['city'] = response.xpath('//span[@class="lname"]/text()').extract()[0]
25 |         item['company'] = response.xpath('//p[@class="cname"]/a/@title').extract()[0]
26 |         item['location'] = response.xpath('//p[@class="fp"]/text()').extract()[1].rstrip()
27 |         return item


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/25_scrapy爬取前程无忧网站python相关的工作信息/pythonjobs/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = pythonjobs.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = pythonjobs
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/ghostdriver.log


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = shuimujob.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = shuimujob
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/platform.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ShuimujobItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # pass
15 |     title = scrapy.Field()
16 |     href = scrapy.Field()
17 |     author = scrapy.Field()
18 |     time = scrapy.Field()
19 |     content = scrapy.Field()
20 |     is_dev = scrapy.Field()
21 |     is_alg = scrapy.Field()
22 |     is_fin = scrapy.Field()
23 |     base_url_index = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ShuimujobSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.conf import settings
 9 | from scrapy.exceptions import DropItem
10 | from scrapy import log
11 | 
12 | class ShuimujobPipeline(object):
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | class MongoDBPipeline(object):
17 | 
18 |     def __init__(self):
19 |         pass
20 | 
21 | 
22 |     def open_spider(self, spider):
23 |         self.client = pymongo.MongoClient(
24 |             settings['MONGODB_SERVER'],
25 |             settings['MONGODB_PORT']
26 |         )
27 |         self.db = self.client[settings['MONGODB_DB']]
28 |         self.collection = self.db[settings['MONGODB_COLLECTION']]
29 | 
30 |     def close_spider(self, spider):
31 |         self.client.close()
32 | 
33 |     def process_item(self, item, spider):
34 |         valid = True
35 |         for data in item:
36 |             if not data :
37 |                 valid = False
38 |                 raise DropItem("Missing {0}!".format(data))
39 |         if item['title'] == '':
40 |             valid = False
41 |             raise DropItem("title is '' ")
42 |         if item['content'] == '':
43 |             valid = False
44 |             raise DropItem("content is '' ")
45 |         if valid:
46 |             self.collection.insert(dict(item))
47 |         return item
48 | 
49 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/platform.py:
--------------------------------------------------------------------------------
1 | import sys
2 | def getPlatform():
3 |     platform=''
4 |     if sys.platform.startswith('win'):
5 |         platform = 'win'
6 |     elif sys.platform.startswith('linux'):
7 |         platform = 'linux'
8 |     return platform


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for shuimujob project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'shuimujob'
13 | 
14 | SPIDER_MODULES = ['shuimujob.spiders']
15 | NEWSPIDER_MODULE = 'shuimujob.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'shuimujob (+http://www.yourdomain.com)'
20 | 
21 | 
22 | MONGODB_SERVER = "localhost"
23 | MONGODB_PORT = 27017
24 | MONGODB_DB = "shuimujob"
25 | MONGODB_COLLECTION = "job_info"
26 | 
27 | # Obey robots.txt rules
28 | ROBOTSTXT_OBEY = False
29 | 
30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
31 | #CONCURRENT_REQUESTS = 32
32 | 
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 | 
41 | # Disable cookies (enabled by default)
42 | COOKIES_ENABLED = False
43 | 
44 | # Disable Telnet Console (enabled by default)
45 | #TELNETCONSOLE_ENABLED = False
46 | 
47 | # Override the default request headers:
48 | #DEFAULT_REQUEST_HEADERS = {
49 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 | #   'Accept-Language': 'en',
51 | #}
52 | 
53 | # Enable or disable spider middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
55 | #SPIDER_MIDDLEWARES = {
56 | #    'shuimujob.middlewares.ShuimujobSpiderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable downloader middlewares
60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
61 | #DOWNLOADER_MIDDLEWARES = {
62 | #    'shuimujob.middlewares.MyCustomDownloaderMiddleware': 543,
63 | #}
64 | 
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 | 
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 |    # 'shuimujob.pipelines.ShuimujobPipeline': 300,
75 |    'shuimujob.pipelines.MongoDBPipeline':300
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/__pycache__/shuimu_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/26_scrapy爬取水木社区和北大未名社区的实习信息并导入mongodb数据库/shuimujob/shuimujob/spiders/shuimu_spider.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import scrapy
 3 | from shuimujob.items import ShuimujobItem
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support.ui import WebDriverWait
 7 | from selenium.webdriver.support import  expected_conditions as EC
 8 | from bs4 import BeautifulSoup
 9 | from scrapy import signals
10 | from scrapy.xlib.pydispatch import dispatcher
11 | from shuimujob.platform import getPlatform
12 | 
13 | class SMSpider(scrapy.spiders.CrawlSpider):
14 |     '''
15 |     #要建立一个 Spider，你可以为 scrapy.spider.BaseSpider 创建一个子类，并确定三个主要的、强制的属性：
16 |     #name ：爬虫的识别名，它必须是唯一的，在不同的爬虫中你必须定义不同的名字.
17 |     #start_urls ：爬虫开始爬的一个 URL 列表。爬虫从这里开始抓取数据，所以，第一次下载的数据将会从这些 URLS 开始。其他子 URL 将会从这些起始 URL 中继承性生成。
18 |     #parse() ：爬虫的方法，调用时候传入从每一个 URL 传回的 Response 对象作为参数，response 将会是 parse 方法的唯一的一个参数,
19 |     #这个方法负责解析返回的数据、匹配抓取的数据(解析为 item )并跟踪更多的 URL。
20 |     '''
21 |     name="shuimujob"
22 |     base_url = 'http://www.newsmth.net/nForum/board/Intern'
23 |     start_urls = [base_url]
24 |     start_urls.extend([base_url+'?p='+str(i) for i in range(2,4)])
25 |     # start_urls = ['http://www.newsmth.net/']
26 |     platform = getPlatform()
27 | 
28 |     def __init__(self):
29 |         scrapy.spiders.Spider.__init__(self)
30 |         if self.platform == 'linux':
31 |             self.driver = webdriver.PhantomJS()
32 |         elif self.platform == 'win':
33 |             self.driver = webdriver.PhantomJS()
34 |         self.driver.set_page_load_timeout(15)
35 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
36 | 
37 | 
38 | 
39 |     def spider_closed(self, spider):
40 |         self.driver.quit()
41 | 
42 |     def parse(self,response):
43 |         self.driver.get(response.url)
44 | 
45 |         element = WebDriverWait(self.driver,30).until(EC.presence_of_all_elements_located((By.TAG_NAME,'table')))
46 |         page_source = self.driver.page_source
47 |         bs_obj = BeautifulSoup(page_source, "lxml")
48 |         table = bs_obj.find('table',class_='board-list tiz')
49 |         intern_messages = table.find_all('tr',class_=False)
50 |         for message in intern_messages:
51 |             title, href, time, author = '','','',''
52 |             td_9 = message.find('td',class_='title_9')
53 |             if td_9:
54 |                 title = td_9.a.get_text().encode('utf-8','ignore')
55 |                 href = td_9.a['href']
56 |             td_10 = message.find('td', class_='title_10')
57 |             if td_10:
58 |                 time=td_10.get_text().encode('utf-8','ignore')
59 |             td_12 = message.find('td', class_='title_12')
60 |             if td_12:
61 |                 author = td_12.a.get_text().encode('utf-8','ignore')
62 |             item = ShuimujobItem()
63 |             item['title'] = title
64 |             item['href'] = href
65 |             item['time'] = time
66 |             item['author'] = author
67 |             item['base_url_index'] = 0
68 |             root_url = 'http://www.newsmth.net'
69 |             # content = scrapy.Request(root_url+href,self.parse_content)
70 |             if href!='':
71 |                 content = self.parse_content(root_url+href)
72 |                 # print 'content:', content
73 |                 item['content'] = content
74 |             yield item
75 | 
76 |     def parse_content(self,url):
77 | 
78 |         self.driver.get(url)
79 |         element = WebDriverWait(self.driver, 30).until(EC.presence_of_all_elements_located((By.TAG_NAME, 'table')))
80 |         page_source = self.driver.page_source
81 |         bs_obj = BeautifulSoup(page_source, "lxml")
82 |         return bs_obj.find('td', class_='a-content').p.get_text().encode('utf-8','ignore')


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/house.csv:
--------------------------------------------------------------------------------
 1 | house,house_area,house_room,total_price,unit_price
 2 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462
 3 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722
 4 | 天坛新寓 ,75.16,3室1厅,243.0,32332
 5 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546
 6 | 常府街10至16号 ,62.21,2室1厅,212.0,34079
 7 | house,house_area,house_room,total_price,unit_price
 8 | 滨江奥城听雨苑 ,115.83,3室2厅,515.0,44462
 9 | 虹苑新寓三村 ,63.8,2室2厅,196.0,30722
10 | 天坛新寓 ,75.16,3室1厅,243.0,32332
11 | 棉鞋营小区 ,69.74,3室1厅,220.0,31546
12 | 常府街10至16号 ,62.21,2室1厅,212.0,34079
13 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class NjHouseItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     house=scrapy.Field()
15 |     total_price=scrapy.Field()
16 |     unit_price=scrapy.Field()
17 |     house_room=scrapy.Field()
18 |     house_area=scrapy.Field()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class NjHouseSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class NjHousePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for nj_house project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'nj_house'
13 | 
14 | SPIDER_MODULES = ['nj_house.spiders']
15 | NEWSPIDER_MODULE = 'nj_house.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'nj_house (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'nj_house.middlewares.NjHouseSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'nj_house.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'nj_house.pipelines.NjHousePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/__pycache__/lj_house.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/nj_house/spiders/lj_house.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import re
 3 | import scrapy
 4 | from nj_house.items import NjHouseItem
 5 | 
 6 | class LjHouseSpider(scrapy.Spider):
 7 |     name = "lj_house"
 8 |     allowed_domains = ["nj.lianjia.com/ershoufang/"]
 9 |     start_urls = ['http://nj.lianjia.com/ershoufang//']
10 | 
11 |     def parse(self, response):
12 |         clears = response.css('.sellListContent li')
13 |         item = NjHouseItem()
14 |         for c in clears:       	
15 |         		house = c.css('.houseInfo a::text').extract_first()
16 |         		house_text = c.css('.houseInfo::text').extract_first()
17 |         		house_info_list = [e for e in re.split('\|', int(house_text)) if len(e) > 1]
18 |         		house_room = house_info_list[0].strip()
19 |         		house_area = ''.join(re.findall(r'[\d+\.]', house_info_list[1]))
20 |         		total_price = c.css('.totalPrice span::text').extract_first()
21 |         		unit_price = c.css('.unitPrice span::text').extract_first()
22 |         		unit_price = re.findall('\d+', unit_price)[0]
23 | 
24 |         		item['house'] = house
25 |         		item['total_price'] = float(total_price)
26 |         		item['unit_price'] = int(unit_price)
27 |         		item['house_area'] = float(house_area)
28 |         		item['house_room'] = house_room
29 |         		yield item
30 | 
31 |         page_info = response.css('div[class="page-box fr"]').css('div::attr(page-data)').extract_first()
32 |         page_list = re.findall('\d+', page_info)
33 |         next_page = 'pg' + str(int(page_list[1]) + 1)
34 |         url = self.start_urls[0] + next_page
35 |         if next_page:
36 |             yield Request(url=url, callback=self.parse)
37 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/27_scrapy爬取南京20000多套二手房信息/nj_house/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = nj_house.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = nj_house
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__init__.py


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LianjiaItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     # 标签  小区  户型   面积   关注人数  观看人数  发布时间  价格   均价  详情链接  经纬度 城区
15 |     title = scrapy.Field()
16 |     community = scrapy.Field()
17 |     model = scrapy.Field()
18 |     area = scrapy.Field()
19 |     focus_num = scrapy.Field()
20 |     watch_num = scrapy.Field()
21 |     time = scrapy.Field()
22 |     price = scrapy.Field()
23 |     average_price = scrapy.Field()
24 |     link = scrapy.Field()
25 |     Latitude = scrapy.Field()
26 |     city = scrapy.Field()


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class LianjiaSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | from scrapy.conf import settings
10 | from LianJia.items import LianjiaItem
11 | 
12 | class LianjiaPipeline(object):
13 |     def __init__(self):
14 |         host = settings['MONGODB_HOST']
15 |         port = settings['MONGODB_PORT']
16 |         db_name = settings['MONGODB_DBNAME']
17 |         client = pymongo.MongoClient(host=host,port=port)
18 |         tdb = client[db_name]
19 |         self.post = tdb[settings['MONGODB_DOCNAME']]
20 | 
21 |     def process_item(self, item, spider):
22 |         if isinstance(item,LianjiaItem):
23 |             try:
24 |                 info = dict(item)
25 |                 if self.post.insert(info):
26 |                     print('bingo')
27 |             except Exception:
28 |                 pass
29 |         return item
30 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for LianJia project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'LianJia'
13 | 
14 | SPIDER_MODULES = ['LianJia.spiders']
15 | NEWSPIDER_MODULE = 'LianJia.spiders'
16 | 
17 | 
18 | MONGODB_HOST = '127.0.0.1'
19 | MONGODB_PORT = 27017
20 | MONGODB_DBNAME = "lianjia"
21 | MONGODB_DOCNAME = "saveinfo_5"
22 | 
23 | DOWNLOAD_DELAY = 10
24 | 
25 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
26 | #USER_AGENT = 'LianJia (+http://www.yourdomain.com)'
27 | 
28 | # Obey robots.txt rules
29 | ROBOTSTXT_OBEY = False
30 | 
31 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
32 | #CONCURRENT_REQUESTS = 32
33 | 
34 | # Configure a delay for requests for the same website (default: 0)
35 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
36 | # See also autothrottle settings and docs
37 | #DOWNLOAD_DELAY = 3
38 | # The download delay setting will honor only one of:
39 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
40 | #CONCURRENT_REQUESTS_PER_IP = 16
41 | 
42 | # Disable cookies (enabled by default)
43 | #COOKIES_ENABLED = False
44 | 
45 | # Disable Telnet Console (enabled by default)
46 | #TELNETCONSOLE_ENABLED = False
47 | 
48 | # Override the default request headers:
49 | #DEFAULT_REQUEST_HEADERS = {
50 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
51 | #   'Accept-Language': 'en',
52 | #}
53 | 
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | #    'LianJia.middlewares.LianjiaSpiderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | #    'LianJia.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | ITEM_PIPELINES = {
75 |     'LianJia.pipelines.LianjiaPipeline': 300,
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/lianjia.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/LianJia/spiders/__pycache__/lianjia.cpython-36.pyc


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/lianjia.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiguang123/Python-Learning-Action-Code/45b27be36c73ddf5ca7e77de5c521ace8c5509c3/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/lianjia.csv


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫实战项目/28_scrapy爬取链家北京二手房数据/LianJia/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = LianJia.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = LianJia
12 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫相关函数库介绍/.ipynb_checkpoints/01_requests学习笔记-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# 引入Requests库\n",
 12 |     "import requests"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# 发起GET请求\n",
 24 |     "response = requests.get('https://www.baidu.com/')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 4,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "<class 'requests.models.Response'>\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# 查看响应类型  requests.models.Response\n",
 42 |     "print(type(response))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 5,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "200\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# 输出状态码\n",
 60 |     "print(response.status_code)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "<class 'str'>\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# 输出响应内容类型  text\n",
 78 |     "print(type(response.text))"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 8,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# 输出cookies\n",
 96 |     "print(response.cookies)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 9,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "<Response [200]>"
108 |       ]
109 |      },
110 |      "execution_count": 9,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "# 发起POST请求\n",
117 |     "requests.post('http://httpbin.org/post')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 10,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "<Response [200]>"
129 |       ]
130 |      },
131 |      "execution_count": 10,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "# 发起PUT请求\n",
138 |     "requests.put('http://httpbin.org/put')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": []
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "Python 3",
154 |    "language": "python",
155 |    "name": "python3"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.6.6"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------
/03_Python网络爬虫/Python网络爬虫相关函数库介绍/01_requests学习笔记.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# 引入Requests库\n",
 12 |     "import requests"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# 发起GET请求\n",
 24 |     "response = requests.get('https://www.baidu.com/')"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 4,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "<class 'requests.models.Response'>\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "# 查看响应类型  requests.models.Response\n",
 42 |     "print(type(response))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 5,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "200\n"
 55 |      ]
 56 |     }
 57 |    ],
 58 |    "source": [
 59 |     "# 输出状态码\n",
 60 |     "print(response.status_code)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "<class 'str'>\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# 输出响应内容类型  text\n",
 78 |     "print(type(response.text))"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 8,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# 输出cookies\n",
 96 |     "print(response.cookies)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 9,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "<Response [200]>"
108 |       ]
109 |      },
110 |      "execution_count": 9,
111 |      "metadata": {},
112 |      "output_type": "execute_result"
113 |     }
114 |    ],
115 |    "source": [
116 |     "# 发起POST请求\n",
117 |     "requests.post('http://httpbin.org/post')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 10,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "<Response [200]>"
129 |       ]
130 |      },
131 |      "execution_count": 10,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "# 发起PUT请求\n",
138 |     "requests.put('http://httpbin.org/put')"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {
145 |     "collapsed": true
146 |    },
147 |    "outputs": [],
148 |    "source": []
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "Python 3",
154 |    "language": "python",
155 |    "name": "python3"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.6.6"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python爱好者社区历史文章代码实践
2 | 
3 | 内容涵盖：python基础入门篇、进阶篇、Python网络爬虫、机器学习、深度学习、数据分析与挖掘等系列
4 | 
5 | 参考链接：[https://zhuanlan.zhihu.com/p/33929261](https://zhuanlan.zhihu.com/p/33929261 "https://zhuanlan.zhihu.com/p/33929261")
6 | 


--------------------------------------------------------------------------------