├── Python生成验证码文字变图片.py
├── Python的Web和数据分析学习图谱
    ├── Django.png
    ├── flask.png
    └── 数据分析算法合集.png
├── README.md
├── lagouSpider.py
├── scrapy爬虫简单项目
    ├── .idea
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── python实现有道词典.iml
    │   └── workspace.xml
    ├── 2.py
    ├── Qqnews
    │   ├── Qqnews
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── items.cpython-36.pyc
    │   │   │   ├── pipelines.cpython-36.pyc
    │   │   │   └── settings.cpython-36.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── Qqnews_spider.py
    │   │   │   ├── __init__.py
    │   │   │   └── __pycache__
    │   │   │       ├── Qqnews_spider.cpython-36.pyc
    │   │   │       └── __init__.cpython-36.pyc
    │   └── scrapy.cfg
    ├── Yustneirong
    │   ├── Yustneirong
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── __init__.pyc
    │   └── scrapy.cfg
    ├── dbtop250
    │   ├── dbtop250
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── items.cpython-36.pyc
    │   │   │   ├── pipelines.cpython-36.pyc
    │   │   │   └── settings.cpython-36.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-36.pyc
    │   │   │       └── dbtop250_spider.cpython-36.pyc
    │   │   │   └── dbtop250_spider.py
    │   └── scrapy.cfg
    ├── douban
    │   ├── douban
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── items.cpython-36.pyc
    │   │   │   └── settings.cpython-36.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-36.pyc
    │   │   │       └── douban_spider.cpython-36.pyc
    │   │   │   └── douban_spider.py
    │   └── scrapy.cfg
    ├── ip地址查询工具.py
    ├── lagou-scrapy
    │   ├── .idea
    │   │   ├── dictionaries
    │   │   │   └── .xml
    │   │   ├── lagou.iml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── dump.rdb
    │   ├── geckodriver.log
    │   ├── lagou
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── items.cpython-36.pyc
    │   │   │   ├── pipelines.cpython-36.pyc
    │   │   │   └── settings.cpython-36.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── middlewares
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   │   └── useragent.cpython-36.pyc
    │   │   │   └── useragent.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-36.pyc
    │   │   │       └── lagouspider.cpython-36.pyc
    │   │   │   └── lagouspider.py
    │   └── scrapy.cfg
    ├── python爬虫实现有道词典.py
    ├── taobaoclass
    │   ├── scrapy.cfg
    │   └── taobaoclass
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── settings.cpython-36.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │           └── __init__.cpython-36.pyc
    │   │       └── taobao_spider.py
    ├── tutorial
    │   ├── scrapy.cfg
    │   └── tutorial
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   └── spiders
    │   │       ├── __init__.py
    │   │       ├── __init__.pyc
    │   │       ├── dmoz_spider.py
    │   │       ├── dmoz_spider.pyc
    │   │       ├── mydomain.py
    │   │       └── mydomain.pyc
    └── zaobao
    │   ├── scrapy.cfg
    │   └── zaobao
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-36.pyc
    │           ├── items.cpython-36.pyc
    │           └── settings.cpython-36.pyc
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           ├── __pycache__
    │               ├── __init__.cpython-36.pyc
    │               └── zaobao_spider.cpython-36.pyc
    │           └── zaobao_spider.py
├── zhihu.com
    ├── scrapy.cfg
    └── zhihuuser
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       ├── items.cpython-36.pyc
    │       └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-36.pyc
    │           └── zhihu.cpython-36.pyc
    │       └── zhihu.py
├── zhihu
    ├── scrapy.cfg
    └── zhihuuser
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-36.pyc
    │       ├── items.cpython-36.pyc
    │       └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       ├── __pycache__
    │           ├── __init__.cpython-36.pyc
    │           └── zhihu.cpython-36.pyc
    │       └── zhihu.py
├── 可视化文件显示程序.zip
├── 基于python的turtle画出叮当猫.py
├── 基于python的turtle的桌面弹球.py
├── 基于python的turtle移动的小球.py
├── 抓取财富网股票信息.py
├── 爬取12306车票信息.py
└── 爬取qq音乐歌曲
    ├── audio2.txt
    └── 爬取扣扣音乐文件.py


/Python生成验证码文字变图片.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pygame
 3 | from pygame.locals import *
 4 | 
 5 | 
 6 | 
 7 | pygame.init()
 8 | text = u"1234"
 9 | 
10 | font = pygame.font.SysFont("Microsoft YaHei",64)
11 | ftext = font.render(text,True,(65,83,130),(255,255,255))
12 | pygame.image.save(ftext,"D:/pythontab.jpg")


--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/Django.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/Django.png


--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/flask.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/flask.png


--------------------------------------------------------------------------------
/Python的Web和数据分析学习图谱/数据分析算法合集.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/Python的Web和数据分析学习图谱/数据分析算法合集.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # python
2 | python-爬虫-web-数据分析
3 | 


--------------------------------------------------------------------------------
/lagouSpider.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import random
  4 | import urllib
  5 | import requests
  6 | from bs4 import BeautifulSoup
  7 | import time
  8 | import csv
  9 | import codecs
 10 | from selenium import webdriver
 11 | 
 12 | 
 13 | headers = {
 14 |         'Accept':'application/json, text/javascript, */*; q=0.01',
 15 |         'Accept-Encoding':'gzip, deflate, br',
 16 |         'Accept-Language':'zh-CN,zh;q=0.8',
 17 |         'Connection':'keep-alive',
 18 |         'Content-Length':'25',
 19 |         'Content-Type':'application/x-www-form-urlencoded; 
 20 |         'Cookie':'填上cookie信息',
 21 |         'Host':'www.lagou.com',
 22 |         'Origin':'https://www.lagou.com',
 23 |         'Referer':"h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC",
 24 |         'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 25 |         'X-Anit-Forge-Code':'0',
 26 |         'X-Anit-Forge-Token':'None',
 27 |         'X-Requested-With':'XMLHttpRequest'
 28 | }
 29 | 
 30 | #访问网页  获取所有的json数据
 31 | def post(url,para,headers=None,proxy=None,timeOut=5,timeOutRetry=5):
 32 |     if not url or not para:
 33 |         print("PostError url or para not exit")
 34 |         print("11111111111111")
 35 |         return None
 36 |     try:
 37 |         if not headers:
 38 |             headers=headers
 39 |         response = requests.post(url,data=para,headers=headers)
 40 |         print(response.status_code)
 41 | 
 42 |         print(response.text)
 43 |         if response.status_code == 200 or response.status_code == 302:
 44 |             htmlCode =  response.text
 45 |             # print('1111111111')
 46 |         else:
 47 |             print("2222222222222")
 48 |             htmlCode = None
 49 |     except Exception as e:
 50 |         if timeOutRetry > 0:
 51 |             htmlCode = post(url=url,para=para,timeOutRetry=(timeOutRetry-1))
 52 |             print('3333333333333333333333333333')
 53 |             htmlCode = None
 54 |     return htmlCode
 55 | 
 56 | # url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false&isSchoolJob=0'
 57 | # url = 'https://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC'
 58 | #对获取的json数据进行处理，获取自己需要的信息，获取每个职位数据页数，
 59 | def getinfo(url,para):
 60 | 
 61 |     htmlCode = post(url,para=para,headers=headers)   #获取到网页源码,一大堆的json数据
 62 |     if htmlCode == None:
 63 |         return False
 64 |     companies = json.loads(htmlCode).get('content').get('positionResult').get('result')
 65 |     totalCount = json.loads(htmlCode).get('content').get('positionResult').get('totalCount')
 66 |     pagesize = json.loads(htmlCode).get('content').get('pageSize')
 67 |     pages = 0
 68 |     if int(totalCount)%int(pagesize) == 0:
 69 |         pages = int(int(totalCount)/int(pagesize))
 70 |     else:
 71 |         pages = int(int(totalCount) // int(pagesize)) + 1
 72 | 
 73 |     return pages,companies
 74 | 
 75 | #写入文件中，不同的职位保存在不同的文件
 76 | def writeCsv(filename,companies):
 77 |     info = {}
 78 |     csv_file = codecs.open(filename+'.csv', 'ab', 'utf-8', 'ignore')
 79 |     csv_writer = csv.writer(csv_file)
 80 |     for i in companies:
 81 |         info['公司名字'] = i['companyFullName']              #公司名字
 82 |         # print(info['公司名字'])
 83 |         info['公司城市'] = i['city']                        #职位城市
 84 |         info['招聘职位'] = i['positionName']              #招聘职位
 85 |         info['发布时间'] = i['formatCreateTime']              #发布时间
 86 |         info['薪资待遇'] = i['salary']              #薪资待遇
 87 |         info['经验要求'] = i['workYear']              #经验要求
 88 |         info['公司大小'] = i['companySize']              #公司大小
 89 |         info['公司福利'] = i['positionAdvantage']              #公司福利
 90 |         info['公司地址'] = i['district']              #公司地址
 91 |         # print(info)
 92 |         csv_writer.writerow([i['companyFullName'],i['city'],i['positionName'],i['formatCreateTime'],i['salary'],
 93 |                             i['workYear'],i['companySize'],i['positionAdvantage'],i['district']])
 94 | 
 95 | 
 96 | 
 97 | #获取所有的职位信息
 98 | def occupation():
 99 |     url = "https://www.lagou.com/"
100 |     response = requests.get(url)
101 |     soup = BeautifulSoup(response.text, 'html.parser')
102 |     ds = soup.find_all("div", attrs=["_class", "menu_sub dn"])
103 | 
104 |     occupation_list = []
105 |     for h in ds:
106 |         for g in h.find_all('dd'):
107 |             for l in g:
108 |                 if l.string != "\n":
109 |                     occupation_list.append(l.string)
110 | 
111 |     # print(occupation_list)
112 |     # print(len(occupation_list))
113 |     return occupation_list
114 | 
115 | #获取热门城市这些职位的信息
116 | if __name__ == '__main__':
117 |     occu_list = occupation()
118 |     city_list = ['北京','上海','深圳','广州','杭州','成都','南京','武汉','西安','厦门','长沙','苏州','天津']
119 |     for l in occu_list[:]:
120 |         print(l)
121 |         for j in city_list:
122 |             url = 'https://www.lagou.com/jobs/positionAjax.json?'
123 |             para = {'px': 'default','city':j,'needAddtionalResult': 'false', 'isSchoolJob': 0, 'first': 'true', 'pn': '1',
124 |                     'kd':l}
125 |             pages,companies = getinfo(url,para)
126 |             for i in range(pages):
127 |                 para['pn'] = str(i+1)
128 |                 time.sleep(random.random()*5)
129 |                 print('开始爬取第%s页'%str(i+1))
130 |                 try:
131 |                     pages,companies = getinfo(url,para)
132 |                 except:
133 |                     continue
134 |                 # fina = writeCsv(companies)
135 |                 if companies == None:
136 |                     break
137 |                 writeCsv(l,companies)
138 |                 # csv_writer.writerow(fina)
139 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
 4 |     <OptionsSetting value="true" id="Add" />
 5 |     <OptionsSetting value="true" id="Remove" />
 6 |     <OptionsSetting value="true" id="Checkout" />
 7 |     <OptionsSetting value="true" id="Update" />
 8 |     <OptionsSetting value="true" id="Status" />
 9 |     <OptionsSetting value="true" id="Edit" />
10 |     <ConfirmationsSetting value="0" id="Add" />
11 |     <ConfirmationsSetting value="0" id="Remove" />
12 |   </component>
13 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.1 (D:\Anaconda3\python.exe)" project-jdk-type="Python SDK" />
14 | </project>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/python实现有道词典.iml" filepath="$PROJECT_DIR$/.idea/python实现有道词典.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/python实现有道词典.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/.idea/workspace.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project version="4">
  3 |   <component name="ChangeListManager">
  4 |     <list default="true" id="cfdb9ae0-4c39-4f2e-96d6-20c2810b30e7" name="Default" comment="" />
  5 |     <ignored path="python实现有道词典.iws" />
  6 |     <ignored path=".idea/workspace.xml" />
  7 |     <ignored path=".idea/dataSources.local.xml" />
  8 |     <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
  9 |     <option name="TRACKING_ENABLED" value="true" />
 10 |     <option name="SHOW_DIALOG" value="false" />
 11 |     <option name="HIGHLIGHT_CONFLICTS" value="true" />
 12 |     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 13 |     <option name="LAST_RESOLUTION" value="IGNORE" />
 14 |   </component>
 15 |   <component name="CoverageDataManager">
 16 |     <SUITE FILE_PATH="coverage/python实现有道词典$python.coverage" NAME="python爬虫实现有道词典 Coverage Results" MODIFIED="1508636912474" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 17 |     <SUITE FILE_PATH="coverage/python实现有道词典$zaobao_spider.coverage" NAME="zaobao_spider Coverage Results" MODIFIED="1508654579945" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/zaobao/zaobao/spiders" />
 18 |     <SUITE FILE_PATH="coverage/python实现有道词典$2.coverage" NAME="2 Coverage Results" MODIFIED="1508638479401" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 19 |     <SUITE FILE_PATH="coverage/python实现有道词典$douban_spider.coverage" NAME="douban_spider Coverage Results" MODIFIED="1508656018459" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$/douban/douban/spiders" />
 20 |     <SUITE FILE_PATH="coverage/python实现有道词典$ip.coverage" NAME="ip地址查询工具 Coverage Results" MODIFIED="1508638840879" SOURCE_PROVIDER="com.intellij.coverage.DefaultCoverageFileProvider" RUNNER="coverage.py" COVERAGE_BY_TEST_ENABLED="true" COVERAGE_TRACING_ENABLED="false" WORKING_DIRECTORY="$PROJECT_DIR$" />
 21 |   </component>
 22 |   <component name="CreatePatchCommitExecutor">
 23 |     <option name="PATCH_PATH" value="" />
 24 |   </component>
 25 |   <component name="ExecutionTargetManager" SELECTED_TARGET="default_target" />
 26 |   <component name="FavoritesManager">
 27 |     <favorites_list name="python实现有道词典" />
 28 |   </component>
 29 |   <component name="FileEditorManager">
 30 |     <leaf SIDE_TABS_SIZE_LIMIT_KEY="300" />
 31 |   </component>
 32 |   <component name="FileTemplateManagerImpl">
 33 |     <option name="RECENT_TEMPLATES">
 34 |       <list>
 35 |         <option value="Python Script" />
 36 |       </list>
 37 |     </option>
 38 |   </component>
 39 |   <component name="IdeDocumentHistory">
 40 |     <option name="CHANGED_PATHS">
 41 |       <list>
 42 |         <option value="$PROJECT_DIR$/python爬虫实现有道词典.py" />
 43 |         <option value="$PROJECT_DIR$/ip地址查询工具.py" />
 44 |         <option value="$PROJECT_DIR$/2.py" />
 45 |         <option value="$PROJECT_DIR$/tutorial/tutorial/spiders/dmoz_spider.py" />
 46 |         <option value="$PROJECT_DIR$/tutorial/tutorial/items.py" />
 47 |         <option value="$PROJECT_DIR$/tutorial/tutorial/spiders/mydomain.py" />
 48 |         <option value="$PROJECT_DIR$/zaobao/zaobao/items.py" />
 49 |         <option value="$PROJECT_DIR$/zaobao/zaobao/spiders/zaobao_spider.py" />
 50 |         <option value="$PROJECT_DIR$/douban/douban/settings.py" />
 51 |         <option value="$PROJECT_DIR$/douban/douban/spiders/douban_spider.py" />
 52 |         <option value="$PROJECT_DIR$/dbtop250/dbtop250/items.py" />
 53 |         <option value="$PROJECT_DIR$/dbtop250/dbtop250/pipelines.py" />
 54 |         <option value="$PROJECT_DIR$/douban/douban/items.py" />
 55 |         <option value="$PROJECT_DIR$/dbtop250/dbtop250/spiders/dbtop250_spider.py" />
 56 |         <option value="$PROJECT_DIR$/Qqnews/Qqnews/items.py" />
 57 |         <option value="$PROJECT_DIR$/Qqnews/Qqnews/pipelines.py" />
 58 |         <option value="$PROJECT_DIR$/Qqnews/Qqnews/settings.py" />
 59 |         <option value="$PROJECT_DIR$/dbtop250/dbtop250/settings.py" />
 60 |         <option value="$PROJECT_DIR$/Qqnews/Qqnews/spiders/Qqnews_spider.py" />
 61 |         <option value="$PROJECT_DIR$/taobaoclass/taobaoclass/items.py" />
 62 |         <option value="$PROJECT_DIR$/taobaoclass/taobaoclass/spiders/taobao_spider.py" />
 63 |         <option value="$PROJECT_DIR$/taobaoclass/taobaoclass/settings.py" />
 64 |         <option value="$PROJECT_DIR$/taobaoclass/taobaoclass/pipelines.py" />
 65 |       </list>
 66 |     </option>
 67 |   </component>
 68 |   <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
 69 |   <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
 70 |   <component name="JsGulpfileManager">
 71 |     <detection-done>true</detection-done>
 72 |     <sorting>DEFINITION_ORDER</sorting>
 73 |   </component>
 74 |   <component name="ProjectFrameBounds">
 75 |     <option name="x" value="204" />
 76 |     <option name="y" value="48" />
 77 |     <option name="width" value="864" />
 78 |     <option name="height" value="735" />
 79 |   </component>
 80 |   <component name="ProjectInspectionProfilesVisibleTreeState">
 81 |     <entry key="Project Default">
 82 |       <profile-state>
 83 |         <expanded-state>
 84 |           <State>
 85 |             <id />
 86 |           </State>
 87 |           <State>
 88 |             <id>Python</id>
 89 |           </State>
 90 |         </expanded-state>
 91 |         <selected-state>
 92 |           <State>
 93 |             <id>PyUnresolvedReferencesInspection</id>
 94 |           </State>
 95 |         </selected-state>
 96 |       </profile-state>
 97 |     </entry>
 98 |   </component>
 99 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
100 |     <OptionsSetting value="true" id="Add" />
101 |     <OptionsSetting value="true" id="Remove" />
102 |     <OptionsSetting value="true" id="Checkout" />
103 |     <OptionsSetting value="true" id="Update" />
104 |     <OptionsSetting value="true" id="Status" />
105 |     <OptionsSetting value="true" id="Edit" />
106 |     <ConfirmationsSetting value="0" id="Add" />
107 |     <ConfirmationsSetting value="0" id="Remove" />
108 |   </component>
109 |   <component name="ProjectView">
110 |     <navigator currentView="ProjectPane" proportions="" version="1">
111 |       <flattenPackages />
112 |       <showMembers />
113 |       <showModules />
114 |       <showLibraryContents />
115 |       <hideEmptyPackages />
116 |       <abbreviatePackageNames />
117 |       <autoscrollToSource />
118 |       <autoscrollFromSource />
119 |       <sortByType />
120 |       <manualOrder />
121 |       <foldersAlwaysOnTop value="true" />
122 |     </navigator>
123 |     <panes>
124 |       <pane id="ProjectPane">
125 |         <subPane />
126 |       </pane>
127 |       <pane id="Scratches" />
128 |       <pane id="Scope" />
129 |     </panes>
130 |   </component>
131 |   <component name="PropertiesComponent">
132 |     <property name="last_opened_file_path" value="D:/Python27/python.exe" />
133 |     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
134 |     <property name="settings.editor.splitter.proportion" value="0.2" />
135 |     <property name="WebServerToolWindowFactoryState" value="false" />
136 |   </component>
137 |   <component name="RunManager" selected="Python.douban_spider">
138 |     <configuration default="false" name="python爬虫实现有道词典" type="PythonConfigurationType" factoryName="Python" temporary="true">
139 |       <option name="INTERPRETER_OPTIONS" value="" />
140 |       <option name="PARENT_ENVS" value="true" />
141 |       <envs>
142 |         <env name="PYTHONUNBUFFERED" value="1" />
143 |       </envs>
144 |       <option name="SDK_HOME" value="" />
145 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
146 |       <option name="IS_MODULE_SDK" value="true" />
147 |       <option name="ADD_CONTENT_ROOTS" value="true" />
148 |       <option name="ADD_SOURCE_ROOTS" value="true" />
149 |       <module name="python实现有道词典" />
150 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
151 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/python爬虫实现有道词典.py" />
152 |       <option name="PARAMETERS" value="" />
153 |       <option name="SHOW_COMMAND_LINE" value="false" />
154 |       <method />
155 |     </configuration>
156 |     <configuration default="false" name="2" type="PythonConfigurationType" factoryName="Python" temporary="true">
157 |       <option name="INTERPRETER_OPTIONS" value="" />
158 |       <option name="PARENT_ENVS" value="true" />
159 |       <envs>
160 |         <env name="PYTHONUNBUFFERED" value="1" />
161 |       </envs>
162 |       <option name="SDK_HOME" value="" />
163 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
164 |       <option name="IS_MODULE_SDK" value="true" />
165 |       <option name="ADD_CONTENT_ROOTS" value="true" />
166 |       <option name="ADD_SOURCE_ROOTS" value="true" />
167 |       <module name="python实现有道词典" />
168 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
169 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/2.py" />
170 |       <option name="PARAMETERS" value="" />
171 |       <option name="SHOW_COMMAND_LINE" value="false" />
172 |       <method />
173 |     </configuration>
174 |     <configuration default="false" name="ip地址查询工具" type="PythonConfigurationType" factoryName="Python" temporary="true">
175 |       <option name="INTERPRETER_OPTIONS" value="" />
176 |       <option name="PARENT_ENVS" value="true" />
177 |       <envs>
178 |         <env name="PYTHONUNBUFFERED" value="1" />
179 |       </envs>
180 |       <option name="SDK_HOME" value="" />
181 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$" />
182 |       <option name="IS_MODULE_SDK" value="true" />
183 |       <option name="ADD_CONTENT_ROOTS" value="true" />
184 |       <option name="ADD_SOURCE_ROOTS" value="true" />
185 |       <module name="python实现有道词典" />
186 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
187 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/ip地址查询工具.py" />
188 |       <option name="PARAMETERS" value="" />
189 |       <option name="SHOW_COMMAND_LINE" value="false" />
190 |       <method />
191 |     </configuration>
192 |     <configuration default="false" name="zaobao_spider" type="PythonConfigurationType" factoryName="Python" temporary="true">
193 |       <option name="INTERPRETER_OPTIONS" value="" />
194 |       <option name="PARENT_ENVS" value="true" />
195 |       <envs>
196 |         <env name="PYTHONUNBUFFERED" value="1" />
197 |       </envs>
198 |       <option name="SDK_HOME" value="" />
199 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/zaobao/zaobao/spiders" />
200 |       <option name="IS_MODULE_SDK" value="true" />
201 |       <option name="ADD_CONTENT_ROOTS" value="true" />
202 |       <option name="ADD_SOURCE_ROOTS" value="true" />
203 |       <module name="python实现有道词典" />
204 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
205 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/zaobao/zaobao/spiders/zaobao_spider.py" />
206 |       <option name="PARAMETERS" value="" />
207 |       <option name="SHOW_COMMAND_LINE" value="false" />
208 |       <method />
209 |     </configuration>
210 |     <configuration default="false" name="douban_spider" type="PythonConfigurationType" factoryName="Python" temporary="true">
211 |       <option name="INTERPRETER_OPTIONS" value="" />
212 |       <option name="PARENT_ENVS" value="true" />
213 |       <envs>
214 |         <env name="PYTHONUNBUFFERED" value="1" />
215 |       </envs>
216 |       <option name="SDK_HOME" value="" />
217 |       <option name="WORKING_DIRECTORY" value="$PROJECT_DIR$/douban/douban/spiders" />
218 |       <option name="IS_MODULE_SDK" value="true" />
219 |       <option name="ADD_CONTENT_ROOTS" value="true" />
220 |       <option name="ADD_SOURCE_ROOTS" value="true" />
221 |       <module name="python实现有道词典" />
222 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
223 |       <option name="SCRIPT_NAME" value="$PROJECT_DIR$/douban/douban/spiders/douban_spider.py" />
224 |       <option name="PARAMETERS" value="" />
225 |       <option name="SHOW_COMMAND_LINE" value="false" />
226 |       <method />
227 |     </configuration>
228 |     <configuration default="true" type="DjangoTestsConfigurationType" factoryName="Django tests">
229 |       <option name="INTERPRETER_OPTIONS" value="" />
230 |       <option name="PARENT_ENVS" value="true" />
231 |       <envs>
232 |         <env name="PYTHONUNBUFFERED" value="1" />
233 |       </envs>
234 |       <option name="SDK_HOME" value="" />
235 |       <option name="WORKING_DIRECTORY" value="" />
236 |       <option name="IS_MODULE_SDK" value="false" />
237 |       <option name="ADD_CONTENT_ROOTS" value="true" />
238 |       <option name="ADD_SOURCE_ROOTS" value="true" />
239 |       <module name="python实现有道词典" />
240 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
241 |       <option name="TARGET" value="" />
242 |       <option name="SETTINGS_FILE" value="" />
243 |       <option name="CUSTOM_SETTINGS" value="false" />
244 |       <option name="USE_OPTIONS" value="false" />
245 |       <option name="OPTIONS" value="" />
246 |       <method />
247 |     </configuration>
248 |     <configuration default="true" type="JavascriptDebugType" factoryName="JavaScript Debug">
249 |       <method />
250 |     </configuration>
251 |     <configuration default="true" type="PyBehaveRunConfigurationType" factoryName="Behave">
252 |       <option name="INTERPRETER_OPTIONS" value="" />
253 |       <option name="PARENT_ENVS" value="true" />
254 |       <envs />
255 |       <option name="SDK_HOME" value="" />
256 |       <option name="WORKING_DIRECTORY" value="" />
257 |       <option name="IS_MODULE_SDK" value="false" />
258 |       <option name="ADD_CONTENT_ROOTS" value="true" />
259 |       <option name="ADD_SOURCE_ROOTS" value="true" />
260 |       <module name="python实现有道词典" />
261 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
262 |       <option name="ADDITIONAL_ARGS" value="" />
263 |       <method />
264 |     </configuration>
265 |     <configuration default="true" type="PyLettuceRunConfigurationType" factoryName="Lettuce">
266 |       <option name="INTERPRETER_OPTIONS" value="" />
267 |       <option name="PARENT_ENVS" value="true" />
268 |       <envs />
269 |       <option name="SDK_HOME" value="" />
270 |       <option name="WORKING_DIRECTORY" value="" />
271 |       <option name="IS_MODULE_SDK" value="false" />
272 |       <option name="ADD_CONTENT_ROOTS" value="true" />
273 |       <option name="ADD_SOURCE_ROOTS" value="true" />
274 |       <module name="python实现有道词典" />
275 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
276 |       <option name="ADDITIONAL_ARGS" value="" />
277 |       <method />
278 |     </configuration>
279 |     <configuration default="true" type="PythonConfigurationType" factoryName="Python">
280 |       <option name="INTERPRETER_OPTIONS" value="" />
281 |       <option name="PARENT_ENVS" value="true" />
282 |       <envs>
283 |         <env name="PYTHONUNBUFFERED" value="1" />
284 |       </envs>
285 |       <option name="SDK_HOME" value="" />
286 |       <option name="WORKING_DIRECTORY" value="" />
287 |       <option name="IS_MODULE_SDK" value="false" />
288 |       <option name="ADD_CONTENT_ROOTS" value="true" />
289 |       <option name="ADD_SOURCE_ROOTS" value="true" />
290 |       <module name="python实现有道词典" />
291 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
292 |       <option name="SCRIPT_NAME" value="" />
293 |       <option name="PARAMETERS" value="" />
294 |       <option name="SHOW_COMMAND_LINE" value="false" />
295 |       <method />
296 |     </configuration>
297 |     <configuration default="true" type="Tox" factoryName="Tox">
298 |       <option name="INTERPRETER_OPTIONS" value="" />
299 |       <option name="PARENT_ENVS" value="true" />
300 |       <envs />
301 |       <option name="SDK_HOME" value="" />
302 |       <option name="WORKING_DIRECTORY" value="" />
303 |       <option name="IS_MODULE_SDK" value="false" />
304 |       <option name="ADD_CONTENT_ROOTS" value="true" />
305 |       <option name="ADD_SOURCE_ROOTS" value="true" />
306 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
307 |       <module name="python实现有道词典" />
308 |       <method />
309 |     </configuration>
310 |     <configuration default="true" type="js.build_tools.gulp" factoryName="Gulp.js">
311 |       <node-interpreter>project</node-interpreter>
312 |       <node-options />
313 |       <gulpfile />
314 |       <tasks />
315 |       <arguments />
316 |       <envs />
317 |       <method />
318 |     </configuration>
319 |     <configuration default="true" type="js.build_tools.npm" factoryName="npm">
320 |       <command value="run-script" />
321 |       <scripts />
322 |       <node-interpreter value="project" />
323 |       <envs />
324 |       <method />
325 |     </configuration>
326 |     <configuration default="true" type="tests" factoryName="Attests">
327 |       <option name="INTERPRETER_OPTIONS" value="" />
328 |       <option name="PARENT_ENVS" value="true" />
329 |       <envs />
330 |       <option name="SDK_HOME" value="" />
331 |       <option name="WORKING_DIRECTORY" value="" />
332 |       <option name="IS_MODULE_SDK" value="false" />
333 |       <option name="ADD_CONTENT_ROOTS" value="true" />
334 |       <option name="ADD_SOURCE_ROOTS" value="true" />
335 |       <module name="python实现有道词典" />
336 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
337 |       <option name="SCRIPT_NAME" value="" />
338 |       <option name="CLASS_NAME" value="" />
339 |       <option name="METHOD_NAME" value="" />
340 |       <option name="FOLDER_NAME" value="" />
341 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
342 |       <option name="PATTERN" value="" />
343 |       <option name="USE_PATTERN" value="false" />
344 |       <method />
345 |     </configuration>
346 |     <configuration default="true" type="tests" factoryName="Doctests">
347 |       <option name="INTERPRETER_OPTIONS" value="" />
348 |       <option name="PARENT_ENVS" value="true" />
349 |       <envs />
350 |       <option name="SDK_HOME" value="" />
351 |       <option name="WORKING_DIRECTORY" value="" />
352 |       <option name="IS_MODULE_SDK" value="false" />
353 |       <option name="ADD_CONTENT_ROOTS" value="true" />
354 |       <option name="ADD_SOURCE_ROOTS" value="true" />
355 |       <module name="python实现有道词典" />
356 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
357 |       <option name="SCRIPT_NAME" value="" />
358 |       <option name="CLASS_NAME" value="" />
359 |       <option name="METHOD_NAME" value="" />
360 |       <option name="FOLDER_NAME" value="" />
361 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
362 |       <option name="PATTERN" value="" />
363 |       <option name="USE_PATTERN" value="false" />
364 |       <method />
365 |     </configuration>
366 |     <configuration default="true" type="tests" factoryName="Nosetests">
367 |       <option name="INTERPRETER_OPTIONS" value="" />
368 |       <option name="PARENT_ENVS" value="true" />
369 |       <envs />
370 |       <option name="SDK_HOME" value="" />
371 |       <option name="WORKING_DIRECTORY" value="" />
372 |       <option name="IS_MODULE_SDK" value="false" />
373 |       <option name="ADD_CONTENT_ROOTS" value="true" />
374 |       <option name="ADD_SOURCE_ROOTS" value="true" />
375 |       <module name="python实现有道词典" />
376 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
377 |       <option name="SCRIPT_NAME" value="" />
378 |       <option name="CLASS_NAME" value="" />
379 |       <option name="METHOD_NAME" value="" />
380 |       <option name="FOLDER_NAME" value="" />
381 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
382 |       <option name="PATTERN" value="" />
383 |       <option name="USE_PATTERN" value="false" />
384 |       <option name="PARAMS" value="" />
385 |       <option name="USE_PARAM" value="false" />
386 |       <method />
387 |     </configuration>
388 |     <configuration default="true" type="tests" factoryName="Unittests">
389 |       <option name="INTERPRETER_OPTIONS" value="" />
390 |       <option name="PARENT_ENVS" value="true" />
391 |       <envs />
392 |       <option name="SDK_HOME" value="" />
393 |       <option name="WORKING_DIRECTORY" value="" />
394 |       <option name="IS_MODULE_SDK" value="false" />
395 |       <option name="ADD_CONTENT_ROOTS" value="true" />
396 |       <option name="ADD_SOURCE_ROOTS" value="true" />
397 |       <module name="python实现有道词典" />
398 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
399 |       <option name="SCRIPT_NAME" value="" />
400 |       <option name="CLASS_NAME" value="" />
401 |       <option name="METHOD_NAME" value="" />
402 |       <option name="FOLDER_NAME" value="" />
403 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
404 |       <option name="PATTERN" value="" />
405 |       <option name="USE_PATTERN" value="false" />
406 |       <option name="PUREUNITTEST" value="true" />
407 |       <option name="PARAMS" value="" />
408 |       <option name="USE_PARAM" value="false" />
409 |       <method />
410 |     </configuration>
411 |     <configuration default="true" type="tests" factoryName="py.test">
412 |       <option name="INTERPRETER_OPTIONS" value="" />
413 |       <option name="PARENT_ENVS" value="true" />
414 |       <envs />
415 |       <option name="SDK_HOME" value="" />
416 |       <option name="WORKING_DIRECTORY" value="" />
417 |       <option name="IS_MODULE_SDK" value="false" />
418 |       <option name="ADD_CONTENT_ROOTS" value="true" />
419 |       <option name="ADD_SOURCE_ROOTS" value="true" />
420 |       <module name="python实现有道词典" />
421 |       <EXTENSION ID="PythonCoverageRunConfigurationExtension" enabled="false" sample_coverage="true" runner="coverage.py" />
422 |       <option name="SCRIPT_NAME" value="" />
423 |       <option name="CLASS_NAME" value="" />
424 |       <option name="METHOD_NAME" value="" />
425 |       <option name="FOLDER_NAME" value="" />
426 |       <option name="TEST_TYPE" value="TEST_SCRIPT" />
427 |       <option name="PATTERN" value="" />
428 |       <option name="USE_PATTERN" value="false" />
429 |       <option name="testToRun" value="" />
430 |       <option name="keywords" value="" />
431 |       <option name="params" value="" />
432 |       <option name="USE_PARAM" value="false" />
433 |       <option name="USE_KEYWORD" value="false" />
434 |       <method />
435 |     </configuration>
436 |     <list size="5">
437 |       <item index="0" class="java.lang.String" itemvalue="Python.python爬虫实现有道词典" />
438 |       <item index="1" class="java.lang.String" itemvalue="Python.2" />
439 |       <item index="2" class="java.lang.String" itemvalue="Python.ip地址查询工具" />
440 |       <item index="3" class="java.lang.String" itemvalue="Python.zaobao_spider" />
441 |       <item index="4" class="java.lang.String" itemvalue="Python.douban_spider" />
442 |     </list>
443 |     <recent_temporary>
444 |       <list size="5">
445 |         <item index="0" class="java.lang.String" itemvalue="Python.douban_spider" />
446 |         <item index="1" class="java.lang.String" itemvalue="Python.zaobao_spider" />
447 |         <item index="2" class="java.lang.String" itemvalue="Python.ip地址查询工具" />
448 |         <item index="3" class="java.lang.String" itemvalue="Python.2" />
449 |         <item index="4" class="java.lang.String" itemvalue="Python.python爬虫实现有道词典" />
450 |       </list>
451 |     </recent_temporary>
452 |   </component>
453 |   <component name="ShelveChangesManager" show_recycled="false">
454 |     <option name="remove_strategy" value="false" />
455 |   </component>
456 |   <component name="SvnConfiguration">
457 |     <configuration />
458 |   </component>
459 |   <component name="TaskManager">
460 |     <task active="true" id="Default" summary="Default task">
461 |       <changelist id="cfdb9ae0-4c39-4f2e-96d6-20c2810b30e7" name="Default" comment="" />
462 |       <created>1508592887941</created>
463 |       <option name="number" value="Default" />
464 |       <option name="presentableId" value="Default" />
465 |       <updated>1508592887941</updated>
466 |     </task>
467 |     <servers />
468 |   </component>
469 |   <component name="ToolWindowManager">
470 |     <frame x="204" y="48" width="864" height="735" extended-state="0" />
471 |     <editor active="false" />
472 |     <layout>
473 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.4800995" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
474 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
475 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
476 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
477 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
478 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
479 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
480 |       <window_info id="Terminal" active="true" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.41860464" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
481 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="true" content_ui="tabs" />
482 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.47009966" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
483 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
484 |     </layout>
485 |     <layout-to-restore>
486 |       <window_info id="Project" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.29228857" sideWeight="0.5" order="0" side_tool="false" content_ui="combo" />
487 |       <window_info id="TODO" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
488 |       <window_info id="Event Log" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="true" content_ui="tabs" />
489 |       <window_info id="Database" active="false" anchor="right" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="0" side_tool="false" content_ui="tabs" />
490 |       <window_info id="Version Control" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="false" content_ui="tabs" />
491 |       <window_info id="Python Console" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="3" side_tool="false" content_ui="tabs" />
492 |       <window_info id="Run" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.47009966" sideWeight="0.5" order="4" side_tool="false" content_ui="tabs" />
493 |       <window_info id="Structure" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="1" side_tool="true" content_ui="tabs" />
494 |       <window_info id="Terminal" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="true" show_stripe_button="true" weight="0.41860464" sideWeight="0.5" order="5" side_tool="false" content_ui="tabs" />
495 |       <window_info id="Favorites" active="false" anchor="left" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="2" side_tool="true" content_ui="tabs" />
496 |       <window_info id="Debug" active="false" anchor="bottom" auto_hide="false" internal_type="DOCKED" type="DOCKED" visible="false" show_stripe_button="true" weight="0.33" sideWeight="0.5" order="6" side_tool="false" content_ui="tabs" />
497 |     </layout-to-restore>
498 |   </component>
499 |   <component name="Vcs.Log.UiProperties">
500 |     <option name="RECENTLY_FILTERED_USER_GROUPS">
501 |       <collection />
502 |     </option>
503 |     <option name="RECENTLY_FILTERED_BRANCH_GROUPS">
504 |       <collection />
505 |     </option>
506 |   </component>
507 |   <component name="VcsContentAnnotationSettings">
508 |     <option name="myLimit" value="2678400000" />
509 |   </component>
510 |   <component name="XDebuggerManager">
511 |     <breakpoint-manager />
512 |     <watches-manager />
513 |   </component>
514 |   <component name="editorHistoryManager">
515 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/items.py">
516 |       <provider selected="true" editor-type-id="text-editor">
517 |         <state relative-caret-position="836">
518 |           <caret line="44" column="0" selection-start-line="44" selection-start-column="0" selection-end-line="44" selection-end-column="0" />
519 |           <folding />
520 |         </state>
521 |       </provider>
522 |     </entry>
523 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/spiders/taobao_spider.py">
524 |       <provider selected="true" editor-type-id="text-editor">
525 |         <state relative-caret-position="76">
526 |           <caret line="4" column="37" selection-start-line="4" selection-start-column="37" selection-end-line="4" selection-end-column="66" />
527 |           <folding />
528 |         </state>
529 |       </provider>
530 |     </entry>
531 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/settings.py">
532 |       <provider selected="true" editor-type-id="text-editor">
533 |         <state relative-caret-position="1501">
534 |           <caret line="79" column="31" selection-start-line="79" selection-start-column="31" selection-end-line="79" selection-end-column="31" />
535 |           <folding />
536 |         </state>
537 |       </provider>
538 |     </entry>
539 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/pipelines.py">
540 |       <provider selected="true" editor-type-id="text-editor">
541 |         <state relative-caret-position="551">
542 |           <caret line="31" column="0" selection-start-line="31" selection-start-column="0" selection-end-line="31" selection-end-column="0" />
543 |           <folding>
544 |             <element signature="e#192#206#0" expanded="true" />
545 |           </folding>
546 |         </state>
547 |       </provider>
548 |     </entry>
549 |     <entry file="file://$PROJECT_DIR$/tutorial/tutorial/items.py">
550 |       <provider selected="true" editor-type-id="text-editor">
551 |         <state relative-caret-position="0">
552 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
553 |         </state>
554 |       </provider>
555 |     </entry>
556 |     <entry file="file://$PROJECT_DIR$/tutorial/tutorial/items.py">
557 |       <provider selected="true" editor-type-id="text-editor">
558 |         <state relative-caret-position="0">
559 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
560 |         </state>
561 |       </provider>
562 |     </entry>
563 |     <entry file="file://$PROJECT_DIR$/python爬虫实现有道词典.py">
564 |       <provider selected="true" editor-type-id="text-editor">
565 |         <state relative-caret-position="86">
566 |           <caret line="44" column="25" selection-start-line="44" selection-start-column="25" selection-end-line="44" selection-end-column="25" />
567 |         </state>
568 |       </provider>
569 |     </entry>
570 |     <entry file="file://$PROJECT_DIR$/ip地址查询工具.py">
571 |       <provider selected="true" editor-type-id="text-editor">
572 |         <state relative-caret-position="115">
573 |           <caret line="49" column="61" selection-start-line="49" selection-start-column="61" selection-end-line="49" selection-end-column="61" />
574 |         </state>
575 |       </provider>
576 |     </entry>
577 |     <entry file="file://$PROJECT_DIR$/2.py">
578 |       <provider selected="true" editor-type-id="text-editor">
579 |         <state relative-caret-position="38">
580 |           <caret line="2" column="0" selection-start-line="2" selection-start-column="0" selection-end-line="2" selection-end-column="0" />
581 |         </state>
582 |       </provider>
583 |     </entry>
584 |     <entry file="file://$PROJECT_DIR$/tutorial/scrapy.cfg">
585 |       <provider selected="true" editor-type-id="text-editor">
586 |         <state relative-caret-position="0">
587 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
588 |         </state>
589 |       </provider>
590 |     </entry>
591 |     <entry file="file://$PROJECT_DIR$/zaobao.py" />
592 |     <entry file="file://$PROJECT_DIR$/tutorial/tutorial/spiders/mydomain.py">
593 |       <provider selected="true" editor-type-id="text-editor">
594 |         <state relative-caret-position="272">
595 |           <caret line="19" column="0" selection-start-line="19" selection-start-column="0" selection-end-line="19" selection-end-column="0" />
596 |         </state>
597 |       </provider>
598 |     </entry>
599 |     <entry file="file://$PROJECT_DIR$/tutorial/tutorial/spiders/dmoz_spider.py">
600 |       <provider selected="true" editor-type-id="text-editor">
601 |         <state relative-caret-position="82">
602 |           <caret line="14" column="21" selection-start-line="14" selection-start-column="21" selection-end-line="14" selection-end-column="21" />
603 |         </state>
604 |       </provider>
605 |     </entry>
606 |     <entry file="file://$PROJECT_DIR$/tutorial/tutorial/items.py">
607 |       <provider selected="true" editor-type-id="text-editor">
608 |         <state relative-caret-position="234">
609 |           <caret line="22" column="14" selection-start-line="22" selection-start-column="6" selection-end-line="22" selection-end-column="14" />
610 |         </state>
611 |       </provider>
612 |     </entry>
613 |     <entry file="file://$PROJECT_DIR$/zaobao/scrapy.cfg">
614 |       <provider selected="true" editor-type-id="text-editor">
615 |         <state relative-caret-position="0">
616 |           <caret line="0" column="0" selection-start-line="0" selection-start-column="0" selection-end-line="0" selection-end-column="0" />
617 |         </state>
618 |       </provider>
619 |     </entry>
620 |     <entry file="file://$PROJECT_DIR$/zaobao/zaobao/spiders/zaobao_spider.py">
621 |       <provider selected="true" editor-type-id="text-editor">
622 |         <state relative-caret-position="152">
623 |           <caret line="14" column="67" selection-start-line="14" selection-start-column="67" selection-end-line="14" selection-end-column="67" />
624 |         </state>
625 |       </provider>
626 |     </entry>
627 |     <entry file="file://$PROJECT_DIR$/zaobao/zaobao/items.py">
628 |       <provider selected="true" editor-type-id="text-editor">
629 |         <state relative-caret-position="91">
630 |           <caret line="10" column="16" selection-start-line="10" selection-start-column="6" selection-end-line="10" selection-end-column="16" />
631 |         </state>
632 |       </provider>
633 |     </entry>
634 |     <entry file="file://$PROJECT_DIR$/douban/douban/spiders/douban_spider.py">
635 |       <provider selected="true" editor-type-id="text-editor">
636 |         <state relative-caret-position="171">
637 |           <caret line="16" column="68" selection-start-line="16" selection-start-column="68" selection-end-line="16" selection-end-column="68" />
638 |         </state>
639 |       </provider>
640 |     </entry>
641 |     <entry file="file://$PROJECT_DIR$/douban/douban/settings.py">
642 |       <provider selected="true" editor-type-id="text-editor">
643 |         <state relative-caret-position="191">
644 |           <caret line="45" column="127" selection-start-line="44" selection-start-column="2" selection-end-line="45" selection-end-column="127" />
645 |         </state>
646 |       </provider>
647 |     </entry>
648 |     <entry file="file://$PROJECT_DIR$/dbtop250/dbtop250/items.py">
649 |       <provider selected="true" editor-type-id="text-editor">
650 |         <state relative-caret-position="112">
651 |           <caret line="10" column="18" selection-start-line="10" selection-start-column="6" selection-end-line="10" selection-end-column="18" />
652 |         </state>
653 |       </provider>
654 |     </entry>
655 |     <entry file="file://$PROJECT_DIR$/douban/douban/items.py">
656 |       <provider selected="true" editor-type-id="text-editor">
657 |         <state relative-caret-position="178">
658 |           <caret line="16" column="25" selection-start-line="16" selection-start-column="25" selection-end-line="16" selection-end-column="25" />
659 |         </state>
660 |       </provider>
661 |     </entry>
662 |     <entry file="file://$PROJECT_DIR$/dbtop250/dbtop250/spiders/dbtop250_spider.py">
663 |       <provider selected="true" editor-type-id="text-editor">
664 |         <state relative-caret-position="190">
665 |           <caret line="19" column="47" selection-start-line="19" selection-start-column="47" selection-end-line="19" selection-end-column="47" />
666 |         </state>
667 |       </provider>
668 |     </entry>
669 |     <entry file="file://$PROJECT_DIR$/dbtop250/dbtop250/pipelines.py">
670 |       <provider selected="true" editor-type-id="text-editor">
671 |         <state relative-caret-position="285">
672 |           <caret line="15" column="39" selection-start-line="15" selection-start-column="39" selection-end-line="15" selection-end-column="39" />
673 |         </state>
674 |       </provider>
675 |     </entry>
676 |     <entry file="file://$PROJECT_DIR$/Qqnews/Qqnews/items.py">
677 |       <provider selected="true" editor-type-id="text-editor">
678 |         <state relative-caret-position="207">
679 |           <caret line="17" column="0" selection-start-line="17" selection-start-column="0" selection-end-line="17" selection-end-column="0" />
680 |         </state>
681 |       </provider>
682 |     </entry>
683 |     <entry file="file://$PROJECT_DIR$/Qqnews/Qqnews/pipelines.py">
684 |       <provider selected="true" editor-type-id="text-editor">
685 |         <state relative-caret-position="247">
686 |           <caret line="13" column="43" selection-start-line="13" selection-start-column="43" selection-end-line="13" selection-end-column="43" />
687 |         </state>
688 |       </provider>
689 |     </entry>
690 |     <entry file="file://$PROJECT_DIR$/dbtop250/dbtop250/settings.py">
691 |       <provider selected="true" editor-type-id="text-editor">
692 |         <state relative-caret-position="646">
693 |           <caret line="49" column="2" selection-start-line="49" selection-start-column="2" selection-end-line="49" selection-end-column="2" />
694 |         </state>
695 |       </provider>
696 |     </entry>
697 |     <entry file="file://$PROJECT_DIR$/Qqnews/Qqnews/settings.py">
698 |       <provider selected="true" editor-type-id="text-editor">
699 |         <state relative-caret-position="342">
700 |           <caret line="18" column="124" selection-start-line="18" selection-start-column="124" selection-end-line="18" selection-end-column="124" />
701 |         </state>
702 |       </provider>
703 |     </entry>
704 |     <entry file="file://$PROJECT_DIR$/Qqnews/Qqnews/spiders/Qqnews_spider.py">
705 |       <provider selected="true" editor-type-id="text-editor">
706 |         <state relative-caret-position="207">
707 |           <caret line="27" column="0" selection-start-line="27" selection-start-column="0" selection-end-line="27" selection-end-column="0" />
708 |         </state>
709 |       </provider>
710 |     </entry>
711 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/pipelines.py">
712 |       <provider selected="true" editor-type-id="text-editor">
713 |         <state relative-caret-position="444">
714 |           <caret line="31" column="0" selection-start-line="31" selection-start-column="0" selection-end-line="31" selection-end-column="0" />
715 |           <folding>
716 |             <element signature="e#192#206#0" expanded="true" />
717 |           </folding>
718 |         </state>
719 |       </provider>
720 |     </entry>
721 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/settings.py">
722 |       <provider selected="true" editor-type-id="text-editor">
723 |         <state relative-caret-position="171">
724 |           <caret line="79" column="31" selection-start-line="79" selection-start-column="31" selection-end-line="79" selection-end-column="31" />
725 |           <folding />
726 |         </state>
727 |       </provider>
728 |     </entry>
729 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/spiders/taobao_spider.py">
730 |       <provider selected="true" editor-type-id="text-editor">
731 |         <state relative-caret-position="19">
732 |           <caret line="4" column="37" selection-start-line="4" selection-start-column="37" selection-end-line="4" selection-end-column="66" />
733 |           <folding />
734 |         </state>
735 |       </provider>
736 |     </entry>
737 |     <entry file="file://$PROJECT_DIR$/taobaoclass/taobaoclass/items.py">
738 |       <provider selected="true" editor-type-id="text-editor">
739 |         <state relative-caret-position="19">
740 |           <caret line="44" column="0" selection-start-line="44" selection-start-column="0" selection-end-line="44" selection-end-column="0" />
741 |           <folding />
742 |         </state>
743 |       </provider>
744 |     </entry>
745 |   </component>
746 | </project>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/2.py:
--------------------------------------------------------------------------------
1 | str1 = "/**/jQuery110205057557444126394_1484574357057("
2 | print(len(str1))
3 | 
4 | "https://ssl.captcha.qq.com/cap_union_new_getcapbysig?aid=522005705&asig=&captype=&protocol=https&clientype=2&disturblevel=&apptype=2&curenv=inner&ua=TW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV09XNjQpIEFwcGxlV2ViS2l0LzUzNy4zNiAoS0hUTUwsIGxpa2UgR2Vja28pIENocm9tZS82MS4wLjMxNjMuMTAwIFNhZmFyaS81MzcuMzY=&sess=1iW5KCYL2DiqaiJy8K76Am6iwXvqJkGKpwLsItwLEpjoWwy0G0R3y_t1YKNzrr-Ts5j2Knkgh2qfBoWdWYmHDY_tiQXBpB2vT7ttfysXWlz-JltnuOA33JN14umsk_q0oYq3ITlJNR02RDPd_JRNP0iQeNZe8JMMv3x8BD_Sqi-38jNGuIVSD-EZkLDrjztCENIt15GWQCs*&theme=&noBorder=noborder&fb=1&showtype=embed&uid=123456&cap_cd=Kz3KLjvqeqsYRc0aLobTgXc2UjrnVE-vhPOEpygni5x_9E6HTuxT9Q**&lang=2052&rnd=150167&rand=0.482505701756349&vsig=gvzE39T_XEWYaq6gx4EBY250WYUlIgIL_2ypn6K_iE-O-d3Iwb_2XEr5XegIpAikh4qANjt3pf3yvnESAd95nV1qemP9M1hu9800zE1wEvXls0T5ulqE4Die4uYOfQM_J&ischartype=1"


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class QqnewsItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     title = scrapy.Field()
15 |     date = scrapy.Field()
16 |     author = scrapy.Field()
17 |     content = scrapy.Field()
18 | 
19 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class QqnewsSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.conf import settings
 9 | class QqnewsPipeline(object):
10 | 
11 |     def __init__(self):
12 |         MONGODN_HOST = settings['MONGODB_HOST']
13 |         MONGODB_PORT = settings['MONGODB_PORT']
14 |         dbName = settings['MONGODB_DBNAME']
15 |         MONGODB_CNAME = settings['MONGODB_CNAME']
16 |         client = pymongo.MongoClient(host=MONGODN_HOST,port=MONGODB_PORT)
17 |         tdb = client[dbName]
18 |         self.post = tdb[MONGODB_CNAME]
19 |     def process_item(self, item, spider):
20 |         news = dict(item)
21 |         self.post.insert(news)
22 |         return item
23 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Qqnews project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Qqnews'
13 | 
14 | SPIDER_MODULES = ['Qqnews.spiders']
15 | NEWSPIDER_MODULE = 'Qqnews.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
20 | 
21 | 
22 | MONGODB_HOST = '127.0.0.1'
23 | MONGODB_PORT = 27017
24 | MONGODB_DBNAME = 'QQnews'
25 | MONGODB_CNAME = 'military'
26 | 
27 | # Obey robots.txt rules
28 | ROBOTSTXT_OBEY = True
29 | 
30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
31 | #CONCURRENT_REQUESTS = 32
32 | 
33 | # Configure a delay for requests for the same website (default: 0)
34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
35 | # See also autothrottle settings and docs
36 | #DOWNLOAD_DELAY = 3
37 | # The download delay setting will honor only one of:
38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
39 | #CONCURRENT_REQUESTS_PER_IP = 16
40 | 
41 | # Disable cookies (enabled by default)
42 | COOKIES_ENABLED = True
43 | 
44 | # Disable Telnet Console (enabled by default)
45 | #TELNETCONSOLE_ENABLED = False
46 | 
47 | # Override the default request headers:
48 | DEFAULT_REQUEST_HEADERS = {
49 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
50 |   'Accept-Language': 'en',
51 |   'RK=7SNngcUONh':'pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; uin=o0252943669; skey=@zCZ8lcmdT; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pgv_info=ssid=s4702440319; pgv_pvid=4169365884; o_cookie=252943669; pac_uid=1_252943669'
52 | }
53 | 
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | #    'Qqnews.middlewares.QqnewsSpiderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | #    'Qqnews.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | ITEM_PIPELINES = {
75 |    'Qqnews.pipelines.QqnewsPipeline': 300,
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/Qqnews_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from Qqnews.items import QqnewsItem
 5 | 
 6 | 
 7 | class QqnewsSpiderSpider(scrapy.Spider):
 8 |     name = "Qqnews_spider"
 9 |     allowed_domains = ["qq.com"]
10 |     start_urls = ['http://mil.qq.com/mil_index.htm']
11 | 
12 |     def parse(self, response):
13 |         for eveUrl in response.xpath('//a[@class="linkto"]/@href'):
14 |             yield scrapy.Request(eveUrl.extract(),callback=self.parse_content)
15 | 
16 | 
17 |     def parse_content(self,response):
18 |         item = QqnewsItem()
19 |         title = response.xpath('//div[@class="hd"]/h1/text()').extract()
20 |         date1 = response.xpath('//span[@class="a_time"]/text()').extract()
21 |         date2 = response.xpath('//div[@class="md"]/text()').extract()
22 |         date3 = response.xpath('//div[@class="time"]/text()').extract()
23 |         date = str(date1)+str(date2)+str(date3)
24 |         author = response.xpath('//div[@class="content-article"]/p[1]/text()').extract()
25 |         content = response.xpath('//div[@class="content-article"]/text()').extract()
26 |         print(title,date,author,content)
27 |         yield item
28 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/Qqnews_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Qqnews/Qqnews/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Qqnews/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Qqnews.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Qqnews
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/__init__.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class YustneirongItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class YustneirongSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class YustneirongPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Yustneirong project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Yustneirong'
13 | 
14 | SPIDER_MODULES = ['Yustneirong.spiders']
15 | NEWSPIDER_MODULE = 'Yustneirong.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Yustneirong (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Yustneirong.middlewares.YustneirongSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Yustneirong.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'Yustneirong.pipelines.YustneirongPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/settings.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/Yustneirong/Yustneirong/spiders/__init__.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/Yustneirong/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Yustneirong.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Yustneirong
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Dbtop250Item(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     name = scrapy.Field()
15 |     zuto = scrapy.Field()
16 |     desc = scrapy.Field()
17 |     ping = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class Dbtop250SpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | from scrapy.conf import settings
10 | 
11 | 
12 | class Dbtop250Pipeline(object):
13 | 
14 |     def __init__(self):
15 |         host = settings['MONGODB_HOST']
16 |         port = settings['MONGODB_PORT']
17 |         dbName = settings['MONGODB_DBNAME']
18 |         client = pymongo.MongoClient(host=host,port=port)
19 |         tdb = client[dbName]
20 |         self.post = tdb[settings['MONGODB_DOCNAME']]
21 |     def process_item(self, item, spider):
22 |         movie = dict(item)
23 |         self.post.insert(movie)
24 |         return item
25 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dbtop250 project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dbtop250'
13 | 
14 | SPIDER_MODULES = ['dbtop250.spiders']
15 | NEWSPIDER_MODULE = 'dbtop250.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
20 | 
21 | MONGODB_HOST = 'localhost'
22 | MONGODB_PORT = 27017
23 | MONGODB_DBNAME = 'dbtop250'
24 | MONGODB_DOCNAME = 'top250'
25 | 
26 | # Obey robots.txt rules
27 | ROBOTSTXT_OBEY = True
28 | 
29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
30 | #CONCURRENT_REQUESTS = 32
31 | 
32 | # Configure a delay for requests for the same website (default: 0)
33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
34 | # See also autothrottle settings and docs
35 | #DOWNLOAD_DELAY = 3
36 | # The download delay setting will honor only one of:
37 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | #CONCURRENT_REQUESTS_PER_IP = 16
39 | 
40 | # Disable cookies (enabled by default)
41 | COOKIES_ENABLED = True
42 | 
43 | # Disable Telnet Console (enabled by default)
44 | #TELNETCONSOLE_ENABLED = False
45 | 
46 | # Override the default request headers:
47 | DEFAULT_REQUEST_HEADERS = {
48 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 |   'Accept-Language': 'en',
50 |   'Cookie':'RK=7SNngcUONh; pgv_pvi=5307348992; pgv_si=s9438232576; ptui_loginuin=123456; logout_page=; dm_login_weixin_rem=; qm_authimgs_id=0; qm_verifyimagesession=h0136aa52fede833d6b76fb7cfff4ac53f406c36fd8b73221c205f4c89e3aed6341ff952f57983a3247; ptisp=cnc; ptcz=369283ac413194e379e190071988967a9e0f56822f9e55c5fb3f7d01cf7f0fd9; pt2gguin=o0252943669; qqmusic_uin=; qqmusic_key=; qqmusic_fromtag=; pac_uid=1_252943669; dsp_cookiemapping0=1508662302062; dsp_cookiemapping2=1508662302064; ad_play_index=66; thyls_ad=440; dsp_cookiemapping1=1508662308938; pgv_info=ssid=s4702440319; ts_last=mil.qq.com/mil_index.htm; pgv_pvid=4169365884; o_cookie=252943669; ts_uid=2412234112'
51 | }
52 | 
53 | # Enable or disable spider middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
55 | #SPIDER_MIDDLEWARES = {
56 | #    'dbtop250.middlewares.Dbtop250SpiderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable downloader middlewares
60 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
61 | #DOWNLOADER_MIDDLEWARES = {
62 | #    'dbtop250.middlewares.MyCustomDownloaderMiddleware': 543,
63 | #}
64 | 
65 | # Enable or disable extensions
66 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
67 | #EXTENSIONS = {
68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
69 | #}
70 | 
71 | # Configure item pipelines
72 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
73 | ITEM_PIPELINES = {
74 |    'dbtop250.pipelines.Dbtop250Pipeline': 300,
75 | }
76 | 
77 | # Enable and configure the AutoThrottle extension (disabled by default)
78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
79 | #AUTOTHROTTLE_ENABLED = True
80 | # The initial download delay
81 | #AUTOTHROTTLE_START_DELAY = 5
82 | # The maximum download delay to be set in case of high latencies
83 | #AUTOTHROTTLE_MAX_DELAY = 60
84 | # The average number of requests Scrapy should be sending in parallel to
85 | # each remote server
86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG = False
89 | 
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED = True
93 | #HTTPCACHE_EXPIRATION_SECS = 0
94 | #HTTPCACHE_DIR = 'httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES = []
96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
97 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/__pycache__/dbtop250_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/dbtop250/spiders/dbtop250_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class Dbtop250SpiderSpider(scrapy.Spider):
 6 |     name = "dbtop250_spider"
 7 |     allowed_domains = ["douban.com"]
 8 |     start_urls = ['https://movie.douban.com/top250?start=0&filter=']
 9 |     conunt = 0
10 |     def parse(self, response):
11 |         self.conunt +=1
12 |         for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'):
13 |             full_url = eve.extract()
14 |             yield scrapy.Request(full_url,callback=self.parse_movie)
15 | 
16 |         if self.conunt * 25 < 250:
17 |             full_url = 'https://movie.douban.com/top250?start={}&filter='.format(str(self.conunt*25))
18 |             yield scrapy.Request(full_url,callback=self.parse)
19 |     def parse_movie(self,response):
20 |         from dbtop250.items import Dbtop250Item
21 |         item = Dbtop250Item()
22 |         item['name'] = response.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
23 |         # item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract()
24 |         item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract()
25 |         yield item
26 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/dbtop250/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dbtop250.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dbtop250
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     name = scrapy.Field()
15 |     # auto = scrapy.Field()
16 |     desc = scrapy.Field()
17 |     ping = scrapy.Field()
18 | 
19 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class DoubanSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DoubanPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for douban project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'douban'
13 | 
14 | SPIDER_MODULES = ['douban.spiders']
15 | NEWSPIDER_MODULE = 'douban.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'douban (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 |   'Accept-Language': 'en',
45 |   'Cookie':'bid=iJIjKbBsQZ4; gr_user_id=b3a58668-aa55-4aa3-a212-1d9ed21843e8; viewed="27116300_25862578"; ps=y; ll="108288"; push_noty_num=0; push_doumail_num=0; ap=1; _ga=GA1.2.462587836.1508291602; __yadk_uid=byGuKstnDBAymxz38q9BxYZnm6ibZZbe; _vwo_uuid_v2=F727776224927130F161043B6E8DCD6F|0d3a3a996a0fdc93c651e031901946bb; __utma=30149280.462587836.1508291602.1508501064.1508655486.6; __utmb=30149280.0.10.1508655486; __utmc=30149280; __utmz=30149280.1508501064.5.5.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.16325; __utma=223695111.462587836.1508291602.1508655486.1508655486.1; __utmb=223695111.0.10.1508655486; __utmc=223695111; __utmz=223695111.1508655486.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_id.100001.4cf6=288cdd8ec5b2cdaf.1508655486.1.1508656155.1508655486.; _pk_ses.100001.4cf6=*',
46 |   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'douban.middlewares.DoubanSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'douban.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | #    'douban.pipelines.DoubanPipeline': 300,
71 | #}
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/douban/douban/spiders/__pycache__/douban_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/douban/spiders/douban_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from douban.items import DoubanItem
 4 | 
 5 | class DoubanSpiderSpider(scrapy.Spider):
 6 |     name = "douban_spider"
 7 |     allowed_domains = ["douban.com"]
 8 |     print("1111111111111111111111")
 9 |     start_urls = ['https://movie.douban.com/top250?start=0&filter=']
10 | 
11 |     def parse(self, response):
12 | 
13 |         for eve in response.xpath('//div[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href'):
14 |             full_url = eve.extract()
15 | 
16 |             print(full_url)
17 |             yield scrapy.Request(full_url,callback=self.parse_movie)
18 | 
19 |     def parse_movie(self,response):
20 | 
21 |          item = DoubanItem()
22 |          item['name'] = response.xpath('//*[@id="content"]/h1/span[1]//text()').extract()
23 |          item['auto'] = response.xpath('//*[@id="info"]/span/span[2]/a/text()').extract()
24 |          item['ping'] = response.xpath('/html/body/div[3]/div[1]/div[3]/div[1]/div[1]/div[1]/div[2]/div[1]/div[2]/strong/text()').extract()
25 |          print(item)
26 |          yield item
27 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/douban/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = douban.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/ip地址查询工具.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import json
 3 | import requests
 4 | 
 5 | 
 6 | url = 'https://sp0.baidu.com/8aQDcjqpAAV3otqbppnN2DJv/api.php?query=10.0.144.241&co=&resource_id=6006&t=1484574592369&ie=utf8&oe=gbk&cb=op_aladdin_callback&format=json&tn=baidu&cb=jQuery110205057557444126394_1484574357057&_=1484574357071'
 7 | 
 8 | headers = {
 9 |             'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
10 |             'Accept-Encoding':'gzip, deflate, br',
11 |             'Accept-Language':'zh-CN,zh;q=0.8',
12 |             'Connection':'keep-alive',
13 |             'Cookie':'BAIDUID=4812092AE366ED4A55C6D8EA6713A635:FG=1; PSTM=1508161904; BIDUPSID=18C54752D18DC057B004465161A28981; BDUSS=9XM0M3bnJBYUpRZVBFRDRRWXdpVXdIa0d2WDRJUlVFaVlJcFVSMnVFOE5MUkJhSVFBQUFBJCQAAAAAAAAAAAEAAAC6uzCj0KHKqNfTczAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA2g6FkNoOhZQ2; MCITY=-%3A; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598',
14 |             'Host':'sp0.baidu.com',
15 |             'Upgrade-Insecure-Requests':'1',
16 |             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
17 | }
18 | ip = input("请输入你的IP地址：")
19 | params = {
20 |             'query':ip,
21 |             'co':'',
22 |             'resource_id':'6006',
23 |             't':'1484574592369',
24 |             'ie':'utf8',
25 |             'oe':'gbk',
26 |             'cb':'op_aladdin_callback',
27 |             'format':'json',
28 |             'tn':'baidu',
29 |             'cb':'jQuery110205057557444126394_1484574357057',
30 |             '_':'1484574357071'
31 | }
32 | 
33 | response = requests.get(url,params=params).text
34 | # print(response)
35 | response = json.loads(response[46:][:-2])
36 | # print(response[46:][:-2])
37 | print("location:"+response.get('data')[0].get("location"))
38 | print("titlecont:"+response.get('data')[0].get("titlecont"))
39 | print("origip:"+response.get('data')[0].get("origip"))
40 | print("origipquery:"+response.get('data')[0].get("origipquery"))
41 | print("showlamp:"+response.get('data')[0].get("showlamp"))
42 | print("showLikeShare:"+str(response.get('data')[0].get("showLikeShare")))
43 | print("shareImage:"+str(response.get('data')[0].get("shareImage")))
44 | print("ExtendedLocation:"+response.get('data')[0].get("ExtendedLocation"))
45 | print("QriginQuery:"+str(response.get('data')[0].get("QriginQuery")))
46 | print("tplt:"+response.get('data')[0].get("tplt"))
47 | print("resourceid:"+str(response.get('data')[0].get("resourceid")))
48 | print("fetchkey:"+response.get('data')[0].get("fetchkey"))
49 | print("appinfo:"+response.get('data')[0].get("appinfo"))
50 | print("role_id:"+str(response.get('data')[0].get("role_id")))
51 | print("disp_type:"+str(response.get('data')[0].get("disp_type")))
52 | 
53 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/dictionaries/.xml:
--------------------------------------------------------------------------------
1 | <component name="ProjectDictionaryState">
2 |   <dictionary name="你好" />
3 | </component>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/lagou.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.6.1 (D:\Anaconda3\python.exe)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Nosetests" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Nosetests" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="ProjectInspectionProfilesVisibleTreeState">
 4 |     <entry key="Project Default">
 5 |       <profile-state>
 6 |         <expanded-state>
 7 |           <State>
 8 |             <id />
 9 |           </State>
10 |         </expanded-state>
11 |         <selected-state>
12 |           <State>
13 |             <id>AngularJS</id>
14 |           </State>
15 |         </selected-state>
16 |       </profile-state>
17 |     </entry>
18 |   </component>
19 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.1 (D:\Anaconda3\python.exe)" project-jdk-type="Python SDK" />
20 | </project>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/lagou.iml" filepath="$PROJECT_DIR$/.idea/lagou.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/dump.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/dump.rdb


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/geckodriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/geckodriver.log


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy import Field
10 | 
11 | class LagouItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     companyFullName =  Field() # 公司名字
15 |     # print(info['公司名字'])
16 |     city = Field()  # 职位城市
17 |     positionName = Field()  # 招聘职位
18 |     formatCreateTime = Field()  # 发布时间
19 |     salary = Field()  # 薪资待遇
20 |     workYear = Field()  # 经验要求
21 |     Jobdescriptions = Field()    #职位描述
22 |     companySize = Field()  # 公司大小
23 |     positionAdvantage = Field()  # 公司福利
24 |     district = Field()          #公司地址
25 |     companyhref = Field()       #公司链接


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class LagouSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__init__.py:
--------------------------------------------------------------------------------
1 | from .useragent import UserAgentMiddleware


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/__pycache__/useragent.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/middlewares/useragent.py:
--------------------------------------------------------------------------------
 1 | import faker
 2 | 
 3 | 
 4 | class UserAgentMiddleware(object):
 5 |     def __init__(self,settings):
 6 |         self.faker = faker.Faker()
 7 | 
 8 |     @classmethod
 9 |     def from_crawler(cls,crawler):
10 |         return cls(crawler.settings)
11 | 
12 |     def process_request(self,request,spider):
13 |         request.headers['User-Agent'] = self.faker.user_agent()
14 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | 
 9 | #存储到mongodb数据库
10 | class LagouPipeline(object):
11 | 
12 |     def __init__(self, mongo_uri, mongo_db):
13 |         self.mongo_uri = mongo_uri
14 |         self.mongo_db = mongo_db
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         return cls(
19 |             mongo_uri = crawler.settings.get('MONGO_URI'),
20 |             mongo_db = crawler.settings.get('MONGO_DATABASE','items')
21 |         )
22 | 
23 |     def open_spider(self,spider):
24 |         self.client = pymongo.MongoClient()
25 |         self.db = self.client['lagouzhiwei']
26 | 
27 |     def close_spider(self,spider):
28 |         self.client.close()
29 | 
30 |     def process_item(self, item, spider):
31 |         self.db['zhiweitest2'].insert(dict(item))
32 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for lagou project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'lagou'
 13 | 
 14 | SPIDER_MODULES = ['lagou.spiders']
 15 | NEWSPIDER_MODULE = 'lagou.spiders'
 16 | LOG_LEVEL= 'INFO'
 17 | 
 18 | 
 19 | #数据库,我已经写死了，所以这里就不写值了，如果需要在这里调用的可以填写这两个值
 20 | # MONGO_URI = ''
 21 | # MONGO_DATABASE = ''
 22 | 
 23 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 24 | # USER_AGENT = 'lagou (+http://www.yourdomain.com)'
 25 | 
 26 | # Obey robots.txt rules
 27 | ROBOTSTXT_OBEY = False
 28 | 
 29 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 30 | CONCURRENT_REQUESTS = 32
 31 | 
 32 | # Configure a delay for requests for the same website (default: 0)
 33 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 34 | # See also autothrottle settings and docs
 35 | DOWNLOAD_DELAY = 0
 36 | # The download delay setting will honor only one of:
 37 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
 38 | #CONCURRENT_REQUESTS_PER_IP = 16
 39 | 
 40 | # Disable cookies (enabled by default)
 41 | COOKIES_ENABLED = False
 42 | # COOKIE = " user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _gat=1; index_location_city=%E5%8C%97%E4%BA%AC; login=false; unick=""; _putrc=""; JSESSIONID=ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C; TG-TRACK-CODE=index_navigation; _gid=GA1.2.1376878689.1512383958; _ga=GA1.2.358203920.1509241265; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828514,1511828645,1512096311,1512383961; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512391953; LGSID=20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce; LGRID=20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce; SEARCH_ID=b1c5303a69754a66bc97d63dc0fec865"
 43 | # Cookie =  {'user_trace_token':'20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243',' LGUID':'20171016205145-c44d7b22-b270-11e7-991d-525400f775ce', 'showExpriedIndex':'1',
 44 | #            'showExpriedCompanyHome':'1', 'showExpriedMyPublish':'1', 'hasDeliver':'2', '_gat':'1','index_location_city':'%E5%8C%97%E4%BA%AC','login':'false',
 45 | #            'unick':"", '_putrc':"", 'JSESSIONID':'ABAAABAAADEAAFI9B5F7A7C0171C81B55991A6F507BB38C' ,'TG-TRACK-CODE':'index_navigation','_gid':'GA1.2.1376878689.1512383958','_ga':'GA1.2.358203920.1509241265','Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1511828514,1511828645,1512096311,1512383961','Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6':'1512391953','LGSID':'20171204201356-99b53ec2-d8ec-11e7-82ca-525400f775ce',
 46 | #            'LGRID':'20171204205308-13e42a93-d8f2-11e7-8301-525400f775ce','SEARCH_ID':'b1c5303a69754a66bc97d63dc0fec865'}
 47 | #  Disable Telnet Console (enabled by default)
 48 | TELNETCONSOLE_ENABLED = True
 49 | 
 50 | # Override the default request headers:
 51 | # DEFAULT_REQUEST_HEADERS = {
 52 | #         'Accept':'application/json, text/javascript, */*; q=0.01',
 53 | #         'Accept-Encoding':'gzip, deflate, br',
 54 | #         'Accept-Language':'zh-CN,zh;q=0.8',
 55 | #         'Connection':'keep-alive',
 56 | #         'Content-Length':'25',
 57 | #         'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
 58 | #         'Cookie':'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC',
 59 | #         'Host':'www.lagou.com',
 60 | #         'Origin':'https://www.lagou.com',
 61 | #         'Cookie':'Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512613229,1512613260,1512625404,1512968182; index_location_city=%E5%8C%97%E4%BA%AC; _ga=GA1.2.2037062440.1512613233; user_trace_token=201712011102032-33c95bdc-daf5-11e7-8800-525400f775ce; LGUID=20171207102032-33c95ef6-daf5-11e7-8800-525400f775ce; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=4; JSESSIONID=ABAAABAAADEAAFI7D85FFAA76F7A088717F2BAF4B49DB5A; SEARCH_ID=e00f27cb11504a72a10b8ec58bd5f04f; _gat=1; LGSID=20171211125618-9f9c7c23-de2f-11e7-8e96-525400f775ce; PRE_UTM=; PRE_HOST=; PRE_SITE=; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_COCOS2D-X%3Fpx%3Ddefault%26city%3D%25E5%2585%25A8%25E5%259B%25BDstart.firefoxchina.cn; LGRID=20171211125650-b2cc7009-de2f-11e7-8e96-525400f775ce; _putrc=54D6D44AC87A2A52; _gid=GA1.2.834272328.1512968180; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1512968210',
 62 |         # 'Referer':"https://www.lagou.com",
 63 |         # 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 64 |         # 'X-Anit-Forge-Code':'0',
 65 |         # 'X-Anit-Forge-Token':'None',
 66 |         # 'X-Requested-With':'XMLHttpRequest'
 67 | # }
 68 | 
 69 | 
 70 | # Enable or disable spider middlewares
 71 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 72 | #SPIDER_MIDDLEWARES = {
 73 | #    'lagou.middlewares.LagouSpiderMiddleware': 543,
 74 | #}
 75 | 
 76 | # Enable or disable downloader middlewares
 77 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 78 | DOWNLOADER_MIDDLEWARES = {
 79 |    # 'lagou.middlewares.MyCustomDownloaderMiddleware': 543,
 80 |     'lagou.middlewares.UserAgentMiddleware':500,
 81 |     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware':None,
 82 | 
 83 | }
 84 | 
 85 | # Enable or disable extensions
 86 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 87 | #EXTENSIONS = {
 88 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 89 | #}
 90 | 
 91 | # Configure item pipelines
 92 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 93 | ITEM_PIPELINES = {
 94 |    'lagou.pipelines.LagouPipeline': 300,
 95 | }
 96 | 
 97 | # Enable and configure the AutoThrottle extension (disabled by default)
 98 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 99 | #AUTOTHROTTLE_ENABLED = True
100 | # The initial download delay
101 | #AUTOTHROTTLE_START_DELAY = 5
102 | # The maximum download delay to be set in case of high latencies
103 | #AUTOTHROTTLE_MAX_DELAY = 60
104 | # The average number of requests Scrapy should be sending in parallel to
105 | # each remote server
106 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
107 | # Enable showing throttling stats for every response received:
108 | #AUTOTHROTTLE_DEBUG = False
109 | 
110 | # Enable and configure HTTP caching (disabled by default)
111 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
112 | #HTTPCACHE_ENABLED = True
113 | #HTTPCACHE_EXPIRATION_SECS = 0
114 | #HTTPCACHE_DIR = 'httpcache'
115 | #HTTPCACHE_IGNORE_HTTP_CODES = []
116 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
117 | 
118 | 
119 | #开启scrapy-redis分布式
120 | #修改调度器
121 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
122 | #开启去重
123 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/__pycache__/lagouspider.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/lagou/spiders/lagouspider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from bs4 import BeautifulSoup
  4 | import json
  5 | from scrapy.conf import settings
  6 | from ..items import LagouItem
  7 | import requests
  8 | 
  9 | occupation_list = []
 10 | 
 11 | class LagouspiderSpider(scrapy.Spider):
 12 |     name = "lagouspider"
 13 |     allowed_domains = ["lagou.com"]
 14 |     start_urls = ['https://www.lagou.com']
 15 |     cookie = settings['COOKIE']
 16 |     headers = {
 17 |         'Accept': 'application/json, text/javascript, */*; q=0.01',
 18 |         'Accept-Encoding': 'gzip, deflate, br',
 19 |         'Accept-Language': 'zh-CN,zh;q=0.8',
 20 |         'Connection': 'keep-alive',
 21 |         'Content-Length': '25',
 22 |         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 23 |         'Cookie': 'user_trace_token=20171016205119-74af5f4b-4f58-445f-9516-f8c19f475243; LGUID=20171016205145-c44d7b22-b270-11e7-991d-525400f775ce; JSESSIONID=ABAAABAAAIAACBIB57C77C4ECB2518EE2D0B9135B108E91; _gat=1; PRE_UTM=; PRE_HOST=www.baidu.com; PRE_SITE=http%3A%2F%2Fwww.baidu.com%2Fs%3Fwd%3D%25E6%258B%2589%25E9%2592%25A9%26rsv_spt%3D1%26rsv_iqid%3D0xaa9b6e5d00013b57%26issp%3D1%26f%3D8%26rsv_bp%3D0%26rsv_idx%3D2%26ie%3Dutf-8%26rqlang%3D%26tn%3Dbaiduhome_pg%26ch%3D; PRE_LAND=https%3A%2F%2Fwww.lagou.com%2F; TG-TRACK-CODE=index_navigation; _putrc=54D6D44AC87A2A52; login=true; unick=%E6%9D%A8%E9%B9%8F%E5%8D%87; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=2; _ga=GA1.2.358203920.1509241265; _gid=GA1.2.1325253718.1511828511; LGSID=20171128082152-21c7cb9f-d3d2-11e7-ae6a-525400f775ce; LGRID=20171128082206-2a422523-d3d2-11e7-ae6a-525400f775ce; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1510323969,1511253705,1511253729,1511828514; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1511828527; SEARCH_ID=8267a0cd29464ff6b38f86b1d1e17b96; index_location_city=%E5%8C%97%E4%BA%AC',
 24 |         'Host': 'www.lagou.com',
 25 |         'Origin': 'https://www.lagou.com',
 26 |         'Referer': "h'ttps://www.lagou.com/jobs/list_Python?px=default&city=%E5%8C%97%E4%BA%AC",
 27 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 28 |         'X-Anit-Forge-Code': '0',
 29 |         'X-Anit-Forge-Token': 'None',
 30 |         'X-Requested-With': 'XMLHttpRequest'
 31 |     }
 32 | 
 33 | 
 34 |     def parse(self, response,pn=1):
 35 |         #获取所有职位
 36 |         for i in range(1,8):
 37 |             occos = response.xpath('//*[@id="sidebar"]/div/div[{}]/div/dl/dd/a/text()'.format(i)).extract()
 38 |             for occo in occos:
 39 |                 # url = "https://www.lagou.com/jobs/list_{}?px=default&city=%E5%85%A8%E5%9B%BD#filterBox".format('java')
 40 |                 # yield scrapy.Request(url,callback=self.parse_page)
 41 |                 occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0'
 42 |                 data = {
 43 |                     'first':'true',
 44 |                     'pn':pn,
 45 |                     'kd':'java'
 46 |                 }
 47 |                 #获取返回的json数据
 48 |                 response = requests.post(occu_url, data=data, headers=self.headers)
 49 |                 # positionIds = json.loads(response.text).get('content').get('positionResult').get('result')
 50 |                 try:
 51 |                     pageSize = json.loads(response.text).get('content').get('pageSize')
 52 |                     totalCount = json.loads(response.text).get('content').get('positionResult').get('totalCount')
 53 |                 except json.decoder.JSONDecodeError:
 54 |                     continue
 55 |                 #获取总页数
 56 |                 if int(totalCount) % int(pageSize) == 0:
 57 |                     pages = int(int(totalCount)/int(pageSize))
 58 |                 else:
 59 |                     pages = int(int(totalCount)/int(pageSize)) + 1
 60 | 
 61 |                 for page in range(int(pages)):
 62 |                     pn = page + 1
 63 |                     occu_url = 'https://www.lagou.com/jobs/positionAjax.json?px=default&needAddtionalResult=false&isSchoolJob=0'
 64 |                     data = {
 65 |                         'first': 'true',
 66 |                         'pn': pn,
 67 |                         'kd': occo
 68 |                     }
 69 |                    
 70 |                     response = requests.post(occu_url, data=data, headers=self.headers)
 71 |                    
 72 |                     try:
 73 |                         if 'content' in json.loads(response.text).keys():
 74 |                             positionIds = json.loads(response.text).get('content').get('positionResult').get('result')
 75 | 
 76 | 
 77 |                         for positionId in positionIds:
 78 |                             # try:
 79 |                             position = positionId.get('positionId')
 80 |                             # except:
 81 |                             #     continue
 82 |                             # print(positionId)
 83 |                             item = LagouItem()
 84 |                             # self.item = info
 85 |                             item['companyFullName'] = positionId['companyFullName']  # 公司名字
 86 |                             # print(info['公司名字'])
 87 |                             item['city'] = positionId['city']  # 职位城市
 88 |                             item['positionName'] = positionId['positionName']  # 招聘职位
 89 |                             item['formatCreateTime'] = positionId['formatCreateTime']  # 发布时间
 90 |                             item['salary'] = positionId['salary']  # 薪资待遇
 91 |                             item['workYear'] = positionId['workYear']  # 经验要求
 92 |                             item['companySize'] = positionId['companySize']  # 公司大小
 93 |                             item['positionAdvantage'] = positionId['positionAdvantage']  # 公司福利
 94 |                             item['district'] = positionId['district']  # 公司地址
 95 |                             info_url = "https://www.lagou.com/jobs/{}.html".format(position)
 96 |                             # item = LagouItem()
 97 |                             # item['companyhref'] = info_url
 98 |                             print(item)
 99 |                             yield item
100 |                             # yield scrapy.Request(url=info_url, callback=self.parse_fina)
101 |                     except json.decoder.JSONDecodeError:
102 |                         continue
103 |                     except TimeoutError:
104 |                         continue
105 |                         # print(info_url)
106 |                         # yield item
107 | 
108 |     #获取详细页面的信息，这里试验了，很慢，只获取了页面链接,我就只获取了json数据里面的信息，大部门差不多了
109 |     # def parse_fina(self,response):
110 |     #             item = LagouItem()
111 |         # response = response.text
112 |         # print(response.status)
113 |         # if response.status == 200:
114 |         #     try:
115 |         #         item['companyFullName'] = response.xpath('//*[@id="job_company"]/dt/a/img/@alt').extract()  # 公司名字
116 | 
117 |             # print(info['公司名字'])
118 |             #     item['city'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[2]/text()').extract()  # 职位城市
119 |             #     item['positionName'] = response.xpath('/html/body/div[2]/div/div[1]/div/span/text()').extract() # 招聘职位
120 |             #     item['formatCreateTime'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[2]/text()').extract() # 发布时间
121 |             #     item['salary'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[1]/text()').extract()  # 薪资待遇
122 |             #     item['workYear'] = response.xpath('/html/body/div[2]/div/div[1]/dd/p[1]/span[3]/text()').extract()[0]  # 经验要求
123 |             #     item['Jobdescriptions'] = response.xpath('//*[@id="job_detail"]/dd[2]/div/p/text()').extract()  # 职位描述
124 |             #     item['companySize'] = response.xpath('//*[@id="job_company"]/dd/ul/li[3]/text()').extract()  # 公司大小
125 |                 # item['positionAdvantage'] = response.xpath('//*[@id="job_detail"]/dd[1]/p/text()').extract()  # 公司福利
126 |                 # item['district'] = response.xpath('//*[@id="job_detail"]/dd[3]/div[1]/a/text()').extract()  # 公司地址
127 |                 # item['companyhref'] = response.xpath('//*[@id="job_company"]/dd/ul/li[4]/a/@href').extract()  # 公司链接
128 |             # except IndexError:
129 |             #     pass
130 |             #     print(item)
131 |             #     yield item


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/lagou-scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lagou.settings
 8 | 
 9 | [deploy:demo]
10 | url = http://localhost:6800/
11 | project = lagou
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/python爬虫实现有道词典.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import urllib.request
 3 | import json
 4 | import urllib.parse
 5 | 
 6 | url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
 7 | headers = {
 8 |             'Cookie':'OUTFOX_SEARCH_USER_ID=-763428860@10.168.8.61; DICT_UGC=be3af0da19b5c5e6aa4e17bd8d90b28a|; JSESSIONID=abc31lbWsGNO67M3Fi-8v; OUTFOX_SEARCH_USER_ID_NCOO=1648534080.0892432; _ntes_nnid=bf4e54b134dc8a8b2f65cd59c8ba272e,1508592727589; ___rl__test__cookies=1508593353423',
 9 |             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
10 | }
11 | 
12 | dict1 = {
13 |     '0':['zh-CHS','en'],
14 |     '1':['en','zh-CHS'],
15 |     '2':['zh-CHS','ja'],
16 |     '3':['ja','zh-CHS'],
17 |     '4':['zh-CHS','ko'],
18 |     '5':['ko','zh-CHS'],
19 |     '6':['zh-CHS','fr'],
20 |     '7':['fr','zh-CHS'],
21 |     '8':['zh-CHS','ru'],
22 |     '9':['ru','zh-CHS'],
23 |     '10':['zh-CHS','es'],
24 |     '11':['es','zh-CHS'],
25 |     '12':['zh-CHS','pt'],
26 |     '13':['pt','zh-CHS'],
27 | }
28 | switch = input("请选择语言翻译：0：中文-》英语，1：英语-》中文，2：中文-》日语，3：日语-》中文，\n，4：中文-》韩语，5：韩语-》中文，"
29 |                "6：中文-》法语，7：法语-》中文，8：中文-》俄语，\n，9：俄语-》中文，10：中文-》西班牙语，\n，11：西班牙语-》中文，12：中文-》葡萄牙语，"
30 |                "13：葡萄牙语-》中文：")
31 | 
32 | star = dict1[switch][0]
33 | end = dict1[switch][1]
34 | # print(star)
35 | # print(end)
36 | word = input("请输入你要翻译的语句：")
37 | data = {
38 |         'i':word,
39 |         'from':star,
40 |         'to':end,
41 |         'smartresult':'dict',
42 |         'client':'fanyideskweb',
43 |         'salt':'1508593351114',
44 |         'sign':'32cded672e5ba31d4f4929650a5ad22e',
45 |         'doctype':'json',
46 |         'version':'2.1',
47 |         'keyfrom':'fanyi.web',
48 |         'action':'FY_BY_CLICKBUTTION',
49 |         'typoResult':'true'
50 | }
51 | 
52 | data = urllib.parse.urlencode(data).encode("utf-8")
53 | response = urllib.request.urlopen(url=url,data=data)
54 | datas = json.loads(response.read().decode("utf-8"))
55 | answer = datas.get('translateResult')[0][0]['tgt']
56 | print(answer)


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = taobaoclass.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = taobaoclass
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TaobaoclassItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     image_urls = scrapy.Field()
15 |     images = scrapy.Field()
16 |     title = scrapy.Field()
17 |     price = scrapy.Field()
18 |     fukuan = scrapy.Field()
19 |     dizhi = scrapy.Field()
20 |     url = scrapy.Field()
21 |     dianqu = scrapy.Field()
22 | 
23 | class Iphone(scrapy.Item):
24 |     image_urls = scrapy.Field()
25 |     images = scrapy.Field()
26 |     title = scrapy.Field()
27 |     price = scrapy.Field()
28 |     fukuan = scrapy.Field()
29 |     dizhi = scrapy.Field()
30 |     url = scrapy.Field()
31 |     dianqu = scrapy.Field()
32 | 
33 | class Samsung(scrapy.Item):
34 |     image_urls = scrapy.Field()
35 |     images = scrapy.Field()
36 |     title = scrapy.Field()
37 |     price = scrapy.Field()
38 |     fukuan = scrapy.Field()
39 |     dizhi = scrapy.Field()
40 |     url = scrapy.Field()
41 |     dianqu = scrapy.Field()
42 | 
43 | 
44 | 
45 | class HuaWei(scrapy.Item):
46 |     image_urls = scrapy.Field()
47 |     images = scrapy.Field()
48 |     title = scrapy.Field()
49 |     price = scrapy.Field()
50 |     fukuan = scrapy.Field()
51 |     dizhi = scrapy.Field()
52 |     url = scrapy.Field()
53 |     dianqu = scrapy.Field()
54 | 
55 | 
56 | class Magic(scrapy.Item):
57 |     image_urls = scrapy.Field()
58 |     images = scrapy.Field()
59 |     title = scrapy.Field()
60 |     price = scrapy.Field()
61 |     fukuan = scrapy.Field()
62 |     dizhi = scrapy.Field()
63 |     url = scrapy.Field()
64 |     dianqu = scrapy.Field()
65 | 
66 | 
67 | 
68 | class ShouJike(scrapy.Item):
69 |     image_urls = scrapy.Field()
70 |     images = scrapy.Field()
71 |     title = scrapy.Field()
72 |     price = scrapy.Field()
73 |     fukuan = scrapy.Field()
74 |     dizhi = scrapy.Field()
75 |     url = scrapy.Field()
76 |     dianqu = scrapy.Field()
77 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TaobaoclassSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | import pymongo
 8 | from scrapy.conf import settings
 9 | from taobao.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe
10 | 
11 | 
12 | class TaobaoclassPipeline(object):
13 |     def process_item(self, item, spider):
14 |         host = settings('MONGODB_HOST')
15 |         port = settings('MONGODB_PORT')
16 |         dbName = settings('MONGODB_DBNAME')
17 |         client = pymongo.MongoClient(host=host,port=port)
18 |         tdb = client[dbName]
19 | 
20 |         if isinstance(item,Iphone):
21 |             self.post = tdb[settings['MONGODB_DOCNAME_IP']]
22 |         elif isinstance(item, Samsung):
23 |             self.post = tdb[settings['MONGODB_DOCNAME_SAM']]
24 |         elif isinstance(item, HuaWei):
25 |             self.post = tdb[settings['MONGODB_DOCNAME_HW']]
26 |         elif isinstance(item, ShouJiKe):
27 |             self.post = tdb[settings['MONGODB_DOCNAME_SJK']]
28 |         elif isinstance(item, Magic):
29 |             self.post = tdb[settings['MONGODB_DOCNAME_MAG']]
30 |         taobao = dict(item)
31 |         self.post.insert(taobao)
32 | 
33 |         return item
34 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for taobaoclass project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'taobaoclass'
 13 | 
 14 | SPIDER_MODULES = ['taobaoclass.spiders']
 15 | NEWSPIDER_MODULE = 'taobaoclass.spiders'
 16 | 
 17 | MONGODB_HOST = '127.0.0.1'
 18 | MONGODB_PORT = 27017
 19 | MONGODB_DBNAME = 'taobao'
 20 | MONGODB_DOCNAME_IP = 'ipad'
 21 | MONGODB_DOCNAME_SAM = 'samsung'
 22 | MONGODB_DOCNAME_HW = 'huawei'
 23 | MONGODB_DOCNAME_MAG = 'magic'
 24 | MONGODB_DOCNAME_SJK = 'shoujike'
 25 | 
 26 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 27 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 28 | 
 29 | # Obey robots.txt rules
 30 | ROBOTSTXT_OBEY = False
 31 | 
 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 33 | #CONCURRENT_REQUESTS = 32
 34 | 
 35 | # Configure a delay for requests for the same website (default: 0)
 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 37 | # See also autothrottle settings and docs
 38 | #DOWNLOAD_DELAY = 3
 39 | # The download delay setting will honor only one of:
 40 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 41 | #CONCURRENT_REQUESTS_PER_IP = 16
 42 | 
 43 | # Disable cookies (enabled by default)
 44 | COOKIES_ENABLED = True
 45 | 
 46 | # Disable Telnet Console (enabled by default)
 47 | #TELNETCONSOLE_ENABLED = False
 48 | 
 49 | # Override the default request headers:
 50 | DEFAULT_REQUEST_HEADERS = {
 51 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 52 |   'Accept-Language': 'en',
 53 | }
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'taobaoclass.middlewares.TaobaoclassSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'taobaoclass.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |    'taobaoclass.pipelines.TaobaoclassPipeline': 300,
 77 |     'scrapy.contrib.pipeline.images.ImagePipeline':1
 78 | }
 79 | IMAGES_STORE = 'pic/'
 80 | IMAGES_URL_FIELD = 'image_urls'
 81 | 
 82 | # Enable and configure the AutoThrottle extension (disabled by default)
 83 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 84 | #AUTOTHROTTLE_ENABLED = True
 85 | # The initial download delay
 86 | #AUTOTHROTTLE_START_DELAY = 5
 87 | # The maximum download delay to be set in case of high latencies
 88 | #AUTOTHROTTLE_MAX_DELAY = 60
 89 | # The average number of requests Scrapy should be sending in parallel to
 90 | # each remote server
 91 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 92 | # Enable showing throttling stats for every response received:
 93 | #AUTOTHROTTLE_DEBUG = False
 94 | 
 95 | # Enable and configure HTTP caching (disabled by default)
 96 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 97 | #HTTPCACHE_ENABLED = True
 98 | #HTTPCACHE_EXPIRATION_SECS = 0
 99 | #HTTPCACHE_DIR = 'httpcache'
100 | #HTTPCACHE_IGNORE_HTTP_CODES = []
101 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
102 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/taobaoclass/taobaoclass/spiders/taobao_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import json
 4 | from taobaoclass import items
 5 | from taobaoclass.items import Iphone,Samsung,Magic,HuaWei,ShouJiKe
 6 | import urllib.parse
 7 | 
 8 | 
 9 | class TaobaoSpiderSpider(scrapy.Spider):
10 |     name = "taobao_spider"
11 |     totalItem = ['magic','华为mate9']
12 |     allowed_domains = ["taobao.com"]
13 |     start_urls = []
14 |     count = 0
15 |     total = 0
16 |     while(count < 500):
17 |         for eveItem in totalItem:
18 |             count = count + 13
19 |             new_url = 'https://s.taobao.com/api?_ksTS=1488147288907_219&ajax=true&m=customized&q=' + urllib.parse.quote(eveItem) + '&imgfile=&js=1&stats_click=search_radio_all%3A1&ie=utf8&s=' + str(count) + '&bcoffset=-3'
20 |             start_urls.append(new_url)
21 |     print(start_urls)
22 | 
23 | 
24 |     def parse(self, response):
25 |         try:
26 |             html = json.loads(response.body.decode().replace('}}})','}}}').replace("jsonp220(",''))
27 |             for eve in html['API.CustomizedApi']['itemlist']['auctions']:
28 |                 print("++++++++++++++++++++++++++++++++++++++++")
29 |                 if 'ipad' in str(response.url):
30 |                     items = Iphone()
31 |                     print("ipad")
32 |                 elif 'samsung' in str(response.url):
33 |                     items = Samsung()
34 |                     print("Samsung")
35 |                 elif 'mate9' in str(response.url):
36 |                     item = HuaWei()
37 |                     print('huawei')
38 |                 else:
39 |                     item = ShouJiKe()
40 |                     print('shoujike')
41 |                 img = []
42 |                 self.total = self.total + 1
43 |                 item['title'] = eve['raw_title']
44 |                 item['price'] = eve['view_price']
45 |                 item['fukuan'] = eve['view_sales']
46 |                 item['dizhi'] = eve['item_loc']
47 |                 item['url'] = 'https:' + eve['comment_url']
48 |                 item['dianpu'] = eve['nick']
49 |                 img.append(str('http://' + eve['pic_url']))
50 |                 item['image_urls'] = img
51 |                 yield item
52 |         except Exception as e:
53 |                 print(e)


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = tutorial.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = tutorial
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/__init__.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class TutorialItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 
16 | class DmozItem(scrapy.Item):
17 |     title = scrapy.Field()
18 |     link = scrapy.Field()
19 |     desc = scrapy.Field()
20 |     name = scrapy.Field()
21 |     price = scrapy.Field()
22 |     last_updated = scrapy.Field(serializer = str)
23 | class TestItem(scrapy.Item):
24 |     id = scrapy.Field()
25 |     name = scrapy.Field()
26 |     description = scrapy.Field()
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TutorialSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TutorialPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for tutorial project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'tutorial'
13 | 
14 | SPIDER_MODULES = ['tutorial.spiders']
15 | NEWSPIDER_MODULE = 'tutorial.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'tutorial (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'tutorial.middlewares.TutorialSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'tutorial.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'tutorial.pipelines.TutorialPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/settings.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/__init__.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | # from tutorial.tutorial.items import DmozItem
 4 | 
 5 | 
 6 | class DmozSpider(scrapy.Spider):
 7 | 
 8 |     name = "dmoz"
 9 |     allowed_domains = ['dmoz.org']
10 |     start_urls = [
11 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
12 |         "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
13 |     ]
14 |     # def parse(self, response):
15 |     #     # filename = response.url.split("/")[-2]
16 |     #     # with open(filename,"wb") as f:
17 |     #     #     f.write(response.body)
18 |     #     for sel in response.xpath('//ul/li'):
19 |     #         # title = sel.xpath('a/text()').extract()
20 |     #         # link  = sel.xpath('a/@href').extract()
21 |     #         # desc = self.xpath('text()').extract()
22 |     #         # print(title,link,desc)
23 |     #         item = DmozItem()
24 |     #         item['title'] = sel.xpath('a/text()').extract()
25 |     #         item['scrlink'] = sel.xpath('a/@href').extract()
26 |     #         item['desc'] = sel.xpath('text()').extract()
27 |     #         yield item


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/dmoz_spider.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | 
 5 | class MydomainSpider(scrapy.Spider):
 6 |     name = 'mydomain'
 7 |     allowed_domains = ['mydomain.com']
 8 |     start_urls = ['http://mydomain.com/']
 9 | 
10 |     def parse(self, response):
11 |         pass
12 | class MySpider(scrapy.Spider):
13 |     name = "example.com"
14 |     allowed_domains = ['example.com']
15 |     start_urls = [
16 |         'http://www.example.com/1.html',
17 |         'http://www.example.com/2.html',
18 |         'http://www.example.com/3.html',
19 |     ]
20 | 
21 |     def parse(self, response):
22 |         self.log('A response from %s just arrived!' % response.url)


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/tutorial/tutorial/spiders/mydomain.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zaobao.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zaobao
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__init__.py


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZaobaoItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     name = scrapy.Field()
15 |     url = scrapy.Field()
16 |     data = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZaobaoSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ZaobaoPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for zaobao project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'zaobao'
13 | 
14 | SPIDER_MODULES = ['zaobao.spiders']
15 | NEWSPIDER_MODULE = 'zaobao.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zaobao (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'zaobao.middlewares.ZaobaoSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'zaobao.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'zaobao.pipelines.ZaobaoPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/scrapy爬虫简单项目/zaobao/zaobao/spiders/__pycache__/zaobao_spider.cpython-36.pyc


--------------------------------------------------------------------------------
/scrapy爬虫简单项目/zaobao/zaobao/spiders/zaobao_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | from zaobao.items import ZaobaoItem
 5 | 
 6 | 
 7 | class ZaobaoSpiderSpider(scrapy.Spider):
 8 |     name = "zaobao_spider"
 9 |     allowed_domains = ["zaobao.com"]
10 |     start_urls = ['http://zaobao.com/']
11 | 
12 |     def parse(self, response):
13 |         for eve in response.xpath('//*[@id="DressUp]/div/div/div/div/a/@href'):
14 |             full_url = response.urljoin(eve.extract())
15 |             yield scrapy.Request(full_url,callback=self.parse_news)
16 | 
17 |     def parse_news(self,response):
18 |         item = ZaobaoItem()
19 |         item['name'] = response.xpath('//*[@id="MainCourse"]/div/h1/text()').extract()
20 |         item['url'] = response.xpath('//*[@id="MainCourse]/div/div[2]').extract()
21 |         print(item)
22 |         yield item


--------------------------------------------------------------------------------
/zhihu.com/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhihuuser.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihuuser
12 | 


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__init__.py


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy import Item,Field
10 | 
11 | class ZhihuuserItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     id = Field()
15 |     name = Field()
16 |     avatar_url = Field()
17 |     user_type = Field()
18 |     answer_count = Field()
19 |     url = Field()
20 |     url_token = Field()
21 |     headline = Field()
22 | 
23 | 


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhihuuserSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | class ZhihuuserPipeline(object):
10 |     def __init__(self, mondo_uri, mongo_db):
11 |         self.mongo_uri = mondo_uri
12 |         self.mongo_db = mongo_db
13 | 
14 |     @classmethod
15 |     def from_crawler(cls,crawler):
16 |         return cls(
17 |             mongo_uri = crawler.settings.get('MONGO_URI'),
18 |             mongo_db = crawler.settings.get('MONGO_DATABASE','items')
19 |         )
20 | 
21 |     def  open_spider(self,spider):
22 |         self.client = pymongo.MongoClient()
23 |         self.db = self.client['zhihuuser']
24 | 
25 |     def close_spider(self,spider):
26 |         self.client.close()
27 | 
28 |     def process_item(self, item, spider):
29 |         # collection_name = item.__class__.__name__
30 |         # self.db[collection_name].insert(dict(item))
31 |         #去重,如果有更新，没有就插入
32 |         self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
33 |         return item
34 | 


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for zhihuuser project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'zhihuuser'
13 | 
14 | SPIDER_MODULES = ['zhihuuser.spiders']
15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 |   'Accept-Language': 'en',
45 |   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
46 |     'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |    'zhihuuser.pipelines.ZhihuuserPipeline': 300,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 
94 | 
95 | #修改调度器
96 | # SCHEDULER = "scrapy_redis.scheduler.Scheduler"
97 | # DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu.com/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu.com/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | 
 4 | import scrapy
 5 | from scrapy import spider,Request
 6 | from ..items import ZhihuuserItem
 7 | 
 8 | class ZhihuSpider(scrapy.Spider):
 9 |     name = "zhihu"
10 |     allowed_domains = ["zhihu.com"]
11 |     start_urls = ['http://www.zhihu.com/']
12 | 
13 |     start_user = 'excited-vczh'
14 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
15 |     user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
16 | 
17 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
18 |     follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 | 
20 |     #粉丝列表
21 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
22 |     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
23 | 
24 |     def start_requests(self):
25 |         # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
26 |         yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user)
27 |         yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows)
28 | 
29 |     #用户个人信息
30 |     def parse_user(self, response):
31 |         result = json.loads(response.text)
32 |         item = ZhihuuserItem()
33 |         for field in item.fields:
34 |             #如果定义的item是获取的键名之一，就赋值
35 |             if field in result.keys():
36 |                 item[field] = result.get(field)
37 |         yield item
38 | 
39 |         yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows)
40 |         yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
41 |         yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
42 | 
43 |     #关注着信息
44 |     def parse_follows(self,response):
45 | 
46 |         results = json.loads(response.text)
47 | 
48 |         #先判断data键名是否存在
49 |         if 'data' in results.keys():
50 |             for result in results.get('data'):
51 |                 yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user)
52 | 
53 |         #获取下一页链接，然后继续对下一页数据进行处理
54 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 |             next_page = results.get('paging').get('next')
56 |             yield Request(next_page,self.parse_follows)
57 | 
58 |     #粉丝信息
59 |     def parse_followers(self, response):
60 | 
61 |         results = json.loads(response.text)
62 | 
63 |         # 先判断data键名是否存在
64 |         if 'data' in results.keys():
65 |             for result in results.get('data'):
66 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
67 |                               callback=self.parse_user)
68 | 
69 |         # 获取下一页链接，然后继续对下一页数据进行处理
70 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
71 |             next_page = results.get('paging').get('next')
72 |             yield Request(next_page, self.parse_followers)


--------------------------------------------------------------------------------
/zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhihuuser.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | #project = zhihuuser
12 | #部署项目到主机上，部署分布式，利用scrapyd
13 | url = http://localhost:6800/addversion.json
14 | projuct =  zhihuuser


--------------------------------------------------------------------------------
/zhihu/zhihuuser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__init__.py


--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu/zhihuuser/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy import Item,Field
10 | 
11 | class ZhihuuserItem(Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     id = Field()
15 |     name = Field()
16 |     avatar_url = Field()
17 |     user_type = Field()
18 |     answer_count = Field()
19 |     url = Field()
20 |     url_token = Field()
21 |     headline = Field()
22 | 
23 | 


--------------------------------------------------------------------------------
/zhihu/zhihuuser/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhihuuserSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/zhihu/zhihuuser/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | class ZhihuuserPipeline(object):
10 |     def __init__(self, mondo_uri, mongo_db):
11 |         self.mongo_uri = mondo_uri
12 |         self.mongo_db = mongo_db
13 | 
14 |     @classmethod
15 |     def from_crawler(cls,crawler):
16 |         return cls(
17 |             mongo_uri = crawler.settings.get('MONGO_URI'),
18 |             mongo_db = crawler.settings.get('MONGO_DATABASE','items')
19 |         )
20 | 
21 |     def  open_spider(self,spider):
22 |         self.client = pymongo.MongoClient()
23 |         self.db = self.client['zhihuuser']
24 | 
25 |     def close_spider(self,spider):
26 |         self.client.close()
27 | 
28 |     def process_item(self, item, spider):
29 |         # collection_name = item.__class__.__name__
30 |         # self.db[collection_name].insert(dict(item))
31 |         #去重,如果有更新，没有就插入
32 |         self.db['user'].update({'url_token':item['url_token']},{'$set':item},True)
33 |         return item
34 | 


--------------------------------------------------------------------------------
/zhihu/zhihuuser/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for zhihuuser project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'zhihuuser'
 13 | 
 14 | SPIDER_MODULES = ['zhihuuser.spiders']
 15 | NEWSPIDER_MODULE = 'zhihuuser.spiders'
 16 | 
 17 | 
 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 19 | #USER_AGENT = 'zhihuuser (+http://www.yourdomain.com)'
 20 | 
 21 | # Obey robots.txt rules
 22 | ROBOTSTXT_OBEY = False
 23 | 
 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 25 | #CONCURRENT_REQUESTS = 32
 26 | 
 27 | # Configure a delay for requests for the same website (default: 0)
 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 29 | # See also autothrottle settings and docs
 30 | #DOWNLOAD_DELAY = 3
 31 | # The download delay setting will honor only one of:
 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 33 | #CONCURRENT_REQUESTS_PER_IP = 16
 34 | 
 35 | # Disable cookies (enabled by default)
 36 | #COOKIES_ENABLED = False
 37 | 
 38 | # Disable Telnet Console (enabled by default)
 39 | #TELNETCONSOLE_ENABLED = False
 40 | 
 41 | # Override the default request headers:
 42 | DEFAULT_REQUEST_HEADERS = {
 43 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 44 |   'Accept-Language': 'en',
 45 |   'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0',
 46 |     'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'
 47 | }
 48 | 
 49 | # Enable or disable spider middlewares
 50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 51 | #SPIDER_MIDDLEWARES = {
 52 | #    'zhihuuser.middlewares.ZhihuuserSpiderMiddleware': 543,
 53 | #}
 54 | 
 55 | # Enable or disable downloader middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 57 | #DOWNLOADER_MIDDLEWARES = {
 58 | #    'zhihuuser.middlewares.MyCustomDownloaderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable extensions
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 63 | #EXTENSIONS = {
 64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 65 | #}
 66 | 
 67 | # Configure item pipelines
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 69 | ITEM_PIPELINES = {
 70 |    'zhihuuser.pipelines.ZhihuuserPipeline': 300,
 71 |     #加入scrapy_redis中间件,分布式,不注释此行，每台机器爬取的item存储到各自的数据库，网络传输压力大
 72 |     #为了使每台机器爬取的item不存储到数据库，注释此行
 73 |     # 'scrapy_redis.pipelines.RedisPipeline':301
 74 | }
 75 | 
 76 | # Enable and configure the AutoThrottle extension (disabled by default)
 77 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 78 | #AUTOTHROTTLE_ENABLED = True
 79 | # The initial download delay
 80 | #AUTOTHROTTLE_START_DELAY = 5
 81 | # The maximum download delay to be set in case of high latencies
 82 | #AUTOTHROTTLE_MAX_DELAY = 60
 83 | # The average number of requests Scrapy should be sending in parallel to
 84 | # each remote server
 85 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 86 | # Enable showing throttling stats for every response received:
 87 | #AUTOTHROTTLE_DEBUG = False
 88 | 
 89 | # Enable and configure HTTP caching (disabled by default)
 90 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 91 | #HTTPCACHE_ENABLED = True
 92 | #HTTPCACHE_EXPIRATION_SECS = 0
 93 | #HTTPCACHE_DIR = 'httpcache'
 94 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 95 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 96 | 
 97 | 
 98 | #分布式
 99 | 
100 | #修改调度器
101 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
102 | #开启去重
103 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
104 | 
105 | #redis数据库连接信息
106 | REDIS_URL = 'redis://127.0.0.1:6379'
107 | 
108 | #爬取完不清空请求队列和指纹 ，没什么用，一般默认False
109 | # SCHEDULER_PERSIST = True
110 | #在每次爬取的时候，都会把指纹和队列清空，相当于重新进行了爬取
111 | #SCHEDULER_FLUSH_ON_START = True


--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/zhihu/zhihuuser/spiders/__pycache__/zhihu.cpython-36.pyc


--------------------------------------------------------------------------------
/zhihu/zhihuuser/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | 
 4 | import scrapy
 5 | from scrapy import spider,Request
 6 | from ..items import ZhihuuserItem
 7 | 
 8 | class ZhihuSpider(scrapy.Spider):
 9 |     name = "zhihu"
10 |     allowed_domains = ["zhihu.com"]
11 |     start_urls = ['http://www.zhihu.com/']
12 | 
13 |     start_user = 'excited-vczh'
14 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
15 |     user_query = 'allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
16 | 
17 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
18 |     follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 | 
20 |     #粉丝列表
21 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
22 |     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
23 | 
24 |     def start_requests(self):
25 |         # url = 'https://www.zhihu.com/api/v4/members/wang-qing-qing-70-81?include=allow_message,is_followed,is_following,is_org,is_blocking,employments,answer_count,follower_count,articles_count,gender,badge[?(type=best_answerer)].topics'
26 |         yield Request(self.user_url.format(user = self.start_user, include = self.user_query), self.parse_user)
27 |         yield Request(self.follows_url.format(user = self.start_user, include = self.follows_query,offset = 0, limit = 20), callback = self.parse_follows)
28 | 
29 |     #用户个人信息
30 |     def parse_user(self, response):
31 |         result = json.loads(response.text)
32 |         item = ZhihuuserItem()
33 |         for field in item.fields:
34 |             #如果定义的item是获取的键名之一，就赋值
35 |             if field in result.keys():
36 |                 item[field] = result.get(field)
37 |         yield item
38 | 
39 |         yield Request(self.user_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_follows)
40 |         yield Request(self.follows_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
41 |         yield Request(self.followers_url.format(user = result.get('url_token'), include=self.follows_query, limit=20, offset = 0),callback = self.parse_followers)
42 | 
43 |     #关注着信息
44 |     def parse_follows(self,response):
45 | 
46 |         results = json.loads(response.text)
47 | 
48 |         #先判断data键名是否存在
49 |         if 'data' in results.keys():
50 |             for result in results.get('data'):
51 |                 yield Request(self.user_url.format(user = result.get('url_token'), include = self.user_query), callback = self.parse_user)
52 | 
53 |         #获取下一页链接，然后继续对下一页数据进行处理
54 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
55 |             next_page = results.get('paging').get('next')
56 |             yield Request(next_page,self.parse_follows)
57 | 
58 |     #粉丝信息
59 |     def parse_followers(self, response):
60 | 
61 |         results = json.loads(response.text)
62 | 
63 |         # 先判断data键名是否存在
64 |         if 'data' in results.keys():
65 |             for result in results.get('data'):
66 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
67 |                               callback=self.parse_user)
68 | 
69 |         # 获取下一页链接，然后继续对下一页数据进行处理
70 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
71 |             next_page = results.get('paging').get('next')
72 |             yield Request(next_page, self.parse_followers)


--------------------------------------------------------------------------------
/可视化文件显示程序.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YPSheng/python/4fa15657937bab6c5662bbd7b4a73dd81951b34f/可视化文件显示程序.zip


--------------------------------------------------------------------------------
/基于python的turtle画出叮当猫.py:
--------------------------------------------------------------------------------
  1 | import turtle
  2 | 
  3 | turtle.speed(5)
  4 | turtle.circle(50)
  5 | turtle.begin_fill()           #画头
  6 | turtle.circle(85)
  7 | turtle.fillcolor("blue")
  8 | turtle.end_fill()
  9 | 
 10 | # turtle.penup()
 11 | # turtle.goto(0,20)
 12 | # turtle.pendown()
 13 | 
 14 | # turtle.begin_fill()
 15 | # turtle.circle(35)
 16 | # turtle.fillcolor("white")
 17 | # turtle.end_fill()
 18 | 
 19 | turtle.begin_fill()                #画脸
 20 | turtle.circle(60)
 21 | turtle.fillcolor("white")
 22 | turtle.end_fill()
 23 | 
 24 | 
 25 | 
 26 | turtle.penup()
 27 | turtle.goto(-20,95)             #化左眼眶
 28 | turtle.pendown()
 29 | turtle.begin_fill()
 30 | turtle.circle(19)
 31 | turtle.fillcolor("white")
 32 | turtle.end_fill()
 33 | 
 34 | 
 35 | 
 36 | turtle.penup()                  #画右眼眶
 37 | turtle.goto(20,95)
 38 | turtle.pendown()
 39 | turtle.begin_fill()
 40 | turtle.circle(19)
 41 | turtle.fillcolor("white")
 42 | turtle.end_fill()
 43 | 
 44 | turtle.penup()                 #化左眼珠
 45 | turtle.goto(-8,111)
 46 | turtle.pendown()
 47 | turtle.begin_fill()
 48 | turtle.fillcolor("black")
 49 | turtle.circle(3)
 50 | turtle.end_fill()
 51 | 
 52 | 
 53 | turtle.penup()              #画右眼珠
 54 | turtle.goto(8,111)
 55 | turtle.pendown()
 56 | turtle.begin_fill()
 57 | turtle.fillcolor("black")
 58 | turtle.circle(3)
 59 | turtle.end_fill()
 60 | 
 61 | turtle.penup()              #画鼻子
 62 | turtle.goto(0,85)
 63 | turtle.pendown()
 64 | turtle.begin_fill()
 65 | turtle.circle(10)
 66 | turtle.fillcolor("red")
 67 | turtle.end_fill()
 68 | 
 69 | 
 70 | turtle.goto(0,30)               #画竖线
 71 | 
 72 | turtle.penup()            #左边第一根胡子
 73 | turtle.goto(-20,70)
 74 | turtle.pendown()
 75 | turtle.goto(-45,80)
 76 | 
 77 | turtle.penup()                  #左边第二根胡子
 78 | turtle.goto(-20,60)
 79 | turtle.pendown()
 80 | turtle.goto(-47,60)
 81 | 
 82 | turtle.penup()                  #左边第三根胡子
 83 | turtle.goto(-20,50)
 84 | turtle.pendown()
 85 | turtle.goto(-47,40)
 86 | 
 87 | turtle.penup()                  #右边第三根胡子
 88 | turtle.goto(20,50)
 89 | turtle.pendown()
 90 | turtle.goto(47,40)
 91 | 
 92 | 
 93 | turtle.penup()                  #右边第二根胡子
 94 | turtle.goto(20,60)
 95 | turtle.pendown()
 96 | turtle.goto(47,60)
 97 | 
 98 | 
 99 | turtle.penup()                  #左边第一根胡子
100 | turtle.goto(20,70)
101 | turtle.pendown()
102 | turtle.goto(45,80)
103 | 
104 | turtle.penup()                  #右边胳膊1
105 | turtle.goto(50,20)
106 | turtle.pendown()
107 | turtle.goto(100,-10)
108 | 
109 | 
110 | turtle.penup()                  #右边胳膊2
111 | turtle.goto(50,-20)
112 | turtle.pendown()
113 | turtle.goto(80,-40)
114 | 
115 | turtle.begin_fill()
116 | turtle.goto(100,-10)
117 | turtle.goto(50,20)
118 | turtle.goto(50,-20)
119 | turtle.goto(80,-40)
120 | turtle.fillcolor("yellow")
121 | turtle.end_fill()
122 | 
123 | 
124 | 
125 | turtle.penup()                  #右手
126 | turtle.goto(100,-50)
127 | turtle.pendown()
128 | turtle.begin_fill()
129 | turtle.circle(20)
130 | turtle.fillcolor("blue")
131 | turtle.end_fill()
132 | 
133 | 
134 | 
135 | turtle.penup()                  #左边胳膊1
136 | turtle.goto(-50,20)
137 | turtle.pendown()
138 | turtle.goto(-100,-10)
139 | 
140 | 
141 | turtle.penup()                  #左边胳膊2
142 | turtle.goto(-50,-20)
143 | turtle.pendown()
144 | turtle.goto(-80,-40)
145 | 
146 | turtle.begin_fill()
147 | turtle.goto(-100,-10)
148 | turtle.goto(-50,20)
149 | turtle.goto(-50,-20)
150 | turtle.goto(-80,-40)
151 | turtle.fillcolor("yellow")
152 | turtle.end_fill()
153 | 
154 | turtle.penup()                  #左手
155 | turtle.goto(-100,-53)
156 | turtle.pendown()
157 | turtle.begin_fill()
158 | turtle.circle(20)
159 | turtle.fillcolor("blue")
160 | turtle.end_fill()
161 | 
162 | 
163 | turtle.penup()                  #左手
164 | turtle.goto(-50,-20)
165 | turtle.pendown()
166 | turtle.goto(-50,-100)
167 | 
168 | turtle.penup()                  #左手
169 | turtle.goto(50,-20)
170 | turtle.pendown()
171 | turtle.goto(50,-100)
172 | 
173 | 
174 | turtle.begin_fill()
175 | turtle.penup()
176 | turtle.goto(50,-120)
177 | turtle.pendown()
178 | turtle.circle(10)
179 | turtle.fillcolor("blue")
180 | turtle.end_fill()
181 | 
182 | turtle.begin_fill()
183 | turtle.goto(20,-120)
184 | turtle.circle(10)
185 | turtle.fillcolor("blue")
186 | turtle.end_fill()
187 | 
188 | 
189 | turtle.penup()
190 | turtle.goto(50,-100)
191 | turtle.pendown()
192 | turtle.goto(20,-100)
193 | 
194 | 
195 | 
196 | turtle.penup()
197 | turtle.goto(-50,-120)
198 | turtle.pendown()
199 | turtle.begin_fill()
200 | turtle.circle(10)
201 | turtle.goto(-20,-120)
202 | turtle.circle(10)
203 | turtle.fillcolor("blue")
204 | turtle.end_fill()
205 | 
206 | turtle.penup()
207 | turtle.goto(-20,-100)
208 | turtle.pendown()
209 | turtle.goto(-50,-100)
210 | 
211 | 
212 | turtle.penup()
213 | turtle.goto(-20,-100)
214 | turtle.pendown()
215 | turtle.goto(-20,-85)
216 | 
217 | turtle.goto(20,-85)
218 | turtle.goto(20,-100)
219 | 
220 | turtle.penup()
221 | turtle.goto(-50,-20)
222 | turtle.pendown()
223 | 
224 | turtle.begin_fill()
225 | turtle.goto(50,-20)
226 | turtle.goto(50,-85)
227 | turtle.goto(-50,-85)
228 | turtle.goto(-50,-20)
229 | turtle.fillcolor("blue")
230 | turtle.end_fill()
231 | 
232 | 
233 | turtle.penup()
234 | turtle.goto(0,-20)    #铃铛
235 | turtle.pendown()
236 | turtle.begin_fill()
237 | turtle.circle(10)
238 | turtle.fillcolor("yellow")
239 | turtle.end_fill()
240 | 
241 | 
242 | 
243 | turtle.penup()
244 | turtle.goto(-10,-10)
245 | turtle.pendown()
246 | turtle.goto(10,-10)
247 | 
248 | 
249 | turtle.penup()
250 | turtle.goto(-50,20)
251 | turtle.pendown()
252 | turtle.begin_fill()
253 | turtle.goto(50,20)
254 | turtle.goto(50,0)
255 | turtle.goto(-50,0)
256 | turtle.goto(-50,20)
257 | turtle.fillcolor("red")
258 | turtle.end_fill()
259 | 
260 | 
261 | turtle.penup()
262 | turtle.goto(50,0)
263 | turtle.pendown()
264 | turtle.begin_fill()
265 | turtle.circle(10)
266 | turtle.fillcolor("red")
267 | turtle.end_fill()
268 | 
269 | 
270 | turtle.penup()
271 | turtle.goto(-50,0)
272 | turtle.pendown()
273 | turtle.begin_fill()
274 | turtle.circle(10)
275 | turtle.fillcolor("red")
276 | turtle.end_fill()
277 | 
278 | 
279 | turtle.penup()                  #内裤
280 | turtle.goto(-50,-70)
281 | turtle.pendown()
282 | turtle.begin_fill()
283 | turtle.goto(50,-70)
284 | turtle.goto(50,-50)
285 | turtle.goto(-50,-50)
286 | turtle.goto(-50,-70)
287 | turtle.fillcolor("red")
288 | turtle.end_fill()
289 | 
290 | turtle.penup()
291 | turtle.goto(-10,-70)
292 | turtle.pendown()
293 | turtle.begin_fill()
294 | turtle.goto(-10,-85)
295 | turtle.goto(10,-85)
296 | turtle.goto(10,-70)
297 | turtle.goto(-10,-70)
298 | turtle.fillcolor("red")
299 | turtle.end_fill()
300 | 
301 | turtle.penup()
302 | turtle.goto(-100,200)
303 | turtle.pendown()
304 | s = "机器猫中的战斗猫"
305 | turtle.write(s,font = ("Arial",20,"normal"))
306 | 
307 | 
308 | turtle.done()


--------------------------------------------------------------------------------
/基于python的turtle的桌面弹球.py:
--------------------------------------------------------------------------------
 1 | from tkinter import *
 2 | from random import randint
 3 | 
 4 | def getRandomColor():
 5 |     color = "#"
 6 |     for j in range(6):
 7 |         color += toHexChar(randint(0,15))
 8 |     return color
 9 | def toHexChar(hexValue):
10 |     if 0 <= hexValue <= 9:
11 |         return chr(hexValue + ord('0'))
12 |     else:
13 |         return chr(hexValue - 10 + ord("A"))
14 | class Ball:
15 |     def __init__(self):
16 |         self.x = 0
17 |         self.y = 0
18 |         self.dx = 2
19 |         self.dy = 2
20 |         self.radius = 3
21 |         self.color = getRandomColor()
22 | 
23 | class BounceBalls:
24 |     def __init__(self):
25 |         self.ballList = []
26 |         win = Tk()
27 |         win.title("Bouncing Balls")
28 | 
29 |         self.width = 350
30 |         self.height = 150
31 |         self.canvas = Canvas(win,bg = "white",width = self.width,height = self.height)
32 |         self.canvas.pack()
33 | 
34 | 
35 |         frame = Frame(win)
36 |         frame.pack()
37 |         btStop = Button(frame,text = "Stop",command = self.stop)
38 |         btStop.pack(side = LEFT)
39 |         btResume = Button(frame,text = "Resume",command = self.resume)
40 |         btResume.pack(side = LEFT)
41 |         btAdd = Button(frame,text = "+",command = self.add)
42 |         btAdd.pack(side = LEFT)
43 |         btRemove = Button(frame,text = "-",command = self.remove)
44 |         btRemove.pack(side = LEFT)
45 | 
46 | 
47 |         self.sleepTime = 100
48 |         self.isStopped = False
49 |         self.animate()
50 |         win.mainloop()
51 |     def stop(self):
52 |         self.isStopped = True
53 |     def resume(self):
54 |         self.isStopped = False
55 |         self.animate()
56 |     def add(self):
57 |         self.ballList.append(Ball())
58 |     def remove(self):
59 |         self.ballList.pop()
60 |     def animate(self):
61 |         while not self.isStopped:
62 |             self.canvas.after(self.sleepTime)
63 |             self.canvas.update()
64 |             self.canvas.delete("ball")
65 | 
66 |             for ball in self.ballList:
67 |                 self.redisplayBall(ball)
68 |     def redisplayBall(self,ball):
69 |         if ball.x > self.width or ball.x < 0:
70 |             ball.dx = -ball.dx
71 |         if ball.y > self.height or ball.y < 0:
72 |             ball.y = -ball.y
73 |         ball.x += ball.dx
74 |         ball.y += ball.dy
75 |         self.canvas.create_oval(ball.x - ball.radius,ball.y - ball.radius,ball.x + ball.radius,ball.y + ball.radius,fill = ball.color,tags = "ball")
76 | BounceBalls()


--------------------------------------------------------------------------------
/基于python的turtle移动的小球.py:
--------------------------------------------------------------------------------
 1 | from tkinter import *
 2 | 
 3 | class MovingBall:
 4 |     def __init__(self):
 5 |         win = Tk()
 6 |         win.title("Moving Ball")
 7 | 
 8 |         self.width = 250
 9 |         self.canvas = Canvas(win,width = self.width,height = 200,bg = 'white')
10 |         self.canvas.pack()
11 | 
12 |         frame = Frame(win)
13 |         frame.pack()
14 |         btLeft = Button(frame,text = "Left",command = self.LeftMoving )
15 |         btLeft.pack()
16 |         btRight = Button(frame,text = "Right",command = self.RightMoving)
17 |         btRight.pack()
18 |         btUp = Button(frame,text = "Up",command = self.UpMoving)
19 |         btUp.pack()
20 |         btDown = Button(frame,text = "Down",command = self.DownMoving)
21 |         btDown.pack()
22 |         self.x = 0
23 |         self.y = 0
24 |         self.canvas.create_oval(self.x,self.y,self.x + 10,self.y + 10,fill = "black",tags = "oval")
25 |         win.mainloop()
26 | 
27 |     def LeftMoving(self):
28 |         self.canvas.delete("oval")
29 |         if self.x > 10:
30 |             self.x -= 10
31 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
32 |         else:
33 |             self.x = 250
34 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
35 | 
36 |     def RightMoving(self):
37 |         self.canvas.delete("oval")
38 |         if self.x < 250:
39 |             self.x += 10
40 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
41 |         else:
42 |             self.x = 0
43 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
44 |     def UpMoving(self):
45 |         self.canvas.delete("oval")
46 |         if self.y > 10:
47 |             self.y -= 10
48 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
49 |         else:
50 |             self.y = 200
51 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
52 | 
53 |     def DownMoving(self):
54 |         self.canvas.delete("oval")
55 |         if self.y < 200:
56 |             self.y += 10
57 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
58 |         else:
59 |             self.y = 0
60 |             self.canvas.create_oval(self.x, self.y, self.x + 10, self.y + 10, fill="black", tags="oval")
61 | MovingBall()


--------------------------------------------------------------------------------
/抓取财富网股票信息.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import urllib.request
 3 | import re
 4 | import random
 5 | import time
 6 | #抓取所需内容
 7 | user_agent = ["Mozilla/5.0 (Windows NT 10.0; WOW64)", 'Mozilla/5.0 (Windows NT 6.3; WOW64)',
 8 |               'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
 9 |               'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
10 |               'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
11 |               'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
12 |               'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
13 |               'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
14 |               'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
15 |               'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
16 |               'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
17 |               'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
18 |               'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
19 |               'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
20 |               'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
21 |               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
22 |               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
23 |               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
24 |               'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
25 |               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
26 |               'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
27 |               'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
28 | stock_total=[]   #stock_total：所有页面的股票数据   stock_page：某页的股票数据
29 | for page in range(1,8):
30 |     url='http://quote.stockstar.com/stock/ranklist_a_3_1_'+str(page)+'.html'
31 |     request=urllib.request.Request(url=url,headers={"User-Agent":random.choice(user_agent)})#随机从user_agent列表中抽取一个元素
32 |     try:
33 |         response=urllib.request.urlopen(request)
34 |     except urllib.error.HTTPError as e:            #异常检测
35 |         print('page=',page,'',e.code)
36 |     except urllib.error.URLError as e:
37 |         print('page=',page,'',e.reason)
38 |     content=response.read().decode('gbk')       #读取网页内容
39 |     print('get page',page)                  #打印成功获取的页码
40 |     pattern=re.compile('<tbody[\s\S]*</tbody>')
41 |     body=re.findall(pattern,str(content))
42 |     # for i in body:
43 |     #     print(i)
44 |     pattern=re.compile('>(.*?)<')
45 |     stock_page=re.findall(pattern,body[0])      #正则匹配
46 |     # print(stock_page)
47 |     stock_total.extend(stock_page)
48 |     # print(stock_total)
49 |     time.sleep(random.randrange(1,4))        #每抓一页随机休眠几秒，数值可根据实际情况改动
50 | #删除空白字符
51 | stock_last=stock_total[:]  #stock_last为最终所要得到的股票数据
52 | for data in stock_total:
53 |     if data=='':
54 |         stock_last.remove('')
55 | # print(stock_last)
56 | #打印部分结果
57 | print('代码','\t','简称','   ','\t','最新价','\t','涨跌幅','\t','涨跌额','\t','5分钟涨幅')
58 | for i in range(len(stock_last) - 6):
59 |     print(format(stock_last[i],"6s"),'\t',format(stock_last[i+1],"6s"),' ','\t',format(stock_last[i+2],"6s"),'  ','\t',format(stock_last[i+3],"6s"),'  ','\t',format(stock_last[i+4],"6s"),'  ','\t',format(stock_last[i+5],"6s"))
60 | 


--------------------------------------------------------------------------------
/爬取12306车票信息.py:
--------------------------------------------------------------------------------
  1 | #coding:utf-8
  2 | import ssl
  3 | from urllib import parse
  4 | import re
  5 | import requests
  6 | import json
  7 | import urllib
  8 | 
  9 | #
 10 | ssl._create_default_https_context = ssl._create_unverified_context
 11 | # headers = {
 12 | #             'Cookie':'JSESSIONID=95820ECC00B038495AC43E949F6D4A69; route=6f50b51faa11b987e576cdb301e545c4; BIGipServerotn=351273482.64545.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-20; _jc_save_wfdc_flag=dc',
 13 | #             'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
 14 | # }
 15 | 
 16 | # 获取所有的站点信息
 17 | def get_station():
 18 |     url = 'http://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9028'
 19 |     response = urllib.request.urlopen(url).read()
 20 |     # print(response)
 21 |     return response.decode("utf-8")
 22 | 
 23 | #获取出发点和终点站的信息
 24 | def station(stationinfo,star,end):
 25 |     str2 = stationinfo[20:][:-2]
 26 |     str3 = str2.split('|')
 27 |     order1 = str3.index(star)
 28 |     order2 = str3.index(end)
 29 |     starstation = str3[int(order1) + 1]
 30 |     endstation= str3[int(order2) + 1]
 31 |     return starstation,endstation
 32 | 
 33 | # 获取列车信息
 34 | def getTrainInfo(start,end,date):
 35 | 
 36 |     # params = {
 37 |     train_date = date
 38 |     from_station = start
 39 |     to_station = end
 40 |     purpose_codes = 'ADULT'
 41 |     # }
 42 |     url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes={}'.format(train_date,from_station,to_station,purpose_codes)
 43 |     # print(url)
 44 |     headers = {
 45 |                     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
 46 |                     'Cookie':'JSESSIONID=B201655CD8BCF12D53ADF6CA6D2AA050; route=495c805987d0f5c8c84b14f60212447d; BIGipServerotn=770703882.38945.0000; BIGipServerpool_passport=367854090.50215.0000; _jc_save_fromStation=%u5317%u4EAC%2CBJP; _jc_save_toStation=%u5929%u6D25%2CTJP; _jc_save_fromDate=2017-10-25; _jc_save_toDate=2017-10-21; _jc_save_wfdc_flag=dc'
 47 | 
 48 |     }
 49 |     response = urllib.request.Request(url,headers=headers)
 50 |     response = urllib.request.urlopen(response).read()
 51 |     # response = response.urlopen()
 52 |     return response.decode("utf-8")
 53 | 
 54 | # 获取价钱信息，打印列车的所有信息
 55 | def getTicketInfo(getTrainInfos,train_date,stationinfo):
 56 |     # print(getTrainInfos)
 57 |     getTrainInfos = json.loads(getTrainInfos).get('data').get('result')
 58 | 
 59 |     for getTrainInfo in getTrainInfos:
 60 |         order3 = getTrainInfo.split('|')
 61 |         train_no = order3[2]
 62 |         seat_types = str(order3[-1:])[2:5]
 63 |         if len(seat_types) != 3 :
 64 |             continue
 65 |         from_station_no = str(order3[1:][15])
 66 |         to_station_no = str(order3[1:][16])
 67 |         url  = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no={}&from_station_no={}&to_station_no={}&seat_types={}&train_date={}'.format(train_no,from_station_no,to_station_no,seat_types,train_date)
 68 |         # url = 'https://kyfw.12306.cn/otn/leftTicket/queryTicketPrice?train_no=26000K772632&from_station_no=10&to_station_no=11&seat_types=113&train_date=2017-10-25'
 69 |         headers = {
 70 |                       'Accept':'* / *',
 71 |                       'Accept - Encoding':'gzip, deflate, br',
 72 |                       'Accept - Language':'zh - CN, zh;q = 0.8',
 73 |                       'Cache - Control':'no - cache',
 74 |                       'Connection':'keep - alive',
 75 |                       'Host':'kyfw.12306.cn',
 76 |                       'If - Modified - Since':'0',
 77 |                       'Referer:https':'// kyfw.12306.cn / otn / leftTicket / init',
 78 |                       'X - Requested - With':'XMLHttpRequest',
 79 |                       'Cookie': 'JSESSIONID = B201655CD8BCF12D53ADF6CA6D2AA050;route = 495c805987d0f5c8c84b14f60212447d;BIGipServerotn = 770703882.38945.0000;BIGipServerpool_passport = 367854090.50215.0000;_jc_save_fromStation = % u5317 % u4EAC % 2BJP;_jc_save_toStation = % u5929 % u6D25 % 2TJP;_jc_save_fromDate = 2017 - 10 - 25;_jc_save_toDate = 2017 - 10 - 21;_jc_save_wfdc_flag = dc',
 80 |                       'User - Agent':' Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 61.0.3163.100Safari / 537.36'
 81 |         }
 82 |         response = urllib.request.urlopen(url).read()
 83 |         datas = response.decode("utf-8")
 84 |         com = re.compile('({.*?}})')
 85 |         datas = com.findall(datas)
 86 |         for data in datas:
 87 |             if len(data) > 30:
 88 |                 data = json.loads(data)
 89 |                 # print(data)
 90 |                 datas = data.get('data')
 91 |                 print("------------------本次列车-----------------------------------")
 92 |                 print('本次列车', order3[3])
 93 | 
 94 |                 str2 = stationinfo[20:][:-2]
 95 |                 str3 = str2.split('|')
 96 |                 order1 = str3.index(order3[4])
 97 |                 order2 = str3.index(order3[7])
 98 |                 starstation = str3[int(order1) - 1]
 99 |                 endstation = str3[int(order2) - 1]
100 | 
101 |                 print('出发站点', starstation)
102 |                 print('到达站点', endstation)
103 |                 print('出发时间', order3[8])
104 |                 print('到达时间', order3[9])
105 |                 print('历时时间', order3[10])
106 |                 # print(type(datas))
107 |                 for k in datas:
108 | 
109 |                     if k == 'A9':
110 |                         print('商务座特等座',"：",datas[k])
111 |                     elif k == 'M':
112 |                         print("一等座","：",datas[k])
113 |                     elif k == 'O':
114 |                         print("二等座","：",datas[k])
115 |                     elif k == 'WZ':
116 |                         print("无座","：",datas[k])
117 |                     elif k == 'A4':
118 |                         print("软卧", "：", datas[k])
119 |                     elif k == 'WZ':
120 |                         print("无座", "：", datas[k])
121 |                     elif k == 'F':
122 |                         print("动卧", "：", datas[k])
123 |                     elif k == 'A3':
124 |                         print("硬卧", "：", datas[k])
125 |                     elif k == 'A1':
126 |                         print("硬座", "：", datas[k])
127 |                     elif k == 'A6':
128 |                         print("高级软卧", "：", datas[k])
129 |                     elif k == 'OT':
130 |                         print("其他", "：", datas[k])
131 | 
132 | if __name__ == "__main__":
133 |     start = input('出发车站:')
134 |     end = input('到达车站:')
135 |     date = input("出发时间(如2017.10.25):")
136 |     # 处理时间格式
137 |     date = date.replace('.','-')
138 |     stationinfo = get_station()
139 |     starstation,endstation  = station(stationinfo,start,end)
140 |     getTrainInfo = getTrainInfo(starstation,endstation,date)
141 |     getTicketInfo(getTrainInfo,date,stationinfo)
142 | 
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 


--------------------------------------------------------------------------------
/爬取qq音乐歌曲/爬取扣扣音乐文件.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | import time
  4 | import random
  5 | import requests
  6 | import urllib
  7 | import time
  8 | import codecs
  9 | import urllib3
 10 | def songmid():
 11 |     mid = []
 12 |     name = []
 13 |     url = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?ct=24&qqmusic_ver=1298&new_json=1&remoteplace=sizer.yqq.song_next&searchid=148958880434449513&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w=%E4%BA%94%E6%9C%88%E5%A4%A9&g_tk=1989554541&jsonpCallback=searchCallbacksong5150&loginUin=1093211972&hostUin=0&format=jsonp&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq&needNewCode=0"
 14 |     response = requests.get(url)
 15 |     if json.loads(response.text[23:-1])['data']['song']['curnum'] and json.loads(response.text[23:-1])['data']['song']['curpage']:
 16 |     # if True:
 17 |         for i in range(20):
 18 |             # print(i)
 19 |             jsonpcallback = "searchCallbacksong"+str((random.randint(1000,10000)))
 20 |             if i == 0:
 21 |                 remoteplace = "txt.yqq.song"
 22 |             else:
 23 |                 remoteplace = "sizer.yqq.song_next"
 24 |             # print(i)
 25 |             params= {
 26 |                     'ct': "24",
 27 |                     'qqmusic_ver': "1298:",
 28 |                     'new_json': "1",
 29 |                     'remoteplace': "sizer.yqq.song_next",
 30 |                     'searchid': "148958880434449513",
 31 |                     't': "0",
 32 |                     'aggr': "1",
 33 |                     'cr': "1",
 34 |                     'catzhida': "1",
 35 |                     'lossless': "0",
 36 |                     'flag_qc': "0",
 37 |                     'p': i+1,
 38 |                     'n': str(json.loads(response.text[23:-1])['data']['song']['curnum']),
 39 |                     # 'n': 20,
 40 |                     'w': "%E4%BA%94%E6%9C%88%E5%A4%A9",
 41 |                     'g_tk': "1989554541",
 42 |                     'jsonpcallback': jsonpcallback,
 43 |                     'loginuin': "1093211972",
 44 |                     'hostuin': "0",
 45 |                     'format': "jsonp",
 46 |                     'incharset': "utf8",
 47 |                     'outcharset': "utf-8",
 48 |                     'notice': "0",
 49 |                     'platform': "yqq",
 50 |                     'neednewcode': "0",
 51 |                     'cache-control': "no-cache",
 52 |             }
 53 |             # url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp?"
 54 |             url2 = "https://c.y.qq.com/soso/fcgi-bin/client_search_cp"
 55 |             response2 = requests.get(url2,params=params)
 56 |             # print(json.loads(response2.text[9:-1]))
 57 |             for i in json.loads(response2.content[9:-1])['data']['song']['list']:
 58 |                 if i['file']['media_mid']:
 59 |                     mid.append(i['file']['media_mid'])
 60 |                     name.append(i['name'])
 61 |     print(set(mid))
 62 |     print(len(set(mid)))
 63 |     return mid,name
 64 | url = []
 65 | file = codecs.open('audio2.txt','w')
 66 | def resolve(songmids,name):
 67 | 
 68 |     for i in range(len(songmids)):
 69 |         filename = 'C400' + songmids[i] + '.m4a'
 70 |         # print(songmids[i])
 71 |         guid = int(random.random() * 2147483647) * int(time.time() * 1000) % 10000000000
 72 | 
 73 |         d = {
 74 |             'format': 'json',
 75 |             'cid': 205361747,
 76 |             'uin': 0,
 77 |             'songmid': songmids[i],
 78 |             'filename': filename,
 79 |             'guid': guid,
 80 |             'g_tk':5381,
 81 |             'loginUin':0,
 82 |             'hostUin':0,
 83 |             'notice': '0',
 84 |             'platform':'yqq',
 85 |             'needNewCode':'0',
 86 |         }
 87 |         headers = {
 88 |             'User - Agent':"Mozilla / 5.0(WindowsNT10.0; …) Gecko / 20100101Firefox / 57.0"
 89 |         }
 90 |         r = requests.get('https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg', params=d)
 91 |         try:
 92 |             vkey = json.loads(r.text)['data']['items'][0]['vkey']
 93 |         except:
 94 |             continue
 95 |         if vkey:
 96 |             audio_url = 'http://dl.stream.qqmusic.qq.com/%s?vkey=%s&guid=%s&uin=0&fromtag=66' % (filename, vkey, guid)
 97 |             time.sleep(random.random()*1)
 98 |             url.append(audio_url)
 99 |             file.write(audio_url+'\n')
100 | if __name__ == "__main__":
101 |     songmids,name =songmid()
102 |     resolve(songmids,name)
103 |     file.close()


--------------------------------------------------------------------------------