├── PostHandle.py
├── README.md
├── __init__.py
├── __pycache__
    ├── PostHandle.cpython-34.pyc
    ├── __init__.cpython-34.pyc
    ├── commonUtils.cpython-34.pyc
    ├── contentSettings.cpython-34.pyc
    ├── items.cpython-34.pyc
    ├── mysqlUtils.cpython-34.pyc
    ├── pipelines.cpython-34.pyc
    ├── settings.cpython-34.pyc
    ├── uploadUtils.cpython-34.pyc
    └── urlSettings.cpython-34.pyc
├── check_post.py
├── commonUtils.py
├── contentSettings.py
├── items.py
├── middlewares.py
├── mysqlUtils.py
├── pipelines.py
├── postSettings.py
├── settings.py
├── spiders
    ├── ContentSpider.py
    ├── UrlSpider.py
    ├── __init__.py
    └── __pycache__
    │   ├── ContentSpider.cpython-34.pyc
    │   ├── UrlSpider.cpython-34.pyc
    │   └── __init__.cpython-34.pyc
├── uploadUtils.py
└── urlSettings.py


/PostHandle.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from DgSpider.mysqlUtils import dbhandle_get_content
 4 | from DgSpider.mysqlUtils import dbhandle_update_status
 5 | from DgSpider.uploadUtils import upload_post
 6 | import json
 7 | 
 8 | 
 9 | def post_handel(url):
10 |     result = dbhandle_get_content(url)
11 | 
12 |     title = result[0]
13 |     content = result[1]
14 |     user_id = result[2]
15 |     gid = result[3]
16 |     cs = []
17 | 
18 |     text_list = content.split('[dgimg]')
19 |     for text_single in text_list:
20 |         text_single_c = text_single.split('[/dgimg]')
21 |         if len(text_single_c) == 1:
22 |             cs_json = {"c": text_single_c[0], "i": '', "w": '', "h": ''}
23 |             cs.append(cs_json)
24 |         else:
25 |             # tmp_img_upload_json = upload_img_result.pop()
26 |             pic_flag = text_single_c[1]
27 |             img_params = text_single_c[0].split(';')
28 |             i = img_params[0]
29 |             w = img_params[1]
30 |             h = img_params[2]
31 |             cs_json = {"c": pic_flag, "i": i, "w": w, "h": h}
32 |             cs.append(cs_json)
33 | 
34 |     strcs = json.dumps(cs)
35 |     json_data = {"apisign": "99ea3eda4b4554adag2c4a741d58baa60",
36 |                  "user_id": user_id,
37 |                  "gid": gid,
38 |                  "t": title,
39 |                  "cs": strcs}
40 |     # 上传帖子
41 |     result_uploadpost = upload_post(json_data)
42 | 
43 |     # 更新状态2，成功上传帖子
44 |     result_updateresult = dbhandle_update_status(url, 2)
45 | 
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 网页爬虫设计
 2 | ====  
 3 | 
 4 | 创建项目
 5 | -------  
 6 | 
 7 |  - 进入指定文件夹，右击空白处>在此处打开命令行窗口
 8 |  - 创建项目
 9 | ```
10 | Scrapy startproject DgSpider
11 | ```
12 | 
13 | 主要代码文件说明
14 | -------  
15 | 
16 |  - 爬虫主类  ：UrlSpider.py、ContentSpider.py
17 | 	 *项目包含2个爬虫主类，分别用于爬取文章列表页所有文章的URL、文章详情页具体内容*
18 |  - 内容处理类 ：pipelines.py
19 | 	 *处理内容*
20 |  - 传输字段类 ：items.py
21 | 	*暂存爬取的数据*
22 |  - 设置文件 ：settings.py
23 | 	*用于主要的参数配置*
24 |  - 数据库操作：mysqlUtils.py
25 | 	  *链接操作数据库*
26 |  - 文本处理、上传文本：PostHandle.py
27 | 	  *处理文本*
28 | 
29 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__init__.py


--------------------------------------------------------------------------------
/__pycache__/PostHandle.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/PostHandle.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/__init__.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/__init__.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/commonUtils.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/commonUtils.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/contentSettings.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/contentSettings.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/items.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/items.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/mysqlUtils.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/mysqlUtils.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/pipelines.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/pipelines.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/settings.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/settings.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/uploadUtils.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/uploadUtils.cpython-34.pyc


--------------------------------------------------------------------------------
/__pycache__/urlSettings.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/urlSettings.cpython-34.pyc


--------------------------------------------------------------------------------
/check_post.py:
--------------------------------------------------------------------------------
 1 | import requests, re
 2 | import http
 3 | import urllib
 4 | 
 5 | 
 6 | def checkPost():
 7 |     # 调用上传帖子接口
 8 |     CREATE_POST_URL = "http://api.test.net/robot/handlePost"
 9 | 
10 |     fields={'group_id': '30',
11 |             'type': 1,
12 |             'apisign':'99ea3esdg45549162c4a741d58baa60'}
13 | 
14 |     r = requests.post(CREATE_POST_URL, data=fields)
15 | 
16 |     print(r.json())
17 | 


--------------------------------------------------------------------------------
/commonUtils.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from hashlib import md5
 3 | 
 4 | 
 5 | # 获取随机发帖ID
 6 | def get_random_user(user_str):
 7 |     user_list = []
 8 |     for user_id in str(user_str).split(','):
 9 |         user_list.append(user_id)
10 |     userid_idx = random.randint(1, len(user_list))
11 |     user_chooesd = user_list[userid_idx-1]
12 |     return user_chooesd
13 | 
14 | 
15 | # 获取MD5加密URL
16 | def get_linkmd5id(url):
17 |     # url进行md5处理，为避免重复采集设计
18 |     md5_url = md5(url.encode("utf8")).hexdigest()
19 |     return md5_url
20 | 


--------------------------------------------------------------------------------
/contentSettings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for DgSpider project
 4 | 
 5 | # 图片储存
 6 | IMAGES_STORE = 'D:\\pics\\jfss\\'
 7 | 
 8 | # 爬取域名
 9 | DOMAIN = 'nrsfh.com'
10 | 
11 | # 图片域名前缀
12 | DOMAIN_HTTP = "http:"
13 | 
14 | # 随机发帖用户
15 | CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\
16 |                    '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \
17 |                    '18441410,18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,' \
18 |                    '18441420,18441421,18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,' \
19 |                    '18441430,18441431,18441432,18441433,18441434,18441435,18441436,18441437,18441438,18441439,' \
20 |                    '18441440,18441441,18441442,18441443,18441444,18441445,18441446,18441447,18441448,18441449,' \
21 |                    '18441450,18441451,18441452,18441453,18441454,18441455,18441456,18441457,18441458,18441460,' \
22 |                    '18441461,18441462,18441463,18441464,18441465,18441466,18441467,18441468,18441469,18441470,' \
23 |                    '18441471,18441472,18441473,18441474,18441475,18441476,18441477,18441478,18441479,18441481,' \
24 |                    '18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,18441490'
25 | 
26 | # 爬虫名
27 | SPIDER_NAME = 'DgContentSpider'
28 | 
29 | # 文章URL爬取规则XPATH
30 | POST_TITLE_XPATH = '//div[@class="title"]'
31 | POST_CONTENT_XPATH = '//div[@class="bodycss"]'
32 | 
33 | 


--------------------------------------------------------------------------------
/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | # douguo Spider Item
 5 | # @author zhangjianfei
 6 | # @date 2017/04/07
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DgspiderUrlItem(scrapy.Item):
12 |     url = scrapy.Field()
13 | 
14 | 
15 | class DgspiderPostItem(scrapy.Item):
16 |     url = scrapy.Field()
17 |     title = scrapy.Field()
18 |     text = scrapy.Field()
19 | 


--------------------------------------------------------------------------------
/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class DgspiderSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/mysqlUtils.py:
--------------------------------------------------------------------------------
  1 | import pymysql
  2 | import pymysql.cursors
  3 | import os
  4 | 
  5 | 
  6 | def dbhandle_online():
  7 |     host = '192.168.1.235'
  8 |     user = 'root'
  9 |     passwd = 'douguo2015'
 10 |     charset = 'utf8'
 11 |     conn = pymysql.connect(
 12 |         host=host,
 13 |         user=user,
 14 |         passwd=passwd,
 15 |         charset=charset,
 16 |         use_unicode=False
 17 |     )
 18 |     return conn
 19 | 
 20 | 
 21 | def dbhandle_local():
 22 |     host = '192.168.1.235'
 23 |     user = 'root'
 24 |     passwd = 'douguo2015'
 25 |     charset = 'utf8'
 26 |     conn = pymysql.connect(
 27 |         host=host,
 28 |         user=user,
 29 |         passwd=passwd,
 30 |         charset=charset,
 31 |         use_unicode=True
 32 |         # use_unicode=False
 33 |     )
 34 |     return conn
 35 | 
 36 | 
 37 | def dbhandle_geturl(gid):
 38 |     host = '192.168.1.235'
 39 |     user = 'root'
 40 |     passwd = 'douguo2015'
 41 |     charset = 'utf8'
 42 |     conn = pymysql.connect(
 43 |         host=host,
 44 |         user=user,
 45 |         passwd=passwd,
 46 |         charset=charset,
 47 |         use_unicode=False
 48 |     )
 49 |     cursor = conn.cursor()
 50 |     sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid
 51 |     try:
 52 |         cursor.execute(sql)
 53 |         result = cursor.fetchone()
 54 |         conn.commit()
 55 |     except Exception as e:
 56 |         print("***** exception")
 57 |         print(e)
 58 |         conn.rollback()
 59 | 
 60 |     if result is None:
 61 |         os._exit(0)
 62 |     else:
 63 |         url = result[0]
 64 |         spider_name = result[1]
 65 |         site = result[2]
 66 |         gid = result[3]
 67 |         module = result[4]
 68 |         return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode()
 69 | 
 70 | 
 71 | def dbhandle_insert_content(url, title, content, user_id, has_img):
 72 |     host = '192.168.1.235'
 73 |     user = 'root'
 74 |     passwd = 'douguo2015'
 75 |     charset = 'utf8'
 76 |     conn = pymysql.connect(
 77 |         host=host,
 78 |         user=user,
 79 |         passwd=passwd,
 80 |         charset=charset,
 81 |         use_unicode=False
 82 |     )
 83 |     cur = conn.cursor()
 84 | 
 85 |     # 如果标题或者内容为空，那么程序将退出，篇文章将会作废并将status设置为1，爬虫继续向下运行获得新的URl
 86 |     if content.strip() == '' or title.strip() == '':
 87 |         sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url)
 88 |         try:
 89 |             cur.execute(sql_fail)
 90 |             result = cur.fetchone()
 91 |             conn.commit()
 92 |         except Exception as e:
 93 |             print(e)
 94 |             conn.rollback()
 95 |         os._exit(0)
 96 | 
 97 |     sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \
 98 |           % (title, content, user_id, has_img, url)
 99 | 
100 |     try:
101 |         cur.execute(sql)
102 |         result = cur.fetchone()
103 |         conn.commit()
104 |     except Exception as e:
105 |         print(e)
106 |         conn.rollback()
107 |     return result
108 | 
109 | 
110 | def dbhandle_update_status(url, status):
111 |     host = '192.168.1.235'
112 |     user = 'root'
113 |     passwd = 'douguo2015'
114 |     charset = 'utf8'
115 |     conn = pymysql.connect(
116 |         host=host,
117 |         user=user,
118 |         passwd=passwd,
119 |         charset=charset,
120 |         use_unicode=False
121 |     )
122 |     cur = conn.cursor()
123 |     sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \
124 |           % (status, url)
125 |     try:
126 |         cur.execute(sql)
127 |         result = cur.fetchone()
128 |         conn.commit()
129 |     except Exception as e:
130 |         print(e)
131 |         conn.rollback()
132 |     return result
133 | 
134 | 
135 | def dbhandle_get_content(url):
136 |     host = '192.168.1.235'
137 |     user = 'root'
138 |     passwd = 'douguo2015'
139 |     charset = 'utf8'
140 |     conn = pymysql.connect(
141 |         host=host,
142 |         user=user,
143 |         passwd=passwd,
144 |         charset=charset,
145 |         use_unicode=False
146 |     )
147 |     cursor = conn.cursor()
148 |     sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url
149 |     try:
150 |         cursor.execute(sql)
151 |         result = cursor.fetchone()
152 |         conn.commit()
153 |     except Exception as e:
154 |         print("***** exception")
155 |         print(e)
156 |         conn.rollback()
157 | 
158 |     if result is None:
159 |         os._exit(1)
160 | 
161 |     title = result[0]
162 |     content = result[1]
163 |     user_id = result[2]
164 |     gid = result[3]
165 |     return title.decode(), content.decode(), user_id.decode(), gid.decode()
166 | 
167 | 
168 | # 获取爬虫初始化参数
169 | def dbhandle_get_spider_param(url):
170 |     host = '192.168.1.235'
171 |     user = 'root'
172 |     passwd = 'douguo2015'
173 |     charset = 'utf8'
174 |     conn = pymysql.connect(
175 |         host=host,
176 |         user=user,
177 |         passwd=passwd,
178 |         charset=charset,
179 |         use_unicode=False
180 |     )
181 |     cursor = conn.cursor()
182 |     sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url
183 |     result = ''
184 |     try:
185 |         cursor.execute(sql)
186 |         result = cursor.fetchone()
187 |         conn.commit()
188 |     except Exception as e:
189 |         print("***** exception")
190 |         print(e)
191 |         conn.rollback()
192 |     title = result[0]
193 |     content = result[1]
194 |     user_id = result[2]
195 |     gid = result[3]
196 |     return title.decode(), content.decode(), user_id.decode(), gid.decode()
197 | 


--------------------------------------------------------------------------------
/pipelines.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define your item pipelines here
  4 | # If you have many piplelines, all should be init here
  5 | # and use IF to judge them
  6 | #
  7 | # DOUGUO Spider pipelines
  8 | # @author zhangjianfei
  9 | # @date 2017/04/13
 10 | 
 11 | import re
 12 | import urllib.request
 13 | from DgSpider import urlSettings
 14 | from DgSpider import contentSettings
 15 | from DgSpider.mysqlUtils import dbhandle_insert_content
 16 | from DgSpider.uploadUtils import uploadImage
 17 | from DgSpider.mysqlUtils import dbhandle_online
 18 | from DgSpider.mysqlUtils import dbhandle_update_status
 19 | from bs4 import BeautifulSoup
 20 | from DgSpider.PostHandle import post_handel
 21 | from DgSpider.commonUtils import get_random_user
 22 | from DgSpider.commonUtils import get_linkmd5id
 23 | 
 24 | 
 25 | class DgPipeline(object):
 26 |     # post构造reply
 27 |     cs = []
 28 | 
 29 |     # 帖子title
 30 |     title = ''
 31 | 
 32 |     # 帖子文本
 33 |     text = ''
 34 | 
 35 |     # 当前爬取的url
 36 |     url = ''
 37 | 
 38 |     # 随机用户ID
 39 |     user_id = ''
 40 | 
 41 |     # 图片flag
 42 |     has_img = 0
 43 | 
 44 |     # get title flag
 45 |     get_title_flag = 0
 46 | 
 47 |     def __init__(self):
 48 |         DgPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)
 49 | 
 50 |     # process the data
 51 |     def process_item(self, item, spider):
 52 |         self.get_title_flag += 1
 53 | 
 54 |         # pipeline for content
 55 |         if spider.name == contentSettings.SPIDER_NAME:
 56 | 
 57 |             # 获取当前网页url
 58 |             DgPipeline.url = item['url']
 59 | 
 60 |             # 获取post title
 61 |             if len(item['title']) == 0:
 62 |                 title_tmp = ''
 63 |             else:
 64 |                 title_tmp = item['title'][0]
 65 | 
 66 |             # 替换标题中可能会引起 sql syntax 的符号
 67 |             # 对于分页的文章，只取得第一页的标题
 68 |             if self.get_title_flag == 1:
 69 | 
 70 |                 # 使用beautifulSoup格什化标题
 71 |                 soup_title = BeautifulSoup(title_tmp, "lxml")
 72 |                 title = ''
 73 |                 
 74 |                 # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、
 75 |                 # 多行的空格、换行，使用stripped_strings获取文本
 76 |                 for string in soup_title.stripped_strings:
 77 |                     title += string
 78 | 
 79 |                 title = title.replace("'", "”").replace('"', '“')
 80 |                 DgPipeline.title = title
 81 | 
 82 |             # 获取正post内容
 83 |             if len(item['text']) == 0:
 84 |                 text_temp = ''
 85 |             else:
 86 |                 text_temp = item['text'][0]
 87 | 
 88 |             # 获取图片
 89 |             reg_img = re.compile(r'<img.*>')
 90 |             imgs = reg_img.findall(text_temp)
 91 |             for img in imgs:
 92 |                 DgPipeline.has_img = 1
 93 | 
 94 |                 match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)
 95 |                 img_url_tmp = match_obj.group(1)
 96 | 
 97 |                 # 去除所有Http:标签
 98 |                 img_url_tmp = img_url_tmp.replace("http:", "")
 99 | 
100 |                 # 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理
101 |                 imgUrl_tmp_list = img_url_tmp.split('"')
102 |                 img_url_tmp = imgUrl_tmp_list[0]
103 | 
104 |                 # 加入http
105 |                 imgUrl = 'http:' + img_url_tmp
106 | 
107 |                 list_name = imgUrl.split('/')
108 |                 file_name = list_name[len(list_name)-1]
109 | 
110 |                 # 获取图片本地存储路径
111 |                 file_path = contentSettings.IMAGES_STORE + file_name
112 |                 
113 |                 # 获取图片并上传至本地
114 |                 urllib.request.urlretrieve(imgUrl, file_path)
115 |                 upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgPipeline.user_id)
116 |                 
117 |                 # 获取上传之后返回的服务器图片路径、宽、高
118 |                 img_u = upload_img_result_json['result']['image_url']
119 |                 img_w = upload_img_result_json['result']['w']
120 |                 img_h = upload_img_result_json['result']['h']
121 |                 img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)
122 | 
123 |                 # 在图片前后插入字符标记
124 |                 text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')
125 | 
126 |             # 使用beautifulSoup格什化HTML
127 |             soup = BeautifulSoup(text_temp, "lxml")
128 |             text = ''
129 |             
130 |             # 对于bs之后的html树形结构，不使用.prettify()，对于bs, prettify后每一个标签自动换行，造成多个、
131 |             # 多行的空格、换行
132 |             for string in soup.stripped_strings:
133 |                 text += string + '\n'
134 | 
135 |             # 替换因为双引号为中文双引号，避免 mysql syntax
136 |             DgPipeline.text = self.text + text.replace('"', '“')
137 | 
138 |             # 对于分页的文章，每一页之间加入换行
139 |             # DgPipeline.text += (DgPipeline.text + '\n')
140 | 
141 |         # pipeline for url
142 |         elif spider.name == urlSettings.SPIDER_NAME:
143 |             db_object = dbhandle_online()
144 |             cursor = db_object.cursor()
145 | 
146 |             for url in item['url']:
147 |                 linkmd5id = get_linkmd5id(url)
148 |                 spider_name = contentSettings.SPIDER_NAME
149 |                 site = urlSettings.DOMAIN
150 |                 gid = urlSettings.GROUP_ID
151 |                 module = urlSettings.MODULE
152 |                 status = '0'
153 |                 sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id
154 |                 sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status) ' \
155 |                       'values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' \
156 |                       % (linkmd5id, url, spider_name, site, gid, module, status)
157 |                         
158 |                 try:
159 |                     # 判断url是否存在,如果不存在，则插入
160 |                     cursor.execute(sql_search)
161 |                     result_search = cursor.fetchone()
162 |                     if result_search is None or result_search[0].strip() == '':
163 |                         cursor.execute(sql)
164 |                         result = cursor.fetchone()
165 |                         db_object.commit()
166 |                 except Exception as e:
167 |                     print(">>> catch exception !")
168 |                     print(e)
169 |                     db_object.rollback()
170 | 
171 |         return item
172 | 
173 |     # spider开启时被调用
174 |     def open_spider(self, spider):
175 |         pass
176 | 
177 |     # sipder 关闭时被调用
178 |     def close_spider(self, spider):
179 |         if spider.name == contentSettings.SPIDER_NAME:
180 |             # 数据入库：235
181 |             url = DgPipeline.url
182 |             title = DgPipeline.title
183 |             content = DgPipeline.text
184 |             user_id = DgPipeline.user_id
185 |             dbhandle_insert_content(url, title, content, user_id, DgPipeline.has_img)
186 | 
187 |             # 处理文本、设置status、上传至dgCommunity.dg_post
188 |             # 如果判断has_img为1，那么上传帖子
189 |             if DgPipeline.has_img == 1:
190 |                 if title.strip() != '' and content.strip() != '':
191 |                     spider.logger.info('has_img=1,title and content is not null! Uploading post into db...')
192 |                     post_handel(url)
193 |                 else:
194 |                     spider.logger.info('has_img=1,but title or content is null! ready to exit...')
195 |                 pass
196 |             else:
197 |                 spider.logger.info('has_img=0, changing status and ready to exit...')
198 |                 pass
199 | 
200 |         elif spider.name == urlSettings.SPIDER_NAME:
201 |             pass
202 | 
203 | 


--------------------------------------------------------------------------------
/postSettings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # 图片储存
 4 | IMAGES_STORE = 'D:\\pics\\pregnantMonther\\'
 5 | 
 6 | # 图片域名前缀
 7 | DOMAIN = "http:"
 8 | 
 9 | # 发帖用户ID
10 | CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\
11 |                    '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \
12 |                    '18441410,'\
13 |                    '18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,18441420,18441421,' \
14 |                    '18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,18441430,18441431,18441432,' \
15 |                    '18441433,18441434,18441435,18441436,18441437,18441438,18441439,18441440,18441441,18441442,18441443,' \
16 |                    '18441444,18441445,18441446,18441447,18441448,18441449,18441450,18441451,18441452,18441453,18441454,' \
17 |                    '18441455,18441456,18441457,18441458,18441460,18441461,18441462,18441463,18441464,18441465,18441466,' \
18 |                    '18441467,18441468,18441469,18441470,18441471,18441472,18441473,18441474,18441475,18441476,18441477,' \
19 |                    '18441478,18441479,18441481,18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,' \
20 |                    '18441490'
21 | 
22 | 


--------------------------------------------------------------------------------
/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for DgSpider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'DgSpider'
13 | 
14 | SPIDER_MODULES = ['DgSpider.spiders']
15 | NEWSPIDER_MODULE = 'DgSpider.spiders'
16 | 
17 | # 注册PIPELINES
18 | ITEM_PIPELINES = {
19 |     'DgSpider.pipelines.DgPipeline': 1
20 | }
21 | 
22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
23 | # USER_AGENT = 'DgSpider (+http://www.yourdomain.com)'
24 | 
25 | # Obey robots.txt rules
26 | ROBOTSTXT_OBEY = True
27 | 
28 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
29 | #CONCURRENT_REQUESTS = 32
30 | 
31 | # Configure a delay for requests for the same website (default: 0)
32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
33 | # See also autothrottle settings and docs
34 | #DOWNLOAD_DELAY = 3
35 | # The download delay setting will honor only one of:
36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
37 | #CONCURRENT_REQUESTS_PER_IP = 16
38 | 
39 | # Disable cookies (enabled by default)
40 | COOKIES_ENABLED = False
41 | 
42 | # Disable Telnet Console (enabled by default)
43 | #TELNETCONSOLE_ENABLED = False
44 | 
45 | # Override the default request headers:
46 | #DEFAULT_REQUEST_HEADERS = {
47 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
48 | #   'Accept-Language': 'en',
49 | #}
50 | 
51 | # Enable or disable spider middlewares
52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
53 | #SPIDER_MIDDLEWARES = {
54 | #    'DgSpider.middlewares.DgspiderSpiderMiddleware': 543,
55 | #}
56 | 
57 | # Enable or disable downloader middlewares
58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
59 | #DOWNLOADER_MIDDLEWARES = {
60 | #    'DgSpider.middlewares.MyCustomDownloaderMiddleware': 543,
61 | #}
62 | 
63 | # Enable or disable extensions
64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
65 | #EXTENSIONS = {
66 | #    'scrapy.extensions.telnet.TelnetConsole': None,
67 | #}
68 | 
69 | # Configure item pipelines
70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
71 | #ITEM_PIPELINES = {
72 | #    'DgSpider.pipelines.DgspiderPipeline': 300,
73 | #}
74 | 
75 | # Enable and configure the AutoThrottle extension (disabled by default)
76 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
77 | #AUTOTHROTTLE_ENABLED = True
78 | # The initial download delay
79 | #AUTOTHROTTLE_START_DELAY = 5
80 | # The maximum download delay to be set in case of high latencies
81 | #AUTOTHROTTLE_MAX_DELAY = 60
82 | # The average number of requests Scrapy should be sending in parallel to
83 | # each remote server
84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
85 | # Enable showing throttling stats for every response received:
86 | #AUTOTHROTTLE_DEBUG = False
87 | 
88 | # Enable and configure HTTP caching (disabled by default)
89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
90 | #HTTPCACHE_ENABLED = True
91 | #HTTPCACHE_EXPIRATION_SECS = 0
92 | #HTTPCACHE_DIR = 'httpcache'
93 | #HTTPCACHE_IGNORE_HTTP_CODES = []
94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
95 | 


--------------------------------------------------------------------------------
/spiders/ContentSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from DgSpider.mysqlUtils import dbhandle_geturl
 5 | from DgSpider.items import DgspiderPostItem
 6 | from scrapy.selector import Selector
 7 | from scrapy.http import Request
 8 | from DgSpider import contentSettings
 9 | from DgSpider import urlSettings
10 | from DgSpider.mysqlUtils import dbhandle_update_status
11 | 
12 | 
13 | class DgContentSpider(scrapy.Spider):
14 |     print('Spider DgContentSpider Staring...')
15 | 
16 |     result = dbhandle_geturl(urlSettings.GROUP_ID)
17 | 
18 |     url = result[0]
19 |     spider_name = result[1]
20 |     site = result[2]
21 |     gid = result[3]
22 |     module = result[4]
23 | 
24 |     name = 'DgContentSpider'
25 | 
26 |     # 设定爬取域名范围
27 |     allowed_domains = [site]
28 | 
29 |     # 爬取地址
30 |     start_urls = [url]
31 | 
32 |     start_urls_tmp = []
33 |     """构造分页序列，一般来说遵循规则 url.html,url_2.html,url_3.html，并且url.html也写为url_1.html"""
34 |     for i in range(6, 1, -1):
35 |         start_single = url[:-5]
36 |         start_urls_tmp.append(start_single+"_"+str(i)+".html")
37 | 
38 |     # 更新状态
39 |     """对于爬去网页，无论是否爬取成功都将设置status为1，避免死循环"""
40 |     dbhandle_update_status(url, 1)
41 | 
42 |     # 爬取方法
43 |     def parse(self, response):
44 |         item = DgspiderPostItem()
45 | 
46 |         # sel : 页面源代码
47 |         sel = Selector(response)
48 | 
49 |         item['url'] = DgContentSpider.url
50 | 
51 |         # 对于title, <div><h1><span aaa><span>标题1</h1></div>,使用下列方法取得
52 |         data_title_tmp = sel.xpath(contentSettings.POST_TITLE_XPATH)
53 |         item['title'] = data_title_tmp.xpath('string(.)').extract()
54 | 
55 |         item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract()
56 | 
57 |         yield item
58 | 
59 |         if self.start_urls_tmp:
60 |             url = self.start_urls_tmp.pop()
61 |             yield Request(url, callback=self.parse)
62 | 


--------------------------------------------------------------------------------
/spiders/UrlSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | from DgSpider.items import DgspiderUrlItem
 5 | from scrapy.selector import Selector
 6 | from DgSpider import urlSettings
 7 | 
 8 | 
 9 | class DgUrlSpider(scrapy.Spider):
10 |     print('Spider DgUrlSpider Staring...')
11 | 
12 |     # 爬虫名 必须静态指定
13 |     # name = urlSettings.SPIDER_NAME
14 |     name = 'DgUrlSpider'
15 | 
16 |     # 设定域名
17 |     allowed_domains = [urlSettings.DOMAIN]
18 | 
19 |     # 爬取地址
20 |     url_list = []
21 |     """一般来说，列表页第一页不符合规则，单独append"""
22 |     url_list.append(urlSettings.START_LIST_URL)
23 |     loop = urlSettings.LIST_URL_RULER_LOOP
24 |     for i in range(1, loop):
25 |         url = urlSettings.LIST_URL_RULER_PREFIX + str(i) + urlSettings.LIST_URL_RULER_SUFFIX
26 |         url_list.append(url)
27 |     start_urls = url_list
28 | 
29 |     # 爬取方法
30 |     def parse(self, response):
31 | 
32 |         # sel : 页面源代码
33 |         sel = Selector(response)
34 | 
35 |         item_url = DgspiderUrlItem()
36 |         url_item = []
37 | 
38 |         # XPATH获取url
39 |         url_list = sel.xpath(urlSettings.POST_URL_XPATH).extract()
40 | 
41 |         # 消除http前缀差异
42 |         for url in url_list:
43 |             url = url.replace('http:', '')
44 |             url_item.append('http:' + url)
45 | 
46 |         # list去重
47 |         url_item = list(set(url_item))
48 |         item_url['url'] = url_item
49 | 
50 |         yield item_url
51 | 


--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/__pycache__/ContentSpider.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/ContentSpider.cpython-34.pyc


--------------------------------------------------------------------------------
/spiders/__pycache__/UrlSpider.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/UrlSpider.cpython-34.pyc


--------------------------------------------------------------------------------
/spiders/__pycache__/__init__.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/__init__.cpython-34.pyc


--------------------------------------------------------------------------------
/uploadUtils.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from requests_toolbelt.multipart.encoder import MultipartEncoder
 3 | 
 4 | 
 5 | def upload_post(json_data):
 6 |     # 上传帖子 ，参考：http://192.168.2.25:3000/api/interface/2016
 7 |     # create_post_url = "http://api.qa.douguo.net/robot/uploadimagespost"
 8 |     create_post_url = "http://api.douguo.net/robot/uploadimagespost"
 9 | 
10 |     # 传帖子
11 |     # dataJson = json.dumps({"user_id":"19013245","gid":30,"t":"2017-03-23","cs":[{"c":"啦啦啦","i":"","w":0,"h":0},
12 |     #                       {"c":"啦啦啦2222","i":"http://wwww.douguo.com/abc.jpg","w":0,"h":0}],"time":1235235234})
13 |     # jsonData = {"user_id":"19013245","gid":5,"t":"TEST","cs":'[{"c":"啊啊啊","i":"qqq","w":12,"h":10},
14 |     #               {"c":"这个内容真不错","i":"http://wwww.baidu.com","w":10,"h":10}]',"time":61411313}
15 | 
16 |     # print(jsonData)
17 |     req_post = requests.post(create_post_url, data=json_data)
18 |     print(req_post.json())
19 |     # print(reqPost.text)
20 | 
21 | 
22 | def uploadImage(img_path, content_type, user_id):
23 |     # 上传单个图片 ， 参考：http://192.168.2.25:3000/api/interface/2015
24 |     # UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage"
25 |     UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage"
26 |     # 传图片
27 | 
28 |     m = MultipartEncoder(
29 |         # fields={'user_id': '192323',
30 |         #         'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')}
31 |         fields={'user_id': user_id,
32 |                 'apisign': '99ea3eda4b45549162c4a741d58baa60',
33 |                 'image': ('filename', open(img_path, 'rb'), 'image/jpeg')}
34 |     )
35 | 
36 |     r = requests.post(UPLOAD_IMG_URL, data=m, headers={'Content-Type': m.content_type})
37 |     print(r.json())
38 |     # print(r.text)
39 |     return r.json()
40 |     # return r.text


--------------------------------------------------------------------------------
/urlSettings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # 爬取域名
 4 | DOMAIN = 'eastlady.cn'
 5 | 
 6 | # 爬虫名
 7 | SPIDER_NAME = 'DgUrlSpider'
 8 | 
 9 | GROUP_ID = '33'
10 | 
11 | MODULE = '999'
12 | 
13 | # 文章列表页起始爬取URL
14 | START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html'
15 | 
16 | # 文章列表循环规则
17 | LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/'
18 | LIST_URL_RULER_SUFFIX = '.html'
19 | LIST_URL_RULER_LOOP = 30
20 | 
21 | # 文章URL爬取规则XPATH
22 | POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href'
23 | 
24 | 


--------------------------------------------------------------------------------