├── PostHandle.py ├── README.md ├── __init__.py ├── __pycache__ ├── PostHandle.cpython-34.pyc ├── __init__.cpython-34.pyc ├── commonUtils.cpython-34.pyc ├── contentSettings.cpython-34.pyc ├── items.cpython-34.pyc ├── mysqlUtils.cpython-34.pyc ├── pipelines.cpython-34.pyc ├── settings.cpython-34.pyc ├── uploadUtils.cpython-34.pyc └── urlSettings.cpython-34.pyc ├── check_post.py ├── commonUtils.py ├── contentSettings.py ├── items.py ├── middlewares.py ├── mysqlUtils.py ├── pipelines.py ├── postSettings.py ├── settings.py ├── spiders ├── ContentSpider.py ├── UrlSpider.py ├── __init__.py └── __pycache__ │ ├── ContentSpider.cpython-34.pyc │ ├── UrlSpider.cpython-34.pyc │ └── __init__.cpython-34.pyc ├── uploadUtils.py └── urlSettings.py /PostHandle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from DgSpider.mysqlUtils import dbhandle_get_content 4 | from DgSpider.mysqlUtils import dbhandle_update_status 5 | from DgSpider.uploadUtils import upload_post 6 | import json 7 | 8 | 9 | def post_handel(url): 10 | result = dbhandle_get_content(url) 11 | 12 | title = result[0] 13 | content = result[1] 14 | user_id = result[2] 15 | gid = result[3] 16 | cs = [] 17 | 18 | text_list = content.split('[dgimg]') 19 | for text_single in text_list: 20 | text_single_c = text_single.split('[/dgimg]') 21 | if len(text_single_c) == 1: 22 | cs_json = {"c": text_single_c[0], "i": '', "w": '', "h": ''} 23 | cs.append(cs_json) 24 | else: 25 | # tmp_img_upload_json = upload_img_result.pop() 26 | pic_flag = text_single_c[1] 27 | img_params = text_single_c[0].split(';') 28 | i = img_params[0] 29 | w = img_params[1] 30 | h = img_params[2] 31 | cs_json = {"c": pic_flag, "i": i, "w": w, "h": h} 32 | cs.append(cs_json) 33 | 34 | strcs = json.dumps(cs) 35 | json_data = {"apisign": "99ea3eda4b4554adag2c4a741d58baa60", 36 | "user_id": user_id, 37 | "gid": gid, 38 | "t": title, 39 | "cs": strcs} 40 | # 上传帖子 41 | result_uploadpost = upload_post(json_data) 42 | 43 | # 更新状态2,成功上传帖子 44 | result_updateresult = dbhandle_update_status(url, 2) 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 网页爬虫设计 2 | ==== 3 | 4 | 创建项目 5 | ------- 6 | 7 | - 进入指定文件夹,右击空白处>在此处打开命令行窗口 8 | - 创建项目 9 | ``` 10 | Scrapy startproject DgSpider 11 | ``` 12 | 13 | 主要代码文件说明 14 | ------- 15 | 16 | - 爬虫主类 :UrlSpider.py、ContentSpider.py 17 | *项目包含2个爬虫主类,分别用于爬取文章列表页所有文章的URL、文章详情页具体内容* 18 | - 内容处理类 :pipelines.py 19 | *处理内容* 20 | - 传输字段类 :items.py 21 | *暂存爬取的数据* 22 | - 设置文件 :settings.py 23 | *用于主要的参数配置* 24 | - 数据库操作:mysqlUtils.py 25 | *链接操作数据库* 26 | - 文本处理、上传文本:PostHandle.py 27 | *处理文本* 28 | 29 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__init__.py -------------------------------------------------------------------------------- /__pycache__/PostHandle.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/PostHandle.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/commonUtils.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/commonUtils.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/contentSettings.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/contentSettings.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/items.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/items.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/mysqlUtils.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/mysqlUtils.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/pipelines.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/pipelines.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/settings.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/settings.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/uploadUtils.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/uploadUtils.cpython-34.pyc -------------------------------------------------------------------------------- /__pycache__/urlSettings.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/__pycache__/urlSettings.cpython-34.pyc -------------------------------------------------------------------------------- /check_post.py: -------------------------------------------------------------------------------- 1 | import requests, re 2 | import http 3 | import urllib 4 | 5 | 6 | def checkPost(): 7 | # 调用上传帖子接口 8 | CREATE_POST_URL = "http://api.test.net/robot/handlePost" 9 | 10 | fields={'group_id': '30', 11 | 'type': 1, 12 | 'apisign':'99ea3esdg45549162c4a741d58baa60'} 13 | 14 | r = requests.post(CREATE_POST_URL, data=fields) 15 | 16 | print(r.json()) 17 | -------------------------------------------------------------------------------- /commonUtils.py: -------------------------------------------------------------------------------- 1 | import random 2 | from hashlib import md5 3 | 4 | 5 | # 获取随机发帖ID 6 | def get_random_user(user_str): 7 | user_list = [] 8 | for user_id in str(user_str).split(','): 9 | user_list.append(user_id) 10 | userid_idx = random.randint(1, len(user_list)) 11 | user_chooesd = user_list[userid_idx-1] 12 | return user_chooesd 13 | 14 | 15 | # 获取MD5加密URL 16 | def get_linkmd5id(url): 17 | # url进行md5处理,为避免重复采集设计 18 | md5_url = md5(url.encode("utf8")).hexdigest() 19 | return md5_url 20 | -------------------------------------------------------------------------------- /contentSettings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DgSpider project 4 | 5 | # 图片储存 6 | IMAGES_STORE = 'D:\\pics\\jfss\\' 7 | 8 | # 爬取域名 9 | DOMAIN = 'nrsfh.com' 10 | 11 | # 图片域名前缀 12 | DOMAIN_HTTP = "http:" 13 | 14 | # 随机发帖用户 15 | CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\ 16 | '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \ 17 | '18441410,18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,' \ 18 | '18441420,18441421,18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,' \ 19 | '18441430,18441431,18441432,18441433,18441434,18441435,18441436,18441437,18441438,18441439,' \ 20 | '18441440,18441441,18441442,18441443,18441444,18441445,18441446,18441447,18441448,18441449,' \ 21 | '18441450,18441451,18441452,18441453,18441454,18441455,18441456,18441457,18441458,18441460,' \ 22 | '18441461,18441462,18441463,18441464,18441465,18441466,18441467,18441468,18441469,18441470,' \ 23 | '18441471,18441472,18441473,18441474,18441475,18441476,18441477,18441478,18441479,18441481,' \ 24 | '18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,18441490' 25 | 26 | # 爬虫名 27 | SPIDER_NAME = 'DgContentSpider' 28 | 29 | # 文章URL爬取规则XPATH 30 | POST_TITLE_XPATH = '//div[@class="title"]' 31 | POST_CONTENT_XPATH = '//div[@class="bodycss"]' 32 | 33 | -------------------------------------------------------------------------------- /items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # douguo Spider Item 5 | # @author zhangjianfei 6 | # @date 2017/04/07 7 | 8 | import scrapy 9 | 10 | 11 | class DgspiderUrlItem(scrapy.Item): 12 | url = scrapy.Field() 13 | 14 | 15 | class DgspiderPostItem(scrapy.Item): 16 | url = scrapy.Field() 17 | title = scrapy.Field() 18 | text = scrapy.Field() 19 | -------------------------------------------------------------------------------- /middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DgspiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /mysqlUtils.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import pymysql.cursors 3 | import os 4 | 5 | 6 | def dbhandle_online(): 7 | host = '192.168.1.235' 8 | user = 'root' 9 | passwd = 'douguo2015' 10 | charset = 'utf8' 11 | conn = pymysql.connect( 12 | host=host, 13 | user=user, 14 | passwd=passwd, 15 | charset=charset, 16 | use_unicode=False 17 | ) 18 | return conn 19 | 20 | 21 | def dbhandle_local(): 22 | host = '192.168.1.235' 23 | user = 'root' 24 | passwd = 'douguo2015' 25 | charset = 'utf8' 26 | conn = pymysql.connect( 27 | host=host, 28 | user=user, 29 | passwd=passwd, 30 | charset=charset, 31 | use_unicode=True 32 | # use_unicode=False 33 | ) 34 | return conn 35 | 36 | 37 | def dbhandle_geturl(gid): 38 | host = '192.168.1.235' 39 | user = 'root' 40 | passwd = 'douguo2015' 41 | charset = 'utf8' 42 | conn = pymysql.connect( 43 | host=host, 44 | user=user, 45 | passwd=passwd, 46 | charset=charset, 47 | use_unicode=False 48 | ) 49 | cursor = conn.cursor() 50 | sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid 51 | try: 52 | cursor.execute(sql) 53 | result = cursor.fetchone() 54 | conn.commit() 55 | except Exception as e: 56 | print("***** exception") 57 | print(e) 58 | conn.rollback() 59 | 60 | if result is None: 61 | os._exit(0) 62 | else: 63 | url = result[0] 64 | spider_name = result[1] 65 | site = result[2] 66 | gid = result[3] 67 | module = result[4] 68 | return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode() 69 | 70 | 71 | def dbhandle_insert_content(url, title, content, user_id, has_img): 72 | host = '192.168.1.235' 73 | user = 'root' 74 | passwd = 'douguo2015' 75 | charset = 'utf8' 76 | conn = pymysql.connect( 77 | host=host, 78 | user=user, 79 | passwd=passwd, 80 | charset=charset, 81 | use_unicode=False 82 | ) 83 | cur = conn.cursor() 84 | 85 | # 如果标题或者内容为空,那么程序将退出,篇文章将会作废并将status设置为1,爬虫继续向下运行获得新的URl 86 | if content.strip() == '' or title.strip() == '': 87 | sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url) 88 | try: 89 | cur.execute(sql_fail) 90 | result = cur.fetchone() 91 | conn.commit() 92 | except Exception as e: 93 | print(e) 94 | conn.rollback() 95 | os._exit(0) 96 | 97 | sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \ 98 | % (title, content, user_id, has_img, url) 99 | 100 | try: 101 | cur.execute(sql) 102 | result = cur.fetchone() 103 | conn.commit() 104 | except Exception as e: 105 | print(e) 106 | conn.rollback() 107 | return result 108 | 109 | 110 | def dbhandle_update_status(url, status): 111 | host = '192.168.1.235' 112 | user = 'root' 113 | passwd = 'douguo2015' 114 | charset = 'utf8' 115 | conn = pymysql.connect( 116 | host=host, 117 | user=user, 118 | passwd=passwd, 119 | charset=charset, 120 | use_unicode=False 121 | ) 122 | cur = conn.cursor() 123 | sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \ 124 | % (status, url) 125 | try: 126 | cur.execute(sql) 127 | result = cur.fetchone() 128 | conn.commit() 129 | except Exception as e: 130 | print(e) 131 | conn.rollback() 132 | return result 133 | 134 | 135 | def dbhandle_get_content(url): 136 | host = '192.168.1.235' 137 | user = 'root' 138 | passwd = 'douguo2015' 139 | charset = 'utf8' 140 | conn = pymysql.connect( 141 | host=host, 142 | user=user, 143 | passwd=passwd, 144 | charset=charset, 145 | use_unicode=False 146 | ) 147 | cursor = conn.cursor() 148 | sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url 149 | try: 150 | cursor.execute(sql) 151 | result = cursor.fetchone() 152 | conn.commit() 153 | except Exception as e: 154 | print("***** exception") 155 | print(e) 156 | conn.rollback() 157 | 158 | if result is None: 159 | os._exit(1) 160 | 161 | title = result[0] 162 | content = result[1] 163 | user_id = result[2] 164 | gid = result[3] 165 | return title.decode(), content.decode(), user_id.decode(), gid.decode() 166 | 167 | 168 | # 获取爬虫初始化参数 169 | def dbhandle_get_spider_param(url): 170 | host = '192.168.1.235' 171 | user = 'root' 172 | passwd = 'douguo2015' 173 | charset = 'utf8' 174 | conn = pymysql.connect( 175 | host=host, 176 | user=user, 177 | passwd=passwd, 178 | charset=charset, 179 | use_unicode=False 180 | ) 181 | cursor = conn.cursor() 182 | sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url 183 | result = '' 184 | try: 185 | cursor.execute(sql) 186 | result = cursor.fetchone() 187 | conn.commit() 188 | except Exception as e: 189 | print("***** exception") 190 | print(e) 191 | conn.rollback() 192 | title = result[0] 193 | content = result[1] 194 | user_id = result[2] 195 | gid = result[3] 196 | return title.decode(), content.decode(), user_id.decode(), gid.decode() 197 | -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # If you have many piplelines, all should be init here 5 | # and use IF to judge them 6 | # 7 | # DOUGUO Spider pipelines 8 | # @author zhangjianfei 9 | # @date 2017/04/13 10 | 11 | import re 12 | import urllib.request 13 | from DgSpider import urlSettings 14 | from DgSpider import contentSettings 15 | from DgSpider.mysqlUtils import dbhandle_insert_content 16 | from DgSpider.uploadUtils import uploadImage 17 | from DgSpider.mysqlUtils import dbhandle_online 18 | from DgSpider.mysqlUtils import dbhandle_update_status 19 | from bs4 import BeautifulSoup 20 | from DgSpider.PostHandle import post_handel 21 | from DgSpider.commonUtils import get_random_user 22 | from DgSpider.commonUtils import get_linkmd5id 23 | 24 | 25 | class DgPipeline(object): 26 | # post构造reply 27 | cs = [] 28 | 29 | # 帖子title 30 | title = '' 31 | 32 | # 帖子文本 33 | text = '' 34 | 35 | # 当前爬取的url 36 | url = '' 37 | 38 | # 随机用户ID 39 | user_id = '' 40 | 41 | # 图片flag 42 | has_img = 0 43 | 44 | # get title flag 45 | get_title_flag = 0 46 | 47 | def __init__(self): 48 | DgPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER) 49 | 50 | # process the data 51 | def process_item(self, item, spider): 52 | self.get_title_flag += 1 53 | 54 | # pipeline for content 55 | if spider.name == contentSettings.SPIDER_NAME: 56 | 57 | # 获取当前网页url 58 | DgPipeline.url = item['url'] 59 | 60 | # 获取post title 61 | if len(item['title']) == 0: 62 | title_tmp = '' 63 | else: 64 | title_tmp = item['title'][0] 65 | 66 | # 替换标题中可能会引起 sql syntax 的符号 67 | # 对于分页的文章,只取得第一页的标题 68 | if self.get_title_flag == 1: 69 | 70 | # 使用beautifulSoup格什化标题 71 | soup_title = BeautifulSoup(title_tmp, "lxml") 72 | title = '' 73 | 74 | # 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、 75 | # 多行的空格、换行,使用stripped_strings获取文本 76 | for string in soup_title.stripped_strings: 77 | title += string 78 | 79 | title = title.replace("'", "”").replace('"', '“') 80 | DgPipeline.title = title 81 | 82 | # 获取正post内容 83 | if len(item['text']) == 0: 84 | text_temp = '' 85 | else: 86 | text_temp = item['text'][0] 87 | 88 | # 获取图片 89 | reg_img = re.compile(r'') 90 | imgs = reg_img.findall(text_temp) 91 | for img in imgs: 92 | DgPipeline.has_img = 1 93 | 94 | match_obj = re.search('.*src="(.*)".*', img, re.M | re.I) 95 | img_url_tmp = match_obj.group(1) 96 | 97 | # 去除所有Http:标签 98 | img_url_tmp = img_url_tmp.replace("http:", "") 99 | 100 | # 对于这种情况单独处理 101 | imgUrl_tmp_list = img_url_tmp.split('"') 102 | img_url_tmp = imgUrl_tmp_list[0] 103 | 104 | # 加入http 105 | imgUrl = 'http:' + img_url_tmp 106 | 107 | list_name = imgUrl.split('/') 108 | file_name = list_name[len(list_name)-1] 109 | 110 | # 获取图片本地存储路径 111 | file_path = contentSettings.IMAGES_STORE + file_name 112 | 113 | # 获取图片并上传至本地 114 | urllib.request.urlretrieve(imgUrl, file_path) 115 | upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgPipeline.user_id) 116 | 117 | # 获取上传之后返回的服务器图片路径、宽、高 118 | img_u = upload_img_result_json['result']['image_url'] 119 | img_w = upload_img_result_json['result']['w'] 120 | img_h = upload_img_result_json['result']['h'] 121 | img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h) 122 | 123 | # 在图片前后插入字符标记 124 | text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]') 125 | 126 | # 使用beautifulSoup格什化HTML 127 | soup = BeautifulSoup(text_temp, "lxml") 128 | text = '' 129 | 130 | # 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、 131 | # 多行的空格、换行 132 | for string in soup.stripped_strings: 133 | text += string + '\n' 134 | 135 | # 替换因为双引号为中文双引号,避免 mysql syntax 136 | DgPipeline.text = self.text + text.replace('"', '“') 137 | 138 | # 对于分页的文章,每一页之间加入换行 139 | # DgPipeline.text += (DgPipeline.text + '\n') 140 | 141 | # pipeline for url 142 | elif spider.name == urlSettings.SPIDER_NAME: 143 | db_object = dbhandle_online() 144 | cursor = db_object.cursor() 145 | 146 | for url in item['url']: 147 | linkmd5id = get_linkmd5id(url) 148 | spider_name = contentSettings.SPIDER_NAME 149 | site = urlSettings.DOMAIN 150 | gid = urlSettings.GROUP_ID 151 | module = urlSettings.MODULE 152 | status = '0' 153 | sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id 154 | sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status) ' \ 155 | 'values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' \ 156 | % (linkmd5id, url, spider_name, site, gid, module, status) 157 | 158 | try: 159 | # 判断url是否存在,如果不存在,则插入 160 | cursor.execute(sql_search) 161 | result_search = cursor.fetchone() 162 | if result_search is None or result_search[0].strip() == '': 163 | cursor.execute(sql) 164 | result = cursor.fetchone() 165 | db_object.commit() 166 | except Exception as e: 167 | print(">>> catch exception !") 168 | print(e) 169 | db_object.rollback() 170 | 171 | return item 172 | 173 | # spider开启时被调用 174 | def open_spider(self, spider): 175 | pass 176 | 177 | # sipder 关闭时被调用 178 | def close_spider(self, spider): 179 | if spider.name == contentSettings.SPIDER_NAME: 180 | # 数据入库:235 181 | url = DgPipeline.url 182 | title = DgPipeline.title 183 | content = DgPipeline.text 184 | user_id = DgPipeline.user_id 185 | dbhandle_insert_content(url, title, content, user_id, DgPipeline.has_img) 186 | 187 | # 处理文本、设置status、上传至dgCommunity.dg_post 188 | # 如果判断has_img为1,那么上传帖子 189 | if DgPipeline.has_img == 1: 190 | if title.strip() != '' and content.strip() != '': 191 | spider.logger.info('has_img=1,title and content is not null! Uploading post into db...') 192 | post_handel(url) 193 | else: 194 | spider.logger.info('has_img=1,but title or content is null! ready to exit...') 195 | pass 196 | else: 197 | spider.logger.info('has_img=0, changing status and ready to exit...') 198 | pass 199 | 200 | elif spider.name == urlSettings.SPIDER_NAME: 201 | pass 202 | 203 | -------------------------------------------------------------------------------- /postSettings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 图片储存 4 | IMAGES_STORE = 'D:\\pics\\pregnantMonther\\' 5 | 6 | # 图片域名前缀 7 | DOMAIN = "http:" 8 | 9 | # 发帖用户ID 10 | CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\ 11 | '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \ 12 | '18441410,'\ 13 | '18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,18441420,18441421,' \ 14 | '18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,18441430,18441431,18441432,' \ 15 | '18441433,18441434,18441435,18441436,18441437,18441438,18441439,18441440,18441441,18441442,18441443,' \ 16 | '18441444,18441445,18441446,18441447,18441448,18441449,18441450,18441451,18441452,18441453,18441454,' \ 17 | '18441455,18441456,18441457,18441458,18441460,18441461,18441462,18441463,18441464,18441465,18441466,' \ 18 | '18441467,18441468,18441469,18441470,18441471,18441472,18441473,18441474,18441475,18441476,18441477,' \ 19 | '18441478,18441479,18441481,18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,' \ 20 | '18441490' 21 | 22 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DgSpider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'DgSpider' 13 | 14 | SPIDER_MODULES = ['DgSpider.spiders'] 15 | NEWSPIDER_MODULE = 'DgSpider.spiders' 16 | 17 | # 注册PIPELINES 18 | ITEM_PIPELINES = { 19 | 'DgSpider.pipelines.DgPipeline': 1 20 | } 21 | 22 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 23 | # USER_AGENT = 'DgSpider (+http://www.yourdomain.com)' 24 | 25 | # Obey robots.txt rules 26 | ROBOTSTXT_OBEY = True 27 | 28 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 29 | #CONCURRENT_REQUESTS = 32 30 | 31 | # Configure a delay for requests for the same website (default: 0) 32 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 33 | # See also autothrottle settings and docs 34 | #DOWNLOAD_DELAY = 3 35 | # The download delay setting will honor only one of: 36 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 37 | #CONCURRENT_REQUESTS_PER_IP = 16 38 | 39 | # Disable cookies (enabled by default) 40 | COOKIES_ENABLED = False 41 | 42 | # Disable Telnet Console (enabled by default) 43 | #TELNETCONSOLE_ENABLED = False 44 | 45 | # Override the default request headers: 46 | #DEFAULT_REQUEST_HEADERS = { 47 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 48 | # 'Accept-Language': 'en', 49 | #} 50 | 51 | # Enable or disable spider middlewares 52 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 53 | #SPIDER_MIDDLEWARES = { 54 | # 'DgSpider.middlewares.DgspiderSpiderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable downloader middlewares 58 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 59 | #DOWNLOADER_MIDDLEWARES = { 60 | # 'DgSpider.middlewares.MyCustomDownloaderMiddleware': 543, 61 | #} 62 | 63 | # Enable or disable extensions 64 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 65 | #EXTENSIONS = { 66 | # 'scrapy.extensions.telnet.TelnetConsole': None, 67 | #} 68 | 69 | # Configure item pipelines 70 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 71 | #ITEM_PIPELINES = { 72 | # 'DgSpider.pipelines.DgspiderPipeline': 300, 73 | #} 74 | 75 | # Enable and configure the AutoThrottle extension (disabled by default) 76 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 77 | #AUTOTHROTTLE_ENABLED = True 78 | # The initial download delay 79 | #AUTOTHROTTLE_START_DELAY = 5 80 | # The maximum download delay to be set in case of high latencies 81 | #AUTOTHROTTLE_MAX_DELAY = 60 82 | # The average number of requests Scrapy should be sending in parallel to 83 | # each remote server 84 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 85 | # Enable showing throttling stats for every response received: 86 | #AUTOTHROTTLE_DEBUG = False 87 | 88 | # Enable and configure HTTP caching (disabled by default) 89 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 90 | #HTTPCACHE_ENABLED = True 91 | #HTTPCACHE_EXPIRATION_SECS = 0 92 | #HTTPCACHE_DIR = 'httpcache' 93 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 94 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 95 | -------------------------------------------------------------------------------- /spiders/ContentSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from DgSpider.mysqlUtils import dbhandle_geturl 5 | from DgSpider.items import DgspiderPostItem 6 | from scrapy.selector import Selector 7 | from scrapy.http import Request 8 | from DgSpider import contentSettings 9 | from DgSpider import urlSettings 10 | from DgSpider.mysqlUtils import dbhandle_update_status 11 | 12 | 13 | class DgContentSpider(scrapy.Spider): 14 | print('Spider DgContentSpider Staring...') 15 | 16 | result = dbhandle_geturl(urlSettings.GROUP_ID) 17 | 18 | url = result[0] 19 | spider_name = result[1] 20 | site = result[2] 21 | gid = result[3] 22 | module = result[4] 23 | 24 | name = 'DgContentSpider' 25 | 26 | # 设定爬取域名范围 27 | allowed_domains = [site] 28 | 29 | # 爬取地址 30 | start_urls = [url] 31 | 32 | start_urls_tmp = [] 33 | """构造分页序列,一般来说遵循规则 url.html,url_2.html,url_3.html,并且url.html也写为url_1.html""" 34 | for i in range(6, 1, -1): 35 | start_single = url[:-5] 36 | start_urls_tmp.append(start_single+"_"+str(i)+".html") 37 | 38 | # 更新状态 39 | """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" 40 | dbhandle_update_status(url, 1) 41 | 42 | # 爬取方法 43 | def parse(self, response): 44 | item = DgspiderPostItem() 45 | 46 | # sel : 页面源代码 47 | sel = Selector(response) 48 | 49 | item['url'] = DgContentSpider.url 50 | 51 | # 对于title,

标题1

,使用下列方法取得 52 | data_title_tmp = sel.xpath(contentSettings.POST_TITLE_XPATH) 53 | item['title'] = data_title_tmp.xpath('string(.)').extract() 54 | 55 | item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() 56 | 57 | yield item 58 | 59 | if self.start_urls_tmp: 60 | url = self.start_urls_tmp.pop() 61 | yield Request(url, callback=self.parse) 62 | -------------------------------------------------------------------------------- /spiders/UrlSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from DgSpider.items import DgspiderUrlItem 5 | from scrapy.selector import Selector 6 | from DgSpider import urlSettings 7 | 8 | 9 | class DgUrlSpider(scrapy.Spider): 10 | print('Spider DgUrlSpider Staring...') 11 | 12 | # 爬虫名 必须静态指定 13 | # name = urlSettings.SPIDER_NAME 14 | name = 'DgUrlSpider' 15 | 16 | # 设定域名 17 | allowed_domains = [urlSettings.DOMAIN] 18 | 19 | # 爬取地址 20 | url_list = [] 21 | """一般来说,列表页第一页不符合规则,单独append""" 22 | url_list.append(urlSettings.START_LIST_URL) 23 | loop = urlSettings.LIST_URL_RULER_LOOP 24 | for i in range(1, loop): 25 | url = urlSettings.LIST_URL_RULER_PREFIX + str(i) + urlSettings.LIST_URL_RULER_SUFFIX 26 | url_list.append(url) 27 | start_urls = url_list 28 | 29 | # 爬取方法 30 | def parse(self, response): 31 | 32 | # sel : 页面源代码 33 | sel = Selector(response) 34 | 35 | item_url = DgspiderUrlItem() 36 | url_item = [] 37 | 38 | # XPATH获取url 39 | url_list = sel.xpath(urlSettings.POST_URL_XPATH).extract() 40 | 41 | # 消除http前缀差异 42 | for url in url_list: 43 | url = url.replace('http:', '') 44 | url_item.append('http:' + url) 45 | 46 | # list去重 47 | url_item = list(set(url_item)) 48 | item_url['url'] = url_item 49 | 50 | yield item_url 51 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/__pycache__/ContentSpider.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/ContentSpider.cpython-34.pyc -------------------------------------------------------------------------------- /spiders/__pycache__/UrlSpider.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/UrlSpider.cpython-34.pyc -------------------------------------------------------------------------------- /spiders/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zjfGit/Scrapy-Spider-based-on-Python3/d14ea6bb3d766d8908645e47e1e4d5d7d170d9cb/spiders/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /uploadUtils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests_toolbelt.multipart.encoder import MultipartEncoder 3 | 4 | 5 | def upload_post(json_data): 6 | # 上传帖子 ,参考:http://192.168.2.25:3000/api/interface/2016 7 | # create_post_url = "http://api.qa.douguo.net/robot/uploadimagespost" 8 | create_post_url = "http://api.douguo.net/robot/uploadimagespost" 9 | 10 | # 传帖子 11 | # dataJson = json.dumps({"user_id":"19013245","gid":30,"t":"2017-03-23","cs":[{"c":"啦啦啦","i":"","w":0,"h":0}, 12 | # {"c":"啦啦啦2222","i":"http://wwww.douguo.com/abc.jpg","w":0,"h":0}],"time":1235235234}) 13 | # jsonData = {"user_id":"19013245","gid":5,"t":"TEST","cs":'[{"c":"啊啊啊","i":"qqq","w":12,"h":10}, 14 | # {"c":"这个内容真不错","i":"http://wwww.baidu.com","w":10,"h":10}]',"time":61411313} 15 | 16 | # print(jsonData) 17 | req_post = requests.post(create_post_url, data=json_data) 18 | print(req_post.json()) 19 | # print(reqPost.text) 20 | 21 | 22 | def uploadImage(img_path, content_type, user_id): 23 | # 上传单个图片 , 参考:http://192.168.2.25:3000/api/interface/2015 24 | # UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage" 25 | UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage" 26 | # 传图片 27 | 28 | m = MultipartEncoder( 29 | # fields={'user_id': '192323', 30 | # 'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')} 31 | fields={'user_id': user_id, 32 | 'apisign': '99ea3eda4b45549162c4a741d58baa60', 33 | 'image': ('filename', open(img_path, 'rb'), 'image/jpeg')} 34 | ) 35 | 36 | r = requests.post(UPLOAD_IMG_URL, data=m, headers={'Content-Type': m.content_type}) 37 | print(r.json()) 38 | # print(r.text) 39 | return r.json() 40 | # return r.text -------------------------------------------------------------------------------- /urlSettings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 爬取域名 4 | DOMAIN = 'eastlady.cn' 5 | 6 | # 爬虫名 7 | SPIDER_NAME = 'DgUrlSpider' 8 | 9 | GROUP_ID = '33' 10 | 11 | MODULE = '999' 12 | 13 | # 文章列表页起始爬取URL 14 | START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html' 15 | 16 | # 文章列表循环规则 17 | LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/' 18 | LIST_URL_RULER_SUFFIX = '.html' 19 | LIST_URL_RULER_LOOP = 30 20 | 21 | # 文章URL爬取规则XPATH 22 | POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href' 23 | 24 | --------------------------------------------------------------------------------