├── README.md ├── SpiderKeeper.py ├── commands └── crawlall.py ├── commonUtils.py ├── ghostdriver.log ├── items.py ├── middlewares.py ├── middlewares └── middleware.py ├── mysqlUtils.py ├── notusedspiders ├── ContentSpider.py ├── ContentSpider_real.py ├── DgContentSpider_PhantomJS.py ├── DgUrlSpider_PhantomJS.py ├── PostHandle.py ├── UrlSpider.py ├── check_post.py ├── contentSettings.py ├── params.js ├── uploadUtils.py └── utils.py ├── pipelines.py ├── settings.py ├── setup.py ├── spiders ├── UrlSpider_JFSH.py ├── UrlSpider_MSZT.py ├── UrlSpider_SYDW.py ├── UrlSpider_YLBG.py ├── UrlSpider_YMYE.py └── __init__.py ├── test.py ├── urlSettings.py └── webBrowserPools ├── ghostdriver.log └── pool.py /README.md: -------------------------------------------------------------------------------- 1 | # 爬虫Windows环境搭建 2 | ## 安装需要的程序包 3 | - Python3.4.3 > https://pan.baidu.com/s/1pK8KDcv 4 | - pip9.0.1 > https://pan.baidu.com/s/1mhNdRN6 5 | - 编辑器pycharm > https://pan.baidu.com/s/1i4Nkdk5 6 | - pywin32 > http://pan.baidu.com/s/1pKZiZWZ 7 | - pyOpenSSL > http://pan.baidu.com/s/1hsgOQJq 8 | - windows_sdk > http://pan.baidu.com/s/1hrM6iRa 9 | - phantomjs > http://pan.baidu.com/s/1nvHm5AD 10 | 11 | ## 安装过程 12 | 13 | ### 安装基础环境 14 | 1. 安装Python安装包,一路Next 15 | 2. 将Python的安装目录添加到环境变量Path中 16 | 3. win + r 输入Cmd打开命令行窗口,输入Python 测试是否安装成功 17 | 18 | ### 安装pip 19 | > pip的作用相当于linux的yum,安装之后可以采用命令行的方式在线安装一些依赖包 20 | 1. 解压pip压缩包到某一目录(推荐与Python基础环境目录同级) 21 | 2. cmd窗口进入pip解压目录 22 | 3. 输入 python setup.py install 进行安装,安装过程中将会在Python目录的scripts目录下进行 23 | 4. 将pip的安装目录 C:\Python34\Scripts; 配置到环境变量path中 24 | 5. cmd命令行输入pip list 或者 pip --version 进行检验 25 | 26 | ### 安装Scrapy 27 | > Scrapy是一个比较成熟的爬虫框架,使用它可以进行网页内容的抓取,但是对于windows并不友好,我们需要一些类库去支持它 28 | 1. 安装pywin32: 一路next即可 29 | 2. 安装wheel:安装scrapy时需要一些whl文件的安装,whl文件的安装需要预先配置wheel文件。在cmd下使用pip安装 : pip install wheel 30 | 3. 安装PyOpenSSL:下载完成PyOpenSSL后,进入下载所在目录,执行安装:pip install pyOpenSSl (**注意,执行安装的wheel文件名一定要tab键自动弹出,不要手动敲入**) 31 | 4. 安装lxml: 直接使用pip在线安装 pip install lxml 32 | > ***在Windows的安装过程中,一定会出现 “error: Microsoft Visual C++ 10.0 is required (Unable to find vcvarsall.bat).”的问题,也就是无法找到相对应的编译包。一般的做法是下载VisualStudio来获得Complier,但是我们不这样做。*** 33 | 34 | > 下载windows-sdk后,执行安装操作,如果安装成功,那么这个问题就解决了。如果失败,那么需要先把安装失败过程中的2个编译包卸载。他们分别为:Microsoft Visual C++ 2010 x86 Redistributable、Microsoft Visual C++ 2010 x64 Redistributable(可以使用360或者腾讯管家来卸载) 35 | 36 | > 卸载完成之后,在安装确认过程中,不要勾选Visual C++ compiler,这样他第一次就能安装成功。安装成功之后,再次点击sdk进行安装,这时候又需要把Visual C++ compiler勾选上,再次执行安装。完成以上操作后,就不会出现Microsoft Visual C++ 10.0 is required的问题了。 37 | 38 | > 如果在安装过程中出现“failed building wheel for xxx”的问题,那么需要手动下载wheel包进行安装,所有的安装文件都可以在[http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/)里找到,找到需要的包并下载完成后执行pip install xxxx即可。 39 | 40 | 5. 安装Scrapy:pip install Scrapy, 安装完成后可以再命令行窗口输入Scrapy进行验证。 41 | 42 | 43 | 44 | 45 | 46 | # 爬虫架构设计 47 | 为了更好的扩展性和爬虫工作的易于监控,爬虫项目分成3个子项目,分别是url提取、内容爬取、内容更新(包括更新线上内容和定时审核) 48 | 49 | 主要是采用 Python 编写的scrapy框架,scrapy是目前非常热门的一种爬虫框架,它把整个爬虫过程分为了多个独立的模块,并提供了多个基类可以供我们去自由扩展,让爬虫编写变得简单而有逻辑性。并且scrapy自带的多线程、异常处理、以及强大的自定义Settings也让整个数据抓取过程变得高效而稳定。 50 | scrapy-redis:一个三方的基于redis的分布式爬虫框架,配合scrapy使用,让爬虫具有了分布式爬取的功能。github地址: https://github.com/darkrho/scrapy-redis 51 | mongodb 、mysql 或其他数据库:针对不同类型数据可以根据具体需求来选择不同的数据库存储。结构化数据可以使用mysql节省空间,非结构化、文本等数据可以采用mongodb等非关系型数据提高访问速度。具体选择可以自行百度谷歌,有很多关于sql和nosql的对比文章。 52 | 53 | 其实对于已有的scrapy程序,对其扩展成分布式程序还是比较容易的。总的来说就是以下几步: 54 | 55 | * 找一台高性能服务器,用于redis队列的维护以及数据的存储。 56 | * 扩展scrapy程序,让其通过服务器的redis来获取start_urls,并改写pipeline里数据 存储部分,把存储地址改为服务器地址。 57 | * 在服务器上写一些生成url的脚本,并定期执行。 58 | 59 | # 1 url提取 60 | ## 1.1 分布式抓取的原理 61 | 采用scrapy-redis实现分布式,其实从原理上来说很简单,这里为描述方便,我们把自己的核心服务器称为master,而把用于跑爬虫程序的机器称为slave。 62 | 63 | 我们知道,采用scrapy框架抓取网页,我们需要首先给定它一些start_urls,爬虫首先访问start_urls里面的url,再根据我们的具体逻辑,对里面的元素、或者是其他的二级、三级页面进行抓取。而要实现分布式,我们只需要在这个starts_urls里面做文章就行了。 64 | 65 | 我们在master上搭建一个redis数据库(注意这个数据库只用作url的存储,不关心爬取的具体数据,不要和后面的mongodb或者mysql混淆),并对每一个需要爬取的网站类型,都开辟一个单独的列表字段。通过设置slave上scrapy-redis获取url的地址为master地址。这样的结果就是,尽管有多个slave,然而大家获取url的地方只有一个,那就是服务器master上的redis数据库。 66 | 67 | 并且,由于scrapy-redis自身的队列机制,slave获取的链接不会相互冲突。这样各个slave在完成抓取任务之后,再把获取的结果汇总到服务器上(这时的数据存储不再在是redis,而是mongodb或者 mysql等存放具体内容的数据库了) 68 | 69 | 这种方法的还有好处就是程序移植性强,只要处理好路径问题,把slave上的程序移植到另一台机器上运行,基本上就是复制粘贴的事情。 70 | 71 | ## 1.2 url的提取 72 | 首先明确一点,url是在master而不是slave上生成的。 73 | 74 | 对于每一个门类的urls(每一个门类对应redis下的一个字段,表示一个url的列表),我们可以单独写一个生成url的脚本。这个脚本要做的事很简单,就是按照我们需要的格式,构造除url并添加到redis里面。 75 | 76 | 对于slave,我们知道,scrapy可以通过Settings来让爬取结束之后不自动关闭,而是不断的去询问队列里有没有新的url,如果有新的url,那么继续获取url并进行爬取。利用这一特性,我们就可以采用控制url的生成的方法,来控制slave爬虫程序的爬取。 77 | 78 | ## 1.3 url的处理 79 | 1、判断URL指向网站的域名,如果指向外部网站,直接丢弃 80 | 2、URL去重,然后URL地址存入redis和数据库; 81 | 82 | # 2 内容爬取 83 | ## 2.1 定时爬取 84 | 有了上面的介绍,定时抓取的实现就变得简单了,我们只需要定时的去执行url生成的脚本即可。这里推荐linux下的crontab指令,能够非常方便的制定定时任务,具体的介绍大家可以自行查看文档。 85 | 86 | ## 2.2 87 | # 3 内容更新 88 | ## 3.1 表设计 89 | 帖子爬取表: 90 | id :自增主键 91 | md5_url :md5加密URL 92 | url :爬取目标URL 93 | title :爬取文章标题 94 | content :爬取文章内容(已处理) 95 | user_id :随机发帖的用户ID 96 | spider_name :爬虫名 97 | site :爬取域名 98 | gid :灌入帖子的ID 99 | module : 100 | status :状态 (1:已爬取;0:未爬取) 101 | use_time :爬取时间 102 | create_time :创建时间 103 | CREATE TABLE `NewTable` ( 104 | `id` bigint(20) NOT NULL AUTO_INCREMENT , 105 | `md5_url` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 106 | `url` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 107 | `title` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 108 | `content` mediumtext CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 109 | `user_id` varchar(30) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 110 | `spider_name` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 111 | `site` varchar(100) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 112 | `gid` varchar(10) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 113 | `module` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL , 114 | `status` tinyint(4) NOT NULL DEFAULT 0 , 115 | `use_time` datetime NOT NULL , 116 | `create_time` datetime NOT NULL , 117 | PRIMARY KEY (`id`) 118 | ) 119 | ENGINE=InnoDB 120 | DEFAULT CHARACTER SET=utf8 COLLATE=utf8_general_ci 121 | AUTO_INCREMENT=4120 122 | ROW_FORMAT=COMPACT; 123 | 124 | 125 | 126 | # 4 系统优化 127 | ## 4.1 防抓取方法 128 | * 设置download_delay,这个方法基本上属于万能的,理论上只要你的delay足够长,网站服务器都没办法判断你是正常浏览还是爬虫。但它带来的副作用也是显然的:大量降低爬取效率。因此这个我们可能需要进行多次测试来得到一个合适的值。有时候download_delay可以设为一个范围随机值。 129 | * 随机生成User-agent:更改User-agent能够防止一些403或者400的错误,基本上属于每个爬虫都会写的。这里我们可以重写scrapy 里的middleware,让程序每次请求都随机获取一个User-agent,增大隐蔽性。具体实现可以参考 http://www.sharejs.com/codes/python/8310 130 | * 设置代理IP池:网上有很多免费或收费的代理池,可以借由他们作为中介来爬。一个问题是速度不能保证,第二个问题是,这些代理很多可能本来就没办法用。因此如果要用这个方法,比较靠谱的做法是先用程序筛选一些好用的代理,再在这些代理里面去随机、或者顺序访问。 131 | * 设置好header里面的domian和host,有些网站,比如雪球网会根据这两项来判断请求来源,因此也是要注意的地方。 132 | 133 | ## 4.2 程序化管理、web管理 134 | 上述方法虽然能够实现一套完整的流程,但在具体操作过程中还是比较麻烦,可能的话还可以架构web服务器,通过web端来实现url的添加、爬虫状态的监控等,能够减轻非常大的工作量。这些内容如果要展开实在太多,这里就只提一下。 135 | 136 | 137 | 138 | # 5 scrapy部署 139 | ## 5.1 安装python3.6 140 | 141 | ``` 142 | ``` 143 | 1、下载源代码 144 | wget https://www.python.org/ftp/python/3.6.1/Python-3.6.1.tgz 145 | 146 | 2、解压文件 147 | cp Python-3.6.1.tgz /usr/local/goldmine/ 148 | tar -xvf Python-3.6.1.tgz 149 | 150 | 3、编译 151 | ./configure --prefix=/usr/local 152 | 153 | 4、安装 154 | make && make altinstall 155 | 156 | 注意:这里使用的是make altinstall ,如果使用make install,会在系统中有两个版本的Python在/usr/bin/目录中,可能会导致问题。 157 | 4.1 报错---zipimport.ZipImportError: can't decompress data; zlib not available 158 | # http://www.zlib.net/zlib-1.2.11.tar 159 | ============================================= 160 | 使用root用户: 161 | 162 | wget http://www.zlib.net/zlib-1.2.11.tar 163 | tar -xvf zlib-1.2.11.tar.gz 164 | cd zlib-1.2.11 165 | ./configure 166 | make 167 | sudo make install 168 | ============================================= 169 | 安装完zlib,重新执行 Python-3.6.1中的 make && make altinstall 即可安装成功; 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | # 5.2 服务安装虚拟环境【root安装】 178 | 安装virtualenv可以搭建虚拟且独立的python环境,使每个项目环境和其他的项目独立开来,保持环境的干净,解决包冲突。 179 | 180 | ### 5.2.1 安装virtualenv 181 | /usr/local/bin/pip3.6 install virtualenv 182 | 183 | 结果报错了, 184 | =============== 185 | pip is configured with locations that require TLS/SSL, however the ssl module in Python is not available. 186 | Collecting virtualenv 187 | Could not fetch URL https://pypi.python.org/simple/virtualenv/: There was a problem confirming the ssl certificate: Can't connect to HTTPS URL because the SSL module is not available. - skipping 188 | =============== 189 | rpm -aq | grep openssl ,发现缺少 openssl-devel ; 190 | 【route add default gw 192.168.1.219】 191 | yum install openssl-devel -y 192 | 然后,重新编译python,见 5.1 ; 193 | ### 5.2.2 创建新的虚拟环境 194 | virtualenv -p /usr/local/bin/python3.6 python3.6-env 195 | 196 | ### 5.2.3 激活虚拟环境 197 | source python3.6-env/bin/active 198 | 199 | 5.2.3.1 虚拟环境中安装 python 200 | 201 | ### 5.2.4 退出虚拟环境 202 | deactive 203 | 204 | # 5.2 安装scrapy 205 | 206 | # 5.3 安装配置redis 207 | yum install redis 208 | # 5.4 209 | 210 | # 6 redis安装&配置 211 | ## 6.1 安装 212 | mac : sudo brew install redis 213 | /usr/local/bin/redis-server /usr/local/etc/redis.conf 214 | 215 | # 参考 216 | * 1.[基于Python,scrapy,redis的分布式爬虫实现框架](http://ju.outofmemory.cn/entry/206756) 217 | * 2.[小白进阶之Scrapy第三篇(基于Scrapy-Redis的分布式以及cookies池)](http://ju.outofmemory.cn/entry/299500) 218 | * 3.[CentOS中使用virtualenv搭建python3环境](http://www.jb51.net/article/67393.htm) 219 | * 4.[CentOS使用virtualenv搭建独立的Python环境](http://www.51ou.com/browse/linuxwt/60216.html) 220 | * 5.[python虚拟环境安装和配置](http://blog.csdn.net/pipisorry/article/details/39998317) 221 | -------------------------------------------------------------------------------- /SpiderKeeper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import time 4 | import threading 5 | from scrapy import cmdline 6 | 7 | # def ylbg(): 8 | # print(">> thread.staring ylbg ...") 9 | # cmdline.execute("scrapy crawl UrlSpider_YLBG".split()) 10 | # print(">> thread.ending ylbg ...") 11 | # 12 | # def sydw(): 13 | # print(">> thread.starting sydw ...") 14 | # cmdline.execute("scrapy crawl UrlSpider_SYDW".split()) 15 | # print(">> thread.ending sydw ...") 16 | # 17 | # threading._start_new_thread(ylbg()) 18 | # threading._start_new_thread(sydw()) 19 | 20 | # 配置 commands ,执行 scrapy list 下的所有spider 21 | cmdline.execute("scrapy crawlall".split()) 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /commands/crawlall.py: -------------------------------------------------------------------------------- 1 | from scrapy.commands import ScrapyCommand 2 | from scrapy.crawler import CrawlerRunner 3 | from scrapy.utils.conf import arglist_to_dict 4 | 5 | 6 | class Command(ScrapyCommand): 7 | 8 | requires_project = True 9 | 10 | def syntax(self): 11 | return '[options]' 12 | 13 | def short_desc(self): 14 | return 'Runs all of the spiders' 15 | 16 | def add_options(self, parser): 17 | ScrapyCommand.add_options(self, parser) 18 | parser.add_option("-a", dest="spargs", action="append", default=[], metavar="NAME=VALUE", 19 | help="set spider argument (may be repeated)") 20 | parser.add_option("-o", "--output", metavar="FILE", help="dump scraped items into FILE (use - for stdout)") 21 | parser.add_option("-t", "--output-format", metavar="FORMAT", help="format to use for dumping items with -o") 22 | 23 | def process_options(self, args, opts): 24 | ScrapyCommand.process_options(self, args, opts) 25 | # try: 26 | opts.spargs = arglist_to_dict(opts.spargs) 27 | # except ValueError: 28 | # raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False) 29 | 30 | def run(self, args, opts): 31 | # settings = get_project_settings() 32 | 33 | spider_loader = self.crawler_process.spider_loader 34 | for spidername in args or spider_loader.list(): 35 | print("*********cralall spidername************" + spidername) 36 | self.crawler_process.crawl(spidername, **opts.spargs) 37 | self.crawler_process.start() 38 | -------------------------------------------------------------------------------- /commonUtils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | import datetime 4 | from hashlib import md5 5 | 6 | 7 | # 获取随机发帖ID 8 | def get_random_user(user_str): 9 | user_list = [] 10 | for user_id in str(user_str).split(','): 11 | user_list.append(user_id) 12 | userid_idx = random.randint(1, len(user_list)) 13 | user_chooesd = user_list[userid_idx-1] 14 | return user_chooesd 15 | 16 | 17 | # 获取MD5加密URL 18 | def get_linkmd5id(url): 19 | # url进行md5处理,为避免重复采集设计 20 | md5_url = md5(url.encode("utf8")).hexdigest() 21 | return md5_url 22 | 23 | 24 | # get unix time stamp 25 | def get_time_stamp(): 26 | create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 27 | time_array = time.strptime(create_time, "%Y-%m-%d %H:%M:%S") 28 | time_stamp = int(time.mktime(time_array)) 29 | return time_stamp 30 | 31 | -------------------------------------------------------------------------------- /ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2017-06-28T00:22:35.372Z] GhostDriver - Main - running on port 9643 2 | [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":false,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","webSecurityEnabled":true} 3 | [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - page.customHeaders: - {} 4 | [INFO - 2017-06-28T00:22:38.400Z] Session [e424dd60-5b97-11e7-a0fa-fbfe1e4d560f] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","phantomjs.page.settings.loadImages":false} 5 | [INFO - 2017-06-28T00:22:38.400Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: e424dd60-5b97-11e7-a0fa-fbfe1e4d560f 6 | [ERROR - 2017-06-28T00:22:38.410Z] RouterReqHand - _handle.error - {"name":"Missing Command Parameter","message":"{\"headers\":{\"Accept\":\"application/json\",\"Accept-Encoding\":\"identity\",\"Connection\":\"close\",\"Content-Length\":\"73\",\"Content-Type\":\"application/json;charset=UTF-8\",\"Host\":\"127.0.0.1:9643\",\"User-Agent\":\"Python http auth\"},\"httpVersion\":\"1.1\",\"method\":\"POST\",\"post\":\"{\\\"sessionId\\\": \\\"e424dd60-5b97-11e7-a0fa-fbfe1e4d560f\\\", \\\"pageLoad\\\": 180000}\",\"url\":\"/timeouts\",\"urlParsed\":{\"anchor\":\"\",\"query\":\"\",\"file\":\"timeouts\",\"directory\":\"/\",\"path\":\"/timeouts\",\"relative\":\"/timeouts\",\"port\":\"\",\"host\":\"\",\"password\":\"\",\"user\":\"\",\"userInfo\":\"\",\"authority\":\"\",\"protocol\":\"\",\"source\":\"/timeouts\",\"queryKey\":{},\"chunks\":[\"timeouts\"]},\"urlOriginal\":\"/session/e424dd60-5b97-11e7-a0fa-fbfe1e4d560f/timeouts\"}","line":546,"sourceURL":"phantomjs://code/session_request_handler.js","stack":"_postTimeout@phantomjs://code/session_request_handler.js:546:73\n_handle@phantomjs://code/session_request_handler.js:148:25\n_reroute@phantomjs://code/request_handler.js:61:20\n_handle@phantomjs://code/router_request_handler.js:78:46"} 7 | 8 | phantomjs://platform/console++.js:263 in error 9 | [INFO - 2017-06-28T00:27:35.412Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 10 | [INFO - 2017-06-28T00:32:35.411Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 11 | [INFO - 2017-06-28T00:37:35.416Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 12 | [INFO - 2017-06-28T00:42:35.418Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 13 | [INFO - 2017-06-28T00:47:35.418Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 14 | [INFO - 2017-06-28T00:52:35.423Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 15 | [INFO - 2017-06-28T00:57:35.423Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 16 | [INFO - 2017-06-28T01:02:35.427Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 17 | [INFO - 2017-06-28T01:07:35.431Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 18 | [INFO - 2017-06-28T01:12:35.470Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 19 | [INFO - 2017-06-28T01:17:35.469Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 20 | [INFO - 2017-06-28T01:22:35.469Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 21 | [INFO - 2017-06-28T01:27:35.477Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 22 | essSessions - Asynchronous Sessions clean-up phase starting NOW 23 | [INFO - 2017-06-28T01:29:06.882Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 24 | 2017-06-28T01:18:20.002Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 25 | [INFO - 2017-06-28T01:23:20.005Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 26 | [INFO - 2017-06-28T01:28:20.013Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 27 | 2017-06-28T01:18:06.690Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 28 | [INFO - 2017-06-28T01:23:06.726Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 29 | [INFO - 2017-06-28T01:28:06.738Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 30 | -------------------------------------------------------------------------------- /items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | 9 | import scrapy 10 | 11 | 12 | class DgspiderUrlItem(scrapy.Item): 13 | url = scrapy.Field() 14 | 15 | 16 | class DgspiderPostItem(scrapy.Item): 17 | url = scrapy.Field() 18 | title = scrapy.Field() 19 | text = scrapy.Field() -------------------------------------------------------------------------------- /middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DgspiderphantomjsSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /middlewares/middleware.py: -------------------------------------------------------------------------------- 1 | # douguo request middleware 2 | # for the page which loaded by js/ajax 3 | # ang changes should be recored here: 4 | # 5 | # @author zhangjianfei 6 | # @date 2017/05/04 7 | 8 | from selenium import webdriver 9 | from scrapy.http import HtmlResponse 10 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 11 | from DgSpiderPhantomJS import urlSettings 12 | import time 13 | import datetime 14 | import random 15 | import os 16 | import execjs 17 | import DgSpiderPhantomJS.settings as settings 18 | 19 | 20 | class JavaScriptMiddleware(object): 21 | 22 | def process_request(self, request, spider): 23 | 24 | print("LOGS: Spider name in middleware - " + spider.name) 25 | 26 | # 开启虚拟浏览器参数 27 | dcap = dict(DesiredCapabilities.PHANTOMJS) 28 | 29 | # 设置agents 30 | dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS)) 31 | 32 | # 禁止加载图片 33 | dcap["phantomjs.page.settings.loadImages"] = False 34 | 35 | driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap) 36 | 37 | # 由于phantomjs路径已经增添在path中,path可以不写 38 | # driver = webdriver.PhantomJS() 39 | 40 | # 利用firfox 41 | # driver = webdriver.Firefox(executable_path=r"D:\FireFoxBrowser\firefox.exe") 42 | 43 | # 利用chrome 44 | # chromedriver = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe" 45 | # os.environ["webdriver.chrome.driver"] = chromedriver 46 | # driver = webdriver.Chrome(chromedriver) 47 | 48 | # 模拟登陆 49 | # driver.find_element_by_class_name("input_id").send_keys("34563453") 50 | # driver.find_element_by_class_name("input_pwd").send_keys("zjf%#¥&") 51 | # driver.find_element_by_class_name("btn btn_lightgreen btn_login").click() 52 | # driver.implicitly_wait(15) 53 | # time.sleep(10) 54 | 55 | # 模拟用户下拉 56 | # js1 = 'return document.body.scrollHeight' 57 | # js2 = 'window.scrollTo(0, document.body.scrollHeight)' 58 | # js3 = "document.body.scrollTop=1000" 59 | # old_scroll_height = 0 60 | # while driver.execute_script(js1) > old_scroll_height: 61 | # old_scroll_height = driver.execute_script(js1) 62 | # driver.execute_script(js2) 63 | # time.sleep(3) 64 | 65 | # 设置20秒页面超时返回 66 | driver.set_page_load_timeout(180) 67 | # 设置20秒脚本超时时间 68 | driver.set_script_timeout(180) 69 | 70 | # get time stamp 71 | 72 | # get page screenshot 73 | # driver.save_screenshot("D:\p.jpg") 74 | 75 | # 模拟用户在同一个浏览器对象下刷新页面 76 | # the whole page source 77 | body = '' 78 | for i in range(50): 79 | print("SPider name: " + spider.name) 80 | # sleep in a random time for the ajax asynchronous request 81 | # time.sleep(random.randint(5, 6)) 82 | time.sleep(random.randint(300, 600)) 83 | 84 | print("LOGS: freshing page " + str(i) + "...") 85 | 86 | # get page request 87 | driver.get(request.url) 88 | 89 | # waiting for response 90 | driver.implicitly_wait(30) 91 | 92 | # get page resource 93 | body = body + driver.page_source 94 | 95 | return HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) 96 | 97 | 98 | -------------------------------------------------------------------------------- /mysqlUtils.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | import pymysql.cursors 3 | import os 4 | 5 | 6 | def dbhandle_online(): 7 | host = '192.168.1.235' 8 | user = 'root' 9 | passwd = 'douguo2015' 10 | charset = 'utf8' 11 | conn = pymysql.connect( 12 | host=host, 13 | user=user, 14 | passwd=passwd, 15 | charset=charset, 16 | use_unicode=False 17 | ) 18 | return conn 19 | 20 | 21 | def dbhandle_local(): 22 | host = '192.168.1.235' 23 | user = 'root' 24 | passwd = 'douguo2015' 25 | charset = 'utf8' 26 | conn = pymysql.connect( 27 | host=host, 28 | user=user, 29 | passwd=passwd, 30 | charset=charset, 31 | use_unicode=True 32 | # use_unicode=False 33 | ) 34 | return conn 35 | 36 | 37 | def dbhandle_geturl(gid): 38 | host = '192.168.1.235' 39 | user = 'root' 40 | passwd = 'douguo2015' 41 | charset = 'utf8' 42 | conn = pymysql.connect( 43 | host=host, 44 | user=user, 45 | passwd=passwd, 46 | charset=charset, 47 | use_unicode=False 48 | ) 49 | cursor = conn.cursor() 50 | sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid 51 | try: 52 | cursor.execute(sql) 53 | result = cursor.fetchone() 54 | conn.commit() 55 | except Exception as e: 56 | print("***** exception") 57 | print(e) 58 | conn.rollback() 59 | 60 | if result is None: 61 | os._exit(0) 62 | else: 63 | url = result[0] 64 | spider_name = result[1] 65 | site = result[2] 66 | gid = result[3] 67 | module = result[4] 68 | return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode() 69 | 70 | 71 | def dbhandle_insert_content(url, title, content, user_id, has_img): 72 | host = '192.168.1.235' 73 | user = 'root' 74 | passwd = 'douguo2015' 75 | charset = 'utf8' 76 | conn = pymysql.connect( 77 | host=host, 78 | user=user, 79 | passwd=passwd, 80 | charset=charset, 81 | use_unicode=False 82 | ) 83 | cur = conn.cursor() 84 | 85 | # 如果标题或者内容为空,那么程序将退出,篇文章将会作废并将status设置为1,爬虫继续向下运行获得新的URl 86 | if content.strip() == '' or title.strip() == '': 87 | sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url) 88 | try: 89 | cur.execute(sql_fail) 90 | result = cur.fetchone() 91 | conn.commit() 92 | except Exception as e: 93 | print(e) 94 | conn.rollback() 95 | os._exit(0) 96 | 97 | sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \ 98 | % (title, content, user_id, has_img, url) 99 | 100 | try: 101 | cur.execute(sql) 102 | result = cur.fetchone() 103 | conn.commit() 104 | except Exception as e: 105 | print(e) 106 | conn.rollback() 107 | return result 108 | 109 | 110 | def dbhandle_update_status(url, status): 111 | host = '192.168.1.235' 112 | user = 'root' 113 | passwd = 'douguo2015' 114 | charset = 'utf8' 115 | conn = pymysql.connect( 116 | host=host, 117 | user=user, 118 | passwd=passwd, 119 | charset=charset, 120 | use_unicode=False 121 | ) 122 | cur = conn.cursor() 123 | sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \ 124 | % (status, url) 125 | try: 126 | cur.execute(sql) 127 | result = cur.fetchone() 128 | conn.commit() 129 | except Exception as e: 130 | print(e) 131 | conn.rollback() 132 | return result 133 | 134 | 135 | def dbhandle_get_content(url): 136 | host = '192.168.1.235' 137 | user = 'root' 138 | passwd = 'douguo2015' 139 | charset = 'utf8' 140 | conn = pymysql.connect( 141 | host=host, 142 | user=user, 143 | passwd=passwd, 144 | charset=charset, 145 | use_unicode=False 146 | ) 147 | cursor = conn.cursor() 148 | sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url 149 | try: 150 | cursor.execute(sql) 151 | result = cursor.fetchone() 152 | conn.commit() 153 | except Exception as e: 154 | print("***** exception") 155 | print(e) 156 | conn.rollback() 157 | 158 | if result is None: 159 | os._exit(1) 160 | 161 | title = result[0] 162 | content = result[1] 163 | user_id = result[2] 164 | gid = result[3] 165 | return title.decode(), content.decode(), user_id.decode(), gid.decode() 166 | 167 | 168 | # 获取爬虫初始化参数 169 | def dbhandle_get_spider_param(url): 170 | host = '192.168.1.235' 171 | user = 'root' 172 | passwd = 'douguo2015' 173 | charset = 'utf8' 174 | conn = pymysql.connect( 175 | host=host, 176 | user=user, 177 | passwd=passwd, 178 | charset=charset, 179 | use_unicode=False 180 | ) 181 | cursor = conn.cursor() 182 | sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url 183 | result = '' 184 | try: 185 | cursor.execute(sql) 186 | result = cursor.fetchone() 187 | conn.commit() 188 | except Exception as e: 189 | print("***** exception") 190 | print(e) 191 | conn.rollback() 192 | title = result[0] 193 | content = result[1] 194 | user_id = result[2] 195 | gid = result[3] 196 | return title.decode(), content.decode(), user_id.decode(), gid.decode() 197 | -------------------------------------------------------------------------------- /notusedspiders/ContentSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy.selector import Selector 5 | 6 | from DgSpiderPhantomJS import urlSettings 7 | from DgSpiderPhantomJS.items import DgspiderPostItem 8 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl 9 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status 10 | from DgSpiderPhantomJS.notusedspiders import contentSettings 11 | 12 | 13 | class DgContentSpider(scrapy.Spider): 14 | print('>>> Spider DgContentPhantomJSSpider Staring ...') 15 | 16 | # get url from db 17 | result = dbhandle_geturl(urlSettings.GROUP_ID) 18 | url = result[0] 19 | spider_name = result[1] 20 | site = result[2] 21 | gid = result[3] 22 | module = result[4] 23 | 24 | # set spider name 25 | name = contentSettings.SPIDER_NAME 26 | # name = 'DgUrlSpiderPhantomJS' 27 | 28 | # set domains 29 | allowed_domains = [contentSettings.DOMAIN] 30 | 31 | # set scrapy url 32 | start_urls = [url] 33 | 34 | # change status 35 | """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" 36 | dbhandle_update_status(url, 1) 37 | 38 | # scrapy crawl 39 | def parse(self, response): 40 | 41 | # init the item 42 | item = DgspiderPostItem() 43 | 44 | # get the page source 45 | sel = Selector(response) 46 | 47 | print(sel) 48 | 49 | # get post title 50 | title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) 51 | item['title'] = title_date.xpath('string(.)').extract() 52 | 53 | # get post page source 54 | item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() 55 | 56 | # get url 57 | item['url'] = DgContentSpider.url 58 | 59 | yield item 60 | 61 | -------------------------------------------------------------------------------- /notusedspiders/ContentSpider_real.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy.selector import Selector 5 | 6 | from DgSpiderPhantomJS import urlSettings 7 | from DgSpiderPhantomJS.items import DgspiderPostItem 8 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl 9 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status 10 | from DgSpiderPhantomJS.notusedspiders import contentSettings 11 | 12 | 13 | class DgContentSpider(scrapy.Spider): 14 | print('LOGS: Spider DgContentPhantomSpider Staring ...') 15 | 16 | # get url from db 17 | result = dbhandle_geturl(urlSettings.GROUP_ID) 18 | url = result[0] 19 | spider_name = result[1] 20 | site = result[2] 21 | gid = result[3] 22 | module = result[4] 23 | 24 | # set spider name 25 | name = contentSettings.SPIDER_NAME 26 | # name = 'DgUrlSpiderPhantomJS' 27 | 28 | # set domains 29 | allowed_domains = [contentSettings.DOMAIN] 30 | 31 | # set scrapy url 32 | start_urls = [url] 33 | 34 | # change status 35 | """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" 36 | dbhandle_update_status(url, 1) 37 | 38 | # scrapy crawl 39 | def parse(self, response): 40 | 41 | # init the item 42 | item = DgspiderPostItem() 43 | 44 | # get the page source 45 | sel = Selector(response) 46 | 47 | print(sel) 48 | 49 | # get post title 50 | title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) 51 | item['title'] = title_date.xpath('string(.)').extract() 52 | 53 | # get post page source 54 | item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() 55 | 56 | # get url 57 | item['url'] = DgContentSpider.url 58 | 59 | yield item 60 | 61 | -------------------------------------------------------------------------------- /notusedspiders/DgContentSpider_PhantomJS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy.selector import Selector 5 | 6 | from DgSpiderPhantomJS import urlSettings 7 | from DgSpiderPhantomJS.items import DgspiderPostItem 8 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_geturl 9 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status 10 | from DgSpiderPhantomJS.notusedspiders import contentSettings 11 | 12 | 13 | class DgcontentspiderPhantomjsSpider(scrapy.Spider): 14 | print('>>> Spider DgContentPhantomJSSpider Staring ...') 15 | 16 | # get url from db 17 | result = dbhandle_geturl(urlSettings.GROUP_ID) 18 | url = result[0] 19 | spider_name = result[1] 20 | site = result[2] 21 | gid = result[3] 22 | module = result[4] 23 | 24 | # set spider name 25 | name = contentSettings.SPIDER_NAME 26 | # name = 'DgUrlSpiderPhantomJS' 27 | 28 | # set domains 29 | allowed_domains = [contentSettings.DOMAIN] 30 | 31 | # set scrapy url 32 | start_urls = [url] 33 | 34 | # change status 35 | """对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环""" 36 | dbhandle_update_status(url, 1) 37 | 38 | # scrapy crawl 39 | def parse(self, response): 40 | 41 | # init the item 42 | item = DgspiderPostItem() 43 | 44 | # get the page source 45 | sel = Selector(response) 46 | 47 | print(sel) 48 | 49 | # get post title 50 | title_date = sel.xpath(contentSettings.POST_TITLE_XPATH) 51 | item['title'] = title_date.xpath('string(.)').extract() 52 | 53 | # get post page source 54 | item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract() 55 | 56 | # get url 57 | item['url'] = self.url 58 | 59 | yield item 60 | 61 | -------------------------------------------------------------------------------- /notusedspiders/DgUrlSpider_PhantomJS.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from DgSpiderPhantomJS.items import DgspiderUrlItem 5 | from scrapy.selector import Selector 6 | from DgSpiderPhantomJS import urlSettings 7 | 8 | 9 | class DgurlspiderPhantomjsSpider(scrapy.Spider): 10 | print('>>> Spider DgUrlPhantomJSSpider Staring ...') 11 | 12 | # set your spider name 13 | # name = urlSettings.SPIDER_NAME 14 | name = urlSettings.SPIDER_NAME 15 | 16 | # set your allowed domain 17 | allowed_domains = [urlSettings.DOMAIN] 18 | 19 | # set spider start url 20 | start_urls = [urlSettings.URL_START] 21 | 22 | # scrapy crawl 23 | def parse(self, response): 24 | 25 | # init the item 26 | item = DgspiderUrlItem() 27 | 28 | # get the page source 29 | sel = Selector(response) 30 | 31 | # page_source = self.page 32 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 33 | 34 | # if the url you got had some prefix, it will works, such as 'http://' 35 | url_item = [] 36 | for url in url_list: 37 | url = url.replace(urlSettings.URL_PREFIX, '') 38 | url_item.append(urlSettings.URL_PREFIX + url) 39 | 40 | # use set to del repeated urls 41 | url_item = list(set(url_item)) 42 | 43 | item['url'] = url_item 44 | 45 | yield item 46 | 47 | -------------------------------------------------------------------------------- /notusedspiders/PostHandle.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import json 4 | 5 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_get_content 6 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_update_status 7 | from DgSpiderPhantomJS.notusedspiders.uploadUtils import upload_post 8 | 9 | 10 | def post_handel(url): 11 | result = dbhandle_get_content(url) 12 | 13 | title = result[0] 14 | content = result[1] 15 | user_id = result[2] 16 | gid = result[3] 17 | cs = [] 18 | 19 | text_list = content.split('[dgimg]') 20 | for text_single in text_list: 21 | text_single_c = text_single.split('[/dgimg]') 22 | if len(text_single_c) == 1: 23 | cs_json = {"c": text_single_c[0], "i": '', "w": '', "h": ''} 24 | cs.append(cs_json) 25 | else: 26 | # tmp_img_upload_json = upload_img_result.pop() 27 | pic_flag = text_single_c[1] 28 | img_params = text_single_c[0].split(';') 29 | i = img_params[0] 30 | w = img_params[1] 31 | h = img_params[2] 32 | cs_json = {"c": pic_flag, "i": i, "w": w, "h": h} 33 | cs.append(cs_json) 34 | 35 | strcs = json.dumps(cs) 36 | json_data = {"apisign": "99ea3eda4b45549162c4a741d58baa60", 37 | "user_id": user_id, 38 | "gid": gid, 39 | "t": title, 40 | "cs": strcs} 41 | # 上传帖子 42 | result_uploadpost = upload_post(json_data) 43 | 44 | # 更新状态2,成功上传帖子 45 | result_updateresult = dbhandle_update_status(url, 2) 46 | # 47 | # if __name__ == '__main__': 48 | # post_handel('http://www.mama.cn/baby/art/20140523/773474.html') 49 | -------------------------------------------------------------------------------- /notusedspiders/UrlSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | from scrapy.selector import Selector 5 | 6 | from DgSpiderPhantomJS import urlSettings 7 | from DgSpiderPhantomJS.items import DgspiderUrlItem 8 | from DgSpiderPhantomJS.notusedspiders import contentSettings 9 | 10 | 11 | class DgUrlSpider(scrapy.Spider): 12 | 13 | print('LOGS: Spider DgUrlPhantomSpider Staring ...') 14 | 15 | # set your spider name 16 | name = contentSettings.SPIDER_NAME 17 | 18 | # set your allowed domain 19 | allowed_domains = [urlSettings.DOMAIN] 20 | 21 | # set spider start url 22 | start_urls = [urlSettings.URL_START_JFSS] 23 | 24 | # scrapy crawl 25 | def parse(self, response): 26 | 27 | # init the item 28 | item = DgspiderUrlItem() 29 | 30 | # get the page source 31 | sel = Selector(response) 32 | 33 | # page_source = self.page 34 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 35 | 36 | # if the url you got had some prefix, it will works, such as 'http://' 37 | url_item = [] 38 | for url in url_list: 39 | url = url.replace(urlSettings.URL_PREFIX, '') 40 | url_item.append(urlSettings.URL_PREFIX + url) 41 | 42 | # use set to del repeated urls 43 | url_item = list(set(url_item)) 44 | 45 | item['url'] = url_item 46 | 47 | # transer item to pipeline 48 | yield item 49 | 50 | # for i in range(5): 51 | # yield Request(self.start_urls[0], callback=self.parse) 52 | -------------------------------------------------------------------------------- /notusedspiders/check_post.py: -------------------------------------------------------------------------------- 1 | import requests, re 2 | import http 3 | import urllib 4 | 5 | # 圈圈:孕妈育儿 4 6 | # 圈圈:减肥瘦身 33 7 | # 圈圈:情感生活 30 8 | 9 | 10 | def checkPost(): 11 | # CREATE_POST_URL = "http://api.qa.douguo.net/robot/handlePost" 12 | CREATE_POST_URL = "http://api.douguo.net/robot/handlePost" 13 | 14 | fields={'group_id': '35', 15 | 'type': 1, 16 | 'apisign':'99ea3eda4b45549162c4a741d58baa60'} 17 | 18 | r = requests.post(CREATE_POST_URL, data=fields) 19 | 20 | print(r.json()) 21 | 22 | 23 | if __name__ == '__main__': 24 | #for i in range(1,50): 25 | #checkPost() 26 | checkPost() 27 | # print(i), 28 | #print(testText('aaaa\001')) -------------------------------------------------------------------------------- /notusedspiders/contentSettings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for DgSpider project 4 | 5 | # 图片储存 6 | IMAGES_STORE = 'D:\\pics\\jfss\\' 7 | 8 | # 爬取域名 9 | DOMAIN = 'toutiao.com' 10 | 11 | # 图片域名前缀 12 | DOMAIN_HTTP = "http:" 13 | 14 | # 随机发帖用户 15 | CREATE_POST_USER = '37619,18441390,18441391,18441392,18441393,18441394,18441395,18441396,18441397,18441398,18441399,'\ 16 | '18441400,18441401,18441402,18441403,18441404, 18441405,18441406,18441407,18441408,18441409,' \ 17 | '18441410,18441411,18441412,18441413,18441414,18441415,18441416,18441417,18441418,18441419,' \ 18 | '18441420,18441421,18441422,18441423,18441424,18441425,18441426,18441427,18441428,18441429,' \ 19 | '18441430,18441431,18441432,18441433,18441434,18441435,18441436,18441437,18441438,18441439,' \ 20 | '18441440,18441441,18441442,18441443,18441444,18441445,18441446,18441447,18441448,18441449,' \ 21 | '18441450,18441451,18441452,18441453,18441454,18441455,18441456,18441457,18441458,18441460,' \ 22 | '18441461,18441462,18441463,18441464,18441465,18441466,18441467,18441468,18441469,18441470,' \ 23 | '18441471,18441472,18441473,18441474,18441475,18441476,18441477,18441478,18441479,18441481,' \ 24 | '18441482,18441483,18441484,18441485,18441486,18441487,18441488,18441489,18441490' 25 | 26 | # 爬虫名 27 | SPIDER_NAME = 'DgContentSpider_PhantomJS' 28 | 29 | # 文章URL爬取规则XPATH 30 | POST_TITLE_XPATH = '//h1[@class="article-title"]' 31 | POST_CONTENT_XPATH = '//div[@class="article-content"]' 32 | 33 | -------------------------------------------------------------------------------- /notusedspiders/params.js: -------------------------------------------------------------------------------- 1 | function getParam(){ 2 | var asas; 3 | var cpcp; 4 | var t = Math.floor((new Date).getTime() / 1e3) 5 | , e = t.toString(16).toUpperCase() 6 | , i = md5(t).toString().toUpperCase(); 7 | if (8 != e.length){ 8 | asas = "479BB4B7254C150"; 9 | cpcp = "7E0AC8874BB0985"; 10 | }else{ 11 | for (var n = i.slice(0, 5), o = i.slice(-5), a = "", s = 0; 5 > s; s++){ 12 | a += n[s] + e[s]; 13 | } 14 | for (var r = "", c = 0; 5 > c; c++){ 15 | r += e[c + 3] + o[c]; 16 | } 17 | asas = "A1" + a + e.slice(-3); 18 | cpcp= e.slice(0, 3) + r + "E1"; 19 | } 20 | return '{"as":"'+asas+'","cp":"'+cpcp+'"}'; 21 | } 22 | !function(e) { 23 | "use strict"; 24 | function t(e, t) { 25 | var n = (65535 & e) + (65535 & t) 26 | , r = (e >> 16) + (t >> 16) + (n >> 16); 27 | return r << 16 | 65535 & n 28 | } 29 | function n(e, t) { 30 | return e << t | e >>> 32 - t 31 | } 32 | function r(e, r, o, i, a, u) { 33 | return t(n(t(t(r, e), t(i, u)), a), o) 34 | } 35 | function o(e, t, n, o, i, a, u) { 36 | return r(t & n | ~t & o, e, t, i, a, u) 37 | } 38 | function i(e, t, n, o, i, a, u) { 39 | return r(t & o | n & ~o, e, t, i, a, u) 40 | } 41 | function a(e, t, n, o, i, a, u) { 42 | return r(t ^ n ^ o, e, t, i, a, u) 43 | } 44 | function u(e, t, n, o, i, a, u) { 45 | return r(n ^ (t | ~o), e, t, i, a, u) 46 | } 47 | function s(e, n) { 48 | e[n >> 5] |= 128 << n % 32, 49 | e[(n + 64 >>> 9 << 4) + 14] = n; 50 | var r, s, c, l, f, p = 1732584193, d = -271733879, h = -1732584194, m = 271733878; 51 | for (r = 0; r < e.length; r += 16) 52 | s = p, 53 | c = d, 54 | l = h, 55 | f = m, 56 | p = o(p, d, h, m, e[r], 7, -680876936), 57 | m = o(m, p, d, h, e[r + 1], 12, -389564586), 58 | h = o(h, m, p, d, e[r + 2], 17, 606105819), 59 | d = o(d, h, m, p, e[r + 3], 22, -1044525330), 60 | p = o(p, d, h, m, e[r + 4], 7, -176418897), 61 | m = o(m, p, d, h, e[r + 5], 12, 1200080426), 62 | h = o(h, m, p, d, e[r + 6], 17, -1473231341), 63 | d = o(d, h, m, p, e[r + 7], 22, -45705983), 64 | p = o(p, d, h, m, e[r + 8], 7, 1770035416), 65 | m = o(m, p, d, h, e[r + 9], 12, -1958414417), 66 | h = o(h, m, p, d, e[r + 10], 17, -42063), 67 | d = o(d, h, m, p, e[r + 11], 22, -1990404162), 68 | p = o(p, d, h, m, e[r + 12], 7, 1804603682), 69 | m = o(m, p, d, h, e[r + 13], 12, -40341101), 70 | h = o(h, m, p, d, e[r + 14], 17, -1502002290), 71 | d = o(d, h, m, p, e[r + 15], 22, 1236535329), 72 | p = i(p, d, h, m, e[r + 1], 5, -165796510), 73 | m = i(m, p, d, h, e[r + 6], 9, -1069501632), 74 | h = i(h, m, p, d, e[r + 11], 14, 643717713), 75 | d = i(d, h, m, p, e[r], 20, -373897302), 76 | p = i(p, d, h, m, e[r + 5], 5, -701558691), 77 | m = i(m, p, d, h, e[r + 10], 9, 38016083), 78 | h = i(h, m, p, d, e[r + 15], 14, -660478335), 79 | d = i(d, h, m, p, e[r + 4], 20, -405537848), 80 | p = i(p, d, h, m, e[r + 9], 5, 568446438), 81 | m = i(m, p, d, h, e[r + 14], 9, -1019803690), 82 | h = i(h, m, p, d, e[r + 3], 14, -187363961), 83 | d = i(d, h, m, p, e[r + 8], 20, 1163531501), 84 | p = i(p, d, h, m, e[r + 13], 5, -1444681467), 85 | m = i(m, p, d, h, e[r + 2], 9, -51403784), 86 | h = i(h, m, p, d, e[r + 7], 14, 1735328473), 87 | d = i(d, h, m, p, e[r + 12], 20, -1926607734), 88 | p = a(p, d, h, m, e[r + 5], 4, -378558), 89 | m = a(m, p, d, h, e[r + 8], 11, -2022574463), 90 | h = a(h, m, p, d, e[r + 11], 16, 1839030562), 91 | d = a(d, h, m, p, e[r + 14], 23, -35309556), 92 | p = a(p, d, h, m, e[r + 1], 4, -1530992060), 93 | m = a(m, p, d, h, e[r + 4], 11, 1272893353), 94 | h = a(h, m, p, d, e[r + 7], 16, -155497632), 95 | d = a(d, h, m, p, e[r + 10], 23, -1094730640), 96 | p = a(p, d, h, m, e[r + 13], 4, 681279174), 97 | m = a(m, p, d, h, e[r], 11, -358537222), 98 | h = a(h, m, p, d, e[r + 3], 16, -722521979), 99 | d = a(d, h, m, p, e[r + 6], 23, 76029189), 100 | p = a(p, d, h, m, e[r + 9], 4, -640364487), 101 | m = a(m, p, d, h, e[r + 12], 11, -421815835), 102 | h = a(h, m, p, d, e[r + 15], 16, 530742520), 103 | d = a(d, h, m, p, e[r + 2], 23, -995338651), 104 | p = u(p, d, h, m, e[r], 6, -198630844), 105 | m = u(m, p, d, h, e[r + 7], 10, 1126891415), 106 | h = u(h, m, p, d, e[r + 14], 15, -1416354905), 107 | d = u(d, h, m, p, e[r + 5], 21, -57434055), 108 | p = u(p, d, h, m, e[r + 12], 6, 1700485571), 109 | m = u(m, p, d, h, e[r + 3], 10, -1894986606), 110 | h = u(h, m, p, d, e[r + 10], 15, -1051523), 111 | d = u(d, h, m, p, e[r + 1], 21, -2054922799), 112 | p = u(p, d, h, m, e[r + 8], 6, 1873313359), 113 | m = u(m, p, d, h, e[r + 15], 10, -30611744), 114 | h = u(h, m, p, d, e[r + 6], 15, -1560198380), 115 | d = u(d, h, m, p, e[r + 13], 21, 1309151649), 116 | p = u(p, d, h, m, e[r + 4], 6, -145523070), 117 | m = u(m, p, d, h, e[r + 11], 10, -1120210379), 118 | h = u(h, m, p, d, e[r + 2], 15, 718787259), 119 | d = u(d, h, m, p, e[r + 9], 21, -343485551), 120 | p = t(p, s), 121 | d = t(d, c), 122 | h = t(h, l), 123 | m = t(m, f); 124 | return [p, d, h, m] 125 | } 126 | function c(e) { 127 | var t, n = ""; 128 | for (t = 0; t < 32 * e.length; t += 8) 129 | n += String.fromCharCode(e[t >> 5] >>> t % 32 & 255); 130 | return n 131 | } 132 | function l(e) { 133 | var t, n = []; 134 | for (n[(e.length >> 2) - 1] = void 0, 135 | t = 0; t < n.length; t += 1) 136 | n[t] = 0; 137 | for (t = 0; t < 8 * e.length; t += 8) 138 | n[t >> 5] |= (255 & e.charCodeAt(t / 8)) << t % 32; 139 | return n 140 | } 141 | function f(e) { 142 | return c(s(l(e), 8 * e.length)) 143 | } 144 | function p(e, t) { 145 | var n, r, o = l(e), i = [], a = []; 146 | for (i[15] = a[15] = void 0, 147 | o.length > 16 && (o = s(o, 8 * e.length)), 148 | n = 0; 16 > n; n += 1) 149 | i[n] = 909522486 ^ o[n], 150 | a[n] = 1549556828 ^ o[n]; 151 | return r = s(i.concat(l(t)), 512 + 8 * t.length), 152 | c(s(a.concat(r), 640)) 153 | } 154 | function d(e) { 155 | var t, n, r = "0123456789abcdef", o = ""; 156 | for (n = 0; n < e.length; n += 1) 157 | t = e.charCodeAt(n), 158 | o += r.charAt(t >>> 4 & 15) + r.charAt(15 & t); 159 | return o 160 | } 161 | function h(e) { 162 | return unescape(encodeURIComponent(e)) 163 | } 164 | function m(e) { 165 | return f(h(e)) 166 | } 167 | function g(e) { 168 | return d(m(e)) 169 | } 170 | function v(e, t) { 171 | return p(h(e), h(t)) 172 | } 173 | function y(e, t) { 174 | return d(v(e, t)) 175 | } 176 | function b(e, t, n) { 177 | return t ? n ? v(t, e) : y(t, e) : n ? m(e) : g(e) 178 | } 179 | "function" == typeof define && define.amd ? define("static/js/lib/md5", ["require"], function() { 180 | return b 181 | }) : "object" == typeof module && module.exports ? module.exports = b : e.md5 = b 182 | }(this) -------------------------------------------------------------------------------- /notusedspiders/uploadUtils.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests_toolbelt.multipart.encoder import MultipartEncoder 3 | 4 | 5 | def upload_post(json_data): 6 | # 上传帖子 ,参考:http://192.168.2.25:3000/api/interface/2016 7 | # create_post_url = "http://api.qa.douguo.net/robot/uploadimagespost" 8 | create_post_url = "http://api.douguo.net/robot/uploadimagespost" 9 | 10 | # 传帖子 11 | # dataJson = json.dumps({"user_id":"19013245","gid":30,"t":"2017-03-23","cs":[{"c":"啦啦啦","i":"","w":0,"h":0}, 12 | # {"c":"啦啦啦2222","i":"http://wwww.douguo.com/abc.jpg","w":0,"h":0}],"time":1235235234}) 13 | # jsonData = {"user_id":"19013245","gid":5,"t":"TEST","cs":'[{"c":"啊啊啊","i":"qqq","w":12,"h":10}, 14 | # {"c":"这个内容真不错","i":"http://wwww.baidu.com","w":10,"h":10}]',"time":61411313} 15 | 16 | # print(jsonData) 17 | req_post = requests.post(create_post_url, data=json_data) 18 | print(req_post.json()) 19 | # print(reqPost.text) 20 | 21 | 22 | def uploadImage(img_path, content_type, user_id): 23 | # 上传单个图片 , 参考:http://192.168.2.25:3000/api/interface/2015 24 | # UPLOAD_IMG_URL = "http://api.qa.douguo.net/robot/uploadpostimage" 25 | UPLOAD_IMG_URL = "http://api.douguo.net/robot/uploadpostimage" 26 | # 传图片 27 | 28 | m = MultipartEncoder( 29 | # fields={'user_id': '192323', 30 | # 'images': ('filename', open(imgPath, 'rb'), 'image/JPEG')} 31 | fields={'user_id': user_id, 32 | 'apisign': '99ea3eda4b45549162c4a741d58baa60', 33 | 'image': ('filename', open(img_path, 'rb'), 'image/jpeg')} 34 | ) 35 | 36 | r = requests.post(UPLOAD_IMG_URL, data=m, headers={'Content-Type': m.content_type}) 37 | print(r.json()) 38 | # print(r.text) 39 | return r.json() 40 | # return r.text -------------------------------------------------------------------------------- /notusedspiders/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import datetime 3 | 4 | 5 | -------------------------------------------------------------------------------- /pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import datetime 9 | from DgSpiderPhantomJS import urlSettings 10 | from DgSpiderPhantomJS.mysqlUtils import dbhandle_online 11 | from DgSpiderPhantomJS.commonUtils import get_linkmd5id 12 | 13 | 14 | class DgspiderphantomjsPipeline(object): 15 | 16 | def __init__(self): 17 | pass 18 | 19 | # process the data 20 | def process_item(self, item, spider): 21 | 22 | # get mysql connettion 23 | db_object = dbhandle_online() 24 | cursor = db_object.cursor() 25 | 26 | print(">>>>> Spider name :") 27 | print(spider.name) 28 | 29 | for url in item['url']: 30 | linkmd5id = get_linkmd5id(url) 31 | 32 | if spider.name == urlSettings.SPIDER_JFSS: 33 | spider_name = urlSettings.SPIDER_JFSS 34 | gid = urlSettings.GROUP_ID_JFSS 35 | elif spider.name == urlSettings.SPIDER_MSZT: 36 | spider_name = urlSettings.SPIDER_MSZT 37 | gid = urlSettings.GROUP_ID_MSZT 38 | elif spider.name == urlSettings.SPIDER_SYDW: 39 | spider_name = urlSettings.SPIDER_SYDW 40 | gid = urlSettings.GROUP_ID_SYDW 41 | elif spider.name == urlSettings.SPIDER_YLBG: 42 | spider_name = urlSettings.SPIDER_YLBG 43 | gid = urlSettings.GROUP_ID_YLBG 44 | elif spider.name == urlSettings.SPIDER_YMYE: 45 | spider_name = urlSettings.SPIDER_YMYE 46 | gid = urlSettings.GROUP_ID_YMYE 47 | 48 | module = urlSettings.MODULE 49 | site = urlSettings.DOMAIN 50 | create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 51 | status = '0' 52 | sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id 53 | sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status, ' \ 54 | 'create_time) ' \ 55 | 'values("%s", "%s", "%s", "%s", "%s", "%s", "%s", "%s")' \ 56 | % (linkmd5id, url, spider_name, site, gid, module, status, create_time) 57 | try: 58 | # if url is not existed, then insert 59 | cursor.execute(sql_search) 60 | result_search = cursor.fetchone() 61 | if result_search is None or result_search[0].strip() == '': 62 | cursor.execute(sql) 63 | result = cursor.fetchone() 64 | db_object.commit() 65 | except Exception as e: 66 | print("Waring!: catch exception !") 67 | print(e) 68 | db_object.rollback() 69 | 70 | return item 71 | 72 | # spider开启时被调用 73 | def open_spider(self, spider): 74 | pass 75 | 76 | # sipder 关闭时被调用 77 | def close_spider(self, spider): 78 | pass 79 | -------------------------------------------------------------------------------- /settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dg-spider-phantomJS project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dg-spider-phantomJS' 13 | 14 | SPIDER_MODULES = ['dg-spider-phantomJS.spiders'] 15 | NEWSPIDER_MODULE = 'dg-spider-phantomJS.spiders' 16 | 17 | # 注册PIPELINES 18 | ITEM_PIPELINES = { 19 | 'dg-spider-phantomJS.pipelines.DgspiderphantomjsPipeline': 544 20 | } 21 | 22 | DOWNLOADER_MIDDLEWARES = { 23 | 'dg-spider-phantomJS.middlewares.middleware.JavaScriptMiddleware': 543, # 键为中间件类的路径,值为中间件的顺序 24 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, # 禁止内置的中间件 25 | } 26 | 27 | USER_AGENTS = [ 28 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 29 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 31 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 32 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 33 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 34 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 35 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5" 36 | ] 37 | 38 | COMMANDS_MODULE = 'dg-spider-phantomJS.commands' 39 | # 40 | 41 | 42 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 43 | #USER_AGENT = 'DgSpiderPhantomJS (+http://www.yourdomain.com)' 44 | 45 | # Obey robots.txt rules 46 | # ROBOTSTXT_OBEY = True 47 | 48 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 49 | #CONCURRENT_REQUESTS = 32 50 | 51 | # Configure a delay for requests for the same website (default: 0) 52 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 53 | # See also autothrottle settings and docs 54 | # 设置下载延迟 55 | # DOWNLOAD_DELAY = 3 56 | 57 | # The download delay setting will honor only one of: 58 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 59 | #CONCURRENT_REQUESTS_PER_IP = 16 60 | 61 | # Disable cookies (enabled by default) 62 | COOKIES_ENABLED = True 63 | 64 | # Disable Telnet Console (enabled by default) 65 | #TELNETCONSOLE_ENABLED = False 66 | 67 | # Override the default request headers: 68 | #DEFAULT_REQUEST_HEADERS = { 69 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 70 | # 'Accept-Language': 'en', 71 | #} 72 | 73 | # Enable or disable spider middlewares 74 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 75 | #SPIDER_MIDDLEWARES = { 76 | # 'dg-spider-phantomJS.middlewares.DgspiderphantomjsSpiderMiddleware': 543, 77 | #} 78 | 79 | # Enable or disable downloader middlewares 80 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 81 | #DOWNLOADER_MIDDLEWARES = { 82 | # 'dg-spider-phantomJS.middlewares.MyCustomDownloaderMiddleware': 543, 83 | #} 84 | 85 | # Enable or disable extensions 86 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 87 | #EXTENSIONS = { 88 | # 'scrapy.extensions.telnet.TelnetConsole': None, 89 | #} 90 | 91 | # Configure item pipelines 92 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 93 | #ITEM_PIPELINES = { 94 | # 'dg-spider-phantomJS.pipelines.DgspiderphantomjsPipeline': 300, 95 | #} 96 | 97 | # Enable and configure the AutoThrottle extension (disabled by default) 98 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 99 | #AUTOTHROTTLE_ENABLED = True 100 | # The initial download delay 101 | #AUTOTHROTTLE_START_DELAY = 5 102 | # The maximum download delay to be set in case of high latencies 103 | #AUTOTHROTTLE_MAX_DELAY = 60 104 | # The average number of requests Scrapy should be sending in parallel to 105 | # each remote server 106 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 107 | # Enable showing throttling stats for every response received: 108 | #AUTOTHROTTLE_DEBUG = False 109 | 110 | # Enable and configure HTTP caching (disabled by default) 111 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 112 | #HTTPCACHE_ENABLED = True 113 | #HTTPCACHE_EXPIRATION_SECS = 0 114 | #HTTPCACHE_DIR = 'httpcache' 115 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 116 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 117 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='scrapy-mymodule', 4 | entry_points={ 5 | 'scrapy.commands': [ 6 | 'crawlall=cnblogs.commands:crawlall', 7 | ], 8 | }, 9 | ) 10 | -------------------------------------------------------------------------------- /spiders/UrlSpider_JFSH.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from DgSpiderPhantomJS.items import DgspiderUrlItem 4 | from scrapy.selector import Selector 5 | from DgSpiderPhantomJS import urlSettings 6 | 7 | 8 | class UrlspiderJfshSpider(scrapy.Spider): 9 | 10 | name = "UrlSpider_JFSS" 11 | 12 | # set your allowed domain 13 | allowed_domains = [urlSettings.DOMAIN] 14 | 15 | # set spider start url 16 | start_urls = [urlSettings.URL_START_JFSS] 17 | 18 | # scrapy crawl 19 | def parse(self, response): 20 | print("LOGS: Starting spider JFSS ...") 21 | 22 | # init the item 23 | item = DgspiderUrlItem() 24 | 25 | # get the page source 26 | sel = Selector(response) 27 | 28 | # page_source = self.page 29 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 30 | 31 | # if the url you got had some prefix, it will works, such as 'http://' 32 | url_item = [] 33 | for url in url_list: 34 | url = url.replace(urlSettings.URL_PREFIX, '') 35 | url_item.append(urlSettings.URL_PREFIX + url) 36 | 37 | # use set to del repeated urls 38 | url_item = list(set(url_item)) 39 | 40 | item['url'] = url_item 41 | 42 | # transer item to pipeline 43 | yield item 44 | -------------------------------------------------------------------------------- /spiders/UrlSpider_MSZT.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | 5 | from DgSpiderPhantomJS import urlSettings 6 | from DgSpiderPhantomJS.items import DgspiderUrlItem 7 | 8 | 9 | class UrlspiderMsztSpider(scrapy.Spider): 10 | 11 | name = "UrlSpider_MSZT" 12 | 13 | # set your allowed domain 14 | allowed_domains = [urlSettings.DOMAIN] 15 | 16 | # set spider start url 17 | start_urls = [urlSettings.URL_START_MSZT] 18 | 19 | # scrapy crawl 20 | def parse(self, response): 21 | print("LOGS: Starting spider MSZT ...") 22 | 23 | # init the item 24 | item = DgspiderUrlItem() 25 | 26 | # get the page source 27 | sel = Selector(response) 28 | 29 | # page_source = self.page 30 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 31 | 32 | # if the url you got had some prefix, it will works, such as 'http://' 33 | url_item = [] 34 | for url in url_list: 35 | url = url.replace(urlSettings.URL_PREFIX, '') 36 | url_item.append(urlSettings.URL_PREFIX + url) 37 | 38 | # use set to del repeated urls 39 | url_item = list(set(url_item)) 40 | 41 | item['url'] = url_item 42 | 43 | # transer item to pipeline 44 | yield item 45 | -------------------------------------------------------------------------------- /spiders/UrlSpider_SYDW.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | 5 | from DgSpiderPhantomJS import urlSettings 6 | from DgSpiderPhantomJS.items import DgspiderUrlItem 7 | 8 | 9 | class UrlspiderSydwSpider(scrapy.Spider): 10 | 11 | name = "UrlSpider_SYDW" 12 | 13 | # set your allowed domain 14 | allowed_domains = [urlSettings.DOMAIN] 15 | 16 | # set spider start url 17 | start_urls = [urlSettings.URL_START_SYDW] 18 | 19 | # scrapy crawl 20 | def parse(self, response): 21 | print("LOGS: Starting spider SYDW ...") 22 | 23 | # init the item 24 | item = DgspiderUrlItem() 25 | 26 | # get the page source 27 | sel = Selector(response) 28 | 29 | # page_source = self.page 30 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 31 | 32 | # if the url you got had some prefix, it will works, such as 'http://' 33 | url_item = [] 34 | for url in url_list: 35 | url = url.replace(urlSettings.URL_PREFIX, '') 36 | url_item.append(urlSettings.URL_PREFIX + url) 37 | 38 | # use set to del repeated urls 39 | url_item = list(set(url_item)) 40 | 41 | item['url'] = url_item 42 | 43 | # transer item to pipeline 44 | yield item -------------------------------------------------------------------------------- /spiders/UrlSpider_YLBG.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | 5 | from DgSpiderPhantomJS import urlSettings 6 | from DgSpiderPhantomJS.items import DgspiderUrlItem 7 | 8 | 9 | class UrlspiderYlbgSpider(scrapy.Spider): 10 | 11 | name = "UrlSpider_YLBG" 12 | 13 | 14 | # set your allowed domain 15 | allowed_domains = [urlSettings.DOMAIN] 16 | 17 | # set spider start url 18 | start_urls = [urlSettings.URL_START_YLBG] 19 | 20 | # scrapy crawl 21 | def parse(self, response): 22 | print("LOGS: Starting spider YLBG ...") 23 | 24 | # init the item 25 | item = DgspiderUrlItem() 26 | 27 | # get the page source 28 | sel = Selector(response) 29 | 30 | # page_source = self.page 31 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 32 | 33 | # if the url you got had some prefix, it will works, such as 'http://' 34 | url_item = [] 35 | for url in url_list: 36 | url = url.replace(urlSettings.URL_PREFIX, '') 37 | url_item.append(urlSettings.URL_PREFIX + url) 38 | 39 | # use set to del repeated urls 40 | url_item = list(set(url_item)) 41 | 42 | item['url'] = url_item 43 | 44 | # transer item to pipeline 45 | yield item -------------------------------------------------------------------------------- /spiders/UrlSpider_YMYE.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | 5 | from DgSpiderPhantomJS import urlSettings 6 | from DgSpiderPhantomJS.items import DgspiderUrlItem 7 | 8 | 9 | class UrlspiderYmyeSpider(scrapy.Spider): 10 | 11 | name = "UrlSpider_YMYE" 12 | 13 | # set your allowed domain 14 | allowed_domains = [urlSettings.DOMAIN] 15 | 16 | # set spider start url 17 | start_urls = [urlSettings.URL_START_YMYE] 18 | 19 | # scrapy crawl 20 | def parse(self, response): 21 | print("LOGS: Starting spider YMYE ...") 22 | 23 | # init the item 24 | item = DgspiderUrlItem() 25 | 26 | # get the page source 27 | sel = Selector(response) 28 | 29 | # page_source = self.page 30 | url_list = sel.xpath(urlSettings.POST_URL_PHANTOMJS_XPATH).extract() 31 | 32 | # if the url you got had some prefix, it will works, such as 'http://' 33 | url_item = [] 34 | for url in url_list: 35 | url = url.replace(urlSettings.URL_PREFIX, '') 36 | url_item.append(urlSettings.URL_PREFIX + url) 37 | 38 | # use set to del repeated urls 39 | url_item = list(set(url_item)) 40 | 41 | item['url'] = url_item 42 | 43 | # transer item to pipeline 44 | yield item 45 | 46 | # for i in range(5): 47 | # yield Request(self.start_urls[0], callback=self.parse) -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import sys, shelve, time, execjs 3 | # import PyV8 4 | 5 | # create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 6 | # print(create_time) 7 | 8 | 9 | def initDriverPool(): 10 | create_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 11 | time_array = time.strptime(create_time, "%Y-%m-%d %H:%M:%S") 12 | time_stamp = int(time.mktime(time_array)) 13 | 14 | print(time_stamp) 15 | 16 | def execjs(): 17 | js_str = open('D:\Scrapy\DgSpiderPhantomJS\DgSpiderPhantomJS\params.js').read() 18 | a = execjs.compile(js_str).call('getParam') 19 | # a = execjs.eval(js_str3) 20 | print(a) 21 | 22 | # def js(self): 23 | # ctxt = PyV8.JSContext() 24 | # ctxt.enter() 25 | # func = ctxt.eval('''(function(){return '###'})''') 26 | # print(func) 27 | 28 | if __name__=='__main__': 29 | execjs() -------------------------------------------------------------------------------- /urlSettings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """爬取域名""" 4 | DOMAIN = 'toutiao.com' 5 | 6 | """圈子列表""" 7 | # 减肥瘦身 8 | GROUP_ID_JFSS = '33' 9 | # 情感生活 10 | GROUP_ID_QQSH = '30' 11 | # 营养专家 12 | GROUP_ID_YYZJ = '35' 13 | # 孕妈育儿 14 | GROUP_ID_YMYE = '4' 15 | # 深夜豆文 16 | GROUP_ID_SYDW = '37' 17 | # 美食杂谈 18 | GROUP_ID_MSZT = '24' 19 | # 娱乐八卦 20 | GROUP_ID_YLBG = '38' 21 | 22 | """爬虫列表""" 23 | SPIDER_JFSS = 'UrlSpider_JFSS' 24 | SPIDER_QQSH = 'UrlSpider_QQSH' 25 | SPIDER_YYZJ = 'UrlSpider_YYZJ' 26 | SPIDER_YMYE = 'UrlSpider_YMYE' 27 | SPIDER_SYDW = 'UrlSpider_SYDW' 28 | SPIDER_MSZT = 'UrlSpider_MSZT' 29 | SPIDER_YLBG = 'UrlSpider_YLBG' 30 | 31 | MODULE = '999' 32 | 33 | # url 前缀 34 | URL_PREFIX = 'http://www.toutiao.com' 35 | 36 | # 爬取起始页 37 | URL_START_JFSS = 'http://www.toutiao.com/ch/news_regimen/' 38 | URL_START_YMYE = 'http://www.toutiao.com/ch/news_baby/' 39 | URL_START_SYDW = 'http://www.toutiao.com/ch/news_essay/' 40 | URL_START_MSZT = 'http://www.toutiao.com/ch/news_food/' 41 | URL_START_YLBG = 'http://www.toutiao.com/ch/news_entertainment/' 42 | 43 | """静态页爬取规则""" 44 | # # 文章列表页起始爬取URL 45 | # START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html' 46 | # 47 | # # 文章列表循环规则 48 | # LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/' 49 | # LIST_URL_RULER_SUFFIX = '.html' 50 | # LIST_URL_RULER_LOOP = 30 51 | # 52 | # # 文章URL爬取规则XPATH 53 | # POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href' 54 | 55 | """今日头条-动态JS/Ajax爬取规则""" 56 | POST_URL_PHANTOMJS_XPATH = '//div[@class="title-box"]/a/@href' 57 | 58 | 59 | -------------------------------------------------------------------------------- /webBrowserPools/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2017-05-08T02:11:33.071Z] GhostDriver - Main - running on port 13763 2 | [INFO - 2017-05-08T02:11:36.561Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":false,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","webSecurityEnabled":true} 3 | [INFO - 2017-05-08T02:11:36.561Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - page.customHeaders: - {} 4 | [INFO - 2017-05-08T02:11:36.562Z] Session [aa201d90-3393-11e7-8f82-03c3e0612c46] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)","phantomjs.page.settings.loadImages":false} 5 | [INFO - 2017-05-08T02:11:36.562Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: aa201d90-3393-11e7-8f82-03c3e0612c46 6 | -------------------------------------------------------------------------------- /webBrowserPools/pool.py: -------------------------------------------------------------------------------- 1 | # douguo object pool 2 | # for the page which loaded by js/ajax 3 | # ang changes should be recored here: 4 | # 5 | # @author zhangjianfei 6 | # @date 2017/05/08 7 | 8 | from selenium import webdriver 9 | from scrapy.http import HtmlResponse 10 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 11 | import time 12 | import random 13 | import os 14 | import DgSpiderPhantomJS.settings as settings 15 | import pickle 16 | 17 | 18 | def save_driver(): 19 | dcap = dict(DesiredCapabilities.PHANTOMJS) 20 | dcap["phantomjs.page.settings.userAgent"] = (random.choice(settings.USER_AGENTS)) 21 | dcap["phantomjs.page.settings.loadImages"] = False 22 | driver = webdriver.PhantomJS(executable_path=r"D:\phantomjs-2.1.1\bin\phantomjs.exe", desired_capabilities=dcap) 23 | fn = open('D:\driver.pkl', 'w') 24 | 25 | # with open(fn, 'w') as f: 26 | pickle.dump(driver, fn, 0) 27 | fn.close() 28 | 29 | 30 | def get_driver(): 31 | fn = 'D:\driver.pkl' 32 | with open(fn, 'r') as f: 33 | driver = pickle.load(f) 34 | return driver 35 | 36 | 37 | if __name__ == '__main__': 38 | save_driver() 39 | --------------------------------------------------------------------------------