├── .gitignore ├── README.md ├── datadeal ├── company │ ├── __init__.py │ ├── adminx.py │ ├── apps.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ └── save_company.py │ ├── migrations │ │ ├── 0001_initial.py │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ └── views.py ├── datadeal │ ├── __init__.py │ ├── adminx.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ └── alarm.py │ ├── migrations │ │ ├── 0001_initial.py │ │ ├── 0002_variable_important.py │ │ ├── 0003_variable_all_text.py │ │ ├── 0004_auto_20170608_1126.py │ │ ├── 0005_remove_variable_important.py │ │ ├── 0006_auto_20170703_1334.py │ │ ├── 0007_auto_20170705_0835.py │ │ ├── 0008_auto_20170720_1032.py │ │ ├── 0009_auto_20170815_0855.py │ │ └── __init__.py │ ├── models.py │ ├── settings.py │ ├── static │ │ ├── css │ │ │ └── jquery.dataTables.min.css │ │ ├── images │ │ │ ├── Sorting icons.psd │ │ │ ├── favicon.ico │ │ │ ├── sort_asc.png │ │ │ ├── sort_asc_disabled.png │ │ │ ├── sort_both.png │ │ │ ├── sort_desc.png │ │ │ └── sort_desc_disabled.png │ │ └── js │ │ │ ├── iframe_common.js │ │ │ ├── iframe_detail.js │ │ │ ├── iframe_list.js │ │ │ ├── jquery-3.2.0.min.js │ │ │ └── jquery.dataTables.min.js │ ├── templates │ │ ├── detail_iframe.html │ │ ├── files_admin.html │ │ ├── images_admin.html │ │ ├── index.html │ │ └── list_iframe.html │ ├── urls.py │ ├── views.py │ └── wsgi.py ├── distribute │ ├── __init__.py │ ├── adminx.py │ ├── apps.py │ ├── management │ │ ├── __init__.py │ │ └── commands │ │ │ ├── __init__.py │ │ │ ├── get_spiders.py │ │ │ ├── mongo_test.py │ │ │ ├── register_node.py │ │ │ ├── start_spider.py │ │ │ └── upload_files.py │ ├── migrations │ │ ├── 0001_initial.py │ │ ├── 0002_auto_20170606_1335.py │ │ └── __init__.py │ ├── models.py │ ├── tests.py │ ├── urls.py │ └── views.py ├── ghostdriver.log └── manage.py ├── searchInfo ├── ghostdriver.log ├── scrapy.cfg └── searchInfo │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ ├── beijing.py │ ├── case.py │ ├── chengdu.py │ ├── common.py │ ├── gansu.py │ ├── hainan.py │ ├── qingdao.py │ ├── risk.py │ ├── sdein.py │ ├── sdqts.py │ ├── sfda.py │ ├── shandong.py │ ├── shanghai.py │ └── xxgk.py └── selenium ├── baidu_cookies.json ├── exponent_baidu.py ├── exponent_sina.py ├── exponent_sougou.py ├── ghostdriver.log ├── keywords.py ├── selenium_get_url.py ├── selenium_proxy.py └── tesseract_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | datadeal/datadeal/medias/ 3 | searchInfo/.scrapy/ 4 | selenium/images/ 5 | selenium/url_list.txt 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 各省份食品药品处罚案件爬虫程序 2 | Scrapy + Selenium + Django 3 | -------------------------------------------------------------------------------- /datadeal/company/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/__init__.py -------------------------------------------------------------------------------- /datadeal/company/adminx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import xadmin 3 | from .models import * 4 | 5 | class CompanyAdmin(object): 6 | list_display = ['name','address','creditcode','registration','organization','kind','status','legalperson','start_at','capital','deadline','give_at','webpage','authority','scope'] 7 | search_fields = ['name','address','creditcode','registration','organization','kind','status','legalperson','capital','deadline','webpage','authority','scope'] 8 | list_filter = ['start_at','give_at'] 9 | xadmin.site.register(Company, CompanyAdmin) 10 | 11 | class ShareholdersAdmin(object): 12 | list_display = ['name','kind','subcribe_money','subcribe_date','real_money','real_date','company'] 13 | search_fields = ['name','kind','subcribe_money','subcribe_date','real_money','real_date'] 14 | list_filter = ['company'] 15 | xadmin.site.register(Shareholders, ShareholdersAdmin) 16 | 17 | class MemberAdmin(object): 18 | list_display = ['name','kind','company'] 19 | search_fields = ['name','kind'] 20 | list_filter = ['company'] 21 | xadmin.site.register(Member, MemberAdmin) -------------------------------------------------------------------------------- /datadeal/company/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class CompanyConfig(AppConfig): 5 | name = 'company' 6 | -------------------------------------------------------------------------------- /datadeal/company/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/management/__init__.py -------------------------------------------------------------------------------- /datadeal/company/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/management/commands/__init__.py -------------------------------------------------------------------------------- /datadeal/company/management/commands/save_company.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from company.models import * 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support import expected_conditions as EC 7 | from selenium.webdriver.support.wait import WebDriverWait 8 | from selenium.webdriver.common.action_chains import ActionChains 9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 10 | import time 11 | import urllib 12 | import json 13 | 14 | class Command(BaseCommand): 15 | help = '保存公司信息' 16 | 17 | def handle(self, *args, **options): 18 | company_list = ['新泰市人民药业有限公司仁德人民医药商场','新泰市人民药业有限公司新兰人民医药商场','新泰市人民药业有限公司黄崖人民医药商场','新泰市泉沟安康大药店','新泰市百姓园大药房','新泰市泉沟镇保安堂大药店','新泰市泉沟镇子恒药店','新泰市泉沟镇老百姓大药房','新泰市泉沟平价大药店','新泰市泉沟镇泉民大药店','新泰市康宇大药店','韩庄众心百姓大药房','西张庄众心百姓大药房','芙蓉街众心百姓大药房','淄博新华大药店连锁有限公司桓台陈庄药店','淄博新华大药店连锁有限公司兴桓药店','淄博丰祺医药有限公司云涛药店','桓台县索镇瑞康药店','桓台县城区信康药店','桓台县东壁大药店','淄博丰祺医药零售有限公司侯庄药店','淄博丰祺医药零售有限公司姜坊药店','果里镇福生堂药店','果里镇广生堂药店','淄博市临淄昊虹工贸有限公司','青岛啤酒股份有限公司青岛啤酒三厂','青岛北苑英徽家具有限公司','青岛平泰电子有限公司','青岛司玛特瑞进电子有限公司','青岛黄金铅锌开发有限公司','青岛长荣化工有限公司','东明县迪奥化工有限公司','东明元创化工有限公司','东明宏昌化工有限公司','东明欧宝板业有限公司','山东优一化工有限公司','东明凌宇化工有限公司','东明佳润化工有限公司'] 19 | 20 | desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() 21 | desired_capabilities["phantomjs.page.settings.userAgent"] = ( 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 23 | ) 24 | browser = webdriver.PhantomJS(desired_capabilities=desired_capabilities) 25 | browser.maximize_window() 26 | 27 | for company in company_list: 28 | keyword = urllib.quote(company) 29 | browser.get('http://www.xizhi.com/search?wd=%s&type=all' % keyword) 30 | try: 31 | a = browser.find_element_by_xpath('/html/body/div[5]/div[1]/ul/li/div/div[2]/h3/a') 32 | except: 33 | a = '' 34 | if a: 35 | browser.get(a.get_attribute("href")) 36 | name = browser.find_element_by_xpath('/html/body/div[5]/div[1]/div[2]/h2/a').text 37 | print name 38 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[1]/div') 39 | tds = div.find_elements_by_tag_name('td') 40 | aleady = Company.objects.filter(name=name) 41 | if not aleady.count(): 42 | obj = Company.objects.create(name=name,address=tds[25].text,creditcode=tds[1].text,registration=tds[3].text,organization=tds[5].text,kind=tds[7].text,status=tds[9].text,legalperson=tds[11].text,start_at=tds[13].text,capital=tds[15].text,deadline=tds[17].text,give_at=tds[19].text,webpage=tds[21].text,authority=tds[23].text,scope=tds[27].text) 43 | else: 44 | obj = '' 45 | 46 | if obj: 47 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[2]') 48 | trs = div.find_elements_by_tag_name('tr') 49 | if len(trs) > 1: 50 | for i,tr in enumerate(trs): 51 | if i > 0: 52 | tds = tr.find_elements_by_tag_name('td') 53 | if tds[2].text: 54 | subcribe = tds[2].text.split('/') 55 | if len(subcribe) > 1: 56 | subcribe_money = subcribe[0] 57 | subcribe_date = subcribe[1] 58 | else: 59 | subcribe_money = subcribe[0] 60 | subcribe_date = '' 61 | else: 62 | subcribe_money = '' 63 | subcribe_date = '' 64 | if tds[3].text: 65 | real = tds[3].text.split('/') 66 | if len(real) > 1: 67 | real_money = real[0] 68 | real_date = real[1] 69 | else: 70 | real_money = real[0] 71 | real_date = '' 72 | else: 73 | real_money = '' 74 | real_date = '' 75 | try: 76 | Shareholders.objects.create(name=tds[0].text,kind=tds[1].text,subcribe_money=subcribe_money,subcribe_date=subcribe_date,real_money=real_money,real_date=real_date,company=obj) 77 | except: 78 | Shareholders.objects.create(name=tds[0].text,kind=tds[1].text,subcribe_money=subcribe_money,real_money=real_money,company=obj) 79 | 80 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[3]') 81 | lis = div.find_elements_by_tag_name('li') 82 | if len(lis) > 0: 83 | for li in lis: 84 | key = li.find_element_by_class_name('lab').text.split(u':')[0] 85 | val = li.find_element_by_class_name('lab-in').text 86 | Member.objects.create(name=val,kind=key,company=obj) 87 | 88 | browser.quit() 89 | -------------------------------------------------------------------------------- /datadeal/company/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-07-27 17:04 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | initial = True 12 | 13 | dependencies = [ 14 | ] 15 | 16 | operations = [ 17 | migrations.CreateModel( 18 | name='Company', 19 | fields=[ 20 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 21 | ('name', models.CharField(max_length=50, verbose_name='\u4f01\u4e1a\u540d\u79f0')), 22 | ('address', models.CharField(max_length=100, verbose_name='\u4f01\u4e1a\u5730\u5740')), 23 | ('creditcode', models.CharField(max_length=50, null=True, verbose_name='\u7edf\u4e00\u793e\u4f1a\u4fe1\u7528\u4ee3\u7801')), 24 | ('registration', models.CharField(max_length=50, verbose_name='\u6ce8\u518c\u53f7')), 25 | ('organization', models.CharField(max_length=50, null=True, verbose_name='\u7ec4\u7ec7\u673a\u6784\u4ee3\u7801')), 26 | ('kind', models.CharField(max_length=50, null=True, verbose_name='\u516c\u53f8\u7c7b\u578b')), 27 | ('status', models.CharField(max_length=50, null=True, verbose_name='\u7ecf\u8425\u72b6\u6001')), 28 | ('legalperson', models.CharField(max_length=50, verbose_name='\u6cd5\u5b9a\u4ee3\u8868\u4eba')), 29 | ('start_at', models.DateField(null=True, verbose_name='\u7ecf\u8425\u65e5\u671f')), 30 | ('capital', models.CharField(max_length=50, null=True, verbose_name='\u6ce8\u518c\u8d44\u672c')), 31 | ('deadline', models.CharField(max_length=50, null=True, verbose_name='\u8425\u4e1a\u671f\u9650')), 32 | ('give_at', models.DateField(null=True, verbose_name='\u53d1\u7167\u65e5\u671f')), 33 | ('webpage', models.CharField(max_length=50, null=True, verbose_name='\u7f51\u5740')), 34 | ('authority', models.CharField(max_length=50, null=True, verbose_name='\u767b\u8bb0\u673a\u5173')), 35 | ('scope', models.TextField(null=True, verbose_name='\u7ecf\u8425\u8303\u56f4')), 36 | ], 37 | options={ 38 | 'verbose_name': '\u516c\u53f8\u4fe1\u606f', 39 | 'verbose_name_plural': '\u516c\u53f8\u4fe1\u606f\u7ba1\u7406', 40 | }, 41 | ), 42 | migrations.CreateModel( 43 | name='Member', 44 | fields=[ 45 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 46 | ('name', models.CharField(max_length=50, verbose_name='\u540d\u5b57')), 47 | ('kind', models.CharField(max_length=50, verbose_name='\u8eab\u4efd')), 48 | ('company', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='company.Company', verbose_name='\u516c\u53f8')), 49 | ], 50 | options={ 51 | 'verbose_name': '\u6210\u5458\u4fe1\u606f', 52 | 'verbose_name_plural': '\u6210\u5458\u4fe1\u606f\u7ba1\u7406', 53 | }, 54 | ), 55 | migrations.CreateModel( 56 | name='Shareholders', 57 | fields=[ 58 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 59 | ('name', models.CharField(max_length=50, verbose_name='\u80a1\u4e1c')), 60 | ('kind', models.CharField(max_length=50, verbose_name='\u7c7b\u578b')), 61 | ('subcribe_money', models.CharField(max_length=50, null=True, verbose_name='\u8ba4\u7f34\u51fa\u8d44\u91d1\u989d')), 62 | ('subcribe_date', models.DateField(null=True, verbose_name='\u8ba4\u7f34\u51fa\u8d44\u65f6\u95f4')), 63 | ('real_money', models.CharField(max_length=50, null=True, verbose_name='\u5b9e\u7f34\u51fa\u8d44\u91d1\u989d')), 64 | ('real_date', models.DateField(null=True, verbose_name='\u5b9e\u7f34\u51fa\u8d44\u65f6\u95f4')), 65 | ('company', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='company.Company', verbose_name='\u516c\u53f8')), 66 | ], 67 | options={ 68 | 'verbose_name': '\u80a1\u4e1c\u4fe1\u606f', 69 | 'verbose_name_plural': '\u80a1\u4e1c\u4fe1\u606f\u7ba1\u7406', 70 | }, 71 | ), 72 | ] 73 | -------------------------------------------------------------------------------- /datadeal/company/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/migrations/__init__.py -------------------------------------------------------------------------------- /datadeal/company/models.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models 5 | 6 | # Create your models here. 7 | class Company(models.Model): 8 | name = models.CharField(u'企业名称',max_length=50) 9 | address = models.CharField(u'企业地址',max_length=100) 10 | creditcode = models.CharField(u'统一社会信用代码',max_length=50,null=True) 11 | registration = models.CharField(u'注册号',max_length=50) 12 | organization = models.CharField(u'组织机构代码',max_length=50,null=True) 13 | kind = models.CharField(u'公司类型',max_length=50,null=True) 14 | status = models.CharField(u'经营状态',max_length=50,null=True) 15 | legalperson = models.CharField(u'法定代表人',max_length=50) 16 | start_at = models.DateField(u'经营日期',null=True) 17 | capital = models.CharField(u'注册资本',max_length=50,null=True) 18 | deadline = models.CharField(u'营业期限',max_length=50,null=True) 19 | give_at = models.DateField(u'发照日期',null=True) 20 | webpage = models.CharField(u'网址',max_length=50,null=True) 21 | authority = models.CharField(u'登记机关',max_length=50,null=True) 22 | scope = models.TextField(u'经营范围',null=True) 23 | 24 | def __unicode__(self): 25 | return self.name 26 | 27 | class Meta: 28 | verbose_name = u'公司信息' 29 | verbose_name_plural = u'公司信息管理' 30 | 31 | 32 | class Shareholders(models.Model): 33 | name = models.CharField(u'股东',max_length=50) 34 | kind = models.CharField(u'类型',max_length=50) 35 | subcribe_money = models.CharField(u'认缴出资金额',max_length=50,null=True) 36 | subcribe_date = models.DateField(u'认缴出资时间',null=True) 37 | real_money = models.CharField(u'实缴出资金额',max_length=50,null=True) 38 | real_date = models.DateField(u'实缴出资时间',null=True) 39 | company = models.ForeignKey('Company',verbose_name=u'公司') 40 | 41 | def __unicode__(self): 42 | return self.name 43 | 44 | class Meta: 45 | verbose_name = u'股东信息' 46 | verbose_name_plural = u'股东信息管理' 47 | 48 | class Member(models.Model): 49 | name = models.CharField(u'名字',max_length=50) 50 | kind = models.CharField(u'身份',max_length=50) 51 | company = models.ForeignKey('Company',verbose_name=u'公司') 52 | 53 | def __unicode__(self): 54 | return self.name 55 | 56 | class Meta: 57 | verbose_name = u'成员信息' 58 | verbose_name_plural = u'成员信息管理' -------------------------------------------------------------------------------- /datadeal/company/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /datadeal/company/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /datadeal/datadeal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/__init__.py -------------------------------------------------------------------------------- /datadeal/datadeal/adminx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import xadmin 3 | from xadmin.views.base import CommAdminView 4 | from xadmin.plugins.themes import ThemePlugin 5 | from django.http import HttpResponseRedirect 6 | from datadeal.settings import BASE_DIR 7 | from pure_pagination import Paginator, EmptyPage, PageNotAnInteger 8 | from .models import * 9 | from distribute.models import * 10 | from company.models import * 11 | from .views import * 12 | import subprocess 13 | import datetime 14 | import json 15 | import time 16 | import os 17 | 18 | class BaseSetting(object): 19 | enable_themes = True 20 | use_bootswatch = True 21 | 22 | class AdminSettings(object): 23 | menu_style = "accordion" 24 | site_title = '爬虫管理系统' 25 | site_footer = '爬虫管理系统' 26 | 27 | def get_site_menu(self): 28 | return [ 29 | {'title': '爬虫管理','icon':'fa fa-bug', 'perm': self.get_model_perm(scrapySetting, 'change'), 'menus':( 30 | {'title': '爬虫生成配置', 'url': self.get_model_url(scrapySetting, 'changelist'), 31 | 'perm': self.get_model_perm(scrapySetting, 'changelist')}, 32 | {'title': '域名白名单', 'url': self.get_model_url(AllowDomains, 'changelist'), 33 | 'perm': self.get_model_perm(AllowDomains, 'changelist')}, 34 | {'title': '一级爬取地址', 'url': self.get_model_url(startUrls, 'changelist'), 35 | 'perm': self.get_model_perm(startUrls, 'changelist')}, 36 | {'title': '循环体列表', 'url': self.get_model_url(CycleObj, 'changelist'), 37 | 'perm': self.get_model_perm(CycleObj, 'changelist')}, 38 | {'title': '变量列表', 'url': self.get_model_url(Variable, 'changelist'), 39 | 'perm': self.get_model_perm(Variable, 'changelist')}, 40 | {'title': '爬虫列表', 'url': self.get_model_url(scrapyList, 'changelist'), 41 | 'perm': self.get_model_perm(scrapyList, 'changelist')}, 42 | )}, 43 | {'title': '节点管理','icon':'fa fa-chain', 'perm': self.get_model_perm(Node, 'change'), 'menus':( 44 | {'title': '节点管理', 'url': self.get_model_url(Node, 'changelist'), 45 | 'perm': self.get_model_perm(Node, 'changelist')}, 46 | {'title': '节点IP管理', 'url': self.get_model_url(NodeIp, 'changelist'), 47 | 'perm': self.get_model_perm(NodeIp, 'changelist')}, 48 | {'title': '任务管理', 'url': self.get_model_url(NodeTask, 'changelist'), 49 | 'perm': self.get_model_perm(NodeTask, 'changelist')}, 50 | )}, 51 | {'title': '数据管理','icon':'fa fa-bar-chart-o', 'perm': self.get_model_perm(SpiderData, 'change'), 'menus':( 52 | {'title': '数据信息', 'url': self.get_model_url(SpiderData, 'changelist'), 53 | 'perm': self.get_model_perm(SpiderData, 'changelist')}, 54 | {'title': '错误信息', 'url': self.get_model_url(ErrorData, 'changelist'), 55 | 'perm': self.get_model_perm(ErrorData, 'changelist')}, 56 | {'title': '预警信息', 'url': self.get_model_url(DataAlarm, 'changelist'), 57 | 'perm': self.get_model_perm(DataAlarm, 'changelist')}, 58 | {'title': '公司信息', 'url': self.get_model_url(Company, 'changelist'), 59 | 'perm': self.get_model_perm(Company, 'changelist')}, 60 | {'title': '股东信息', 'url': self.get_model_url(Shareholders, 'changelist'), 61 | 'perm': self.get_model_perm(Shareholders, 'changelist')}, 62 | {'title': '成员信息', 'url': self.get_model_url(Member, 'changelist'), 63 | 'perm': self.get_model_perm(Member, 'changelist')}, 64 | )}, 65 | {'title': '文件管理','icon':'fa fa-file', 'perm': self.get_model_perm(SpiderData, 'change'), 'menus':( 66 | {'title': '图片管理', 'url':'/admin/images_admin/', 'perm': ''}, 67 | {'title': '文件管理', 'url':'/admin/files_admin/', 'perm': ''}, 68 | )}, 69 | ] 70 | 71 | xadmin.site.register(xadmin.views.BaseAdminView,BaseSetting) 72 | xadmin.site.register(xadmin.views.CommAdminView,AdminSettings) 73 | 74 | class scrapySettingAdmin(object): 75 | list_display = ['name', 'allow_domains','start_urls','cycleobj','variable','num','kind','create_at','modify_at'] 76 | search_fields = ['name','allow_domains'] 77 | list_filter = ['kind','create_at','modify_at'] 78 | style_fields = {'allow_domains': 'm2m_transfer','start_urls': 'm2m_transfer','cycleobj': 'm2m_transfer','variable': 'm2m_transfer'} 79 | actions = ['create_spider','create_tasks'] 80 | def create_spider(self, request, queryset): 81 | for q in queryset: 82 | if scrapyList.objects.filter(name=q.name).count() == 0: 83 | create_scrapy_file(q) 84 | self.message_user(u'%s 爬虫创建成功' % q.name) 85 | scrapyList.objects.create(name=q.name) 86 | else: 87 | self.message_user(u'%s 爬虫名已被使用' % q.name) 88 | create_spider.short_description = "创建爬虫" 89 | def create_tasks(self, request, queryset): 90 | from distribute.models import NodeTask 91 | for q in queryset: 92 | try: 93 | scrapy = scrapyList.objects.get(name=q.name) 94 | except: 95 | scrapy = '' 96 | if scrapy: 97 | urls = q.start_urls.all() 98 | total = urls.count() 99 | count,last = divmod(total,q.num) 100 | for n in range(0,count+1): 101 | start = n*q.num 102 | if n == count: 103 | if last > 0: 104 | end = total 105 | else: 106 | end = 'pass' 107 | else: 108 | end = (n+1)*q.num 109 | if not end == 'pass': 110 | name = q.name+'_'+str(n+1) 111 | already = NodeTask.objects.filter(name=name).count() 112 | if not already: 113 | obj = NodeTask.objects.create(name=name,scrapy=scrapy,priority=n+1) 114 | for i in urls[start:end]: 115 | obj.urls.add(i) 116 | self.message_user(u'%s 爬虫任务分发完毕' % q.name) 117 | else: 118 | self.message_user(u'请先创建%s爬虫' % q.name) 119 | create_tasks.short_description = "生成任务" 120 | xadmin.site.register(scrapySetting, scrapySettingAdmin) 121 | 122 | class AllowDomainsAdmin(object): 123 | list_display = ['name'] 124 | search_fields = ['name'] 125 | list_filter = [] 126 | xadmin.site.register(AllowDomains, AllowDomainsAdmin) 127 | 128 | class startUrlsAdmin(object): 129 | list_display = ['url'] 130 | search_fields = ['url'] 131 | list_filter = [] 132 | xadmin.site.register(startUrls, startUrlsAdmin) 133 | 134 | class CycleObjAdmin(object): 135 | list_display = ['name','xpath','variable'] 136 | search_fields = ['name','xpath'] 137 | list_filter = ['variable'] 138 | xadmin.site.register(CycleObj, CycleObjAdmin) 139 | 140 | class VariableAdmin(object): 141 | list_display = ['name','xpath','kind','all_text'] 142 | search_fields = ['name','xpath'] 143 | list_filter = ['kind','all_text'] 144 | xadmin.site.register(Variable, VariableAdmin) 145 | 146 | class scrapyListAdmin(object): 147 | list_display = ['name','priority','alarm_day','create_at','is_open'] 148 | search_fields = ['name'] 149 | list_filter = ['create_at','is_open'] 150 | list_editable = ['alarm_day','is_open'] 151 | actions = ['start_spider','download'] 152 | def start_spider(self, request, queryset): 153 | for q in queryset: 154 | self.message_user(u'%s 爬虫正在爬取数据... %s' % (q.name,datetime.datetime.now().strftime('%H:%M:%S'))) 155 | subprocess.call('cd ../searchInfo && scrapy crawl %s -o ../datadeal/datadeal/medias/%s_data.json' % (q.name,q.name), shell=True) 156 | self.message_user(u'%s 爬虫已经抓取完数据 %s' % (q.name,datetime.datetime.now().strftime('%H:%M:%S'))) 157 | start_spider.short_description = "运行爬虫" 158 | def download(self, request, queryset): 159 | for q in queryset: 160 | if os.path.exists(BASE_DIR+'/datadeal/medias/%s_data.json' % q.name): 161 | return HttpResponseRedirect('/medias/%s_data.json' % q.name) 162 | else: 163 | self.message_user(u'%s 数据不存在,请先运行爬虫' % q.name) 164 | download.short_description = "数据下载" 165 | xadmin.site.register(scrapyList, scrapyListAdmin) 166 | 167 | class SpiderDataAdmin(object): 168 | list_display = ['scrapyname','create_at','data_str','page_pdf'] 169 | search_fields = ['scrapyname','uid','data','url','file'] 170 | list_filter = ['scrapyname','create_at'] 171 | xadmin.site.register(SpiderData, SpiderDataAdmin) 172 | 173 | class ErrorDataAdmin(object): 174 | list_display = ['scrapyname','uid','create_at','url','content'] 175 | search_fields = ['scrapyname','uid','url','content'] 176 | list_filter = ['scrapyname','create_at'] 177 | xadmin.site.register(ErrorData, ErrorDataAdmin) 178 | 179 | class DataAlarmAdmin(object): 180 | list_display = ['scrapyname','is_alarm','remark','create_at'] 181 | search_fields = ['scrapyname','remark'] 182 | list_filter = ['scrapyname','is_alarm','create_at'] 183 | list_editable = ['is_alarm','remark'] 184 | xadmin.site.register(DataAlarm, DataAlarmAdmin) 185 | 186 | class ImagesAdminView(CommAdminView): 187 | 188 | def get(self, request, *args, **kwargs): 189 | images_dir = BASE_DIR+'/datadeal/medias/images' 190 | images = os.listdir(images_dir) 191 | img_list = [] 192 | for image in images: 193 | url = images_dir+'/'+image 194 | ctime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(os.path.getctime(url)+8*3600)) 195 | img_list.append({'url':image,'ctime':ctime}) 196 | img_list.sort(key=lambda x:x['ctime'],reverse=True) 197 | try: 198 | page = request.GET.get('page', 1) 199 | except PageNotAnInteger: 200 | page = 1 201 | p = Paginator(img_list,12, request=request) 202 | img_list = p.page(page) 203 | 204 | context = self.get_context() 205 | context.update({ 206 | 'p':p, 207 | 'img_list':img_list 208 | }) 209 | return self.template_response('images_admin.html',context) 210 | xadmin.site.register_view(r'^images_admin/$', ImagesAdminView, name='images_admin') 211 | 212 | class FilesAdminView(CommAdminView): 213 | 214 | def get(self, request, *args, **kwargs): 215 | dir_list = ['common','risk'] 216 | file_list = [] 217 | for d in dir_list: 218 | files_dir = BASE_DIR+'/datadeal/medias/'+d 219 | files = os.listdir(files_dir) 220 | for file in files: 221 | import locale 222 | file = file.decode(locale.getdefaultlocale()[1]) 223 | url = files_dir+'/'+file 224 | ctime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(os.path.getctime(url)+8*3600)) 225 | file_list.append([file,ctime,'下载删除']) 226 | 227 | context = self.get_context() 228 | context.update({ 229 | 'file_list':json.dumps(file_list) 230 | }) 231 | return self.template_response('files_admin.html',context) 232 | xadmin.site.register_view(r'^files_admin/$', FilesAdminView, name='files_admin') -------------------------------------------------------------------------------- /datadeal/datadeal/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/management/__init__.py -------------------------------------------------------------------------------- /datadeal/datadeal/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/management/commands/__init__.py -------------------------------------------------------------------------------- /datadeal/datadeal/management/commands/alarm.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from datadeal.models import scrapyList,DataAlarm,SpiderData 4 | import datetime 5 | 6 | class Command(BaseCommand): 7 | help = '生成爬虫预警' 8 | 9 | def handle(self, *args, **options): 10 | scrapy = scrapyList.objects.filter(is_open=True) 11 | for s in scrapy: 12 | try: 13 | data = SpiderData.objects.filter(scrapyname=s.name).order_by('-create_at')[0] 14 | except: 15 | data = '' 16 | if data: 17 | nodata_day = (datetime.datetime.now()-data.create_at).days 18 | if nodata_day > s.alarm_day: 19 | da = DataAlarm.objects.filter(is_alarm=True,scrapyname=s.name).order_by('-create_at') 20 | if da.count(): 21 | alreay_day = (datetime.datetime.now()-da[0].create_at).days 22 | if alreay_day > s.alarm_day: 23 | DataAlarm.objects.create(scrapyname=s.name,is_alarm=True,remark='') 24 | else: 25 | DataAlarm.objects.create(scrapyname=s.name,is_alarm=True,remark='') -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-05 09:36 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | initial = True 11 | 12 | dependencies = [ 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name='AllowDomains', 18 | fields=[ 19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 20 | ('name', models.CharField(max_length=500, verbose_name='\u540d\u79f0')), 21 | ], 22 | options={ 23 | 'verbose_name': '\u57df\u540d\u767d\u540d\u5355', 24 | 'verbose_name_plural': '\u57df\u540d\u767d\u540d\u5355\u7ba1\u7406', 25 | }, 26 | ), 27 | migrations.CreateModel( 28 | name='CycleObj', 29 | fields=[ 30 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 31 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7', max_length=50, verbose_name='\u5faa\u73af\u4f53\u540d\u79f0')), 32 | ('xpath', models.CharField(help_text=b'\xe4\xbd\xbf\xe7\x94\xa8xpath\xe8\xa7\x84\xe5\x88\x99\xef\xbc\x9a\nnodename \xe9\x80\x89\xe6\x8b\xa9\xe6\x89\x80\xe6\x9c\x89\xe7\x9b\xae\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe5\xad\x90\xe8\x8a\x82\n/ \xe4\xbb\x8e\xe6\xa0\xb9\xe8\x8a\x82\xe8\xbf\x9b\xe8\xa1\x8c\xe9\x80\x89\xe6\x8b\xa9\n// \xe9\x80\x89\xe6\x8b\xa9\xe6\x96\x87\xe6\xa1\xa3\xe4\xb8\xad\xe7\x9b\xb8\xe5\x90\xbb\xe5\x90\x88\xe7\x9a\x84\xe8\x8a\x82\xe8\x80\x8c\xe4\xb8\x8d\xe7\xae\xa1\xe5\x85\xb6\xe5\x9c\xa8\xe6\x96\x87\xe6\xa1\xa3\xe7\x9a\x84\xe4\xbd\x95\xe5\xa4\x84\n. \xe9\x80\x89\xe6\x8b\xa9\xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\n.. \xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe7\x88\xb6\xe8\x8a\x82\n@ \xe9\x80\x89\xe6\x8b\xa9\xe5\xb1\x9e\xe6\x80\xa7', max_length=200, verbose_name='\u67e5\u8be2\u89c4\u5219')), 33 | ], 34 | options={ 35 | 'verbose_name': '\u5faa\u73af\u4f53\u5217\u8868', 36 | 'verbose_name_plural': '\u5faa\u73af\u4f53\u5217\u8868\u7ba1\u7406', 37 | }, 38 | ), 39 | migrations.CreateModel( 40 | name='scrapyList', 41 | fields=[ 42 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 43 | ('name', models.CharField(max_length=500, verbose_name='\u540d\u79f0')), 44 | ('priority', models.IntegerField(default=0, verbose_name='\u9879\u76ee\u4f18\u5148\u7ea7')), 45 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 46 | ], 47 | options={ 48 | 'verbose_name': '\u722c\u866b\u5217\u8868', 49 | 'verbose_name_plural': '\u722c\u866b\u5217\u8868\u7ba1\u7406', 50 | }, 51 | ), 52 | migrations.CreateModel( 53 | name='scrapySetting', 54 | fields=[ 55 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 56 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7', max_length=20, verbose_name='\u540d\u79f0')), 57 | ('num', models.IntegerField(default=1, verbose_name='\u5355\u4e2a\u4efb\u52a1\u94fe\u63a5\u6570')), 58 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 59 | ('modify_at', models.DateTimeField(auto_now=True, verbose_name='\u4fee\u6539\u65f6\u95f4')), 60 | ('allow_domains', models.ManyToManyField(to='datadeal.AllowDomains', verbose_name='\u57df\u540d\u767d\u540d\u5355')), 61 | ('cycleobj', models.ManyToManyField(to='datadeal.CycleObj', verbose_name='\u5faa\u73af\u4f53')), 62 | ], 63 | options={ 64 | 'verbose_name': '\u722c\u866b\u914d\u7f6e', 65 | 'verbose_name_plural': '\u722c\u866b\u914d\u7f6e\u7ba1\u7406', 66 | }, 67 | ), 68 | migrations.CreateModel( 69 | name='startUrls', 70 | fields=[ 71 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 72 | ('url', models.URLField(max_length=500, verbose_name='\u540d\u79f0')), 73 | ], 74 | options={ 75 | 'verbose_name': '\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868', 76 | 'verbose_name_plural': '\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868\u7ba1\u7406', 77 | }, 78 | ), 79 | migrations.CreateModel( 80 | name='Variable', 81 | fields=[ 82 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 83 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7,\xe5\xbb\xba\xe8\xae\xae\xe7\x94\xa8\xe5\xaf\xb9\xe5\xba\x94\xe7\x9a\x84\xe5\xbe\xaa\xe7\x8e\xaf\xe4\xbd\x93\xe5\x81\x9a\xe5\x89\x8d\xe7\xbc\x80\xe5\x8a\xa0\xe4\xbb\xa5\xe5\x8c\xba\xe5\x88\x86', max_length=50, verbose_name='\u53d8\u91cf\u540d\u79f0')), 84 | ('xpath', models.CharField(help_text=b'\xe4\xbd\xbf\xe7\x94\xa8xpath\xe8\xa7\x84\xe5\x88\x99\xef\xbc\x9a\nnodename \xe9\x80\x89\xe6\x8b\xa9\xe6\x89\x80\xe6\x9c\x89\xe7\x9b\xae\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe5\xad\x90\xe8\x8a\x82\n/ \xe4\xbb\x8e\xe6\xa0\xb9\xe8\x8a\x82\xe8\xbf\x9b\xe8\xa1\x8c\xe9\x80\x89\xe6\x8b\xa9\n// \xe9\x80\x89\xe6\x8b\xa9\xe6\x96\x87\xe6\xa1\xa3\xe4\xb8\xad\xe7\x9b\xb8\xe5\x90\xbb\xe5\x90\x88\xe7\x9a\x84\xe8\x8a\x82\xe8\x80\x8c\xe4\xb8\x8d\xe7\xae\xa1\xe5\x85\xb6\xe5\x9c\xa8\xe6\x96\x87\xe6\xa1\xa3\xe7\x9a\x84\xe4\xbd\x95\xe5\xa4\x84\n. \xe9\x80\x89\xe6\x8b\xa9\xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\n.. \xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe7\x88\xb6\xe8\x8a\x82\n@ \xe9\x80\x89\xe6\x8b\xa9\xe5\xb1\x9e\xe6\x80\xa7', max_length=200, verbose_name='\u67e5\u8be2\u89c4\u5219')), 85 | ('kind', models.IntegerField(choices=[(1, '\u4e00\u7ea7\u53d8\u91cf'), (2, '\u4e8c\u7ea7\u53d8\u91cf'), (3, '\u4e8c\u7ea7\u94fe\u63a5')], default=1, verbose_name='\u7c7b\u578b')), 86 | ], 87 | options={ 88 | 'verbose_name': '\u53d8\u91cf\u5217\u8868', 89 | 'verbose_name_plural': '\u53d8\u91cf\u5217\u8868\u7ba1\u7406', 90 | }, 91 | ), 92 | migrations.AddField( 93 | model_name='scrapysetting', 94 | name='start_urls', 95 | field=models.ManyToManyField(to='datadeal.startUrls', verbose_name='\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868'), 96 | ), 97 | migrations.AddField( 98 | model_name='cycleobj', 99 | name='variable', 100 | field=models.ManyToManyField(to='datadeal.Variable', verbose_name='\u53d8\u91cf'), 101 | ), 102 | ] 103 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0002_variable_important.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-05 11:23 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0001_initial'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AddField( 16 | model_name='variable', 17 | name='important', 18 | field=models.BooleanField(default=False, help_text=b'\xe8\x8b\xa5\xe8\xaf\xa5\xe5\x8f\x98\xe9\x87\x8f\xe6\x89\xbe\xe4\xb8\x8d\xe5\x88\xb0\xe6\x95\xb0\xe6\x8d\xae\xef\xbc\x8c\xe5\x88\x99\xe4\xb8\x8d\xe8\xbf\x9b\xe8\xa1\x8c\xe4\xb8\x8b\xe4\xb8\x80\xe6\xad\xa5', verbose_name='\u5173\u952e\u53d8\u91cf'), 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0003_variable_all_text.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-05 13:50 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0002_variable_important'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AddField( 16 | model_name='variable', 17 | name='all_text', 18 | field=models.BooleanField(default=False, help_text=b'\xe5\xb0\x86\xe6\x8f\x90\xe5\x8f\x96\xe8\xaf\xa5\xe6\xa0\x87\xe7\xad\xbe\xe4\xb8\x8b\xe6\x96\x87\xe6\x9c\xac\xe5\x8f\x8a\xe6\x89\x80\xe6\x9c\x89\xe5\xad\x90\xe6\xa0\x87\xe7\xad\xbe\xe6\x96\x87\xe6\x9c\xac\xef\xbc\x8c', verbose_name='\u5168\u6587\u672c\u63d0\u53d6'), 19 | ), 20 | ] 21 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0004_auto_20170608_1126.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-08 11:26 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0003_variable_all_text'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AddField( 16 | model_name='scrapysetting', 17 | name='kind', 18 | field=models.IntegerField(choices=[(1, '\u5217\u8868\u53ca\u8be6\u60c5'), (2, '\u5217\u8868'), (3, '\u5355\u9875\u9762'), (4, '\u5176\u4ed6')], default=1, verbose_name='\u7c7b\u578b'), 19 | ), 20 | migrations.AddField( 21 | model_name='scrapysetting', 22 | name='variable', 23 | field=models.ManyToManyField(blank=True, to='datadeal.Variable', verbose_name='\u975e\u5faa\u73af\u53d8\u91cf'), 24 | ), 25 | migrations.AlterField( 26 | model_name='scrapysetting', 27 | name='cycleobj', 28 | field=models.ManyToManyField(blank=True, to='datadeal.CycleObj', verbose_name='\u5faa\u73af\u4f53'), 29 | ), 30 | migrations.AlterField( 31 | model_name='variable', 32 | name='all_text', 33 | field=models.BooleanField(default=False, help_text=b'\xe5\xb0\x86\xe6\x8f\x90\xe5\x8f\x96\xe8\xaf\xa5\xe6\xa0\x87\xe7\xad\xbe\xe4\xb8\x8b\xe6\x96\x87\xe6\x9c\xac\xe5\x8f\x8a\xe6\x89\x80\xe6\x9c\x89\xe5\xad\x90\xe6\xa0\x87\xe7\xad\xbe\xe6\x96\x87\xe6\x9c\xac,\xe5\xbc\x80\xe5\x90\xaf\xe5\x90\x8e\xe4\xb8\x8d\xe8\xa6\x81\xe5\x86\x99/text()', verbose_name='\u5b50\u6807\u7b7e\u6587\u672c\u63d0\u53d6'), 34 | ), 35 | migrations.AlterField( 36 | model_name='variable', 37 | name='important', 38 | field=models.BooleanField(default=False, help_text=b'\xe8\x8b\xa5\xe8\xaf\xa5\xe5\x8f\x98\xe9\x87\x8f\xe6\x89\xbe\xe4\xb8\x8d\xe5\x88\xb0\xe6\x95\xb0\xe6\x8d\xae\xef\xbc\x8c\xe5\x88\x99\xe4\xb8\x8d\xe4\xbc\x9a\xe5\x8a\xa0\xe5\x85\xa5\xe5\x88\xb0\xe8\xbf\x94\xe5\x9b\x9e\xe6\x95\xb0\xe6\x8d\xae\xe4\xb8\xad', verbose_name='\u5173\u952e\u53d8\u91cf'), 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0005_remove_variable_important.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-08 11:49 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0004_auto_20170608_1126'), 12 | ] 13 | 14 | operations = [ 15 | migrations.RemoveField( 16 | model_name='variable', 17 | name='important', 18 | ), 19 | ] 20 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0006_auto_20170703_1334.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-07-03 13:34 3 | from __future__ import unicode_literals 4 | from django.contrib.postgres.operations import HStoreExtension 5 | import django.contrib.postgres.fields.hstore 6 | from django.db import migrations, models 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ('datadeal', '0005_remove_variable_important'), 13 | ] 14 | 15 | operations = [ 16 | HStoreExtension(), 17 | migrations.CreateModel( 18 | name='SpiderData', 19 | fields=[ 20 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 21 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u540d\u79f0')), 22 | ('uid', models.CharField(max_length=50, verbose_name='uid')), 23 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 24 | ('data', django.contrib.postgres.fields.hstore.HStoreField()), 25 | ], 26 | options={ 27 | 'verbose_name': '\u6570\u636e\u4fe1\u606f', 28 | 'verbose_name_plural': '\u6570\u636e\u4fe1\u606f\u7ba1\u7406', 29 | }, 30 | ), 31 | migrations.AlterField( 32 | model_name='scrapylist', 33 | name='priority', 34 | field=models.IntegerField(default=10, help_text=b'\xe5\x80\xbc\xe8\xb6\x8a\xe5\xb0\x8f\xe8\xb6\x8a\xe4\xbc\x98\xe5\x85\x88', verbose_name='\u9879\u76ee\u4f18\u5148\u7ea7'), 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0007_auto_20170705_0835.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-07-05 08:35 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0006_auto_20170703_1334'), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='ErrorData', 17 | fields=[ 18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 19 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u9879\u76ee\u6765\u6e90')), 20 | ('uid', models.CharField(max_length=50, verbose_name='\u8bbe\u5907\u6765\u6e90')), 21 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 22 | ('url', models.CharField(max_length=300, verbose_name='\u8bbf\u95ee\u5730\u5740')), 23 | ('content', models.CharField(max_length=300, verbose_name='\u9519\u8bef\u4fe1\u606f')), 24 | ], 25 | options={ 26 | 'verbose_name': '\u9519\u8bef\u4fe1\u606f', 27 | 'verbose_name_plural': '\u9519\u8bef\u4fe1\u606f\u7ba1\u7406', 28 | }, 29 | ), 30 | migrations.AlterField( 31 | model_name='spiderdata', 32 | name='scrapyname', 33 | field=models.CharField(max_length=50, verbose_name='\u9879\u76ee\u6765\u6e90'), 34 | ), 35 | migrations.AlterField( 36 | model_name='spiderdata', 37 | name='uid', 38 | field=models.CharField(max_length=50, verbose_name='\u8bbe\u5907\u6765\u6e90'), 39 | ), 40 | ] 41 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0008_auto_20170720_1032.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-07-20 10:32 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0007_auto_20170705_0835'), 12 | ] 13 | 14 | operations = [ 15 | migrations.AddField( 16 | model_name='spiderdata', 17 | name='file', 18 | field=models.CharField(blank=True, max_length=100, null=True, verbose_name='\u539f\u9875\u9762'), 19 | ), 20 | migrations.AddField( 21 | model_name='spiderdata', 22 | name='url', 23 | field=models.CharField(blank=True, max_length=300, null=True, verbose_name='\u8bbf\u95ee\u5730\u5740'), 24 | ), 25 | ] 26 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/0009_auto_20170815_0855.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-08-15 08:55 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | 7 | 8 | class Migration(migrations.Migration): 9 | 10 | dependencies = [ 11 | ('datadeal', '0008_auto_20170720_1032'), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='DataAlarm', 17 | fields=[ 18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 19 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u722c\u866b\u540d')), 20 | ('is_alarm', models.BooleanField(default=True, verbose_name='\u662f\u5426\u9884\u8b66')), 21 | ('remark', models.TextField(blank=True, null=True, verbose_name='\u539f\u56e0\u5907\u6ce8')), 22 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 23 | ], 24 | options={ 25 | 'verbose_name': '\u722c\u866b\u9884\u8b66', 26 | 'verbose_name_plural': '\u722c\u866b\u9884\u8b66\u7ba1\u7406', 27 | }, 28 | ), 29 | migrations.AddField( 30 | model_name='scrapylist', 31 | name='alarm_day', 32 | field=models.IntegerField(default=30, help_text=b'\xe8\xb6\x85\xe8\xbf\x87\xe6\x97\xb6\xe9\x97\xb4\xe6\x97\xa0\xe6\x95\xb0\xe6\x8d\xae\xe5\x88\x99\xe7\x94\x9f\xe6\x88\x90\xe9\xa2\x84\xe8\xad\xa6', verbose_name='\u9884\u8b66\u5929\u6570'), 33 | ), 34 | migrations.AddField( 35 | model_name='scrapylist', 36 | name='is_open', 37 | field=models.BooleanField(default=True, verbose_name='\u662f\u5426\u542f\u7528'), 38 | ), 39 | migrations.AlterField( 40 | model_name='errordata', 41 | name='scrapyname', 42 | field=models.CharField(max_length=50, verbose_name='\u722c\u866b\u540d'), 43 | ), 44 | ] 45 | -------------------------------------------------------------------------------- /datadeal/datadeal/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/migrations/__init__.py -------------------------------------------------------------------------------- /datadeal/datadeal/models.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from django.db import models 3 | from datadeal.settings import BASE_DIR 4 | from django.contrib.postgres.fields import HStoreField 5 | 6 | class scrapySetting(models.Model): 7 | KIND_CHOICES = ((1, u'列表及详情'),(2, u'列表'),(3,u'单页面'),(4,u'其他')) 8 | name = models.CharField(u'名称',max_length=20,help_text='不要输入中文和特殊符号') 9 | allow_domains = models.ManyToManyField('AllowDomains',verbose_name=u'域名白名单') 10 | start_urls = models.ManyToManyField('startUrls',verbose_name=u'一级爬取地址列表') 11 | cycleobj = models.ManyToManyField('CycleObj',verbose_name=u'循环体',blank=True) 12 | variable = models.ManyToManyField('Variable',verbose_name=u'非循环变量',blank=True) 13 | num = models.IntegerField(u'单个任务链接数',default=1) 14 | kind = models.IntegerField(u'类型', choices=KIND_CHOICES,default=1) 15 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 16 | modify_at = models.DateTimeField(u'修改时间', auto_now=True) 17 | 18 | def __unicode__(self): 19 | return self.name 20 | 21 | class Meta: 22 | verbose_name = u'爬虫配置' 23 | verbose_name_plural = u'爬虫配置管理' 24 | 25 | class AllowDomains(models.Model): 26 | name = models.CharField(u'名称',max_length=500) 27 | 28 | def __unicode__(self): 29 | return self.name 30 | 31 | class Meta: 32 | verbose_name = u'域名白名单' 33 | verbose_name_plural = u'域名白名单管理' 34 | 35 | class startUrls(models.Model): 36 | url = models.URLField(u'名称',max_length=500) 37 | 38 | def __unicode__(self): 39 | return self.url 40 | 41 | class Meta: 42 | verbose_name = u'一级爬取地址列表' 43 | verbose_name_plural = u'一级爬取地址列表管理' 44 | 45 | class CycleObj(models.Model): 46 | name = models.CharField(u'循环体名称',max_length=50,help_text='不要输入中文和特殊符号') 47 | xpath = models.CharField(u'查询规则',max_length=200,help_text='使用xpath规则:\nnodename 选择所有目前节的子节\n/ 从根节进行选择\n// 选择文档中相吻合的节而不管其在文档的何处\n. 选择当前节\n.. 当前节的父节\n@ 选择属性') 48 | variable = models.ManyToManyField('Variable',verbose_name=u'变量') 49 | 50 | def __unicode__(self): 51 | return self.name 52 | 53 | class Meta: 54 | verbose_name = u'循环体列表' 55 | verbose_name_plural = u'循环体列表管理' 56 | 57 | class Variable(models.Model): 58 | KIND_CHOICES = ((1, u'一级变量'), (2, u'二级变量'),(3,u'二级链接')) 59 | name = models.CharField(u'变量名称',max_length=50,help_text='不要输入中文和特殊符号,建议用对应的循环体做前缀加以区分') 60 | xpath = models.CharField(u'查询规则',max_length=200,help_text='使用xpath规则:\nnodename 选择所有目前节的子节\n/ 从根节进行选择\n// 选择文档中相吻合的节而不管其在文档的何处\n. 选择当前节\n.. 当前节的父节\n@ 选择属性') 61 | kind = models.IntegerField(u'类型', choices=KIND_CHOICES,default=1) 62 | all_text = models.BooleanField(u'子标签文本提取',default=False,help_text='将提取该标签下文本及所有子标签文本,开启后不要写/text()') 63 | 64 | def __unicode__(self): 65 | return self.name 66 | 67 | class Meta: 68 | verbose_name = u'变量列表' 69 | verbose_name_plural = u'变量列表管理' 70 | 71 | class scrapyList(models.Model): 72 | name = models.CharField(u'名称',max_length=500) 73 | priority = models.IntegerField(u'项目优先级',default=10,help_text='值越小越优先') 74 | alarm_day = models.IntegerField(u'预警天数',default=30,help_text='超过时间无数据则生成预警') 75 | is_open = models.BooleanField(u'是否启用',default=True) 76 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 77 | 78 | def __unicode__(self): 79 | return self.name 80 | 81 | class Meta: 82 | verbose_name = u'爬虫列表' 83 | verbose_name_plural = u'爬虫列表管理' 84 | 85 | class SpiderData(models.Model): 86 | scrapyname = models.CharField(u'项目来源',max_length=50) 87 | uid = models.CharField(u'设备来源',max_length=50) 88 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 89 | data = HStoreField() 90 | url = models.CharField(u'访问地址',max_length=300,null=True,blank=True) 91 | file = models.CharField(u'原页面',max_length=100,null=True,blank=True) 92 | 93 | def __unicode__(self): 94 | return self.scrapyname 95 | 96 | def data_str(self): 97 | data_str = '' 98 | for key,val in self.data.items(): 99 | if not val: 100 | val = '' 101 | data_str += key+'=>'+val+'///' 102 | return data_str 103 | data_str.short_description = u'数据信息' 104 | 105 | def page_pdf(self): 106 | if self.file: 107 | url = '/medias/web/'+self.file 108 | return '%s' % (url,self.file) 109 | else: 110 | return '' 111 | page_pdf.allow_tags = True 112 | page_pdf.short_description = u'页面pdf' 113 | 114 | class Meta: 115 | verbose_name = u'数据信息' 116 | verbose_name_plural = u'数据信息管理' 117 | 118 | class ErrorData(models.Model): 119 | scrapyname = models.CharField(u'爬虫名',max_length=50) 120 | uid = models.CharField(u'设备来源',max_length=50) 121 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 122 | url = models.CharField(u'访问地址',max_length=300) 123 | content = models.CharField(u'错误信息',max_length=300) 124 | 125 | def __unicode__(self): 126 | return self.scrapyname 127 | 128 | class Meta: 129 | verbose_name = u'错误信息' 130 | verbose_name_plural = u'错误信息管理' 131 | 132 | 133 | class DataAlarm(models.Model): 134 | scrapyname = models.CharField(u'爬虫名',max_length=50) 135 | is_alarm = models.BooleanField(u'是否预警',default=True) 136 | remark = models.TextField(u'原因备注',null=True,blank=True) 137 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 138 | 139 | def __unicode__(self): 140 | return self.scrapyname 141 | 142 | class Meta: 143 | verbose_name = u'爬虫预警' 144 | verbose_name_plural = u'爬虫预警管理' -------------------------------------------------------------------------------- /datadeal/datadeal/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for datadeal project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.9. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.9/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | DB_USER = "postgres" 19 | DB_PASSWORD = 'bigdata123' 20 | DB_HOST = '10.20.1.50' 21 | DB_NAME = 'scrapydata' 22 | # Quick-start development settings - unsuitable for production 23 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/ 24 | 25 | # SECURITY WARNING: keep the secret key used in production secret! 26 | SECRET_KEY = '02kavqnmt!id6az-9*7i0q*r!ek-vx&f87u$=cqnky()ic!52(' 27 | 28 | # SECURITY WARNING: don't run with debug turned on in production! 29 | DEBUG = True 30 | 31 | ALLOWED_HOSTS = [] 32 | 33 | 34 | # Application definition 35 | 36 | INSTALLED_APPS = [ 37 | 'django.contrib.admin', 38 | 'django.contrib.auth', 39 | 'django.contrib.contenttypes', 40 | 'django.contrib.sessions', 41 | 'django.contrib.messages', 42 | 'django.contrib.staticfiles', 43 | 'django.contrib.postgres', 44 | 'crispy_forms', 45 | 'xadmin', 46 | 'datadeal', 47 | 'distribute', 48 | 'company', 49 | ] 50 | 51 | MIDDLEWARE_CLASSES = [ 52 | 'django.middleware.security.SecurityMiddleware', 53 | 'django.contrib.sessions.middleware.SessionMiddleware', 54 | 'django.middleware.common.CommonMiddleware', 55 | # 'django.middleware.csrf.CsrfViewMiddleware', 56 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 57 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware', 58 | 'django.contrib.messages.middleware.MessageMiddleware', 59 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 60 | ] 61 | 62 | ROOT_URLCONF = 'datadeal.urls' 63 | 64 | TEMPLATES = [ 65 | { 66 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 67 | 'DIRS': [], 68 | 'APP_DIRS': True, 69 | 'OPTIONS': { 70 | 'context_processors': [ 71 | 'django.template.context_processors.debug', 72 | 'django.template.context_processors.request', 73 | 'django.contrib.auth.context_processors.auth', 74 | 'django.contrib.messages.context_processors.messages', 75 | ], 76 | }, 77 | }, 78 | ] 79 | 80 | WSGI_APPLICATION = 'datadeal.wsgi.application' 81 | 82 | 83 | # Database 84 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases 85 | 86 | DATABASES = { 87 | 'default': { 88 | 'ENGINE': 'django.db.backends.postgresql_psycopg2', 89 | 'NAME': DB_NAME, 90 | 'USER': DB_USER, 91 | 'PASSWORD': DB_PASSWORD, 92 | 'HOST': DB_HOST, 93 | } 94 | } 95 | 96 | # Password validation 97 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators 98 | 99 | AUTH_PASSWORD_VALIDATORS = [ 100 | { 101 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 102 | }, 103 | { 104 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 105 | }, 106 | { 107 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 108 | }, 109 | { 110 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 111 | }, 112 | ] 113 | 114 | 115 | # Internationalization 116 | # https://docs.djangoproject.com/en/1.9/topics/i18n/ 117 | 118 | LANGUAGE_CODE = 'zh-Hans' 119 | 120 | TIME_ZONE = 'Asia/Shanghai' 121 | 122 | USE_I18N = True 123 | 124 | USE_L10N = True 125 | 126 | USE_TZ = False 127 | 128 | 129 | # Static files (CSS, JavaScript, Images) 130 | # https://docs.djangoproject.com/en/1.9/howto/static-files/ 131 | 132 | STATIC_URL = '/static/' 133 | MEDIA_URL = '/medias/' 134 | STATIC_ROOT = os.path.join(os.path.dirname(__file__), './static/').replace('\\', '/') 135 | MEDIA_ROOT = os.path.join(os.path.dirname(__file__), './medias/').replace('\\', '/') -------------------------------------------------------------------------------- /datadeal/datadeal/static/css/jquery.dataTables.min.css: -------------------------------------------------------------------------------- 1 | table.dataTable{width:100%;margin:0 auto;clear:both;border-collapse:separate;border-spacing:0}table.dataTable thead th,table.dataTable tfoot th{font-weight:bold}table.dataTable thead th,table.dataTable thead td{padding:10px 18px;border-bottom:1px solid #111}table.dataTable thead th:active,table.dataTable thead td:active{outline:none}table.dataTable tfoot th,table.dataTable tfoot td{padding:10px 18px 6px 18px;border-top:1px solid #111}table.dataTable thead .sorting,table.dataTable thead .sorting_asc,table.dataTable thead .sorting_desc,table.dataTable thead .sorting_asc_disabled,table.dataTable thead .sorting_desc_disabled{cursor:pointer;*cursor:hand}table.dataTable thead .sorting,table.dataTable thead .sorting_asc,table.dataTable thead .sorting_desc,table.dataTable thead .sorting_asc_disabled,table.dataTable thead .sorting_desc_disabled{background-repeat:no-repeat;background-position:center right}table.dataTable thead .sorting{background-image:url("../images/sort_both.png")}table.dataTable thead .sorting_asc{background-image:url("../images/sort_asc.png")}table.dataTable thead .sorting_desc{background-image:url("../images/sort_desc.png")}table.dataTable thead .sorting_asc_disabled{background-image:url("../images/sort_asc_disabled.png")}table.dataTable thead .sorting_desc_disabled{background-image:url("../images/sort_desc_disabled.png")}table.dataTable tbody tr{background-color:#ffffff}table.dataTable tbody tr.selected{background-color:#B0BED9}table.dataTable tbody th,table.dataTable tbody td{padding:8px 10px}table.dataTable.row-border tbody th,table.dataTable.row-border tbody td,table.dataTable.display tbody th,table.dataTable.display tbody td{border-top:1px solid #ddd}table.dataTable.row-border tbody tr:first-child th,table.dataTable.row-border tbody tr:first-child td,table.dataTable.display tbody tr:first-child th,table.dataTable.display tbody tr:first-child td{border-top:none}table.dataTable.cell-border tbody th,table.dataTable.cell-border tbody td{border-top:1px solid #ddd;border-right:1px solid #ddd}table.dataTable.cell-border tbody tr th:first-child,table.dataTable.cell-border tbody tr td:first-child{border-left:1px solid #ddd}table.dataTable.cell-border tbody tr:first-child th,table.dataTable.cell-border tbody tr:first-child td{border-top:none}table.dataTable.stripe tbody tr.odd,table.dataTable.display tbody tr.odd{background-color:#f9f9f9}table.dataTable.stripe tbody tr.odd.selected,table.dataTable.display tbody tr.odd.selected{background-color:#acbad4}table.dataTable.hover tbody tr:hover,table.dataTable.display tbody tr:hover{background-color:#f6f6f6}table.dataTable.hover tbody tr:hover.selected,table.dataTable.display tbody tr:hover.selected{background-color:#aab7d1}table.dataTable.order-column tbody tr>.sorting_1,table.dataTable.order-column tbody tr>.sorting_2,table.dataTable.order-column tbody tr>.sorting_3,table.dataTable.display tbody tr>.sorting_1,table.dataTable.display tbody tr>.sorting_2,table.dataTable.display tbody tr>.sorting_3{background-color:#fafafa}table.dataTable.order-column tbody tr.selected>.sorting_1,table.dataTable.order-column tbody tr.selected>.sorting_2,table.dataTable.order-column tbody tr.selected>.sorting_3,table.dataTable.display tbody tr.selected>.sorting_1,table.dataTable.display tbody tr.selected>.sorting_2,table.dataTable.display tbody tr.selected>.sorting_3{background-color:#acbad5}table.dataTable.display tbody tr.odd>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd>.sorting_1{background-color:#f1f1f1}table.dataTable.display tbody tr.odd>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd>.sorting_2{background-color:#f3f3f3}table.dataTable.display tbody tr.odd>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd>.sorting_3{background-color:whitesmoke}table.dataTable.display tbody tr.odd.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_1{background-color:#a6b4cd}table.dataTable.display tbody tr.odd.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_2{background-color:#a8b5cf}table.dataTable.display tbody tr.odd.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_3{background-color:#a9b7d1}table.dataTable.display tbody tr.even>.sorting_1,table.dataTable.order-column.stripe tbody tr.even>.sorting_1{background-color:#fafafa}table.dataTable.display tbody tr.even>.sorting_2,table.dataTable.order-column.stripe tbody tr.even>.sorting_2{background-color:#fcfcfc}table.dataTable.display tbody tr.even>.sorting_3,table.dataTable.order-column.stripe tbody tr.even>.sorting_3{background-color:#fefefe}table.dataTable.display tbody tr.even.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_1{background-color:#acbad5}table.dataTable.display tbody tr.even.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_2{background-color:#aebcd6}table.dataTable.display tbody tr.even.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_3{background-color:#afbdd8}table.dataTable.display tbody tr:hover>.sorting_1,table.dataTable.order-column.hover tbody tr:hover>.sorting_1{background-color:#eaeaea}table.dataTable.display tbody tr:hover>.sorting_2,table.dataTable.order-column.hover tbody tr:hover>.sorting_2{background-color:#ececec}table.dataTable.display tbody tr:hover>.sorting_3,table.dataTable.order-column.hover tbody tr:hover>.sorting_3{background-color:#efefef}table.dataTable.display tbody tr:hover.selected>.sorting_1,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_1{background-color:#a2aec7}table.dataTable.display tbody tr:hover.selected>.sorting_2,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_2{background-color:#a3b0c9}table.dataTable.display tbody tr:hover.selected>.sorting_3,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_3{background-color:#a5b2cb}table.dataTable.no-footer{border-bottom:1px solid #111}table.dataTable.nowrap th,table.dataTable.nowrap td{white-space:nowrap}table.dataTable.compact thead th,table.dataTable.compact thead td{padding:4px 17px 4px 4px}table.dataTable.compact tfoot th,table.dataTable.compact tfoot td{padding:4px}table.dataTable.compact tbody th,table.dataTable.compact tbody td{padding:4px}table.dataTable th.dt-left,table.dataTable td.dt-left{text-align:left}table.dataTable th.dt-center,table.dataTable td.dt-center,table.dataTable td.dataTables_empty{text-align:center}table.dataTable th.dt-right,table.dataTable td.dt-right{text-align:right}table.dataTable th.dt-justify,table.dataTable td.dt-justify{text-align:justify}table.dataTable th.dt-nowrap,table.dataTable td.dt-nowrap{white-space:nowrap}table.dataTable thead th.dt-head-left,table.dataTable thead td.dt-head-left,table.dataTable tfoot th.dt-head-left,table.dataTable tfoot td.dt-head-left{text-align:left}table.dataTable thead th.dt-head-center,table.dataTable thead td.dt-head-center,table.dataTable tfoot th.dt-head-center,table.dataTable tfoot td.dt-head-center{text-align:center}table.dataTable thead th.dt-head-right,table.dataTable thead td.dt-head-right,table.dataTable tfoot th.dt-head-right,table.dataTable tfoot td.dt-head-right{text-align:right}table.dataTable thead th.dt-head-justify,table.dataTable thead td.dt-head-justify,table.dataTable tfoot th.dt-head-justify,table.dataTable tfoot td.dt-head-justify{text-align:justify}table.dataTable thead th.dt-head-nowrap,table.dataTable thead td.dt-head-nowrap,table.dataTable tfoot th.dt-head-nowrap,table.dataTable tfoot td.dt-head-nowrap{white-space:nowrap}table.dataTable tbody th.dt-body-left,table.dataTable tbody td.dt-body-left{text-align:left}table.dataTable tbody th.dt-body-center,table.dataTable tbody td.dt-body-center{text-align:center}table.dataTable tbody th.dt-body-right,table.dataTable tbody td.dt-body-right{text-align:right}table.dataTable tbody th.dt-body-justify,table.dataTable tbody td.dt-body-justify{text-align:justify}table.dataTable tbody th.dt-body-nowrap,table.dataTable tbody td.dt-body-nowrap{white-space:nowrap}table.dataTable,table.dataTable th,table.dataTable td{-webkit-box-sizing:content-box;box-sizing:content-box}.dataTables_wrapper{position:relative;clear:both;*zoom:1;zoom:1}.dataTables_wrapper .dataTables_length{float:left}.dataTables_wrapper .dataTables_filter{float:right;text-align:right}.dataTables_wrapper .dataTables_filter input{margin-left:0.5em}.dataTables_wrapper .dataTables_info{clear:both;float:left;padding-top:0.755em}.dataTables_wrapper .dataTables_paginate{float:right;text-align:right;padding-top:0.25em}.dataTables_wrapper .dataTables_paginate .paginate_button{box-sizing:border-box;display:inline-block;min-width:1.5em;padding:0.5em 1em;margin-left:2px;text-align:center;text-decoration:none !important;cursor:pointer;*cursor:hand;color:#333 !important;border:1px solid transparent;border-radius:2px}.dataTables_wrapper .dataTables_paginate .paginate_button.current,.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover{color:#333 !important;border:1px solid #979797;background-color:white;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #fff), color-stop(100%, #dcdcdc));background:-webkit-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-moz-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-ms-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-o-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:linear-gradient(to bottom, #fff 0%, #dcdcdc 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button.disabled,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:active{cursor:default;color:#666 !important;border:1px solid transparent;background:transparent;box-shadow:none}.dataTables_wrapper .dataTables_paginate .paginate_button:hover{color:white !important;border:1px solid #111;background-color:#585858;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #585858), color-stop(100%, #111));background:-webkit-linear-gradient(top, #585858 0%, #111 100%);background:-moz-linear-gradient(top, #585858 0%, #111 100%);background:-ms-linear-gradient(top, #585858 0%, #111 100%);background:-o-linear-gradient(top, #585858 0%, #111 100%);background:linear-gradient(to bottom, #585858 0%, #111 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button:active{outline:none;background-color:#2b2b2b;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #2b2b2b), color-stop(100%, #0c0c0c));background:-webkit-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-moz-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-ms-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-o-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:linear-gradient(to bottom, #2b2b2b 0%, #0c0c0c 100%);box-shadow:inset 0 0 3px #111}.dataTables_wrapper .dataTables_paginate .ellipsis{padding:0 1em}.dataTables_wrapper .dataTables_processing{position:absolute;top:50%;left:50%;width:100%;height:40px;margin-left:-50%;margin-top:-25px;padding-top:20px;text-align:center;font-size:1.2em;background-color:white;background:-webkit-gradient(linear, left top, right top, color-stop(0%, rgba(255,255,255,0)), color-stop(25%, rgba(255,255,255,0.9)), color-stop(75%, rgba(255,255,255,0.9)), color-stop(100%, rgba(255,255,255,0)));background:-webkit-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-moz-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-ms-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-o-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:linear-gradient(to right, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%)}.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter,.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_processing,.dataTables_wrapper .dataTables_paginate{color:#333}.dataTables_wrapper .dataTables_scroll{clear:both}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody{*margin-top:-1px;-webkit-overflow-scrolling:touch}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td{vertical-align:middle}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td>div.dataTables_sizing{height:0;overflow:hidden;margin:0 !important;padding:0 !important}.dataTables_wrapper.no-footer .dataTables_scrollBody{border-bottom:1px solid #111}.dataTables_wrapper.no-footer div.dataTables_scrollHead>table,.dataTables_wrapper.no-footer div.dataTables_scrollBody>table{border-bottom:none}.dataTables_wrapper:after{visibility:hidden;display:block;content:"";clear:both;height:0}@media screen and (max-width: 767px){.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_paginate{float:none;text-align:center}.dataTables_wrapper .dataTables_paginate{margin-top:0.5em}}@media screen and (max-width: 640px){.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter{float:none;text-align:center}.dataTables_wrapper .dataTables_filter{margin-top:0.5em}} 2 | -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/Sorting icons.psd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/Sorting icons.psd -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/favicon.ico -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/sort_asc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_asc.png -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/sort_asc_disabled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_asc_disabled.png -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/sort_both.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_both.png -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/sort_desc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_desc.png -------------------------------------------------------------------------------- /datadeal/datadeal/static/images/sort_desc_disabled.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_desc_disabled.png -------------------------------------------------------------------------------- /datadeal/datadeal/static/js/iframe_common.js: -------------------------------------------------------------------------------- 1 | function get_parent(child,xpath){ 2 | var tag = child.get(0).tagName.toLocaleLowerCase(); 3 | var i = child.index(); 4 | if(child.parent().children().length == 1){ 5 | var tag_name = tag; 6 | }else{ 7 | var tag_name = tag+'['+(i+1)+']'; 8 | } 9 | if(child.parent().attr('id')){ 10 | var id = child.parent().attr('id'); 11 | var root_name = child.parent().get(0).tagName.toLocaleLowerCase(); 12 | xpath = '//'+root_name+'[@id=\''+id+'\']/'+tag_name+xpath; 13 | return xpath; 14 | }else{ 15 | child = child.parent(); 16 | xpath = '/'+tag_name+xpath; 17 | if(tag_name=='html'){ 18 | return xpath; 19 | }else{ 20 | return get_parent(child,xpath); 21 | } 22 | } 23 | } 24 | 25 | function get_xpath(that){ 26 | var xpath = ''; 27 | xpath = get_parent(that,xpath); 28 | return xpath; 29 | } -------------------------------------------------------------------------------- /datadeal/datadeal/static/js/iframe_detail.js: -------------------------------------------------------------------------------- 1 | var variable = []; 2 | var v_dict = {}; 3 | $('*').click(function(){ 4 | if($(this).children().length == 0){ 5 | var xpath = get_xpath($(this)); 6 | $(this).css('border','3px dotted #000'); 7 | if($.inArray(xpath, variable) == -1){ 8 | variable.push(xpath); 9 | v_dict[xpath] = $(this); 10 | parent.set_variable(variable,'detail_table'); 11 | } 12 | } 13 | }) 14 | 15 | function childrenup(xpath) { 16 | change_xpath = xpath.replace(/danyin/g,'\''); 17 | var that = v_dict[change_xpath]; 18 | var new_xpath = get_xpath(that.parent()); 19 | if($.inArray(xpath, variable) != -1){ 20 | variable.splice($.inArray(xpath,variable),1); 21 | that.css('border',''); 22 | that.parent().css('border','3px dotted #000'); 23 | if($.inArray(new_xpath, variable) == -1){ 24 | variable.push(new_xpath); 25 | v_dict[new_xpath] = that.parent(); 26 | parent.set_variable(variable,'detail_table'); 27 | } 28 | } 29 | 30 | } 31 | 32 | function del_v(xpath){ 33 | change_xpath = xpath.replace(/danyin/g,'\''); 34 | var that = v_dict[change_xpath]; 35 | if($.inArray(xpath, variable) != -1){ 36 | variable.splice($.inArray(xpath,variable),1); 37 | that.css('border',''); 38 | } 39 | } -------------------------------------------------------------------------------- /datadeal/datadeal/static/js/iframe_list.js: -------------------------------------------------------------------------------- 1 | function get_cycle(str1,str2){ 2 | var cut = 0; 3 | for(i in str1){ 4 | if(str1[i] == '['){ 5 | cut = i; 6 | } 7 | if(str1[i] != str2[i]){ 8 | break 9 | } 10 | } 11 | var cycle = str1.substr(0,cut); 12 | var variable = str2.substr(cut).split('/'); 13 | variable.splice(0,1); 14 | variable = variable.join('/'); 15 | return [cycle,variable] 16 | } 17 | 18 | var choice = []; 19 | var cycle = ''; 20 | var v_list = []; 21 | $('*').click(function(){ 22 | if($(this).children().length == 0){ 23 | var xpath = get_xpath($(this)); 24 | choice.push(xpath); 25 | if(choice.length > 1){ 26 | var array = get_cycle(choice[0],choice[choice.length-1]); 27 | cycle = array[0]; 28 | parent.set_cycle(cycle); 29 | var variable = array[1]; 30 | if(variable){ 31 | if($.inArray(variable, v_list) == -1){ 32 | v_list.push(variable); 33 | parent.set_variable(v_list,'list_table'); 34 | } 35 | } 36 | } 37 | $(this).css('border','3px dotted #000'); 38 | } 39 | }) -------------------------------------------------------------------------------- /datadeal/datadeal/templates/detail_iframe.html: -------------------------------------------------------------------------------- 1 | 山东省环境保护厅--2017年山东省砖瓦行业环保专项行动排查情况 2 | 3 | 33 | 34 | 57 | 58 |
35 | 36 | 37 | 54 | 55 | 56 |
设为首页 加入收藏政务微博政务微信“12369”网络举报平台WAP版 网站地图
59 | 60 | 61 | 64 | 65 | 66 | 159 | 160 | 161 | 162 | 163 | 164 | 253 | 254 | 255 | 256 | 257 | 258 | 279 | 280 |
62 | 63 |
67 | 68 | 69 | 151 | 152 | 153 | 154 | 155 | 156 | 157 |
70 | 71 | 72 | 91 | 110 | 129 | 148 | 149 | 150 |
73 | 74 | 75 | 89 | 90 |
76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |
单位职责领导简介
机关处室直属单位
联系方式
92 | 93 | 94 | 108 | 109 |
95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 |
公开指南公开目录
依申请公开公开年报
政府文件申请反馈
111 | 112 | 113 | 127 | 128 |
114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 |
办理指南在线申报
在线查询公示中心
下载中心在线咨询
130 | 131 | 132 | 146 | 147 |
133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 |
网上调查民意征集
在线访谈您的建议
曝光台
158 |
165 | 166 | 190 | 191 | 192 | 251 | 252 |
167 | 168 | 169 | 187 | 188 | 189 |
170 | 171 | 177 | 185 | 186 |
172 | 173 | 174 | 175 | 176 |
当前位置:首页 > 环境监察 > 案件查处
178 | 179 | 180 | 181 | 182 | 183 | 184 |
打印本页关闭本页
193 | 194 | 195 | 196 |
 
197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 214 | 215 | 216 | 249 | 250 |
2017年山东省砖瓦行业环保专项行动排查情况
206 | 207 | 208 | 211 | 212 | 213 |
发布日期:2017-03-29 209 | 210 |      来源:     作者:
217 | 218 | 225 | 226 | 227 | 242 | 243 |
219 |
221 |

   为贯彻落实环境保护部办公厅《关于督促开展砖瓦行业环保专项行动的函》(环办环监函〔2017〕33号)要求,我厅组织各市开展了山东省2017年砖瓦行业环保专项行动。经各市环保局排查,全省共有砖瓦企业1970家(详见附表)。

222 |

    

223 |

  附表:山东省砖瓦企业名单 

224 |
228 |
229 | 附件: 230 |
231 |
232 |
233 |
234 | 239 |
240 |
241 |
244 | 245 | 246 | 247 | 248 |
关闭本页
259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 |
主办单位:山东省环境保护厅 联系电话:0531-86106112
地址:济南市经十路3377号,邮编:250101
网站管理:山东省环境保护宣传教育中心 268 |
技术支持:山东省环境信息与监控中心
Copyright 2012 sdein.gov.cn 鲁ICP备:09042362
277 |

278 |
281 | 294 | 295 | 296 | 298 | 299 | 300 | 301 | {% load staticfiles %} -------------------------------------------------------------------------------- /datadeal/datadeal/templates/files_admin.html: -------------------------------------------------------------------------------- 1 | {% extends 'xadmin/base_site.html' %} 2 | {% load i18n l10n %} 3 | {% load xadmin_tags %} 4 | {% load static %} 5 | {% load staticfiles %} 6 | {% block title %} 7 | 文件管理 8 | {% endblock %} 9 | {% block content-nav %}{% endblock %} 10 | 11 | {% block breadcrumbs %} 12 | 16 | {% endblock %} 17 | {% block content %} 18 | 19 | 20 | 选择文件夹: 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |
文件名创建时间操作
文件名创建时间操作
38 | 39 | 64 | 118 | {% endblock %} -------------------------------------------------------------------------------- /datadeal/datadeal/templates/images_admin.html: -------------------------------------------------------------------------------- 1 | {% extends 'xadmin/base_site.html' %} 2 | {% load i18n l10n %} 3 | {% load xadmin_tags %} 4 | {% load static %} 5 | {% block title %} 6 | 图片管理 7 | {% endblock %} 8 | {% block content-nav %}{% endblock %} 9 | 10 | {% block breadcrumbs %} 11 | 15 | {% endblock %} 16 | {% block content %} 17 | 20 | 21 |
22 |
23 | {% for item in img_list.object_list %} 24 |
25 |
26 | 27 | 28 | 29 |
30 |

31 | 文件名:{{item.url}} 32 | 创建时间:{{item.ctime}} 33 |

34 |
35 | 36 | {% endfor %} 37 |
38 |
39 | 54 | 55 | 69 | {% endblock %} -------------------------------------------------------------------------------- /datadeal/datadeal/templates/index.html: -------------------------------------------------------------------------------- 1 | {% load staticfiles %} 2 | 3 | 4 | 5 | 6 | Index 7 | 8 | 19 | 20 | 21 |
22 | {% for k,val in kind %} 23 |
24 | {{val}} 25 |
26 | {% endfor %} 27 |
28 | 39 | 40 | 50 | 123 | 124 | -------------------------------------------------------------------------------- /datadeal/datadeal/urls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from django.conf.urls.static import static 3 | from django.conf.urls import patterns, include, url 4 | from django.views.static import serve 5 | from datadeal import settings 6 | from .views import * 7 | import xadmin 8 | xadmin.autodiscover() 9 | 10 | urlpatterns = [ 11 | url(r'^admin/', include(xadmin.site.urls)), 12 | url(r'^$', IndexView.as_view() ,name='index'), 13 | url(r'^list/$', ListFrameView.as_view() ,name='list'), 14 | url(r'^detail/$', DetailFrameView.as_view() ,name='detail'), 15 | url(r'^back_html/$', AjaxBackHtmlView.as_view() ,name='back_html'), 16 | url(r'^upload_files/$', UploadFilesView.as_view() ,name='upload_files'), 17 | url(r'^zip_files/$', ZipFilesView.as_view() ,name='zip_files'), 18 | url(r'^del_file/$', DeleteFilesView.as_view() ,name='del_file'), 19 | url(r'^distribute/',include('distribute.urls')), 20 | url(r'^medias/(?P.*)$', serve, {'document_root':settings.MEDIA_ROOT}), 21 | url(r'^static/(?P.*)$', serve, {'document_root':settings.STATIC_ROOT}), 22 | ] -------------------------------------------------------------------------------- /datadeal/datadeal/views.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.views.generic import TemplateView,View 3 | from django.http import HttpResponse,HttpResponseRedirect 4 | from datadeal.settings import BASE_DIR 5 | from .models import scrapySetting 6 | import urllib,urllib2 7 | import bs4 8 | import os 9 | 10 | def get_nocycle_variables(nocycle_variable): 11 | v_dict = '' 12 | result = '' 13 | nvname_list = [] 14 | for nov in nocycle_variable: 15 | nvname_list.append(nov.name) 16 | v_dict += '\'%s\':%s,' % (nov.name,nov.name) 17 | if nov.all_text: 18 | all_text = '.xpath(\'string(.)\')' 19 | else: 20 | all_text = '' 21 | result += ' '*8+nov.name+' = response.xpath(\''+nov.xpath+'\')'+all_text+'.extract_first()\n' 22 | return v_dict,result,nvname_list 23 | 24 | 25 | def create_scrapy_file(q): 26 | """ 27 | 创建scrapy爬虫文件方法 28 | """ 29 | allow_domains = '[' 30 | for i,val in enumerate(q.allow_domains.all()): 31 | if i == len(q.allow_domains.all())-1: 32 | allow_domains += '"'+val.name+'"' 33 | else: 34 | allow_domains += '"'+val.name+'"'+ ',' 35 | allow_domains += ']' 36 | start_requests = ' def start_requests(self):\n'+' '*8+'results = getTasks(\''+q.name+'\')\n'+' '*8+'self.taks_urls = {}\n'+' '*8+'self.tasks = {}\n'+' '*8+'if isinstance(results,dict):\n'+' '*12+'print results[\'error\']\n'+' '*8+'else:\n'+' '*12+'for re in results:\n'+' '*16+'self.tasks[re[\'id\']] = {\'t_count\':len(re[\'urls\']),\'count\':0}\n'+' '*16+'for u in re[\'urls\']:\n'+' '*20+'self.taks_urls[u] = re[\'id\']\n'+' '*20+'yield self.make_requests_from_url(u)\n\n' 37 | after_parse = ' def after_parse(self,url):\n'+' '*8+'task_id = self.taks_urls[url]\n'+' '*8+'self.tasks[task_id][\'count\'] += 1\n'+' '*8+'if self.tasks[task_id][\'count\'] == self.tasks[task_id][\'t_count\']:\n'+' '*12+'afterTasks(task_id)\n\n' 38 | 39 | nocycle_variable = q.variable.filter(kind=1) 40 | v_dict,cycleobj,nvname_list = get_nocycle_variables(nocycle_variable) 41 | cycleobjs = q.cycleobj.all() 42 | if len(cycleobjs): 43 | next_url = '' #判断是否有子查询链接 44 | next_variable = [] 45 | v_list = [] 46 | total_v_list = [] 47 | total_v_list += nvname_list 48 | v_list += nvname_list 49 | c_dict = '' 50 | for c in cycleobjs: 51 | variables = c.variable.all() 52 | variable = '' 53 | for v in variables: 54 | #包含子标签文本提取 55 | if v.all_text: 56 | v_str = '%s = i.xpath(\'%s\').xpath(\'string(.)\').extract_first()\n' % (v.name,v.xpath) 57 | else: 58 | v_str = '%s = i.xpath(\'%s\').extract_first()\n' % (v.name,v.xpath) 59 | if v.kind == 1: 60 | variable += ' '*12+v_str 61 | v_dict += '\'%s\':%s,' % (v.name,v.name) 62 | c_dict += '\'%s\':%s,' % (v.name,v.name) 63 | v_list.append(v.name) 64 | total_v_list.append(v.name) 65 | elif v.kind == 2: 66 | next_variable.append({'name':v.name,'xpath':v.xpath,'all_text':v.all_text}) 67 | if not v.name in total_v_list: 68 | total_v_list.append(v.name) 69 | elif v.kind == 3: 70 | next_url = v.name 71 | variable += ' '*12+v_str 72 | if next_url: 73 | cycleobj += ' '*8+c.name+' = response.xpath(\''+c.xpath+'\')\n'+' '*8+'for i in %s:\n%s' % (c.name,variable) 74 | cycleobj += ' '*12+next_url+' = set_url_head('+next_url+',response.url)\n'+' '*12+'if '+next_url+':\n'+' '*16+'yield scrapy.Request('+next_url+', callback=self.parse_item,meta={'+v_dict+'})\n' 75 | else: 76 | nvname_list.append(c.name+'_data') 77 | c_dict = c_dict[0:-1] 78 | cycleobj += ' '*8+c.name+' = response.xpath(\''+c.xpath+'\')\n'+' '*8+c.name+'_data = []\n'+' '*8+'for i in '+c.name+':\n'+variable+' '*12+c.name+'_data.append({'+c_dict+'})\n' 79 | cycleobj += ' '*8+'self.after_parse(response.url)\n' 80 | if next_url: 81 | cycleobj += '\n'+' '*4+'def parse_item(self, response):\n' 82 | for vl in v_list: 83 | cycleobj += ' '*8+vl+' = response.meta[\''+vl+'\']\n' 84 | for nv in next_variable: 85 | if nv['all_text']: 86 | cycleobj += ' '*8+'%s = response.xpath(\'%s\').xpath(\'string(.)\').extract_first()\n' % (nv['name'],nv['xpath']) 87 | else: 88 | cycleobj += ' '*8+'%s = response.xpath(\'%s\').extract_first()\n' % (nv['name'],nv['xpath']) 89 | data = '' 90 | for total in total_v_list: 91 | data += '\'%s\':%s,' % (total,total) 92 | data = data[0:-1] 93 | cycleobj += ' '*8+'sendData(\'%s\',{%s},response.url)' % (q.name,data) 94 | else: 95 | no_next = '' 96 | for n in nvname_list: 97 | no_next += '\'%s\':%s,' % (n,n) 98 | no_next = no_next[0:-1] 99 | cycleobj += ' '*8+'sendData(\'%s\',{%s},response.url)' % (q.name,no_next) 100 | else: # 单页面爬虫 101 | v_dict = v_dict[0:-1] 102 | cycleobj = nv_str+' '*8+'data = {'+v_dict+'}\n'+' '*8+'self.after_parse(response.url)\n'+' '*8+'sendData(\''+q.name+'\',data,response.url)' 103 | 104 | with open(BASE_DIR+'/../searchInfo/searchInfo/spiders/%s.py' % q.name,'w') as f: 105 | f.write('# -*- coding: utf-8 -*-\nimport scrapy\nfrom distribute.views import *\n\nclass '+q.name+'Spider(scrapy.Spider):\n name = "'+q.name+'"\n allowed_domains = '+allow_domains+'\n\n'+start_requests+after_parse+' def parse(self, response):\n'+cycleobj) 106 | 107 | class IndexView(TemplateView): 108 | template_name = 'index.html' 109 | 110 | def get_context_data(self, **kwargs): 111 | context = super(IndexView, self).get_context_data() 112 | 113 | context.update({ 114 | 'kind':scrapySetting.KIND_CHOICES 115 | }) 116 | return context 117 | 118 | class AjaxBackHtmlView(View): 119 | 120 | def post(self,request): 121 | url = request.POST.get('url','') 122 | frame = request.POST.get('frame','') 123 | proto, rest = urllib.splittype(url) 124 | domain, rest = urllib.splithost(rest) 125 | f = urllib2.urlopen(url) 126 | result = f.read() 127 | soup = bs4.BeautifulSoup(result,'lxml') 128 | body = soup.find('html') 129 | # css样式链接替换 130 | link = body.find_all('link') 131 | for l in link: 132 | if l['href'].startswith('/'): 133 | l['href'] = proto+'://'+domain+l['href'] 134 | elif l['href'].startswith('../'): 135 | last = l['href'].split('../')[-1] 136 | l['href'] = proto+'://'+domain+'/'+last 137 | # js链接替换 138 | script = body.find_all('script') 139 | for s in script: 140 | if s.has_key('src'): 141 | if s['src'].startswith('/'): 142 | s['src'] = proto+'://'+domain+s['src'] 143 | elif s['src'].startswith('../'): 144 | last = s['src'].split('../')[-1] 145 | s['src'] = proto+'://'+domain+'/'+last 146 | # img链接替换 147 | img = body.find_all('img') 148 | for g in img: 149 | if g.has_key('src'): 150 | if g['src'].startswith('/'): 151 | g['src'] = proto+'://'+domain+g['src'] 152 | elif g['src'].startswith('../'): 153 | last = g['src'].split('../')[-1] 154 | g['src'] = proto+'://'+domain+'/'+last 155 | # a标签链接禁止点击 156 | a = body.find_all('a') 157 | for i in a: 158 | href = i['href'] 159 | i['href'] = 'javascript:void(0);' 160 | i['href_bak'] = href 161 | 162 | if frame == 'list_iframe': 163 | result = 'list/' 164 | with open(BASE_DIR+'/datadeal/templates/'+frame+'.html','w') as f: 165 | f.write(str(body)) 166 | f.write('\n{% load staticfiles %}') 167 | elif frame == 'detail_iframe': 168 | result = 'detail/' 169 | with open(BASE_DIR+'/datadeal/templates/'+frame+'.html','w') as f: 170 | f.write(str(body)) 171 | f.write('\n{% load staticfiles %}') 172 | else: 173 | result = 'other' 174 | return HttpResponse(result) 175 | 176 | class ListFrameView(TemplateView): 177 | template_name = 'list_iframe.html' 178 | 179 | class DetailFrameView(TemplateView): 180 | template_name = 'detail_iframe.html' 181 | 182 | 183 | class UploadFilesView(View): 184 | def post(self,request): 185 | count = 0 186 | for name,file in request.FILES.items(): 187 | dir_path = BASE_DIR+'/datadeal/medias/'+name.split('/')[0] 188 | if not os.path.exists(dir_path): 189 | os.makedirs(dir_path) 190 | path = BASE_DIR+'/datadeal/medias/'+name 191 | if not os.path.exists(path): 192 | with open(path, 'wb') as f: 193 | f.write(file.read()) 194 | count += 1 195 | return HttpResponse(u'已上传%s项文件' % count) 196 | 197 | class ZipFilesView(View): 198 | def post(self,request): 199 | file_type = request.POST.get('type','') 200 | import zipfile 201 | zp_name = BASE_DIR+'/datadeal/medias/'+file_type+'.zip' 202 | file_list = [] 203 | if os.path.exists(zp_name): 204 | z_r = zipfile.ZipFile(zp_name, mode='r') 205 | for filename in z_r.namelist(): 206 | file_list.append(filename) 207 | z_r.close() 208 | 209 | file_dir = BASE_DIR+'/datadeal/medias/'+file_type 210 | files = os.listdir(file_dir) 211 | for f in files: 212 | if not f in file_list: 213 | zpfd = zipfile.ZipFile(zp_name, mode='a',compression=zipfile.ZIP_DEFLATED) 214 | zpfd.write(file_dir+'/'+f,f) 215 | zpfd.close() 216 | return HttpResponse('/medias/'+file_type+'.zip') 217 | 218 | class DeleteFilesView(View): 219 | def post(self,request): 220 | file_name = request.POST.get('file_name','') 221 | file_path = BASE_DIR+'/datadeal/medias/common/'+file_name 222 | status = False 223 | if os.path.exists(file_path): 224 | os.remove(file_path) 225 | status = True 226 | return HttpResponse(status) -------------------------------------------------------------------------------- /datadeal/datadeal/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for datadeal project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /datadeal/distribute/__init__.py: -------------------------------------------------------------------------------- 1 | default_app_config = "distribute.apps.DistributeConfig" -------------------------------------------------------------------------------- /datadeal/distribute/adminx.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import xadmin 3 | from .models import * 4 | 5 | class NodeAdmin(object): 6 | list_display = ['uid','status','ips','max_num'] 7 | search_fields = ['uid'] 8 | list_filter = ['status'] 9 | list_editable = ['status'] 10 | xadmin.site.register(Node, NodeAdmin) 11 | 12 | class NodeIpAdmin(object): 13 | list_display = ['ip','create_at'] 14 | search_fields = ['ip'] 15 | list_filter = ['create_at'] 16 | xadmin.site.register(NodeIp, NodeIpAdmin) 17 | 18 | class NodeTaskAdmin(object): 19 | list_display = ['name','scrapy','priority','urls','status','create_at','get_at','over_at','node','nodeip'] 20 | search_fields = ['name'] 21 | list_filter = ['scrapy','node','status','get_at','over_at','create_at'] 22 | xadmin.site.register(NodeTask, NodeTaskAdmin) -------------------------------------------------------------------------------- /datadeal/distribute/apps.py: -------------------------------------------------------------------------------- 1 | #_*_ encoding:utf-8 _*_ 2 | from django.apps import AppConfig 3 | 4 | class DistributeConfig(AppConfig): 5 | name = 'distribute' 6 | verbose_name = "节点管理" -------------------------------------------------------------------------------- /datadeal/distribute/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/management/__init__.py -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/management/commands/__init__.py -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/get_spiders.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from distribute.views import get_mac_address,HOST 4 | from datadeal.settings import BASE_DIR 5 | import urllib,urllib2 6 | import json 7 | 8 | class Command(BaseCommand): 9 | help = '同步主机spider文件' 10 | 11 | def handle(self, *args, **options): 12 | mac = get_mac_address() 13 | posturl = HOST+'/distribute/get_spiders/' 14 | data = {'uid':mac} 15 | data = urllib.urlencode(data) 16 | f = urllib2.urlopen(posturl,data) 17 | result = json.loads(f.read()) 18 | 19 | if result.has_key('error'): 20 | print result['error'] 21 | else: 22 | for key,val in result.items(): 23 | with open(BASE_DIR+'/../searchInfo/searchInfo/spiders/'+key,'w') as s_file: 24 | s_file.write(val) 25 | print u'同步完成' -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/mongo_test.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from datadeal.models import SpiderData 4 | 5 | class Command(BaseCommand): 6 | help = 'test' 7 | 8 | def handle(self, *args, **options): 9 | # print SpiderData.objects(__raw__={'data.处罚结果(种类) ': '罚款 '}) 10 | data = SpiderData.objects.filter(id=21475) 11 | # url_list = [] 12 | # with open('d://project/commonscrapy/selenium/url_list.txt','r') as file: 13 | # for line in file.readlines(): 14 | # if line.replace('\n',''): 15 | # url_list.append(line.replace('\n','')) 16 | 17 | # for url in url_list: 18 | # data = SpiderData.objects.filter(url=url) 19 | # for d in data: 20 | 21 | # print d.url 22 | # d.delete() 23 | for d in data: 24 | print d.data 25 | # for key,val in d.data.items(): 26 | # print key,val 27 | -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/register_node.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from distribute.views import get_mac_address,HOST 4 | import urllib,urllib2 5 | import json 6 | 7 | class Command(BaseCommand): 8 | help = '注册节点mac地址' 9 | 10 | def handle(self, *args, **options): 11 | mac = get_mac_address() 12 | posturl = HOST+'/distribute/create_node/' 13 | data = {'uid':mac} 14 | data = urllib.urlencode(data) 15 | f = urllib2.urlopen(posturl,data) 16 | result = f.read().decode('utf8') 17 | print result -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/start_spider.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from distribute.views import get_mac_address,HOST 4 | from datadeal.settings import BASE_DIR 5 | import urllib,urllib2 6 | import json 7 | import os 8 | 9 | class Command(BaseCommand): 10 | help = '开始爬取数据' 11 | 12 | def handle(self, *args, **options): 13 | posturl = HOST+'/distribute/get_spidername/' 14 | data = {} 15 | data = urllib.urlencode(data) 16 | f = urllib2.urlopen(posturl,data) 17 | result = f.read().decode('utf8') 18 | if result: 19 | os.system('cd %s/../searchInfo&&scrapy crawl %s' % (BASE_DIR,result)) 20 | else: 21 | print u'暂时没有可执行任务' -------------------------------------------------------------------------------- /datadeal/distribute/management/commands/upload_files.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.core.management.base import BaseCommand, CommandError 3 | from distribute.views import get_mac_address,HOST 4 | from datadeal.settings import BASE_DIR 5 | import requests 6 | import os 7 | 8 | class Command(BaseCommand): 9 | help = '上传本机medias下的下载文件至主机' 10 | 11 | def add_arguments(self, parser): 12 | parser.add_argument('args', metavar='spider_label', nargs='*', 13 | help='Specify the spider dir to upload.') 14 | 15 | def handle(self, *args, **options): 16 | dir_name = BASE_DIR+'/datadeal/medias' 17 | upload_files = {} 18 | files = os.listdir(dir_name) 19 | if len(args) > 0: 20 | dir_list = args 21 | else: 22 | dir_list = [] 23 | for f in files: 24 | if not '.' in f: 25 | dir_list.append(f) 26 | for d in dir_list: 27 | d_files = os.listdir(dir_name+'/'+d) 28 | for df in d_files: 29 | upload_files[d+'/'+df]=open(dir_name+'/'+d+'/'+df,'rb') 30 | url = HOST+'/upload_files/' 31 | response = requests.post(url,files=upload_files) 32 | print response.content.decode('utf8') -------------------------------------------------------------------------------- /datadeal/distribute/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-05 09:36 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | initial = True 12 | 13 | dependencies = [ 14 | ('datadeal', '0001_initial'), 15 | ] 16 | 17 | operations = [ 18 | migrations.CreateModel( 19 | name='Node', 20 | fields=[ 21 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 22 | ('uid', models.CharField(max_length=50, verbose_name='uid')), 23 | ('status', models.BooleanField(default=True, verbose_name='\u662f\u5426\u5f00\u542f')), 24 | ], 25 | options={ 26 | 'verbose_name': '\u8282\u70b9\u7ba1\u7406', 27 | 'verbose_name_plural': '\u8282\u70b9\u7ba1\u7406', 28 | }, 29 | ), 30 | migrations.CreateModel( 31 | name='NodeTask', 32 | fields=[ 33 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 34 | ('name', models.CharField(max_length=50, verbose_name='\u4efb\u52a1\u540d')), 35 | ('priority', models.IntegerField(default=0, verbose_name='\u4efb\u52a1\u4f18\u5148\u7ea7')), 36 | ('status', models.IntegerField(choices=[(1, '\u5f85\u91c7\u96c6'), (2, '\u5df2\u5b8c\u6210')], default=1, verbose_name='\u4efb\u52a1\u72b6\u6001')), 37 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 38 | ('get_at', models.DateTimeField(blank=True, null=True, verbose_name='\u4efb\u52a1\u9886\u53d6\u65f6\u95f4')), 39 | ('over_at', models.DateTimeField(blank=True, null=True, verbose_name='\u4efb\u52a1\u5b8c\u6210\u65f6\u95f4')), 40 | ('node', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='distribute.Node', verbose_name='\u6267\u884c\u8282\u70b9')), 41 | ('scrapy', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='datadeal.scrapyList', verbose_name='\u9879\u76ee')), 42 | ('urls', models.ManyToManyField(to='datadeal.startUrls', verbose_name='\u722c\u53d6\u94fe\u63a5')), 43 | ], 44 | options={ 45 | 'verbose_name': '\u4efb\u52a1\u7ba1\u7406', 46 | 'verbose_name_plural': '\u4efb\u52a1\u7ba1\u7406', 47 | }, 48 | ), 49 | ] 50 | -------------------------------------------------------------------------------- /datadeal/distribute/migrations/0002_auto_20170606_1335.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Generated by Django 1.9 on 2017-06-06 13:35 3 | from __future__ import unicode_literals 4 | 5 | from django.db import migrations, models 6 | import django.db.models.deletion 7 | 8 | 9 | class Migration(migrations.Migration): 10 | 11 | dependencies = [ 12 | ('distribute', '0001_initial'), 13 | ] 14 | 15 | operations = [ 16 | migrations.CreateModel( 17 | name='NodeIp', 18 | fields=[ 19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 20 | ('ip', models.CharField(max_length=100, verbose_name='ip\u5730\u5740')), 21 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')), 22 | ], 23 | options={ 24 | 'verbose_name': '\u8282\u70b9IP\u7ba1\u7406', 25 | 'verbose_name_plural': '\u8282\u70b9IP\u7ba1\u7406', 26 | }, 27 | ), 28 | migrations.AddField( 29 | model_name='node', 30 | name='max_num', 31 | field=models.IntegerField(default=10, help_text='\u5355\u4f4d: \u6b21/\u5929(\u540c\u9879\u76ee\u540cip)', verbose_name='\u6700\u5927\u4efb\u52a1\u9891\u5ea6'), 32 | ), 33 | migrations.AddField( 34 | model_name='node', 35 | name='ips', 36 | field=models.ManyToManyField(to='distribute.NodeIp', verbose_name='\u5386\u53f2IP'), 37 | ), 38 | migrations.AddField( 39 | model_name='nodetask', 40 | name='nodeip', 41 | field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='distribute.NodeIp', verbose_name='\u6267\u884cIP'), 42 | ), 43 | ] 44 | -------------------------------------------------------------------------------- /datadeal/distribute/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/migrations/__init__.py -------------------------------------------------------------------------------- /datadeal/distribute/models.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from __future__ import unicode_literals 3 | 4 | from django.db import models 5 | 6 | class Node(models.Model): 7 | uid = models.CharField(u'uid',max_length=50) 8 | status = models.BooleanField(u'是否开启',default=True) 9 | ips = models.ManyToManyField('NodeIp',verbose_name=u'历史IP',blank=True) 10 | max_num = models.IntegerField(u'最大任务频度',help_text='单位: 次/天(同项目同ip)',default=10) 11 | 12 | def __unicode__(self): 13 | return self.uid 14 | 15 | class Meta: 16 | verbose_name = u'节点管理' 17 | verbose_name_plural = u'节点管理' 18 | 19 | class NodeIp(models.Model): 20 | ip = models.CharField(u'ip地址',max_length=100) 21 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 22 | 23 | def __unicode__(self): 24 | return self.ip 25 | 26 | class Meta: 27 | verbose_name = u'节点IP管理' 28 | verbose_name_plural = u'节点IP管理' 29 | 30 | class NodeTask(models.Model): 31 | STATUS_CHOICES = ((1, u'待采集'),(2, u'已完成')) 32 | name = models.CharField(u'任务名',max_length=50) 33 | scrapy = models.ForeignKey('datadeal.scrapyList',verbose_name=u'项目') 34 | priority = models.IntegerField(u'任务优先级',default=10,help_text='值越小越优先') 35 | urls = models.ManyToManyField('datadeal.startUrls',verbose_name=u'爬取链接') 36 | status = models.IntegerField(u'任务状态', choices=STATUS_CHOICES,default=1) 37 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True) 38 | get_at = models.DateTimeField(u'任务领取时间',null=True,blank=True) 39 | over_at = models.DateTimeField(u'任务完成时间',null=True,blank=True) 40 | node = models.ForeignKey('Node',verbose_name=u'执行节点',blank=True,null=True) 41 | nodeip = models.ForeignKey('NodeIp',verbose_name=u'执行IP',blank=True,null=True) 42 | 43 | def __unicode__(self): 44 | return self.name 45 | 46 | class Meta: 47 | verbose_name = u'任务管理' 48 | verbose_name_plural = u'任务管理' -------------------------------------------------------------------------------- /datadeal/distribute/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /datadeal/distribute/urls.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from django.conf.urls import patterns, include, url 3 | from .views import * 4 | 5 | urlpatterns = [ 6 | url(r'^create_node/$', CreateNode.as_view() ,name='create_node'), 7 | url(r'^get_spiders/$', getSpiders.as_view() ,name='get_spiders'), 8 | url(r'^handle_tasks/$', handleTasks.as_view() ,name='handle_tasks'), 9 | url(r'^over_tasks/$', overTasks.as_view() ,name='over_tasks'), 10 | url(r'^save_data/$', SaveData.as_view() ,name='save_data'), 11 | url(r'^get_spidername/$', GetSpiderName.as_view() ,name='get_spidername'), 12 | ] -------------------------------------------------------------------------------- /datadeal/distribute/views.py: -------------------------------------------------------------------------------- 1 | #!coding=utf-8 2 | from django.views.generic import TemplateView,View 3 | from django.http import HttpResponse,HttpResponseRedirect 4 | from datadeal.settings import BASE_DIR 5 | from .models import Node,NodeIp,NodeTask 6 | from datadeal.models import SpiderData,ErrorData 7 | import os 8 | import urllib,urllib2 9 | import json 10 | import uuid 11 | import datetime 12 | import pdfkit 13 | import hashlib 14 | import time 15 | 16 | # HOST = 'http://192.168.211.1:8000' 17 | HOST = 'http://10.20.1.52:8000' 18 | TASK_NUM = 1 19 | 20 | def get_mac_address(): 21 | ''' 22 | 获取本机mac地址 23 | ''' 24 | mac=uuid.UUID(int = uuid.getnode()).hex[-12:] 25 | return ":".join([mac[e:e+2] for e in range(0,11,2)]) 26 | 27 | def set_url_head(url,r_url): 28 | ''' 29 | 设置url前缀 30 | ''' 31 | if url: 32 | if url.startswith('http://') or url.startswith('https://'): 33 | new_url = url 34 | else: 35 | if r_url.endswith('.html'): 36 | last = r_url.split('/')[-1] 37 | r_url = r_url.split(last)[0] 38 | if r_url.startswith('http://'): 39 | if url.startswith('/'): 40 | new_url = 'http://'+r_url.split('http://')[1].split('/')[0]+url 41 | else: 42 | new_url = r_url+url 43 | elif r_url.startswith('https://'): 44 | if url.startswith('/'): 45 | new_url = 'https://'+r_url.split('https://')[1].split('/')[0]+url 46 | else: 47 | new_url = r_url+url 48 | else: 49 | new_url = url 50 | else: 51 | new_url = '' 52 | return new_url 53 | 54 | class CreateNode(View): 55 | name = '注册mac地址(主机)' 56 | 57 | def post(self,request): 58 | uid = request.POST.get('uid','') 59 | already = Node.objects.filter(uid=uid).count() 60 | if not already and uid: 61 | Node.objects.create(uid=uid) 62 | msg = u'注册成功' 63 | else: 64 | msg = u'该节点已注册' 65 | return HttpResponse(msg) 66 | 67 | class getSpiders(View): 68 | name = '获取spider文件(主机)' 69 | 70 | def post(self,request): 71 | uid = request.POST.get('uid','') 72 | already = Node.objects.filter(uid=uid).count() 73 | if already and uid: 74 | dir_name = BASE_DIR+'/../searchInfo/searchInfo/spiders' 75 | files = os.listdir(dir_name) 76 | new_files = [] 77 | for f in files: 78 | if not f.endswith('.pyc') and not f == '__init__.py': 79 | new_files.append(f) 80 | result = {} 81 | for i in new_files: 82 | f_name = dir_name+'/'+i 83 | with open(f_name,'r') as spider: 84 | text = spider.read() 85 | result[i] = text 86 | else: 87 | result = {'error':u'节点未注册'} 88 | return HttpResponse(json.dumps(result)) 89 | 90 | def getTasks(name): 91 | ''' 92 | 获取任务(节点) 93 | ''' 94 | mac = get_mac_address() 95 | posturl = HOST+'/distribute/handle_tasks/' 96 | data = {'uid':mac,'num':TASK_NUM,'name':name} 97 | data = urllib.urlencode(data) 98 | f = urllib2.urlopen(posturl,data) 99 | result = json.loads(f.read()) 100 | return result 101 | 102 | class handleTasks(View): 103 | name = '分发任务(主机)' 104 | 105 | def post(self,request): 106 | uid = request.POST.get('uid','') 107 | num = int(request.POST.get('num',0)) 108 | name = request.POST.get('name','') 109 | try: 110 | node = Node.objects.get(uid=uid,status=True) 111 | except: 112 | node = '' 113 | if node: 114 | if request.META.has_key('HTTP_X_FORWARDED_FOR'): 115 | ip = request.META['HTTP_X_FORWARDED_FOR'] 116 | else: 117 | ip = request.META['REMOTE_ADDR'] 118 | try: 119 | nip = NodeIp.objects.get(ip=ip) 120 | except: 121 | nip = NodeIp.objects.create(ip=ip) 122 | if not nip in node.ips.all(): 123 | node.ips.add(nip) 124 | today = datetime.datetime.now().date() 125 | start = today.strftime('%Y-%m-%d 00:00') 126 | end = today.strftime('%Y-%m-%d 23:59') 127 | count = NodeTask.objects.filter(nodeip=nip,get_at__gte=start,get_at__lte=end,scrapy__name=name).count() 128 | if count < node.max_num: 129 | if count+num <= node.max_num: 130 | result = [] 131 | tasks = NodeTask.objects.filter(scrapy__name=name,status=1,node__uid__isnull=True).order_by('priority')[0:num] 132 | for t in tasks: 133 | task = {'id':'','urls':[]} 134 | task['id'] = t.id 135 | for i in t.urls.all(): 136 | task['urls'].append(i.url) 137 | result.append(task) 138 | t.get_at = datetime.datetime.now() 139 | t.node = node 140 | t.nodeip = nip 141 | t.save() 142 | else: 143 | msg = ip+' 单次获取任务个数超过频度限制,请减少单次获取任务个数' 144 | print(unicode(msg)) 145 | result = {'error':msg} 146 | else: 147 | msg = ip+' 超过今日该项目领取任务限制' 148 | print(unicode(msg)) 149 | result = {'error':msg} 150 | else: 151 | msg = uid+' 节点未注册或已关闭' 152 | print(unicode(msg)) 153 | result = {'error':msg} 154 | return HttpResponse(json.dumps(result)) 155 | 156 | def afterTasks(task_id): 157 | ''' 158 | 完成任务(节点) 159 | ''' 160 | posturl = HOST+'/distribute/over_tasks/' 161 | nowtime = datetime.datetime.now() 162 | data = {'task_id':task_id,'nowtime':nowtime} 163 | data = urllib.urlencode(data) 164 | f = urllib2.urlopen(posturl,data) 165 | result = f.read() 166 | return result 167 | 168 | class overTasks(View): 169 | name = '任务结束(主机)' 170 | 171 | def post(self,request): 172 | task_id = request.POST.get('task_id','') 173 | nowtime = request.POST.get('nowtime','') 174 | try: 175 | task = NodeTask.objects.get(id=task_id) 176 | task.over_at = nowtime 177 | task.status = 2 178 | task.save() 179 | except Exception, e: 180 | print unicode(e) 181 | return HttpResponse('over') 182 | 183 | def sendData(name,data,url,error=False): 184 | ''' 185 | 发送爬取信息(节点) 186 | ''' 187 | mac = get_mac_address() 188 | posturl = HOST+'/distribute/save_data/' 189 | data = {'uid':mac,'data':data,'name':name,'error':error,'url':url} 190 | data = urllib.urlencode(data) 191 | f = urllib2.urlopen(posturl,data) 192 | result = f.read() 193 | return result 194 | 195 | class SaveData(View): 196 | name = '保存数据(主机)' 197 | 198 | def post(self,request): 199 | uid = request.POST.get('uid','') 200 | name = request.POST.get('name','') 201 | data = request.POST.get('data','') 202 | if data: 203 | try: 204 | data = eval(data) 205 | except Exception, e: 206 | print unicode(e) 207 | print data 208 | else: 209 | data = {} 210 | url = request.POST.get('url','') 211 | error = request.POST.get('error','False') 212 | 213 | if error == 'True': 214 | ErrorData.objects.create(uid=uid,scrapyname=name,url=data['url'],content=data['error']) 215 | msg = 'error' 216 | else: 217 | # 加入保存每条数据访问的页面url与pdf 218 | m = hashlib.md5() 219 | # m.update(url+str(time.time())) 220 | m.update(url) 221 | pdfname = m.hexdigest()+'.pdf' 222 | file_dir = BASE_DIR+'/datadeal/medias/web/' 223 | if not os.path.exists(file_dir): 224 | os.mkdir(file_dir) 225 | if os.path.exists(file_dir+pdfname): 226 | pass 227 | else: 228 | try: 229 | options = { 230 | 'page-size': 'B3', 231 | } 232 | pdfkit.from_url(url,file_dir+pdfname,options=options) 233 | except: 234 | pass 235 | 236 | SpiderData.objects.create(uid=uid,scrapyname=name,data=data,url=url,file=pdfname) 237 | msg = 'ok' 238 | return HttpResponse(msg) 239 | 240 | class GetSpiderName(View): 241 | name = '获取优先可爬取项目名(主机)' 242 | 243 | def post(self,request): 244 | tasks = NodeTask.objects.filter(get_at__isnull=True).order_by('scrapy__priority') 245 | print tasks 246 | if len(tasks) == 0: 247 | result = '' 248 | else: 249 | result = tasks[0].scrapy.name 250 | return HttpResponse(result) -------------------------------------------------------------------------------- /datadeal/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2017-07-27T09:43:39.930Z] GhostDriver - Main - running on port 62298 2 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36","webSecurityEnabled":true} 3 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.customHeaders: - {} 4 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"} 5 | [INFO - 2017-07-27T09:43:39.996Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 12004e70-72b0-11e7-aee6-f5ffd6c70928 6 | [ERROR - 2017-07-27T09:43:42.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 7 | 8 | phantomjs://platform/console++.js:263 in error 9 | [ERROR - 2017-07-27T09:43:42.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 10 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 11 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 12 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 13 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 14 | 15 | phantomjs://platform/console++.js:263 in error 16 | [ERROR - 2017-07-27T09:43:44.348Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 17 | 18 | phantomjs://platform/console++.js:263 in error 19 | [ERROR - 2017-07-27T09:43:44.348Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 20 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 21 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 22 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 23 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 24 | 25 | phantomjs://platform/console++.js:263 in error 26 | [INFO - 2017-07-27T09:47:49.368Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 27 | [INFO - 2017-07-27T09:51:09.506Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW 28 | 2004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 29 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 30 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 31 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 32 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 33 | 34 | phantomjs://platform/console++.js:263 in error 35 | [ERROR - 2017-07-27T09:43:45.820Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 36 | 37 | phantomjs://platform/console++.js:263 in error 38 | [ERROR - 2017-07-27T09:43:45.820Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 39 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 40 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 41 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 42 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 43 | 44 | phantomjs://platform/console++.js:263 in error 45 | [ERROR - 2017-07-27T09:43:47.142Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 46 | 47 | phantomjs://platform/console++.js:263 in error 48 | [ERROR - 2017-07-27T09:43:47.142Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 49 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 50 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 51 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 52 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 53 | 54 | phantomjs://platform/console++.js:263 in error 55 | [ERROR - 2017-07-27T09:43:47.672Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 56 | 57 | phantomjs://platform/console++.js:263 in error 58 | [ERROR - 2017-07-27T09:43:47.672Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 59 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 60 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 61 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 62 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 63 | 64 | phantomjs://platform/console++.js:263 in error 65 | [ERROR - 2017-07-27T09:43:48.404Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 66 | 67 | phantomjs://platform/console++.js:263 in error 68 | [ERROR - 2017-07-27T09:43:48.404Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 69 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 70 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 71 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 72 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 73 | 74 | phantomjs://platform/console++.js:263 in error 75 | [ERROR - 2017-07-27T09:43:49.085Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 76 | 77 | phantomjs://platform/console++.js:263 in error 78 | [ERROR - 2017-07-27T09:43:49.085Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 79 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 80 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 81 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 82 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 83 | 84 | phantomjs://platform/console++.js:263 in error 85 | [ERROR - 2017-07-27T09:43:49.855Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 86 | 87 | phantomjs://platform/console++.js:263 in error 88 | [ERROR - 2017-07-27T09:43:49.855Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 89 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 90 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 91 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 92 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 93 | 94 | phantomjs://platform/console++.js:263 in error 95 | [ERROR - 2017-07-27T09:43:50.325Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 96 | 97 | phantomjs://platform/console++.js:263 in error 98 | [ERROR - 2017-07-27T09:43:50.325Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 99 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 100 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 101 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 102 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 103 | 104 | phantomjs://platform/console++.js:263 in error 105 | [ERROR - 2017-07-27T09:43:51.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 106 | 107 | phantomjs://platform/console++.js:263 in error 108 | [ERROR - 2017-07-27T09:43:51.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 109 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 110 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 111 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 112 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 113 | 114 | phantomjs://platform/console++.js:263 in error 115 | [ERROR - 2017-07-27T09:43:51.869Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 116 | 117 | phantomjs://platform/console++.js:263 in error 118 | [ERROR - 2017-07-27T09:43:51.870Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 119 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 120 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 121 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 122 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 123 | 124 | phantomjs://platform/console++.js:263 in error 125 | [ERROR - 2017-07-27T09:43:52.886Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 126 | 127 | phantomjs://platform/console++.js:263 in error 128 | [ERROR - 2017-07-27T09:43:52.886Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 129 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 130 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 131 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 132 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 133 | 134 | phantomjs://platform/console++.js:263 in error 135 | [ERROR - 2017-07-27T09:43:53.546Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 136 | 137 | phantomjs://platform/console++.js:263 in error 138 | [ERROR - 2017-07-27T09:43:53.546Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 139 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 140 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 141 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 142 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 143 | 144 | phantomjs://platform/console++.js:263 in error 145 | [ERROR - 2017-07-27T09:43:54.676Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 146 | 147 | phantomjs://platform/console++.js:263 in error 148 | [ERROR - 2017-07-27T09:43:54.676Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 149 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 150 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 151 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 152 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 153 | 154 | phantomjs://platform/console++.js:263 in error 155 | [ERROR - 2017-07-27T09:43:55.271Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 156 | 157 | phantomjs://platform/console++.js:263 in error 158 | [ERROR - 2017-07-27T09:43:55.271Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 159 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 160 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 161 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 162 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 163 | 164 | phantomjs://platform/console++.js:263 in error 165 | [ERROR - 2017-07-27T09:43:57.125Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 166 | 167 | phantomjs://platform/console++.js:263 in error 168 | [ERROR - 2017-07-27T09:43:57.125Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 169 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 170 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 171 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 172 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 173 | 174 | phantomjs://platform/console++.js:263 in error 175 | [ERROR - 2017-07-27T09:43:58.231Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 176 | 177 | phantomjs://platform/console++.js:263 in error 178 | [ERROR - 2017-07-27T09:43:58.231Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 179 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 180 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 181 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 182 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 183 | 184 | phantomjs://platform/console++.js:263 in error 185 | [ERROR - 2017-07-27T09:43:59.242Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 186 | 187 | phantomjs://platform/console++.js:263 in error 188 | [ERROR - 2017-07-27T09:43:59.242Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 189 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 190 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 191 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 192 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 193 | 194 | phantomjs://platform/console++.js:263 in error 195 | [ERROR - 2017-07-27T09:43:59.762Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 196 | 197 | phantomjs://platform/console++.js:263 in error 198 | [ERROR - 2017-07-27T09:43:59.762Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 199 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 200 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 201 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 202 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 203 | 204 | phantomjs://platform/console++.js:263 in error 205 | [ERROR - 2017-07-27T09:44:00.853Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 206 | 207 | phantomjs://platform/console++.js:263 in error 208 | [ERROR - 2017-07-27T09:44:00.853Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 209 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 210 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 211 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 212 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 213 | 214 | phantomjs://platform/console++.js:263 in error 215 | [ERROR - 2017-07-27T09:44:01.369Z] WebElementLocator - _handleLocateCommand - Element(s) NOT Found: GAVE UP. Search Stop Time: 1501148641320 216 | 217 | phantomjs://platform/console++.js:263 in error 218 | [ERROR - 2017-07-27T09:44:01.756Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 219 | 220 | phantomjs://platform/console++.js:263 in error 221 | [ERROR - 2017-07-27T09:44:01.756Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 222 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 223 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 224 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 225 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 226 | 227 | phantomjs://platform/console++.js:263 in error 228 | [ERROR - 2017-07-27T09:44:02.228Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 229 | 230 | phantomjs://platform/console++.js:263 in error 231 | [ERROR - 2017-07-27T09:44:02.228Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 232 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 233 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 234 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 235 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 236 | 237 | phantomjs://platform/console++.js:263 in error 238 | [ERROR - 2017-07-27T09:44:03.433Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 239 | 240 | phantomjs://platform/console++.js:263 in error 241 | [ERROR - 2017-07-27T09:44:03.433Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 242 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 243 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 244 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 245 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 246 | 247 | phantomjs://platform/console++.js:263 in error 248 | [ERROR - 2017-07-27T09:44:03.955Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 249 | 250 | phantomjs://platform/console++.js:263 in error 251 | [ERROR - 2017-07-27T09:44:03.955Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 252 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 253 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 254 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 255 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 256 | 257 | phantomjs://platform/console++.js:263 in error 258 | [ERROR - 2017-07-27T09:44:05.229Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 259 | 260 | phantomjs://platform/console++.js:263 in error 261 | [ERROR - 2017-07-27T09:44:05.229Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 262 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 263 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 264 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 265 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 266 | 267 | phantomjs://platform/console++.js:263 in error 268 | [ERROR - 2017-07-27T09:44:06.542Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property. 269 | 270 | phantomjs://platform/console++.js:263 in error 271 | [ERROR - 2017-07-27T09:44:06.542Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack: 272 | defineProperty (http://dn-growing.qbox.me/vds.js:2) 273 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2) 274 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2) 275 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2) 276 | 277 | phantomjs://platform/console++.js:263 in error 278 | -------------------------------------------------------------------------------- /datadeal/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings") 7 | 8 | from django.core.management import execute_from_command_line 9 | 10 | execute_from_command_line(sys.argv) 11 | -------------------------------------------------------------------------------- /searchInfo/ghostdriver.log: -------------------------------------------------------------------------------- 1 | [INFO - 2017-08-18T00:33:31.697Z] GhostDriver - Main - running on port 50755 2 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true} 3 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - page.customHeaders: - {} 4 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}} 5 | [INFO - 2017-08-18T00:33:31.971Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: dcc570b0-83ac-11e7-97fe-0f519fac670c 6 | -------------------------------------------------------------------------------- /searchInfo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = searchInfo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = searchInfo 12 | -------------------------------------------------------------------------------- /searchInfo/searchInfo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/searchInfo/searchInfo/__init__.py -------------------------------------------------------------------------------- /searchInfo/searchInfo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class FileItem(scrapy.Item): 12 | file_urls = scrapy.Field() -------------------------------------------------------------------------------- /searchInfo/searchInfo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from distribute.views import sendData 10 | 11 | class SearchinfoSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self,response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # if response.url.startswith('http://'): 28 | # url = response.url.split('http://')[1].split('/')[0] 29 | # elif response.url.startswith('https://'): 30 | # url = response.url.split('https://')[1].split('/')[0] 31 | # else: 32 | # url = response.url 33 | # if url in spider.allowed_domains: 34 | # return None 35 | # else: 36 | # print 'error_________: url not in allow_domains!' 37 | # raise 38 | pass 39 | 40 | def process_spider_output(self,response, result, spider): 41 | # Called with the results returned from the Spider, after 42 | # it has processed the response. 43 | 44 | # Must return an iterable of Request, dict or Item objects. 45 | for i in result: 46 | yield i 47 | 48 | def process_spider_exception(self,response, exception, spider): 49 | # Called when a spider or process_spider_input() method 50 | # (from other spider middleware) raises an exception. 51 | 52 | # Should return either None or an iterable of Response, dict 53 | # or Item objects. 54 | sendData(spider.name,{'error':unicode(exception),'url':response.url},response.url,True) 55 | 56 | def process_start_requests(self,start_requests, spider): 57 | # Called with the start requests of the spider, and works 58 | # similarly to the process_spider_output() method, except 59 | # that it doesn’t have a response associated. 60 | 61 | # Must return only requests (not items). 62 | for r in start_requests: 63 | yield r 64 | 65 | def spider_opened(self, spider): 66 | spider.logger.info('Spider opened: %s' % spider.name) 67 | -------------------------------------------------------------------------------- /searchInfo/searchInfo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from searchInfo import settings 3 | import requests 4 | import hashlib 5 | import os 6 | 7 | class FilesDownloadPipeline(object): 8 | def process_item(self, item, spider): 9 | 10 | if 'file_urls' in item: 11 | dir_path = '%s/%s' % (settings.FILES_STORE, spider.name) 12 | img_path = settings.IMAGES_STORE 13 | if not os.path.exists(dir_path): 14 | os.makedirs(dir_path) 15 | if not os.path.exists(img_path): 16 | os.makedirs(img_path) 17 | for file_url in item['file_urls']: 18 | file_name = file_url.split('/')[-1] 19 | back = file_name.split('.')[-1] 20 | m = hashlib.md5() 21 | m.update(file_name) 22 | file_name = m.hexdigest() 23 | if spider.name == 'shandong': 24 | file_name = file_name+'.png' 25 | file_path = '%s/%s' % (img_path, file_name) 26 | elif back == 'png' or back == 'jpg' or back == 'gif': 27 | file_name = file_name+'.'+back 28 | file_path = '%s/%s' % (img_path, file_name) 29 | else: 30 | file_name = spider.name+'_'+file_name+'.'+back 31 | file_path = '%s/%s' % (dir_path, file_name) 32 | if os.path.exists(file_path): 33 | continue 34 | with open(file_path, 'wb') as handle: 35 | response = requests.get(file_url, stream=True) 36 | for block in response.iter_content(1024): 37 | if not block: 38 | break 39 | handle.write(block) 40 | return item -------------------------------------------------------------------------------- /searchInfo/searchInfo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for searchInfo project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'searchInfo' 13 | 14 | SPIDER_MODULES = ['searchInfo.spiders'] 15 | NEWSPIDER_MODULE = 'searchInfo.spiders' 16 | 17 | FEED_EXPORT_ENCODING = 'utf-8' 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'searchInfo (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | 'Accept-Language': 'en', 45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36' 46 | } 47 | 48 | # Enable or disable spider middlewares 49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 50 | SPIDER_MIDDLEWARES = { 51 | 'searchInfo.middlewares.SearchinfoSpiderMiddleware': 543, 52 | } 53 | 54 | # Enable or disable downloader middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 56 | #DOWNLOADER_MIDDLEWARES = { 57 | # 'searchInfo.middlewares.MyCustomDownloaderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable extensions 61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 62 | #EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | #} 65 | 66 | # Configure item pipelines 67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'searchInfo.pipelines.FilesDownloadPipeline': 300, 70 | # 'scrapy_redis.pipelines.RedisPipeline': 300 71 | } 72 | FILES_STORE = '../datadeal/datadeal/medias/' 73 | IMAGES_STORE = '../datadeal/datadeal/medias/images' 74 | # Enable and configure the AutoThrottle extension (disabled by default) 75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 76 | #AUTOTHROTTLE_ENABLED = True 77 | # The initial download delay 78 | #AUTOTHROTTLE_START_DELAY = 5 79 | # The maximum download delay to be set in case of high latencies 80 | #AUTOTHROTTLE_MAX_DELAY = 60 81 | # The average number of requests Scrapy should be sending in parallel to 82 | # each remote server 83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 84 | # Enable showing throttling stats for every response received: 85 | #AUTOTHROTTLE_DEBUG = False 86 | 87 | # Enable and configure HTTP caching (disabled by default) 88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 89 | # HTTPCACHE_ENABLED = True 90 | # HTTPCACHE_EXPIRATION_SECS = 3600 91 | # HTTPCACHE_DIR = 'httpcache' 92 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 94 | 95 | 96 | # REDIS_HOST = '127.0.0.1' 97 | # REDIS_PORT = 6379 98 | 99 | import sys,os 100 | from django.core.wsgi import get_wsgi_application 101 | sys.path.append(os.path.join(os.path.split(os.path.dirname(__file__))[0],'../datadeal')) 102 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings") 103 | application = get_wsgi_application() -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/beijing.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | 8 | class BeiJingSpider(scrapy.Spider): 9 | name = "beijing" 10 | allowed_domains = ["www.bjda.gov.cn"] 11 | start_urls = ['http://www.bjda.gov.cn/eportal/ui?pageId=331216¤tPage=1&filter_LIKE_TITLE=&filter_LIKE_XKZH='] 12 | 13 | def parse(self, response): 14 | # for i in range(1,1472): 15 | for i in range(1,10): 16 | url = 'http://www.bjda.gov.cn/eportal/ui?pageId=331216¤tPage=%s&filter_LIKE_TITLE=&filter_LIKE_XKZH=' % i 17 | yield scrapy.Request(url, callback=self.parse_item) 18 | 19 | 20 | def parse_item(self, response): 21 | urls = response.xpath('//*[@id="form"]/div[2]/table//a') 22 | for url in urls: 23 | text = url.xpath('string(.)').extract_first() 24 | if text and text == '查看': 25 | url = url.xpath('@href').extract_first() 26 | url = 'http://www.bjda.gov.cn'+url 27 | already = SpiderData.objects.filter(url=url) 28 | if already.count() == 0: 29 | yield scrapy.Request(url, callback=self.parse_detail) 30 | else: 31 | pass 32 | # print 'already crawled' 33 | 34 | def parse_detail(self,response): 35 | trs = response.xpath('//*[@id="84f8b7f6cfc44b849b61b5c0ed21976a"]/div[2]/table//tr') 36 | data = {} 37 | for tr in trs: 38 | key = tr.xpath('th/text()').extract_first().replace(':','') 39 | val = tr.xpath('td/text()').extract_first() 40 | data[key] = val 41 | sendData('beijing',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/case.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | 8 | '''山东省行政处罚案件爬虫''' 9 | class CaseSpider(scrapy.Spider): 10 | name = "case" 11 | allowed_domains = ["sdlf.shandongbusiness.gov.cn"] 12 | start_urls = ['http://sdlf.shandongbusiness.gov.cn/newslist.shtml?method=listXzcf'] 13 | 14 | def parse(self, response): 15 | for i in range(1,6): 16 | yield scrapy.FormRequest( 17 | url='http://sdlf.shandongbusiness.gov.cn/newslist.shtml', 18 | formdata={'pager.requestPage': str(i), 'method': 'listXzcf'}, 19 | callback=self.after_post 20 | ) 21 | 22 | def after_post(self, response): 23 | li = response.xpath('//ul[@class="rlistul"]/li') 24 | for l in li: 25 | date = l.xpath('span/text()').extract_first() 26 | title = l.xpath('a/text()').extract_first() 27 | url = 'http://sdlf.shandongbusiness.gov.cn'+l.xpath('a/@href').extract_first() 28 | yield scrapy.Request(url, callback=self.parse_item,meta={'date':date,'title':title}) 29 | 30 | def parse_item(self, response): 31 | date = response.meta['date'] 32 | title = response.meta['title'] 33 | data = {} 34 | tables = response.xpath('//table[@class="rtab2"]') 35 | for table in tables: 36 | trs = table.xpath('tr') 37 | for tr in trs: 38 | key = tr.xpath('th/text()').extract_first().split(u':')[0] 39 | value = tr.xpath('td/text()').extract_first() 40 | data[key] = value 41 | 42 | already = SpiderData.objects.filter(url=response.url) 43 | if already.count() == 0: 44 | sendData('case',data,response.url) 45 | else: 46 | pass 47 | # print 'already crawl' -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/chengdu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | 8 | class ChengDuSpider(scrapy.Spider): 9 | name = "chengdu" 10 | allowed_domains = ["www.shfda.gov.cn"] 11 | start_urls = ['http://www.cdepb.gov.cn/cdepbws/Web/Template/GovDefaultList.aspx?cid=843'] 12 | 13 | def parse(self, response): 14 | # for i in range(1,37): 15 | for i in range(1,10): 16 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=%s&pagesize=20' % i 17 | yield scrapy.Request(url, callback=self.parse_item) 18 | 19 | 20 | def parse_item(self, response): 21 | urls = response.xpath('//*[@id="b1"]//a') 22 | for url in urls: 23 | text = url.xpath('string(.)').extract_first() 24 | if text and text == '详情': 25 | url = url.xpath('@href').extract_first() 26 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/'+url 27 | already = SpiderData.objects.filter(url=url) 28 | if already.count() == 0: 29 | yield scrapy.Request(url, callback=self.parse_detail) 30 | else: 31 | # print 'already crawled' 32 | pass 33 | 34 | def parse_detail(self,response): 35 | trs = response.xpath('//*[@id="main"]/div/div[2]/table//tr') 36 | data = {} 37 | for tr in trs: 38 | key = tr.xpath('td[1]/text()').extract_first() 39 | val = tr.xpath('td[2]/text()').extract_first() 40 | data[key] = val 41 | sendData('shanghai',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/gansu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | 8 | class GanSuSpider(scrapy.Spider): 9 | name = "gansu" 10 | allowed_domains = ["www.gsfda.gov.cn"] 11 | start_urls = ['http://www.gsfda.gov.cn:2180/xzlaw/xzlawActionWZ!list.do?queryBean.pn=1&queryBean.pageSize=100'] 12 | 13 | def parse(self, response): 14 | # for i in range(1,106): 15 | for i in range(1,5): 16 | url = 'http://www.gsfda.gov.cn:2180/xzlaw/xzlawActionWZ!list.do?queryBean.pn=%s&queryBean.pageSize=100' % i 17 | yield scrapy.Request(url, callback=self.parse_item) 18 | 19 | 20 | def parse_item(self, response): 21 | urls = response.xpath('//*[@id="list"]//a') 22 | for url in urls: 23 | text = url.xpath('string(.)').extract_first() 24 | if text and text == '[查看]': 25 | url = url.xpath('@href').extract_first() 26 | url = 'http://www.gsfda.gov.cn:2180/xzlaw/'+url 27 | already = SpiderData.objects.filter(url=url) 28 | if already.count() == 0: 29 | yield scrapy.Request(url, callback=self.parse_detail) 30 | else: 31 | # print 'already crawled' 32 | pass 33 | 34 | def parse_detail(self,response): 35 | trs = response.xpath('//*[@id="edit"]//tr') 36 | data = {} 37 | for i,tr in enumerate(trs): 38 | if i > 0: 39 | key = tr.xpath('th/text()').extract_first() 40 | val = val = tr.xpath('td/text()').extract_first() 41 | if key: 42 | key = key.replace(':','').replace(' ','') 43 | if not val: 44 | val = '' 45 | data[key] = val 46 | sendData('gansu',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/hainan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | import json 8 | 9 | class HaiNanSpider(scrapy.Spider): 10 | name = "hainan" 11 | allowed_domains = ["aj.hifda.gov.cn"] 12 | start_urls = ['http://aj.hifda.gov.cn/web/index.jsp'] 13 | 14 | def parse(self, response): 15 | # for i in range(0,47): 16 | for i in range(0,5): 17 | yield scrapy.FormRequest( 18 | url='http://aj.hifda.gov.cn/loseCredit/refreshList.json', 19 | formdata={ 20 | "cityName":"", 21 | "initialVal":"", 22 | "ispublish":"1", 23 | "listPageSize":"100", 24 | "queryContent":"", 25 | "queryOrder":"0", 26 | "searchOrderType":"0", 27 | "selectIndex":"1", 28 | "skip":"%s" % str(i*100), 29 | }, 30 | callback=self.parse_item 31 | ) 32 | 33 | def parse_item(self, response): 34 | result = json.loads(response.body) 35 | for r in result['resultData']: 36 | url = 'http://aj.hifda.gov.cn/web/showContent.jsp?id='+r['id'] 37 | data = {u'企业(商户)名称':r['companyname'],u'注册地址':r['companysite'],u'法定代表人姓名':r['companyman'],u'法定代表人身份证号':r['companymanid'],u'负责人姓名':r['responsible_man'],u'负责人身份证号':r['resp_man_id'],u'直接责任人':r['direct_person'],u'社会信用代码':r['idcode'],u'案件分类':r['toclassify'],u'案件名称':r['losecase'],u'行政处罚决定文书号':r['punish_writ_num'],u'主要违法事实':r['losedetail'],u'处罚依据和内容':r['punishway'],u'处罚机关':r['punishunit'],u'处罚时间':r['punishtime']} 38 | already = SpiderData.objects.filter(url=url) 39 | if already.count() == 0: 40 | sendData('hainan',data,url) 41 | else: 42 | pass 43 | # print 'already crawled' -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/qingdao.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from distribute.views import set_url_head 4 | from searchInfo.items import FileItem 5 | from scrapy.loader import ItemLoader 6 | 7 | class QingDaoSpider(scrapy.Spider): 8 | name = "qingdao" 9 | allowed_domains = ["sfda.qingdao.gov.cn"] 10 | start_urls = ['http://sfda.qingdao.gov.cn/n32205967/n32206400/index.html'] 11 | 12 | def parse(self, response): 13 | data = [] 14 | qingdao_div = response.xpath('//div[@id="listChangeDiv"]/ul/li') 15 | for i in qingdao_div: 16 | qingdao_url = i.xpath('a/@href').extract_first() 17 | qingdao_url = set_url_head(qingdao_url,response.url) 18 | if qingdao_url: 19 | yield scrapy.Request(qingdao_url, callback=self.parse_item) 20 | 21 | def parse_item(self, response): 22 | qingdao_detail = response.xpath('//div[@class="main_t"]//a') 23 | l = ItemLoader(item=FileItem(), response=response) 24 | for i in qingdao_detail: 25 | url = i.xpath('@href').extract_first() 26 | url = set_url_head(url,response.url) 27 | if url: 28 | l.add_value('file_urls',url) 29 | return l.load_item() -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/risk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | # from distribute.views import sendData 6 | # from datadeal.models import SpiderData 7 | from searchInfo.items import FileItem 8 | from scrapy.loader import ItemLoader 9 | 10 | '''国家食药局风险检测文件爬虫''' 11 | class RiskSpider(scrapy.Spider): 12 | name = "risk" 13 | allowed_domains = ["www.sda.gov.cn"] 14 | start_urls = ['http://www.sda.gov.cn/WS01/CL1667/index.html'] 15 | 16 | def parse(self, response): 17 | # 食品 18 | url = 'http://www.sda.gov.cn/WS01/CL1667/index.html' 19 | yield scrapy.Request(url, callback=self.parse_item) 20 | # for i in range(1,222): 21 | # url = 'http://www.sda.gov.cn/WS01/CL1667/index_%s.html' % i 22 | # yield scrapy.Request(url, callback=self.parse_item) 23 | 24 | # 药品 25 | url = 'http://www.sda.gov.cn/WS01/CL1429/' 26 | yield scrapy.Request(url, callback=self.parse_item) 27 | # for i in range(1,12): 28 | # url = 'http://www.sda.gov.cn/WS01/CL1429/index_%s.html' % i 29 | # yield scrapy.Request(url, callback=self.parse_item) 30 | 31 | #化妆品 32 | url = 'http://www.sda.gov.cn/WS01/CL1866/' 33 | yield scrapy.Request(url, callback=self.parse_item) 34 | # for i in range(1,3): 35 | # url = 'http://www.sda.gov.cn/WS01/CL1866/index_%s.html' % i 36 | # yield scrapy.Request(url, callback=self.parse_item) 37 | 38 | def parse_item(self, response): 39 | urls = response.xpath('/html/body/table[3]//tr/td[3]/table[2]//tr/td/table[1]//a') 40 | for url in urls: 41 | text = url.xpath('string(.)').extract_first() 42 | if '不合格' in text or '抽检' in text: 43 | url = url.xpath('@href').extract_first().replace('..','') 44 | url = 'http://www.sda.gov.cn/WS01'+url 45 | yield scrapy.Request(url, callback=self.parse_detail) 46 | 47 | def parse_detail(self,response): 48 | path = response.xpath('//a') 49 | l = ItemLoader(item=FileItem(), response=response) 50 | for i in path: 51 | url = i.xpath('@href').extract_first() 52 | if url: 53 | if url.endswith('.doc') or url.endswith('.xlsx') or url.endswith('.xls') or url.endswith('.docx') or url.endswith('.rar') or url.endswith('.pdf') or url.endswith('.zip'): 54 | url = 'http://www.sda.gov.cn'+url 55 | l.add_value('file_urls',url) 56 | return l.load_item() -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/sdein.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from distribute.views import * 4 | 5 | class SdeinSpider(scrapy.Spider): 6 | name = "sdein" 7 | allowed_domains = ["www.sdein.gov.cn","zfc.sdein.gov.cn"] 8 | 9 | def start_requests(self): 10 | results = getTasks('sdein') 11 | self.taks_urls = {} 12 | self.tasks = {} 13 | if isinstance(results,dict): 14 | print results['error'] 15 | else: 16 | for re in results: 17 | self.tasks[re['id']] = {'t_count':len(re['urls']),'count':0} 18 | for u in re['urls']: 19 | self.taks_urls[u] = re['id'] 20 | yield self.make_requests_from_url(u) 21 | 22 | def after_parse(self,url): 23 | task_id = self.taks_urls[url] 24 | self.tasks[task_id]['count'] += 1 25 | if self.tasks[task_id]['count'] == self.tasks[task_id]['t_count']: 26 | afterTasks(task_id) 27 | 28 | def parse(self, response): 29 | sdein_table = response.xpath('//table[@width="763"]/tr[3]/td/table[2]/tr') 30 | for i in sdein_table: 31 | sdein_title = i.xpath('td[2]/a/text()').extract_first() 32 | sdein_date = i.xpath('td[3]/text()').extract_first() 33 | sdein_url = i.xpath('td[2]/a/@href').extract_first() 34 | sdein_url = set_url_head(sdein_url,response.url) 35 | if sdein_url: 36 | yield scrapy.Request(sdein_url, callback=self.parse_item,meta={'sdein_title':sdein_title,'sdein_date':sdein_date,}) 37 | self.after_parse(response.url) 38 | 39 | def parse_item(self, response): 40 | sdein_title = response.meta['sdein_title'] 41 | sdein_date = response.meta['sdein_date'] 42 | sdein_content = response.xpath('//div[@class="TRS_Editor"]').xpath('string(.)').extract_first() 43 | sendData('sdein',{'sdein_title':sdein_title,'sdein_date':sdein_date,'sdein_content':sdein_content},response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/sdqts.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from distribute.views import * 4 | 5 | class SdqtsSpider(scrapy.Spider): 6 | name = "sdqts" 7 | allowed_domains = ["www.sdqts.gov.cn"] 8 | start_urls = ['http://www.sdqts.gov.cn/sdzj/380936/index.html'] 9 | 10 | def parse(self, response): 11 | sdqts_table = response.xpath('//*[@id="2d758f3ea2c041e399b5d84609a300f5"]/div[2]/div[2]/div[2]/table[2]/tbody/tr') 12 | for i in sdqts_table: 13 | sdqts_title = i.xpath('td/table/tbody/tr/td[1]/a/text()').extract_first() 14 | sdqts_date = i.xpath('td/table/tbody/tr/td[4]/text()').extract_first() 15 | sdqts_url = i.xpath('td/table/tbody/tr/td[1]/a/@href').extract_first() 16 | sdqts_url = set_url_head(sdqts_url,response.url) 17 | if sdqts_url: 18 | yield scrapy.Request(sdqts_url, callback=self.parse_item,meta={'sdqts_title':sdqts_title,'sdqts_date':sdqts_date,}) 19 | 20 | def parse_item(self, response): 21 | sdqts_title = response.meta['sdqts_title'] 22 | sdqts_date = response.meta['sdqts_date'] 23 | tr = response.xpath('//div[@class="gov_infoCatalog_detailsection"]//table//tr') 24 | data = {} 25 | if len(tr) == 2: 26 | td_title = tr[0].xpath('td') 27 | td_val = tr[1].xpath('td') 28 | for i in range(0,len(tr[1].xpath('td'))): 29 | data[td_title[i].xpath('string(.)').extract_first()] = td_val[i].xpath('string(.)').extract_first() 30 | if data: 31 | sendData('sdqts',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/sfda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib 4 | import urllib2 5 | import bs4 6 | from distribute.views import sendData 7 | from datadeal.models import SpiderData 8 | 9 | class SfdaSpider(scrapy.Spider): 10 | name = "sfda" 11 | allowed_domains = ["app1.sfda.gov.cn"] 12 | start_urls = ['http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=114&tableName=TABLE114&title=%E5%9B%BD%E5%AE%B6%E9%A3%9F%E5%93%81%E5%AE%89%E5%85%A8%E7%9B%91%E7%9D%A3%E6%8A%BD%E6%A3%80%EF%BC%88%E4%B8%8D%E5%90%88%E6%A0%BC%E4%BA%A7%E5%93%81%EF%BC%89&bcId=143106776907834761101199700381'] 13 | 14 | def parse(self, response): 15 | # for i in range(1,238): 16 | for i in range(10,20): 17 | yield scrapy.FormRequest( 18 | url='http://app1.sfda.gov.cn/datasearch/face3/search.jsp', 19 | formdata={ 20 | "State":"1", 21 | "bcId":"143106776907834761101199700381", 22 | "curstart":str(i), 23 | "tableId":"114", 24 | "tableName":"TABLE114", 25 | "viewsubTitleName":"COLUMN1486", 26 | "viewtitleName":"COLUMN1490" 27 | }, 28 | callback=self.after_post 29 | ) 30 | 31 | def after_post(self, response): 32 | for a in response.xpath('//a'): 33 | aid = a.xpath('@href').extract_first().split('&Id=')[1].split('\'')[0] 34 | get_url = "http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=114&tableName=TABLE114&Id="+aid 35 | yield scrapy.Request(get_url, callback=self.parse_item) 36 | 37 | def parse_item(self, response): 38 | trs = response.xpath('//table/tr') 39 | data = {} 40 | for tr in trs: 41 | key = tr.xpath('td[1]/text()').extract_first() 42 | val = tr.xpath('td[2]/text()').extract_first() 43 | if key or val: 44 | data[key] = val 45 | try: 46 | already = SpiderData.objects.filter(scrapyname='sfda',data__contains={u"被抽样单位名称":data[u'被抽样单位名称'],u"生产日期/批号":data[u'生产日期/批号'],u"抽检项目":data[u'抽检项目']}).count() 47 | except: 48 | already = 1 49 | if not already: 50 | sendData('sfda',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/shandong.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from distribute.views import set_url_head 4 | from searchInfo.items import FileItem 5 | from scrapy.loader import ItemLoader 6 | 7 | class shandongSpider(scrapy.Spider): 8 | name = "shandong" 9 | allowed_domains = ["www.creditsd.gov.cn"] 10 | start_urls = ['http://www.creditsd.gov.cn/creditsearch.punishmentList.phtml?id='] 11 | 12 | def parse(self, response): 13 | for i in range(1,11): 14 | url = 'http://www.creditsd.gov.cn/creditsearch.punishmentList.phtml?id=&keyword=&page=%s' % i 15 | yield scrapy.Request(url, callback=self.parse_list) 16 | 17 | def parse_list(self, response): 18 | div = response.xpath('/html/body/div/table[2]//tr') 19 | for i in div: 20 | url = i.xpath('td[1]/a/@href').extract_first() 21 | if url: 22 | url = 'http://www.creditsd.gov.cn'+url 23 | yield scrapy.Request(url, callback=self.parse_item) 24 | 25 | def parse_item(self, response): 26 | img_url = response.xpath('//*[@id="img"]/@src').extract_first() 27 | if img_url: 28 | img_url = 'http://www.creditsd.gov.cn'+img_url 29 | l = ItemLoader(item=FileItem(), response=response) 30 | l.add_value('file_urls',img_url) 31 | return l.load_item() -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/shanghai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import urllib2 4 | import json 5 | from distribute.views import sendData 6 | from datadeal.models import SpiderData 7 | 8 | class ShangHaiSpider(scrapy.Spider): 9 | name = "shanghai" 10 | allowed_domains = ["www.shfda.gov.cn"] 11 | start_urls = ['http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=1&pagesize=20'] 12 | 13 | def parse(self, response): 14 | # for i in range(1,815): 15 | for i in range(1,10): 16 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=%s&pagesize=20' % i 17 | yield scrapy.Request(url, callback=self.parse_item) 18 | 19 | 20 | def parse_item(self, response): 21 | urls = response.xpath('//*[@id="b1"]//a') 22 | for url in urls: 23 | text = url.xpath('string(.)').extract_first() 24 | if text and text == '详情': 25 | url = url.xpath('@href').extract_first() 26 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/'+url 27 | already = SpiderData.objects.filter(url=url) 28 | if already.count() == 0: 29 | yield scrapy.Request(url, callback=self.parse_detail) 30 | else: 31 | # print 'already crawled' 32 | pass 33 | 34 | def parse_detail(self,response): 35 | trs = response.xpath('//*[@id="main"]/div/div[2]/table//tr') 36 | data = {} 37 | for tr in trs: 38 | key = tr.xpath('td[1]/text()').extract_first() 39 | val = tr.xpath('td[2]/text()').extract_first() 40 | data[key] = val 41 | sendData('shanghai',data,response.url) -------------------------------------------------------------------------------- /searchInfo/searchInfo/spiders/xxgk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from distribute.views import * 4 | 5 | class XxgkSpider(scrapy.Spider): 6 | name = "xxgk" 7 | allowed_domains = ["xxgk.sdein.gov.cn"] 8 | 9 | def start_requests(self): 10 | results = getTasks('xxgk') 11 | self.taks_urls = {} 12 | self.tasks = {} 13 | if isinstance(results,dict): 14 | print results['error'] 15 | else: 16 | for re in results: 17 | self.tasks[re['id']] = {'t_count':len(re['urls']),'count':0} 18 | for u in re['urls']: 19 | self.taks_urls[u] = re['id'] 20 | yield self.make_requests_from_url(u) 21 | 22 | def after_parse(self,url): 23 | task_id = self.taks_urls[url] 24 | self.tasks[task_id]['count'] += 1 25 | if self.tasks[task_id]['count'] == self.tasks[task_id]['t_count']: 26 | afterTasks(task_id) 27 | 28 | def parse(self, response): 29 | xxgk_table = response.xpath('//table[@width="763"]/tr[4]/td/table/tr') 30 | for i in xxgk_table: 31 | sdein_title = i.xpath('td[2]/a/text()').extract_first() 32 | sdein_date = i.xpath('td[3]/text()').extract_first() 33 | sdein_url = i.xpath('td[2]/a/@href').extract_first() 34 | sdein_url = set_url_head(sdein_url,response.url) 35 | if sdein_url: 36 | yield scrapy.Request(sdein_url, callback=self.parse_item,meta={'sdein_title':sdein_title,'sdein_date':sdein_date,}) 37 | self.after_parse(response.url) 38 | 39 | def parse_item(self, response): 40 | sdein_title = response.meta['sdein_title'] 41 | sdein_date = response.meta['sdein_date'] 42 | xxgk_content = response.xpath('/html/body/table[2]/tr[6]/td/table/tr[2]/td/table[4]/tr[3]/td/table[1]').xpath('string(.)').extract_first() 43 | trs = response.xpath('/html/body/table[2]/tr[6]/td/table/tr[2]/td/table[2]/tr') 44 | data = {} 45 | for tr in trs: 46 | tds = tr.xpath('td') 47 | for num,td in enumerate(tds): 48 | if num % 2 == 0: 49 | data[td.xpath('string(.)').extract_first()] = tds[num+1].xpath('string(.)').extract_first() 50 | data['content'] = xxgk_content 51 | sendData('xxgk',data,response.url) -------------------------------------------------------------------------------- /selenium/baidu_cookies.json: -------------------------------------------------------------------------------- 1 | [{"domain": "index.baidu.com", "name": "bdshare_firstime", "expires": "\u5468\u65e5, 24 \u4e03\u6708 2022 02:52:30 GMT", "create_at": 1500864751.085, "value": "1500864750591", "expiry": 1658631150, "path": "/", "httponly": false, "secure": false}, {"domain": ".index.baidu.com", "name": "Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc", "value": "1500864750", "path": "/", "httponly": false, "secure": false}, {"domain": ".index.baidu.com", "name": "Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc", "expires": "\u5468\u4e8c, 24 \u4e03\u6708 2018 02:52:30 GMT", "value": "1500864214,1500864295,1500864672,1500864742", "expiry": 1532400750, "path": "/", "httponly": false, "secure": false}, {"domain": "index.baidu.com", "name": "CHKFORREG", "expires": "\u5468\u4e8c, 25 \u4e03\u6708 2017 02:52:29 GMT", "value": "54b8a6ea6d56d48e58c165c605b717e1", "expiry": 1500951149, "path": "/", "httponly": false, "secure": false}, {"domain": ".baidu.com", "name": "BDUSS", "expires": "\u5468\u4e94, 10 \u5341\u6708 2025 02:52:27 GMT", "value": "TNWV0UydlptYW91SksxMzJINjI4UDBhOXo4RUpPQ2hBN01lN1lnbVpvVHI3WnhaSVFBQUFBJCQAAAAAAAAAAAEAAAARP6U7stDA4c7e2~MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOtgdVnrYHVZL", "expiry": 1760064747, "path": "/", "httponly": true, "secure": false}, {"domain": ".index.baidu.com", "name": "FP_UID", "expires": "\u5468\u516d, 31 \u5341\u4e8c\u6708 2050 00:00:00 GMT", "value": "e692d0e17c4002bb8907b276ce03d6c8", "expiry": 2556057600, "path": "/", "httponly": false, "secure": false}, {"domain": "index.baidu.com", "name": "searchtips", "expires": "\u5468\u56db, 19 \u4e03\u6708 2018 02:52:22 GMT", "value": "1", "expiry": 1531968742, "path": "/", "httponly": false, "secure": false}, {"domain": ".baidu.com", "name": "BAIDUID", "expires": "\u5468\u4e8c, 24 \u4e03\u6708 2018 02:52:22 GMT", "value": "62945FA019DB5ABFC5EDB35994164E0A:FG=1", "expiry": 1532400742, "path": "/", "httponly": false, "secure": false}] -------------------------------------------------------------------------------- /selenium/exponent_baidu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | from keywords import get_keywords,save_keyword_index 8 | from PIL import Image 9 | import time 10 | import urllib 11 | import json 12 | import os 13 | import time 14 | 15 | def reset_cookies(browser,listCookies): 16 | browser.delete_all_cookies() 17 | for cookie in listCookies: 18 | browser.add_cookie({ 19 | 'domain': cookie['domain'] if cookie['domain'].startswith('.') else '.'+cookie['domain'], 20 | 'name': cookie['name'], 21 | 'value': cookie['value'], 22 | 'path': '/', 23 | 'expires': None 24 | }) 25 | 26 | def login(browser): 27 | name = browser.find_element_by_id("TANGRAM_12__userName") 28 | name.clear() 29 | name.send_keys("yourname") 30 | password = browser.find_element_by_id("TANGRAM_12__password") 31 | password.clear() 32 | password.send_keys("yourpassword") 33 | submit = browser.find_element_by_id('TANGRAM_12__submit').click() 34 | time.sleep(3) 35 | with open('baidu_cookies.json', 'w') as f: 36 | cookies = browser.get_cookies() 37 | cookies[0]['create_at'] = time.time() 38 | data = json.dumps(cookies) 39 | f.write(data) 40 | 41 | def move_fuc(action,browser,keyword,x,y,k): 42 | # 模拟移动鼠标截图 43 | trend = browser.find_element_by_id("trend") 44 | action.move_to_element_with_offset(trend,x,y).perform() 45 | time.sleep(10) 46 | browser.save_screenshot('images/screenshot.png') 47 | # 根据需要元素裁图 48 | viewbox = browser.find_element_by_id("viewbox") 49 | date = browser.find_element_by_xpath('//*[@id="viewbox"]/div[1]/div[1]').text.split(' ')[0] 50 | left = viewbox.location['x'] 51 | top = viewbox.location['y'] 52 | right = viewbox.location['x'] + viewbox.size['width'] 53 | bottom = viewbox.location['y'] + viewbox.size['height'] 54 | im = Image.open('images/screenshot.png') 55 | im = im.crop((left, top, right, bottom)) 56 | image_name = 'images/baidu_%s_%s.png' % (keyword,date) 57 | im.save(image_name) 58 | time.sleep(1) 59 | 60 | # 调用ocr识别图像 61 | os.system('./zfOcr '+image_name) 62 | time.sleep(3) 63 | dir_name = os.path.dirname(os.path.abspath(__file__))+'/' 64 | if os.path.exists(dir_name+image_name+'.txt'): 65 | with open(image_name+'.txt','r') as f: 66 | num = int(f.read()) 67 | data = {'keyword_id':k[0],'site':u'百度','keyword_type':k[2],'index_date':date,'index_value':num} 68 | save_keyword_index(data) 69 | else: 70 | print '%s.txt file not exist' % (image_name) 71 | # print date,num 72 | 73 | 74 | if __name__ == '__main__': 75 | browser = webdriver.PhantomJS() 76 | try: 77 | browser.maximize_window() 78 | keyword_list = get_keywords() 79 | for k in keyword_list: 80 | keyword = k[1].decode('utf8') 81 | keyword = urllib.quote(keyword.encode('cp936')) 82 | try: 83 | browser.get('http://index.baidu.com/?tpl=trend&word=%s' % keyword) 84 | with open('baidu_cookies.json', 'r') as f: 85 | listCookies = f.read() 86 | if listCookies: 87 | listCookies = json.loads(listCookies) 88 | create_at = listCookies[0]['create_at'] 89 | else: 90 | create_at = 0 91 | if create_at == 0 or time.time() - create_at > 3600*5: 92 | login(browser) 93 | else: 94 | reset_cookies(browser,listCookies) 95 | browser.get('http://index.baidu.com/?tpl=trend&word=%s' % keyword) 96 | time.sleep(5) 97 | try: 98 | trend = browser.find_element_by_id("trend") 99 | except: 100 | trend = '' 101 | if trend: 102 | action = ActionChains(browser) 103 | for i in range(0,30): 104 | x = 30 + 42*i 105 | if i == 29: 106 | x = 1230 107 | move_fuc(action,browser,keyword,x,150,k) 108 | else: 109 | print '%s not find' % keyword 110 | except Exception, e: 111 | print keyword,unicode(e) 112 | except Exception, e: 113 | print unicode(e) 114 | finally: 115 | browser.quit() -------------------------------------------------------------------------------- /selenium/exponent_sina.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | from keywords import get_keywords,save_keyword_index 8 | import urllib 9 | import time 10 | import sys 11 | 12 | def move_fuc(action,element,browser,x,y=200): 13 | action.move_to_element_with_offset(element,x,y).perform() 14 | time.sleep(1) 15 | div = browser.find_element_by_xpath('//*[@id="hotword_chart"]/div/div[2]') 16 | text = div.get_attribute('innerHTML') 17 | date = text.split('
')[0].split(u':')[1] 18 | val = text.split('
')[1].split(u':')[1].replace(',','') 19 | return date,val 20 | 21 | 22 | if __name__ == '__main__': 23 | browser = webdriver.PhantomJS() 24 | try: 25 | browser.maximize_window() 26 | keyword_list = get_keywords() 27 | for k in keyword_list: 28 | keyword = k[1].decode('utf8') 29 | keyword = urllib.quote(keyword.encode('utf8')) 30 | browser.get('http://data.weibo.com/index/hotword?wid=1020000010045&wname=%s' % keyword) 31 | try: 32 | canvas = browser.find_element_by_xpath('//*[@id="hotword_chart"]/div/canvas[7]') 33 | except: 34 | canvas = '' 35 | 36 | if canvas: 37 | action = ActionChains(browser) 38 | 39 | data = {} 40 | for i in range(0,33): 41 | date,val = move_fuc(action,canvas,browser,35+i*26) 42 | # print date,val 43 | data = {'keyword_id':k[0],'site':u'新浪','keyword_type':k[2],'index_date':date,'index_value':val} 44 | try: 45 | save_keyword_index(data) 46 | except Exception, e: 47 | print keyword,date,val,unicode(e) 48 | else: 49 | print '%s not find' % keyword 50 | except Exception, e: 51 | print unicode(e) 52 | finally: 53 | browser.quit() 54 | -------------------------------------------------------------------------------- /selenium/exponent_sougou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | from keywords import get_keywords,save_keyword_index 7 | import re 8 | import urllib 9 | import time 10 | 11 | if __name__ == '__main__': 12 | browser = webdriver.PhantomJS() 13 | try: 14 | keyword_list = get_keywords() 15 | for k in keyword_list: 16 | keyword = k[1].decode('utf8') 17 | keyword = urllib.quote(keyword.encode('utf8')) 18 | browser.get('http://zhishu.sogou.com/index/searchHeat?kwdNamesStr=%s&timePeriodType=MONTH&dataType=SEARCH_ALL&queryType=INPUT' % keyword) 19 | try: 20 | r = re.findall(r'root.SG.data = {"pvList":\[([\s\S]*)],"infoList"', browser.page_source, re.M) 21 | except: 22 | r = '' 23 | if r: 24 | points = eval(r[0].split('],"infoList"')[0]) 25 | for p in points: 26 | date = str(p['date']) 27 | date = date[0:4]+'-'+date[4:6]+'-'+date[6:8] 28 | # print date,p['pv'] 29 | data = {'keyword_id':k[0],'site':u'搜狗','keyword_type':k[2],'index_date':date,'index_value':p['pv']} 30 | try: 31 | save_keyword_index(data) 32 | except Exception, e: 33 | print unicode(e),keyword,date,p['pv'] 34 | else: 35 | print '%s not find' % keyword 36 | except Exception, e: 37 | print unicode(e) 38 | finally: 39 | browser.quit() -------------------------------------------------------------------------------- /selenium/keywords.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | 3 | conn = psycopg2.connect(database='SYH',user='postgres',password='bigdata123',host='10.20.1.50',port='5432') 4 | cur = conn.cursor() 5 | 6 | def get_keywords(): 7 | cur.execute("SELECT * FROM biz_keyword;") 8 | rows = cur.fetchall() 9 | return rows 10 | 11 | def save_keyword_index(data): 12 | cur.execute('SELECT * FROM biz_keyword_index where keyword_id=%s and index_date=\'%s\' and site=\'%s\';' % (data['keyword_id'],data['index_date'],data['site'])) 13 | rows = cur.fetchall() 14 | if not len(rows): 15 | # print('save:',data['index_date']) 16 | cur.execute("INSERT INTO biz_keyword_index (keyword_id,site,keyword_type,index_date,index_value) values (%s,\'%s\',%s,\'%s\',%s);" % (data['keyword_id'],data['site'],data['keyword_type'],data['index_date'],data['index_value'])) 17 | conn.commit() 18 | return rows -------------------------------------------------------------------------------- /selenium/selenium_get_url.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support import expected_conditions as EC 5 | from selenium.webdriver.support.wait import WebDriverWait 6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 7 | from selenium.webdriver.common.action_chains import ActionChains 8 | import time 9 | 10 | def write_fuc(browser): 11 | table = browser.find_element_by_xpath('//*[@id="mainleft"]') 12 | tag_a = table.find_elements_by_tag_name('a') 13 | with open('url_list.txt','a+') as f: 14 | for a in tag_a: 15 | text = a.text 16 | if u'信息公开表' in text: 17 | print text 18 | f.write(a.get_attribute("href")+'\n') 19 | 20 | if __name__ == '__main__': 21 | 22 | # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() 23 | # desired_capabilities["phantomjs.page.settings.userAgent"] = ( 24 | # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 25 | # ) 26 | browser = webdriver.PhantomJS() 27 | 28 | # browser.get('http://www.weihaifda.gov.cn/col/col14562/index.html') 29 | # browser.save_screenshot('screenshot.png') 30 | # write_fuc(browser) 31 | # browser.get('http://www.thsfda.gov.cn/xxgk/xzcfajxxgk/index_1.html') 32 | # write_fuc(browser) 33 | # for i in range(1,41): 34 | # browser.get('http://ypjd.xjalt.gov.cn/qwssjgy.jsp?wbtreeid=1001¤tnum='+str(i)+'&newskeycode2=6KGM5pS%2F5aSE572a5qGI5Lu25L%2Bh5oGv5YWs5byA') 35 | # write_fuc(browser) 36 | 37 | browser.get('http://www.huainan.gov.cn/public/column/4971284?type=4&catId=4977426&action=list') 38 | # browser.find_element_by_xpath('//*[@id="example"]/li[7]/div').click() 39 | # browser.switch_to.frame("conTarget") 40 | # write_fuc(browser) 41 | # time.sleep(1) 42 | # write_fuc(browser) 43 | 44 | count = 1 45 | while count <= 16: 46 | # try: 47 | # next_page = browser.find_element_by_xpath('//*[@id="container"]/div/div/table//tr/td[3]/div[2]/form/table//tr[21]/td/table//tr/td/table//tr/td[2]/div/a[7]') 48 | try: 49 | next_page = browser.find_element_by_partial_link_text('下一页') 50 | # next_page = browser.find_element_by_id('NextPage1_Next') 51 | except: 52 | next_page = '' 53 | # if 'default_pgNextDisabled' in next_page.get_attribute('class'): 54 | if not next_page: 55 | print 'enter_over' 56 | write_fuc(browser) 57 | break 58 | else: 59 | print 'enter' 60 | write_fuc(browser) 61 | next_page.click() 62 | time.sleep(2) 63 | count += 1 64 | browser.quit() 65 | -------------------------------------------------------------------------------- /selenium/selenium_proxy.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.common.proxy import Proxy 5 | from selenium.webdriver.common.proxy import ProxyType 6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.webdriver.support.wait import WebDriverWait 9 | import time 10 | 11 | porxy_list = ['183.222.102.105','183.222.102.101','60.216.42.11','47.52.24.117'] 12 | 13 | proxy = Proxy( 14 | { 15 | 'proxyType': ProxyType.MANUAL, 16 | 'httpProxy': '47.52.24.117' 17 | } 18 | ) 19 | desired_capabilities = DesiredCapabilities.PHANTOMJS.copy() 20 | proxy.add_to_capabilities(desired_capabilities) 21 | desired_capabilities["phantomjs.page.settings.userAgent"] = ( 22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" 23 | ) 24 | 25 | browser = webdriver.PhantomJS(desired_capabilities=desired_capabilities) 26 | 27 | # browser.get('http://www.seabay.cn/cn/code/?search=pvg') 28 | # print 'start_____' 29 | # table = browser.find_element_by_xpath('//*[@id="infoiata"]') 30 | # print table.get_attribute('innerHTML') 31 | # browser.quit() 32 | 33 | 34 | # browser.get('https://httpbin.org/get?show_env=1') #检测头信息 35 | browser.get('http://www.ip181.com/') #检测代理类型 36 | # browser.get('http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+3+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E8%A1%8C%E6%94%BF%E6%A1%88%E4%BB%B6') 37 | # browser.get('http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=114&tableName=TABLE114&title=%E5%9B%BD%E5%AE%B6%E9%A3%9F%E5%93%81%E5%AE%89%E5%85%A8%E7%9B%91%E7%9D%A3%E6%8A%BD%E6%A3%80%EF%BC%88%E4%B8%8D%E5%90%88%E6%A0%BC%E4%BA%A7%E5%93%81%EF%BC%89&bcId=143106776907834761101199700381') 38 | # print 'start_____' 39 | try: 40 | # browser.get('http://www.luan.gov.cn/opennessTarget/?branch_id=5212bc2d682e09147c7c4aa8&branch_type=&column_code=70302&topic_id=&tag=&page=1') 41 | # time.sleep(3) 42 | browser.save_screenshot('screenshot1.png') 43 | # print browser.page_source 44 | # WebDriverWait(browser,30).until(EC.visibility_of_any_elements_located((By.CSS_SELECTOR,'.dataItem'))) 45 | # resultlist = browser.find_element_by_id('list') 46 | # print resultlist.get_attribute('innerHTML') 47 | # time.sleep(10) 48 | # resultlist = browser.find_element_by_id('list') 49 | resultlist = browser.find_element_by_class_name('panel-body') 50 | print resultlist.get_attribute('innerHTML') 51 | # print browser.page_source 52 | # with open('wenshu.html','w') as ws: 53 | # ws.write(resultlist.get_attribute('innerHTML')) 54 | # browser.save_screenshot('screenshot.png') 55 | finally: 56 | browser.quit() -------------------------------------------------------------------------------- /selenium/tesseract_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pytesseract 3 | from PIL import Image 4 | 5 | if __name__ == '__main__': 6 | image = Image.open('images/test_2017-06-25.png') 7 | code = pytesseract.image_to_string(image, lang='eng') 8 | print code --------------------------------------------------------------------------------