├── .gitignore
├── README.md
├── datadeal
├── company
│ ├── __init__.py
│ ├── adminx.py
│ ├── apps.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ └── save_company.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ └── views.py
├── datadeal
│ ├── __init__.py
│ ├── adminx.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ └── alarm.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_variable_important.py
│ │ ├── 0003_variable_all_text.py
│ │ ├── 0004_auto_20170608_1126.py
│ │ ├── 0005_remove_variable_important.py
│ │ ├── 0006_auto_20170703_1334.py
│ │ ├── 0007_auto_20170705_0835.py
│ │ ├── 0008_auto_20170720_1032.py
│ │ ├── 0009_auto_20170815_0855.py
│ │ └── __init__.py
│ ├── models.py
│ ├── settings.py
│ ├── static
│ │ ├── css
│ │ │ └── jquery.dataTables.min.css
│ │ ├── images
│ │ │ ├── Sorting icons.psd
│ │ │ ├── favicon.ico
│ │ │ ├── sort_asc.png
│ │ │ ├── sort_asc_disabled.png
│ │ │ ├── sort_both.png
│ │ │ ├── sort_desc.png
│ │ │ └── sort_desc_disabled.png
│ │ └── js
│ │ │ ├── iframe_common.js
│ │ │ ├── iframe_detail.js
│ │ │ ├── iframe_list.js
│ │ │ ├── jquery-3.2.0.min.js
│ │ │ └── jquery.dataTables.min.js
│ ├── templates
│ │ ├── detail_iframe.html
│ │ ├── files_admin.html
│ │ ├── images_admin.html
│ │ ├── index.html
│ │ └── list_iframe.html
│ ├── urls.py
│ ├── views.py
│ └── wsgi.py
├── distribute
│ ├── __init__.py
│ ├── adminx.py
│ ├── apps.py
│ ├── management
│ │ ├── __init__.py
│ │ └── commands
│ │ │ ├── __init__.py
│ │ │ ├── get_spiders.py
│ │ │ ├── mongo_test.py
│ │ │ ├── register_node.py
│ │ │ ├── start_spider.py
│ │ │ └── upload_files.py
│ ├── migrations
│ │ ├── 0001_initial.py
│ │ ├── 0002_auto_20170606_1335.py
│ │ └── __init__.py
│ ├── models.py
│ ├── tests.py
│ ├── urls.py
│ └── views.py
├── ghostdriver.log
└── manage.py
├── searchInfo
├── ghostdriver.log
├── scrapy.cfg
└── searchInfo
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ ├── beijing.py
│ ├── case.py
│ ├── chengdu.py
│ ├── common.py
│ ├── gansu.py
│ ├── hainan.py
│ ├── qingdao.py
│ ├── risk.py
│ ├── sdein.py
│ ├── sdqts.py
│ ├── sfda.py
│ ├── shandong.py
│ ├── shanghai.py
│ └── xxgk.py
└── selenium
├── baidu_cookies.json
├── exponent_baidu.py
├── exponent_sina.py
├── exponent_sougou.py
├── ghostdriver.log
├── keywords.py
├── selenium_get_url.py
├── selenium_proxy.py
└── tesseract_test.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | datadeal/datadeal/medias/
3 | searchInfo/.scrapy/
4 | selenium/images/
5 | selenium/url_list.txt
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 各省份食品药品处罚案件爬虫程序
2 | Scrapy + Selenium + Django
3 |
--------------------------------------------------------------------------------
/datadeal/company/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/__init__.py
--------------------------------------------------------------------------------
/datadeal/company/adminx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import xadmin
3 | from .models import *
4 |
5 | class CompanyAdmin(object):
6 | list_display = ['name','address','creditcode','registration','organization','kind','status','legalperson','start_at','capital','deadline','give_at','webpage','authority','scope']
7 | search_fields = ['name','address','creditcode','registration','organization','kind','status','legalperson','capital','deadline','webpage','authority','scope']
8 | list_filter = ['start_at','give_at']
9 | xadmin.site.register(Company, CompanyAdmin)
10 |
11 | class ShareholdersAdmin(object):
12 | list_display = ['name','kind','subcribe_money','subcribe_date','real_money','real_date','company']
13 | search_fields = ['name','kind','subcribe_money','subcribe_date','real_money','real_date']
14 | list_filter = ['company']
15 | xadmin.site.register(Shareholders, ShareholdersAdmin)
16 |
17 | class MemberAdmin(object):
18 | list_display = ['name','kind','company']
19 | search_fields = ['name','kind']
20 | list_filter = ['company']
21 | xadmin.site.register(Member, MemberAdmin)
--------------------------------------------------------------------------------
/datadeal/company/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 |
3 |
4 | class CompanyConfig(AppConfig):
5 | name = 'company'
6 |
--------------------------------------------------------------------------------
/datadeal/company/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/management/__init__.py
--------------------------------------------------------------------------------
/datadeal/company/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/management/commands/__init__.py
--------------------------------------------------------------------------------
/datadeal/company/management/commands/save_company.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from company.models import *
4 | from selenium import webdriver
5 | from selenium.webdriver.common.by import By
6 | from selenium.webdriver.support import expected_conditions as EC
7 | from selenium.webdriver.support.wait import WebDriverWait
8 | from selenium.webdriver.common.action_chains import ActionChains
9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
10 | import time
11 | import urllib
12 | import json
13 |
14 | class Command(BaseCommand):
15 | help = '保存公司信息'
16 |
17 | def handle(self, *args, **options):
18 | company_list = ['新泰市人民药业有限公司仁德人民医药商场','新泰市人民药业有限公司新兰人民医药商场','新泰市人民药业有限公司黄崖人民医药商场','新泰市泉沟安康大药店','新泰市百姓园大药房','新泰市泉沟镇保安堂大药店','新泰市泉沟镇子恒药店','新泰市泉沟镇老百姓大药房','新泰市泉沟平价大药店','新泰市泉沟镇泉民大药店','新泰市康宇大药店','韩庄众心百姓大药房','西张庄众心百姓大药房','芙蓉街众心百姓大药房','淄博新华大药店连锁有限公司桓台陈庄药店','淄博新华大药店连锁有限公司兴桓药店','淄博丰祺医药有限公司云涛药店','桓台县索镇瑞康药店','桓台县城区信康药店','桓台县东壁大药店','淄博丰祺医药零售有限公司侯庄药店','淄博丰祺医药零售有限公司姜坊药店','果里镇福生堂药店','果里镇广生堂药店','淄博市临淄昊虹工贸有限公司','青岛啤酒股份有限公司青岛啤酒三厂','青岛北苑英徽家具有限公司','青岛平泰电子有限公司','青岛司玛特瑞进电子有限公司','青岛黄金铅锌开发有限公司','青岛长荣化工有限公司','东明县迪奥化工有限公司','东明元创化工有限公司','东明宏昌化工有限公司','东明欧宝板业有限公司','山东优一化工有限公司','东明凌宇化工有限公司','东明佳润化工有限公司']
19 |
20 | desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
21 | desired_capabilities["phantomjs.page.settings.userAgent"] = (
22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
23 | )
24 | browser = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
25 | browser.maximize_window()
26 |
27 | for company in company_list:
28 | keyword = urllib.quote(company)
29 | browser.get('http://www.xizhi.com/search?wd=%s&type=all' % keyword)
30 | try:
31 | a = browser.find_element_by_xpath('/html/body/div[5]/div[1]/ul/li/div/div[2]/h3/a')
32 | except:
33 | a = ''
34 | if a:
35 | browser.get(a.get_attribute("href"))
36 | name = browser.find_element_by_xpath('/html/body/div[5]/div[1]/div[2]/h2/a').text
37 | print name
38 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[1]/div')
39 | tds = div.find_elements_by_tag_name('td')
40 | aleady = Company.objects.filter(name=name)
41 | if not aleady.count():
42 | obj = Company.objects.create(name=name,address=tds[25].text,creditcode=tds[1].text,registration=tds[3].text,organization=tds[5].text,kind=tds[7].text,status=tds[9].text,legalperson=tds[11].text,start_at=tds[13].text,capital=tds[15].text,deadline=tds[17].text,give_at=tds[19].text,webpage=tds[21].text,authority=tds[23].text,scope=tds[27].text)
43 | else:
44 | obj = ''
45 |
46 | if obj:
47 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[2]')
48 | trs = div.find_elements_by_tag_name('tr')
49 | if len(trs) > 1:
50 | for i,tr in enumerate(trs):
51 | if i > 0:
52 | tds = tr.find_elements_by_tag_name('td')
53 | if tds[2].text:
54 | subcribe = tds[2].text.split('/')
55 | if len(subcribe) > 1:
56 | subcribe_money = subcribe[0]
57 | subcribe_date = subcribe[1]
58 | else:
59 | subcribe_money = subcribe[0]
60 | subcribe_date = ''
61 | else:
62 | subcribe_money = ''
63 | subcribe_date = ''
64 | if tds[3].text:
65 | real = tds[3].text.split('/')
66 | if len(real) > 1:
67 | real_money = real[0]
68 | real_date = real[1]
69 | else:
70 | real_money = real[0]
71 | real_date = ''
72 | else:
73 | real_money = ''
74 | real_date = ''
75 | try:
76 | Shareholders.objects.create(name=tds[0].text,kind=tds[1].text,subcribe_money=subcribe_money,subcribe_date=subcribe_date,real_money=real_money,real_date=real_date,company=obj)
77 | except:
78 | Shareholders.objects.create(name=tds[0].text,kind=tds[1].text,subcribe_money=subcribe_money,real_money=real_money,company=obj)
79 |
80 | div = browser.find_element_by_xpath('//*[@id="details-content"]/div[1]/div[3]')
81 | lis = div.find_elements_by_tag_name('li')
82 | if len(lis) > 0:
83 | for li in lis:
84 | key = li.find_element_by_class_name('lab').text.split(u':')[0]
85 | val = li.find_element_by_class_name('lab-in').text
86 | Member.objects.create(name=val,kind=key,company=obj)
87 |
88 | browser.quit()
89 |
--------------------------------------------------------------------------------
/datadeal/company/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-07-27 17:04
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 | import django.db.models.deletion
7 |
8 |
9 | class Migration(migrations.Migration):
10 |
11 | initial = True
12 |
13 | dependencies = [
14 | ]
15 |
16 | operations = [
17 | migrations.CreateModel(
18 | name='Company',
19 | fields=[
20 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
21 | ('name', models.CharField(max_length=50, verbose_name='\u4f01\u4e1a\u540d\u79f0')),
22 | ('address', models.CharField(max_length=100, verbose_name='\u4f01\u4e1a\u5730\u5740')),
23 | ('creditcode', models.CharField(max_length=50, null=True, verbose_name='\u7edf\u4e00\u793e\u4f1a\u4fe1\u7528\u4ee3\u7801')),
24 | ('registration', models.CharField(max_length=50, verbose_name='\u6ce8\u518c\u53f7')),
25 | ('organization', models.CharField(max_length=50, null=True, verbose_name='\u7ec4\u7ec7\u673a\u6784\u4ee3\u7801')),
26 | ('kind', models.CharField(max_length=50, null=True, verbose_name='\u516c\u53f8\u7c7b\u578b')),
27 | ('status', models.CharField(max_length=50, null=True, verbose_name='\u7ecf\u8425\u72b6\u6001')),
28 | ('legalperson', models.CharField(max_length=50, verbose_name='\u6cd5\u5b9a\u4ee3\u8868\u4eba')),
29 | ('start_at', models.DateField(null=True, verbose_name='\u7ecf\u8425\u65e5\u671f')),
30 | ('capital', models.CharField(max_length=50, null=True, verbose_name='\u6ce8\u518c\u8d44\u672c')),
31 | ('deadline', models.CharField(max_length=50, null=True, verbose_name='\u8425\u4e1a\u671f\u9650')),
32 | ('give_at', models.DateField(null=True, verbose_name='\u53d1\u7167\u65e5\u671f')),
33 | ('webpage', models.CharField(max_length=50, null=True, verbose_name='\u7f51\u5740')),
34 | ('authority', models.CharField(max_length=50, null=True, verbose_name='\u767b\u8bb0\u673a\u5173')),
35 | ('scope', models.TextField(null=True, verbose_name='\u7ecf\u8425\u8303\u56f4')),
36 | ],
37 | options={
38 | 'verbose_name': '\u516c\u53f8\u4fe1\u606f',
39 | 'verbose_name_plural': '\u516c\u53f8\u4fe1\u606f\u7ba1\u7406',
40 | },
41 | ),
42 | migrations.CreateModel(
43 | name='Member',
44 | fields=[
45 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
46 | ('name', models.CharField(max_length=50, verbose_name='\u540d\u5b57')),
47 | ('kind', models.CharField(max_length=50, verbose_name='\u8eab\u4efd')),
48 | ('company', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='company.Company', verbose_name='\u516c\u53f8')),
49 | ],
50 | options={
51 | 'verbose_name': '\u6210\u5458\u4fe1\u606f',
52 | 'verbose_name_plural': '\u6210\u5458\u4fe1\u606f\u7ba1\u7406',
53 | },
54 | ),
55 | migrations.CreateModel(
56 | name='Shareholders',
57 | fields=[
58 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
59 | ('name', models.CharField(max_length=50, verbose_name='\u80a1\u4e1c')),
60 | ('kind', models.CharField(max_length=50, verbose_name='\u7c7b\u578b')),
61 | ('subcribe_money', models.CharField(max_length=50, null=True, verbose_name='\u8ba4\u7f34\u51fa\u8d44\u91d1\u989d')),
62 | ('subcribe_date', models.DateField(null=True, verbose_name='\u8ba4\u7f34\u51fa\u8d44\u65f6\u95f4')),
63 | ('real_money', models.CharField(max_length=50, null=True, verbose_name='\u5b9e\u7f34\u51fa\u8d44\u91d1\u989d')),
64 | ('real_date', models.DateField(null=True, verbose_name='\u5b9e\u7f34\u51fa\u8d44\u65f6\u95f4')),
65 | ('company', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='company.Company', verbose_name='\u516c\u53f8')),
66 | ],
67 | options={
68 | 'verbose_name': '\u80a1\u4e1c\u4fe1\u606f',
69 | 'verbose_name_plural': '\u80a1\u4e1c\u4fe1\u606f\u7ba1\u7406',
70 | },
71 | ),
72 | ]
73 |
--------------------------------------------------------------------------------
/datadeal/company/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/company/migrations/__init__.py
--------------------------------------------------------------------------------
/datadeal/company/models.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models
5 |
6 | # Create your models here.
7 | class Company(models.Model):
8 | name = models.CharField(u'企业名称',max_length=50)
9 | address = models.CharField(u'企业地址',max_length=100)
10 | creditcode = models.CharField(u'统一社会信用代码',max_length=50,null=True)
11 | registration = models.CharField(u'注册号',max_length=50)
12 | organization = models.CharField(u'组织机构代码',max_length=50,null=True)
13 | kind = models.CharField(u'公司类型',max_length=50,null=True)
14 | status = models.CharField(u'经营状态',max_length=50,null=True)
15 | legalperson = models.CharField(u'法定代表人',max_length=50)
16 | start_at = models.DateField(u'经营日期',null=True)
17 | capital = models.CharField(u'注册资本',max_length=50,null=True)
18 | deadline = models.CharField(u'营业期限',max_length=50,null=True)
19 | give_at = models.DateField(u'发照日期',null=True)
20 | webpage = models.CharField(u'网址',max_length=50,null=True)
21 | authority = models.CharField(u'登记机关',max_length=50,null=True)
22 | scope = models.TextField(u'经营范围',null=True)
23 |
24 | def __unicode__(self):
25 | return self.name
26 |
27 | class Meta:
28 | verbose_name = u'公司信息'
29 | verbose_name_plural = u'公司信息管理'
30 |
31 |
32 | class Shareholders(models.Model):
33 | name = models.CharField(u'股东',max_length=50)
34 | kind = models.CharField(u'类型',max_length=50)
35 | subcribe_money = models.CharField(u'认缴出资金额',max_length=50,null=True)
36 | subcribe_date = models.DateField(u'认缴出资时间',null=True)
37 | real_money = models.CharField(u'实缴出资金额',max_length=50,null=True)
38 | real_date = models.DateField(u'实缴出资时间',null=True)
39 | company = models.ForeignKey('Company',verbose_name=u'公司')
40 |
41 | def __unicode__(self):
42 | return self.name
43 |
44 | class Meta:
45 | verbose_name = u'股东信息'
46 | verbose_name_plural = u'股东信息管理'
47 |
48 | class Member(models.Model):
49 | name = models.CharField(u'名字',max_length=50)
50 | kind = models.CharField(u'身份',max_length=50)
51 | company = models.ForeignKey('Company',verbose_name=u'公司')
52 |
53 | def __unicode__(self):
54 | return self.name
55 |
56 | class Meta:
57 | verbose_name = u'成员信息'
58 | verbose_name_plural = u'成员信息管理'
--------------------------------------------------------------------------------
/datadeal/company/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/datadeal/company/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 |
3 | # Create your views here.
4 |
--------------------------------------------------------------------------------
/datadeal/datadeal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/__init__.py
--------------------------------------------------------------------------------
/datadeal/datadeal/adminx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import xadmin
3 | from xadmin.views.base import CommAdminView
4 | from xadmin.plugins.themes import ThemePlugin
5 | from django.http import HttpResponseRedirect
6 | from datadeal.settings import BASE_DIR
7 | from pure_pagination import Paginator, EmptyPage, PageNotAnInteger
8 | from .models import *
9 | from distribute.models import *
10 | from company.models import *
11 | from .views import *
12 | import subprocess
13 | import datetime
14 | import json
15 | import time
16 | import os
17 |
18 | class BaseSetting(object):
19 | enable_themes = True
20 | use_bootswatch = True
21 |
22 | class AdminSettings(object):
23 | menu_style = "accordion"
24 | site_title = '爬虫管理系统'
25 | site_footer = '爬虫管理系统'
26 |
27 | def get_site_menu(self):
28 | return [
29 | {'title': '爬虫管理','icon':'fa fa-bug', 'perm': self.get_model_perm(scrapySetting, 'change'), 'menus':(
30 | {'title': '爬虫生成配置', 'url': self.get_model_url(scrapySetting, 'changelist'),
31 | 'perm': self.get_model_perm(scrapySetting, 'changelist')},
32 | {'title': '域名白名单', 'url': self.get_model_url(AllowDomains, 'changelist'),
33 | 'perm': self.get_model_perm(AllowDomains, 'changelist')},
34 | {'title': '一级爬取地址', 'url': self.get_model_url(startUrls, 'changelist'),
35 | 'perm': self.get_model_perm(startUrls, 'changelist')},
36 | {'title': '循环体列表', 'url': self.get_model_url(CycleObj, 'changelist'),
37 | 'perm': self.get_model_perm(CycleObj, 'changelist')},
38 | {'title': '变量列表', 'url': self.get_model_url(Variable, 'changelist'),
39 | 'perm': self.get_model_perm(Variable, 'changelist')},
40 | {'title': '爬虫列表', 'url': self.get_model_url(scrapyList, 'changelist'),
41 | 'perm': self.get_model_perm(scrapyList, 'changelist')},
42 | )},
43 | {'title': '节点管理','icon':'fa fa-chain', 'perm': self.get_model_perm(Node, 'change'), 'menus':(
44 | {'title': '节点管理', 'url': self.get_model_url(Node, 'changelist'),
45 | 'perm': self.get_model_perm(Node, 'changelist')},
46 | {'title': '节点IP管理', 'url': self.get_model_url(NodeIp, 'changelist'),
47 | 'perm': self.get_model_perm(NodeIp, 'changelist')},
48 | {'title': '任务管理', 'url': self.get_model_url(NodeTask, 'changelist'),
49 | 'perm': self.get_model_perm(NodeTask, 'changelist')},
50 | )},
51 | {'title': '数据管理','icon':'fa fa-bar-chart-o', 'perm': self.get_model_perm(SpiderData, 'change'), 'menus':(
52 | {'title': '数据信息', 'url': self.get_model_url(SpiderData, 'changelist'),
53 | 'perm': self.get_model_perm(SpiderData, 'changelist')},
54 | {'title': '错误信息', 'url': self.get_model_url(ErrorData, 'changelist'),
55 | 'perm': self.get_model_perm(ErrorData, 'changelist')},
56 | {'title': '预警信息', 'url': self.get_model_url(DataAlarm, 'changelist'),
57 | 'perm': self.get_model_perm(DataAlarm, 'changelist')},
58 | {'title': '公司信息', 'url': self.get_model_url(Company, 'changelist'),
59 | 'perm': self.get_model_perm(Company, 'changelist')},
60 | {'title': '股东信息', 'url': self.get_model_url(Shareholders, 'changelist'),
61 | 'perm': self.get_model_perm(Shareholders, 'changelist')},
62 | {'title': '成员信息', 'url': self.get_model_url(Member, 'changelist'),
63 | 'perm': self.get_model_perm(Member, 'changelist')},
64 | )},
65 | {'title': '文件管理','icon':'fa fa-file', 'perm': self.get_model_perm(SpiderData, 'change'), 'menus':(
66 | {'title': '图片管理', 'url':'/admin/images_admin/', 'perm': ''},
67 | {'title': '文件管理', 'url':'/admin/files_admin/', 'perm': ''},
68 | )},
69 | ]
70 |
71 | xadmin.site.register(xadmin.views.BaseAdminView,BaseSetting)
72 | xadmin.site.register(xadmin.views.CommAdminView,AdminSettings)
73 |
74 | class scrapySettingAdmin(object):
75 | list_display = ['name', 'allow_domains','start_urls','cycleobj','variable','num','kind','create_at','modify_at']
76 | search_fields = ['name','allow_domains']
77 | list_filter = ['kind','create_at','modify_at']
78 | style_fields = {'allow_domains': 'm2m_transfer','start_urls': 'm2m_transfer','cycleobj': 'm2m_transfer','variable': 'm2m_transfer'}
79 | actions = ['create_spider','create_tasks']
80 | def create_spider(self, request, queryset):
81 | for q in queryset:
82 | if scrapyList.objects.filter(name=q.name).count() == 0:
83 | create_scrapy_file(q)
84 | self.message_user(u'%s 爬虫创建成功' % q.name)
85 | scrapyList.objects.create(name=q.name)
86 | else:
87 | self.message_user(u'%s 爬虫名已被使用' % q.name)
88 | create_spider.short_description = "创建爬虫"
89 | def create_tasks(self, request, queryset):
90 | from distribute.models import NodeTask
91 | for q in queryset:
92 | try:
93 | scrapy = scrapyList.objects.get(name=q.name)
94 | except:
95 | scrapy = ''
96 | if scrapy:
97 | urls = q.start_urls.all()
98 | total = urls.count()
99 | count,last = divmod(total,q.num)
100 | for n in range(0,count+1):
101 | start = n*q.num
102 | if n == count:
103 | if last > 0:
104 | end = total
105 | else:
106 | end = 'pass'
107 | else:
108 | end = (n+1)*q.num
109 | if not end == 'pass':
110 | name = q.name+'_'+str(n+1)
111 | already = NodeTask.objects.filter(name=name).count()
112 | if not already:
113 | obj = NodeTask.objects.create(name=name,scrapy=scrapy,priority=n+1)
114 | for i in urls[start:end]:
115 | obj.urls.add(i)
116 | self.message_user(u'%s 爬虫任务分发完毕' % q.name)
117 | else:
118 | self.message_user(u'请先创建%s爬虫' % q.name)
119 | create_tasks.short_description = "生成任务"
120 | xadmin.site.register(scrapySetting, scrapySettingAdmin)
121 |
122 | class AllowDomainsAdmin(object):
123 | list_display = ['name']
124 | search_fields = ['name']
125 | list_filter = []
126 | xadmin.site.register(AllowDomains, AllowDomainsAdmin)
127 |
128 | class startUrlsAdmin(object):
129 | list_display = ['url']
130 | search_fields = ['url']
131 | list_filter = []
132 | xadmin.site.register(startUrls, startUrlsAdmin)
133 |
134 | class CycleObjAdmin(object):
135 | list_display = ['name','xpath','variable']
136 | search_fields = ['name','xpath']
137 | list_filter = ['variable']
138 | xadmin.site.register(CycleObj, CycleObjAdmin)
139 |
140 | class VariableAdmin(object):
141 | list_display = ['name','xpath','kind','all_text']
142 | search_fields = ['name','xpath']
143 | list_filter = ['kind','all_text']
144 | xadmin.site.register(Variable, VariableAdmin)
145 |
146 | class scrapyListAdmin(object):
147 | list_display = ['name','priority','alarm_day','create_at','is_open']
148 | search_fields = ['name']
149 | list_filter = ['create_at','is_open']
150 | list_editable = ['alarm_day','is_open']
151 | actions = ['start_spider','download']
152 | def start_spider(self, request, queryset):
153 | for q in queryset:
154 | self.message_user(u'%s 爬虫正在爬取数据... %s' % (q.name,datetime.datetime.now().strftime('%H:%M:%S')))
155 | subprocess.call('cd ../searchInfo && scrapy crawl %s -o ../datadeal/datadeal/medias/%s_data.json' % (q.name,q.name), shell=True)
156 | self.message_user(u'%s 爬虫已经抓取完数据 %s' % (q.name,datetime.datetime.now().strftime('%H:%M:%S')))
157 | start_spider.short_description = "运行爬虫"
158 | def download(self, request, queryset):
159 | for q in queryset:
160 | if os.path.exists(BASE_DIR+'/datadeal/medias/%s_data.json' % q.name):
161 | return HttpResponseRedirect('/medias/%s_data.json' % q.name)
162 | else:
163 | self.message_user(u'%s 数据不存在,请先运行爬虫' % q.name)
164 | download.short_description = "数据下载"
165 | xadmin.site.register(scrapyList, scrapyListAdmin)
166 |
167 | class SpiderDataAdmin(object):
168 | list_display = ['scrapyname','create_at','data_str','page_pdf']
169 | search_fields = ['scrapyname','uid','data','url','file']
170 | list_filter = ['scrapyname','create_at']
171 | xadmin.site.register(SpiderData, SpiderDataAdmin)
172 |
173 | class ErrorDataAdmin(object):
174 | list_display = ['scrapyname','uid','create_at','url','content']
175 | search_fields = ['scrapyname','uid','url','content']
176 | list_filter = ['scrapyname','create_at']
177 | xadmin.site.register(ErrorData, ErrorDataAdmin)
178 |
179 | class DataAlarmAdmin(object):
180 | list_display = ['scrapyname','is_alarm','remark','create_at']
181 | search_fields = ['scrapyname','remark']
182 | list_filter = ['scrapyname','is_alarm','create_at']
183 | list_editable = ['is_alarm','remark']
184 | xadmin.site.register(DataAlarm, DataAlarmAdmin)
185 |
186 | class ImagesAdminView(CommAdminView):
187 |
188 | def get(self, request, *args, **kwargs):
189 | images_dir = BASE_DIR+'/datadeal/medias/images'
190 | images = os.listdir(images_dir)
191 | img_list = []
192 | for image in images:
193 | url = images_dir+'/'+image
194 | ctime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(os.path.getctime(url)+8*3600))
195 | img_list.append({'url':image,'ctime':ctime})
196 | img_list.sort(key=lambda x:x['ctime'],reverse=True)
197 | try:
198 | page = request.GET.get('page', 1)
199 | except PageNotAnInteger:
200 | page = 1
201 | p = Paginator(img_list,12, request=request)
202 | img_list = p.page(page)
203 |
204 | context = self.get_context()
205 | context.update({
206 | 'p':p,
207 | 'img_list':img_list
208 | })
209 | return self.template_response('images_admin.html',context)
210 | xadmin.site.register_view(r'^images_admin/$', ImagesAdminView, name='images_admin')
211 |
212 | class FilesAdminView(CommAdminView):
213 |
214 | def get(self, request, *args, **kwargs):
215 | dir_list = ['common','risk']
216 | file_list = []
217 | for d in dir_list:
218 | files_dir = BASE_DIR+'/datadeal/medias/'+d
219 | files = os.listdir(files_dir)
220 | for file in files:
221 | import locale
222 | file = file.decode(locale.getdefaultlocale()[1])
223 | url = files_dir+'/'+file
224 | ctime = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(os.path.getctime(url)+8*3600))
225 | file_list.append([file,ctime,'下载删除'])
226 |
227 | context = self.get_context()
228 | context.update({
229 | 'file_list':json.dumps(file_list)
230 | })
231 | return self.template_response('files_admin.html',context)
232 | xadmin.site.register_view(r'^files_admin/$', FilesAdminView, name='files_admin')
--------------------------------------------------------------------------------
/datadeal/datadeal/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/management/__init__.py
--------------------------------------------------------------------------------
/datadeal/datadeal/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/management/commands/__init__.py
--------------------------------------------------------------------------------
/datadeal/datadeal/management/commands/alarm.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from datadeal.models import scrapyList,DataAlarm,SpiderData
4 | import datetime
5 |
6 | class Command(BaseCommand):
7 | help = '生成爬虫预警'
8 |
9 | def handle(self, *args, **options):
10 | scrapy = scrapyList.objects.filter(is_open=True)
11 | for s in scrapy:
12 | try:
13 | data = SpiderData.objects.filter(scrapyname=s.name).order_by('-create_at')[0]
14 | except:
15 | data = ''
16 | if data:
17 | nodata_day = (datetime.datetime.now()-data.create_at).days
18 | if nodata_day > s.alarm_day:
19 | da = DataAlarm.objects.filter(is_alarm=True,scrapyname=s.name).order_by('-create_at')
20 | if da.count():
21 | alreay_day = (datetime.datetime.now()-da[0].create_at).days
22 | if alreay_day > s.alarm_day:
23 | DataAlarm.objects.create(scrapyname=s.name,is_alarm=True,remark='')
24 | else:
25 | DataAlarm.objects.create(scrapyname=s.name,is_alarm=True,remark='')
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-05 09:36
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | initial = True
11 |
12 | dependencies = [
13 | ]
14 |
15 | operations = [
16 | migrations.CreateModel(
17 | name='AllowDomains',
18 | fields=[
19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
20 | ('name', models.CharField(max_length=500, verbose_name='\u540d\u79f0')),
21 | ],
22 | options={
23 | 'verbose_name': '\u57df\u540d\u767d\u540d\u5355',
24 | 'verbose_name_plural': '\u57df\u540d\u767d\u540d\u5355\u7ba1\u7406',
25 | },
26 | ),
27 | migrations.CreateModel(
28 | name='CycleObj',
29 | fields=[
30 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
31 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7', max_length=50, verbose_name='\u5faa\u73af\u4f53\u540d\u79f0')),
32 | ('xpath', models.CharField(help_text=b'\xe4\xbd\xbf\xe7\x94\xa8xpath\xe8\xa7\x84\xe5\x88\x99\xef\xbc\x9a\nnodename \xe9\x80\x89\xe6\x8b\xa9\xe6\x89\x80\xe6\x9c\x89\xe7\x9b\xae\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe5\xad\x90\xe8\x8a\x82\n/ \xe4\xbb\x8e\xe6\xa0\xb9\xe8\x8a\x82\xe8\xbf\x9b\xe8\xa1\x8c\xe9\x80\x89\xe6\x8b\xa9\n// \xe9\x80\x89\xe6\x8b\xa9\xe6\x96\x87\xe6\xa1\xa3\xe4\xb8\xad\xe7\x9b\xb8\xe5\x90\xbb\xe5\x90\x88\xe7\x9a\x84\xe8\x8a\x82\xe8\x80\x8c\xe4\xb8\x8d\xe7\xae\xa1\xe5\x85\xb6\xe5\x9c\xa8\xe6\x96\x87\xe6\xa1\xa3\xe7\x9a\x84\xe4\xbd\x95\xe5\xa4\x84\n. \xe9\x80\x89\xe6\x8b\xa9\xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\n.. \xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe7\x88\xb6\xe8\x8a\x82\n@ \xe9\x80\x89\xe6\x8b\xa9\xe5\xb1\x9e\xe6\x80\xa7', max_length=200, verbose_name='\u67e5\u8be2\u89c4\u5219')),
33 | ],
34 | options={
35 | 'verbose_name': '\u5faa\u73af\u4f53\u5217\u8868',
36 | 'verbose_name_plural': '\u5faa\u73af\u4f53\u5217\u8868\u7ba1\u7406',
37 | },
38 | ),
39 | migrations.CreateModel(
40 | name='scrapyList',
41 | fields=[
42 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
43 | ('name', models.CharField(max_length=500, verbose_name='\u540d\u79f0')),
44 | ('priority', models.IntegerField(default=0, verbose_name='\u9879\u76ee\u4f18\u5148\u7ea7')),
45 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
46 | ],
47 | options={
48 | 'verbose_name': '\u722c\u866b\u5217\u8868',
49 | 'verbose_name_plural': '\u722c\u866b\u5217\u8868\u7ba1\u7406',
50 | },
51 | ),
52 | migrations.CreateModel(
53 | name='scrapySetting',
54 | fields=[
55 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
56 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7', max_length=20, verbose_name='\u540d\u79f0')),
57 | ('num', models.IntegerField(default=1, verbose_name='\u5355\u4e2a\u4efb\u52a1\u94fe\u63a5\u6570')),
58 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
59 | ('modify_at', models.DateTimeField(auto_now=True, verbose_name='\u4fee\u6539\u65f6\u95f4')),
60 | ('allow_domains', models.ManyToManyField(to='datadeal.AllowDomains', verbose_name='\u57df\u540d\u767d\u540d\u5355')),
61 | ('cycleobj', models.ManyToManyField(to='datadeal.CycleObj', verbose_name='\u5faa\u73af\u4f53')),
62 | ],
63 | options={
64 | 'verbose_name': '\u722c\u866b\u914d\u7f6e',
65 | 'verbose_name_plural': '\u722c\u866b\u914d\u7f6e\u7ba1\u7406',
66 | },
67 | ),
68 | migrations.CreateModel(
69 | name='startUrls',
70 | fields=[
71 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
72 | ('url', models.URLField(max_length=500, verbose_name='\u540d\u79f0')),
73 | ],
74 | options={
75 | 'verbose_name': '\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868',
76 | 'verbose_name_plural': '\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868\u7ba1\u7406',
77 | },
78 | ),
79 | migrations.CreateModel(
80 | name='Variable',
81 | fields=[
82 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
83 | ('name', models.CharField(help_text=b'\xe4\xb8\x8d\xe8\xa6\x81\xe8\xbe\x93\xe5\x85\xa5\xe4\xb8\xad\xe6\x96\x87\xe5\x92\x8c\xe7\x89\xb9\xe6\xae\x8a\xe7\xac\xa6\xe5\x8f\xb7,\xe5\xbb\xba\xe8\xae\xae\xe7\x94\xa8\xe5\xaf\xb9\xe5\xba\x94\xe7\x9a\x84\xe5\xbe\xaa\xe7\x8e\xaf\xe4\xbd\x93\xe5\x81\x9a\xe5\x89\x8d\xe7\xbc\x80\xe5\x8a\xa0\xe4\xbb\xa5\xe5\x8c\xba\xe5\x88\x86', max_length=50, verbose_name='\u53d8\u91cf\u540d\u79f0')),
84 | ('xpath', models.CharField(help_text=b'\xe4\xbd\xbf\xe7\x94\xa8xpath\xe8\xa7\x84\xe5\x88\x99\xef\xbc\x9a\nnodename \xe9\x80\x89\xe6\x8b\xa9\xe6\x89\x80\xe6\x9c\x89\xe7\x9b\xae\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe5\xad\x90\xe8\x8a\x82\n/ \xe4\xbb\x8e\xe6\xa0\xb9\xe8\x8a\x82\xe8\xbf\x9b\xe8\xa1\x8c\xe9\x80\x89\xe6\x8b\xa9\n// \xe9\x80\x89\xe6\x8b\xa9\xe6\x96\x87\xe6\xa1\xa3\xe4\xb8\xad\xe7\x9b\xb8\xe5\x90\xbb\xe5\x90\x88\xe7\x9a\x84\xe8\x8a\x82\xe8\x80\x8c\xe4\xb8\x8d\xe7\xae\xa1\xe5\x85\xb6\xe5\x9c\xa8\xe6\x96\x87\xe6\xa1\xa3\xe7\x9a\x84\xe4\xbd\x95\xe5\xa4\x84\n. \xe9\x80\x89\xe6\x8b\xa9\xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\n.. \xe5\xbd\x93\xe5\x89\x8d\xe8\x8a\x82\xe7\x9a\x84\xe7\x88\xb6\xe8\x8a\x82\n@ \xe9\x80\x89\xe6\x8b\xa9\xe5\xb1\x9e\xe6\x80\xa7', max_length=200, verbose_name='\u67e5\u8be2\u89c4\u5219')),
85 | ('kind', models.IntegerField(choices=[(1, '\u4e00\u7ea7\u53d8\u91cf'), (2, '\u4e8c\u7ea7\u53d8\u91cf'), (3, '\u4e8c\u7ea7\u94fe\u63a5')], default=1, verbose_name='\u7c7b\u578b')),
86 | ],
87 | options={
88 | 'verbose_name': '\u53d8\u91cf\u5217\u8868',
89 | 'verbose_name_plural': '\u53d8\u91cf\u5217\u8868\u7ba1\u7406',
90 | },
91 | ),
92 | migrations.AddField(
93 | model_name='scrapysetting',
94 | name='start_urls',
95 | field=models.ManyToManyField(to='datadeal.startUrls', verbose_name='\u4e00\u7ea7\u722c\u53d6\u5730\u5740\u5217\u8868'),
96 | ),
97 | migrations.AddField(
98 | model_name='cycleobj',
99 | name='variable',
100 | field=models.ManyToManyField(to='datadeal.Variable', verbose_name='\u53d8\u91cf'),
101 | ),
102 | ]
103 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0002_variable_important.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-05 11:23
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0001_initial'),
12 | ]
13 |
14 | operations = [
15 | migrations.AddField(
16 | model_name='variable',
17 | name='important',
18 | field=models.BooleanField(default=False, help_text=b'\xe8\x8b\xa5\xe8\xaf\xa5\xe5\x8f\x98\xe9\x87\x8f\xe6\x89\xbe\xe4\xb8\x8d\xe5\x88\xb0\xe6\x95\xb0\xe6\x8d\xae\xef\xbc\x8c\xe5\x88\x99\xe4\xb8\x8d\xe8\xbf\x9b\xe8\xa1\x8c\xe4\xb8\x8b\xe4\xb8\x80\xe6\xad\xa5', verbose_name='\u5173\u952e\u53d8\u91cf'),
19 | ),
20 | ]
21 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0003_variable_all_text.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-05 13:50
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0002_variable_important'),
12 | ]
13 |
14 | operations = [
15 | migrations.AddField(
16 | model_name='variable',
17 | name='all_text',
18 | field=models.BooleanField(default=False, help_text=b'\xe5\xb0\x86\xe6\x8f\x90\xe5\x8f\x96\xe8\xaf\xa5\xe6\xa0\x87\xe7\xad\xbe\xe4\xb8\x8b\xe6\x96\x87\xe6\x9c\xac\xe5\x8f\x8a\xe6\x89\x80\xe6\x9c\x89\xe5\xad\x90\xe6\xa0\x87\xe7\xad\xbe\xe6\x96\x87\xe6\x9c\xac\xef\xbc\x8c', verbose_name='\u5168\u6587\u672c\u63d0\u53d6'),
19 | ),
20 | ]
21 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0004_auto_20170608_1126.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-08 11:26
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0003_variable_all_text'),
12 | ]
13 |
14 | operations = [
15 | migrations.AddField(
16 | model_name='scrapysetting',
17 | name='kind',
18 | field=models.IntegerField(choices=[(1, '\u5217\u8868\u53ca\u8be6\u60c5'), (2, '\u5217\u8868'), (3, '\u5355\u9875\u9762'), (4, '\u5176\u4ed6')], default=1, verbose_name='\u7c7b\u578b'),
19 | ),
20 | migrations.AddField(
21 | model_name='scrapysetting',
22 | name='variable',
23 | field=models.ManyToManyField(blank=True, to='datadeal.Variable', verbose_name='\u975e\u5faa\u73af\u53d8\u91cf'),
24 | ),
25 | migrations.AlterField(
26 | model_name='scrapysetting',
27 | name='cycleobj',
28 | field=models.ManyToManyField(blank=True, to='datadeal.CycleObj', verbose_name='\u5faa\u73af\u4f53'),
29 | ),
30 | migrations.AlterField(
31 | model_name='variable',
32 | name='all_text',
33 | field=models.BooleanField(default=False, help_text=b'\xe5\xb0\x86\xe6\x8f\x90\xe5\x8f\x96\xe8\xaf\xa5\xe6\xa0\x87\xe7\xad\xbe\xe4\xb8\x8b\xe6\x96\x87\xe6\x9c\xac\xe5\x8f\x8a\xe6\x89\x80\xe6\x9c\x89\xe5\xad\x90\xe6\xa0\x87\xe7\xad\xbe\xe6\x96\x87\xe6\x9c\xac,\xe5\xbc\x80\xe5\x90\xaf\xe5\x90\x8e\xe4\xb8\x8d\xe8\xa6\x81\xe5\x86\x99/text()', verbose_name='\u5b50\u6807\u7b7e\u6587\u672c\u63d0\u53d6'),
34 | ),
35 | migrations.AlterField(
36 | model_name='variable',
37 | name='important',
38 | field=models.BooleanField(default=False, help_text=b'\xe8\x8b\xa5\xe8\xaf\xa5\xe5\x8f\x98\xe9\x87\x8f\xe6\x89\xbe\xe4\xb8\x8d\xe5\x88\xb0\xe6\x95\xb0\xe6\x8d\xae\xef\xbc\x8c\xe5\x88\x99\xe4\xb8\x8d\xe4\xbc\x9a\xe5\x8a\xa0\xe5\x85\xa5\xe5\x88\xb0\xe8\xbf\x94\xe5\x9b\x9e\xe6\x95\xb0\xe6\x8d\xae\xe4\xb8\xad', verbose_name='\u5173\u952e\u53d8\u91cf'),
39 | ),
40 | ]
41 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0005_remove_variable_important.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-08 11:49
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0004_auto_20170608_1126'),
12 | ]
13 |
14 | operations = [
15 | migrations.RemoveField(
16 | model_name='variable',
17 | name='important',
18 | ),
19 | ]
20 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0006_auto_20170703_1334.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-07-03 13:34
3 | from __future__ import unicode_literals
4 | from django.contrib.postgres.operations import HStoreExtension
5 | import django.contrib.postgres.fields.hstore
6 | from django.db import migrations, models
7 |
8 |
9 | class Migration(migrations.Migration):
10 |
11 | dependencies = [
12 | ('datadeal', '0005_remove_variable_important'),
13 | ]
14 |
15 | operations = [
16 | HStoreExtension(),
17 | migrations.CreateModel(
18 | name='SpiderData',
19 | fields=[
20 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
21 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u540d\u79f0')),
22 | ('uid', models.CharField(max_length=50, verbose_name='uid')),
23 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
24 | ('data', django.contrib.postgres.fields.hstore.HStoreField()),
25 | ],
26 | options={
27 | 'verbose_name': '\u6570\u636e\u4fe1\u606f',
28 | 'verbose_name_plural': '\u6570\u636e\u4fe1\u606f\u7ba1\u7406',
29 | },
30 | ),
31 | migrations.AlterField(
32 | model_name='scrapylist',
33 | name='priority',
34 | field=models.IntegerField(default=10, help_text=b'\xe5\x80\xbc\xe8\xb6\x8a\xe5\xb0\x8f\xe8\xb6\x8a\xe4\xbc\x98\xe5\x85\x88', verbose_name='\u9879\u76ee\u4f18\u5148\u7ea7'),
35 | ),
36 | ]
37 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0007_auto_20170705_0835.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-07-05 08:35
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0006_auto_20170703_1334'),
12 | ]
13 |
14 | operations = [
15 | migrations.CreateModel(
16 | name='ErrorData',
17 | fields=[
18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
19 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u9879\u76ee\u6765\u6e90')),
20 | ('uid', models.CharField(max_length=50, verbose_name='\u8bbe\u5907\u6765\u6e90')),
21 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
22 | ('url', models.CharField(max_length=300, verbose_name='\u8bbf\u95ee\u5730\u5740')),
23 | ('content', models.CharField(max_length=300, verbose_name='\u9519\u8bef\u4fe1\u606f')),
24 | ],
25 | options={
26 | 'verbose_name': '\u9519\u8bef\u4fe1\u606f',
27 | 'verbose_name_plural': '\u9519\u8bef\u4fe1\u606f\u7ba1\u7406',
28 | },
29 | ),
30 | migrations.AlterField(
31 | model_name='spiderdata',
32 | name='scrapyname',
33 | field=models.CharField(max_length=50, verbose_name='\u9879\u76ee\u6765\u6e90'),
34 | ),
35 | migrations.AlterField(
36 | model_name='spiderdata',
37 | name='uid',
38 | field=models.CharField(max_length=50, verbose_name='\u8bbe\u5907\u6765\u6e90'),
39 | ),
40 | ]
41 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0008_auto_20170720_1032.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-07-20 10:32
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0007_auto_20170705_0835'),
12 | ]
13 |
14 | operations = [
15 | migrations.AddField(
16 | model_name='spiderdata',
17 | name='file',
18 | field=models.CharField(blank=True, max_length=100, null=True, verbose_name='\u539f\u9875\u9762'),
19 | ),
20 | migrations.AddField(
21 | model_name='spiderdata',
22 | name='url',
23 | field=models.CharField(blank=True, max_length=300, null=True, verbose_name='\u8bbf\u95ee\u5730\u5740'),
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/0009_auto_20170815_0855.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-08-15 08:55
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 |
7 |
8 | class Migration(migrations.Migration):
9 |
10 | dependencies = [
11 | ('datadeal', '0008_auto_20170720_1032'),
12 | ]
13 |
14 | operations = [
15 | migrations.CreateModel(
16 | name='DataAlarm',
17 | fields=[
18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
19 | ('scrapyname', models.CharField(max_length=50, verbose_name='\u722c\u866b\u540d')),
20 | ('is_alarm', models.BooleanField(default=True, verbose_name='\u662f\u5426\u9884\u8b66')),
21 | ('remark', models.TextField(blank=True, null=True, verbose_name='\u539f\u56e0\u5907\u6ce8')),
22 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
23 | ],
24 | options={
25 | 'verbose_name': '\u722c\u866b\u9884\u8b66',
26 | 'verbose_name_plural': '\u722c\u866b\u9884\u8b66\u7ba1\u7406',
27 | },
28 | ),
29 | migrations.AddField(
30 | model_name='scrapylist',
31 | name='alarm_day',
32 | field=models.IntegerField(default=30, help_text=b'\xe8\xb6\x85\xe8\xbf\x87\xe6\x97\xb6\xe9\x97\xb4\xe6\x97\xa0\xe6\x95\xb0\xe6\x8d\xae\xe5\x88\x99\xe7\x94\x9f\xe6\x88\x90\xe9\xa2\x84\xe8\xad\xa6', verbose_name='\u9884\u8b66\u5929\u6570'),
33 | ),
34 | migrations.AddField(
35 | model_name='scrapylist',
36 | name='is_open',
37 | field=models.BooleanField(default=True, verbose_name='\u662f\u5426\u542f\u7528'),
38 | ),
39 | migrations.AlterField(
40 | model_name='errordata',
41 | name='scrapyname',
42 | field=models.CharField(max_length=50, verbose_name='\u722c\u866b\u540d'),
43 | ),
44 | ]
45 |
--------------------------------------------------------------------------------
/datadeal/datadeal/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/migrations/__init__.py
--------------------------------------------------------------------------------
/datadeal/datadeal/models.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from django.db import models
3 | from datadeal.settings import BASE_DIR
4 | from django.contrib.postgres.fields import HStoreField
5 |
6 | class scrapySetting(models.Model):
7 | KIND_CHOICES = ((1, u'列表及详情'),(2, u'列表'),(3,u'单页面'),(4,u'其他'))
8 | name = models.CharField(u'名称',max_length=20,help_text='不要输入中文和特殊符号')
9 | allow_domains = models.ManyToManyField('AllowDomains',verbose_name=u'域名白名单')
10 | start_urls = models.ManyToManyField('startUrls',verbose_name=u'一级爬取地址列表')
11 | cycleobj = models.ManyToManyField('CycleObj',verbose_name=u'循环体',blank=True)
12 | variable = models.ManyToManyField('Variable',verbose_name=u'非循环变量',blank=True)
13 | num = models.IntegerField(u'单个任务链接数',default=1)
14 | kind = models.IntegerField(u'类型', choices=KIND_CHOICES,default=1)
15 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
16 | modify_at = models.DateTimeField(u'修改时间', auto_now=True)
17 |
18 | def __unicode__(self):
19 | return self.name
20 |
21 | class Meta:
22 | verbose_name = u'爬虫配置'
23 | verbose_name_plural = u'爬虫配置管理'
24 |
25 | class AllowDomains(models.Model):
26 | name = models.CharField(u'名称',max_length=500)
27 |
28 | def __unicode__(self):
29 | return self.name
30 |
31 | class Meta:
32 | verbose_name = u'域名白名单'
33 | verbose_name_plural = u'域名白名单管理'
34 |
35 | class startUrls(models.Model):
36 | url = models.URLField(u'名称',max_length=500)
37 |
38 | def __unicode__(self):
39 | return self.url
40 |
41 | class Meta:
42 | verbose_name = u'一级爬取地址列表'
43 | verbose_name_plural = u'一级爬取地址列表管理'
44 |
45 | class CycleObj(models.Model):
46 | name = models.CharField(u'循环体名称',max_length=50,help_text='不要输入中文和特殊符号')
47 | xpath = models.CharField(u'查询规则',max_length=200,help_text='使用xpath规则:\nnodename 选择所有目前节的子节\n/ 从根节进行选择\n// 选择文档中相吻合的节而不管其在文档的何处\n. 选择当前节\n.. 当前节的父节\n@ 选择属性')
48 | variable = models.ManyToManyField('Variable',verbose_name=u'变量')
49 |
50 | def __unicode__(self):
51 | return self.name
52 |
53 | class Meta:
54 | verbose_name = u'循环体列表'
55 | verbose_name_plural = u'循环体列表管理'
56 |
57 | class Variable(models.Model):
58 | KIND_CHOICES = ((1, u'一级变量'), (2, u'二级变量'),(3,u'二级链接'))
59 | name = models.CharField(u'变量名称',max_length=50,help_text='不要输入中文和特殊符号,建议用对应的循环体做前缀加以区分')
60 | xpath = models.CharField(u'查询规则',max_length=200,help_text='使用xpath规则:\nnodename 选择所有目前节的子节\n/ 从根节进行选择\n// 选择文档中相吻合的节而不管其在文档的何处\n. 选择当前节\n.. 当前节的父节\n@ 选择属性')
61 | kind = models.IntegerField(u'类型', choices=KIND_CHOICES,default=1)
62 | all_text = models.BooleanField(u'子标签文本提取',default=False,help_text='将提取该标签下文本及所有子标签文本,开启后不要写/text()')
63 |
64 | def __unicode__(self):
65 | return self.name
66 |
67 | class Meta:
68 | verbose_name = u'变量列表'
69 | verbose_name_plural = u'变量列表管理'
70 |
71 | class scrapyList(models.Model):
72 | name = models.CharField(u'名称',max_length=500)
73 | priority = models.IntegerField(u'项目优先级',default=10,help_text='值越小越优先')
74 | alarm_day = models.IntegerField(u'预警天数',default=30,help_text='超过时间无数据则生成预警')
75 | is_open = models.BooleanField(u'是否启用',default=True)
76 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
77 |
78 | def __unicode__(self):
79 | return self.name
80 |
81 | class Meta:
82 | verbose_name = u'爬虫列表'
83 | verbose_name_plural = u'爬虫列表管理'
84 |
85 | class SpiderData(models.Model):
86 | scrapyname = models.CharField(u'项目来源',max_length=50)
87 | uid = models.CharField(u'设备来源',max_length=50)
88 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
89 | data = HStoreField()
90 | url = models.CharField(u'访问地址',max_length=300,null=True,blank=True)
91 | file = models.CharField(u'原页面',max_length=100,null=True,blank=True)
92 |
93 | def __unicode__(self):
94 | return self.scrapyname
95 |
96 | def data_str(self):
97 | data_str = ''
98 | for key,val in self.data.items():
99 | if not val:
100 | val = ''
101 | data_str += key+'=>'+val+'///'
102 | return data_str
103 | data_str.short_description = u'数据信息'
104 |
105 | def page_pdf(self):
106 | if self.file:
107 | url = '/medias/web/'+self.file
108 | return '%s' % (url,self.file)
109 | else:
110 | return ''
111 | page_pdf.allow_tags = True
112 | page_pdf.short_description = u'页面pdf'
113 |
114 | class Meta:
115 | verbose_name = u'数据信息'
116 | verbose_name_plural = u'数据信息管理'
117 |
118 | class ErrorData(models.Model):
119 | scrapyname = models.CharField(u'爬虫名',max_length=50)
120 | uid = models.CharField(u'设备来源',max_length=50)
121 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
122 | url = models.CharField(u'访问地址',max_length=300)
123 | content = models.CharField(u'错误信息',max_length=300)
124 |
125 | def __unicode__(self):
126 | return self.scrapyname
127 |
128 | class Meta:
129 | verbose_name = u'错误信息'
130 | verbose_name_plural = u'错误信息管理'
131 |
132 |
133 | class DataAlarm(models.Model):
134 | scrapyname = models.CharField(u'爬虫名',max_length=50)
135 | is_alarm = models.BooleanField(u'是否预警',default=True)
136 | remark = models.TextField(u'原因备注',null=True,blank=True)
137 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
138 |
139 | def __unicode__(self):
140 | return self.scrapyname
141 |
142 | class Meta:
143 | verbose_name = u'爬虫预警'
144 | verbose_name_plural = u'爬虫预警管理'
--------------------------------------------------------------------------------
/datadeal/datadeal/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for datadeal project.
3 |
4 | Generated by 'django-admin startproject' using Django 1.9.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.9/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/1.9/ref/settings/
11 | """
12 |
13 | import os
14 |
15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 |
18 | DB_USER = "postgres"
19 | DB_PASSWORD = 'bigdata123'
20 | DB_HOST = '10.20.1.50'
21 | DB_NAME = 'scrapydata'
22 | # Quick-start development settings - unsuitable for production
23 | # See https://docs.djangoproject.com/en/1.9/howto/deployment/checklist/
24 |
25 | # SECURITY WARNING: keep the secret key used in production secret!
26 | SECRET_KEY = '02kavqnmt!id6az-9*7i0q*r!ek-vx&f87u$=cqnky()ic!52('
27 |
28 | # SECURITY WARNING: don't run with debug turned on in production!
29 | DEBUG = True
30 |
31 | ALLOWED_HOSTS = []
32 |
33 |
34 | # Application definition
35 |
36 | INSTALLED_APPS = [
37 | 'django.contrib.admin',
38 | 'django.contrib.auth',
39 | 'django.contrib.contenttypes',
40 | 'django.contrib.sessions',
41 | 'django.contrib.messages',
42 | 'django.contrib.staticfiles',
43 | 'django.contrib.postgres',
44 | 'crispy_forms',
45 | 'xadmin',
46 | 'datadeal',
47 | 'distribute',
48 | 'company',
49 | ]
50 |
51 | MIDDLEWARE_CLASSES = [
52 | 'django.middleware.security.SecurityMiddleware',
53 | 'django.contrib.sessions.middleware.SessionMiddleware',
54 | 'django.middleware.common.CommonMiddleware',
55 | # 'django.middleware.csrf.CsrfViewMiddleware',
56 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
57 | 'django.contrib.auth.middleware.SessionAuthenticationMiddleware',
58 | 'django.contrib.messages.middleware.MessageMiddleware',
59 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
60 | ]
61 |
62 | ROOT_URLCONF = 'datadeal.urls'
63 |
64 | TEMPLATES = [
65 | {
66 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
67 | 'DIRS': [],
68 | 'APP_DIRS': True,
69 | 'OPTIONS': {
70 | 'context_processors': [
71 | 'django.template.context_processors.debug',
72 | 'django.template.context_processors.request',
73 | 'django.contrib.auth.context_processors.auth',
74 | 'django.contrib.messages.context_processors.messages',
75 | ],
76 | },
77 | },
78 | ]
79 |
80 | WSGI_APPLICATION = 'datadeal.wsgi.application'
81 |
82 |
83 | # Database
84 | # https://docs.djangoproject.com/en/1.9/ref/settings/#databases
85 |
86 | DATABASES = {
87 | 'default': {
88 | 'ENGINE': 'django.db.backends.postgresql_psycopg2',
89 | 'NAME': DB_NAME,
90 | 'USER': DB_USER,
91 | 'PASSWORD': DB_PASSWORD,
92 | 'HOST': DB_HOST,
93 | }
94 | }
95 |
96 | # Password validation
97 | # https://docs.djangoproject.com/en/1.9/ref/settings/#auth-password-validators
98 |
99 | AUTH_PASSWORD_VALIDATORS = [
100 | {
101 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
102 | },
103 | {
104 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
105 | },
106 | {
107 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
108 | },
109 | {
110 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
111 | },
112 | ]
113 |
114 |
115 | # Internationalization
116 | # https://docs.djangoproject.com/en/1.9/topics/i18n/
117 |
118 | LANGUAGE_CODE = 'zh-Hans'
119 |
120 | TIME_ZONE = 'Asia/Shanghai'
121 |
122 | USE_I18N = True
123 |
124 | USE_L10N = True
125 |
126 | USE_TZ = False
127 |
128 |
129 | # Static files (CSS, JavaScript, Images)
130 | # https://docs.djangoproject.com/en/1.9/howto/static-files/
131 |
132 | STATIC_URL = '/static/'
133 | MEDIA_URL = '/medias/'
134 | STATIC_ROOT = os.path.join(os.path.dirname(__file__), './static/').replace('\\', '/')
135 | MEDIA_ROOT = os.path.join(os.path.dirname(__file__), './medias/').replace('\\', '/')
--------------------------------------------------------------------------------
/datadeal/datadeal/static/css/jquery.dataTables.min.css:
--------------------------------------------------------------------------------
1 | table.dataTable{width:100%;margin:0 auto;clear:both;border-collapse:separate;border-spacing:0}table.dataTable thead th,table.dataTable tfoot th{font-weight:bold}table.dataTable thead th,table.dataTable thead td{padding:10px 18px;border-bottom:1px solid #111}table.dataTable thead th:active,table.dataTable thead td:active{outline:none}table.dataTable tfoot th,table.dataTable tfoot td{padding:10px 18px 6px 18px;border-top:1px solid #111}table.dataTable thead .sorting,table.dataTable thead .sorting_asc,table.dataTable thead .sorting_desc,table.dataTable thead .sorting_asc_disabled,table.dataTable thead .sorting_desc_disabled{cursor:pointer;*cursor:hand}table.dataTable thead .sorting,table.dataTable thead .sorting_asc,table.dataTable thead .sorting_desc,table.dataTable thead .sorting_asc_disabled,table.dataTable thead .sorting_desc_disabled{background-repeat:no-repeat;background-position:center right}table.dataTable thead .sorting{background-image:url("../images/sort_both.png")}table.dataTable thead .sorting_asc{background-image:url("../images/sort_asc.png")}table.dataTable thead .sorting_desc{background-image:url("../images/sort_desc.png")}table.dataTable thead .sorting_asc_disabled{background-image:url("../images/sort_asc_disabled.png")}table.dataTable thead .sorting_desc_disabled{background-image:url("../images/sort_desc_disabled.png")}table.dataTable tbody tr{background-color:#ffffff}table.dataTable tbody tr.selected{background-color:#B0BED9}table.dataTable tbody th,table.dataTable tbody td{padding:8px 10px}table.dataTable.row-border tbody th,table.dataTable.row-border tbody td,table.dataTable.display tbody th,table.dataTable.display tbody td{border-top:1px solid #ddd}table.dataTable.row-border tbody tr:first-child th,table.dataTable.row-border tbody tr:first-child td,table.dataTable.display tbody tr:first-child th,table.dataTable.display tbody tr:first-child td{border-top:none}table.dataTable.cell-border tbody th,table.dataTable.cell-border tbody td{border-top:1px solid #ddd;border-right:1px solid #ddd}table.dataTable.cell-border tbody tr th:first-child,table.dataTable.cell-border tbody tr td:first-child{border-left:1px solid #ddd}table.dataTable.cell-border tbody tr:first-child th,table.dataTable.cell-border tbody tr:first-child td{border-top:none}table.dataTable.stripe tbody tr.odd,table.dataTable.display tbody tr.odd{background-color:#f9f9f9}table.dataTable.stripe tbody tr.odd.selected,table.dataTable.display tbody tr.odd.selected{background-color:#acbad4}table.dataTable.hover tbody tr:hover,table.dataTable.display tbody tr:hover{background-color:#f6f6f6}table.dataTable.hover tbody tr:hover.selected,table.dataTable.display tbody tr:hover.selected{background-color:#aab7d1}table.dataTable.order-column tbody tr>.sorting_1,table.dataTable.order-column tbody tr>.sorting_2,table.dataTable.order-column tbody tr>.sorting_3,table.dataTable.display tbody tr>.sorting_1,table.dataTable.display tbody tr>.sorting_2,table.dataTable.display tbody tr>.sorting_3{background-color:#fafafa}table.dataTable.order-column tbody tr.selected>.sorting_1,table.dataTable.order-column tbody tr.selected>.sorting_2,table.dataTable.order-column tbody tr.selected>.sorting_3,table.dataTable.display tbody tr.selected>.sorting_1,table.dataTable.display tbody tr.selected>.sorting_2,table.dataTable.display tbody tr.selected>.sorting_3{background-color:#acbad5}table.dataTable.display tbody tr.odd>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd>.sorting_1{background-color:#f1f1f1}table.dataTable.display tbody tr.odd>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd>.sorting_2{background-color:#f3f3f3}table.dataTable.display tbody tr.odd>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd>.sorting_3{background-color:whitesmoke}table.dataTable.display tbody tr.odd.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_1{background-color:#a6b4cd}table.dataTable.display tbody tr.odd.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_2{background-color:#a8b5cf}table.dataTable.display tbody tr.odd.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.odd.selected>.sorting_3{background-color:#a9b7d1}table.dataTable.display tbody tr.even>.sorting_1,table.dataTable.order-column.stripe tbody tr.even>.sorting_1{background-color:#fafafa}table.dataTable.display tbody tr.even>.sorting_2,table.dataTable.order-column.stripe tbody tr.even>.sorting_2{background-color:#fcfcfc}table.dataTable.display tbody tr.even>.sorting_3,table.dataTable.order-column.stripe tbody tr.even>.sorting_3{background-color:#fefefe}table.dataTable.display tbody tr.even.selected>.sorting_1,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_1{background-color:#acbad5}table.dataTable.display tbody tr.even.selected>.sorting_2,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_2{background-color:#aebcd6}table.dataTable.display tbody tr.even.selected>.sorting_3,table.dataTable.order-column.stripe tbody tr.even.selected>.sorting_3{background-color:#afbdd8}table.dataTable.display tbody tr:hover>.sorting_1,table.dataTable.order-column.hover tbody tr:hover>.sorting_1{background-color:#eaeaea}table.dataTable.display tbody tr:hover>.sorting_2,table.dataTable.order-column.hover tbody tr:hover>.sorting_2{background-color:#ececec}table.dataTable.display tbody tr:hover>.sorting_3,table.dataTable.order-column.hover tbody tr:hover>.sorting_3{background-color:#efefef}table.dataTable.display tbody tr:hover.selected>.sorting_1,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_1{background-color:#a2aec7}table.dataTable.display tbody tr:hover.selected>.sorting_2,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_2{background-color:#a3b0c9}table.dataTable.display tbody tr:hover.selected>.sorting_3,table.dataTable.order-column.hover tbody tr:hover.selected>.sorting_3{background-color:#a5b2cb}table.dataTable.no-footer{border-bottom:1px solid #111}table.dataTable.nowrap th,table.dataTable.nowrap td{white-space:nowrap}table.dataTable.compact thead th,table.dataTable.compact thead td{padding:4px 17px 4px 4px}table.dataTable.compact tfoot th,table.dataTable.compact tfoot td{padding:4px}table.dataTable.compact tbody th,table.dataTable.compact tbody td{padding:4px}table.dataTable th.dt-left,table.dataTable td.dt-left{text-align:left}table.dataTable th.dt-center,table.dataTable td.dt-center,table.dataTable td.dataTables_empty{text-align:center}table.dataTable th.dt-right,table.dataTable td.dt-right{text-align:right}table.dataTable th.dt-justify,table.dataTable td.dt-justify{text-align:justify}table.dataTable th.dt-nowrap,table.dataTable td.dt-nowrap{white-space:nowrap}table.dataTable thead th.dt-head-left,table.dataTable thead td.dt-head-left,table.dataTable tfoot th.dt-head-left,table.dataTable tfoot td.dt-head-left{text-align:left}table.dataTable thead th.dt-head-center,table.dataTable thead td.dt-head-center,table.dataTable tfoot th.dt-head-center,table.dataTable tfoot td.dt-head-center{text-align:center}table.dataTable thead th.dt-head-right,table.dataTable thead td.dt-head-right,table.dataTable tfoot th.dt-head-right,table.dataTable tfoot td.dt-head-right{text-align:right}table.dataTable thead th.dt-head-justify,table.dataTable thead td.dt-head-justify,table.dataTable tfoot th.dt-head-justify,table.dataTable tfoot td.dt-head-justify{text-align:justify}table.dataTable thead th.dt-head-nowrap,table.dataTable thead td.dt-head-nowrap,table.dataTable tfoot th.dt-head-nowrap,table.dataTable tfoot td.dt-head-nowrap{white-space:nowrap}table.dataTable tbody th.dt-body-left,table.dataTable tbody td.dt-body-left{text-align:left}table.dataTable tbody th.dt-body-center,table.dataTable tbody td.dt-body-center{text-align:center}table.dataTable tbody th.dt-body-right,table.dataTable tbody td.dt-body-right{text-align:right}table.dataTable tbody th.dt-body-justify,table.dataTable tbody td.dt-body-justify{text-align:justify}table.dataTable tbody th.dt-body-nowrap,table.dataTable tbody td.dt-body-nowrap{white-space:nowrap}table.dataTable,table.dataTable th,table.dataTable td{-webkit-box-sizing:content-box;box-sizing:content-box}.dataTables_wrapper{position:relative;clear:both;*zoom:1;zoom:1}.dataTables_wrapper .dataTables_length{float:left}.dataTables_wrapper .dataTables_filter{float:right;text-align:right}.dataTables_wrapper .dataTables_filter input{margin-left:0.5em}.dataTables_wrapper .dataTables_info{clear:both;float:left;padding-top:0.755em}.dataTables_wrapper .dataTables_paginate{float:right;text-align:right;padding-top:0.25em}.dataTables_wrapper .dataTables_paginate .paginate_button{box-sizing:border-box;display:inline-block;min-width:1.5em;padding:0.5em 1em;margin-left:2px;text-align:center;text-decoration:none !important;cursor:pointer;*cursor:hand;color:#333 !important;border:1px solid transparent;border-radius:2px}.dataTables_wrapper .dataTables_paginate .paginate_button.current,.dataTables_wrapper .dataTables_paginate .paginate_button.current:hover{color:#333 !important;border:1px solid #979797;background-color:white;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #fff), color-stop(100%, #dcdcdc));background:-webkit-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-moz-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-ms-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:-o-linear-gradient(top, #fff 0%, #dcdcdc 100%);background:linear-gradient(to bottom, #fff 0%, #dcdcdc 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button.disabled,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:hover,.dataTables_wrapper .dataTables_paginate .paginate_button.disabled:active{cursor:default;color:#666 !important;border:1px solid transparent;background:transparent;box-shadow:none}.dataTables_wrapper .dataTables_paginate .paginate_button:hover{color:white !important;border:1px solid #111;background-color:#585858;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #585858), color-stop(100%, #111));background:-webkit-linear-gradient(top, #585858 0%, #111 100%);background:-moz-linear-gradient(top, #585858 0%, #111 100%);background:-ms-linear-gradient(top, #585858 0%, #111 100%);background:-o-linear-gradient(top, #585858 0%, #111 100%);background:linear-gradient(to bottom, #585858 0%, #111 100%)}.dataTables_wrapper .dataTables_paginate .paginate_button:active{outline:none;background-color:#2b2b2b;background:-webkit-gradient(linear, left top, left bottom, color-stop(0%, #2b2b2b), color-stop(100%, #0c0c0c));background:-webkit-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-moz-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-ms-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:-o-linear-gradient(top, #2b2b2b 0%, #0c0c0c 100%);background:linear-gradient(to bottom, #2b2b2b 0%, #0c0c0c 100%);box-shadow:inset 0 0 3px #111}.dataTables_wrapper .dataTables_paginate .ellipsis{padding:0 1em}.dataTables_wrapper .dataTables_processing{position:absolute;top:50%;left:50%;width:100%;height:40px;margin-left:-50%;margin-top:-25px;padding-top:20px;text-align:center;font-size:1.2em;background-color:white;background:-webkit-gradient(linear, left top, right top, color-stop(0%, rgba(255,255,255,0)), color-stop(25%, rgba(255,255,255,0.9)), color-stop(75%, rgba(255,255,255,0.9)), color-stop(100%, rgba(255,255,255,0)));background:-webkit-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-moz-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-ms-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:-o-linear-gradient(left, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%);background:linear-gradient(to right, rgba(255,255,255,0) 0%, rgba(255,255,255,0.9) 25%, rgba(255,255,255,0.9) 75%, rgba(255,255,255,0) 100%)}.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter,.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_processing,.dataTables_wrapper .dataTables_paginate{color:#333}.dataTables_wrapper .dataTables_scroll{clear:both}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody{*margin-top:-1px;-webkit-overflow-scrolling:touch}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td{vertical-align:middle}.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>thead>tr>td>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>th>div.dataTables_sizing,.dataTables_wrapper .dataTables_scroll div.dataTables_scrollBody>table>tbody>tr>td>div.dataTables_sizing{height:0;overflow:hidden;margin:0 !important;padding:0 !important}.dataTables_wrapper.no-footer .dataTables_scrollBody{border-bottom:1px solid #111}.dataTables_wrapper.no-footer div.dataTables_scrollHead>table,.dataTables_wrapper.no-footer div.dataTables_scrollBody>table{border-bottom:none}.dataTables_wrapper:after{visibility:hidden;display:block;content:"";clear:both;height:0}@media screen and (max-width: 767px){.dataTables_wrapper .dataTables_info,.dataTables_wrapper .dataTables_paginate{float:none;text-align:center}.dataTables_wrapper .dataTables_paginate{margin-top:0.5em}}@media screen and (max-width: 640px){.dataTables_wrapper .dataTables_length,.dataTables_wrapper .dataTables_filter{float:none;text-align:center}.dataTables_wrapper .dataTables_filter{margin-top:0.5em}}
2 |
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/Sorting icons.psd:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/Sorting icons.psd
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/favicon.ico
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/sort_asc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_asc.png
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/sort_asc_disabled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_asc_disabled.png
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/sort_both.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_both.png
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/sort_desc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_desc.png
--------------------------------------------------------------------------------
/datadeal/datadeal/static/images/sort_desc_disabled.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/datadeal/static/images/sort_desc_disabled.png
--------------------------------------------------------------------------------
/datadeal/datadeal/static/js/iframe_common.js:
--------------------------------------------------------------------------------
1 | function get_parent(child,xpath){
2 | var tag = child.get(0).tagName.toLocaleLowerCase();
3 | var i = child.index();
4 | if(child.parent().children().length == 1){
5 | var tag_name = tag;
6 | }else{
7 | var tag_name = tag+'['+(i+1)+']';
8 | }
9 | if(child.parent().attr('id')){
10 | var id = child.parent().attr('id');
11 | var root_name = child.parent().get(0).tagName.toLocaleLowerCase();
12 | xpath = '//'+root_name+'[@id=\''+id+'\']/'+tag_name+xpath;
13 | return xpath;
14 | }else{
15 | child = child.parent();
16 | xpath = '/'+tag_name+xpath;
17 | if(tag_name=='html'){
18 | return xpath;
19 | }else{
20 | return get_parent(child,xpath);
21 | }
22 | }
23 | }
24 |
25 | function get_xpath(that){
26 | var xpath = '';
27 | xpath = get_parent(that,xpath);
28 | return xpath;
29 | }
--------------------------------------------------------------------------------
/datadeal/datadeal/static/js/iframe_detail.js:
--------------------------------------------------------------------------------
1 | var variable = [];
2 | var v_dict = {};
3 | $('*').click(function(){
4 | if($(this).children().length == 0){
5 | var xpath = get_xpath($(this));
6 | $(this).css('border','3px dotted #000');
7 | if($.inArray(xpath, variable) == -1){
8 | variable.push(xpath);
9 | v_dict[xpath] = $(this);
10 | parent.set_variable(variable,'detail_table');
11 | }
12 | }
13 | })
14 |
15 | function childrenup(xpath) {
16 | change_xpath = xpath.replace(/danyin/g,'\'');
17 | var that = v_dict[change_xpath];
18 | var new_xpath = get_xpath(that.parent());
19 | if($.inArray(xpath, variable) != -1){
20 | variable.splice($.inArray(xpath,variable),1);
21 | that.css('border','');
22 | that.parent().css('border','3px dotted #000');
23 | if($.inArray(new_xpath, variable) == -1){
24 | variable.push(new_xpath);
25 | v_dict[new_xpath] = that.parent();
26 | parent.set_variable(variable,'detail_table');
27 | }
28 | }
29 |
30 | }
31 |
32 | function del_v(xpath){
33 | change_xpath = xpath.replace(/danyin/g,'\'');
34 | var that = v_dict[change_xpath];
35 | if($.inArray(xpath, variable) != -1){
36 | variable.splice($.inArray(xpath,variable),1);
37 | that.css('border','');
38 | }
39 | }
--------------------------------------------------------------------------------
/datadeal/datadeal/static/js/iframe_list.js:
--------------------------------------------------------------------------------
1 | function get_cycle(str1,str2){
2 | var cut = 0;
3 | for(i in str1){
4 | if(str1[i] == '['){
5 | cut = i;
6 | }
7 | if(str1[i] != str2[i]){
8 | break
9 | }
10 | }
11 | var cycle = str1.substr(0,cut);
12 | var variable = str2.substr(cut).split('/');
13 | variable.splice(0,1);
14 | variable = variable.join('/');
15 | return [cycle,variable]
16 | }
17 |
18 | var choice = [];
19 | var cycle = '';
20 | var v_list = [];
21 | $('*').click(function(){
22 | if($(this).children().length == 0){
23 | var xpath = get_xpath($(this));
24 | choice.push(xpath);
25 | if(choice.length > 1){
26 | var array = get_cycle(choice[0],choice[choice.length-1]);
27 | cycle = array[0];
28 | parent.set_cycle(cycle);
29 | var variable = array[1];
30 | if(variable){
31 | if($.inArray(variable, v_list) == -1){
32 | v_list.push(variable);
33 | parent.set_variable(v_list,'list_table');
34 | }
35 | }
36 | }
37 | $(this).css('border','3px dotted #000');
38 | }
39 | })
--------------------------------------------------------------------------------
/datadeal/datadeal/templates/detail_iframe.html:
--------------------------------------------------------------------------------
1 |
山东省环境保护厅--2017年山东省砖瓦行业环保专项行动排查情况
2 |
59 |
60 |
61 |
62 |
63 | |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |  |
72 |
73 |
74 |  |
75 | |
89 |
90 | |
91 |
92 |
93 |  |
94 | |
108 |
109 | |
110 |
111 |
112 |  |
113 | |
127 |
128 | |
129 |
130 |
131 |  |
132 |
133 |
134 | 网上调查 |
135 | 民意征集 |
136 |
137 |
138 | 在线访谈 |
139 | 您的建议 |
140 |
141 |
142 | 曝光台 |
143 | |
144 |
145 | |
146 |
147 | |
148 |  |
149 |
150 | |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 | |
159 |
160 |
161 | |
162 |
163 |
164 |
165 |
166 |
167 |
168 |  |
169 |
170 |
171 |
172 |
173 |  |
174 | 当前位置:首页 > 环境监察 > 案件查处 |
175 |
176 | |
177 |
178 |
179 |  |
180 | 打印本页 |
181 |  |
182 | 关闭本页 |
183 |
184 | |
185 |
186 | |
187 |  |
188 |
189 | |
190 |
191 |
192 |
197 |
198 |
199 | 2017年山东省砖瓦行业环保专项行动排查情况 |
200 |
201 |
202 | |
203 |
204 |
205 |
206 | |
207 |
208 | 发布日期:2017-03-29
209 |
210 | 来源: 作者: |
211 |
212 | |
213 | |
214 |
215 |
216 |
217 |
218 |
219 |
221 | 为贯彻落实环境保护部办公厅《关于督促开展砖瓦行业环保专项行动的函》(环办环监函〔2017〕33号)要求,我厅组织各市开展了山东省2017年砖瓦行业环保专项行动。经各市环保局排查,全省共有砖瓦企业1970家(详见附表)。
222 |
223 | 附表:山东省砖瓦企业名单
224 | |
225 |
226 |
227 |
228 |
229 | 附件:
230 |
231 |
241 | |
242 |
243 |
244 |
245 |
246 | 关闭本页 |
247 |
248 | |
249 |
250 | |
251 |
252 | |
253 |
254 |
255 | |
256 |
257 |
258 |
259 |
260 |
261 | 主办单位:山东省环境保护厅 联系电话:0531-86106112 |
262 |
263 |
264 | 地址:济南市经十路3377号,邮编:250101 |
265 |
266 |
267 | 网站管理:山东省环境保护宣传教育中心
268 | |
269 |
270 |
271 | 技术支持:山东省环境信息与监控中心 |
272 |
273 |
274 | Copyright 2012 sdein.gov.cn 鲁ICP备:09042362 |
275 |
276 |
277 |
278 | |
279 |
280 |
281 |
294 |
295 |
296 |
298 |
299 |
300 |
301 | {% load staticfiles %}
--------------------------------------------------------------------------------
/datadeal/datadeal/templates/files_admin.html:
--------------------------------------------------------------------------------
1 | {% extends 'xadmin/base_site.html' %}
2 | {% load i18n l10n %}
3 | {% load xadmin_tags %}
4 | {% load static %}
5 | {% load staticfiles %}
6 | {% block title %}
7 | 文件管理
8 | {% endblock %}
9 | {% block content-nav %}{% endblock %}
10 |
11 | {% block breadcrumbs %}
12 |
16 | {% endblock %}
17 | {% block content %}
18 |
19 |
20 | 选择文件夹:
21 |
22 |
23 |
24 |
25 | 文件名 |
26 | 创建时间 |
27 | 操作 |
28 |
29 |
30 |
31 |
32 | 文件名 |
33 | 创建时间 |
34 | 操作 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
50 |
51 | 确认要删除该文件吗?
52 |
53 |
54 |
61 |
62 |
63 |
64 |
118 | {% endblock %}
--------------------------------------------------------------------------------
/datadeal/datadeal/templates/images_admin.html:
--------------------------------------------------------------------------------
1 | {% extends 'xadmin/base_site.html' %}
2 | {% load i18n l10n %}
3 | {% load xadmin_tags %}
4 | {% load static %}
5 | {% block title %}
6 | 图片管理
7 | {% endblock %}
8 | {% block content-nav %}{% endblock %}
9 |
10 | {% block breadcrumbs %}
11 |
15 | {% endblock %}
16 | {% block content %}
17 |
20 |
21 |
22 |
23 | {% for item in img_list.object_list %}
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | 文件名:{{item.url}}
32 | 创建时间:{{item.ctime}}
33 |
34 |
35 |
36 | {% endfor %}
37 |
38 |
39 |
54 |
55 |
69 | {% endblock %}
--------------------------------------------------------------------------------
/datadeal/datadeal/templates/index.html:
--------------------------------------------------------------------------------
1 | {% load staticfiles %}
2 |
3 |
4 |
5 |
6 | Index
7 |
8 |
19 |
20 |
21 |
22 | {% for k,val in kind %}
23 |
24 | {{val}}
25 |
26 | {% endfor %}
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
49 |
50 |
123 |
124 |
--------------------------------------------------------------------------------
/datadeal/datadeal/urls.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from django.conf.urls.static import static
3 | from django.conf.urls import patterns, include, url
4 | from django.views.static import serve
5 | from datadeal import settings
6 | from .views import *
7 | import xadmin
8 | xadmin.autodiscover()
9 |
10 | urlpatterns = [
11 | url(r'^admin/', include(xadmin.site.urls)),
12 | url(r'^$', IndexView.as_view() ,name='index'),
13 | url(r'^list/$', ListFrameView.as_view() ,name='list'),
14 | url(r'^detail/$', DetailFrameView.as_view() ,name='detail'),
15 | url(r'^back_html/$', AjaxBackHtmlView.as_view() ,name='back_html'),
16 | url(r'^upload_files/$', UploadFilesView.as_view() ,name='upload_files'),
17 | url(r'^zip_files/$', ZipFilesView.as_view() ,name='zip_files'),
18 | url(r'^del_file/$', DeleteFilesView.as_view() ,name='del_file'),
19 | url(r'^distribute/',include('distribute.urls')),
20 | url(r'^medias/(?P.*)$', serve, {'document_root':settings.MEDIA_ROOT}),
21 | url(r'^static/(?P.*)$', serve, {'document_root':settings.STATIC_ROOT}),
22 | ]
--------------------------------------------------------------------------------
/datadeal/datadeal/views.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.views.generic import TemplateView,View
3 | from django.http import HttpResponse,HttpResponseRedirect
4 | from datadeal.settings import BASE_DIR
5 | from .models import scrapySetting
6 | import urllib,urllib2
7 | import bs4
8 | import os
9 |
10 | def get_nocycle_variables(nocycle_variable):
11 | v_dict = ''
12 | result = ''
13 | nvname_list = []
14 | for nov in nocycle_variable:
15 | nvname_list.append(nov.name)
16 | v_dict += '\'%s\':%s,' % (nov.name,nov.name)
17 | if nov.all_text:
18 | all_text = '.xpath(\'string(.)\')'
19 | else:
20 | all_text = ''
21 | result += ' '*8+nov.name+' = response.xpath(\''+nov.xpath+'\')'+all_text+'.extract_first()\n'
22 | return v_dict,result,nvname_list
23 |
24 |
25 | def create_scrapy_file(q):
26 | """
27 | 创建scrapy爬虫文件方法
28 | """
29 | allow_domains = '['
30 | for i,val in enumerate(q.allow_domains.all()):
31 | if i == len(q.allow_domains.all())-1:
32 | allow_domains += '"'+val.name+'"'
33 | else:
34 | allow_domains += '"'+val.name+'"'+ ','
35 | allow_domains += ']'
36 | start_requests = ' def start_requests(self):\n'+' '*8+'results = getTasks(\''+q.name+'\')\n'+' '*8+'self.taks_urls = {}\n'+' '*8+'self.tasks = {}\n'+' '*8+'if isinstance(results,dict):\n'+' '*12+'print results[\'error\']\n'+' '*8+'else:\n'+' '*12+'for re in results:\n'+' '*16+'self.tasks[re[\'id\']] = {\'t_count\':len(re[\'urls\']),\'count\':0}\n'+' '*16+'for u in re[\'urls\']:\n'+' '*20+'self.taks_urls[u] = re[\'id\']\n'+' '*20+'yield self.make_requests_from_url(u)\n\n'
37 | after_parse = ' def after_parse(self,url):\n'+' '*8+'task_id = self.taks_urls[url]\n'+' '*8+'self.tasks[task_id][\'count\'] += 1\n'+' '*8+'if self.tasks[task_id][\'count\'] == self.tasks[task_id][\'t_count\']:\n'+' '*12+'afterTasks(task_id)\n\n'
38 |
39 | nocycle_variable = q.variable.filter(kind=1)
40 | v_dict,cycleobj,nvname_list = get_nocycle_variables(nocycle_variable)
41 | cycleobjs = q.cycleobj.all()
42 | if len(cycleobjs):
43 | next_url = '' #判断是否有子查询链接
44 | next_variable = []
45 | v_list = []
46 | total_v_list = []
47 | total_v_list += nvname_list
48 | v_list += nvname_list
49 | c_dict = ''
50 | for c in cycleobjs:
51 | variables = c.variable.all()
52 | variable = ''
53 | for v in variables:
54 | #包含子标签文本提取
55 | if v.all_text:
56 | v_str = '%s = i.xpath(\'%s\').xpath(\'string(.)\').extract_first()\n' % (v.name,v.xpath)
57 | else:
58 | v_str = '%s = i.xpath(\'%s\').extract_first()\n' % (v.name,v.xpath)
59 | if v.kind == 1:
60 | variable += ' '*12+v_str
61 | v_dict += '\'%s\':%s,' % (v.name,v.name)
62 | c_dict += '\'%s\':%s,' % (v.name,v.name)
63 | v_list.append(v.name)
64 | total_v_list.append(v.name)
65 | elif v.kind == 2:
66 | next_variable.append({'name':v.name,'xpath':v.xpath,'all_text':v.all_text})
67 | if not v.name in total_v_list:
68 | total_v_list.append(v.name)
69 | elif v.kind == 3:
70 | next_url = v.name
71 | variable += ' '*12+v_str
72 | if next_url:
73 | cycleobj += ' '*8+c.name+' = response.xpath(\''+c.xpath+'\')\n'+' '*8+'for i in %s:\n%s' % (c.name,variable)
74 | cycleobj += ' '*12+next_url+' = set_url_head('+next_url+',response.url)\n'+' '*12+'if '+next_url+':\n'+' '*16+'yield scrapy.Request('+next_url+', callback=self.parse_item,meta={'+v_dict+'})\n'
75 | else:
76 | nvname_list.append(c.name+'_data')
77 | c_dict = c_dict[0:-1]
78 | cycleobj += ' '*8+c.name+' = response.xpath(\''+c.xpath+'\')\n'+' '*8+c.name+'_data = []\n'+' '*8+'for i in '+c.name+':\n'+variable+' '*12+c.name+'_data.append({'+c_dict+'})\n'
79 | cycleobj += ' '*8+'self.after_parse(response.url)\n'
80 | if next_url:
81 | cycleobj += '\n'+' '*4+'def parse_item(self, response):\n'
82 | for vl in v_list:
83 | cycleobj += ' '*8+vl+' = response.meta[\''+vl+'\']\n'
84 | for nv in next_variable:
85 | if nv['all_text']:
86 | cycleobj += ' '*8+'%s = response.xpath(\'%s\').xpath(\'string(.)\').extract_first()\n' % (nv['name'],nv['xpath'])
87 | else:
88 | cycleobj += ' '*8+'%s = response.xpath(\'%s\').extract_first()\n' % (nv['name'],nv['xpath'])
89 | data = ''
90 | for total in total_v_list:
91 | data += '\'%s\':%s,' % (total,total)
92 | data = data[0:-1]
93 | cycleobj += ' '*8+'sendData(\'%s\',{%s},response.url)' % (q.name,data)
94 | else:
95 | no_next = ''
96 | for n in nvname_list:
97 | no_next += '\'%s\':%s,' % (n,n)
98 | no_next = no_next[0:-1]
99 | cycleobj += ' '*8+'sendData(\'%s\',{%s},response.url)' % (q.name,no_next)
100 | else: # 单页面爬虫
101 | v_dict = v_dict[0:-1]
102 | cycleobj = nv_str+' '*8+'data = {'+v_dict+'}\n'+' '*8+'self.after_parse(response.url)\n'+' '*8+'sendData(\''+q.name+'\',data,response.url)'
103 |
104 | with open(BASE_DIR+'/../searchInfo/searchInfo/spiders/%s.py' % q.name,'w') as f:
105 | f.write('# -*- coding: utf-8 -*-\nimport scrapy\nfrom distribute.views import *\n\nclass '+q.name+'Spider(scrapy.Spider):\n name = "'+q.name+'"\n allowed_domains = '+allow_domains+'\n\n'+start_requests+after_parse+' def parse(self, response):\n'+cycleobj)
106 |
107 | class IndexView(TemplateView):
108 | template_name = 'index.html'
109 |
110 | def get_context_data(self, **kwargs):
111 | context = super(IndexView, self).get_context_data()
112 |
113 | context.update({
114 | 'kind':scrapySetting.KIND_CHOICES
115 | })
116 | return context
117 |
118 | class AjaxBackHtmlView(View):
119 |
120 | def post(self,request):
121 | url = request.POST.get('url','')
122 | frame = request.POST.get('frame','')
123 | proto, rest = urllib.splittype(url)
124 | domain, rest = urllib.splithost(rest)
125 | f = urllib2.urlopen(url)
126 | result = f.read()
127 | soup = bs4.BeautifulSoup(result,'lxml')
128 | body = soup.find('html')
129 | # css样式链接替换
130 | link = body.find_all('link')
131 | for l in link:
132 | if l['href'].startswith('/'):
133 | l['href'] = proto+'://'+domain+l['href']
134 | elif l['href'].startswith('../'):
135 | last = l['href'].split('../')[-1]
136 | l['href'] = proto+'://'+domain+'/'+last
137 | # js链接替换
138 | script = body.find_all('script')
139 | for s in script:
140 | if s.has_key('src'):
141 | if s['src'].startswith('/'):
142 | s['src'] = proto+'://'+domain+s['src']
143 | elif s['src'].startswith('../'):
144 | last = s['src'].split('../')[-1]
145 | s['src'] = proto+'://'+domain+'/'+last
146 | # img链接替换
147 | img = body.find_all('img')
148 | for g in img:
149 | if g.has_key('src'):
150 | if g['src'].startswith('/'):
151 | g['src'] = proto+'://'+domain+g['src']
152 | elif g['src'].startswith('../'):
153 | last = g['src'].split('../')[-1]
154 | g['src'] = proto+'://'+domain+'/'+last
155 | # a标签链接禁止点击
156 | a = body.find_all('a')
157 | for i in a:
158 | href = i['href']
159 | i['href'] = 'javascript:void(0);'
160 | i['href_bak'] = href
161 |
162 | if frame == 'list_iframe':
163 | result = 'list/'
164 | with open(BASE_DIR+'/datadeal/templates/'+frame+'.html','w') as f:
165 | f.write(str(body))
166 | f.write('\n{% load staticfiles %}')
167 | elif frame == 'detail_iframe':
168 | result = 'detail/'
169 | with open(BASE_DIR+'/datadeal/templates/'+frame+'.html','w') as f:
170 | f.write(str(body))
171 | f.write('\n{% load staticfiles %}')
172 | else:
173 | result = 'other'
174 | return HttpResponse(result)
175 |
176 | class ListFrameView(TemplateView):
177 | template_name = 'list_iframe.html'
178 |
179 | class DetailFrameView(TemplateView):
180 | template_name = 'detail_iframe.html'
181 |
182 |
183 | class UploadFilesView(View):
184 | def post(self,request):
185 | count = 0
186 | for name,file in request.FILES.items():
187 | dir_path = BASE_DIR+'/datadeal/medias/'+name.split('/')[0]
188 | if not os.path.exists(dir_path):
189 | os.makedirs(dir_path)
190 | path = BASE_DIR+'/datadeal/medias/'+name
191 | if not os.path.exists(path):
192 | with open(path, 'wb') as f:
193 | f.write(file.read())
194 | count += 1
195 | return HttpResponse(u'已上传%s项文件' % count)
196 |
197 | class ZipFilesView(View):
198 | def post(self,request):
199 | file_type = request.POST.get('type','')
200 | import zipfile
201 | zp_name = BASE_DIR+'/datadeal/medias/'+file_type+'.zip'
202 | file_list = []
203 | if os.path.exists(zp_name):
204 | z_r = zipfile.ZipFile(zp_name, mode='r')
205 | for filename in z_r.namelist():
206 | file_list.append(filename)
207 | z_r.close()
208 |
209 | file_dir = BASE_DIR+'/datadeal/medias/'+file_type
210 | files = os.listdir(file_dir)
211 | for f in files:
212 | if not f in file_list:
213 | zpfd = zipfile.ZipFile(zp_name, mode='a',compression=zipfile.ZIP_DEFLATED)
214 | zpfd.write(file_dir+'/'+f,f)
215 | zpfd.close()
216 | return HttpResponse('/medias/'+file_type+'.zip')
217 |
218 | class DeleteFilesView(View):
219 | def post(self,request):
220 | file_name = request.POST.get('file_name','')
221 | file_path = BASE_DIR+'/datadeal/medias/common/'+file_name
222 | status = False
223 | if os.path.exists(file_path):
224 | os.remove(file_path)
225 | status = True
226 | return HttpResponse(status)
--------------------------------------------------------------------------------
/datadeal/datadeal/wsgi.py:
--------------------------------------------------------------------------------
1 | """
2 | WSGI config for datadeal project.
3 |
4 | It exposes the WSGI callable as a module-level variable named ``application``.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.9/howto/deployment/wsgi/
8 | """
9 |
10 | import os
11 |
12 | from django.core.wsgi import get_wsgi_application
13 |
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings")
15 |
16 | application = get_wsgi_application()
17 |
--------------------------------------------------------------------------------
/datadeal/distribute/__init__.py:
--------------------------------------------------------------------------------
1 | default_app_config = "distribute.apps.DistributeConfig"
--------------------------------------------------------------------------------
/datadeal/distribute/adminx.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import xadmin
3 | from .models import *
4 |
5 | class NodeAdmin(object):
6 | list_display = ['uid','status','ips','max_num']
7 | search_fields = ['uid']
8 | list_filter = ['status']
9 | list_editable = ['status']
10 | xadmin.site.register(Node, NodeAdmin)
11 |
12 | class NodeIpAdmin(object):
13 | list_display = ['ip','create_at']
14 | search_fields = ['ip']
15 | list_filter = ['create_at']
16 | xadmin.site.register(NodeIp, NodeIpAdmin)
17 |
18 | class NodeTaskAdmin(object):
19 | list_display = ['name','scrapy','priority','urls','status','create_at','get_at','over_at','node','nodeip']
20 | search_fields = ['name']
21 | list_filter = ['scrapy','node','status','get_at','over_at','create_at']
22 | xadmin.site.register(NodeTask, NodeTaskAdmin)
--------------------------------------------------------------------------------
/datadeal/distribute/apps.py:
--------------------------------------------------------------------------------
1 | #_*_ encoding:utf-8 _*_
2 | from django.apps import AppConfig
3 |
4 | class DistributeConfig(AppConfig):
5 | name = 'distribute'
6 | verbose_name = "节点管理"
--------------------------------------------------------------------------------
/datadeal/distribute/management/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/management/__init__.py
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/management/commands/__init__.py
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/get_spiders.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from distribute.views import get_mac_address,HOST
4 | from datadeal.settings import BASE_DIR
5 | import urllib,urllib2
6 | import json
7 |
8 | class Command(BaseCommand):
9 | help = '同步主机spider文件'
10 |
11 | def handle(self, *args, **options):
12 | mac = get_mac_address()
13 | posturl = HOST+'/distribute/get_spiders/'
14 | data = {'uid':mac}
15 | data = urllib.urlencode(data)
16 | f = urllib2.urlopen(posturl,data)
17 | result = json.loads(f.read())
18 |
19 | if result.has_key('error'):
20 | print result['error']
21 | else:
22 | for key,val in result.items():
23 | with open(BASE_DIR+'/../searchInfo/searchInfo/spiders/'+key,'w') as s_file:
24 | s_file.write(val)
25 | print u'同步完成'
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/mongo_test.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from datadeal.models import SpiderData
4 |
5 | class Command(BaseCommand):
6 | help = 'test'
7 |
8 | def handle(self, *args, **options):
9 | # print SpiderData.objects(__raw__={'data.处罚结果(种类) ': '罚款 '})
10 | data = SpiderData.objects.filter(id=21475)
11 | # url_list = []
12 | # with open('d://project/commonscrapy/selenium/url_list.txt','r') as file:
13 | # for line in file.readlines():
14 | # if line.replace('\n',''):
15 | # url_list.append(line.replace('\n',''))
16 |
17 | # for url in url_list:
18 | # data = SpiderData.objects.filter(url=url)
19 | # for d in data:
20 |
21 | # print d.url
22 | # d.delete()
23 | for d in data:
24 | print d.data
25 | # for key,val in d.data.items():
26 | # print key,val
27 |
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/register_node.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from distribute.views import get_mac_address,HOST
4 | import urllib,urllib2
5 | import json
6 |
7 | class Command(BaseCommand):
8 | help = '注册节点mac地址'
9 |
10 | def handle(self, *args, **options):
11 | mac = get_mac_address()
12 | posturl = HOST+'/distribute/create_node/'
13 | data = {'uid':mac}
14 | data = urllib.urlencode(data)
15 | f = urllib2.urlopen(posturl,data)
16 | result = f.read().decode('utf8')
17 | print result
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/start_spider.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from distribute.views import get_mac_address,HOST
4 | from datadeal.settings import BASE_DIR
5 | import urllib,urllib2
6 | import json
7 | import os
8 |
9 | class Command(BaseCommand):
10 | help = '开始爬取数据'
11 |
12 | def handle(self, *args, **options):
13 | posturl = HOST+'/distribute/get_spidername/'
14 | data = {}
15 | data = urllib.urlencode(data)
16 | f = urllib2.urlopen(posturl,data)
17 | result = f.read().decode('utf8')
18 | if result:
19 | os.system('cd %s/../searchInfo&&scrapy crawl %s' % (BASE_DIR,result))
20 | else:
21 | print u'暂时没有可执行任务'
--------------------------------------------------------------------------------
/datadeal/distribute/management/commands/upload_files.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.core.management.base import BaseCommand, CommandError
3 | from distribute.views import get_mac_address,HOST
4 | from datadeal.settings import BASE_DIR
5 | import requests
6 | import os
7 |
8 | class Command(BaseCommand):
9 | help = '上传本机medias下的下载文件至主机'
10 |
11 | def add_arguments(self, parser):
12 | parser.add_argument('args', metavar='spider_label', nargs='*',
13 | help='Specify the spider dir to upload.')
14 |
15 | def handle(self, *args, **options):
16 | dir_name = BASE_DIR+'/datadeal/medias'
17 | upload_files = {}
18 | files = os.listdir(dir_name)
19 | if len(args) > 0:
20 | dir_list = args
21 | else:
22 | dir_list = []
23 | for f in files:
24 | if not '.' in f:
25 | dir_list.append(f)
26 | for d in dir_list:
27 | d_files = os.listdir(dir_name+'/'+d)
28 | for df in d_files:
29 | upload_files[d+'/'+df]=open(dir_name+'/'+d+'/'+df,'rb')
30 | url = HOST+'/upload_files/'
31 | response = requests.post(url,files=upload_files)
32 | print response.content.decode('utf8')
--------------------------------------------------------------------------------
/datadeal/distribute/migrations/0001_initial.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-05 09:36
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 | import django.db.models.deletion
7 |
8 |
9 | class Migration(migrations.Migration):
10 |
11 | initial = True
12 |
13 | dependencies = [
14 | ('datadeal', '0001_initial'),
15 | ]
16 |
17 | operations = [
18 | migrations.CreateModel(
19 | name='Node',
20 | fields=[
21 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
22 | ('uid', models.CharField(max_length=50, verbose_name='uid')),
23 | ('status', models.BooleanField(default=True, verbose_name='\u662f\u5426\u5f00\u542f')),
24 | ],
25 | options={
26 | 'verbose_name': '\u8282\u70b9\u7ba1\u7406',
27 | 'verbose_name_plural': '\u8282\u70b9\u7ba1\u7406',
28 | },
29 | ),
30 | migrations.CreateModel(
31 | name='NodeTask',
32 | fields=[
33 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
34 | ('name', models.CharField(max_length=50, verbose_name='\u4efb\u52a1\u540d')),
35 | ('priority', models.IntegerField(default=0, verbose_name='\u4efb\u52a1\u4f18\u5148\u7ea7')),
36 | ('status', models.IntegerField(choices=[(1, '\u5f85\u91c7\u96c6'), (2, '\u5df2\u5b8c\u6210')], default=1, verbose_name='\u4efb\u52a1\u72b6\u6001')),
37 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
38 | ('get_at', models.DateTimeField(blank=True, null=True, verbose_name='\u4efb\u52a1\u9886\u53d6\u65f6\u95f4')),
39 | ('over_at', models.DateTimeField(blank=True, null=True, verbose_name='\u4efb\u52a1\u5b8c\u6210\u65f6\u95f4')),
40 | ('node', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='distribute.Node', verbose_name='\u6267\u884c\u8282\u70b9')),
41 | ('scrapy', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='datadeal.scrapyList', verbose_name='\u9879\u76ee')),
42 | ('urls', models.ManyToManyField(to='datadeal.startUrls', verbose_name='\u722c\u53d6\u94fe\u63a5')),
43 | ],
44 | options={
45 | 'verbose_name': '\u4efb\u52a1\u7ba1\u7406',
46 | 'verbose_name_plural': '\u4efb\u52a1\u7ba1\u7406',
47 | },
48 | ),
49 | ]
50 |
--------------------------------------------------------------------------------
/datadeal/distribute/migrations/0002_auto_20170606_1335.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Generated by Django 1.9 on 2017-06-06 13:35
3 | from __future__ import unicode_literals
4 |
5 | from django.db import migrations, models
6 | import django.db.models.deletion
7 |
8 |
9 | class Migration(migrations.Migration):
10 |
11 | dependencies = [
12 | ('distribute', '0001_initial'),
13 | ]
14 |
15 | operations = [
16 | migrations.CreateModel(
17 | name='NodeIp',
18 | fields=[
19 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
20 | ('ip', models.CharField(max_length=100, verbose_name='ip\u5730\u5740')),
21 | ('create_at', models.DateTimeField(auto_now_add=True, verbose_name='\u521b\u5efa\u65f6\u95f4')),
22 | ],
23 | options={
24 | 'verbose_name': '\u8282\u70b9IP\u7ba1\u7406',
25 | 'verbose_name_plural': '\u8282\u70b9IP\u7ba1\u7406',
26 | },
27 | ),
28 | migrations.AddField(
29 | model_name='node',
30 | name='max_num',
31 | field=models.IntegerField(default=10, help_text='\u5355\u4f4d: \u6b21/\u5929(\u540c\u9879\u76ee\u540cip)', verbose_name='\u6700\u5927\u4efb\u52a1\u9891\u5ea6'),
32 | ),
33 | migrations.AddField(
34 | model_name='node',
35 | name='ips',
36 | field=models.ManyToManyField(to='distribute.NodeIp', verbose_name='\u5386\u53f2IP'),
37 | ),
38 | migrations.AddField(
39 | model_name='nodetask',
40 | name='nodeip',
41 | field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, to='distribute.NodeIp', verbose_name='\u6267\u884cIP'),
42 | ),
43 | ]
44 |
--------------------------------------------------------------------------------
/datadeal/distribute/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/datadeal/distribute/migrations/__init__.py
--------------------------------------------------------------------------------
/datadeal/distribute/models.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from __future__ import unicode_literals
3 |
4 | from django.db import models
5 |
6 | class Node(models.Model):
7 | uid = models.CharField(u'uid',max_length=50)
8 | status = models.BooleanField(u'是否开启',default=True)
9 | ips = models.ManyToManyField('NodeIp',verbose_name=u'历史IP',blank=True)
10 | max_num = models.IntegerField(u'最大任务频度',help_text='单位: 次/天(同项目同ip)',default=10)
11 |
12 | def __unicode__(self):
13 | return self.uid
14 |
15 | class Meta:
16 | verbose_name = u'节点管理'
17 | verbose_name_plural = u'节点管理'
18 |
19 | class NodeIp(models.Model):
20 | ip = models.CharField(u'ip地址',max_length=100)
21 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
22 |
23 | def __unicode__(self):
24 | return self.ip
25 |
26 | class Meta:
27 | verbose_name = u'节点IP管理'
28 | verbose_name_plural = u'节点IP管理'
29 |
30 | class NodeTask(models.Model):
31 | STATUS_CHOICES = ((1, u'待采集'),(2, u'已完成'))
32 | name = models.CharField(u'任务名',max_length=50)
33 | scrapy = models.ForeignKey('datadeal.scrapyList',verbose_name=u'项目')
34 | priority = models.IntegerField(u'任务优先级',default=10,help_text='值越小越优先')
35 | urls = models.ManyToManyField('datadeal.startUrls',verbose_name=u'爬取链接')
36 | status = models.IntegerField(u'任务状态', choices=STATUS_CHOICES,default=1)
37 | create_at = models.DateTimeField(u'创建时间', auto_now_add=True)
38 | get_at = models.DateTimeField(u'任务领取时间',null=True,blank=True)
39 | over_at = models.DateTimeField(u'任务完成时间',null=True,blank=True)
40 | node = models.ForeignKey('Node',verbose_name=u'执行节点',blank=True,null=True)
41 | nodeip = models.ForeignKey('NodeIp',verbose_name=u'执行IP',blank=True,null=True)
42 |
43 | def __unicode__(self):
44 | return self.name
45 |
46 | class Meta:
47 | verbose_name = u'任务管理'
48 | verbose_name_plural = u'任务管理'
--------------------------------------------------------------------------------
/datadeal/distribute/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 |
3 | # Create your tests here.
4 |
--------------------------------------------------------------------------------
/datadeal/distribute/urls.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from django.conf.urls import patterns, include, url
3 | from .views import *
4 |
5 | urlpatterns = [
6 | url(r'^create_node/$', CreateNode.as_view() ,name='create_node'),
7 | url(r'^get_spiders/$', getSpiders.as_view() ,name='get_spiders'),
8 | url(r'^handle_tasks/$', handleTasks.as_view() ,name='handle_tasks'),
9 | url(r'^over_tasks/$', overTasks.as_view() ,name='over_tasks'),
10 | url(r'^save_data/$', SaveData.as_view() ,name='save_data'),
11 | url(r'^get_spidername/$', GetSpiderName.as_view() ,name='get_spidername'),
12 | ]
--------------------------------------------------------------------------------
/datadeal/distribute/views.py:
--------------------------------------------------------------------------------
1 | #!coding=utf-8
2 | from django.views.generic import TemplateView,View
3 | from django.http import HttpResponse,HttpResponseRedirect
4 | from datadeal.settings import BASE_DIR
5 | from .models import Node,NodeIp,NodeTask
6 | from datadeal.models import SpiderData,ErrorData
7 | import os
8 | import urllib,urllib2
9 | import json
10 | import uuid
11 | import datetime
12 | import pdfkit
13 | import hashlib
14 | import time
15 |
16 | # HOST = 'http://192.168.211.1:8000'
17 | HOST = 'http://10.20.1.52:8000'
18 | TASK_NUM = 1
19 |
20 | def get_mac_address():
21 | '''
22 | 获取本机mac地址
23 | '''
24 | mac=uuid.UUID(int = uuid.getnode()).hex[-12:]
25 | return ":".join([mac[e:e+2] for e in range(0,11,2)])
26 |
27 | def set_url_head(url,r_url):
28 | '''
29 | 设置url前缀
30 | '''
31 | if url:
32 | if url.startswith('http://') or url.startswith('https://'):
33 | new_url = url
34 | else:
35 | if r_url.endswith('.html'):
36 | last = r_url.split('/')[-1]
37 | r_url = r_url.split(last)[0]
38 | if r_url.startswith('http://'):
39 | if url.startswith('/'):
40 | new_url = 'http://'+r_url.split('http://')[1].split('/')[0]+url
41 | else:
42 | new_url = r_url+url
43 | elif r_url.startswith('https://'):
44 | if url.startswith('/'):
45 | new_url = 'https://'+r_url.split('https://')[1].split('/')[0]+url
46 | else:
47 | new_url = r_url+url
48 | else:
49 | new_url = url
50 | else:
51 | new_url = ''
52 | return new_url
53 |
54 | class CreateNode(View):
55 | name = '注册mac地址(主机)'
56 |
57 | def post(self,request):
58 | uid = request.POST.get('uid','')
59 | already = Node.objects.filter(uid=uid).count()
60 | if not already and uid:
61 | Node.objects.create(uid=uid)
62 | msg = u'注册成功'
63 | else:
64 | msg = u'该节点已注册'
65 | return HttpResponse(msg)
66 |
67 | class getSpiders(View):
68 | name = '获取spider文件(主机)'
69 |
70 | def post(self,request):
71 | uid = request.POST.get('uid','')
72 | already = Node.objects.filter(uid=uid).count()
73 | if already and uid:
74 | dir_name = BASE_DIR+'/../searchInfo/searchInfo/spiders'
75 | files = os.listdir(dir_name)
76 | new_files = []
77 | for f in files:
78 | if not f.endswith('.pyc') and not f == '__init__.py':
79 | new_files.append(f)
80 | result = {}
81 | for i in new_files:
82 | f_name = dir_name+'/'+i
83 | with open(f_name,'r') as spider:
84 | text = spider.read()
85 | result[i] = text
86 | else:
87 | result = {'error':u'节点未注册'}
88 | return HttpResponse(json.dumps(result))
89 |
90 | def getTasks(name):
91 | '''
92 | 获取任务(节点)
93 | '''
94 | mac = get_mac_address()
95 | posturl = HOST+'/distribute/handle_tasks/'
96 | data = {'uid':mac,'num':TASK_NUM,'name':name}
97 | data = urllib.urlencode(data)
98 | f = urllib2.urlopen(posturl,data)
99 | result = json.loads(f.read())
100 | return result
101 |
102 | class handleTasks(View):
103 | name = '分发任务(主机)'
104 |
105 | def post(self,request):
106 | uid = request.POST.get('uid','')
107 | num = int(request.POST.get('num',0))
108 | name = request.POST.get('name','')
109 | try:
110 | node = Node.objects.get(uid=uid,status=True)
111 | except:
112 | node = ''
113 | if node:
114 | if request.META.has_key('HTTP_X_FORWARDED_FOR'):
115 | ip = request.META['HTTP_X_FORWARDED_FOR']
116 | else:
117 | ip = request.META['REMOTE_ADDR']
118 | try:
119 | nip = NodeIp.objects.get(ip=ip)
120 | except:
121 | nip = NodeIp.objects.create(ip=ip)
122 | if not nip in node.ips.all():
123 | node.ips.add(nip)
124 | today = datetime.datetime.now().date()
125 | start = today.strftime('%Y-%m-%d 00:00')
126 | end = today.strftime('%Y-%m-%d 23:59')
127 | count = NodeTask.objects.filter(nodeip=nip,get_at__gte=start,get_at__lte=end,scrapy__name=name).count()
128 | if count < node.max_num:
129 | if count+num <= node.max_num:
130 | result = []
131 | tasks = NodeTask.objects.filter(scrapy__name=name,status=1,node__uid__isnull=True).order_by('priority')[0:num]
132 | for t in tasks:
133 | task = {'id':'','urls':[]}
134 | task['id'] = t.id
135 | for i in t.urls.all():
136 | task['urls'].append(i.url)
137 | result.append(task)
138 | t.get_at = datetime.datetime.now()
139 | t.node = node
140 | t.nodeip = nip
141 | t.save()
142 | else:
143 | msg = ip+' 单次获取任务个数超过频度限制,请减少单次获取任务个数'
144 | print(unicode(msg))
145 | result = {'error':msg}
146 | else:
147 | msg = ip+' 超过今日该项目领取任务限制'
148 | print(unicode(msg))
149 | result = {'error':msg}
150 | else:
151 | msg = uid+' 节点未注册或已关闭'
152 | print(unicode(msg))
153 | result = {'error':msg}
154 | return HttpResponse(json.dumps(result))
155 |
156 | def afterTasks(task_id):
157 | '''
158 | 完成任务(节点)
159 | '''
160 | posturl = HOST+'/distribute/over_tasks/'
161 | nowtime = datetime.datetime.now()
162 | data = {'task_id':task_id,'nowtime':nowtime}
163 | data = urllib.urlencode(data)
164 | f = urllib2.urlopen(posturl,data)
165 | result = f.read()
166 | return result
167 |
168 | class overTasks(View):
169 | name = '任务结束(主机)'
170 |
171 | def post(self,request):
172 | task_id = request.POST.get('task_id','')
173 | nowtime = request.POST.get('nowtime','')
174 | try:
175 | task = NodeTask.objects.get(id=task_id)
176 | task.over_at = nowtime
177 | task.status = 2
178 | task.save()
179 | except Exception, e:
180 | print unicode(e)
181 | return HttpResponse('over')
182 |
183 | def sendData(name,data,url,error=False):
184 | '''
185 | 发送爬取信息(节点)
186 | '''
187 | mac = get_mac_address()
188 | posturl = HOST+'/distribute/save_data/'
189 | data = {'uid':mac,'data':data,'name':name,'error':error,'url':url}
190 | data = urllib.urlencode(data)
191 | f = urllib2.urlopen(posturl,data)
192 | result = f.read()
193 | return result
194 |
195 | class SaveData(View):
196 | name = '保存数据(主机)'
197 |
198 | def post(self,request):
199 | uid = request.POST.get('uid','')
200 | name = request.POST.get('name','')
201 | data = request.POST.get('data','')
202 | if data:
203 | try:
204 | data = eval(data)
205 | except Exception, e:
206 | print unicode(e)
207 | print data
208 | else:
209 | data = {}
210 | url = request.POST.get('url','')
211 | error = request.POST.get('error','False')
212 |
213 | if error == 'True':
214 | ErrorData.objects.create(uid=uid,scrapyname=name,url=data['url'],content=data['error'])
215 | msg = 'error'
216 | else:
217 | # 加入保存每条数据访问的页面url与pdf
218 | m = hashlib.md5()
219 | # m.update(url+str(time.time()))
220 | m.update(url)
221 | pdfname = m.hexdigest()+'.pdf'
222 | file_dir = BASE_DIR+'/datadeal/medias/web/'
223 | if not os.path.exists(file_dir):
224 | os.mkdir(file_dir)
225 | if os.path.exists(file_dir+pdfname):
226 | pass
227 | else:
228 | try:
229 | options = {
230 | 'page-size': 'B3',
231 | }
232 | pdfkit.from_url(url,file_dir+pdfname,options=options)
233 | except:
234 | pass
235 |
236 | SpiderData.objects.create(uid=uid,scrapyname=name,data=data,url=url,file=pdfname)
237 | msg = 'ok'
238 | return HttpResponse(msg)
239 |
240 | class GetSpiderName(View):
241 | name = '获取优先可爬取项目名(主机)'
242 |
243 | def post(self,request):
244 | tasks = NodeTask.objects.filter(get_at__isnull=True).order_by('scrapy__priority')
245 | print tasks
246 | if len(tasks) == 0:
247 | result = ''
248 | else:
249 | result = tasks[0].scrapy.name
250 | return HttpResponse(result)
--------------------------------------------------------------------------------
/datadeal/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO - 2017-07-27T09:43:39.930Z] GhostDriver - Main - running on port 62298
2 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36","webSecurityEnabled":true}
3 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.customHeaders: - {}
4 | [INFO - 2017-07-27T09:43:39.996Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"},"phantomjs.page.settings.userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
5 | [INFO - 2017-07-27T09:43:39.996Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: 12004e70-72b0-11e7-aee6-f5ffd6c70928
6 | [ERROR - 2017-07-27T09:43:42.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
7 |
8 | phantomjs://platform/console++.js:263 in error
9 | [ERROR - 2017-07-27T09:43:42.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
10 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
11 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
12 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
13 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
14 |
15 | phantomjs://platform/console++.js:263 in error
16 | [ERROR - 2017-07-27T09:43:44.348Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
17 |
18 | phantomjs://platform/console++.js:263 in error
19 | [ERROR - 2017-07-27T09:43:44.348Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
20 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
21 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
22 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
23 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
24 |
25 | phantomjs://platform/console++.js:263 in error
26 | [INFO - 2017-07-27T09:47:49.368Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW
27 | [INFO - 2017-07-27T09:51:09.506Z] SessionManagerReqHand - _cleanupWindowlessSessions - Asynchronous Sessions clean-up phase starting NOW
28 | 2004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
29 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
30 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
31 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
32 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
33 |
34 | phantomjs://platform/console++.js:263 in error
35 | [ERROR - 2017-07-27T09:43:45.820Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
36 |
37 | phantomjs://platform/console++.js:263 in error
38 | [ERROR - 2017-07-27T09:43:45.820Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
39 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
40 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
41 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
42 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
43 |
44 | phantomjs://platform/console++.js:263 in error
45 | [ERROR - 2017-07-27T09:43:47.142Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
46 |
47 | phantomjs://platform/console++.js:263 in error
48 | [ERROR - 2017-07-27T09:43:47.142Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
49 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
50 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
51 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
52 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
53 |
54 | phantomjs://platform/console++.js:263 in error
55 | [ERROR - 2017-07-27T09:43:47.672Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
56 |
57 | phantomjs://platform/console++.js:263 in error
58 | [ERROR - 2017-07-27T09:43:47.672Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
59 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
60 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
61 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
62 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
63 |
64 | phantomjs://platform/console++.js:263 in error
65 | [ERROR - 2017-07-27T09:43:48.404Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
66 |
67 | phantomjs://platform/console++.js:263 in error
68 | [ERROR - 2017-07-27T09:43:48.404Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
69 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
70 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
71 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
72 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
73 |
74 | phantomjs://platform/console++.js:263 in error
75 | [ERROR - 2017-07-27T09:43:49.085Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
76 |
77 | phantomjs://platform/console++.js:263 in error
78 | [ERROR - 2017-07-27T09:43:49.085Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
79 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
80 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
81 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
82 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
83 |
84 | phantomjs://platform/console++.js:263 in error
85 | [ERROR - 2017-07-27T09:43:49.855Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
86 |
87 | phantomjs://platform/console++.js:263 in error
88 | [ERROR - 2017-07-27T09:43:49.855Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
89 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
90 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
91 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
92 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
93 |
94 | phantomjs://platform/console++.js:263 in error
95 | [ERROR - 2017-07-27T09:43:50.325Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
96 |
97 | phantomjs://platform/console++.js:263 in error
98 | [ERROR - 2017-07-27T09:43:50.325Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
99 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
100 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
101 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
102 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
103 |
104 | phantomjs://platform/console++.js:263 in error
105 | [ERROR - 2017-07-27T09:43:51.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
106 |
107 | phantomjs://platform/console++.js:263 in error
108 | [ERROR - 2017-07-27T09:43:51.322Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
109 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
110 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
111 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
112 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
113 |
114 | phantomjs://platform/console++.js:263 in error
115 | [ERROR - 2017-07-27T09:43:51.869Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
116 |
117 | phantomjs://platform/console++.js:263 in error
118 | [ERROR - 2017-07-27T09:43:51.870Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
119 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
120 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
121 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
122 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
123 |
124 | phantomjs://platform/console++.js:263 in error
125 | [ERROR - 2017-07-27T09:43:52.886Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
126 |
127 | phantomjs://platform/console++.js:263 in error
128 | [ERROR - 2017-07-27T09:43:52.886Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
129 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
130 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
131 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
132 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
133 |
134 | phantomjs://platform/console++.js:263 in error
135 | [ERROR - 2017-07-27T09:43:53.546Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
136 |
137 | phantomjs://platform/console++.js:263 in error
138 | [ERROR - 2017-07-27T09:43:53.546Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
139 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
140 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
141 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
142 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
143 |
144 | phantomjs://platform/console++.js:263 in error
145 | [ERROR - 2017-07-27T09:43:54.676Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
146 |
147 | phantomjs://platform/console++.js:263 in error
148 | [ERROR - 2017-07-27T09:43:54.676Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
149 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
150 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
151 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
152 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
153 |
154 | phantomjs://platform/console++.js:263 in error
155 | [ERROR - 2017-07-27T09:43:55.271Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
156 |
157 | phantomjs://platform/console++.js:263 in error
158 | [ERROR - 2017-07-27T09:43:55.271Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
159 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
160 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
161 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
162 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
163 |
164 | phantomjs://platform/console++.js:263 in error
165 | [ERROR - 2017-07-27T09:43:57.125Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
166 |
167 | phantomjs://platform/console++.js:263 in error
168 | [ERROR - 2017-07-27T09:43:57.125Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
169 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
170 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
171 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
172 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
173 |
174 | phantomjs://platform/console++.js:263 in error
175 | [ERROR - 2017-07-27T09:43:58.231Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
176 |
177 | phantomjs://platform/console++.js:263 in error
178 | [ERROR - 2017-07-27T09:43:58.231Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
179 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
180 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
181 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
182 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
183 |
184 | phantomjs://platform/console++.js:263 in error
185 | [ERROR - 2017-07-27T09:43:59.242Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
186 |
187 | phantomjs://platform/console++.js:263 in error
188 | [ERROR - 2017-07-27T09:43:59.242Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
189 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
190 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
191 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
192 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
193 |
194 | phantomjs://platform/console++.js:263 in error
195 | [ERROR - 2017-07-27T09:43:59.762Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
196 |
197 | phantomjs://platform/console++.js:263 in error
198 | [ERROR - 2017-07-27T09:43:59.762Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
199 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
200 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
201 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
202 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
203 |
204 | phantomjs://platform/console++.js:263 in error
205 | [ERROR - 2017-07-27T09:44:00.853Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
206 |
207 | phantomjs://platform/console++.js:263 in error
208 | [ERROR - 2017-07-27T09:44:00.853Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
209 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
210 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
211 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
212 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
213 |
214 | phantomjs://platform/console++.js:263 in error
215 | [ERROR - 2017-07-27T09:44:01.369Z] WebElementLocator - _handleLocateCommand - Element(s) NOT Found: GAVE UP. Search Stop Time: 1501148641320
216 |
217 | phantomjs://platform/console++.js:263 in error
218 | [ERROR - 2017-07-27T09:44:01.756Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
219 |
220 | phantomjs://platform/console++.js:263 in error
221 | [ERROR - 2017-07-27T09:44:01.756Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
222 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
223 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
224 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
225 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
226 |
227 | phantomjs://platform/console++.js:263 in error
228 | [ERROR - 2017-07-27T09:44:02.228Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
229 |
230 | phantomjs://platform/console++.js:263 in error
231 | [ERROR - 2017-07-27T09:44:02.228Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
232 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
233 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
234 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
235 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
236 |
237 | phantomjs://platform/console++.js:263 in error
238 | [ERROR - 2017-07-27T09:44:03.433Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
239 |
240 | phantomjs://platform/console++.js:263 in error
241 | [ERROR - 2017-07-27T09:44:03.433Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
242 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
243 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
244 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
245 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
246 |
247 | phantomjs://platform/console++.js:263 in error
248 | [ERROR - 2017-07-27T09:44:03.955Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
249 |
250 | phantomjs://platform/console++.js:263 in error
251 | [ERROR - 2017-07-27T09:44:03.955Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
252 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
253 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
254 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
255 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
256 |
257 | phantomjs://platform/console++.js:263 in error
258 | [ERROR - 2017-07-27T09:44:05.229Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
259 |
260 | phantomjs://platform/console++.js:263 in error
261 | [ERROR - 2017-07-27T09:44:05.229Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
262 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
263 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
264 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
265 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
266 |
267 | phantomjs://platform/console++.js:263 in error
268 | [ERROR - 2017-07-27T09:44:06.542Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - msg: TypeError: Attempting to configurable attribute of unconfigurable property.
269 |
270 | phantomjs://platform/console++.js:263 in error
271 | [ERROR - 2017-07-27T09:44:06.542Z] Session [12004e70-72b0-11e7-aee6-f5ffd6c70928] - page.onError - stack:
272 | defineProperty (http://dn-growing.qbox.me/vds.js:2)
273 | registerHistoryHandler (http://dn-growing.qbox.me/vds.js:2)
274 | domLoadedHandler (http://dn-growing.qbox.me/vds.js:2)
275 | (anonymous function) (http://dn-growing.qbox.me/vds.js:2)
276 |
277 | phantomjs://platform/console++.js:263 in error
278 |
--------------------------------------------------------------------------------
/datadeal/manage.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import sys
4 |
5 | if __name__ == "__main__":
6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings")
7 |
8 | from django.core.management import execute_from_command_line
9 |
10 | execute_from_command_line(sys.argv)
11 |
--------------------------------------------------------------------------------
/searchInfo/ghostdriver.log:
--------------------------------------------------------------------------------
1 | [INFO - 2017-08-18T00:33:31.697Z] GhostDriver - Main - running on port 50755
2 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - page.settings - {"XSSAuditingEnabled":false,"javascriptCanCloseWindows":true,"javascriptCanOpenWindows":true,"javascriptEnabled":true,"loadImages":true,"localToRemoteUrlAccessEnabled":false,"userAgent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1","webSecurityEnabled":true}
3 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - page.customHeaders: - {}
4 | [INFO - 2017-08-18T00:33:31.971Z] Session [dcc570b0-83ac-11e7-97fe-0f519fac670c] - Session.negotiatedCapabilities - {"browserName":"phantomjs","version":"2.1.1","driverName":"ghostdriver","driverVersion":"1.2.0","platform":"windows-7-32bit","javascriptEnabled":true,"takesScreenshot":true,"handlesAlerts":false,"databaseEnabled":false,"locationContextEnabled":false,"applicationCacheEnabled":false,"browserConnectionEnabled":false,"cssSelectorsEnabled":true,"webStorageEnabled":false,"rotatable":false,"acceptSslCerts":false,"nativeEvents":true,"proxy":{"proxyType":"direct"}}
5 | [INFO - 2017-08-18T00:33:31.971Z] SessionManagerReqHand - _postNewSessionCommand - New Session Created: dcc570b0-83ac-11e7-97fe-0f519fac670c
6 |
--------------------------------------------------------------------------------
/searchInfo/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = searchInfo.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = searchInfo
12 |
--------------------------------------------------------------------------------
/searchInfo/searchInfo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Sophosss/scrapy/e5e09cd01ca31930bcd37323091a857dcd9ca769/searchInfo/searchInfo/__init__.py
--------------------------------------------------------------------------------
/searchInfo/searchInfo/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class FileItem(scrapy.Item):
12 | file_urls = scrapy.Field()
--------------------------------------------------------------------------------
/searchInfo/searchInfo/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 | from distribute.views import sendData
10 |
11 | class SearchinfoSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self,response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # if response.url.startswith('http://'):
28 | # url = response.url.split('http://')[1].split('/')[0]
29 | # elif response.url.startswith('https://'):
30 | # url = response.url.split('https://')[1].split('/')[0]
31 | # else:
32 | # url = response.url
33 | # if url in spider.allowed_domains:
34 | # return None
35 | # else:
36 | # print 'error_________: url not in allow_domains!'
37 | # raise
38 | pass
39 |
40 | def process_spider_output(self,response, result, spider):
41 | # Called with the results returned from the Spider, after
42 | # it has processed the response.
43 |
44 | # Must return an iterable of Request, dict or Item objects.
45 | for i in result:
46 | yield i
47 |
48 | def process_spider_exception(self,response, exception, spider):
49 | # Called when a spider or process_spider_input() method
50 | # (from other spider middleware) raises an exception.
51 |
52 | # Should return either None or an iterable of Response, dict
53 | # or Item objects.
54 | sendData(spider.name,{'error':unicode(exception),'url':response.url},response.url,True)
55 |
56 | def process_start_requests(self,start_requests, spider):
57 | # Called with the start requests of the spider, and works
58 | # similarly to the process_spider_output() method, except
59 | # that it doesn’t have a response associated.
60 |
61 | # Must return only requests (not items).
62 | for r in start_requests:
63 | yield r
64 |
65 | def spider_opened(self, spider):
66 | spider.logger.info('Spider opened: %s' % spider.name)
67 |
--------------------------------------------------------------------------------
/searchInfo/searchInfo/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from searchInfo import settings
3 | import requests
4 | import hashlib
5 | import os
6 |
7 | class FilesDownloadPipeline(object):
8 | def process_item(self, item, spider):
9 |
10 | if 'file_urls' in item:
11 | dir_path = '%s/%s' % (settings.FILES_STORE, spider.name)
12 | img_path = settings.IMAGES_STORE
13 | if not os.path.exists(dir_path):
14 | os.makedirs(dir_path)
15 | if not os.path.exists(img_path):
16 | os.makedirs(img_path)
17 | for file_url in item['file_urls']:
18 | file_name = file_url.split('/')[-1]
19 | back = file_name.split('.')[-1]
20 | m = hashlib.md5()
21 | m.update(file_name)
22 | file_name = m.hexdigest()
23 | if spider.name == 'shandong':
24 | file_name = file_name+'.png'
25 | file_path = '%s/%s' % (img_path, file_name)
26 | elif back == 'png' or back == 'jpg' or back == 'gif':
27 | file_name = file_name+'.'+back
28 | file_path = '%s/%s' % (img_path, file_name)
29 | else:
30 | file_name = spider.name+'_'+file_name+'.'+back
31 | file_path = '%s/%s' % (dir_path, file_name)
32 | if os.path.exists(file_path):
33 | continue
34 | with open(file_path, 'wb') as handle:
35 | response = requests.get(file_url, stream=True)
36 | for block in response.iter_content(1024):
37 | if not block:
38 | break
39 | handle.write(block)
40 | return item
--------------------------------------------------------------------------------
/searchInfo/searchInfo/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for searchInfo project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'searchInfo'
13 |
14 | SPIDER_MODULES = ['searchInfo.spiders']
15 | NEWSPIDER_MODULE = 'searchInfo.spiders'
16 |
17 | FEED_EXPORT_ENCODING = 'utf-8'
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'searchInfo (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | # CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | 'Accept-Language': 'en',
45 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
46 | }
47 |
48 | # Enable or disable spider middlewares
49 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
50 | SPIDER_MIDDLEWARES = {
51 | 'searchInfo.middlewares.SearchinfoSpiderMiddleware': 543,
52 | }
53 |
54 | # Enable or disable downloader middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
56 | #DOWNLOADER_MIDDLEWARES = {
57 | # 'searchInfo.middlewares.MyCustomDownloaderMiddleware': 543,
58 | #}
59 |
60 | # Enable or disable extensions
61 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
62 | #EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | #}
65 |
66 | # Configure item pipelines
67 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'searchInfo.pipelines.FilesDownloadPipeline': 300,
70 | # 'scrapy_redis.pipelines.RedisPipeline': 300
71 | }
72 | FILES_STORE = '../datadeal/datadeal/medias/'
73 | IMAGES_STORE = '../datadeal/datadeal/medias/images'
74 | # Enable and configure the AutoThrottle extension (disabled by default)
75 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
76 | #AUTOTHROTTLE_ENABLED = True
77 | # The initial download delay
78 | #AUTOTHROTTLE_START_DELAY = 5
79 | # The maximum download delay to be set in case of high latencies
80 | #AUTOTHROTTLE_MAX_DELAY = 60
81 | # The average number of requests Scrapy should be sending in parallel to
82 | # each remote server
83 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
84 | # Enable showing throttling stats for every response received:
85 | #AUTOTHROTTLE_DEBUG = False
86 |
87 | # Enable and configure HTTP caching (disabled by default)
88 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
89 | # HTTPCACHE_ENABLED = True
90 | # HTTPCACHE_EXPIRATION_SECS = 3600
91 | # HTTPCACHE_DIR = 'httpcache'
92 | # HTTPCACHE_IGNORE_HTTP_CODES = []
93 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
94 |
95 |
96 | # REDIS_HOST = '127.0.0.1'
97 | # REDIS_PORT = 6379
98 |
99 | import sys,os
100 | from django.core.wsgi import get_wsgi_application
101 | sys.path.append(os.path.join(os.path.split(os.path.dirname(__file__))[0],'../datadeal'))
102 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "datadeal.settings")
103 | application = get_wsgi_application()
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/beijing.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 |
8 | class BeiJingSpider(scrapy.Spider):
9 | name = "beijing"
10 | allowed_domains = ["www.bjda.gov.cn"]
11 | start_urls = ['http://www.bjda.gov.cn/eportal/ui?pageId=331216¤tPage=1&filter_LIKE_TITLE=&filter_LIKE_XKZH=']
12 |
13 | def parse(self, response):
14 | # for i in range(1,1472):
15 | for i in range(1,10):
16 | url = 'http://www.bjda.gov.cn/eportal/ui?pageId=331216¤tPage=%s&filter_LIKE_TITLE=&filter_LIKE_XKZH=' % i
17 | yield scrapy.Request(url, callback=self.parse_item)
18 |
19 |
20 | def parse_item(self, response):
21 | urls = response.xpath('//*[@id="form"]/div[2]/table//a')
22 | for url in urls:
23 | text = url.xpath('string(.)').extract_first()
24 | if text and text == '查看':
25 | url = url.xpath('@href').extract_first()
26 | url = 'http://www.bjda.gov.cn'+url
27 | already = SpiderData.objects.filter(url=url)
28 | if already.count() == 0:
29 | yield scrapy.Request(url, callback=self.parse_detail)
30 | else:
31 | pass
32 | # print 'already crawled'
33 |
34 | def parse_detail(self,response):
35 | trs = response.xpath('//*[@id="84f8b7f6cfc44b849b61b5c0ed21976a"]/div[2]/table//tr')
36 | data = {}
37 | for tr in trs:
38 | key = tr.xpath('th/text()').extract_first().replace(':','')
39 | val = tr.xpath('td/text()').extract_first()
40 | data[key] = val
41 | sendData('beijing',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/case.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 |
8 | '''山东省行政处罚案件爬虫'''
9 | class CaseSpider(scrapy.Spider):
10 | name = "case"
11 | allowed_domains = ["sdlf.shandongbusiness.gov.cn"]
12 | start_urls = ['http://sdlf.shandongbusiness.gov.cn/newslist.shtml?method=listXzcf']
13 |
14 | def parse(self, response):
15 | for i in range(1,6):
16 | yield scrapy.FormRequest(
17 | url='http://sdlf.shandongbusiness.gov.cn/newslist.shtml',
18 | formdata={'pager.requestPage': str(i), 'method': 'listXzcf'},
19 | callback=self.after_post
20 | )
21 |
22 | def after_post(self, response):
23 | li = response.xpath('//ul[@class="rlistul"]/li')
24 | for l in li:
25 | date = l.xpath('span/text()').extract_first()
26 | title = l.xpath('a/text()').extract_first()
27 | url = 'http://sdlf.shandongbusiness.gov.cn'+l.xpath('a/@href').extract_first()
28 | yield scrapy.Request(url, callback=self.parse_item,meta={'date':date,'title':title})
29 |
30 | def parse_item(self, response):
31 | date = response.meta['date']
32 | title = response.meta['title']
33 | data = {}
34 | tables = response.xpath('//table[@class="rtab2"]')
35 | for table in tables:
36 | trs = table.xpath('tr')
37 | for tr in trs:
38 | key = tr.xpath('th/text()').extract_first().split(u':')[0]
39 | value = tr.xpath('td/text()').extract_first()
40 | data[key] = value
41 |
42 | already = SpiderData.objects.filter(url=response.url)
43 | if already.count() == 0:
44 | sendData('case',data,response.url)
45 | else:
46 | pass
47 | # print 'already crawl'
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/chengdu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 |
8 | class ChengDuSpider(scrapy.Spider):
9 | name = "chengdu"
10 | allowed_domains = ["www.shfda.gov.cn"]
11 | start_urls = ['http://www.cdepb.gov.cn/cdepbws/Web/Template/GovDefaultList.aspx?cid=843']
12 |
13 | def parse(self, response):
14 | # for i in range(1,37):
15 | for i in range(1,10):
16 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=%s&pagesize=20' % i
17 | yield scrapy.Request(url, callback=self.parse_item)
18 |
19 |
20 | def parse_item(self, response):
21 | urls = response.xpath('//*[@id="b1"]//a')
22 | for url in urls:
23 | text = url.xpath('string(.)').extract_first()
24 | if text and text == '详情':
25 | url = url.xpath('@href').extract_first()
26 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/'+url
27 | already = SpiderData.objects.filter(url=url)
28 | if already.count() == 0:
29 | yield scrapy.Request(url, callback=self.parse_detail)
30 | else:
31 | # print 'already crawled'
32 | pass
33 |
34 | def parse_detail(self,response):
35 | trs = response.xpath('//*[@id="main"]/div/div[2]/table//tr')
36 | data = {}
37 | for tr in trs:
38 | key = tr.xpath('td[1]/text()').extract_first()
39 | val = tr.xpath('td[2]/text()').extract_first()
40 | data[key] = val
41 | sendData('shanghai',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/gansu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 |
8 | class GanSuSpider(scrapy.Spider):
9 | name = "gansu"
10 | allowed_domains = ["www.gsfda.gov.cn"]
11 | start_urls = ['http://www.gsfda.gov.cn:2180/xzlaw/xzlawActionWZ!list.do?queryBean.pn=1&queryBean.pageSize=100']
12 |
13 | def parse(self, response):
14 | # for i in range(1,106):
15 | for i in range(1,5):
16 | url = 'http://www.gsfda.gov.cn:2180/xzlaw/xzlawActionWZ!list.do?queryBean.pn=%s&queryBean.pageSize=100' % i
17 | yield scrapy.Request(url, callback=self.parse_item)
18 |
19 |
20 | def parse_item(self, response):
21 | urls = response.xpath('//*[@id="list"]//a')
22 | for url in urls:
23 | text = url.xpath('string(.)').extract_first()
24 | if text and text == '[查看]':
25 | url = url.xpath('@href').extract_first()
26 | url = 'http://www.gsfda.gov.cn:2180/xzlaw/'+url
27 | already = SpiderData.objects.filter(url=url)
28 | if already.count() == 0:
29 | yield scrapy.Request(url, callback=self.parse_detail)
30 | else:
31 | # print 'already crawled'
32 | pass
33 |
34 | def parse_detail(self,response):
35 | trs = response.xpath('//*[@id="edit"]//tr')
36 | data = {}
37 | for i,tr in enumerate(trs):
38 | if i > 0:
39 | key = tr.xpath('th/text()').extract_first()
40 | val = val = tr.xpath('td/text()').extract_first()
41 | if key:
42 | key = key.replace(':','').replace(' ','')
43 | if not val:
44 | val = ''
45 | data[key] = val
46 | sendData('gansu',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/hainan.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 | import json
8 |
9 | class HaiNanSpider(scrapy.Spider):
10 | name = "hainan"
11 | allowed_domains = ["aj.hifda.gov.cn"]
12 | start_urls = ['http://aj.hifda.gov.cn/web/index.jsp']
13 |
14 | def parse(self, response):
15 | # for i in range(0,47):
16 | for i in range(0,5):
17 | yield scrapy.FormRequest(
18 | url='http://aj.hifda.gov.cn/loseCredit/refreshList.json',
19 | formdata={
20 | "cityName":"",
21 | "initialVal":"",
22 | "ispublish":"1",
23 | "listPageSize":"100",
24 | "queryContent":"",
25 | "queryOrder":"0",
26 | "searchOrderType":"0",
27 | "selectIndex":"1",
28 | "skip":"%s" % str(i*100),
29 | },
30 | callback=self.parse_item
31 | )
32 |
33 | def parse_item(self, response):
34 | result = json.loads(response.body)
35 | for r in result['resultData']:
36 | url = 'http://aj.hifda.gov.cn/web/showContent.jsp?id='+r['id']
37 | data = {u'企业(商户)名称':r['companyname'],u'注册地址':r['companysite'],u'法定代表人姓名':r['companyman'],u'法定代表人身份证号':r['companymanid'],u'负责人姓名':r['responsible_man'],u'负责人身份证号':r['resp_man_id'],u'直接责任人':r['direct_person'],u'社会信用代码':r['idcode'],u'案件分类':r['toclassify'],u'案件名称':r['losecase'],u'行政处罚决定文书号':r['punish_writ_num'],u'主要违法事实':r['losedetail'],u'处罚依据和内容':r['punishway'],u'处罚机关':r['punishunit'],u'处罚时间':r['punishtime']}
38 | already = SpiderData.objects.filter(url=url)
39 | if already.count() == 0:
40 | sendData('hainan',data,url)
41 | else:
42 | pass
43 | # print 'already crawled'
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/qingdao.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from distribute.views import set_url_head
4 | from searchInfo.items import FileItem
5 | from scrapy.loader import ItemLoader
6 |
7 | class QingDaoSpider(scrapy.Spider):
8 | name = "qingdao"
9 | allowed_domains = ["sfda.qingdao.gov.cn"]
10 | start_urls = ['http://sfda.qingdao.gov.cn/n32205967/n32206400/index.html']
11 |
12 | def parse(self, response):
13 | data = []
14 | qingdao_div = response.xpath('//div[@id="listChangeDiv"]/ul/li')
15 | for i in qingdao_div:
16 | qingdao_url = i.xpath('a/@href').extract_first()
17 | qingdao_url = set_url_head(qingdao_url,response.url)
18 | if qingdao_url:
19 | yield scrapy.Request(qingdao_url, callback=self.parse_item)
20 |
21 | def parse_item(self, response):
22 | qingdao_detail = response.xpath('//div[@class="main_t"]//a')
23 | l = ItemLoader(item=FileItem(), response=response)
24 | for i in qingdao_detail:
25 | url = i.xpath('@href').extract_first()
26 | url = set_url_head(url,response.url)
27 | if url:
28 | l.add_value('file_urls',url)
29 | return l.load_item()
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/risk.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | # from distribute.views import sendData
6 | # from datadeal.models import SpiderData
7 | from searchInfo.items import FileItem
8 | from scrapy.loader import ItemLoader
9 |
10 | '''国家食药局风险检测文件爬虫'''
11 | class RiskSpider(scrapy.Spider):
12 | name = "risk"
13 | allowed_domains = ["www.sda.gov.cn"]
14 | start_urls = ['http://www.sda.gov.cn/WS01/CL1667/index.html']
15 |
16 | def parse(self, response):
17 | # 食品
18 | url = 'http://www.sda.gov.cn/WS01/CL1667/index.html'
19 | yield scrapy.Request(url, callback=self.parse_item)
20 | # for i in range(1,222):
21 | # url = 'http://www.sda.gov.cn/WS01/CL1667/index_%s.html' % i
22 | # yield scrapy.Request(url, callback=self.parse_item)
23 |
24 | # 药品
25 | url = 'http://www.sda.gov.cn/WS01/CL1429/'
26 | yield scrapy.Request(url, callback=self.parse_item)
27 | # for i in range(1,12):
28 | # url = 'http://www.sda.gov.cn/WS01/CL1429/index_%s.html' % i
29 | # yield scrapy.Request(url, callback=self.parse_item)
30 |
31 | #化妆品
32 | url = 'http://www.sda.gov.cn/WS01/CL1866/'
33 | yield scrapy.Request(url, callback=self.parse_item)
34 | # for i in range(1,3):
35 | # url = 'http://www.sda.gov.cn/WS01/CL1866/index_%s.html' % i
36 | # yield scrapy.Request(url, callback=self.parse_item)
37 |
38 | def parse_item(self, response):
39 | urls = response.xpath('/html/body/table[3]//tr/td[3]/table[2]//tr/td/table[1]//a')
40 | for url in urls:
41 | text = url.xpath('string(.)').extract_first()
42 | if '不合格' in text or '抽检' in text:
43 | url = url.xpath('@href').extract_first().replace('..','')
44 | url = 'http://www.sda.gov.cn/WS01'+url
45 | yield scrapy.Request(url, callback=self.parse_detail)
46 |
47 | def parse_detail(self,response):
48 | path = response.xpath('//a')
49 | l = ItemLoader(item=FileItem(), response=response)
50 | for i in path:
51 | url = i.xpath('@href').extract_first()
52 | if url:
53 | if url.endswith('.doc') or url.endswith('.xlsx') or url.endswith('.xls') or url.endswith('.docx') or url.endswith('.rar') or url.endswith('.pdf') or url.endswith('.zip'):
54 | url = 'http://www.sda.gov.cn'+url
55 | l.add_value('file_urls',url)
56 | return l.load_item()
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/sdein.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from distribute.views import *
4 |
5 | class SdeinSpider(scrapy.Spider):
6 | name = "sdein"
7 | allowed_domains = ["www.sdein.gov.cn","zfc.sdein.gov.cn"]
8 |
9 | def start_requests(self):
10 | results = getTasks('sdein')
11 | self.taks_urls = {}
12 | self.tasks = {}
13 | if isinstance(results,dict):
14 | print results['error']
15 | else:
16 | for re in results:
17 | self.tasks[re['id']] = {'t_count':len(re['urls']),'count':0}
18 | for u in re['urls']:
19 | self.taks_urls[u] = re['id']
20 | yield self.make_requests_from_url(u)
21 |
22 | def after_parse(self,url):
23 | task_id = self.taks_urls[url]
24 | self.tasks[task_id]['count'] += 1
25 | if self.tasks[task_id]['count'] == self.tasks[task_id]['t_count']:
26 | afterTasks(task_id)
27 |
28 | def parse(self, response):
29 | sdein_table = response.xpath('//table[@width="763"]/tr[3]/td/table[2]/tr')
30 | for i in sdein_table:
31 | sdein_title = i.xpath('td[2]/a/text()').extract_first()
32 | sdein_date = i.xpath('td[3]/text()').extract_first()
33 | sdein_url = i.xpath('td[2]/a/@href').extract_first()
34 | sdein_url = set_url_head(sdein_url,response.url)
35 | if sdein_url:
36 | yield scrapy.Request(sdein_url, callback=self.parse_item,meta={'sdein_title':sdein_title,'sdein_date':sdein_date,})
37 | self.after_parse(response.url)
38 |
39 | def parse_item(self, response):
40 | sdein_title = response.meta['sdein_title']
41 | sdein_date = response.meta['sdein_date']
42 | sdein_content = response.xpath('//div[@class="TRS_Editor"]').xpath('string(.)').extract_first()
43 | sendData('sdein',{'sdein_title':sdein_title,'sdein_date':sdein_date,'sdein_content':sdein_content},response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/sdqts.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from distribute.views import *
4 |
5 | class SdqtsSpider(scrapy.Spider):
6 | name = "sdqts"
7 | allowed_domains = ["www.sdqts.gov.cn"]
8 | start_urls = ['http://www.sdqts.gov.cn/sdzj/380936/index.html']
9 |
10 | def parse(self, response):
11 | sdqts_table = response.xpath('//*[@id="2d758f3ea2c041e399b5d84609a300f5"]/div[2]/div[2]/div[2]/table[2]/tbody/tr')
12 | for i in sdqts_table:
13 | sdqts_title = i.xpath('td/table/tbody/tr/td[1]/a/text()').extract_first()
14 | sdqts_date = i.xpath('td/table/tbody/tr/td[4]/text()').extract_first()
15 | sdqts_url = i.xpath('td/table/tbody/tr/td[1]/a/@href').extract_first()
16 | sdqts_url = set_url_head(sdqts_url,response.url)
17 | if sdqts_url:
18 | yield scrapy.Request(sdqts_url, callback=self.parse_item,meta={'sdqts_title':sdqts_title,'sdqts_date':sdqts_date,})
19 |
20 | def parse_item(self, response):
21 | sdqts_title = response.meta['sdqts_title']
22 | sdqts_date = response.meta['sdqts_date']
23 | tr = response.xpath('//div[@class="gov_infoCatalog_detailsection"]//table//tr')
24 | data = {}
25 | if len(tr) == 2:
26 | td_title = tr[0].xpath('td')
27 | td_val = tr[1].xpath('td')
28 | for i in range(0,len(tr[1].xpath('td'))):
29 | data[td_title[i].xpath('string(.)').extract_first()] = td_val[i].xpath('string(.)').extract_first()
30 | if data:
31 | sendData('sdqts',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/sfda.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib
4 | import urllib2
5 | import bs4
6 | from distribute.views import sendData
7 | from datadeal.models import SpiderData
8 |
9 | class SfdaSpider(scrapy.Spider):
10 | name = "sfda"
11 | allowed_domains = ["app1.sfda.gov.cn"]
12 | start_urls = ['http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=114&tableName=TABLE114&title=%E5%9B%BD%E5%AE%B6%E9%A3%9F%E5%93%81%E5%AE%89%E5%85%A8%E7%9B%91%E7%9D%A3%E6%8A%BD%E6%A3%80%EF%BC%88%E4%B8%8D%E5%90%88%E6%A0%BC%E4%BA%A7%E5%93%81%EF%BC%89&bcId=143106776907834761101199700381']
13 |
14 | def parse(self, response):
15 | # for i in range(1,238):
16 | for i in range(10,20):
17 | yield scrapy.FormRequest(
18 | url='http://app1.sfda.gov.cn/datasearch/face3/search.jsp',
19 | formdata={
20 | "State":"1",
21 | "bcId":"143106776907834761101199700381",
22 | "curstart":str(i),
23 | "tableId":"114",
24 | "tableName":"TABLE114",
25 | "viewsubTitleName":"COLUMN1486",
26 | "viewtitleName":"COLUMN1490"
27 | },
28 | callback=self.after_post
29 | )
30 |
31 | def after_post(self, response):
32 | for a in response.xpath('//a'):
33 | aid = a.xpath('@href').extract_first().split('&Id=')[1].split('\'')[0]
34 | get_url = "http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=114&tableName=TABLE114&Id="+aid
35 | yield scrapy.Request(get_url, callback=self.parse_item)
36 |
37 | def parse_item(self, response):
38 | trs = response.xpath('//table/tr')
39 | data = {}
40 | for tr in trs:
41 | key = tr.xpath('td[1]/text()').extract_first()
42 | val = tr.xpath('td[2]/text()').extract_first()
43 | if key or val:
44 | data[key] = val
45 | try:
46 | already = SpiderData.objects.filter(scrapyname='sfda',data__contains={u"被抽样单位名称":data[u'被抽样单位名称'],u"生产日期/批号":data[u'生产日期/批号'],u"抽检项目":data[u'抽检项目']}).count()
47 | except:
48 | already = 1
49 | if not already:
50 | sendData('sfda',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/shandong.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from distribute.views import set_url_head
4 | from searchInfo.items import FileItem
5 | from scrapy.loader import ItemLoader
6 |
7 | class shandongSpider(scrapy.Spider):
8 | name = "shandong"
9 | allowed_domains = ["www.creditsd.gov.cn"]
10 | start_urls = ['http://www.creditsd.gov.cn/creditsearch.punishmentList.phtml?id=']
11 |
12 | def parse(self, response):
13 | for i in range(1,11):
14 | url = 'http://www.creditsd.gov.cn/creditsearch.punishmentList.phtml?id=&keyword=&page=%s' % i
15 | yield scrapy.Request(url, callback=self.parse_list)
16 |
17 | def parse_list(self, response):
18 | div = response.xpath('/html/body/div/table[2]//tr')
19 | for i in div:
20 | url = i.xpath('td[1]/a/@href').extract_first()
21 | if url:
22 | url = 'http://www.creditsd.gov.cn'+url
23 | yield scrapy.Request(url, callback=self.parse_item)
24 |
25 | def parse_item(self, response):
26 | img_url = response.xpath('//*[@id="img"]/@src').extract_first()
27 | if img_url:
28 | img_url = 'http://www.creditsd.gov.cn'+img_url
29 | l = ItemLoader(item=FileItem(), response=response)
30 | l.add_value('file_urls',img_url)
31 | return l.load_item()
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/shanghai.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | import urllib2
4 | import json
5 | from distribute.views import sendData
6 | from datadeal.models import SpiderData
7 |
8 | class ShangHaiSpider(scrapy.Spider):
9 | name = "shanghai"
10 | allowed_domains = ["www.shfda.gov.cn"]
11 | start_urls = ['http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=1&pagesize=20']
12 |
13 | def parse(self, response):
14 | # for i in range(1,815):
15 | for i in range(1,10):
16 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/xxgk2.aspx?pu=&qymc=&slrqstart=&slrqend=&pageindex=%s&pagesize=20' % i
17 | yield scrapy.Request(url, callback=self.parse_item)
18 |
19 |
20 | def parse_item(self, response):
21 | urls = response.xpath('//*[@id="b1"]//a')
22 | for url in urls:
23 | text = url.xpath('string(.)').extract_first()
24 | if text and text == '详情':
25 | url = url.xpath('@href').extract_first()
26 | url = 'http://www.shfda.gov.cn/XingZhengChuFa/'+url
27 | already = SpiderData.objects.filter(url=url)
28 | if already.count() == 0:
29 | yield scrapy.Request(url, callback=self.parse_detail)
30 | else:
31 | # print 'already crawled'
32 | pass
33 |
34 | def parse_detail(self,response):
35 | trs = response.xpath('//*[@id="main"]/div/div[2]/table//tr')
36 | data = {}
37 | for tr in trs:
38 | key = tr.xpath('td[1]/text()').extract_first()
39 | val = tr.xpath('td[2]/text()').extract_first()
40 | data[key] = val
41 | sendData('shanghai',data,response.url)
--------------------------------------------------------------------------------
/searchInfo/searchInfo/spiders/xxgk.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from distribute.views import *
4 |
5 | class XxgkSpider(scrapy.Spider):
6 | name = "xxgk"
7 | allowed_domains = ["xxgk.sdein.gov.cn"]
8 |
9 | def start_requests(self):
10 | results = getTasks('xxgk')
11 | self.taks_urls = {}
12 | self.tasks = {}
13 | if isinstance(results,dict):
14 | print results['error']
15 | else:
16 | for re in results:
17 | self.tasks[re['id']] = {'t_count':len(re['urls']),'count':0}
18 | for u in re['urls']:
19 | self.taks_urls[u] = re['id']
20 | yield self.make_requests_from_url(u)
21 |
22 | def after_parse(self,url):
23 | task_id = self.taks_urls[url]
24 | self.tasks[task_id]['count'] += 1
25 | if self.tasks[task_id]['count'] == self.tasks[task_id]['t_count']:
26 | afterTasks(task_id)
27 |
28 | def parse(self, response):
29 | xxgk_table = response.xpath('//table[@width="763"]/tr[4]/td/table/tr')
30 | for i in xxgk_table:
31 | sdein_title = i.xpath('td[2]/a/text()').extract_first()
32 | sdein_date = i.xpath('td[3]/text()').extract_first()
33 | sdein_url = i.xpath('td[2]/a/@href').extract_first()
34 | sdein_url = set_url_head(sdein_url,response.url)
35 | if sdein_url:
36 | yield scrapy.Request(sdein_url, callback=self.parse_item,meta={'sdein_title':sdein_title,'sdein_date':sdein_date,})
37 | self.after_parse(response.url)
38 |
39 | def parse_item(self, response):
40 | sdein_title = response.meta['sdein_title']
41 | sdein_date = response.meta['sdein_date']
42 | xxgk_content = response.xpath('/html/body/table[2]/tr[6]/td/table/tr[2]/td/table[4]/tr[3]/td/table[1]').xpath('string(.)').extract_first()
43 | trs = response.xpath('/html/body/table[2]/tr[6]/td/table/tr[2]/td/table[2]/tr')
44 | data = {}
45 | for tr in trs:
46 | tds = tr.xpath('td')
47 | for num,td in enumerate(tds):
48 | if num % 2 == 0:
49 | data[td.xpath('string(.)').extract_first()] = tds[num+1].xpath('string(.)').extract_first()
50 | data['content'] = xxgk_content
51 | sendData('xxgk',data,response.url)
--------------------------------------------------------------------------------
/selenium/baidu_cookies.json:
--------------------------------------------------------------------------------
1 | [{"domain": "index.baidu.com", "name": "bdshare_firstime", "expires": "\u5468\u65e5, 24 \u4e03\u6708 2022 02:52:30 GMT", "create_at": 1500864751.085, "value": "1500864750591", "expiry": 1658631150, "path": "/", "httponly": false, "secure": false}, {"domain": ".index.baidu.com", "name": "Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc", "value": "1500864750", "path": "/", "httponly": false, "secure": false}, {"domain": ".index.baidu.com", "name": "Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc", "expires": "\u5468\u4e8c, 24 \u4e03\u6708 2018 02:52:30 GMT", "value": "1500864214,1500864295,1500864672,1500864742", "expiry": 1532400750, "path": "/", "httponly": false, "secure": false}, {"domain": "index.baidu.com", "name": "CHKFORREG", "expires": "\u5468\u4e8c, 25 \u4e03\u6708 2017 02:52:29 GMT", "value": "54b8a6ea6d56d48e58c165c605b717e1", "expiry": 1500951149, "path": "/", "httponly": false, "secure": false}, {"domain": ".baidu.com", "name": "BDUSS", "expires": "\u5468\u4e94, 10 \u5341\u6708 2025 02:52:27 GMT", "value": "TNWV0UydlptYW91SksxMzJINjI4UDBhOXo4RUpPQ2hBN01lN1lnbVpvVHI3WnhaSVFBQUFBJCQAAAAAAAAAAAEAAAARP6U7stDA4c7e2~MAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOtgdVnrYHVZL", "expiry": 1760064747, "path": "/", "httponly": true, "secure": false}, {"domain": ".index.baidu.com", "name": "FP_UID", "expires": "\u5468\u516d, 31 \u5341\u4e8c\u6708 2050 00:00:00 GMT", "value": "e692d0e17c4002bb8907b276ce03d6c8", "expiry": 2556057600, "path": "/", "httponly": false, "secure": false}, {"domain": "index.baidu.com", "name": "searchtips", "expires": "\u5468\u56db, 19 \u4e03\u6708 2018 02:52:22 GMT", "value": "1", "expiry": 1531968742, "path": "/", "httponly": false, "secure": false}, {"domain": ".baidu.com", "name": "BAIDUID", "expires": "\u5468\u4e8c, 24 \u4e03\u6708 2018 02:52:22 GMT", "value": "62945FA019DB5ABFC5EDB35994164E0A:FG=1", "expiry": 1532400742, "path": "/", "httponly": false, "secure": false}]
--------------------------------------------------------------------------------
/selenium/exponent_baidu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support import expected_conditions as EC
5 | from selenium.webdriver.support.wait import WebDriverWait
6 | from selenium.webdriver.common.action_chains import ActionChains
7 | from keywords import get_keywords,save_keyword_index
8 | from PIL import Image
9 | import time
10 | import urllib
11 | import json
12 | import os
13 | import time
14 |
15 | def reset_cookies(browser,listCookies):
16 | browser.delete_all_cookies()
17 | for cookie in listCookies:
18 | browser.add_cookie({
19 | 'domain': cookie['domain'] if cookie['domain'].startswith('.') else '.'+cookie['domain'],
20 | 'name': cookie['name'],
21 | 'value': cookie['value'],
22 | 'path': '/',
23 | 'expires': None
24 | })
25 |
26 | def login(browser):
27 | name = browser.find_element_by_id("TANGRAM_12__userName")
28 | name.clear()
29 | name.send_keys("yourname")
30 | password = browser.find_element_by_id("TANGRAM_12__password")
31 | password.clear()
32 | password.send_keys("yourpassword")
33 | submit = browser.find_element_by_id('TANGRAM_12__submit').click()
34 | time.sleep(3)
35 | with open('baidu_cookies.json', 'w') as f:
36 | cookies = browser.get_cookies()
37 | cookies[0]['create_at'] = time.time()
38 | data = json.dumps(cookies)
39 | f.write(data)
40 |
41 | def move_fuc(action,browser,keyword,x,y,k):
42 | # 模拟移动鼠标截图
43 | trend = browser.find_element_by_id("trend")
44 | action.move_to_element_with_offset(trend,x,y).perform()
45 | time.sleep(10)
46 | browser.save_screenshot('images/screenshot.png')
47 | # 根据需要元素裁图
48 | viewbox = browser.find_element_by_id("viewbox")
49 | date = browser.find_element_by_xpath('//*[@id="viewbox"]/div[1]/div[1]').text.split(' ')[0]
50 | left = viewbox.location['x']
51 | top = viewbox.location['y']
52 | right = viewbox.location['x'] + viewbox.size['width']
53 | bottom = viewbox.location['y'] + viewbox.size['height']
54 | im = Image.open('images/screenshot.png')
55 | im = im.crop((left, top, right, bottom))
56 | image_name = 'images/baidu_%s_%s.png' % (keyword,date)
57 | im.save(image_name)
58 | time.sleep(1)
59 |
60 | # 调用ocr识别图像
61 | os.system('./zfOcr '+image_name)
62 | time.sleep(3)
63 | dir_name = os.path.dirname(os.path.abspath(__file__))+'/'
64 | if os.path.exists(dir_name+image_name+'.txt'):
65 | with open(image_name+'.txt','r') as f:
66 | num = int(f.read())
67 | data = {'keyword_id':k[0],'site':u'百度','keyword_type':k[2],'index_date':date,'index_value':num}
68 | save_keyword_index(data)
69 | else:
70 | print '%s.txt file not exist' % (image_name)
71 | # print date,num
72 |
73 |
74 | if __name__ == '__main__':
75 | browser = webdriver.PhantomJS()
76 | try:
77 | browser.maximize_window()
78 | keyword_list = get_keywords()
79 | for k in keyword_list:
80 | keyword = k[1].decode('utf8')
81 | keyword = urllib.quote(keyword.encode('cp936'))
82 | try:
83 | browser.get('http://index.baidu.com/?tpl=trend&word=%s' % keyword)
84 | with open('baidu_cookies.json', 'r') as f:
85 | listCookies = f.read()
86 | if listCookies:
87 | listCookies = json.loads(listCookies)
88 | create_at = listCookies[0]['create_at']
89 | else:
90 | create_at = 0
91 | if create_at == 0 or time.time() - create_at > 3600*5:
92 | login(browser)
93 | else:
94 | reset_cookies(browser,listCookies)
95 | browser.get('http://index.baidu.com/?tpl=trend&word=%s' % keyword)
96 | time.sleep(5)
97 | try:
98 | trend = browser.find_element_by_id("trend")
99 | except:
100 | trend = ''
101 | if trend:
102 | action = ActionChains(browser)
103 | for i in range(0,30):
104 | x = 30 + 42*i
105 | if i == 29:
106 | x = 1230
107 | move_fuc(action,browser,keyword,x,150,k)
108 | else:
109 | print '%s not find' % keyword
110 | except Exception, e:
111 | print keyword,unicode(e)
112 | except Exception, e:
113 | print unicode(e)
114 | finally:
115 | browser.quit()
--------------------------------------------------------------------------------
/selenium/exponent_sina.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support import expected_conditions as EC
5 | from selenium.webdriver.support.wait import WebDriverWait
6 | from selenium.webdriver.common.action_chains import ActionChains
7 | from keywords import get_keywords,save_keyword_index
8 | import urllib
9 | import time
10 | import sys
11 |
12 | def move_fuc(action,element,browser,x,y=200):
13 | action.move_to_element_with_offset(element,x,y).perform()
14 | time.sleep(1)
15 | div = browser.find_element_by_xpath('//*[@id="hotword_chart"]/div/div[2]')
16 | text = div.get_attribute('innerHTML')
17 | date = text.split('
')[0].split(u':')[1]
18 | val = text.split('
')[1].split(u':')[1].replace(',','')
19 | return date,val
20 |
21 |
22 | if __name__ == '__main__':
23 | browser = webdriver.PhantomJS()
24 | try:
25 | browser.maximize_window()
26 | keyword_list = get_keywords()
27 | for k in keyword_list:
28 | keyword = k[1].decode('utf8')
29 | keyword = urllib.quote(keyword.encode('utf8'))
30 | browser.get('http://data.weibo.com/index/hotword?wid=1020000010045&wname=%s' % keyword)
31 | try:
32 | canvas = browser.find_element_by_xpath('//*[@id="hotword_chart"]/div/canvas[7]')
33 | except:
34 | canvas = ''
35 |
36 | if canvas:
37 | action = ActionChains(browser)
38 |
39 | data = {}
40 | for i in range(0,33):
41 | date,val = move_fuc(action,canvas,browser,35+i*26)
42 | # print date,val
43 | data = {'keyword_id':k[0],'site':u'新浪','keyword_type':k[2],'index_date':date,'index_value':val}
44 | try:
45 | save_keyword_index(data)
46 | except Exception, e:
47 | print keyword,date,val,unicode(e)
48 | else:
49 | print '%s not find' % keyword
50 | except Exception, e:
51 | print unicode(e)
52 | finally:
53 | browser.quit()
54 |
--------------------------------------------------------------------------------
/selenium/exponent_sougou.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support import expected_conditions as EC
5 | from selenium.webdriver.support.wait import WebDriverWait
6 | from keywords import get_keywords,save_keyword_index
7 | import re
8 | import urllib
9 | import time
10 |
11 | if __name__ == '__main__':
12 | browser = webdriver.PhantomJS()
13 | try:
14 | keyword_list = get_keywords()
15 | for k in keyword_list:
16 | keyword = k[1].decode('utf8')
17 | keyword = urllib.quote(keyword.encode('utf8'))
18 | browser.get('http://zhishu.sogou.com/index/searchHeat?kwdNamesStr=%s&timePeriodType=MONTH&dataType=SEARCH_ALL&queryType=INPUT' % keyword)
19 | try:
20 | r = re.findall(r'root.SG.data = {"pvList":\[([\s\S]*)],"infoList"', browser.page_source, re.M)
21 | except:
22 | r = ''
23 | if r:
24 | points = eval(r[0].split('],"infoList"')[0])
25 | for p in points:
26 | date = str(p['date'])
27 | date = date[0:4]+'-'+date[4:6]+'-'+date[6:8]
28 | # print date,p['pv']
29 | data = {'keyword_id':k[0],'site':u'搜狗','keyword_type':k[2],'index_date':date,'index_value':p['pv']}
30 | try:
31 | save_keyword_index(data)
32 | except Exception, e:
33 | print unicode(e),keyword,date,p['pv']
34 | else:
35 | print '%s not find' % keyword
36 | except Exception, e:
37 | print unicode(e)
38 | finally:
39 | browser.quit()
--------------------------------------------------------------------------------
/selenium/keywords.py:
--------------------------------------------------------------------------------
1 | import psycopg2
2 |
3 | conn = psycopg2.connect(database='SYH',user='postgres',password='bigdata123',host='10.20.1.50',port='5432')
4 | cur = conn.cursor()
5 |
6 | def get_keywords():
7 | cur.execute("SELECT * FROM biz_keyword;")
8 | rows = cur.fetchall()
9 | return rows
10 |
11 | def save_keyword_index(data):
12 | cur.execute('SELECT * FROM biz_keyword_index where keyword_id=%s and index_date=\'%s\' and site=\'%s\';' % (data['keyword_id'],data['index_date'],data['site']))
13 | rows = cur.fetchall()
14 | if not len(rows):
15 | # print('save:',data['index_date'])
16 | cur.execute("INSERT INTO biz_keyword_index (keyword_id,site,keyword_type,index_date,index_value) values (%s,\'%s\',%s,\'%s\',%s);" % (data['keyword_id'],data['site'],data['keyword_type'],data['index_date'],data['index_value']))
17 | conn.commit()
18 | return rows
--------------------------------------------------------------------------------
/selenium/selenium_get_url.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support import expected_conditions as EC
5 | from selenium.webdriver.support.wait import WebDriverWait
6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
7 | from selenium.webdriver.common.action_chains import ActionChains
8 | import time
9 |
10 | def write_fuc(browser):
11 | table = browser.find_element_by_xpath('//*[@id="mainleft"]')
12 | tag_a = table.find_elements_by_tag_name('a')
13 | with open('url_list.txt','a+') as f:
14 | for a in tag_a:
15 | text = a.text
16 | if u'信息公开表' in text:
17 | print text
18 | f.write(a.get_attribute("href")+'\n')
19 |
20 | if __name__ == '__main__':
21 |
22 | # desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
23 | # desired_capabilities["phantomjs.page.settings.userAgent"] = (
24 | # "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
25 | # )
26 | browser = webdriver.PhantomJS()
27 |
28 | # browser.get('http://www.weihaifda.gov.cn/col/col14562/index.html')
29 | # browser.save_screenshot('screenshot.png')
30 | # write_fuc(browser)
31 | # browser.get('http://www.thsfda.gov.cn/xxgk/xzcfajxxgk/index_1.html')
32 | # write_fuc(browser)
33 | # for i in range(1,41):
34 | # browser.get('http://ypjd.xjalt.gov.cn/qwssjgy.jsp?wbtreeid=1001¤tnum='+str(i)+'&newskeycode2=6KGM5pS%2F5aSE572a5qGI5Lu25L%2Bh5oGv5YWs5byA')
35 | # write_fuc(browser)
36 |
37 | browser.get('http://www.huainan.gov.cn/public/column/4971284?type=4&catId=4977426&action=list')
38 | # browser.find_element_by_xpath('//*[@id="example"]/li[7]/div').click()
39 | # browser.switch_to.frame("conTarget")
40 | # write_fuc(browser)
41 | # time.sleep(1)
42 | # write_fuc(browser)
43 |
44 | count = 1
45 | while count <= 16:
46 | # try:
47 | # next_page = browser.find_element_by_xpath('//*[@id="container"]/div/div/table//tr/td[3]/div[2]/form/table//tr[21]/td/table//tr/td/table//tr/td[2]/div/a[7]')
48 | try:
49 | next_page = browser.find_element_by_partial_link_text('下一页')
50 | # next_page = browser.find_element_by_id('NextPage1_Next')
51 | except:
52 | next_page = ''
53 | # if 'default_pgNextDisabled' in next_page.get_attribute('class'):
54 | if not next_page:
55 | print 'enter_over'
56 | write_fuc(browser)
57 | break
58 | else:
59 | print 'enter'
60 | write_fuc(browser)
61 | next_page.click()
62 | time.sleep(2)
63 | count += 1
64 | browser.quit()
65 |
--------------------------------------------------------------------------------
/selenium/selenium_proxy.py:
--------------------------------------------------------------------------------
1 | #coding:utf8
2 | from selenium import webdriver
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.common.proxy import Proxy
5 | from selenium.webdriver.common.proxy import ProxyType
6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
7 | from selenium.webdriver.support import expected_conditions as EC
8 | from selenium.webdriver.support.wait import WebDriverWait
9 | import time
10 |
11 | porxy_list = ['183.222.102.105','183.222.102.101','60.216.42.11','47.52.24.117']
12 |
13 | proxy = Proxy(
14 | {
15 | 'proxyType': ProxyType.MANUAL,
16 | 'httpProxy': '47.52.24.117'
17 | }
18 | )
19 | desired_capabilities = DesiredCapabilities.PHANTOMJS.copy()
20 | proxy.add_to_capabilities(desired_capabilities)
21 | desired_capabilities["phantomjs.page.settings.userAgent"] = (
22 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"
23 | )
24 |
25 | browser = webdriver.PhantomJS(desired_capabilities=desired_capabilities)
26 |
27 | # browser.get('http://www.seabay.cn/cn/code/?search=pvg')
28 | # print 'start_____'
29 | # table = browser.find_element_by_xpath('//*[@id="infoiata"]')
30 | # print table.get_attribute('innerHTML')
31 | # browser.quit()
32 |
33 |
34 | # browser.get('https://httpbin.org/get?show_env=1') #检测头信息
35 | browser.get('http://www.ip181.com/') #检测代理类型
36 | # browser.get('http://wenshu.court.gov.cn/List/List?sorttype=1&conditions=searchWord+3+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E8%A1%8C%E6%94%BF%E6%A1%88%E4%BB%B6')
37 | # browser.get('http://app1.sfda.gov.cn/datasearch/face3/base.jsp?tableId=114&tableName=TABLE114&title=%E5%9B%BD%E5%AE%B6%E9%A3%9F%E5%93%81%E5%AE%89%E5%85%A8%E7%9B%91%E7%9D%A3%E6%8A%BD%E6%A3%80%EF%BC%88%E4%B8%8D%E5%90%88%E6%A0%BC%E4%BA%A7%E5%93%81%EF%BC%89&bcId=143106776907834761101199700381')
38 | # print 'start_____'
39 | try:
40 | # browser.get('http://www.luan.gov.cn/opennessTarget/?branch_id=5212bc2d682e09147c7c4aa8&branch_type=&column_code=70302&topic_id=&tag=&page=1')
41 | # time.sleep(3)
42 | browser.save_screenshot('screenshot1.png')
43 | # print browser.page_source
44 | # WebDriverWait(browser,30).until(EC.visibility_of_any_elements_located((By.CSS_SELECTOR,'.dataItem')))
45 | # resultlist = browser.find_element_by_id('list')
46 | # print resultlist.get_attribute('innerHTML')
47 | # time.sleep(10)
48 | # resultlist = browser.find_element_by_id('list')
49 | resultlist = browser.find_element_by_class_name('panel-body')
50 | print resultlist.get_attribute('innerHTML')
51 | # print browser.page_source
52 | # with open('wenshu.html','w') as ws:
53 | # ws.write(resultlist.get_attribute('innerHTML'))
54 | # browser.save_screenshot('screenshot.png')
55 | finally:
56 | browser.quit()
--------------------------------------------------------------------------------
/selenium/tesseract_test.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pytesseract
3 | from PIL import Image
4 |
5 | if __name__ == '__main__':
6 | image = Image.open('images/test_2017-06-25.png')
7 | code = pytesseract.image_to_string(image, lang='eng')
8 | print code
--------------------------------------------------------------------------------