├── .gitignore
├── Jpider
    ├── __init__.py
    ├── __init__.pyc
    ├── settings.py
    ├── settings.pyc
    ├── urls.py
    └── wsgi.py
├── README.md
├── conf
    └── logging.conf
├── funny
    ├── 1111.py
    ├── __init__.py
    ├── add#.py
    ├── alert.conf
    ├── ambari_yinxin_popo_alert.py
    ├── exception_t.py
    ├── execjs_test
    │   ├── __init__.py
    │   └── exec_js.py
    ├── funny_comment
    ├── funny_comment#
    ├── info.conf
    ├── readconf.py
    └── wx.py
├── log
    └── jpider.log
├── manage.py
├── myutil
    ├── __init__.py
    ├── email2.py
    └── myemail.py
├── output
    ├── __init__.py
    ├── all-data-2017_04_27.xls
    ├── all-data.xls
    ├── baidu.py
    └── dazhongdianping.py
├── spiders
    ├── Sina_spider1
    │   ├── Begin.py
    │   ├── Sina_spider1
    │   │   ├── __init__.py
    │   │   ├── __init__.pyc
    │   │   ├── conf.py
    │   │   ├── conf
    │   │   │   └── weibo.yaml
    │   │   ├── constant.py
    │   │   ├── cookies.py
    │   │   ├── cookies.pyc
    │   │   ├── items.py
    │   │   ├── items.pyc
    │   │   ├── middleware.py
    │   │   ├── middleware.pyc
    │   │   ├── pipelines.py
    │   │   ├── pipelines.pyc
    │   │   ├── settings.py
    │   │   ├── settings.pyc
    │   │   ├── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── __init__.pyc
    │   │   │   ├── spiders.py
    │   │   │   └── spiders.pyc
    │   │   ├── user_agents.py
    │   │   ├── user_agents.pyc
    │   │   ├── yumdama.py
    │   │   └── yumdama.pyc
    │   ├── aa.png
    │   └── scrapy.cfg
    ├── __init__.py
    ├── __init__.pyc
    ├── admin.py
    ├── apps.py
    ├── apps.pyc
    ├── baidurank
    │   ├── 123.html
    │   ├── baidurank
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── rank.py
    │   ├── hello.html
    │   └── scrapy.cfg
    ├── bilibili
    │   ├── __init__.py
    │   └── bilibili_spider.py
    ├── dazongdianping
    │   ├── dazongdianping.log
    │   ├── dazongdianping
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── dazhong_no.py
    │   │   │   ├── dazong.py
    │   │   │   ├── dazong_repair.py
    │   │   │   └── features.txt
    │   └── scrapy.cfg
    ├── dist_weibo
    │   ├── __init__.py
    │   ├── conf
    │   │   ├── account.conf
    │   │   └── logging.conf
    │   ├── dao
    │   │   ├── __init__.py
    │   │   ├── redis_cookies.py
    │   │   └── sqlalchemy_session.py
    │   ├── headers.py
    │   ├── js
    │   │   └── ssologin.js
    │   ├── kill_celery.txt
    │   ├── logger.py
    │   ├── login
    │   │   ├── __init__.py
    │   │   └── login.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   └── models.py
    │   ├── notebook
    │   │   └── Request.md
    │   ├── sql
    │   │   └── database.sql
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── home_page.py
    │   │   ├── login.py
    │   │   ├── mobile_login.py
    │   │   ├── user.py
    │   │   └── workers.py
    │   └── workers.py
    ├── dist_weibo_spider
    │   ├── __init__.py
    │   ├── conf
    │   │   └── account.conf
    │   ├── dao
    │   │   ├── __init__.py
    │   │   └── redis_cookies.py
    │   ├── headers.py
    │   ├── js
    │   │   └── ssologin.js
    │   ├── login
    │   │   ├── __init__.py
    │   │   └── login.py
    │   ├── notebook
    │   │   └── Request.md
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── home_page.py
    │   │   └── workers.py
    │   └── workers.py
    ├── distributed
    │   ├── README.md
    │   ├── __init__.py
    │   ├── celeryt
    │   │   ├── __init__.py
    │   │   ├── celerybeat-schedule
    │   │   ├── tasks.py
    │   │   └── test.py
    │   ├── redist
    │   │   ├── __init__.py
    │   │   └── test_redis.py
    │   ├── task_dispatcher.py
    │   ├── tasks.py
    │   └── workers.py
    ├── logger.py
    ├── models.py
    ├── mzi
    │   ├── mzi
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── baikerank.py
    │   │   │   └── meizi.py
    │   └── scrapy.cfg
    ├── onepiece
    │   ├── aaa
    │   ├── onepiece
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   └── one_piece.py
    │   └── scrapy.cfg
    ├── pangxieyg
    │   ├── README.MD
    │   ├── README.txt
    │   ├── __init__.py
    │   ├── alert-templates.xml
    │   ├── build
    │   │   └── pangxie2
    │   │   │   ├── out00-Analysis.toc
    │   │   │   ├── out00-EXE.toc
    │   │   │   ├── out00-PKG.pkg
    │   │   │   ├── out00-PKG.toc
    │   │   │   ├── out00-PYZ.pyz
    │   │   │   ├── out00-PYZ.toc
    │   │   │   ├── pangxie2.exe.manifest
    │   │   │   └── warnpangxie2.txt
    │   ├── conf
    │   │   ├── README.MD
    │   │   ├── info.conf
    │   │   └── logging.conf
    │   ├── dist.zip
    │   ├── dist
    │   │   ├── conf
    │   │   │   ├── README.MD
    │   │   │   ├── info.conf
    │   │   │   └── logging.conf
    │   │   └── pangxie2.exe
    │   ├── i.spec
    │   ├── ico.ico
    │   ├── mp3player.py
    │   ├── notify.wav
    │   ├── pangxie2.py
    │   ├── pangxie2.spec
    │   ├── pangxieyg.py
    │   ├── record
    │   │   ├── 104808_2017-06-02.txt
    │   │   ├── 104808_2017-06-06.txt
    │   │   ├── 107494_2017-06-02.txt
    │   │   ├── 107494_2017-06-06.txt
    │   │   ├── 109121_2017-06-02.txt
    │   │   ├── 109121_2017-06-06.txt
    │   │   ├── 111592_2017-06-02.txt
    │   │   ├── 111592_2017-06-06.txt
    │   │   ├── 121557_2017-06-02.txt
    │   │   └── 121557_2017-06-06.txt
    │   ├── sound.py
    │   ├── user_agent.py
    │   └── winui.py
    ├── rank
    │   ├── __init__.py
    │   └── baike_rank.py
    ├── tests.py
    ├── user_agent.py
    ├── views.py
    ├── wechat_sport
    │   ├── __init__.py
    │   ├── get_steps.py
    │   └── wechat_login.py
    ├── weibo
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-35.pyc
    │   │   └── user_agent.cpython-35.pyc
    │   ├── conf
    │   │   ├── README.md
    │   │   └── account.conf
    │   ├── constants.py
    │   ├── dao.py
    │   ├── multhread.py
    │   ├── weibo2.py
    │   ├── weibo_conf.py
    │   └── weibo_http.py
    └── zju
    │   ├── scrapy.cfg
    │   └── zju
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── myemail.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       ├── spiders
    │           └── ZjuSpider.py
    │       └── wechat.py
└── usage
    ├── README.md
    ├── __init__.py
    ├── __pycache__
        └── __init__.cpython-35.pyc
    ├── celery_u
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-35.pyc
        │   ├── celery.cpython-35.pyc
        │   └── tasks.cpython-35.pyc
        ├── add.py
        ├── celery3.pdf
        ├── celery_.py
        ├── celeryconfig.py
        └── tasks.py
    ├── kafka_u
        ├── __init__.py
        ├── consumer.py
        └── producer.py
    ├── proj
        └── tasks.py
    ├── redis_u
        ├── __init__.py
        └── redis_u.py
    └── requests_u
        ├── __init__.py
        └── req_usage.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | .idea/
3 | Jpider/__pycache__/
4 | spiders/__pycache__/
5 | spiders/migrations/
6 | templates/
7 | *.log
8 | */*.conf
9 | 


--------------------------------------------------------------------------------
/Jpider/__init__.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 | pymysql.install_as_MySQLdb()
3 | 


--------------------------------------------------------------------------------
/Jpider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/Jpider/__init__.pyc


--------------------------------------------------------------------------------
/Jpider/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for Jpider project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 1.10.3.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/1.10/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/1.10/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | 
 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 | 
 18 | 
 19 | # Quick-start development settings - unsuitable for production
 20 | # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/
 21 | 
 22 | # SECURITY WARNING: keep the secret key used in production secret!
 23 | SECRET_KEY = '$v@3$86$0^8cney1sk%ln+aa+zf!x@=6bhs$u1&ka(*!i%vub$'
 24 | 
 25 | # SECURITY WARNING: don't run with debug turned on in production!
 26 | DEBUG = True
 27 | 
 28 | ALLOWED_HOSTS = []
 29 | 
 30 | 
 31 | # Application definition
 32 | 
 33 | INSTALLED_APPS = [
 34 |     'django.contrib.admin',
 35 |     'django.contrib.auth',
 36 |     'django.contrib.contenttypes',
 37 |     'django.contrib.sessions',
 38 |     'django.contrib.messages',
 39 |     'django.contrib.staticfiles',
 40 |     'spiders.apps.SpidersConfig'
 41 | ]
 42 | 
 43 | MIDDLEWARE = [
 44 |     'django.middleware.security.SecurityMiddleware',
 45 |     'django.contrib.sessions.middleware.SessionMiddleware',
 46 |     'django.middleware.common.CommonMiddleware',
 47 |     'django.middleware.csrf.CsrfViewMiddleware',
 48 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 49 |     'django.contrib.messages.middleware.MessageMiddleware',
 50 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 51 | ]
 52 | 
 53 | ROOT_URLCONF = 'Jpider.urls'
 54 | 
 55 | TEMPLATES = [
 56 |     {
 57 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 58 |         'DIRS': [os.path.join(BASE_DIR, 'templates')]
 59 |         ,
 60 |         'APP_DIRS': True,
 61 |         'OPTIONS': {
 62 |             'context_processors': [
 63 |                 'django.template.context_processors.debug',
 64 |                 'django.template.context_processors.request',
 65 |                 'django.contrib.auth.context_processors.auth',
 66 |                 'django.contrib.messages.context_processors.messages',
 67 |             ],
 68 |         },
 69 |     },
 70 | ]
 71 | 
 72 | WSGI_APPLICATION = 'Jpider.wsgi.application'
 73 | 
 74 | 
 75 | # Database
 76 | # https://docs.djangoproject.com/en/1.10/ref/settings/#databases
 77 | 
 78 | DATABASES = {
 79 |     'default': {
 80 |         'ENGINE': 'django.db.backends.mysql',
 81 |         # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
 82 |         'NAME': 'spider',
 83 |         'USER': 'root',
 84 |         'HOST': '127.0.0.1',
 85 |         'PASSWORD': '1111',
 86 |         'PORT': 3306,
 87 |         # 在处理包含emoji微博的时候，可能会出错
 88 |         # 想去看下字符编码
 89 |         # use spider;
 90 |         # show variables like 'character_set_database'；
 91 |         # 修改字段字符编码
 92 |         # alter table spiders_weibo modify text longtext charset utf8mb4 collate utf8mb4_unicode_ci;
 93 |         'OPTIONS': {'charset': 'utf8mb4'},
 94 |     }
 95 | }
 96 | 
 97 | 
 98 | # Password validation
 99 | # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators
100 | 
101 | AUTH_PASSWORD_VALIDATORS = [
102 |     {
103 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
104 |     },
105 |     {
106 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
107 |     },
108 |     {
109 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
110 |     },
111 |     {
112 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
113 |     },
114 | ]
115 | 
116 | 
117 | # Internationalization
118 | # https://docs.djangoproject.com/en/1.10/topics/i18n/
119 | 
120 | LANGUAGE_CODE = 'en-us'
121 | 
122 | TIME_ZONE = 'UTC'
123 | 
124 | USE_I18N = True
125 | 
126 | USE_L10N = True
127 | 
128 | USE_TZ = True
129 | 
130 | 
131 | # Static files (CSS, JavaScript, Images)
132 | # https://docs.djangoproject.com/en/1.10/howto/static-files/
133 | 
134 | STATIC_URL = '/static/'
135 | 


--------------------------------------------------------------------------------
/Jpider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/Jpider/settings.pyc


--------------------------------------------------------------------------------
/Jpider/urls.py:
--------------------------------------------------------------------------------
 1 | """Jpider URL Configuration
 2 | 
 3 | The `urlpatterns` list routes URLs to views. For more information please see:
 4 |     https://docs.djangoproject.com/en/1.10/topics/http/urls/
 5 | Examples:
 6 | Function views
 7 |     1. Add an import:  from my_app import views
 8 |     2. Add a URL to urlpatterns:  url(r'^$', views.home, name='home')
 9 | Class-based views
10 |     1. Add an import:  from other_app.views import Home
11 |     2. Add a URL to urlpatterns:  url(r'^$', Home.as_view(), name='home')
12 | Including another URLconf
13 |     1. Import the include() function: from django.conf.urls import url, include
14 |     2. Add a URL to urlpatterns:  url(r'^blog/', include('blog.urls'))
15 | """
16 | from django.conf.urls import url
17 | from django.contrib import admin
18 | 
19 | urlpatterns = [
20 |     url(r'^admin/', admin.site.urls),
21 | ]
22 | 


--------------------------------------------------------------------------------
/Jpider/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for Jpider project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Jpider.settings")
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Jpider
 2 | 新浪微博、拉钩网、大众点评各种爬虫
 3 | 
 4 | 项目模块比较凌乱
 5 | 
 6 | 所有爬虫都在 spiders包下
 7 | 
 8 | - baidurank
 9 | > 百度百科的明星排行
10 | - dazongdianping
11 | > 大众点评店铺信息
12 | - dist_weibo_spider
13 | > pc端微博分布式爬虫，还未完成....
14 | - mzi
15 | > 妹子网
16 | - pangxieyg
17 | > 小众电商抢购爬虫
18 | - wechat_sport
19 | > 微信运动步数获取，目前只能获取自己的步数
20 | - weibo
21 | > 移动端微博爬虫，多线程
22 | 
23 | 所有爬虫的ORM用的是django自带的ORM
24 | 
25 | - usage
26 | > 一些库的使用示例
27 | 
28 | 


--------------------------------------------------------------------------------
/conf/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root, simpleLogger
 3 | 
 4 | [handlers]
 5 | keys=consoleHandler,fileHandler,rotatingFileHandler
 6 | 
 7 | [formatters]
 8 | keys=simpleFmt
 9 | 
10 | [logger_root]
11 | level=DEBUG
12 | handlers=rotatingFileHandler
13 | 
14 | [logger_simpleLogger]
15 | level=DEBUG
16 | handlers=consoleHandler,rotatingFileHandler
17 | qualname=simpleLogger
18 | propagate=0
19 | 
20 | [handler_consoleHandler]
21 | class=StreamHandler
22 | level=DEBUG
23 | formatter=simpleFmt
24 | args=(sys.stdout,)
25 | 
26 | [handler_fileHandler]
27 | class=FileHandler
28 | level=DEBUG
29 | formatter=simpleFmt
30 | args=('../../log/jpider.log','a')
31 | 
32 | [handler_rotatingFileHandler]
33 | class=handlers.RotatingFileHandler
34 | level=DEBUG
35 | formatter=simpleFmt
36 | args=('../../log/jpider.log','a',50*1024*1024, 10)
37 | 
38 | [formatter_simpleFmt]
39 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s
40 | datefmt='%Y-%m-%d %H:%M:%S'


--------------------------------------------------------------------------------
/funny/1111.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | 
 3 | 
 4 | def is_perfect(input_str):
 5 |     found = False
 6 |     for index in range(len(input_str)-3):
 7 |         if not found:
 8 |             if input_str[index] == input_str[index+1]:
 9 |                 if input_str[index+2] == input_str[index+3]:
10 |                     found = True
11 |                 else:
12 |                     return False
13 |         else:
14 |             if input_str[index] == input_str[index+1]:
15 |                 return False
16 |     else:
17 |         return found
18 | 
19 | s = input()
20 | all_perm = set()
21 | for i in itertools.permutations(s, len(s)):
22 |     print(i)
23 |     all_perm.add(''.join(list(i)))
24 | count = 0
25 | for i in all_perm:
26 |     if is_perfect(i):
27 |         print(i)
28 |         count += 1
29 | print(count)
30 | 


--------------------------------------------------------------------------------
/funny/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/funny/__init__.py


--------------------------------------------------------------------------------
/funny/add#.py:
--------------------------------------------------------------------------------
1 | def try_expect_finally():
2 |     try:
3 |         1/0
4 |     except Exception:
5 |         print 'e'
6 |     finally:
7 |         print 'finally'
8 | try_expect_finally()
9 | 


--------------------------------------------------------------------------------
/funny/alert.conf:
--------------------------------------------------------------------------------
1 | [main]
2 | type=15
3 | product=test
4 | account=xxxxxx@corp.netease.com
5 | mobile=134xxxxx520
6 | subject=alert send by python
7 | 


--------------------------------------------------------------------------------
/funny/ambari_yinxin_popo_alert.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | __doc__ = """
 3 |         ambari alert by yixin & popo
 4 |         """
 5 | import sys
 6 | import requests
 7 | import ConfigParser
 8 | 
 9 | netease_alert_url = 'https://xxxx/omnew/alert/sendMultiAlert'
10 | conf_path = './alert.conf'
11 | 
12 | 
13 | def send_alert(data):
14 |     if isinstance(data, dict):
15 |         requests.post(url=netease_alert_url, json=data)
16 | 
17 | 
18 | def main():
19 |     cf = ConfigParser.ConfigParser()
20 |     cf.read(conf_path)
21 |     data = {}
22 |     type = int(cf.get('main', 'type'))
23 |     product = cf.get('main', 'product')
24 |     data['product'] = product
25 |     data['type'] = type
26 |     # get message from ambari alert
27 |     definitionName = sys.argv[1]
28 |     definitionLabel = sys.argv[2]
29 |     serviceName = sys.argv[3]
30 |     alertState = sys.argv[4]
31 |     alertText = sys.argv[5]
32 |     alert_msg = "definitionName:%s\ndefinitionLabel:%s\nserviceName:%s\nalertState:%s\nalertText:%s\n" % \
33 |                 (definitionName, definitionLabel, serviceName, alertState, alertText)
34 | 
35 |     if type >> 3 == 1:  # send by yixin
36 |         type &= 7
37 |         mobile = cf.get('main', 'mobile')
38 |         data['mobile'] = mobile
39 |         data['yixinMsg'] = alert_msg
40 | 
41 |     if type >> 2 == 1:  # duanxin
42 |         type &= 3
43 |         mobile = cf.get('main', 'mobile')
44 |         data['mobile'] = mobile
45 |         data['mobileMsg'] = alert_msg
46 | 
47 |     if type >> 1 == 1:  # email
48 |         type &= 1
49 |         account = cf.get('main', 'account')
50 |         data['account'] = account
51 |         data['emailMsg'] = alert_msg
52 |         data['subject'] = cf.get('main', 'subject')
53 | 
54 |     if type == 1:  # popo
55 |         data['account'] = cf.get('main', 'account')
56 |         data['popoMsg'] = alert_msg
57 | 
58 |     send_alert(data=data)
59 | 
60 | main()
61 | 


--------------------------------------------------------------------------------
/funny/exception_t.py:
--------------------------------------------------------------------------------
 1 | import traceback
 2 | 
 3 | def f():
 4 |     try:
 5 |         1/2
 6 |         return
 7 |     except Exception as e:
 8 |         print traceback.format_exc()
 9 |     finally:
10 |         print 'finally'
11 | 
12 | f()


--------------------------------------------------------------------------------
/funny/execjs_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/funny/execjs_test/__init__.py


--------------------------------------------------------------------------------
/funny/execjs_test/exec_js.py:
--------------------------------------------------------------------------------
 1 | import execjs
 2 | 
 3 | print(execjs.eval("'1 2 3'.split()"))
 4 | ctx = execjs.compile("""
 5 |     function add(x, y){
 6 |         return x + y;
 7 |     }
 8 | """)
 9 | print(ctx.call('add', 1, 2))
10 | 


--------------------------------------------------------------------------------
/funny/funny_comment:
--------------------------------------------------------------------------------
 1 | 
 2 |                                       ,_-=(!7(7/zs_.
 3 |                                    .='  ' .`/,/!(=)Zm.
 4 |                      .._,,._..  ,-`- `,\ ` -` -`\\7//WW.
 5 |                 ,v=~/.-,-\- -!|V-s.)iT-|s|\-.'   `///mK%.
 6 |               v!`i!-.e]-g`bT/i(/[=.Z/m)K(YNYi..   /-]i44M.
 7 |             v`/,`|v]-DvLcfZ/eV/iDLN\D/ZK@%8W[Z..   `/d!Z8m
 8 |            //,c\(2(X/NYNY8]ZZ/bZd\()/\7WY%WKKW)   -'|(][%4.
 9 |          ,\\i\c(e)WX@WKKZKDKWMZ8(b5/ZK8]Z7%ffVM,   -.Y!bNMi
10 |          /-iit5N)KWG%%8%%%%W8%ZWM(8YZvD)XN(@.  [   \]!/GXW[
11 |         / ))G8\NMN%W%%%%%%%%%%8KK@WZKYK*ZG5KMi,-   vi[NZGM[
12 |        i\!(44Y8K%8%%%**~YZYZ@%%%%%4KWZ/PKN)ZDZ7   c=//WZK%!
13 |       ,\v\YtMZW8W%%f`,`.t/bNZZK%%W%%ZXb*K(K5DZ   -c\\/KM48
14 |       -|c5PbM4DDW%f  v./c\[tMY8W%PMW%D@KW)Gbf   -/(=ZZKM8[
15 |       2(N8YXWK85@K   -'c|K4/KKK%@  V%@@WD8e~  .//ct)8ZK%8`
16 |       =)b%]Nd)@KM[  !'\cG!iWYK%%|   !M@KZf    -c\))ZDKW%`
17 |       YYKWZGNM4/Pb  '-VscP4]b@W%     'Mf`   -L\///KM(%W!
18 |       !KKW4ZK/W7)Z. '/cttbY)DKW%     -`  .',\v)K(5KW%%f
19 |       'W)KWKZZg)Z2/,!/L(-DYYb54%  ,,`, -\-/v(((KK5WW%f
20 |        \M4NDDKZZ(e!/\7vNTtZd)8\Mi!\-,-/i-v((tKNGN%W%%
21 |        'M8M88(Zd))///((|D\tDY\\KK-`/-i(=)KtNNN@W%%%@%[
22 |         !8%@KW5KKN4///s(\Pd!ROBY8/=2(/4ZdzKD%K%%%M8@%%
23 |          '%%%W%dGNtPK(c\/2\[Z(ttNYZ2NZW8W8K%%%%YKM%M%%.
24 |            *%%W%GW5@/%!e]_tZdY()v)ZXMZW%W%%%*5Y]K%ZK%8[
25 |             '*%%%%8%8WK\)[/ZmZ/Zi]!/M%%%%@f\ \Y/NNMK%%!
26 |               'VM%%%%W%WN5Z/Gt5/b)((cV@f`  - |cZbMKW%%|
27 |                  'V*M%%%WZ/ZG\t5((+)L\'-,,/  -)X(NWW%%
28 |                       `~`MZ/DZGNZG5(((\,    ,t\\Z)KW%@
29 |                          'M8K%8GN8\5(5///]i!v\K)85W%%f
30 |                            YWWKKKKWZ8G54X/GGMeK@WM8%@
31 |                             !M8%8%48WG@KWYbW%WWW%%%@
32 |                               VM%WKWK%8K%%8WWWW%%%@`
33 |                                 ~*%%%%%%W%%%%%%%@~
34 |                                    ~*MM%%%%%%@f`
35 |                                        '''''
36 | 
37 |                                        


--------------------------------------------------------------------------------
/funny/funny_comment#:
--------------------------------------------------------------------------------
 1 | #
 2 | #                                      ,_-=(!7(7/zs_.
 3 | #                                   .='  ' .`/,/!(=)Zm.
 4 | #                     .._,,._..  ,-`- `,\ ` -` -`\\7//WW.
 5 | #                ,v=~/.-,-\- -!|V-s.)iT-|s|\-.'   `///mK%.
 6 | #              v!`i!-.e]-g`bT/i(/[=.Z/m)K(YNYi..   /-]i44M.
 7 | #            v`/,`|v]-DvLcfZ/eV/iDLN\D/ZK@%8W[Z..   `/d!Z8m
 8 | #           //,c\(2(X/NYNY8]ZZ/bZd\()/\7WY%WKKW)   -'|(][%4.
 9 | #         ,\\i\c(e)WX@WKKZKDKWMZ8(b5/ZK8]Z7%ffVM,   -.Y!bNMi
10 | #         /-iit5N)KWG%%8%%%%W8%ZWM(8YZvD)XN(@.  [   \]!/GXW[
11 | #        / ))G8\NMN%W%%%%%%%%%%8KK@WZKYK*ZG5KMi,-   vi[NZGM[
12 | #       i\!(44Y8K%8%%%**~YZYZ@%%%%%4KWZ/PKN)ZDZ7   c=//WZK%!
13 | #      ,\v\YtMZW8W%%f`,`.t/bNZZK%%W%%ZXb*K(K5DZ   -c\\/KM48
14 | #      -|c5PbM4DDW%f  v./c\[tMY8W%PMW%D@KW)Gbf   -/(=ZZKM8[
15 | #      2(N8YXWK85@K   -'c|K4/KKK%@  V%@@WD8e~  .//ct)8ZK%8`
16 | #      =)b%]Nd)@KM[  !'\cG!iWYK%%|   !M@KZf    -c\))ZDKW%`
17 | #      YYKWZGNM4/Pb  '-VscP4]b@W%     'Mf`   -L\///KM(%W!
18 | #      !KKW4ZK/W7)Z. '/cttbY)DKW%     -`  .',\v)K(5KW%%f
19 | #      'W)KWKZZg)Z2/,!/L(-DYYb54%  ,,`, -\-/v(((KK5WW%f
20 | #       \M4NDDKZZ(e!/\7vNTtZd)8\Mi!\-,-/i-v((tKNGN%W%%
21 | #       'M8M88(Zd))///((|D\tDY\\KK-`/-i(=)KtNNN@W%%%@%[
22 | #        !8%@KW5KKN4///s(\Pd!ROBY8/=2(/4ZdzKD%K%%%M8@%%
23 | #         '%%%W%dGNtPK(c\/2\[Z(ttNYZ2NZW8W8K%%%%YKM%M%%.
24 | #           *%%W%GW5@/%!e]_tZdY()v)ZXMZW%W%%%*5Y]K%ZK%8[
25 | #            '*%%%%8%8WK\)[/ZmZ/Zi]!/M%%%%@f\ \Y/NNMK%%!
26 | #              'VM%%%%W%WN5Z/Gt5/b)((cV@f`  - |cZbMKW%%|
27 | #                 'V*M%%%WZ/ZG\t5((+)L\'-,,/  -)X(NWW%%
28 | #                      `~`MZ/DZGNZG5(((\,    ,t\\Z)KW%@
29 | #                         'M8K%8GN8\5(5///]i!v\K)85W%%f
30 | #                           YWWKKKKWZ8G54X/GGMeK@WM8%@
31 | #                            !M8%8%48WG@KWYbW%WWW%%%@
32 | #                              VM%WKWK%8K%%8WWWW%%%@`
33 | #                                ~*%%%%%%W%%%%%%%@~
34 | #                                   ~*MM%%%%%%@f`
35 | #                                       '''''
36 | #
37 | #                                       


--------------------------------------------------------------------------------
/funny/info.conf:
--------------------------------------------------------------------------------
1 | [user]
2 | username=13486178520
3 | password=vs7452014
4 | goods=104808 130459
5 | 
6 | 


--------------------------------------------------------------------------------
/funny/readconf.py:
--------------------------------------------------------------------------------
 1 | import ConfigParser
 2 | 
 3 | 
 4 | cf = ConfigParser.ConfigParser()
 5 | cf.read(r'.\info.conf')
 6 | sections = cf.sections()
 7 | username = cf.get('user', 'username')
 8 | password = cf.get('user', 'password')
 9 | goods = '|'.join(cf.get('user', 'goods').split(' '))
10 | print username, password, goods
11 | 


--------------------------------------------------------------------------------
/funny/wx.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from wxpy import *
 3 | from wechat_sender import *
 4 | 
 5 | bot = Bot()
 6 | my_friend = bot.friends().search('jopper')[0]
 7 | my_friend.send('hello')
 8 | group = bot.groups().search('Team of single dogs')[0]
 9 | group.send('send from python, for test\n zhujiajunup@163.com')
10 | 
11 | 


--------------------------------------------------------------------------------
/log/jpider.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/log/jpider.log


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os
 3 | import sys
 4 | 
 5 | if __name__ == "__main__":
 6 |     os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Jpider.settings")
 7 |     try:
 8 |         from django.core.management import execute_from_command_line
 9 |     except ImportError:
10 |         # The above import may fail for some other reason. Ensure that the
11 |         # issue is really that Django is missing to avoid masking other
12 |         # exceptions on Python 2.
13 |         try:
14 |             import django
15 |         except ImportError:
16 |             raise ImportError(
17 |                 "Couldn't import Django. Are you sure it's installed and "
18 |                 "available on your PYTHONPATH environment variable? Did you "
19 |                 "forget to activate a virtual environment?"
20 |             )
21 |         raise
22 |     execute_from_command_line(sys.argv)
23 | 


--------------------------------------------------------------------------------
/myutil/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/myutil/__init__.py


--------------------------------------------------------------------------------
/myutil/email2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import smtplib
 4 | from email.mime.multipart import MIMEMultipart
 5 | from email.mime.text import MIMEText
 6 | 
 7 | SERVER = 'smtp.163.com'
 8 | FROM = 'jjzhu_ncu@163.com'
 9 | TO = ['jjzhu_zju@163.com']
10 | 
11 | SUBJECT = u'测试UTF8编码'
12 | TEXT = u'ABCDEFG一二三四五六七'
13 | 
14 | msg = MIMEMultipart('alternative')
15 | # 注意包含了非ASCII字符，需要使用unicode
16 | msg['Subject'] = SUBJECT
17 | msg['From'] = FROM
18 | msg['To'] = ', '.join(TO)
19 | part = MIMEText(TEXT, 'plain', 'utf-8')
20 | msg.attach(part)
21 | 
22 | server = smtplib.SMTP(SERVER, port=25)
23 | server.login(FROM, 'jvs7452014')
24 | server.sendmail(FROM, TO, msg.as_string().encode('ascii'))
25 | server.quit()
26 | 


--------------------------------------------------------------------------------
/myutil/myemail.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | import email.mime.multipart
 3 | import email.mime.text
 4 | 
 5 | 
 6 | class Email(object):
 7 |     content_from = None
 8 |     content_to = None
 9 |     content_subject = None
10 |     content_msg = None
11 |     content_pwd = None
12 | 
13 |     def send_163(self):
14 |         assert self.content_from is not None
15 |         assert self.content_to is not None
16 |         assert self.content_pwd is not None
17 |         msg = email.mime.multipart.MIMEMultipart()
18 |         msg['from'] = self.content_from
19 |         msg['to'] = self.content_to
20 |         msg['subject'] = self.content_subject
21 |         txt = email.mime.text.MIMEText(self.content_msg, 'plain', 'utf-8')
22 |         msg.attach(txt)
23 |         smtp = smtplib.SMTP(host='smtp.163.com', port=25)
24 | 
25 |         smtp.login(self.content_from, self.content_pwd)
26 |         smtp.sendmail(self.content_from, self.content_to, str(msg))
27 |         smtp.quit()
28 | 
29 | 
30 | def send_email(subject, msg):
31 |     e = Email()
32 |     e.content_from = 'jjzhu_ncu@163.com'
33 |     e.content_to = '767543579@qq.com'
34 |     e.content_pwd = 'xxxx'
35 |     e.content_subject = 'hello world'
36 |     e.content_msg = 'hello word'
37 |     e.send_163()
38 | 
39 | if __name__ == '__main__':
40 |     send_email('', '')
41 | 


--------------------------------------------------------------------------------
/output/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import django
4 | 
5 | sys.path.append('/../Jpider')
6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
7 | django.setup()


--------------------------------------------------------------------------------
/output/all-data-2017_04_27.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/output/all-data-2017_04_27.xls


--------------------------------------------------------------------------------
/output/all-data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/output/all-data.xls


--------------------------------------------------------------------------------
/output/baidu.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import django
 4 | import django.db.models
 5 | sys.path.append('../Jpider')
 6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
 7 | django.setup()
 8 | 
 9 | from spiders.models import BaiKeRank
10 | 
11 | ranks = BaiKeRank.objects.all().order_by('rank')
12 | for r in ranks:
13 |     print(r)
14 | 
15 | 


--------------------------------------------------------------------------------
/output/dazhongdianping.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import sys
  3 | import os
  4 | import django
  5 | import django.db.models
  6 | sys.path.append('../Jpider')
  7 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
  8 | django.setup()
  9 | 
 10 | from spiders.models import ShopInfo, ReviewDedail, ShopId
 11 | 
 12 | import xlwt
 13 | 
 14 | 
 15 |             # 'http://www.dianping.com/search/category/2/10/g110', # 北京火锅
 16 |             # 'http://www.dianping.com/search/category/2/10/g107', # 北京台湾菜
 17 |             # 'http://www.dianping.com/search/category/2/10/g112', # 北京小吃快餐
 18 |             # 'http://www.dianping.com/search/category/2/10/g250', # 北京创意菜
 19 |             # 'http://www.dianping.com/search/category/2/10/g116', # 北京西餐
 20 |             # 'http://www.dianping.com/search/category/2/10/g113', # 北京日本菜
 21 |             # 'http://www.dianping.com/search/category/2/10/g103', # 北京粤菜
 22 |             # 'http://www.dianping.com/search/category/2/10/g115', # 北京东南亚菜
 23 |             # 'http://www.dianping.com/search/category/2/10/g102', # 北京川菜
 24 |             # 'http://www.dianping.com/search/category/1/10/g113', # 上海日本菜？？？
 25 |             # 'http://www.dianping.com/search/category/1/10/g110', # 上海火锅
 26 |             # 'http://www.dianping.com/search/category/1/10/g107', # 上海台湾菜
 27 |             # 'http://www.dianping.com/search/category/1/10/g103', # 上海粤菜
 28 |             # 'http://www.dianping.com/search/category/1/10/g102', # 上海川菜
 29 |             # 'http://www.dianping.com/search/category/1/10/g112', # 上海小吃快餐
 30 |             # 'http://www.dianping.com/search/category/1/10/g115', # 上海东南亚菜
 31 |             # 'http://www.dianping.com/search/category/1/10/g116',  # 上海西餐
 32 | 
 33 | category_dict = {'g110':'火锅', 'g107':'台湾菜', 'g112':'小吃快餐', 'g250': '创意菜',
 34 |                  'g116': '西餐', 'g113': '日本菜', 'g103': '粤菜', 'g115': '东南亚菜', 'g102': '川菜'}
 35 | 
 36 | rank_star_dict = {
 37 |     '五星商户': 5,
 38 |     '准五星商户':4.5,
 39 |     '四星商户': 4,
 40 |     '准四星商户': 3.5,
 41 |     '三星商户': 3,
 42 |     '准三星商户': 2.5,
 43 |     '二星商户': 2,
 44 |     '准二星商户': 1.5,
 45 |     '一星商户': 1,
 46 |     '准一星商户': 0.5,
 47 |     '该商户暂无星级': 0,
 48 |     '': '无'
 49 | }
 50 | 
 51 | 
 52 | workbook = xlwt.Workbook()
 53 | sheet = workbook.add_sheet('dazongdianping',cell_overwrite_ok=True)
 54 | title = ['餐厅id','城市', '餐厅名称', '餐厅地点', '餐厅地址', '餐厅类别', '人均价格', '是否参加营销活动', '营业时间', '点评数量',
 55 |          '总体评分', '口味评分', '环境评分', '服务评分', '五星', '四星', '三星', '二星', '一星', '第一条评论时间']
 56 | for i in range(len(title)):
 57 |     sheet.write(0, i, title[i] )
 58 | 
 59 | shops = ShopInfo.objects.all()
 60 | 
 61 | result_dic = {}
 62 | 
 63 | for j in range(1, len(shops)+1):
 64 |     shop = shops[j-1]
 65 |     info_list = []
 66 |     info_list.append(str(shop.shop_id)) # id
 67 |     print(shop.shop_id)
 68 |     try:
 69 |         url = ShopId.objects.get(pk=shop.shop_id).from_url
 70 |     except ShopId.DoesNotExist:
 71 |         continue
 72 |     if url is None:
 73 |         continue
 74 |     city_no = url.split('/')[-3]
 75 |     city = '北京' if city_no == '2' else '上海'
 76 |     info_list.append(city)
 77 |     category = category_dict[url.split('/')[-1][:4]]
 78 |     info_list.append(shop.shop_name)
 79 |     info_list.append(shop.place if shop.place is not None else '')
 80 |     info_list.append(shop.address if shop.address is not None else '')
 81 |     info_list.append(category)
 82 |     avg_price = shop.avg_price.split('：')[1]
 83 |     if len(avg_price) != 1:
 84 |         avg_price = avg_price[:-1]
 85 | 
 86 |     info_list.append(avg_price )
 87 |     features = shop.feature2.split(';')
 88 |     print(features)
 89 |     f_l = []
 90 |     for f in features:
 91 |         if f == 'huo':
 92 |             print('活动')
 93 |             f_l.append('活动')
 94 |         elif f == 'ka':
 95 |             print('会员卡')
 96 |             f_l.append('会员卡')
 97 |         else:
 98 |             f_l.append(f)
 99 |     info_list.append(';'.join(f_l))
100 |     f_l.clear()
101 |     info_list.append(shop.open_time.replace('\t', ' ').replace('\r','').replace('\n', ';') if shop.open_time is not None else '')
102 |     info_list.append(shop.review_count[:-3])
103 |     info_list.append(rank_star_dict[shop.rank_star])
104 |     info_list.append(shop.taste.split('：')[1])
105 |     info_list.append(shop.env.split('：')[1])
106 |     info_list.append(shop.service.split('：')[1])
107 | 
108 |     review = ReviewDedail.objects.get(pk=shop.shop_id)
109 |     info_list.append(review.star_5)
110 |     info_list.append(review.star_4)
111 |     info_list.append(review.star_3)
112 |     info_list.append(review.star_2)
113 |     info_list.append(review.star_1)
114 |     if review.first_review_time is not None:
115 |         f_r_t = review.first_review_time.split('\xa0')[0]
116 |         if len(f_r_t) == 5:
117 |             f_r_t = '2017-'+f_r_t
118 |         else:
119 |             f_r_t = '20'+f_r_t
120 |         info_list.append(f_r_t)
121 |     else:
122 |         info_list.append('')
123 |     for i in range(len(info_list)):
124 |         if info_list[i] is None:
125 |             info_list[i] = ' '
126 |     # 'http://www.dianping.com/search/category/2/10/g110', # 北京火锅
127 |     # 'http://www.dianping.com/search/category/2/10/g107', # 北京台湾菜
128 |     # 'http://www.dianping.com/search/category/2/10/g112', # 北京小吃快餐
129 |     # 'http://www.dianping.com/search/category/2/10/g250', # 北京创意菜
130 |     # 'http://www.dianping.com/search/category/2/10/g116', # 北京西餐
131 |     # 'http://www.dianping.com/search/category/2/10/g113', # 北京日本菜
132 |     # 'http://www.dianping.com/search/category/2/10/g103', # 北京粤菜
133 |     # 'http://www.dianping.com/search/category/2/10/g115', # 北京东南亚菜
134 |     # 'http://www.dianping.com/search/category/2/10/g102', # 北京川菜
135 |     # 'http://www.dianping.com/search/category/1/10/g113', # 上海日本菜？？？
136 |     # 'http://www.dianping.com/search/category/1/10/g110', # 上海火锅
137 |     # 'http://www.dianping.com/search/category/1/10/g107', # 上海台湾菜
138 |     # 'http://www.dianping.com/search/category/1/10/g103', # 上海粤菜
139 |     # 'http://www.dianping.com/search/category/1/10/g102', # 上海川菜
140 |     # 'http://www.dianping.com/search/category/1/10/g112', # 上海小吃快餐
141 |     # 'http://www.dianping.com/search/category/1/10/g115', # 上海东南亚菜
142 |     # 'http://www.dianping.com/search/category/1/10/g116',  # 上海西餐
143 |     li = result_dic.get(city+'_'+category, [])
144 |     li.append(info_list.copy())
145 |     result_dic[city+'_'+category] = li
146 |     # file = open('/Users/didi/crawler/output/%s_%s.txt' % (city, category), 'a')
147 |     #
148 |     # file.write('\t'.join([str(i) for i in info_list])+'\n')
149 |     # file.close()
150 |     # print(info_list)
151 |     info_list.clear()
152 | 
153 | book = xlwt.Workbook()
154 | for city_cate, infos in result_dic.items():
155 |     sheet = book.add_sheet(city_cate)
156 |     for i in range(len(title)):
157 |         sheet.write(0, i, title[i])
158 |     for i in range(1, len(infos)):
159 |         for j in range(len(infos[i])):
160 |             sheet.write(i, j, infos[i][j])
161 | import datetime
162 | 
163 | book.save('./all-data-'+ datetime.datetime.now().strftime('%Y_%m_%d')+'.xls')


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Begin.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | 
3 | cmdline.execute("scrapy crawl sinaSpider".split())
4 | import requests
5 | requests.post()
6 | # import yaml
7 | # f = open('./Sina_spider1/conf/weibo.yaml')
8 | # print yaml.load(f)


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/__init__.py


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/__init__.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/conf.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/conf.py


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/conf/weibo.yaml:
--------------------------------------------------------------------------------
1 | accounts:
2 |   - user: 'jjzhu_zju@163.com'
3 |     password: '***'
4 | users:
5 |   - 2210643391
6 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/constant.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 | REPOST = 0  # 转发
3 | ORIGINAL = 1  # 原创
4 | LIKE = 2  # 点赞
5 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/cookies.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | 
  3 | import base64
  4 | import requests
  5 | import sys
  6 | import time
  7 | import json
  8 | from selenium import webdriver
  9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 10 | from selenium.common.exceptions import NoSuchElementException
 11 | import logging
 12 | from settings import PROPERTIES
 13 | from yumdama import identify
 14 | import traceback
 15 | reload(sys)
 16 | sys.setdefaultencoding('utf8')
 17 | IDENTIFY = 1  # 验证码输入方式:        1:看截图aa.png，手动输入     2:云打码
 18 | # 0 代表从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie
 19 | # 1 代表从https://weibo.cn/login/获取Cookie
 20 | COOKIE_GETWAY = 0
 21 | dcap = dict(DesiredCapabilities.PHANTOMJS)  # PhantomJS需要使用老版手机的user-agent，不然验证码会无法通过
 22 | dcap["phantomjs.page.settings.userAgent"] = (
 23 |     "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"
 24 | )
 25 | logger = logging.getLogger(__name__)
 26 | logging.getLogger("selenium").setLevel(logging.WARNING)  # 将selenium的日志级别设成WARNING，太烦人
 27 | 
 28 | 
 29 | def getCookie(account, password):
 30 |     if COOKIE_GETWAY == 0:
 31 |         return get_cookie_from_login_sina_com_cn(account, password)
 32 |     elif COOKIE_GETWAY == 1:
 33 |         return get_cookie_from_weibo_cn(account, password)
 34 |     else:
 35 |         logger.error("COOKIE_GETWAY Error!")
 36 | 
 37 | 
 38 | def get_cookie_from_login_sina_com_cn(account, password):
 39 |     """ 获取一个账号的Cookie """
 40 |     loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)"
 41 |     username = base64.b64encode(account.encode("utf-8")).decode("utf-8")
 42 |     postData = {
 43 |         "entry": "sso",
 44 |         "gateway": "1",
 45 |         "from": "null",
 46 |         "savestate": "30",
 47 |         "useticket": "0",
 48 |         "pagerefer": "",
 49 |         "vsnf": "1",
 50 |         "su": username,
 51 |         "service": "sso",
 52 |         "sp": password,
 53 |         "sr": "1440*900",
 54 |         "encoding": "UTF-8",
 55 |         "cdult": "3",
 56 |         "domain": "sina.com.cn",
 57 |         "prelt": "0",
 58 |         "returntype": "TEXT",
 59 |     }
 60 |     session = requests.Session()
 61 |     r = session.post(loginURL, data=postData)
 62 |     jsonStr = r.content.decode("gbk")
 63 |     info = json.loads(jsonStr)
 64 |     if info["retcode"] == "0":
 65 |         logger.warning("Get Cookie Success!( Account:%s )" % account)
 66 |         cookie = session.cookies.get_dict()
 67 |         print cookie
 68 |         return json.dumps(cookie)
 69 |     else:
 70 |         logger.warning("Failed!( Reason:%s )" % info["reason"])
 71 |         return ""
 72 | 
 73 | 
 74 | def get_cookie_from_weibo_cn(account, password):
 75 |     """ 获取一个账号的Cookie """
 76 |     try:
 77 |         browser = webdriver.PhantomJS(desired_capabilities=dcap)
 78 |         browser.get("https://weibo.cn/login/")
 79 |         time.sleep(1)
 80 | 
 81 |         failure = 0
 82 |         while "微博" in browser.title and failure < 5:
 83 |             failure += 1
 84 |             browser.save_screenshot("aa.png")
 85 |             print browser.w3c
 86 |             username = browser.find_element_by_id("loginName")
 87 |             username.clear()
 88 |             username.send_keys(account)
 89 | 
 90 |             psd = browser.find_element_by_xpath('//input[@type="password"]')
 91 |             psd.clear()
 92 |             psd.send_keys(password)
 93 |             try:
 94 |                 code = browser.find_element_by_name("loginVCode")
 95 |                 code.clear()
 96 |                 if IDENTIFY == 1:
 97 |                     code_txt = raw_input("请查看路径下新生成的aa.png，然后输入验证码:")  # 手动输入验证码
 98 |                 else:
 99 |                     from PIL import Image
100 |                     img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]')
101 |                     x = img.location["x"]
102 |                     y = img.location["y"]
103 |                     im = Image.open("aa.png")
104 |                     im.crop((x, y, 100 + x, y + 22)).save("ab.png")  # 剪切出验证码
105 |                     code_txt = identify()  # 验证码打码平台识别
106 |                 code.send_keys(code_txt)
107 |             except NoSuchElementException, e:
108 |                 print e
109 |                 pass
110 | 
111 |             commit = browser.find_element_by_id("loginAction")
112 |             commit.click()
113 |             time.sleep(3)
114 |             # print browser.title
115 |             # print browser.page_source
116 |             # if "手机新浪网" not in browser.title:
117 |             #     time.sleep(4)
118 |             # if '未激活微博' in browser.page_source:
119 |             #     print '账号未开通微博'
120 |             #     return {}
121 | 
122 |         cookie = {}
123 |         browser.get("https://weibo.cn")
124 |         # if "我的首页" in browser.title:
125 |         for elem in browser.get_cookies():
126 |             cookie[elem["name"]] = elem["value"]
127 |         logger.info("Get Cookie Success!( Account:%s )" % account)
128 |         return json.dumps(cookie)
129 |     except Exception, e:
130 |         logger.warning("Failed %s!" % account)
131 |         traceback.print_exc()
132 |         return ""
133 |     finally:
134 |         try:
135 |             browser.quit()
136 |         except Exception, e:
137 |             pass
138 | 
139 | 
140 | def getCookies(weibo):
141 |     """ 获取Cookies """
142 |     cookies = []
143 |     for elem in weibo:
144 |         account = elem['user']
145 |         password = elem['password']
146 |         print account, password
147 |         cookie = getCookie(account, password)
148 |         if cookie is not None and cookie != '':
149 |             print '-' * 10
150 |             print cookie
151 |             print '-' * 10
152 |             if isinstance(cookie, str):
153 |                 cookies.append(eval(cookie))
154 |             elif isinstance(cookie, dict):
155 |                 cookies.append(cookie)
156 |             else:
157 |                 raise "unsupported type[%s] of cookie[%s]" % (type(cookie), cookie)
158 | 
159 |     return cookies
160 | cookies = getCookies(PROPERTIES['accounts'])
161 | logger.warning("Get Cookies Finish!( Num:%d)" % len(cookies))
162 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/cookies.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/cookies.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/items.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | 
 3 | from scrapy import Item, Field
 4 | 
 5 | 
 6 | class InformationItem(Item):
 7 |     """ 个人信息 """
 8 |     _id = Field()  # 用户ID
 9 |     NickName = Field()  # 昵称
10 |     Gender = Field()  # 性别
11 |     Province = Field()  # 所在省
12 |     City = Field()  # 所在城市
13 |     Signature = Field()  # 个性签名
14 |     Birthday = Field()  # 生日
15 |     Num_Tweets = Field()  # 微博数
16 |     Num_Follows = Field()  # 关注数
17 |     Num_Fans = Field()  # 粉丝数
18 |     Sex_Orientation = Field()  # 性取向
19 |     Marriage = Field()  # 婚姻状况
20 |     URL = Field()  # 首页链接
21 | 
22 | 
23 | class FlagItem(Item):
24 |     weibo_id = Field()
25 | 
26 | 
27 | class CommentItem(Item):
28 |     weibo_id = Field()
29 |     id = Field()  # 评论id
30 |     user = Field()  # 评论用户
31 |     content = Field()  # 评论内容
32 |     source = Field()  # 评论来源
33 |     time = Field()  # 评论发表时间
34 | 
35 |     def __str__(self):
36 |         return self['user'] + "\t"+self['content']+"...\t"+self['time']
37 | 
38 | 
39 | class TweetsItem(Item):
40 |     """ 微博信息 """
41 |     _id = Field()  # 用户ID-微博ID
42 |     ID = Field()  # 用户ID
43 |     Content = Field()  # 微博内容
44 |     PubTime = Field()  # 发表时间
45 |     Coordinates = Field()  # 定位坐标
46 |     Tools = Field()  # 发表工具/平台
47 |     Like = Field()  # 点赞数
48 |     Comment = Field()  # 评论数
49 |     Transfer = Field()  # 转载数
50 |     Type = Field()  # 类型 转发|原创|点赞
51 | 
52 |     def __str__(self):
53 |         return '--------------------------------------------------------------------------\n' \
54 |             '|\t用户\t|\t\t微博\t\t|\t来源\t|\t发布时间\t|\t微博id\t|\n' \
55 |             '------------------------------------------------------------------------------\n' \
56 |             '|%s\t|\t%s\t|\t%s\t|\t%s\t|\t%s\t|\n' \
57 |             '------------------------------------------------------------------------------\n'\
58 |             % (self["ID"], self["Content"][:20], self["Tools"] if 'Tools' in self else '', self['PubTime'], self['_id'])
59 | 
60 | 
61 | 
62 | class FollowsItem(Item):
63 |     """ 关注人列表 """
64 |     _id = Field()  # 用户ID
65 |     follows = Field()  # 关注
66 | 
67 | 
68 | class FansItem(Item):
69 |     """ 粉丝列表 """
70 |     _id = Field()  # 用户ID
71 |     fans = Field()  # 粉丝
72 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/items.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/middleware.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | import random
 3 | from cookies import cookies
 4 | from user_agents import agents
 5 | import logging
 6 | import re
 7 | 
 8 | class UserAgentMiddleware(object):
 9 |     """ 换User-Agent """
10 | 
11 |     def process_request(self, request, spider):
12 |         agent = random.choice(agents)
13 |         request.headers["User-Agent"] = agent
14 | 
15 | 
16 | class RefererMiddleware(object):
17 |     page_pattern = re.compile('https://weibo.cn/(.*?)\?page=(\d+)')
18 | 
19 |     def process_request(self, request, spider):
20 | 
21 |         if 'Referer' in request.headers:
22 |             page_result = self.page_pattern.findall(request.url)
23 |             if len(page_result) == 1:
24 |                 curr_page = int(page_result[0][1]) - 1
25 |                 request.headers['Referer'] = 'https://weibo.cn/%s?page=%d' % (page_result[0][0], curr_page)
26 |         print request.url
27 |         print request.headers
28 |         print request.cookies
29 |         print request.headers['Referer'] if 'Referer' in request.headers else ''
30 | 
31 | 
32 | class CookiesMiddleware(object):
33 |     """ 换Cookie """
34 | 
35 |     def process_request(self, request, spider):
36 |         cookie = random.choice(cookies)
37 |         logging.info("use cookie %s" % cookie)
38 |         request.cookies = cookie
39 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/middleware.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/middleware.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/pipelines.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | import os
 3 | from items import CommentItem, TweetsItem, FlagItem, FansItem
 4 | 
 5 | 
 6 | class FilePipeline(object):
 7 |     FILE_CACHE = {}
 8 | 
 9 |     def process_item(self, item, spider):
10 | 
11 |         if isinstance(item, CommentItem):
12 |             path = './' + item['weibo_id']
13 |             if not os.path.exists(path):
14 |                 os.makedirs(path)
15 |                 f = open(path + '/' + item['weibo_id'] + '.txt', 'a')
16 |                 self.FILE_CACHE[item['weibo_id']] = f
17 |             f = self.FILE_CACHE[item['weibo_id']]
18 |             f.write('%s\t%s\t%s\t%s\n' % (
19 |             item['user'], item['content'], item['source'] if 'source' in item else '', item['time']))
20 |         if isinstance(item, TweetsItem):
21 |             path = './' + item['ID']
22 |             if not os.path.exists(path):
23 |                 os.makedirs(path)
24 |             if item['ID'] not in self.FILE_CACHE:
25 |                 f = open(path + '/' + item['ID'] + '.txt', 'a')
26 |                 self.FILE_CACHE[item['ID']] = f
27 |             f = self.FILE_CACHE[item['ID']]
28 |             f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item['_id'], item['Content'], item['PubTime'], item['Tools'] if 'Tools' in item else '',
29 |                                                       item['Comment'], item['Like'], item['Transfer']))
30 |             f.flush()
31 |         if isinstance(item, FlagItem):
32 |             f = self.FILE_CACHE[item['weibo_id']]
33 |             f.close()
34 |             del self.FILE_CACHE[item['weibo_id']]
35 | 
36 | # class MongoDBPipeline(object):
37 | #     def __init__(self):
38 | #         clinet = pymongo.MongoClient("localhost", 27017)
39 | #         db = clinet["Sina"]
40 | #         self.Information = db["Information"]
41 | #         self.Tweets = db["Tweets"]
42 | #         self.Follows = db["Follows"]
43 | #         self.Fans = db["Fans"]
44 | #
45 | #     def process_item(self, item, spider):
46 | #         """ 判断item的类型，并作相应的处理，再入数据库 """
47 | #         if isinstance(item, InformationItem):
48 | #             try:
49 | #                 self.Information.insert(dict(item))
50 | #             except Exception:
51 | #                 pass
52 | #         elif isinstance(item, TweetsItem):
53 | #             try:
54 | #                 self.Tweets.insert(dict(item))
55 | #             except Exception:
56 | #                 pass
57 | #         elif isinstance(item, FollowsItem):
58 | #             followsItems = dict(item)
59 | #             follows = followsItems.pop("follows")
60 | #             for i in range(len(follows)):
61 | #                 followsItems[str(i + 1)] = follows[i]
62 | #             try:
63 | #                 self.Follows.insert(followsItems)
64 | #             except Exception:
65 | #                 pass
66 | #         elif isinstance(item, FansItem):
67 | #             fansItems = dict(item)
68 | #             fans = fansItems.pop("fans")
69 | #             for i in range(len(fans)):
70 | #                 fansItems[str(i + 1)] = fans[i]
71 | #             try:
72 | #                 self.Fans.insert(fansItems)
73 | #             except Exception:
74 | #                 pass
75 | #         return item
76 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/pipelines.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/settings.py:
--------------------------------------------------------------------------------
 1 | # encoding=utf-8
 2 | import yaml
 3 | import os
 4 | import scrapy.core.scraper
 5 | print os.path.split(os.path.realpath(__file__))[0]
 6 | PROPERTIES = yaml.load(open(os.path.split(os.path.realpath(__file__))[0] + '/conf/weibo.yaml'))
 7 | 
 8 | BOT_NAME = 'Sina_spider1'
 9 | 
10 | SPIDER_MODULES = ['Sina_spider1.spiders']
11 | NEWSPIDER_MODULE = 'Sina_spider1.spiders'
12 | # HTTPCACHE_ENABLED = False
13 | DOWNLOADER_MIDDLEWARES = {
14 |     "Sina_spider1.middleware.UserAgentMiddleware": 401,
15 |     "Sina_spider1.middleware.CookiesMiddleware": 402,
16 |     'Sina_spider1.middleware.RefererMiddleware': 403,
17 | }
18 | 
19 | ITEM_PIPELINES = {
20 |     'Sina_spider1.pipelines.FilePipeline': 300,
21 | }
22 | 
23 | DOWNLOAD_DELAY = 1  # 间隔时间
24 | # CONCURRENT_ITEMS = 1000
25 | # CONCURRENT_REQUESTS = 100
26 | # REDIRECT_ENABLED = False
27 | # CONCURRENT_REQUESTS_PER_DOMAIN = 100
28 | # CONCURRENT_REQUESTS_PER_IP = 0
29 | # CONCURRENT_REQUESTS_PER_SPIDER=100
30 | # DNSCACHE_ENABLED = True
31 | # LOG_LEVEL = 'INFO'    # 日志级别
32 | # CONCURRENT_REQUESTS = 70
33 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/settings.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/spiders/__init__.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/spiders/spiders.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/spiders/spiders.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/user_agents.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/user_agents.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/yumdama.py:
--------------------------------------------------------------------------------
  1 | # encoding=utf-8
  2 | import httplib, mimetypes, urlparse, json, time
  3 | 
  4 | ######################################################################
  5 | 
  6 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html
  7 | # 所有函数请查询 http://www.yundama.com/apidoc
  8 | 
  9 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号
 10 | # 2. http://www.yundama.com/developer/myapp 添加新软件
 11 | # 3. 使用添加的软件ID和密钥进行开发，享受丰厚分成
 12 | 
 13 | # 用户名
 14 | username = ''
 15 | 
 16 | # 密码
 17 | password = ''
 18 | 
 19 | # 软件ＩＤ，开发者分成必要参数。登录开发者后台【我的软件】获得！
 20 | appid = 1
 21 | 
 22 | # 软件密钥，开发者分成必要参数。登录开发者后台【我的软件】获得！
 23 | appkey = '22cc5376925e9387a23cf797cb9ba745'
 24 | 
 25 | # 图片文件
 26 | filename = 'ab.png'
 27 | 
 28 | # 验证码类型，# 例：1004表示4位字母数字，不同类型收费不同。请准确填写，否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
 29 | codetype = 1004
 30 | 
 31 | # 超时时间，秒
 32 | timeout = 60
 33 | 
 34 | 
 35 | ######################################################################
 36 | 
 37 | class YDMHttp:
 38 |     apiurl = 'http://api.yundama.net:5678/api.php'
 39 | 
 40 |     username = ''
 41 |     password = ''
 42 |     appid = ''
 43 |     appkey = ''
 44 | 
 45 |     def __init__(self, username, password, appid, appkey):
 46 |         self.username = username
 47 |         self.password = password
 48 |         self.appid = str(appid)
 49 |         self.appkey = appkey
 50 | 
 51 |     def request(self, fields, files=[]):
 52 |         try:
 53 |             response = post_url(self.apiurl, fields, files)
 54 |             response = json.loads(response)
 55 |         except Exception as e:
 56 |             response = None
 57 |         return response
 58 | 
 59 |     def balance(self):
 60 |         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
 61 |                 'appkey': self.appkey}
 62 |         response = self.request(data)
 63 |         if (response):
 64 |             if (response['ret'] and response['ret'] < 0):
 65 |                 return response['ret']
 66 |             else:
 67 |                 return response['balance']
 68 |         else:
 69 |             return -9001
 70 | 
 71 |     def login(self):
 72 |         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
 73 |                 'appkey': self.appkey}
 74 |         response = self.request(data)
 75 |         if (response):
 76 |             if (response['ret'] and response['ret'] < 0):
 77 |                 return response['ret']
 78 |             else:
 79 |                 return response['uid']
 80 |         else:
 81 |             return -9001
 82 | 
 83 |     def upload(self, filename, codetype, timeout):
 84 |         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
 85 |                 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
 86 |         file = {'file': filename}
 87 |         response = self.request(data, file)
 88 |         if (response):
 89 |             if (response['ret'] and response['ret'] < 0):
 90 |                 return response['ret']
 91 |             else:
 92 |                 return response['cid']
 93 |         else:
 94 |             return -9001
 95 | 
 96 |     def result(self, cid):
 97 |         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
 98 |                 'appkey': self.appkey, 'cid': str(cid)}
 99 |         response = self.request(data)
100 |         return response and response['text'] or ''
101 | 
102 |     def decode(self, filename, codetype, timeout):
103 |         cid = self.upload(filename, codetype, timeout)
104 |         if (cid > 0):
105 |             for i in range(0, timeout):
106 |                 result = self.result(cid)
107 |                 if (result != ''):
108 |                     return cid, result
109 |                 else:
110 |                     time.sleep(1)
111 |             return -3003, ''
112 |         else:
113 |             return cid, ''
114 | 
115 | 
116 | ######################################################################
117 | 
118 | def post_url(url, fields, files=[]):
119 |     urlparts = urlparse.urlsplit(url)
120 |     return post_multipart(urlparts[1], urlparts[2], fields, files)
121 | 
122 | 
123 | def post_multipart(host, selector, fields, files):
124 |     content_type, body = encode_multipart_formdata(fields, files)
125 |     h = httplib.HTTP(host)
126 |     h.putrequest('POST', selector)
127 |     h.putheader('Host', host)
128 |     h.putheader('Content-Type', content_type)
129 |     h.putheader('Content-Length', str(len(body)))
130 |     h.endheaders()
131 |     h.send(body)
132 |     errcode, errmsg, headers = h.getreply()
133 |     return h.file.read()
134 | 
135 | 
136 | def encode_multipart_formdata(fields, files=[]):
137 |     BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ'
138 |     CRLF = '\r\n'
139 |     L = []
140 |     for field in fields:
141 |         key = field
142 |         value = fields[key]
143 |         L.append('--' + BOUNDARY)
144 |         L.append('Content-Disposition: form-data; name="%s"' % key)
145 |         L.append('')
146 |         L.append(value)
147 |     for field in files:
148 |         key = field
149 |         filepath = files[key]
150 |         L.append('--' + BOUNDARY)
151 |         L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath))
152 |         L.append('Content-Type: %s' % get_content_type(filepath))
153 |         L.append('')
154 |         L.append(open(filepath, 'rb').read())
155 |     L.append('--' + BOUNDARY + '--')
156 |     L.append('')
157 |     body = CRLF.join(L)
158 |     content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
159 |     return content_type, body
160 | 
161 | 
162 | def get_content_type(filename):
163 |     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
164 | 
165 | 
166 | ######################################################################
167 | 
168 | 
169 | def identify():
170 |     if (username == 'username'):
171 |         print '请设置好相关参数再测试'
172 |     else:
173 |         # 初始化
174 |         yundama = YDMHttp(username, password, appid, appkey)
175 | 
176 |         # 登陆云打码
177 |         uid = yundama.login()
178 |         # print 'uid: %s' % uid
179 | 
180 |         # 查询余额
181 |         balance = yundama.balance()
182 |         # print 'balance: %s' % balance
183 | 
184 |         # 开始识别，图片路径，验证码类型ID，超时时间（秒），识别结果
185 |         cid, result = yundama.decode(filename, codetype, timeout)
186 |         # print 'cid: %s, result: %s' % (cid, result)
187 |         return result
188 | 


--------------------------------------------------------------------------------
/spiders/Sina_spider1/Sina_spider1/yumdama.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/yumdama.pyc


--------------------------------------------------------------------------------
/spiders/Sina_spider1/aa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/aa.png


--------------------------------------------------------------------------------
/spiders/Sina_spider1/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Sina_spider1.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Sina_spider1
12 | 


--------------------------------------------------------------------------------
/spiders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/__init__.py


--------------------------------------------------------------------------------
/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/__init__.pyc


--------------------------------------------------------------------------------
/spiders/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/spiders/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class SpidersConfig(AppConfig):
5 |     name = 'spiders'
6 | 


--------------------------------------------------------------------------------
/spiders/apps.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/apps.pyc


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import django
4 | 
5 | sys.path.append('../../../Jpider')
6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
7 | django.setup()


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from spiders.models import BaiKeRank
10 | from scrapy_djangoitem import DjangoItem
11 | 
12 | class BaidurankItem(scrapy.Item):
13 |     # define the fields for your item here like:
14 |     # name = scrapy.Field()
15 |     pass
16 | 
17 | 
18 | class BaiKeRankItem(DjangoItem):
19 |     django_model = BaiKeRank


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class BaidurankSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BaidurankPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for baidurank project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'baidurank'
13 | 
14 | SPIDER_MODULES = ['baidurank.spiders']
15 | NEWSPIDER_MODULE = 'baidurank.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'baidurank (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 1
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'baidurank.middlewares.BaidurankSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'baidurank.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'baidurank.pipelines.BaidurankPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/baidurank/baidurank/spiders/rank.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import json
 3 | from ..items import BaiKeRankItem
 4 | import datetime
 5 | class BaiduRank(scrapy.Spider):
 6 |     name = 'baidurank'
 7 |     start_urls = [
 8 |         'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek'
 9 |     ]
10 |     url_p = 'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek&pg=%d'
11 |     max_page = 50
12 |     curr_page = 1
13 |     curr_time = ''
14 |     def start_requests(self):
15 |         self.curr_time = datetime.datetime.now()
16 |         for url in self.start_urls:
17 |             yield self.make_requests_from_url(url)
18 |         for pg in range(1, 50):
19 |             yield self.make_requests_from_url(self.url_p % pg)
20 | 
21 |     def parse(self, response):
22 | 
23 |         rt = json.loads(response.body)
24 | 
25 |         this_week = rt['data']['thisWeek']
26 |         for record in this_week:
27 |             baike_rank = BaiKeRankItem()
28 |             baike_rank['rank'] = str(record['rank'])
29 |             baike_rank['name'] = record['name']
30 |             baike_rank['ori_score'] = str(record['oriScore'])
31 |             baike_rank['rank_time'] = self.curr_time.strftime('%Y-%m-%d %H:%M:%S')
32 |             baike_rank.save()
33 |             print(str(record['rank'])+'\t'+record['name']+'\t'+str(record['oriScore']))
34 | 
35 | 


--------------------------------------------------------------------------------
/spiders/baidurank/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = baidurank.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = baidurank
12 | 


--------------------------------------------------------------------------------
/spiders/bilibili/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/bilibili/__init__.py


--------------------------------------------------------------------------------
/spiders/bilibili/bilibili_spider.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import sys
 4 | import os
 5 | import django
 6 | 
 7 | sys.path.append('../../../Jpider')
 8 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
 9 | django.setup()
10 | from spiders.models import BilibiliMovie
11 | 
12 | search_url = 'https://s.search.bilibili.com/cate/search'
13 | tag_url = 'https://api.bilibili.com/x/tag/hots'
14 | curr_page = 1
15 | t_params = dict(
16 |     rid=145,
17 |     type=0,
18 |     jsonp='jsonp'
19 | )
20 | q_params = dict(
21 |     main_ver='v3',
22 |     search_type='video',
23 |     view_type='hot_rank',
24 |     pic_size='160x100',
25 |     order='hot',
26 |     copy_righ='-1',
27 |     cate_id=145,
28 |     page=curr_page,
29 |     pagesize=20,
30 |     keyword='恐怖'
31 | )
32 | req = requests.get(url=tag_url, params=t_params, verify=False)
33 | req_json = json.loads(req.text)
34 | tags = req_json['data'][0]['tags']
35 | for tag in tags:
36 |     print(tag)
37 |     q_params['keyword'] = tag['tag_name']
38 |     req = requests.get(url=search_url, params=q_params, verify=False)
39 |     req_json = json.loads(req.text)
40 |     pages = req_json['numPages']
41 |     for r in req_json['result']:
42 |         movie = BilibiliMovie()
43 |         movie.arcurl = r['arcurl']
44 |         movie.author = r['author']
45 |         movie.description = r['description']
46 |         movie.favorites = r['favorites']
47 |         movie.play = r['play']
48 |         movie.video_review = r['video_review']
49 |         movie.tag = r['tag']
50 |         movie.title = r['title']
51 |         movie.id = r['id']
52 |         movie.save()
53 |         print(movie)
54 |     curr_page += 1
55 |     while curr_page <= pages:
56 |         q_params['page'] = curr_page
57 |         req = requests.get(url=search_url, params=q_params, verify=False)
58 |         req_json = json.loads(req.text)
59 |         for r in req_json['result']:
60 |             movie = BilibiliMovie()
61 |             movie.arcurl = r['arcurl']
62 |             movie.author = r['author']
63 |             movie.description = r['description']
64 |             movie.favorites = r['favorites']
65 |             movie.play = r['play']
66 |             movie.video_review = r['video_review']
67 |             movie.tag = r['tag']
68 |             movie.title = r['title']
69 |             movie.id = r['id']
70 |             movie.save()
71 |             print(movie)
72 |         curr_page += 1
73 | 


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import django
4 | 
5 | sys.path.append('../../../Jpider')
6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
7 | django.setup()


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from spiders.models import ShopInfo, ReviewDedail, ShopId
10 | from scrapy_djangoitem import DjangoItem
11 | 
12 | class DazongdianpingItem(scrapy.Item):
13 |     # define the fields for your item here like:
14 |     # name = scrapy.Field()
15 |     pass
16 | 
17 | class ShopInfoItem(DjangoItem):
18 |     django_model = ShopInfo
19 | 
20 | class ReviewDetailItem(DjangoItem):
21 |     django_model = ReviewDedail
22 | 
23 | class ShopIdItem(DjangoItem):
24 |     django_model = ShopId


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | from spiders.user_agent import agents
10 | import random
11 | 
12 | class DazongdianpingSpiderMiddleware(object):
13 |     # Not all methods need to be defined. If a method is not defined,
14 |     # scrapy acts as if the spider middleware does not modify the
15 |     # passed objects.
16 | 
17 |     @classmethod
18 |     def from_crawler(cls, crawler):
19 |         # This method is used by Scrapy to create your spiders.
20 |         s = cls()
21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
22 |         return s
23 | 
24 |     def process_spider_input(response, spider):
25 |         # Called for each response that goes through the spider
26 |         # middleware and into the spider.
27 | 
28 |         # Should return None or raise an exception.
29 |         return None
30 | 
31 |     def process_spider_output(response, result, spider):
32 |         # Called with the results returned from the Spider, after
33 |         # it has processed the response.
34 | 
35 |         # Must return an iterable of Request, dict or Item objects.
36 |         for i in result:
37 |             yield i
38 | 
39 |     def process_spider_exception(response, exception, spider):
40 |         # Called when a spider or process_spider_input() method
41 |         # (from other spider middleware) raises an exception.
42 | 
43 |         # Should return either None or an iterable of Response, dict
44 |         # or Item objects.
45 |         pass
46 | 
47 |     def process_start_requests(start_requests, spider):
48 |         # Called with the start requests of the spider, and works
49 |         # similarly to the process_spider_output() method, except
50 |         # that it doesn’t have a response associated.
51 | 
52 |         # Must return only requests (not items).
53 |         for r in start_requests:
54 |             yield r
55 | 
56 |     def spider_opened(self, spider):
57 |         spider.logger.info('Spider opened: %s' % spider.name)
58 | 
59 | 
60 | class ChangeProxyMiddleware(object):
61 |     def process_request(self, request, spider):
62 | 
63 |         request.headers.setdefault('User-Agent', random.choice(agents))


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DazongdianpingPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for dazongdianping project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'dazongdianping'
 13 | 
 14 | SPIDER_MODULES = ['dazongdianping.spiders']
 15 | NEWSPIDER_MODULE = 'dazongdianping.spiders'
 16 | 
 17 | # LOG_FILE = 'dazongdianping.log'
 18 | # LOG_LEVEL = 'ERROR'
 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 20 | #USER_AGENT = 'dazongdianping (+http://www.yourdomain.com)'
 21 | 
 22 | # Obey robots.txt rules
 23 | ROBOTSTXT_OBEY = False
 24 | 
 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 26 | #CONCURRENT_REQUESTS = 32
 27 | 
 28 | # Configure a delay for requests for the same website (default: 0)
 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 30 | # See also autothrottle settings and docs
 31 | DOWNLOAD_DELAY = 4
 32 | 
 33 | # The download delay setting will honor only one of:
 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 35 | #CONCURRENT_REQUESTS_PER_IP = 16
 36 | 
 37 | # Disable cookies (enabled by default)
 38 | # COOKIES_ENABLED = False
 39 | 
 40 | # Disable Telnet Console (enabled by default)
 41 | #TELNETCONSOLE_ENABLED = False
 42 | 
 43 | # Override the default request headers:
 44 | DEFAULT_REQUEST_HEADERS = {
 45 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 46 |             # 'Accept-Encoding': 'gzip,deflate,sdch',
 47 |             'Accept-Language': 'zdeprecatedh-CN,zh;q=0.8,en;q=0.6',
 48 |             'Host': 'www.dianping.com',
 49 |             'Proxy-Connection': 'keep-alive',
 50 |             'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
 51 | 
 52 |             # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36'
 53 |             #           ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'
 54 | }
 55 | 
 56 | # Enable or disable spider middlewares
 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 58 | # SPIDER_MIDDLEWARES = {
 59 | #    'dazongdianping.middlewares.ChangeProxyMiddleware': 100,
 60 | # }
 61 | 
 62 | # Enable or disable downloader middlewares
 63 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 64 | DOWNLOADER_MIDDLEWARES = {
 65 |    'dazongdianping.middlewares.ChangeProxyMiddleware': 100,
 66 | }
 67 | 
 68 | # Enable or disable extensions
 69 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 70 | #EXTENSIONS = {
 71 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 72 | #}
 73 | 
 74 | # Configure item pipelines
 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 76 | #ITEM_PIPELINES = {
 77 | #    'dazongdianping.pipelines.DazongdianpingPipeline': 300,
 78 | #}
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | #HTTPCACHE_ENABLED = True
 96 | #HTTPCACHE_EXPIRATION_SECS = 0
 97 | #HTTPCACHE_DIR = 'httpcache'
 98 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/dazongdianping/dazongdianping/spiders/dazong_repair.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from spiders.models import ShopInfo, ReviewDedail
 3 | from django.db import connection
 4 | import traceback
 5 | from ..items import ReviewDetailItem
 6 | from spiders.user_agent import agents
 7 | import random
 8 | 
 9 | class DazongRepair(scrapy.Spider):
10 |     name = 'dazongrepair'
11 | 
12 |     url_pattern = 'http://www.dianping.com/shop/%s/review_more_newest#start=10'
13 |     shop_url_p = 'http://www.dianping.com/shop/%s'
14 | 
15 |     def start_requests(self):
16 |         with connection.cursor() as cursor:
17 |             cursor.execute("select shop_id from spiders_shopinfo where shop_id not in (select shop_id from spiders_reviewdedail)")
18 |             rows = cursor.fetchall()
19 |             for row in rows:
20 |                 url = self.url_pattern % row[0]
21 |                 referer = self.shop_url_p % row[0]
22 |                 header = {
23 |                     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
24 |                     'Accept-Encoding': 'gzip,deflate,sdch',
25 |                     'Accept-Language': 'zdeprecatedh-CN,zh;q=0.8,en;q=0.6',
26 |                     'Host': 'www.dianping.com',
27 | 
28 |                     'User-Agent': random.choice(agents),
29 |                     'Referer': referer,
30 |                 }
31 | 
32 | 
33 |                 yield scrapy.Request(url, callback=self.parse, headers=header)
34 | 
35 |     def parse(self, response):
36 |         print(response.url)
37 |         review_detail = ReviewDetailItem()
38 |         try:
39 |             shop_id = response.url.split('/')[-2]
40 |             main_body = response.css('div.main')
41 |             comment_tab = main_body.css('div.comment-tab span')
42 |             cnt = '0'
43 |             for c_t in comment_tab:
44 |                 title = c_t.css('a::text').extract()[0]
45 |                 if title.strip() == '全部点评':
46 |                     cnt = c_t.css('em.col-exp::text').extract()[0].strip()[1:-1]
47 |                     break
48 |             if cnt == '0':
49 |                 review_detail['shop_id'] = shop_id
50 |                 review_detail['star_all'] = 0
51 |                 review_detail.save()
52 |                 self.logger.error('%s - %s: %s' % (response.url, '全部点评', cnt))
53 |                 print('%s - %s: %s' % (response.url, '全部点评', cnt))
54 |                 return None
55 | 
56 |             stars = main_body.css('div.comment-mode div.comment-star span em.col-exp::text').extract()
57 |             first_review_time = main_body.css('div.comment-mode div.comment-list ul li span.time::text').extract_first().strip()
58 |             first_review_content = main_body.css('div.comment-mode div.comment-list div.comment-txt div::text').extract_first().strip()
59 |             review_detail['first_review_time'] = first_review_time
60 |             review_detail['first_review_content'] = first_review_content
61 |             review_detail['star_all'] = stars[0][1:-1]
62 |             review_detail['star_5'] = stars[1][1:-1]
63 |             review_detail['star_4'] = stars[2][1:-1]
64 |             review_detail['star_3'] = stars[3][1:-1]
65 |             review_detail['star_2'] = stars[4][1:-1]
66 |             review_detail['star_1'] = stars[5][1:-1]
67 |             review_detail['shop_id'] = shop_id
68 |             review_detail.save()
69 |             print(shop_id+'\t'+str(stars)  + '\t' + first_review_time)
70 |         except Exception:
71 |             self.logger.error(traceback.format_exc())
72 | 
73 | 


--------------------------------------------------------------------------------
/spiders/dazongdianping/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dazongdianping.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dazongdianping
12 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/__init__.py


--------------------------------------------------------------------------------
/spiders/dist_weibo/conf/account.conf:
--------------------------------------------------------------------------------
1 | user password


--------------------------------------------------------------------------------
/spiders/dist_weibo/conf/logging.conf:
--------------------------------------------------------------------------------
 1 | #logging.conf
 2 | ######################################################################
 3 | [loggers]  # 配置了两个logger
 4 | keys=root, simpleLogger
 5 | 
 6 | [handlers]  # 配置所需要的handler
 7 | keys=consoleHandler,fileHandler,rotatingFileHandler
 8 | 
 9 | [formatters]  # 配置formatter
10 | keys=simpleFmt
11 | 
12 | [logger_root]
13 | level=DEBUG
14 | handlers=rotatingFileHandler
15 | 
16 | [logger_simpleLogger]  # 对simpleLogger进行相关配置
17 | level=DEBUG
18 | handlers=consoleHandler,rotatingFileHandler
19 | qualname=simpleLogger
20 | propagate=0
21 | 
22 | [handler_consoleHandler]  # 在控制台输出日志信息的处理方式
23 | class=StreamHandler
24 | level=DEBUG
25 | formatter=simpleFmt
26 | args=(sys.stdout,)
27 | 
28 | [handler_fileHandler]
29 | class=FileHandler
30 | level=DEBUG
31 | formatter=simpleFmt
32 | args=('./log/dist_weibo.log','a')
33 | 
34 | [handler_rotatingFileHandler]  # 设置日志备份
35 | class=handlers.RotatingFileHandler
36 | level=DEBUG
37 | formatter=simpleFmt
38 | args=('./log/dist_weibo.log','a',50*1024*1024, 10)
39 | 
40 | [formatter_simpleFmt]
41 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s
42 | datefmt='%Y-%m-%d %H:%M:%S'


--------------------------------------------------------------------------------
/spiders/dist_weibo/dao/__init__.py:
--------------------------------------------------------------------------------
1 | __dict__="""
2 |     data store module
3 | """


--------------------------------------------------------------------------------
/spiders/dist_weibo/dao/redis_cookies.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import redis
 3 | import json
 4 | import datetime
 5 | from logger import LOGGER
 6 | class RedisCookies(object):
 7 |     redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
 8 | 
 9 |     @classmethod
10 |     def save_cookies(cls, user_name, unique_id, cookies):
11 |         
12 |         pickled_cookies = json.dumps({
13 |             'user_name': user_name,
14 |             'cookies': cookies,
15 |             'unique_id': unique_id,
16 |             'login_time': datetime.datetime.now().timestamp()
17 |         })
18 |         LOGGER.info('save cookie in redis: %s' % str(pickled_cookies))
19 |         r = redis.Redis(connection_pool=cls.redis_pool)
20 |         r.hset('account', user_name, pickled_cookies)
21 |         cls.user_in_queue(user_name)
22 | 
23 |     @classmethod
24 |     def user_in_queue(cls, user_name):
25 |         r = redis.Redis(connection_pool=cls.redis_pool)
26 |         
27 |         if not r.sismember('users', user_name):
28 |             LOGGER.info('user in queue: %s' % user_name)
29 |             r.sadd("users", user_name)
30 |         else:
31 |             LOGGER.info('user already in queue: %s' % user_name)
32 | 
33 |     @classmethod
34 |     def fetch_cookies(cls):
35 |         LOGGER.info('get cookies from reids')
36 |         r = redis.Redis(connection_pool=cls.redis_pool)
37 |         while True:
38 |             user = r.spop('users')
39 |             r.sadd('users', user)
40 |             c = r.hget('account', user)
41 |             if c:
42 |                 user_cookies = c.decode('utf-8')
43 |                 cookies_json = json.loads(user_cookies)
44 |                 LOGGER.info('cookies got-------')
45 |                 return cookies_json
46 |             LOGGER.warn('cookies not get')
47 | 
48 |     @classmethod
49 |     def clean(cls):
50 |         r = redis.Redis(connection_pool=cls.redis_pool)
51 |         r.delete('users')
52 |         r.delete('account')
53 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/dao/sqlalchemy_session.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | from sqlalchemy.ext.declarative import declarative_base
 3 | from sqlalchemy.orm import sessionmaker
 4 | Base = declarative_base()
 5 | engine = create_engine('mysql+pymysql://root:111111@localhost:3306/dist_weibo?charset=utf8')
 6 | DBSession = sessionmaker(bind=engine)
 7 | db_session = DBSession()
 8 | 
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/headers.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | user_agents = [
 5 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 6 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) '
 7 |     'Chrome/57.0.2987.133 Safari/537.36',
 8 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) '
 9 |     'Version/10.1 Safari/603.1.30',
10 |     'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
11 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5',
12 |     'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
13 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89'
14 |     ' Safari/537.1 QIHU 360SE',
15 |     'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 '
16 |     '2345Explorer/7.1.0.12633',
17 |     'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Ubuntu/10.10 '
18 |     'Chromium/8.0.552.237 Chrome/8.0.552.237 Safari/534.10',
19 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 '
20 |     'Chrome/34.0.1847.116 Safari/537.36',
21 |     'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
22 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.39 (KHTML, like Gecko) Version/9.0 '
23 |     'Safari/601.1.39',
24 |     'Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.14',
25 |     'Opera/9.80 (Linux armv6l ; U; CE-HTML/1.0 NETTV/3.0.1;; en) Presto/2.6.33 Version/10.60',
26 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; baidubrowser 1.x)',
27 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) '
28 |     'Chrome/58.0.3029.110 Safari/537.36',
29 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) '
30 |     'Version/5.1 Safari/534.50',
31 |     'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) '
32 |     'Version/5.1 Safari/534.50',
33 |     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
34 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
35 |     'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
36 |     'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
37 |     'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
38 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
39 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
40 |     'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
41 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 '
42 |     'Safari/537.36',
43 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'
44 | ]
45 | 
46 | def get_header():
47 |     header = {
48 |         'User-Agent': random.choice(user_agents),
49 |         'Accept-Encoding': 'gzip, deflate',
50 |         'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
51 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 |         'Connection': 'keep-alive'
53 |     }
54 |     return header
55 | 
56 | 
57 | def get_header2():
58 |     header = get_header()
59 |     header['Proxy-Connection'] = 'keep-alive'
60 |     header['Upgrade-Insecure-Requests'] = 1
61 |     header['Host'] = 'weibo.com'
62 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/kill_celery.txt:
--------------------------------------------------------------------------------
1 | ps aux| grep celery| awk '{print $2}'|xargs kill -9


--------------------------------------------------------------------------------
/spiders/dist_weibo/logger.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import logging
 3 | import logging.config
 4 | import os
 5 | 
 6 | def logger_conf():
 7 |     """
 8 |     load basic logger configure
 9 |     :return: configured logger
10 |     """
11 | 
12 |     if platform.system() == 'Windows':
13 | 
14 |         logging.config.fileConfig(os.path.abspath('.')+'\\conf\\logging.conf')
15 |     elif platform.system() == 'Linux':
16 | 
17 |         logging.config.fileConfig(os.path.abspath('.')+'/conf/logging.conf')
18 |     elif platform.system() == 'Darwin':
19 |         print(os.path.abspath('../../'))
20 |         logging.config.fileConfig(os.path.abspath('') + '/conf/logging.conf')
21 |     logger = logging.getLogger('simpleLogger')
22 | 
23 |     return logger
24 | 
25 | LOGGER = logger_conf()


--------------------------------------------------------------------------------
/spiders/dist_weibo/login/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jjzhu'
2 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/login/login.py:
--------------------------------------------------------------------------------
  1 | import execjs
  2 | import requests
  3 | import re
  4 | import json
  5 | import os
  6 | from dao.redis_cookies import RedisCookies
  7 | from headers import headers
  8 | def get_session():
  9 |     return requests.session()
 10 | 
 11 | 
 12 | def get_js_exec(path):
 13 |     phantom = execjs.get('PhantomJS')
 14 |     with open(path, 'r') as f:
 15 |         source = f.read()
 16 |     return phantom.compile(source)
 17 | 
 18 | 
 19 | def get_encodename(name, js_exec):
 20 |     return js_exec.call('get_name', name)
 21 | 
 22 | 
 23 | def get_password(password, pre_obj, exec_js):
 24 |     nonce = pre_obj['nonce']
 25 |     pubkey = pre_obj['pubkey']
 26 |     servertime = pre_obj['servertime']
 27 |     return exec_js.call('get_pass', password, nonce, servertime, pubkey)
 28 | 
 29 | 
 30 | def get_prelogin_info(prelogin_url, session):
 31 |     json_pattern = r'.*?\((.*)\)'
 32 |     response_str = session.get(prelogin_url).text
 33 |     m = re.match(json_pattern, response_str)
 34 |     return json.loads(m.group(1))
 35 | 
 36 | 
 37 | def get_redirect(data, post_url, session):
 38 |     logining_page = session.post(post_url, data=data, headers=headers)
 39 |     login_loop = logining_page.content.decode('GBK')
 40 |     pa = r'location\.replace\([\'"](.*?)[\'"]\)'
 41 |     return re.findall(pa, login_loop)[0]
 42 | 
 43 | 
 44 | def do_login(session, url):
 45 |     return session.get(url).text
 46 | 
 47 | 
 48 | def login(name, password):
 49 |     name = '18270916129'
 50 |     password = 'VS7452014'
 51 |     json_pattern = r'.*?\((.*)\)'
 52 |     session = get_session()
 53 |     exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js')
 54 |     su = get_encodename(name, exec_js)
 55 |     print(su)
 56 |     post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
 57 |     prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \
 58 |                    'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)'
 59 | 
 60 |     pre_obj = get_prelogin_info(prelogin_url, session)
 61 |     print(pre_obj)
 62 |     ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js)
 63 |     print(ps)
 64 |     data = {
 65 |         'entry': 'weibo',
 66 |         'gateway': '1',
 67 |         'from': '',
 68 |         'savestate': '7',
 69 |         'useticket': '1',
 70 |         'pagerefer': "http://login.sina.com.cn/sso/logout.php?"
 71 |                      "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
 72 |         'vsnf': '1',
 73 |         'su': su,
 74 |         'service': 'miniblog',
 75 |         'servertime': pre_obj['servertime'],
 76 |         'nonce': pre_obj['nonce'],
 77 |         'pwencode': 'rsa2',
 78 |         'rsakv': pre_obj['rsakv'],
 79 |         'sp': ps,
 80 |         'sr': '1366*768',
 81 |         'encoding': 'UTF-8',
 82 |         'prelt': '115',
 83 |         'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&'
 84 |                'callback=parent.sinaSSOController.feedBackUrlCallBack',
 85 |         'returntype': 'META',
 86 |     }
 87 |     url = get_redirect(data, post_url, session)
 88 |     print(url)
 89 |     login_info = do_login(session, url)
 90 |     print(login_info)
 91 |     m = re.match(json_pattern, login_info)
 92 |     info = json.loads(m.group(1))
 93 |     print(info)
 94 |     print(session.cookies.get_dict())
 95 |     RedisCookies.save_cookies(name, info['userinfo']['uniqueid'],
 96 |                               cookies=session.cookies.get_dict())
 97 | 
 98 |     return session, info
 99 | 
100 |     # session.get('http://weibo.com/u')
101 | if __name__ == '__main__':
102 |     login()
103 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/model/__init__.py


--------------------------------------------------------------------------------
/spiders/dist_weibo/model/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Integer, String, Text
 2 | from dao.sqlalchemy_session import Base
 3 | 
 4 | 
 5 | 
 6 | class Weibo(Base):
 7 |     __tablename__ = 'weibo'
 8 | 
 9 |     id = Column(Integer, primary_key=True, autoincrement=True)
10 |     source = Column(String(255))
11 |     date_time = Column(String(128))
12 |     url = Column(String(255))
13 |     content = Column(Text)
14 | 
15 | class User(Base):
16 |     __tablename__ = 'user'
17 |     id = Column(Integer, primary_key=True, autoincrement=True)
18 |     user_id = Column(String(128))
19 |     nickname = Column(String(255))
20 |     realname = Column(String(255))
21 |     location = Column(String(255))
22 |     gender = Column(String(4))
23 |     sexual_ori = Column(String(128))
24 |     emotion_state = Column(String(64))
25 |     birthday = Column(String(16))
26 |     blood_type = Column(String(2))
27 |     blog = Column(String(255))
28 |     domain_name = Column(String(255))
29 |     intro = Column(Text)
30 |     register_time = Column(String(16))
31 |     email = Column(String(64))
32 |     company = Column(String(128))
33 |     college = Column(String(255))
34 |     high_school = Column(String(255))
35 |     mid_school = Column(String(255))
36 |     tags = Column(String(255))
37 | 
38 | class Relationship(Base):
39 |     __tablename__ = 'relationship'
40 |     id = Column(Integer, primary_key=True, autoincrement=True)
41 |     user_id = Column(String(128))
42 |     fan_id = Column(String(128))
43 | 
44 | 
45 | class CrawlInfo(Base):
46 |     __tablename__ = 'crawl_info'
47 |     id = Column(Integer, primary_key=True, autoincrement=True)
48 |     user_id = Column(String(128))
49 |     last_crawl_date = Column(String(20))
50 | 
51 | if __name__ == '__main__':
52 |     from dao.sqlalchemy_session import engine
53 |     Base.metadata.create_all(engine)


--------------------------------------------------------------------------------
/spiders/dist_weibo/notebook/Request.md:
--------------------------------------------------------------------------------
1 | - python3
2 | - execjs
3 | - phantomjs


--------------------------------------------------------------------------------
/spiders/dist_weibo/sql/database.sql:
--------------------------------------------------------------------------------
1 | create database dist_weibo default character set utf8mb4 collate utf8mb4_unicode_ci;


--------------------------------------------------------------------------------
/spiders/dist_weibo/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/tasks/__init__.py


--------------------------------------------------------------------------------
/spiders/dist_weibo/tasks/home_page.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from dao.redis_cookies import RedisCookies
 3 | from headers import get_header
 4 | import requests
 5 | from tasks.workers import app
 6 | from bs4 import BeautifulSoup
 7 | import re
 8 | from model.models import Weibo
 9 | from dao.sqlalchemy_session import db_session
10 | 
11 | @app.task
12 | def home_page():
13 |     home_url = 'http://weibo.com/u/{}?is_ori=1&is_tag=0&profile_ftype=1&page=1'
14 | 
15 |     cookies_json = RedisCookies.fetch_cookies()
16 | 
17 | 
18 |     cookies = cookies_json['cookies']
19 | 
20 |     unique_id = cookies_json['unique_id']
21 | 
22 |     resp = requests.get(url=home_url.format(unique_id), headers=get_header(), cookies=cookies, verify=False).text
23 | 
24 |     home_html = BeautifulSoup(resp, 'html.parser')
25 | 
26 |     scripts = home_html.find_all('script')
27 |     scripts.reverse()
28 | 
29 |     view = re.compile('FM.view\((.*)\)')
30 |     weibo_html_content = ''
31 |     for script in scripts:
32 |         result = view.findall(script.string)
33 |         if len(result):
34 | 
35 |             r_json = json.loads(result[0])
36 |             if 'pl.content.homeFeed.index' == r_json['ns']:
37 |                 weibo_html_content = r_json['html']
38 |                 break
39 |     weibo_info = []
40 | 
41 |     if weibo_html_content != '':
42 |         weibo_html = BeautifulSoup(weibo_html_content, 'html.parser')
43 |         weibos = weibo_html.find_all('div', 'WB_detail')
44 | 
45 |         for weibo in weibos:
46 | 
47 |             source = ''
48 |             date = ''
49 |             weibo_url = ''
50 |             all_a = weibo.find_all('a', attrs={'class': 'S_txt2'})
51 |             weibo_text = weibo.find('div', attrs={'class': 'WB_text'})
52 |             content = weibo_text.text
53 | 
54 | 
55 |             for _a in all_a:
56 | 
57 | 
58 |                 if _a.has_attr('date') and _a.has_attr('href'):
59 | 
60 |                     date = _a.get('date')
61 |                     weibo_url = _a.get('href')
62 |                 if _a.has_attr('action-type'):
63 |                     source = _a.text
64 |             weibo = Weibo(source=source, url=weibo_url, date_time=date, content=content)
65 |             db_session.add(weibo)
66 |             db_session.commit()
67 |             weibo_info.append('date:%s\tsource:%s\turl:%s' % (date, source, weibo_url))
68 |     return weibo_info
69 | 
70 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/tasks/login.py:
--------------------------------------------------------------------------------
  1 | import execjs
  2 | import requests
  3 | import re
  4 | import json
  5 | import os
  6 | from tasks.workers import app
  7 | from dao.redis_cookies import RedisCookies
  8 | from headers import get_header
  9 | from logger import LOGGER
 10 | 
 11 | def get_session():
 12 |     return requests.session()
 13 | 
 14 | 
 15 | def get_js_exec(path):
 16 |     phantom = execjs.get('PhantomJS')
 17 |     with open(path, 'r') as f:
 18 |         source = f.read()
 19 |     return phantom.compile(source)
 20 | 
 21 | 
 22 | def get_encodename(name, js_exec):
 23 |     return js_exec.call('get_name', name)
 24 | 
 25 | 
 26 | def get_password(password, pre_obj, exec_js):
 27 |     nonce = pre_obj['nonce']
 28 |     pubkey = pre_obj['pubkey']
 29 |     servertime = pre_obj['servertime']
 30 |     return exec_js.call('get_pass', password, nonce, servertime, pubkey)
 31 | 
 32 | 
 33 | def get_prelogin_info(prelogin_url, session):
 34 |     json_pattern = r'.*?\((.*)\)'
 35 |     response_str = session.get(prelogin_url).text
 36 |     m = re.match(json_pattern, response_str)
 37 |     return json.loads(m.group(1))
 38 | 
 39 | 
 40 | def get_redirect(data, post_url, session):
 41 |     print(data)
 42 |     print(post_url)
 43 |     logining_page = session.post(post_url, data=data, headers=get_header())
 44 |     print(logining_page)
 45 |     login_loop = logining_page.content.decode('GBK')
 46 |     pa = r'location\.replace\([\'"](.*?)[\'"]\)'
 47 |     return re.findall(pa, login_loop)[0]
 48 | 
 49 | 
 50 | def do_login(session, url):
 51 |     return session.get(url).text
 52 | 
 53 | @app.task(ignore_result=True)
 54 | def clean():
 55 |     RedisCookies.clean()
 56 | 
 57 | @app.task(ignore_result=True)
 58 | def login(name='', password=''):
 59 |     json_pattern = r'.*?\((.*)\)'
 60 |     session = get_session()
 61 |     exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js')
 62 |     su = get_encodename(name, exec_js)
 63 |     post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
 64 |     prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \
 65 |                    'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)'
 66 | 
 67 |     pre_obj = get_prelogin_info(prelogin_url, session)
 68 |     ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js)
 69 |     data = {
 70 |         'entry': 'weibo',
 71 |         'gateway': '1',
 72 |         'from': '',
 73 |         'savestate': '7',
 74 |         'useticket': '1',
 75 |         'pagerefer': "http://login.sina.com.cn/sso/logout.php?"
 76 |                      "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
 77 |         'vsnf': '1',
 78 |         'su': su,
 79 |         'service': 'miniblog',
 80 |         'servertime': pre_obj['servertime'],
 81 |         'nonce': pre_obj['nonce'],
 82 |         'pwencode': 'rsa2',
 83 |         'rsakv': pre_obj['rsakv'],
 84 |         'sp': ps,
 85 |         'sr': '1366*768',
 86 |         'encoding': 'UTF-8',
 87 |         'prelt': '115',
 88 |         'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&'
 89 |                'callback=parent.sinaSSOController.feedBackUrlCallBack',
 90 |         'returntype': 'META',
 91 |     }
 92 |     url = get_redirect(data, post_url, session)
 93 |     print(url)
 94 |     login_info = do_login(session, url)
 95 |     m = re.match(json_pattern, login_info)
 96 |     info = json.loads(m.group(1))
 97 |     RedisCookies.save_cookies(name, info['userinfo']['uniqueid'],
 98 |                               cookies=session.cookies.get_dict())
 99 | 
100 | #     return session, info
101 | 
102 |     # session.get('http://weibo.com/u')
103 | if __name__ == '__main__':
104 |     login()
105 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/tasks/mobile_login.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | args = {
 4 |             'username': '767543579@qq.com',
 5 |             'password': 'JOPPER',
 6 |             'savestate': 1,
 7 |             'ec': 0,
 8 |             'pagerefer': 'https://passport.weibo.cn/signin/'
 9 |                          'welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F&wm=3349&vt=4',
10 |             'entry': 'mweibo',
11 |             'wentry': '',
12 |             'loginfrom': '',
13 |             'client_id': '',
14 |             'code': '',
15 |             'qq': '',
16 |             'hff': '',
17 |             'hfp': ''
18 |         }
19 | 
20 | session = requests.session()
21 | session.post('https://passport.weibo.cn/sso/login', data=args)
22 | resp = session.get('https://m.weibo.cn/api/container/getIndex?containerid=2304132210643391_-_WEIBO_SECOND_PROFILE_MORE_WEIBO&page=1')
23 | 
24 | print(session.cookies.get_dict())
25 | print(session.get('http://weibo.com/47452014').text)
26 | print(resp.text)
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo/tasks/workers.py:
--------------------------------------------------------------------------------
 1 | # coding:utf-8
 2 | import os
 3 | from celery import platforms
 4 | from celery import Celery
 5 | # root权限启动
 6 | platforms.C_FORCE_ROOT = True
 7 | 
 8 | get_broker_or_backend = ('redis://:''@127.0.0.1:6379/0', 'redis://:''@127.0.0.1:6379/1')
 9 | 
10 | worker_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'celery.log')
11 | beat_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'beat.log')
12 | 
13 | tasks = ['tasks.login', 'tasks.home_page', 'tasks.user']
14 | app = Celery('weibo_task', include=tasks, broker=get_broker_or_backend[0], backend=get_broker_or_backend[1])


--------------------------------------------------------------------------------
/spiders/dist_weibo/workers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/workers.py


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/__init__.py


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/conf/account.conf:
--------------------------------------------------------------------------------
1 | user password


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/dao/__init__.py:
--------------------------------------------------------------------------------
1 | __dict__="""
2 |     data store module
3 | """


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/dao/redis_cookies.py:
--------------------------------------------------------------------------------
 1 | #-*- coding:utf-8 -*-
 2 | import redis
 3 | import json
 4 | import datetime
 5 | 
 6 | class RedisCookies(object):
 7 |     redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
 8 | 
 9 |     @classmethod
10 |     def save_cookies(cls, user_name, unique_id, cookies):
11 |         pickled_cookies = json.dumps({
12 |             'cookies': cookies,
13 |             'unique_id': unique_id,
14 |             'login_time': datetime.datetime.now().timestamp()
15 |         })
16 |         r = redis.Redis(connection_pool=cls.redis_pool)
17 |         r.hset('account', user_name, pickled_cookies)
18 |         cls.user_in_queue(user_name)
19 | 
20 |     @classmethod
21 |     def user_in_queue(cls, user_name):
22 |         r = redis.Redis(connection_pool=cls.redis_pool)
23 |         if not r.sismember('users', user_name):
24 |             r.sadd("users", user_name)
25 | 
26 |     @classmethod
27 |     def fetch_cookies(cls):
28 |         r = redis.Redis(connection_pool=cls.redis_pool)
29 |         user = r.spop('users')
30 |         r.sadd('users', user)
31 |         return r.hget('account', user).decode('utf-8')
32 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/headers.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | 
 4 | user_agents = [
 5 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 6 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) '
 7 |     'Chrome/57.0.2987.133 Safari/537.36',
 8 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) '
 9 |     'Version/10.1 Safari/603.1.30',
10 |     'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0',
11 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5',
12 |     'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0',
13 |     'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89'
14 |     ' Safari/537.1 QIHU 360SE',
15 |     'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 '
16 |     '2345Explorer/7.1.0.12633',
17 |     'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Ubuntu/10.10 '
18 |     'Chromium/8.0.552.237 Chrome/8.0.552.237 Safari/534.10',
19 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 '
20 |     'Chrome/34.0.1847.116 Safari/537.36',
21 |     'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)',
22 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.39 (KHTML, like Gecko) Version/9.0 '
23 |     'Safari/601.1.39',
24 |     'Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.14',
25 |     'Opera/9.80 (Linux armv6l ; U; CE-HTML/1.0 NETTV/3.0.1;; en) Presto/2.6.33 Version/10.60',
26 |     'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; baidubrowser 1.x)',
27 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) '
28 |     'Chrome/58.0.3029.110 Safari/537.36',
29 |     'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) '
30 |     'Version/5.1 Safari/534.50',
31 |     'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) '
32 |     'Version/5.1 Safari/534.50',
33 |     'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
34 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
35 |     'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
36 |     'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11',
37 |     'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11',
38 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)',
39 |     'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)',
40 |     'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
41 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 '
42 |     'Safari/537.36',
43 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36'
44 | ]
45 | 
46 | 
47 | headers = {
48 |     'User-Agent': random.choice(user_agents),
49 |     'Accept-Encoding': 'gzip, deflate, sdch',
50 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
51 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 |     'Connection': 'keep-alive'
53 | }
54 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/login/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jjzhu'
2 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/login/login.py:
--------------------------------------------------------------------------------
  1 | import execjs
  2 | import requests
  3 | import re
  4 | import json
  5 | import os
  6 | from spiders.dist_weibo_spider.dao.redis_cookies import RedisCookies
  7 | from spiders.dist_weibo_spider.headers import headers
  8 | def get_session():
  9 |     return requests.session()
 10 | 
 11 | 
 12 | def get_js_exec(path):
 13 |     phantom = execjs.get('PhantomJS')
 14 |     with open(path, 'r') as f:
 15 |         source = f.read()
 16 |     return phantom.compile(source)
 17 | 
 18 | 
 19 | def get_encodename(name, js_exec):
 20 |     return js_exec.call('get_name', name)
 21 | 
 22 | 
 23 | def get_password(password, pre_obj, exec_js):
 24 |     nonce = pre_obj['nonce']
 25 |     pubkey = pre_obj['pubkey']
 26 |     servertime = pre_obj['servertime']
 27 |     return exec_js.call('get_pass', password, nonce, servertime, pubkey)
 28 | 
 29 | 
 30 | def get_prelogin_info(prelogin_url, session):
 31 |     json_pattern = r'.*?\((.*)\)'
 32 |     response_str = session.get(prelogin_url).text
 33 |     m = re.match(json_pattern, response_str)
 34 |     return json.loads(m.group(1))
 35 | 
 36 | 
 37 | def get_redirect(data, post_url, session):
 38 |     logining_page = session.post(post_url, data=data, headers=headers)
 39 |     login_loop = logining_page.content.decode('GBK')
 40 |     pa = r'location\.replace\([\'"](.*?)[\'"]\)'
 41 |     return re.findall(pa, login_loop)[0]
 42 | 
 43 | 
 44 | def do_login(session, url):
 45 |     return session.get(url).text
 46 | 
 47 | 
 48 | def login():
 49 |     name = '18270916129'
 50 |     password = 'VS7452014'
 51 |     json_pattern = r'.*?\((.*)\)'
 52 |     session = get_session()
 53 |     exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js')
 54 |     su = get_encodename(name, exec_js)
 55 |     print(su)
 56 |     post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)'
 57 |     prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \
 58 |                    'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)'
 59 | 
 60 |     pre_obj = get_prelogin_info(prelogin_url, session)
 61 |     print(pre_obj)
 62 |     ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js)
 63 |     print(ps)
 64 |     data = {
 65 |         'entry': 'weibo',
 66 |         'gateway': '1',
 67 |         'from': '',
 68 |         'savestate': '7',
 69 |         'useticket': '1',
 70 |         'pagerefer': "http://login.sina.com.cn/sso/logout.php?"
 71 |                      "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl",
 72 |         'vsnf': '1',
 73 |         'su': su,
 74 |         'service': 'miniblog',
 75 |         'servertime': pre_obj['servertime'],
 76 |         'nonce': pre_obj['nonce'],
 77 |         'pwencode': 'rsa2',
 78 |         'rsakv': pre_obj['rsakv'],
 79 |         'sp': ps,
 80 |         'sr': '1366*768',
 81 |         'encoding': 'UTF-8',
 82 |         'prelt': '115',
 83 |         'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&'
 84 |                'callback=parent.sinaSSOController.feedBackUrlCallBack',
 85 |         'returntype': 'META',
 86 |     }
 87 |     url = get_redirect(data, post_url, session)
 88 |     print(url)
 89 |     login_info = do_login(session, url)
 90 |     print(login_info)
 91 |     m = re.match(json_pattern, login_info)
 92 |     info = json.loads(m.group(1))
 93 |     print(info)
 94 |     print(session.cookies.get_dict())
 95 |     RedisCookies.save_cookies(name, info['userinfo']['uniqueid'],
 96 |                               cookies=session.cookies.get_dict())
 97 | 
 98 |     return session, info
 99 | 
100 |     # session.get('http://weibo.com/u')
101 | if __name__ == '__main__':
102 |     login()
103 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/notebook/Request.md:
--------------------------------------------------------------------------------
1 | - python3
2 | - execjs
3 | - phantomjs


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/tasks/__init__.py


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/tasks/home_page.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from spiders.dist_weibo_spider.dao.redis_cookies import RedisCookies
 3 | from spiders.dist_weibo_spider.headers import headers
 4 | import requests
 5 | 
 6 | home_url = 'http://weibo.com/u/{}?is_ori=1&is_tag=0&profile_ftype=1&page=1'
 7 | 
 8 | user_cookies = RedisCookies.fetch_cookies()
 9 | cookies_json = json.loads(user_cookies)
10 | 
11 | cookies = cookies_json['cookies']
12 | print(cookies)
13 | unique_id = cookies_json['unique_id']
14 | print(home_url.format(unique_id))
15 | resp = requests.get(url=home_url.format(unique_id), headers=headers, cookies=cookies, verify=False).text
16 | print(resp)
17 | 
18 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/tasks/workers.py:
--------------------------------------------------------------------------------
 1 | # coding:urf-8
 2 | import os
 3 | from celery import platforms
 4 | 
 5 | # root权限启动
 6 | platforms.C_FORCE_ROOT = True
 7 | 
 8 | worker_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'celery.log')
 9 | beat_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'beat.log')
10 | 


--------------------------------------------------------------------------------
/spiders/dist_weibo_spider/workers.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/workers.py


--------------------------------------------------------------------------------
/spiders/distributed/README.md:
--------------------------------------------------------------------------------
1 | ## 分布式爬虫


--------------------------------------------------------------------------------
/spiders/distributed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/__init__.py


--------------------------------------------------------------------------------
/spiders/distributed/celeryt/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/celeryt/__init__.py


--------------------------------------------------------------------------------
/spiders/distributed/celeryt/celerybeat-schedule:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/celeryt/celerybeat-schedule


--------------------------------------------------------------------------------
/spiders/distributed/celeryt/tasks.py:
--------------------------------------------------------------------------------
 1 | # -*- coding=utf-8 -*-
 2 | from celery import Celery
 3 | 
 4 | app = Celery('tasks', broker='redist://'':''@127.0.0.1:6379/0', backend='redist://'':''@127.0.0.1:6379/1')
 5 | 
 6 | app.conf.update(
 7 |     #  配置所在时区
 8 |     CELERY_TIMEZONE='Asia/Shanghai',
 9 |     CELERY_ENABLE_UTC=True,
10 |     #  官网推荐消息序列化方式为json
11 |     CELERY_ACCEPT_CONTENT=['json'],
12 |     CELERY_TASK_SERIALIZER='json',
13 |     CELERY_RESULT_SERIALIZER='json',
14 |     # 配置定时任务
15 |     CELERYBEAT_SCHEDULE={
16 |         'my_task': {
17 |             'task': 'tasks.add',  # tasks.py模块下的add方法
18 |             'schedule': 1,  # 每隔60运行一次
19 |             'args': (23, 12),
20 |         }
21 |     }
22 | )
23 | 
24 | 
25 | @app.task
26 | def add(x, y):
27 |     return x + y
28 | 
29 | 
30 | @app.task
31 | def sub(x, y):
32 |     return x - y


--------------------------------------------------------------------------------
/spiders/distributed/celeryt/test.py:
--------------------------------------------------------------------------------
1 | from tasks import add, sub
2 | 
3 | add_rs = add.delay(1, 2)
4 | sub_rs = sub.delay(3, 1)
5 | print add_rs.get()
6 | print sub_rs.get()
7 | 


--------------------------------------------------------------------------------
/spiders/distributed/redist/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/redist/__init__.py


--------------------------------------------------------------------------------
/spiders/distributed/redist/test_redis.py:
--------------------------------------------------------------------------------
1 | import redis
2 | r = redis.StrictRedis(host='localhost', port=6379, db=0)
3 | print(r.get('name'))
4 | 


--------------------------------------------------------------------------------
/spiders/distributed/task_dispatcher.py:
--------------------------------------------------------------------------------
 1 | from workers import app
 2 | 
 3 | crawl_urls = [
 4 |     'http://docs.celeryproject.org/en/latest/getting-started/introduction.html',
 5 |     'http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html',
 6 |     'http://docs.celeryproject.org/en/latest/getting-started/first-steps-with-celery.html',
 7 |     'http://docs.celeryproject.org/en/latest/getting-started/next-steps.html',
 8 |     'http://docs.celeryproject.org/en/latest/getting-started/resources.html',
 9 |     'http://docs.celeryproject.org/en/latest/userguide/application.html',
10 |     'http://docs.celeryproject.org/en/latest/userguide/tasks.html',
11 |     'http://docs.celeryproject.org/en/latest/userguide/canvas.html',
12 |     'http://docs.celeryproject.org/en/latest/userguide/workers.html',
13 |     'http://docs.celeryproject.org/en/latest/userguide/daemonizing.html',
14 |     'http://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html'
15 | ]
16 | 
17 | 
18 | def manage_crawl_task(urls):
19 |     for url in urls:
20 |         app.send_task('tasks.crawl', args=(url,))
21 | 
22 | if __name__ == '__main__':
23 |     manage_crawl_task(crawl_urls)
24 | 


--------------------------------------------------------------------------------
/spiders/distributed/tasks.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | from workers import app
 4 | 
 5 | 
 6 | @app.task
 7 | def crawl(url):
 8 |     print 'crawl url:{}'.format(url)
 9 |     rsp_test = requests.get(url).text
10 |     soup = BeautifulSoup(rsp_test, 'html.parser')
11 |     return soup.find('h1').text
12 | 
13 | 


--------------------------------------------------------------------------------
/spiders/distributed/workers.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | 
 3 | redis_host = '127.0.0.1'
 4 | redis_port = 6379
 5 | app = Celery('crawl_task', include=['tasks'], broker='redis://%s:%d/1' % (redis_host, redis_port),
 6 |              backend='redis://%s:%d/2' % (redis_host, redis_port))
 7 | app.conf.update(
 8 |     CELERY_TIMEZONE='Asia/Shanghai',
 9 |     CELERY_ENABLE_UTC=True,
10 |     CELERY_ACCEPT_CONTENT=['json'],
11 |     CELERY_TASK_SERIALIZER='json',
12 |     CELERY_RESULT_SERIALIZER='json',
13 | )
14 | 


--------------------------------------------------------------------------------
/spiders/logger.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | import logging
 3 | import logging.config
 4 | import os
 5 | 
 6 | def logger_conf():
 7 |     """
 8 |     load basic logger configure
 9 |     :return: configured logger
10 |     """
11 | 
12 |     if platform.system() == 'Windows':
13 | 
14 |         logging.config.fileConfig(os.path.abspath('../../')+'\\conf\\logging.conf')
15 |     elif platform.system() == 'Linux':
16 | 
17 |         logging.config.fileConfig(os.path.abspath('../../')+'/conf/logging.conf')
18 |     elif platform.system() == 'Darwin':
19 |         print(os.path.abspath('../../'))
20 |         logging.config.fileConfig(os.path.abspath('../../') + '/conf/logging.conf')
21 |     logger = logging.getLogger('simpleLogger')
22 | 
23 |     return logger
24 | 
25 | LOGGER = logger_conf()


--------------------------------------------------------------------------------
/spiders/models.py:
--------------------------------------------------------------------------------
  1 | from django.db import models
  2 | from django.utils.translation import ugettext_lazy as _
  3 | import datetime
  4 | # Create your models here.
  5 | 
  6 | 
  7 | class ShopId(models.Model):
  8 |     shop_id = models.CharField(max_length=20, primary_key=True)
  9 |     from_url = models.CharField(max_length=200, null=True)
 10 | 
 11 | 
 12 | class BaiKeRank(models.Model):
 13 |     rank = models.IntegerField(null=True)
 14 |     name = models.CharField(max_length=50, null=True)
 15 |     ori_score = models.CharField(max_length=50, null=True)
 16 |     rank_time = models.CharField(max_length=20)
 17 | 
 18 |     def __str__(self):
 19 |         return str(self.rank) +'\t' + self.name + '\t' + self.ori_score
 20 | 
 21 | 
 22 | class ShopInfo(models.Model):
 23 |     shop_id = models.CharField(max_length=20, primary_key=True)
 24 |     shop_name = models.CharField(max_length=200, default='')
 25 |     review_count = models.CharField(max_length=20, default='')
 26 |     avg_price = models.CharField(max_length=20, default='')
 27 |     taste = models.CharField(max_length=10, default='')
 28 |     env = models.CharField(max_length=10, default='')
 29 |     service = models.CharField(max_length=10, default='')
 30 |     address = models.CharField(max_length=200, default='')
 31 |     open_time = models.CharField(max_length=200, default='')
 32 |     rank_star = models.CharField(max_length=20, default='')
 33 |     place = models.CharField(max_length=20, default='')
 34 |     classify = models.CharField(max_length=20, default='')
 35 |     star_all = models.CharField(max_length=20, default='')
 36 |     star_5 = models.CharField(max_length=20, default='')
 37 |     star_4 = models.CharField(max_length=20, default='')
 38 |     star_3 = models.CharField(max_length=20, default='')
 39 |     star_2 = models.CharField(max_length=20, default='')
 40 |     star_1 = models.CharField(max_length=20, default='')
 41 |     feature = models.BooleanField(default=False)
 42 |     feature2 = models.CharField(max_length=200, default='')
 43 | 
 44 | 
 45 | class ReviewDedail(models.Model):
 46 |     shop_id = models.CharField(max_length=20, primary_key=True)
 47 |     star_all = models.CharField(max_length=20, null=True)
 48 |     star_5 = models.CharField(max_length=20, null=True)
 49 |     star_4 = models.CharField(max_length=20, null=True)
 50 |     star_3 = models.CharField(max_length=20, null=True)
 51 |     star_2 = models.CharField(max_length=20, null=True)
 52 |     star_1 = models.CharField(max_length=20, null=True)
 53 |     first_review_time = models.CharField(max_length=100, null=True)
 54 |     first_review_content = models.TextField(null=True)
 55 | 
 56 | 
 57 | class WeiboUser(models.Model):
 58 |     GENDER = ('m', 'f', 'u')
 59 |     id = models.CharField(max_length=12, primary_key=True)  # 用户id
 60 |     profile_url = models.CharField(max_length=400, null=True)  # url
 61 |     description = models.CharField(max_length=1000, null=True)  # 简介
 62 |     created_at = models.CharField(max_length=100, null=True)  # 创建时间
 63 |     screen_name = models.CharField(max_length=100, null=True)  # 昵称
 64 |     nativePlace = models.CharField(max_length=10, null=True)  # 所在地
 65 |     mblogNum = models.CharField(max_length=20, null=True)  # 微博数
 66 |     attNum = models.CharField(max_length=20, null=True)  # 关注数
 67 |     fansNum = models.CharField(max_length=20, null=True)  # 粉丝数
 68 |     gender = models.CharField(max_length=10, null=True) # 性别
 69 |     school = models.CharField(max_length=100, null=True)
 70 |     def __str__(self):
 71 |         return '\n\t'+'user: ' + self.screen_name + '\n\t'+'id: '+str(self.id)+'\n\t'\
 72 |                + '昵称:'+self.screen_name + '\n\t'+'微博数:'+str(self.mblogNum)+'\n\t'+'关注:'+str(self.attNum)
 73 | 
 74 | 
 75 | class UserRelationship(models.Model):
 76 |     user = models.ForeignKey(WeiboUser, related_name='user')
 77 |     follower = models.ForeignKey(WeiboUser, related_name='follower')
 78 | 
 79 |     class Meta:
 80 |         unique_together = ('user', 'follower')
 81 |     primary = ('user', 'follower')
 82 | 
 83 |     def __str__(self):
 84 |         return self.follower.screen_name +'--->'+self.user.screen_name
 85 | 
 86 | 
 87 | class Weibo(models.Model):
 88 |     id = models.CharField(max_length=20, primary_key=True)
 89 |     user = models.ForeignKey(WeiboUser)
 90 |     text = models.TextField(null=False)
 91 |     created_timestamp = models.CharField(max_length=20, null=True)
 92 |     retweented_status = models.ForeignKey('self', null=True)
 93 |     source = models.CharField(max_length=200, null=True)
 94 | 
 95 |     def __str__(self):
 96 |         return '\n\t'+'user:'+self.user.screen_name+'\n\t'+'blog_id:'+self.id
 97 | 
 98 | 
 99 | class Comment(models.Model):
100 |     name = models.CharField(_('name'), max_length=64)
101 |     email_address = models.EmailField(_('email address'))
102 |     homepage = models.URLField(_('home page'), blank=True)
103 |     comment = models.TextField(_('comment'))
104 |     pub_date = models.DateTimeField(_('Published date'), editable=False, auto_now_add=True)
105 |     is_spam = models.BooleanField(_('spam?'), default=False, editable=False)
106 | 
107 |     class Meta:
108 |         verbose_name = _('comment')
109 |         verbose_name_plural = _('comment')
110 | 
111 | 
112 | class Step(models.Model):
113 |     steps = models.IntegerField()
114 |     curr_time = models.DateTimeField(default=datetime.datetime.now())
115 | 
116 |     def __str__(self):
117 |         return '{steps:%d, time:%s}' % (self.steps, self.curr_time.strftime('%Y-%m-%d %H:%M:%S'))
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/mzi/mzi/__init__.py


--------------------------------------------------------------------------------
/spiders/mzi/mzi/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class MziItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class MziSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class MziPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for mzi project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'mzi'
13 | 
14 | SPIDER_MODULES = ['mzi.spiders']
15 | NEWSPIDER_MODULE = 'mzi.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'mzi (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | DEFAULT_REQUEST_HEADERS = {
43 |             'Accept': '*/*',
44 |             # 'Accept-Encoding': 'gzip,deflate,sdch',
45 |             'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
46 |             'Host': 'baike.baidu.com',
47 |             'Proxy-Connection': 'keep-alive',
48 |             'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
49 | 
50 |             # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36'
51 |             #           ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'
52 | }
53 | 
54 | # Enable or disable spider middlewares
55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
56 | #SPIDER_MIDDLEWARES = {
57 | #    'mzi.middlewares.MziSpiderMiddleware': 543,
58 | #}
59 | 
60 | # Enable or disable downloader middlewares
61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
62 | #DOWNLOADER_MIDDLEWARES = {
63 | #    'mzi.middlewares.MyCustomDownloaderMiddleware': 543,
64 | #}
65 | 
66 | # Enable or disable extensions
67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
68 | #EXTENSIONS = {
69 | #    'scrapy.extensions.telnet.TelnetConsole': None,
70 | #}
71 | 
72 | # Configure item pipelines
73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
74 | #ITEM_PIPELINES = {
75 | #    'mzi.pipelines.MziPipeline': 300,
76 | #}
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | #AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | #AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | #AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | #AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | #HTTPCACHE_ENABLED = True
94 | #HTTPCACHE_EXPIRATION_SECS = 0
95 | #HTTPCACHE_DIR = 'httpcache'
96 | #HTTPCACHE_IGNORE_HTTP_CODES = []
97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/spiders/mzi/mzi/spiders/baikerank.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import json
 3 | class BaikeRank(scrapy.Spider):
 4 |     name = 'baikerank'
 5 |     start_urls = [
 6 |         'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek'
 7 |     ]
 8 | 
 9 |     def parse(self, response):
10 |         rt = json.loads(response.body())
11 |         print('_'*50)
12 |         print(rt)


--------------------------------------------------------------------------------
/spiders/mzi/mzi/spiders/meizi.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import re
 3 | import os
 4 | import urllib.request
 5 | 
 6 | class Meizi(scrapy.Spider):
 7 |     name = 'mzi'
 8 |     # allowed_domains = ['http://www.mzitu.com/']
 9 |     start_urls = [
10 |         'http://www.mzitu.com/'
11 |     ]
12 |     href_pattern = re.compile('<a href="(.*?)" target="_blank">(.*?)</a>')
13 |     base_dir = '/Users/didi/crawler/mzi/'
14 | 
15 |     def parse(self, response):
16 | 
17 |         header_hrefs = response.css('ul.menu li a::attr(href)').extract()
18 |         for ref in header_hrefs:
19 |             print(ref)
20 |             yield scrapy.Request(ref, callback=self.parse_classify)
21 | 
22 |     def parse_classify(self, response):
23 |         pic_hrefs = response.css('div.postlist ul li a::attr(href)').extract()
24 |         for href in pic_hrefs:
25 |             yield scrapy.Request(url=href, callback=self.parse_detail)
26 | 
27 |         max_page = int(response.css('a.page-numbers::text').extract()[-2])
28 |         base_url = (response.url if response.url.endswith('/') else response.url+'/') + 'page/'
29 |         for pn in range(max_page+1):
30 |             yield scrapy.Request(url=base_url+str(pn), callback=self.parse_classify)
31 | 
32 |     def parse_detail(self, response):
33 |         title = response.css('div.main-image img::attr(alt)').extract()[0]
34 |         if not os.path.exists(self.base_dir+title):
35 |             os.mkdir(self.base_dir+title)
36 | 
37 |         img_src = response.css('div.main-image img::attr(src)').extract()[0]
38 |         img_path = self.base_dir+title+'/'+img_src[img_src.rindex('/')+1:]
39 |         urllib.request.urlretrieve(img_src, img_path)
40 |         max_page = int(response.css('div.pagenavi a span::text').extract()[-2])
41 | 
42 |         base_url = response.url if response.url.endswith('/') else response.url+'/'
43 |         for pn in range(max_page+1):
44 |             yield scrapy.Request(base_url+str(pn), callback=self.parse_detail)
45 | 
46 |         print(img_path)
47 | 
48 | 


--------------------------------------------------------------------------------
/spiders/mzi/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = mzi.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = mzi
12 | 


--------------------------------------------------------------------------------
/spiders/onepiece/onepiece/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class OnepieceItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/spiders/onepiece/onepiece/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class OnepieceSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/spiders/onepiece/onepiece/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class OnepiecePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/spiders/onepiece/onepiece/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for onepiece project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'onepiece'
13 | 
14 | SPIDER_MODULES = ['onepiece.spiders']
15 | NEWSPIDER_MODULE = 'onepiece.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'onepiece (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'onepiece.middlewares.OnepieceSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'onepiece.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'onepiece.pipelines.OnepiecePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/spiders/onepiece/onepiece/spiders/one_piece.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | import urllib.request
 3 | 
 4 | class OnePiece(scrapy.Spider):
 5 |     name = 'onepiece'
 6 |     start_urls = [
 7 |         'http://www.dlkoo.com/down/3/2015/368456186.html',
 8 |     ]
 9 |     base_dir = '/Users/didi/crawler/onepiece/'
10 |     def parse(self, response):
11 |         url = response.url
12 |         link_list = response.xpath("//div[@id='dlinklist']").css("a::attr(href)").extract()
13 |         torrent_url = 'http://www.dlkoo.com/down/downfile.asp?act=subb&n=%s/downfile2.asp?act=down&n=%s'
14 | 
15 |         for link in link_list:
16 |             id = link.split("=")[1]
17 | 
18 |             urllib.request.urlretrieve(torrent_url % (id, id))
19 | 
20 |         pass
21 | 


--------------------------------------------------------------------------------
/spiders/onepiece/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = onepiece.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = onepiece
12 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/README.MD:
--------------------------------------------------------------------------------
1 | ## 螃蟹云购
2 | 监控商品有无货，有货时提醒


--------------------------------------------------------------------------------
/spiders/pangxieyg/README.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/README.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/__init__.py:
--------------------------------------------------------------------------------
1 | __doc__ = 'pyinstaller -F pangxie2.py'


--------------------------------------------------------------------------------
/spiders/pangxieyg/alert-templates.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!--
  3 |    Licensed to the Apache Software Foundation (ASF) under one or more
  4 |    contributor license agreements.  See the NOTICE file distributed with
  5 |    this work for additional information regarding copyright ownership.
  6 |    The ASF licenses this file to You under the Apache License, Version 2.0
  7 |    (the "License"); you may not use this file except in compliance with
  8 |    the License.  You may obtain a copy of the License at
  9 | 
 10 |        http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 |    Unless required by applicable law or agreed to in writing, software
 13 |    distributed under the License is distributed on an "AS IS" BASIS,
 14 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |    See the License for the specific language governing permissions and
 16 |    limitations under the License.
 17 | -->
 18 | <alert-templates>
 19 |   <alert-template type="EMAIL">
 20 |     <subject>
 21 |       <![CDATA[Alert Summary: OK[$summary.getOkCount()], Warning[$summary.getWarningCount()], Critical[$summary.getCriticalCount()], Unknown[$summary.getUnknownCount()]]]>
 22 |     </subject>
 23 |     <body>
 24 |       <![CDATA[
 25 | #set( $alertStates = ["OK", "WARNING", "CRITICAL", "UNKNOWN"] )
 26 | #set( $services = $summary.getServices() )
 27 | <html>
 28 |   <style type="text/css">
 29 |     html {
 30 |       font-family:sans-serif;
 31 |       -webkit-text-size-adjust:100%;
 32 |       -ms-text-size-adjust:100%;
 33 |     }
 34 |     body {
 35 |       margin:10px;
 36 |     }
 37 |     footer,header {
 38 |       display:block;
 39 |     }
 40 |     table {
 41 |       border-spacing:0;
 42 |       border-collapse:collapse;
 43 |     }
 44 |     td,th {
 45 |       padding:10px;
 46 |     }
 47 |     .panel {
 48 |       margin-bottom:20px;
 49 |       background-color:#fff;
 50 |       border:1px solid transparent;
 51 |       border-radius:4px;
 52 |       -webkit-box-shadow:0 1px 1px rgba(0,0,0,.05);
 53 |       box-shadow:0 1px 1px rgba(0,0,0,.05);
 54 |     }
 55 |     .panel-body {
 56 |       padding:15px;
 57 |     }
 58 |     .panel-heading {
 59 |       padding:10px 15px;
 60 |       border-bottom:1px solid transparent;
 61 |       border-top-left-radius:3px;
 62 |       border-top-right-radius:3px;
 63 |     }
 64 |     .panel-title {
 65 |       margin-top:0;
 66 |       margin-bottom:0;
 67 |       font-size:16px;
 68 |       color:inherit;
 69 |     }
 70 |     .panel-default {
 71 |       border-color:#ddd;
 72 |     }
 73 |     .panel-default > .panel-heading {
 74 |       color:#333;
 75 |       background-color:#f5f5f5;
 76 |       border-color:#ddd;
 77 |     }
 78 |     .panel-primary {
 79 |       border-color: #337ab7;
 80 |     }
 81 |     .panel-primary > .panel-heading {
 82 |       color: #fff;
 83 |       background-color: #337ab7;
 84 |       border-color: #337ab7;
 85 |     }
 86 |     .label {
 87 |       display:inline;
 88 |       padding:.3em 1em;
 89 |       font-size:75%;
 90 |       font-weight:bold;
 91 |       line-height:1;
 92 |       color:#fff;
 93 |       text-align:center;
 94 |       white-space:nowrap;
 95 |       vertical-align:baseline;
 96 |       border-radius:.25em;
 97 |     }
 98 |     .label-unknown {
 99 |       background-color:#777;
100 |     }
101 |     .label-primary {
102 |       background-color:#337ab7;
103 |     }
104 |     .label-ok {
105 |       background-color:#5cb85c;
106 |     }
107 |     .label-warning {
108 |       background-color:#f0ad4e;
109 |     }
110 |     .label-critical {
111 |       background-color:#d9534f;
112 |     }
113 |     .label-small {
114 |       font-size:12px;
115 |     }
116 |     .ambari-footer{
117 |       font-family: Arial, Helvetica, sans-serif;
118 |       font-size: 12px;    
119 |     }
120 |   </style>
121 |   <div class="panel panel-primary">
122 |     <div class="panel-heading">
123 |       <h3 class="panel-title">Services Reporting Alerts</h3>
124 |     </div>
125 |     <div class="panel-body">
126 |       <table>
127 |         #foreach( $alertState in $alertStates )
128 |           #if( $summary.getServicesByAlertState($alertState)  )
129 |             <tr>
130 |               <td>
131 |                 <span class="label label-$alertState">$alertState</span>
132 |               </td>
133 |               <td>
134 |                 $summary.getServicesByAlertState($alertState)
135 |               </td>
136 |             </tr>
137 |           #end
138 |         #end
139 |       </table> 
140 |     </div>
141 |   </div>
142 | 
143 |   #foreach( $service in $services )
144 |   <div class="panel panel-default">
145 |     <div class="panel-heading">
146 |       <h3 class="panel-title">$service</h3>
147 |     </div>
148 |     <div class="panel-body">
149 |       <table>
150 |         #foreach( $alertState in $alertStates )
151 |             #foreach( $alert in $summary.getAlerts($service,$alertState) )
152 |               <tr>
153 |                 <td>
154 |                   <span class="label label-$alertState">$alertState</span>
155 |                 </td>
156 |                 <td>
157 |                   $alert.getAlertDefinition().getLabel()
158 |                   <div class="label-small">
159 |                     $alert.getAlertText()
160 |                   </div>
161 |                 </td>
162 |               </tr>
163 |             #end
164 |         #end
165 |       </table>
166 |     </div>
167 |   </div>
168 |   #end
169 |   <div class="ambari-footer">
170 |     This notification was sent to $dispatch.getTargetName()
171 |     <br/>
172 |     Apache Ambari $ambari.getServerVersion()
173 |   </div>
174 | </html>
175 |       ]]>
176 |     </body>
177 |   </alert-template>
178 |   <alert-template type="SNMP">
179 |     <subject>
180 |       <![CDATA[[$alert.getAlertState()] $alert.getAlertName()]]>
181 |     </subject>
182 |     <body>
183 |       <![CDATA[
184 | [Alert] $alert.getAlertName()
185 | [Service] $alert.getServiceName()
186 | #if( $alert.hasComponentName() )
187 | [Component] $alert.getComponentName()
188 | #end
189 | #if( $alert.hasHostName() )
190 | [Host] $alert.getHostName()
191 | #end
192 | 
193 | $alert.getAlertText()]]>
194 |     </body>
195 |   </alert-template>  
196 | </alert-templates>


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/out00-EXE.toc:
--------------------------------------------------------------------------------
 1 | ('E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\dist\\pangxie2.exe',
 2 |  True,
 3 |  False,
 4 |  False,
 5 |  None,
 6 |  None,
 7 |  False,
 8 |  False,
 9 |  u'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1"><assemblyIdentity name="pangxie2" processorArchitecture="x86" type="win32" version="1.0.0.0"/><dependency><dependentAssembly><assemblyIdentity name="Microsoft.VC90.CRT" processorArchitecture="x86" publicKeyToken="1fc8b3b9a1e18e3b" type="win32" version="9.0.21022.8"/><compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1"/></dependentAssembly></dependency><compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1"><application><supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/><supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/><supportedOS Id="{e2011457-1546-43c5-a5fe-008deee3d3f0}"/><supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/><supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/></application></compatibility></assembly>',
10 |  True,
11 |  'pangxie2.pkg',
12 |  [('out00-PYZ.pyz',
13 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PYZ.pyz',
14 |    'PYZ'),
15 |   ('struct', 'd:\\python27\\lib\\struct.pyc', 'PYMODULE'),
16 |   ('pyimod01_os_path',
17 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod01_os_path.pyc',
18 |    'PYMODULE'),
19 |   ('pyimod02_archive',
20 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod02_archive.pyc',
21 |    'PYMODULE'),
22 |   ('pyimod03_importers',
23 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod03_importers.pyc',
24 |    'PYMODULE'),
25 |   ('pyiboot01_bootstrap',
26 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyiboot01_bootstrap.py',
27 |    'PYSOURCE'),
28 |   ('pangxie2',
29 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\pangxie2.py',
30 |    'PYSOURCE'),
31 |   (u'Microsoft.VC90.CRT.manifest',
32 |    u'C:\\windows\\WinSxS\\Manifests\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57.manifest',
33 |    'BINARY'),
34 |   (u'msvcr90.dll',
35 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcr90.dll',
36 |    'BINARY'),
37 |   (u'msvcp90.dll',
38 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcp90.dll',
39 |    'BINARY'),
40 |   (u'msvcm90.dll',
41 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcm90.dll',
42 |    'BINARY'),
43 |   ('python27.dll', 'C:\\windows\\system32\\python27.dll', 'BINARY'),
44 |   ('unicodedata', 'd:\\python27\\DLLs\\unicodedata.pyd', 'EXTENSION'),
45 |   ('bz2', 'd:\\python27\\DLLs\\bz2.pyd', 'EXTENSION'),
46 |   ('_hashlib', 'd:\\python27\\DLLs\\_hashlib.pyd', 'EXTENSION'),
47 |   ('_ssl', 'd:\\python27\\DLLs\\_ssl.pyd', 'EXTENSION'),
48 |   ('_socket', 'd:\\python27\\DLLs\\_socket.pyd', 'EXTENSION'),
49 |   ('select', 'd:\\python27\\DLLs\\select.pyd', 'EXTENSION'),
50 |   ('pyexpat', 'd:\\python27\\DLLs\\pyexpat.pyd', 'EXTENSION'),
51 |   ('_ctypes', 'd:\\python27\\DLLs\\_ctypes.pyd', 'EXTENSION'),
52 |   ('win32pipe',
53 |    'd:\\python27\\lib\\site-packages\\win32\\win32pipe.pyd',
54 |    'EXTENSION'),
55 |   ('_multiprocessing',
56 |    'd:\\python27\\DLLs\\_multiprocessing.pyd',
57 |    'EXTENSION'),
58 |   ('win32evtlog',
59 |    'd:\\python27\\lib\\site-packages\\win32\\win32evtlog.pyd',
60 |    'EXTENSION'),
61 |   ('win32api',
62 |    'd:\\python27\\lib\\site-packages\\win32\\win32api.pyd',
63 |    'EXTENSION'),
64 |   ('_portaudio',
65 |    'd:\\python27\\lib\\site-packages\\_portaudio.pyd',
66 |    'EXTENSION'),
67 |   ('win32wnet',
68 |    'd:\\python27\\lib\\site-packages\\win32\\win32wnet.pyd',
69 |    'EXTENSION'),
70 |   ('pywintypes27.dll',
71 |    'd:\\python27\\lib\\site-packages\\pywin32_system32\\pywintypes27.dll',
72 |    'BINARY'),
73 |   ('certifi\\cacert.pem',
74 |    'd:\\python27\\lib\\site-packages\\certifi\\cacert.pem',
75 |    'DATA'),
76 |   ('certifi\\old_root.pem',
77 |    'd:\\python27\\lib\\site-packages\\certifi\\old_root.pem',
78 |    'DATA'),
79 |   ('Include\\pyconfig.h', 'd:\\python27\\Include\\pyconfig.h', 'DATA'),
80 |   ('certifi\\weak.pem',
81 |    'd:\\python27\\lib\\site-packages\\certifi\\weak.pem',
82 |    'DATA'),
83 |   ('pangxie2.exe.manifest',
84 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\pangxie2.exe.manifest',
85 |    'BINARY'),
86 |   ('pyi-windows-manifest-filename pangxie2.exe.manifest', '', 'OPTION')],
87 |  [],
88 |  False,
89 |  False,
90 |  1497602441L,
91 |  [('run.exe',
92 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\bootloader\\Windows-32bit\\run.exe',
93 |    'EXECUTABLE')])
94 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/out00-PKG.pkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/build/pangxie2/out00-PKG.pkg


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/out00-PKG.toc:
--------------------------------------------------------------------------------
 1 | ('E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PKG.pkg',
 2 |  {'BINARY': 1,
 3 |   'DATA': 1,
 4 |   'EXECUTABLE': 1,
 5 |   'EXTENSION': 1,
 6 |   'PYMODULE': 1,
 7 |   'PYSOURCE': 1,
 8 |   'PYZ': 0},
 9 |  [('out00-PYZ.pyz',
10 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PYZ.pyz',
11 |    'PYZ'),
12 |   ('struct', 'd:\\python27\\lib\\struct.pyc', 'PYMODULE'),
13 |   ('pyimod01_os_path',
14 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod01_os_path.pyc',
15 |    'PYMODULE'),
16 |   ('pyimod02_archive',
17 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod02_archive.pyc',
18 |    'PYMODULE'),
19 |   ('pyimod03_importers',
20 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod03_importers.pyc',
21 |    'PYMODULE'),
22 |   ('pyiboot01_bootstrap',
23 |    'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyiboot01_bootstrap.py',
24 |    'PYSOURCE'),
25 |   ('pangxie2',
26 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\pangxie2.py',
27 |    'PYSOURCE'),
28 |   (u'Microsoft.VC90.CRT.manifest',
29 |    u'C:\\windows\\WinSxS\\Manifests\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57.manifest',
30 |    'BINARY'),
31 |   (u'msvcr90.dll',
32 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcr90.dll',
33 |    'BINARY'),
34 |   (u'msvcp90.dll',
35 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcp90.dll',
36 |    'BINARY'),
37 |   (u'msvcm90.dll',
38 |    u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcm90.dll',
39 |    'BINARY'),
40 |   ('python27.dll', 'C:\\windows\\system32\\python27.dll', 'BINARY'),
41 |   ('unicodedata', 'd:\\python27\\DLLs\\unicodedata.pyd', 'EXTENSION'),
42 |   ('bz2', 'd:\\python27\\DLLs\\bz2.pyd', 'EXTENSION'),
43 |   ('_hashlib', 'd:\\python27\\DLLs\\_hashlib.pyd', 'EXTENSION'),
44 |   ('_ssl', 'd:\\python27\\DLLs\\_ssl.pyd', 'EXTENSION'),
45 |   ('_socket', 'd:\\python27\\DLLs\\_socket.pyd', 'EXTENSION'),
46 |   ('select', 'd:\\python27\\DLLs\\select.pyd', 'EXTENSION'),
47 |   ('pyexpat', 'd:\\python27\\DLLs\\pyexpat.pyd', 'EXTENSION'),
48 |   ('_ctypes', 'd:\\python27\\DLLs\\_ctypes.pyd', 'EXTENSION'),
49 |   ('win32pipe',
50 |    'd:\\python27\\lib\\site-packages\\win32\\win32pipe.pyd',
51 |    'EXTENSION'),
52 |   ('_multiprocessing',
53 |    'd:\\python27\\DLLs\\_multiprocessing.pyd',
54 |    'EXTENSION'),
55 |   ('win32evtlog',
56 |    'd:\\python27\\lib\\site-packages\\win32\\win32evtlog.pyd',
57 |    'EXTENSION'),
58 |   ('win32api',
59 |    'd:\\python27\\lib\\site-packages\\win32\\win32api.pyd',
60 |    'EXTENSION'),
61 |   ('_portaudio',
62 |    'd:\\python27\\lib\\site-packages\\_portaudio.pyd',
63 |    'EXTENSION'),
64 |   ('win32wnet',
65 |    'd:\\python27\\lib\\site-packages\\win32\\win32wnet.pyd',
66 |    'EXTENSION'),
67 |   ('pywintypes27.dll',
68 |    'd:\\python27\\lib\\site-packages\\pywin32_system32\\pywintypes27.dll',
69 |    'BINARY'),
70 |   ('certifi\\cacert.pem',
71 |    'd:\\python27\\lib\\site-packages\\certifi\\cacert.pem',
72 |    'DATA'),
73 |   ('certifi\\old_root.pem',
74 |    'd:\\python27\\lib\\site-packages\\certifi\\old_root.pem',
75 |    'DATA'),
76 |   ('Include\\pyconfig.h', 'd:\\python27\\Include\\pyconfig.h', 'DATA'),
77 |   ('certifi\\weak.pem',
78 |    'd:\\python27\\lib\\site-packages\\certifi\\weak.pem',
79 |    'DATA'),
80 |   ('pangxie2.exe.manifest',
81 |    'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\pangxie2.exe.manifest',
82 |    'BINARY'),
83 |   ('pyi-windows-manifest-filename pangxie2.exe.manifest', '', 'OPTION')],
84 |  False,
85 |  False,
86 |  False)
87 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/out00-PYZ.pyz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/build/pangxie2/out00-PYZ.pyz


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/pangxie2.exe.manifest:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
 2 | <assembly manifestVersion="1.0" xmlns="urn:schemas-microsoft-com:asm.v1">
 3 |   <assemblyIdentity name="pangxie2" processorArchitecture="x86" type="win32" version="1.0.0.0"/>
 4 |   <dependency>
 5 |     <dependentAssembly>
 6 |       <assemblyIdentity name="Microsoft.VC90.CRT" processorArchitecture="x86" publicKeyToken="1fc8b3b9a1e18e3b" type="win32" version="9.0.21022.8"/>
 7 |       <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1"/>
 8 |     </dependentAssembly>
 9 |   </dependency>
10 |   <compatibility xmlns="urn:schemas-microsoft-com:compatibility.v1">
11 |     <application>
12 |       <supportedOS Id="{1f676c76-80e1-4239-95bb-83d0f6d0da78}"/>
13 |       <supportedOS Id="{4a2f28e3-53b9-4441-ba9c-d69d4a4a6e38}"/>
14 |       <supportedOS Id="{e2011457-1546-43c5-a5fe-008deee3d3f0}"/>
15 |       <supportedOS Id="{35138b9a-5d96-4fbd-8e2d-a2440225f93a}"/>
16 |       <supportedOS Id="{8e0f7a12-bfb3-4fe8-b9a5-48fd50a15a9a}"/>
17 |     </application>
18 |   </compatibility>
19 | </assembly>


--------------------------------------------------------------------------------
/spiders/pangxieyg/build/pangxie2/warnpangxie2.txt:
--------------------------------------------------------------------------------
 1 | missing module named org - imported by copy
 2 | missing module named fcntl - imported by tempfile, subprocess
 3 | missing module named readline - imported by cmd, pdb
 4 | missing module named pwd - imported by posixpath, getpass, netrc, shutil, tarfile
 5 | missing module named _dummy_threading - imported by dummy_threading
 6 | missing module named _scproxy - imported by urllib
 7 | missing module named EasyDialogs - imported by getpass
 8 | missing module named termios - imported by getpass
 9 | missing module named SOCKS - imported by ftplib
10 | missing module named rourl2path - imported by urllib
11 | missing module named vms_lib - imported by platform
12 | missing module named 'org.python' - imported by pickle, xml.sax
13 | missing module named 'java.lang' - imported by platform, xml.sax._exceptions
14 | missing module named java - imported by platform
15 | missing module named _xmlplus - imported by xml
16 | missing module named 'Carbon.File' - imported by plistlib
17 | missing module named 'Carbon.Files' - imported by plistlib
18 | missing module named Carbon - imported by plistlib
19 | missing module named MacOS - imported by platform
20 | missing module named macresource - imported by MacOS
21 | missing module named gestalt - imported by platform
22 | missing module named winreg.OpenKeyEx - imported by winreg, platform
23 | missing module named winreg.HKEY_LOCAL_MACHINE - imported by winreg, platform
24 | missing module named winreg.QueryValueEx - imported by winreg, platform
25 | missing module named winreg.CloseKey - imported by winreg, platform
26 | missing module named riscosenviron - imported by os
27 | missing module named riscospath - imported by os
28 | missing module named riscos - imported by os
29 | missing module named ce - imported by os
30 | missing module named _emx_link - imported by os
31 | missing module named os2 - imported by os
32 | missing module named posix - imported by os
33 | missing module named resource - imported by posix
34 | missing module named _xmlrpclib - imported by xmlrpclib
35 | missing module named _sysconfigdata - imported by distutils.sysconfig
36 | missing module named grp - imported by shutil, tarfile
37 | missing module named 'urllib.request' - imported by requests.compat
38 | missing module named 'urllib.parse' - imported by requests.compat
39 | missing module named ipaddress - imported by urllib3.packages.ssl_match_hostname._implementation
40 | missing module named backports - imported by urllib3.packages.ssl_match_hostname
41 | missing module named simplejson - imported by requests.compat
42 | missing module named 'urllib3.packages.six.moves' - imported by urllib3.exceptions, urllib3.connectionpool, urllib3.connection, urllib3.util.response, urllib3.request, urllib3.response, urllib3.poolmanager
43 | missing module named socks - imported by urllib3.contrib.socks
44 | missing module named 'OpenSSL.crypto' - imported by urllib3.contrib.pyopenssl
45 | missing module named 'cryptography.x509' - imported by urllib3.contrib.pyopenssl
46 | missing module named six - imported by urllib3.contrib.pyopenssl
47 | missing module named 'cryptography.hazmat' - imported by urllib3.contrib.pyopenssl
48 | missing module named cryptography - imported by urllib3.contrib.pyopenssl
49 | missing module named OpenSSL - imported by urllib3.contrib.pyopenssl
50 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/conf/README.MD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/conf/README.MD


--------------------------------------------------------------------------------
/spiders/pangxieyg/conf/info.conf:
--------------------------------------------------------------------------------
 1 | [main]
 2 | timeout=10
 3 | sleep=1
 4 | [user]
 5 | username=13486178520
 6 | password=****
 7 | goods=109121 111592 121557 107494
 8 | [mail]
 9 | to=767543579@qq.com jjzhu_ncu@163.com
10 | [thread]
11 | thread_num=4


--------------------------------------------------------------------------------
/spiders/pangxieyg/conf/logging.conf:
--------------------------------------------------------------------------------
 1 | #logging.conf
 2 | ######################################################################
 3 | [loggers]  # 配置了两个logger
 4 | keys=root, simpleLogger
 5 | 
 6 | [handlers]  # 配置所需要的handler
 7 | keys=consoleHandler,fileHandler,rotatingFileHandler
 8 | 
 9 | [formatters]  # 配置formatter
10 | keys=simpleFmt
11 | 
12 | [logger_root]
13 | level=INFO
14 | handlers=rotatingFileHandler
15 | 
16 | [logger_simpleLogger]  # 对simpleLogger进行相关配置
17 | level=INFO
18 | handlers=consoleHandler,rotatingFileHandler
19 | qualname=simpleLogger
20 | propagate=0
21 | 
22 | [handler_consoleHandler]  # 在控制台输出日志信息的处理方式
23 | class=StreamHandler
24 | level=INFO
25 | formatter=simpleFmt
26 | args=(sys.stdout,)
27 | 
28 | [handler_fileHandler]
29 | class=FileHandler
30 | level=INFO
31 | formatter=simpleFmt
32 | args=('./log/pangxieyg.log','a')
33 | 
34 | [handler_rotatingFileHandler]  # 设置日志备份
35 | class=handlers.RotatingFileHandler
36 | level=INFO
37 | formatter=simpleFmt
38 | args=('./log/pangxieyg.log','a',50*1024*1024, 10)
39 | 
40 | [formatter_simpleFmt]
41 | format=%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s
42 | datefmt='%Y-%m-%d %H:%M:%S'


--------------------------------------------------------------------------------
/spiders/pangxieyg/dist.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist.zip


--------------------------------------------------------------------------------
/spiders/pangxieyg/dist/conf/README.MD:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist/conf/README.MD


--------------------------------------------------------------------------------
/spiders/pangxieyg/dist/conf/info.conf:
--------------------------------------------------------------------------------
 1 | [main]
 2 | timeout=10
 3 | sleep=1
 4 | [user]
 5 | username=13486178520
 6 | password=vs7452014
 7 | goods=109121 111592 121557 107494
 8 | [mail]
 9 | to=767543579@qq.com jjzhu_ncu@163.com
10 | [thread]
11 | thread_num=4


--------------------------------------------------------------------------------
/spiders/pangxieyg/dist/conf/logging.conf:
--------------------------------------------------------------------------------
 1 | #logging.conf
 2 | ######################################################################
 3 | [loggers]  # 配置了两个logger
 4 | keys=root, simpleLogger
 5 | 
 6 | [handlers]  # 配置所需要的handler
 7 | keys=consoleHandler,fileHandler,rotatingFileHandler
 8 | 
 9 | [formatters]  # 配置formatter
10 | keys=simpleFmt
11 | 
12 | [logger_root]
13 | level=INFO
14 | handlers=rotatingFileHandler
15 | 
16 | [logger_simpleLogger]  # 对simpleLogger进行相关配置
17 | level=INFO
18 | handlers=consoleHandler,rotatingFileHandler
19 | qualname=simpleLogger
20 | propagate=0
21 | 
22 | [handler_consoleHandler]  # 在控制台输出日志信息的处理方式
23 | class=StreamHandler
24 | level=INFO
25 | formatter=simpleFmt
26 | args=(sys.stdout,)
27 | 
28 | [handler_fileHandler]
29 | class=FileHandler
30 | level=INFO
31 | formatter=simpleFmt
32 | args=('./log/pangxieyg.log','a')
33 | 
34 | [handler_rotatingFileHandler]  # 设置日志备份
35 | class=handlers.RotatingFileHandler
36 | level=INFO
37 | formatter=simpleFmt
38 | args=('./log/pangxieyg.log','a',50*1024*1024, 10)
39 | 
40 | [formatter_simpleFmt]
41 | format=%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s
42 | datefmt='%Y-%m-%d %H:%M:%S'


--------------------------------------------------------------------------------
/spiders/pangxieyg/dist/pangxie2.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist/pangxie2.exe


--------------------------------------------------------------------------------
/spiders/pangxieyg/i.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | 
 3 | block_cipher = None
 4 | 
 5 | 
 6 | a = Analysis(['i', 'ico.ico', 'py'],
 7 |              pathex=['E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg'],
 8 |              binaries=[],
 9 |              datas=[],
10 |              hiddenimports=[],
11 |              hookspath=[],
12 |              runtime_hooks=[],
13 |              excludes=[],
14 |              win_no_prefer_redirects=False,
15 |              win_private_assemblies=False,
16 |              cipher=block_cipher)
17 | pyz = PYZ(a.pure, a.zipped_data,
18 |              cipher=block_cipher)
19 | exe = EXE(pyz,
20 |           a.scripts,
21 |           a.binaries,
22 |           a.zipfiles,
23 |           a.datas,
24 |           name='i',
25 |           debug=False,
26 |           strip=False,
27 |           upx=True,
28 |           console=True )
29 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/ico.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/ico.ico


--------------------------------------------------------------------------------
/spiders/pangxieyg/notify.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/notify.wav


--------------------------------------------------------------------------------
/spiders/pangxieyg/pangxie2.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | 
 3 | block_cipher = None
 4 | 
 5 | 
 6 | a = Analysis(['pangxie2.py'],
 7 |              pathex=['E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg'],
 8 |              binaries=[],
 9 |              datas=[],
10 |              hiddenimports=[],
11 |              hookspath=[],
12 |              runtime_hooks=[],
13 |              excludes=[],
14 |              win_no_prefer_redirects=False,
15 |              win_private_assemblies=False,
16 |              cipher=block_cipher)
17 | pyz = PYZ(a.pure, a.zipped_data,
18 |              cipher=block_cipher)
19 | exe = EXE(pyz,
20 |           a.scripts,
21 |           a.binaries,
22 |           a.zipfiles,
23 |           a.datas,
24 |           name='pangxie2',
25 |           debug=False,
26 |           strip=False,
27 |           upx=True,
28 |           console=True )
29 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/pangxieyg.py:
--------------------------------------------------------------------------------
 1 | import urllib
 2 | import urllib.request
 3 | import http.cookiejar
 4 | # from . import user_agent
 5 | import random
 6 | class PangXie:
 7 |     def make_opener(self):
 8 |         cj = http.cookiejar.CookieJar()
 9 |         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
10 |         header = []
11 |         head = {
12 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
13 |             'Accept-Encoding': 'gzip, deflate, sdch',
14 |             'Accept-Language': 'zh-CN,zh;q=0.8',
15 |             'Connection': 'keep-alive',
16 |             'Content-Length': '254',
17 |             'Host': 'www.pangxieyg.com',
18 | 
19 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
20 |             # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,'
21 |             #           ' like Gecko) Chrome/37.0.2062.124 Safari/537.36'
22 |         }
23 |         for key, value in head.items():
24 |             elem = (key, value)
25 |             header.append(elem)
26 |         opener.addheaders = header
27 |         return opener
28 | 
29 | if __name__ == '__main__':
30 |     px = PangXie()
31 |     opener = px.make_opener()
32 |     opener.open('http://www.pangxieyg.com/wap/')
33 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/104808_2017-06-02.txt:
--------------------------------------------------------------------------------
1 | 2017-06-02 16:02:57	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
2 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/104808_2017-06-06.txt:
--------------------------------------------------------------------------------
1 | 2017-06-06 20:08:32	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
2 | 2017-06-06 20:25:56	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
3 | 2017-06-06 20:26:21	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
4 | 2017-06-06 20:26:25	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
5 | 2017-06-06 20:26:36	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
6 | 2017-06-06 20:26:44	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
7 | 2017-06-06 20:26:57	宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上	有货啦
8 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/107494_2017-06-02.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/107494_2017-06-02.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/107494_2017-06-06.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/107494_2017-06-06.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/109121_2017-06-02.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/109121_2017-06-02.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/109121_2017-06-06.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/109121_2017-06-06.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/111592_2017-06-02.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/111592_2017-06-02.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/111592_2017-06-06.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/111592_2017-06-06.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/121557_2017-06-02.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/121557_2017-06-02.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/record/121557_2017-06-06.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/121557_2017-06-06.txt


--------------------------------------------------------------------------------
/spiders/pangxieyg/sound.py:
--------------------------------------------------------------------------------
 1 | # from win32com.client import Dispatch
 2 | #
 3 | #
 4 | # while True:
 5 | #     wmp = Dispatch("WMPlayer.OCX")
 6 | #     media = wmp.newMedia("D:/CloudMusic/双笙 - 小幸运.mp3")
 7 | #     wmp.currentPlaylist.appendItem(media)
 8 | #     wmp.controls.play()
 9 | import pyaudio
10 | import wave
11 | 
12 | 
13 | chunk = 1024
14 | wf = wave.open(r'C:\Windows\Media\notify.wav', 'rb')
15 | p = pyaudio.PyAudio()
16 | stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
17 |                 channels=wf.getnchannels(),
18 |                 rate=wf.getframerate(),
19 |                 output=True)
20 | 
21 | # 写声音输出流进行播放
22 | while True:
23 |     data = wf.readframes(chunk)
24 |     if data == b'':
25 |         break
26 |     stream.write(data)
27 | stream.close()
28 | p.terminate()
29 | 


--------------------------------------------------------------------------------
/spiders/pangxieyg/winui.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Creates a task-bar icon.  Run from Python.exe to see the
  3 | # messages printed.
  4 | import win32api, win32gui
  5 | import win32con, winerror
  6 | import sys, os
  7 | 
  8 | class MainWindow:
  9 |     def __init__(self):
 10 |         msg_TaskbarRestart = win32gui.RegisterWindowMessage("TaskbarCreated");
 11 |         message_map = {
 12 |                 msg_TaskbarRestart: self.OnRestart,
 13 |                 win32con.WM_DESTROY: self.OnDestroy,
 14 |                 win32con.WM_COMMAND: self.OnCommand,
 15 |                 win32con.WM_USER+20 : self.OnTaskbarNotify,
 16 |         }
 17 |         # Register the Window class.
 18 |         wc = win32gui.WNDCLASS()
 19 |         hinst = wc.hInstance = win32api.GetModuleHandle(None)
 20 |         wc.lpszClassName = "PythonTaskbarDemo"
 21 |         wc.style = win32con.CS_VREDRAW | win32con.CS_HREDRAW;
 22 |         wc.hCursor = win32api.LoadCursor( 0, win32con.IDC_ARROW )
 23 |         wc.hbrBackground = win32con.COLOR_WINDOW
 24 |         wc.lpfnWndProc = message_map # could also specify a wndproc.
 25 | 
 26 |         # Don't blow up if class already registered to make testing easier
 27 |         try:
 28 |             classAtom = win32gui.RegisterClass(wc)
 29 |         except win32gui.error, err_info:
 30 |             if err_info.winerror!=winerror.ERROR_CLASS_ALREADY_EXISTS:
 31 |                 raise
 32 | 
 33 |         # Create the Window.
 34 |         style = win32con.WS_OVERLAPPED | win32con.WS_SYSMENU
 35 |         self.hwnd = win32gui.CreateWindow( wc.lpszClassName, "Taskbar Demo", style, \
 36 |                 0, 0, win32con.CW_USEDEFAULT, win32con.CW_USEDEFAULT, \
 37 |                 0, 0, hinst, None)
 38 |         win32gui.UpdateWindow(self.hwnd)
 39 |         self._DoCreateIcons()
 40 |     def _DoCreateIcons(self):
 41 |         # Try and find a custom icon
 42 |         hinst =  win32api.GetModuleHandle(None)
 43 |         iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "pyc.ico" ))
 44 |         if not os.path.isfile(iconPathName):
 45 |             # Look in DLLs dir, a-la py 2.5
 46 |             iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "DLLs", "pyc.ico" ))
 47 |         if not os.path.isfile(iconPathName):
 48 |             # Look in the source tree.
 49 |             iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "..\\PC\\pyc.ico" ))
 50 |         if os.path.isfile(iconPathName):
 51 |             icon_flags = win32con.LR_LOADFROMFILE | win32con.LR_DEFAULTSIZE
 52 |             hicon = win32gui.LoadImage(hinst, iconPathName, win32con.IMAGE_ICON, 0, 0, icon_flags)
 53 |         else:
 54 |             print "Can't find a Python icon file - using default"
 55 |             hicon = win32gui.LoadIcon(0, win32con.IDI_APPLICATION)
 56 | 
 57 |         flags = win32gui.NIF_ICON | win32gui.NIF_MESSAGE | win32gui.NIF_TIP
 58 |         nid = (self.hwnd, 0, flags, win32con.WM_USER+20, hicon, "Python Demo")
 59 |         try:
 60 |             win32gui.Shell_NotifyIcon(win32gui.NIM_ADD, nid)
 61 |         except win32gui.error:
 62 |             # This is common when windows is starting, and this code is hit
 63 |             # before the taskbar has been created.
 64 |             print "Failed to add the taskbar icon - is explorer running?"
 65 |             # but keep running anyway - when explorer starts, we get the
 66 |             # TaskbarCreated message.
 67 | 
 68 |     def OnRestart(self, hwnd, msg, wparam, lparam):
 69 |         self._DoCreateIcons()
 70 | 
 71 |     def OnDestroy(self, hwnd, msg, wparam, lparam):
 72 |         nid = (self.hwnd, 0)
 73 |         win32gui.Shell_NotifyIcon(win32gui.NIM_DELETE, nid)
 74 |         win32gui.PostQuitMessage(0) # Terminate the app.
 75 | 
 76 |     def OnTaskbarNotify(self, hwnd, msg, wparam, lparam):
 77 |         if lparam==win32con.WM_LBUTTONUP:
 78 |             print "You clicked me."
 79 |         elif lparam==win32con.WM_LBUTTONDBLCLK:
 80 |             print "You double-clicked me - goodbye"
 81 |             win32gui.DestroyWindow(self.hwnd)
 82 |         elif lparam==win32con.WM_RBUTTONUP:
 83 |             print "You right clicked me."
 84 |             menu = win32gui.CreatePopupMenu()
 85 |             win32gui.AppendMenu( menu, win32con.MF_STRING, 1023, "Display Dialog")
 86 |             win32gui.AppendMenu( menu, win32con.MF_STRING, 1024, "Say Hello")
 87 |             win32gui.AppendMenu( menu, win32con.MF_STRING, 1025, "Exit program" )
 88 |             pos = win32gui.GetCursorPos()
 89 |             # See http://msdn.microsoft.com/library/default.asp?url=/library/en-us/winui/menus_0hdi.asp
 90 |             win32gui.SetForegroundWindow(self.hwnd)
 91 |             win32gui.TrackPopupMenu(menu, win32con.TPM_LEFTALIGN, pos[0], pos[1], 0, self.hwnd, None)
 92 |             win32gui.PostMessage(self.hwnd, win32con.WM_NULL, 0, 0)
 93 |         return 1
 94 | 
 95 |     def OnCommand(self, hwnd, msg, wparam, lparam):
 96 |         id = win32api.LOWORD(wparam)
 97 |         if id == 1023:
 98 |             import win32gui_dialog
 99 |             win32gui_dialog.DemoModal()
100 |         elif id == 1024:
101 |             print "Hello"
102 |         elif id == 1025:
103 |             print "Goodbye"
104 |             win32gui.DestroyWindow(self.hwnd)
105 |         else:
106 |             print "Unknown command -", id
107 | 
108 | def main():
109 |     w=MainWindow()
110 |     win32gui.PumpMessages()
111 | 
112 | if __name__=='__main__':
113 |     main()
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/spiders/rank/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/rank/__init__.py


--------------------------------------------------------------------------------
/spiders/rank/baike_rank.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import http.cookiejar
 3 | import urllib.request
 4 | from spiders import user_agent
 5 | import random
 6 | import json
 7 | 
 8 | class Rank():
 9 |     def make_my_opener(self):
10 |         """
11 |         模拟浏览器发送请求
12 |         :return:
13 |         """
14 |         cj = http.cookiejar.CookieJar()
15 |         opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
16 |         header = []
17 |         head = {
18 |             'Accept': '*/*',
19 |             # 'Accept-Encoding': 'gzip,deflate,sdch',
20 |             'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
21 |             'Host': 'baike.baidu.com',
22 |             'Proxy-Connection': 'keep-alive',
23 |             'User-Agent': user_agent.agents[random.randint(0, len(user_agent.agents) - 1)]
24 | 
25 |             # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36'
26 |             #           ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36'
27 |         }
28 |         for key, value in head.items():
29 |             elem = (key, value)
30 |             header.append(elem)
31 |         opener.addheaders = header
32 |         return opener
33 | 
34 |     def start(self):
35 |         opener = self.make_my_opener()
36 |         max_page = 50
37 |         rsp = opener.open('http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek')
38 | 
39 |         rsp_json = json.loads(rsp.read().decode())
40 | 
41 |         print(rsp_json)
42 |         for pn in range(1, max_page):
43 | 
44 |             rsp = opener.open('http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek&pg=%d' % pn)
45 |             rsp_json = json.loads(rsp.read().decode())
46 | 
47 |             print(rsp_json)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     rank = Rank()
52 |     rank.start()


--------------------------------------------------------------------------------
/spiders/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | 


--------------------------------------------------------------------------------
/spiders/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 | 
3 | # Create your views here.
4 | 


--------------------------------------------------------------------------------
/spiders/wechat_sport/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/wechat_sport/__init__.py


--------------------------------------------------------------------------------
/spiders/wechat_sport/get_steps.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import json
 4 | import datetime
 5 | from pprint import pprint
 6 | import sys
 7 | import os
 8 | import django
 9 | from time import sleep
10 | sys.path.append('../../')
11 | sys.path.append('../')
12 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings'
13 | django.setup()
14 | from spiders.models import Step
15 | HEADERS = {
16 | 
17 |     'Host': 'hw.weixin.qq.com',
18 |     'Connection': 'keep-alive',
19 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
20 | 
21 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
22 |                   'Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI '
23 |                   'WindowsWechat QBCore/3.43.556.400 QQBrowser/9.0.2524.400',
24 |     'Accept-Encoding': 'gzip, deflate',
25 |     'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4',
26 |     'Cookie': 'hwstepranksk=uiNfWaL2l5E6ItwiqWdoU9gbuSnCWw2vxj-5_7i7U6QH6eWZ;'
27 | }
28 | 
29 | url = 'https://hw.weixin.qq.com/steprank/step/personal'
30 | while True:
31 |     resp = requests.get(url=url, params={
32 |        #  'pass_ticket': 'wHHOyL%2BvmKG1LE5VIuKgnrVj825Zv9dFN6HzwqXRZ9IpyQ6I6EcmRXkBtXTB5fAY'
33 |     }, headers=HEADERS).text
34 |     match_strings = re.findall(r"window.json = (\S+);", resp)
35 | 
36 |     resp_json = json.loads(match_strings[0])
37 |     step = Step()
38 |     step.steps = resp_json['rankdesc']['score']
39 |     step.curr_time = datetime.datetime.now()
40 |     step.save()
41 |     pprint(step)
42 |     sleep(2*60)
43 | 


--------------------------------------------------------------------------------
/spiders/weibo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__init__.py


--------------------------------------------------------------------------------
/spiders/weibo/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/spiders/weibo/__pycache__/user_agent.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__pycache__/user_agent.cpython-35.pyc


--------------------------------------------------------------------------------
/spiders/weibo/conf/README.md:
--------------------------------------------------------------------------------
1 | 在conf/目录下创建account.conf文件
2 | 用户名和密码空格分隔
3 | 一行一个用户
4 | e.g.
5 | 
6 | xxxx@163.com password_123
7 | bbbb@163.com password_234


--------------------------------------------------------------------------------
/spiders/weibo/conf/account.conf:
--------------------------------------------------------------------------------
1 | 767543579@qq.com JOPPER


--------------------------------------------------------------------------------
/spiders/weibo/dao.py:
--------------------------------------------------------------------------------
  1 | from spiders.models import Weibo, WeiboUser, UserRelationship
  2 | from spiders.logger import LOGGER
  3 | from django.db.models import Q
  4 | 
  5 | def save_blog_info( blog_info):
  6 | 
  7 |     try:
  8 |         # 存在就更新，不存在就创建
  9 |         weibo = Weibo.objects.get(pk=blog_info['id'])
 10 |     except Weibo.DoesNotExist:
 11 |         weibo = Weibo()
 12 |     weibo.id = str(blog_info['id'])
 13 |     weibo.created_timestamp = blog_info['created_at']
 14 |             # self.user_enqueue(ret_weibo.user.id)
 15 |     weibo.source = blog_info['source']
 16 |     weibo.text = blog_info['text']
 17 |     try:
 18 |         user = WeiboUser.objects.get(pk=blog_info['user']['id'])
 19 |     except WeiboUser.DoesNotExist:
 20 |         user = save_user_info(blog_info['user'])
 21 |     weibo.user = user
 22 |     weibo.save()
 23 |     LOGGER.info(weibo)
 24 |     if weibo.retweented_status is not None:  # 添加关系
 25 | 
 26 |         try:
 27 |             r = UserRelationship.objects.get(Q(user=weibo.retweented_status.user) & Q(follower=weibo.user))
 28 |             LOGGER.info('relationship already exist:' + str(r))
 29 |         except UserRelationship.DoesNotExist:
 30 |             LOGGER.info('relationship not exist')
 31 |             relation = UserRelationship()
 32 |             relation.user = weibo.retweented_status.user
 33 |             relation.follower = weibo.user
 34 |             LOGGER.info(relation)
 35 |             relation.save()
 36 |     return weibo
 37 | 
 38 | 
 39 | def save_user_info( user_info):
 40 |     try:
 41 |         user = WeiboUser.objects.get(pk=user_info['id'])
 42 |     except WeiboUser.DoesNotExist:
 43 |         user = WeiboUser()
 44 |         user.id = user_info['id']
 45 | 
 46 |     user.attNum = user_info['attNum'] if 'attNum' in user_info \
 47 |         else user_info['follow_count'] if 'follow_count' in user_info \
 48 |         else ''
 49 |     user.created_at = user_info['created_at'] if 'created_at' in user_info else ''
 50 |     user.screen_name = user_info['screen_name'] if 'screen_name' in user_info else ''
 51 |     user.description = user_info['description'] if 'description' in user_info else ''
 52 |     user.fansNum = user_info['fansNum'] if 'fansNum' in user_info \
 53 |         else user_info['followers_count'] if 'followers_count' in user_info \
 54 |         else ''
 55 |     user.mblogNum = user_info['mblogNum'] if 'mblogNum' in user_info \
 56 |         else user_info['statuses_count'] if 'statuses_count' in user_info \
 57 |         else ''
 58 |     user.nativePlace = user_info['nativePlace'] if 'nativePlace' in user_info else ''
 59 |     user.profile_url = user_info['profile_url'] if 'profile_url' in user_info else ''
 60 |     user.gender = WeiboUser.GENDER.index(user_info['gender'] if 'gender' in user_info else 'u')
 61 |     user.save()
 62 | 
 63 |     LOGGER.info(user)
 64 |     return user
 65 | 
 66 | 
 67 | def save_relationship(user, fan):
 68 |     try:
 69 |         r = UserRelationship.objects.get(Q(user=user) & Q(follower=fan))
 70 |         LOGGER.info('relationship already exist:' + str(r))
 71 |     except UserRelationship.DoesNotExist:
 72 |         LOGGER.info('relationship not exist')
 73 |         relation = UserRelationship()
 74 |         relation.user = user
 75 |         relation.follower = fan
 76 |         LOGGER.info(relation)
 77 |         relation.save()
 78 | 
 79 | 
 80 | def insert_pic_info(self, pic_info):
 81 |     pass
 82 | 
 83 | 
 84 | def insert_comment_info(self, comment_info):
 85 |     pass
 86 | 
 87 | 
 88 | def save_pic(self):
 89 |     url = 'http://ww2.sinaimg.cn/large/c0788b86jw1f2xfstebzaj20dc0hst9r.jpg'
 90 |     # opener = my_http.make_my_opener()
 91 |     # rsp = opener.open(url)
 92 |     # pic_data = rsp.read()
 93 |     # try:
 94 |     #     file = open("d:\\weibo_pic\\1.jpg", 'wb')
 95 |     #     file.write(pic_data)
 96 |     #     file.close()
 97 |     # except FileNotFoundError:
 98 |     #     os.mkdir("d:\\weibo_pic")
 99 |     # except FileExistsError:
100 |     #     pass


--------------------------------------------------------------------------------
/spiders/weibo/multhread.py:
--------------------------------------------------------------------------------
 1 | from collections import deque
 2 | import queue
 3 | import threading
 4 | from time import sleep
 5 | q = queue.Queue()
 6 | result = deque()
 7 | num_worker_threads = 10
 8 | threads = []
 9 | def do_work(n):
10 |     return n+2
11 | 
12 | def workder():
13 |     while True:
14 |         item = q.get()
15 |         if item is None:
16 |             print('break')
17 |             break
18 |         print(do_work(item))
19 |         q.task_done()
20 |         sleep(10)
21 |         q.put(12)
22 | 
23 | def worker1():
24 |     while True:
25 |         print(1)
26 |         sleep(10)
27 | 
28 | def worker2():
29 |     while True:
30 |         print(2)
31 | 
32 | t1 = threading.Thread(target=worker1)
33 | t2 = threading.Thread(target=worker2)
34 | t1.start()
35 | t2.start()
36 | 


--------------------------------------------------------------------------------
/spiders/weibo/weibo_conf.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def get_account():
 3 |     """
 4 |     这里是去读配置文件，weibo账号，conf文件夹下有说明
 5 |     :return: 
 6 |     """
 7 |     accounts = []
 8 |     conf_file = 'conf/account.conf'
 9 |     try:
10 |         with open(conf_file, 'r') as f:
11 |             for line in f.readlines():
12 |                 fields = line.split(' ')
13 |                 accounts.append({'username': fields[0], 'password': fields[1]})
14 |     except FileNotFoundError:
15 |         raise FileNotFoundError('No such file or directory:%s,'
16 |                                 ' read conf/README.md to conf weibo account' % conf_file)
17 |     return accounts


--------------------------------------------------------------------------------
/spiders/weibo/weibo_http.py:
--------------------------------------------------------------------------------
  1 | from spiders.logger import LOGGER
  2 | from spiders.weibo import constants
  3 | 
  4 | import traceback
  5 | import random
  6 | import json
  7 | import http.cookiejar
  8 | import urllib.parse
  9 | import urllib.request
 10 | from time import sleep
 11 | import ssl
 12 | 
 13 | 
 14 | def login(user_name, password, opener):
 15 |     LOGGER.info(user_name + ' login')
 16 |     args = {
 17 |         'username': user_name,
 18 |         'password': password,
 19 |         'savestate': 1,
 20 |         'ec': 0,
 21 |         'pagerefer': 'https://passport.weibo.cn/signin/'
 22 |                      'welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F&wm=3349&vt=4',
 23 |         'entry': 'mweibo',
 24 |         'wentry': '',
 25 |         'loginfrom': '',
 26 |         'client_id': '',
 27 |         'code': '',
 28 |         'qq': '',
 29 |         'hff': '',
 30 |         'hfp': ''
 31 |     }
 32 | 
 33 |     post_data = urllib.parse.urlencode(args).encode()
 34 |     try_time = 0
 35 |     while try_time < constants.TRY_TIME:
 36 |         try:
 37 |             resp = opener.open(constants.LOGIN_URL, post_data)
 38 |             resp_json = json.loads(resp.read().decode())
 39 |             if 'retcode' in resp_json and resp_json['retcode'] == 20000000:
 40 |                 LOGGER.info("%s login successful" % user_name)
 41 |                 break
 42 |             else:
 43 |                 LOGGER.warn('login fail:%s' % str(resp_json))
 44 |                 sleep(10)
 45 |                 try_time += 1
 46 |         except :
 47 |             LOGGER.error("login failed")
 48 |             LOGGER.error(traceback.print_exc())
 49 |             sleep(10)
 50 |             try_time += 1
 51 |             LOGGER.info('try %d time' % try_time)
 52 | 
 53 | 
 54 | 
 55 | def get_openner():
 56 |     opener = make_my_opener()
 57 |     curr_index = random.randint(0, len(constants.USERS) - 1)  # 随机选取用户
 58 |     LOGGER.info('user index : %d' % curr_index)
 59 |     login(constants.USERS[curr_index]['username'], constants.USERS[curr_index]['password'], opener)
 60 |     change_header(opener)
 61 |     return opener
 62 | 
 63 | 
 64 | def change_header(opener, ext=None):
 65 |     head = {
 66 |         'Accept': '*/*',
 67 |         'Connection': 'keep-alive',
 68 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
 69 |         'Host': 'm.weibo.cn',
 70 |         'Proxy-Connection': 'keep-alive',
 71 |         'User-Agent': constants.USER_AGENTS[random.randint(0, len(constants.USER_AGENTS) - 1)]
 72 |     }
 73 |     if ext:
 74 |         head.update(ext)
 75 |     header = []
 76 |     for key, value in head.items():
 77 |         elem = (key, value)
 78 |         header.append(elem)
 79 |     opener.addheaders = header
 80 | 
 81 | 
 82 | def change_proxy(opener):
 83 |     proxy_handler = urllib.request.ProxyHandler(constants.PROXIES[random.randint(0, len(constants.PROXIES) -1)])
 84 |     opener.add_handler(proxy_handler)
 85 | 
 86 | 
 87 | def make_my_opener():
 88 |     """
 89 |             模拟浏览器发送请求
 90 |             :return:
 91 |             """
 92 |     cj = http.cookiejar.CookieJar()
 93 |     opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
 94 | 
 95 |     header = []
 96 |     head = {
 97 |         'Accept': '*/*',
 98 |         'Accept-Encoding': 'gzip,deflate',
 99 |         'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
100 |         'Connection': 'keep-alive',
101 |         'Content-Length': '254',
102 |         'Content-Type': 'application/x-www-form-urlencoded',
103 |         'Host': 'passport.weibo.cn',
104 |         'Origin': 'https://passport.weibo.cn',
105 |         'Referer': 'https://passport.weibo.cn/signin/login?'
106 |                    'entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F',
107 |         'User-Agent': constants.USER_AGENTS[random.randint(0, len(constants.USER_AGENTS) - 1)]
108 |     }
109 |     for key, value in head.items():
110 |         elem = (key, value)
111 |         header.append(elem)
112 |     opener.addheaders = header
113 |     return opener
114 | 


--------------------------------------------------------------------------------
/spiders/zju/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zju.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zju
12 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZjuItem(scrapy.Item):
12 |     url = scrapy.Field()
13 |     title = scrapy.Field()
14 |     time = scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZjuSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/myemail.py:
--------------------------------------------------------------------------------
 1 | import smtplib
 2 | import email.mime.multipart
 3 | import email.mime.text
 4 | # -*- coding:utf-8 -*-
 5 | 
 6 | 
 7 | class Email(object):
 8 |     content_from = None
 9 |     content_to = None
10 |     content_subject = None
11 |     content_msg = None
12 |     content_pwd = None
13 | 
14 |     def send_163(self):
15 |         assert self.content_from is not None
16 |         assert self.content_to is not None
17 |         assert self.content_pwd is not None
18 |         msg = email.mime.multipart.MIMEMultipart()
19 |         msg['from'] = self.content_from
20 |         msg['to'] = self.content_to
21 |         msg['subject'] = self.content_subject
22 |         txt = email.mime.text.MIMEText(self.content_msg,  'plain', 'utf-8')
23 |         msg.attach(txt)
24 |         smtp = smtplib.SMTP(host='smtp.163.com', port=25)
25 | 
26 |         smtp.login(self.content_from, self.content_pwd)
27 |         smtp.sendmail(self.content_from, self.content_to, str(msg))
28 |         smtp.quit()
29 | 
30 | 
31 | def send_email(subject, msg):
32 |     e = Email()
33 |     e.content_from = 'jjzhu_ncu@163.com'
34 |     e.content_to = 'jjzhu_zju@163.com'
35 |     e.content_pwd = 'jvs7452014'
36 |     e.content_subject = 'hello world'
37 |     e.content_msg = 'hello word'
38 |     e.send_163()
39 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | from .myemail import Email
 8 | from .wechat import Wechat
 9 | 
10 | 
11 | class ZjuPipeline(object):
12 |     wc = Wechat()
13 | 
14 |     def process_item(self, item, spider):
15 |         msg = item['title'] + '\n' + item['url'] + '\n' + item['time']
16 |         self.wc.send(msg)
17 | 
18 |         e = Email()
19 |         e.content_from = 'jjzhu_ncu@163.com'
20 |         e.content_to = 'jjzhu_zju@163.com'
21 |         e.content_pwd = 'jvs7452014'
22 |         e.content_subject = u'浙大研究生官网发布新消息啦！'
23 |         e.content_msg = item['title'] + '\n' + item['url'] + '\n' + item['time']
24 |         e.send_163()
25 | 
26 |         return item
27 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for zju project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'zju'
13 | 
14 | SPIDER_MODULES = ['zju.spiders']
15 | NEWSPIDER_MODULE = 'zju.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'zju (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = True
22 | DEFAULT_REQUEST_HEADERS = {
23 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
24 |     'Accept-Encoding': 'gzip, deflate',
25 |     'Accept-Language': 'zh-CN,zh;q=0.8',
26 |     'Cache-Control': 'max-age=0',
27 |     'Connection': 'keep-alive',
28 |     'Host': 'grs.zju.edu.cn',
29 |     'Upgrade-Insecure-Requests': '1',
30 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
31 | }
32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
33 | # CONCURRENT_REQUESTS = 32
34 | 
35 | # Configure a delay for requests for the same website (default: 0)
36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
37 | # See also autothrottle settings and docs
38 | DOWNLOAD_DELAY = 2
39 | # The download delay setting will honor only one of:
40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
41 | # CONCURRENT_REQUESTS_PER_IP = 16
42 | 
43 | # Disable cookies (enabled by default)
44 | # COOKIES_ENABLED = False
45 | 
46 | # Disable Telnet Console (enabled by default)
47 | # TELNETCONSOLE_ENABLED = False
48 | 
49 | # Override the default request headers:
50 | # DEFAULT_REQUEST_HEADERS = {
51 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 | #   'Accept-Language': 'en',
53 | # }
54 | 
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | # SPIDER_MIDDLEWARES = {
58 | #    'zju.middlewares.ZjuSpiderMiddleware': 543,
59 | # }
60 | 
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | # DOWNLOADER_MIDDLEWARES = {
64 | #    'zju.middlewares.MyCustomDownloaderMiddleware': 543,
65 | # }
66 | 
67 | # Enable or disable extensions
68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
69 | # EXTENSIONS = {
70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
71 | # }
72 | 
73 | # Configure item pipelines
74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
75 | ITEM_PIPELINES = {
76 |    'zju.pipelines.ZjuPipeline': 300,
77 | }
78 | 
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | # AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | # AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | # AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | # AUTOTHROTTLE_DEBUG = False
91 | 
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | # HTTPCACHE_ENABLED = True
95 | # HTTPCACHE_EXPIRATION_SECS = 0
96 | # HTTPCACHE_DIR = 'httpcache'
97 | # HTTPCACHE_IGNORE_HTTP_CODES = []
98 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/spiders/ZjuSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import scrapy
 3 | from scrapy.http import Request
 4 | import datetime
 5 | from ..items import ZjuItem
 6 | 
 7 | 
 8 | class ZjuSpider(scrapy.Spider):
 9 |     name = 'zju'
10 |     url = 'http://grs.zju.edu.cn'
11 |     url2 = 'http://grs.zju.edu.cn/redir.php?catalog_id=16313'
12 |     url3 = 'http://grs.zju.edu.cn/redir.php?catalog_id=16313&page=1'
13 |     notified = set()
14 | 
15 |     def start_requests(self):
16 |         #  ield Request(self.url, dont_filter=True, callback=self.parse)
17 |         yield Request(self.url2, dont_filter=True, callback=self.parse2)
18 |         yield Request(self.url3, dont_filter=True, callback=self.parse2)
19 | 
20 |     def parse2(self, response):
21 |         lis = response.xpath('//ul[@id="artphs"]/li')
22 |         for li in lis:
23 |             c_url = self.url + '/' + li.xpath('h3/a/@href').extract_first()
24 |             title = li.xpath('h3/a/@title').extract_first()
25 |             time = li.xpath('span/text()').extract_first()
26 |             publish_time = datetime.datetime.strptime(time, '%Y-%m-%d')
27 |             now = datetime.datetime.now()
28 | 
29 |             if (now - publish_time).days < 1:
30 |                 if c_url not in self.notified:
31 |                     zju_item = ZjuItem()
32 |                     zju_item['url'] = c_url
33 |                     zju_item['title'] = title
34 |                     zju_item['time'] = time
35 |                     self.notified.add(c_url)
36 |                     yield zju_item
37 |         yield Request(self.url2, dont_filter=True, callback=self.parse2)
38 |         yield Request(self.url3, dont_filter=True, callback=self.parse2)
39 | 
40 |     def parse(self, response):
41 | 
42 |         lis = response.xpath('//ul[@id="arthd"]/li')
43 |         datetime.datetime.strptime('2017-06-01', '%Y-%m-%d')
44 |         for li in lis:
45 |             c_url = self.url + '/' + li.xpath('a/@href').extract_first()
46 |             title = li.xpath('a/@title').extract_first()
47 |             time = li.xpath('span[@class="art-date"]/text()').extract_first()
48 | 
49 |             publish_time = datetime.datetime.strptime(time,  '%Y-%m-%d')
50 |             now = datetime.datetime.now()
51 | 
52 |             if(now - publish_time).days < 1:
53 |                 if c_url not in self.notified:
54 |                     zju_item = ZjuItem()
55 |                     zju_item['url'] = c_url
56 |                     zju_item['title'] = title
57 |                     zju_item['time'] = time
58 |                     self.notified.add(c_url)
59 |                     yield zju_item
60 |         yield Request(self.url, dont_filter=True, callback=self.parse)
61 | 


--------------------------------------------------------------------------------
/spiders/zju/zju/wechat.py:
--------------------------------------------------------------------------------
 1 | from wxpy import *
 2 | bot = Bot()
 3 | 
 4 | 
 5 | class Wechat(object):
 6 |     def __init__(self):
 7 |         self.group = bot.groups().search('English exam')[0]
 8 |         self.my_friends = [bot.friends().search('jopper')[0]]
 9 | 
10 |     def send(self, msg):
11 |         self.group.send(msg)
12 | 
13 |     def send2(self, msg):
14 |         for f in self.my_friends:
15 |             f.send(msg)
16 | 


--------------------------------------------------------------------------------
/usage/README.md:
--------------------------------------------------------------------------------
1 | ## usage
2 | 各种框架、库的使用
3 | - requests
4 | - celery


--------------------------------------------------------------------------------
/usage/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/__init__.py


--------------------------------------------------------------------------------
/usage/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/usage/celery_u/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__init__.py


--------------------------------------------------------------------------------
/usage/celery_u/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/usage/celery_u/__pycache__/celery.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/celery.cpython-35.pyc


--------------------------------------------------------------------------------
/usage/celery_u/__pycache__/tasks.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/tasks.cpython-35.pyc


--------------------------------------------------------------------------------
/usage/celery_u/add.py:
--------------------------------------------------------------------------------
1 | from tasks import add
2 | print(add.delay(3, 3).get(timeout=1))
3 | 


--------------------------------------------------------------------------------
/usage/celery_u/celery3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/celery3.pdf


--------------------------------------------------------------------------------
/usage/celery_u/celery_.py:
--------------------------------------------------------------------------------
 1 | from celery import Celery
 2 | 
 3 | app = Celery('celery',
 4 |              broker='redis://:''@127.0.0.1:6379/0',
 5 |              backend='redis://:''@127.0.0.1:6379/1',
 6 |              include=['tasks']
 7 | )
 8 | app.conf.update(
 9 |     CELERY_TASK_RESULT_EXPIRES=3600,
10 |     CELERY_ROUTES={
11 |         'celery.taks.add': {'qqueue': 'hipri'}
12 |     }
13 | )
14 | 
15 | if __name__ == '__main__':
16 |     app.start()
17 | 


--------------------------------------------------------------------------------
/usage/celery_u/celeryconfig.py:
--------------------------------------------------------------------------------
1 | BROKER_URL = 'amqp://'
2 | CELERY_RESULT_BACKEND = 'rpc://'
3 | CELERY_TASK_SERIALIZER = 'json'
4 | CELERY_RESULT_SERIALIZER = 'json'
5 | CELERY_ACCEPT_CONTENT=['json']
6 | CELERY_TIMEZONE = 'Europe/Oslo'
7 | CELERY_ENABLE_UTC = True
8 | 


--------------------------------------------------------------------------------
/usage/celery_u/tasks.py:
--------------------------------------------------------------------------------
 1 | from celery_ import app
 2 | 
 3 | 
 4 | @app.task
 5 | def add( x, y):
 6 | 
 7 |     return x + y
 8 | 
 9 | 
10 | @app.task
11 | def mul(x, y):
12 |     return x * y
13 | 
14 | 
15 | @app.task
16 | def xsum(numbers):
17 |     return sum(numbers)
18 | 


--------------------------------------------------------------------------------
/usage/kafka_u/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/kafka_u/__init__.py


--------------------------------------------------------------------------------
/usage/kafka_u/consumer.py:
--------------------------------------------------------------------------------
 1 | from kafka import KafkaConsumer
 2 | 
 3 | consumer = KafkaConsumer('my-tppic',
 4 |                          group_id='my-group',
 5 |                          bootstrap_servers = ['localhost:9092'])
 6 | for message in consumer:
 7 |     print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition,
 8 |                                          message.offset, message.key,
 9 |                                          message.value))
10 | 
11 | 


--------------------------------------------------------------------------------
/usage/kafka_u/producer.py:
--------------------------------------------------------------------------------
 1 | from kafka import KafkaProducer
 2 | from kafka.errors import KafkaError
 3 | 
 4 | producer = KafkaProducer(bootstrap_servers=['localhost:9092'])
 5 | 
 6 | # Asynchronous by default
 7 | future = producer.send('my-topic', b'raw_bytes')
 8 | 
 9 | # Block for 'synchronous' sends
10 | try:
11 |     record_metadata = future.get(timeout=10)
12 | except KafkaError:
13 |     # Decide what to do if produce request failed...
14 | 
15 |     pass
16 | 
17 | # Successful result returns assigned partition and offset
18 | print (record_metadata.topic)
19 | print (record_metadata.partition)
20 | print (record_metadata.offset)
21 | 


--------------------------------------------------------------------------------
/usage/proj/tasks.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | if __name__ == '__main__':
 4 |     a = 1
 5 |     b = 2
 6 |     while True:
 7 |         try:
 8 |             c = a/b
 9 |             print(c)
10 |             break
11 |         except:
12 |             print('error')
13 |         finally:
14 |             print('finally')
15 |     print('aaaa')


--------------------------------------------------------------------------------
/usage/redis_u/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jjzhu'
2 | 


--------------------------------------------------------------------------------
/usage/redis_u/redis_u.py:
--------------------------------------------------------------------------------
1 | __author__ = 'jjzhu'
2 | import redis
3 | 
4 | r = redis.StrictRedis(db=0)
5 | print(r.get('foo'))
6 | 
7 | 


--------------------------------------------------------------------------------
/usage/requests_u/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/requests_u/__init__.py


--------------------------------------------------------------------------------
/usage/requests_u/req_usage.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | 
4 | def send_request():
5 |     result = requests.get('http://music.163.com/')
6 |     print(result.text)
7 | 
8 | send_request()


--------------------------------------------------------------------------------