├── .gitignore ├── Jpider ├── __init__.py ├── __init__.pyc ├── settings.py ├── settings.pyc ├── urls.py └── wsgi.py ├── README.md ├── conf └── logging.conf ├── funny ├── 1111.py ├── __init__.py ├── add#.py ├── alert.conf ├── ambari_yinxin_popo_alert.py ├── exception_t.py ├── execjs_test │ ├── __init__.py │ └── exec_js.py ├── funny_comment ├── funny_comment# ├── info.conf ├── readconf.py └── wx.py ├── log └── jpider.log ├── manage.py ├── myutil ├── __init__.py ├── email2.py └── myemail.py ├── output ├── __init__.py ├── all-data-2017_04_27.xls ├── all-data.xls ├── baidu.py └── dazhongdianping.py ├── spiders ├── Sina_spider1 │ ├── Begin.py │ ├── Sina_spider1 │ │ ├── __init__.py │ │ ├── __init__.pyc │ │ ├── conf.py │ │ ├── conf │ │ │ └── weibo.yaml │ │ ├── constant.py │ │ ├── cookies.py │ │ ├── cookies.pyc │ │ ├── items.py │ │ ├── items.pyc │ │ ├── middleware.py │ │ ├── middleware.pyc │ │ ├── pipelines.py │ │ ├── pipelines.pyc │ │ ├── settings.py │ │ ├── settings.pyc │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ ├── __init__.pyc │ │ │ ├── spiders.py │ │ │ └── spiders.pyc │ │ ├── user_agents.py │ │ ├── user_agents.pyc │ │ ├── yumdama.py │ │ └── yumdama.pyc │ ├── aa.png │ └── scrapy.cfg ├── __init__.py ├── __init__.pyc ├── admin.py ├── apps.py ├── apps.pyc ├── baidurank │ ├── 123.html │ ├── baidurank │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── rank.py │ ├── hello.html │ └── scrapy.cfg ├── bilibili │ ├── __init__.py │ └── bilibili_spider.py ├── dazongdianping │ ├── dazongdianping.log │ ├── dazongdianping │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── dazhong_no.py │ │ │ ├── dazong.py │ │ │ ├── dazong_repair.py │ │ │ └── features.txt │ └── scrapy.cfg ├── dist_weibo │ ├── __init__.py │ ├── conf │ │ ├── account.conf │ │ └── logging.conf │ ├── dao │ │ ├── __init__.py │ │ ├── redis_cookies.py │ │ └── sqlalchemy_session.py │ ├── headers.py │ ├── js │ │ └── ssologin.js │ ├── kill_celery.txt │ ├── logger.py │ ├── login │ │ ├── __init__.py │ │ └── login.py │ ├── model │ │ ├── __init__.py │ │ └── models.py │ ├── notebook │ │ └── Request.md │ ├── sql │ │ └── database.sql │ ├── tasks │ │ ├── __init__.py │ │ ├── home_page.py │ │ ├── login.py │ │ ├── mobile_login.py │ │ ├── user.py │ │ └── workers.py │ └── workers.py ├── dist_weibo_spider │ ├── __init__.py │ ├── conf │ │ └── account.conf │ ├── dao │ │ ├── __init__.py │ │ └── redis_cookies.py │ ├── headers.py │ ├── js │ │ └── ssologin.js │ ├── login │ │ ├── __init__.py │ │ └── login.py │ ├── notebook │ │ └── Request.md │ ├── tasks │ │ ├── __init__.py │ │ ├── home_page.py │ │ └── workers.py │ └── workers.py ├── distributed │ ├── README.md │ ├── __init__.py │ ├── celeryt │ │ ├── __init__.py │ │ ├── celerybeat-schedule │ │ ├── tasks.py │ │ └── test.py │ ├── redist │ │ ├── __init__.py │ │ └── test_redis.py │ ├── task_dispatcher.py │ ├── tasks.py │ └── workers.py ├── logger.py ├── models.py ├── mzi │ ├── mzi │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── baikerank.py │ │ │ └── meizi.py │ └── scrapy.cfg ├── onepiece │ ├── aaa │ ├── onepiece │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ └── one_piece.py │ └── scrapy.cfg ├── pangxieyg │ ├── README.MD │ ├── README.txt │ ├── __init__.py │ ├── alert-templates.xml │ ├── build │ │ └── pangxie2 │ │ │ ├── out00-Analysis.toc │ │ │ ├── out00-EXE.toc │ │ │ ├── out00-PKG.pkg │ │ │ ├── out00-PKG.toc │ │ │ ├── out00-PYZ.pyz │ │ │ ├── out00-PYZ.toc │ │ │ ├── pangxie2.exe.manifest │ │ │ └── warnpangxie2.txt │ ├── conf │ │ ├── README.MD │ │ ├── info.conf │ │ └── logging.conf │ ├── dist.zip │ ├── dist │ │ ├── conf │ │ │ ├── README.MD │ │ │ ├── info.conf │ │ │ └── logging.conf │ │ └── pangxie2.exe │ ├── i.spec │ ├── ico.ico │ ├── mp3player.py │ ├── notify.wav │ ├── pangxie2.py │ ├── pangxie2.spec │ ├── pangxieyg.py │ ├── record │ │ ├── 104808_2017-06-02.txt │ │ ├── 104808_2017-06-06.txt │ │ ├── 107494_2017-06-02.txt │ │ ├── 107494_2017-06-06.txt │ │ ├── 109121_2017-06-02.txt │ │ ├── 109121_2017-06-06.txt │ │ ├── 111592_2017-06-02.txt │ │ ├── 111592_2017-06-06.txt │ │ ├── 121557_2017-06-02.txt │ │ └── 121557_2017-06-06.txt │ ├── sound.py │ ├── user_agent.py │ └── winui.py ├── rank │ ├── __init__.py │ └── baike_rank.py ├── tests.py ├── user_agent.py ├── views.py ├── wechat_sport │ ├── __init__.py │ ├── get_steps.py │ └── wechat_login.py ├── weibo │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── user_agent.cpython-35.pyc │ ├── conf │ │ ├── README.md │ │ └── account.conf │ ├── constants.py │ ├── dao.py │ ├── multhread.py │ ├── weibo2.py │ ├── weibo_conf.py │ └── weibo_http.py └── zju │ ├── scrapy.cfg │ └── zju │ ├── items.py │ ├── middlewares.py │ ├── myemail.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ └── ZjuSpider.py │ └── wechat.py └── usage ├── README.md ├── __init__.py ├── __pycache__ └── __init__.cpython-35.pyc ├── celery_u ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── celery.cpython-35.pyc │ └── tasks.cpython-35.pyc ├── add.py ├── celery3.pdf ├── celery_.py ├── celeryconfig.py └── tasks.py ├── kafka_u ├── __init__.py ├── consumer.py └── producer.py ├── proj └── tasks.py ├── redis_u ├── __init__.py └── redis_u.py └── requests_u ├── __init__.py └── req_usage.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | .idea/ 3 | Jpider/__pycache__/ 4 | spiders/__pycache__/ 5 | spiders/migrations/ 6 | templates/ 7 | *.log 8 | */*.conf 9 | -------------------------------------------------------------------------------- /Jpider/__init__.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | pymysql.install_as_MySQLdb() 3 | -------------------------------------------------------------------------------- /Jpider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/Jpider/__init__.pyc -------------------------------------------------------------------------------- /Jpider/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for Jpider project. 3 | 4 | Generated by 'django-admin startproject' using Django 1.10.3. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.10/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/1.10/ref/settings/ 11 | """ 12 | 13 | import os 14 | 15 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...) 16 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | 18 | 19 | # Quick-start development settings - unsuitable for production 20 | # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/ 21 | 22 | # SECURITY WARNING: keep the secret key used in production secret! 23 | SECRET_KEY = '$v@3$86$0^8cney1sk%ln+aa+zf!x@=6bhs$u1&ka(*!i%vub$' 24 | 25 | # SECURITY WARNING: don't run with debug turned on in production! 26 | DEBUG = True 27 | 28 | ALLOWED_HOSTS = [] 29 | 30 | 31 | # Application definition 32 | 33 | INSTALLED_APPS = [ 34 | 'django.contrib.admin', 35 | 'django.contrib.auth', 36 | 'django.contrib.contenttypes', 37 | 'django.contrib.sessions', 38 | 'django.contrib.messages', 39 | 'django.contrib.staticfiles', 40 | 'spiders.apps.SpidersConfig' 41 | ] 42 | 43 | MIDDLEWARE = [ 44 | 'django.middleware.security.SecurityMiddleware', 45 | 'django.contrib.sessions.middleware.SessionMiddleware', 46 | 'django.middleware.common.CommonMiddleware', 47 | 'django.middleware.csrf.CsrfViewMiddleware', 48 | 'django.contrib.auth.middleware.AuthenticationMiddleware', 49 | 'django.contrib.messages.middleware.MessageMiddleware', 50 | 'django.middleware.clickjacking.XFrameOptionsMiddleware', 51 | ] 52 | 53 | ROOT_URLCONF = 'Jpider.urls' 54 | 55 | TEMPLATES = [ 56 | { 57 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 58 | 'DIRS': [os.path.join(BASE_DIR, 'templates')] 59 | , 60 | 'APP_DIRS': True, 61 | 'OPTIONS': { 62 | 'context_processors': [ 63 | 'django.template.context_processors.debug', 64 | 'django.template.context_processors.request', 65 | 'django.contrib.auth.context_processors.auth', 66 | 'django.contrib.messages.context_processors.messages', 67 | ], 68 | }, 69 | }, 70 | ] 71 | 72 | WSGI_APPLICATION = 'Jpider.wsgi.application' 73 | 74 | 75 | # Database 76 | # https://docs.djangoproject.com/en/1.10/ref/settings/#databases 77 | 78 | DATABASES = { 79 | 'default': { 80 | 'ENGINE': 'django.db.backends.mysql', 81 | # 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), 82 | 'NAME': 'spider', 83 | 'USER': 'root', 84 | 'HOST': '127.0.0.1', 85 | 'PASSWORD': '1111', 86 | 'PORT': 3306, 87 | # 在处理包含emoji微博的时候,可能会出错 88 | # 想去看下字符编码 89 | # use spider; 90 | # show variables like 'character_set_database'; 91 | # 修改字段字符编码 92 | # alter table spiders_weibo modify text longtext charset utf8mb4 collate utf8mb4_unicode_ci; 93 | 'OPTIONS': {'charset': 'utf8mb4'}, 94 | } 95 | } 96 | 97 | 98 | # Password validation 99 | # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators 100 | 101 | AUTH_PASSWORD_VALIDATORS = [ 102 | { 103 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 104 | }, 105 | { 106 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', 107 | }, 108 | { 109 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', 110 | }, 111 | { 112 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', 113 | }, 114 | ] 115 | 116 | 117 | # Internationalization 118 | # https://docs.djangoproject.com/en/1.10/topics/i18n/ 119 | 120 | LANGUAGE_CODE = 'en-us' 121 | 122 | TIME_ZONE = 'UTC' 123 | 124 | USE_I18N = True 125 | 126 | USE_L10N = True 127 | 128 | USE_TZ = True 129 | 130 | 131 | # Static files (CSS, JavaScript, Images) 132 | # https://docs.djangoproject.com/en/1.10/howto/static-files/ 133 | 134 | STATIC_URL = '/static/' 135 | -------------------------------------------------------------------------------- /Jpider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/Jpider/settings.pyc -------------------------------------------------------------------------------- /Jpider/urls.py: -------------------------------------------------------------------------------- 1 | """Jpider URL Configuration 2 | 3 | The `urlpatterns` list routes URLs to views. For more information please see: 4 | https://docs.djangoproject.com/en/1.10/topics/http/urls/ 5 | Examples: 6 | Function views 7 | 1. Add an import: from my_app import views 8 | 2. Add a URL to urlpatterns: url(r'^$', views.home, name='home') 9 | Class-based views 10 | 1. Add an import: from other_app.views import Home 11 | 2. Add a URL to urlpatterns: url(r'^$', Home.as_view(), name='home') 12 | Including another URLconf 13 | 1. Import the include() function: from django.conf.urls import url, include 14 | 2. Add a URL to urlpatterns: url(r'^blog/', include('blog.urls')) 15 | """ 16 | from django.conf.urls import url 17 | from django.contrib import admin 18 | 19 | urlpatterns = [ 20 | url(r'^admin/', admin.site.urls), 21 | ] 22 | -------------------------------------------------------------------------------- /Jpider/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for Jpider project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/1.10/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Jpider.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jpider 2 | 新浪微博、拉钩网、大众点评各种爬虫 3 | 4 | 项目模块比较凌乱 5 | 6 | 所有爬虫都在 spiders包下 7 | 8 | - baidurank 9 | > 百度百科的明星排行 10 | - dazongdianping 11 | > 大众点评店铺信息 12 | - dist_weibo_spider 13 | > pc端微博分布式爬虫,还未完成.... 14 | - mzi 15 | > 妹子网 16 | - pangxieyg 17 | > 小众电商抢购爬虫 18 | - wechat_sport 19 | > 微信运动步数获取,目前只能获取自己的步数 20 | - weibo 21 | > 移动端微博爬虫,多线程 22 | 23 | 所有爬虫的ORM用的是django自带的ORM 24 | 25 | - usage 26 | > 一些库的使用示例 27 | 28 | -------------------------------------------------------------------------------- /conf/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root, simpleLogger 3 | 4 | [handlers] 5 | keys=consoleHandler,fileHandler,rotatingFileHandler 6 | 7 | [formatters] 8 | keys=simpleFmt 9 | 10 | [logger_root] 11 | level=DEBUG 12 | handlers=rotatingFileHandler 13 | 14 | [logger_simpleLogger] 15 | level=DEBUG 16 | handlers=consoleHandler,rotatingFileHandler 17 | qualname=simpleLogger 18 | propagate=0 19 | 20 | [handler_consoleHandler] 21 | class=StreamHandler 22 | level=DEBUG 23 | formatter=simpleFmt 24 | args=(sys.stdout,) 25 | 26 | [handler_fileHandler] 27 | class=FileHandler 28 | level=DEBUG 29 | formatter=simpleFmt 30 | args=('../../log/jpider.log','a') 31 | 32 | [handler_rotatingFileHandler] 33 | class=handlers.RotatingFileHandler 34 | level=DEBUG 35 | formatter=simpleFmt 36 | args=('../../log/jpider.log','a',50*1024*1024, 10) 37 | 38 | [formatter_simpleFmt] 39 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s 40 | datefmt='%Y-%m-%d %H:%M:%S' -------------------------------------------------------------------------------- /funny/1111.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | 4 | def is_perfect(input_str): 5 | found = False 6 | for index in range(len(input_str)-3): 7 | if not found: 8 | if input_str[index] == input_str[index+1]: 9 | if input_str[index+2] == input_str[index+3]: 10 | found = True 11 | else: 12 | return False 13 | else: 14 | if input_str[index] == input_str[index+1]: 15 | return False 16 | else: 17 | return found 18 | 19 | s = input() 20 | all_perm = set() 21 | for i in itertools.permutations(s, len(s)): 22 | print(i) 23 | all_perm.add(''.join(list(i))) 24 | count = 0 25 | for i in all_perm: 26 | if is_perfect(i): 27 | print(i) 28 | count += 1 29 | print(count) 30 | -------------------------------------------------------------------------------- /funny/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/funny/__init__.py -------------------------------------------------------------------------------- /funny/add#.py: -------------------------------------------------------------------------------- 1 | def try_expect_finally(): 2 | try: 3 | 1/0 4 | except Exception: 5 | print 'e' 6 | finally: 7 | print 'finally' 8 | try_expect_finally() 9 | -------------------------------------------------------------------------------- /funny/alert.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | type=15 3 | product=test 4 | account=xxxxxx@corp.netease.com 5 | mobile=134xxxxx520 6 | subject=alert send by python 7 | -------------------------------------------------------------------------------- /funny/ambari_yinxin_popo_alert.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | __doc__ = """ 3 | ambari alert by yixin & popo 4 | """ 5 | import sys 6 | import requests 7 | import ConfigParser 8 | 9 | netease_alert_url = 'https://xxxx/omnew/alert/sendMultiAlert' 10 | conf_path = './alert.conf' 11 | 12 | 13 | def send_alert(data): 14 | if isinstance(data, dict): 15 | requests.post(url=netease_alert_url, json=data) 16 | 17 | 18 | def main(): 19 | cf = ConfigParser.ConfigParser() 20 | cf.read(conf_path) 21 | data = {} 22 | type = int(cf.get('main', 'type')) 23 | product = cf.get('main', 'product') 24 | data['product'] = product 25 | data['type'] = type 26 | # get message from ambari alert 27 | definitionName = sys.argv[1] 28 | definitionLabel = sys.argv[2] 29 | serviceName = sys.argv[3] 30 | alertState = sys.argv[4] 31 | alertText = sys.argv[5] 32 | alert_msg = "definitionName:%s\ndefinitionLabel:%s\nserviceName:%s\nalertState:%s\nalertText:%s\n" % \ 33 | (definitionName, definitionLabel, serviceName, alertState, alertText) 34 | 35 | if type >> 3 == 1: # send by yixin 36 | type &= 7 37 | mobile = cf.get('main', 'mobile') 38 | data['mobile'] = mobile 39 | data['yixinMsg'] = alert_msg 40 | 41 | if type >> 2 == 1: # duanxin 42 | type &= 3 43 | mobile = cf.get('main', 'mobile') 44 | data['mobile'] = mobile 45 | data['mobileMsg'] = alert_msg 46 | 47 | if type >> 1 == 1: # email 48 | type &= 1 49 | account = cf.get('main', 'account') 50 | data['account'] = account 51 | data['emailMsg'] = alert_msg 52 | data['subject'] = cf.get('main', 'subject') 53 | 54 | if type == 1: # popo 55 | data['account'] = cf.get('main', 'account') 56 | data['popoMsg'] = alert_msg 57 | 58 | send_alert(data=data) 59 | 60 | main() 61 | -------------------------------------------------------------------------------- /funny/exception_t.py: -------------------------------------------------------------------------------- 1 | import traceback 2 | 3 | def f(): 4 | try: 5 | 1/2 6 | return 7 | except Exception as e: 8 | print traceback.format_exc() 9 | finally: 10 | print 'finally' 11 | 12 | f() -------------------------------------------------------------------------------- /funny/execjs_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/funny/execjs_test/__init__.py -------------------------------------------------------------------------------- /funny/execjs_test/exec_js.py: -------------------------------------------------------------------------------- 1 | import execjs 2 | 3 | print(execjs.eval("'1 2 3'.split()")) 4 | ctx = execjs.compile(""" 5 | function add(x, y){ 6 | return x + y; 7 | } 8 | """) 9 | print(ctx.call('add', 1, 2)) 10 | -------------------------------------------------------------------------------- /funny/funny_comment: -------------------------------------------------------------------------------- 1 | 2 | ,_-=(!7(7/zs_. 3 | .=' ' .`/,/!(=)Zm. 4 | .._,,._.. ,-`- `,\ ` -` -`\\7//WW. 5 | ,v=~/.-,-\- -!|V-s.)iT-|s|\-.' `///mK%. 6 | v!`i!-.e]-g`bT/i(/[=.Z/m)K(YNYi.. /-]i44M. 7 | v`/,`|v]-DvLcfZ/eV/iDLN\D/ZK@%8W[Z.. `/d!Z8m 8 | //,c\(2(X/NYNY8]ZZ/bZd\()/\7WY%WKKW) -'|(][%4. 9 | ,\\i\c(e)WX@WKKZKDKWMZ8(b5/ZK8]Z7%ffVM, -.Y!bNMi 10 | /-iit5N)KWG%%8%%%%W8%ZWM(8YZvD)XN(@. [ \]!/GXW[ 11 | / ))G8\NMN%W%%%%%%%%%%8KK@WZKYK*ZG5KMi,- vi[NZGM[ 12 | i\!(44Y8K%8%%%**~YZYZ@%%%%%4KWZ/PKN)ZDZ7 c=//WZK%! 13 | ,\v\YtMZW8W%%f`,`.t/bNZZK%%W%%ZXb*K(K5DZ -c\\/KM48 14 | -|c5PbM4DDW%f v./c\[tMY8W%PMW%D@KW)Gbf -/(=ZZKM8[ 15 | 2(N8YXWK85@K -'c|K4/KKK%@ V%@@WD8e~ .//ct)8ZK%8` 16 | =)b%]Nd)@KM[ !'\cG!iWYK%%| !M@KZf -c\))ZDKW%` 17 | YYKWZGNM4/Pb '-VscP4]b@W% 'Mf` -L\///KM(%W! 18 | !KKW4ZK/W7)Z. '/cttbY)DKW% -` .',\v)K(5KW%%f 19 | 'W)KWKZZg)Z2/,!/L(-DYYb54% ,,`, -\-/v(((KK5WW%f 20 | \M4NDDKZZ(e!/\7vNTtZd)8\Mi!\-,-/i-v((tKNGN%W%% 21 | 'M8M88(Zd))///((|D\tDY\\KK-`/-i(=)KtNNN@W%%%@%[ 22 | !8%@KW5KKN4///s(\Pd!ROBY8/=2(/4ZdzKD%K%%%M8@%% 23 | '%%%W%dGNtPK(c\/2\[Z(ttNYZ2NZW8W8K%%%%YKM%M%%. 24 | *%%W%GW5@/%!e]_tZdY()v)ZXMZW%W%%%*5Y]K%ZK%8[ 25 | '*%%%%8%8WK\)[/ZmZ/Zi]!/M%%%%@f\ \Y/NNMK%%! 26 | 'VM%%%%W%WN5Z/Gt5/b)((cV@f` - |cZbMKW%%| 27 | 'V*M%%%WZ/ZG\t5((+)L\'-,,/ -)X(NWW%% 28 | `~`MZ/DZGNZG5(((\, ,t\\Z)KW%@ 29 | 'M8K%8GN8\5(5///]i!v\K)85W%%f 30 | YWWKKKKWZ8G54X/GGMeK@WM8%@ 31 | !M8%8%48WG@KWYbW%WWW%%%@ 32 | VM%WKWK%8K%%8WWWW%%%@` 33 | ~*%%%%%%W%%%%%%%@~ 34 | ~*MM%%%%%%@f` 35 | ''''' 36 | 37 | -------------------------------------------------------------------------------- /funny/funny_comment#: -------------------------------------------------------------------------------- 1 | # 2 | # ,_-=(!7(7/zs_. 3 | # .=' ' .`/,/!(=)Zm. 4 | # .._,,._.. ,-`- `,\ ` -` -`\\7//WW. 5 | # ,v=~/.-,-\- -!|V-s.)iT-|s|\-.' `///mK%. 6 | # v!`i!-.e]-g`bT/i(/[=.Z/m)K(YNYi.. /-]i44M. 7 | # v`/,`|v]-DvLcfZ/eV/iDLN\D/ZK@%8W[Z.. `/d!Z8m 8 | # //,c\(2(X/NYNY8]ZZ/bZd\()/\7WY%WKKW) -'|(][%4. 9 | # ,\\i\c(e)WX@WKKZKDKWMZ8(b5/ZK8]Z7%ffVM, -.Y!bNMi 10 | # /-iit5N)KWG%%8%%%%W8%ZWM(8YZvD)XN(@. [ \]!/GXW[ 11 | # / ))G8\NMN%W%%%%%%%%%%8KK@WZKYK*ZG5KMi,- vi[NZGM[ 12 | # i\!(44Y8K%8%%%**~YZYZ@%%%%%4KWZ/PKN)ZDZ7 c=//WZK%! 13 | # ,\v\YtMZW8W%%f`,`.t/bNZZK%%W%%ZXb*K(K5DZ -c\\/KM48 14 | # -|c5PbM4DDW%f v./c\[tMY8W%PMW%D@KW)Gbf -/(=ZZKM8[ 15 | # 2(N8YXWK85@K -'c|K4/KKK%@ V%@@WD8e~ .//ct)8ZK%8` 16 | # =)b%]Nd)@KM[ !'\cG!iWYK%%| !M@KZf -c\))ZDKW%` 17 | # YYKWZGNM4/Pb '-VscP4]b@W% 'Mf` -L\///KM(%W! 18 | # !KKW4ZK/W7)Z. '/cttbY)DKW% -` .',\v)K(5KW%%f 19 | # 'W)KWKZZg)Z2/,!/L(-DYYb54% ,,`, -\-/v(((KK5WW%f 20 | # \M4NDDKZZ(e!/\7vNTtZd)8\Mi!\-,-/i-v((tKNGN%W%% 21 | # 'M8M88(Zd))///((|D\tDY\\KK-`/-i(=)KtNNN@W%%%@%[ 22 | # !8%@KW5KKN4///s(\Pd!ROBY8/=2(/4ZdzKD%K%%%M8@%% 23 | # '%%%W%dGNtPK(c\/2\[Z(ttNYZ2NZW8W8K%%%%YKM%M%%. 24 | # *%%W%GW5@/%!e]_tZdY()v)ZXMZW%W%%%*5Y]K%ZK%8[ 25 | # '*%%%%8%8WK\)[/ZmZ/Zi]!/M%%%%@f\ \Y/NNMK%%! 26 | # 'VM%%%%W%WN5Z/Gt5/b)((cV@f` - |cZbMKW%%| 27 | # 'V*M%%%WZ/ZG\t5((+)L\'-,,/ -)X(NWW%% 28 | # `~`MZ/DZGNZG5(((\, ,t\\Z)KW%@ 29 | # 'M8K%8GN8\5(5///]i!v\K)85W%%f 30 | # YWWKKKKWZ8G54X/GGMeK@WM8%@ 31 | # !M8%8%48WG@KWYbW%WWW%%%@ 32 | # VM%WKWK%8K%%8WWWW%%%@` 33 | # ~*%%%%%%W%%%%%%%@~ 34 | # ~*MM%%%%%%@f` 35 | # ''''' 36 | # 37 | # -------------------------------------------------------------------------------- /funny/info.conf: -------------------------------------------------------------------------------- 1 | [user] 2 | username=13486178520 3 | password=vs7452014 4 | goods=104808 130459 5 | 6 | -------------------------------------------------------------------------------- /funny/readconf.py: -------------------------------------------------------------------------------- 1 | import ConfigParser 2 | 3 | 4 | cf = ConfigParser.ConfigParser() 5 | cf.read(r'.\info.conf') 6 | sections = cf.sections() 7 | username = cf.get('user', 'username') 8 | password = cf.get('user', 'password') 9 | goods = '|'.join(cf.get('user', 'goods').split(' ')) 10 | print username, password, goods 11 | -------------------------------------------------------------------------------- /funny/wx.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from wxpy import * 3 | from wechat_sender import * 4 | 5 | bot = Bot() 6 | my_friend = bot.friends().search('jopper')[0] 7 | my_friend.send('hello') 8 | group = bot.groups().search('Team of single dogs')[0] 9 | group.send('send from python, for test\n zhujiajunup@163.com') 10 | 11 | -------------------------------------------------------------------------------- /log/jpider.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/log/jpider.log -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import sys 4 | 5 | if __name__ == "__main__": 6 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "Jpider.settings") 7 | try: 8 | from django.core.management import execute_from_command_line 9 | except ImportError: 10 | # The above import may fail for some other reason. Ensure that the 11 | # issue is really that Django is missing to avoid masking other 12 | # exceptions on Python 2. 13 | try: 14 | import django 15 | except ImportError: 16 | raise ImportError( 17 | "Couldn't import Django. Are you sure it's installed and " 18 | "available on your PYTHONPATH environment variable? Did you " 19 | "forget to activate a virtual environment?" 20 | ) 21 | raise 22 | execute_from_command_line(sys.argv) 23 | -------------------------------------------------------------------------------- /myutil/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/myutil/__init__.py -------------------------------------------------------------------------------- /myutil/email2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import smtplib 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.text import MIMEText 6 | 7 | SERVER = 'smtp.163.com' 8 | FROM = 'jjzhu_ncu@163.com' 9 | TO = ['jjzhu_zju@163.com'] 10 | 11 | SUBJECT = u'测试UTF8编码' 12 | TEXT = u'ABCDEFG一二三四五六七' 13 | 14 | msg = MIMEMultipart('alternative') 15 | # 注意包含了非ASCII字符,需要使用unicode 16 | msg['Subject'] = SUBJECT 17 | msg['From'] = FROM 18 | msg['To'] = ', '.join(TO) 19 | part = MIMEText(TEXT, 'plain', 'utf-8') 20 | msg.attach(part) 21 | 22 | server = smtplib.SMTP(SERVER, port=25) 23 | server.login(FROM, 'jvs7452014') 24 | server.sendmail(FROM, TO, msg.as_string().encode('ascii')) 25 | server.quit() 26 | -------------------------------------------------------------------------------- /myutil/myemail.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import email.mime.multipart 3 | import email.mime.text 4 | 5 | 6 | class Email(object): 7 | content_from = None 8 | content_to = None 9 | content_subject = None 10 | content_msg = None 11 | content_pwd = None 12 | 13 | def send_163(self): 14 | assert self.content_from is not None 15 | assert self.content_to is not None 16 | assert self.content_pwd is not None 17 | msg = email.mime.multipart.MIMEMultipart() 18 | msg['from'] = self.content_from 19 | msg['to'] = self.content_to 20 | msg['subject'] = self.content_subject 21 | txt = email.mime.text.MIMEText(self.content_msg, 'plain', 'utf-8') 22 | msg.attach(txt) 23 | smtp = smtplib.SMTP(host='smtp.163.com', port=25) 24 | 25 | smtp.login(self.content_from, self.content_pwd) 26 | smtp.sendmail(self.content_from, self.content_to, str(msg)) 27 | smtp.quit() 28 | 29 | 30 | def send_email(subject, msg): 31 | e = Email() 32 | e.content_from = 'jjzhu_ncu@163.com' 33 | e.content_to = '767543579@qq.com' 34 | e.content_pwd = 'xxxx' 35 | e.content_subject = 'hello world' 36 | e.content_msg = 'hello word' 37 | e.send_163() 38 | 39 | if __name__ == '__main__': 40 | send_email('', '') 41 | -------------------------------------------------------------------------------- /output/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import django 4 | 5 | sys.path.append('/../Jpider') 6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 7 | django.setup() -------------------------------------------------------------------------------- /output/all-data-2017_04_27.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/output/all-data-2017_04_27.xls -------------------------------------------------------------------------------- /output/all-data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/output/all-data.xls -------------------------------------------------------------------------------- /output/baidu.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import django 4 | import django.db.models 5 | sys.path.append('../Jpider') 6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 7 | django.setup() 8 | 9 | from spiders.models import BaiKeRank 10 | 11 | ranks = BaiKeRank.objects.all().order_by('rank') 12 | for r in ranks: 13 | print(r) 14 | 15 | -------------------------------------------------------------------------------- /output/dazhongdianping.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import os 4 | import django 5 | import django.db.models 6 | sys.path.append('../Jpider') 7 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 8 | django.setup() 9 | 10 | from spiders.models import ShopInfo, ReviewDedail, ShopId 11 | 12 | import xlwt 13 | 14 | 15 | # 'http://www.dianping.com/search/category/2/10/g110', # 北京火锅 16 | # 'http://www.dianping.com/search/category/2/10/g107', # 北京台湾菜 17 | # 'http://www.dianping.com/search/category/2/10/g112', # 北京小吃快餐 18 | # 'http://www.dianping.com/search/category/2/10/g250', # 北京创意菜 19 | # 'http://www.dianping.com/search/category/2/10/g116', # 北京西餐 20 | # 'http://www.dianping.com/search/category/2/10/g113', # 北京日本菜 21 | # 'http://www.dianping.com/search/category/2/10/g103', # 北京粤菜 22 | # 'http://www.dianping.com/search/category/2/10/g115', # 北京东南亚菜 23 | # 'http://www.dianping.com/search/category/2/10/g102', # 北京川菜 24 | # 'http://www.dianping.com/search/category/1/10/g113', # 上海日本菜??? 25 | # 'http://www.dianping.com/search/category/1/10/g110', # 上海火锅 26 | # 'http://www.dianping.com/search/category/1/10/g107', # 上海台湾菜 27 | # 'http://www.dianping.com/search/category/1/10/g103', # 上海粤菜 28 | # 'http://www.dianping.com/search/category/1/10/g102', # 上海川菜 29 | # 'http://www.dianping.com/search/category/1/10/g112', # 上海小吃快餐 30 | # 'http://www.dianping.com/search/category/1/10/g115', # 上海东南亚菜 31 | # 'http://www.dianping.com/search/category/1/10/g116', # 上海西餐 32 | 33 | category_dict = {'g110':'火锅', 'g107':'台湾菜', 'g112':'小吃快餐', 'g250': '创意菜', 34 | 'g116': '西餐', 'g113': '日本菜', 'g103': '粤菜', 'g115': '东南亚菜', 'g102': '川菜'} 35 | 36 | rank_star_dict = { 37 | '五星商户': 5, 38 | '准五星商户':4.5, 39 | '四星商户': 4, 40 | '准四星商户': 3.5, 41 | '三星商户': 3, 42 | '准三星商户': 2.5, 43 | '二星商户': 2, 44 | '准二星商户': 1.5, 45 | '一星商户': 1, 46 | '准一星商户': 0.5, 47 | '该商户暂无星级': 0, 48 | '': '无' 49 | } 50 | 51 | 52 | workbook = xlwt.Workbook() 53 | sheet = workbook.add_sheet('dazongdianping',cell_overwrite_ok=True) 54 | title = ['餐厅id','城市', '餐厅名称', '餐厅地点', '餐厅地址', '餐厅类别', '人均价格', '是否参加营销活动', '营业时间', '点评数量', 55 | '总体评分', '口味评分', '环境评分', '服务评分', '五星', '四星', '三星', '二星', '一星', '第一条评论时间'] 56 | for i in range(len(title)): 57 | sheet.write(0, i, title[i] ) 58 | 59 | shops = ShopInfo.objects.all() 60 | 61 | result_dic = {} 62 | 63 | for j in range(1, len(shops)+1): 64 | shop = shops[j-1] 65 | info_list = [] 66 | info_list.append(str(shop.shop_id)) # id 67 | print(shop.shop_id) 68 | try: 69 | url = ShopId.objects.get(pk=shop.shop_id).from_url 70 | except ShopId.DoesNotExist: 71 | continue 72 | if url is None: 73 | continue 74 | city_no = url.split('/')[-3] 75 | city = '北京' if city_no == '2' else '上海' 76 | info_list.append(city) 77 | category = category_dict[url.split('/')[-1][:4]] 78 | info_list.append(shop.shop_name) 79 | info_list.append(shop.place if shop.place is not None else '') 80 | info_list.append(shop.address if shop.address is not None else '') 81 | info_list.append(category) 82 | avg_price = shop.avg_price.split(':')[1] 83 | if len(avg_price) != 1: 84 | avg_price = avg_price[:-1] 85 | 86 | info_list.append(avg_price ) 87 | features = shop.feature2.split(';') 88 | print(features) 89 | f_l = [] 90 | for f in features: 91 | if f == 'huo': 92 | print('活动') 93 | f_l.append('活动') 94 | elif f == 'ka': 95 | print('会员卡') 96 | f_l.append('会员卡') 97 | else: 98 | f_l.append(f) 99 | info_list.append(';'.join(f_l)) 100 | f_l.clear() 101 | info_list.append(shop.open_time.replace('\t', ' ').replace('\r','').replace('\n', ';') if shop.open_time is not None else '') 102 | info_list.append(shop.review_count[:-3]) 103 | info_list.append(rank_star_dict[shop.rank_star]) 104 | info_list.append(shop.taste.split(':')[1]) 105 | info_list.append(shop.env.split(':')[1]) 106 | info_list.append(shop.service.split(':')[1]) 107 | 108 | review = ReviewDedail.objects.get(pk=shop.shop_id) 109 | info_list.append(review.star_5) 110 | info_list.append(review.star_4) 111 | info_list.append(review.star_3) 112 | info_list.append(review.star_2) 113 | info_list.append(review.star_1) 114 | if review.first_review_time is not None: 115 | f_r_t = review.first_review_time.split('\xa0')[0] 116 | if len(f_r_t) == 5: 117 | f_r_t = '2017-'+f_r_t 118 | else: 119 | f_r_t = '20'+f_r_t 120 | info_list.append(f_r_t) 121 | else: 122 | info_list.append('') 123 | for i in range(len(info_list)): 124 | if info_list[i] is None: 125 | info_list[i] = ' ' 126 | # 'http://www.dianping.com/search/category/2/10/g110', # 北京火锅 127 | # 'http://www.dianping.com/search/category/2/10/g107', # 北京台湾菜 128 | # 'http://www.dianping.com/search/category/2/10/g112', # 北京小吃快餐 129 | # 'http://www.dianping.com/search/category/2/10/g250', # 北京创意菜 130 | # 'http://www.dianping.com/search/category/2/10/g116', # 北京西餐 131 | # 'http://www.dianping.com/search/category/2/10/g113', # 北京日本菜 132 | # 'http://www.dianping.com/search/category/2/10/g103', # 北京粤菜 133 | # 'http://www.dianping.com/search/category/2/10/g115', # 北京东南亚菜 134 | # 'http://www.dianping.com/search/category/2/10/g102', # 北京川菜 135 | # 'http://www.dianping.com/search/category/1/10/g113', # 上海日本菜??? 136 | # 'http://www.dianping.com/search/category/1/10/g110', # 上海火锅 137 | # 'http://www.dianping.com/search/category/1/10/g107', # 上海台湾菜 138 | # 'http://www.dianping.com/search/category/1/10/g103', # 上海粤菜 139 | # 'http://www.dianping.com/search/category/1/10/g102', # 上海川菜 140 | # 'http://www.dianping.com/search/category/1/10/g112', # 上海小吃快餐 141 | # 'http://www.dianping.com/search/category/1/10/g115', # 上海东南亚菜 142 | # 'http://www.dianping.com/search/category/1/10/g116', # 上海西餐 143 | li = result_dic.get(city+'_'+category, []) 144 | li.append(info_list.copy()) 145 | result_dic[city+'_'+category] = li 146 | # file = open('/Users/didi/crawler/output/%s_%s.txt' % (city, category), 'a') 147 | # 148 | # file.write('\t'.join([str(i) for i in info_list])+'\n') 149 | # file.close() 150 | # print(info_list) 151 | info_list.clear() 152 | 153 | book = xlwt.Workbook() 154 | for city_cate, infos in result_dic.items(): 155 | sheet = book.add_sheet(city_cate) 156 | for i in range(len(title)): 157 | sheet.write(0, i, title[i]) 158 | for i in range(1, len(infos)): 159 | for j in range(len(infos[i])): 160 | sheet.write(i, j, infos[i][j]) 161 | import datetime 162 | 163 | book.save('./all-data-'+ datetime.datetime.now().strftime('%Y_%m_%d')+'.xls') -------------------------------------------------------------------------------- /spiders/Sina_spider1/Begin.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | 3 | cmdline.execute("scrapy crawl sinaSpider".split()) 4 | import requests 5 | requests.post() 6 | # import yaml 7 | # f = open('./Sina_spider1/conf/weibo.yaml') 8 | # print yaml.load(f) -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/__init__.py -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/__init__.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/conf.py -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/conf/weibo.yaml: -------------------------------------------------------------------------------- 1 | accounts: 2 | - user: 'jjzhu_zju@163.com' 3 | password: '***' 4 | users: 5 | - 2210643391 6 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/constant.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | REPOST = 0 # 转发 3 | ORIGINAL = 1 # 原创 4 | LIKE = 2 # 点赞 5 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/cookies.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | import base64 4 | import requests 5 | import sys 6 | import time 7 | import json 8 | from selenium import webdriver 9 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 10 | from selenium.common.exceptions import NoSuchElementException 11 | import logging 12 | from settings import PROPERTIES 13 | from yumdama import identify 14 | import traceback 15 | reload(sys) 16 | sys.setdefaultencoding('utf8') 17 | IDENTIFY = 1 # 验证码输入方式: 1:看截图aa.png,手动输入 2:云打码 18 | # 0 代表从https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18) 获取cookie 19 | # 1 代表从https://weibo.cn/login/获取Cookie 20 | COOKIE_GETWAY = 0 21 | dcap = dict(DesiredCapabilities.PHANTOMJS) # PhantomJS需要使用老版手机的user-agent,不然验证码会无法通过 22 | dcap["phantomjs.page.settings.userAgent"] = ( 23 | "Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" 24 | ) 25 | logger = logging.getLogger(__name__) 26 | logging.getLogger("selenium").setLevel(logging.WARNING) # 将selenium的日志级别设成WARNING,太烦人 27 | 28 | 29 | def getCookie(account, password): 30 | if COOKIE_GETWAY == 0: 31 | return get_cookie_from_login_sina_com_cn(account, password) 32 | elif COOKIE_GETWAY == 1: 33 | return get_cookie_from_weibo_cn(account, password) 34 | else: 35 | logger.error("COOKIE_GETWAY Error!") 36 | 37 | 38 | def get_cookie_from_login_sina_com_cn(account, password): 39 | """ 获取一个账号的Cookie """ 40 | loginURL = "https://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)" 41 | username = base64.b64encode(account.encode("utf-8")).decode("utf-8") 42 | postData = { 43 | "entry": "sso", 44 | "gateway": "1", 45 | "from": "null", 46 | "savestate": "30", 47 | "useticket": "0", 48 | "pagerefer": "", 49 | "vsnf": "1", 50 | "su": username, 51 | "service": "sso", 52 | "sp": password, 53 | "sr": "1440*900", 54 | "encoding": "UTF-8", 55 | "cdult": "3", 56 | "domain": "sina.com.cn", 57 | "prelt": "0", 58 | "returntype": "TEXT", 59 | } 60 | session = requests.Session() 61 | r = session.post(loginURL, data=postData) 62 | jsonStr = r.content.decode("gbk") 63 | info = json.loads(jsonStr) 64 | if info["retcode"] == "0": 65 | logger.warning("Get Cookie Success!( Account:%s )" % account) 66 | cookie = session.cookies.get_dict() 67 | print cookie 68 | return json.dumps(cookie) 69 | else: 70 | logger.warning("Failed!( Reason:%s )" % info["reason"]) 71 | return "" 72 | 73 | 74 | def get_cookie_from_weibo_cn(account, password): 75 | """ 获取一个账号的Cookie """ 76 | try: 77 | browser = webdriver.PhantomJS(desired_capabilities=dcap) 78 | browser.get("https://weibo.cn/login/") 79 | time.sleep(1) 80 | 81 | failure = 0 82 | while "微博" in browser.title and failure < 5: 83 | failure += 1 84 | browser.save_screenshot("aa.png") 85 | print browser.w3c 86 | username = browser.find_element_by_id("loginName") 87 | username.clear() 88 | username.send_keys(account) 89 | 90 | psd = browser.find_element_by_xpath('//input[@type="password"]') 91 | psd.clear() 92 | psd.send_keys(password) 93 | try: 94 | code = browser.find_element_by_name("loginVCode") 95 | code.clear() 96 | if IDENTIFY == 1: 97 | code_txt = raw_input("请查看路径下新生成的aa.png,然后输入验证码:") # 手动输入验证码 98 | else: 99 | from PIL import Image 100 | img = browser.find_element_by_xpath('//form[@method="post"]/div/img[@alt="请打开图片显示"]') 101 | x = img.location["x"] 102 | y = img.location["y"] 103 | im = Image.open("aa.png") 104 | im.crop((x, y, 100 + x, y + 22)).save("ab.png") # 剪切出验证码 105 | code_txt = identify() # 验证码打码平台识别 106 | code.send_keys(code_txt) 107 | except NoSuchElementException, e: 108 | print e 109 | pass 110 | 111 | commit = browser.find_element_by_id("loginAction") 112 | commit.click() 113 | time.sleep(3) 114 | # print browser.title 115 | # print browser.page_source 116 | # if "手机新浪网" not in browser.title: 117 | # time.sleep(4) 118 | # if '未激活微博' in browser.page_source: 119 | # print '账号未开通微博' 120 | # return {} 121 | 122 | cookie = {} 123 | browser.get("https://weibo.cn") 124 | # if "我的首页" in browser.title: 125 | for elem in browser.get_cookies(): 126 | cookie[elem["name"]] = elem["value"] 127 | logger.info("Get Cookie Success!( Account:%s )" % account) 128 | return json.dumps(cookie) 129 | except Exception, e: 130 | logger.warning("Failed %s!" % account) 131 | traceback.print_exc() 132 | return "" 133 | finally: 134 | try: 135 | browser.quit() 136 | except Exception, e: 137 | pass 138 | 139 | 140 | def getCookies(weibo): 141 | """ 获取Cookies """ 142 | cookies = [] 143 | for elem in weibo: 144 | account = elem['user'] 145 | password = elem['password'] 146 | print account, password 147 | cookie = getCookie(account, password) 148 | if cookie is not None and cookie != '': 149 | print '-' * 10 150 | print cookie 151 | print '-' * 10 152 | if isinstance(cookie, str): 153 | cookies.append(eval(cookie)) 154 | elif isinstance(cookie, dict): 155 | cookies.append(cookie) 156 | else: 157 | raise "unsupported type[%s] of cookie[%s]" % (type(cookie), cookie) 158 | 159 | return cookies 160 | cookies = getCookies(PROPERTIES['accounts']) 161 | logger.warning("Get Cookies Finish!( Num:%d)" % len(cookies)) 162 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/cookies.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/cookies.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/items.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | 3 | from scrapy import Item, Field 4 | 5 | 6 | class InformationItem(Item): 7 | """ 个人信息 """ 8 | _id = Field() # 用户ID 9 | NickName = Field() # 昵称 10 | Gender = Field() # 性别 11 | Province = Field() # 所在省 12 | City = Field() # 所在城市 13 | Signature = Field() # 个性签名 14 | Birthday = Field() # 生日 15 | Num_Tweets = Field() # 微博数 16 | Num_Follows = Field() # 关注数 17 | Num_Fans = Field() # 粉丝数 18 | Sex_Orientation = Field() # 性取向 19 | Marriage = Field() # 婚姻状况 20 | URL = Field() # 首页链接 21 | 22 | 23 | class FlagItem(Item): 24 | weibo_id = Field() 25 | 26 | 27 | class CommentItem(Item): 28 | weibo_id = Field() 29 | id = Field() # 评论id 30 | user = Field() # 评论用户 31 | content = Field() # 评论内容 32 | source = Field() # 评论来源 33 | time = Field() # 评论发表时间 34 | 35 | def __str__(self): 36 | return self['user'] + "\t"+self['content']+"...\t"+self['time'] 37 | 38 | 39 | class TweetsItem(Item): 40 | """ 微博信息 """ 41 | _id = Field() # 用户ID-微博ID 42 | ID = Field() # 用户ID 43 | Content = Field() # 微博内容 44 | PubTime = Field() # 发表时间 45 | Coordinates = Field() # 定位坐标 46 | Tools = Field() # 发表工具/平台 47 | Like = Field() # 点赞数 48 | Comment = Field() # 评论数 49 | Transfer = Field() # 转载数 50 | Type = Field() # 类型 转发|原创|点赞 51 | 52 | def __str__(self): 53 | return '--------------------------------------------------------------------------\n' \ 54 | '|\t用户\t|\t\t微博\t\t|\t来源\t|\t发布时间\t|\t微博id\t|\n' \ 55 | '------------------------------------------------------------------------------\n' \ 56 | '|%s\t|\t%s\t|\t%s\t|\t%s\t|\t%s\t|\n' \ 57 | '------------------------------------------------------------------------------\n'\ 58 | % (self["ID"], self["Content"][:20], self["Tools"] if 'Tools' in self else '', self['PubTime'], self['_id']) 59 | 60 | 61 | 62 | class FollowsItem(Item): 63 | """ 关注人列表 """ 64 | _id = Field() # 用户ID 65 | follows = Field() # 关注 66 | 67 | 68 | class FansItem(Item): 69 | """ 粉丝列表 """ 70 | _id = Field() # 用户ID 71 | fans = Field() # 粉丝 72 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/items.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/middleware.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import random 3 | from cookies import cookies 4 | from user_agents import agents 5 | import logging 6 | import re 7 | 8 | class UserAgentMiddleware(object): 9 | """ 换User-Agent """ 10 | 11 | def process_request(self, request, spider): 12 | agent = random.choice(agents) 13 | request.headers["User-Agent"] = agent 14 | 15 | 16 | class RefererMiddleware(object): 17 | page_pattern = re.compile('https://weibo.cn/(.*?)\?page=(\d+)') 18 | 19 | def process_request(self, request, spider): 20 | 21 | if 'Referer' in request.headers: 22 | page_result = self.page_pattern.findall(request.url) 23 | if len(page_result) == 1: 24 | curr_page = int(page_result[0][1]) - 1 25 | request.headers['Referer'] = 'https://weibo.cn/%s?page=%d' % (page_result[0][0], curr_page) 26 | print request.url 27 | print request.headers 28 | print request.cookies 29 | print request.headers['Referer'] if 'Referer' in request.headers else '' 30 | 31 | 32 | class CookiesMiddleware(object): 33 | """ 换Cookie """ 34 | 35 | def process_request(self, request, spider): 36 | cookie = random.choice(cookies) 37 | logging.info("use cookie %s" % cookie) 38 | request.cookies = cookie 39 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/middleware.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/middleware.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/pipelines.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import os 3 | from items import CommentItem, TweetsItem, FlagItem, FansItem 4 | 5 | 6 | class FilePipeline(object): 7 | FILE_CACHE = {} 8 | 9 | def process_item(self, item, spider): 10 | 11 | if isinstance(item, CommentItem): 12 | path = './' + item['weibo_id'] 13 | if not os.path.exists(path): 14 | os.makedirs(path) 15 | f = open(path + '/' + item['weibo_id'] + '.txt', 'a') 16 | self.FILE_CACHE[item['weibo_id']] = f 17 | f = self.FILE_CACHE[item['weibo_id']] 18 | f.write('%s\t%s\t%s\t%s\n' % ( 19 | item['user'], item['content'], item['source'] if 'source' in item else '', item['time'])) 20 | if isinstance(item, TweetsItem): 21 | path = './' + item['ID'] 22 | if not os.path.exists(path): 23 | os.makedirs(path) 24 | if item['ID'] not in self.FILE_CACHE: 25 | f = open(path + '/' + item['ID'] + '.txt', 'a') 26 | self.FILE_CACHE[item['ID']] = f 27 | f = self.FILE_CACHE[item['ID']] 28 | f.write('%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (item['_id'], item['Content'], item['PubTime'], item['Tools'] if 'Tools' in item else '', 29 | item['Comment'], item['Like'], item['Transfer'])) 30 | f.flush() 31 | if isinstance(item, FlagItem): 32 | f = self.FILE_CACHE[item['weibo_id']] 33 | f.close() 34 | del self.FILE_CACHE[item['weibo_id']] 35 | 36 | # class MongoDBPipeline(object): 37 | # def __init__(self): 38 | # clinet = pymongo.MongoClient("localhost", 27017) 39 | # db = clinet["Sina"] 40 | # self.Information = db["Information"] 41 | # self.Tweets = db["Tweets"] 42 | # self.Follows = db["Follows"] 43 | # self.Fans = db["Fans"] 44 | # 45 | # def process_item(self, item, spider): 46 | # """ 判断item的类型,并作相应的处理,再入数据库 """ 47 | # if isinstance(item, InformationItem): 48 | # try: 49 | # self.Information.insert(dict(item)) 50 | # except Exception: 51 | # pass 52 | # elif isinstance(item, TweetsItem): 53 | # try: 54 | # self.Tweets.insert(dict(item)) 55 | # except Exception: 56 | # pass 57 | # elif isinstance(item, FollowsItem): 58 | # followsItems = dict(item) 59 | # follows = followsItems.pop("follows") 60 | # for i in range(len(follows)): 61 | # followsItems[str(i + 1)] = follows[i] 62 | # try: 63 | # self.Follows.insert(followsItems) 64 | # except Exception: 65 | # pass 66 | # elif isinstance(item, FansItem): 67 | # fansItems = dict(item) 68 | # fans = fansItems.pop("fans") 69 | # for i in range(len(fans)): 70 | # fansItems[str(i + 1)] = fans[i] 71 | # try: 72 | # self.Fans.insert(fansItems) 73 | # except Exception: 74 | # pass 75 | # return item 76 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/pipelines.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/settings.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import yaml 3 | import os 4 | import scrapy.core.scraper 5 | print os.path.split(os.path.realpath(__file__))[0] 6 | PROPERTIES = yaml.load(open(os.path.split(os.path.realpath(__file__))[0] + '/conf/weibo.yaml')) 7 | 8 | BOT_NAME = 'Sina_spider1' 9 | 10 | SPIDER_MODULES = ['Sina_spider1.spiders'] 11 | NEWSPIDER_MODULE = 'Sina_spider1.spiders' 12 | # HTTPCACHE_ENABLED = False 13 | DOWNLOADER_MIDDLEWARES = { 14 | "Sina_spider1.middleware.UserAgentMiddleware": 401, 15 | "Sina_spider1.middleware.CookiesMiddleware": 402, 16 | 'Sina_spider1.middleware.RefererMiddleware': 403, 17 | } 18 | 19 | ITEM_PIPELINES = { 20 | 'Sina_spider1.pipelines.FilePipeline': 300, 21 | } 22 | 23 | DOWNLOAD_DELAY = 1 # 间隔时间 24 | # CONCURRENT_ITEMS = 1000 25 | # CONCURRENT_REQUESTS = 100 26 | # REDIRECT_ENABLED = False 27 | # CONCURRENT_REQUESTS_PER_DOMAIN = 100 28 | # CONCURRENT_REQUESTS_PER_IP = 0 29 | # CONCURRENT_REQUESTS_PER_SPIDER=100 30 | # DNSCACHE_ENABLED = True 31 | # LOG_LEVEL = 'INFO' # 日志级别 32 | # CONCURRENT_REQUESTS = 70 33 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/settings.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/spiders/__init__.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/spiders/spiders.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/spiders/spiders.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/user_agents.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/user_agents.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/yumdama.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import httplib, mimetypes, urlparse, json, time 3 | 4 | ###################################################################### 5 | 6 | # 错误代码请查询 http://www.yundama.com/apidoc/YDM_ErrorCode.html 7 | # 所有函数请查询 http://www.yundama.com/apidoc 8 | 9 | # 1. http://www.yundama.com/index/reg/developer 注册开发者账号 10 | # 2. http://www.yundama.com/developer/myapp 添加新软件 11 | # 3. 使用添加的软件ID和密钥进行开发,享受丰厚分成 12 | 13 | # 用户名 14 | username = '' 15 | 16 | # 密码 17 | password = '' 18 | 19 | # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得! 20 | appid = 1 21 | 22 | # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得! 23 | appkey = '22cc5376925e9387a23cf797cb9ba745' 24 | 25 | # 图片文件 26 | filename = 'ab.png' 27 | 28 | # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html 29 | codetype = 1004 30 | 31 | # 超时时间,秒 32 | timeout = 60 33 | 34 | 35 | ###################################################################### 36 | 37 | class YDMHttp: 38 | apiurl = 'http://api.yundama.net:5678/api.php' 39 | 40 | username = '' 41 | password = '' 42 | appid = '' 43 | appkey = '' 44 | 45 | def __init__(self, username, password, appid, appkey): 46 | self.username = username 47 | self.password = password 48 | self.appid = str(appid) 49 | self.appkey = appkey 50 | 51 | def request(self, fields, files=[]): 52 | try: 53 | response = post_url(self.apiurl, fields, files) 54 | response = json.loads(response) 55 | except Exception as e: 56 | response = None 57 | return response 58 | 59 | def balance(self): 60 | data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 61 | 'appkey': self.appkey} 62 | response = self.request(data) 63 | if (response): 64 | if (response['ret'] and response['ret'] < 0): 65 | return response['ret'] 66 | else: 67 | return response['balance'] 68 | else: 69 | return -9001 70 | 71 | def login(self): 72 | data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 73 | 'appkey': self.appkey} 74 | response = self.request(data) 75 | if (response): 76 | if (response['ret'] and response['ret'] < 0): 77 | return response['ret'] 78 | else: 79 | return response['uid'] 80 | else: 81 | return -9001 82 | 83 | def upload(self, filename, codetype, timeout): 84 | data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 85 | 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)} 86 | file = {'file': filename} 87 | response = self.request(data, file) 88 | if (response): 89 | if (response['ret'] and response['ret'] < 0): 90 | return response['ret'] 91 | else: 92 | return response['cid'] 93 | else: 94 | return -9001 95 | 96 | def result(self, cid): 97 | data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 98 | 'appkey': self.appkey, 'cid': str(cid)} 99 | response = self.request(data) 100 | return response and response['text'] or '' 101 | 102 | def decode(self, filename, codetype, timeout): 103 | cid = self.upload(filename, codetype, timeout) 104 | if (cid > 0): 105 | for i in range(0, timeout): 106 | result = self.result(cid) 107 | if (result != ''): 108 | return cid, result 109 | else: 110 | time.sleep(1) 111 | return -3003, '' 112 | else: 113 | return cid, '' 114 | 115 | 116 | ###################################################################### 117 | 118 | def post_url(url, fields, files=[]): 119 | urlparts = urlparse.urlsplit(url) 120 | return post_multipart(urlparts[1], urlparts[2], fields, files) 121 | 122 | 123 | def post_multipart(host, selector, fields, files): 124 | content_type, body = encode_multipart_formdata(fields, files) 125 | h = httplib.HTTP(host) 126 | h.putrequest('POST', selector) 127 | h.putheader('Host', host) 128 | h.putheader('Content-Type', content_type) 129 | h.putheader('Content-Length', str(len(body))) 130 | h.endheaders() 131 | h.send(body) 132 | errcode, errmsg, headers = h.getreply() 133 | return h.file.read() 134 | 135 | 136 | def encode_multipart_formdata(fields, files=[]): 137 | BOUNDARY = 'WebKitFormBoundaryJKrptX8yPbuAJLBQ' 138 | CRLF = '\r\n' 139 | L = [] 140 | for field in fields: 141 | key = field 142 | value = fields[key] 143 | L.append('--' + BOUNDARY) 144 | L.append('Content-Disposition: form-data; name="%s"' % key) 145 | L.append('') 146 | L.append(value) 147 | for field in files: 148 | key = field 149 | filepath = files[key] 150 | L.append('--' + BOUNDARY) 151 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filepath)) 152 | L.append('Content-Type: %s' % get_content_type(filepath)) 153 | L.append('') 154 | L.append(open(filepath, 'rb').read()) 155 | L.append('--' + BOUNDARY + '--') 156 | L.append('') 157 | body = CRLF.join(L) 158 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY 159 | return content_type, body 160 | 161 | 162 | def get_content_type(filename): 163 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream' 164 | 165 | 166 | ###################################################################### 167 | 168 | 169 | def identify(): 170 | if (username == 'username'): 171 | print '请设置好相关参数再测试' 172 | else: 173 | # 初始化 174 | yundama = YDMHttp(username, password, appid, appkey) 175 | 176 | # 登陆云打码 177 | uid = yundama.login() 178 | # print 'uid: %s' % uid 179 | 180 | # 查询余额 181 | balance = yundama.balance() 182 | # print 'balance: %s' % balance 183 | 184 | # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果 185 | cid, result = yundama.decode(filename, codetype, timeout) 186 | # print 'cid: %s, result: %s' % (cid, result) 187 | return result 188 | -------------------------------------------------------------------------------- /spiders/Sina_spider1/Sina_spider1/yumdama.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/Sina_spider1/yumdama.pyc -------------------------------------------------------------------------------- /spiders/Sina_spider1/aa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/Sina_spider1/aa.png -------------------------------------------------------------------------------- /spiders/Sina_spider1/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Sina_spider1.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Sina_spider1 12 | -------------------------------------------------------------------------------- /spiders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/__init__.py -------------------------------------------------------------------------------- /spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/__init__.pyc -------------------------------------------------------------------------------- /spiders/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /spiders/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class SpidersConfig(AppConfig): 5 | name = 'spiders' 6 | -------------------------------------------------------------------------------- /spiders/apps.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/apps.pyc -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import django 4 | 5 | sys.path.append('../../../Jpider') 6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 7 | django.setup() -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from spiders.models import BaiKeRank 10 | from scrapy_djangoitem import DjangoItem 11 | 12 | class BaidurankItem(scrapy.Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | pass 16 | 17 | 18 | class BaiKeRankItem(DjangoItem): 19 | django_model = BaiKeRank -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaidurankSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaidurankPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for baidurank project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'baidurank' 13 | 14 | SPIDER_MODULES = ['baidurank.spiders'] 15 | NEWSPIDER_MODULE = 'baidurank.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'baidurank (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 1 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'baidurank.middlewares.BaidurankSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'baidurank.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'baidurank.pipelines.BaidurankPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/baidurank/baidurank/spiders/rank.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | from ..items import BaiKeRankItem 4 | import datetime 5 | class BaiduRank(scrapy.Spider): 6 | name = 'baidurank' 7 | start_urls = [ 8 | 'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek' 9 | ] 10 | url_p = 'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek&pg=%d' 11 | max_page = 50 12 | curr_page = 1 13 | curr_time = '' 14 | def start_requests(self): 15 | self.curr_time = datetime.datetime.now() 16 | for url in self.start_urls: 17 | yield self.make_requests_from_url(url) 18 | for pg in range(1, 50): 19 | yield self.make_requests_from_url(self.url_p % pg) 20 | 21 | def parse(self, response): 22 | 23 | rt = json.loads(response.body) 24 | 25 | this_week = rt['data']['thisWeek'] 26 | for record in this_week: 27 | baike_rank = BaiKeRankItem() 28 | baike_rank['rank'] = str(record['rank']) 29 | baike_rank['name'] = record['name'] 30 | baike_rank['ori_score'] = str(record['oriScore']) 31 | baike_rank['rank_time'] = self.curr_time.strftime('%Y-%m-%d %H:%M:%S') 32 | baike_rank.save() 33 | print(str(record['rank'])+'\t'+record['name']+'\t'+str(record['oriScore'])) 34 | 35 | -------------------------------------------------------------------------------- /spiders/baidurank/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = baidurank.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = baidurank 12 | -------------------------------------------------------------------------------- /spiders/bilibili/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/bilibili/__init__.py -------------------------------------------------------------------------------- /spiders/bilibili/bilibili_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import sys 4 | import os 5 | import django 6 | 7 | sys.path.append('../../../Jpider') 8 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 9 | django.setup() 10 | from spiders.models import BilibiliMovie 11 | 12 | search_url = 'https://s.search.bilibili.com/cate/search' 13 | tag_url = 'https://api.bilibili.com/x/tag/hots' 14 | curr_page = 1 15 | t_params = dict( 16 | rid=145, 17 | type=0, 18 | jsonp='jsonp' 19 | ) 20 | q_params = dict( 21 | main_ver='v3', 22 | search_type='video', 23 | view_type='hot_rank', 24 | pic_size='160x100', 25 | order='hot', 26 | copy_righ='-1', 27 | cate_id=145, 28 | page=curr_page, 29 | pagesize=20, 30 | keyword='恐怖' 31 | ) 32 | req = requests.get(url=tag_url, params=t_params, verify=False) 33 | req_json = json.loads(req.text) 34 | tags = req_json['data'][0]['tags'] 35 | for tag in tags: 36 | print(tag) 37 | q_params['keyword'] = tag['tag_name'] 38 | req = requests.get(url=search_url, params=q_params, verify=False) 39 | req_json = json.loads(req.text) 40 | pages = req_json['numPages'] 41 | for r in req_json['result']: 42 | movie = BilibiliMovie() 43 | movie.arcurl = r['arcurl'] 44 | movie.author = r['author'] 45 | movie.description = r['description'] 46 | movie.favorites = r['favorites'] 47 | movie.play = r['play'] 48 | movie.video_review = r['video_review'] 49 | movie.tag = r['tag'] 50 | movie.title = r['title'] 51 | movie.id = r['id'] 52 | movie.save() 53 | print(movie) 54 | curr_page += 1 55 | while curr_page <= pages: 56 | q_params['page'] = curr_page 57 | req = requests.get(url=search_url, params=q_params, verify=False) 58 | req_json = json.loads(req.text) 59 | for r in req_json['result']: 60 | movie = BilibiliMovie() 61 | movie.arcurl = r['arcurl'] 62 | movie.author = r['author'] 63 | movie.description = r['description'] 64 | movie.favorites = r['favorites'] 65 | movie.play = r['play'] 66 | movie.video_review = r['video_review'] 67 | movie.tag = r['tag'] 68 | movie.title = r['title'] 69 | movie.id = r['id'] 70 | movie.save() 71 | print(movie) 72 | curr_page += 1 73 | -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import django 4 | 5 | sys.path.append('../../../Jpider') 6 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 7 | django.setup() -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from spiders.models import ShopInfo, ReviewDedail, ShopId 10 | from scrapy_djangoitem import DjangoItem 11 | 12 | class DazongdianpingItem(scrapy.Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | pass 16 | 17 | class ShopInfoItem(DjangoItem): 18 | django_model = ShopInfo 19 | 20 | class ReviewDetailItem(DjangoItem): 21 | django_model = ReviewDedail 22 | 23 | class ShopIdItem(DjangoItem): 24 | django_model = ShopId -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from spiders.user_agent import agents 10 | import random 11 | 12 | class DazongdianpingSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.logger.info('Spider opened: %s' % spider.name) 58 | 59 | 60 | class ChangeProxyMiddleware(object): 61 | def process_request(self, request, spider): 62 | 63 | request.headers.setdefault('User-Agent', random.choice(agents)) -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DazongdianpingPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for dazongdianping project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'dazongdianping' 13 | 14 | SPIDER_MODULES = ['dazongdianping.spiders'] 15 | NEWSPIDER_MODULE = 'dazongdianping.spiders' 16 | 17 | # LOG_FILE = 'dazongdianping.log' 18 | # LOG_LEVEL = 'ERROR' 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | #USER_AGENT = 'dazongdianping (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | #CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 4 32 | 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | # COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | DEFAULT_REQUEST_HEADERS = { 45 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 46 | # 'Accept-Encoding': 'gzip,deflate,sdch', 47 | 'Accept-Language': 'zdeprecatedh-CN,zh;q=0.8,en;q=0.6', 48 | 'Host': 'www.dianping.com', 49 | 'Proxy-Connection': 'keep-alive', 50 | 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" 51 | 52 | # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36' 53 | # ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' 54 | } 55 | 56 | # Enable or disable spider middlewares 57 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 58 | # SPIDER_MIDDLEWARES = { 59 | # 'dazongdianping.middlewares.ChangeProxyMiddleware': 100, 60 | # } 61 | 62 | # Enable or disable downloader middlewares 63 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 64 | DOWNLOADER_MIDDLEWARES = { 65 | 'dazongdianping.middlewares.ChangeProxyMiddleware': 100, 66 | } 67 | 68 | # Enable or disable extensions 69 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 70 | #EXTENSIONS = { 71 | # 'scrapy.extensions.telnet.TelnetConsole': None, 72 | #} 73 | 74 | # Configure item pipelines 75 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 76 | #ITEM_PIPELINES = { 77 | # 'dazongdianping.pipelines.DazongdianpingPipeline': 300, 78 | #} 79 | 80 | # Enable and configure the AutoThrottle extension (disabled by default) 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 82 | #AUTOTHROTTLE_ENABLED = True 83 | # The initial download delay 84 | #AUTOTHROTTLE_START_DELAY = 5 85 | # The maximum download delay to be set in case of high latencies 86 | #AUTOTHROTTLE_MAX_DELAY = 60 87 | # The average number of requests Scrapy should be sending in parallel to 88 | # each remote server 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 90 | # Enable showing throttling stats for every response received: 91 | #AUTOTHROTTLE_DEBUG = False 92 | 93 | # Enable and configure HTTP caching (disabled by default) 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 95 | #HTTPCACHE_ENABLED = True 96 | #HTTPCACHE_EXPIRATION_SECS = 0 97 | #HTTPCACHE_DIR = 'httpcache' 98 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 99 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 100 | -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/dazongdianping/dazongdianping/spiders/dazong_repair.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from spiders.models import ShopInfo, ReviewDedail 3 | from django.db import connection 4 | import traceback 5 | from ..items import ReviewDetailItem 6 | from spiders.user_agent import agents 7 | import random 8 | 9 | class DazongRepair(scrapy.Spider): 10 | name = 'dazongrepair' 11 | 12 | url_pattern = 'http://www.dianping.com/shop/%s/review_more_newest#start=10' 13 | shop_url_p = 'http://www.dianping.com/shop/%s' 14 | 15 | def start_requests(self): 16 | with connection.cursor() as cursor: 17 | cursor.execute("select shop_id from spiders_shopinfo where shop_id not in (select shop_id from spiders_reviewdedail)") 18 | rows = cursor.fetchall() 19 | for row in rows: 20 | url = self.url_pattern % row[0] 21 | referer = self.shop_url_p % row[0] 22 | header = { 23 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 24 | 'Accept-Encoding': 'gzip,deflate,sdch', 25 | 'Accept-Language': 'zdeprecatedh-CN,zh;q=0.8,en;q=0.6', 26 | 'Host': 'www.dianping.com', 27 | 28 | 'User-Agent': random.choice(agents), 29 | 'Referer': referer, 30 | } 31 | 32 | 33 | yield scrapy.Request(url, callback=self.parse, headers=header) 34 | 35 | def parse(self, response): 36 | print(response.url) 37 | review_detail = ReviewDetailItem() 38 | try: 39 | shop_id = response.url.split('/')[-2] 40 | main_body = response.css('div.main') 41 | comment_tab = main_body.css('div.comment-tab span') 42 | cnt = '0' 43 | for c_t in comment_tab: 44 | title = c_t.css('a::text').extract()[0] 45 | if title.strip() == '全部点评': 46 | cnt = c_t.css('em.col-exp::text').extract()[0].strip()[1:-1] 47 | break 48 | if cnt == '0': 49 | review_detail['shop_id'] = shop_id 50 | review_detail['star_all'] = 0 51 | review_detail.save() 52 | self.logger.error('%s - %s: %s' % (response.url, '全部点评', cnt)) 53 | print('%s - %s: %s' % (response.url, '全部点评', cnt)) 54 | return None 55 | 56 | stars = main_body.css('div.comment-mode div.comment-star span em.col-exp::text').extract() 57 | first_review_time = main_body.css('div.comment-mode div.comment-list ul li span.time::text').extract_first().strip() 58 | first_review_content = main_body.css('div.comment-mode div.comment-list div.comment-txt div::text').extract_first().strip() 59 | review_detail['first_review_time'] = first_review_time 60 | review_detail['first_review_content'] = first_review_content 61 | review_detail['star_all'] = stars[0][1:-1] 62 | review_detail['star_5'] = stars[1][1:-1] 63 | review_detail['star_4'] = stars[2][1:-1] 64 | review_detail['star_3'] = stars[3][1:-1] 65 | review_detail['star_2'] = stars[4][1:-1] 66 | review_detail['star_1'] = stars[5][1:-1] 67 | review_detail['shop_id'] = shop_id 68 | review_detail.save() 69 | print(shop_id+'\t'+str(stars) + '\t' + first_review_time) 70 | except Exception: 71 | self.logger.error(traceback.format_exc()) 72 | 73 | -------------------------------------------------------------------------------- /spiders/dazongdianping/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = dazongdianping.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = dazongdianping 12 | -------------------------------------------------------------------------------- /spiders/dist_weibo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/__init__.py -------------------------------------------------------------------------------- /spiders/dist_weibo/conf/account.conf: -------------------------------------------------------------------------------- 1 | user password -------------------------------------------------------------------------------- /spiders/dist_weibo/conf/logging.conf: -------------------------------------------------------------------------------- 1 | #logging.conf 2 | ###################################################################### 3 | [loggers] # 配置了两个logger 4 | keys=root, simpleLogger 5 | 6 | [handlers] # 配置所需要的handler 7 | keys=consoleHandler,fileHandler,rotatingFileHandler 8 | 9 | [formatters] # 配置formatter 10 | keys=simpleFmt 11 | 12 | [logger_root] 13 | level=DEBUG 14 | handlers=rotatingFileHandler 15 | 16 | [logger_simpleLogger] # 对simpleLogger进行相关配置 17 | level=DEBUG 18 | handlers=consoleHandler,rotatingFileHandler 19 | qualname=simpleLogger 20 | propagate=0 21 | 22 | [handler_consoleHandler] # 在控制台输出日志信息的处理方式 23 | class=StreamHandler 24 | level=DEBUG 25 | formatter=simpleFmt 26 | args=(sys.stdout,) 27 | 28 | [handler_fileHandler] 29 | class=FileHandler 30 | level=DEBUG 31 | formatter=simpleFmt 32 | args=('./log/dist_weibo.log','a') 33 | 34 | [handler_rotatingFileHandler] # 设置日志备份 35 | class=handlers.RotatingFileHandler 36 | level=DEBUG 37 | formatter=simpleFmt 38 | args=('./log/dist_weibo.log','a',50*1024*1024, 10) 39 | 40 | [formatter_simpleFmt] 41 | format=%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s 42 | datefmt='%Y-%m-%d %H:%M:%S' -------------------------------------------------------------------------------- /spiders/dist_weibo/dao/__init__.py: -------------------------------------------------------------------------------- 1 | __dict__=""" 2 | data store module 3 | """ -------------------------------------------------------------------------------- /spiders/dist_weibo/dao/redis_cookies.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import redis 3 | import json 4 | import datetime 5 | from logger import LOGGER 6 | class RedisCookies(object): 7 | redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=0) 8 | 9 | @classmethod 10 | def save_cookies(cls, user_name, unique_id, cookies): 11 | 12 | pickled_cookies = json.dumps({ 13 | 'user_name': user_name, 14 | 'cookies': cookies, 15 | 'unique_id': unique_id, 16 | 'login_time': datetime.datetime.now().timestamp() 17 | }) 18 | LOGGER.info('save cookie in redis: %s' % str(pickled_cookies)) 19 | r = redis.Redis(connection_pool=cls.redis_pool) 20 | r.hset('account', user_name, pickled_cookies) 21 | cls.user_in_queue(user_name) 22 | 23 | @classmethod 24 | def user_in_queue(cls, user_name): 25 | r = redis.Redis(connection_pool=cls.redis_pool) 26 | 27 | if not r.sismember('users', user_name): 28 | LOGGER.info('user in queue: %s' % user_name) 29 | r.sadd("users", user_name) 30 | else: 31 | LOGGER.info('user already in queue: %s' % user_name) 32 | 33 | @classmethod 34 | def fetch_cookies(cls): 35 | LOGGER.info('get cookies from reids') 36 | r = redis.Redis(connection_pool=cls.redis_pool) 37 | while True: 38 | user = r.spop('users') 39 | r.sadd('users', user) 40 | c = r.hget('account', user) 41 | if c: 42 | user_cookies = c.decode('utf-8') 43 | cookies_json = json.loads(user_cookies) 44 | LOGGER.info('cookies got-------') 45 | return cookies_json 46 | LOGGER.warn('cookies not get') 47 | 48 | @classmethod 49 | def clean(cls): 50 | r = redis.Redis(connection_pool=cls.redis_pool) 51 | r.delete('users') 52 | r.delete('account') 53 | -------------------------------------------------------------------------------- /spiders/dist_weibo/dao/sqlalchemy_session.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | from sqlalchemy.ext.declarative import declarative_base 3 | from sqlalchemy.orm import sessionmaker 4 | Base = declarative_base() 5 | engine = create_engine('mysql+pymysql://root:111111@localhost:3306/dist_weibo?charset=utf8') 6 | DBSession = sessionmaker(bind=engine) 7 | db_session = DBSession() 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /spiders/dist_weibo/headers.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | user_agents = [ 5 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 6 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) ' 7 | 'Chrome/57.0.2987.133 Safari/537.36', 8 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) ' 9 | 'Version/10.1 Safari/603.1.30', 10 | 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 11 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5', 12 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', 13 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89' 14 | ' Safari/537.1 QIHU 360SE', 15 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 ' 16 | '2345Explorer/7.1.0.12633', 17 | 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Ubuntu/10.10 ' 18 | 'Chromium/8.0.552.237 Chrome/8.0.552.237 Safari/534.10', 19 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 ' 20 | 'Chrome/34.0.1847.116 Safari/537.36', 21 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)', 22 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.39 (KHTML, like Gecko) Version/9.0 ' 23 | 'Safari/601.1.39', 24 | 'Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.14', 25 | 'Opera/9.80 (Linux armv6l ; U; CE-HTML/1.0 NETTV/3.0.1;; en) Presto/2.6.33 Version/10.60', 26 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; baidubrowser 1.x)', 27 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) ' 28 | 'Chrome/58.0.3029.110 Safari/537.36', 29 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' 30 | 'Version/5.1 Safari/534.50', 31 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' 32 | 'Version/5.1 Safari/534.50', 33 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 34 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 35 | 'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 36 | 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 37 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 38 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 39 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', 40 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 41 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 ' 42 | 'Safari/537.36', 43 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36' 44 | ] 45 | 46 | def get_header(): 47 | header = { 48 | 'User-Agent': random.choice(user_agents), 49 | 'Accept-Encoding': 'gzip, deflate', 50 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | 'Connection': 'keep-alive' 53 | } 54 | return header 55 | 56 | 57 | def get_header2(): 58 | header = get_header() 59 | header['Proxy-Connection'] = 'keep-alive' 60 | header['Upgrade-Insecure-Requests'] = 1 61 | header['Host'] = 'weibo.com' 62 | -------------------------------------------------------------------------------- /spiders/dist_weibo/kill_celery.txt: -------------------------------------------------------------------------------- 1 | ps aux| grep celery| awk '{print $2}'|xargs kill -9 -------------------------------------------------------------------------------- /spiders/dist_weibo/logger.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import logging 3 | import logging.config 4 | import os 5 | 6 | def logger_conf(): 7 | """ 8 | load basic logger configure 9 | :return: configured logger 10 | """ 11 | 12 | if platform.system() == 'Windows': 13 | 14 | logging.config.fileConfig(os.path.abspath('.')+'\\conf\\logging.conf') 15 | elif platform.system() == 'Linux': 16 | 17 | logging.config.fileConfig(os.path.abspath('.')+'/conf/logging.conf') 18 | elif platform.system() == 'Darwin': 19 | print(os.path.abspath('../../')) 20 | logging.config.fileConfig(os.path.abspath('') + '/conf/logging.conf') 21 | logger = logging.getLogger('simpleLogger') 22 | 23 | return logger 24 | 25 | LOGGER = logger_conf() -------------------------------------------------------------------------------- /spiders/dist_weibo/login/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jjzhu' 2 | -------------------------------------------------------------------------------- /spiders/dist_weibo/login/login.py: -------------------------------------------------------------------------------- 1 | import execjs 2 | import requests 3 | import re 4 | import json 5 | import os 6 | from dao.redis_cookies import RedisCookies 7 | from headers import headers 8 | def get_session(): 9 | return requests.session() 10 | 11 | 12 | def get_js_exec(path): 13 | phantom = execjs.get('PhantomJS') 14 | with open(path, 'r') as f: 15 | source = f.read() 16 | return phantom.compile(source) 17 | 18 | 19 | def get_encodename(name, js_exec): 20 | return js_exec.call('get_name', name) 21 | 22 | 23 | def get_password(password, pre_obj, exec_js): 24 | nonce = pre_obj['nonce'] 25 | pubkey = pre_obj['pubkey'] 26 | servertime = pre_obj['servertime'] 27 | return exec_js.call('get_pass', password, nonce, servertime, pubkey) 28 | 29 | 30 | def get_prelogin_info(prelogin_url, session): 31 | json_pattern = r'.*?\((.*)\)' 32 | response_str = session.get(prelogin_url).text 33 | m = re.match(json_pattern, response_str) 34 | return json.loads(m.group(1)) 35 | 36 | 37 | def get_redirect(data, post_url, session): 38 | logining_page = session.post(post_url, data=data, headers=headers) 39 | login_loop = logining_page.content.decode('GBK') 40 | pa = r'location\.replace\([\'"](.*?)[\'"]\)' 41 | return re.findall(pa, login_loop)[0] 42 | 43 | 44 | def do_login(session, url): 45 | return session.get(url).text 46 | 47 | 48 | def login(name, password): 49 | name = '18270916129' 50 | password = 'VS7452014' 51 | json_pattern = r'.*?\((.*)\)' 52 | session = get_session() 53 | exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js') 54 | su = get_encodename(name, exec_js) 55 | print(su) 56 | post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' 57 | prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \ 58 | 'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)' 59 | 60 | pre_obj = get_prelogin_info(prelogin_url, session) 61 | print(pre_obj) 62 | ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js) 63 | print(ps) 64 | data = { 65 | 'entry': 'weibo', 66 | 'gateway': '1', 67 | 'from': '', 68 | 'savestate': '7', 69 | 'useticket': '1', 70 | 'pagerefer': "http://login.sina.com.cn/sso/logout.php?" 71 | "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl", 72 | 'vsnf': '1', 73 | 'su': su, 74 | 'service': 'miniblog', 75 | 'servertime': pre_obj['servertime'], 76 | 'nonce': pre_obj['nonce'], 77 | 'pwencode': 'rsa2', 78 | 'rsakv': pre_obj['rsakv'], 79 | 'sp': ps, 80 | 'sr': '1366*768', 81 | 'encoding': 'UTF-8', 82 | 'prelt': '115', 83 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&' 84 | 'callback=parent.sinaSSOController.feedBackUrlCallBack', 85 | 'returntype': 'META', 86 | } 87 | url = get_redirect(data, post_url, session) 88 | print(url) 89 | login_info = do_login(session, url) 90 | print(login_info) 91 | m = re.match(json_pattern, login_info) 92 | info = json.loads(m.group(1)) 93 | print(info) 94 | print(session.cookies.get_dict()) 95 | RedisCookies.save_cookies(name, info['userinfo']['uniqueid'], 96 | cookies=session.cookies.get_dict()) 97 | 98 | return session, info 99 | 100 | # session.get('http://weibo.com/u') 101 | if __name__ == '__main__': 102 | login() 103 | -------------------------------------------------------------------------------- /spiders/dist_weibo/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/model/__init__.py -------------------------------------------------------------------------------- /spiders/dist_weibo/model/models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text 2 | from dao.sqlalchemy_session import Base 3 | 4 | 5 | 6 | class Weibo(Base): 7 | __tablename__ = 'weibo' 8 | 9 | id = Column(Integer, primary_key=True, autoincrement=True) 10 | source = Column(String(255)) 11 | date_time = Column(String(128)) 12 | url = Column(String(255)) 13 | content = Column(Text) 14 | 15 | class User(Base): 16 | __tablename__ = 'user' 17 | id = Column(Integer, primary_key=True, autoincrement=True) 18 | user_id = Column(String(128)) 19 | nickname = Column(String(255)) 20 | realname = Column(String(255)) 21 | location = Column(String(255)) 22 | gender = Column(String(4)) 23 | sexual_ori = Column(String(128)) 24 | emotion_state = Column(String(64)) 25 | birthday = Column(String(16)) 26 | blood_type = Column(String(2)) 27 | blog = Column(String(255)) 28 | domain_name = Column(String(255)) 29 | intro = Column(Text) 30 | register_time = Column(String(16)) 31 | email = Column(String(64)) 32 | company = Column(String(128)) 33 | college = Column(String(255)) 34 | high_school = Column(String(255)) 35 | mid_school = Column(String(255)) 36 | tags = Column(String(255)) 37 | 38 | class Relationship(Base): 39 | __tablename__ = 'relationship' 40 | id = Column(Integer, primary_key=True, autoincrement=True) 41 | user_id = Column(String(128)) 42 | fan_id = Column(String(128)) 43 | 44 | 45 | class CrawlInfo(Base): 46 | __tablename__ = 'crawl_info' 47 | id = Column(Integer, primary_key=True, autoincrement=True) 48 | user_id = Column(String(128)) 49 | last_crawl_date = Column(String(20)) 50 | 51 | if __name__ == '__main__': 52 | from dao.sqlalchemy_session import engine 53 | Base.metadata.create_all(engine) -------------------------------------------------------------------------------- /spiders/dist_weibo/notebook/Request.md: -------------------------------------------------------------------------------- 1 | - python3 2 | - execjs 3 | - phantomjs -------------------------------------------------------------------------------- /spiders/dist_weibo/sql/database.sql: -------------------------------------------------------------------------------- 1 | create database dist_weibo default character set utf8mb4 collate utf8mb4_unicode_ci; -------------------------------------------------------------------------------- /spiders/dist_weibo/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/tasks/__init__.py -------------------------------------------------------------------------------- /spiders/dist_weibo/tasks/home_page.py: -------------------------------------------------------------------------------- 1 | import json 2 | from dao.redis_cookies import RedisCookies 3 | from headers import get_header 4 | import requests 5 | from tasks.workers import app 6 | from bs4 import BeautifulSoup 7 | import re 8 | from model.models import Weibo 9 | from dao.sqlalchemy_session import db_session 10 | 11 | @app.task 12 | def home_page(): 13 | home_url = 'http://weibo.com/u/{}?is_ori=1&is_tag=0&profile_ftype=1&page=1' 14 | 15 | cookies_json = RedisCookies.fetch_cookies() 16 | 17 | 18 | cookies = cookies_json['cookies'] 19 | 20 | unique_id = cookies_json['unique_id'] 21 | 22 | resp = requests.get(url=home_url.format(unique_id), headers=get_header(), cookies=cookies, verify=False).text 23 | 24 | home_html = BeautifulSoup(resp, 'html.parser') 25 | 26 | scripts = home_html.find_all('script') 27 | scripts.reverse() 28 | 29 | view = re.compile('FM.view\((.*)\)') 30 | weibo_html_content = '' 31 | for script in scripts: 32 | result = view.findall(script.string) 33 | if len(result): 34 | 35 | r_json = json.loads(result[0]) 36 | if 'pl.content.homeFeed.index' == r_json['ns']: 37 | weibo_html_content = r_json['html'] 38 | break 39 | weibo_info = [] 40 | 41 | if weibo_html_content != '': 42 | weibo_html = BeautifulSoup(weibo_html_content, 'html.parser') 43 | weibos = weibo_html.find_all('div', 'WB_detail') 44 | 45 | for weibo in weibos: 46 | 47 | source = '' 48 | date = '' 49 | weibo_url = '' 50 | all_a = weibo.find_all('a', attrs={'class': 'S_txt2'}) 51 | weibo_text = weibo.find('div', attrs={'class': 'WB_text'}) 52 | content = weibo_text.text 53 | 54 | 55 | for _a in all_a: 56 | 57 | 58 | if _a.has_attr('date') and _a.has_attr('href'): 59 | 60 | date = _a.get('date') 61 | weibo_url = _a.get('href') 62 | if _a.has_attr('action-type'): 63 | source = _a.text 64 | weibo = Weibo(source=source, url=weibo_url, date_time=date, content=content) 65 | db_session.add(weibo) 66 | db_session.commit() 67 | weibo_info.append('date:%s\tsource:%s\turl:%s' % (date, source, weibo_url)) 68 | return weibo_info 69 | 70 | -------------------------------------------------------------------------------- /spiders/dist_weibo/tasks/login.py: -------------------------------------------------------------------------------- 1 | import execjs 2 | import requests 3 | import re 4 | import json 5 | import os 6 | from tasks.workers import app 7 | from dao.redis_cookies import RedisCookies 8 | from headers import get_header 9 | from logger import LOGGER 10 | 11 | def get_session(): 12 | return requests.session() 13 | 14 | 15 | def get_js_exec(path): 16 | phantom = execjs.get('PhantomJS') 17 | with open(path, 'r') as f: 18 | source = f.read() 19 | return phantom.compile(source) 20 | 21 | 22 | def get_encodename(name, js_exec): 23 | return js_exec.call('get_name', name) 24 | 25 | 26 | def get_password(password, pre_obj, exec_js): 27 | nonce = pre_obj['nonce'] 28 | pubkey = pre_obj['pubkey'] 29 | servertime = pre_obj['servertime'] 30 | return exec_js.call('get_pass', password, nonce, servertime, pubkey) 31 | 32 | 33 | def get_prelogin_info(prelogin_url, session): 34 | json_pattern = r'.*?\((.*)\)' 35 | response_str = session.get(prelogin_url).text 36 | m = re.match(json_pattern, response_str) 37 | return json.loads(m.group(1)) 38 | 39 | 40 | def get_redirect(data, post_url, session): 41 | print(data) 42 | print(post_url) 43 | logining_page = session.post(post_url, data=data, headers=get_header()) 44 | print(logining_page) 45 | login_loop = logining_page.content.decode('GBK') 46 | pa = r'location\.replace\([\'"](.*?)[\'"]\)' 47 | return re.findall(pa, login_loop)[0] 48 | 49 | 50 | def do_login(session, url): 51 | return session.get(url).text 52 | 53 | @app.task(ignore_result=True) 54 | def clean(): 55 | RedisCookies.clean() 56 | 57 | @app.task(ignore_result=True) 58 | def login(name='', password=''): 59 | json_pattern = r'.*?\((.*)\)' 60 | session = get_session() 61 | exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js') 62 | su = get_encodename(name, exec_js) 63 | post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' 64 | prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \ 65 | 'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)' 66 | 67 | pre_obj = get_prelogin_info(prelogin_url, session) 68 | ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js) 69 | data = { 70 | 'entry': 'weibo', 71 | 'gateway': '1', 72 | 'from': '', 73 | 'savestate': '7', 74 | 'useticket': '1', 75 | 'pagerefer': "http://login.sina.com.cn/sso/logout.php?" 76 | "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl", 77 | 'vsnf': '1', 78 | 'su': su, 79 | 'service': 'miniblog', 80 | 'servertime': pre_obj['servertime'], 81 | 'nonce': pre_obj['nonce'], 82 | 'pwencode': 'rsa2', 83 | 'rsakv': pre_obj['rsakv'], 84 | 'sp': ps, 85 | 'sr': '1366*768', 86 | 'encoding': 'UTF-8', 87 | 'prelt': '115', 88 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&' 89 | 'callback=parent.sinaSSOController.feedBackUrlCallBack', 90 | 'returntype': 'META', 91 | } 92 | url = get_redirect(data, post_url, session) 93 | print(url) 94 | login_info = do_login(session, url) 95 | m = re.match(json_pattern, login_info) 96 | info = json.loads(m.group(1)) 97 | RedisCookies.save_cookies(name, info['userinfo']['uniqueid'], 98 | cookies=session.cookies.get_dict()) 99 | 100 | # return session, info 101 | 102 | # session.get('http://weibo.com/u') 103 | if __name__ == '__main__': 104 | login() 105 | -------------------------------------------------------------------------------- /spiders/dist_weibo/tasks/mobile_login.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | args = { 4 | 'username': '767543579@qq.com', 5 | 'password': 'JOPPER', 6 | 'savestate': 1, 7 | 'ec': 0, 8 | 'pagerefer': 'https://passport.weibo.cn/signin/' 9 | 'welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F&wm=3349&vt=4', 10 | 'entry': 'mweibo', 11 | 'wentry': '', 12 | 'loginfrom': '', 13 | 'client_id': '', 14 | 'code': '', 15 | 'qq': '', 16 | 'hff': '', 17 | 'hfp': '' 18 | } 19 | 20 | session = requests.session() 21 | session.post('https://passport.weibo.cn/sso/login', data=args) 22 | resp = session.get('https://m.weibo.cn/api/container/getIndex?containerid=2304132210643391_-_WEIBO_SECOND_PROFILE_MORE_WEIBO&page=1') 23 | 24 | print(session.cookies.get_dict()) 25 | print(session.get('http://weibo.com/47452014').text) 26 | print(resp.text) 27 | 28 | 29 | -------------------------------------------------------------------------------- /spiders/dist_weibo/tasks/workers.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import os 3 | from celery import platforms 4 | from celery import Celery 5 | # root权限启动 6 | platforms.C_FORCE_ROOT = True 7 | 8 | get_broker_or_backend = ('redis://:''@127.0.0.1:6379/0', 'redis://:''@127.0.0.1:6379/1') 9 | 10 | worker_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'celery.log') 11 | beat_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'beat.log') 12 | 13 | tasks = ['tasks.login', 'tasks.home_page', 'tasks.user'] 14 | app = Celery('weibo_task', include=tasks, broker=get_broker_or_backend[0], backend=get_broker_or_backend[1]) -------------------------------------------------------------------------------- /spiders/dist_weibo/workers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo/workers.py -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/__init__.py -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/conf/account.conf: -------------------------------------------------------------------------------- 1 | user password -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/dao/__init__.py: -------------------------------------------------------------------------------- 1 | __dict__=""" 2 | data store module 3 | """ -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/dao/redis_cookies.py: -------------------------------------------------------------------------------- 1 | #-*- coding:utf-8 -*- 2 | import redis 3 | import json 4 | import datetime 5 | 6 | class RedisCookies(object): 7 | redis_pool = redis.ConnectionPool(host='localhost', port=6379, db=0) 8 | 9 | @classmethod 10 | def save_cookies(cls, user_name, unique_id, cookies): 11 | pickled_cookies = json.dumps({ 12 | 'cookies': cookies, 13 | 'unique_id': unique_id, 14 | 'login_time': datetime.datetime.now().timestamp() 15 | }) 16 | r = redis.Redis(connection_pool=cls.redis_pool) 17 | r.hset('account', user_name, pickled_cookies) 18 | cls.user_in_queue(user_name) 19 | 20 | @classmethod 21 | def user_in_queue(cls, user_name): 22 | r = redis.Redis(connection_pool=cls.redis_pool) 23 | if not r.sismember('users', user_name): 24 | r.sadd("users", user_name) 25 | 26 | @classmethod 27 | def fetch_cookies(cls): 28 | r = redis.Redis(connection_pool=cls.redis_pool) 29 | user = r.spop('users') 30 | r.sadd('users', user) 31 | return r.hget('account', user).decode('utf-8') 32 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/headers.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | 4 | user_agents = [ 5 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', 6 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) ' 7 | 'Chrome/57.0.2987.133 Safari/537.36', 8 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) ' 9 | 'Version/10.1 Safari/603.1.30', 10 | 'Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0', 11 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5', 12 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0', 13 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89' 14 | ' Safari/537.1 QIHU 360SE', 15 | 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.108 Safari/537.36 ' 16 | '2345Explorer/7.1.0.12633', 17 | 'Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Ubuntu/10.10 ' 18 | 'Chromium/8.0.552.237 Chrome/8.0.552.237 Safari/534.10', 19 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/34.0.1847.116 ' 20 | 'Chrome/34.0.1847.116 Safari/537.36', 21 | 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)', 22 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11) AppleWebKit/601.1.39 (KHTML, like Gecko) Version/9.0 ' 23 | 'Safari/601.1.39', 24 | 'Opera/9.80 (Windows NT 5.1) Presto/2.12.388 Version/12.14', 25 | 'Opera/9.80 (Linux armv6l ; U; CE-HTML/1.0 NETTV/3.0.1;; en) Presto/2.6.33 Version/10.60', 26 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; baidubrowser 1.x)', 27 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) ' 28 | 'Chrome/58.0.3029.110 Safari/537.36', 29 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' 30 | 'Version/5.1 Safari/534.50', 31 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) ' 32 | 'Version/5.1 Safari/534.50', 33 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 34 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 35 | 'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 36 | 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', 37 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', 38 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', 39 | 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', 40 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36', 41 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 ' 42 | 'Safari/537.36', 43 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36' 44 | ] 45 | 46 | 47 | headers = { 48 | 'User-Agent': random.choice(user_agents), 49 | 'Accept-Encoding': 'gzip, deflate, sdch', 50 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | 'Connection': 'keep-alive' 53 | } 54 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/login/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jjzhu' 2 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/login/login.py: -------------------------------------------------------------------------------- 1 | import execjs 2 | import requests 3 | import re 4 | import json 5 | import os 6 | from spiders.dist_weibo_spider.dao.redis_cookies import RedisCookies 7 | from spiders.dist_weibo_spider.headers import headers 8 | def get_session(): 9 | return requests.session() 10 | 11 | 12 | def get_js_exec(path): 13 | phantom = execjs.get('PhantomJS') 14 | with open(path, 'r') as f: 15 | source = f.read() 16 | return phantom.compile(source) 17 | 18 | 19 | def get_encodename(name, js_exec): 20 | return js_exec.call('get_name', name) 21 | 22 | 23 | def get_password(password, pre_obj, exec_js): 24 | nonce = pre_obj['nonce'] 25 | pubkey = pre_obj['pubkey'] 26 | servertime = pre_obj['servertime'] 27 | return exec_js.call('get_pass', password, nonce, servertime, pubkey) 28 | 29 | 30 | def get_prelogin_info(prelogin_url, session): 31 | json_pattern = r'.*?\((.*)\)' 32 | response_str = session.get(prelogin_url).text 33 | m = re.match(json_pattern, response_str) 34 | return json.loads(m.group(1)) 35 | 36 | 37 | def get_redirect(data, post_url, session): 38 | logining_page = session.post(post_url, data=data, headers=headers) 39 | login_loop = logining_page.content.decode('GBK') 40 | pa = r'location\.replace\([\'"](.*?)[\'"]\)' 41 | return re.findall(pa, login_loop)[0] 42 | 43 | 44 | def do_login(session, url): 45 | return session.get(url).text 46 | 47 | 48 | def login(): 49 | name = '18270916129' 50 | password = 'VS7452014' 51 | json_pattern = r'.*?\((.*)\)' 52 | session = get_session() 53 | exec_js = get_js_exec(os.path.split(os.path.realpath(__file__))[0]+'/../js/ssologin.js') 54 | su = get_encodename(name, exec_js) 55 | print(su) 56 | post_url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.18)' 57 | prelogin_url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&' \ 58 | 'su=' + su + '&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)' 59 | 60 | pre_obj = get_prelogin_info(prelogin_url, session) 61 | print(pre_obj) 62 | ps = get_password(password=password, pre_obj=pre_obj, exec_js=exec_js) 63 | print(ps) 64 | data = { 65 | 'entry': 'weibo', 66 | 'gateway': '1', 67 | 'from': '', 68 | 'savestate': '7', 69 | 'useticket': '1', 70 | 'pagerefer': "http://login.sina.com.cn/sso/logout.php?" 71 | "entry=miniblog&r=http%3A%2F%2Fweibo.com%2Flogout.php%3Fbackurl", 72 | 'vsnf': '1', 73 | 'su': su, 74 | 'service': 'miniblog', 75 | 'servertime': pre_obj['servertime'], 76 | 'nonce': pre_obj['nonce'], 77 | 'pwencode': 'rsa2', 78 | 'rsakv': pre_obj['rsakv'], 79 | 'sp': ps, 80 | 'sr': '1366*768', 81 | 'encoding': 'UTF-8', 82 | 'prelt': '115', 83 | 'url': 'http://weibo.com/ajaxlogin.php?framelogin=1&' 84 | 'callback=parent.sinaSSOController.feedBackUrlCallBack', 85 | 'returntype': 'META', 86 | } 87 | url = get_redirect(data, post_url, session) 88 | print(url) 89 | login_info = do_login(session, url) 90 | print(login_info) 91 | m = re.match(json_pattern, login_info) 92 | info = json.loads(m.group(1)) 93 | print(info) 94 | print(session.cookies.get_dict()) 95 | RedisCookies.save_cookies(name, info['userinfo']['uniqueid'], 96 | cookies=session.cookies.get_dict()) 97 | 98 | return session, info 99 | 100 | # session.get('http://weibo.com/u') 101 | if __name__ == '__main__': 102 | login() 103 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/notebook/Request.md: -------------------------------------------------------------------------------- 1 | - python3 2 | - execjs 3 | - phantomjs -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/tasks/__init__.py -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/tasks/home_page.py: -------------------------------------------------------------------------------- 1 | import json 2 | from spiders.dist_weibo_spider.dao.redis_cookies import RedisCookies 3 | from spiders.dist_weibo_spider.headers import headers 4 | import requests 5 | 6 | home_url = 'http://weibo.com/u/{}?is_ori=1&is_tag=0&profile_ftype=1&page=1' 7 | 8 | user_cookies = RedisCookies.fetch_cookies() 9 | cookies_json = json.loads(user_cookies) 10 | 11 | cookies = cookies_json['cookies'] 12 | print(cookies) 13 | unique_id = cookies_json['unique_id'] 14 | print(home_url.format(unique_id)) 15 | resp = requests.get(url=home_url.format(unique_id), headers=headers, cookies=cookies, verify=False).text 16 | print(resp) 17 | 18 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/tasks/workers.py: -------------------------------------------------------------------------------- 1 | # coding:urf-8 2 | import os 3 | from celery import platforms 4 | 5 | # root权限启动 6 | platforms.C_FORCE_ROOT = True 7 | 8 | worker_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'celery.log') 9 | beat_log_path = os.path.join(os.path.dirname(os.path.dirname(__file__))+'/logs', 'beat.log') 10 | -------------------------------------------------------------------------------- /spiders/dist_weibo_spider/workers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/dist_weibo_spider/workers.py -------------------------------------------------------------------------------- /spiders/distributed/README.md: -------------------------------------------------------------------------------- 1 | ## 分布式爬虫 -------------------------------------------------------------------------------- /spiders/distributed/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/__init__.py -------------------------------------------------------------------------------- /spiders/distributed/celeryt/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/celeryt/__init__.py -------------------------------------------------------------------------------- /spiders/distributed/celeryt/celerybeat-schedule: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/celeryt/celerybeat-schedule -------------------------------------------------------------------------------- /spiders/distributed/celeryt/tasks.py: -------------------------------------------------------------------------------- 1 | # -*- coding=utf-8 -*- 2 | from celery import Celery 3 | 4 | app = Celery('tasks', broker='redist://'':''@127.0.0.1:6379/0', backend='redist://'':''@127.0.0.1:6379/1') 5 | 6 | app.conf.update( 7 | # 配置所在时区 8 | CELERY_TIMEZONE='Asia/Shanghai', 9 | CELERY_ENABLE_UTC=True, 10 | # 官网推荐消息序列化方式为json 11 | CELERY_ACCEPT_CONTENT=['json'], 12 | CELERY_TASK_SERIALIZER='json', 13 | CELERY_RESULT_SERIALIZER='json', 14 | # 配置定时任务 15 | CELERYBEAT_SCHEDULE={ 16 | 'my_task': { 17 | 'task': 'tasks.add', # tasks.py模块下的add方法 18 | 'schedule': 1, # 每隔60运行一次 19 | 'args': (23, 12), 20 | } 21 | } 22 | ) 23 | 24 | 25 | @app.task 26 | def add(x, y): 27 | return x + y 28 | 29 | 30 | @app.task 31 | def sub(x, y): 32 | return x - y -------------------------------------------------------------------------------- /spiders/distributed/celeryt/test.py: -------------------------------------------------------------------------------- 1 | from tasks import add, sub 2 | 3 | add_rs = add.delay(1, 2) 4 | sub_rs = sub.delay(3, 1) 5 | print add_rs.get() 6 | print sub_rs.get() 7 | -------------------------------------------------------------------------------- /spiders/distributed/redist/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/distributed/redist/__init__.py -------------------------------------------------------------------------------- /spiders/distributed/redist/test_redis.py: -------------------------------------------------------------------------------- 1 | import redis 2 | r = redis.StrictRedis(host='localhost', port=6379, db=0) 3 | print(r.get('name')) 4 | -------------------------------------------------------------------------------- /spiders/distributed/task_dispatcher.py: -------------------------------------------------------------------------------- 1 | from workers import app 2 | 3 | crawl_urls = [ 4 | 'http://docs.celeryproject.org/en/latest/getting-started/introduction.html', 5 | 'http://docs.celeryproject.org/en/latest/getting-started/brokers/index.html', 6 | 'http://docs.celeryproject.org/en/latest/getting-started/first-steps-with-celery.html', 7 | 'http://docs.celeryproject.org/en/latest/getting-started/next-steps.html', 8 | 'http://docs.celeryproject.org/en/latest/getting-started/resources.html', 9 | 'http://docs.celeryproject.org/en/latest/userguide/application.html', 10 | 'http://docs.celeryproject.org/en/latest/userguide/tasks.html', 11 | 'http://docs.celeryproject.org/en/latest/userguide/canvas.html', 12 | 'http://docs.celeryproject.org/en/latest/userguide/workers.html', 13 | 'http://docs.celeryproject.org/en/latest/userguide/daemonizing.html', 14 | 'http://docs.celeryproject.org/en/latest/userguide/periodic-tasks.html' 15 | ] 16 | 17 | 18 | def manage_crawl_task(urls): 19 | for url in urls: 20 | app.send_task('tasks.crawl', args=(url,)) 21 | 22 | if __name__ == '__main__': 23 | manage_crawl_task(crawl_urls) 24 | -------------------------------------------------------------------------------- /spiders/distributed/tasks.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | from workers import app 4 | 5 | 6 | @app.task 7 | def crawl(url): 8 | print 'crawl url:{}'.format(url) 9 | rsp_test = requests.get(url).text 10 | soup = BeautifulSoup(rsp_test, 'html.parser') 11 | return soup.find('h1').text 12 | 13 | -------------------------------------------------------------------------------- /spiders/distributed/workers.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | 3 | redis_host = '127.0.0.1' 4 | redis_port = 6379 5 | app = Celery('crawl_task', include=['tasks'], broker='redis://%s:%d/1' % (redis_host, redis_port), 6 | backend='redis://%s:%d/2' % (redis_host, redis_port)) 7 | app.conf.update( 8 | CELERY_TIMEZONE='Asia/Shanghai', 9 | CELERY_ENABLE_UTC=True, 10 | CELERY_ACCEPT_CONTENT=['json'], 11 | CELERY_TASK_SERIALIZER='json', 12 | CELERY_RESULT_SERIALIZER='json', 13 | ) 14 | -------------------------------------------------------------------------------- /spiders/logger.py: -------------------------------------------------------------------------------- 1 | import platform 2 | import logging 3 | import logging.config 4 | import os 5 | 6 | def logger_conf(): 7 | """ 8 | load basic logger configure 9 | :return: configured logger 10 | """ 11 | 12 | if platform.system() == 'Windows': 13 | 14 | logging.config.fileConfig(os.path.abspath('../../')+'\\conf\\logging.conf') 15 | elif platform.system() == 'Linux': 16 | 17 | logging.config.fileConfig(os.path.abspath('../../')+'/conf/logging.conf') 18 | elif platform.system() == 'Darwin': 19 | print(os.path.abspath('../../')) 20 | logging.config.fileConfig(os.path.abspath('../../') + '/conf/logging.conf') 21 | logger = logging.getLogger('simpleLogger') 22 | 23 | return logger 24 | 25 | LOGGER = logger_conf() -------------------------------------------------------------------------------- /spiders/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django.utils.translation import ugettext_lazy as _ 3 | import datetime 4 | # Create your models here. 5 | 6 | 7 | class ShopId(models.Model): 8 | shop_id = models.CharField(max_length=20, primary_key=True) 9 | from_url = models.CharField(max_length=200, null=True) 10 | 11 | 12 | class BaiKeRank(models.Model): 13 | rank = models.IntegerField(null=True) 14 | name = models.CharField(max_length=50, null=True) 15 | ori_score = models.CharField(max_length=50, null=True) 16 | rank_time = models.CharField(max_length=20) 17 | 18 | def __str__(self): 19 | return str(self.rank) +'\t' + self.name + '\t' + self.ori_score 20 | 21 | 22 | class ShopInfo(models.Model): 23 | shop_id = models.CharField(max_length=20, primary_key=True) 24 | shop_name = models.CharField(max_length=200, default='') 25 | review_count = models.CharField(max_length=20, default='') 26 | avg_price = models.CharField(max_length=20, default='') 27 | taste = models.CharField(max_length=10, default='') 28 | env = models.CharField(max_length=10, default='') 29 | service = models.CharField(max_length=10, default='') 30 | address = models.CharField(max_length=200, default='') 31 | open_time = models.CharField(max_length=200, default='') 32 | rank_star = models.CharField(max_length=20, default='') 33 | place = models.CharField(max_length=20, default='') 34 | classify = models.CharField(max_length=20, default='') 35 | star_all = models.CharField(max_length=20, default='') 36 | star_5 = models.CharField(max_length=20, default='') 37 | star_4 = models.CharField(max_length=20, default='') 38 | star_3 = models.CharField(max_length=20, default='') 39 | star_2 = models.CharField(max_length=20, default='') 40 | star_1 = models.CharField(max_length=20, default='') 41 | feature = models.BooleanField(default=False) 42 | feature2 = models.CharField(max_length=200, default='') 43 | 44 | 45 | class ReviewDedail(models.Model): 46 | shop_id = models.CharField(max_length=20, primary_key=True) 47 | star_all = models.CharField(max_length=20, null=True) 48 | star_5 = models.CharField(max_length=20, null=True) 49 | star_4 = models.CharField(max_length=20, null=True) 50 | star_3 = models.CharField(max_length=20, null=True) 51 | star_2 = models.CharField(max_length=20, null=True) 52 | star_1 = models.CharField(max_length=20, null=True) 53 | first_review_time = models.CharField(max_length=100, null=True) 54 | first_review_content = models.TextField(null=True) 55 | 56 | 57 | class WeiboUser(models.Model): 58 | GENDER = ('m', 'f', 'u') 59 | id = models.CharField(max_length=12, primary_key=True) # 用户id 60 | profile_url = models.CharField(max_length=400, null=True) # url 61 | description = models.CharField(max_length=1000, null=True) # 简介 62 | created_at = models.CharField(max_length=100, null=True) # 创建时间 63 | screen_name = models.CharField(max_length=100, null=True) # 昵称 64 | nativePlace = models.CharField(max_length=10, null=True) # 所在地 65 | mblogNum = models.CharField(max_length=20, null=True) # 微博数 66 | attNum = models.CharField(max_length=20, null=True) # 关注数 67 | fansNum = models.CharField(max_length=20, null=True) # 粉丝数 68 | gender = models.CharField(max_length=10, null=True) # 性别 69 | school = models.CharField(max_length=100, null=True) 70 | def __str__(self): 71 | return '\n\t'+'user: ' + self.screen_name + '\n\t'+'id: '+str(self.id)+'\n\t'\ 72 | + '昵称:'+self.screen_name + '\n\t'+'微博数:'+str(self.mblogNum)+'\n\t'+'关注:'+str(self.attNum) 73 | 74 | 75 | class UserRelationship(models.Model): 76 | user = models.ForeignKey(WeiboUser, related_name='user') 77 | follower = models.ForeignKey(WeiboUser, related_name='follower') 78 | 79 | class Meta: 80 | unique_together = ('user', 'follower') 81 | primary = ('user', 'follower') 82 | 83 | def __str__(self): 84 | return self.follower.screen_name +'--->'+self.user.screen_name 85 | 86 | 87 | class Weibo(models.Model): 88 | id = models.CharField(max_length=20, primary_key=True) 89 | user = models.ForeignKey(WeiboUser) 90 | text = models.TextField(null=False) 91 | created_timestamp = models.CharField(max_length=20, null=True) 92 | retweented_status = models.ForeignKey('self', null=True) 93 | source = models.CharField(max_length=200, null=True) 94 | 95 | def __str__(self): 96 | return '\n\t'+'user:'+self.user.screen_name+'\n\t'+'blog_id:'+self.id 97 | 98 | 99 | class Comment(models.Model): 100 | name = models.CharField(_('name'), max_length=64) 101 | email_address = models.EmailField(_('email address')) 102 | homepage = models.URLField(_('home page'), blank=True) 103 | comment = models.TextField(_('comment')) 104 | pub_date = models.DateTimeField(_('Published date'), editable=False, auto_now_add=True) 105 | is_spam = models.BooleanField(_('spam?'), default=False, editable=False) 106 | 107 | class Meta: 108 | verbose_name = _('comment') 109 | verbose_name_plural = _('comment') 110 | 111 | 112 | class Step(models.Model): 113 | steps = models.IntegerField() 114 | curr_time = models.DateTimeField(default=datetime.datetime.now()) 115 | 116 | def __str__(self): 117 | return '{steps:%d, time:%s}' % (self.steps, self.curr_time.strftime('%Y-%m-%d %H:%M:%S')) 118 | 119 | 120 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/mzi/mzi/__init__.py -------------------------------------------------------------------------------- /spiders/mzi/mzi/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class MziItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MziSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class MziPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for mzi project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'mzi' 13 | 14 | SPIDER_MODULES = ['mzi.spiders'] 15 | NEWSPIDER_MODULE = 'mzi.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'mzi (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | DEFAULT_REQUEST_HEADERS = { 43 | 'Accept': '*/*', 44 | # 'Accept-Encoding': 'gzip,deflate,sdch', 45 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 46 | 'Host': 'baike.baidu.com', 47 | 'Proxy-Connection': 'keep-alive', 48 | 'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", 49 | 50 | # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36' 51 | # ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' 52 | } 53 | 54 | # Enable or disable spider middlewares 55 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 56 | #SPIDER_MIDDLEWARES = { 57 | # 'mzi.middlewares.MziSpiderMiddleware': 543, 58 | #} 59 | 60 | # Enable or disable downloader middlewares 61 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 62 | #DOWNLOADER_MIDDLEWARES = { 63 | # 'mzi.middlewares.MyCustomDownloaderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable extensions 67 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 68 | #EXTENSIONS = { 69 | # 'scrapy.extensions.telnet.TelnetConsole': None, 70 | #} 71 | 72 | # Configure item pipelines 73 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 74 | #ITEM_PIPELINES = { 75 | # 'mzi.pipelines.MziPipeline': 300, 76 | #} 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | #AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | #AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | #AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | #AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | #HTTPCACHE_ENABLED = True 94 | #HTTPCACHE_EXPIRATION_SECS = 0 95 | #HTTPCACHE_DIR = 'httpcache' 96 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spiders/mzi/mzi/spiders/baikerank.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | class BaikeRank(scrapy.Spider): 4 | name = 'baikerank' 5 | start_urls = [ 6 | 'http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek' 7 | ] 8 | 9 | def parse(self, response): 10 | rt = json.loads(response.body()) 11 | print('_'*50) 12 | print(rt) -------------------------------------------------------------------------------- /spiders/mzi/mzi/spiders/meizi.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import os 4 | import urllib.request 5 | 6 | class Meizi(scrapy.Spider): 7 | name = 'mzi' 8 | # allowed_domains = ['http://www.mzitu.com/'] 9 | start_urls = [ 10 | 'http://www.mzitu.com/' 11 | ] 12 | href_pattern = re.compile('(.*?)') 13 | base_dir = '/Users/didi/crawler/mzi/' 14 | 15 | def parse(self, response): 16 | 17 | header_hrefs = response.css('ul.menu li a::attr(href)').extract() 18 | for ref in header_hrefs: 19 | print(ref) 20 | yield scrapy.Request(ref, callback=self.parse_classify) 21 | 22 | def parse_classify(self, response): 23 | pic_hrefs = response.css('div.postlist ul li a::attr(href)').extract() 24 | for href in pic_hrefs: 25 | yield scrapy.Request(url=href, callback=self.parse_detail) 26 | 27 | max_page = int(response.css('a.page-numbers::text').extract()[-2]) 28 | base_url = (response.url if response.url.endswith('/') else response.url+'/') + 'page/' 29 | for pn in range(max_page+1): 30 | yield scrapy.Request(url=base_url+str(pn), callback=self.parse_classify) 31 | 32 | def parse_detail(self, response): 33 | title = response.css('div.main-image img::attr(alt)').extract()[0] 34 | if not os.path.exists(self.base_dir+title): 35 | os.mkdir(self.base_dir+title) 36 | 37 | img_src = response.css('div.main-image img::attr(src)').extract()[0] 38 | img_path = self.base_dir+title+'/'+img_src[img_src.rindex('/')+1:] 39 | urllib.request.urlretrieve(img_src, img_path) 40 | max_page = int(response.css('div.pagenavi a span::text').extract()[-2]) 41 | 42 | base_url = response.url if response.url.endswith('/') else response.url+'/' 43 | for pn in range(max_page+1): 44 | yield scrapy.Request(base_url+str(pn), callback=self.parse_detail) 45 | 46 | print(img_path) 47 | 48 | -------------------------------------------------------------------------------- /spiders/mzi/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = mzi.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = mzi 12 | -------------------------------------------------------------------------------- /spiders/onepiece/onepiece/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class OnepieceItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /spiders/onepiece/onepiece/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class OnepieceSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /spiders/onepiece/onepiece/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class OnepiecePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /spiders/onepiece/onepiece/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for onepiece project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'onepiece' 13 | 14 | SPIDER_MODULES = ['onepiece.spiders'] 15 | NEWSPIDER_MODULE = 'onepiece.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'onepiece (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'onepiece.middlewares.OnepieceSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'onepiece.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'onepiece.pipelines.OnepiecePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /spiders/onepiece/onepiece/spiders/one_piece.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import urllib.request 3 | 4 | class OnePiece(scrapy.Spider): 5 | name = 'onepiece' 6 | start_urls = [ 7 | 'http://www.dlkoo.com/down/3/2015/368456186.html', 8 | ] 9 | base_dir = '/Users/didi/crawler/onepiece/' 10 | def parse(self, response): 11 | url = response.url 12 | link_list = response.xpath("//div[@id='dlinklist']").css("a::attr(href)").extract() 13 | torrent_url = 'http://www.dlkoo.com/down/downfile.asp?act=subb&n=%s/downfile2.asp?act=down&n=%s' 14 | 15 | for link in link_list: 16 | id = link.split("=")[1] 17 | 18 | urllib.request.urlretrieve(torrent_url % (id, id)) 19 | 20 | pass 21 | -------------------------------------------------------------------------------- /spiders/onepiece/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = onepiece.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = onepiece 12 | -------------------------------------------------------------------------------- /spiders/pangxieyg/README.MD: -------------------------------------------------------------------------------- 1 | ## 螃蟹云购 2 | 监控商品有无货,有货时提醒 -------------------------------------------------------------------------------- /spiders/pangxieyg/README.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/README.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = 'pyinstaller -F pangxie2.py' -------------------------------------------------------------------------------- /spiders/pangxieyg/alert-templates.xml: -------------------------------------------------------------------------------- 1 | 2 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 28 | 121 |
122 |
123 |

Services Reporting Alerts

124 |
125 |
126 | 127 | #foreach( $alertState in $alertStates ) 128 | #if( $summary.getServicesByAlertState($alertState) ) 129 | 130 | 133 | 136 | 137 | #end 138 | #end 139 |
131 | $alertState 132 | 134 | $summary.getServicesByAlertState($alertState) 135 |
140 |
141 |
142 | 143 | #foreach( $service in $services ) 144 |
145 |
146 |

$service

147 |
148 |
149 | 150 | #foreach( $alertState in $alertStates ) 151 | #foreach( $alert in $summary.getAlerts($service,$alertState) ) 152 | 153 | 156 | 162 | 163 | #end 164 | #end 165 |
154 | $alertState 155 | 157 | $alert.getAlertDefinition().getLabel() 158 |
159 | $alert.getAlertText() 160 |
161 |
166 |
167 |
168 | #end 169 | 174 | 175 | ]]> 176 | 177 |
178 | 179 | 180 | 181 | 182 | 183 | 194 | 195 | 196 |
-------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/out00-EXE.toc: -------------------------------------------------------------------------------- 1 | ('E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\dist\\pangxie2.exe', 2 | True, 3 | False, 4 | False, 5 | None, 6 | None, 7 | False, 8 | False, 9 | u'', 10 | True, 11 | 'pangxie2.pkg', 12 | [('out00-PYZ.pyz', 13 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PYZ.pyz', 14 | 'PYZ'), 15 | ('struct', 'd:\\python27\\lib\\struct.pyc', 'PYMODULE'), 16 | ('pyimod01_os_path', 17 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod01_os_path.pyc', 18 | 'PYMODULE'), 19 | ('pyimod02_archive', 20 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod02_archive.pyc', 21 | 'PYMODULE'), 22 | ('pyimod03_importers', 23 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod03_importers.pyc', 24 | 'PYMODULE'), 25 | ('pyiboot01_bootstrap', 26 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyiboot01_bootstrap.py', 27 | 'PYSOURCE'), 28 | ('pangxie2', 29 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\pangxie2.py', 30 | 'PYSOURCE'), 31 | (u'Microsoft.VC90.CRT.manifest', 32 | u'C:\\windows\\WinSxS\\Manifests\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57.manifest', 33 | 'BINARY'), 34 | (u'msvcr90.dll', 35 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcr90.dll', 36 | 'BINARY'), 37 | (u'msvcp90.dll', 38 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcp90.dll', 39 | 'BINARY'), 40 | (u'msvcm90.dll', 41 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcm90.dll', 42 | 'BINARY'), 43 | ('python27.dll', 'C:\\windows\\system32\\python27.dll', 'BINARY'), 44 | ('unicodedata', 'd:\\python27\\DLLs\\unicodedata.pyd', 'EXTENSION'), 45 | ('bz2', 'd:\\python27\\DLLs\\bz2.pyd', 'EXTENSION'), 46 | ('_hashlib', 'd:\\python27\\DLLs\\_hashlib.pyd', 'EXTENSION'), 47 | ('_ssl', 'd:\\python27\\DLLs\\_ssl.pyd', 'EXTENSION'), 48 | ('_socket', 'd:\\python27\\DLLs\\_socket.pyd', 'EXTENSION'), 49 | ('select', 'd:\\python27\\DLLs\\select.pyd', 'EXTENSION'), 50 | ('pyexpat', 'd:\\python27\\DLLs\\pyexpat.pyd', 'EXTENSION'), 51 | ('_ctypes', 'd:\\python27\\DLLs\\_ctypes.pyd', 'EXTENSION'), 52 | ('win32pipe', 53 | 'd:\\python27\\lib\\site-packages\\win32\\win32pipe.pyd', 54 | 'EXTENSION'), 55 | ('_multiprocessing', 56 | 'd:\\python27\\DLLs\\_multiprocessing.pyd', 57 | 'EXTENSION'), 58 | ('win32evtlog', 59 | 'd:\\python27\\lib\\site-packages\\win32\\win32evtlog.pyd', 60 | 'EXTENSION'), 61 | ('win32api', 62 | 'd:\\python27\\lib\\site-packages\\win32\\win32api.pyd', 63 | 'EXTENSION'), 64 | ('_portaudio', 65 | 'd:\\python27\\lib\\site-packages\\_portaudio.pyd', 66 | 'EXTENSION'), 67 | ('win32wnet', 68 | 'd:\\python27\\lib\\site-packages\\win32\\win32wnet.pyd', 69 | 'EXTENSION'), 70 | ('pywintypes27.dll', 71 | 'd:\\python27\\lib\\site-packages\\pywin32_system32\\pywintypes27.dll', 72 | 'BINARY'), 73 | ('certifi\\cacert.pem', 74 | 'd:\\python27\\lib\\site-packages\\certifi\\cacert.pem', 75 | 'DATA'), 76 | ('certifi\\old_root.pem', 77 | 'd:\\python27\\lib\\site-packages\\certifi\\old_root.pem', 78 | 'DATA'), 79 | ('Include\\pyconfig.h', 'd:\\python27\\Include\\pyconfig.h', 'DATA'), 80 | ('certifi\\weak.pem', 81 | 'd:\\python27\\lib\\site-packages\\certifi\\weak.pem', 82 | 'DATA'), 83 | ('pangxie2.exe.manifest', 84 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\pangxie2.exe.manifest', 85 | 'BINARY'), 86 | ('pyi-windows-manifest-filename pangxie2.exe.manifest', '', 'OPTION')], 87 | [], 88 | False, 89 | False, 90 | 1497602441L, 91 | [('run.exe', 92 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\bootloader\\Windows-32bit\\run.exe', 93 | 'EXECUTABLE')]) 94 | -------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/out00-PKG.pkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/build/pangxie2/out00-PKG.pkg -------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/out00-PKG.toc: -------------------------------------------------------------------------------- 1 | ('E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PKG.pkg', 2 | {'BINARY': 1, 3 | 'DATA': 1, 4 | 'EXECUTABLE': 1, 5 | 'EXTENSION': 1, 6 | 'PYMODULE': 1, 7 | 'PYSOURCE': 1, 8 | 'PYZ': 0}, 9 | [('out00-PYZ.pyz', 10 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\out00-PYZ.pyz', 11 | 'PYZ'), 12 | ('struct', 'd:\\python27\\lib\\struct.pyc', 'PYMODULE'), 13 | ('pyimod01_os_path', 14 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod01_os_path.pyc', 15 | 'PYMODULE'), 16 | ('pyimod02_archive', 17 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod02_archive.pyc', 18 | 'PYMODULE'), 19 | ('pyimod03_importers', 20 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyimod03_importers.pyc', 21 | 'PYMODULE'), 22 | ('pyiboot01_bootstrap', 23 | 'd:\\python27\\lib\\site-packages\\PyInstaller\\loader\\pyiboot01_bootstrap.py', 24 | 'PYSOURCE'), 25 | ('pangxie2', 26 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\pangxie2.py', 27 | 'PYSOURCE'), 28 | (u'Microsoft.VC90.CRT.manifest', 29 | u'C:\\windows\\WinSxS\\Manifests\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57.manifest', 30 | 'BINARY'), 31 | (u'msvcr90.dll', 32 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcr90.dll', 33 | 'BINARY'), 34 | (u'msvcp90.dll', 35 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcp90.dll', 36 | 'BINARY'), 37 | (u'msvcm90.dll', 38 | u'C:\\windows\\WinSxS\\x86_microsoft.vc90.crt_1fc8b3b9a1e18e3b_9.0.30729.6161_none_50934f2ebcb7eb57\\msvcm90.dll', 39 | 'BINARY'), 40 | ('python27.dll', 'C:\\windows\\system32\\python27.dll', 'BINARY'), 41 | ('unicodedata', 'd:\\python27\\DLLs\\unicodedata.pyd', 'EXTENSION'), 42 | ('bz2', 'd:\\python27\\DLLs\\bz2.pyd', 'EXTENSION'), 43 | ('_hashlib', 'd:\\python27\\DLLs\\_hashlib.pyd', 'EXTENSION'), 44 | ('_ssl', 'd:\\python27\\DLLs\\_ssl.pyd', 'EXTENSION'), 45 | ('_socket', 'd:\\python27\\DLLs\\_socket.pyd', 'EXTENSION'), 46 | ('select', 'd:\\python27\\DLLs\\select.pyd', 'EXTENSION'), 47 | ('pyexpat', 'd:\\python27\\DLLs\\pyexpat.pyd', 'EXTENSION'), 48 | ('_ctypes', 'd:\\python27\\DLLs\\_ctypes.pyd', 'EXTENSION'), 49 | ('win32pipe', 50 | 'd:\\python27\\lib\\site-packages\\win32\\win32pipe.pyd', 51 | 'EXTENSION'), 52 | ('_multiprocessing', 53 | 'd:\\python27\\DLLs\\_multiprocessing.pyd', 54 | 'EXTENSION'), 55 | ('win32evtlog', 56 | 'd:\\python27\\lib\\site-packages\\win32\\win32evtlog.pyd', 57 | 'EXTENSION'), 58 | ('win32api', 59 | 'd:\\python27\\lib\\site-packages\\win32\\win32api.pyd', 60 | 'EXTENSION'), 61 | ('_portaudio', 62 | 'd:\\python27\\lib\\site-packages\\_portaudio.pyd', 63 | 'EXTENSION'), 64 | ('win32wnet', 65 | 'd:\\python27\\lib\\site-packages\\win32\\win32wnet.pyd', 66 | 'EXTENSION'), 67 | ('pywintypes27.dll', 68 | 'd:\\python27\\lib\\site-packages\\pywin32_system32\\pywintypes27.dll', 69 | 'BINARY'), 70 | ('certifi\\cacert.pem', 71 | 'd:\\python27\\lib\\site-packages\\certifi\\cacert.pem', 72 | 'DATA'), 73 | ('certifi\\old_root.pem', 74 | 'd:\\python27\\lib\\site-packages\\certifi\\old_root.pem', 75 | 'DATA'), 76 | ('Include\\pyconfig.h', 'd:\\python27\\Include\\pyconfig.h', 'DATA'), 77 | ('certifi\\weak.pem', 78 | 'd:\\python27\\lib\\site-packages\\certifi\\weak.pem', 79 | 'DATA'), 80 | ('pangxie2.exe.manifest', 81 | 'E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg\\build\\pangxie2\\pangxie2.exe.manifest', 82 | 'BINARY'), 83 | ('pyi-windows-manifest-filename pangxie2.exe.manifest', '', 'OPTION')], 84 | False, 85 | False, 86 | False) 87 | -------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/out00-PYZ.pyz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/build/pangxie2/out00-PYZ.pyz -------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/pangxie2.exe.manifest: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /spiders/pangxieyg/build/pangxie2/warnpangxie2.txt: -------------------------------------------------------------------------------- 1 | missing module named org - imported by copy 2 | missing module named fcntl - imported by tempfile, subprocess 3 | missing module named readline - imported by cmd, pdb 4 | missing module named pwd - imported by posixpath, getpass, netrc, shutil, tarfile 5 | missing module named _dummy_threading - imported by dummy_threading 6 | missing module named _scproxy - imported by urllib 7 | missing module named EasyDialogs - imported by getpass 8 | missing module named termios - imported by getpass 9 | missing module named SOCKS - imported by ftplib 10 | missing module named rourl2path - imported by urllib 11 | missing module named vms_lib - imported by platform 12 | missing module named 'org.python' - imported by pickle, xml.sax 13 | missing module named 'java.lang' - imported by platform, xml.sax._exceptions 14 | missing module named java - imported by platform 15 | missing module named _xmlplus - imported by xml 16 | missing module named 'Carbon.File' - imported by plistlib 17 | missing module named 'Carbon.Files' - imported by plistlib 18 | missing module named Carbon - imported by plistlib 19 | missing module named MacOS - imported by platform 20 | missing module named macresource - imported by MacOS 21 | missing module named gestalt - imported by platform 22 | missing module named winreg.OpenKeyEx - imported by winreg, platform 23 | missing module named winreg.HKEY_LOCAL_MACHINE - imported by winreg, platform 24 | missing module named winreg.QueryValueEx - imported by winreg, platform 25 | missing module named winreg.CloseKey - imported by winreg, platform 26 | missing module named riscosenviron - imported by os 27 | missing module named riscospath - imported by os 28 | missing module named riscos - imported by os 29 | missing module named ce - imported by os 30 | missing module named _emx_link - imported by os 31 | missing module named os2 - imported by os 32 | missing module named posix - imported by os 33 | missing module named resource - imported by posix 34 | missing module named _xmlrpclib - imported by xmlrpclib 35 | missing module named _sysconfigdata - imported by distutils.sysconfig 36 | missing module named grp - imported by shutil, tarfile 37 | missing module named 'urllib.request' - imported by requests.compat 38 | missing module named 'urllib.parse' - imported by requests.compat 39 | missing module named ipaddress - imported by urllib3.packages.ssl_match_hostname._implementation 40 | missing module named backports - imported by urllib3.packages.ssl_match_hostname 41 | missing module named simplejson - imported by requests.compat 42 | missing module named 'urllib3.packages.six.moves' - imported by urllib3.exceptions, urllib3.connectionpool, urllib3.connection, urllib3.util.response, urllib3.request, urllib3.response, urllib3.poolmanager 43 | missing module named socks - imported by urllib3.contrib.socks 44 | missing module named 'OpenSSL.crypto' - imported by urllib3.contrib.pyopenssl 45 | missing module named 'cryptography.x509' - imported by urllib3.contrib.pyopenssl 46 | missing module named six - imported by urllib3.contrib.pyopenssl 47 | missing module named 'cryptography.hazmat' - imported by urllib3.contrib.pyopenssl 48 | missing module named cryptography - imported by urllib3.contrib.pyopenssl 49 | missing module named OpenSSL - imported by urllib3.contrib.pyopenssl 50 | -------------------------------------------------------------------------------- /spiders/pangxieyg/conf/README.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/conf/README.MD -------------------------------------------------------------------------------- /spiders/pangxieyg/conf/info.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | timeout=10 3 | sleep=1 4 | [user] 5 | username=13486178520 6 | password=**** 7 | goods=109121 111592 121557 107494 8 | [mail] 9 | to=767543579@qq.com jjzhu_ncu@163.com 10 | [thread] 11 | thread_num=4 -------------------------------------------------------------------------------- /spiders/pangxieyg/conf/logging.conf: -------------------------------------------------------------------------------- 1 | #logging.conf 2 | ###################################################################### 3 | [loggers] # 配置了两个logger 4 | keys=root, simpleLogger 5 | 6 | [handlers] # 配置所需要的handler 7 | keys=consoleHandler,fileHandler,rotatingFileHandler 8 | 9 | [formatters] # 配置formatter 10 | keys=simpleFmt 11 | 12 | [logger_root] 13 | level=INFO 14 | handlers=rotatingFileHandler 15 | 16 | [logger_simpleLogger] # 对simpleLogger进行相关配置 17 | level=INFO 18 | handlers=consoleHandler,rotatingFileHandler 19 | qualname=simpleLogger 20 | propagate=0 21 | 22 | [handler_consoleHandler] # 在控制台输出日志信息的处理方式 23 | class=StreamHandler 24 | level=INFO 25 | formatter=simpleFmt 26 | args=(sys.stdout,) 27 | 28 | [handler_fileHandler] 29 | class=FileHandler 30 | level=INFO 31 | formatter=simpleFmt 32 | args=('./log/pangxieyg.log','a') 33 | 34 | [handler_rotatingFileHandler] # 设置日志备份 35 | class=handlers.RotatingFileHandler 36 | level=INFO 37 | formatter=simpleFmt 38 | args=('./log/pangxieyg.log','a',50*1024*1024, 10) 39 | 40 | [formatter_simpleFmt] 41 | format=%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s 42 | datefmt='%Y-%m-%d %H:%M:%S' -------------------------------------------------------------------------------- /spiders/pangxieyg/dist.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist.zip -------------------------------------------------------------------------------- /spiders/pangxieyg/dist/conf/README.MD: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist/conf/README.MD -------------------------------------------------------------------------------- /spiders/pangxieyg/dist/conf/info.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | timeout=10 3 | sleep=1 4 | [user] 5 | username=13486178520 6 | password=vs7452014 7 | goods=109121 111592 121557 107494 8 | [mail] 9 | to=767543579@qq.com jjzhu_ncu@163.com 10 | [thread] 11 | thread_num=4 -------------------------------------------------------------------------------- /spiders/pangxieyg/dist/conf/logging.conf: -------------------------------------------------------------------------------- 1 | #logging.conf 2 | ###################################################################### 3 | [loggers] # 配置了两个logger 4 | keys=root, simpleLogger 5 | 6 | [handlers] # 配置所需要的handler 7 | keys=consoleHandler,fileHandler,rotatingFileHandler 8 | 9 | [formatters] # 配置formatter 10 | keys=simpleFmt 11 | 12 | [logger_root] 13 | level=INFO 14 | handlers=rotatingFileHandler 15 | 16 | [logger_simpleLogger] # 对simpleLogger进行相关配置 17 | level=INFO 18 | handlers=consoleHandler,rotatingFileHandler 19 | qualname=simpleLogger 20 | propagate=0 21 | 22 | [handler_consoleHandler] # 在控制台输出日志信息的处理方式 23 | class=StreamHandler 24 | level=INFO 25 | formatter=simpleFmt 26 | args=(sys.stdout,) 27 | 28 | [handler_fileHandler] 29 | class=FileHandler 30 | level=INFO 31 | formatter=simpleFmt 32 | args=('./log/pangxieyg.log','a') 33 | 34 | [handler_rotatingFileHandler] # 设置日志备份 35 | class=handlers.RotatingFileHandler 36 | level=INFO 37 | formatter=simpleFmt 38 | args=('./log/pangxieyg.log','a',50*1024*1024, 10) 39 | 40 | [formatter_simpleFmt] 41 | format=%(asctime)s - %(levelname)s - [%(filename)s:%(lineno)s] - %(message)s 42 | datefmt='%Y-%m-%d %H:%M:%S' -------------------------------------------------------------------------------- /spiders/pangxieyg/dist/pangxie2.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/dist/pangxie2.exe -------------------------------------------------------------------------------- /spiders/pangxieyg/i.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['i', 'ico.ico', 'py'], 7 | pathex=['E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg'], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=[], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher) 17 | pyz = PYZ(a.pure, a.zipped_data, 18 | cipher=block_cipher) 19 | exe = EXE(pyz, 20 | a.scripts, 21 | a.binaries, 22 | a.zipfiles, 23 | a.datas, 24 | name='i', 25 | debug=False, 26 | strip=False, 27 | upx=True, 28 | console=True ) 29 | -------------------------------------------------------------------------------- /spiders/pangxieyg/ico.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/ico.ico -------------------------------------------------------------------------------- /spiders/pangxieyg/notify.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/notify.wav -------------------------------------------------------------------------------- /spiders/pangxieyg/pangxie2.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['pangxie2.py'], 7 | pathex=['E:\\codingspace\\python\\Jpider\\spiders\\pangxieyg'], 8 | binaries=[], 9 | datas=[], 10 | hiddenimports=[], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher) 17 | pyz = PYZ(a.pure, a.zipped_data, 18 | cipher=block_cipher) 19 | exe = EXE(pyz, 20 | a.scripts, 21 | a.binaries, 22 | a.zipfiles, 23 | a.datas, 24 | name='pangxie2', 25 | debug=False, 26 | strip=False, 27 | upx=True, 28 | console=True ) 29 | -------------------------------------------------------------------------------- /spiders/pangxieyg/pangxieyg.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import urllib.request 3 | import http.cookiejar 4 | # from . import user_agent 5 | import random 6 | class PangXie: 7 | def make_opener(self): 8 | cj = http.cookiejar.CookieJar() 9 | opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) 10 | header = [] 11 | head = { 12 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 13 | 'Accept-Encoding': 'gzip, deflate, sdch', 14 | 'Accept-Language': 'zh-CN,zh;q=0.8', 15 | 'Connection': 'keep-alive', 16 | 'Content-Length': '254', 17 | 'Host': 'www.pangxieyg.com', 18 | 19 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 20 | # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML,' 21 | # ' like Gecko) Chrome/37.0.2062.124 Safari/537.36' 22 | } 23 | for key, value in head.items(): 24 | elem = (key, value) 25 | header.append(elem) 26 | opener.addheaders = header 27 | return opener 28 | 29 | if __name__ == '__main__': 30 | px = PangXie() 31 | opener = px.make_opener() 32 | opener.open('http://www.pangxieyg.com/wap/') 33 | -------------------------------------------------------------------------------- /spiders/pangxieyg/record/104808_2017-06-02.txt: -------------------------------------------------------------------------------- 1 | 2017-06-02 16:02:57 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 2 | -------------------------------------------------------------------------------- /spiders/pangxieyg/record/104808_2017-06-06.txt: -------------------------------------------------------------------------------- 1 | 2017-06-06 20:08:32 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 2 | 2017-06-06 20:25:56 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 3 | 2017-06-06 20:26:21 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 4 | 2017-06-06 20:26:25 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 5 | 2017-06-06 20:26:36 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 6 | 2017-06-06 20:26:44 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 7 | 2017-06-06 20:26:57 宗品汇系列 正宗农家散养余干乌黑鸡1200g以上 乌黑鸡 1200g以上 有货啦 8 | -------------------------------------------------------------------------------- /spiders/pangxieyg/record/107494_2017-06-02.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/107494_2017-06-02.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/107494_2017-06-06.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/107494_2017-06-06.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/109121_2017-06-02.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/109121_2017-06-02.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/109121_2017-06-06.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/109121_2017-06-06.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/111592_2017-06-02.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/111592_2017-06-02.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/111592_2017-06-06.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/111592_2017-06-06.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/121557_2017-06-02.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/121557_2017-06-02.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/record/121557_2017-06-06.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/pangxieyg/record/121557_2017-06-06.txt -------------------------------------------------------------------------------- /spiders/pangxieyg/sound.py: -------------------------------------------------------------------------------- 1 | # from win32com.client import Dispatch 2 | # 3 | # 4 | # while True: 5 | # wmp = Dispatch("WMPlayer.OCX") 6 | # media = wmp.newMedia("D:/CloudMusic/双笙 - 小幸运.mp3") 7 | # wmp.currentPlaylist.appendItem(media) 8 | # wmp.controls.play() 9 | import pyaudio 10 | import wave 11 | 12 | 13 | chunk = 1024 14 | wf = wave.open(r'C:\Windows\Media\notify.wav', 'rb') 15 | p = pyaudio.PyAudio() 16 | stream = p.open(format=p.get_format_from_width(wf.getsampwidth()), 17 | channels=wf.getnchannels(), 18 | rate=wf.getframerate(), 19 | output=True) 20 | 21 | # 写声音输出流进行播放 22 | while True: 23 | data = wf.readframes(chunk) 24 | if data == b'': 25 | break 26 | stream.write(data) 27 | stream.close() 28 | p.terminate() 29 | -------------------------------------------------------------------------------- /spiders/pangxieyg/winui.py: -------------------------------------------------------------------------------- 1 | 2 | # Creates a task-bar icon. Run from Python.exe to see the 3 | # messages printed. 4 | import win32api, win32gui 5 | import win32con, winerror 6 | import sys, os 7 | 8 | class MainWindow: 9 | def __init__(self): 10 | msg_TaskbarRestart = win32gui.RegisterWindowMessage("TaskbarCreated"); 11 | message_map = { 12 | msg_TaskbarRestart: self.OnRestart, 13 | win32con.WM_DESTROY: self.OnDestroy, 14 | win32con.WM_COMMAND: self.OnCommand, 15 | win32con.WM_USER+20 : self.OnTaskbarNotify, 16 | } 17 | # Register the Window class. 18 | wc = win32gui.WNDCLASS() 19 | hinst = wc.hInstance = win32api.GetModuleHandle(None) 20 | wc.lpszClassName = "PythonTaskbarDemo" 21 | wc.style = win32con.CS_VREDRAW | win32con.CS_HREDRAW; 22 | wc.hCursor = win32api.LoadCursor( 0, win32con.IDC_ARROW ) 23 | wc.hbrBackground = win32con.COLOR_WINDOW 24 | wc.lpfnWndProc = message_map # could also specify a wndproc. 25 | 26 | # Don't blow up if class already registered to make testing easier 27 | try: 28 | classAtom = win32gui.RegisterClass(wc) 29 | except win32gui.error, err_info: 30 | if err_info.winerror!=winerror.ERROR_CLASS_ALREADY_EXISTS: 31 | raise 32 | 33 | # Create the Window. 34 | style = win32con.WS_OVERLAPPED | win32con.WS_SYSMENU 35 | self.hwnd = win32gui.CreateWindow( wc.lpszClassName, "Taskbar Demo", style, \ 36 | 0, 0, win32con.CW_USEDEFAULT, win32con.CW_USEDEFAULT, \ 37 | 0, 0, hinst, None) 38 | win32gui.UpdateWindow(self.hwnd) 39 | self._DoCreateIcons() 40 | def _DoCreateIcons(self): 41 | # Try and find a custom icon 42 | hinst = win32api.GetModuleHandle(None) 43 | iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "pyc.ico" )) 44 | if not os.path.isfile(iconPathName): 45 | # Look in DLLs dir, a-la py 2.5 46 | iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "DLLs", "pyc.ico" )) 47 | if not os.path.isfile(iconPathName): 48 | # Look in the source tree. 49 | iconPathName = os.path.abspath(os.path.join( os.path.split(sys.executable)[0], "..\\PC\\pyc.ico" )) 50 | if os.path.isfile(iconPathName): 51 | icon_flags = win32con.LR_LOADFROMFILE | win32con.LR_DEFAULTSIZE 52 | hicon = win32gui.LoadImage(hinst, iconPathName, win32con.IMAGE_ICON, 0, 0, icon_flags) 53 | else: 54 | print "Can't find a Python icon file - using default" 55 | hicon = win32gui.LoadIcon(0, win32con.IDI_APPLICATION) 56 | 57 | flags = win32gui.NIF_ICON | win32gui.NIF_MESSAGE | win32gui.NIF_TIP 58 | nid = (self.hwnd, 0, flags, win32con.WM_USER+20, hicon, "Python Demo") 59 | try: 60 | win32gui.Shell_NotifyIcon(win32gui.NIM_ADD, nid) 61 | except win32gui.error: 62 | # This is common when windows is starting, and this code is hit 63 | # before the taskbar has been created. 64 | print "Failed to add the taskbar icon - is explorer running?" 65 | # but keep running anyway - when explorer starts, we get the 66 | # TaskbarCreated message. 67 | 68 | def OnRestart(self, hwnd, msg, wparam, lparam): 69 | self._DoCreateIcons() 70 | 71 | def OnDestroy(self, hwnd, msg, wparam, lparam): 72 | nid = (self.hwnd, 0) 73 | win32gui.Shell_NotifyIcon(win32gui.NIM_DELETE, nid) 74 | win32gui.PostQuitMessage(0) # Terminate the app. 75 | 76 | def OnTaskbarNotify(self, hwnd, msg, wparam, lparam): 77 | if lparam==win32con.WM_LBUTTONUP: 78 | print "You clicked me." 79 | elif lparam==win32con.WM_LBUTTONDBLCLK: 80 | print "You double-clicked me - goodbye" 81 | win32gui.DestroyWindow(self.hwnd) 82 | elif lparam==win32con.WM_RBUTTONUP: 83 | print "You right clicked me." 84 | menu = win32gui.CreatePopupMenu() 85 | win32gui.AppendMenu( menu, win32con.MF_STRING, 1023, "Display Dialog") 86 | win32gui.AppendMenu( menu, win32con.MF_STRING, 1024, "Say Hello") 87 | win32gui.AppendMenu( menu, win32con.MF_STRING, 1025, "Exit program" ) 88 | pos = win32gui.GetCursorPos() 89 | # See http://msdn.microsoft.com/library/default.asp?url=/library/en-us/winui/menus_0hdi.asp 90 | win32gui.SetForegroundWindow(self.hwnd) 91 | win32gui.TrackPopupMenu(menu, win32con.TPM_LEFTALIGN, pos[0], pos[1], 0, self.hwnd, None) 92 | win32gui.PostMessage(self.hwnd, win32con.WM_NULL, 0, 0) 93 | return 1 94 | 95 | def OnCommand(self, hwnd, msg, wparam, lparam): 96 | id = win32api.LOWORD(wparam) 97 | if id == 1023: 98 | import win32gui_dialog 99 | win32gui_dialog.DemoModal() 100 | elif id == 1024: 101 | print "Hello" 102 | elif id == 1025: 103 | print "Goodbye" 104 | win32gui.DestroyWindow(self.hwnd) 105 | else: 106 | print "Unknown command -", id 107 | 108 | def main(): 109 | w=MainWindow() 110 | win32gui.PumpMessages() 111 | 112 | if __name__=='__main__': 113 | main() 114 | 115 | 116 | -------------------------------------------------------------------------------- /spiders/rank/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/rank/__init__.py -------------------------------------------------------------------------------- /spiders/rank/baike_rank.py: -------------------------------------------------------------------------------- 1 | 2 | import http.cookiejar 3 | import urllib.request 4 | from spiders import user_agent 5 | import random 6 | import json 7 | 8 | class Rank(): 9 | def make_my_opener(self): 10 | """ 11 | 模拟浏览器发送请求 12 | :return: 13 | """ 14 | cj = http.cookiejar.CookieJar() 15 | opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) 16 | header = [] 17 | head = { 18 | 'Accept': '*/*', 19 | # 'Accept-Encoding': 'gzip,deflate,sdch', 20 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 21 | 'Host': 'baike.baidu.com', 22 | 'Proxy-Connection': 'keep-alive', 23 | 'User-Agent': user_agent.agents[random.randint(0, len(user_agent.agents) - 1)] 24 | 25 | # 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36' 26 | # ' (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36' 27 | } 28 | for key, value in head.items(): 29 | elem = (key, value) 30 | header.append(elem) 31 | opener.addheaders = header 32 | return opener 33 | 34 | def start(self): 35 | opener = self.make_my_opener() 36 | max_page = 50 37 | rsp = opener.open('http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek') 38 | 39 | rsp_json = json.loads(rsp.read().decode()) 40 | 41 | print(rsp_json) 42 | for pn in range(1, max_page): 43 | 44 | rsp = opener.open('http://baike.baidu.com/starflower/api/starflowerstarlist?rankType=thisWeek&pg=%d' % pn) 45 | rsp_json = json.loads(rsp.read().decode()) 46 | 47 | print(rsp_json) 48 | 49 | 50 | if __name__ == '__main__': 51 | rank = Rank() 52 | rank.start() -------------------------------------------------------------------------------- /spiders/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | -------------------------------------------------------------------------------- /spiders/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /spiders/wechat_sport/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/wechat_sport/__init__.py -------------------------------------------------------------------------------- /spiders/wechat_sport/get_steps.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import json 4 | import datetime 5 | from pprint import pprint 6 | import sys 7 | import os 8 | import django 9 | from time import sleep 10 | sys.path.append('../../') 11 | sys.path.append('../') 12 | os.environ['DJANGO_SETTINGS_MODULE'] = 'Jpider.settings' 13 | django.setup() 14 | from spiders.models import Step 15 | HEADERS = { 16 | 17 | 'Host': 'hw.weixin.qq.com', 18 | 'Connection': 'keep-alive', 19 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 20 | 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 22 | 'Chrome/39.0.2171.95 Safari/537.36 MicroMessenger/6.5.2.501 NetType/WIFI ' 23 | 'WindowsWechat QBCore/3.43.556.400 QQBrowser/9.0.2524.400', 24 | 'Accept-Encoding': 'gzip, deflate', 25 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-us;q=0.6,en;q=0.5;q=0.4', 26 | 'Cookie': 'hwstepranksk=uiNfWaL2l5E6ItwiqWdoU9gbuSnCWw2vxj-5_7i7U6QH6eWZ;' 27 | } 28 | 29 | url = 'https://hw.weixin.qq.com/steprank/step/personal' 30 | while True: 31 | resp = requests.get(url=url, params={ 32 | # 'pass_ticket': 'wHHOyL%2BvmKG1LE5VIuKgnrVj825Zv9dFN6HzwqXRZ9IpyQ6I6EcmRXkBtXTB5fAY' 33 | }, headers=HEADERS).text 34 | match_strings = re.findall(r"window.json = (\S+);", resp) 35 | 36 | resp_json = json.loads(match_strings[0]) 37 | step = Step() 38 | step.steps = resp_json['rankdesc']['score'] 39 | step.curr_time = datetime.datetime.now() 40 | step.save() 41 | pprint(step) 42 | sleep(2*60) 43 | -------------------------------------------------------------------------------- /spiders/weibo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__init__.py -------------------------------------------------------------------------------- /spiders/weibo/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /spiders/weibo/__pycache__/user_agent.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/spiders/weibo/__pycache__/user_agent.cpython-35.pyc -------------------------------------------------------------------------------- /spiders/weibo/conf/README.md: -------------------------------------------------------------------------------- 1 | 在conf/目录下创建account.conf文件 2 | 用户名和密码空格分隔 3 | 一行一个用户 4 | e.g. 5 | 6 | xxxx@163.com password_123 7 | bbbb@163.com password_234 -------------------------------------------------------------------------------- /spiders/weibo/conf/account.conf: -------------------------------------------------------------------------------- 1 | 767543579@qq.com JOPPER -------------------------------------------------------------------------------- /spiders/weibo/dao.py: -------------------------------------------------------------------------------- 1 | from spiders.models import Weibo, WeiboUser, UserRelationship 2 | from spiders.logger import LOGGER 3 | from django.db.models import Q 4 | 5 | def save_blog_info( blog_info): 6 | 7 | try: 8 | # 存在就更新,不存在就创建 9 | weibo = Weibo.objects.get(pk=blog_info['id']) 10 | except Weibo.DoesNotExist: 11 | weibo = Weibo() 12 | weibo.id = str(blog_info['id']) 13 | weibo.created_timestamp = blog_info['created_at'] 14 | # self.user_enqueue(ret_weibo.user.id) 15 | weibo.source = blog_info['source'] 16 | weibo.text = blog_info['text'] 17 | try: 18 | user = WeiboUser.objects.get(pk=blog_info['user']['id']) 19 | except WeiboUser.DoesNotExist: 20 | user = save_user_info(blog_info['user']) 21 | weibo.user = user 22 | weibo.save() 23 | LOGGER.info(weibo) 24 | if weibo.retweented_status is not None: # 添加关系 25 | 26 | try: 27 | r = UserRelationship.objects.get(Q(user=weibo.retweented_status.user) & Q(follower=weibo.user)) 28 | LOGGER.info('relationship already exist:' + str(r)) 29 | except UserRelationship.DoesNotExist: 30 | LOGGER.info('relationship not exist') 31 | relation = UserRelationship() 32 | relation.user = weibo.retweented_status.user 33 | relation.follower = weibo.user 34 | LOGGER.info(relation) 35 | relation.save() 36 | return weibo 37 | 38 | 39 | def save_user_info( user_info): 40 | try: 41 | user = WeiboUser.objects.get(pk=user_info['id']) 42 | except WeiboUser.DoesNotExist: 43 | user = WeiboUser() 44 | user.id = user_info['id'] 45 | 46 | user.attNum = user_info['attNum'] if 'attNum' in user_info \ 47 | else user_info['follow_count'] if 'follow_count' in user_info \ 48 | else '' 49 | user.created_at = user_info['created_at'] if 'created_at' in user_info else '' 50 | user.screen_name = user_info['screen_name'] if 'screen_name' in user_info else '' 51 | user.description = user_info['description'] if 'description' in user_info else '' 52 | user.fansNum = user_info['fansNum'] if 'fansNum' in user_info \ 53 | else user_info['followers_count'] if 'followers_count' in user_info \ 54 | else '' 55 | user.mblogNum = user_info['mblogNum'] if 'mblogNum' in user_info \ 56 | else user_info['statuses_count'] if 'statuses_count' in user_info \ 57 | else '' 58 | user.nativePlace = user_info['nativePlace'] if 'nativePlace' in user_info else '' 59 | user.profile_url = user_info['profile_url'] if 'profile_url' in user_info else '' 60 | user.gender = WeiboUser.GENDER.index(user_info['gender'] if 'gender' in user_info else 'u') 61 | user.save() 62 | 63 | LOGGER.info(user) 64 | return user 65 | 66 | 67 | def save_relationship(user, fan): 68 | try: 69 | r = UserRelationship.objects.get(Q(user=user) & Q(follower=fan)) 70 | LOGGER.info('relationship already exist:' + str(r)) 71 | except UserRelationship.DoesNotExist: 72 | LOGGER.info('relationship not exist') 73 | relation = UserRelationship() 74 | relation.user = user 75 | relation.follower = fan 76 | LOGGER.info(relation) 77 | relation.save() 78 | 79 | 80 | def insert_pic_info(self, pic_info): 81 | pass 82 | 83 | 84 | def insert_comment_info(self, comment_info): 85 | pass 86 | 87 | 88 | def save_pic(self): 89 | url = 'http://ww2.sinaimg.cn/large/c0788b86jw1f2xfstebzaj20dc0hst9r.jpg' 90 | # opener = my_http.make_my_opener() 91 | # rsp = opener.open(url) 92 | # pic_data = rsp.read() 93 | # try: 94 | # file = open("d:\\weibo_pic\\1.jpg", 'wb') 95 | # file.write(pic_data) 96 | # file.close() 97 | # except FileNotFoundError: 98 | # os.mkdir("d:\\weibo_pic") 99 | # except FileExistsError: 100 | # pass -------------------------------------------------------------------------------- /spiders/weibo/multhread.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import queue 3 | import threading 4 | from time import sleep 5 | q = queue.Queue() 6 | result = deque() 7 | num_worker_threads = 10 8 | threads = [] 9 | def do_work(n): 10 | return n+2 11 | 12 | def workder(): 13 | while True: 14 | item = q.get() 15 | if item is None: 16 | print('break') 17 | break 18 | print(do_work(item)) 19 | q.task_done() 20 | sleep(10) 21 | q.put(12) 22 | 23 | def worker1(): 24 | while True: 25 | print(1) 26 | sleep(10) 27 | 28 | def worker2(): 29 | while True: 30 | print(2) 31 | 32 | t1 = threading.Thread(target=worker1) 33 | t2 = threading.Thread(target=worker2) 34 | t1.start() 35 | t2.start() 36 | -------------------------------------------------------------------------------- /spiders/weibo/weibo_conf.py: -------------------------------------------------------------------------------- 1 | 2 | def get_account(): 3 | """ 4 | 这里是去读配置文件,weibo账号,conf文件夹下有说明 5 | :return: 6 | """ 7 | accounts = [] 8 | conf_file = 'conf/account.conf' 9 | try: 10 | with open(conf_file, 'r') as f: 11 | for line in f.readlines(): 12 | fields = line.split(' ') 13 | accounts.append({'username': fields[0], 'password': fields[1]}) 14 | except FileNotFoundError: 15 | raise FileNotFoundError('No such file or directory:%s,' 16 | ' read conf/README.md to conf weibo account' % conf_file) 17 | return accounts -------------------------------------------------------------------------------- /spiders/weibo/weibo_http.py: -------------------------------------------------------------------------------- 1 | from spiders.logger import LOGGER 2 | from spiders.weibo import constants 3 | 4 | import traceback 5 | import random 6 | import json 7 | import http.cookiejar 8 | import urllib.parse 9 | import urllib.request 10 | from time import sleep 11 | import ssl 12 | 13 | 14 | def login(user_name, password, opener): 15 | LOGGER.info(user_name + ' login') 16 | args = { 17 | 'username': user_name, 18 | 'password': password, 19 | 'savestate': 1, 20 | 'ec': 0, 21 | 'pagerefer': 'https://passport.weibo.cn/signin/' 22 | 'welcome?entry=mweibo&r=http%3A%2F%2Fm.weibo.cn%2F&wm=3349&vt=4', 23 | 'entry': 'mweibo', 24 | 'wentry': '', 25 | 'loginfrom': '', 26 | 'client_id': '', 27 | 'code': '', 28 | 'qq': '', 29 | 'hff': '', 30 | 'hfp': '' 31 | } 32 | 33 | post_data = urllib.parse.urlencode(args).encode() 34 | try_time = 0 35 | while try_time < constants.TRY_TIME: 36 | try: 37 | resp = opener.open(constants.LOGIN_URL, post_data) 38 | resp_json = json.loads(resp.read().decode()) 39 | if 'retcode' in resp_json and resp_json['retcode'] == 20000000: 40 | LOGGER.info("%s login successful" % user_name) 41 | break 42 | else: 43 | LOGGER.warn('login fail:%s' % str(resp_json)) 44 | sleep(10) 45 | try_time += 1 46 | except : 47 | LOGGER.error("login failed") 48 | LOGGER.error(traceback.print_exc()) 49 | sleep(10) 50 | try_time += 1 51 | LOGGER.info('try %d time' % try_time) 52 | 53 | 54 | 55 | def get_openner(): 56 | opener = make_my_opener() 57 | curr_index = random.randint(0, len(constants.USERS) - 1) # 随机选取用户 58 | LOGGER.info('user index : %d' % curr_index) 59 | login(constants.USERS[curr_index]['username'], constants.USERS[curr_index]['password'], opener) 60 | change_header(opener) 61 | return opener 62 | 63 | 64 | def change_header(opener, ext=None): 65 | head = { 66 | 'Accept': '*/*', 67 | 'Connection': 'keep-alive', 68 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 69 | 'Host': 'm.weibo.cn', 70 | 'Proxy-Connection': 'keep-alive', 71 | 'User-Agent': constants.USER_AGENTS[random.randint(0, len(constants.USER_AGENTS) - 1)] 72 | } 73 | if ext: 74 | head.update(ext) 75 | header = [] 76 | for key, value in head.items(): 77 | elem = (key, value) 78 | header.append(elem) 79 | opener.addheaders = header 80 | 81 | 82 | def change_proxy(opener): 83 | proxy_handler = urllib.request.ProxyHandler(constants.PROXIES[random.randint(0, len(constants.PROXIES) -1)]) 84 | opener.add_handler(proxy_handler) 85 | 86 | 87 | def make_my_opener(): 88 | """ 89 | 模拟浏览器发送请求 90 | :return: 91 | """ 92 | cj = http.cookiejar.CookieJar() 93 | opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj)) 94 | 95 | header = [] 96 | head = { 97 | 'Accept': '*/*', 98 | 'Accept-Encoding': 'gzip,deflate', 99 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 100 | 'Connection': 'keep-alive', 101 | 'Content-Length': '254', 102 | 'Content-Type': 'application/x-www-form-urlencoded', 103 | 'Host': 'passport.weibo.cn', 104 | 'Origin': 'https://passport.weibo.cn', 105 | 'Referer': 'https://passport.weibo.cn/signin/login?' 106 | 'entry=mweibo&res=wel&wm=3349&r=http%3A%2F%2Fm.weibo.cn%2F', 107 | 'User-Agent': constants.USER_AGENTS[random.randint(0, len(constants.USER_AGENTS) - 1)] 108 | } 109 | for key, value in head.items(): 110 | elem = (key, value) 111 | header.append(elem) 112 | opener.addheaders = header 113 | return opener 114 | -------------------------------------------------------------------------------- /spiders/zju/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = zju.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = zju 12 | -------------------------------------------------------------------------------- /spiders/zju/zju/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ZjuItem(scrapy.Item): 12 | url = scrapy.Field() 13 | title = scrapy.Field() 14 | time = scrapy.Field() 15 | -------------------------------------------------------------------------------- /spiders/zju/zju/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ZjuSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /spiders/zju/zju/myemail.py: -------------------------------------------------------------------------------- 1 | import smtplib 2 | import email.mime.multipart 3 | import email.mime.text 4 | # -*- coding:utf-8 -*- 5 | 6 | 7 | class Email(object): 8 | content_from = None 9 | content_to = None 10 | content_subject = None 11 | content_msg = None 12 | content_pwd = None 13 | 14 | def send_163(self): 15 | assert self.content_from is not None 16 | assert self.content_to is not None 17 | assert self.content_pwd is not None 18 | msg = email.mime.multipart.MIMEMultipart() 19 | msg['from'] = self.content_from 20 | msg['to'] = self.content_to 21 | msg['subject'] = self.content_subject 22 | txt = email.mime.text.MIMEText(self.content_msg, 'plain', 'utf-8') 23 | msg.attach(txt) 24 | smtp = smtplib.SMTP(host='smtp.163.com', port=25) 25 | 26 | smtp.login(self.content_from, self.content_pwd) 27 | smtp.sendmail(self.content_from, self.content_to, str(msg)) 28 | smtp.quit() 29 | 30 | 31 | def send_email(subject, msg): 32 | e = Email() 33 | e.content_from = 'jjzhu_ncu@163.com' 34 | e.content_to = 'jjzhu_zju@163.com' 35 | e.content_pwd = 'jvs7452014' 36 | e.content_subject = 'hello world' 37 | e.content_msg = 'hello word' 38 | e.send_163() 39 | -------------------------------------------------------------------------------- /spiders/zju/zju/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from .myemail import Email 8 | from .wechat import Wechat 9 | 10 | 11 | class ZjuPipeline(object): 12 | wc = Wechat() 13 | 14 | def process_item(self, item, spider): 15 | msg = item['title'] + '\n' + item['url'] + '\n' + item['time'] 16 | self.wc.send(msg) 17 | 18 | e = Email() 19 | e.content_from = 'jjzhu_ncu@163.com' 20 | e.content_to = 'jjzhu_zju@163.com' 21 | e.content_pwd = 'jvs7452014' 22 | e.content_subject = u'浙大研究生官网发布新消息啦!' 23 | e.content_msg = item['title'] + '\n' + item['url'] + '\n' + item['time'] 24 | e.send_163() 25 | 26 | return item 27 | -------------------------------------------------------------------------------- /spiders/zju/zju/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for zju project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'zju' 13 | 14 | SPIDER_MODULES = ['zju.spiders'] 15 | NEWSPIDER_MODULE = 'zju.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'zju (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = True 22 | DEFAULT_REQUEST_HEADERS = { 23 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 24 | 'Accept-Encoding': 'gzip, deflate', 25 | 'Accept-Language': 'zh-CN,zh;q=0.8', 26 | 'Cache-Control': 'max-age=0', 27 | 'Connection': 'keep-alive', 28 | 'Host': 'grs.zju.edu.cn', 29 | 'Upgrade-Insecure-Requests': '1', 30 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 31 | } 32 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 33 | # CONCURRENT_REQUESTS = 32 34 | 35 | # Configure a delay for requests for the same website (default: 0) 36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 37 | # See also autothrottle settings and docs 38 | DOWNLOAD_DELAY = 2 39 | # The download delay setting will honor only one of: 40 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 41 | # CONCURRENT_REQUESTS_PER_IP = 16 42 | 43 | # Disable cookies (enabled by default) 44 | # COOKIES_ENABLED = False 45 | 46 | # Disable Telnet Console (enabled by default) 47 | # TELNETCONSOLE_ENABLED = False 48 | 49 | # Override the default request headers: 50 | # DEFAULT_REQUEST_HEADERS = { 51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 52 | # 'Accept-Language': 'en', 53 | # } 54 | 55 | # Enable or disable spider middlewares 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 57 | # SPIDER_MIDDLEWARES = { 58 | # 'zju.middlewares.ZjuSpiderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable downloader middlewares 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 63 | # DOWNLOADER_MIDDLEWARES = { 64 | # 'zju.middlewares.MyCustomDownloaderMiddleware': 543, 65 | # } 66 | 67 | # Enable or disable extensions 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 69 | # EXTENSIONS = { 70 | # 'scrapy.extensions.telnet.TelnetConsole': None, 71 | # } 72 | 73 | # Configure item pipelines 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 75 | ITEM_PIPELINES = { 76 | 'zju.pipelines.ZjuPipeline': 300, 77 | } 78 | 79 | # Enable and configure the AutoThrottle extension (disabled by default) 80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 81 | # AUTOTHROTTLE_ENABLED = True 82 | # The initial download delay 83 | # AUTOTHROTTLE_START_DELAY = 5 84 | # The maximum download delay to be set in case of high latencies 85 | # AUTOTHROTTLE_MAX_DELAY = 60 86 | # The average number of requests Scrapy should be sending in parallel to 87 | # each remote server 88 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 89 | # Enable showing throttling stats for every response received: 90 | # AUTOTHROTTLE_DEBUG = False 91 | 92 | # Enable and configure HTTP caching (disabled by default) 93 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 94 | # HTTPCACHE_ENABLED = True 95 | # HTTPCACHE_EXPIRATION_SECS = 0 96 | # HTTPCACHE_DIR = 'httpcache' 97 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 98 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 99 | -------------------------------------------------------------------------------- /spiders/zju/zju/spiders/ZjuSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import scrapy 3 | from scrapy.http import Request 4 | import datetime 5 | from ..items import ZjuItem 6 | 7 | 8 | class ZjuSpider(scrapy.Spider): 9 | name = 'zju' 10 | url = 'http://grs.zju.edu.cn' 11 | url2 = 'http://grs.zju.edu.cn/redir.php?catalog_id=16313' 12 | url3 = 'http://grs.zju.edu.cn/redir.php?catalog_id=16313&page=1' 13 | notified = set() 14 | 15 | def start_requests(self): 16 | # ield Request(self.url, dont_filter=True, callback=self.parse) 17 | yield Request(self.url2, dont_filter=True, callback=self.parse2) 18 | yield Request(self.url3, dont_filter=True, callback=self.parse2) 19 | 20 | def parse2(self, response): 21 | lis = response.xpath('//ul[@id="artphs"]/li') 22 | for li in lis: 23 | c_url = self.url + '/' + li.xpath('h3/a/@href').extract_first() 24 | title = li.xpath('h3/a/@title').extract_first() 25 | time = li.xpath('span/text()').extract_first() 26 | publish_time = datetime.datetime.strptime(time, '%Y-%m-%d') 27 | now = datetime.datetime.now() 28 | 29 | if (now - publish_time).days < 1: 30 | if c_url not in self.notified: 31 | zju_item = ZjuItem() 32 | zju_item['url'] = c_url 33 | zju_item['title'] = title 34 | zju_item['time'] = time 35 | self.notified.add(c_url) 36 | yield zju_item 37 | yield Request(self.url2, dont_filter=True, callback=self.parse2) 38 | yield Request(self.url3, dont_filter=True, callback=self.parse2) 39 | 40 | def parse(self, response): 41 | 42 | lis = response.xpath('//ul[@id="arthd"]/li') 43 | datetime.datetime.strptime('2017-06-01', '%Y-%m-%d') 44 | for li in lis: 45 | c_url = self.url + '/' + li.xpath('a/@href').extract_first() 46 | title = li.xpath('a/@title').extract_first() 47 | time = li.xpath('span[@class="art-date"]/text()').extract_first() 48 | 49 | publish_time = datetime.datetime.strptime(time, '%Y-%m-%d') 50 | now = datetime.datetime.now() 51 | 52 | if(now - publish_time).days < 1: 53 | if c_url not in self.notified: 54 | zju_item = ZjuItem() 55 | zju_item['url'] = c_url 56 | zju_item['title'] = title 57 | zju_item['time'] = time 58 | self.notified.add(c_url) 59 | yield zju_item 60 | yield Request(self.url, dont_filter=True, callback=self.parse) 61 | -------------------------------------------------------------------------------- /spiders/zju/zju/wechat.py: -------------------------------------------------------------------------------- 1 | from wxpy import * 2 | bot = Bot() 3 | 4 | 5 | class Wechat(object): 6 | def __init__(self): 7 | self.group = bot.groups().search('English exam')[0] 8 | self.my_friends = [bot.friends().search('jopper')[0]] 9 | 10 | def send(self, msg): 11 | self.group.send(msg) 12 | 13 | def send2(self, msg): 14 | for f in self.my_friends: 15 | f.send(msg) 16 | -------------------------------------------------------------------------------- /usage/README.md: -------------------------------------------------------------------------------- 1 | ## usage 2 | 各种框架、库的使用 3 | - requests 4 | - celery -------------------------------------------------------------------------------- /usage/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/__init__.py -------------------------------------------------------------------------------- /usage/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /usage/celery_u/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__init__.py -------------------------------------------------------------------------------- /usage/celery_u/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /usage/celery_u/__pycache__/celery.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/celery.cpython-35.pyc -------------------------------------------------------------------------------- /usage/celery_u/__pycache__/tasks.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/__pycache__/tasks.cpython-35.pyc -------------------------------------------------------------------------------- /usage/celery_u/add.py: -------------------------------------------------------------------------------- 1 | from tasks import add 2 | print(add.delay(3, 3).get(timeout=1)) 3 | -------------------------------------------------------------------------------- /usage/celery_u/celery3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/celery_u/celery3.pdf -------------------------------------------------------------------------------- /usage/celery_u/celery_.py: -------------------------------------------------------------------------------- 1 | from celery import Celery 2 | 3 | app = Celery('celery', 4 | broker='redis://:''@127.0.0.1:6379/0', 5 | backend='redis://:''@127.0.0.1:6379/1', 6 | include=['tasks'] 7 | ) 8 | app.conf.update( 9 | CELERY_TASK_RESULT_EXPIRES=3600, 10 | CELERY_ROUTES={ 11 | 'celery.taks.add': {'qqueue': 'hipri'} 12 | } 13 | ) 14 | 15 | if __name__ == '__main__': 16 | app.start() 17 | -------------------------------------------------------------------------------- /usage/celery_u/celeryconfig.py: -------------------------------------------------------------------------------- 1 | BROKER_URL = 'amqp://' 2 | CELERY_RESULT_BACKEND = 'rpc://' 3 | CELERY_TASK_SERIALIZER = 'json' 4 | CELERY_RESULT_SERIALIZER = 'json' 5 | CELERY_ACCEPT_CONTENT=['json'] 6 | CELERY_TIMEZONE = 'Europe/Oslo' 7 | CELERY_ENABLE_UTC = True 8 | -------------------------------------------------------------------------------- /usage/celery_u/tasks.py: -------------------------------------------------------------------------------- 1 | from celery_ import app 2 | 3 | 4 | @app.task 5 | def add( x, y): 6 | 7 | return x + y 8 | 9 | 10 | @app.task 11 | def mul(x, y): 12 | return x * y 13 | 14 | 15 | @app.task 16 | def xsum(numbers): 17 | return sum(numbers) 18 | -------------------------------------------------------------------------------- /usage/kafka_u/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/kafka_u/__init__.py -------------------------------------------------------------------------------- /usage/kafka_u/consumer.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaConsumer 2 | 3 | consumer = KafkaConsumer('my-tppic', 4 | group_id='my-group', 5 | bootstrap_servers = ['localhost:9092']) 6 | for message in consumer: 7 | print("%s:%d:%d: key=%s value=%s" % (message.topic, message.partition, 8 | message.offset, message.key, 9 | message.value)) 10 | 11 | -------------------------------------------------------------------------------- /usage/kafka_u/producer.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaProducer 2 | from kafka.errors import KafkaError 3 | 4 | producer = KafkaProducer(bootstrap_servers=['localhost:9092']) 5 | 6 | # Asynchronous by default 7 | future = producer.send('my-topic', b'raw_bytes') 8 | 9 | # Block for 'synchronous' sends 10 | try: 11 | record_metadata = future.get(timeout=10) 12 | except KafkaError: 13 | # Decide what to do if produce request failed... 14 | 15 | pass 16 | 17 | # Successful result returns assigned partition and offset 18 | print (record_metadata.topic) 19 | print (record_metadata.partition) 20 | print (record_metadata.offset) 21 | -------------------------------------------------------------------------------- /usage/proj/tasks.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | if __name__ == '__main__': 4 | a = 1 5 | b = 2 6 | while True: 7 | try: 8 | c = a/b 9 | print(c) 10 | break 11 | except: 12 | print('error') 13 | finally: 14 | print('finally') 15 | print('aaaa') -------------------------------------------------------------------------------- /usage/redis_u/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jjzhu' 2 | -------------------------------------------------------------------------------- /usage/redis_u/redis_u.py: -------------------------------------------------------------------------------- 1 | __author__ = 'jjzhu' 2 | import redis 3 | 4 | r = redis.StrictRedis(db=0) 5 | print(r.get('foo')) 6 | 7 | -------------------------------------------------------------------------------- /usage/requests_u/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhujiajunup/Jpider/cc62d8b1976a7c629eeb316353a02f3092669add/usage/requests_u/__init__.py -------------------------------------------------------------------------------- /usage/requests_u/req_usage.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def send_request(): 5 | result = requests.get('http://music.163.com/') 6 | print(result.text) 7 | 8 | send_request() --------------------------------------------------------------------------------