├── .coveragerc
├── .github
    └── ISSUE_TEMPLATE.md
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── config_example.json
├── data
    └── .gitignore
├── docker-compose.yaml
├── docs
    ├── About-Projects.md
    ├── About-Tasks.md
    ├── Architecture.md
    ├── Command-Line.md
    ├── Deployment-demo.pyspider.org.md
    ├── Deployment.md
    ├── Frequently-Asked-Questions.md
    ├── Quickstart.md
    ├── Running-pyspider-with-Docker.md
    ├── Script-Environment.md
    ├── Working-with-Results.md
    ├── apis
    │   ├── @catch_status_code_error.md
    │   ├── @every.md
    │   ├── Response.md
    │   ├── index.md
    │   ├── self.crawl.md
    │   └── self.send_message.md
    ├── conf.py
    ├── imgs
    │   ├── creating_a_project.png
    │   ├── css_selector_helper.png
    │   ├── demo.png
    │   ├── developer-tools-network-filter.png
    │   ├── developer-tools-network.png
    │   ├── index_page.png
    │   ├── inspect_element.png
    │   ├── pyspider-arch.png
    │   ├── request-headers.png
    │   ├── run_one_step.png
    │   ├── search-for-request.png
    │   ├── tutorial_imdb_front.png
    │   └── twitch.png
    ├── index.md
    └── tutorial
    │   ├── AJAX-and-more-HTTP.md
    │   ├── HTML-and-CSS-Selector.md
    │   ├── Render-with-PhantomJS.md
    │   └── index.md
├── mkdocs.yml
├── pyspider
    ├── __init__.py
    ├── database
    │   ├── __init__.py
    │   ├── base
    │   │   ├── __init__.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   └── taskdb.py
    │   ├── basedb.py
    │   ├── couchdb
    │   │   ├── __init__.py
    │   │   ├── couchdbbase.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   └── taskdb.py
    │   ├── elasticsearch
    │   │   ├── __init__.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   └── taskdb.py
    │   ├── local
    │   │   ├── __init__.py
    │   │   └── projectdb.py
    │   ├── mongodb
    │   │   ├── __init__.py
    │   │   ├── mongodbbase.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   └── taskdb.py
    │   ├── mysql
    │   │   ├── __init__.py
    │   │   ├── mysqlbase.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   └── taskdb.py
    │   ├── redis
    │   │   ├── __init__.py
    │   │   └── taskdb.py
    │   ├── sqlalchemy
    │   │   ├── __init__.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   ├── sqlalchemybase.py
    │   │   └── taskdb.py
    │   └── sqlite
    │   │   ├── __init__.py
    │   │   ├── projectdb.py
    │   │   ├── resultdb.py
    │   │   ├── sqlitebase.py
    │   │   └── taskdb.py
    ├── fetcher
    │   ├── __init__.py
    │   ├── cookie_utils.py
    │   ├── phantomjs_fetcher.js
    │   ├── puppeteer_fetcher.js
    │   ├── splash_fetcher.lua
    │   └── tornado_fetcher.py
    ├── libs
    │   ├── ListIO.py
    │   ├── __init__.py
    │   ├── base_handler.py
    │   ├── bench.py
    │   ├── counter.py
    │   ├── dataurl.py
    │   ├── log.py
    │   ├── multiprocessing_queue.py
    │   ├── pprint.py
    │   ├── response.py
    │   ├── result_dump.py
    │   ├── sample_handler.py
    │   ├── url.py
    │   ├── utils.py
    │   └── wsgi_xmlrpc.py
    ├── logging.conf
    ├── message_queue
    │   ├── __init__.py
    │   ├── kombu_queue.py
    │   ├── rabbitmq.py
    │   └── redis_queue.py
    ├── processor
    │   ├── __init__.py
    │   ├── processor.py
    │   └── project_module.py
    ├── result
    │   ├── __init__.py
    │   └── result_worker.py
    ├── run.py
    ├── scheduler
    │   ├── __init__.py
    │   ├── scheduler.py
    │   ├── task_queue.py
    │   └── token_bucket.py
    └── webui
    │   ├── __init__.py
    │   ├── app.py
    │   ├── bench_test.py
    │   ├── debug.py
    │   ├── index.py
    │   ├── login.py
    │   ├── result.py
    │   ├── static
    │       ├── .babelrc
    │       ├── css_selector_helper.min.js
    │       ├── debug.min.css
    │       ├── debug.min.js
    │       ├── index.min.css
    │       ├── index.min.js
    │       ├── package.json
    │       ├── result.min.css
    │       ├── result.min.js
    │       ├── src
    │       │   ├── css_selector_helper.js
    │       │   ├── debug.js
    │       │   ├── debug.less
    │       │   ├── index.js
    │       │   ├── index.less
    │       │   ├── result.less
    │       │   ├── splitter.js
    │       │   ├── task.less
    │       │   ├── tasks.less
    │       │   └── variable.less
    │       ├── task.min.css
    │       ├── task.min.js
    │       ├── tasks.min.css
    │       ├── tasks.min.js
    │       └── webpack.config.js
    │   ├── task.py
    │   ├── templates
    │       ├── debug.html
    │       ├── index.html
    │       ├── result.html
    │       ├── task.html
    │       └── tasks.html
    │   └── webdav.py
├── requirements.txt
├── run.py
├── setup.py
├── tests
    ├── __init__.py
    ├── data_fetcher_processor_handler.py
    ├── data_handler.py
    ├── data_sample_handler.py
    ├── data_test_webpage.py
    ├── test_base_handler.py
    ├── test_bench.py
    ├── test_counter.py
    ├── test_database.py
    ├── test_fetcher.py
    ├── test_fetcher_processor.py
    ├── test_message_queue.py
    ├── test_processor.py
    ├── test_response.py
    ├── test_result_dump.py
    ├── test_result_worker.py
    ├── test_run.py
    ├── test_scheduler.py
    ├── test_task_queue.py
    ├── test_utils.py
    ├── test_webdav.py
    ├── test_webui.py
    └── test_xmlrpc.py
├── tools
    └── migrate.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source =
 3 |     pyspider
 4 | parallel = True
 5 | 
 6 | [report]
 7 | omit =
 8 |     pyspider/libs/sample_handler.py
 9 |     pyspider/libs/pprint.py
10 | 
11 | exclude_lines =
12 |     pragma: no cover
13 |     def __repr__
14 |     if self.debug:
15 |     if settings.DEBUG
16 |     raise AssertionError
17 |     raise NotImplementedError
18 |     if 0:
19 |     if __name__ == .__main__.:
20 |     except ImportError:
21 |     pass
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | Thanks for using pyspider!
 3 | 
 4 | 如果你需要使用中文提问，请将问题提交到 https://segmentfault.com/t/pyspider
 5 | -->
 6 | 
 7 | * pyspider version:
 8 | * Operating system:
 9 | * Start up command:
10 | 
11 | ### Expected behavior
12 | 
13 | <!-- What do you think should happen? -->
14 | 
15 | ### Actual behavior
16 | 
17 | <!-- What actually happens? -->
18 | 
19 | ### How to reproduce
20 | 
21 | <!-- 
22 | 
23 | The best chance of getting help is providing enough information that can be reproduce the issue you have.
24 | 
25 | If it's related to API or extraction behavior, please paste the script of your project.
26 | If it's related to scheduling of whole project, please paste the screenshot of queue status on the top in dashboard.
27 | 
28 | -->
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | data/*
 3 | .venv
 4 | .idea
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Packages
 9 | *.egg
10 | *.egg-info
11 | dist
12 | build
13 | eggs
14 | parts
15 | bin
16 | var
17 | sdist
18 | develop-eggs
19 | .installed.cfg
20 | lib
21 | lib64
22 | __pycache__
23 | 
24 | # Installer logs
25 | pip-log.txt
26 | 
27 | # Unit test / coverage reports
28 | .coverage
29 | .tox
30 | nosetests.xml
31 | 
32 | # Translations
33 | *.mo
34 | 
35 | # Mr Developer
36 | .mr.developer.cfg
37 | .project
38 | .pydevproject
39 | .idea
40 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | cache: pip
 3 | python:
 4 |   - 3.5
 5 |   - 3.6
 6 |   - 3.7
 7 |   #- 3.8
 8 | services:
 9 |     - docker
10 |     - mongodb
11 |     - rabbitmq
12 |     - redis
13 |     - mysql
14 |     # - elasticsearch
15 |     - postgresql
16 | addons:
17 |   postgresql: "9.4"
18 |   apt:
19 |     packages:
20 |     - rabbitmq-server
21 | env:
22 |     - IGNORE_COUCHDB=1
23 | 
24 | before_install:
25 |     - sudo apt-get update -qq
26 |     - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart
27 |     - npm install express puppeteer
28 |     - sudo docker pull scrapinghub/splash
29 |     - sudo docker run -d --net=host scrapinghub/splash
30 | before_script:
31 |     - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
32 |     - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
33 |     - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres
34 |     - sleep 10
35 | install:
36 |     - pip install https://github.com/marcus67/easywebdav/archive/master.zip
37 |     - sudo apt-get install libgnutls28-dev
38 |     - pip install -e .[all,test]
39 |     - pip install coveralls
40 | script:
41 |     - coverage run setup.py test
42 | after_success:
43 |     - coverage combine
44 |     - coveralls
45 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.6
 2 | MAINTAINER binux <roy@binux.me>
 3 | 
 4 | # install phantomjs
 5 | RUN mkdir -p /opt/phantomjs \
 6 |         && cd /opt/phantomjs \
 7 |         && wget -O phantomjs.tar.bz2 https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \
 8 |         && tar xavf phantomjs.tar.bz2 --strip-components 1 \
 9 |         && ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \
10 |         && rm phantomjs.tar.bz2
11 | # Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory
12 | ENV OPENSSL_CONF=/etc/ssl/
13 | 
14 | # install nodejs
15 | ENV NODEJS_VERSION=8.15.0 \
16 |     PATH=$PATH:/opt/node/bin
17 | WORKDIR "/opt/node"
18 | RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \
19 |     curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \
20 |     rm -rf /var/lib/apt/lists/*
21 | RUN npm install puppeteer express
22 | 
23 | # install requirements
24 | COPY requirements.txt /opt/pyspider/requirements.txt
25 | RUN pip install -r /opt/pyspider/requirements.txt
26 | 
27 | # add all repo
28 | ADD ./ /opt/pyspider
29 | 
30 | # run test
31 | WORKDIR /opt/pyspider
32 | RUN pip install -e .[all]
33 | 
34 | # Create a symbolic link to node_modules
35 | RUN ln -s /opt/node/node_modules ./node_modules
36 | 
37 | #VOLUME ["/opt/pyspider"]
38 | ENTRYPOINT ["pyspider"]
39 | 
40 | EXPOSE 5000 23333 24444 25555 22222
41 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include requirements.txt
3 | include Dockerfile
4 | include LICENSE
5 | include pyspider/logging.conf
6 | include pyspider/webui/static/*
7 | include pyspider/webui/templates/*
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | pyspider [![Build Status]][Travis CI] [![Coverage Status]][Coverage]
 2 | ========
 3 | 
 4 | A Powerful Spider(Web Crawler) System in Python.
 5 | 
 6 | - Write script in Python
 7 | - Powerful WebUI with script editor, task monitor, project manager and result viewer
 8 | - [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
 9 | - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
10 | - Task priority, retry, periodical, recrawl by age, etc...
11 | - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc...
12 | 
13 | Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  
14 | Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  
15 | Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  
16 | 
17 | Sample Code 
18 | -----------
19 | 
20 | ```python
21 | from pyspider.libs.base_handler import *
22 | 
23 | 
24 | class Handler(BaseHandler):
25 |     crawl_config = {
26 |     }
27 | 
28 |     @every(minutes=24 * 60)
29 |     def on_start(self):
30 |         self.crawl('http://scrapy.org/', callback=self.index_page)
31 | 
32 |     @config(age=10 * 24 * 60 * 60)
33 |     def index_page(self, response):
34 |         for each in response.doc('a[href^="http"]').items():
35 |             self.crawl(each.attr.href, callback=self.detail_page)
36 | 
37 |     def detail_page(self, response):
38 |         return {
39 |             "url": response.url,
40 |             "title": response.doc('title').text(),
41 |         }
42 | ```
43 | 
44 | 
45 | Installation
46 | ------------
47 | 
48 | * `pip install pyspider`
49 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)
50 | 
51 | **WARNING:** WebUI is open to the public by default, it can be used to execute any command which may harm your system. Please use it in an internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).
52 | 
53 | Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)
54 | 
55 | Contribute
56 | ----------
57 | 
58 | * Use It
59 | * Open [Issue], send PR
60 | * [User Group]
61 | * [中文问答](http://segmentfault.com/t/pyspider)
62 | 
63 | 
64 | TODO
65 | ----
66 | 
67 | ### v0.4.0
68 | 
69 | - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)
70 | 
71 | 
72 | License
73 | -------
74 | Licensed under the Apache License, Version 2.0
75 | 
76 | 
77 | [Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat
78 | [Travis CI]:            https://travis-ci.org/binux/pyspider
79 | [Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat
80 | [Coverage]:             https://coveralls.io/r/binux/pyspider
81 | [Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat
82 | [Issue]:                https://github.com/binux/pyspider/issues
83 | [User Group]:           https://groups.google.com/group/pyspider-users
84 | 


--------------------------------------------------------------------------------
/config_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "taskdb": "couchdb+taskdb://user:password@couchdb:5984",
 3 |   "projectdb": "couchdb+projectdb://user:password@couchdb:5984",
 4 |   "resultdb": "couchdb+resultdb://user:password@couchdb:5984",
 5 |   "message_queue": "amqp://rabbitmq:5672/%2F",
 6 |   "webui": {
 7 |     "username": "username",
 8 |     "password": "password",
 9 |     "need-auth": true,
10 |     "scheduler-rpc": "http://scheduler:23333",
11 |     "fetcher-rpc": "http://fetcher:24444"
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | *.db
2 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
  1 | version: "3.7"
  2 | 
  3 | # replace /path/to/dir/ to point to config.json
  4 | 
  5 | # The RabbitMQ and CouchDB services can take some time to startup.
  6 | # During this time most of the pyspider services will exit and restart.
  7 | # Once RabbitMQ and CouchDB are fully up and running everything should run as normal.
  8 | 
  9 | services:
 10 |   rabbitmq:
 11 |     image: rabbitmq:alpine
 12 |     container_name: rabbitmq
 13 |     networks:
 14 |       - pyspider
 15 |     command: rabbitmq-server
 16 |   mysql:
 17 |     image: mysql:latest
 18 |     container_name: mysql
 19 |     volumes:
 20 |       - /tmp:/var/lib/mysql
 21 |     environment:
 22 |       - MYSQL_ALLOW_EMPTY_PASSWORD=yes
 23 |     networks:
 24 |       - pyspider
 25 |   phantomjs:
 26 |     image: pyspider:latest
 27 |     container_name: phantomjs
 28 |     networks:
 29 |       - pyspider
 30 |     volumes:
 31 |       - ./config_example.json:/opt/pyspider/config.json
 32 |     command: -c config.json phantomjs
 33 |     depends_on:
 34 |       - couchdb
 35 |       - rabbitmq
 36 |     restart: unless-stopped
 37 |   result:
 38 |     image: pyspider:latest
 39 |     container_name: result
 40 |     networks:
 41 |       - pyspider
 42 |     volumes:
 43 |       - ./config_example.json:/opt/pyspider/config.json
 44 |     command: -c config.json result_worker
 45 |     depends_on:
 46 |       - couchdb
 47 |       - rabbitmq
 48 |     restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start
 49 |   processor:
 50 |     container_name: processor
 51 |     image: pyspider:latest
 52 |     networks:
 53 |       - pyspider
 54 |     volumes:
 55 |       - ./config_example.json:/opt/pyspider/config.json
 56 |     command: -c config.json processor
 57 |     depends_on:
 58 |       - couchdb
 59 |       - rabbitmq
 60 |     restart: unless-stopped
 61 |   fetcher:
 62 |     image: pyspider:latest
 63 |     container_name: fetcher
 64 |     networks:
 65 |       - pyspider
 66 |     volumes:
 67 |       - ./config_example.json:/opt/pyspider/config.json
 68 |     command : -c config.json fetcher
 69 |     depends_on:
 70 |       - couchdb
 71 |       - rabbitmq
 72 |     restart: unless-stopped
 73 |   scheduler:
 74 |     image: pyspider:latest
 75 |     container_name: scheduler
 76 |     networks:
 77 |       - pyspider
 78 |     volumes:
 79 |       - ./config_example.json:/opt/pyspider/config.json
 80 |     command: -c config.json scheduler
 81 |     depends_on:
 82 |       - couchdb
 83 |       - rabbitmq
 84 |     restart: unless-stopped
 85 |   webui:
 86 |     image: pyspider:latest
 87 |     container_name: webui
 88 |     ports:
 89 |       - "5050:5000"
 90 |     networks:
 91 |       - pyspider
 92 |     volumes:
 93 |       - ./config_example.json:/opt/pyspider/config.json
 94 |     command: -c config.json webui
 95 |     depends_on:
 96 |       - couchdb
 97 |       - rabbitmq
 98 |     restart: unless-stopped
 99 | 
100 | networks:
101 |   pyspider:
102 |     external:
103 |       name: pyspider
104 |   default:
105 |     driver: bridge
106 | 


--------------------------------------------------------------------------------
/docs/About-Projects.md:
--------------------------------------------------------------------------------
 1 | About Projects
 2 | ==============
 3 | 
 4 | In most cases, a project is one script you write for one website.
 5 | 
 6 | * Projects are independent, but you can import another project as a module with `from projects import other_project`
 7 | * A project has 5 status: `TODO`, `STOP`, `CHECKING`, `DEBUG` and `RUNNING`
 8 |     - `TODO` - a script is just created to be written
 9 |     - `STOP` - you can mark a project as `STOP` if you want it to STOP (= =).
10 |     - `CHECKING` - when a running project is modified, to prevent incomplete modification, project status will be set as `CHECKING` automatically.
11 |     - `DEBUG`/`RUNNING` - these two status have no difference to spider. But it's good to mark it as `DEBUG` when it's running the first time then change it to `RUNNING` after being checked.
12 | * The crawl rate is controlled by `rate` and `burst` with [token-bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm.
13 |     - `rate` - how many requests in one second
14 |     - `burst` - consider this situation, `rate/burst = 0.1/3`, it means that the spider scrawls 1 page every 10 seconds. All tasks are finished, project is checking last updated items every minute. Assume that 3 new items are found, pyspider will "burst" and crawl 3 tasks without waiting 3*10 seconds. However, the fourth task needs wait 10 seconds.
15 | * To delete a project, set `group` to `delete` and status to `STOP`, wait 24 hours.
16 | 
17 | 
18 | `on_finished` callback
19 | --------------------
20 | You can override `on_finished` method in the project, the method would be triggered when the task_queue goes to 0.
21 | 
22 | Example 1: When you start a project to crawl a website with 100 pages, the `on_finished` callback will be fired when 100 pages are successfully crawled or failed after retries.
23 | 
24 | Example 2: A project with `auto_recrawl` tasks will **NEVER** trigger the `on_finished` callback, because time queue will never become 0 when there are auto_recrawl tasks in it.
25 | 
26 | Example 3: A project with `@every` decorated method will trigger the `on_finished` callback every time when the newly submitted tasks are finished.
27 | 


--------------------------------------------------------------------------------
/docs/About-Tasks.md:
--------------------------------------------------------------------------------
 1 | About Tasks
 2 | ===========
 3 | 
 4 | Tasks are the basic unit to be scheduled.
 5 | 
 6 | Basis
 7 | -----
 8 | 
 9 | * A task is differentiated by its `taskid`. (Default: `md5(url)`, can be changed by overriding the `def get_taskid(self, task)` method)
10 | * Tasks are isolated between different projects.
11 | * A Task has 4 status:
12 |     - active
13 |     - failed
14 |     - success
15 |     - bad - not used
16 | * Only tasks in active status will be scheduled.
17 | * Tasks are served in order of `priority`.
18 | 
19 | Schedule
20 | --------
21 | 
22 | #### new task
23 | 
24 | When a new task (never seen before) comes in:
25 | 
26 | * If `exetime` is set but not arrived, it will be put into a time-based queue to wait.
27 | * Otherwise it will be accepted.
28 | 
29 | When the task is already in the queue:
30 | 
31 | * Ignored unless `force_update`
32 | 
33 | When a completed task comes out:
34 | 
35 | * If `age` is set, `last_crawl_time + age < now` it will be accepted. Otherwise discarded.
36 | * If `itag` is set and not equal to it's previous value, it will be accepted. Otherwise discarded.
37 | 
38 | 
39 | #### task retry
40 | 
41 | When a fetch error or script error happens, the task will retry 3 times by default.
42 | 
43 | The first retry will execute every time after 30 seconds, 1 hour, 6 hours, 12 hours and any more retries will postpone 24 hours.
44 | 
45 | If `age` is specified, the retry delay will not larger then `age`.
46 | 
47 | You can config the retry delay by adding a variable named `retry_delay` to handler. `retry_delay` is a dict to specify retry intervals. The items in the dict are {retried: seconds}, and a special key: '' (empty string) is used to specify the default retry delay if not specified.
48 | 
49 | e.g. the default `retry_delay` declares like:
50 | 
51 | 
52 | ```
53 | class MyHandler(BaseHandler):
54 |     retry_delay = {
55 |         0: 30,
56 |         1: 1*60*60,
57 |         2: 6*60*60,
58 |         3: 12*60*60,
59 |         '': 24*60*60
60 |     }
61 | ```
62 | 


--------------------------------------------------------------------------------
/docs/Frequently-Asked-Questions.md:
--------------------------------------------------------------------------------
 1 | Frequently Asked Questions
 2 | ==========================
 3 | 
 4 | Does pyspider Work with Windows?
 5 | --------------------------------
 6 | Yes, it should, some users have made it work on Windows. But as I don't have windows development environment, I cannot test. Only some tips for users who want to use pyspider on Windows:
 7 | 
 8 | - Some package needs binary libs (e.g. pycurl, lxml), that maybe you cannot install it from pip, Windowns binaries packages could be found in [http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/).
 9 | - Make a clean environment with [virtualenv](https://virtualenv.readthedocs.org/en/latest/)
10 | - Try 32bit version of Python, especially your are facing crash issue.
11 | - Avoid using Python 3.4.1 ([#194](https://github.com/binux/pyspider/issues/194), [#217](https://github.com/binux/pyspider/issues/217))
12 | 
13 | Unreadable Code (乱码) Returned from Phantomjs
14 | ---------------------------------------------
15 | 
16 | Phantomjs doesn't support gzip, don't set `Accept-Encoding` header with `gzip`.
17 | 
18 | 
19 | How to Delete a Project?
20 | ------------------------
21 | 
22 | set `group` to `delete` and `status` to `STOP` then wait 24 hours. You can change the time before a project deleted via `scheduler.DELETE_TIME`.
23 | 
24 | How to Restart a Project?
25 | -------------------------
26 | #### Why
27 | It happens after you modified a script, and wants to crawl everything again with new strategy. But as the [age](/apis/self.crawl/#age) of urls are not expired. Scheduler will discard all of the new requests.
28 | 
29 | #### Solution
30 | 1. Create a new project.
31 | 2. Using a [itag](/apis/self.crawl/#itag) within `Handler.crawl_config` to specify the version of your script.
32 | 
33 | How to Use WebDAV Mode?
34 | -----------------------
35 | Mount `http://hostname/dav/` to your filesystem, edit or create scripts with your favourite editor.
36 | 
37 | > OSX: `mount_webdav http://hostname/dav/ /Volumes/dav`  
38 | > Linux: Install davfs2, `mount.davfs http://hostname/dav/ /mnt/dav`  
39 | > VIM: `vim http://hostname/dav/script_name.py`
40 | 
41 | When you are editing script without WebUI, you need to change it to `WebDAV Mode` while debugging. After you saved script in editor, WebUI can load and use latest script to debug your code.
42 | 
43 | What does the progress bar mean on the dashboard?
44 | -------------------------------------------------
45 | When mouse move onto the progress bar, you can see the explaintions.
46 | 
47 | For 5m, 1h, 1d the number are the events triggered in 5m, 1h, 1d. For all progress bar, they are the number of total tasks in correspond status.
48 | 
49 | Only the tasks in DEBUG/RUNNING status will show the progress.
50 | 
51 | How many scheduler/fetcher/processor/result_worker do I need? or pyspider stop working
52 | --------------------------------------------------------------------------------------
53 | You can have only have one scheduler, and multiple fetcher/processor/result_worker depends on the bottleneck. You can use the queue status on dashboard to view the bottleneck of the system:
54 | 
55 | ![run one step](imgs/queue_status.png)
56 | 
57 | For example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers.
58 | 
59 | The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips.


--------------------------------------------------------------------------------
/docs/Quickstart.md:
--------------------------------------------------------------------------------
 1 | Quickstart
 2 | ==========
 3 | 
 4 | Installation
 5 | ------------
 6 | 
 7 | * `pip install pyspider`
 8 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)
 9 | 
10 | if you are using ubuntu, try:
11 | ```
12 | apt-get install python python-dev python-distribute python-pip \
13 | libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml \
14 | libssl-dev zlib1g-dev
15 | ```
16 | to install binary packages first.
17 | 
18 | 
19 | please install PhantomJS if needed: http://phantomjs.org/build.html
20 | 
21 | note that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment
22 | 
23 | **Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment).
24 | 
25 | **WARNING:** WebUI is opened to public by default, it can be used to execute any command which may harm to you system. Please use it in internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config).
26 | 
27 | Your First Script
28 | -----------------
29 | 
30 | ```python
31 | from pyspider.libs.base_handler import *
32 | 
33 | 
34 | class Handler(BaseHandler):
35 |     crawl_config = {
36 |     }
37 | 
38 |     @every(minutes=24 * 60)
39 |     def on_start(self):
40 |         self.crawl('http://scrapy.org/', callback=self.index_page)
41 | 
42 |     @config(age=10 * 24 * 60 * 60)
43 |     def index_page(self, response):
44 |         for each in response.doc('a[href^="http"]').items():
45 |             self.crawl(each.attr.href, callback=self.detail_page)
46 | 
47 |     @config(priority=2)
48 |     def detail_page(self, response):
49 |         return {
50 |             "url": response.url,
51 |             "title": response.doc('title').text(),
52 |         }
53 | ```
54 | 
55 | > * `def on_start(self)` is the entry point of the script. It will be called when you click the `run` button on dashboard.
56 | > * [`self.crawl(url, callback=self.index_page)`*](/apis/self.crawl) is the most important API here. It will add a new task to be crawled. Most of the options will be spicified via `self.crawl` arguments.
57 | > * `def index_page(self, response)` get a [`Response`*](/apis/Response) object. [`response.doc`*](/apis/Response/#responsedoc) is a [pyquery](https://pythonhosted.org/pyquery/) object which has jQuery-like API to select elements to be extracted.
58 | > * `def detail_page(self, response)` return a `dict` object as result. The result will be captured into `resultdb` by default. You can override `on_result(self, result)` method to manage the result yourself.
59 | 
60 | 
61 | More things you may want to know:
62 | 
63 | > * [`@every(minutes=24*60, seconds=0)`*](/apis/@every/) is a helper to tell the scheduler that `on_start` method should be called everyday.
64 | > * [`@config(age=10 * 24 * 60 * 60)`*](/apis/self.crawl/#configkwargs) specified the default `age` parameter of `self.crawl` with page type `index_page` (when `callback=self.index_page`). The parameter [`age`*](/apis/self.crawl/#age) can be specified via `self.crawl(url, age=10*24*60*60)` (highest priority) and `crawl_config` (lowest priority).
65 | > * [`age=10 * 24 * 60 * 60`*](/apis/self.crawl/#age) tell scheduler discard the request if it have been crawled in 10 days. pyspider will not crawl a same URL twice by default (discard forever), even you had modified the code, it's very common for beginners that runs the project the first time and modified it and run it the second time, it will not crawl again (read [`itag`](/apis/self.crawl/#itag) for solution)
66 | > * [`@config(priority=2)`*](/apis/self.crawl/#schedule) mark that detail pages should be crawled first.
67 | 
68 | You can test your script step by step by click the green `run` button. Switch to `follows` panel, click the play button to move on.
69 | 
70 | ![run one step](imgs/run_one_step.png)
71 | 
72 | Start Running
73 | -------------
74 | 
75 | 1. Save your script.
76 | 2. Back to dashboard find your project.
77 | 3. Changing the `status` to `DEBUG` or `RUNNING`.
78 | 4. Click the `run` button.
79 | 
80 | ![index demo](imgs/index_page.png)
81 | 
82 | Your script is running now!
83 | 


--------------------------------------------------------------------------------
/docs/Running-pyspider-with-Docker.md:
--------------------------------------------------------------------------------
 1 | ```shell
 2 | # mysql
 3 | docker run --name mysql -d -v /data/mysql:/var/lib/mysql -e MYSQL_ALLOW_EMPTY_PASSWORD=yes mysql:latest
 4 | # rabbitmq
 5 | docker run --name rabbitmq -d rabbitmq:latest
 6 | 
 7 | # phantomjs
 8 | docker run --name phantomjs -d binux/pyspider:latest phantomjs
 9 | 
10 | # result worker
11 | docker run --name result_worker -m 128m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest result_worker
12 | # processor, run multiple instance if needed.
13 | docker run --name processor -m 256m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest processor
14 | # fetcher, run multiple instance if needed.
15 | docker run --name fetcher -m 256m -d --link phantomjs:phantomjs --link rabbitmq:rabbitmq binux/pyspider:latest fetcher --no-xmlrpc
16 | # scheduler
17 | docker run --name scheduler -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest scheduler
18 | # webui
19 | docker run --name webui -m 256m -d -p 5000:5000 --link mysql:mysql --link rabbitmq:rabbitmq --link scheduler:scheduler --link phantomjs:phantomjs binux/pyspider:latest webui
20 | ```
21 | 
22 | or running with [Docker Compose](https://docs.docker.com/compose/) with `docker-compose.yml`:
23 | 
24 | NOTE: It's recommended to run mysql and rabbitmq outside compose as they may not been restarted with pyspider. You can find commands to start mysql and rabbitmq service above.
25 | 
26 | ```
27 | phantomjs:
28 |   image: binux/pyspider:latest
29 |   command: phantomjs
30 | result:
31 |   image: binux/pyspider:latest
32 |   external_links:
33 |     - mysql
34 |     - rabbitmq
35 |   command: result_worker
36 | processor:
37 |   image: binux/pyspider:latest
38 |   external_links:
39 |     - mysql
40 |     - rabbitmq
41 |   command: processor
42 | fetcher:
43 |   image: binux/pyspider:latest
44 |   external_links:
45 |     - rabbitmq
46 |   links:
47 |     - phantomjs
48 |   command : fetcher
49 | scheduler:
50 |   image: binux/pyspider:latest
51 |   external_links:
52 |     - mysql
53 |     - rabbitmq
54 |   command: scheduler
55 | webui:
56 |   image: binux/pyspider:latest
57 |   external_links:
58 |     - mysql
59 |     - rabbitmq
60 |   links:
61 |     - scheduler
62 |     - phantomjs
63 |   command: webui
64 |   ports:
65 |     - "5000:5000"
66 | ```
67 | 
68 | `docker-compose up`
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/docs/Script-Environment.md:
--------------------------------------------------------------------------------
 1 | Script Environment
 2 | ==================
 3 | 
 4 | Variables
 5 | ---------
 6 | * `self.project_name`
 7 | * `self.project` information about current project
 8 | * `self.response`
 9 | * `self.task`
10 | 
11 | About Script
12 | ------------
13 | * The name of `Handler` is not matters, but you need at least one class inherit from `BaseHandler`
14 | * A third parameter can be set to get task object: `def callback(self, response, task)`
15 | * Non-200 response will not submit to callback by default. Use `@catch_status_code_error` 
16 | 
17 | About Environment
18 | -----------------
19 | * `logging`, `print` and exceptions will be captured.
20 | * You can import other projects as module with `from projects import some_project`
21 | 
22 | ### Web view
23 | 
24 | * view the page as a browser would render (approximately)
25 | 
26 | ### HTML view
27 | 
28 | * view the HTML of the current callback (index_page, detail_page, etc.)
29 | 
30 | ### Follows view
31 | 
32 | * view the callbacks that can be made from the current callback
33 | * index_page follows view will show the detail_page callbacks that can be executed.
34 | 
35 | ### Messages view
36 | 
37 | * shows the messages send by [`self.send_message`](apis/self.send_message) API.
38 | 
39 | ### Enable CSS Selector Helper
40 | 
41 | * Enable a CSS Selector Helper of the Web view. It gets the CSS Selector of the element you clicked then add it to your script.
42 | 


--------------------------------------------------------------------------------
/docs/Working-with-Results.md:
--------------------------------------------------------------------------------
 1 | Working with Results
 2 | ====================
 3 | Downloading and viewing your data from WebUI is convenient, but may not suitable for computer.
 4 | 
 5 | Working with ResultDB
 6 | ---------------------
 7 | Although resultdb is only designed for result preview, not suitable for large scale storage. But if you want to grab data from resultdb, there are some simple snippets using database API that can help you to connect and select the data.
 8 | 
 9 | ```
10 | from pyspider.database import connect_database
11 | resultdb = connect_database("<your resutldb connection url>")
12 | for project in resultdb.projects:
13 |     for result in resultdb.select(project):
14 |         assert result['taskid']
15 |         assert result['url']
16 |         assert result['result']
17 | ```
18 | 
19 | The `result['result']` is the object submitted by `return` statement from your script.
20 | 
21 | Working with ResultWorker
22 | -------------------------
23 | In product environment, you may want to connect pyspider to your system / post-processing pipeline, rather than store it into resultdb. It's highly recommended to override ResultWorker.
24 | 
25 | ```
26 | from pyspider.result import ResultWorker
27 | 
28 | class MyResultWorker(ResultWorker):
29 |     def on_result(self, task, result):
30 |         assert task['taskid']
31 |         assert task['project']
32 |         assert task['url']
33 |         assert result
34 |         # your processing code goes here
35 | ```
36 | 
37 | `result` is the object submitted by `return` statement from your script.
38 | 
39 | You can put this script (e.g., `my_result_worker.py`) at the folder where you launch pyspider. Add argument for `result_worker` subcommand:
40 | 
41 | `pyspider result_worker --result-cls=my_result_worker.MyResultWorker`
42 | 
43 | Or
44 | 
45 | ```
46 | {
47 |   ...
48 |   "result_worker": {
49 |     "result_cls": "my_result_worker.MyResultWorker"
50 |   }
51 |   ...
52 | }
53 | ```
54 | 
55 | if you are using config file. [Please refer to Deployment](/Deployment)
56 | 
57 | Design Your Own Database Schema
58 | -------------------------------
59 | The results stored in database is encoded as JSON for compatibility. It's highly recommended to design your own database, and override the ResultWorker described above.
60 | 
61 | TIPS about Results
62 | -------------------
63 | #### Want to return more than one result in callback?
64 | As resultdb de-duplicate results by taskid(url), the latest will overwrite previous results.
65 | 
66 | One workaround is using `send_message` API to make a `fake` taskid for each result.
67 | 
68 | ```
69 | def detail_page(self, response):
70 |     for li in response.doc('li').items():
71 |         self.send_message(self.project_name, {
72 |             ...
73 |         }, url=response.url+"#"+li('a.product-sku').text())
74 | 
75 | def on_message(self, project, msg):
76 |     return msg
77 | ```
78 | 
79 | See Also: [apis/self.send_message](/apis/self.send_message)
80 | 


--------------------------------------------------------------------------------
/docs/apis/@catch_status_code_error.md:
--------------------------------------------------------------------------------
 1 | @catch_status_code_error
 2 | ========================
 3 | 
 4 | non-200 response will been regarded as fetch failed and will not pass to callback. use this decorator to override this feature.
 5 | 
 6 | ```python
 7 | def on_start(self):
 8 |     self.crawl('http://httpbin.org/status/404', self.callback)
 9 | 
10 | @catch_status_code_error  
11 | def callback(self, response):
12 |     ...
13 | ```
14 | 
15 | >  The `callback` would not be executed as the request is failed (with status code 404). With the `@catch_status_code_error` decorater, the `callback` would be executed even if the request failed.
16 | 
17 | 


--------------------------------------------------------------------------------
/docs/apis/@every.md:
--------------------------------------------------------------------------------
 1 | @every(minutes=0, seconds=0)
 2 | ============================
 3 | 
 4 | method will been called every `minutes` or `seconds`
 5 | 
 6 | 
 7 | ```python
 8 | @every(minutes=24 * 60)
 9 | def on_start(self):
10 |     for url in urllist:
11 |         self.crawl(url, callback=self.index_page)
12 | ```
13 | 
14 | The urls would be restarted every 24 hours. Note that, if `age` is also used and the period is longer then `@every`, the crawl request would be discarded as it's regarded as not changed:
15 | 
16 | ```python
17 | @every(minutes=24 * 60)
18 | def on_start(self):
19 |     self.crawl('http://www.example.org/', callback=self.index_page)
20 | 
21 | @config(age=10 * 24 * 60 * 60)
22 | def index_page(self):
23 |     ...
24 | ```
25 | 
26 | > Even though the crawl request triggered every day, but it's discard and only restarted every 10 days.
27 | 
28 | 


--------------------------------------------------------------------------------
/docs/apis/Response.md:
--------------------------------------------------------------------------------
 1 | Response
 2 | ========
 3 | 
 4 | The attributes of Response object.
 5 | 
 6 | ### Response.url
 7 | 
 8 | final URL.
 9 | 
10 | ### Response.text
11 | 
12 | Content of response, in unicode.
13 | 
14 | if `Response.encoding` is None and `chardet` module is available, encoding of content will be guessed.
15 | 
16 | ### Response.content
17 | 
18 | Content of response, in bytes.
19 | 
20 | ### Response.doc
21 | 
22 | A [PyQuery](https://pythonhosted.org/pyquery/) object of the response's content. Links have made as absolute by default.
23 | 
24 | Refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)
25 | 
26 | It's important that I will repeat, refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/)
27 | 
28 | ### Response.etree
29 | 
30 | A [lxml](http://lxml.de/) object of the response's content.
31 | 
32 | ### Response.json
33 | 
34 | The JSON-encoded content of the response, if any.
35 | 
36 | ### Response.status_code
37 | 
38 | ### Response.orig_url
39 | 
40 | If there is any redirection during the request, here is the url you just submit via `self.crawl`.
41 | 
42 | ### Response.headers
43 | 
44 | A case insensitive dict holds the headers of response.
45 | 
46 | ### Response.cookies
47 | 
48 | ### Response.error
49 | 
50 | Messages when fetch error
51 | 
52 | ### Response.time
53 | 
54 | Time used during fetching.
55 | 
56 | ### Response.ok
57 | 
58 | True if `status_code` is 200 and no error.
59 | 
60 | ### Response.encoding
61 | 
62 | Encoding of Response.content.
63 | 
64 | If Response.encoding is None, encoding will be guessed by header or content or `chardet`(if available).
65 | 
66 | Set encoding of content manually will overwrite the guessed encoding.
67 | 
68 | ### Response.save
69 | 
70 | The object saved by [`self.crawl`](/apis/self.crawl/#save) API
71 | 
72 | ### Response.js_script_result
73 | 
74 | content returned by JS script
75 | 
76 | ### Response.raise_for_status()
77 | 
78 | Raise HTTPError if status code is not 200 or `Response.error` exists.
79 | 
80 | 


--------------------------------------------------------------------------------
/docs/apis/index.md:
--------------------------------------------------------------------------------
1 | API Reference
2 | =============
3 |     
4 | - [self.crawl](self.crawl)
5 | - [Response](Response)
6 | - [self.send_message](self.send_message)
7 | - [@every](@every)
8 | - [@catch_status_code_error](@catch_status_code_error)
9 | 


--------------------------------------------------------------------------------
/docs/apis/self.send_message.md:
--------------------------------------------------------------------------------
 1 | self.send_message
 2 | =================
 3 | 
 4 | self.send_message(project, msg, [url])
 5 | --------------------------------------
 6 | send messages to other project. can been received by `def on_message(self, project, message)` callback.
 7 | 
 8 | - `project` - other project name
 9 | - `msg` - any json-able object
10 | - `url` - result will been overwrite if have same `taskid`. `send_message` share a same `taskid` by default. Change this to return multiple result by one response.
11 | 
12 | ```python
13 | def detail_page(self, response):
14 |     for i, each in enumerate(response.json['products']):
15 |         self.send_message(self.project_name, {
16 |                 "name": each['name'],
17 |                 'price': each['prices'],
18 |              }, url="%s#%s" % (response.url, i))
19 | 
20 | def on_message(self, project, msg):
21 |     return msg
22 | ``` 
23 | 
24 | pyspider send_message [OPTIONS] PROJECT MESSAGE
25 | -----------------------------------------------
26 | 
27 | You can also send message from command line.
28 | 
29 | ```
30 | Usage: pyspider send_message [OPTIONS] PROJECT MESSAGE
31 | 
32 |   Send Message to project from command line
33 | 
34 | Options:
35 |   --scheduler-rpc TEXT  xmlrpc path of scheduler
36 |   --help                Show this message and exit.
37 | ```
38 | 
39 | def on_message(self, project, message)
40 | --------------------------------------
41 | receive message from other project
42 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-11-10 01:31:54
 7 | 
 8 | import sys
 9 | from unittest.mock import MagicMock
10 | from recommonmark.parser import CommonMarkParser
11 | 
12 | class Mock(MagicMock):
13 |     @classmethod
14 |     def __getattr__(cls, name):
15 |             return Mock()
16 | 
17 | MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2']
18 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
19 | 
20 | source_parsers = {
21 |         '.md': CommonMarkParser,
22 | }
23 | 
24 | source_suffix = ['.rst', '.md']
25 | 


--------------------------------------------------------------------------------
/docs/imgs/creating_a_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/creating_a_project.png


--------------------------------------------------------------------------------
/docs/imgs/css_selector_helper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/css_selector_helper.png


--------------------------------------------------------------------------------
/docs/imgs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/demo.png


--------------------------------------------------------------------------------
/docs/imgs/developer-tools-network-filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/developer-tools-network-filter.png


--------------------------------------------------------------------------------
/docs/imgs/developer-tools-network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/developer-tools-network.png


--------------------------------------------------------------------------------
/docs/imgs/index_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/index_page.png


--------------------------------------------------------------------------------
/docs/imgs/inspect_element.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/inspect_element.png


--------------------------------------------------------------------------------
/docs/imgs/pyspider-arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/pyspider-arch.png


--------------------------------------------------------------------------------
/docs/imgs/request-headers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/request-headers.png


--------------------------------------------------------------------------------
/docs/imgs/run_one_step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/run_one_step.png


--------------------------------------------------------------------------------
/docs/imgs/search-for-request.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/search-for-request.png


--------------------------------------------------------------------------------
/docs/imgs/tutorial_imdb_front.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/tutorial_imdb_front.png


--------------------------------------------------------------------------------
/docs/imgs/twitch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/twitch.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | pyspider [![Build Status][Build Status]][Travis CI] [![Coverage Status][Coverage Status]][Coverage] [![Try][Try]][Demo]
 2 | ========
 3 | 
 4 | A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]**
 5 | 
 6 | - Write script in Python
 7 | - Powerful WebUI with script editor, task monitor, project manager and result viewer
 8 | - [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend
 9 | - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue
10 | - Task priority, retry, periodical, recrawl by age, etc...
11 | - Distributed architecture, Crawl Javascript pages, Python 2&3, etc...
12 | 
13 | Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/)  
14 | Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/)  
15 | Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases)  
16 | 
17 | Sample Code 
18 | -----------
19 | 
20 | ```python
21 | from pyspider.libs.base_handler import *
22 | 
23 | 
24 | class Handler(BaseHandler):
25 |     crawl_config = {
26 |     }
27 | 
28 |     @every(minutes=24 * 60)
29 |     def on_start(self):
30 |         self.crawl('http://scrapy.org/', callback=self.index_page)
31 | 
32 |     @config(age=10 * 24 * 60 * 60)
33 |     def index_page(self, response):
34 |         for each in response.doc('a[href^="http"]').items():
35 |             self.crawl(each.attr.href, callback=self.detail_page)
36 | 
37 |     def detail_page(self, response):
38 |         return {
39 |             "url": response.url,
40 |             "title": response.doc('title').text(),
41 |         }
42 | ```
43 | 
44 | [![Demo][Demo Img]][Demo]
45 | 
46 | 
47 | Installation
48 | ------------
49 | 
50 | * `pip install pyspider`
51 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/)
52 | 
53 | Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/)
54 | 
55 | Contribute
56 | ----------
57 | 
58 | * Use It
59 | * Open [Issue], send PR
60 | * [User Group]
61 | * [中文问答](http://segmentfault.com/t/pyspider)
62 | 
63 | 
64 | TODO
65 | ----
66 | 
67 | ### v0.4.0
68 | 
69 | - [x] local mode, load script from file.
70 | - [x] works as a framework (all components running in one process, no threads)
71 | - [x] redis
72 | - [x] shell mode like `scrapy shell` 
73 | - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia)
74 | 
75 | 
76 | ### more
77 | 
78 | - [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV)
79 | 
80 | 
81 | License
82 | -------
83 | Licensed under the Apache License, Version 2.0
84 | 
85 | 
86 | [Build Status]:         https://img.shields.io/travis/binux/pyspider/master.svg?style=flat
87 | [Travis CI]:            https://travis-ci.org/binux/pyspider
88 | [Coverage Status]:      https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat
89 | [Coverage]:             https://coveralls.io/r/binux/pyspider
90 | [Try]:                  https://img.shields.io/badge/try-pyspider-blue.svg?style=flat
91 | [Demo]:                 http://demo.pyspider.org/
92 | [Demo Img]:             imgs/demo.png
93 | [Issue]:                https://github.com/binux/pyspider/issues
94 | [User Group]:           https://groups.google.com/group/pyspider-users
95 | 


--------------------------------------------------------------------------------
/docs/tutorial/Render-with-PhantomJS.md:
--------------------------------------------------------------------------------
 1 | Level 3: Render with PhantomJS
 2 | ==============================
 3 | 
 4 | Sometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS].
 5 | 
 6 | To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`.
 7 | 
 8 | Make sure phantomjs is working by running
 9 | ```
10 | $ pyspider phantomjs
11 | ```
12 | 
13 | Continue with the rest of the tutorial if the output is
14 | ```
15 | Web server running on port 25555
16 | ```
17 | 
18 | Use PhantomJS
19 | -------------
20 | 
21 | When pyspider with PhantomJS connected, you can enable this feature by adding a parameter `fetch_type='js'` to `self.crawl`. We use PhantomJS to scrape channel list of  [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) which is loaded with AJAX we discussed in [Level 2](tutorial/AJAX-and-more-HTTP#ajax):
22 | 
23 | ```
24 | class Handler(BaseHandler):
25 |     def on_start(self):
26 |         self.crawl('http://www.twitch.tv/directory/game/Dota%202',
27 |                    fetch_type='js', callback=self.index_page)
28 |              
29 |     def index_page(self, response):
30 |         return {
31 |             "url": response.url,
32 |             "channels": [{
33 |                 "title": x('.title').text(),
34 |                 "viewers": x('.info').contents()[2],
35 |                 "name": x('.info a').text(),
36 |             } for x in response.doc('.stream.item').items()]
37 |         }
38 | ```
39 | > I used some API to handle the list of streams. You can find complete API reference from [PyQuery complete API](https://pythonhosted.org/pyquery/api.html)
40 | 
41 | Running JavaScript on Page
42 | --------------------------
43 | 
44 | We will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page.
45 | 
46 | To scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: 
47 | 
48 | ```
49 | class Handler(BaseHandler):
50 |     def on_start(self):
51 |         self.crawl('http://www.pinterest.com/categories/popular/',
52 |                    fetch_type='js', js_script="""
53 |                    function() {
54 |                        window.scrollTo(0,document.body.scrollHeight);
55 |                    }
56 |                    """, callback=self.index_page)
57 | 
58 |     def index_page(self, response):
59 |         return {
60 |             "url": response.url,
61 |             "images": [{
62 |                 "title": x('.richPinGridTitle').text(),
63 |                 "img": x('.pinImg').attr('src'),
64 |                 "author": x('.creditName').text(),
65 |             } for x in response.doc('.item').items() if x('.pinImg')]
66 |         }
67 | ```
68 | 
69 | > * Script would been executed after page loaded(can been changed via [`js_run_at` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher))
70 | > * We scroll once after page loaded, you can scroll multiple times using [`setTimeout`](https://developer.mozilla.org/en-US/docs/Web/API/WindowTimers.setTimeout). PhantomJS will fetch as many items as possible before timeout arrived.
71 | 
72 | Online demo: [http://demo.pyspider.org/debug/tutorial_pinterest](http://demo.pyspider.org/debug/tutorial_pinterest)
73 | 
74 | 
75 | 
76 | [PhantomJS]:           http://phantomjs.org/
77 | 


--------------------------------------------------------------------------------
/docs/tutorial/index.md:
--------------------------------------------------------------------------------
 1 | pyspider Tutorial
 2 | =================
 3 | 
 4 | > The best way to learn how to scrap is learning how to make it.
 5 | 
 6 | * [Level 1: HTML and CSS Selector](HTML-and-CSS-Selector)
 7 | * [Level 2: AJAX and More HTTP](AJAX-and-more-HTTP)
 8 | * [Level 3: Render with PhantomJS](Render-with-PhantomJS)
 9 | 
10 | If you have problem using pyspider, [user group](https://groups.google.com/group/pyspider-users) is a place for discussing.
11 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: pyspider
 2 | site_description: A Powerful Spider(Web Crawler) System in Python.
 3 | site_author: binux
 4 | repo_url: https://github.com/binux/pyspider
 5 | pages:
 6 | - Introduction: index.md
 7 | - Quickstart: Quickstart.md
 8 | - Command Line: Command-Line.md
 9 | - Tutorial:
10 |   - Index: tutorial/index.md
11 |   - 'Level 1: HTML and CSS Selector': tutorial/HTML-and-CSS-Selector.md
12 |   - 'Level 2: AJAX and More HTTP': tutorial/AJAX-and-more-HTTP.md
13 |   - 'Level 3: Render with PhantomJS': tutorial/Render-with-PhantomJS.md
14 | - About pyspider:
15 |   - Architecture: Architecture.md
16 |   - About Tasks: About-Tasks.md
17 |   - About Projects: About-Projects.md
18 |   - Script Environment: Script-Environment.md
19 |   - Working with Results: Working-with-Results.md
20 | - API Reference:
21 |   - Index: apis/index.md
22 |   - self.crawl: apis/self.crawl.md
23 |   - Response: apis/Response.md
24 |   - self.send_message: apis/self.send_message.md
25 |   - '@catch_status_code_error': apis/@catch_status_code_error.md
26 |   - '@every': apis/@every.md
27 | - Deployment: Deployment.md
28 | - Running pyspider with Docker: Running-pyspider-with-Docker.md
29 | - Deployment of demo.pyspider.org: Deployment-demo.pyspider.org.md
30 | - Frequently Asked Questions: Frequently-Asked-Questions.md
31 | 
32 | theme: readthedocs
33 | markdown_extensions: ['toc(permalink=true)', ]
34 | 


--------------------------------------------------------------------------------
/pyspider/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-11-17 19:17:12
7 | 
8 | __version__ = '0.4.0'
9 | 


--------------------------------------------------------------------------------
/pyspider/database/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/base/__init__.py


--------------------------------------------------------------------------------
/pyspider/database/base/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-09 11:28:52
 7 | 
 8 | import re
 9 | 
10 | # NOTE: When get/get_all/check_update from database with default fields,
11 | #       all following fields should be included in output dict.
12 | {
13 |     'project': {
14 |         'name': str,
15 |         'group': str,
16 |         'status': str,
17 |         'script': str,
18 |         # 'config': str,
19 |         'comments': str,
20 |         # 'priority': int,
21 |         'rate': int,
22 |         'burst': int,
23 |         'updatetime': int,
24 |     }
25 | }
26 | 
27 | 
28 | class ProjectDB(object):
29 |     status_str = [
30 |         'TODO',
31 |         'STOP',
32 |         'CHECKING',
33 |         'DEBUG',
34 |         'RUNNING',
35 |     ]
36 | 
37 |     def insert(self, name, obj={}):
38 |         raise NotImplementedError
39 | 
40 |     def update(self, name, obj={}, **kwargs):
41 |         raise NotImplementedError
42 | 
43 |     def get_all(self, fields=None):
44 |         raise NotImplementedError
45 | 
46 |     def get(self, name, fields):
47 |         raise NotImplementedError
48 | 
49 |     def drop(self, name):
50 |         raise NotImplementedError
51 | 
52 |     def check_update(self, timestamp, fields=None):
53 |         raise NotImplementedError
54 | 
55 |     def split_group(self, group, lower=True):
56 |         if lower:
57 |             return re.split("\W+", (group or '').lower())
58 |         else:
59 |             return re.split("\W+", group or '')
60 | 
61 |     def verify_project_name(self, name):
62 |         if len(name) > 64:
63 |             return False
64 |         if re.search(r"[^\w]", name):
65 |             return False
66 |         return True
67 | 
68 |     def copy(self):
69 |         '''
70 |         database should be able to copy itself to create new connection
71 | 
72 |         it's implemented automatically by pyspider.database.connect_database
73 |         if you are not create database connection via connect_database method,
74 |         you should implement this
75 |         '''
76 |         raise NotImplementedError
77 | 


--------------------------------------------------------------------------------
/pyspider/database/base/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-11 18:40:03
 7 | 
 8 | # result schema
 9 | {
10 |     'result': {
11 |         'taskid': str,  # new, not changeable
12 |         'project': str,  # new, not changeable
13 |         'url': str,  # new, not changeable
14 |         'result': str,  # json string
15 |         'updatetime': int,
16 |     }
17 | }
18 | 
19 | 
20 | class ResultDB(object):
21 |     """
22 |     database for result
23 |     """
24 |     projects = set()  # projects in resultdb
25 | 
26 |     def save(self, project, taskid, url, result):
27 |         raise NotImplementedError
28 | 
29 |     def select(self, project, fields=None, offset=0, limit=None):
30 |         raise NotImplementedError
31 | 
32 |     def count(self, project):
33 |         raise NotImplementedError
34 | 
35 |     def get(self, project, taskid, fields=None):
36 |         raise NotImplementedError
37 | 
38 |     def drop(self, project):
39 |         raise NotImplementedError
40 | 
41 |     def copy(self):
42 |         '''
43 |         database should be able to copy itself to create new connection
44 | 
45 |         it's implemented automatically by pyspider.database.connect_database
46 |         if you are not create database connection via connect_database method,
47 |         you should implement this
48 |         '''
49 |         raise NotImplementedError
50 | 


--------------------------------------------------------------------------------
/pyspider/database/base/taskdb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<i@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-02-08 10:28:48
  7 | 
  8 | # task schema
  9 | {
 10 |     'task': {
 11 |         'taskid': str,  # new, not change
 12 |         'project': str,  # new, not change
 13 |         'url': str,  # new, not change
 14 |         'status': int,  # change
 15 |         'schedule': {
 16 |             'priority': int,
 17 |             'retries': int,
 18 |             'retried': int,
 19 |             'exetime': int,
 20 |             'age': int,
 21 |             'itag': str,
 22 |             # 'recrawl': int
 23 |         },  # new and restart
 24 |         'fetch': {
 25 |             'method': str,
 26 |             'headers': dict,
 27 |             'data': str,
 28 |             'timeout': int,
 29 |             'save': dict,
 30 |         },  # new and restart
 31 |         'process': {
 32 |             'callback': str,
 33 |         },  # new and restart
 34 |         'track': {
 35 |             'fetch': {
 36 |                 'ok': bool,
 37 |                 'time': int,
 38 |                 'status_code': int,
 39 |                 'headers': dict,
 40 |                 'encoding': str,
 41 |                 'content': str,
 42 |             },
 43 |             'process': {
 44 |                 'ok': bool,
 45 |                 'time': int,
 46 |                 'follows': int,
 47 |                 'outputs': int,
 48 |                 'logs': str,
 49 |                 'exception': str,
 50 |             },
 51 |             'save': object,  # jsonable object saved by processor
 52 |         },  # finish
 53 |         'lastcrawltime': int,  # keep between request
 54 |         'updatetime': int,  # keep between request
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | class TaskDB(object):
 60 |     ACTIVE = 1
 61 |     SUCCESS = 2
 62 |     FAILED = 3
 63 |     BAD = 4
 64 | 
 65 |     projects = set()  # projects in taskdb
 66 | 
 67 |     def load_tasks(self, status, project=None, fields=None):
 68 |         raise NotImplementedError
 69 | 
 70 |     def get_task(self, project, taskid, fields=None):
 71 |         raise NotImplementedError
 72 | 
 73 |     def status_count(self, project):
 74 |         '''
 75 |         return a dict
 76 |         '''
 77 |         raise NotImplementedError
 78 | 
 79 |     def insert(self, project, taskid, obj={}):
 80 |         raise NotImplementedError
 81 | 
 82 |     def update(self, project, taskid, obj={}, **kwargs):
 83 |         raise NotImplementedError
 84 | 
 85 |     def drop(self, project):
 86 |         raise NotImplementedError
 87 | 
 88 |     @staticmethod
 89 |     def status_to_string(status):
 90 |         return {
 91 |             1: 'ACTIVE',
 92 |             2: 'SUCCESS',
 93 |             3: 'FAILED',
 94 |             4: 'BAD',
 95 |         }.get(status, 'UNKNOWN')
 96 | 
 97 |     @staticmethod
 98 |     def status_to_int(status):
 99 |         return {
100 |             'ACTIVE': 1,
101 |             'SUCCESS': 2,
102 |             'FAILED': 3,
103 |             'BAD': 4,
104 |         }.get(status, 4)
105 | 
106 |     def copy(self):
107 |         '''
108 |         database should be able to copy itself to create new connection
109 | 
110 |         it's implemented automatically by pyspider.database.connect_database
111 |         if you are not create database connection via connect_database method,
112 |         you should implement this
113 |         '''
114 |         raise NotImplementedError
115 | 


--------------------------------------------------------------------------------
/pyspider/database/couchdb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/couchdb/__init__.py


--------------------------------------------------------------------------------
/pyspider/database/couchdb/couchdbbase.py:
--------------------------------------------------------------------------------
 1 | import time, requests, json
 2 | from requests.auth import HTTPBasicAuth
 3 | 
 4 | class SplitTableMixin(object):
 5 |     UPDATE_PROJECTS_TIME = 10 * 60
 6 | 
 7 |     def __init__(self):
 8 |         self.session = requests.session()
 9 |         if self.username:
10 |             self.session.auth = HTTPBasicAuth(self.username, self.password)
11 |         self.session.headers.update({'Content-Type': 'application/json'})
12 | 
13 |     def _collection_name(self, project):
14 |         if self.collection_prefix:
15 |             return "%s_%s" % (self.collection_prefix, project)
16 |         else:
17 |             return project
18 | 
19 | 
20 |     @property
21 |     def projects(self):
22 |         if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:
23 |             self._list_project()
24 |         return self._projects
25 | 
26 | 
27 |     @projects.setter
28 |     def projects(self, value):
29 |         self._projects = value
30 | 
31 | 
32 |     def _list_project(self):
33 |         self._last_update_projects = time.time()
34 |         self.projects = set()
35 |         if self.collection_prefix:
36 |             prefix = "%s." % self.collection_prefix
37 |         else:
38 |             prefix = ''
39 | 
40 |         url = self.base_url + "_all_dbs"
41 |         res = self.session.get(url, json={}).json()
42 |         for each in res:
43 |             if each.startswith('_'):
44 |                 continue
45 |             if each.startswith(self.database):
46 |                 self.projects.add(each[len(self.database)+1+len(prefix):])
47 | 
48 | 
49 |     def create_database(self, name):
50 |         url = self.base_url + name
51 |         res = self.session.put(url).json()
52 |         if 'error' in res and res['error'] == 'unauthorized':
53 |             raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password))
54 |         return res
55 | 
56 | 
57 |     def get_doc(self, db_name, doc_id):
58 |         url = self.base_url + db_name + "/" + doc_id
59 |         res = self.session.get(url).json()
60 |         if "error" in res and res["error"] == "not_found":
61 |             return None
62 |         return res
63 | 
64 | 
65 |     def get_docs(self, db_name, selector):
66 |         url = self.base_url + db_name + "/_find"
67 |         selector['use_index'] = self.index
68 |         res = self.session.post(url, json=selector).json()
69 |         if 'error' in res and res['error'] == 'not_found':
70 |             return []
71 |         return res['docs']
72 | 
73 | 
74 |     def get_all_docs(self, db_name):
75 |         return self.get_docs(db_name, {"selector": {}})
76 | 
77 | 
78 |     def insert_doc(self, db_name, doc_id, doc):
79 |         url = self.base_url + db_name + "/" + doc_id
80 |         return self.session.put(url, json=doc).json()
81 | 
82 | 
83 |     def update_doc(self, db_name, doc_id, new_doc):
84 |         doc = self.get_doc(db_name, doc_id)
85 |         if doc is None:
86 |             return self.insert_doc(db_name, doc_id, new_doc)
87 |         for key in new_doc:
88 |             doc[key] = new_doc[key]
89 |         url = self.base_url + db_name + "/" + doc_id
90 |         return self.session.put(url, json=doc).json()
91 | 
92 | 
93 |     def delete(self, url):
94 |         return self.session.delete(url).json()
95 | 
96 | 


--------------------------------------------------------------------------------
/pyspider/database/couchdb/resultdb.py:
--------------------------------------------------------------------------------
  1 | import time, json
  2 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
  3 | from .couchdbbase import SplitTableMixin
  4 | 
  5 | 
  6 | class ResultDB(SplitTableMixin, BaseResultDB):
  7 |     collection_prefix = ''
  8 | 
  9 |     def __init__(self, url, database='resultdb', username=None, password=None):
 10 |         self.username = username
 11 |         self.password = password
 12 |         self.base_url = url
 13 |         self.url = url + database + "/"
 14 |         self.database = database
 15 | 
 16 |         super().__init__()
 17 |         self.create_database(database)
 18 |         self.index = None
 19 | 
 20 |     def _get_collection_name(self, project):
 21 |         return self.database + "_" + self._collection_name(project)
 22 | 
 23 |     def _create_project(self, project):
 24 |         collection_name = self._get_collection_name(project)
 25 |         self.create_database(collection_name)
 26 |         # create index
 27 |         payload = {
 28 |             'index': {
 29 |                 'fields': ['taskid']
 30 |             },
 31 |             'name': collection_name
 32 |         }
 33 | 
 34 |         res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
 35 |         self.index = res['id']
 36 |         self._list_project()
 37 | 
 38 |     def save(self, project, taskid, url, result):
 39 |         if project not in self.projects:
 40 |             self._create_project(project)
 41 |         collection_name = self._get_collection_name(project)
 42 |         obj = {
 43 |             'taskid': taskid,
 44 |             'url': url,
 45 |             'result': result,
 46 |             'updatetime': time.time(),
 47 |         }
 48 |         return self.update_doc(collection_name, taskid, obj)
 49 | 
 50 |     def select(self, project, fields=None, offset=0, limit=0):
 51 |         if project not in self.projects:
 52 |             self._list_project()
 53 |         if project not in self.projects:
 54 |             return
 55 |         offset = offset or 0
 56 |         limit = limit or 0
 57 |         collection_name = self._get_collection_name(project)
 58 |         if fields is None:
 59 |             fields = []
 60 |         if limit == 0:
 61 |             sel = {
 62 |                 'selector': {},
 63 |                 'fields': fields,
 64 |                 'skip': offset
 65 |             }
 66 |         else:
 67 |             sel = {
 68 |               'selector': {},
 69 |               'fields': fields,
 70 |               'skip': offset,
 71 |               'limit': limit
 72 |             }
 73 |         for result in self.get_docs(collection_name, sel):
 74 |             yield result
 75 | 
 76 |     def count(self, project):
 77 |         if project not in self.projects:
 78 |             self._list_project()
 79 |         if project not in self.projects:
 80 |             return
 81 |         collection_name = self._get_collection_name(project)
 82 |         return len(self.get_all_docs(collection_name))
 83 | 
 84 |     def get(self, project, taskid, fields=None):
 85 |         if project not in self.projects:
 86 |             self._list_project()
 87 |         if project not in self.projects:
 88 |             return
 89 |         collection_name = self._get_collection_name(project)
 90 |         if fields is None:
 91 |             fields = []
 92 |         sel = {
 93 |             'selector': {'taskid': taskid},
 94 |             'fields': fields
 95 |         }
 96 |         ret = self.get_docs(collection_name, sel)
 97 |         if len(ret) == 0:
 98 |             return None
 99 |         return ret[0]
100 | 
101 |     def drop_database(self):
102 |         return self.delete(self.url)
103 | 
104 |     def drop(self, project):
105 |         # drop the project
106 |         collection_name = self._get_collection_name(project)
107 |         url = self.base_url + collection_name
108 |         return self.delete(url)


--------------------------------------------------------------------------------
/pyspider/database/couchdb/taskdb.py:
--------------------------------------------------------------------------------
  1 | import json, time
  2 | from pyspider.database.base.taskdb import TaskDB as BaseTaskDB
  3 | from .couchdbbase import SplitTableMixin
  4 | 
  5 | 
  6 | class TaskDB(SplitTableMixin, BaseTaskDB):
  7 |     collection_prefix = ''
  8 | 
  9 |     def __init__(self, url, database='taskdb', username=None, password=None):
 10 |         self.username = username
 11 |         self.password = password
 12 |         self.base_url = url
 13 |         self.url = url + database + "/"
 14 |         self.database = database
 15 |         self.index = None
 16 | 
 17 |         super().__init__()
 18 | 
 19 |         self.create_database(database)
 20 |         self.projects = set()
 21 |         self._list_project()
 22 | 
 23 |     def _get_collection_name(self, project):
 24 |         return self.database + "_" + self._collection_name(project)
 25 | 
 26 |     def _create_project(self, project):
 27 |         collection_name = self._get_collection_name(project)
 28 |         self.create_database(collection_name)
 29 |         # create index
 30 |         payload = {
 31 |             'index': {
 32 |                 'fields': ['status', 'taskid']
 33 |             },
 34 |             'name': collection_name
 35 |         }
 36 |         res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json()
 37 |         self.index = res['id']
 38 |         self._list_project()
 39 | 
 40 |     def load_tasks(self, status, project=None, fields=None):
 41 |         if not project:
 42 |             self._list_project()
 43 |         if fields is None:
 44 |             fields = []
 45 |         if project:
 46 |             projects = [project, ]
 47 |         else:
 48 |             projects = self.projects
 49 |         for project in projects:
 50 |             collection_name = self._get_collection_name(project)
 51 |             for task in self.get_docs(collection_name, {"selector": {"status": status}, "fields": fields}):
 52 |                 yield task
 53 | 
 54 |     def get_task(self, project, taskid, fields=None):
 55 |         if project not in self.projects:
 56 |             self._list_project()
 57 |         if project not in self.projects:
 58 |             return
 59 |         if fields is None:
 60 |             fields = []
 61 |         collection_name = self._get_collection_name(project)
 62 |         ret = self.get_docs(collection_name, {"selector": {"taskid": taskid}, "fields": fields})
 63 |         if len(ret) == 0:
 64 |             return None
 65 |         return ret[0]
 66 | 
 67 |     def status_count(self, project):
 68 |         if project not in self.projects:
 69 |             self._list_project()
 70 |         if project not in self.projects:
 71 |             return {}
 72 |         collection_name = self._get_collection_name(project)
 73 | 
 74 |         def _count_for_status(collection_name, status):
 75 |             total = len(self.get_docs(collection_name, {"selector": {'status': status}}))
 76 |             return {'total': total, "_id": status} if total else None
 77 | 
 78 |         c = collection_name
 79 |         ret = filter(lambda x: x,map(lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED]))
 80 | 
 81 |         result = {}
 82 |         if isinstance(ret, dict):
 83 |             ret = ret.get('result', [])
 84 |         for each in ret:
 85 |             result[each['_id']] = each['total']
 86 |         return result
 87 | 
 88 |     def insert(self, project, taskid, obj={}):
 89 |         if project not in self.projects:
 90 |             self._create_project(project)
 91 |         obj = dict(obj)
 92 |         obj['taskid'] = taskid
 93 |         obj['project'] = project
 94 |         obj['updatetime'] = time.time()
 95 |         return self.update(project, taskid, obj=obj)
 96 | 
 97 |     def update(self, project, taskid, obj={}, **kwargs):
 98 |         obj = dict(obj)
 99 |         obj.update(kwargs)
100 |         obj['updatetime'] = time.time()
101 |         collection_name = self._get_collection_name(project)
102 |         return self.update_doc(collection_name, taskid, obj)
103 | 
104 |     def drop_database(self):
105 |         return self.delete(self.url)
106 | 
107 |     def drop(self, project):
108 |         collection_name = self._get_collection_name(project)
109 |         url = self.base_url + collection_name
110 |         return self.delete(url)


--------------------------------------------------------------------------------
/pyspider/database/elasticsearch/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2016-01-17 18:31:58
7 | 


--------------------------------------------------------------------------------
/pyspider/database/elasticsearch/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2016-01-17 18:32:33
 7 | 
 8 | import time
 9 | 
10 | import elasticsearch.helpers
11 | from elasticsearch import Elasticsearch
12 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
13 | 
14 | 
15 | class ProjectDB(BaseProjectDB):
16 |     __type__ = 'project'
17 | 
18 |     def __init__(self, hosts, index='pyspider'):
19 |         self.index = index
20 |         self.es = Elasticsearch(hosts=hosts)
21 | 
22 |         self.es.indices.create(index=self.index, ignore=400)
23 |         if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
24 |             self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
25 |                 "_all": {"enabled": False},
26 |                 "properties": {
27 |                     "updatetime": {"type": "double"}
28 |                 }
29 |             })
30 | 
31 |     def insert(self, name, obj={}):
32 |         obj = dict(obj)
33 |         obj['name'] = name
34 |         obj['updatetime'] = time.time()
35 | 
36 |         obj.setdefault('group', '')
37 |         obj.setdefault('status', 'TODO')
38 |         obj.setdefault('script', '')
39 |         obj.setdefault('comments', '')
40 |         obj.setdefault('rate', 0)
41 |         obj.setdefault('burst', 0)
42 | 
43 |         return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name,
44 |                              refresh=True)
45 | 
46 |     def update(self, name, obj={}, **kwargs):
47 |         obj = dict(obj)
48 |         obj.update(kwargs)
49 |         obj['updatetime'] = time.time()
50 |         return self.es.update(index=self.index, doc_type=self.__type__,
51 |                               body={'doc': obj}, id=name, refresh=True, ignore=404)
52 | 
53 |     def get_all(self, fields=None):
54 |         for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
55 |                                                  query={'query': {"match_all": {}}},
56 |                                                  _source_include=fields or []):
57 |             yield record['_source']
58 | 
59 |     def get(self, name, fields=None):
60 |         ret = self.es.get(index=self.index, doc_type=self.__type__, id=name,
61 |                           _source_include=fields or [], ignore=404)
62 |         return ret.get('_source', None)
63 | 
64 |     def check_update(self, timestamp, fields=None):
65 |         for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
66 |                                                  query={'query': {"range": {
67 |                                                      "updatetime": {"gte": timestamp}
68 |                                                  }}}, _source_include=fields or []):
69 |             yield record['_source']
70 | 
71 |     def drop(self, name):
72 |         return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True)
73 | 


--------------------------------------------------------------------------------
/pyspider/database/elasticsearch/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2016-01-18 19:41:24
 7 | 
 8 | 
 9 | import time
10 | 
11 | import elasticsearch.helpers
12 | from elasticsearch import Elasticsearch
13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
14 | 
15 | 
16 | class ResultDB(BaseResultDB):
17 |     __type__ = 'result'
18 | 
19 |     def __init__(self, hosts, index='pyspider'):
20 |         self.index = index
21 |         self.es = Elasticsearch(hosts=hosts)
22 | 
23 |         self.es.indices.create(index=self.index, ignore=400)
24 |         if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__):
25 |             self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={
26 |                 "_all": {"enabled": True},
27 |                 "properties": {
28 |                     "taskid": {"enabled": False},
29 |                     "project": {"type": "string", "index": "not_analyzed"},
30 |                     "url": {"enabled": False},
31 |                 }
32 |             })
33 | 
34 |     @property
35 |     def projects(self):
36 |         ret = self.es.search(index=self.index, doc_type=self.__type__,
37 |                              body={"aggs": {"projects": {
38 |                                  "terms": {"field": "project"}
39 |                              }}}, _source=False)
40 |         return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])]
41 | 
42 |     def save(self, project, taskid, url, result):
43 |         obj = {
44 |             'taskid': taskid,
45 |             'project': project,
46 |             'url': url,
47 |             'result': result,
48 |             'updatetime': time.time(),
49 |         }
50 |         return self.es.index(index=self.index, doc_type=self.__type__,
51 |                              body=obj, id='%s:%s' % (project, taskid))
52 | 
53 |     def select(self, project, fields=None, offset=0, limit=0):
54 |         offset = offset or 0
55 |         limit = limit or 0
56 |         if not limit:
57 |             for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
58 |                                                      query={'query': {'term': {'project': project}}},
59 |                                                      _source_include=fields or [], from_=offset,
60 |                                                      sort="updatetime:desc"):
61 |                 yield record['_source']
62 |         else:
63 |             for record in self.es.search(index=self.index, doc_type=self.__type__,
64 |                                          body={'query': {'term': {'project': project}}},
65 |                                          _source_include=fields or [], from_=offset, size=limit,
66 |                                          sort="updatetime:desc"
67 |                                          ).get('hits', {}).get('hits', []):
68 |                 yield record['_source']
69 | 
70 |     def count(self, project):
71 |         return self.es.count(index=self.index, doc_type=self.__type__,
72 |                              body={'query': {'term': {'project': project}}}
73 |                              ).get('count', 0)
74 | 
75 |     def get(self, project, taskid, fields=None):
76 |         ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid),
77 |                           _source_include=fields or [], ignore=404)
78 |         return ret.get('_source', None)
79 | 
80 |     def drop(self, project):
81 |         self.refresh()
82 |         for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__,
83 |                                                  query={'query': {'term': {'project': project}}},
84 |                                                  _source=False):
85 |             self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id'])
86 | 
87 |     def refresh(self):
88 |         """
89 |         Explicitly refresh one or more index, making all operations
90 |         performed since the last refresh available for search.
91 |         """
92 |         self.es.indices.refresh(index=self.index)
93 | 


--------------------------------------------------------------------------------
/pyspider/database/local/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2015-01-17 20:56:50
7 | 


--------------------------------------------------------------------------------
/pyspider/database/local/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-17 12:32:17
 7 | 
 8 | import os
 9 | import re
10 | import six
11 | import glob
12 | import logging
13 | 
14 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
15 | 
16 | 
17 | class ProjectDB(BaseProjectDB):
18 |     """ProjectDB loading scripts from local file."""
19 | 
20 |     def __init__(self, files):
21 |         self.files = files
22 |         self.projects = {}
23 |         self.load_scripts()
24 | 
25 |     def load_scripts(self):
26 |         project_names = set(self.projects.keys())
27 |         for path in self.files:
28 |             for filename in glob.glob(path):
29 |                 name = os.path.splitext(os.path.basename(filename))[0]
30 |                 if name in project_names:
31 |                     project_names.remove(name)
32 |                 updatetime = os.path.getmtime(filename)
33 |                 if name not in self.projects or updatetime > self.projects[name]['updatetime']:
34 |                     project = self._build_project(filename)
35 |                     if not project:
36 |                         continue
37 |                     self.projects[project['name']] = project
38 | 
39 |         for name in project_names:
40 |             del self.projects[name]
41 | 
42 |     rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M)
43 |     burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M)
44 | 
45 |     def _build_project(self, filename):
46 |         try:
47 |             with open(filename) as fp:
48 |                 script = fp.read()
49 |             m = self.rate_re.search(script)
50 |             if m:
51 |                 rate = float(m.group(1))
52 |             else:
53 |                 rate = 1
54 | 
55 |             m = self.burst_re.search(script)
56 |             if m:
57 |                 burst = float(m.group(1))
58 |             else:
59 |                 burst = 3
60 | 
61 |             return {
62 |                 'name': os.path.splitext(os.path.basename(filename))[0],
63 |                 'group': None,
64 |                 'status': 'RUNNING',
65 |                 'script': script,
66 |                 'comments': None,
67 |                 'rate': rate,
68 |                 'burst': burst,
69 |                 'updatetime': os.path.getmtime(filename),
70 |             }
71 |         except OSError as e:
72 |             logging.error('loading project script error: %s', e)
73 |             return None
74 | 
75 |     def get_all(self, fields=None):
76 |         for projectname in self.projects:
77 |             yield self.get(projectname, fields)
78 | 
79 |     def get(self, name, fields=None):
80 |         if name not in self.projects:
81 |             return None
82 |         project = self.projects[name]
83 |         result = {}
84 |         for f in fields or project:
85 |             if f in project:
86 |                 result[f] = project[f]
87 |             else:
88 |                 result[f] = None
89 |         return result
90 | 
91 |     def check_update(self, timestamp, fields=None):
92 |         self.load_scripts()
93 |         for projectname, project in six.iteritems(self.projects):
94 |             if project['updatetime'] > timestamp:
95 |                 yield self.get(projectname, fields)
96 | 


--------------------------------------------------------------------------------
/pyspider/database/mongodb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/mongodb/__init__.py


--------------------------------------------------------------------------------
/pyspider/database/mongodb/mongodbbase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-22 20:42:01
 7 | 
 8 | import time
 9 | 
10 | 
11 | class SplitTableMixin(object):
12 |     UPDATE_PROJECTS_TIME = 10 * 60
13 | 
14 |     def _collection_name(self, project):
15 |         if self.collection_prefix:
16 |             return "%s.%s" % (self.collection_prefix, project)
17 |         else:
18 |             return project
19 | 
20 |     @property
21 |     def projects(self):
22 |         if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME:
23 |             self._list_project()
24 |         return self._projects
25 | 
26 |     @projects.setter
27 |     def projects(self, value):
28 |         self._projects = value
29 | 
30 |     def _list_project(self):
31 |         self._last_update_projects = time.time()
32 |         self.projects = set()
33 |         if self.collection_prefix:
34 |             prefix = "%s." % self.collection_prefix
35 |         else:
36 |             prefix = ''
37 |         for each in self.database.collection_names():
38 |             if each.startswith('system.'):
39 |                 continue
40 |             if each.startswith(prefix):
41 |                 self.projects.add(each[len(prefix):])
42 | 
43 |     def drop(self, project):
44 |         if project not in self.projects:
45 |             self._list_project()
46 |         if project not in self.projects:
47 |             return
48 |         collection_name = self._collection_name(project)
49 |         self.database[collection_name].drop()
50 |         self._list_project()
51 | 


--------------------------------------------------------------------------------
/pyspider/database/mongodb/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-12 12:22:42
 7 | 
 8 | import time
 9 | from pymongo import MongoClient
10 | 
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | 
13 | 
14 | class ProjectDB(BaseProjectDB):
15 |     __collection_name__ = 'projectdb'
16 | 
17 |     def __init__(self, url, database='projectdb'):
18 |         self.conn = MongoClient(url)
19 |         self.conn.admin.command("ismaster")
20 |         self.database = self.conn[database]
21 |         self.collection = self.database[self.__collection_name__]
22 | 
23 |         self.collection.ensure_index('name', unique=True)
24 | 
25 |     def _default_fields(self, each):
26 |         if each is None:
27 |             return each
28 |         each.setdefault('group', None)
29 |         each.setdefault('status', 'TODO')
30 |         each.setdefault('script', '')
31 |         each.setdefault('comments', None)
32 |         each.setdefault('rate', 0)
33 |         each.setdefault('burst', 0)
34 |         each.setdefault('updatetime', 0)
35 |         return each
36 | 
37 |     def insert(self, name, obj={}):
38 |         obj = dict(obj)
39 |         obj['name'] = name
40 |         obj['updatetime'] = time.time()
41 |         return self.collection.update({'name': name}, {'$set': obj}, upsert=True)
42 | 
43 |     def update(self, name, obj={}, **kwargs):
44 |         obj = dict(obj)
45 |         obj.update(kwargs)
46 |         obj['updatetime'] = time.time()
47 |         return self.collection.update({'name': name}, {'$set': obj})
48 | 
49 |     def get_all(self, fields=None):
50 |         for each in self.collection.find({}, fields):
51 |             if each and '_id' in each:
52 |                 del each['_id']
53 |             yield self._default_fields(each)
54 | 
55 |     def get(self, name, fields=None):
56 |         each = self.collection.find_one({'name': name}, fields)
57 |         if each and '_id' in each:
58 |             del each['_id']
59 |         return self._default_fields(each)
60 | 
61 |     def check_update(self, timestamp, fields=None):
62 |         for project in self.get_all(fields=('updatetime', 'name')):
63 |             if project['updatetime'] > timestamp:
64 |                 project = self.get(project['name'], fields)
65 |                 yield self._default_fields(project)
66 | 
67 |     def drop(self, name):
68 |         return self.collection.remove({'name': name})
69 | 


--------------------------------------------------------------------------------
/pyspider/database/mongodb/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-13 22:18:36
 7 | 
 8 | import json
 9 | import time
10 | 
11 | from pymongo import MongoClient
12 | 
13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
14 | from .mongodbbase import SplitTableMixin
15 | 
16 | 
17 | class ResultDB(SplitTableMixin, BaseResultDB):
18 |     collection_prefix = ''
19 | 
20 |     def __init__(self, url, database='resultdb'):
21 |         self.conn = MongoClient(url)
22 |         self.conn.admin.command("ismaster")
23 |         self.database = self.conn[database]
24 |         self.projects = set()
25 | 
26 |         self._list_project()
27 |         # we suggest manually build index in advance, instead of indexing
28 |         #  in the startup process,
29 |         # for project in self.projects:
30 |         #     collection_name = self._collection_name(project)
31 |         #     self.database[collection_name].ensure_index('taskid')
32 |         pass
33 | 
34 |     def _create_project(self, project):
35 |         collection_name = self._collection_name(project)
36 |         self.database[collection_name].ensure_index('taskid')
37 |         self._list_project()
38 | 
39 |     def _parse(self, data):
40 |         data['_id'] = str(data['_id'])
41 |         if 'result' in data:
42 |             data['result'] = json.loads(data['result'])
43 |         return data
44 | 
45 |     def _stringify(self, data):
46 |         if 'result' in data:
47 |             data['result'] = json.dumps(data['result'])
48 |         return data
49 | 
50 |     def save(self, project, taskid, url, result):
51 |         if project not in self.projects:
52 |             self._create_project(project)
53 |         collection_name = self._collection_name(project)
54 |         obj = {
55 |             'taskid'    : taskid,
56 |             'url'       : url,
57 |             'result'    : result,
58 |             'updatetime': time.time(),
59 |         }
60 |         return self.database[collection_name].update(
61 |             {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True
62 |         )
63 | 
64 |     def select(self, project, fields=None, offset=0, limit=0):
65 |         if project not in self.projects:
66 |             self._list_project()
67 |         if project not in self.projects:
68 |             return
69 |         offset = offset or 0
70 |         limit = limit or 0
71 |         collection_name = self._collection_name(project)
72 |         for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit):
73 |             yield self._parse(result)
74 | 
75 |     def count(self, project):
76 |         if project not in self.projects:
77 |             self._list_project()
78 |         if project not in self.projects:
79 |             return
80 |         collection_name = self._collection_name(project)
81 |         return self.database[collection_name].count()
82 | 
83 |     def get(self, project, taskid, fields=None):
84 |         if project not in self.projects:
85 |             self._list_project()
86 |         if project not in self.projects:
87 |             return
88 |         collection_name = self._collection_name(project)
89 |         ret = self.database[collection_name].find_one({'taskid': taskid}, fields)
90 |         if not ret:
91 |             return ret
92 |         return self._parse(ret)
93 | 


--------------------------------------------------------------------------------
/pyspider/database/mysql/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-07-17 20:12:54
7 | 


--------------------------------------------------------------------------------
/pyspider/database/mysql/mysqlbase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-05 10:42:24
 7 | 
 8 | import time
 9 | import mysql.connector
10 | 
11 | 
12 | class MySQLMixin(object):
13 |     maxlimit = 18446744073709551615
14 | 
15 |     @property
16 |     def dbcur(self):
17 |         try:
18 |             if self.conn.unread_result:
19 |                 self.conn.get_rows()
20 |                 if hasattr(self.conn, 'free_result'):
21 |                     self.conn.free_result()
22 |             return self.conn.cursor()
23 |         except (mysql.connector.OperationalError, mysql.connector.InterfaceError):
24 |             self.conn.ping(reconnect=True)
25 |             self.conn.database = self.database_name
26 |             return self.conn.cursor()
27 | 
28 | 
29 | class SplitTableMixin(object):
30 |     UPDATE_PROJECTS_TIME = 10 * 60
31 | 
32 |     def _tablename(self, project):
33 |         if self.__tablename__:
34 |             return '%s_%s' % (self.__tablename__, project)
35 |         else:
36 |             return project
37 | 
38 |     @property
39 |     def projects(self):
40 |         if time.time() - getattr(self, '_last_update_projects', 0) \
41 |                 > self.UPDATE_PROJECTS_TIME:
42 |             self._list_project()
43 |         return self._projects
44 | 
45 |     @projects.setter
46 |     def projects(self, value):
47 |         self._projects = value
48 | 
49 |     def _list_project(self):
50 |         self._last_update_projects = time.time()
51 |         self.projects = set()
52 |         if self.__tablename__:
53 |             prefix = '%s_' % self.__tablename__
54 |         else:
55 |             prefix = ''
56 |         for project, in self._execute('show tables;'):
57 |             if project.startswith(prefix):
58 |                 project = project[len(prefix):]
59 |                 self.projects.add(project)
60 | 
61 |     def drop(self, project):
62 |         if project not in self.projects:
63 |             self._list_project()
64 |         if project not in self.projects:
65 |             return
66 |         tablename = self._tablename(project)
67 |         self._execute("DROP TABLE %s" % self.escape(tablename))
68 |         self._list_project()
69 | 


--------------------------------------------------------------------------------
/pyspider/database/mysql/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-07-17 21:06:43
 7 | 
 8 | import time
 9 | import mysql.connector
10 | 
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | from pyspider.database.basedb import BaseDB
13 | from .mysqlbase import MySQLMixin
14 | 
15 | 
16 | class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB):
17 |     __tablename__ = 'projectdb'
18 | 
19 |     def __init__(self, host='localhost', port=3306, database='projectdb',
20 |                  user='root', passwd=None):
21 |         self.database_name = database
22 |         self.conn = mysql.connector.connect(user=user, password=passwd,
23 |                                             host=host, port=port, autocommit=True)
24 |         if database not in [x[0] for x in self._execute('show databases')]:
25 |             self._execute('CREATE DATABASE %s' % self.escape(database))
26 |         self.conn.database = database
27 | 
28 |         self._execute('''CREATE TABLE IF NOT EXISTS %s (
29 |             `name` varchar(64) PRIMARY KEY,
30 |             `group` varchar(64),
31 |             `status` varchar(16),
32 |             `script` TEXT,
33 |             `comments` varchar(1024),
34 |             `rate` float(11, 4),
35 |             `burst` float(11, 4),
36 |             `updatetime` double(16, 4)
37 |             ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__))
38 | 
39 |     def insert(self, name, obj={}):
40 |         obj = dict(obj)
41 |         obj['name'] = name
42 |         obj['updatetime'] = time.time()
43 |         return self._insert(**obj)
44 | 
45 |     def update(self, name, obj={}, **kwargs):
46 |         obj = dict(obj)
47 |         obj.update(kwargs)
48 |         obj['updatetime'] = time.time()
49 |         ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
50 |         return ret.rowcount
51 | 
52 |     def get_all(self, fields=None):
53 |         return self._select2dic(what=fields)
54 | 
55 |     def get(self, name, fields=None):
56 |         where = "`name` = %s" % self.placeholder
57 |         for each in self._select2dic(what=fields, where=where, where_values=(name, )):
58 |             return each
59 |         return None
60 | 
61 |     def drop(self, name):
62 |         where = "`name` = %s" % self.placeholder
63 |         return self._delete(where=where, where_values=(name, ))
64 | 
65 |     def check_update(self, timestamp, fields=None):
66 |         where = "`updatetime` >= %f" % timestamp
67 |         return self._select2dic(what=fields, where=where)
68 | 


--------------------------------------------------------------------------------
/pyspider/database/mysql/resultdb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<i@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-10-13 22:02:57
  7 | 
  8 | import re
  9 | import six
 10 | import time
 11 | import json
 12 | import mysql.connector
 13 | 
 14 | from pyspider.libs import utils
 15 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
 16 | from pyspider.database.basedb import BaseDB
 17 | from .mysqlbase import MySQLMixin, SplitTableMixin
 18 | 
 19 | 
 20 | class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB):
 21 |     __tablename__ = ''
 22 | 
 23 |     def __init__(self, host='localhost', port=3306, database='resultdb',
 24 |                  user='root', passwd=None):
 25 |         self.database_name = database
 26 |         self.conn = mysql.connector.connect(user=user, password=passwd,
 27 |                                             host=host, port=port, autocommit=True)
 28 |         if database not in [x[0] for x in self._execute('show databases')]:
 29 |             self._execute('CREATE DATABASE %s' % self.escape(database))
 30 |         self.conn.database = database
 31 |         self._list_project()
 32 | 
 33 |     def _create_project(self, project):
 34 |         assert re.match(r'^\w+$', project) is not None
 35 |         tablename = self._tablename(project)
 36 |         if tablename in [x[0] for x in self._execute('show tables')]:
 37 |             return
 38 |         self._execute('''CREATE TABLE %s (
 39 |             `taskid` varchar(64) PRIMARY KEY,
 40 |             `url` varchar(1024),
 41 |             `result` MEDIUMBLOB,
 42 |             `updatetime` double(16, 4)
 43 |             ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename))
 44 | 
 45 |     def _parse(self, data):
 46 |         for key, value in list(six.iteritems(data)):
 47 |             if isinstance(value, (bytearray, six.binary_type)):
 48 |                 data[key] = utils.text(value)
 49 |         if 'result' in data:
 50 |             data['result'] = json.loads(data['result'])
 51 |         return data
 52 | 
 53 |     def _stringify(self, data):
 54 |         if 'result' in data:
 55 |             data['result'] = json.dumps(data['result'])
 56 |         return data
 57 | 
 58 |     def save(self, project, taskid, url, result):
 59 |         tablename = self._tablename(project)
 60 |         if project not in self.projects:
 61 |             self._create_project(project)
 62 |             self._list_project()
 63 |         obj = {
 64 |             'taskid': taskid,
 65 |             'url': url,
 66 |             'result': result,
 67 |             'updatetime': time.time(),
 68 |         }
 69 |         return self._replace(tablename, **self._stringify(obj))
 70 | 
 71 |     def select(self, project, fields=None, offset=0, limit=None):
 72 |         if project not in self.projects:
 73 |             self._list_project()
 74 |         if project not in self.projects:
 75 |             return
 76 |         tablename = self._tablename(project)
 77 | 
 78 |         for task in self._select2dic(tablename, what=fields, order='updatetime DESC',
 79 |                                      offset=offset, limit=limit):
 80 |             yield self._parse(task)
 81 | 
 82 |     def count(self, project):
 83 |         if project not in self.projects:
 84 |             self._list_project()
 85 |         if project not in self.projects:
 86 |             return 0
 87 |         tablename = self._tablename(project)
 88 |         for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)):
 89 |             return count
 90 | 
 91 |     def get(self, project, taskid, fields=None):
 92 |         if project not in self.projects:
 93 |             self._list_project()
 94 |         if project not in self.projects:
 95 |             return
 96 |         tablename = self._tablename(project)
 97 |         where = "`taskid` = %s" % self.placeholder
 98 |         for task in self._select2dic(tablename, what=fields,
 99 |                                      where=where, where_values=(taskid, )):
100 |             return self._parse(task)
101 | 


--------------------------------------------------------------------------------
/pyspider/database/redis/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2015-05-17 01:34:21
7 | 
8 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlalchemy/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<roy@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-12-04 20:11:04
7 | 
8 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlalchemy/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-04 23:25:10
 7 | 
 8 | import six
 9 | import time
10 | import sqlalchemy.exc
11 | 
12 | from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text
13 | from sqlalchemy.engine.url import make_url
14 | from pyspider.libs import utils
15 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
16 | from .sqlalchemybase import result2dict
17 | 
18 | 
19 | class ProjectDB(BaseProjectDB):
20 |     __tablename__ = 'projectdb'
21 | 
22 |     def __init__(self, url):
23 |         self.table = Table(self.__tablename__, MetaData(),
24 |                            Column('name', String(64), primary_key=True),
25 |                            Column('group', String(64)),
26 |                            Column('status', String(16)),
27 |                            Column('script', Text),
28 |                            Column('comments', String(1024)),
29 |                            Column('rate', Float(11)),
30 |                            Column('burst', Float(11)),
31 |                            Column('updatetime', Float(32)),
32 |                            mysql_engine='InnoDB',
33 |                            mysql_charset='utf8'
34 |                            )
35 | 
36 |         self.url = make_url(url)
37 |         if self.url.database:
38 |             database = self.url.database
39 |             self.url.database = None
40 |             try:
41 |                 engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600)
42 |                 conn = engine.connect()
43 |                 conn.execute("commit")
44 |                 conn.execute("CREATE DATABASE %s" % database)
45 |             except sqlalchemy.exc.SQLAlchemyError:
46 |                 pass
47 |             self.url.database = database
48 |         self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600)
49 |         self.table.create(self.engine, checkfirst=True)
50 | 
51 |     @staticmethod
52 |     def _parse(data):
53 |         return data
54 | 
55 |     @staticmethod
56 |     def _stringify(data):
57 |         return data
58 | 
59 |     def insert(self, name, obj={}):
60 |         obj = dict(obj)
61 |         obj['name'] = name
62 |         obj['updatetime'] = time.time()
63 |         return self.engine.execute(self.table.insert()
64 |                                    .values(**self._stringify(obj)))
65 | 
66 |     def update(self, name, obj={}, **kwargs):
67 |         obj = dict(obj)
68 |         obj.update(kwargs)
69 |         obj['updatetime'] = time.time()
70 |         return self.engine.execute(self.table.update()
71 |                                    .where(self.table.c.name == name)
72 |                                    .values(**self._stringify(obj)))
73 | 
74 |     def get_all(self, fields=None):
75 |         columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
76 |         for task in self.engine.execute(self.table.select()
77 |                                         .with_only_columns(columns)):
78 |             yield self._parse(result2dict(columns, task))
79 | 
80 |     def get(self, name, fields=None):
81 |         columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
82 |         for task in self.engine.execute(self.table.select()
83 |                                         .where(self.table.c.name == name)
84 |                                         .limit(1)
85 |                                         .with_only_columns(columns)):
86 |             return self._parse(result2dict(columns, task))
87 | 
88 |     def drop(self, name):
89 |         return self.engine.execute(self.table.delete()
90 |                                    .where(self.table.c.name == name))
91 | 
92 |     def check_update(self, timestamp, fields=None):
93 |         columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c
94 |         for task in self.engine.execute(self.table.select()
95 |                                         .with_only_columns(columns)
96 |                                         .where(self.table.c.updatetime >= timestamp)):
97 |             yield self._parse(result2dict(columns, task))
98 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlalchemy/sqlalchemybase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-04 18:48:47
 7 | 
 8 | import time
 9 | 
10 | 
11 | def result2dict(columns, task):
12 |     return dict(task)
13 | 
14 | 
15 | class SplitTableMixin(object):
16 |     UPDATE_PROJECTS_TIME = 10 * 60
17 | 
18 |     def _tablename(self, project):
19 |         if self.__tablename__:
20 |             return '%s_%s' % (self.__tablename__, project)
21 |         else:
22 |             return project
23 | 
24 |     @property
25 |     def projects(self):
26 |         if time.time() - getattr(self, '_last_update_projects', 0) \
27 |                 > self.UPDATE_PROJECTS_TIME:
28 |             self._list_project()
29 |         return self._projects
30 | 
31 |     @projects.setter
32 |     def projects(self, value):
33 |         self._projects = value
34 | 
35 |     def _list_project(self):
36 |         self._last_update_projects = time.time()
37 |         self.projects = set()
38 |         if self.__tablename__:
39 |             prefix = '%s_' % self.__tablename__
40 |         else:
41 |             prefix = ''
42 | 
43 |         for project in self.engine.table_names():
44 |             if project.startswith(prefix):
45 |                 project = project[len(prefix):]
46 |                 self.projects.add(project)
47 | 
48 |     def drop(self, project):
49 |         if project not in self.projects:
50 |             self._list_project()
51 |         if project not in self.projects:
52 |             return
53 |         self.table.name = self._tablename(project)
54 |         self.table.drop(self.engine)
55 |         self._list_project()
56 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlite/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/sqlite/__init__.py


--------------------------------------------------------------------------------
/pyspider/database/sqlite/projectdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-09 12:05:52
 7 | 
 8 | import time
 9 | 
10 | from .sqlitebase import SQLiteMixin
11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB
12 | from pyspider.database.basedb import BaseDB
13 | 
14 | 
15 | class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB):
16 |     __tablename__ = 'projectdb'
17 |     placeholder = '?'
18 | 
19 |     def __init__(self, path):
20 |         self.path = path
21 |         self.last_pid = 0
22 |         self.conn = None
23 |         self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
24 |                 name PRIMARY KEY,
25 |                 `group`,
26 |                 status, script, comments,
27 |                 rate, burst, updatetime
28 |                 )''' % self.__tablename__)
29 | 
30 |     def insert(self, name, obj={}):
31 |         obj = dict(obj)
32 |         obj['name'] = name
33 |         obj['updatetime'] = time.time()
34 |         return self._insert(**obj)
35 | 
36 |     def update(self, name, obj={}, **kwargs):
37 |         obj = dict(obj)
38 |         obj.update(kwargs)
39 |         obj['updatetime'] = time.time()
40 |         ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj)
41 |         return ret.rowcount
42 | 
43 |     def get_all(self, fields=None):
44 |         return self._select2dic(what=fields)
45 | 
46 |     def get(self, name, fields=None):
47 |         where = "`name` = %s" % self.placeholder
48 |         for each in self._select2dic(what=fields, where=where, where_values=(name, )):
49 |             return each
50 |         return None
51 | 
52 |     def check_update(self, timestamp, fields=None):
53 |         where = "`updatetime` >= %f" % timestamp
54 |         return self._select2dic(what=fields, where=where)
55 | 
56 |     def drop(self, name):
57 |         where = "`name` = %s" % self.placeholder
58 |         return self._delete(where=where, where_values=(name, ))
59 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlite/resultdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-13 17:08:43
 7 | 
 8 | import re
 9 | import time
10 | import json
11 | 
12 | from .sqlitebase import SQLiteMixin, SplitTableMixin
13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB
14 | from pyspider.database.basedb import BaseDB
15 | 
16 | 
17 | class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB):
18 |     __tablename__ = 'resultdb'
19 |     placeholder = '?'
20 | 
21 |     def __init__(self, path):
22 |         self.path = path
23 |         self.last_pid = 0
24 |         self.conn = None
25 |         self._list_project()
26 | 
27 |     def _create_project(self, project):
28 |         assert re.match(r'^\w+$', project) is not None
29 |         tablename = self._tablename(project)
30 |         self._execute('''CREATE TABLE IF NOT EXISTS `%s` (
31 |                 taskid PRIMARY KEY,
32 |                 url,
33 |                 result,
34 |                 updatetime
35 |                 )''' % tablename)
36 | 
37 |     def _parse(self, data):
38 |         if 'result' in data:
39 |             data['result'] = json.loads(data['result'])
40 |         return data
41 | 
42 |     def _stringify(self, data):
43 |         if 'result' in data:
44 |             data['result'] = json.dumps(data['result'])
45 |         return data
46 | 
47 |     def save(self, project, taskid, url, result):
48 |         tablename = self._tablename(project)
49 |         if project not in self.projects:
50 |             self._create_project(project)
51 |             self._list_project()
52 |         obj = {
53 |             'taskid': taskid,
54 |             'url': url,
55 |             'result': result,
56 |             'updatetime': time.time(),
57 |         }
58 |         return self._replace(tablename, **self._stringify(obj))
59 | 
60 |     def select(self, project, fields=None, offset=0, limit=None):
61 |         if project not in self.projects:
62 |             self._list_project()
63 |         if project not in self.projects:
64 |             return
65 |         tablename = self._tablename(project)
66 | 
67 |         for task in self._select2dic(tablename, what=fields, order='updatetime DESC',
68 |                                      offset=offset, limit=limit):
69 |             yield self._parse(task)
70 | 
71 |     def count(self, project):
72 |         if project not in self.projects:
73 |             self._list_project()
74 |         if project not in self.projects:
75 |             return 0
76 |         tablename = self._tablename(project)
77 |         for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)):
78 |             return count
79 | 
80 |     def get(self, project, taskid, fields=None):
81 |         if project not in self.projects:
82 |             self._list_project()
83 |         if project not in self.projects:
84 |             return
85 |         tablename = self._tablename(project)
86 |         where = "`taskid` = %s" % self.placeholder
87 |         for task in self._select2dic(tablename, what=fields,
88 |                                      where=where, where_values=(taskid, )):
89 |             return self._parse(task)
90 | 


--------------------------------------------------------------------------------
/pyspider/database/sqlite/sqlitebase.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-22 20:30:44
 7 | 
 8 | import os
 9 | import time
10 | import sqlite3
11 | import threading
12 | 
13 | 
14 | class SQLiteMixin(object):
15 | 
16 |     @property
17 |     def dbcur(self):
18 |         pid = (os.getpid(), threading.current_thread().ident)
19 |         if not (self.conn and pid == self.last_pid):
20 |             self.last_pid = pid
21 |             self.conn = sqlite3.connect(self.path, isolation_level=None)
22 |         return self.conn.cursor()
23 | 
24 | 
25 | class SplitTableMixin(object):
26 |     UPDATE_PROJECTS_TIME = 10 * 60
27 | 
28 |     def _tablename(self, project):
29 |         if self.__tablename__:
30 |             return '%s_%s' % (self.__tablename__, project)
31 |         else:
32 |             return project
33 | 
34 |     @property
35 |     def projects(self):
36 |         if time.time() - getattr(self, '_last_update_projects', 0) \
37 |                 > self.UPDATE_PROJECTS_TIME:
38 |             self._list_project()
39 |         return self._projects
40 | 
41 |     @projects.setter
42 |     def projects(self, value):
43 |         self._projects = value
44 | 
45 |     def _list_project(self):
46 |         self._last_update_projects = time.time()
47 |         self.projects = set()
48 |         if self.__tablename__:
49 |             prefix = '%s_' % self.__tablename__
50 |         else:
51 |             prefix = ''
52 |         for project, in self._select('sqlite_master', what='name',
53 |                                      where='type = "table"'):
54 |             if project.startswith(prefix):
55 |                 project = project[len(prefix):]
56 |                 self.projects.add(project)
57 | 
58 |     def drop(self, project):
59 |         if project not in self.projects:
60 |             self._list_project()
61 |         if project not in self.projects:
62 |             return
63 |         tablename = self._tablename(project)
64 |         self._execute("DROP TABLE %s" % self.escape(tablename))
65 |         self._list_project()
66 | 


--------------------------------------------------------------------------------
/pyspider/fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | from .tornado_fetcher import Fetcher
2 | 


--------------------------------------------------------------------------------
/pyspider/fetcher/cookie_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-14 09:07:11
 7 | 
 8 | from requests.cookies import MockRequest
 9 | 
10 | 
11 | class MockResponse(object):
12 | 
13 |     def __init__(self, headers):
14 |         self._headers = headers
15 | 
16 |     def info(self):
17 |         return self
18 | 
19 |     def getheaders(self, name):
20 |         """make cookie python 2 version use this method to get cookie list"""
21 |         return self._headers.get_list(name)
22 | 
23 |     def get_all(self, name, default=None):
24 |         """make cookie python 3 version use this instead of getheaders"""
25 |         if default is None:
26 |             default = []
27 |         return self._headers.get_list(name) or default
28 | 
29 | 
30 | def extract_cookies_to_jar(jar, request, response):
31 |     req = MockRequest(request)
32 |     res = MockResponse(response)
33 |     jar.extract_cookies(res, req)
34 | 


--------------------------------------------------------------------------------
/pyspider/libs/ListIO.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-26 23:41:51
 7 | 
 8 | 
 9 | class ListO(object):
10 | 
11 |     """A StringO write to list."""
12 | 
13 |     def __init__(self, buffer=None):
14 |         self._buffer = buffer
15 |         if self._buffer is None:
16 |             self._buffer = []
17 | 
18 |     def isatty(self):
19 |         return False
20 | 
21 |     def close(self):
22 |         pass
23 | 
24 |     def flush(self):
25 |         pass
26 | 
27 |     def seek(self, n, mode=0):
28 |         pass
29 | 
30 |     def readline(self):
31 |         pass
32 | 
33 |     def reset(self):
34 |         pass
35 | 
36 |     def write(self, x):
37 |         self._buffer.append(x)
38 | 
39 |     def writelines(self, x):
40 |         self._buffer.extend(x)
41 | 


--------------------------------------------------------------------------------
/pyspider/libs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/libs/__init__.py


--------------------------------------------------------------------------------
/pyspider/libs/dataurl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2012-11-16 10:33:20
 7 | 
 8 | import six
 9 | from base64 import b64encode, b64decode
10 | from . import utils
11 | from six.moves.urllib.parse import quote, unquote
12 | 
13 | 
14 | def encode(data, mime_type='', charset='utf-8', base64=True):
15 |     """
16 |     Encode data to DataURL
17 |     """
18 |     if isinstance(data, six.text_type):
19 |         data = data.encode(charset)
20 |     else:
21 |         charset = None
22 |     if base64:
23 |         data = utils.text(b64encode(data))
24 |     else:
25 |         data = utils.text(quote(data))
26 | 
27 |     result = ['data:', ]
28 |     if mime_type:
29 |         result.append(mime_type)
30 |     if charset:
31 |         result.append(';charset=')
32 |         result.append(charset)
33 |     if base64:
34 |         result.append(';base64')
35 |     result.append(',')
36 |     result.append(data)
37 | 
38 |     return ''.join(result)
39 | 
40 | 
41 | def decode(data_url):
42 |     """
43 |     Decode DataURL data
44 |     """
45 |     metadata, data = data_url.rsplit(',', 1)
46 |     _, metadata = metadata.split('data:', 1)
47 |     parts = metadata.split(';')
48 |     if parts[-1] == 'base64':
49 |         data = b64decode(data)
50 |     else:
51 |         data = unquote(data)
52 | 
53 |     for part in parts:
54 |         if part.startswith("charset="):
55 |             data = data.decode(part[8:])
56 |     return data
57 | 


--------------------------------------------------------------------------------
/pyspider/libs/log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2012-10-24 16:08:17
 7 | 
 8 | import logging
 9 | 
10 | try:
11 |     import curses
12 | except ImportError:
13 |     curses = None
14 | 
15 | from tornado.log import LogFormatter as _LogFormatter
16 | 
17 | 
18 | class LogFormatter(_LogFormatter, object):
19 |     """Init tornado.log.LogFormatter from logging.config.fileConfig"""
20 |     def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs):
21 |         if fmt is None:
22 |             fmt = _LogFormatter.DEFAULT_FORMAT
23 |         super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs)
24 | 
25 | 
26 | class SaveLogHandler(logging.Handler):
27 |     """LogHandler that save records to a list"""
28 | 
29 |     def __init__(self, saveto=None, *args, **kwargs):
30 |         self.saveto = saveto
31 |         logging.Handler.__init__(self, *args, **kwargs)
32 | 
33 |     def emit(self, record):
34 |         if self.saveto is not None:
35 |             self.saveto.append(record)
36 | 
37 |     handle = emit
38 | 
39 | 
40 | def enable_pretty_logging(logger=logging.getLogger()):
41 |     channel = logging.StreamHandler()
42 |     channel.setFormatter(LogFormatter())
43 |     logger.addHandler(channel)
44 | 


--------------------------------------------------------------------------------
/pyspider/libs/multiprocessing_queue.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | import platform
 3 | import multiprocessing
 4 | from multiprocessing.queues import Queue as BaseQueue
 5 | 
 6 | 
 7 | # The SharedCounter and Queue classes come from:
 8 | # https://github.com/vterron/lemon/commit/9ca6b4b
 9 | 
10 | class SharedCounter(object):
11 |     """ A synchronized shared counter.
12 |     The locking done by multiprocessing.Value ensures that only a single
13 |     process or thread may read or write the in-memory ctypes object. However,
14 |     in order to do n += 1, Python performs a read followed by a write, so a
15 |     second process may read the old value before the new one is written by the
16 |     first process. The solution is to use a multiprocessing.Lock to guarantee
17 |     the atomicity of the modifications to Value.
18 |     This class comes almost entirely from Eli Bendersky's blog:
19 |     http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/
20 |     """
21 | 
22 |     def __init__(self, n=0):
23 |         self.count = multiprocessing.Value('i', n)
24 | 
25 |     def increment(self, n=1):
26 |         """ Increment the counter by n (default = 1) """
27 |         with self.count.get_lock():
28 |             self.count.value += n
29 | 
30 |     @property
31 |     def value(self):
32 |         """ Return the value of the counter """
33 |         return self.count.value
34 | 
35 | 
36 | class MultiProcessingQueue(BaseQueue):
37 |     """ A portable implementation of multiprocessing.Queue.
38 |     Because of multithreading / multiprocessing semantics, Queue.qsize() may
39 |     raise the NotImplementedError exception on Unix platforms like Mac OS X
40 |     where sem_getvalue() is not implemented. This subclass addresses this
41 |     problem by using a synchronized shared counter (initialized to zero) and
42 |     increasing / decreasing its value every time the put() and get() methods
43 |     are called, respectively. This not only prevents NotImplementedError from
44 |     being raised, but also allows us to implement a reliable version of both
45 |     qsize() and empty().
46 |     """
47 |     def __init__(self, *args, **kwargs):
48 |         super(MultiProcessingQueue, self).__init__(*args, **kwargs)
49 |         self.size = SharedCounter(0)
50 | 
51 |     def put(self, *args, **kwargs):
52 |         self.size.increment(1)
53 |         super(MultiProcessingQueue, self).put(*args, **kwargs)
54 | 
55 |     def get(self, *args, **kwargs):
56 |         v = super(MultiProcessingQueue, self).get(*args, **kwargs)
57 |         self.size.increment(-1)
58 |         return v
59 | 
60 |     def qsize(self):
61 |         """ Reliable implementation of multiprocessing.Queue.qsize() """
62 |         return self.size.value
63 | 
64 | 
65 | if platform.system() == 'Darwin':
66 |     if hasattr(multiprocessing, 'get_context'):  # for py34
67 |         def Queue(maxsize=0):
68 |             return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context())
69 |     else:
70 |         def Queue(maxsize=0):
71 |             return MultiProcessingQueue(maxsize)
72 | else:
73 |     from multiprocessing import Queue  # flake8: noqa
74 | 


--------------------------------------------------------------------------------
/pyspider/libs/sample_handler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # Created on __DATE__
 4 | # Project: __PROJECT_NAME__
 5 | 
 6 | from pyspider.libs.base_handler import *
 7 | 
 8 | 
 9 | class Handler(BaseHandler):
10 |     crawl_config = {
11 |     }
12 | 
13 |     @every(minutes=24 * 60)
14 |     def on_start(self):
15 |         self.crawl('__START_URL__', callback=self.index_page)
16 | 
17 |     @config(age=10 * 24 * 60 * 60)
18 |     def index_page(self, response):
19 |         for each in response.doc('a[href^="http"]').items():
20 |             self.crawl(each.attr.href, callback=self.detail_page)
21 | 
22 |     @config(priority=2)
23 |     def detail_page(self, response):
24 |         return {
25 |             "url": response.url,
26 |             "title": response.doc('title').text(),
27 |         }
28 | 


--------------------------------------------------------------------------------
/pyspider/libs/wsgi_xmlrpc.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2006-2007 Open Source Applications Foundation
 2 | #
 3 | #   Licensed under the Apache License, Version 2.0 (the "License");
 4 | #   you may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | #
15 | #   Origin: https://code.google.com/p/wsgi-xmlrpc/
16 | 
17 | 
18 | from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher
19 | import logging
20 | 
21 | logger = logging.getLogger(__name__)
22 | 
23 | 
24 | class WSGIXMLRPCApplication(object):
25 |     """Application to handle requests to the XMLRPC service"""
26 | 
27 |     def __init__(self, instance=None, methods=None):
28 |         """Create windmill xmlrpc dispatcher"""
29 |         if methods is None:
30 |             methods = []
31 |         try:
32 |             self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None)
33 |         except TypeError:
34 |             # python 2.4
35 |             self.dispatcher = SimpleXMLRPCDispatcher()
36 |         if instance is not None:
37 |             self.dispatcher.register_instance(instance)
38 |         for method in methods:
39 |             self.dispatcher.register_function(method)
40 |         self.dispatcher.register_introspection_functions()
41 | 
42 |     def register_instance(self, instance):
43 |         return self.dispatcher.register_instance(instance)
44 | 
45 |     def register_function(self, function, name=None):
46 |         return self.dispatcher.register_function(function, name)
47 | 
48 |     def handler(self, environ, start_response):
49 |         """XMLRPC service for windmill browser core to communicate with"""
50 | 
51 |         if environ['REQUEST_METHOD'] == 'POST':
52 |             return self.handle_POST(environ, start_response)
53 |         else:
54 |             start_response("400 Bad request", [('Content-Type', 'text/plain')])
55 |             return ['']
56 | 
57 |     def handle_POST(self, environ, start_response):
58 |         """Handles the HTTP POST request.
59 | 
60 |         Attempts to interpret all HTTP POST requests as XML-RPC calls,
61 |         which are forwarded to the server's _dispatch method for handling.
62 | 
63 |         Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher.
64 |         """
65 | 
66 |         try:
67 |             # Get arguments by reading body of request.
68 |             # We read this in chunks to avoid straining
69 |             # socket.read(); around the 10 or 15Mb mark, some platforms
70 |             # begin to have problems (bug #792570).
71 | 
72 |             length = int(environ['CONTENT_LENGTH'])
73 |             data = environ['wsgi.input'].read(length)
74 | 
75 |             # In previous versions of SimpleXMLRPCServer, _dispatch
76 |             # could be overridden in this class, instead of in
77 |             # SimpleXMLRPCDispatcher. To maintain backwards compatibility,
78 |             # check to see if a subclass implements _dispatch and
79 |             # using that method if present.
80 |             response = self.dispatcher._marshaled_dispatch(
81 |                 data, getattr(self.dispatcher, '_dispatch', None)
82 |             )
83 |             response += b'\n'
84 |         except Exception as e:  # This should only happen if the module is buggy
85 |             # internal error, report as HTTP server error
86 |             logger.exception(e)
87 |             start_response("500 Server error", [('Content-Type', 'text/plain')])
88 |             return []
89 |         else:
90 |             # got a valid XML RPC response
91 |             start_response("200 OK", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)])
92 |             return [response]
93 | 
94 |     def __call__(self, environ, start_response):
95 |         return self.handler(environ, start_response)
96 | 


--------------------------------------------------------------------------------
/pyspider/logging.conf:
--------------------------------------------------------------------------------
 1 | [loggers]
 2 | keys=root,scheduler,fetcher,processor,webui,bench,werkzeug
 3 | 
 4 | [logger_root]
 5 | level=INFO
 6 | handlers=screen
 7 | 
 8 | [logger_scheduler]
 9 | level=INFO
10 | handlers=screen
11 | qualname=scheduler
12 | propagate=0
13 | 
14 | [logger_fetcher]
15 | level=DEBUG
16 | handlers=screen
17 | qualname=fetcher
18 | propagate=0
19 | 
20 | [logger_processor]
21 | level=DEBUG
22 | handlers=screen
23 | qualname=processor
24 | propagate=0
25 | 
26 | [logger_webui]
27 | level=DEBUG
28 | handlers=screen
29 | qualname=webui
30 | propagate=0
31 | 
32 | [logger_bench]
33 | level=DEBUG
34 | handlers=screen
35 | qualname=bench
36 | propagate=0
37 | 
38 | [logger_werkzeug]
39 | level=INFO
40 | handlers=screen
41 | qualname=werkzeug
42 | propagate=0
43 | 
44 | [handlers]
45 | keys=screen
46 | 
47 | [handler_screen]
48 | class=logging.StreamHandler
49 | formatter=pretty
50 | level=DEBUG
51 | args=(sys.stderr, )
52 | 
53 | [formatters]
54 | keys=pretty
55 | 
56 | [formatter_pretty]
57 | class=pyspider.libs.log.LogFormatter
58 | 


--------------------------------------------------------------------------------
/pyspider/message_queue/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-04-30 21:47:08
 7 | 
 8 | import logging
 9 | 
10 | try:
11 |     from urllib import parse as urlparse
12 | except ImportError:
13 |     import urlparse
14 | 
15 | 
16 | def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True):
17 |     """
18 |     create connection to message queue
19 | 
20 |     name:
21 |         name of message queue
22 | 
23 |     rabbitmq:
24 |         amqp://username:password@host:5672/%2F
25 |         see https://www.rabbitmq.com/uri-spec.html
26 |     redis:
27 |         redis://host:6379/db
28 |         redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode)
29 |     kombu:
30 |         kombu+transport://userid:password@hostname:port/virtual_host
31 |         see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
32 |     builtin:
33 |         None
34 |     """
35 | 
36 |     if not url:
37 |         from pyspider.libs.multiprocessing_queue import Queue
38 |         return Queue(maxsize=maxsize)
39 | 
40 |     parsed = urlparse.urlparse(url)
41 |     if parsed.scheme == 'amqp':
42 |         from .rabbitmq import Queue
43 |         return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
44 |     elif parsed.scheme == 'redis':
45 |         from .redis_queue import Queue
46 |         if ',' in parsed.netloc:
47 |             """
48 |             redis in cluster mode (there is no concept of 'db' in cluster mode)
49 |             ex. redis://host1:port1,host2:port2,...,hostn:portn
50 |             """
51 |             cluster_nodes = []
52 |             for netloc in parsed.netloc.split(','):
53 |                 cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])})
54 | 
55 |             return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes)
56 | 
57 |         else:
58 |             db = parsed.path.lstrip('/').split('/')
59 |             try:
60 |                 db = int(db[0])
61 |             except:
62 |                 logging.warning('redis DB must zero-based numeric index, using 0 instead')
63 |                 db = 0
64 | 
65 |             password = parsed.password or None
66 | 
67 |             return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit)
68 |     elif url.startswith('kombu+'):
69 |         url = url[len('kombu+'):]
70 |         from .kombu_queue import Queue
71 |         return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit)
72 |     else:
73 |         raise Exception('unknown connection url: %s', url)
74 | 


--------------------------------------------------------------------------------
/pyspider/message_queue/kombu_queue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<roy@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2015-05-22 20:54:01
  7 | 
  8 | import time
  9 | import umsgpack
 10 | from kombu import Connection, enable_insecure_serializers
 11 | from kombu.serialization import register
 12 | from kombu.exceptions import ChannelError
 13 | from six.moves import queue as BaseQueue
 14 | 
 15 | 
 16 | register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack')
 17 | enable_insecure_serializers(['umsgpack'])
 18 | 
 19 | 
 20 | class KombuQueue(object):
 21 |     """
 22 |     kombu is a high-level interface for multiple message queue backends.
 23 | 
 24 |     KombuQueue is built on top of kombu API.
 25 |     """
 26 | 
 27 |     Empty = BaseQueue.Empty
 28 |     Full = BaseQueue.Full
 29 |     max_timeout = 0.3
 30 | 
 31 |     def __init__(self, name, url="amqp://", maxsize=0, lazy_limit=True):
 32 |         """
 33 |         Constructor for KombuQueue
 34 | 
 35 |         url:        http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls
 36 |         maxsize:    an integer that sets the upperbound limit on the number of
 37 |                     items that can be placed in the queue.
 38 |         """
 39 |         self.name = name
 40 |         self.conn = Connection(url)
 41 |         self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack')
 42 | 
 43 |         self.maxsize = maxsize
 44 |         self.lazy_limit = lazy_limit
 45 |         if self.lazy_limit and self.maxsize:
 46 |             self.qsize_diff_limit = int(self.maxsize * 0.1)
 47 |         else:
 48 |             self.qsize_diff_limit = 0
 49 |         self.qsize_diff = 0
 50 | 
 51 |     def qsize(self):
 52 |         try:
 53 |             return self.queue.qsize()
 54 |         except ChannelError:
 55 |             return 0
 56 | 
 57 |     def empty(self):
 58 |         if self.qsize() == 0:
 59 |             return True
 60 |         else:
 61 |             return False
 62 | 
 63 |     def full(self):
 64 |         if self.maxsize and self.qsize() >= self.maxsize:
 65 |             return True
 66 |         else:
 67 |             return False
 68 | 
 69 |     def put(self, obj, block=True, timeout=None):
 70 |         if not block:
 71 |             return self.put_nowait(obj)
 72 | 
 73 |         start_time = time.time()
 74 |         while True:
 75 |             try:
 76 |                 return self.put_nowait(obj)
 77 |             except BaseQueue.Full:
 78 |                 if timeout:
 79 |                     lasted = time.time() - start_time
 80 |                     if timeout > lasted:
 81 |                         time.sleep(min(self.max_timeout, timeout - lasted))
 82 |                     else:
 83 |                         raise
 84 |                 else:
 85 |                     time.sleep(self.max_timeout)
 86 | 
 87 |     def put_nowait(self, obj):
 88 |         if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit:
 89 |             pass
 90 |         elif self.full():
 91 |             raise BaseQueue.Full
 92 |         else:
 93 |             self.qsize_diff = 0
 94 |         return self.queue.put(obj)
 95 | 
 96 |     def get(self, block=True, timeout=None):
 97 |         try:
 98 |             ret = self.queue.get(block, timeout)
 99 |             return ret.payload
100 |         except self.queue.Empty:
101 |             raise BaseQueue.Empty
102 | 
103 |     def get_nowait(self):
104 |         try:
105 |             ret = self.queue.get_nowait()
106 |             return ret.payload
107 |         except self.queue.Empty:
108 |             raise BaseQueue.Empty
109 | 
110 |     def delete(self):
111 |         self.queue.queue.delete()
112 | 
113 |     def __del__(self):
114 |         self.queue.close()
115 | 
116 | 
117 | Queue = KombuQueue
118 | 


--------------------------------------------------------------------------------
/pyspider/message_queue/redis_queue.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<roy@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2015-04-27 22:48:04
  7 | 
  8 | import time
  9 | import redis
 10 | import umsgpack
 11 | from six.moves import queue as BaseQueue
 12 | 
 13 | 
 14 | class RedisQueue(object):
 15 |     """
 16 |     A Queue like message built over redis
 17 |     """
 18 | 
 19 |     Empty = BaseQueue.Empty
 20 |     Full = BaseQueue.Full
 21 |     max_timeout = 0.3
 22 | 
 23 |     def __init__(self, name, host='localhost', port=6379, db=0,
 24 |                  maxsize=0, lazy_limit=True, password=None, cluster_nodes=None):
 25 |         """
 26 |         Constructor for RedisQueue
 27 | 
 28 |         maxsize:    an integer that sets the upperbound limit on the number of
 29 |                     items that can be placed in the queue.
 30 |         lazy_limit: redis queue is shared via instance, a lazy size limit is used
 31 |                     for better performance.
 32 |         """
 33 |         self.name = name
 34 |         if(cluster_nodes is not None):
 35 |             from rediscluster import StrictRedisCluster
 36 |             self.redis = StrictRedisCluster(startup_nodes=cluster_nodes)
 37 |         else:
 38 |             self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password)
 39 |         self.maxsize = maxsize
 40 |         self.lazy_limit = lazy_limit
 41 |         self.last_qsize = 0
 42 | 
 43 |     def qsize(self):
 44 |         self.last_qsize = self.redis.llen(self.name)
 45 |         return self.last_qsize
 46 | 
 47 |     def empty(self):
 48 |         if self.qsize() == 0:
 49 |             return True
 50 |         else:
 51 |             return False
 52 | 
 53 |     def full(self):
 54 |         if self.maxsize and self.qsize() >= self.maxsize:
 55 |             return True
 56 |         else:
 57 |             return False
 58 | 
 59 |     def put_nowait(self, obj):
 60 |         if self.lazy_limit and self.last_qsize < self.maxsize:
 61 |             pass
 62 |         elif self.full():
 63 |             raise self.Full
 64 |         self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj))
 65 |         return True
 66 | 
 67 |     def put(self, obj, block=True, timeout=None):
 68 |         if not block:
 69 |             return self.put_nowait(obj)
 70 | 
 71 |         start_time = time.time()
 72 |         while True:
 73 |             try:
 74 |                 return self.put_nowait(obj)
 75 |             except self.Full:
 76 |                 if timeout:
 77 |                     lasted = time.time() - start_time
 78 |                     if timeout > lasted:
 79 |                         time.sleep(min(self.max_timeout, timeout - lasted))
 80 |                     else:
 81 |                         raise
 82 |                 else:
 83 |                     time.sleep(self.max_timeout)
 84 | 
 85 |     def get_nowait(self):
 86 |         ret = self.redis.lpop(self.name)
 87 |         if ret is None:
 88 |             raise self.Empty
 89 |         return umsgpack.unpackb(ret)
 90 | 
 91 |     def get(self, block=True, timeout=None):
 92 |         if not block:
 93 |             return self.get_nowait()
 94 | 
 95 |         start_time = time.time()
 96 |         while True:
 97 |             try:
 98 |                 return self.get_nowait()
 99 |             except self.Empty:
100 |                 if timeout:
101 |                     lasted = time.time() - start_time
102 |                     if timeout > lasted:
103 |                         time.sleep(min(self.max_timeout, timeout - lasted))
104 |                     else:
105 |                         raise
106 |                 else:
107 |                     time.sleep(self.max_timeout)
108 | 
109 | Queue = RedisQueue
110 | 


--------------------------------------------------------------------------------
/pyspider/processor/__init__.py:
--------------------------------------------------------------------------------
1 | from .processor import ProcessorResult, Processor
2 | 


--------------------------------------------------------------------------------
/pyspider/result/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-10-19 16:10:19
7 | 
8 | from .result_worker import ResultWorker, OneResultWorker
9 | 


--------------------------------------------------------------------------------
/pyspider/result/result_worker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-19 15:37:46
 7 | 
 8 | import time
 9 | import json
10 | import logging
11 | from six.moves import queue as Queue
12 | logger = logging.getLogger("result")
13 | 
14 | 
15 | class ResultWorker(object):
16 | 
17 |     """
18 |     do with result
19 |     override this if needed.
20 |     """
21 | 
22 |     def __init__(self, resultdb, inqueue):
23 |         self.resultdb = resultdb
24 |         self.inqueue = inqueue
25 |         self._quit = False
26 | 
27 |     def on_result(self, task, result):
28 |         '''Called every result'''
29 |         if not result:
30 |             return
31 |         if 'taskid' in task and 'project' in task and 'url' in task:
32 |             logger.info('result %s:%s %s -> %.30r' % (
33 |                 task['project'], task['taskid'], task['url'], result))
34 |             return self.resultdb.save(
35 |                 project=task['project'],
36 |                 taskid=task['taskid'],
37 |                 url=task['url'],
38 |                 result=result
39 |             )
40 |         else:
41 |             logger.warning('result UNKNOW -> %.30r' % result)
42 |             return
43 | 
44 |     def quit(self):
45 |         self._quit = True
46 | 
47 |     def run(self):
48 |         '''Run loop'''
49 |         logger.info("result_worker starting...")
50 | 
51 |         while not self._quit:
52 |             try:
53 |                 task, result = self.inqueue.get(timeout=1)
54 |                 self.on_result(task, result)
55 |             except Queue.Empty as e:
56 |                 continue
57 |             except KeyboardInterrupt:
58 |                 break
59 |             except AssertionError as e:
60 |                 logger.error(e)
61 |                 continue
62 |             except Exception as e:
63 |                 logger.exception(e)
64 |                 continue
65 | 
66 |         logger.info("result_worker exiting...")
67 | 
68 | 
69 | class OneResultWorker(ResultWorker):
70 |     '''Result Worker for one mode, write results to stdout'''
71 |     def on_result(self, task, result):
72 |         '''Called every result'''
73 |         if not result:
74 |             return
75 |         if 'taskid' in task and 'project' in task and 'url' in task:
76 |             logger.info('result %s:%s %s -> %.30r' % (
77 |                 task['project'], task['taskid'], task['url'], result))
78 |             print(json.dumps({
79 |                 'taskid': task['taskid'],
80 |                 'project': task['project'],
81 |                 'url': task['url'],
82 |                 'result': result,
83 |                 'updatetime': time.time()
84 |             }))
85 |         else:
86 |             logger.warning('result UNKNOW -> %.30r' % result)
87 |             return
88 | 


--------------------------------------------------------------------------------
/pyspider/scheduler/__init__.py:
--------------------------------------------------------------------------------
1 | from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler  # NOQA
2 | 


--------------------------------------------------------------------------------
/pyspider/scheduler/token_bucket.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-07 16:53:08
 7 | 
 8 | import time
 9 | try:
10 |     import threading as _threading
11 | except ImportError:
12 |     import dummy_threading as _threading
13 | 
14 | 
15 | class Bucket(object):
16 | 
17 |     '''
18 |     traffic flow control with token bucket
19 |     '''
20 | 
21 |     update_interval = 30
22 | 
23 |     def __init__(self, rate=1, burst=None):
24 |         self.rate = float(rate)
25 |         if burst is None:
26 |             self.burst = float(rate) * 10
27 |         else:
28 |             self.burst = float(burst)
29 |         self.mutex = _threading.Lock()
30 |         self.bucket = self.burst
31 |         self.last_update = time.time()
32 | 
33 |     def get(self):
34 |         '''Get the number of tokens in bucket'''
35 |         now = time.time()
36 |         if self.bucket >= self.burst:
37 |             self.last_update = now
38 |             return self.bucket
39 |         bucket = self.rate * (now - self.last_update)
40 |         self.mutex.acquire()
41 |         if bucket > 1:
42 |             self.bucket += bucket
43 |             if self.bucket > self.burst:
44 |                 self.bucket = self.burst
45 |             self.last_update = now
46 |         self.mutex.release()
47 |         return self.bucket
48 | 
49 |     def set(self, value):
50 |         '''Set number of tokens in bucket'''
51 |         self.bucket = value
52 | 
53 |     def desc(self, value=1):
54 |         '''Use value tokens'''
55 |         self.bucket -= value
56 | 


--------------------------------------------------------------------------------
/pyspider/webui/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- encoding: utf-8 -*-
3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
4 | # Author: Binux<i@binux.me>
5 | #         http://binux.me
6 | # Created on 2014-02-22 23:20:40
7 | 
8 | from . import app, index, debug, task, result, login
9 | 


--------------------------------------------------------------------------------
/pyspider/webui/app.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<i@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-02-22 23:17:13
  7 | 
  8 | import os
  9 | import sys
 10 | import logging
 11 | logger = logging.getLogger("webui")
 12 | 
 13 | from six import reraise
 14 | from six.moves import builtins
 15 | from six.moves.urllib.parse import urljoin
 16 | from flask import Flask
 17 | from pyspider.fetcher import tornado_fetcher
 18 | 
 19 | if os.name == 'nt':
 20 |     import mimetypes
 21 |     mimetypes.add_type("text/css", ".css", True)
 22 | 
 23 | 
 24 | class QuitableFlask(Flask):
 25 |     """Add quit() method to Flask object"""
 26 | 
 27 |     @property
 28 |     def logger(self):
 29 |         return logger
 30 | 
 31 |     def run(self, host=None, port=None, debug=None, **options):
 32 |         import tornado.wsgi
 33 |         import tornado.ioloop
 34 |         import tornado.httpserver
 35 |         import tornado.web
 36 | 
 37 |         if host is None:
 38 |             host = '127.0.0.1'
 39 |         if port is None:
 40 |             server_name = self.config['SERVER_NAME']
 41 |             if server_name and ':' in server_name:
 42 |                 port = int(server_name.rsplit(':', 1)[1])
 43 |             else:
 44 |                 port = 5000
 45 |         if debug is not None:
 46 |             self.debug = bool(debug)
 47 | 
 48 |         hostname = host
 49 |         port = port
 50 |         application = self
 51 |         use_reloader = self.debug
 52 |         use_debugger = self.debug
 53 | 
 54 |         if use_debugger:
 55 |             from werkzeug.debug import DebuggedApplication
 56 |             application = DebuggedApplication(application, True)
 57 | 
 58 |         try:
 59 |             from .webdav import dav_app
 60 |         except ImportError as e:
 61 |             logger.warning('WebDav interface not enabled: %r', e)
 62 |             dav_app = None
 63 |         if dav_app:
 64 |             from werkzeug.wsgi import DispatcherMiddleware
 65 |             application = DispatcherMiddleware(application, {
 66 |                 '/dav': dav_app
 67 |             })
 68 | 
 69 |         container = tornado.wsgi.WSGIContainer(application)
 70 |         self.http_server = tornado.httpserver.HTTPServer(container)
 71 |         self.http_server.listen(port, hostname)
 72 |         if use_reloader:
 73 |             from tornado import autoreload
 74 |             autoreload.start()
 75 | 
 76 |         self.logger.info('webui running on %s:%s', hostname, port)
 77 |         self.ioloop = tornado.ioloop.IOLoop.current()
 78 |         self.ioloop.start()
 79 | 
 80 |     def quit(self):
 81 |         if hasattr(self, 'ioloop'):
 82 |             self.ioloop.add_callback(self.http_server.stop)
 83 |             self.ioloop.add_callback(self.ioloop.stop)
 84 |         self.logger.info('webui exiting...')
 85 | 
 86 | 
 87 | app = QuitableFlask('webui',
 88 |                     static_folder=os.path.join(os.path.dirname(__file__), 'static'),
 89 |                     template_folder=os.path.join(os.path.dirname(__file__), 'templates'))
 90 | app.secret_key = os.urandom(24)
 91 | app.jinja_env.line_statement_prefix = '#'
 92 | app.jinja_env.globals.update(builtins.__dict__)
 93 | 
 94 | app.config.update({
 95 |     'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async_mode=False).fetch(x),
 96 |     'taskdb': None,
 97 |     'projectdb': None,
 98 |     'scheduler_rpc': None,
 99 |     'queues': dict(),
100 |     'process_time_limit': 30,
101 | })
102 | 
103 | 
104 | def cdn_url_handler(error, endpoint, kwargs):
105 |     if endpoint == 'cdn':
106 |         path = kwargs.pop('path')
107 |         # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/')
108 |         # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/')
109 |         cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/')
110 |         return urljoin(cdn, path)
111 |     else:
112 |         exc_type, exc_value, tb = sys.exc_info()
113 |         if exc_value is error:
114 |             reraise(exc_type, exc_value, tb)
115 |         else:
116 |             raise error
117 | app.handle_url_build_error = cdn_url_handler
118 | 


--------------------------------------------------------------------------------
/pyspider/webui/bench_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-08 22:31:17
 7 | 
 8 | import random
 9 | try:
10 |     from urllib import urlencode
11 | except ImportError:
12 |     from urllib.parse import urlencode
13 | 
14 | from flask import request
15 | from .app import app
16 | 
17 | 
18 | @app.route('/bench')
19 | def bench_test():
20 |     total = int(request.args.get('total', 10000))
21 |     show = int(request.args.get('show', 20))
22 |     nlist = [random.randint(1, total) for _ in range(show)]
23 |     result = []
24 |     result.append("<html><head></head><body>")
25 |     args = dict(request.args)
26 |     for nl in nlist:
27 |         args['n'] = nl
28 |         argstr = urlencode(sorted(args.items()), doseq=True)
29 |         result.append("<a href='/bench?{0}'>follow {1}</a><br>".format(argstr, nl))
30 |     result.append("</body></html>")
31 |     return "".join(result)
32 | 


--------------------------------------------------------------------------------
/pyspider/webui/login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-10 20:36:27
 7 | 
 8 | import base64
 9 | from flask import Response
10 | try:
11 |     import flask_login as login
12 | except ImportError:
13 |     from flask.ext import login
14 | from .app import app
15 | 
16 | login_manager = login.LoginManager()
17 | login_manager.init_app(app)
18 | 
19 | 
20 | class AnonymousUser(login.AnonymousUserMixin):
21 | 
22 |     def is_anonymous(self):
23 |         return True
24 | 
25 |     def is_active(self):
26 |         return False
27 | 
28 |     def is_authenticated(self):
29 |         return False
30 | 
31 |     def get_id(self):
32 |         return
33 | 
34 | 
35 | class User(login.UserMixin):
36 | 
37 |     def __init__(self, id, password):
38 |         self.id = id
39 |         self.password = password
40 | 
41 |     def is_authenticated(self):
42 |         if not app.config.get('webui_username'):
43 |             return True
44 |         if self.id == app.config.get('webui_username') \
45 |                 and self.password == app.config.get('webui_password'):
46 |             return True
47 |         return False
48 | 
49 |     def is_active(self):
50 |         return self.is_authenticated()
51 | 
52 | 
53 | login_manager.anonymous_user = AnonymousUser
54 | 
55 | 
56 | @login_manager.request_loader
57 | def load_user_from_request(request):
58 |     api_key = request.headers.get('Authorization')
59 |     if api_key:
60 |         api_key = api_key[len("Basic "):]
61 |         try:
62 |             api_key = base64.b64decode(api_key).decode('utf8')
63 |             return User(*api_key.split(":", 1))
64 |         except Exception as e:
65 |             app.logger.error('wrong api key: %r, %r', api_key, e)
66 |             return None
67 |     return None
68 | app.login_response = Response(
69 |     "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'}
70 | )
71 | 
72 | 
73 | @app.before_request
74 | def before_request():
75 |     if app.config.get('need_auth', False):
76 |         if not login.current_user.is_active():
77 |             return app.login_response
78 | 


--------------------------------------------------------------------------------
/pyspider/webui/result.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-10-19 16:23:55
 7 | 
 8 | from __future__ import unicode_literals
 9 | 
10 | from flask import render_template, request, json
11 | from flask import Response
12 | from .app import app
13 | from pyspider.libs import result_dump
14 | 
15 | 
16 | @app.route('/results')
17 | def result():
18 |     resultdb = app.config['resultdb']
19 |     project = request.args.get('project')
20 |     offset = int(request.args.get('offset', 0))
21 |     limit = int(request.args.get('limit', 20))
22 | 
23 |     count = resultdb.count(project)
24 |     results = list(resultdb.select(project, offset=offset, limit=limit))
25 | 
26 |     return render_template(
27 |         "result.html", count=count, results=results,
28 |         result_formater=result_dump.result_formater,
29 |         project=project, offset=offset, limit=limit, json=json
30 |     )
31 | 
32 | 
33 | @app.route('/results/dump/<project>.<_format>')
34 | def dump_result(project, _format):
35 |     resultdb = app.config['resultdb']
36 |     # force update project list
37 |     resultdb.get(project, 'any')
38 |     if project not in resultdb.projects:
39 |         return "no such project.", 404
40 | 
41 |     offset = int(request.args.get('offset', 0)) or None
42 |     limit = int(request.args.get('limit', 0)) or None
43 |     results = resultdb.select(project, offset=offset, limit=limit)
44 | 
45 |     if _format == 'json':
46 |         valid = request.args.get('style', 'rows') == 'full'
47 |         return Response(result_dump.dump_as_json(results, valid),
48 |                         mimetype='application/json')
49 |     elif _format == 'txt':
50 |         return Response(result_dump.dump_as_txt(results),
51 |                         mimetype='text/plain')
52 |     elif _format == 'csv':
53 |         return Response(result_dump.dump_as_csv(results),
54 |                         mimetype='text/csv')
55 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/.babelrc:
--------------------------------------------------------------------------------
1 | {
2 |   "presets": ["es2015"]
3 | }
4 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/css_selector_helper.min.js:
--------------------------------------------------------------------------------
1 | !function(e){function t(n){if(r[n])return r[n].exports;var a=r[n]={exports:{},id:n,loaded:!1};return e[n].call(a.exports,a,a.exports,t),a.loaded=!0,a.exports}var r={};return t.m=e,t.c=r,t.p="",t(0)}([function(e,t){"use strict";function r(e,t){function r(e,t){if(!e||!t)return!1;if(e.length!=t.length)return!1;for(var r=0,n=e.length;r<n;r++)if(e[r]!==t[r])return!1;return!0}function n(t){return e.evaluate(t,e,null,XPathResult.FIRST_ORDERED_NODE_TYPE,null).singleNodeValue}function a(e){var t=0,r=0;do isNaN(e.offsetLeft)||(r+=e.offsetLeft),isNaN(e.offsetTop)||(t+=e.offsetTop);while(e=e.offsetParent);return{top:t,left:r}}function o(e){var t="";return e.forEach(function(e){e.selected&&(t+=e.name)}),t}function l(e,t){var r="",n=null;return e.forEach(function(e,a){if(!(t>=0&&a>t))if(e.invalid)n=null;else if(e.selected){n&&(r+=" >");var o="";e.features.forEach(function(e){e.selected&&(o+=e.pattern)}),""===o&&(o="*"),r+=" "+o,n=e}else n=null}),""===r&&(r="*"),r}function i(t){var n=[];do{var a=[];if(a.push({name:t.tagName.toLowerCase(),pattern:t.tagName.toLowerCase(),selected:!0}),t.getAttribute("id")&&a.push({name:"#"+t.getAttribute("id"),pattern:"#"+t.getAttribute("id"),selected:!0}),t.classList.length>0)for(var i=0;i<t.classList.length;i++){var s=t.classList[i];a.push({name:"."+s,pattern:"."+s,selected:!0})}for(var f="itemprop",i=0,u=t.attributes;i<u.length;i++)f.indexOf(u[i].nodeName)!=-1&&a.push({name:"["+u[i].nodeName+"="+JSON.stringify(u[i].nodeValue)+"]",pattern:"["+u[i].nodeName+"="+JSON.stringify(u[i].nodeValue)+"]",selected:!0});for(var c=t.parentNode.childNodes,d=t.tagName.toLowerCase(),i=0,p=0;c.length>1&&i<c.length;i++){var h=c[i];if(h===t){d+="["+(p+1)+"]";break}h.tagName==t.tagName&&p++}n.push({tag:t.tagName.toLowerCase(),name:o(a),xpath:d,selected:!0,invalid:"tbody"===t.tagName.toLowerCase(),features:a})}while(t=t.parentElement);n.reverse();var v=e.querySelectorAll(l(n));return n.forEach(function(t,a){if(!t.invalid){var i=e.querySelectorAll(l(n,a));t.features.forEach(function(t,o){t.selected=!1,r(i,e.querySelectorAll(l(n,a)))||(t.selected=!0)}),t.features.every(function(e){return!e.selected})&&(t.features[0].selected=!0),t.name=o(t.features)}}),n.forEach(function(t,a){return t.selected=!1,r(v,e.querySelectorAll(l(n)))?void(t.name=t.tag):void(t.selected=!0)}),n}function s(t){t instanceof Element&&(t=[t]),Array.prototype.forEach.call(e.querySelectorAll(".pyspider_overlay"),function(e){e.remove()}),Array.prototype.forEach.call(t,function(t){var r=e.createElement("div");r.className="pyspider_overlay";var n=a(t);r.setAttribute("style","z-index: 999999;background-color: rgba(255, 165, 0, 0.3);position: absolute;pointer-events: none;top: "+n.top+"px;left:"+n.left+"px;width: "+t.offsetWidth+"px;height: "+t.offsetHeight+"px;"),e.body.appendChild(r)})}function f(t){t instanceof Element&&(t=[t]),Array.prototype.forEach.call(e.querySelectorAll(".pyspider_highlight"),function(e){e.remove()}),Array.prototype.forEach.call(t,function(t){var r=e.createElement("div");r.className="pyspider_highlight";var n=a(t);r.setAttribute("style","z-index: 888888;border: 2px solid #c00;position: absolute;pointer-events: none;top: "+(n.top-2)+"px;left:"+(n.left-2)+"px;width: "+t.offsetWidth+"px;height: "+t.offsetHeight+"px;"),e.body.appendChild(r)})}window.addEventListener("message",function(t){"overlay"==t.data.type?s(n(t.data.xpath)):"heightlight"==t.data.type&&f(e.querySelectorAll(t.data.css_selector))}),e.addEventListener("mouseover",function(e){s(event.target)}),e.addEventListener("click",function(e){e.preventDefault(),e.stopPropagation(),t(i(e.target))})}Object.defineProperty(t,"__esModule",{value:!0}),t["default"]=r}]);
2 | //# sourceMappingURL=css_selector_helper.min.js.map


--------------------------------------------------------------------------------
/pyspider/webui/static/index.min.css:
--------------------------------------------------------------------------------
1 | h1{margin-top:5px}header .alert{position:absolute;width:50rem;left:50%;margin-left:-25rem}.queue-info td,.queue-info th{text-align:center;border:1px solid #ddd}[v-cloak]{display:none}.projects{min-width:850px;border-top:1px solid #ddd;border-bottom:1px solid #ddd}.projects .project-group{width:80px}.projects .project-name{font-weight:700}.projects .project-status{width:100px}.projects .project-status>span{border:1px solid gray;padding:1px 5px 0;background:#999;color:#fff}.projects span.status-TODO{border:1px solid #ec971f;padding:1px 5px 0;background:#f0ad4e;color:#fff}.projects span.status-STOP{border:1px solid #c9302c;padding:1px 5px 0;background:#d9534f;color:#fff}.projects span.status-CHECKING{border:1px solid #dcbe00;padding:1px 5px 0;background:#ffde10;color:#fff}.projects span.status-DEBUG{border:1px solid #3071a9;padding:1px 5px 0;background:#428bca;color:#fff}.projects span.status-RUNNING{border:1px solid #449d44;padding:1px 5px 0;background:#5cb85c;color:#fff}.projects span.status-PAUSED{border:1px solid #3c3c3c;padding:1px 5px 0;background:#555;color:#fff}.projects .project-rate,.projects .project-time{width:110px}.projects th.project-progress{position:relative}.projects th.project-progress span{position:absolute}.projects td.project-progress{position:relative;min-width:5%}.projects td.project-progress.progress-all{min-width:10%}.projects td.project-progress .progress{position:relative;margin:0;background-color:#aaa}.projects td.project-progress .progress .progress-text{width:100%;text-align:center;position:absolute;font-weight:700;color:#fff;pointer-events:none}.projects td.project-progress .progress .progress-bar{-webkit-transition:none;transition:none}.projects .project-actions{width:200px}.global-btn{margin-top:-5px;padding:10px}.global-btn .create-btn-div{float:right}.global-btn .active-btn-div{float:left}
2 | /*# sourceMappingURL=index.min.css.map*/


--------------------------------------------------------------------------------
/pyspider/webui/static/index.min.js:
--------------------------------------------------------------------------------
1 | !function(t){function e(r){if(a[r])return a[r].exports;var n=a[r]={exports:{},id:r,loaded:!1};return t[r].call(n.exports,n,n.exports,e),n.loaded=!0,n.exports}var a={};return e.m=t,e.c=a,e.p="",e(0)}({0:function(t,e,a){"use strict";a(10),$(function(){function t(t){$(".project-group>span").editable({name:"group",pk:function(t){return $(this).parents("tr").data("name")},emptytext:"[group]",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name");t.projects[r].group=a,$(this).attr("style","")}}),$(".project-status>span").editable({type:"select",name:"status",source:[{value:"TODO",text:"TODO"},{value:"STOP",text:"STOP"},{value:"CHECKING",text:"CHECKING"},{value:"DEBUG",text:"DEBUG"},{value:"RUNNING",text:"RUNNING"}],pk:function(t){return $(this).parents("tr").data("name")},emptytext:"[status]",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name");t.projects[r].status=a,$(this).removeClass("status-"+$(this).attr("data-value")).addClass("status-"+a).attr("data-value",a).attr("style","")}}),$(".project-rate>span").editable({name:"rate",pk:function(t){return $(this).parents("tr").data("name")},validate:function(t){var e=t.split("/");return 2!=e.length?"format error: rate/burst":$.isNumeric(e[0])&&$.isNumeric(e[1])?void 0:"format error: rate/burst"},highlight:!1,emptytext:"0/0",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name"),n=a.split("/");t.projects[r].rate=parseFloat(n[0]),t.projects[r].burst=parseFloat(n[1]),$(this).attr("style","")}})}function e(){Sortable.getColumnType=function(t,e){var a=$($(t).find("th").get(e)).data("type");return"num"==a?Sortable.types.numeric:"date"==a?Sortable.types.date:Sortable.types.alpha},$("table.projects").attr("data-sortable",!0),Sortable.init()}function a(){$.get("/counter",function(t){for(var e in t){var a=t[e];if(void 0!==s.projects[e]){var r="5m,1h,1d,all".split(","),n=!0,o=!1,i=void 0;try{for(var u,c=r[Symbol.iterator]();!(n=(u=c.next()).done);n=!0){var l=u.value,p=a[l];if(void 0!==p){var d=p.pending||0,f=p.success||0,m=p.retry||0,v=p.failed||0,h=p.task||d+f+m+v;p.task=h,p.title=""+l+" of "+h+" tasks:\n"+("all"==l?"pending("+(d/h*100).toFixed(1)+"%): \t"+d+"\n":"new("+(d/h*100).toFixed(1)+"%): \t\t"+d+"\n")+"success("+(f/h*100).toFixed(1)+"%): \t"+f+"\nretry("+(m/h*100).toFixed(1)+"%): \t"+m+"\nfailed("+(v/h*100).toFixed(1)+"%): \t"+v}}}catch($){o=!0,i=$}finally{try{!n&&c["return"]&&c["return"]()}finally{if(o)throw i}}s.projects[e].paused=a.paused,s.projects[e].time=a["5m_time"],s.projects[e].progress=a}}})}function r(){$.get("/queues",function(t){$(".queue_value").each(function(e,a){var r=$(a).attr("title");void 0!==t[r]?$(a).text(t[r]):$(a).text("???")})})}$("#create-project-modal form").on("submit",function(t){var e=$(this),a=e.find("[name=project-name]").val();return 0==a.length||a.search(/[^\w]/)!=-1?(e.find("[name=project-name]").parents(".form-group").addClass("has-error"),e.find("[name=project-name] ~ .help-block").show(),!1):(e.find("[name=script-mode]:checked").val(),e.attr("action","/debug/"+a),!0)});var n={};projects.forEach(function(t){t.paused=!1,t.time={},t.progress={},n[t.name]=t});var s=new Vue({el:".projects",data:{projects:n},ready:function(){t(this),e(this),a(),window.setInterval(a,15e3),r(),window.setInterval(r,15e3)},methods:{project_run:function(t,e){$("#need-set-status-alert").hide(),"RUNNING"!=t.status&&"DEBUG"!=t.status&&$("#need-set-status-alert").show();var a=e.target;$(a).addClass("btn-warning"),$.ajax({type:"POST",url:"/run",data:{project:t.name},success:function(t){$(a).removeClass("btn-warning"),t.result||$(a).addClass("btn-danger")},error:function(){$(a).removeClass("btn-warning").addClass("btn-danger")}})}}})})},10:function(t,e){}});
2 | //# sourceMappingURL=index.min.js.map


--------------------------------------------------------------------------------
/pyspider/webui/static/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pyspider-webui",
 3 |   "version": "0.3.9",
 4 |   "description": "webui of pyspider",
 5 |   "scripts": {
 6 |     "build": "webpack --progress --colors --optimize-minimize",
 7 |     "dev": "webpack --progress --colors --optimize-minimize --watch"
 8 |   },
 9 |   "keywords": [
10 |     "pyspider"
11 |   ],
12 |   "author": "binux",
13 |   "license": "MIT",
14 |   "devDependencies": {
15 |     "babel-core": "^6.14.0",
16 |     "babel-loader": "^6.2.5",
17 |     "babel-preset-es2015": "^6.14.0",
18 |     "css-loader": "^0.25.0",
19 |     "extract-text-webpack-plugin": "^1.0.1",
20 |     "less": "^2.7.1",
21 |     "less-loader": "^2.2.3",
22 |     "style-loader": "^0.13.1",
23 |     "webpack": "^1.13.2"
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/result.min.css:
--------------------------------------------------------------------------------
1 | .top-bar{padding:10px 15px 2px;height:46px;background-color:#f5f5f5;border-bottom:1px solid #ddd;position:relative}.top-bar h1{margin:0 0 10px;font-size:18px}.top-bar .btn-group{margin:8px 10px 0 0;position:absolute;right:0;top:0}.pagination-wrap{text-align:right;padding-right:15px}table{border-bottom:1px solid #ddd}table td{word-break:break-all}
2 | /*# sourceMappingURL=result.min.css.map*/


--------------------------------------------------------------------------------
/pyspider/webui/static/result.min.js:
--------------------------------------------------------------------------------
1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]);
2 | //# sourceMappingURL=result.min.js.map


--------------------------------------------------------------------------------
/pyspider/webui/static/src/index.less:
--------------------------------------------------------------------------------
  1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
  2 | /* Author: Binux<i@binux.me> */
  3 | /*         http://binux.me */
  4 | /* Created on 2014-02-23 00:28:30 */
  5 | 
  6 | @import "variable";
  7 | 
  8 | h1 {
  9 |   margin-top: 5px;
 10 | }
 11 | 
 12 | header .alert {
 13 |   position: absolute;;
 14 |   width: 50rem;
 15 |   left: 50%;
 16 |   margin-left: -25rem;
 17 | }
 18 | 
 19 | .queue-info {
 20 |   th, td {
 21 |     text-align: center;
 22 |     border: 1px solid #ddd;
 23 |   }
 24 | }
 25 | 
 26 | [v-cloak] {
 27 |   display: none;
 28 | }
 29 | 
 30 | .projects {
 31 |   min-width: 850px;
 32 |   border-top: 1px solid #ddd;
 33 |   border-bottom: 1px solid #ddd;
 34 | 
 35 |   .project-group {
 36 |     width: 80px;
 37 |   }
 38 | 
 39 |   .project-name {
 40 |     font-weight: bold;
 41 |   }
 42 | 
 43 |   .project-status {
 44 |     width: 100px;
 45 |   }
 46 |   .project-status-span(@color) {
 47 |     border: solid 1px darken(@color, 10%);
 48 |     padding: 1px 5px 0 5px;
 49 |     background: @color;
 50 |     color: white;
 51 |   }
 52 |   .project-status>span {
 53 |     .project-status-span(@gray-light);
 54 |   }
 55 |   span.status-TODO {
 56 |     .project-status-span(@orange);
 57 |   }
 58 |   span.status-STOP {
 59 |     .project-status-span(@red);
 60 |   }
 61 |   span.status-CHECKING {
 62 |     .project-status-span(darken(@yellow, 10%));
 63 |   }
 64 |   span.status-DEBUG {
 65 |     .project-status-span(@blue);
 66 |   }
 67 |   span.status-RUNNING {
 68 |     .project-status-span(@green);
 69 |   }
 70 |   span.status-PAUSED {
 71 |     .project-status-span(@gray);
 72 |   }
 73 | 
 74 |   .project-rate {
 75 |     width: 110px;
 76 |   }
 77 | 
 78 |   .project-time {
 79 |     width: 110px;
 80 |   }
 81 |   
 82 |   th.project-progress {
 83 |     position: relative;
 84 |     span {
 85 |       position: absolute;
 86 |     }
 87 |   }
 88 | 
 89 |   td.project-progress {
 90 |     position: relative;
 91 |     min-width: 5%;
 92 |     &.progress-all {
 93 |       min-width: 10%;
 94 |     }
 95 | 
 96 |     .progress {
 97 |       position: relative;
 98 |       margin: 0;
 99 |       background-color: #aaa;
100 |       .progress-text {
101 |         width: 100%;
102 |         text-align: center;
103 |         position: absolute;
104 |         font-weight: bold;
105 |         color: #fff;
106 |         pointer-events: none;
107 |       }
108 |       .progress-bar {
109 |         -webkit-transition: none;
110 |         transition: none;
111 |       }
112 |     }
113 |   }
114 | 
115 |   .project-actions {
116 |     width: 200px;
117 |   }
118 | }
119 | 
120 | .global-btn {
121 |   margin-top: -5px;
122 |   padding: 10px 10px 10px 10px;
123 | 
124 |   .create-btn-div {
125 |     float: right;
126 |   }
127 | 
128 |   .active-btn-div {
129 |     float: left;
130 |   }
131 | }
132 | 
133 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/src/result.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-10-22 22:38:45 */
 5 | 
 6 | @import "variable";
 7 | 
 8 | .top-bar {
 9 |   padding: 10px 15px 2px 15px;
10 |   height: 46px;
11 |   background-color: #f5f5f5;
12 |   border-bottom: 1px solid #ddd;
13 |   position: relative;
14 |   
15 |   h1 {
16 |     margin: 0 0 10px 0;
17 |     font-size: 18px;
18 |   }
19 | 
20 |   .btn-group {
21 |     margin: 8px 10px 0 0;
22 |     position: absolute;
23 |     right: 0;
24 |     top: 0;
25 | 
26 |     a.btn {
27 |     }
28 |   }
29 | }
30 | 
31 | .pagination-wrap {
32 |   text-align: right;
33 |   padding-right: 15px;
34 | }
35 | 
36 | table {
37 |   border-bottom: 1px solid #ddd;
38 | 
39 |   td {
40 |     word-break: break-all;
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/src/task.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-16 19:20:30 */
 5 | 
 6 | @import "variable";
 7 | 
 8 | .base-info {
 9 |   padding: 10px 15px 2px 15px;
10 |   background-color: #f5f5f5;
11 |   border-bottom: 1px solid #ddd;
12 | }
13 | 
14 | .more-info {
15 |   padding: 10px 15px;
16 | }
17 | 
18 | .more-info dd {
19 |   display: block;
20 |   font-family: monospace;
21 |   white-space: pre;
22 |   word-break: break-all;
23 |   word-wrap: break-word;
24 |   margin: 1em 0px;
25 | }
26 | 
27 | .status_mix(@color: lighten(black, 50%)) {
28 |   border: solid 1px darken(@color, 10%);
29 |   padding: 1px 5px 0 5px;
30 |   background: @color;
31 |   color: white;
32 | }
33 | .status {
34 |   &-1 {
35 |     .status_mix(@blue);
36 |   }
37 |   &-2 {
38 |     .status_mix(@green);
39 |   }
40 |   &-3 {
41 |     .status_mix(@red);
42 |   }
43 |   &-4 {
44 |     .status_mix;
45 |   }
46 | }
47 | 
48 | .url {
49 |   font-size: 120%;
50 |   text-decoration: underline;
51 | }
52 | 
53 | .callback {
54 |   color: @orange;
55 |   font-weight: bold;
56 | 
57 |   &:hover, &:focus {
58 |     color: darken(@orange, 10%);
59 |   }
60 | }
61 | 
62 | dt .glyphicon-ok {
63 |   color: @green;
64 | }
65 | dt .glyphicon-remove {
66 |   color: @red;
67 | }
68 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/src/tasks.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-18 23:20:46 */
 5 | 
 6 | @import "variable";
 7 | @import "task";
 8 | 
 9 | .tasks {
10 |   margin: 0;
11 |   padding: 0;
12 |   list-style-type: none;
13 | 
14 |   li {
15 |     .base-info;
16 | 
17 |     &:nth-child(even) {
18 |       background-color: white;
19 |     }
20 |   }
21 | 
22 |   .url {
23 |     display: inline-block;
24 |     vertical-align: bottom;
25 |     max-width: 40em;
26 |     overflow: hidden;
27 |     white-space: nowrap;
28 |     text-overflow: ellipsis;
29 |   }
30 |   
31 |   .update-time {
32 |     font-weight: bold;
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/src/variable.less:
--------------------------------------------------------------------------------
 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */
 2 | /* Author: Binux<i@binux.me> */
 3 | /*         http://binux.me */
 4 | /* Created on 2014-07-16 19:18:30 */
 5 | 
 6 | // colors
 7 | @gray-darker:            lighten(#000, 13.5%); // #222
 8 | @gray-dark:              lighten(#000, 20%);   // #333
 9 | @gray:                   lighten(#000, 33.5%); // #555
10 | @gray-light:             lighten(#000, 60%);   // #999
11 | @gray-lighter:           lighten(#000, 93.5%); // #eee
12 | 
13 | @blue: #428bca;
14 | @green: #5cb85c;
15 | @blue-light: #5bc0de;
16 | @orange: #f0ad4e;
17 | @yellow: #ffe543;
18 | @red: #d9534f;
19 | 


--------------------------------------------------------------------------------
/pyspider/webui/static/task.min.css:
--------------------------------------------------------------------------------
1 | .base-info{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.more-info{padding:10px 15px}.more-info dd{display:block;font-family:monospace;white-space:pre;word-break:break-all;word-wrap:break-word;margin:1em 0}.status-1{border:1px solid #3071a9;background:#428bca}.status-1,.status-2{padding:1px 5px 0;color:#fff}.status-2{border:1px solid #449d44;background:#5cb85c}.status-3{border:1px solid #c9302c;background:#d9534f}.status-3,.status-4{padding:1px 5px 0;color:#fff}.status-4{border:1px solid #666;background:gray}.url{font-size:120%;text-decoration:underline}.callback{color:#f0ad4e;font-weight:700}.callback:focus,.callback:hover{color:#ec971f}dt .glyphicon-ok{color:#5cb85c}dt .glyphicon-remove{color:#d9534f}
2 | /*# sourceMappingURL=task.min.css.map*/


--------------------------------------------------------------------------------
/pyspider/webui/static/task.min.js:
--------------------------------------------------------------------------------
1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]);
2 | //# sourceMappingURL=task.min.js.map


--------------------------------------------------------------------------------
/pyspider/webui/static/tasks.min.css:
--------------------------------------------------------------------------------
1 | .base-info{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.more-info{padding:10px 15px}.more-info dd{display:block;font-family:monospace;white-space:pre;word-break:break-all;word-wrap:break-word;margin:1em 0}.status-1{border:1px solid #3071a9;background:#428bca}.status-1,.status-2{padding:1px 5px 0;color:#fff}.status-2{border:1px solid #449d44;background:#5cb85c}.status-3{border:1px solid #c9302c;background:#d9534f}.status-3,.status-4{padding:1px 5px 0;color:#fff}.status-4{border:1px solid #666;background:gray}.url{font-size:120%;text-decoration:underline}.callback{color:#f0ad4e;font-weight:700}.callback:focus,.callback:hover{color:#ec971f}dt .glyphicon-ok{color:#5cb85c}dt .glyphicon-remove{color:#d9534f}.tasks{margin:0;padding:0;list-style-type:none}.tasks li{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.tasks li:nth-child(even){background-color:#fff}.tasks .url{display:inline-block;vertical-align:bottom;max-width:40em;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.tasks .update-time{font-weight:700}
2 | /*# sourceMappingURL=tasks.min.css.map*/


--------------------------------------------------------------------------------
/pyspider/webui/static/tasks.min.js:
--------------------------------------------------------------------------------
1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]);
2 | //# sourceMappingURL=tasks.min.js.map


--------------------------------------------------------------------------------
/pyspider/webui/static/webpack.config.js:
--------------------------------------------------------------------------------
 1 | var webpack = require("webpack");
 2 | var ExtractTextPlugin = require("extract-text-webpack-plugin");
 3 | 
 4 | module.exports = {
 5 |   entry: {
 6 |     index: "./src/index",
 7 |     debug: "./src/debug",
 8 |     result: "./src/result.less",
 9 |     task: "./src/task.less",
10 |     tasks: "./src/tasks.less",
11 |   },
12 |   output: {
13 |     //path: "./dist",
14 |     filename: "[name].min.js"
15 |   },
16 |   module: {
17 |     loaders: [
18 |       { test: /\.js$/, loader: "babel-loader" },
19 |       { test: /\.less$/, loader: ExtractTextPlugin.extract("style-loader", "css-loader?sourceMap!less-loader?sourceMap") }
20 |     ]
21 |   },
22 |   devtool: 'source-map',
23 |   plugins: [
24 |     new ExtractTextPlugin("[name].min.css"),
25 |     new webpack.optimize.UglifyJsPlugin({ compress: { warnings: false } }),
26 |   ]
27 | }
28 | 


--------------------------------------------------------------------------------
/pyspider/webui/task.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<i@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-07-16 15:30:57
  7 | 
  8 | import socket
  9 | from flask import abort, render_template, request, json
 10 | 
 11 | from pyspider.libs import utils
 12 | from .app import app
 13 | 
 14 | 
 15 | @app.route('/task/<taskid>')
 16 | def task(taskid):
 17 |     if ':' not in taskid:
 18 |         abort(400)
 19 |     project, taskid = taskid.split(':', 1)
 20 | 
 21 |     taskdb = app.config['taskdb']
 22 |     task = taskdb.get_task(project, taskid)
 23 | 
 24 |     if not task:
 25 |         abort(404)
 26 |     resultdb = app.config['resultdb']
 27 |     result = {}
 28 |     if resultdb:
 29 |         result = resultdb.get(project, taskid)
 30 | 
 31 |     return render_template("task.html", task=task, json=json, result=result,
 32 |                            status_to_string=app.config['taskdb'].status_to_string)
 33 | 
 34 | 
 35 | @app.route('/task/<taskid>.json')
 36 | def task_in_json(taskid):
 37 |     if ':' not in taskid:
 38 |         return json.jsonify({'code': 400, 'error': 'bad project:task_id format'})
 39 |     project, taskid = taskid.split(':', 1)
 40 | 
 41 |     taskdb = app.config['taskdb']
 42 |     task = taskdb.get_task(project, taskid)
 43 | 
 44 |     if not task:
 45 |         return json.jsonify({'code': 404, 'error': 'not found'})
 46 |     task['status_string'] = app.config['taskdb'].status_to_string(task['status'])
 47 |     return json.jsonify(task)
 48 | 
 49 | 
 50 | @app.route('/tasks')
 51 | def tasks():
 52 |     rpc = app.config['scheduler_rpc']
 53 |     taskdb = app.config['taskdb']
 54 |     project = request.args.get('project', "")
 55 |     limit = int(request.args.get('limit', 100))
 56 | 
 57 |     try:
 58 |         updatetime_tasks = rpc.get_active_tasks(project, limit)
 59 |     except socket.error as e:
 60 |         app.logger.warning('connect to scheduler rpc error: %r', e)
 61 |         return 'connect to scheduler error', 502
 62 | 
 63 |     tasks = {}
 64 |     result = []
 65 |     for updatetime, task in sorted(updatetime_tasks, key=lambda x: x[0]):
 66 |         key = '%(project)s:%(taskid)s' % task
 67 |         task['updatetime'] = updatetime
 68 |         if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE:
 69 |             result.append(tasks[key])
 70 |         tasks[key] = task
 71 |     result.extend(tasks.values())
 72 | 
 73 |     return render_template(
 74 |         "tasks.html",
 75 |         tasks=result,
 76 |         status_to_string=taskdb.status_to_string
 77 |     )
 78 | 
 79 | 
 80 | @app.route('/active_tasks')
 81 | def active_tasks():
 82 |     rpc = app.config['scheduler_rpc']
 83 |     taskdb = app.config['taskdb']
 84 |     project = request.args.get('project', "")
 85 |     limit = int(request.args.get('limit', 100))
 86 | 
 87 |     try:
 88 |         tasks = rpc.get_active_tasks(project, limit)
 89 |     except socket.error as e:
 90 |         app.logger.warning('connect to scheduler rpc error: %r', e)
 91 |         return '{}', 502, {'Content-Type': 'application/json'}
 92 | 
 93 |     result = []
 94 |     for updatetime, task in tasks:
 95 |         task['updatetime'] = updatetime
 96 |         task['updatetime_text'] = utils.format_date(updatetime)
 97 |         if 'status' in task:
 98 |             task['status_text'] = taskdb.status_to_string(task['status'])
 99 |         result.append(task)
100 | 
101 |     return json.dumps(result), 200, {'Content-Type': 'application/json'}
102 | 
103 | app.template_filter('format_date')(utils.format_date)
104 | 


--------------------------------------------------------------------------------
/pyspider/webui/templates/result.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Results - {{ project }} - pyspider</title>
 6 |     <!--[if lt IE 9]>
 7 |       <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 8 |     <![endif]-->
 9 | 
10 |     <meta name="description" content="results of {{ project }}">
11 |     <meta name="author" content="binux">
12 |     <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
13 |     <link href="{{ url_for('static', filename='result.min.css') }}" rel="stylesheet">
14 | 
15 |     <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
16 |     <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
17 |   </head>
18 | 
19 |   <body>
20 |     <div class="top-bar">
21 |       <h1>{{ project }} - Results</h1>
22 |       <div class="btn-group">
23 |         <a href="/results/dump/{{ project }}.json"
24 |           target="_blank" class="btn btn-default btn-sm">
25 |           <span class="glyphicon glyphicon-download-alt"></span>
26 |           JSON</a>
27 |         <a href="/results/dump/{{ project }}.txt"
28 |           target="_blank" class="btn btn-default btn-sm">URL-JSON</a>
29 |         <a href="/results/dump/{{ project }}.csv"
30 |           target="_blank" class="btn btn-default btn-sm">CSV</a>
31 |       </div>
32 |     </div>
33 |     # set common_fields, results = result_formater(results)
34 |     <table class="table table-condensed table-striped">
35 |       <thead>
36 |         <th>url</th>
37 |         <th></th>
38 |         # for field in common_fields|sort
39 |         <th>
40 |           {{ field }}
41 |         </th>
42 |         # endfor
43 |         <th>
44 |           ...
45 |         </th>
46 |       </thead>
47 |       <tbody>
48 |         # for result in results
49 |         <tr>
50 |           <td>
51 |             <a class=url href="/task/{{ project }}:{{ result.taskid }}" target=_blank>{{ result.url }}</a>
52 |           </td>
53 |           <td>
54 |             <a class=open-url href="{{ result.url }}" target="_blank"><span class="glyphicon glyphicon-new-window"></span></a>
55 |           </td>
56 |           # for field in common_fields|sort
57 |           <td>{{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }}</td>
58 |           # endfor
59 |           <td>
60 |             {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }}
61 |           </td>
62 |         # endfor
63 |       </tbody>
64 |     </table>
65 | 
66 |     <div class="pagination-wrap">
67 |       <ul class="pagination">
68 |         # set current_page = int(offset/limit) + (1 if offset%limit else 0)
69 |         # set count = count if count is not none else 0
70 |         # set total_page = int(count/limit) + (1 if count%limit else 0)
71 |         <li class="{{ "disabled" if current_page - 1 <= 0 else "" }}">
72 |           <a href="{% if current_page>1 %}/results?project={{ project }}&offset={{ (current_page-1)*limit }}&limit={{ limit }}{% endif %}">&laquo;</a>
73 |         </li>
74 |         # set prev = 0
75 |         # for i in range(0, total_page):
76 |         # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5:
77 |           # set prev = i
78 |           <li class="{% if i == current_page %}active{% endif %}">
79 |             <a href="/results?project={{ project }}&offset={{ i*limit }}&limit={{ limit }}">{{ i + 1 }}</a>
80 |           </li>
81 |         # elif prev == i-1:
82 |         <li class="disabled"><a>…</a></li>
83 |         # endif
84 |         # endfor
85 |         <li class="{{ "disabled" if current_page + 1 >= total_page else "" }}">
86 |           <a href="{% if current_page+1<total_page %}/results?project={{ project }}&offset={{ (current_page+1)*limit }}&limit={{ limit }}{% endif %}">&raquo;</a>
87 |         </li>
88 |       </ul>
89 |     </div>
90 |   </body>
91 | </html>
92 | 


--------------------------------------------------------------------------------
/pyspider/webui/templates/task.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Task - {{ task.project }}:{{ task.taskid }} - pyspider</title>
 6 |     <!--[if lt IE 9]>
 7 |       <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 8 |     <![endif]-->
 9 | 
10 |     <meta name="description" content="pyspider taskboard of {{ task.project }}:{{task.taskid }}">
11 |     <meta name="author" content="binux">
12 |     <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
13 |     <link href="{{ url_for('static', filename='task.min.css') }}" rel="stylesheet">
14 | 
15 |     <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
16 |     <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
17 |   </head>
18 | 
19 |   <body>
20 |       <div class=base-info>
21 |         <p>
22 |           <span class="status status-{{ task.status }}">{{ status_to_string(task.status) }}</span>
23 |           <a class=callback href="/debug/{{ task.project }}?taskid={{ task.taskid }}">{{ task.project }}.{{ task.process.callback }}</a>
24 |           &gt;
25 |           <a class=url href="{{ task.url }}" target=_blank>{{ task.url }}</a>
26 |           {% if task.status in (2, 3, 4) %}
27 |           (<span class=last-crawl>{{ task.lastcrawltime | format_date }}</span> crawled )
28 |           {% else %}
29 |           (<span class=update-time>{{ task.updatetime | format_date }}</span> updated )
30 |           {% endif %}
31 |         </p>
32 |       </div>
33 |       <div class=more-info>
34 |         <dl>
35 |           <dt>taskid</dt>
36 |           <dd>{{ task.taskid }}</dd>
37 |           <dt>lastcrawltime</dt>
38 |           <dd>{{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})</dd>
39 |           <dt>updatetime</dt>
40 |           <dd>{{ task.updatetime }} ({{ task.updatetime | format_date }})</dd>
41 |           # if task.schedule and task.schedule.exetime
42 |           <dt>exetime</dt>
43 |           <dd>{{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})</dd>
44 |           # endif
45 | 
46 |           # if task.track and task.track.fetch
47 |           <dt>
48 |             track.fetch
49 |             <span class="glyphicon glyphicon-{{ "ok" if task.track.fetch.ok else "remove" }}"></span>
50 |             {{ (task.track.fetch.time * 1000) | round(2) }}ms
51 |           </dt>
52 |           <dd>{{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}</dd>
53 |           # endif
54 | 
55 |           # if task.track and task.track.process
56 |           <dt>
57 |             track.process
58 |             <span class="glyphicon glyphicon-{{ "ok" if task.track.process.ok else "remove" }}"></span>
59 |             {{ (task.track.process.time * 1000) | round(2) }}ms
60 |             # if task.track.process.follows
61 |               +{{ task.track.process.follows | int }}
62 |             # endif
63 |           </dt>
64 |           <dd>
65 |             #- if task.track.process.exception
66 |             {{- task.track.process.exception or '' }}
67 |             # endif
68 |             #- if task.track.process.logs
69 |               {{- task.track.process.logs or '' }}
70 |             # endif
71 |             {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}}
72 |           </dd>
73 |           # endif
74 |         </dl>
75 |         <dl>
76 |           #- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', )
77 |           #- for key, value in task.items() if key not in not_shown_keys
78 |           <dt>{{ key }}</dt>
79 |           <dd>{{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}</dd>
80 |           #- endfor
81 |         </dl>
82 |         # if result and result.get('result'):
83 |         <dl>
84 |           <dt>result</dt>
85 |           <dd>{{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}</dd>
86 |         </dl>
87 |         # endif
88 |       </div>
89 |   </body>
90 | </html>
91 | <!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->
92 | 
93 | 


--------------------------------------------------------------------------------
/pyspider/webui/templates/tasks.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |   <head>
 4 |     <meta charset="utf-8">
 5 |     <title>Tasks - pyspider</title>
 6 |     <!--[if lt IE 9]>
 7 |       <script src="http://html5shim.googlecode.com/svn/trunk/html5.js"></script>
 8 |     <![endif]-->
 9 | 
10 |     <meta name="description" content="last actived tasks">
11 |     <meta name="author" content="binux">
12 |     <link href="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/css/bootstrap.min.css') }}" rel="stylesheet">
13 |     <link href="{{ url_for('static', filename='tasks.min.css') }}" rel="stylesheet">
14 | 
15 |     <script src="{{ url_for('cdn', path='jquery/1.11.0/jquery.min.js') }}"></script>
16 |     <script src="{{ url_for('cdn', path='twitter-bootstrap/3.1.1/js/bootstrap.min.js') }}"></script>
17 |   </head>
18 | 
19 |   <body>
20 |     <ol class=tasks>
21 |       {% for task in tasks | sort(reverse=True, attribute='updatetime') %}
22 |       <li class=task>
23 |         {% if task.status %}
24 |           <span class="status status-{{ task.status }}">{{ status_to_string(task.status) }}</span>
25 |         {% elif task.track %}
26 |         <span class="status status-3">
27 |           {% set fetchok = task.track.fetch and task.track.fetch.ok %}
28 |           {% set processok = task.track.process and task.track.process.ok %}
29 |           {%- if not fetchok -%}
30 |           FETCH_ERROR
31 |           {%- elif not processok -%}
32 |           PROCESS_ERROR
33 |           {%- endif -%}
34 |         </span>
35 |         {% else %}
36 |           <span class="status status-4 }}">ERROR</span>
37 |         {% endif %}
38 | 
39 |         <a class=callback href="/debug/{{ task.project }}?taskid={{ task.taskid }}" target=_blank>{{ task.project }}</a>
40 |         &gt;
41 |         <a class=url href="/task/{{ task.project }}:{{ task.taskid }}" title="{{ task.url }}" target=_blank>{{ task.url }}</a>
42 | 
43 |         <span class=update-time>{{ task.updatetime | format_date }}</span>
44 | 
45 |         {% if task.track and task.track.fetch %}
46 |         <span span=use-time>
47 |           {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms
48 |         </span>
49 |         {% endif %}
50 | 
51 |         <span span=follows>
52 |         {% if task.track and task.track.process %}
53 |         +{{ task.track.process.follows | int }}
54 |         {% endif %}
55 |         </span>
56 |       </li>
57 |       {% endfor %}
58 |     </ol>
59 |   </body>
60 | </html>
61 | <!-- vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: -->
62 | 
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Flask==0.10
 2 | Jinja2==2.7
 3 | chardet==3.0.4
 4 | cssselect==0.9
 5 | lxml==4.3.3
 6 | pycurl==7.43.0.3
 7 | pyquery==1.4.0
 8 | requests==2.24.0
 9 | tornado==4.5.3
10 | mysql-connector-python==8.0.16
11 | pika==1.1.0
12 | pymongo==3.9.0
13 | Flask-Login==0.2.11
14 | u-msgpack-python==1.6
15 | click==6.6
16 | SQLAlchemy==1.3.10
17 | six==1.10.0
18 | amqp==2.4.0
19 | redis==2.10.6
20 | redis-py-cluster==1.3.6
21 | kombu==4.4.0
22 | psycopg2==2.8.2
23 | elasticsearch==2.3.0
24 | tblib==1.4.0
25 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-24 23:11:49
 7 | 
 8 | from pyspider.run import main
 9 | 
10 | if __name__ == '__main__':
11 |     main()
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- encoding: utf-8 -*-
  3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
  4 | # Author: Binux<roy@binux.me>
  5 | #         http://binux.me
  6 | # Created on 2014-11-24 22:27:45
  7 | 
  8 | 
  9 | import sys
 10 | from setuptools import setup, find_packages
 11 | from codecs import open
 12 | from os import path
 13 | 
 14 | here = path.abspath(path.dirname(__file__))
 15 | with open(path.join(here, 'README.md'), encoding='utf-8') as f:
 16 |     long_description = f.read()
 17 | 
 18 | import pyspider
 19 | 
 20 | install_requires = [
 21 |     'Flask==0.10',
 22 |     'Jinja2==2.7',
 23 |     'chardet==3.0.4',
 24 |     'cssselect==0.9',
 25 |     "lxml==4.3.3",
 26 |     'pycurl==7.43.0.3',
 27 |     'requests==2.24.0',
 28 |     'Flask-Login==0.2.11',
 29 |     'u-msgpack-python==1.6',
 30 |     'click==3.3',
 31 |     'six==1.10.0',
 32 |     'tblib==1.4.0',
 33 |     'wsgidav==2.3.0',
 34 |     'tornado>=3.2,<=4.5.3',
 35 |     'pyquery',
 36 | ]
 37 | 
 38 | extras_require_all = [
 39 |     'mysql-connector-python==8.0.16',
 40 |     'pymongo==3.9.0',
 41 |     'redis==2.10.6',
 42 |     'redis-py-cluster==1.3.6',
 43 |     'psycopg2==2.8.2',
 44 |     'elasticsearch==2.3.0',
 45 |     'kombu==4.4.0',
 46 |     'amqp==2.4.0',
 47 |     'SQLAlchemy==1.3.10',
 48 |     'pika==1.1.0'
 49 | ]
 50 | 
 51 | setup(
 52 |     name='pyspider',
 53 |     version=pyspider.__version__,
 54 | 
 55 |     description='A Powerful Spider System in Python',
 56 |     long_description=long_description,
 57 | 
 58 |     url='https://github.com/binux/pyspider',
 59 | 
 60 |     author='Roy Binux',
 61 |     author_email='roy@binux.me',
 62 | 
 63 |     license='Apache License, Version 2.0',
 64 | 
 65 |     classifiers=[
 66 |         'Development Status :: 4 - Beta',
 67 |         'Programming Language :: Python :: 3.5',
 68 |         'Programming Language :: Python :: 3.6',
 69 |         'Programming Language :: Python :: 3.7',
 70 | 
 71 |         'License :: OSI Approved :: Apache Software License',
 72 | 
 73 |         'Intended Audience :: Developers',
 74 |         'Operating System :: OS Independent',
 75 |         'Environment :: Web Environment',
 76 | 
 77 |         'Topic :: Internet :: WWW/HTTP',
 78 |         'Topic :: Software Development :: Libraries :: Application Frameworks',
 79 |         'Topic :: Software Development :: Libraries :: Python Modules',
 80 |     ],
 81 | 
 82 |     keywords='scrapy crawler spider webui',
 83 | 
 84 |     packages=find_packages(exclude=['data', 'tests*']),
 85 | 
 86 |     install_requires=install_requires,
 87 | 
 88 |     extras_require={
 89 |         'all': extras_require_all,
 90 |         'test': [
 91 |             'coverage',
 92 |             'Werkzeug==0.16.1',
 93 |             'httpbin==0.7.0',
 94 |             'pyproxy==0.1.6',
 95 |             'easywebdav==1.2.0',
 96 |         ]
 97 |     },
 98 | 
 99 |     package_data={
100 |         'pyspider': [
101 |             'logging.conf',
102 |             'fetcher/phantomjs_fetcher.js',
103 |             'fetcher/splash_fetcher.lua',
104 |             'webui/static/*.js',
105 |             'webui/static/*.css',
106 |             'webui/templates/*'
107 |         ],
108 |     },
109 | 
110 |     entry_points={
111 |         'console_scripts': [
112 |             'pyspider=pyspider.run:main'
113 |         ]
114 |     },
115 | 
116 |     test_suite='tests.all_suite',
117 | )
118 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-02-09 10:53:19
 7 | 
 8 | import os
 9 | import unittest
10 | 
11 | all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py")
12 | 


--------------------------------------------------------------------------------
/tests/data_fetcher_processor_handler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-18 14:12:55
 7 | 
 8 | from pyspider.libs.base_handler import *
 9 | 
10 | class Handler(BaseHandler):
11 | 
12 |     @not_send_status
13 |     def not_send_status(self, response):
14 |         self.crawl('http://www.baidu.com/')
15 |         return response.text
16 | 
17 |     def url_deduplicated(self, response):
18 |         self.crawl('http://www.baidu.com/')
19 |         self.crawl('http://www.google.com/')
20 |         self.crawl('http://www.baidu.com/')
21 |         self.crawl('http://www.google.com/')
22 |         self.crawl('http://www.google.com/')
23 | 
24 |     @catch_status_code_error
25 |     def catch_http_error(self, response):
26 |         self.crawl('http://www.baidu.com/')
27 |         return response.status_code
28 | 
29 |     def json(self, response):
30 |         return response.json
31 | 
32 |     def html(self, response):
33 |         return response.doc('h1').text()
34 | 
35 |     def links(self, response):
36 |         self.crawl([x.attr.href for x in response.doc('a').items()], callback=self.links)
37 | 
38 |     def cookies(self, response):
39 |         return response.cookies
40 | 
41 |     def get_save(self, response):
42 |         return response.save
43 | 
44 |     def get_process_save(self, response):
45 |         return self.save
46 | 
47 |     def set_process_save(self, response):
48 |         self.save['roy'] = 'binux'
49 | 
50 | class IgnoreHandler(BaseHandler):
51 |     pass
52 | 
53 | __handler_cls__ = Handler
54 | 


--------------------------------------------------------------------------------
/tests/data_handler.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env python
 3 | # -*- encoding: utf-8 -*-
 4 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 5 | # Author: Binux<i@binux.me>
 6 | #         http://binux.me
 7 | # Created on 2014-02-22 14:02:21
 8 | 
 9 | import time
10 | from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every
11 | 
12 | class IgnoreHandler(object):
13 |     pass
14 | 
15 | class TestHandler(BaseHandler):
16 |     retry_delay = {
17 |         1: 10,
18 |         '': -1
19 |     }
20 | 
21 |     def hello(self):
22 |         return "hello world!"
23 | 
24 |     def echo(self, response):
25 |         return response.content
26 | 
27 |     def saved(self, response):
28 |         return response.save
29 | 
30 |     def echo_task(self, response, task):
31 |         return task['project']
32 | 
33 |     @catch_status_code_error
34 |     def catch_status_code(self, response):
35 |         return response.status_code
36 | 
37 |     def raise_exception(self):
38 |         print('print')
39 |         logger.info("info")
40 |         logger.warning("warning")
41 |         logger.error("error")
42 |         raise Exception('exception')
43 | 
44 |     def add_task(self, response):
45 |         self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'})
46 |         self.send_message('some_project', {'some': 'message'})
47 | 
48 |     @every
49 |     def on_cronjob1(self, response):
50 |         logger.info('on_cronjob1')
51 | 
52 |     @every(seconds=10)
53 |     def on_cronjob2(self, response):
54 |         logger.info('on_cronjob2')
55 | 
56 |     def generator(self, response):
57 |         yield "a"
58 |         yield "b"
59 | 
60 |     def sleep(self, response):
61 |         time.sleep(response.save)
62 | 
63 | 


--------------------------------------------------------------------------------
/tests/data_sample_handler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # Created on __DATE__
 4 | # Project: __PROJECT_NAME__
 5 | 
 6 | from pyspider.libs.base_handler import *
 7 | 
 8 | 
 9 | class Handler(BaseHandler):
10 |     crawl_config = {
11 |     }
12 | 
13 |     @every(minutes=24 * 60)
14 |     def on_start(self):
15 |         self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page)
16 | 
17 |     @config(age=10 * 24 * 60 * 60)
18 |     def index_page(self, response):
19 |         for each in response.doc('a[href^="http"]').items():
20 |             self.crawl(each.attr.href, callback=self.detail_page)
21 | 
22 |     @config(priority=2)
23 |     def detail_page(self, response):
24 |         return {
25 |             "url": response.url,
26 |             "title": response.doc('title').text(),
27 |         }
28 | 


--------------------------------------------------------------------------------
/tests/data_test_webpage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-24 13:44:10
 7 | 
 8 | from httpbin import app
 9 | 
10 | @app.route('/pyspider/test.html')
11 | def test_page():
12 |     return '''
13 | <a href="/404">404
14 | <a href="/links/10/0">0
15 | <a href="/links/10/1">1
16 | <a href="/links/10/2">2
17 | <a href="/links/10/3">3
18 | <a href="/links/10/4">4
19 | <a href="/gzip">gzip
20 | <a href="/get">get
21 | <a href="/deflate">deflate
22 | <a href="/html">html
23 | <a href="/xml">xml
24 | <a href="/robots.txt">robots
25 | <a href="/cache">cache
26 | <a href="/stream/20">stream
27 | '''
28 | 
29 | @app.route('/pyspider/ajax.html')
30 | def test_ajax():
31 |     return '''
32 | <div class=status>loading...</div>
33 | <div class=ua></div>
34 | <div class=ip></div>
35 | <script>
36 | var xhr = new XMLHttpRequest();
37 | xhr.onload = function() {
38 |   var data = JSON.parse(xhr.responseText);
39 |   document.querySelector('.status').innerHTML = 'done';
40 |   document.querySelector('.ua').innerHTML = data.headers['User-Agent'];
41 |   document.querySelector('.ip').innerHTML = data.origin;
42 | }
43 | xhr.open("get", "/get", true);
44 | xhr.send();
45 | </script>
46 | '''
47 | 
48 | @app.route('/pyspider/ajax_click.html')
49 | def test_ajax_click():
50 |     return '''
51 | <div class=status>loading...</div>
52 | <div class=ua></div>
53 | <div class=ip></div>
54 | <a href="javascript:void(0)" onclick="load()">load</a>
55 | <script>
56 | function load() {
57 |     var xhr = new XMLHttpRequest();
58 |     xhr.onload = function() {
59 |       var data = JSON.parse(xhr.responseText);
60 |       document.querySelector('.status').innerHTML = 'done';
61 |       document.querySelector('.ua').innerHTML = data.headers['User-Agent'];
62 |       document.querySelector('.ip').innerHTML = data.origin;
63 |     }
64 |     xhr.open("get", "/get", true);
65 |     xhr.send();
66 | }
67 | </script>
68 | '''
69 | 


--------------------------------------------------------------------------------
/tests/test_base_handler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2017-02-26 10:35:23
 7 | 
 8 | import unittest
 9 | 
10 | from pyspider.libs.base_handler import BaseHandler
11 | 
12 | 
13 | class TestBaseHandler(unittest.TestCase):
14 |     sample_task_http = {
15 |         'taskid': 'taskid',
16 |         'project': 'project',
17 |         'url': '',
18 |         'fetch': {
19 |             'method': 'GET',
20 |             'headers': {
21 |                 'Cookie': 'a=b',
22 |                 'a': 'b'
23 |             },
24 |             'cookies': {
25 |                 'c': 'd',
26 |             },
27 |             'timeout': 60,
28 |             'save': 'abc',
29 |         },
30 |         'process': {
31 |             'callback': 'callback',
32 |             'save': [1, 2, 3],
33 |         },
34 |     }
35 | 
36 |     def test_task_join_crawl_config(self):
37 |         task = dict(self.sample_task_http)
38 |         crawl_config = {
39 |             'taskid': 'xxxx',       # should not affect finial task
40 |             'proxy': 'username:password@hostname:port',  # should add proxy
41 |             'headers': {            # should merge headers
42 |                 'Cookie': 'abc',    # should not affect cookie
43 |                 'c': 'd',           # should add header c
44 |             }
45 |         }
46 |         
47 |         ret = BaseHandler.task_join_crawl_config(task, crawl_config)
48 |         self.assertDictEqual(ret, {
49 |             'taskid': 'taskid',
50 |             'project': 'project',
51 |             'url': '',
52 |             'fetch': {
53 |                 'method': 'GET',
54 |                 'proxy': 'username:password@hostname:port',
55 |                 'headers': {
56 |                     'Cookie': 'a=b',
57 |                     'a': 'b',
58 |                     'c': 'd'
59 |                 },
60 |                 'cookies': {
61 |                     'c': 'd',
62 |                 },
63 |                 'timeout': 60,
64 |                 'save': 'abc',
65 |             },
66 |             'process': {
67 |                 'callback': 'callback',
68 |                 'save': [1, 2, 3],
69 |             },
70 |         });
71 | 


--------------------------------------------------------------------------------
/tests/test_bench.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-12-10 01:34:09
 7 | 
 8 | import os
 9 | import sys
10 | import time
11 | import click
12 | import shutil
13 | import inspect
14 | import unittest
15 | 
16 | from pyspider import run
17 | from pyspider.libs import utils
18 | 
19 | class TestBench(unittest.TestCase):
20 | 
21 |     @classmethod
22 |     def setUpClass(self):
23 |         shutil.rmtree('./data/bench', ignore_errors=True)
24 |         os.makedirs('./data/bench')
25 | 
26 |     @classmethod
27 |     def tearDownClass(self):
28 |         shutil.rmtree('./data/bench', ignore_errors=True)
29 | 
30 |     def test_10_bench(self):
31 |         import subprocess
32 |         #cmd = [sys.executable]
33 |         cmd = ['coverage', 'run']
34 |         p = subprocess.Popen(cmd+[
35 |             inspect.getsourcefile(run),
36 |             '--queue-maxsize=0',
37 |             'bench',
38 |             '--total=500'
39 |         ], close_fds=True, stderr=subprocess.PIPE)
40 | 
41 |         stdout, stderr = p.communicate()
42 |         stderr = utils.text(stderr)
43 |         print(stderr)
44 | 
45 |         self.assertEqual(p.returncode, 0, stderr)
46 |         self.assertIn('Crawled', stderr)
47 |         self.assertIn('Fetched', stderr)
48 |         self.assertIn('Processed', stderr)
49 |         self.assertIn('Saved', stderr)
50 | 


--------------------------------------------------------------------------------
/tests/test_counter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-04-05 00:05:58
 7 | 
 8 | import sys
 9 | import time
10 | import unittest
11 | 
12 | from pyspider.libs import counter
13 | 
14 | class TestCounter(unittest.TestCase):
15 |     def test_010_TimebaseAverageEventCounter(self):
16 |         c = counter.TimebaseAverageEventCounter(2, 1)
17 |         for i in range(100):
18 |             time.sleep(0.1)
19 |             c.event(100+i)
20 | 
21 |         self.assertEqual(c.sum, float(180+199)*20/2)
22 |         self.assertEqual(c.avg, float(180+199)/2)
23 | 
24 |     def test_020_TotalCounter(self):
25 |         c = counter.TotalCounter()
26 |         for i in range(3):
27 |             c.event(i)
28 |         self.assertEqual(c.avg, 3)
29 |         self.assertEqual(c.sum, 3)
30 | 
31 |     def test_030_AverageWindowCounter(self):
32 |         c = counter.AverageWindowCounter(10)
33 |         self.assertTrue(c.empty())
34 | 
35 |         for i in range(20):
36 |             c.event(i)
37 | 
38 |         self.assertFalse(c.empty())
39 |         self.assertEqual(c.avg, 14.5)
40 |         self.assertEqual(c.sum, 145)
41 | 
42 |     def test_020_delete(self):
43 |         c = counter.CounterManager()
44 |         c.event(('a', 'b'), 1)
45 |         c.event(('a', 'c'), 1)
46 |         c.event(('b', 'c'), 1)
47 |         
48 |         self.assertIsNotNone(c['a'])
49 |         self.assertIsNotNone(c['b'])
50 | 
51 |         del c['a']
52 | 
53 |         self.assertNotIn('a', c)
54 |         self.assertIsNotNone(c['b'])
55 | 


--------------------------------------------------------------------------------
/tests/test_response.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-18 11:10:27
 7 | 
 8 | 
 9 | import os
10 | import copy
11 | import time
12 | import httpbin
13 | import unittest
14 | 
15 | import logging
16 | import logging.config
17 | logging.config.fileConfig("pyspider/logging.conf")
18 | 
19 | from pyspider.libs import utils
20 | from pyspider.libs.response import rebuild_response
21 | from pyspider.fetcher.tornado_fetcher import Fetcher
22 | 
23 | class TestResponse(unittest.TestCase):
24 |     sample_task_http = {
25 |         'taskid': 'taskid',
26 |         'project': 'project',
27 |         'url': '',
28 |     }
29 | 
30 |     @classmethod
31 |     def setUpClass(self):
32 |         self.fetcher = Fetcher(None, None, async_mode=False)
33 |         self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False)
34 |         self.httpbin = 'http://127.0.0.1:14887'
35 |         time.sleep(0.5)
36 | 
37 |     @classmethod
38 |     def tearDownClass(self):
39 |         self.httpbin_thread.terminate()
40 | 
41 |     def get(self, url, **kwargs):
42 |         if not url.startswith('http://'):
43 |             url = self.httpbin + url
44 |         request = copy.deepcopy(self.sample_task_http)
45 |         request['url'] = url
46 |         request.update(kwargs)
47 |         result = self.fetcher.fetch(request)
48 |         response = rebuild_response(result)
49 |         return response
50 | 
51 |     def test_10_html(self):
52 |         response = self.get('/html')
53 |         self.assertEqual(response.status_code, 200)
54 |         self.assertIsNotNone(response.doc('h1'))
55 | 
56 |     def test_20_xml(self):
57 |         response = self.get('/xml')
58 |         self.assertEqual(response.status_code, 200)
59 |         self.assertIsNotNone(response.doc('item'))
60 | 
61 |     def test_30_gzip(self):
62 |         response = self.get('/gzip')
63 |         self.assertEqual(response.status_code, 200)
64 |         self.assertIn('gzipped', response.text)
65 | 
66 |     def test_40_deflate(self):
67 |         response = self.get('/deflate')
68 |         self.assertEqual(response.status_code, 200)
69 |         self.assertIn('deflated', response.text)
70 | 
71 |     def test_50_ok(self):
72 |         response = self.get('/status/200')
73 |         self.assertTrue(response.ok)
74 |         self.assertTrue(response)
75 |         response = self.get('/status/302')
76 |         self.assertTrue(response.ok)
77 |         self.assertTrue(response)
78 |         with self.assertRaises(Exception):
79 |             self.raise_for_status(allow_redirects=False)
80 | 
81 |     def test_60_not_ok(self):
82 |         response = self.get('/status/400')
83 |         self.assertFalse(response.ok)
84 |         self.assertFalse(response)
85 |         response = self.get('/status/500')
86 |         self.assertFalse(response.ok)
87 |         self.assertFalse(response)
88 |         response = self.get('/status/600')
89 |         self.assertFalse(response.ok)
90 |         self.assertFalse(response)
91 | 
92 |     def test_70_reraise_exception(self):
93 |         response = self.get('file://abc')
94 |         with self.assertRaisesRegex(Exception, 'HTTP 599'):
95 |             response.raise_for_status()
96 | 


--------------------------------------------------------------------------------
/tests/test_result_dump.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-10-12 22:17:57
 7 | 
 8 | from __future__ import unicode_literals, division
 9 | 
10 | import six
11 | import csv
12 | import time
13 | import json
14 | import unittest
15 | from six import StringIO
16 | 
17 | from pyspider.libs import result_dump
18 | 
19 | results1 = [
20 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
21 |      'result': {'a': 1, 'b': 2} },
22 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
23 |      'result': {'a': 1, 'b': 2, 'c': 3} },
24 | ]
25 | 
26 | results2 = results1 + [
27 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
28 |      'result': [1, 2, '中文', u'中文'] },
29 | ]
30 | 
31 | results_error = results2 + [
32 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
33 |      'result': None},
34 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time() },
35 |     {'taskid': 'taskid1', 'pdatetime': time.time() },
36 | ]
37 | 
38 | result_list_error = [
39 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
40 |      'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]},
41 |     {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(),
42 |      'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]},
43 | ]
44 | 
45 | class TestResultDump(unittest.TestCase):
46 |     def test_result_formater_1(self):
47 |         common_fields, results = result_dump.result_formater(results1)
48 |         self.assertEqual(common_fields, set(('a', 'b')))
49 | 
50 |     def test_result_formater_2(self):
51 |         common_fields, results = result_dump.result_formater(results2)
52 |         self.assertEqual(common_fields, set())
53 | 
54 |     def test_result_formater_error(self):
55 |         common_fields, results = result_dump.result_formater(results_error)
56 |         self.assertEqual(common_fields, set())
57 | 
58 |     def test_dump_as_json(self):
59 |         for i, line in enumerate((''.join(
60 |                 result_dump.dump_as_json(results2))).splitlines()):
61 |             self.assertDictEqual(results2[i], json.loads(line))
62 | 
63 |     def test_dump_as_json_valid(self):
64 |         ret = json.loads(''.join(result_dump.dump_as_json(results2, True)))
65 |         for i, j in zip(results2, ret):
66 |             self.assertDictEqual(i, j)
67 | 
68 |     def test_dump_as_txt(self):
69 |         for i, line in enumerate((''.join(
70 |                 result_dump.dump_as_txt(results2))).splitlines()):
71 |             url, json_data = line.split('\t', 2)
72 |             self.assertEqual(results2[i]['result'], json.loads(json_data))
73 | 
74 |     def test_dump_as_csv(self):
75 |         reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(results1))))
76 |         for row in reader:
77 |             self.assertEqual(len(row), 4)
78 | 
79 |     def test_dump_as_csv_case_1(self):
80 |         reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(result_list_error))))
81 |         for row in reader:
82 |             self.assertEqual(len(row), 2)
83 | 


--------------------------------------------------------------------------------
/tests/test_result_worker.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<i@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2014-11-11 20:52:53
 7 | 
 8 | import os
 9 | import time
10 | import unittest
11 | import logging.config
12 | logging.config.fileConfig("pyspider/logging.conf")
13 | 
14 | import shutil
15 | from pyspider.database.sqlite import resultdb
16 | from pyspider.result.result_worker import ResultWorker
17 | from pyspider.libs.multiprocessing_queue import Queue
18 | from pyspider.libs.utils import run_in_thread
19 | 
20 | 
21 | class TestProcessor(unittest.TestCase):
22 |     resultdb_path = './data/tests/result.db'
23 | 
24 |     @classmethod
25 |     def setUpClass(self):
26 |         shutil.rmtree('./data/tests/', ignore_errors=True)
27 |         os.makedirs('./data/tests/')
28 | 
29 |         def get_resultdb():
30 |             return resultdb.ResultDB(self.resultdb_path)
31 |         self.resultdb = get_resultdb()
32 |         self.inqueue = Queue(10)
33 | 
34 |         def run_result_worker():
35 |             self.result_worker = ResultWorker(get_resultdb(), self.inqueue)
36 |             self.result_worker.run()
37 |         self.process = run_in_thread(run_result_worker)
38 |         time.sleep(1)
39 | 
40 |     @classmethod
41 |     def tearDownClass(self):
42 |         if self.process.is_alive():
43 |             self.result_worker.quit()
44 |             self.process.join(2)
45 |         assert not self.process.is_alive()
46 |         shutil.rmtree('./data/tests/', ignore_errors=True)
47 | 
48 |     def test_10_bad_result(self):
49 |         self.inqueue.put(({'project': 'test_project'}, {}))
50 |         self.resultdb._list_project()
51 |         self.assertEqual(len(self.resultdb.projects), 0)
52 |         self.assertEqual(self.resultdb.count('test_project'), 0)
53 | 
54 |     def test_10_bad_result_2(self):
55 |         self.inqueue.put(({'project': 'test_project'}, {'a': 'b'}))
56 |         self.resultdb._list_project()
57 |         self.assertEqual(len(self.resultdb.projects), 0)
58 |         self.assertEqual(self.resultdb.count('test_project'), 0)
59 | 
60 |     def test_20_insert_result(self):
61 |         data = {
62 |             'a': 'b'
63 |         }
64 |         self.inqueue.put(({
65 |             'project': 'test_project',
66 |             'taskid': 'id1',
67 |             'url': 'url1'
68 |         }, data))
69 |         time.sleep(0.5)
70 |         self.resultdb._list_project()
71 |         self.assertEqual(len(self.resultdb.projects), 1)
72 |         self.assertEqual(self.resultdb.count('test_project'), 1)
73 | 
74 |         result = self.resultdb.get('test_project', 'id1')
75 |         self.assertEqual(result['result'], data)
76 | 
77 |     def test_30_overwrite(self):
78 |         self.inqueue.put(({
79 |             'project': 'test_project',
80 |             'taskid': 'id1',
81 |             'url': 'url1'
82 |         }, "abc"))
83 |         time.sleep(0.1)
84 |         result = self.resultdb.get('test_project', 'id1')
85 |         self.assertEqual(result['result'], "abc")
86 | 
87 |     def test_40_insert_list(self):
88 |         self.inqueue.put(({
89 |             'project': 'test_project',
90 |             'taskid': 'id2',
91 |             'url': 'url1'
92 |         }, ['a', 'b']))
93 |         time.sleep(0.1)
94 |         result = self.resultdb.get('test_project', 'id2')
95 |         self.assertEqual(result['result'], ['a', 'b'])
96 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-01-18 16:53:49
 7 | 
 8 | import sys
 9 | import time
10 | import unittest
11 | 
12 | from pyspider.libs import utils
13 | 
14 | class TestFetcher(unittest.TestCase):
15 |     def test_readonlydict(self):
16 |         data = dict(a='a', b=123)
17 |         data['c'] = self
18 |         data = utils.ReadOnlyDict(data)
19 | 
20 |         with self.assertRaises(Exception):
21 |             data['d'] = 9
22 | 
23 |     def test_getitem(self):
24 |         l = [1, 2]
25 |         self.assertEqual(utils.getitem(l, 0), 1)
26 |         self.assertEqual(utils.getitem(l, 1), 2)
27 |         self.assertEqual(utils.getitem(l, 3), None)
28 |         self.assertEqual(utils.getitem(l, 3, 9), 9)
29 |         self.assertEqual(utils.getitem(l, 'key'), None)
30 |         self.assertEqual(utils.getitem(l, 'key', 8), 8)
31 |         data = dict(a='a', b=123)
32 |         self.assertEqual(utils.getitem(data, 'a'), 'a')
33 |         self.assertEqual(utils.getitem(data, 'b'), 123)
34 |         self.assertEqual(utils.getitem(data, 'c'), None)
35 |         self.assertEqual(utils.getitem(data, 'c', 9), 9)
36 | 
37 |     def test_format_data(self):
38 |         now = time.time()
39 |         self.assertEqual(utils.format_date(now - 30), '30 seconds ago')
40 |         self.assertEqual(utils.format_date(now - 60), '1 minute ago')
41 |         self.assertEqual(utils.format_date(now - 2*60), '2 minutes ago')
42 |         self.assertEqual(utils.format_date(now - 30*60), '30 minutes ago')
43 |         self.assertEqual(utils.format_date(now - 60*60), '1 hour ago')
44 |         self.assertEqual(utils.format_date(1963475336), 'Mar 21, 2032 at 9:48')
45 |         self.assertEqual(utils.format_date(now - 12*60*60), '12 hours ago')
46 |         self.assertRegex(utils.format_date(now - 24*60*60), r'^yesterday at \d{1,2}:\d{2}$')
47 |         self.assertRegex(utils.format_date(now - 2*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
48 |         self.assertRegex(utils.format_date(now - 3*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
49 |         self.assertRegex(utils.format_date(now - 4*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$')
50 |         self.assertRegex(utils.format_date(now - 5*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$')
51 |         self.assertRegex(utils.format_date(now - 333*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$')
52 |         self.assertRegex(utils.format_date(now - 334*24*60*60), r'^[A-Z][a-z]+ \d{1,2}, \d{4} at \d{1,2}:\d{2}$')
53 | 


--------------------------------------------------------------------------------
/tests/test_xmlrpc.py:
--------------------------------------------------------------------------------
 1 | #   Copyright (c) 2006-2007 Open Source Applications Foundation
 2 | #
 3 | #   Licensed under the Apache License, Version 2.0 (the "License");
 4 | #   you may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | #
15 | #   Origin: https://code.google.com/p/wsgi-xmlrpc/
16 | 
17 | import unittest
18 | import tornado.wsgi
19 | import tornado.ioloop
20 | import tornado.httpserver
21 | from pyspider.libs import utils
22 | 
23 | class TestXMLRPCServer(unittest.TestCase):
24 |     @classmethod
25 |     def setUpClass(self):
26 |         from pyspider.libs import wsgi_xmlrpc
27 |         
28 |         def test_1():
29 |             return 'test_1'
30 |             
31 |         class Test2(object):
32 |             def test_3(self, obj):
33 |                 return obj
34 |                 
35 |         test = Test2()
36 |         
37 |         application = wsgi_xmlrpc.WSGIXMLRPCApplication()
38 |         application.register_instance(Test2())
39 |         application.register_function(test_1)
40 | 
41 |         container = tornado.wsgi.WSGIContainer(application)
42 |         self.io_loop = tornado.ioloop.IOLoop.current()
43 |         http_server = tornado.httpserver.HTTPServer(container, io_loop=self.io_loop)
44 |         http_server.listen(3423)
45 |         self.thread = utils.run_in_thread(self.io_loop.start)
46 | 
47 |     @classmethod
48 |     def tearDownClass(self):
49 |         self.io_loop.add_callback(self.io_loop.stop)
50 |         self.thread.join()
51 |     
52 |     def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'):
53 |         from six.moves.xmlrpc_client import ServerProxy
54 |         
55 |         client = ServerProxy(uri)
56 |         
57 |         assert client.test_1() == 'test_1'
58 |         assert client.test_3({'asdf':4}) == {'asdf':4}
59 | 


--------------------------------------------------------------------------------
/tools/migrate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
 4 | # Author: Binux<roy@binux.me>
 5 | #         http://binux.me
 6 | # Created on 2015-09-30 23:22:46
 7 | 
 8 | import click
 9 | import logging
10 | from pyspider.database.base.projectdb import ProjectDB
11 | from pyspider.database.base.taskdb import TaskDB
12 | from pyspider.database.base.resultdb import ResultDB
13 | from pyspider.database import connect_database
14 | from pyspider.libs.utils import unicode_obj
15 | from multiprocessing.pool import ThreadPool as Pool
16 | 
17 | logging.getLogger().setLevel(logging.INFO)
18 | 
19 | 
20 | def taskdb_migrating(project, from_connection, to_connection):
21 |     logging.info("taskdb: %s", project)
22 |     f = connect_database(from_connection)
23 |     t = connect_database(to_connection)
24 |     t.drop(project)
25 |     for status in range(1, 5):
26 |         for task in f.load_tasks(status, project=project):
27 |             t.insert(project, task['taskid'], task)
28 | 
29 | 
30 | def resultdb_migrating(project, from_connection, to_connection):
31 |     logging.info("resultdb: %s", project)
32 |     f = connect_database(from_connection)
33 |     t = connect_database(to_connection)
34 |     t.drop(project)
35 |     for result in f.select(project):
36 |         t.save(project, result['taskid'], result['url'], result['result'])
37 | 
38 | 
39 | @click.command()
40 | @click.option('--pool', default=10, help='cocurrent worker size.')
41 | @click.argument('from_connection', required=1)
42 | @click.argument('to_connection', required=1)
43 | def migrate(pool, from_connection, to_connection):
44 |     """
45 |     Migrate tool for pyspider
46 |     """
47 |     f = connect_database(from_connection)
48 |     t = connect_database(to_connection)
49 | 
50 |     if isinstance(f, ProjectDB):
51 |         for each in f.get_all():
52 |             each = unicode_obj(each)
53 |             logging.info("projectdb: %s", each['name'])
54 |             t.drop(each['name'])
55 |             t.insert(each['name'], each)
56 |     elif isinstance(f, TaskDB):
57 |         pool = Pool(pool)
58 |         pool.map(
59 |             lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t),
60 |             f.projects)
61 |     elif isinstance(f, ResultDB):
62 |         pool = Pool(pool)
63 |         pool.map(
64 |             lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t),
65 |             f.projects)
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     migrate()
70 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py35,py36,py37,py38
3 | [testenv]
4 | install_command = 
5 |     pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1'  {opts} -e .[all,test] {packages}
6 | commands =
7 |     python setup.py test []
8 | 


--------------------------------------------------------------------------------