├── .coveragerc ├── .github └── ISSUE_TEMPLATE.md ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── config_example.json ├── data └── .gitignore ├── docker-compose.yaml ├── docs ├── About-Projects.md ├── About-Tasks.md ├── Architecture.md ├── Command-Line.md ├── Deployment-demo.pyspider.org.md ├── Deployment.md ├── Frequently-Asked-Questions.md ├── Quickstart.md ├── Running-pyspider-with-Docker.md ├── Script-Environment.md ├── Working-with-Results.md ├── apis │ ├── @catch_status_code_error.md │ ├── @every.md │ ├── Response.md │ ├── index.md │ ├── self.crawl.md │ └── self.send_message.md ├── conf.py ├── imgs │ ├── creating_a_project.png │ ├── css_selector_helper.png │ ├── demo.png │ ├── developer-tools-network-filter.png │ ├── developer-tools-network.png │ ├── index_page.png │ ├── inspect_element.png │ ├── pyspider-arch.png │ ├── request-headers.png │ ├── run_one_step.png │ ├── search-for-request.png │ ├── tutorial_imdb_front.png │ └── twitch.png ├── index.md └── tutorial │ ├── AJAX-and-more-HTTP.md │ ├── HTML-and-CSS-Selector.md │ ├── Render-with-PhantomJS.md │ └── index.md ├── mkdocs.yml ├── pyspider ├── __init__.py ├── database │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ └── taskdb.py │ ├── basedb.py │ ├── couchdb │ │ ├── __init__.py │ │ ├── couchdbbase.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ └── taskdb.py │ ├── elasticsearch │ │ ├── __init__.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ └── taskdb.py │ ├── local │ │ ├── __init__.py │ │ └── projectdb.py │ ├── mongodb │ │ ├── __init__.py │ │ ├── mongodbbase.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ └── taskdb.py │ ├── mysql │ │ ├── __init__.py │ │ ├── mysqlbase.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ └── taskdb.py │ ├── redis │ │ ├── __init__.py │ │ └── taskdb.py │ ├── sqlalchemy │ │ ├── __init__.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ ├── sqlalchemybase.py │ │ └── taskdb.py │ └── sqlite │ │ ├── __init__.py │ │ ├── projectdb.py │ │ ├── resultdb.py │ │ ├── sqlitebase.py │ │ └── taskdb.py ├── fetcher │ ├── __init__.py │ ├── cookie_utils.py │ ├── phantomjs_fetcher.js │ ├── puppeteer_fetcher.js │ ├── splash_fetcher.lua │ └── tornado_fetcher.py ├── libs │ ├── ListIO.py │ ├── __init__.py │ ├── base_handler.py │ ├── bench.py │ ├── counter.py │ ├── dataurl.py │ ├── log.py │ ├── multiprocessing_queue.py │ ├── pprint.py │ ├── response.py │ ├── result_dump.py │ ├── sample_handler.py │ ├── url.py │ ├── utils.py │ └── wsgi_xmlrpc.py ├── logging.conf ├── message_queue │ ├── __init__.py │ ├── kombu_queue.py │ ├── rabbitmq.py │ └── redis_queue.py ├── processor │ ├── __init__.py │ ├── processor.py │ └── project_module.py ├── result │ ├── __init__.py │ └── result_worker.py ├── run.py ├── scheduler │ ├── __init__.py │ ├── scheduler.py │ ├── task_queue.py │ └── token_bucket.py └── webui │ ├── __init__.py │ ├── app.py │ ├── bench_test.py │ ├── debug.py │ ├── index.py │ ├── login.py │ ├── result.py │ ├── static │ ├── .babelrc │ ├── css_selector_helper.min.js │ ├── debug.min.css │ ├── debug.min.js │ ├── index.min.css │ ├── index.min.js │ ├── package.json │ ├── result.min.css │ ├── result.min.js │ ├── src │ │ ├── css_selector_helper.js │ │ ├── debug.js │ │ ├── debug.less │ │ ├── index.js │ │ ├── index.less │ │ ├── result.less │ │ ├── splitter.js │ │ ├── task.less │ │ ├── tasks.less │ │ └── variable.less │ ├── task.min.css │ ├── task.min.js │ ├── tasks.min.css │ ├── tasks.min.js │ └── webpack.config.js │ ├── task.py │ ├── templates │ ├── debug.html │ ├── index.html │ ├── result.html │ ├── task.html │ └── tasks.html │ └── webdav.py ├── requirements.txt ├── run.py ├── setup.py ├── tests ├── __init__.py ├── data_fetcher_processor_handler.py ├── data_handler.py ├── data_sample_handler.py ├── data_test_webpage.py ├── test_base_handler.py ├── test_bench.py ├── test_counter.py ├── test_database.py ├── test_fetcher.py ├── test_fetcher_processor.py ├── test_message_queue.py ├── test_processor.py ├── test_response.py ├── test_result_dump.py ├── test_result_worker.py ├── test_run.py ├── test_scheduler.py ├── test_task_queue.py ├── test_utils.py ├── test_webdav.py ├── test_webui.py └── test_xmlrpc.py ├── tools └── migrate.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | pyspider 4 | parallel = True 5 | 6 | [report] 7 | omit = 8 | pyspider/libs/sample_handler.py 9 | pyspider/libs/pprint.py 10 | 11 | exclude_lines = 12 | pragma: no cover 13 | def __repr__ 14 | if self.debug: 15 | if settings.DEBUG 16 | raise AssertionError 17 | raise NotImplementedError 18 | if 0: 19 | if __name__ == .__main__.: 20 | except ImportError: 21 | pass 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | * pyspider version: 8 | * Operating system: 9 | * Start up command: 10 | 11 | ### Expected behavior 12 | 13 | 14 | 15 | ### Actual behavior 16 | 17 | 18 | 19 | ### How to reproduce 20 | 21 | 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | data/* 3 | .venv 4 | .idea 5 | # C extensions 6 | *.so 7 | 8 | # Packages 9 | *.egg 10 | *.egg-info 11 | dist 12 | build 13 | eggs 14 | parts 15 | bin 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | __pycache__ 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | .idea 40 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | cache: pip 3 | python: 4 | - 3.5 5 | - 3.6 6 | - 3.7 7 | #- 3.8 8 | services: 9 | - docker 10 | - mongodb 11 | - rabbitmq 12 | - redis 13 | - mysql 14 | # - elasticsearch 15 | - postgresql 16 | addons: 17 | postgresql: "9.4" 18 | apt: 19 | packages: 20 | - rabbitmq-server 21 | env: 22 | - IGNORE_COUCHDB=1 23 | 24 | before_install: 25 | - sudo apt-get update -qq 26 | - curl -O https://download.elastic.co/elasticsearch/release/org/elasticsearch/distribution/deb/elasticsearch/2.4.0/elasticsearch-2.4.0.deb && sudo dpkg -i --force-confnew elasticsearch-2.4.0.deb && sudo service elasticsearch restart 27 | - npm install express puppeteer 28 | - sudo docker pull scrapinghub/splash 29 | - sudo docker run -d --net=host scrapinghub/splash 30 | before_script: 31 | - psql -c "CREATE DATABASE pyspider_test_taskdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres 32 | - psql -c "CREATE DATABASE pyspider_test_projectdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres 33 | - psql -c "CREATE DATABASE pyspider_test_resultdb ENCODING 'UTF8' TEMPLATE=template0;" -U postgres 34 | - sleep 10 35 | install: 36 | - pip install https://github.com/marcus67/easywebdav/archive/master.zip 37 | - sudo apt-get install libgnutls28-dev 38 | - pip install -e .[all,test] 39 | - pip install coveralls 40 | script: 41 | - coverage run setup.py test 42 | after_success: 43 | - coverage combine 44 | - coveralls 45 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | MAINTAINER binux 3 | 4 | # install phantomjs 5 | RUN mkdir -p /opt/phantomjs \ 6 | && cd /opt/phantomjs \ 7 | && wget -O phantomjs.tar.bz2 https://bitbucket.org/ariya/phantomjs/downloads/phantomjs-2.1.1-linux-x86_64.tar.bz2 \ 8 | && tar xavf phantomjs.tar.bz2 --strip-components 1 \ 9 | && ln -s /opt/phantomjs/bin/phantomjs /usr/local/bin/phantomjs \ 10 | && rm phantomjs.tar.bz2 11 | # Fix Error: libssl_conf.so: cannot open shared object file: No such file or directory 12 | ENV OPENSSL_CONF=/etc/ssl/ 13 | 14 | # install nodejs 15 | ENV NODEJS_VERSION=8.15.0 \ 16 | PATH=$PATH:/opt/node/bin 17 | WORKDIR "/opt/node" 18 | RUN apt-get -qq update && apt-get -qq install -y curl ca-certificates libx11-xcb1 libxtst6 libnss3 libasound2 libatk-bridge2.0-0 libgtk-3-0 --no-install-recommends && \ 19 | curl -sL https://nodejs.org/dist/v${NODEJS_VERSION}/node-v${NODEJS_VERSION}-linux-x64.tar.gz | tar xz --strip-components=1 && \ 20 | rm -rf /var/lib/apt/lists/* 21 | RUN npm install puppeteer express 22 | 23 | # install requirements 24 | COPY requirements.txt /opt/pyspider/requirements.txt 25 | RUN pip install -r /opt/pyspider/requirements.txt 26 | 27 | # add all repo 28 | ADD ./ /opt/pyspider 29 | 30 | # run test 31 | WORKDIR /opt/pyspider 32 | RUN pip install -e .[all] 33 | 34 | # Create a symbolic link to node_modules 35 | RUN ln -s /opt/node/node_modules ./node_modules 36 | 37 | #VOLUME ["/opt/pyspider"] 38 | ENTRYPOINT ["pyspider"] 39 | 40 | EXPOSE 5000 23333 24444 25555 22222 41 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include requirements.txt 3 | include Dockerfile 4 | include LICENSE 5 | include pyspider/logging.conf 6 | include pyspider/webui/static/* 7 | include pyspider/webui/templates/* 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pyspider [![Build Status]][Travis CI] [![Coverage Status]][Coverage] 2 | ======== 3 | 4 | A Powerful Spider(Web Crawler) System in Python. 5 | 6 | - Write script in Python 7 | - Powerful WebUI with script editor, task monitor, project manager and result viewer 8 | - [MySQL](https://www.mysql.com/), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend 9 | - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue 10 | - Task priority, retry, periodical, recrawl by age, etc... 11 | - Distributed architecture, Crawl Javascript pages, Python 2.{6,7}, 3.{3,4,5,6} support, etc... 12 | 13 | Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/) 14 | Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/) 15 | Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases) 16 | 17 | Sample Code 18 | ----------- 19 | 20 | ```python 21 | from pyspider.libs.base_handler import * 22 | 23 | 24 | class Handler(BaseHandler): 25 | crawl_config = { 26 | } 27 | 28 | @every(minutes=24 * 60) 29 | def on_start(self): 30 | self.crawl('http://scrapy.org/', callback=self.index_page) 31 | 32 | @config(age=10 * 24 * 60 * 60) 33 | def index_page(self, response): 34 | for each in response.doc('a[href^="http"]').items(): 35 | self.crawl(each.attr.href, callback=self.detail_page) 36 | 37 | def detail_page(self, response): 38 | return { 39 | "url": response.url, 40 | "title": response.doc('title').text(), 41 | } 42 | ``` 43 | 44 | 45 | Installation 46 | ------------ 47 | 48 | * `pip install pyspider` 49 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) 50 | 51 | **WARNING:** WebUI is open to the public by default, it can be used to execute any command which may harm your system. Please use it in an internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config). 52 | 53 | Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/) 54 | 55 | Contribute 56 | ---------- 57 | 58 | * Use It 59 | * Open [Issue], send PR 60 | * [User Group] 61 | * [中文问答](http://segmentfault.com/t/pyspider) 62 | 63 | 64 | TODO 65 | ---- 66 | 67 | ### v0.4.0 68 | 69 | - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia) 70 | 71 | 72 | License 73 | ------- 74 | Licensed under the Apache License, Version 2.0 75 | 76 | 77 | [Build Status]: https://img.shields.io/travis/binux/pyspider/master.svg?style=flat 78 | [Travis CI]: https://travis-ci.org/binux/pyspider 79 | [Coverage Status]: https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat 80 | [Coverage]: https://coveralls.io/r/binux/pyspider 81 | [Try]: https://img.shields.io/badge/try-pyspider-blue.svg?style=flat 82 | [Issue]: https://github.com/binux/pyspider/issues 83 | [User Group]: https://groups.google.com/group/pyspider-users 84 | -------------------------------------------------------------------------------- /config_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "taskdb": "couchdb+taskdb://user:password@couchdb:5984", 3 | "projectdb": "couchdb+projectdb://user:password@couchdb:5984", 4 | "resultdb": "couchdb+resultdb://user:password@couchdb:5984", 5 | "message_queue": "amqp://rabbitmq:5672/%2F", 6 | "webui": { 7 | "username": "username", 8 | "password": "password", 9 | "need-auth": true, 10 | "scheduler-rpc": "http://scheduler:23333", 11 | "fetcher-rpc": "http://fetcher:24444" 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | # replace /path/to/dir/ to point to config.json 4 | 5 | # The RabbitMQ and CouchDB services can take some time to startup. 6 | # During this time most of the pyspider services will exit and restart. 7 | # Once RabbitMQ and CouchDB are fully up and running everything should run as normal. 8 | 9 | services: 10 | rabbitmq: 11 | image: rabbitmq:alpine 12 | container_name: rabbitmq 13 | networks: 14 | - pyspider 15 | command: rabbitmq-server 16 | mysql: 17 | image: mysql:latest 18 | container_name: mysql 19 | volumes: 20 | - /tmp:/var/lib/mysql 21 | environment: 22 | - MYSQL_ALLOW_EMPTY_PASSWORD=yes 23 | networks: 24 | - pyspider 25 | phantomjs: 26 | image: pyspider:latest 27 | container_name: phantomjs 28 | networks: 29 | - pyspider 30 | volumes: 31 | - ./config_example.json:/opt/pyspider/config.json 32 | command: -c config.json phantomjs 33 | depends_on: 34 | - couchdb 35 | - rabbitmq 36 | restart: unless-stopped 37 | result: 38 | image: pyspider:latest 39 | container_name: result 40 | networks: 41 | - pyspider 42 | volumes: 43 | - ./config_example.json:/opt/pyspider/config.json 44 | command: -c config.json result_worker 45 | depends_on: 46 | - couchdb 47 | - rabbitmq 48 | restart: unless-stopped # Sometimes we'll get a connection refused error because couchdb has yet to fully start 49 | processor: 50 | container_name: processor 51 | image: pyspider:latest 52 | networks: 53 | - pyspider 54 | volumes: 55 | - ./config_example.json:/opt/pyspider/config.json 56 | command: -c config.json processor 57 | depends_on: 58 | - couchdb 59 | - rabbitmq 60 | restart: unless-stopped 61 | fetcher: 62 | image: pyspider:latest 63 | container_name: fetcher 64 | networks: 65 | - pyspider 66 | volumes: 67 | - ./config_example.json:/opt/pyspider/config.json 68 | command : -c config.json fetcher 69 | depends_on: 70 | - couchdb 71 | - rabbitmq 72 | restart: unless-stopped 73 | scheduler: 74 | image: pyspider:latest 75 | container_name: scheduler 76 | networks: 77 | - pyspider 78 | volumes: 79 | - ./config_example.json:/opt/pyspider/config.json 80 | command: -c config.json scheduler 81 | depends_on: 82 | - couchdb 83 | - rabbitmq 84 | restart: unless-stopped 85 | webui: 86 | image: pyspider:latest 87 | container_name: webui 88 | ports: 89 | - "5050:5000" 90 | networks: 91 | - pyspider 92 | volumes: 93 | - ./config_example.json:/opt/pyspider/config.json 94 | command: -c config.json webui 95 | depends_on: 96 | - couchdb 97 | - rabbitmq 98 | restart: unless-stopped 99 | 100 | networks: 101 | pyspider: 102 | external: 103 | name: pyspider 104 | default: 105 | driver: bridge 106 | -------------------------------------------------------------------------------- /docs/About-Projects.md: -------------------------------------------------------------------------------- 1 | About Projects 2 | ============== 3 | 4 | In most cases, a project is one script you write for one website. 5 | 6 | * Projects are independent, but you can import another project as a module with `from projects import other_project` 7 | * A project has 5 status: `TODO`, `STOP`, `CHECKING`, `DEBUG` and `RUNNING` 8 | - `TODO` - a script is just created to be written 9 | - `STOP` - you can mark a project as `STOP` if you want it to STOP (= =). 10 | - `CHECKING` - when a running project is modified, to prevent incomplete modification, project status will be set as `CHECKING` automatically. 11 | - `DEBUG`/`RUNNING` - these two status have no difference to spider. But it's good to mark it as `DEBUG` when it's running the first time then change it to `RUNNING` after being checked. 12 | * The crawl rate is controlled by `rate` and `burst` with [token-bucket](http://en.wikipedia.org/wiki/Token_bucket) algorithm. 13 | - `rate` - how many requests in one second 14 | - `burst` - consider this situation, `rate/burst = 0.1/3`, it means that the spider scrawls 1 page every 10 seconds. All tasks are finished, project is checking last updated items every minute. Assume that 3 new items are found, pyspider will "burst" and crawl 3 tasks without waiting 3*10 seconds. However, the fourth task needs wait 10 seconds. 15 | * To delete a project, set `group` to `delete` and status to `STOP`, wait 24 hours. 16 | 17 | 18 | `on_finished` callback 19 | -------------------- 20 | You can override `on_finished` method in the project, the method would be triggered when the task_queue goes to 0. 21 | 22 | Example 1: When you start a project to crawl a website with 100 pages, the `on_finished` callback will be fired when 100 pages are successfully crawled or failed after retries. 23 | 24 | Example 2: A project with `auto_recrawl` tasks will **NEVER** trigger the `on_finished` callback, because time queue will never become 0 when there are auto_recrawl tasks in it. 25 | 26 | Example 3: A project with `@every` decorated method will trigger the `on_finished` callback every time when the newly submitted tasks are finished. 27 | -------------------------------------------------------------------------------- /docs/About-Tasks.md: -------------------------------------------------------------------------------- 1 | About Tasks 2 | =========== 3 | 4 | Tasks are the basic unit to be scheduled. 5 | 6 | Basis 7 | ----- 8 | 9 | * A task is differentiated by its `taskid`. (Default: `md5(url)`, can be changed by overriding the `def get_taskid(self, task)` method) 10 | * Tasks are isolated between different projects. 11 | * A Task has 4 status: 12 | - active 13 | - failed 14 | - success 15 | - bad - not used 16 | * Only tasks in active status will be scheduled. 17 | * Tasks are served in order of `priority`. 18 | 19 | Schedule 20 | -------- 21 | 22 | #### new task 23 | 24 | When a new task (never seen before) comes in: 25 | 26 | * If `exetime` is set but not arrived, it will be put into a time-based queue to wait. 27 | * Otherwise it will be accepted. 28 | 29 | When the task is already in the queue: 30 | 31 | * Ignored unless `force_update` 32 | 33 | When a completed task comes out: 34 | 35 | * If `age` is set, `last_crawl_time + age < now` it will be accepted. Otherwise discarded. 36 | * If `itag` is set and not equal to it's previous value, it will be accepted. Otherwise discarded. 37 | 38 | 39 | #### task retry 40 | 41 | When a fetch error or script error happens, the task will retry 3 times by default. 42 | 43 | The first retry will execute every time after 30 seconds, 1 hour, 6 hours, 12 hours and any more retries will postpone 24 hours. 44 | 45 | If `age` is specified, the retry delay will not larger then `age`. 46 | 47 | You can config the retry delay by adding a variable named `retry_delay` to handler. `retry_delay` is a dict to specify retry intervals. The items in the dict are {retried: seconds}, and a special key: '' (empty string) is used to specify the default retry delay if not specified. 48 | 49 | e.g. the default `retry_delay` declares like: 50 | 51 | 52 | ``` 53 | class MyHandler(BaseHandler): 54 | retry_delay = { 55 | 0: 30, 56 | 1: 1*60*60, 57 | 2: 6*60*60, 58 | 3: 12*60*60, 59 | '': 24*60*60 60 | } 61 | ``` 62 | -------------------------------------------------------------------------------- /docs/Frequently-Asked-Questions.md: -------------------------------------------------------------------------------- 1 | Frequently Asked Questions 2 | ========================== 3 | 4 | Does pyspider Work with Windows? 5 | -------------------------------- 6 | Yes, it should, some users have made it work on Windows. But as I don't have windows development environment, I cannot test. Only some tips for users who want to use pyspider on Windows: 7 | 8 | - Some package needs binary libs (e.g. pycurl, lxml), that maybe you cannot install it from pip, Windowns binaries packages could be found in [http://www.lfd.uci.edu/~gohlke/pythonlibs/](http://www.lfd.uci.edu/~gohlke/pythonlibs/). 9 | - Make a clean environment with [virtualenv](https://virtualenv.readthedocs.org/en/latest/) 10 | - Try 32bit version of Python, especially your are facing crash issue. 11 | - Avoid using Python 3.4.1 ([#194](https://github.com/binux/pyspider/issues/194), [#217](https://github.com/binux/pyspider/issues/217)) 12 | 13 | Unreadable Code (乱码) Returned from Phantomjs 14 | --------------------------------------------- 15 | 16 | Phantomjs doesn't support gzip, don't set `Accept-Encoding` header with `gzip`. 17 | 18 | 19 | How to Delete a Project? 20 | ------------------------ 21 | 22 | set `group` to `delete` and `status` to `STOP` then wait 24 hours. You can change the time before a project deleted via `scheduler.DELETE_TIME`. 23 | 24 | How to Restart a Project? 25 | ------------------------- 26 | #### Why 27 | It happens after you modified a script, and wants to crawl everything again with new strategy. But as the [age](/apis/self.crawl/#age) of urls are not expired. Scheduler will discard all of the new requests. 28 | 29 | #### Solution 30 | 1. Create a new project. 31 | 2. Using a [itag](/apis/self.crawl/#itag) within `Handler.crawl_config` to specify the version of your script. 32 | 33 | How to Use WebDAV Mode? 34 | ----------------------- 35 | Mount `http://hostname/dav/` to your filesystem, edit or create scripts with your favourite editor. 36 | 37 | > OSX: `mount_webdav http://hostname/dav/ /Volumes/dav` 38 | > Linux: Install davfs2, `mount.davfs http://hostname/dav/ /mnt/dav` 39 | > VIM: `vim http://hostname/dav/script_name.py` 40 | 41 | When you are editing script without WebUI, you need to change it to `WebDAV Mode` while debugging. After you saved script in editor, WebUI can load and use latest script to debug your code. 42 | 43 | What does the progress bar mean on the dashboard? 44 | ------------------------------------------------- 45 | When mouse move onto the progress bar, you can see the explaintions. 46 | 47 | For 5m, 1h, 1d the number are the events triggered in 5m, 1h, 1d. For all progress bar, they are the number of total tasks in correspond status. 48 | 49 | Only the tasks in DEBUG/RUNNING status will show the progress. 50 | 51 | How many scheduler/fetcher/processor/result_worker do I need? or pyspider stop working 52 | -------------------------------------------------------------------------------------- 53 | You can have only have one scheduler, and multiple fetcher/processor/result_worker depends on the bottleneck. You can use the queue status on dashboard to view the bottleneck of the system: 54 | 55 | ![run one step](imgs/queue_status.png) 56 | 57 | For example, the number between scheduler and fetcher indicate the queue size of scheduler to fetchers, when it's hitting 100 (default maximum queue size), fetcher might crashed, or you should considered adding more fetchers. 58 | 59 | The number `0+0` below fetcher indicate the queue size of new tasks and status packs between processors and schduler. You can put your mouse over the numbers to see the tips. -------------------------------------------------------------------------------- /docs/Quickstart.md: -------------------------------------------------------------------------------- 1 | Quickstart 2 | ========== 3 | 4 | Installation 5 | ------------ 6 | 7 | * `pip install pyspider` 8 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) 9 | 10 | if you are using ubuntu, try: 11 | ``` 12 | apt-get install python python-dev python-distribute python-pip \ 13 | libcurl4-openssl-dev libxml2-dev libxslt1-dev python-lxml \ 14 | libssl-dev zlib1g-dev 15 | ``` 16 | to install binary packages first. 17 | 18 | 19 | please install PhantomJS if needed: http://phantomjs.org/build.html 20 | 21 | note that PhantomJS will be enabled only if it is excutable in the `PATH` or in the System Environment 22 | 23 | **Note:** `pyspider` command is running pyspider in `all` mode, which running components in threads or subprocesses. For production environment, please refer to [Deployment](Deployment). 24 | 25 | **WARNING:** WebUI is opened to public by default, it can be used to execute any command which may harm to you system. Please use it in internal network or [enable `need-auth` for webui](http://docs.pyspider.org/en/latest/Command-Line/#-config). 26 | 27 | Your First Script 28 | ----------------- 29 | 30 | ```python 31 | from pyspider.libs.base_handler import * 32 | 33 | 34 | class Handler(BaseHandler): 35 | crawl_config = { 36 | } 37 | 38 | @every(minutes=24 * 60) 39 | def on_start(self): 40 | self.crawl('http://scrapy.org/', callback=self.index_page) 41 | 42 | @config(age=10 * 24 * 60 * 60) 43 | def index_page(self, response): 44 | for each in response.doc('a[href^="http"]').items(): 45 | self.crawl(each.attr.href, callback=self.detail_page) 46 | 47 | @config(priority=2) 48 | def detail_page(self, response): 49 | return { 50 | "url": response.url, 51 | "title": response.doc('title').text(), 52 | } 53 | ``` 54 | 55 | > * `def on_start(self)` is the entry point of the script. It will be called when you click the `run` button on dashboard. 56 | > * [`self.crawl(url, callback=self.index_page)`*](/apis/self.crawl) is the most important API here. It will add a new task to be crawled. Most of the options will be spicified via `self.crawl` arguments. 57 | > * `def index_page(self, response)` get a [`Response`*](/apis/Response) object. [`response.doc`*](/apis/Response/#responsedoc) is a [pyquery](https://pythonhosted.org/pyquery/) object which has jQuery-like API to select elements to be extracted. 58 | > * `def detail_page(self, response)` return a `dict` object as result. The result will be captured into `resultdb` by default. You can override `on_result(self, result)` method to manage the result yourself. 59 | 60 | 61 | More things you may want to know: 62 | 63 | > * [`@every(minutes=24*60, seconds=0)`*](/apis/@every/) is a helper to tell the scheduler that `on_start` method should be called everyday. 64 | > * [`@config(age=10 * 24 * 60 * 60)`*](/apis/self.crawl/#configkwargs) specified the default `age` parameter of `self.crawl` with page type `index_page` (when `callback=self.index_page`). The parameter [`age`*](/apis/self.crawl/#age) can be specified via `self.crawl(url, age=10*24*60*60)` (highest priority) and `crawl_config` (lowest priority). 65 | > * [`age=10 * 24 * 60 * 60`*](/apis/self.crawl/#age) tell scheduler discard the request if it have been crawled in 10 days. pyspider will not crawl a same URL twice by default (discard forever), even you had modified the code, it's very common for beginners that runs the project the first time and modified it and run it the second time, it will not crawl again (read [`itag`](/apis/self.crawl/#itag) for solution) 66 | > * [`@config(priority=2)`*](/apis/self.crawl/#schedule) mark that detail pages should be crawled first. 67 | 68 | You can test your script step by step by click the green `run` button. Switch to `follows` panel, click the play button to move on. 69 | 70 | ![run one step](imgs/run_one_step.png) 71 | 72 | Start Running 73 | ------------- 74 | 75 | 1. Save your script. 76 | 2. Back to dashboard find your project. 77 | 3. Changing the `status` to `DEBUG` or `RUNNING`. 78 | 4. Click the `run` button. 79 | 80 | ![index demo](imgs/index_page.png) 81 | 82 | Your script is running now! 83 | -------------------------------------------------------------------------------- /docs/Running-pyspider-with-Docker.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | # mysql 3 | docker run --name mysql -d -v /data/mysql:/var/lib/mysql -e MYSQL_ALLOW_EMPTY_PASSWORD=yes mysql:latest 4 | # rabbitmq 5 | docker run --name rabbitmq -d rabbitmq:latest 6 | 7 | # phantomjs 8 | docker run --name phantomjs -d binux/pyspider:latest phantomjs 9 | 10 | # result worker 11 | docker run --name result_worker -m 128m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest result_worker 12 | # processor, run multiple instance if needed. 13 | docker run --name processor -m 256m -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest processor 14 | # fetcher, run multiple instance if needed. 15 | docker run --name fetcher -m 256m -d --link phantomjs:phantomjs --link rabbitmq:rabbitmq binux/pyspider:latest fetcher --no-xmlrpc 16 | # scheduler 17 | docker run --name scheduler -d --link mysql:mysql --link rabbitmq:rabbitmq binux/pyspider:latest scheduler 18 | # webui 19 | docker run --name webui -m 256m -d -p 5000:5000 --link mysql:mysql --link rabbitmq:rabbitmq --link scheduler:scheduler --link phantomjs:phantomjs binux/pyspider:latest webui 20 | ``` 21 | 22 | or running with [Docker Compose](https://docs.docker.com/compose/) with `docker-compose.yml`: 23 | 24 | NOTE: It's recommended to run mysql and rabbitmq outside compose as they may not been restarted with pyspider. You can find commands to start mysql and rabbitmq service above. 25 | 26 | ``` 27 | phantomjs: 28 | image: binux/pyspider:latest 29 | command: phantomjs 30 | result: 31 | image: binux/pyspider:latest 32 | external_links: 33 | - mysql 34 | - rabbitmq 35 | command: result_worker 36 | processor: 37 | image: binux/pyspider:latest 38 | external_links: 39 | - mysql 40 | - rabbitmq 41 | command: processor 42 | fetcher: 43 | image: binux/pyspider:latest 44 | external_links: 45 | - rabbitmq 46 | links: 47 | - phantomjs 48 | command : fetcher 49 | scheduler: 50 | image: binux/pyspider:latest 51 | external_links: 52 | - mysql 53 | - rabbitmq 54 | command: scheduler 55 | webui: 56 | image: binux/pyspider:latest 57 | external_links: 58 | - mysql 59 | - rabbitmq 60 | links: 61 | - scheduler 62 | - phantomjs 63 | command: webui 64 | ports: 65 | - "5000:5000" 66 | ``` 67 | 68 | `docker-compose up` 69 | 70 | 71 | -------------------------------------------------------------------------------- /docs/Script-Environment.md: -------------------------------------------------------------------------------- 1 | Script Environment 2 | ================== 3 | 4 | Variables 5 | --------- 6 | * `self.project_name` 7 | * `self.project` information about current project 8 | * `self.response` 9 | * `self.task` 10 | 11 | About Script 12 | ------------ 13 | * The name of `Handler` is not matters, but you need at least one class inherit from `BaseHandler` 14 | * A third parameter can be set to get task object: `def callback(self, response, task)` 15 | * Non-200 response will not submit to callback by default. Use `@catch_status_code_error` 16 | 17 | About Environment 18 | ----------------- 19 | * `logging`, `print` and exceptions will be captured. 20 | * You can import other projects as module with `from projects import some_project` 21 | 22 | ### Web view 23 | 24 | * view the page as a browser would render (approximately) 25 | 26 | ### HTML view 27 | 28 | * view the HTML of the current callback (index_page, detail_page, etc.) 29 | 30 | ### Follows view 31 | 32 | * view the callbacks that can be made from the current callback 33 | * index_page follows view will show the detail_page callbacks that can be executed. 34 | 35 | ### Messages view 36 | 37 | * shows the messages send by [`self.send_message`](apis/self.send_message) API. 38 | 39 | ### Enable CSS Selector Helper 40 | 41 | * Enable a CSS Selector Helper of the Web view. It gets the CSS Selector of the element you clicked then add it to your script. 42 | -------------------------------------------------------------------------------- /docs/Working-with-Results.md: -------------------------------------------------------------------------------- 1 | Working with Results 2 | ==================== 3 | Downloading and viewing your data from WebUI is convenient, but may not suitable for computer. 4 | 5 | Working with ResultDB 6 | --------------------- 7 | Although resultdb is only designed for result preview, not suitable for large scale storage. But if you want to grab data from resultdb, there are some simple snippets using database API that can help you to connect and select the data. 8 | 9 | ``` 10 | from pyspider.database import connect_database 11 | resultdb = connect_database("") 12 | for project in resultdb.projects: 13 | for result in resultdb.select(project): 14 | assert result['taskid'] 15 | assert result['url'] 16 | assert result['result'] 17 | ``` 18 | 19 | The `result['result']` is the object submitted by `return` statement from your script. 20 | 21 | Working with ResultWorker 22 | ------------------------- 23 | In product environment, you may want to connect pyspider to your system / post-processing pipeline, rather than store it into resultdb. It's highly recommended to override ResultWorker. 24 | 25 | ``` 26 | from pyspider.result import ResultWorker 27 | 28 | class MyResultWorker(ResultWorker): 29 | def on_result(self, task, result): 30 | assert task['taskid'] 31 | assert task['project'] 32 | assert task['url'] 33 | assert result 34 | # your processing code goes here 35 | ``` 36 | 37 | `result` is the object submitted by `return` statement from your script. 38 | 39 | You can put this script (e.g., `my_result_worker.py`) at the folder where you launch pyspider. Add argument for `result_worker` subcommand: 40 | 41 | `pyspider result_worker --result-cls=my_result_worker.MyResultWorker` 42 | 43 | Or 44 | 45 | ``` 46 | { 47 | ... 48 | "result_worker": { 49 | "result_cls": "my_result_worker.MyResultWorker" 50 | } 51 | ... 52 | } 53 | ``` 54 | 55 | if you are using config file. [Please refer to Deployment](/Deployment) 56 | 57 | Design Your Own Database Schema 58 | ------------------------------- 59 | The results stored in database is encoded as JSON for compatibility. It's highly recommended to design your own database, and override the ResultWorker described above. 60 | 61 | TIPS about Results 62 | ------------------- 63 | #### Want to return more than one result in callback? 64 | As resultdb de-duplicate results by taskid(url), the latest will overwrite previous results. 65 | 66 | One workaround is using `send_message` API to make a `fake` taskid for each result. 67 | 68 | ``` 69 | def detail_page(self, response): 70 | for li in response.doc('li').items(): 71 | self.send_message(self.project_name, { 72 | ... 73 | }, url=response.url+"#"+li('a.product-sku').text()) 74 | 75 | def on_message(self, project, msg): 76 | return msg 77 | ``` 78 | 79 | See Also: [apis/self.send_message](/apis/self.send_message) 80 | -------------------------------------------------------------------------------- /docs/apis/@catch_status_code_error.md: -------------------------------------------------------------------------------- 1 | @catch_status_code_error 2 | ======================== 3 | 4 | non-200 response will been regarded as fetch failed and will not pass to callback. use this decorator to override this feature. 5 | 6 | ```python 7 | def on_start(self): 8 | self.crawl('http://httpbin.org/status/404', self.callback) 9 | 10 | @catch_status_code_error 11 | def callback(self, response): 12 | ... 13 | ``` 14 | 15 | > The `callback` would not be executed as the request is failed (with status code 404). With the `@catch_status_code_error` decorater, the `callback` would be executed even if the request failed. 16 | 17 | -------------------------------------------------------------------------------- /docs/apis/@every.md: -------------------------------------------------------------------------------- 1 | @every(minutes=0, seconds=0) 2 | ============================ 3 | 4 | method will been called every `minutes` or `seconds` 5 | 6 | 7 | ```python 8 | @every(minutes=24 * 60) 9 | def on_start(self): 10 | for url in urllist: 11 | self.crawl(url, callback=self.index_page) 12 | ``` 13 | 14 | The urls would be restarted every 24 hours. Note that, if `age` is also used and the period is longer then `@every`, the crawl request would be discarded as it's regarded as not changed: 15 | 16 | ```python 17 | @every(minutes=24 * 60) 18 | def on_start(self): 19 | self.crawl('http://www.example.org/', callback=self.index_page) 20 | 21 | @config(age=10 * 24 * 60 * 60) 22 | def index_page(self): 23 | ... 24 | ``` 25 | 26 | > Even though the crawl request triggered every day, but it's discard and only restarted every 10 days. 27 | 28 | -------------------------------------------------------------------------------- /docs/apis/Response.md: -------------------------------------------------------------------------------- 1 | Response 2 | ======== 3 | 4 | The attributes of Response object. 5 | 6 | ### Response.url 7 | 8 | final URL. 9 | 10 | ### Response.text 11 | 12 | Content of response, in unicode. 13 | 14 | if `Response.encoding` is None and `chardet` module is available, encoding of content will be guessed. 15 | 16 | ### Response.content 17 | 18 | Content of response, in bytes. 19 | 20 | ### Response.doc 21 | 22 | A [PyQuery](https://pythonhosted.org/pyquery/) object of the response's content. Links have made as absolute by default. 23 | 24 | Refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/) 25 | 26 | It's important that I will repeat, refer to the documentation of PyQuery: [https://pythonhosted.org/pyquery/](https://pythonhosted.org/pyquery/) 27 | 28 | ### Response.etree 29 | 30 | A [lxml](http://lxml.de/) object of the response's content. 31 | 32 | ### Response.json 33 | 34 | The JSON-encoded content of the response, if any. 35 | 36 | ### Response.status_code 37 | 38 | ### Response.orig_url 39 | 40 | If there is any redirection during the request, here is the url you just submit via `self.crawl`. 41 | 42 | ### Response.headers 43 | 44 | A case insensitive dict holds the headers of response. 45 | 46 | ### Response.cookies 47 | 48 | ### Response.error 49 | 50 | Messages when fetch error 51 | 52 | ### Response.time 53 | 54 | Time used during fetching. 55 | 56 | ### Response.ok 57 | 58 | True if `status_code` is 200 and no error. 59 | 60 | ### Response.encoding 61 | 62 | Encoding of Response.content. 63 | 64 | If Response.encoding is None, encoding will be guessed by header or content or `chardet`(if available). 65 | 66 | Set encoding of content manually will overwrite the guessed encoding. 67 | 68 | ### Response.save 69 | 70 | The object saved by [`self.crawl`](/apis/self.crawl/#save) API 71 | 72 | ### Response.js_script_result 73 | 74 | content returned by JS script 75 | 76 | ### Response.raise_for_status() 77 | 78 | Raise HTTPError if status code is not 200 or `Response.error` exists. 79 | 80 | -------------------------------------------------------------------------------- /docs/apis/index.md: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | - [self.crawl](self.crawl) 5 | - [Response](Response) 6 | - [self.send_message](self.send_message) 7 | - [@every](@every) 8 | - [@catch_status_code_error](@catch_status_code_error) 9 | -------------------------------------------------------------------------------- /docs/apis/self.send_message.md: -------------------------------------------------------------------------------- 1 | self.send_message 2 | ================= 3 | 4 | self.send_message(project, msg, [url]) 5 | -------------------------------------- 6 | send messages to other project. can been received by `def on_message(self, project, message)` callback. 7 | 8 | - `project` - other project name 9 | - `msg` - any json-able object 10 | - `url` - result will been overwrite if have same `taskid`. `send_message` share a same `taskid` by default. Change this to return multiple result by one response. 11 | 12 | ```python 13 | def detail_page(self, response): 14 | for i, each in enumerate(response.json['products']): 15 | self.send_message(self.project_name, { 16 | "name": each['name'], 17 | 'price': each['prices'], 18 | }, url="%s#%s" % (response.url, i)) 19 | 20 | def on_message(self, project, msg): 21 | return msg 22 | ``` 23 | 24 | pyspider send_message [OPTIONS] PROJECT MESSAGE 25 | ----------------------------------------------- 26 | 27 | You can also send message from command line. 28 | 29 | ``` 30 | Usage: pyspider send_message [OPTIONS] PROJECT MESSAGE 31 | 32 | Send Message to project from command line 33 | 34 | Options: 35 | --scheduler-rpc TEXT xmlrpc path of scheduler 36 | --help Show this message and exit. 37 | ``` 38 | 39 | def on_message(self, project, message) 40 | -------------------------------------- 41 | receive message from other project 42 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-11-10 01:31:54 7 | 8 | import sys 9 | from unittest.mock import MagicMock 10 | from recommonmark.parser import CommonMarkParser 11 | 12 | class Mock(MagicMock): 13 | @classmethod 14 | def __getattr__(cls, name): 15 | return Mock() 16 | 17 | MOCK_MODULES = ['pycurl', 'lxml', 'psycopg2'] 18 | sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) 19 | 20 | source_parsers = { 21 | '.md': CommonMarkParser, 22 | } 23 | 24 | source_suffix = ['.rst', '.md'] 25 | -------------------------------------------------------------------------------- /docs/imgs/creating_a_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/creating_a_project.png -------------------------------------------------------------------------------- /docs/imgs/css_selector_helper.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/css_selector_helper.png -------------------------------------------------------------------------------- /docs/imgs/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/demo.png -------------------------------------------------------------------------------- /docs/imgs/developer-tools-network-filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/developer-tools-network-filter.png -------------------------------------------------------------------------------- /docs/imgs/developer-tools-network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/developer-tools-network.png -------------------------------------------------------------------------------- /docs/imgs/index_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/index_page.png -------------------------------------------------------------------------------- /docs/imgs/inspect_element.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/inspect_element.png -------------------------------------------------------------------------------- /docs/imgs/pyspider-arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/pyspider-arch.png -------------------------------------------------------------------------------- /docs/imgs/request-headers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/request-headers.png -------------------------------------------------------------------------------- /docs/imgs/run_one_step.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/run_one_step.png -------------------------------------------------------------------------------- /docs/imgs/search-for-request.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/search-for-request.png -------------------------------------------------------------------------------- /docs/imgs/tutorial_imdb_front.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/tutorial_imdb_front.png -------------------------------------------------------------------------------- /docs/imgs/twitch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/docs/imgs/twitch.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | pyspider [![Build Status][Build Status]][Travis CI] [![Coverage Status][Coverage Status]][Coverage] [![Try][Try]][Demo] 2 | ======== 3 | 4 | A Powerful Spider(Web Crawler) System in Python. **[TRY IT NOW!][Demo]** 5 | 6 | - Write script in Python 7 | - Powerful WebUI with script editor, task monitor, project manager and result viewer 8 | - [MySQL](https://www.mysql.com/), [CouchDB](https://couchdb.apache.org), [MongoDB](https://www.mongodb.org/), [Redis](http://redis.io/), [SQLite](https://www.sqlite.org/), [Elasticsearch](https://www.elastic.co/products/elasticsearch); [PostgreSQL](http://www.postgresql.org/) with [SQLAlchemy](http://www.sqlalchemy.org/) as database backend 9 | - [RabbitMQ](http://www.rabbitmq.com/), [Redis](http://redis.io/) and [Kombu](http://kombu.readthedocs.org/) as message queue 10 | - Task priority, retry, periodical, recrawl by age, etc... 11 | - Distributed architecture, Crawl Javascript pages, Python 2&3, etc... 12 | 13 | Tutorial: [http://docs.pyspider.org/en/latest/tutorial/](http://docs.pyspider.org/en/latest/tutorial/) 14 | Documentation: [http://docs.pyspider.org/](http://docs.pyspider.org/) 15 | Release notes: [https://github.com/binux/pyspider/releases](https://github.com/binux/pyspider/releases) 16 | 17 | Sample Code 18 | ----------- 19 | 20 | ```python 21 | from pyspider.libs.base_handler import * 22 | 23 | 24 | class Handler(BaseHandler): 25 | crawl_config = { 26 | } 27 | 28 | @every(minutes=24 * 60) 29 | def on_start(self): 30 | self.crawl('http://scrapy.org/', callback=self.index_page) 31 | 32 | @config(age=10 * 24 * 60 * 60) 33 | def index_page(self, response): 34 | for each in response.doc('a[href^="http"]').items(): 35 | self.crawl(each.attr.href, callback=self.detail_page) 36 | 37 | def detail_page(self, response): 38 | return { 39 | "url": response.url, 40 | "title": response.doc('title').text(), 41 | } 42 | ``` 43 | 44 | [![Demo][Demo Img]][Demo] 45 | 46 | 47 | Installation 48 | ------------ 49 | 50 | * `pip install pyspider` 51 | * run command `pyspider`, visit [http://localhost:5000/](http://localhost:5000/) 52 | 53 | Quickstart: [http://docs.pyspider.org/en/latest/Quickstart/](http://docs.pyspider.org/en/latest/Quickstart/) 54 | 55 | Contribute 56 | ---------- 57 | 58 | * Use It 59 | * Open [Issue], send PR 60 | * [User Group] 61 | * [中文问答](http://segmentfault.com/t/pyspider) 62 | 63 | 64 | TODO 65 | ---- 66 | 67 | ### v0.4.0 68 | 69 | - [x] local mode, load script from file. 70 | - [x] works as a framework (all components running in one process, no threads) 71 | - [x] redis 72 | - [x] shell mode like `scrapy shell` 73 | - [ ] a visual scraping interface like [portia](https://github.com/scrapinghub/portia) 74 | 75 | 76 | ### more 77 | 78 | - [x] edit script with vim via [WebDAV](http://en.wikipedia.org/wiki/WebDAV) 79 | 80 | 81 | License 82 | ------- 83 | Licensed under the Apache License, Version 2.0 84 | 85 | 86 | [Build Status]: https://img.shields.io/travis/binux/pyspider/master.svg?style=flat 87 | [Travis CI]: https://travis-ci.org/binux/pyspider 88 | [Coverage Status]: https://img.shields.io/coveralls/binux/pyspider.svg?branch=master&style=flat 89 | [Coverage]: https://coveralls.io/r/binux/pyspider 90 | [Try]: https://img.shields.io/badge/try-pyspider-blue.svg?style=flat 91 | [Demo]: http://demo.pyspider.org/ 92 | [Demo Img]: imgs/demo.png 93 | [Issue]: https://github.com/binux/pyspider/issues 94 | [User Group]: https://groups.google.com/group/pyspider-users 95 | -------------------------------------------------------------------------------- /docs/tutorial/Render-with-PhantomJS.md: -------------------------------------------------------------------------------- 1 | Level 3: Render with PhantomJS 2 | ============================== 3 | 4 | Sometimes web page is too complex to find out the API request. It's time to meet the power of [PhantomJS]. 5 | 6 | To use PhantomJS, you should have PhantomJS [installed](http://phantomjs.org/download.html). If you are running pyspider with `all` mode, PhantomJS is enabled if excutable in the `PATH`. 7 | 8 | Make sure phantomjs is working by running 9 | ``` 10 | $ pyspider phantomjs 11 | ``` 12 | 13 | Continue with the rest of the tutorial if the output is 14 | ``` 15 | Web server running on port 25555 16 | ``` 17 | 18 | Use PhantomJS 19 | ------------- 20 | 21 | When pyspider with PhantomJS connected, you can enable this feature by adding a parameter `fetch_type='js'` to `self.crawl`. We use PhantomJS to scrape channel list of [http://www.twitch.tv/directory/game/Dota%202](http://www.twitch.tv/directory/game/Dota%202) which is loaded with AJAX we discussed in [Level 2](tutorial/AJAX-and-more-HTTP#ajax): 22 | 23 | ``` 24 | class Handler(BaseHandler): 25 | def on_start(self): 26 | self.crawl('http://www.twitch.tv/directory/game/Dota%202', 27 | fetch_type='js', callback=self.index_page) 28 | 29 | def index_page(self, response): 30 | return { 31 | "url": response.url, 32 | "channels": [{ 33 | "title": x('.title').text(), 34 | "viewers": x('.info').contents()[2], 35 | "name": x('.info a').text(), 36 | } for x in response.doc('.stream.item').items()] 37 | } 38 | ``` 39 | > I used some API to handle the list of streams. You can find complete API reference from [PyQuery complete API](https://pythonhosted.org/pyquery/api.html) 40 | 41 | Running JavaScript on Page 42 | -------------------------- 43 | 44 | We will try to scrape images from [http://www.pinterest.com/categories/popular/](http://www.pinterest.com/categories/popular/) in this section. Only 25 images is shown at the beginning, more images would be loaded when you scroll to the bottom of the page. 45 | 46 | To scrape images as many as posible we can use a [`js_script` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher) to set some function wrapped JavaScript codes to simulate the scroll action: 47 | 48 | ``` 49 | class Handler(BaseHandler): 50 | def on_start(self): 51 | self.crawl('http://www.pinterest.com/categories/popular/', 52 | fetch_type='js', js_script=""" 53 | function() { 54 | window.scrollTo(0,document.body.scrollHeight); 55 | } 56 | """, callback=self.index_page) 57 | 58 | def index_page(self, response): 59 | return { 60 | "url": response.url, 61 | "images": [{ 62 | "title": x('.richPinGridTitle').text(), 63 | "img": x('.pinImg').attr('src'), 64 | "author": x('.creditName').text(), 65 | } for x in response.doc('.item').items() if x('.pinImg')] 66 | } 67 | ``` 68 | 69 | > * Script would been executed after page loaded(can been changed via [`js_run_at` parameter](/apis/self.crawl/#enable-javascript-fetcher-need-support-by-fetcher)) 70 | > * We scroll once after page loaded, you can scroll multiple times using [`setTimeout`](https://developer.mozilla.org/en-US/docs/Web/API/WindowTimers.setTimeout). PhantomJS will fetch as many items as possible before timeout arrived. 71 | 72 | Online demo: [http://demo.pyspider.org/debug/tutorial_pinterest](http://demo.pyspider.org/debug/tutorial_pinterest) 73 | 74 | 75 | 76 | [PhantomJS]: http://phantomjs.org/ 77 | -------------------------------------------------------------------------------- /docs/tutorial/index.md: -------------------------------------------------------------------------------- 1 | pyspider Tutorial 2 | ================= 3 | 4 | > The best way to learn how to scrap is learning how to make it. 5 | 6 | * [Level 1: HTML and CSS Selector](HTML-and-CSS-Selector) 7 | * [Level 2: AJAX and More HTTP](AJAX-and-more-HTTP) 8 | * [Level 3: Render with PhantomJS](Render-with-PhantomJS) 9 | 10 | If you have problem using pyspider, [user group](https://groups.google.com/group/pyspider-users) is a place for discussing. 11 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: pyspider 2 | site_description: A Powerful Spider(Web Crawler) System in Python. 3 | site_author: binux 4 | repo_url: https://github.com/binux/pyspider 5 | pages: 6 | - Introduction: index.md 7 | - Quickstart: Quickstart.md 8 | - Command Line: Command-Line.md 9 | - Tutorial: 10 | - Index: tutorial/index.md 11 | - 'Level 1: HTML and CSS Selector': tutorial/HTML-and-CSS-Selector.md 12 | - 'Level 2: AJAX and More HTTP': tutorial/AJAX-and-more-HTTP.md 13 | - 'Level 3: Render with PhantomJS': tutorial/Render-with-PhantomJS.md 14 | - About pyspider: 15 | - Architecture: Architecture.md 16 | - About Tasks: About-Tasks.md 17 | - About Projects: About-Projects.md 18 | - Script Environment: Script-Environment.md 19 | - Working with Results: Working-with-Results.md 20 | - API Reference: 21 | - Index: apis/index.md 22 | - self.crawl: apis/self.crawl.md 23 | - Response: apis/Response.md 24 | - self.send_message: apis/self.send_message.md 25 | - '@catch_status_code_error': apis/@catch_status_code_error.md 26 | - '@every': apis/@every.md 27 | - Deployment: Deployment.md 28 | - Running pyspider with Docker: Running-pyspider-with-Docker.md 29 | - Deployment of demo.pyspider.org: Deployment-demo.pyspider.org.md 30 | - Frequently Asked Questions: Frequently-Asked-Questions.md 31 | 32 | theme: readthedocs 33 | markdown_extensions: ['toc(permalink=true)', ] 34 | -------------------------------------------------------------------------------- /pyspider/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-17 19:17:12 7 | 8 | __version__ = '0.4.0' 9 | -------------------------------------------------------------------------------- /pyspider/database/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/base/__init__.py -------------------------------------------------------------------------------- /pyspider/database/base/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-09 11:28:52 7 | 8 | import re 9 | 10 | # NOTE: When get/get_all/check_update from database with default fields, 11 | # all following fields should be included in output dict. 12 | { 13 | 'project': { 14 | 'name': str, 15 | 'group': str, 16 | 'status': str, 17 | 'script': str, 18 | # 'config': str, 19 | 'comments': str, 20 | # 'priority': int, 21 | 'rate': int, 22 | 'burst': int, 23 | 'updatetime': int, 24 | } 25 | } 26 | 27 | 28 | class ProjectDB(object): 29 | status_str = [ 30 | 'TODO', 31 | 'STOP', 32 | 'CHECKING', 33 | 'DEBUG', 34 | 'RUNNING', 35 | ] 36 | 37 | def insert(self, name, obj={}): 38 | raise NotImplementedError 39 | 40 | def update(self, name, obj={}, **kwargs): 41 | raise NotImplementedError 42 | 43 | def get_all(self, fields=None): 44 | raise NotImplementedError 45 | 46 | def get(self, name, fields): 47 | raise NotImplementedError 48 | 49 | def drop(self, name): 50 | raise NotImplementedError 51 | 52 | def check_update(self, timestamp, fields=None): 53 | raise NotImplementedError 54 | 55 | def split_group(self, group, lower=True): 56 | if lower: 57 | return re.split("\W+", (group or '').lower()) 58 | else: 59 | return re.split("\W+", group or '') 60 | 61 | def verify_project_name(self, name): 62 | if len(name) > 64: 63 | return False 64 | if re.search(r"[^\w]", name): 65 | return False 66 | return True 67 | 68 | def copy(self): 69 | ''' 70 | database should be able to copy itself to create new connection 71 | 72 | it's implemented automatically by pyspider.database.connect_database 73 | if you are not create database connection via connect_database method, 74 | you should implement this 75 | ''' 76 | raise NotImplementedError 77 | -------------------------------------------------------------------------------- /pyspider/database/base/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-11 18:40:03 7 | 8 | # result schema 9 | { 10 | 'result': { 11 | 'taskid': str, # new, not changeable 12 | 'project': str, # new, not changeable 13 | 'url': str, # new, not changeable 14 | 'result': str, # json string 15 | 'updatetime': int, 16 | } 17 | } 18 | 19 | 20 | class ResultDB(object): 21 | """ 22 | database for result 23 | """ 24 | projects = set() # projects in resultdb 25 | 26 | def save(self, project, taskid, url, result): 27 | raise NotImplementedError 28 | 29 | def select(self, project, fields=None, offset=0, limit=None): 30 | raise NotImplementedError 31 | 32 | def count(self, project): 33 | raise NotImplementedError 34 | 35 | def get(self, project, taskid, fields=None): 36 | raise NotImplementedError 37 | 38 | def drop(self, project): 39 | raise NotImplementedError 40 | 41 | def copy(self): 42 | ''' 43 | database should be able to copy itself to create new connection 44 | 45 | it's implemented automatically by pyspider.database.connect_database 46 | if you are not create database connection via connect_database method, 47 | you should implement this 48 | ''' 49 | raise NotImplementedError 50 | -------------------------------------------------------------------------------- /pyspider/database/base/taskdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-08 10:28:48 7 | 8 | # task schema 9 | { 10 | 'task': { 11 | 'taskid': str, # new, not change 12 | 'project': str, # new, not change 13 | 'url': str, # new, not change 14 | 'status': int, # change 15 | 'schedule': { 16 | 'priority': int, 17 | 'retries': int, 18 | 'retried': int, 19 | 'exetime': int, 20 | 'age': int, 21 | 'itag': str, 22 | # 'recrawl': int 23 | }, # new and restart 24 | 'fetch': { 25 | 'method': str, 26 | 'headers': dict, 27 | 'data': str, 28 | 'timeout': int, 29 | 'save': dict, 30 | }, # new and restart 31 | 'process': { 32 | 'callback': str, 33 | }, # new and restart 34 | 'track': { 35 | 'fetch': { 36 | 'ok': bool, 37 | 'time': int, 38 | 'status_code': int, 39 | 'headers': dict, 40 | 'encoding': str, 41 | 'content': str, 42 | }, 43 | 'process': { 44 | 'ok': bool, 45 | 'time': int, 46 | 'follows': int, 47 | 'outputs': int, 48 | 'logs': str, 49 | 'exception': str, 50 | }, 51 | 'save': object, # jsonable object saved by processor 52 | }, # finish 53 | 'lastcrawltime': int, # keep between request 54 | 'updatetime': int, # keep between request 55 | } 56 | } 57 | 58 | 59 | class TaskDB(object): 60 | ACTIVE = 1 61 | SUCCESS = 2 62 | FAILED = 3 63 | BAD = 4 64 | 65 | projects = set() # projects in taskdb 66 | 67 | def load_tasks(self, status, project=None, fields=None): 68 | raise NotImplementedError 69 | 70 | def get_task(self, project, taskid, fields=None): 71 | raise NotImplementedError 72 | 73 | def status_count(self, project): 74 | ''' 75 | return a dict 76 | ''' 77 | raise NotImplementedError 78 | 79 | def insert(self, project, taskid, obj={}): 80 | raise NotImplementedError 81 | 82 | def update(self, project, taskid, obj={}, **kwargs): 83 | raise NotImplementedError 84 | 85 | def drop(self, project): 86 | raise NotImplementedError 87 | 88 | @staticmethod 89 | def status_to_string(status): 90 | return { 91 | 1: 'ACTIVE', 92 | 2: 'SUCCESS', 93 | 3: 'FAILED', 94 | 4: 'BAD', 95 | }.get(status, 'UNKNOWN') 96 | 97 | @staticmethod 98 | def status_to_int(status): 99 | return { 100 | 'ACTIVE': 1, 101 | 'SUCCESS': 2, 102 | 'FAILED': 3, 103 | 'BAD': 4, 104 | }.get(status, 4) 105 | 106 | def copy(self): 107 | ''' 108 | database should be able to copy itself to create new connection 109 | 110 | it's implemented automatically by pyspider.database.connect_database 111 | if you are not create database connection via connect_database method, 112 | you should implement this 113 | ''' 114 | raise NotImplementedError 115 | -------------------------------------------------------------------------------- /pyspider/database/couchdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/couchdb/__init__.py -------------------------------------------------------------------------------- /pyspider/database/couchdb/couchdbbase.py: -------------------------------------------------------------------------------- 1 | import time, requests, json 2 | from requests.auth import HTTPBasicAuth 3 | 4 | class SplitTableMixin(object): 5 | UPDATE_PROJECTS_TIME = 10 * 60 6 | 7 | def __init__(self): 8 | self.session = requests.session() 9 | if self.username: 10 | self.session.auth = HTTPBasicAuth(self.username, self.password) 11 | self.session.headers.update({'Content-Type': 'application/json'}) 12 | 13 | def _collection_name(self, project): 14 | if self.collection_prefix: 15 | return "%s_%s" % (self.collection_prefix, project) 16 | else: 17 | return project 18 | 19 | 20 | @property 21 | def projects(self): 22 | if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME: 23 | self._list_project() 24 | return self._projects 25 | 26 | 27 | @projects.setter 28 | def projects(self, value): 29 | self._projects = value 30 | 31 | 32 | def _list_project(self): 33 | self._last_update_projects = time.time() 34 | self.projects = set() 35 | if self.collection_prefix: 36 | prefix = "%s." % self.collection_prefix 37 | else: 38 | prefix = '' 39 | 40 | url = self.base_url + "_all_dbs" 41 | res = self.session.get(url, json={}).json() 42 | for each in res: 43 | if each.startswith('_'): 44 | continue 45 | if each.startswith(self.database): 46 | self.projects.add(each[len(self.database)+1+len(prefix):]) 47 | 48 | 49 | def create_database(self, name): 50 | url = self.base_url + name 51 | res = self.session.put(url).json() 52 | if 'error' in res and res['error'] == 'unauthorized': 53 | raise Exception("Supplied credentials are incorrect. Reason: {} for User: {} Password: {}".format(res['reason'], self.username, self.password)) 54 | return res 55 | 56 | 57 | def get_doc(self, db_name, doc_id): 58 | url = self.base_url + db_name + "/" + doc_id 59 | res = self.session.get(url).json() 60 | if "error" in res and res["error"] == "not_found": 61 | return None 62 | return res 63 | 64 | 65 | def get_docs(self, db_name, selector): 66 | url = self.base_url + db_name + "/_find" 67 | selector['use_index'] = self.index 68 | res = self.session.post(url, json=selector).json() 69 | if 'error' in res and res['error'] == 'not_found': 70 | return [] 71 | return res['docs'] 72 | 73 | 74 | def get_all_docs(self, db_name): 75 | return self.get_docs(db_name, {"selector": {}}) 76 | 77 | 78 | def insert_doc(self, db_name, doc_id, doc): 79 | url = self.base_url + db_name + "/" + doc_id 80 | return self.session.put(url, json=doc).json() 81 | 82 | 83 | def update_doc(self, db_name, doc_id, new_doc): 84 | doc = self.get_doc(db_name, doc_id) 85 | if doc is None: 86 | return self.insert_doc(db_name, doc_id, new_doc) 87 | for key in new_doc: 88 | doc[key] = new_doc[key] 89 | url = self.base_url + db_name + "/" + doc_id 90 | return self.session.put(url, json=doc).json() 91 | 92 | 93 | def delete(self, url): 94 | return self.session.delete(url).json() 95 | 96 | -------------------------------------------------------------------------------- /pyspider/database/couchdb/resultdb.py: -------------------------------------------------------------------------------- 1 | import time, json 2 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 3 | from .couchdbbase import SplitTableMixin 4 | 5 | 6 | class ResultDB(SplitTableMixin, BaseResultDB): 7 | collection_prefix = '' 8 | 9 | def __init__(self, url, database='resultdb', username=None, password=None): 10 | self.username = username 11 | self.password = password 12 | self.base_url = url 13 | self.url = url + database + "/" 14 | self.database = database 15 | 16 | super().__init__() 17 | self.create_database(database) 18 | self.index = None 19 | 20 | def _get_collection_name(self, project): 21 | return self.database + "_" + self._collection_name(project) 22 | 23 | def _create_project(self, project): 24 | collection_name = self._get_collection_name(project) 25 | self.create_database(collection_name) 26 | # create index 27 | payload = { 28 | 'index': { 29 | 'fields': ['taskid'] 30 | }, 31 | 'name': collection_name 32 | } 33 | 34 | res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() 35 | self.index = res['id'] 36 | self._list_project() 37 | 38 | def save(self, project, taskid, url, result): 39 | if project not in self.projects: 40 | self._create_project(project) 41 | collection_name = self._get_collection_name(project) 42 | obj = { 43 | 'taskid': taskid, 44 | 'url': url, 45 | 'result': result, 46 | 'updatetime': time.time(), 47 | } 48 | return self.update_doc(collection_name, taskid, obj) 49 | 50 | def select(self, project, fields=None, offset=0, limit=0): 51 | if project not in self.projects: 52 | self._list_project() 53 | if project not in self.projects: 54 | return 55 | offset = offset or 0 56 | limit = limit or 0 57 | collection_name = self._get_collection_name(project) 58 | if fields is None: 59 | fields = [] 60 | if limit == 0: 61 | sel = { 62 | 'selector': {}, 63 | 'fields': fields, 64 | 'skip': offset 65 | } 66 | else: 67 | sel = { 68 | 'selector': {}, 69 | 'fields': fields, 70 | 'skip': offset, 71 | 'limit': limit 72 | } 73 | for result in self.get_docs(collection_name, sel): 74 | yield result 75 | 76 | def count(self, project): 77 | if project not in self.projects: 78 | self._list_project() 79 | if project not in self.projects: 80 | return 81 | collection_name = self._get_collection_name(project) 82 | return len(self.get_all_docs(collection_name)) 83 | 84 | def get(self, project, taskid, fields=None): 85 | if project not in self.projects: 86 | self._list_project() 87 | if project not in self.projects: 88 | return 89 | collection_name = self._get_collection_name(project) 90 | if fields is None: 91 | fields = [] 92 | sel = { 93 | 'selector': {'taskid': taskid}, 94 | 'fields': fields 95 | } 96 | ret = self.get_docs(collection_name, sel) 97 | if len(ret) == 0: 98 | return None 99 | return ret[0] 100 | 101 | def drop_database(self): 102 | return self.delete(self.url) 103 | 104 | def drop(self, project): 105 | # drop the project 106 | collection_name = self._get_collection_name(project) 107 | url = self.base_url + collection_name 108 | return self.delete(url) -------------------------------------------------------------------------------- /pyspider/database/couchdb/taskdb.py: -------------------------------------------------------------------------------- 1 | import json, time 2 | from pyspider.database.base.taskdb import TaskDB as BaseTaskDB 3 | from .couchdbbase import SplitTableMixin 4 | 5 | 6 | class TaskDB(SplitTableMixin, BaseTaskDB): 7 | collection_prefix = '' 8 | 9 | def __init__(self, url, database='taskdb', username=None, password=None): 10 | self.username = username 11 | self.password = password 12 | self.base_url = url 13 | self.url = url + database + "/" 14 | self.database = database 15 | self.index = None 16 | 17 | super().__init__() 18 | 19 | self.create_database(database) 20 | self.projects = set() 21 | self._list_project() 22 | 23 | def _get_collection_name(self, project): 24 | return self.database + "_" + self._collection_name(project) 25 | 26 | def _create_project(self, project): 27 | collection_name = self._get_collection_name(project) 28 | self.create_database(collection_name) 29 | # create index 30 | payload = { 31 | 'index': { 32 | 'fields': ['status', 'taskid'] 33 | }, 34 | 'name': collection_name 35 | } 36 | res = self.session.post(self.base_url + collection_name + "/_index", json=payload).json() 37 | self.index = res['id'] 38 | self._list_project() 39 | 40 | def load_tasks(self, status, project=None, fields=None): 41 | if not project: 42 | self._list_project() 43 | if fields is None: 44 | fields = [] 45 | if project: 46 | projects = [project, ] 47 | else: 48 | projects = self.projects 49 | for project in projects: 50 | collection_name = self._get_collection_name(project) 51 | for task in self.get_docs(collection_name, {"selector": {"status": status}, "fields": fields}): 52 | yield task 53 | 54 | def get_task(self, project, taskid, fields=None): 55 | if project not in self.projects: 56 | self._list_project() 57 | if project not in self.projects: 58 | return 59 | if fields is None: 60 | fields = [] 61 | collection_name = self._get_collection_name(project) 62 | ret = self.get_docs(collection_name, {"selector": {"taskid": taskid}, "fields": fields}) 63 | if len(ret) == 0: 64 | return None 65 | return ret[0] 66 | 67 | def status_count(self, project): 68 | if project not in self.projects: 69 | self._list_project() 70 | if project not in self.projects: 71 | return {} 72 | collection_name = self._get_collection_name(project) 73 | 74 | def _count_for_status(collection_name, status): 75 | total = len(self.get_docs(collection_name, {"selector": {'status': status}})) 76 | return {'total': total, "_id": status} if total else None 77 | 78 | c = collection_name 79 | ret = filter(lambda x: x,map(lambda s: _count_for_status(c, s), [self.ACTIVE, self.SUCCESS, self.FAILED])) 80 | 81 | result = {} 82 | if isinstance(ret, dict): 83 | ret = ret.get('result', []) 84 | for each in ret: 85 | result[each['_id']] = each['total'] 86 | return result 87 | 88 | def insert(self, project, taskid, obj={}): 89 | if project not in self.projects: 90 | self._create_project(project) 91 | obj = dict(obj) 92 | obj['taskid'] = taskid 93 | obj['project'] = project 94 | obj['updatetime'] = time.time() 95 | return self.update(project, taskid, obj=obj) 96 | 97 | def update(self, project, taskid, obj={}, **kwargs): 98 | obj = dict(obj) 99 | obj.update(kwargs) 100 | obj['updatetime'] = time.time() 101 | collection_name = self._get_collection_name(project) 102 | return self.update_doc(collection_name, taskid, obj) 103 | 104 | def drop_database(self): 105 | return self.delete(self.url) 106 | 107 | def drop(self, project): 108 | collection_name = self._get_collection_name(project) 109 | url = self.base_url + collection_name 110 | return self.delete(url) -------------------------------------------------------------------------------- /pyspider/database/elasticsearch/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2016-01-17 18:31:58 7 | -------------------------------------------------------------------------------- /pyspider/database/elasticsearch/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2016-01-17 18:32:33 7 | 8 | import time 9 | 10 | import elasticsearch.helpers 11 | from elasticsearch import Elasticsearch 12 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 13 | 14 | 15 | class ProjectDB(BaseProjectDB): 16 | __type__ = 'project' 17 | 18 | def __init__(self, hosts, index='pyspider'): 19 | self.index = index 20 | self.es = Elasticsearch(hosts=hosts) 21 | 22 | self.es.indices.create(index=self.index, ignore=400) 23 | if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): 24 | self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ 25 | "_all": {"enabled": False}, 26 | "properties": { 27 | "updatetime": {"type": "double"} 28 | } 29 | }) 30 | 31 | def insert(self, name, obj={}): 32 | obj = dict(obj) 33 | obj['name'] = name 34 | obj['updatetime'] = time.time() 35 | 36 | obj.setdefault('group', '') 37 | obj.setdefault('status', 'TODO') 38 | obj.setdefault('script', '') 39 | obj.setdefault('comments', '') 40 | obj.setdefault('rate', 0) 41 | obj.setdefault('burst', 0) 42 | 43 | return self.es.index(index=self.index, doc_type=self.__type__, body=obj, id=name, 44 | refresh=True) 45 | 46 | def update(self, name, obj={}, **kwargs): 47 | obj = dict(obj) 48 | obj.update(kwargs) 49 | obj['updatetime'] = time.time() 50 | return self.es.update(index=self.index, doc_type=self.__type__, 51 | body={'doc': obj}, id=name, refresh=True, ignore=404) 52 | 53 | def get_all(self, fields=None): 54 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 55 | query={'query': {"match_all": {}}}, 56 | _source_include=fields or []): 57 | yield record['_source'] 58 | 59 | def get(self, name, fields=None): 60 | ret = self.es.get(index=self.index, doc_type=self.__type__, id=name, 61 | _source_include=fields or [], ignore=404) 62 | return ret.get('_source', None) 63 | 64 | def check_update(self, timestamp, fields=None): 65 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 66 | query={'query': {"range": { 67 | "updatetime": {"gte": timestamp} 68 | }}}, _source_include=fields or []): 69 | yield record['_source'] 70 | 71 | def drop(self, name): 72 | return self.es.delete(index=self.index, doc_type=self.__type__, id=name, refresh=True) 73 | -------------------------------------------------------------------------------- /pyspider/database/elasticsearch/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2016-01-18 19:41:24 7 | 8 | 9 | import time 10 | 11 | import elasticsearch.helpers 12 | from elasticsearch import Elasticsearch 13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 14 | 15 | 16 | class ResultDB(BaseResultDB): 17 | __type__ = 'result' 18 | 19 | def __init__(self, hosts, index='pyspider'): 20 | self.index = index 21 | self.es = Elasticsearch(hosts=hosts) 22 | 23 | self.es.indices.create(index=self.index, ignore=400) 24 | if not self.es.indices.get_mapping(index=self.index, doc_type=self.__type__): 25 | self.es.indices.put_mapping(index=self.index, doc_type=self.__type__, body={ 26 | "_all": {"enabled": True}, 27 | "properties": { 28 | "taskid": {"enabled": False}, 29 | "project": {"type": "string", "index": "not_analyzed"}, 30 | "url": {"enabled": False}, 31 | } 32 | }) 33 | 34 | @property 35 | def projects(self): 36 | ret = self.es.search(index=self.index, doc_type=self.__type__, 37 | body={"aggs": {"projects": { 38 | "terms": {"field": "project"} 39 | }}}, _source=False) 40 | return [each['key'] for each in ret['aggregations']['projects'].get('buckets', [])] 41 | 42 | def save(self, project, taskid, url, result): 43 | obj = { 44 | 'taskid': taskid, 45 | 'project': project, 46 | 'url': url, 47 | 'result': result, 48 | 'updatetime': time.time(), 49 | } 50 | return self.es.index(index=self.index, doc_type=self.__type__, 51 | body=obj, id='%s:%s' % (project, taskid)) 52 | 53 | def select(self, project, fields=None, offset=0, limit=0): 54 | offset = offset or 0 55 | limit = limit or 0 56 | if not limit: 57 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 58 | query={'query': {'term': {'project': project}}}, 59 | _source_include=fields or [], from_=offset, 60 | sort="updatetime:desc"): 61 | yield record['_source'] 62 | else: 63 | for record in self.es.search(index=self.index, doc_type=self.__type__, 64 | body={'query': {'term': {'project': project}}}, 65 | _source_include=fields or [], from_=offset, size=limit, 66 | sort="updatetime:desc" 67 | ).get('hits', {}).get('hits', []): 68 | yield record['_source'] 69 | 70 | def count(self, project): 71 | return self.es.count(index=self.index, doc_type=self.__type__, 72 | body={'query': {'term': {'project': project}}} 73 | ).get('count', 0) 74 | 75 | def get(self, project, taskid, fields=None): 76 | ret = self.es.get(index=self.index, doc_type=self.__type__, id="%s:%s" % (project, taskid), 77 | _source_include=fields or [], ignore=404) 78 | return ret.get('_source', None) 79 | 80 | def drop(self, project): 81 | self.refresh() 82 | for record in elasticsearch.helpers.scan(self.es, index=self.index, doc_type=self.__type__, 83 | query={'query': {'term': {'project': project}}}, 84 | _source=False): 85 | self.es.delete(index=self.index, doc_type=self.__type__, id=record['_id']) 86 | 87 | def refresh(self): 88 | """ 89 | Explicitly refresh one or more index, making all operations 90 | performed since the last refresh available for search. 91 | """ 92 | self.es.indices.refresh(index=self.index) 93 | -------------------------------------------------------------------------------- /pyspider/database/local/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-17 20:56:50 7 | -------------------------------------------------------------------------------- /pyspider/database/local/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-17 12:32:17 7 | 8 | import os 9 | import re 10 | import six 11 | import glob 12 | import logging 13 | 14 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 15 | 16 | 17 | class ProjectDB(BaseProjectDB): 18 | """ProjectDB loading scripts from local file.""" 19 | 20 | def __init__(self, files): 21 | self.files = files 22 | self.projects = {} 23 | self.load_scripts() 24 | 25 | def load_scripts(self): 26 | project_names = set(self.projects.keys()) 27 | for path in self.files: 28 | for filename in glob.glob(path): 29 | name = os.path.splitext(os.path.basename(filename))[0] 30 | if name in project_names: 31 | project_names.remove(name) 32 | updatetime = os.path.getmtime(filename) 33 | if name not in self.projects or updatetime > self.projects[name]['updatetime']: 34 | project = self._build_project(filename) 35 | if not project: 36 | continue 37 | self.projects[project['name']] = project 38 | 39 | for name in project_names: 40 | del self.projects[name] 41 | 42 | rate_re = re.compile(r'^\s*#\s*rate.*?(\d+(\.\d+)?)', re.I | re.M) 43 | burst_re = re.compile(r'^\s*#\s*burst.*?(\d+(\.\d+)?)', re.I | re.M) 44 | 45 | def _build_project(self, filename): 46 | try: 47 | with open(filename) as fp: 48 | script = fp.read() 49 | m = self.rate_re.search(script) 50 | if m: 51 | rate = float(m.group(1)) 52 | else: 53 | rate = 1 54 | 55 | m = self.burst_re.search(script) 56 | if m: 57 | burst = float(m.group(1)) 58 | else: 59 | burst = 3 60 | 61 | return { 62 | 'name': os.path.splitext(os.path.basename(filename))[0], 63 | 'group': None, 64 | 'status': 'RUNNING', 65 | 'script': script, 66 | 'comments': None, 67 | 'rate': rate, 68 | 'burst': burst, 69 | 'updatetime': os.path.getmtime(filename), 70 | } 71 | except OSError as e: 72 | logging.error('loading project script error: %s', e) 73 | return None 74 | 75 | def get_all(self, fields=None): 76 | for projectname in self.projects: 77 | yield self.get(projectname, fields) 78 | 79 | def get(self, name, fields=None): 80 | if name not in self.projects: 81 | return None 82 | project = self.projects[name] 83 | result = {} 84 | for f in fields or project: 85 | if f in project: 86 | result[f] = project[f] 87 | else: 88 | result[f] = None 89 | return result 90 | 91 | def check_update(self, timestamp, fields=None): 92 | self.load_scripts() 93 | for projectname, project in six.iteritems(self.projects): 94 | if project['updatetime'] > timestamp: 95 | yield self.get(projectname, fields) 96 | -------------------------------------------------------------------------------- /pyspider/database/mongodb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/mongodb/__init__.py -------------------------------------------------------------------------------- /pyspider/database/mongodb/mongodbbase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-22 20:42:01 7 | 8 | import time 9 | 10 | 11 | class SplitTableMixin(object): 12 | UPDATE_PROJECTS_TIME = 10 * 60 13 | 14 | def _collection_name(self, project): 15 | if self.collection_prefix: 16 | return "%s.%s" % (self.collection_prefix, project) 17 | else: 18 | return project 19 | 20 | @property 21 | def projects(self): 22 | if time.time() - getattr(self, '_last_update_projects', 0) > self.UPDATE_PROJECTS_TIME: 23 | self._list_project() 24 | return self._projects 25 | 26 | @projects.setter 27 | def projects(self, value): 28 | self._projects = value 29 | 30 | def _list_project(self): 31 | self._last_update_projects = time.time() 32 | self.projects = set() 33 | if self.collection_prefix: 34 | prefix = "%s." % self.collection_prefix 35 | else: 36 | prefix = '' 37 | for each in self.database.collection_names(): 38 | if each.startswith('system.'): 39 | continue 40 | if each.startswith(prefix): 41 | self.projects.add(each[len(prefix):]) 42 | 43 | def drop(self, project): 44 | if project not in self.projects: 45 | self._list_project() 46 | if project not in self.projects: 47 | return 48 | collection_name = self._collection_name(project) 49 | self.database[collection_name].drop() 50 | self._list_project() 51 | -------------------------------------------------------------------------------- /pyspider/database/mongodb/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-12 12:22:42 7 | 8 | import time 9 | from pymongo import MongoClient 10 | 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | 13 | 14 | class ProjectDB(BaseProjectDB): 15 | __collection_name__ = 'projectdb' 16 | 17 | def __init__(self, url, database='projectdb'): 18 | self.conn = MongoClient(url) 19 | self.conn.admin.command("ismaster") 20 | self.database = self.conn[database] 21 | self.collection = self.database[self.__collection_name__] 22 | 23 | self.collection.ensure_index('name', unique=True) 24 | 25 | def _default_fields(self, each): 26 | if each is None: 27 | return each 28 | each.setdefault('group', None) 29 | each.setdefault('status', 'TODO') 30 | each.setdefault('script', '') 31 | each.setdefault('comments', None) 32 | each.setdefault('rate', 0) 33 | each.setdefault('burst', 0) 34 | each.setdefault('updatetime', 0) 35 | return each 36 | 37 | def insert(self, name, obj={}): 38 | obj = dict(obj) 39 | obj['name'] = name 40 | obj['updatetime'] = time.time() 41 | return self.collection.update({'name': name}, {'$set': obj}, upsert=True) 42 | 43 | def update(self, name, obj={}, **kwargs): 44 | obj = dict(obj) 45 | obj.update(kwargs) 46 | obj['updatetime'] = time.time() 47 | return self.collection.update({'name': name}, {'$set': obj}) 48 | 49 | def get_all(self, fields=None): 50 | for each in self.collection.find({}, fields): 51 | if each and '_id' in each: 52 | del each['_id'] 53 | yield self._default_fields(each) 54 | 55 | def get(self, name, fields=None): 56 | each = self.collection.find_one({'name': name}, fields) 57 | if each and '_id' in each: 58 | del each['_id'] 59 | return self._default_fields(each) 60 | 61 | def check_update(self, timestamp, fields=None): 62 | for project in self.get_all(fields=('updatetime', 'name')): 63 | if project['updatetime'] > timestamp: 64 | project = self.get(project['name'], fields) 65 | yield self._default_fields(project) 66 | 67 | def drop(self, name): 68 | return self.collection.remove({'name': name}) 69 | -------------------------------------------------------------------------------- /pyspider/database/mongodb/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-13 22:18:36 7 | 8 | import json 9 | import time 10 | 11 | from pymongo import MongoClient 12 | 13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 14 | from .mongodbbase import SplitTableMixin 15 | 16 | 17 | class ResultDB(SplitTableMixin, BaseResultDB): 18 | collection_prefix = '' 19 | 20 | def __init__(self, url, database='resultdb'): 21 | self.conn = MongoClient(url) 22 | self.conn.admin.command("ismaster") 23 | self.database = self.conn[database] 24 | self.projects = set() 25 | 26 | self._list_project() 27 | # we suggest manually build index in advance, instead of indexing 28 | # in the startup process, 29 | # for project in self.projects: 30 | # collection_name = self._collection_name(project) 31 | # self.database[collection_name].ensure_index('taskid') 32 | pass 33 | 34 | def _create_project(self, project): 35 | collection_name = self._collection_name(project) 36 | self.database[collection_name].ensure_index('taskid') 37 | self._list_project() 38 | 39 | def _parse(self, data): 40 | data['_id'] = str(data['_id']) 41 | if 'result' in data: 42 | data['result'] = json.loads(data['result']) 43 | return data 44 | 45 | def _stringify(self, data): 46 | if 'result' in data: 47 | data['result'] = json.dumps(data['result']) 48 | return data 49 | 50 | def save(self, project, taskid, url, result): 51 | if project not in self.projects: 52 | self._create_project(project) 53 | collection_name = self._collection_name(project) 54 | obj = { 55 | 'taskid' : taskid, 56 | 'url' : url, 57 | 'result' : result, 58 | 'updatetime': time.time(), 59 | } 60 | return self.database[collection_name].update( 61 | {'taskid': taskid}, {"$set": self._stringify(obj)}, upsert=True 62 | ) 63 | 64 | def select(self, project, fields=None, offset=0, limit=0): 65 | if project not in self.projects: 66 | self._list_project() 67 | if project not in self.projects: 68 | return 69 | offset = offset or 0 70 | limit = limit or 0 71 | collection_name = self._collection_name(project) 72 | for result in self.database[collection_name].find({}, fields, skip=offset, limit=limit): 73 | yield self._parse(result) 74 | 75 | def count(self, project): 76 | if project not in self.projects: 77 | self._list_project() 78 | if project not in self.projects: 79 | return 80 | collection_name = self._collection_name(project) 81 | return self.database[collection_name].count() 82 | 83 | def get(self, project, taskid, fields=None): 84 | if project not in self.projects: 85 | self._list_project() 86 | if project not in self.projects: 87 | return 88 | collection_name = self._collection_name(project) 89 | ret = self.database[collection_name].find_one({'taskid': taskid}, fields) 90 | if not ret: 91 | return ret 92 | return self._parse(ret) 93 | -------------------------------------------------------------------------------- /pyspider/database/mysql/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-17 20:12:54 7 | -------------------------------------------------------------------------------- /pyspider/database/mysql/mysqlbase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-05 10:42:24 7 | 8 | import time 9 | import mysql.connector 10 | 11 | 12 | class MySQLMixin(object): 13 | maxlimit = 18446744073709551615 14 | 15 | @property 16 | def dbcur(self): 17 | try: 18 | if self.conn.unread_result: 19 | self.conn.get_rows() 20 | if hasattr(self.conn, 'free_result'): 21 | self.conn.free_result() 22 | return self.conn.cursor() 23 | except (mysql.connector.OperationalError, mysql.connector.InterfaceError): 24 | self.conn.ping(reconnect=True) 25 | self.conn.database = self.database_name 26 | return self.conn.cursor() 27 | 28 | 29 | class SplitTableMixin(object): 30 | UPDATE_PROJECTS_TIME = 10 * 60 31 | 32 | def _tablename(self, project): 33 | if self.__tablename__: 34 | return '%s_%s' % (self.__tablename__, project) 35 | else: 36 | return project 37 | 38 | @property 39 | def projects(self): 40 | if time.time() - getattr(self, '_last_update_projects', 0) \ 41 | > self.UPDATE_PROJECTS_TIME: 42 | self._list_project() 43 | return self._projects 44 | 45 | @projects.setter 46 | def projects(self, value): 47 | self._projects = value 48 | 49 | def _list_project(self): 50 | self._last_update_projects = time.time() 51 | self.projects = set() 52 | if self.__tablename__: 53 | prefix = '%s_' % self.__tablename__ 54 | else: 55 | prefix = '' 56 | for project, in self._execute('show tables;'): 57 | if project.startswith(prefix): 58 | project = project[len(prefix):] 59 | self.projects.add(project) 60 | 61 | def drop(self, project): 62 | if project not in self.projects: 63 | self._list_project() 64 | if project not in self.projects: 65 | return 66 | tablename = self._tablename(project) 67 | self._execute("DROP TABLE %s" % self.escape(tablename)) 68 | self._list_project() 69 | -------------------------------------------------------------------------------- /pyspider/database/mysql/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-17 21:06:43 7 | 8 | import time 9 | import mysql.connector 10 | 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | from pyspider.database.basedb import BaseDB 13 | from .mysqlbase import MySQLMixin 14 | 15 | 16 | class ProjectDB(MySQLMixin, BaseProjectDB, BaseDB): 17 | __tablename__ = 'projectdb' 18 | 19 | def __init__(self, host='localhost', port=3306, database='projectdb', 20 | user='root', passwd=None): 21 | self.database_name = database 22 | self.conn = mysql.connector.connect(user=user, password=passwd, 23 | host=host, port=port, autocommit=True) 24 | if database not in [x[0] for x in self._execute('show databases')]: 25 | self._execute('CREATE DATABASE %s' % self.escape(database)) 26 | self.conn.database = database 27 | 28 | self._execute('''CREATE TABLE IF NOT EXISTS %s ( 29 | `name` varchar(64) PRIMARY KEY, 30 | `group` varchar(64), 31 | `status` varchar(16), 32 | `script` TEXT, 33 | `comments` varchar(1024), 34 | `rate` float(11, 4), 35 | `burst` float(11, 4), 36 | `updatetime` double(16, 4) 37 | ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(self.__tablename__)) 38 | 39 | def insert(self, name, obj={}): 40 | obj = dict(obj) 41 | obj['name'] = name 42 | obj['updatetime'] = time.time() 43 | return self._insert(**obj) 44 | 45 | def update(self, name, obj={}, **kwargs): 46 | obj = dict(obj) 47 | obj.update(kwargs) 48 | obj['updatetime'] = time.time() 49 | ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) 50 | return ret.rowcount 51 | 52 | def get_all(self, fields=None): 53 | return self._select2dic(what=fields) 54 | 55 | def get(self, name, fields=None): 56 | where = "`name` = %s" % self.placeholder 57 | for each in self._select2dic(what=fields, where=where, where_values=(name, )): 58 | return each 59 | return None 60 | 61 | def drop(self, name): 62 | where = "`name` = %s" % self.placeholder 63 | return self._delete(where=where, where_values=(name, )) 64 | 65 | def check_update(self, timestamp, fields=None): 66 | where = "`updatetime` >= %f" % timestamp 67 | return self._select2dic(what=fields, where=where) 68 | -------------------------------------------------------------------------------- /pyspider/database/mysql/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-13 22:02:57 7 | 8 | import re 9 | import six 10 | import time 11 | import json 12 | import mysql.connector 13 | 14 | from pyspider.libs import utils 15 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 16 | from pyspider.database.basedb import BaseDB 17 | from .mysqlbase import MySQLMixin, SplitTableMixin 18 | 19 | 20 | class ResultDB(MySQLMixin, SplitTableMixin, BaseResultDB, BaseDB): 21 | __tablename__ = '' 22 | 23 | def __init__(self, host='localhost', port=3306, database='resultdb', 24 | user='root', passwd=None): 25 | self.database_name = database 26 | self.conn = mysql.connector.connect(user=user, password=passwd, 27 | host=host, port=port, autocommit=True) 28 | if database not in [x[0] for x in self._execute('show databases')]: 29 | self._execute('CREATE DATABASE %s' % self.escape(database)) 30 | self.conn.database = database 31 | self._list_project() 32 | 33 | def _create_project(self, project): 34 | assert re.match(r'^\w+$', project) is not None 35 | tablename = self._tablename(project) 36 | if tablename in [x[0] for x in self._execute('show tables')]: 37 | return 38 | self._execute('''CREATE TABLE %s ( 39 | `taskid` varchar(64) PRIMARY KEY, 40 | `url` varchar(1024), 41 | `result` MEDIUMBLOB, 42 | `updatetime` double(16, 4) 43 | ) ENGINE=InnoDB CHARSET=utf8''' % self.escape(tablename)) 44 | 45 | def _parse(self, data): 46 | for key, value in list(six.iteritems(data)): 47 | if isinstance(value, (bytearray, six.binary_type)): 48 | data[key] = utils.text(value) 49 | if 'result' in data: 50 | data['result'] = json.loads(data['result']) 51 | return data 52 | 53 | def _stringify(self, data): 54 | if 'result' in data: 55 | data['result'] = json.dumps(data['result']) 56 | return data 57 | 58 | def save(self, project, taskid, url, result): 59 | tablename = self._tablename(project) 60 | if project not in self.projects: 61 | self._create_project(project) 62 | self._list_project() 63 | obj = { 64 | 'taskid': taskid, 65 | 'url': url, 66 | 'result': result, 67 | 'updatetime': time.time(), 68 | } 69 | return self._replace(tablename, **self._stringify(obj)) 70 | 71 | def select(self, project, fields=None, offset=0, limit=None): 72 | if project not in self.projects: 73 | self._list_project() 74 | if project not in self.projects: 75 | return 76 | tablename = self._tablename(project) 77 | 78 | for task in self._select2dic(tablename, what=fields, order='updatetime DESC', 79 | offset=offset, limit=limit): 80 | yield self._parse(task) 81 | 82 | def count(self, project): 83 | if project not in self.projects: 84 | self._list_project() 85 | if project not in self.projects: 86 | return 0 87 | tablename = self._tablename(project) 88 | for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): 89 | return count 90 | 91 | def get(self, project, taskid, fields=None): 92 | if project not in self.projects: 93 | self._list_project() 94 | if project not in self.projects: 95 | return 96 | tablename = self._tablename(project) 97 | where = "`taskid` = %s" % self.placeholder 98 | for task in self._select2dic(tablename, what=fields, 99 | where=where, where_values=(taskid, )): 100 | return self._parse(task) 101 | -------------------------------------------------------------------------------- /pyspider/database/redis/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-05-17 01:34:21 7 | 8 | -------------------------------------------------------------------------------- /pyspider/database/sqlalchemy/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-04 20:11:04 7 | 8 | -------------------------------------------------------------------------------- /pyspider/database/sqlalchemy/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-04 23:25:10 7 | 8 | import six 9 | import time 10 | import sqlalchemy.exc 11 | 12 | from sqlalchemy import create_engine, MetaData, Table, Column, String, Float, Text 13 | from sqlalchemy.engine.url import make_url 14 | from pyspider.libs import utils 15 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 16 | from .sqlalchemybase import result2dict 17 | 18 | 19 | class ProjectDB(BaseProjectDB): 20 | __tablename__ = 'projectdb' 21 | 22 | def __init__(self, url): 23 | self.table = Table(self.__tablename__, MetaData(), 24 | Column('name', String(64), primary_key=True), 25 | Column('group', String(64)), 26 | Column('status', String(16)), 27 | Column('script', Text), 28 | Column('comments', String(1024)), 29 | Column('rate', Float(11)), 30 | Column('burst', Float(11)), 31 | Column('updatetime', Float(32)), 32 | mysql_engine='InnoDB', 33 | mysql_charset='utf8' 34 | ) 35 | 36 | self.url = make_url(url) 37 | if self.url.database: 38 | database = self.url.database 39 | self.url.database = None 40 | try: 41 | engine = create_engine(self.url, convert_unicode=True, pool_recycle=3600) 42 | conn = engine.connect() 43 | conn.execute("commit") 44 | conn.execute("CREATE DATABASE %s" % database) 45 | except sqlalchemy.exc.SQLAlchemyError: 46 | pass 47 | self.url.database = database 48 | self.engine = create_engine(url, convert_unicode=True, pool_recycle=3600) 49 | self.table.create(self.engine, checkfirst=True) 50 | 51 | @staticmethod 52 | def _parse(data): 53 | return data 54 | 55 | @staticmethod 56 | def _stringify(data): 57 | return data 58 | 59 | def insert(self, name, obj={}): 60 | obj = dict(obj) 61 | obj['name'] = name 62 | obj['updatetime'] = time.time() 63 | return self.engine.execute(self.table.insert() 64 | .values(**self._stringify(obj))) 65 | 66 | def update(self, name, obj={}, **kwargs): 67 | obj = dict(obj) 68 | obj.update(kwargs) 69 | obj['updatetime'] = time.time() 70 | return self.engine.execute(self.table.update() 71 | .where(self.table.c.name == name) 72 | .values(**self._stringify(obj))) 73 | 74 | def get_all(self, fields=None): 75 | columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c 76 | for task in self.engine.execute(self.table.select() 77 | .with_only_columns(columns)): 78 | yield self._parse(result2dict(columns, task)) 79 | 80 | def get(self, name, fields=None): 81 | columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c 82 | for task in self.engine.execute(self.table.select() 83 | .where(self.table.c.name == name) 84 | .limit(1) 85 | .with_only_columns(columns)): 86 | return self._parse(result2dict(columns, task)) 87 | 88 | def drop(self, name): 89 | return self.engine.execute(self.table.delete() 90 | .where(self.table.c.name == name)) 91 | 92 | def check_update(self, timestamp, fields=None): 93 | columns = [getattr(self.table.c, f, f) for f in fields] if fields else self.table.c 94 | for task in self.engine.execute(self.table.select() 95 | .with_only_columns(columns) 96 | .where(self.table.c.updatetime >= timestamp)): 97 | yield self._parse(result2dict(columns, task)) 98 | -------------------------------------------------------------------------------- /pyspider/database/sqlalchemy/sqlalchemybase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-04 18:48:47 7 | 8 | import time 9 | 10 | 11 | def result2dict(columns, task): 12 | return dict(task) 13 | 14 | 15 | class SplitTableMixin(object): 16 | UPDATE_PROJECTS_TIME = 10 * 60 17 | 18 | def _tablename(self, project): 19 | if self.__tablename__: 20 | return '%s_%s' % (self.__tablename__, project) 21 | else: 22 | return project 23 | 24 | @property 25 | def projects(self): 26 | if time.time() - getattr(self, '_last_update_projects', 0) \ 27 | > self.UPDATE_PROJECTS_TIME: 28 | self._list_project() 29 | return self._projects 30 | 31 | @projects.setter 32 | def projects(self, value): 33 | self._projects = value 34 | 35 | def _list_project(self): 36 | self._last_update_projects = time.time() 37 | self.projects = set() 38 | if self.__tablename__: 39 | prefix = '%s_' % self.__tablename__ 40 | else: 41 | prefix = '' 42 | 43 | for project in self.engine.table_names(): 44 | if project.startswith(prefix): 45 | project = project[len(prefix):] 46 | self.projects.add(project) 47 | 48 | def drop(self, project): 49 | if project not in self.projects: 50 | self._list_project() 51 | if project not in self.projects: 52 | return 53 | self.table.name = self._tablename(project) 54 | self.table.drop(self.engine) 55 | self._list_project() 56 | -------------------------------------------------------------------------------- /pyspider/database/sqlite/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/database/sqlite/__init__.py -------------------------------------------------------------------------------- /pyspider/database/sqlite/projectdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-09 12:05:52 7 | 8 | import time 9 | 10 | from .sqlitebase import SQLiteMixin 11 | from pyspider.database.base.projectdb import ProjectDB as BaseProjectDB 12 | from pyspider.database.basedb import BaseDB 13 | 14 | 15 | class ProjectDB(SQLiteMixin, BaseProjectDB, BaseDB): 16 | __tablename__ = 'projectdb' 17 | placeholder = '?' 18 | 19 | def __init__(self, path): 20 | self.path = path 21 | self.last_pid = 0 22 | self.conn = None 23 | self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( 24 | name PRIMARY KEY, 25 | `group`, 26 | status, script, comments, 27 | rate, burst, updatetime 28 | )''' % self.__tablename__) 29 | 30 | def insert(self, name, obj={}): 31 | obj = dict(obj) 32 | obj['name'] = name 33 | obj['updatetime'] = time.time() 34 | return self._insert(**obj) 35 | 36 | def update(self, name, obj={}, **kwargs): 37 | obj = dict(obj) 38 | obj.update(kwargs) 39 | obj['updatetime'] = time.time() 40 | ret = self._update(where="`name` = %s" % self.placeholder, where_values=(name, ), **obj) 41 | return ret.rowcount 42 | 43 | def get_all(self, fields=None): 44 | return self._select2dic(what=fields) 45 | 46 | def get(self, name, fields=None): 47 | where = "`name` = %s" % self.placeholder 48 | for each in self._select2dic(what=fields, where=where, where_values=(name, )): 49 | return each 50 | return None 51 | 52 | def check_update(self, timestamp, fields=None): 53 | where = "`updatetime` >= %f" % timestamp 54 | return self._select2dic(what=fields, where=where) 55 | 56 | def drop(self, name): 57 | where = "`name` = %s" % self.placeholder 58 | return self._delete(where=where, where_values=(name, )) 59 | -------------------------------------------------------------------------------- /pyspider/database/sqlite/resultdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-13 17:08:43 7 | 8 | import re 9 | import time 10 | import json 11 | 12 | from .sqlitebase import SQLiteMixin, SplitTableMixin 13 | from pyspider.database.base.resultdb import ResultDB as BaseResultDB 14 | from pyspider.database.basedb import BaseDB 15 | 16 | 17 | class ResultDB(SQLiteMixin, SplitTableMixin, BaseResultDB, BaseDB): 18 | __tablename__ = 'resultdb' 19 | placeholder = '?' 20 | 21 | def __init__(self, path): 22 | self.path = path 23 | self.last_pid = 0 24 | self.conn = None 25 | self._list_project() 26 | 27 | def _create_project(self, project): 28 | assert re.match(r'^\w+$', project) is not None 29 | tablename = self._tablename(project) 30 | self._execute('''CREATE TABLE IF NOT EXISTS `%s` ( 31 | taskid PRIMARY KEY, 32 | url, 33 | result, 34 | updatetime 35 | )''' % tablename) 36 | 37 | def _parse(self, data): 38 | if 'result' in data: 39 | data['result'] = json.loads(data['result']) 40 | return data 41 | 42 | def _stringify(self, data): 43 | if 'result' in data: 44 | data['result'] = json.dumps(data['result']) 45 | return data 46 | 47 | def save(self, project, taskid, url, result): 48 | tablename = self._tablename(project) 49 | if project not in self.projects: 50 | self._create_project(project) 51 | self._list_project() 52 | obj = { 53 | 'taskid': taskid, 54 | 'url': url, 55 | 'result': result, 56 | 'updatetime': time.time(), 57 | } 58 | return self._replace(tablename, **self._stringify(obj)) 59 | 60 | def select(self, project, fields=None, offset=0, limit=None): 61 | if project not in self.projects: 62 | self._list_project() 63 | if project not in self.projects: 64 | return 65 | tablename = self._tablename(project) 66 | 67 | for task in self._select2dic(tablename, what=fields, order='updatetime DESC', 68 | offset=offset, limit=limit): 69 | yield self._parse(task) 70 | 71 | def count(self, project): 72 | if project not in self.projects: 73 | self._list_project() 74 | if project not in self.projects: 75 | return 0 76 | tablename = self._tablename(project) 77 | for count, in self._execute("SELECT count(1) FROM %s" % self.escape(tablename)): 78 | return count 79 | 80 | def get(self, project, taskid, fields=None): 81 | if project not in self.projects: 82 | self._list_project() 83 | if project not in self.projects: 84 | return 85 | tablename = self._tablename(project) 86 | where = "`taskid` = %s" % self.placeholder 87 | for task in self._select2dic(tablename, what=fields, 88 | where=where, where_values=(taskid, )): 89 | return self._parse(task) 90 | -------------------------------------------------------------------------------- /pyspider/database/sqlite/sqlitebase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-22 20:30:44 7 | 8 | import os 9 | import time 10 | import sqlite3 11 | import threading 12 | 13 | 14 | class SQLiteMixin(object): 15 | 16 | @property 17 | def dbcur(self): 18 | pid = (os.getpid(), threading.current_thread().ident) 19 | if not (self.conn and pid == self.last_pid): 20 | self.last_pid = pid 21 | self.conn = sqlite3.connect(self.path, isolation_level=None) 22 | return self.conn.cursor() 23 | 24 | 25 | class SplitTableMixin(object): 26 | UPDATE_PROJECTS_TIME = 10 * 60 27 | 28 | def _tablename(self, project): 29 | if self.__tablename__: 30 | return '%s_%s' % (self.__tablename__, project) 31 | else: 32 | return project 33 | 34 | @property 35 | def projects(self): 36 | if time.time() - getattr(self, '_last_update_projects', 0) \ 37 | > self.UPDATE_PROJECTS_TIME: 38 | self._list_project() 39 | return self._projects 40 | 41 | @projects.setter 42 | def projects(self, value): 43 | self._projects = value 44 | 45 | def _list_project(self): 46 | self._last_update_projects = time.time() 47 | self.projects = set() 48 | if self.__tablename__: 49 | prefix = '%s_' % self.__tablename__ 50 | else: 51 | prefix = '' 52 | for project, in self._select('sqlite_master', what='name', 53 | where='type = "table"'): 54 | if project.startswith(prefix): 55 | project = project[len(prefix):] 56 | self.projects.add(project) 57 | 58 | def drop(self, project): 59 | if project not in self.projects: 60 | self._list_project() 61 | if project not in self.projects: 62 | return 63 | tablename = self._tablename(project) 64 | self._execute("DROP TABLE %s" % self.escape(tablename)) 65 | self._list_project() 66 | -------------------------------------------------------------------------------- /pyspider/fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | from .tornado_fetcher import Fetcher 2 | -------------------------------------------------------------------------------- /pyspider/fetcher/cookie_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-14 09:07:11 7 | 8 | from requests.cookies import MockRequest 9 | 10 | 11 | class MockResponse(object): 12 | 13 | def __init__(self, headers): 14 | self._headers = headers 15 | 16 | def info(self): 17 | return self 18 | 19 | def getheaders(self, name): 20 | """make cookie python 2 version use this method to get cookie list""" 21 | return self._headers.get_list(name) 22 | 23 | def get_all(self, name, default=None): 24 | """make cookie python 3 version use this instead of getheaders""" 25 | if default is None: 26 | default = [] 27 | return self._headers.get_list(name) or default 28 | 29 | 30 | def extract_cookies_to_jar(jar, request, response): 31 | req = MockRequest(request) 32 | res = MockResponse(response) 33 | jar.extract_cookies(res, req) 34 | -------------------------------------------------------------------------------- /pyspider/libs/ListIO.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-26 23:41:51 7 | 8 | 9 | class ListO(object): 10 | 11 | """A StringO write to list.""" 12 | 13 | def __init__(self, buffer=None): 14 | self._buffer = buffer 15 | if self._buffer is None: 16 | self._buffer = [] 17 | 18 | def isatty(self): 19 | return False 20 | 21 | def close(self): 22 | pass 23 | 24 | def flush(self): 25 | pass 26 | 27 | def seek(self, n, mode=0): 28 | pass 29 | 30 | def readline(self): 31 | pass 32 | 33 | def reset(self): 34 | pass 35 | 36 | def write(self, x): 37 | self._buffer.append(x) 38 | 39 | def writelines(self, x): 40 | self._buffer.extend(x) 41 | -------------------------------------------------------------------------------- /pyspider/libs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binux/pyspider/897891cafb21ea5b4ac08e728ad2ea212879f7fa/pyspider/libs/__init__.py -------------------------------------------------------------------------------- /pyspider/libs/dataurl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2012-11-16 10:33:20 7 | 8 | import six 9 | from base64 import b64encode, b64decode 10 | from . import utils 11 | from six.moves.urllib.parse import quote, unquote 12 | 13 | 14 | def encode(data, mime_type='', charset='utf-8', base64=True): 15 | """ 16 | Encode data to DataURL 17 | """ 18 | if isinstance(data, six.text_type): 19 | data = data.encode(charset) 20 | else: 21 | charset = None 22 | if base64: 23 | data = utils.text(b64encode(data)) 24 | else: 25 | data = utils.text(quote(data)) 26 | 27 | result = ['data:', ] 28 | if mime_type: 29 | result.append(mime_type) 30 | if charset: 31 | result.append(';charset=') 32 | result.append(charset) 33 | if base64: 34 | result.append(';base64') 35 | result.append(',') 36 | result.append(data) 37 | 38 | return ''.join(result) 39 | 40 | 41 | def decode(data_url): 42 | """ 43 | Decode DataURL data 44 | """ 45 | metadata, data = data_url.rsplit(',', 1) 46 | _, metadata = metadata.split('data:', 1) 47 | parts = metadata.split(';') 48 | if parts[-1] == 'base64': 49 | data = b64decode(data) 50 | else: 51 | data = unquote(data) 52 | 53 | for part in parts: 54 | if part.startswith("charset="): 55 | data = data.decode(part[8:]) 56 | return data 57 | -------------------------------------------------------------------------------- /pyspider/libs/log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2012-10-24 16:08:17 7 | 8 | import logging 9 | 10 | try: 11 | import curses 12 | except ImportError: 13 | curses = None 14 | 15 | from tornado.log import LogFormatter as _LogFormatter 16 | 17 | 18 | class LogFormatter(_LogFormatter, object): 19 | """Init tornado.log.LogFormatter from logging.config.fileConfig""" 20 | def __init__(self, fmt=None, datefmt=None, color=True, *args, **kwargs): 21 | if fmt is None: 22 | fmt = _LogFormatter.DEFAULT_FORMAT 23 | super(LogFormatter, self).__init__(color=color, fmt=fmt, *args, **kwargs) 24 | 25 | 26 | class SaveLogHandler(logging.Handler): 27 | """LogHandler that save records to a list""" 28 | 29 | def __init__(self, saveto=None, *args, **kwargs): 30 | self.saveto = saveto 31 | logging.Handler.__init__(self, *args, **kwargs) 32 | 33 | def emit(self, record): 34 | if self.saveto is not None: 35 | self.saveto.append(record) 36 | 37 | handle = emit 38 | 39 | 40 | def enable_pretty_logging(logger=logging.getLogger()): 41 | channel = logging.StreamHandler() 42 | channel.setFormatter(LogFormatter()) 43 | logger.addHandler(channel) 44 | -------------------------------------------------------------------------------- /pyspider/libs/multiprocessing_queue.py: -------------------------------------------------------------------------------- 1 | import six 2 | import platform 3 | import multiprocessing 4 | from multiprocessing.queues import Queue as BaseQueue 5 | 6 | 7 | # The SharedCounter and Queue classes come from: 8 | # https://github.com/vterron/lemon/commit/9ca6b4b 9 | 10 | class SharedCounter(object): 11 | """ A synchronized shared counter. 12 | The locking done by multiprocessing.Value ensures that only a single 13 | process or thread may read or write the in-memory ctypes object. However, 14 | in order to do n += 1, Python performs a read followed by a write, so a 15 | second process may read the old value before the new one is written by the 16 | first process. The solution is to use a multiprocessing.Lock to guarantee 17 | the atomicity of the modifications to Value. 18 | This class comes almost entirely from Eli Bendersky's blog: 19 | http://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing/ 20 | """ 21 | 22 | def __init__(self, n=0): 23 | self.count = multiprocessing.Value('i', n) 24 | 25 | def increment(self, n=1): 26 | """ Increment the counter by n (default = 1) """ 27 | with self.count.get_lock(): 28 | self.count.value += n 29 | 30 | @property 31 | def value(self): 32 | """ Return the value of the counter """ 33 | return self.count.value 34 | 35 | 36 | class MultiProcessingQueue(BaseQueue): 37 | """ A portable implementation of multiprocessing.Queue. 38 | Because of multithreading / multiprocessing semantics, Queue.qsize() may 39 | raise the NotImplementedError exception on Unix platforms like Mac OS X 40 | where sem_getvalue() is not implemented. This subclass addresses this 41 | problem by using a synchronized shared counter (initialized to zero) and 42 | increasing / decreasing its value every time the put() and get() methods 43 | are called, respectively. This not only prevents NotImplementedError from 44 | being raised, but also allows us to implement a reliable version of both 45 | qsize() and empty(). 46 | """ 47 | def __init__(self, *args, **kwargs): 48 | super(MultiProcessingQueue, self).__init__(*args, **kwargs) 49 | self.size = SharedCounter(0) 50 | 51 | def put(self, *args, **kwargs): 52 | self.size.increment(1) 53 | super(MultiProcessingQueue, self).put(*args, **kwargs) 54 | 55 | def get(self, *args, **kwargs): 56 | v = super(MultiProcessingQueue, self).get(*args, **kwargs) 57 | self.size.increment(-1) 58 | return v 59 | 60 | def qsize(self): 61 | """ Reliable implementation of multiprocessing.Queue.qsize() """ 62 | return self.size.value 63 | 64 | 65 | if platform.system() == 'Darwin': 66 | if hasattr(multiprocessing, 'get_context'): # for py34 67 | def Queue(maxsize=0): 68 | return MultiProcessingQueue(maxsize, ctx=multiprocessing.get_context()) 69 | else: 70 | def Queue(maxsize=0): 71 | return MultiProcessingQueue(maxsize) 72 | else: 73 | from multiprocessing import Queue # flake8: noqa 74 | -------------------------------------------------------------------------------- /pyspider/libs/sample_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on __DATE__ 4 | # Project: __PROJECT_NAME__ 5 | 6 | from pyspider.libs.base_handler import * 7 | 8 | 9 | class Handler(BaseHandler): 10 | crawl_config = { 11 | } 12 | 13 | @every(minutes=24 * 60) 14 | def on_start(self): 15 | self.crawl('__START_URL__', callback=self.index_page) 16 | 17 | @config(age=10 * 24 * 60 * 60) 18 | def index_page(self, response): 19 | for each in response.doc('a[href^="http"]').items(): 20 | self.crawl(each.attr.href, callback=self.detail_page) 21 | 22 | @config(priority=2) 23 | def detail_page(self, response): 24 | return { 25 | "url": response.url, 26 | "title": response.doc('title').text(), 27 | } 28 | -------------------------------------------------------------------------------- /pyspider/libs/wsgi_xmlrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2006-2007 Open Source Applications Foundation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Origin: https://code.google.com/p/wsgi-xmlrpc/ 16 | 17 | 18 | from six.moves.xmlrpc_server import SimpleXMLRPCDispatcher 19 | import logging 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | class WSGIXMLRPCApplication(object): 25 | """Application to handle requests to the XMLRPC service""" 26 | 27 | def __init__(self, instance=None, methods=None): 28 | """Create windmill xmlrpc dispatcher""" 29 | if methods is None: 30 | methods = [] 31 | try: 32 | self.dispatcher = SimpleXMLRPCDispatcher(allow_none=True, encoding=None) 33 | except TypeError: 34 | # python 2.4 35 | self.dispatcher = SimpleXMLRPCDispatcher() 36 | if instance is not None: 37 | self.dispatcher.register_instance(instance) 38 | for method in methods: 39 | self.dispatcher.register_function(method) 40 | self.dispatcher.register_introspection_functions() 41 | 42 | def register_instance(self, instance): 43 | return self.dispatcher.register_instance(instance) 44 | 45 | def register_function(self, function, name=None): 46 | return self.dispatcher.register_function(function, name) 47 | 48 | def handler(self, environ, start_response): 49 | """XMLRPC service for windmill browser core to communicate with""" 50 | 51 | if environ['REQUEST_METHOD'] == 'POST': 52 | return self.handle_POST(environ, start_response) 53 | else: 54 | start_response("400 Bad request", [('Content-Type', 'text/plain')]) 55 | return [''] 56 | 57 | def handle_POST(self, environ, start_response): 58 | """Handles the HTTP POST request. 59 | 60 | Attempts to interpret all HTTP POST requests as XML-RPC calls, 61 | which are forwarded to the server's _dispatch method for handling. 62 | 63 | Most code taken from SimpleXMLRPCServer with modifications for wsgi and my custom dispatcher. 64 | """ 65 | 66 | try: 67 | # Get arguments by reading body of request. 68 | # We read this in chunks to avoid straining 69 | # socket.read(); around the 10 or 15Mb mark, some platforms 70 | # begin to have problems (bug #792570). 71 | 72 | length = int(environ['CONTENT_LENGTH']) 73 | data = environ['wsgi.input'].read(length) 74 | 75 | # In previous versions of SimpleXMLRPCServer, _dispatch 76 | # could be overridden in this class, instead of in 77 | # SimpleXMLRPCDispatcher. To maintain backwards compatibility, 78 | # check to see if a subclass implements _dispatch and 79 | # using that method if present. 80 | response = self.dispatcher._marshaled_dispatch( 81 | data, getattr(self.dispatcher, '_dispatch', None) 82 | ) 83 | response += b'\n' 84 | except Exception as e: # This should only happen if the module is buggy 85 | # internal error, report as HTTP server error 86 | logger.exception(e) 87 | start_response("500 Server error", [('Content-Type', 'text/plain')]) 88 | return [] 89 | else: 90 | # got a valid XML RPC response 91 | start_response("200 OK", [('Content-Type', 'text/xml'), ('Content-Length', str(len(response)),)]) 92 | return [response] 93 | 94 | def __call__(self, environ, start_response): 95 | return self.handler(environ, start_response) 96 | -------------------------------------------------------------------------------- /pyspider/logging.conf: -------------------------------------------------------------------------------- 1 | [loggers] 2 | keys=root,scheduler,fetcher,processor,webui,bench,werkzeug 3 | 4 | [logger_root] 5 | level=INFO 6 | handlers=screen 7 | 8 | [logger_scheduler] 9 | level=INFO 10 | handlers=screen 11 | qualname=scheduler 12 | propagate=0 13 | 14 | [logger_fetcher] 15 | level=DEBUG 16 | handlers=screen 17 | qualname=fetcher 18 | propagate=0 19 | 20 | [logger_processor] 21 | level=DEBUG 22 | handlers=screen 23 | qualname=processor 24 | propagate=0 25 | 26 | [logger_webui] 27 | level=DEBUG 28 | handlers=screen 29 | qualname=webui 30 | propagate=0 31 | 32 | [logger_bench] 33 | level=DEBUG 34 | handlers=screen 35 | qualname=bench 36 | propagate=0 37 | 38 | [logger_werkzeug] 39 | level=INFO 40 | handlers=screen 41 | qualname=werkzeug 42 | propagate=0 43 | 44 | [handlers] 45 | keys=screen 46 | 47 | [handler_screen] 48 | class=logging.StreamHandler 49 | formatter=pretty 50 | level=DEBUG 51 | args=(sys.stderr, ) 52 | 53 | [formatters] 54 | keys=pretty 55 | 56 | [formatter_pretty] 57 | class=pyspider.libs.log.LogFormatter 58 | -------------------------------------------------------------------------------- /pyspider/message_queue/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-04-30 21:47:08 7 | 8 | import logging 9 | 10 | try: 11 | from urllib import parse as urlparse 12 | except ImportError: 13 | import urlparse 14 | 15 | 16 | def connect_message_queue(name, url=None, maxsize=0, lazy_limit=True): 17 | """ 18 | create connection to message queue 19 | 20 | name: 21 | name of message queue 22 | 23 | rabbitmq: 24 | amqp://username:password@host:5672/%2F 25 | see https://www.rabbitmq.com/uri-spec.html 26 | redis: 27 | redis://host:6379/db 28 | redis://host1:port1,host2:port2,...,hostn:portn (for redis 3.x in cluster mode) 29 | kombu: 30 | kombu+transport://userid:password@hostname:port/virtual_host 31 | see http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls 32 | builtin: 33 | None 34 | """ 35 | 36 | if not url: 37 | from pyspider.libs.multiprocessing_queue import Queue 38 | return Queue(maxsize=maxsize) 39 | 40 | parsed = urlparse.urlparse(url) 41 | if parsed.scheme == 'amqp': 42 | from .rabbitmq import Queue 43 | return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) 44 | elif parsed.scheme == 'redis': 45 | from .redis_queue import Queue 46 | if ',' in parsed.netloc: 47 | """ 48 | redis in cluster mode (there is no concept of 'db' in cluster mode) 49 | ex. redis://host1:port1,host2:port2,...,hostn:portn 50 | """ 51 | cluster_nodes = [] 52 | for netloc in parsed.netloc.split(','): 53 | cluster_nodes.append({'host': netloc.split(':')[0], 'port': int(netloc.split(':')[1])}) 54 | 55 | return Queue(name=name, maxsize=maxsize, lazy_limit=lazy_limit, cluster_nodes=cluster_nodes) 56 | 57 | else: 58 | db = parsed.path.lstrip('/').split('/') 59 | try: 60 | db = int(db[0]) 61 | except: 62 | logging.warning('redis DB must zero-based numeric index, using 0 instead') 63 | db = 0 64 | 65 | password = parsed.password or None 66 | 67 | return Queue(name=name, host=parsed.hostname, port=parsed.port, db=db, maxsize=maxsize, password=password, lazy_limit=lazy_limit) 68 | elif url.startswith('kombu+'): 69 | url = url[len('kombu+'):] 70 | from .kombu_queue import Queue 71 | return Queue(name, url, maxsize=maxsize, lazy_limit=lazy_limit) 72 | else: 73 | raise Exception('unknown connection url: %s', url) 74 | -------------------------------------------------------------------------------- /pyspider/message_queue/kombu_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-05-22 20:54:01 7 | 8 | import time 9 | import umsgpack 10 | from kombu import Connection, enable_insecure_serializers 11 | from kombu.serialization import register 12 | from kombu.exceptions import ChannelError 13 | from six.moves import queue as BaseQueue 14 | 15 | 16 | register('umsgpack', umsgpack.packb, umsgpack.unpackb, 'application/x-msgpack') 17 | enable_insecure_serializers(['umsgpack']) 18 | 19 | 20 | class KombuQueue(object): 21 | """ 22 | kombu is a high-level interface for multiple message queue backends. 23 | 24 | KombuQueue is built on top of kombu API. 25 | """ 26 | 27 | Empty = BaseQueue.Empty 28 | Full = BaseQueue.Full 29 | max_timeout = 0.3 30 | 31 | def __init__(self, name, url="amqp://", maxsize=0, lazy_limit=True): 32 | """ 33 | Constructor for KombuQueue 34 | 35 | url: http://kombu.readthedocs.org/en/latest/userguide/connections.html#urls 36 | maxsize: an integer that sets the upperbound limit on the number of 37 | items that can be placed in the queue. 38 | """ 39 | self.name = name 40 | self.conn = Connection(url) 41 | self.queue = self.conn.SimpleQueue(self.name, no_ack=True, serializer='umsgpack') 42 | 43 | self.maxsize = maxsize 44 | self.lazy_limit = lazy_limit 45 | if self.lazy_limit and self.maxsize: 46 | self.qsize_diff_limit = int(self.maxsize * 0.1) 47 | else: 48 | self.qsize_diff_limit = 0 49 | self.qsize_diff = 0 50 | 51 | def qsize(self): 52 | try: 53 | return self.queue.qsize() 54 | except ChannelError: 55 | return 0 56 | 57 | def empty(self): 58 | if self.qsize() == 0: 59 | return True 60 | else: 61 | return False 62 | 63 | def full(self): 64 | if self.maxsize and self.qsize() >= self.maxsize: 65 | return True 66 | else: 67 | return False 68 | 69 | def put(self, obj, block=True, timeout=None): 70 | if not block: 71 | return self.put_nowait(obj) 72 | 73 | start_time = time.time() 74 | while True: 75 | try: 76 | return self.put_nowait(obj) 77 | except BaseQueue.Full: 78 | if timeout: 79 | lasted = time.time() - start_time 80 | if timeout > lasted: 81 | time.sleep(min(self.max_timeout, timeout - lasted)) 82 | else: 83 | raise 84 | else: 85 | time.sleep(self.max_timeout) 86 | 87 | def put_nowait(self, obj): 88 | if self.lazy_limit and self.qsize_diff < self.qsize_diff_limit: 89 | pass 90 | elif self.full(): 91 | raise BaseQueue.Full 92 | else: 93 | self.qsize_diff = 0 94 | return self.queue.put(obj) 95 | 96 | def get(self, block=True, timeout=None): 97 | try: 98 | ret = self.queue.get(block, timeout) 99 | return ret.payload 100 | except self.queue.Empty: 101 | raise BaseQueue.Empty 102 | 103 | def get_nowait(self): 104 | try: 105 | ret = self.queue.get_nowait() 106 | return ret.payload 107 | except self.queue.Empty: 108 | raise BaseQueue.Empty 109 | 110 | def delete(self): 111 | self.queue.queue.delete() 112 | 113 | def __del__(self): 114 | self.queue.close() 115 | 116 | 117 | Queue = KombuQueue 118 | -------------------------------------------------------------------------------- /pyspider/message_queue/redis_queue.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-04-27 22:48:04 7 | 8 | import time 9 | import redis 10 | import umsgpack 11 | from six.moves import queue as BaseQueue 12 | 13 | 14 | class RedisQueue(object): 15 | """ 16 | A Queue like message built over redis 17 | """ 18 | 19 | Empty = BaseQueue.Empty 20 | Full = BaseQueue.Full 21 | max_timeout = 0.3 22 | 23 | def __init__(self, name, host='localhost', port=6379, db=0, 24 | maxsize=0, lazy_limit=True, password=None, cluster_nodes=None): 25 | """ 26 | Constructor for RedisQueue 27 | 28 | maxsize: an integer that sets the upperbound limit on the number of 29 | items that can be placed in the queue. 30 | lazy_limit: redis queue is shared via instance, a lazy size limit is used 31 | for better performance. 32 | """ 33 | self.name = name 34 | if(cluster_nodes is not None): 35 | from rediscluster import StrictRedisCluster 36 | self.redis = StrictRedisCluster(startup_nodes=cluster_nodes) 37 | else: 38 | self.redis = redis.StrictRedis(host=host, port=port, db=db, password=password) 39 | self.maxsize = maxsize 40 | self.lazy_limit = lazy_limit 41 | self.last_qsize = 0 42 | 43 | def qsize(self): 44 | self.last_qsize = self.redis.llen(self.name) 45 | return self.last_qsize 46 | 47 | def empty(self): 48 | if self.qsize() == 0: 49 | return True 50 | else: 51 | return False 52 | 53 | def full(self): 54 | if self.maxsize and self.qsize() >= self.maxsize: 55 | return True 56 | else: 57 | return False 58 | 59 | def put_nowait(self, obj): 60 | if self.lazy_limit and self.last_qsize < self.maxsize: 61 | pass 62 | elif self.full(): 63 | raise self.Full 64 | self.last_qsize = self.redis.rpush(self.name, umsgpack.packb(obj)) 65 | return True 66 | 67 | def put(self, obj, block=True, timeout=None): 68 | if not block: 69 | return self.put_nowait(obj) 70 | 71 | start_time = time.time() 72 | while True: 73 | try: 74 | return self.put_nowait(obj) 75 | except self.Full: 76 | if timeout: 77 | lasted = time.time() - start_time 78 | if timeout > lasted: 79 | time.sleep(min(self.max_timeout, timeout - lasted)) 80 | else: 81 | raise 82 | else: 83 | time.sleep(self.max_timeout) 84 | 85 | def get_nowait(self): 86 | ret = self.redis.lpop(self.name) 87 | if ret is None: 88 | raise self.Empty 89 | return umsgpack.unpackb(ret) 90 | 91 | def get(self, block=True, timeout=None): 92 | if not block: 93 | return self.get_nowait() 94 | 95 | start_time = time.time() 96 | while True: 97 | try: 98 | return self.get_nowait() 99 | except self.Empty: 100 | if timeout: 101 | lasted = time.time() - start_time 102 | if timeout > lasted: 103 | time.sleep(min(self.max_timeout, timeout - lasted)) 104 | else: 105 | raise 106 | else: 107 | time.sleep(self.max_timeout) 108 | 109 | Queue = RedisQueue 110 | -------------------------------------------------------------------------------- /pyspider/processor/__init__.py: -------------------------------------------------------------------------------- 1 | from .processor import ProcessorResult, Processor 2 | -------------------------------------------------------------------------------- /pyspider/result/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 16:10:19 7 | 8 | from .result_worker import ResultWorker, OneResultWorker 9 | -------------------------------------------------------------------------------- /pyspider/result/result_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 15:37:46 7 | 8 | import time 9 | import json 10 | import logging 11 | from six.moves import queue as Queue 12 | logger = logging.getLogger("result") 13 | 14 | 15 | class ResultWorker(object): 16 | 17 | """ 18 | do with result 19 | override this if needed. 20 | """ 21 | 22 | def __init__(self, resultdb, inqueue): 23 | self.resultdb = resultdb 24 | self.inqueue = inqueue 25 | self._quit = False 26 | 27 | def on_result(self, task, result): 28 | '''Called every result''' 29 | if not result: 30 | return 31 | if 'taskid' in task and 'project' in task and 'url' in task: 32 | logger.info('result %s:%s %s -> %.30r' % ( 33 | task['project'], task['taskid'], task['url'], result)) 34 | return self.resultdb.save( 35 | project=task['project'], 36 | taskid=task['taskid'], 37 | url=task['url'], 38 | result=result 39 | ) 40 | else: 41 | logger.warning('result UNKNOW -> %.30r' % result) 42 | return 43 | 44 | def quit(self): 45 | self._quit = True 46 | 47 | def run(self): 48 | '''Run loop''' 49 | logger.info("result_worker starting...") 50 | 51 | while not self._quit: 52 | try: 53 | task, result = self.inqueue.get(timeout=1) 54 | self.on_result(task, result) 55 | except Queue.Empty as e: 56 | continue 57 | except KeyboardInterrupt: 58 | break 59 | except AssertionError as e: 60 | logger.error(e) 61 | continue 62 | except Exception as e: 63 | logger.exception(e) 64 | continue 65 | 66 | logger.info("result_worker exiting...") 67 | 68 | 69 | class OneResultWorker(ResultWorker): 70 | '''Result Worker for one mode, write results to stdout''' 71 | def on_result(self, task, result): 72 | '''Called every result''' 73 | if not result: 74 | return 75 | if 'taskid' in task and 'project' in task and 'url' in task: 76 | logger.info('result %s:%s %s -> %.30r' % ( 77 | task['project'], task['taskid'], task['url'], result)) 78 | print(json.dumps({ 79 | 'taskid': task['taskid'], 80 | 'project': task['project'], 81 | 'url': task['url'], 82 | 'result': result, 83 | 'updatetime': time.time() 84 | })) 85 | else: 86 | logger.warning('result UNKNOW -> %.30r' % result) 87 | return 88 | -------------------------------------------------------------------------------- /pyspider/scheduler/__init__.py: -------------------------------------------------------------------------------- 1 | from .scheduler import Scheduler, OneScheduler, ThreadBaseScheduler # NOQA 2 | -------------------------------------------------------------------------------- /pyspider/scheduler/token_bucket.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-07 16:53:08 7 | 8 | import time 9 | try: 10 | import threading as _threading 11 | except ImportError: 12 | import dummy_threading as _threading 13 | 14 | 15 | class Bucket(object): 16 | 17 | ''' 18 | traffic flow control with token bucket 19 | ''' 20 | 21 | update_interval = 30 22 | 23 | def __init__(self, rate=1, burst=None): 24 | self.rate = float(rate) 25 | if burst is None: 26 | self.burst = float(rate) * 10 27 | else: 28 | self.burst = float(burst) 29 | self.mutex = _threading.Lock() 30 | self.bucket = self.burst 31 | self.last_update = time.time() 32 | 33 | def get(self): 34 | '''Get the number of tokens in bucket''' 35 | now = time.time() 36 | if self.bucket >= self.burst: 37 | self.last_update = now 38 | return self.bucket 39 | bucket = self.rate * (now - self.last_update) 40 | self.mutex.acquire() 41 | if bucket > 1: 42 | self.bucket += bucket 43 | if self.bucket > self.burst: 44 | self.bucket = self.burst 45 | self.last_update = now 46 | self.mutex.release() 47 | return self.bucket 48 | 49 | def set(self, value): 50 | '''Set number of tokens in bucket''' 51 | self.bucket = value 52 | 53 | def desc(self, value=1): 54 | '''Use value tokens''' 55 | self.bucket -= value 56 | -------------------------------------------------------------------------------- /pyspider/webui/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-22 23:20:40 7 | 8 | from . import app, index, debug, task, result, login 9 | -------------------------------------------------------------------------------- /pyspider/webui/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-22 23:17:13 7 | 8 | import os 9 | import sys 10 | import logging 11 | logger = logging.getLogger("webui") 12 | 13 | from six import reraise 14 | from six.moves import builtins 15 | from six.moves.urllib.parse import urljoin 16 | from flask import Flask 17 | from pyspider.fetcher import tornado_fetcher 18 | 19 | if os.name == 'nt': 20 | import mimetypes 21 | mimetypes.add_type("text/css", ".css", True) 22 | 23 | 24 | class QuitableFlask(Flask): 25 | """Add quit() method to Flask object""" 26 | 27 | @property 28 | def logger(self): 29 | return logger 30 | 31 | def run(self, host=None, port=None, debug=None, **options): 32 | import tornado.wsgi 33 | import tornado.ioloop 34 | import tornado.httpserver 35 | import tornado.web 36 | 37 | if host is None: 38 | host = '127.0.0.1' 39 | if port is None: 40 | server_name = self.config['SERVER_NAME'] 41 | if server_name and ':' in server_name: 42 | port = int(server_name.rsplit(':', 1)[1]) 43 | else: 44 | port = 5000 45 | if debug is not None: 46 | self.debug = bool(debug) 47 | 48 | hostname = host 49 | port = port 50 | application = self 51 | use_reloader = self.debug 52 | use_debugger = self.debug 53 | 54 | if use_debugger: 55 | from werkzeug.debug import DebuggedApplication 56 | application = DebuggedApplication(application, True) 57 | 58 | try: 59 | from .webdav import dav_app 60 | except ImportError as e: 61 | logger.warning('WebDav interface not enabled: %r', e) 62 | dav_app = None 63 | if dav_app: 64 | from werkzeug.wsgi import DispatcherMiddleware 65 | application = DispatcherMiddleware(application, { 66 | '/dav': dav_app 67 | }) 68 | 69 | container = tornado.wsgi.WSGIContainer(application) 70 | self.http_server = tornado.httpserver.HTTPServer(container) 71 | self.http_server.listen(port, hostname) 72 | if use_reloader: 73 | from tornado import autoreload 74 | autoreload.start() 75 | 76 | self.logger.info('webui running on %s:%s', hostname, port) 77 | self.ioloop = tornado.ioloop.IOLoop.current() 78 | self.ioloop.start() 79 | 80 | def quit(self): 81 | if hasattr(self, 'ioloop'): 82 | self.ioloop.add_callback(self.http_server.stop) 83 | self.ioloop.add_callback(self.ioloop.stop) 84 | self.logger.info('webui exiting...') 85 | 86 | 87 | app = QuitableFlask('webui', 88 | static_folder=os.path.join(os.path.dirname(__file__), 'static'), 89 | template_folder=os.path.join(os.path.dirname(__file__), 'templates')) 90 | app.secret_key = os.urandom(24) 91 | app.jinja_env.line_statement_prefix = '#' 92 | app.jinja_env.globals.update(builtins.__dict__) 93 | 94 | app.config.update({ 95 | 'fetch': lambda x: tornado_fetcher.Fetcher(None, None, async_mode=False).fetch(x), 96 | 'taskdb': None, 97 | 'projectdb': None, 98 | 'scheduler_rpc': None, 99 | 'queues': dict(), 100 | 'process_time_limit': 30, 101 | }) 102 | 103 | 104 | def cdn_url_handler(error, endpoint, kwargs): 105 | if endpoint == 'cdn': 106 | path = kwargs.pop('path') 107 | # cdn = app.config.get('cdn', 'http://cdn.staticfile.org/') 108 | # cdn = app.config.get('cdn', '//cdnjs.cloudflare.com/ajax/libs/') 109 | cdn = app.config.get('cdn', '//cdnjscn.b0.upaiyun.com/libs/') 110 | return urljoin(cdn, path) 111 | else: 112 | exc_type, exc_value, tb = sys.exc_info() 113 | if exc_value is error: 114 | reraise(exc_type, exc_value, tb) 115 | else: 116 | raise error 117 | app.handle_url_build_error = cdn_url_handler 118 | -------------------------------------------------------------------------------- /pyspider/webui/bench_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-08 22:31:17 7 | 8 | import random 9 | try: 10 | from urllib import urlencode 11 | except ImportError: 12 | from urllib.parse import urlencode 13 | 14 | from flask import request 15 | from .app import app 16 | 17 | 18 | @app.route('/bench') 19 | def bench_test(): 20 | total = int(request.args.get('total', 10000)) 21 | show = int(request.args.get('show', 20)) 22 | nlist = [random.randint(1, total) for _ in range(show)] 23 | result = [] 24 | result.append("") 25 | args = dict(request.args) 26 | for nl in nlist: 27 | args['n'] = nl 28 | argstr = urlencode(sorted(args.items()), doseq=True) 29 | result.append("follow {1}
".format(argstr, nl)) 30 | result.append("") 31 | return "".join(result) 32 | -------------------------------------------------------------------------------- /pyspider/webui/login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-10 20:36:27 7 | 8 | import base64 9 | from flask import Response 10 | try: 11 | import flask_login as login 12 | except ImportError: 13 | from flask.ext import login 14 | from .app import app 15 | 16 | login_manager = login.LoginManager() 17 | login_manager.init_app(app) 18 | 19 | 20 | class AnonymousUser(login.AnonymousUserMixin): 21 | 22 | def is_anonymous(self): 23 | return True 24 | 25 | def is_active(self): 26 | return False 27 | 28 | def is_authenticated(self): 29 | return False 30 | 31 | def get_id(self): 32 | return 33 | 34 | 35 | class User(login.UserMixin): 36 | 37 | def __init__(self, id, password): 38 | self.id = id 39 | self.password = password 40 | 41 | def is_authenticated(self): 42 | if not app.config.get('webui_username'): 43 | return True 44 | if self.id == app.config.get('webui_username') \ 45 | and self.password == app.config.get('webui_password'): 46 | return True 47 | return False 48 | 49 | def is_active(self): 50 | return self.is_authenticated() 51 | 52 | 53 | login_manager.anonymous_user = AnonymousUser 54 | 55 | 56 | @login_manager.request_loader 57 | def load_user_from_request(request): 58 | api_key = request.headers.get('Authorization') 59 | if api_key: 60 | api_key = api_key[len("Basic "):] 61 | try: 62 | api_key = base64.b64decode(api_key).decode('utf8') 63 | return User(*api_key.split(":", 1)) 64 | except Exception as e: 65 | app.logger.error('wrong api key: %r, %r', api_key, e) 66 | return None 67 | return None 68 | app.login_response = Response( 69 | "need auth.", 401, {'WWW-Authenticate': 'Basic realm="Login Required"'} 70 | ) 71 | 72 | 73 | @app.before_request 74 | def before_request(): 75 | if app.config.get('need_auth', False): 76 | if not login.current_user.is_active(): 77 | return app.login_response 78 | -------------------------------------------------------------------------------- /pyspider/webui/result.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-10-19 16:23:55 7 | 8 | from __future__ import unicode_literals 9 | 10 | from flask import render_template, request, json 11 | from flask import Response 12 | from .app import app 13 | from pyspider.libs import result_dump 14 | 15 | 16 | @app.route('/results') 17 | def result(): 18 | resultdb = app.config['resultdb'] 19 | project = request.args.get('project') 20 | offset = int(request.args.get('offset', 0)) 21 | limit = int(request.args.get('limit', 20)) 22 | 23 | count = resultdb.count(project) 24 | results = list(resultdb.select(project, offset=offset, limit=limit)) 25 | 26 | return render_template( 27 | "result.html", count=count, results=results, 28 | result_formater=result_dump.result_formater, 29 | project=project, offset=offset, limit=limit, json=json 30 | ) 31 | 32 | 33 | @app.route('/results/dump/.<_format>') 34 | def dump_result(project, _format): 35 | resultdb = app.config['resultdb'] 36 | # force update project list 37 | resultdb.get(project, 'any') 38 | if project not in resultdb.projects: 39 | return "no such project.", 404 40 | 41 | offset = int(request.args.get('offset', 0)) or None 42 | limit = int(request.args.get('limit', 0)) or None 43 | results = resultdb.select(project, offset=offset, limit=limit) 44 | 45 | if _format == 'json': 46 | valid = request.args.get('style', 'rows') == 'full' 47 | return Response(result_dump.dump_as_json(results, valid), 48 | mimetype='application/json') 49 | elif _format == 'txt': 50 | return Response(result_dump.dump_as_txt(results), 51 | mimetype='text/plain') 52 | elif _format == 'csv': 53 | return Response(result_dump.dump_as_csv(results), 54 | mimetype='text/csv') 55 | -------------------------------------------------------------------------------- /pyspider/webui/static/.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": ["es2015"] 3 | } 4 | -------------------------------------------------------------------------------- /pyspider/webui/static/css_selector_helper.min.js: -------------------------------------------------------------------------------- 1 | !function(e){function t(n){if(r[n])return r[n].exports;var a=r[n]={exports:{},id:n,loaded:!1};return e[n].call(a.exports,a,a.exports,t),a.loaded=!0,a.exports}var r={};return t.m=e,t.c=r,t.p="",t(0)}([function(e,t){"use strict";function r(e,t){function r(e,t){if(!e||!t)return!1;if(e.length!=t.length)return!1;for(var r=0,n=e.length;r=0&&a>t))if(e.invalid)n=null;else if(e.selected){n&&(r+=" >");var o="";e.features.forEach(function(e){e.selected&&(o+=e.pattern)}),""===o&&(o="*"),r+=" "+o,n=e}else n=null}),""===r&&(r="*"),r}function i(t){var n=[];do{var a=[];if(a.push({name:t.tagName.toLowerCase(),pattern:t.tagName.toLowerCase(),selected:!0}),t.getAttribute("id")&&a.push({name:"#"+t.getAttribute("id"),pattern:"#"+t.getAttribute("id"),selected:!0}),t.classList.length>0)for(var i=0;i1&&ispan{border:1px solid gray;padding:1px 5px 0;background:#999;color:#fff}.projects span.status-TODO{border:1px solid #ec971f;padding:1px 5px 0;background:#f0ad4e;color:#fff}.projects span.status-STOP{border:1px solid #c9302c;padding:1px 5px 0;background:#d9534f;color:#fff}.projects span.status-CHECKING{border:1px solid #dcbe00;padding:1px 5px 0;background:#ffde10;color:#fff}.projects span.status-DEBUG{border:1px solid #3071a9;padding:1px 5px 0;background:#428bca;color:#fff}.projects span.status-RUNNING{border:1px solid #449d44;padding:1px 5px 0;background:#5cb85c;color:#fff}.projects span.status-PAUSED{border:1px solid #3c3c3c;padding:1px 5px 0;background:#555;color:#fff}.projects .project-rate,.projects .project-time{width:110px}.projects th.project-progress{position:relative}.projects th.project-progress span{position:absolute}.projects td.project-progress{position:relative;min-width:5%}.projects td.project-progress.progress-all{min-width:10%}.projects td.project-progress .progress{position:relative;margin:0;background-color:#aaa}.projects td.project-progress .progress .progress-text{width:100%;text-align:center;position:absolute;font-weight:700;color:#fff;pointer-events:none}.projects td.project-progress .progress .progress-bar{-webkit-transition:none;transition:none}.projects .project-actions{width:200px}.global-btn{margin-top:-5px;padding:10px}.global-btn .create-btn-div{float:right}.global-btn .active-btn-div{float:left} 2 | /*# sourceMappingURL=index.min.css.map*/ -------------------------------------------------------------------------------- /pyspider/webui/static/index.min.js: -------------------------------------------------------------------------------- 1 | !function(t){function e(r){if(a[r])return a[r].exports;var n=a[r]={exports:{},id:r,loaded:!1};return t[r].call(n.exports,n,n.exports,e),n.loaded=!0,n.exports}var a={};return e.m=t,e.c=a,e.p="",e(0)}({0:function(t,e,a){"use strict";a(10),$(function(){function t(t){$(".project-group>span").editable({name:"group",pk:function(t){return $(this).parents("tr").data("name")},emptytext:"[group]",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name");t.projects[r].group=a,$(this).attr("style","")}}),$(".project-status>span").editable({type:"select",name:"status",source:[{value:"TODO",text:"TODO"},{value:"STOP",text:"STOP"},{value:"CHECKING",text:"CHECKING"},{value:"DEBUG",text:"DEBUG"},{value:"RUNNING",text:"RUNNING"}],pk:function(t){return $(this).parents("tr").data("name")},emptytext:"[status]",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name");t.projects[r].status=a,$(this).removeClass("status-"+$(this).attr("data-value")).addClass("status-"+a).attr("data-value",a).attr("style","")}}),$(".project-rate>span").editable({name:"rate",pk:function(t){return $(this).parents("tr").data("name")},validate:function(t){var e=t.split("/");return 2!=e.length?"format error: rate/burst":$.isNumeric(e[0])&&$.isNumeric(e[1])?void 0:"format error: rate/burst"},highlight:!1,emptytext:"0/0",placement:"right",url:"/update",success:function(e,a){var r=$(this).parents("tr").data("name"),n=a.split("/");t.projects[r].rate=parseFloat(n[0]),t.projects[r].burst=parseFloat(n[1]),$(this).attr("style","")}})}function e(){Sortable.getColumnType=function(t,e){var a=$($(t).find("th").get(e)).data("type");return"num"==a?Sortable.types.numeric:"date"==a?Sortable.types.date:Sortable.types.alpha},$("table.projects").attr("data-sortable",!0),Sortable.init()}function a(){$.get("/counter",function(t){for(var e in t){var a=t[e];if(void 0!==s.projects[e]){var r="5m,1h,1d,all".split(","),n=!0,o=!1,i=void 0;try{for(var u,c=r[Symbol.iterator]();!(n=(u=c.next()).done);n=!0){var l=u.value,p=a[l];if(void 0!==p){var d=p.pending||0,f=p.success||0,m=p.retry||0,v=p.failed||0,h=p.task||d+f+m+v;p.task=h,p.title=""+l+" of "+h+" tasks:\n"+("all"==l?"pending("+(d/h*100).toFixed(1)+"%): \t"+d+"\n":"new("+(d/h*100).toFixed(1)+"%): \t\t"+d+"\n")+"success("+(f/h*100).toFixed(1)+"%): \t"+f+"\nretry("+(m/h*100).toFixed(1)+"%): \t"+m+"\nfailed("+(v/h*100).toFixed(1)+"%): \t"+v}}}catch($){o=!0,i=$}finally{try{!n&&c["return"]&&c["return"]()}finally{if(o)throw i}}s.projects[e].paused=a.paused,s.projects[e].time=a["5m_time"],s.projects[e].progress=a}}})}function r(){$.get("/queues",function(t){$(".queue_value").each(function(e,a){var r=$(a).attr("title");void 0!==t[r]?$(a).text(t[r]):$(a).text("???")})})}$("#create-project-modal form").on("submit",function(t){var e=$(this),a=e.find("[name=project-name]").val();return 0==a.length||a.search(/[^\w]/)!=-1?(e.find("[name=project-name]").parents(".form-group").addClass("has-error"),e.find("[name=project-name] ~ .help-block").show(),!1):(e.find("[name=script-mode]:checked").val(),e.attr("action","/debug/"+a),!0)});var n={};projects.forEach(function(t){t.paused=!1,t.time={},t.progress={},n[t.name]=t});var s=new Vue({el:".projects",data:{projects:n},ready:function(){t(this),e(this),a(),window.setInterval(a,15e3),r(),window.setInterval(r,15e3)},methods:{project_run:function(t,e){$("#need-set-status-alert").hide(),"RUNNING"!=t.status&&"DEBUG"!=t.status&&$("#need-set-status-alert").show();var a=e.target;$(a).addClass("btn-warning"),$.ajax({type:"POST",url:"/run",data:{project:t.name},success:function(t){$(a).removeClass("btn-warning"),t.result||$(a).addClass("btn-danger")},error:function(){$(a).removeClass("btn-warning").addClass("btn-danger")}})}}})})},10:function(t,e){}}); 2 | //# sourceMappingURL=index.min.js.map -------------------------------------------------------------------------------- /pyspider/webui/static/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pyspider-webui", 3 | "version": "0.3.9", 4 | "description": "webui of pyspider", 5 | "scripts": { 6 | "build": "webpack --progress --colors --optimize-minimize", 7 | "dev": "webpack --progress --colors --optimize-minimize --watch" 8 | }, 9 | "keywords": [ 10 | "pyspider" 11 | ], 12 | "author": "binux", 13 | "license": "MIT", 14 | "devDependencies": { 15 | "babel-core": "^6.14.0", 16 | "babel-loader": "^6.2.5", 17 | "babel-preset-es2015": "^6.14.0", 18 | "css-loader": "^0.25.0", 19 | "extract-text-webpack-plugin": "^1.0.1", 20 | "less": "^2.7.1", 21 | "less-loader": "^2.2.3", 22 | "style-loader": "^0.13.1", 23 | "webpack": "^1.13.2" 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /pyspider/webui/static/result.min.css: -------------------------------------------------------------------------------- 1 | .top-bar{padding:10px 15px 2px;height:46px;background-color:#f5f5f5;border-bottom:1px solid #ddd;position:relative}.top-bar h1{margin:0 0 10px;font-size:18px}.top-bar .btn-group{margin:8px 10px 0 0;position:absolute;right:0;top:0}.pagination-wrap{text-align:right;padding-right:15px}table{border-bottom:1px solid #ddd}table td{word-break:break-all} 2 | /*# sourceMappingURL=result.min.css.map*/ -------------------------------------------------------------------------------- /pyspider/webui/static/result.min.js: -------------------------------------------------------------------------------- 1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]); 2 | //# sourceMappingURL=result.min.js.map -------------------------------------------------------------------------------- /pyspider/webui/static/src/index.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-02-23 00:28:30 */ 5 | 6 | @import "variable"; 7 | 8 | h1 { 9 | margin-top: 5px; 10 | } 11 | 12 | header .alert { 13 | position: absolute;; 14 | width: 50rem; 15 | left: 50%; 16 | margin-left: -25rem; 17 | } 18 | 19 | .queue-info { 20 | th, td { 21 | text-align: center; 22 | border: 1px solid #ddd; 23 | } 24 | } 25 | 26 | [v-cloak] { 27 | display: none; 28 | } 29 | 30 | .projects { 31 | min-width: 850px; 32 | border-top: 1px solid #ddd; 33 | border-bottom: 1px solid #ddd; 34 | 35 | .project-group { 36 | width: 80px; 37 | } 38 | 39 | .project-name { 40 | font-weight: bold; 41 | } 42 | 43 | .project-status { 44 | width: 100px; 45 | } 46 | .project-status-span(@color) { 47 | border: solid 1px darken(@color, 10%); 48 | padding: 1px 5px 0 5px; 49 | background: @color; 50 | color: white; 51 | } 52 | .project-status>span { 53 | .project-status-span(@gray-light); 54 | } 55 | span.status-TODO { 56 | .project-status-span(@orange); 57 | } 58 | span.status-STOP { 59 | .project-status-span(@red); 60 | } 61 | span.status-CHECKING { 62 | .project-status-span(darken(@yellow, 10%)); 63 | } 64 | span.status-DEBUG { 65 | .project-status-span(@blue); 66 | } 67 | span.status-RUNNING { 68 | .project-status-span(@green); 69 | } 70 | span.status-PAUSED { 71 | .project-status-span(@gray); 72 | } 73 | 74 | .project-rate { 75 | width: 110px; 76 | } 77 | 78 | .project-time { 79 | width: 110px; 80 | } 81 | 82 | th.project-progress { 83 | position: relative; 84 | span { 85 | position: absolute; 86 | } 87 | } 88 | 89 | td.project-progress { 90 | position: relative; 91 | min-width: 5%; 92 | &.progress-all { 93 | min-width: 10%; 94 | } 95 | 96 | .progress { 97 | position: relative; 98 | margin: 0; 99 | background-color: #aaa; 100 | .progress-text { 101 | width: 100%; 102 | text-align: center; 103 | position: absolute; 104 | font-weight: bold; 105 | color: #fff; 106 | pointer-events: none; 107 | } 108 | .progress-bar { 109 | -webkit-transition: none; 110 | transition: none; 111 | } 112 | } 113 | } 114 | 115 | .project-actions { 116 | width: 200px; 117 | } 118 | } 119 | 120 | .global-btn { 121 | margin-top: -5px; 122 | padding: 10px 10px 10px 10px; 123 | 124 | .create-btn-div { 125 | float: right; 126 | } 127 | 128 | .active-btn-div { 129 | float: left; 130 | } 131 | } 132 | 133 | -------------------------------------------------------------------------------- /pyspider/webui/static/src/result.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-10-22 22:38:45 */ 5 | 6 | @import "variable"; 7 | 8 | .top-bar { 9 | padding: 10px 15px 2px 15px; 10 | height: 46px; 11 | background-color: #f5f5f5; 12 | border-bottom: 1px solid #ddd; 13 | position: relative; 14 | 15 | h1 { 16 | margin: 0 0 10px 0; 17 | font-size: 18px; 18 | } 19 | 20 | .btn-group { 21 | margin: 8px 10px 0 0; 22 | position: absolute; 23 | right: 0; 24 | top: 0; 25 | 26 | a.btn { 27 | } 28 | } 29 | } 30 | 31 | .pagination-wrap { 32 | text-align: right; 33 | padding-right: 15px; 34 | } 35 | 36 | table { 37 | border-bottom: 1px solid #ddd; 38 | 39 | td { 40 | word-break: break-all; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /pyspider/webui/static/src/task.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-16 19:20:30 */ 5 | 6 | @import "variable"; 7 | 8 | .base-info { 9 | padding: 10px 15px 2px 15px; 10 | background-color: #f5f5f5; 11 | border-bottom: 1px solid #ddd; 12 | } 13 | 14 | .more-info { 15 | padding: 10px 15px; 16 | } 17 | 18 | .more-info dd { 19 | display: block; 20 | font-family: monospace; 21 | white-space: pre; 22 | word-break: break-all; 23 | word-wrap: break-word; 24 | margin: 1em 0px; 25 | } 26 | 27 | .status_mix(@color: lighten(black, 50%)) { 28 | border: solid 1px darken(@color, 10%); 29 | padding: 1px 5px 0 5px; 30 | background: @color; 31 | color: white; 32 | } 33 | .status { 34 | &-1 { 35 | .status_mix(@blue); 36 | } 37 | &-2 { 38 | .status_mix(@green); 39 | } 40 | &-3 { 41 | .status_mix(@red); 42 | } 43 | &-4 { 44 | .status_mix; 45 | } 46 | } 47 | 48 | .url { 49 | font-size: 120%; 50 | text-decoration: underline; 51 | } 52 | 53 | .callback { 54 | color: @orange; 55 | font-weight: bold; 56 | 57 | &:hover, &:focus { 58 | color: darken(@orange, 10%); 59 | } 60 | } 61 | 62 | dt .glyphicon-ok { 63 | color: @green; 64 | } 65 | dt .glyphicon-remove { 66 | color: @red; 67 | } 68 | -------------------------------------------------------------------------------- /pyspider/webui/static/src/tasks.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-18 23:20:46 */ 5 | 6 | @import "variable"; 7 | @import "task"; 8 | 9 | .tasks { 10 | margin: 0; 11 | padding: 0; 12 | list-style-type: none; 13 | 14 | li { 15 | .base-info; 16 | 17 | &:nth-child(even) { 18 | background-color: white; 19 | } 20 | } 21 | 22 | .url { 23 | display: inline-block; 24 | vertical-align: bottom; 25 | max-width: 40em; 26 | overflow: hidden; 27 | white-space: nowrap; 28 | text-overflow: ellipsis; 29 | } 30 | 31 | .update-time { 32 | font-weight: bold; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /pyspider/webui/static/src/variable.less: -------------------------------------------------------------------------------- 1 | /* vim: set et sw=2 ts=2 sts=2 ff=unix fenc=utf8: */ 2 | /* Author: Binux */ 3 | /* http://binux.me */ 4 | /* Created on 2014-07-16 19:18:30 */ 5 | 6 | // colors 7 | @gray-darker: lighten(#000, 13.5%); // #222 8 | @gray-dark: lighten(#000, 20%); // #333 9 | @gray: lighten(#000, 33.5%); // #555 10 | @gray-light: lighten(#000, 60%); // #999 11 | @gray-lighter: lighten(#000, 93.5%); // #eee 12 | 13 | @blue: #428bca; 14 | @green: #5cb85c; 15 | @blue-light: #5bc0de; 16 | @orange: #f0ad4e; 17 | @yellow: #ffe543; 18 | @red: #d9534f; 19 | -------------------------------------------------------------------------------- /pyspider/webui/static/task.min.css: -------------------------------------------------------------------------------- 1 | .base-info{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.more-info{padding:10px 15px}.more-info dd{display:block;font-family:monospace;white-space:pre;word-break:break-all;word-wrap:break-word;margin:1em 0}.status-1{border:1px solid #3071a9;background:#428bca}.status-1,.status-2{padding:1px 5px 0;color:#fff}.status-2{border:1px solid #449d44;background:#5cb85c}.status-3{border:1px solid #c9302c;background:#d9534f}.status-3,.status-4{padding:1px 5px 0;color:#fff}.status-4{border:1px solid #666;background:gray}.url{font-size:120%;text-decoration:underline}.callback{color:#f0ad4e;font-weight:700}.callback:focus,.callback:hover{color:#ec971f}dt .glyphicon-ok{color:#5cb85c}dt .glyphicon-remove{color:#d9534f} 2 | /*# sourceMappingURL=task.min.css.map*/ -------------------------------------------------------------------------------- /pyspider/webui/static/task.min.js: -------------------------------------------------------------------------------- 1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]); 2 | //# sourceMappingURL=task.min.js.map -------------------------------------------------------------------------------- /pyspider/webui/static/tasks.min.css: -------------------------------------------------------------------------------- 1 | .base-info{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.more-info{padding:10px 15px}.more-info dd{display:block;font-family:monospace;white-space:pre;word-break:break-all;word-wrap:break-word;margin:1em 0}.status-1{border:1px solid #3071a9;background:#428bca}.status-1,.status-2{padding:1px 5px 0;color:#fff}.status-2{border:1px solid #449d44;background:#5cb85c}.status-3{border:1px solid #c9302c;background:#d9534f}.status-3,.status-4{padding:1px 5px 0;color:#fff}.status-4{border:1px solid #666;background:gray}.url{font-size:120%;text-decoration:underline}.callback{color:#f0ad4e;font-weight:700}.callback:focus,.callback:hover{color:#ec971f}dt .glyphicon-ok{color:#5cb85c}dt .glyphicon-remove{color:#d9534f}.tasks{margin:0;padding:0;list-style-type:none}.tasks li{padding:10px 15px 2px;background-color:#f5f5f5;border-bottom:1px solid #ddd}.tasks li:nth-child(even){background-color:#fff}.tasks .url{display:inline-block;vertical-align:bottom;max-width:40em;overflow:hidden;white-space:nowrap;text-overflow:ellipsis}.tasks .update-time{font-weight:700} 2 | /*# sourceMappingURL=tasks.min.css.map*/ -------------------------------------------------------------------------------- /pyspider/webui/static/tasks.min.js: -------------------------------------------------------------------------------- 1 | !function(r){function t(o){if(e[o])return e[o].exports;var n=e[o]={exports:{},id:o,loaded:!1};return r[o].call(n.exports,n,n.exports,t),n.loaded=!0,n.exports}var e={};return t.m=r,t.c=e,t.p="",t(0)}([function(r,t){}]); 2 | //# sourceMappingURL=tasks.min.js.map -------------------------------------------------------------------------------- /pyspider/webui/static/webpack.config.js: -------------------------------------------------------------------------------- 1 | var webpack = require("webpack"); 2 | var ExtractTextPlugin = require("extract-text-webpack-plugin"); 3 | 4 | module.exports = { 5 | entry: { 6 | index: "./src/index", 7 | debug: "./src/debug", 8 | result: "./src/result.less", 9 | task: "./src/task.less", 10 | tasks: "./src/tasks.less", 11 | }, 12 | output: { 13 | //path: "./dist", 14 | filename: "[name].min.js" 15 | }, 16 | module: { 17 | loaders: [ 18 | { test: /\.js$/, loader: "babel-loader" }, 19 | { test: /\.less$/, loader: ExtractTextPlugin.extract("style-loader", "css-loader?sourceMap!less-loader?sourceMap") } 20 | ] 21 | }, 22 | devtool: 'source-map', 23 | plugins: [ 24 | new ExtractTextPlugin("[name].min.css"), 25 | new webpack.optimize.UglifyJsPlugin({ compress: { warnings: false } }), 26 | ] 27 | } 28 | -------------------------------------------------------------------------------- /pyspider/webui/task.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-07-16 15:30:57 7 | 8 | import socket 9 | from flask import abort, render_template, request, json 10 | 11 | from pyspider.libs import utils 12 | from .app import app 13 | 14 | 15 | @app.route('/task/') 16 | def task(taskid): 17 | if ':' not in taskid: 18 | abort(400) 19 | project, taskid = taskid.split(':', 1) 20 | 21 | taskdb = app.config['taskdb'] 22 | task = taskdb.get_task(project, taskid) 23 | 24 | if not task: 25 | abort(404) 26 | resultdb = app.config['resultdb'] 27 | result = {} 28 | if resultdb: 29 | result = resultdb.get(project, taskid) 30 | 31 | return render_template("task.html", task=task, json=json, result=result, 32 | status_to_string=app.config['taskdb'].status_to_string) 33 | 34 | 35 | @app.route('/task/.json') 36 | def task_in_json(taskid): 37 | if ':' not in taskid: 38 | return json.jsonify({'code': 400, 'error': 'bad project:task_id format'}) 39 | project, taskid = taskid.split(':', 1) 40 | 41 | taskdb = app.config['taskdb'] 42 | task = taskdb.get_task(project, taskid) 43 | 44 | if not task: 45 | return json.jsonify({'code': 404, 'error': 'not found'}) 46 | task['status_string'] = app.config['taskdb'].status_to_string(task['status']) 47 | return json.jsonify(task) 48 | 49 | 50 | @app.route('/tasks') 51 | def tasks(): 52 | rpc = app.config['scheduler_rpc'] 53 | taskdb = app.config['taskdb'] 54 | project = request.args.get('project', "") 55 | limit = int(request.args.get('limit', 100)) 56 | 57 | try: 58 | updatetime_tasks = rpc.get_active_tasks(project, limit) 59 | except socket.error as e: 60 | app.logger.warning('connect to scheduler rpc error: %r', e) 61 | return 'connect to scheduler error', 502 62 | 63 | tasks = {} 64 | result = [] 65 | for updatetime, task in sorted(updatetime_tasks, key=lambda x: x[0]): 66 | key = '%(project)s:%(taskid)s' % task 67 | task['updatetime'] = updatetime 68 | if key in tasks and tasks[key].get('status', None) != taskdb.ACTIVE: 69 | result.append(tasks[key]) 70 | tasks[key] = task 71 | result.extend(tasks.values()) 72 | 73 | return render_template( 74 | "tasks.html", 75 | tasks=result, 76 | status_to_string=taskdb.status_to_string 77 | ) 78 | 79 | 80 | @app.route('/active_tasks') 81 | def active_tasks(): 82 | rpc = app.config['scheduler_rpc'] 83 | taskdb = app.config['taskdb'] 84 | project = request.args.get('project', "") 85 | limit = int(request.args.get('limit', 100)) 86 | 87 | try: 88 | tasks = rpc.get_active_tasks(project, limit) 89 | except socket.error as e: 90 | app.logger.warning('connect to scheduler rpc error: %r', e) 91 | return '{}', 502, {'Content-Type': 'application/json'} 92 | 93 | result = [] 94 | for updatetime, task in tasks: 95 | task['updatetime'] = updatetime 96 | task['updatetime_text'] = utils.format_date(updatetime) 97 | if 'status' in task: 98 | task['status_text'] = taskdb.status_to_string(task['status']) 99 | result.append(task) 100 | 101 | return json.dumps(result), 200, {'Content-Type': 'application/json'} 102 | 103 | app.template_filter('format_date')(utils.format_date) 104 | -------------------------------------------------------------------------------- /pyspider/webui/templates/result.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Results - {{ project }} - pyspider 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 |

{{ project }} - Results

22 |
23 | 25 | 26 | JSON 27 | URL-JSON 29 | CSV 31 |
32 |
33 | # set common_fields, results = result_formater(results) 34 | 35 | 36 | 37 | 38 | # for field in common_fields|sort 39 | 42 | # endfor 43 | 46 | 47 | 48 | # for result in results 49 | 50 | 53 | 56 | # for field in common_fields|sort 57 | 58 | # endfor 59 | 62 | # endfor 63 | 64 |
url 40 | {{ field }} 41 | 44 | ... 45 |
51 | {{ result.url }} 52 | 54 | 55 | {{ json.dumps(result.result_formated[field], ensure_ascii=False) | truncate(100, True) }} 60 | {{ json.dumps(result.others, ensure_ascii=False) | truncate(100, True) }} 61 |
65 | 66 |
67 |
    68 | # set current_page = int(offset/limit) + (1 if offset%limit else 0) 69 | # set count = count if count is not none else 0 70 | # set total_page = int(count/limit) + (1 if count%limit else 0) 71 |
  • 72 | « 73 |
  • 74 | # set prev = 0 75 | # for i in range(0, total_page): 76 | # if abs(i-0) < 2 or abs(i-total_page) < 3 or -2 < i-current_page < 5: 77 | # set prev = i 78 |
  • 79 | {{ i + 1 }} 80 |
  • 81 | # elif prev == i-1: 82 |
  • 83 | # endif 84 | # endfor 85 |
  • = total_page else "" }}"> 86 | » 87 |
  • 88 |
89 |
90 | 91 | 92 | -------------------------------------------------------------------------------- /pyspider/webui/templates/task.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Task - {{ task.project }}:{{ task.taskid }} - pyspider 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
21 |

22 | {{ status_to_string(task.status) }} 23 | {{ task.project }}.{{ task.process.callback }} 24 | > 25 | {{ task.url }} 26 | {% if task.status in (2, 3, 4) %} 27 | ({{ task.lastcrawltime | format_date }} crawled ) 28 | {% else %} 29 | ({{ task.updatetime | format_date }} updated ) 30 | {% endif %} 31 |

32 |
33 |
34 |
35 |
taskid
36 |
{{ task.taskid }}
37 |
lastcrawltime
38 |
{{ task.lastcrawltime }} ({{ task.lastcrawltime | format_date }})
39 |
updatetime
40 |
{{ task.updatetime }} ({{ task.updatetime | format_date }})
41 | # if task.schedule and task.schedule.exetime 42 |
exetime
43 |
{{ task.schedule.exetime }} ({{ task.schedule.exetime | format_date }})
44 | # endif 45 | 46 | # if task.track and task.track.fetch 47 |
48 | track.fetch 49 | 50 | {{ (task.track.fetch.time * 1000) | round(2) }}ms 51 |
52 |
{{ json.dumps(task.track.fetch, indent=2, ensure_ascii=False) }}
53 | # endif 54 | 55 | # if task.track and task.track.process 56 |
57 | track.process 58 | 59 | {{ (task.track.process.time * 1000) | round(2) }}ms 60 | # if task.track.process.follows 61 | +{{ task.track.process.follows | int }} 62 | # endif 63 |
64 |
65 | #- if task.track.process.exception 66 | {{- task.track.process.exception or '' }} 67 | # endif 68 | #- if task.track.process.logs 69 | {{- task.track.process.logs or '' }} 70 | # endif 71 | {{- json.dumps(task.track.process, indent=2, ensure_ascii=False) -}} 72 |
73 | # endif 74 |
75 |
76 | #- set not_shown_keys = ('status', 'url', 'project', 'taskid', 'lastcrawltime', 'updatetime', 'track', ) 77 | #- for key, value in task.items() if key not in not_shown_keys 78 |
{{ key }}
79 |
{{ json.dumps(value, indent=2, ensure_ascii=False) if value is mapping else value }}
80 | #- endfor 81 |
82 | # if result and result.get('result'): 83 |
84 |
result
85 |
{{ json.dumps(result['result'], indent=2, ensure_ascii=False) }}
86 |
87 | # endif 88 |
89 | 90 | 91 | 92 | 93 | -------------------------------------------------------------------------------- /pyspider/webui/templates/tasks.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tasks - pyspider 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 |
    21 | {% for task in tasks | sort(reverse=True, attribute='updatetime') %} 22 |
  1. 23 | {% if task.status %} 24 | {{ status_to_string(task.status) }} 25 | {% elif task.track %} 26 | 27 | {% set fetchok = task.track.fetch and task.track.fetch.ok %} 28 | {% set processok = task.track.process and task.track.process.ok %} 29 | {%- if not fetchok -%} 30 | FETCH_ERROR 31 | {%- elif not processok -%} 32 | PROCESS_ERROR 33 | {%- endif -%} 34 | 35 | {% else %} 36 | ERROR 37 | {% endif %} 38 | 39 | {{ task.project }} 40 | > 41 | {{ task.url }} 42 | 43 | {{ task.updatetime | format_date }} 44 | 45 | {% if task.track and task.track.fetch %} 46 | 47 | {{- '%.1f' | format(task.track.fetch.time * 1000) }}+{{ '%.2f' | format(task.track.process.time * 1000 if task.track and task.track.process else 0) }}ms 48 | 49 | {% endif %} 50 | 51 | 52 | {% if task.track and task.track.process %} 53 | +{{ task.track.process.follows | int }} 54 | {% endif %} 55 | 56 |
  2. 57 | {% endfor %} 58 |
59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Flask==0.10 2 | Jinja2==2.7 3 | chardet==3.0.4 4 | cssselect==0.9 5 | lxml==4.3.3 6 | pycurl==7.43.0.3 7 | pyquery==1.4.0 8 | requests==2.24.0 9 | tornado==4.5.3 10 | mysql-connector-python==8.0.16 11 | pika==1.1.0 12 | pymongo==3.9.0 13 | Flask-Login==0.2.11 14 | u-msgpack-python==1.6 15 | click==6.6 16 | SQLAlchemy==1.3.10 17 | six==1.10.0 18 | amqp==2.4.0 19 | redis==2.10.6 20 | redis-py-cluster==1.3.6 21 | kombu==4.4.0 22 | psycopg2==2.8.2 23 | elasticsearch==2.3.0 24 | tblib==1.4.0 25 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-24 23:11:49 7 | 8 | from pyspider.run import main 9 | 10 | if __name__ == '__main__': 11 | main() 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-24 22:27:45 7 | 8 | 9 | import sys 10 | from setuptools import setup, find_packages 11 | from codecs import open 12 | from os import path 13 | 14 | here = path.abspath(path.dirname(__file__)) 15 | with open(path.join(here, 'README.md'), encoding='utf-8') as f: 16 | long_description = f.read() 17 | 18 | import pyspider 19 | 20 | install_requires = [ 21 | 'Flask==0.10', 22 | 'Jinja2==2.7', 23 | 'chardet==3.0.4', 24 | 'cssselect==0.9', 25 | "lxml==4.3.3", 26 | 'pycurl==7.43.0.3', 27 | 'requests==2.24.0', 28 | 'Flask-Login==0.2.11', 29 | 'u-msgpack-python==1.6', 30 | 'click==3.3', 31 | 'six==1.10.0', 32 | 'tblib==1.4.0', 33 | 'wsgidav==2.3.0', 34 | 'tornado>=3.2,<=4.5.3', 35 | 'pyquery', 36 | ] 37 | 38 | extras_require_all = [ 39 | 'mysql-connector-python==8.0.16', 40 | 'pymongo==3.9.0', 41 | 'redis==2.10.6', 42 | 'redis-py-cluster==1.3.6', 43 | 'psycopg2==2.8.2', 44 | 'elasticsearch==2.3.0', 45 | 'kombu==4.4.0', 46 | 'amqp==2.4.0', 47 | 'SQLAlchemy==1.3.10', 48 | 'pika==1.1.0' 49 | ] 50 | 51 | setup( 52 | name='pyspider', 53 | version=pyspider.__version__, 54 | 55 | description='A Powerful Spider System in Python', 56 | long_description=long_description, 57 | 58 | url='https://github.com/binux/pyspider', 59 | 60 | author='Roy Binux', 61 | author_email='roy@binux.me', 62 | 63 | license='Apache License, Version 2.0', 64 | 65 | classifiers=[ 66 | 'Development Status :: 4 - Beta', 67 | 'Programming Language :: Python :: 3.5', 68 | 'Programming Language :: Python :: 3.6', 69 | 'Programming Language :: Python :: 3.7', 70 | 71 | 'License :: OSI Approved :: Apache Software License', 72 | 73 | 'Intended Audience :: Developers', 74 | 'Operating System :: OS Independent', 75 | 'Environment :: Web Environment', 76 | 77 | 'Topic :: Internet :: WWW/HTTP', 78 | 'Topic :: Software Development :: Libraries :: Application Frameworks', 79 | 'Topic :: Software Development :: Libraries :: Python Modules', 80 | ], 81 | 82 | keywords='scrapy crawler spider webui', 83 | 84 | packages=find_packages(exclude=['data', 'tests*']), 85 | 86 | install_requires=install_requires, 87 | 88 | extras_require={ 89 | 'all': extras_require_all, 90 | 'test': [ 91 | 'coverage', 92 | 'Werkzeug==0.16.1', 93 | 'httpbin==0.7.0', 94 | 'pyproxy==0.1.6', 95 | 'easywebdav==1.2.0', 96 | ] 97 | }, 98 | 99 | package_data={ 100 | 'pyspider': [ 101 | 'logging.conf', 102 | 'fetcher/phantomjs_fetcher.js', 103 | 'fetcher/splash_fetcher.lua', 104 | 'webui/static/*.js', 105 | 'webui/static/*.css', 106 | 'webui/templates/*' 107 | ], 108 | }, 109 | 110 | entry_points={ 111 | 'console_scripts': [ 112 | 'pyspider=pyspider.run:main' 113 | ] 114 | }, 115 | 116 | test_suite='tests.all_suite', 117 | ) 118 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-02-09 10:53:19 7 | 8 | import os 9 | import unittest 10 | 11 | all_suite = unittest.TestLoader().discover(os.path.dirname(__file__), "test_*.py") 12 | -------------------------------------------------------------------------------- /tests/data_fetcher_processor_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-18 14:12:55 7 | 8 | from pyspider.libs.base_handler import * 9 | 10 | class Handler(BaseHandler): 11 | 12 | @not_send_status 13 | def not_send_status(self, response): 14 | self.crawl('http://www.baidu.com/') 15 | return response.text 16 | 17 | def url_deduplicated(self, response): 18 | self.crawl('http://www.baidu.com/') 19 | self.crawl('http://www.google.com/') 20 | self.crawl('http://www.baidu.com/') 21 | self.crawl('http://www.google.com/') 22 | self.crawl('http://www.google.com/') 23 | 24 | @catch_status_code_error 25 | def catch_http_error(self, response): 26 | self.crawl('http://www.baidu.com/') 27 | return response.status_code 28 | 29 | def json(self, response): 30 | return response.json 31 | 32 | def html(self, response): 33 | return response.doc('h1').text() 34 | 35 | def links(self, response): 36 | self.crawl([x.attr.href for x in response.doc('a').items()], callback=self.links) 37 | 38 | def cookies(self, response): 39 | return response.cookies 40 | 41 | def get_save(self, response): 42 | return response.save 43 | 44 | def get_process_save(self, response): 45 | return self.save 46 | 47 | def set_process_save(self, response): 48 | self.save['roy'] = 'binux' 49 | 50 | class IgnoreHandler(BaseHandler): 51 | pass 52 | 53 | __handler_cls__ = Handler 54 | -------------------------------------------------------------------------------- /tests/data_handler.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env python 3 | # -*- encoding: utf-8 -*- 4 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 5 | # Author: Binux 6 | # http://binux.me 7 | # Created on 2014-02-22 14:02:21 8 | 9 | import time 10 | from pyspider.libs.base_handler import BaseHandler, catch_status_code_error, every 11 | 12 | class IgnoreHandler(object): 13 | pass 14 | 15 | class TestHandler(BaseHandler): 16 | retry_delay = { 17 | 1: 10, 18 | '': -1 19 | } 20 | 21 | def hello(self): 22 | return "hello world!" 23 | 24 | def echo(self, response): 25 | return response.content 26 | 27 | def saved(self, response): 28 | return response.save 29 | 30 | def echo_task(self, response, task): 31 | return task['project'] 32 | 33 | @catch_status_code_error 34 | def catch_status_code(self, response): 35 | return response.status_code 36 | 37 | def raise_exception(self): 38 | print('print') 39 | logger.info("info") 40 | logger.warning("warning") 41 | logger.error("error") 42 | raise Exception('exception') 43 | 44 | def add_task(self, response): 45 | self.crawl('http://www.google.com', callback='echo', params={'wd': u'中文'}) 46 | self.send_message('some_project', {'some': 'message'}) 47 | 48 | @every 49 | def on_cronjob1(self, response): 50 | logger.info('on_cronjob1') 51 | 52 | @every(seconds=10) 53 | def on_cronjob2(self, response): 54 | logger.info('on_cronjob2') 55 | 56 | def generator(self, response): 57 | yield "a" 58 | yield "b" 59 | 60 | def sleep(self, response): 61 | time.sleep(response.save) 62 | 63 | -------------------------------------------------------------------------------- /tests/data_sample_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on __DATE__ 4 | # Project: __PROJECT_NAME__ 5 | 6 | from pyspider.libs.base_handler import * 7 | 8 | 9 | class Handler(BaseHandler): 10 | crawl_config = { 11 | } 12 | 13 | @every(minutes=24 * 60) 14 | def on_start(self): 15 | self.crawl('http://127.0.0.1:14887/pyspider/test.html', callback=self.index_page) 16 | 17 | @config(age=10 * 24 * 60 * 60) 18 | def index_page(self, response): 19 | for each in response.doc('a[href^="http"]').items(): 20 | self.crawl(each.attr.href, callback=self.detail_page) 21 | 22 | @config(priority=2) 23 | def detail_page(self, response): 24 | return { 25 | "url": response.url, 26 | "title": response.doc('title').text(), 27 | } 28 | -------------------------------------------------------------------------------- /tests/data_test_webpage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-24 13:44:10 7 | 8 | from httpbin import app 9 | 10 | @app.route('/pyspider/test.html') 11 | def test_page(): 12 | return ''' 13 | 404 14 | 0 15 | 1 16 | 2 17 | 3 18 | 4 19 | gzip 20 | get 21 | deflate 22 | html 23 | xml 24 | robots 25 | cache 26 | stream 27 | ''' 28 | 29 | @app.route('/pyspider/ajax.html') 30 | def test_ajax(): 31 | return ''' 32 |
loading...
33 |
34 |
35 | 46 | ''' 47 | 48 | @app.route('/pyspider/ajax_click.html') 49 | def test_ajax_click(): 50 | return ''' 51 |
loading...
52 |
53 |
54 |
load 55 | 68 | ''' 69 | -------------------------------------------------------------------------------- /tests/test_base_handler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2017-02-26 10:35:23 7 | 8 | import unittest 9 | 10 | from pyspider.libs.base_handler import BaseHandler 11 | 12 | 13 | class TestBaseHandler(unittest.TestCase): 14 | sample_task_http = { 15 | 'taskid': 'taskid', 16 | 'project': 'project', 17 | 'url': '', 18 | 'fetch': { 19 | 'method': 'GET', 20 | 'headers': { 21 | 'Cookie': 'a=b', 22 | 'a': 'b' 23 | }, 24 | 'cookies': { 25 | 'c': 'd', 26 | }, 27 | 'timeout': 60, 28 | 'save': 'abc', 29 | }, 30 | 'process': { 31 | 'callback': 'callback', 32 | 'save': [1, 2, 3], 33 | }, 34 | } 35 | 36 | def test_task_join_crawl_config(self): 37 | task = dict(self.sample_task_http) 38 | crawl_config = { 39 | 'taskid': 'xxxx', # should not affect finial task 40 | 'proxy': 'username:password@hostname:port', # should add proxy 41 | 'headers': { # should merge headers 42 | 'Cookie': 'abc', # should not affect cookie 43 | 'c': 'd', # should add header c 44 | } 45 | } 46 | 47 | ret = BaseHandler.task_join_crawl_config(task, crawl_config) 48 | self.assertDictEqual(ret, { 49 | 'taskid': 'taskid', 50 | 'project': 'project', 51 | 'url': '', 52 | 'fetch': { 53 | 'method': 'GET', 54 | 'proxy': 'username:password@hostname:port', 55 | 'headers': { 56 | 'Cookie': 'a=b', 57 | 'a': 'b', 58 | 'c': 'd' 59 | }, 60 | 'cookies': { 61 | 'c': 'd', 62 | }, 63 | 'timeout': 60, 64 | 'save': 'abc', 65 | }, 66 | 'process': { 67 | 'callback': 'callback', 68 | 'save': [1, 2, 3], 69 | }, 70 | }); 71 | -------------------------------------------------------------------------------- /tests/test_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-12-10 01:34:09 7 | 8 | import os 9 | import sys 10 | import time 11 | import click 12 | import shutil 13 | import inspect 14 | import unittest 15 | 16 | from pyspider import run 17 | from pyspider.libs import utils 18 | 19 | class TestBench(unittest.TestCase): 20 | 21 | @classmethod 22 | def setUpClass(self): 23 | shutil.rmtree('./data/bench', ignore_errors=True) 24 | os.makedirs('./data/bench') 25 | 26 | @classmethod 27 | def tearDownClass(self): 28 | shutil.rmtree('./data/bench', ignore_errors=True) 29 | 30 | def test_10_bench(self): 31 | import subprocess 32 | #cmd = [sys.executable] 33 | cmd = ['coverage', 'run'] 34 | p = subprocess.Popen(cmd+[ 35 | inspect.getsourcefile(run), 36 | '--queue-maxsize=0', 37 | 'bench', 38 | '--total=500' 39 | ], close_fds=True, stderr=subprocess.PIPE) 40 | 41 | stdout, stderr = p.communicate() 42 | stderr = utils.text(stderr) 43 | print(stderr) 44 | 45 | self.assertEqual(p.returncode, 0, stderr) 46 | self.assertIn('Crawled', stderr) 47 | self.assertIn('Fetched', stderr) 48 | self.assertIn('Processed', stderr) 49 | self.assertIn('Saved', stderr) 50 | -------------------------------------------------------------------------------- /tests/test_counter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-04-05 00:05:58 7 | 8 | import sys 9 | import time 10 | import unittest 11 | 12 | from pyspider.libs import counter 13 | 14 | class TestCounter(unittest.TestCase): 15 | def test_010_TimebaseAverageEventCounter(self): 16 | c = counter.TimebaseAverageEventCounter(2, 1) 17 | for i in range(100): 18 | time.sleep(0.1) 19 | c.event(100+i) 20 | 21 | self.assertEqual(c.sum, float(180+199)*20/2) 22 | self.assertEqual(c.avg, float(180+199)/2) 23 | 24 | def test_020_TotalCounter(self): 25 | c = counter.TotalCounter() 26 | for i in range(3): 27 | c.event(i) 28 | self.assertEqual(c.avg, 3) 29 | self.assertEqual(c.sum, 3) 30 | 31 | def test_030_AverageWindowCounter(self): 32 | c = counter.AverageWindowCounter(10) 33 | self.assertTrue(c.empty()) 34 | 35 | for i in range(20): 36 | c.event(i) 37 | 38 | self.assertFalse(c.empty()) 39 | self.assertEqual(c.avg, 14.5) 40 | self.assertEqual(c.sum, 145) 41 | 42 | def test_020_delete(self): 43 | c = counter.CounterManager() 44 | c.event(('a', 'b'), 1) 45 | c.event(('a', 'c'), 1) 46 | c.event(('b', 'c'), 1) 47 | 48 | self.assertIsNotNone(c['a']) 49 | self.assertIsNotNone(c['b']) 50 | 51 | del c['a'] 52 | 53 | self.assertNotIn('a', c) 54 | self.assertIsNotNone(c['b']) 55 | -------------------------------------------------------------------------------- /tests/test_response.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-18 11:10:27 7 | 8 | 9 | import os 10 | import copy 11 | import time 12 | import httpbin 13 | import unittest 14 | 15 | import logging 16 | import logging.config 17 | logging.config.fileConfig("pyspider/logging.conf") 18 | 19 | from pyspider.libs import utils 20 | from pyspider.libs.response import rebuild_response 21 | from pyspider.fetcher.tornado_fetcher import Fetcher 22 | 23 | class TestResponse(unittest.TestCase): 24 | sample_task_http = { 25 | 'taskid': 'taskid', 26 | 'project': 'project', 27 | 'url': '', 28 | } 29 | 30 | @classmethod 31 | def setUpClass(self): 32 | self.fetcher = Fetcher(None, None, async_mode=False) 33 | self.httpbin_thread = utils.run_in_subprocess(httpbin.app.run, port=14887, passthrough_errors=False) 34 | self.httpbin = 'http://127.0.0.1:14887' 35 | time.sleep(0.5) 36 | 37 | @classmethod 38 | def tearDownClass(self): 39 | self.httpbin_thread.terminate() 40 | 41 | def get(self, url, **kwargs): 42 | if not url.startswith('http://'): 43 | url = self.httpbin + url 44 | request = copy.deepcopy(self.sample_task_http) 45 | request['url'] = url 46 | request.update(kwargs) 47 | result = self.fetcher.fetch(request) 48 | response = rebuild_response(result) 49 | return response 50 | 51 | def test_10_html(self): 52 | response = self.get('/html') 53 | self.assertEqual(response.status_code, 200) 54 | self.assertIsNotNone(response.doc('h1')) 55 | 56 | def test_20_xml(self): 57 | response = self.get('/xml') 58 | self.assertEqual(response.status_code, 200) 59 | self.assertIsNotNone(response.doc('item')) 60 | 61 | def test_30_gzip(self): 62 | response = self.get('/gzip') 63 | self.assertEqual(response.status_code, 200) 64 | self.assertIn('gzipped', response.text) 65 | 66 | def test_40_deflate(self): 67 | response = self.get('/deflate') 68 | self.assertEqual(response.status_code, 200) 69 | self.assertIn('deflated', response.text) 70 | 71 | def test_50_ok(self): 72 | response = self.get('/status/200') 73 | self.assertTrue(response.ok) 74 | self.assertTrue(response) 75 | response = self.get('/status/302') 76 | self.assertTrue(response.ok) 77 | self.assertTrue(response) 78 | with self.assertRaises(Exception): 79 | self.raise_for_status(allow_redirects=False) 80 | 81 | def test_60_not_ok(self): 82 | response = self.get('/status/400') 83 | self.assertFalse(response.ok) 84 | self.assertFalse(response) 85 | response = self.get('/status/500') 86 | self.assertFalse(response.ok) 87 | self.assertFalse(response) 88 | response = self.get('/status/600') 89 | self.assertFalse(response.ok) 90 | self.assertFalse(response) 91 | 92 | def test_70_reraise_exception(self): 93 | response = self.get('file://abc') 94 | with self.assertRaisesRegex(Exception, 'HTTP 599'): 95 | response.raise_for_status() 96 | -------------------------------------------------------------------------------- /tests/test_result_dump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-10-12 22:17:57 7 | 8 | from __future__ import unicode_literals, division 9 | 10 | import six 11 | import csv 12 | import time 13 | import json 14 | import unittest 15 | from six import StringIO 16 | 17 | from pyspider.libs import result_dump 18 | 19 | results1 = [ 20 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 21 | 'result': {'a': 1, 'b': 2} }, 22 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 23 | 'result': {'a': 1, 'b': 2, 'c': 3} }, 24 | ] 25 | 26 | results2 = results1 + [ 27 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 28 | 'result': [1, 2, '中文', u'中文'] }, 29 | ] 30 | 31 | results_error = results2 + [ 32 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 33 | 'result': None}, 34 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time() }, 35 | {'taskid': 'taskid1', 'pdatetime': time.time() }, 36 | ] 37 | 38 | result_list_error = [ 39 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 40 | 'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]}, 41 | {'taskid': 'taskid1', 'url': 'http://example.org/url1', 'pdatetime': time.time(), 42 | 'result': [{"rate": "8.2", "title": '1'}, {"rate": "8.2", "title": '1'}]}, 43 | ] 44 | 45 | class TestResultDump(unittest.TestCase): 46 | def test_result_formater_1(self): 47 | common_fields, results = result_dump.result_formater(results1) 48 | self.assertEqual(common_fields, set(('a', 'b'))) 49 | 50 | def test_result_formater_2(self): 51 | common_fields, results = result_dump.result_formater(results2) 52 | self.assertEqual(common_fields, set()) 53 | 54 | def test_result_formater_error(self): 55 | common_fields, results = result_dump.result_formater(results_error) 56 | self.assertEqual(common_fields, set()) 57 | 58 | def test_dump_as_json(self): 59 | for i, line in enumerate((''.join( 60 | result_dump.dump_as_json(results2))).splitlines()): 61 | self.assertDictEqual(results2[i], json.loads(line)) 62 | 63 | def test_dump_as_json_valid(self): 64 | ret = json.loads(''.join(result_dump.dump_as_json(results2, True))) 65 | for i, j in zip(results2, ret): 66 | self.assertDictEqual(i, j) 67 | 68 | def test_dump_as_txt(self): 69 | for i, line in enumerate((''.join( 70 | result_dump.dump_as_txt(results2))).splitlines()): 71 | url, json_data = line.split('\t', 2) 72 | self.assertEqual(results2[i]['result'], json.loads(json_data)) 73 | 74 | def test_dump_as_csv(self): 75 | reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(results1)))) 76 | for row in reader: 77 | self.assertEqual(len(row), 4) 78 | 79 | def test_dump_as_csv_case_1(self): 80 | reader = csv.reader(StringIO(''.join(result_dump.dump_as_csv(result_list_error)))) 81 | for row in reader: 82 | self.assertEqual(len(row), 2) 83 | -------------------------------------------------------------------------------- /tests/test_result_worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2014-11-11 20:52:53 7 | 8 | import os 9 | import time 10 | import unittest 11 | import logging.config 12 | logging.config.fileConfig("pyspider/logging.conf") 13 | 14 | import shutil 15 | from pyspider.database.sqlite import resultdb 16 | from pyspider.result.result_worker import ResultWorker 17 | from pyspider.libs.multiprocessing_queue import Queue 18 | from pyspider.libs.utils import run_in_thread 19 | 20 | 21 | class TestProcessor(unittest.TestCase): 22 | resultdb_path = './data/tests/result.db' 23 | 24 | @classmethod 25 | def setUpClass(self): 26 | shutil.rmtree('./data/tests/', ignore_errors=True) 27 | os.makedirs('./data/tests/') 28 | 29 | def get_resultdb(): 30 | return resultdb.ResultDB(self.resultdb_path) 31 | self.resultdb = get_resultdb() 32 | self.inqueue = Queue(10) 33 | 34 | def run_result_worker(): 35 | self.result_worker = ResultWorker(get_resultdb(), self.inqueue) 36 | self.result_worker.run() 37 | self.process = run_in_thread(run_result_worker) 38 | time.sleep(1) 39 | 40 | @classmethod 41 | def tearDownClass(self): 42 | if self.process.is_alive(): 43 | self.result_worker.quit() 44 | self.process.join(2) 45 | assert not self.process.is_alive() 46 | shutil.rmtree('./data/tests/', ignore_errors=True) 47 | 48 | def test_10_bad_result(self): 49 | self.inqueue.put(({'project': 'test_project'}, {})) 50 | self.resultdb._list_project() 51 | self.assertEqual(len(self.resultdb.projects), 0) 52 | self.assertEqual(self.resultdb.count('test_project'), 0) 53 | 54 | def test_10_bad_result_2(self): 55 | self.inqueue.put(({'project': 'test_project'}, {'a': 'b'})) 56 | self.resultdb._list_project() 57 | self.assertEqual(len(self.resultdb.projects), 0) 58 | self.assertEqual(self.resultdb.count('test_project'), 0) 59 | 60 | def test_20_insert_result(self): 61 | data = { 62 | 'a': 'b' 63 | } 64 | self.inqueue.put(({ 65 | 'project': 'test_project', 66 | 'taskid': 'id1', 67 | 'url': 'url1' 68 | }, data)) 69 | time.sleep(0.5) 70 | self.resultdb._list_project() 71 | self.assertEqual(len(self.resultdb.projects), 1) 72 | self.assertEqual(self.resultdb.count('test_project'), 1) 73 | 74 | result = self.resultdb.get('test_project', 'id1') 75 | self.assertEqual(result['result'], data) 76 | 77 | def test_30_overwrite(self): 78 | self.inqueue.put(({ 79 | 'project': 'test_project', 80 | 'taskid': 'id1', 81 | 'url': 'url1' 82 | }, "abc")) 83 | time.sleep(0.1) 84 | result = self.resultdb.get('test_project', 'id1') 85 | self.assertEqual(result['result'], "abc") 86 | 87 | def test_40_insert_list(self): 88 | self.inqueue.put(({ 89 | 'project': 'test_project', 90 | 'taskid': 'id2', 91 | 'url': 'url1' 92 | }, ['a', 'b'])) 93 | time.sleep(0.1) 94 | result = self.resultdb.get('test_project', 'id2') 95 | self.assertEqual(result['result'], ['a', 'b']) 96 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-01-18 16:53:49 7 | 8 | import sys 9 | import time 10 | import unittest 11 | 12 | from pyspider.libs import utils 13 | 14 | class TestFetcher(unittest.TestCase): 15 | def test_readonlydict(self): 16 | data = dict(a='a', b=123) 17 | data['c'] = self 18 | data = utils.ReadOnlyDict(data) 19 | 20 | with self.assertRaises(Exception): 21 | data['d'] = 9 22 | 23 | def test_getitem(self): 24 | l = [1, 2] 25 | self.assertEqual(utils.getitem(l, 0), 1) 26 | self.assertEqual(utils.getitem(l, 1), 2) 27 | self.assertEqual(utils.getitem(l, 3), None) 28 | self.assertEqual(utils.getitem(l, 3, 9), 9) 29 | self.assertEqual(utils.getitem(l, 'key'), None) 30 | self.assertEqual(utils.getitem(l, 'key', 8), 8) 31 | data = dict(a='a', b=123) 32 | self.assertEqual(utils.getitem(data, 'a'), 'a') 33 | self.assertEqual(utils.getitem(data, 'b'), 123) 34 | self.assertEqual(utils.getitem(data, 'c'), None) 35 | self.assertEqual(utils.getitem(data, 'c', 9), 9) 36 | 37 | def test_format_data(self): 38 | now = time.time() 39 | self.assertEqual(utils.format_date(now - 30), '30 seconds ago') 40 | self.assertEqual(utils.format_date(now - 60), '1 minute ago') 41 | self.assertEqual(utils.format_date(now - 2*60), '2 minutes ago') 42 | self.assertEqual(utils.format_date(now - 30*60), '30 minutes ago') 43 | self.assertEqual(utils.format_date(now - 60*60), '1 hour ago') 44 | self.assertEqual(utils.format_date(1963475336), 'Mar 21, 2032 at 9:48') 45 | self.assertEqual(utils.format_date(now - 12*60*60), '12 hours ago') 46 | self.assertRegex(utils.format_date(now - 24*60*60), r'^yesterday at \d{1,2}:\d{2}$') 47 | self.assertRegex(utils.format_date(now - 2*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') 48 | self.assertRegex(utils.format_date(now - 3*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') 49 | self.assertRegex(utils.format_date(now - 4*24*60*60), r'^[A-Z][a-z]+ at \d{1,2}:\d{2}$') 50 | self.assertRegex(utils.format_date(now - 5*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$') 51 | self.assertRegex(utils.format_date(now - 333*24*60*60), r'^\d{1,2}-\d{1,2} at \d{1,2}:\d{2}$') 52 | self.assertRegex(utils.format_date(now - 334*24*60*60), r'^[A-Z][a-z]+ \d{1,2}, \d{4} at \d{1,2}:\d{2}$') 53 | -------------------------------------------------------------------------------- /tests/test_xmlrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2006-2007 Open Source Applications Foundation 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | # Origin: https://code.google.com/p/wsgi-xmlrpc/ 16 | 17 | import unittest 18 | import tornado.wsgi 19 | import tornado.ioloop 20 | import tornado.httpserver 21 | from pyspider.libs import utils 22 | 23 | class TestXMLRPCServer(unittest.TestCase): 24 | @classmethod 25 | def setUpClass(self): 26 | from pyspider.libs import wsgi_xmlrpc 27 | 28 | def test_1(): 29 | return 'test_1' 30 | 31 | class Test2(object): 32 | def test_3(self, obj): 33 | return obj 34 | 35 | test = Test2() 36 | 37 | application = wsgi_xmlrpc.WSGIXMLRPCApplication() 38 | application.register_instance(Test2()) 39 | application.register_function(test_1) 40 | 41 | container = tornado.wsgi.WSGIContainer(application) 42 | self.io_loop = tornado.ioloop.IOLoop.current() 43 | http_server = tornado.httpserver.HTTPServer(container, io_loop=self.io_loop) 44 | http_server.listen(3423) 45 | self.thread = utils.run_in_thread(self.io_loop.start) 46 | 47 | @classmethod 48 | def tearDownClass(self): 49 | self.io_loop.add_callback(self.io_loop.stop) 50 | self.thread.join() 51 | 52 | def test_xmlrpc_server(self, uri='http://127.0.0.1:3423'): 53 | from six.moves.xmlrpc_client import ServerProxy 54 | 55 | client = ServerProxy(uri) 56 | 57 | assert client.test_1() == 'test_1' 58 | assert client.test_3({'asdf':4}) == {'asdf':4} 59 | -------------------------------------------------------------------------------- /tools/migrate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8: 4 | # Author: Binux 5 | # http://binux.me 6 | # Created on 2015-09-30 23:22:46 7 | 8 | import click 9 | import logging 10 | from pyspider.database.base.projectdb import ProjectDB 11 | from pyspider.database.base.taskdb import TaskDB 12 | from pyspider.database.base.resultdb import ResultDB 13 | from pyspider.database import connect_database 14 | from pyspider.libs.utils import unicode_obj 15 | from multiprocessing.pool import ThreadPool as Pool 16 | 17 | logging.getLogger().setLevel(logging.INFO) 18 | 19 | 20 | def taskdb_migrating(project, from_connection, to_connection): 21 | logging.info("taskdb: %s", project) 22 | f = connect_database(from_connection) 23 | t = connect_database(to_connection) 24 | t.drop(project) 25 | for status in range(1, 5): 26 | for task in f.load_tasks(status, project=project): 27 | t.insert(project, task['taskid'], task) 28 | 29 | 30 | def resultdb_migrating(project, from_connection, to_connection): 31 | logging.info("resultdb: %s", project) 32 | f = connect_database(from_connection) 33 | t = connect_database(to_connection) 34 | t.drop(project) 35 | for result in f.select(project): 36 | t.save(project, result['taskid'], result['url'], result['result']) 37 | 38 | 39 | @click.command() 40 | @click.option('--pool', default=10, help='cocurrent worker size.') 41 | @click.argument('from_connection', required=1) 42 | @click.argument('to_connection', required=1) 43 | def migrate(pool, from_connection, to_connection): 44 | """ 45 | Migrate tool for pyspider 46 | """ 47 | f = connect_database(from_connection) 48 | t = connect_database(to_connection) 49 | 50 | if isinstance(f, ProjectDB): 51 | for each in f.get_all(): 52 | each = unicode_obj(each) 53 | logging.info("projectdb: %s", each['name']) 54 | t.drop(each['name']) 55 | t.insert(each['name'], each) 56 | elif isinstance(f, TaskDB): 57 | pool = Pool(pool) 58 | pool.map( 59 | lambda x, f=from_connection, t=to_connection: taskdb_migrating(x, f, t), 60 | f.projects) 61 | elif isinstance(f, ResultDB): 62 | pool = Pool(pool) 63 | pool.map( 64 | lambda x, f=from_connection, t=to_connection: resultdb_migrating(x, f, t), 65 | f.projects) 66 | 67 | 68 | if __name__ == '__main__': 69 | migrate() 70 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py35,py36,py37,py38 3 | [testenv] 4 | install_command = 5 | pip install --allow-all-external 'https://dev.mysql.com/get/Downloads/Connector-Python/mysql-connector-python-2.1.5.zip#md5=ce4a24cb1746c1c8f6189a97087f21c1' {opts} -e .[all,test] {packages} 6 | commands = 7 | python setup.py test [] 8 | --------------------------------------------------------------------------------