├── .editorconfig ├── .github └── workflows │ └── workflow.yml ├── .gitignore ├── CHANGELOG.md ├── Cargo.toml ├── Justfile ├── LICENSE ├── Makefile ├── README.md ├── doc └── maman.adoc ├── man └── man1 │ └── maman.1 ├── rustfmt.toml ├── src ├── lib.rs ├── main.rs └── maman │ ├── mod.rs │ └── page.rs └── tests └── lib.rs /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [**] 4 | charset = utf-8 5 | end_of_line = lf 6 | insert_final_newline = true 7 | 8 | [**.rs] 9 | trim_trailing_whitespace = true 10 | indent_style = space 11 | indent_size = 4 12 | 13 | [**.yml] 14 | indent_style = space 15 | indent_size = 2 16 | 17 | [**.md] 18 | trim_trailing_whitespace = false 19 | -------------------------------------------------------------------------------- /.github/workflows/workflow.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: CI 3 | on: 4 | push: 5 | paths-ignore: 6 | - "**.md" 7 | 8 | pull_request: 9 | paths-ignore: 10 | - "**.md" 11 | 12 | jobs: 13 | # Run the `rustfmt` code formatter 14 | rustfmt: 15 | name: Rustfmt [Formatter] 16 | runs-on: ubuntu-latest 17 | steps: 18 | - uses: actions/checkout@v2 19 | - uses: actions-rs/toolchain@v1 20 | with: 21 | profile: minimal 22 | toolchain: stable 23 | components: rustfmt 24 | override: true 25 | - run: rustup component add rustfmt 26 | - uses: actions-rs/cargo@v1 27 | with: 28 | command: fmt 29 | args: --all -- --check 30 | 31 | # Run the `clippy` linting tool 32 | clippy: 33 | name: Clippy [Linter] 34 | runs-on: ubuntu-latest 35 | steps: 36 | - uses: actions/checkout@v2 37 | - uses: actions-rs/toolchain@v1 38 | with: 39 | profile: minimal 40 | toolchain: nightly 41 | components: clippy 42 | override: true 43 | - uses: actions-rs/clippy-check@v1 44 | with: 45 | token: ${{ secrets.GITHUB_TOKEN }} 46 | args: --all-targets --all-features -- -D clippy::all 47 | 48 | # Run a security audit on dependencies 49 | cargo_audit: 50 | name: Cargo Audit [Security] 51 | runs-on: ubuntu-latest 52 | steps: 53 | - uses: actions/checkout@v2 54 | - uses: actions-rs/toolchain@v1 55 | with: 56 | toolchain: stable 57 | override: true 58 | - run: cargo install --force cargo-audit 59 | - run: cargo generate-lockfile 60 | - uses: actions-rs/cargo@v1 61 | with: 62 | command: audit 63 | 64 | # Run bench 65 | cargo_bench: 66 | name: Cargo Bench [Bench] 67 | runs-on: ubuntu-latest 68 | steps: 69 | - uses: actions/checkout@v2 70 | - uses: actions-rs/toolchain@v1 71 | with: 72 | toolchain: nightly 73 | override: true 74 | - run: cargo generate-lockfile 75 | - name: Start Redis 76 | uses: supercharge/redis-github-action@1.2.0 77 | with: 78 | redis-version: 6 79 | - uses: actions-rs/cargo@v1 80 | with: 81 | command: bench 82 | 83 | # Ensure that the project could be successfully compiled 84 | cargo_check: 85 | name: Compile 86 | runs-on: ubuntu-latest 87 | 88 | steps: 89 | - uses: actions/checkout@v2 90 | - uses: actions-rs/toolchain@v1 91 | with: 92 | profile: minimal 93 | toolchain: stable 94 | override: true 95 | 96 | - uses: actions-rs/cargo@v1 97 | with: 98 | command: check 99 | args: --all 100 | 101 | # Run tests on Linux, macOS, and Windows 102 | # On both Rust stable and Rust nightly 103 | test: 104 | name: Test Suite 105 | needs: [cargo_check] 106 | runs-on: ${{ matrix.os }} 107 | strategy: 108 | fail-fast: false 109 | matrix: 110 | os: [ubuntu-latest] 111 | rust: [stable, nightly] 112 | 113 | steps: 114 | # Checkout the branch being tested 115 | - uses: actions/checkout@v2 116 | 117 | # Cache files between builds 118 | - name: Cache cargo registry 119 | uses: actions/cache@v1 120 | with: 121 | path: ~/.cargo/registry 122 | key: ${{ runner.os }}-cargo-registry-${{ hashFiles('**/Cargo.lock') }} 123 | 124 | - name: Cache cargo index 125 | uses: actions/cache@v1 126 | with: 127 | path: ~/.cargo/git 128 | key: ${{ runner.os }}-cargo-index-${{ hashFiles('**/Cargo.lock') }} 129 | 130 | - name: Cache cargo build 131 | uses: actions/cache@v1 132 | with: 133 | path: target 134 | key: ${{ runner.os }}-cargo-build-target-${{ hashFiles('**/Cargo.lock') }} 135 | 136 | # Install all the required dependencies for testing 137 | - uses: actions-rs/toolchain@v1 138 | with: 139 | profile: minimal 140 | toolchain: stable 141 | override: true 142 | 143 | - name: Start Redis 144 | uses: supercharge/redis-github-action@1.2.0 145 | with: 146 | redis-version: 6 147 | 148 | - name: Run all tests 149 | uses: actions-rs/cargo@v1 150 | with: 151 | command: test 152 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | Cargo.lock 3 | *.bk 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 2 | 0.13.1 / 2018-12-01 3 | =================== 4 | 5 | * Fix mime filter when content type include charset 6 | * Update env_logger and mockito 7 | 8 | 0.13.0 / 2018-10-15 9 | =================== 10 | 11 | * Use mockito for integration tests 12 | * Update reqwest to 0.9 13 | * Drop encoding crate already done in reqwest now 14 | * Fix mime type filter when charset is present 15 | * remove `use std::ascii::AsciiExt` for Rust 1.23 16 | * Update env_logger to 0.5 17 | * Update html5ever to 0.22 18 | * Update to log and env_logger 0.4 19 | * Update sidekiq to 0.8 20 | * Add filter mime types and update deps 21 | * Update reqwest to 0.7 and robotparser to 0.9 22 | 23 | 0.12.1 / 2017-06-18 24 | =================== 25 | 26 | * Remove unused hyper_serde crate 27 | 28 | 0.12.0 / 2017-06-12 29 | =================== 30 | 31 | * Fix owned instance just for comparison 32 | * Update deps serde 1.0 33 | * Update clippy and add html_root_url 34 | * Add http status and http version to page 35 | * Less strict deps 36 | * Add clippy check 37 | 38 | 0.11.0 / 2017-03-12 39 | =================== 40 | 41 | * Use sidekiq pub Value 42 | * Update sidekiq client to 0.6 43 | * Add travis-ci badge to crates.io 44 | * Update serde to version 0.9 45 | 46 | 0.10.0 / 2017-01-08 47 | =================== 48 | 49 | * Update html5ever to 0.10 50 | * Update reqwest to 0.2.0 and sidekiq to 0.4.0 51 | 52 | 0.9.0 / 2016-11-19 53 | ================== 54 | 55 | * Use reqwest as http client and upgrade robotparser 56 | 57 | 0.8.0 / 2016-10-09 58 | ================== 59 | 60 | * Remove unused extra vector 61 | * Readme updates 62 | 63 | 0.7.0 / 2016-09-11 64 | ================== 65 | 66 | * Use makefile for install and add manpage 67 | * Cleanup main 68 | * Better error handling for redis pool 69 | * Use properly env_logger and fix tests 70 | * Use log and env_logger crate 71 | * Print sidekiq error to stderr 72 | * Use rust-url feature serde for serialization 73 | * Add continue_to_crawl fn 74 | * Move page to own file 75 | 76 | 0.6.0 / 2016-09-03 77 | ================== 78 | 79 | * Fix robots.txt path from base_url 80 | * Use encoding crate 81 | * Update robotparser to 0.5.0 82 | 83 | 0.5.1 / 2016-08-21 84 | ================== 85 | 86 | * Dont follow redirect on crawling 87 | * Add rustfmt.toml config 88 | 89 | 0.5.0 / 2016-08-20 90 | ================== 91 | 92 | * Fix sidekiq push error display 93 | * Update url and sidekiq move to serde 94 | * Add LIMIT option 95 | 96 | 0.4.0 / 2016-06-07 97 | ================== 98 | 99 | * Add urls and extra to Page 100 | * Move and fix private public functions 101 | * Use String instead of Url and cleanup 102 | * Update sidekiq to v0.1.2 103 | 104 | 0.3.0 / 2016-05-29 105 | ================== 106 | 107 | * Only follow StatusCode::Ok and StatusCode::NotModified 108 | * Move job logic from Page to Job 109 | * Use rust-sidekiq 110 | 111 | 0.2.0 / 2016-05-09 112 | ================== 113 | 114 | * Set redis per default to 127.0.0.1 115 | * Use env var for REDIS_URL or default to redis://localhost/ 116 | * Update rust-url to 1.1 117 | * use robotparser::RobotFileParser 118 | 119 | 0.1.0 / 2016-05-03 120 | ================== 121 | 122 | * Initial release 123 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "maman" 3 | # When updating version, also modify html_root_url in the src/lib.rs file. 4 | version = "0.13.1" 5 | authors = ["Laurent Arnoud "] 6 | description = "Rust Web Crawler" 7 | repository = "https://github.com/spk/maman.git" 8 | homepage = "https://github.com/spk/maman" 9 | keywords = ["http", "web", "crawler", "spider"] 10 | license = "MIT" 11 | readme = "README.md" 12 | edition = "2018" 13 | exclude = ["Justfile", "rustfmt.toml"] 14 | 15 | [badges] 16 | travis-ci = { repository = "spk/maman" } 17 | 18 | [dependencies] 19 | env_logger = "0.8" 20 | html5ever = "0.25" 21 | log = "0.4" 22 | mime = "0.3" 23 | reqwest = "0.9" 24 | robotparser = "0.10" 25 | serde = "1.0" 26 | serde_derive = "1.0" 27 | serde_json = "1.0" 28 | sidekiq = "0.9" 29 | url = { version = "2.0", features = ["serde"] } 30 | 31 | [dev-dependencies] 32 | mockito = "0.30" 33 | -------------------------------------------------------------------------------- /Justfile: -------------------------------------------------------------------------------- 1 | default_prefix := "/usr/local" 2 | default_manpage_path := "share/man/man1/" 3 | default_manpage := "maman.1" 4 | 5 | all: build test 6 | 7 | @build: 8 | cargo build 9 | 10 | @test: 11 | cargo test --all -- --quiet 12 | 13 | @bench: 14 | cargo bench 15 | 16 | @docs: build 17 | cargo doc --no-deps 18 | 19 | @format: 20 | cargo fmt --all -- --check 21 | 22 | @lint: 23 | cargo clippy -- -D warnings 24 | 25 | @install: 26 | cargo build --release 27 | find doc/ -type f -exec asciidoctor -b manpage -D man/man1 {} \; 28 | install -dm755 {{env_var_or_default("PREFIX", default_prefix)}}/bin/ 29 | install -dm755 {{env_var_or_default("PREFIX", default_prefix)}}/{{default_manpage_path}} 30 | install -sm755 target/release/maman {{env_var_or_default("PREFIX", default_prefix)}}/bin/ 31 | install -m644 man/man1/{{default_manpage}} {{env_var_or_default("PREFIX", default_prefix)}}/{{default_manpage_path}} 32 | 33 | @uninstall: 34 | rm -f {{env_var_or_default("PREFIX", default_prefix)}}/bin/maman 35 | rm -f {{env_var_or_default("PREFIX", default_prefix)}}/{{default_manpage_path}}{{default_manpage}} 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2016-2021 Laurent Arnoud 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | 'Software'), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PREFIX ?= /usr/local 2 | 3 | BUILD = target/release/maman 4 | MANPAGE = man/man1/maman.1 5 | 6 | all: build install ## Build and install 7 | 8 | help: ## Show this help 9 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ 10 | sort | \ 11 | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 12 | 13 | $(BUILD): ## Build 14 | @which cargo > /dev/null || { echo "https://www.rust-lang.org/"; exit 1; } 15 | @cargo build --release 16 | 17 | build: $(BUILD) 18 | 19 | INSTALL = $(PREFIX)/bin/maman 20 | 21 | $(INSTALL): ## Install 22 | install -dm755 $(PREFIX)/bin/ $(PREFIX)/share/man/man1/ 23 | install -sm755 $(BUILD) $(PREFIX)/bin/ 24 | install -m644 $(MANPAGE) $(PREFIX)/share/man/man1/ 25 | 26 | install: build $(INSTALL) 27 | 28 | clean: ## Clean 29 | rm -rf $(BUILD) 30 | 31 | uninstall: ## Uninstall 32 | rm $(PREFIX)/bin/maman $(PREFIX)/share/$(MANPAGE) 33 | 34 | manpage: ## Generate manpage 35 | @which asciidoctor > /dev/null || { echo "install asciidoctor"; exit 1; } 36 | @find doc/ -type f -exec asciidoctor -b manpage -D man/man1 {} \; 37 | 38 | .PHONY: all install clean uninstall help 39 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Maman 2 | 3 | Maman is a Rust Web Crawler saving pages on Redis. 4 | 5 | Pages are send to list `:queue:maman` using 6 | [Sidekiq job format](https://github.com/mperham/sidekiq/wiki/Job-Format) 7 | 8 | ``` json 9 | { 10 | "class": "Maman", 11 | "jid": "b4a577edbccf1d805744efa9", 12 | "retry": true, 13 | "created_at": 1461789979, "enqueued_at": 1461789979, 14 | "args": { 15 | "document":"", 16 | "urls": ["https://example.net/new"], 17 | "headers": {"content-type": "text/html"}, 18 | "url": "https://example.net/" 19 | } 20 | } 21 | ``` 22 | 23 | ## Dependencies 24 | 25 | * [Redis](http://redis.io/) 26 | 27 | ## Installation 28 | 29 | ### With cargo 30 | 31 | ``` 32 | cargo install maman 33 | ``` 34 | 35 | ### With [just](https://github.com/casey/just) 36 | 37 | ``` 38 | PREFIX=~/.local just install 39 | ``` 40 | 41 | ## Usage 42 | 43 | ``` 44 | maman URL [LIMIT] [MIME_TYPES] 45 | ``` 46 | 47 | `LIMIT` must be an integer or `0` is the default, meaning no limit. 48 | 49 | ## Environment variables 50 | 51 | ### Defaults 52 | 53 | * MAMAN_ENV=development 54 | * REDIS_URL="redis://127.0.0.1/" 55 | 56 | ### Others 57 | 58 | * RUST_LOG=maman=info 59 | 60 | ## LICENSE 61 | 62 | The MIT License 63 | 64 | Copyright (c) 2016-2021 Laurent Arnoud 65 | 66 | --- 67 | [![Build](https://img.shields.io/github/workflow/status/spk/maman/CI/master.svg)](https://github.com/spk/maman/actions) 68 | [![Version](https://img.shields.io/crates/v/maman.svg)](https://crates.io/crates/maman) 69 | [![Documentation](https://img.shields.io/badge/doc-rustdoc-blue.svg)](https://docs.rs/maman/) 70 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT "MIT") 71 | [![Dependency status](https://deps.rs/repo/github/spk/maman/status.svg)](https://deps.rs/repo/github/spk/maman) 72 | -------------------------------------------------------------------------------- /doc/maman.adoc: -------------------------------------------------------------------------------- 1 | = maman(1) 2 | 3 | == NAME 4 | 5 | maman - Rust Web Crawler saving pages on Redis 6 | 7 | == SYNOPSIS 8 | 9 | *maman* URL ['LIMIT'] ['MIME_TYPES'] 10 | 11 | == DESCRIPTION 12 | 13 | Maman is a Rust Web Crawler saving pages on Redis. 14 | Pages are send to list :queue:maman using 15 | https://github.com/mperham/sidekiq/wiki/Job-Format[Sidekiq job format] 16 | 17 | == URL 18 | 19 | Must be a valid http url. 20 | 21 | == LIMIT 22 | 23 | Crawling limit must be an integer or 0 is the default, meaning no limit. 24 | 25 | == Environment variables 26 | 27 | === Defaults 28 | 29 | * MAMAN_ENV=development 30 | * REDIS_URL="redis://127.0.0.1/" 31 | 32 | === Others 33 | 34 | * RUST_LOG=maman=info 35 | 36 | == AUTHOR 37 | 38 | mailto:laurent@spkdev.net[Laurent Arnoud] 39 | 40 | == LICENSE 41 | 42 | The MIT License 43 | 44 | Copyright (c) 2016-2018 mailto:laurent@spkdev.net[Laurent Arnoud] 45 | 46 | // vim: set syntax=asciidoc: 47 | -------------------------------------------------------------------------------- /man/man1/maman.1: -------------------------------------------------------------------------------- 1 | '\" t 2 | .\" Title: maman 3 | .\" Author: [see the "AUTHOR(S)" section] 4 | .\" Generator: Asciidoctor 1.5.8 5 | .\" Date: 2018-01-20 6 | .\" Manual: \ \& 7 | .\" Source: \ \& 8 | .\" Language: English 9 | .\" 10 | .TH "MAMAN" "1" "2018-01-20" "\ \&" "\ \&" 11 | .ie \n(.g .ds Aq \(aq 12 | .el .ds Aq ' 13 | .ss \n[.ss] 0 14 | .nh 15 | .ad l 16 | .de URL 17 | \fI\\$2\fP <\\$1>\\$3 18 | .. 19 | .als MTO URL 20 | .if \n[.g] \{\ 21 | . mso www.tmac 22 | . am URL 23 | . ad l 24 | . . 25 | . am MTO 26 | . ad l 27 | . . 28 | . LINKSTYLE blue R < > 29 | .\} 30 | .SH "NAME" 31 | maman \- Rust Web Crawler saving pages on Redis 32 | .SH "SYNOPSIS" 33 | .sp 34 | \fBmaman\fP URL [\(aqLIMIT\(aq] [\(aqMIME_TYPES\(aq] 35 | .SH "DESCRIPTION" 36 | .sp 37 | Maman is a Rust Web Crawler saving pages on Redis. 38 | Pages are send to list :queue:maman using 39 | .URL "https://github.com/mperham/sidekiq/wiki/Job\-Format" "Sidekiq job format" "" 40 | .SH "URL" 41 | .sp 42 | Must be a valid http url. 43 | .SH "LIMIT" 44 | .sp 45 | Crawling limit must be an integer or 0 is the default, meaning no limit. 46 | .SH "ENVIRONMENT VARIABLES" 47 | .SS "Defaults" 48 | .sp 49 | .RS 4 50 | .ie n \{\ 51 | \h'-04'\(bu\h'+03'\c 52 | .\} 53 | .el \{\ 54 | . sp -1 55 | . IP \(bu 2.3 56 | .\} 57 | MAMAN_ENV=development 58 | .RE 59 | .sp 60 | .RS 4 61 | .ie n \{\ 62 | \h'-04'\(bu\h'+03'\c 63 | .\} 64 | .el \{\ 65 | . sp -1 66 | . IP \(bu 2.3 67 | .\} 68 | REDIS_URL="redis://127.0.0.1/" 69 | .RE 70 | .SS "Others" 71 | .sp 72 | .RS 4 73 | .ie n \{\ 74 | \h'-04'\(bu\h'+03'\c 75 | .\} 76 | .el \{\ 77 | . sp -1 78 | . IP \(bu 2.3 79 | .\} 80 | RUST_LOG=maman=info 81 | .RE 82 | .SH "AUTHOR" 83 | .sp 84 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" 85 | .SH "LICENSE" 86 | .sp 87 | The MIT License 88 | .sp 89 | Copyright (c) 2016\-2018 \c 90 | .MTO "laurent\(atspkdev.net" "Laurent Arnoud" "" -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | max_width = 101 2 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! Maman is a Rust Web Crawler saving pages on Redis. 2 | //! 3 | //! # Default environment variables 4 | //! 5 | //! * `MAMAN_ENV`=development 6 | //! * `REDIS_URL`="redis://127.0.0.1/" 7 | #![doc(html_root_url = "https://docs.rs/maman/0.13.1")] 8 | #![deny(warnings)] 9 | #![crate_name = "maman"] 10 | 11 | extern crate html5ever; 12 | #[macro_use] 13 | extern crate log; 14 | extern crate mime; 15 | extern crate reqwest; 16 | extern crate robotparser; 17 | extern crate serde; 18 | #[macro_use] 19 | extern crate serde_derive; 20 | #[macro_use] 21 | extern crate serde_json; 22 | extern crate sidekiq; 23 | extern crate url; 24 | 25 | #[macro_export] 26 | macro_rules! maman_name { 27 | () => { 28 | "Maman" 29 | }; 30 | } 31 | #[macro_export] 32 | macro_rules! maman_version { 33 | () => { 34 | env!("CARGO_PKG_VERSION") 35 | }; 36 | } 37 | #[macro_export] 38 | macro_rules! maman_version_string { 39 | () => { 40 | concat!(maman_name!(), " v", maman_version!()) 41 | }; 42 | } 43 | #[macro_export] 44 | macro_rules! maman_user_agent { 45 | () => { 46 | concat!(maman_version_string!(), " (https://crates.io/crates/maman)") 47 | }; 48 | } 49 | 50 | pub use crate::maman::{Page, Spider}; 51 | pub mod maman; 52 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate env_logger; 2 | #[macro_use] 3 | extern crate log; 4 | extern crate mime; 5 | extern crate url; 6 | 7 | #[macro_use] 8 | extern crate maman; 9 | extern crate sidekiq; 10 | 11 | use std::env; 12 | use std::process; 13 | use std::str::FromStr; 14 | 15 | use maman::Spider; 16 | use sidekiq::create_redis_pool; 17 | use url::Url; 18 | 19 | const DEFAULT_LIMIT: isize = 0; 20 | 21 | fn print_usage() { 22 | println!(maman_version_string!()); 23 | println!("Usage: maman URL [LIMIT] [MIME_TYPES]"); 24 | } 25 | 26 | fn fetch_url(url_arg: Option) -> Url { 27 | match url_arg { 28 | Some(url) => match Url::parse(url.as_ref()) { 29 | Ok(u) => u, 30 | Err(_) => { 31 | print_usage(); 32 | process::exit(1); 33 | } 34 | }, 35 | None => { 36 | print_usage(); 37 | process::exit(1); 38 | } 39 | } 40 | } 41 | 42 | fn fetch_limit(limit_arg: Option) -> isize { 43 | match limit_arg { 44 | Some(limit) => match limit.parse::() { 45 | Err(_) => DEFAULT_LIMIT, 46 | Ok(l) => l, 47 | }, 48 | None => DEFAULT_LIMIT, 49 | } 50 | } 51 | 52 | fn fetch_mime_types(mime_types_arg: Option) -> Vec { 53 | let mut mime_types = Vec::new(); 54 | match mime_types_arg { 55 | Some(mts) => { 56 | let v: Vec<&str> = mts.split(' ').collect(); 57 | for m in v { 58 | if let Ok(mime) = mime::Mime::from_str(m) { 59 | mime_types.push(mime); 60 | } 61 | } 62 | mime_types 63 | } 64 | None => mime_types, 65 | } 66 | } 67 | 68 | fn main() { 69 | env_logger::init(); 70 | 71 | match create_redis_pool() { 72 | Ok(redis_pool) => { 73 | let mut spider = Spider::new( 74 | redis_pool, 75 | fetch_url(env::args().nth(1)), 76 | fetch_limit(env::args().nth(2)), 77 | fetch_mime_types(env::args().nth(3)), 78 | ); 79 | spider.crawl() 80 | } 81 | Err(err) => { 82 | error!("Redis error: {}", err); 83 | process::exit(1); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/maman/mod.rs: -------------------------------------------------------------------------------- 1 | mod page; 2 | pub use self::page::Page; 3 | 4 | use std::collections::BTreeMap; 5 | use std::default::Default; 6 | use std::env; 7 | use std::str::FromStr; 8 | use std::time::Duration; 9 | 10 | use html5ever::tokenizer::BufferQueue; 11 | use html5ever::tokenizer::Tokenizer; 12 | use reqwest::header::{CONTENT_TYPE, USER_AGENT}; 13 | use reqwest::Client as HttpClient; 14 | use reqwest::Response as HttpResponse; 15 | use reqwest::StatusCode; 16 | use robotparser::RobotFileParser; 17 | use sidekiq::Client as SidekiqClient; 18 | use sidekiq::ClientOpts as SidekiqClientOpts; 19 | use sidekiq::RedisPool; 20 | use url::Url; 21 | 22 | const MAMAN_ENV: &str = "MAMAN_ENV"; 23 | const MAMAN_ENV_DEFAULT: &str = "development"; 24 | 25 | pub struct Spider<'a> { 26 | pub base_url: Url, 27 | pub visited_urls: Vec, 28 | pub unvisited_urls: Vec, 29 | pub env: String, 30 | pub limit: isize, 31 | pub mime_types: Vec, 32 | sidekiq: SidekiqClient, 33 | robot_parser: RobotFileParser<'a>, 34 | } 35 | 36 | impl<'a> Spider<'a> { 37 | pub fn new( 38 | redis_pool: RedisPool, 39 | base_url: Url, 40 | limit: isize, 41 | mime_types: Vec, 42 | ) -> Spider<'a> { 43 | let maman_env = 44 | env::var(&MAMAN_ENV.to_owned()).unwrap_or_else(|_| MAMAN_ENV_DEFAULT.to_owned()); 45 | let robots_txt = base_url.join("/robots.txt").unwrap(); 46 | let robot_file_parser = RobotFileParser::new(robots_txt); 47 | let client_opts = SidekiqClientOpts { 48 | namespace: Some(maman_env.to_string()), 49 | }; 50 | let sidekiq = SidekiqClient::new(redis_pool, client_opts); 51 | Spider { 52 | base_url, 53 | visited_urls: Vec::new(), 54 | unvisited_urls: Vec::new(), 55 | sidekiq, 56 | env: maman_env, 57 | robot_parser: robot_file_parser, 58 | limit, 59 | mime_types, 60 | } 61 | } 62 | 63 | pub fn visit_page(&mut self, page: Page) { 64 | self.visited_urls.push(page.url.clone()); 65 | for u in &page.urls { 66 | self.unvisited_urls.push(u.clone()); 67 | } 68 | if let Err(err) = self.sidekiq.push(page.to_job()) { 69 | error!("SidekiqClient push failed: {}", err); 70 | } 71 | } 72 | 73 | pub fn crawl(&mut self) { 74 | self.robot_parser.read(); 75 | let base_url = self.base_url.clone(); 76 | if let Some(response) = Spider::load_url(self.base_url.as_ref(), &self.mime_types) { 77 | self.visit(&base_url, response); 78 | while let Some(url) = self.unvisited_urls.pop() { 79 | if self.continue_to_crawl() { 80 | if !self.visited_urls.contains(&url) { 81 | if let Some(response) = Spider::load_url(url.as_ref(), &self.mime_types) { 82 | self.visit(&url, response); 83 | } 84 | } 85 | } else { 86 | break; 87 | } 88 | } 89 | } 90 | } 91 | 92 | pub fn read_page(page: Page, document: &str) -> Tokenizer { 93 | let mut tok = Tokenizer::new(page, Default::default()); 94 | let mut input = BufferQueue::new(); 95 | input.push_back(String::from(document).into()); 96 | let _ = tok.feed(&mut input); 97 | tok.end(); 98 | tok 99 | } 100 | 101 | fn visit(&mut self, page_url: &Url, response: HttpResponse) { 102 | if self.can_visit(page_url) { 103 | info!("{}", page_url); 104 | if let Some(page) = Spider::read_response(page_url, response) { 105 | self.visit_page(page); 106 | } 107 | } 108 | } 109 | 110 | fn continue_to_crawl(&self) -> bool { 111 | self.limit == 0 || (self.visited_urls.len() as isize) < self.limit 112 | } 113 | 114 | fn can_visit(&self, page_url: &Url) -> bool { 115 | self.robot_parser.can_fetch(maman_name!(), page_url.path()) 116 | } 117 | 118 | fn read_response(page_url: &Url, mut response: HttpResponse) -> Option { 119 | let mut headers = BTreeMap::new(); 120 | { 121 | for (key, value) in response.headers().iter() { 122 | headers.insert( 123 | key.as_str().to_string(), 124 | value.to_str().unwrap_or("").to_string(), 125 | ); 126 | } 127 | } 128 | match response.text() { 129 | Ok(content) => { 130 | let page = Page::new( 131 | page_url.clone(), 132 | content.to_string(), 133 | headers, 134 | response.status().to_string(), 135 | ); 136 | let read = Spider::read_page(page, &content); 137 | Some(read.sink) 138 | } 139 | _ => None, 140 | } 141 | } 142 | 143 | fn load_url(url: &str, mime_types: &[mime::Mime]) -> Option { 144 | let client = HttpClient::builder() 145 | .timeout(Duration::from_secs(5)) 146 | .build() 147 | .expect("HttpClient failed to construct"); 148 | match client 149 | .get(url) 150 | .header(USER_AGENT, maman_user_agent!()) 151 | .send() 152 | { 153 | Err(_) => None, 154 | Ok(response) => match response.status() { 155 | StatusCode::OK | StatusCode::NOT_MODIFIED => { 156 | if mime_types.is_empty() { 157 | Some(response) 158 | } else { 159 | let content_type = response 160 | .headers() 161 | .get(CONTENT_TYPE) 162 | .and_then(|value| value.to_str().ok()) 163 | .and_then(|value| value.parse::().ok()) 164 | .and_then(|value| { 165 | let (type_, subtype) = (value.type_(), value.subtype()); 166 | { 167 | let mut text = type_.to_string(); 168 | text.push('/'); 169 | text.push_str(subtype.as_ref()); 170 | mime::Mime::from_str(&text).ok() 171 | } 172 | }); 173 | match content_type { 174 | Some(ct) => { 175 | if mime_types.contains(&ct) { 176 | Some(response) 177 | } else { 178 | None 179 | } 180 | } 181 | None => None, 182 | } 183 | } 184 | } 185 | _ => None, 186 | }, 187 | } 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /src/maman/page.rs: -------------------------------------------------------------------------------- 1 | use std::collections::BTreeMap; 2 | use std::default::Default; 3 | 4 | use html5ever::tokenizer::{TagToken, Token, TokenSink, TokenSinkResult}; 5 | use sidekiq::{Job, JobOpts, Value}; 6 | use url::{ParseError, Url}; 7 | 8 | #[derive(Serialize, Debug)] 9 | pub struct Page { 10 | pub url: Url, 11 | pub document: String, 12 | pub headers: BTreeMap, 13 | pub status: String, 14 | pub urls: Vec, 15 | } 16 | 17 | impl TokenSink for Page { 18 | type Handle = (); 19 | 20 | fn process_token(&mut self, token: Token, _: u64) -> TokenSinkResult { 21 | if let TagToken(tag) = token { 22 | if tag.name.as_ref() == "a" { 23 | for attr in &tag.attrs { 24 | if attr.name.local.as_ref() == "href" { 25 | if let Some(u) = self.can_enqueue(&attr.value) { 26 | self.urls.push(u); 27 | } 28 | } 29 | } 30 | } 31 | } 32 | TokenSinkResult::Continue 33 | } 34 | } 35 | 36 | impl Page { 37 | pub fn new( 38 | url: Url, 39 | document: String, 40 | headers: BTreeMap, 41 | status: String, 42 | ) -> Self { 43 | Page { 44 | url, 45 | document, 46 | headers, 47 | status, 48 | urls: Vec::new(), 49 | } 50 | } 51 | 52 | pub fn to_job(&self) -> Job { 53 | let job_opts = JobOpts { 54 | queue: maman_name!().to_string().to_lowercase(), 55 | ..Default::default() 56 | }; 57 | Job::new(maman_name!().to_string(), vec![self.as_object()], job_opts) 58 | } 59 | 60 | pub fn as_object(&self) -> Value { 61 | json!({ 62 | "url": &self.url, 63 | "document": &self.document, 64 | "headers": &self.headers, 65 | "status": &self.status, 66 | "urls": &self.urls, 67 | }) 68 | } 69 | 70 | fn normalize_url(&self, url: &str) -> Option { 71 | match Url::parse(url) { 72 | Ok(u) => Some(u), 73 | Err(ParseError::RelativeUrlWithoutBase) => Some(self.url.join(url).unwrap()), 74 | Err(_) => None, 75 | } 76 | } 77 | 78 | fn url_without_fragment(&self, url: &str) -> Option { 79 | match self.normalize_url(url) { 80 | Some(mut u) => { 81 | u.set_fragment(None); 82 | Some(u) 83 | } 84 | None => None, 85 | } 86 | } 87 | 88 | fn url_eq(&self, url: &Url) -> bool { 89 | self.url == *url 90 | } 91 | 92 | fn domain_eq(&self, url: &Url) -> bool { 93 | self.url.domain() == url.domain() 94 | } 95 | 96 | fn can_enqueue(&self, url: &str) -> Option { 97 | match self.url_without_fragment(url) { 98 | Some(u) => match u.scheme() { 99 | "http" | "https" => { 100 | if !self.url_eq(&u) && self.domain_eq(&u) { 101 | Some(u) 102 | } else { 103 | None 104 | } 105 | } 106 | _ => None, 107 | }, 108 | None => None, 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /tests/lib.rs: -------------------------------------------------------------------------------- 1 | extern crate mockito; 2 | extern crate sidekiq; 3 | extern crate url; 4 | #[macro_use] 5 | extern crate maman; 6 | 7 | use maman::{Page, Spider}; 8 | use sidekiq::create_redis_pool; 9 | 10 | use std::collections::BTreeMap; 11 | use std::env; 12 | use std::str::FromStr; 13 | 14 | use url::Url; 15 | 16 | fn visit_page(input: &str) -> Spider { 17 | env::set_var("MAMAN_ENV", "test"); 18 | let url = Url::parse("https://example.net/").unwrap(); 19 | let redis_pool = create_redis_pool().unwrap(); 20 | let mut spider = Spider::new(redis_pool, url.clone(), 0, Vec::new()); 21 | let page = Page::new( 22 | url, 23 | input.to_string(), 24 | BTreeMap::new(), 25 | "200 OK".to_string(), 26 | ); 27 | let tok = Spider::read_page(page, input); 28 | spider.visit_page(tok.sink); 29 | spider 30 | } 31 | 32 | #[test] 33 | fn test_ignore_initial_url_link() { 34 | let input = ""; 35 | let spider = visit_page(input); 36 | assert_eq!(spider.visited_urls.len(), 1); 37 | assert_eq!(spider.unvisited_urls.len(), 1); 38 | } 39 | 40 | #[test] 41 | fn test_ignore_fragment_link() { 42 | let input = ""; 43 | let spider = visit_page(input); 44 | assert_eq!(spider.visited_urls.len(), 1); 45 | assert_eq!(spider.unvisited_urls.len(), 1); 46 | } 47 | 48 | #[test] 49 | fn test_ignore_mailto_link() { 50 | let input = ""; 51 | let spider = visit_page(input); 52 | assert_eq!(spider.visited_urls.len(), 1); 53 | assert_eq!(spider.unvisited_urls.len(), 1); 54 | } 55 | 56 | #[test] 57 | fn test_new_with_fragment_link() { 58 | let input = ""; 59 | let spider = visit_page(input); 60 | assert_eq!(spider.visited_urls.len(), 1); 61 | assert_eq!(spider.unvisited_urls.len(), 2); 62 | } 63 | 64 | #[test] 65 | fn test_other_domain_link() { 66 | let input = ""; 67 | let spider = visit_page(input); 68 | assert_eq!(spider.visited_urls.len(), 1); 69 | assert_eq!(spider.unvisited_urls.len(), 0); 70 | } 71 | 72 | #[test] 73 | fn test_json_job_format() { 74 | env::set_var("MAMAN_ENV", "test"); 75 | let input = ""; 76 | let url = Url::parse("http://example.net/").unwrap(); 77 | let mut headers = BTreeMap::new(); 78 | headers.insert("content-type".to_string(), "text/html".to_string()); 79 | let page = Page::new(url, input.to_string(), headers, "200 OK".to_string()); 80 | let page_object = page.as_object(); 81 | let job = page.to_job(); 82 | assert_eq!(job.class, maman_name!()); 83 | assert_eq!(job.retry, 25); 84 | assert_eq!(job.queue, maman_name!().to_string().to_lowercase()); 85 | assert_eq!(job.args, vec![page_object]); 86 | } 87 | 88 | #[test] 89 | fn test_integration() { 90 | use mockito::mock; 91 | let _r = mock("GET", "/robots.txt") 92 | .with_status(200) 93 | .with_header("content-type", "text/plain") 94 | .with_body("User-agent: *\nAllow: /") 95 | .create(); 96 | let _m1 = mock("GET", "/") 97 | .with_status(200) 98 | .with_header("content-type", "text/html") 99 | .with_body("hello") 100 | .create(); 101 | let _m2 = mock("GET", "/hello") 102 | .with_status(200) 103 | .with_header("content-type", "text/html") 104 | .with_body("world") 105 | .create(); 106 | let _m3 = mock("GET", "/world") 107 | .with_status(200) 108 | .with_header("content-type", "text/html") 109 | .with_body("!") 110 | .create(); 111 | let redis_pool = create_redis_pool().unwrap(); 112 | let url = Url::parse(&mockito::server_url()).unwrap(); 113 | let mut spider = Spider::new(redis_pool, url, 0, Vec::new()); 114 | spider.crawl(); 115 | assert_eq!(spider.visited_urls.len(), 3); 116 | } 117 | 118 | #[test] 119 | fn test_integration_filter() { 120 | use mockito::mock; 121 | let _r = mock("GET", "/robots.txt") 122 | .with_status(200) 123 | .with_header("content-type", "text/plain") 124 | .with_body("User-agent: *\nAllow: /") 125 | .create(); 126 | let _m1 = mock("GET", "/") 127 | .with_status(200) 128 | .with_header("content-type", "text/html; charset=utf-8") 129 | .with_body("hello") 130 | .create(); 131 | let _m2 = mock("GET", "/hello") 132 | .with_status(200) 133 | .with_header("content-type", "text/html; charset=utf-8") 134 | .with_body("world") 135 | .create(); 136 | let _m3 = mock("GET", "/world") 137 | .with_status(200) 138 | .with_header("content-type", "text/html; charset=utf-8") 139 | .with_body("!") 140 | .create(); 141 | let redis_pool = create_redis_pool().unwrap(); 142 | let url = Url::parse(&mockito::server_url()).unwrap(); 143 | let mut spider = Spider::new( 144 | redis_pool, 145 | url, 146 | 0, 147 | vec![mime::Mime::from_str("text/html").unwrap()], 148 | ); 149 | spider.crawl(); 150 | assert_eq!(spider.visited_urls.len(), 3); 151 | } 152 | --------------------------------------------------------------------------------