├── .backportrc.json ├── .buildkite ├── pipeline.yml ├── publish │ ├── build-and-push-multiarch-docker.sh │ ├── build-docker.sh │ ├── publish-common.sh │ ├── push-docker.sh │ └── test-docker.sh ├── pull-requests.json ├── release-pipeline.yml └── scripts │ ├── run_ci_step.sh │ └── run_command.sh ├── .bundler-version ├── .devcontainer └── devcontainer.json ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── config.yml │ └── enhancement.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ ├── add-labels-main.yml │ └── backport.yml ├── .gitignore ├── .java-version ├── .jrubyrc ├── .rspec ├── .rubocop.yml ├── .ruby-version ├── Brewfile ├── Dockerfile ├── Dockerfile.wolfi ├── Gemfile ├── Gemfile.lock ├── Jarfile ├── Jars.lock ├── LICENSE ├── Makefile ├── NOTICE.txt ├── README.md ├── bin └── crawler ├── catalog-info.yaml ├── config ├── README.md ├── crawler.yml.example ├── elasticsearch.yml.example ├── examples │ ├── parks-australia.yml │ └── simple.yml └── filebeat.yml.example ├── docker-compose.yaml ├── docs ├── ADVANCED.md ├── CHANGELOG.md ├── CLI.md ├── CODE_OF_CONDUCT.md ├── CONFIG.md ├── CONTRIBUTING.md ├── DEVELOPER_GUIDE.md ├── ELASTICSEARCH.md ├── FEATURE_COMPARISON.md ├── RELEASING.md ├── SECURITY.md ├── SUPPORT.md └── features │ ├── BINARY_CONTENT_EXTRACTION.md │ ├── CRAWLER_DIRECTIVES.md │ ├── CRAWL_RULES.md │ ├── EXTRACTION_RULES.md │ ├── INGEST_PIPELINES.md │ ├── LOGGING.md │ └── SCHEDULING.md ├── lib ├── constants.rb ├── crawler.rb ├── crawler │ ├── api │ │ ├── config.rb │ │ └── crawl.rb │ ├── cli.rb │ ├── cli │ │ ├── crawl.rb │ │ ├── helpers.rb │ │ ├── schedule.rb │ │ ├── urltest.rb │ │ ├── validate.rb │ │ └── version.rb │ ├── content_engine │ │ ├── extractor.rb │ │ ├── transformer.rb │ │ └── utils.rb │ ├── coordinator.rb │ ├── core_ext.rb │ ├── data │ │ ├── crawl_result │ │ │ ├── base.rb │ │ │ ├── content_extractable_file.rb │ │ │ ├── error.rb │ │ │ ├── html.rb │ │ │ ├── http_auth_disallowed_error.rb │ │ │ ├── redirect.rb │ │ │ ├── redirect_error.rb │ │ │ ├── robots_txt.rb │ │ │ ├── sitemap.rb │ │ │ ├── success.rb │ │ │ └── unsupported_content_type.rb │ │ ├── crawl_task.rb │ │ ├── domain.rb │ │ ├── extraction │ │ │ ├── rule.rb │ │ │ ├── ruleset.rb │ │ │ └── url_filter.rb │ │ ├── link.rb │ │ ├── rule.rb │ │ ├── rule_engine_outcome.rb │ │ ├── seen_urls.rb │ │ ├── url.rb │ │ ├── url_queue.rb │ │ └── url_queue │ │ │ ├── base.rb │ │ │ └── memory_only.rb │ ├── document_mapper.rb │ ├── event_generator.rb │ ├── executor.rb │ ├── http_client.rb │ ├── http_executor.rb │ ├── http_header_service.rb │ ├── http_utils │ │ ├── all_trusting_trust_manager.rb │ │ ├── config.rb │ │ ├── exceptions.rb │ │ ├── filtering_dns_resolver.rb │ │ └── response.rb │ ├── logging │ │ ├── handler │ │ │ ├── base.rb │ │ │ ├── file.rb │ │ │ └── stdout.rb │ │ └── logger.rb │ ├── mock_event_logger.rb │ ├── mock_executor.rb │ ├── output_sink.rb │ ├── output_sink │ │ ├── base.rb │ │ ├── console.rb │ │ ├── elasticsearch.rb │ │ ├── file.rb │ │ ├── mock.rb │ │ └── null.rb │ ├── robots_txt_parser.rb │ ├── robots_txt_service.rb │ ├── rule_engine │ │ └── base.rb │ ├── stats.rb │ ├── url_validator.rb │ ├── url_validator │ │ ├── crawl_rules_check_concern.rb │ │ ├── dns_check_concern.rb │ │ ├── domain_access_check_concern.rb │ │ ├── domain_uniqueness_check_concern.rb │ │ ├── result.rb │ │ ├── robots_txt_check_concern.rb │ │ ├── tcp_check_concern.rb │ │ ├── url_check_concern.rb │ │ ├── url_content_check_concern.rb │ │ └── url_request_check_concern.rb │ └── utils.rb ├── environment.rb ├── errors.rb └── es │ ├── bulk_queue.rb │ └── client.rb ├── product_version ├── renovate.json ├── script ├── bundle ├── environment ├── functions.sh ├── licenses │ ├── README.md │ ├── generate_notice.rb │ ├── lib │ │ ├── third_party.rb │ │ └── third_party │ │ │ ├── base.rb │ │ │ ├── misc_dependencies.rb │ │ │ └── rubygems_dependencies.rb │ ├── misc_licenses │ │ ├── .gitkeep │ │ ├── _manually_added_jruby-LICENSE.txt │ │ └── _manually_added_tika-LICENSE.txt │ └── rubygems_licenses │ │ ├── .gitkeep │ │ ├── _manually_added_faux-LICENSE.txt │ │ ├── _manually_added_httpclient-LICENSE.txt │ │ ├── _manually_added_jruby-jars-LICENSE.txt │ │ ├── _manually_added_minitest-LICENSE.txt │ │ └── _manually_added_strscan-LICENSE.txt ├── rspec ├── support │ └── string_colors.rb └── vendor_jars ├── spec ├── factories │ └── crawl_results.rb ├── fixtures │ ├── crawl-flat-format.yml │ ├── crawl.yml │ ├── do-not-visit.txt │ ├── elasticsearch-flat-format.yml │ ├── elasticsearch-partially-flat-format.yml │ ├── elasticsearch.yml │ ├── gilacountyaz.gov.html │ ├── sitemap │ │ ├── sitemap_index.xml │ │ ├── sitemap_index_huge.xml │ │ ├── sitemap_no_urls.xml │ │ ├── sitemap_urlset.xml │ │ ├── sitemap_urlset.xml.gz │ │ ├── sitemap_urlset_10000_urls.xml │ │ └── sitemap_urlset_huge.xml │ ├── ssl │ │ ├── ca.crt │ │ ├── ca.key │ │ ├── ca.password.txt │ │ ├── config_with_cert.yml │ │ ├── expired │ │ │ ├── example.cnf │ │ │ ├── example.crt │ │ │ ├── example.csr │ │ │ ├── example.key │ │ │ └── generate.sh │ │ ├── invalid.crt │ │ └── self-signed │ │ │ ├── example.cnf │ │ │ ├── example.crt │ │ │ ├── example.csr │ │ │ ├── example.key │ │ │ └── generate.sh │ └── uncrate.com.html ├── integration │ ├── charset_spec.rb │ ├── content_extraction_spec.rb │ ├── headers_spec.rb │ ├── legacy_sitemaps_spec.rb │ ├── nofollow_spec.rb │ ├── redirects_spec.rb │ ├── response_content_type_spec.rb │ ├── response_limits_spec.rb │ ├── robots_txt_spec.rb │ ├── seed_spec.rb │ ├── sitemap_spec.rb │ ├── sitemap_xxe_spec.rb │ ├── timeouts │ │ ├── request_timeout_spec.rb │ │ └── socket_timeout_spec.rb │ └── url_fragments_spec.rb ├── lib │ ├── crawler │ │ ├── api │ │ │ ├── config_spec.rb │ │ │ └── crawl_spec.rb │ │ ├── cli │ │ │ ├── crawl_spec.rb │ │ │ ├── helpers_spec.rb │ │ │ ├── schedule_spec.rb │ │ │ ├── urltest_spec.rb │ │ │ ├── validate_spec.rb │ │ │ └── version_spec.rb │ │ ├── content_engine │ │ │ ├── extractor_spec.rb │ │ │ ├── transformer_spec.rb │ │ │ └── utils_spec.rb │ │ ├── coordinator_spec.rb │ │ ├── data │ │ │ ├── crawl_result │ │ │ │ ├── html_spec.rb │ │ │ │ └── sitemap_spec.rb │ │ │ ├── crawl_result_spec.rb │ │ │ ├── crawl_task_spec.rb │ │ │ ├── domain_spec.rb │ │ │ ├── extraction │ │ │ │ ├── rule_spec.rb │ │ │ │ ├── ruleset_spec.rb │ │ │ │ └── url_filter_spec.rb │ │ │ ├── link_spec.rb │ │ │ ├── rule_spec.rb │ │ │ ├── url_queue │ │ │ │ └── memory_only_spec.rb │ │ │ ├── url_queue_spec.rb │ │ │ └── url_spec.rb │ │ ├── document_mapper_spec.rb │ │ ├── event_generator_spec.rb │ │ ├── http_client_spec.rb │ │ ├── http_executor_spec.rb │ │ ├── http_utils │ │ │ ├── bad_ssl_spec.rb │ │ │ ├── config_spec.rb │ │ │ ├── filtering_dns_resolver_spec.rb │ │ │ └── response_spec.rb │ │ ├── logging │ │ │ └── crawllogger_spec.rb │ │ ├── output_sink │ │ │ ├── elasticsearch_spec.rb │ │ │ └── file_spec.rb │ │ ├── output_sink_spec.rb │ │ ├── robots_txt_parser_spec.rb │ │ ├── rule_engine │ │ │ └── base_spec.rb │ │ ├── stats_spec.rb │ │ ├── url_validator │ │ │ ├── crawl_rules_check_spec.rb │ │ │ ├── dns_check_spec.rb │ │ │ ├── domain_access_check_spec.rb │ │ │ ├── domain_uniqueness_check_spec.rb │ │ │ ├── robots_txt_check_spec.rb │ │ │ ├── tcp_check_spec.rb │ │ │ ├── url_check_spec.rb │ │ │ ├── url_content_check_spec.rb │ │ │ └── url_request_check_spec.rb │ │ └── url_validator_spec.rb │ ├── crawler_spec.rb │ ├── environment_spec.rb │ └── es │ │ ├── bulk_queue_spec.rb │ │ └── client_spec.rb ├── spec_helper.rb └── support │ ├── cli_helpers.rb │ ├── crawl_response_matchers.rb │ ├── faux │ ├── faux_crawl.rb │ └── results_collection.rb │ ├── fixtures.rb │ └── mock_response.rb └── vendor ├── faux ├── .gitignore ├── Gemfile ├── LICENSE ├── README.md ├── Rakefile ├── faux.gemspec ├── lib │ ├── faux.rb │ ├── faux │ │ ├── element │ │ │ ├── atom_feed.rb │ │ │ ├── base.rb │ │ │ ├── fixture.rb │ │ │ ├── page.rb │ │ │ ├── path_with_content_length.rb │ │ │ ├── robots.rb │ │ │ └── sitemap.rb │ │ ├── helpers │ │ │ └── url.rb │ │ ├── middleware │ │ │ └── reporter.rb │ │ └── version.rb │ └── site.rb ├── sites │ ├── fixture_site.rb │ ├── robots_txt_respect_rules.rb │ ├── simple_site.rb │ └── sitemap_pointing_to_sitemaps.rb └── spec │ ├── faux │ ├── element │ │ ├── atom_feed_spec.rb │ │ ├── base_spec.rb │ │ ├── fixture_spec.rb │ │ ├── page_spec.rb │ │ ├── path_with_content_length_spec.rb │ │ ├── robots_spec.rb │ │ └── sitemap_spec.rb │ ├── middleware │ │ └── reporter_spec.rb │ └── site_spec.rb │ ├── faux_spec.rb │ ├── fixtures │ ├── atom-feed-example-com.xml │ └── simple.html │ └── spec_helper.rb └── jars ├── com └── github │ └── crawler-commons │ └── crawler-commons │ └── 1.2 │ └── crawler-commons-1.2.jar ├── commons-codec └── commons-codec │ └── 1.15 │ └── commons-codec-1.15.jar ├── commons-io └── commons-io │ └── 2.16.1 │ └── commons-io-2.16.1.jar ├── isorelax └── isorelax │ └── 20030108 │ └── isorelax-20030108.jar ├── net ├── sf │ └── saxon │ │ └── Saxon-HE │ │ └── 9.6.0-4 │ │ └── Saxon-HE-9.6.0-4.jar └── sourceforge │ └── htmlunit │ └── neko-htmlunit │ └── 2.63.0 │ └── neko-htmlunit-2.63.0.jar ├── nu └── validator │ └── jing │ └── 20200702VNU │ └── jing-20200702VNU.jar ├── org ├── apache │ ├── commons │ │ ├── commons-compress │ │ │ └── 1.27.1 │ │ │ │ └── commons-compress-1.27.1.jar │ │ └── commons-lang3 │ │ │ └── 3.16.0 │ │ │ └── commons-lang3-3.16.0.jar │ └── httpcomponents │ │ ├── client5 │ │ └── httpclient5 │ │ │ └── 5.1 │ │ │ └── httpclient5-5.1.jar │ │ └── core5 │ │ ├── httpcore5-h2 │ │ └── 5.1.1 │ │ │ └── httpcore5-h2-5.1.1.jar │ │ └── httpcore5 │ │ └── 5.1.1 │ │ └── httpcore5-5.1.1.jar ├── brotli │ └── dec │ │ └── 0.1.2 │ │ └── dec-0.1.2.jar ├── nokogiri │ └── nekodtd │ │ └── 0.1.11.noko2 │ │ └── nekodtd-0.1.11.noko2.jar └── slf4j │ ├── slf4j-api │ └── 1.7.7 │ │ └── slf4j-api-1.7.7.jar │ └── slf4j-nop │ └── 1.7.26 │ └── slf4j-nop-1.7.26.jar ├── xalan ├── serializer │ └── 2.7.3 │ │ └── serializer-2.7.3.jar └── xalan │ └── 2.7.3 │ └── xalan-2.7.3.jar ├── xerces └── xercesImpl │ └── 2.12.2 │ └── xercesImpl-2.12.2.jar └── xml-apis └── xml-apis └── 1.4.01 └── xml-apis-1.4.01.jar /.backportrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "targetBranchChoices": [ 3 | { "name": "main", "checked": true }, 4 | "0.2", 5 | "0.1" 6 | ], 7 | "fork": false, 8 | "targetPRLabels": ["backport"], 9 | "branchLabelMapping": { 10 | "^v0.3.0(.0)?$": "main", 11 | "^v(\\d+).(\\d+)(.\\d+)+$": "$1.$2" 12 | }, 13 | "upstream": "elastic/crawler" 14 | } 15 | -------------------------------------------------------------------------------- /.buildkite/pipeline.yml: -------------------------------------------------------------------------------- 1 | agents: 2 | provider: "gcp" 3 | machineType: "n1-standard-8" 4 | 5 | defaultTimeoutInMinutes: 5 6 | 7 | notify: 8 | - if: 'build.branch =~ /^((main)|([0-9]+\.[0-9]+))\$/ && (build.state == "failed" || pipeline.started_passing)' 9 | slack: 10 | channels: 11 | - "#search-et-alerts" 12 | message: "${BUILDKITE_MESSAGE}" 13 | 14 | # TODO: change docker build steps into pulling a ci-agent-image to speed up build time 15 | steps: 16 | - label: ":rubocop: Lint" 17 | commands: 18 | - ".buildkite/scripts/run_command.sh docker" 19 | - ".buildkite/scripts/run_command.sh lint" 20 | - label: ":rspec: Test" 21 | commands: 22 | - ".buildkite/scripts/run_command.sh docker" 23 | - ".buildkite/scripts/run_command.sh test" 24 | -------------------------------------------------------------------------------- /.buildkite/publish/build-and-push-multiarch-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######## 4 | # Builds the multiarch docker image and pushes it to the docker registry 5 | ######## 6 | 7 | set -exu 8 | set -o pipefail 9 | 10 | # Load our common environment variables for publishing 11 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 12 | export CURDIR 13 | 14 | # shellcheck source=./publish-common.sh 15 | source "$CURDIR/publish-common.sh" 16 | 17 | # Set our tag name as well as the tag names of the individual platform images 18 | TAG_NAME="${BASE_TAG_NAME}:${VERSION}" 19 | LATEST_TAG_NAME="${BASE_TAG_NAME}:latest" 20 | AMD64_TAG="${BASE_TAG_NAME}:${VERSION}-amd64" 21 | ARM64_TAG="${BASE_TAG_NAME}:${VERSION}-arm64" 22 | 23 | # Pull the images from the registry 24 | buildah pull "$AMD64_TAG" 25 | buildah pull "$ARM64_TAG" 26 | 27 | # ensure +x is set to avoid writing any sensitive information to the console 28 | set +x 29 | 30 | # Log into Docker 31 | echo "Logging into docker..." 32 | DOCKER_USER=$(vault read -address "${VAULT_ADDR}" -field "${DOCKER_USER_KEY}" "${VAULT_PATH}") 33 | vault read -address "${VAULT_ADDR}" -field "${DOCKER_PASS_KEY}" "${VAULT_PATH}" | \ 34 | buildah login --username="${DOCKER_USER}" --password-stdin docker.elastic.co 35 | 36 | # Create the manifest for the multiarch image 37 | echo "Creating ${VERSION} manifest..." 38 | buildah manifest create "$TAG_NAME" \ 39 | "$AMD64_TAG" \ 40 | "$ARM64_TAG" 41 | 42 | # ... and push it 43 | echo "Pushing ${VERSION} manifest..." 44 | buildah manifest push "$TAG_NAME" "docker://$TAG_NAME" 45 | 46 | # Write out the final manifest for debugging purposes 47 | echo "Built and pushed ${VERSION} multiarch image... dumping final manifest..." 48 | buildah manifest inspect "$TAG_NAME" 49 | 50 | # Repeat for latest tag if applicable 51 | if [[ "${APPLY_LATEST_TAG:-}" == "true" ]]; then 52 | echo "Creating :latest manifest..." 53 | buildah manifest create "$LATEST_TAG_NAME" \ 54 | "$AMD64_TAG" \ 55 | "$ARM64_TAG" 56 | 57 | echo "Pushing :latest manifest..." 58 | buildah manifest push "$LATEST_TAG_NAME" "docker://$LATEST_TAG_NAME" 59 | 60 | echo "Built and pushed :latest multiarch image... dumping final manifest..." 61 | buildah manifest inspect "$LATEST_TAG_NAME" 62 | else 63 | echo "No :latest manifest required." 64 | fi 65 | -------------------------------------------------------------------------------- /.buildkite/publish/build-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######## 4 | # Builds the docker image and saves it to an archive file 5 | # so it can be stored as an artifact in Buildkite 6 | ######## 7 | 8 | set -exu 9 | set -o pipefail 10 | 11 | if [[ "${ARCHITECTURE:-}" == "" ]]; then 12 | echo "!! ARCHITECTURE is not set. Exiting." 13 | exit 2 14 | fi 15 | 16 | # Load our common environment variables for publishing 17 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 18 | export CURDIR 19 | 20 | # shellcheck source=./publish-common.sh 21 | source "$CURDIR/publish-common.sh" 22 | 23 | pushd "$PROJECT_ROOT" 24 | 25 | # set our complete tag name and build the image 26 | TAG_NAME="$BASE_TAG_NAME:${VERSION}-${ARCHITECTURE}" 27 | docker build -f "$DOCKERFILE_PATH" -t "$TAG_NAME" . 28 | 29 | # save the image to an archive file 30 | OUTPUT_PATH="$PROJECT_ROOT/.artifacts" 31 | OUTPUT_FILE="$OUTPUT_PATH/${DOCKER_ARTIFACT_KEY}-${VERSION}-${ARCHITECTURE}.tar.gz" 32 | mkdir -p "$OUTPUT_PATH" 33 | docker save "$TAG_NAME" | gzip > "$OUTPUT_FILE" 34 | 35 | popd 36 | -------------------------------------------------------------------------------- /.buildkite/publish/publish-common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "${CURDIR:-}" == "" ]]; then 4 | echo "!! CURDIR is not set. Exiting." 5 | exit 2 6 | fi 7 | 8 | function realpath { 9 | echo "$(cd "$(dirname "$1")" || exit; pwd)"/"$(basename "$1")"; 10 | } 11 | 12 | export SCRIPT_DIR="$CURDIR" 13 | 14 | BUILDKITE_DIR=$(realpath "$(dirname "$SCRIPT_DIR")") 15 | PROJECT_ROOT=$(realpath "$(dirname "$BUILDKITE_DIR")") 16 | VERSION_PATH="$PROJECT_ROOT/product_version" 17 | VERSION=$(cat "$VERSION_PATH") 18 | IS_SNAPSHOT=$(buildkite-agent meta-data get is_snapshot) 19 | IS_LATEST=$(buildkite-agent meta-data get is_latest) 20 | 21 | if [[ "${IS_SNAPSHOT:-}" == "false" && "${IS_LATEST:-}" == "true" ]]; then 22 | # don't apply LATEST tag to SNAPSHOT builds 23 | export APPLY_LATEST_TAG="true" 24 | else 25 | export APPLY_LATEST_TAG="false" 26 | fi 27 | 28 | export BUILDKITE_DIR 29 | export PROJECT_ROOT 30 | export VERSION 31 | 32 | if [[ "${IS_SNAPSHOT:-}" == "true" ]]; then 33 | echo "Adding SNAPSHOT labeling" 34 | export VERSION="${VERSION}-SNAPSHOT" 35 | fi 36 | 37 | export BASE_TAG_NAME="${DOCKER_IMAGE_NAME:-docker.elastic.co/integrations/crawler}" 38 | export DOCKERFILE_PATH="${DOCKERFILE_PATH:-Dockerfile.wolfi}" 39 | export DOCKER_ARTIFACT_KEY="${DOCKER_ARTIFACT_KEY:-elastic-crawler-docker}" 40 | 41 | export VAULT_ADDR="${VAULT_ADDR:-https://vault-ci-prod.elastic.dev}" 42 | export VAULT_PATH="secret/ci/elastic-crawler/docker-ci-admin" 43 | export DOCKER_PASS_KEY="secret_20240823" 44 | export DOCKER_USER_KEY="user_20240823" 45 | -------------------------------------------------------------------------------- /.buildkite/publish/push-docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ######## 4 | # Pushes the docker image to the docker registry 5 | ######## 6 | 7 | set -exu 8 | set -o pipefail 9 | 10 | if [[ "${ARCHITECTURE:-}" == "" ]]; then 11 | echo "!! ARCHITECTURE is not set. Exiting." 12 | exit 2 13 | fi 14 | 15 | # Load our common environment variables for publishing 16 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 17 | export CURDIR 18 | 19 | # shellcheck source=./publish-common.sh 20 | source "$CURDIR"/publish-common.sh 21 | 22 | # Load the image from the artifact created in build-docker.sh 23 | echo "Loading image from archive file..." 24 | docker load < "$PROJECT_ROOT/.artifacts/${DOCKER_ARTIFACT_KEY}-${VERSION}-${ARCHITECTURE}.tar.gz" 25 | 26 | # ensure +x is set to avoid writing any sensitive information to the console 27 | set +x 28 | 29 | # Log into Docker 30 | echo "Logging into docker..." 31 | DOCKER_USER=$(vault read -address "${VAULT_ADDR}" -field "${DOCKER_USER_KEY}" "${VAULT_PATH}") 32 | vault read -address "${VAULT_ADDR}" -field "${DOCKER_PASS_KEY}" "${VAULT_PATH}" | \ 33 | docker login -u "$DOCKER_USER" --password-stdin docker.elastic.co 34 | 35 | # Set our tag name and push the image 36 | TAG_NAME="$BASE_TAG_NAME:${VERSION}-${ARCHITECTURE}" 37 | echo "Pushing image to docker with tag: $TAG_NAME" 38 | docker push "$TAG_NAME" 39 | -------------------------------------------------------------------------------- /.buildkite/pull-requests.json: -------------------------------------------------------------------------------- 1 | { 2 | "jobs": [ 3 | { 4 | "enabled": true, 5 | "pipelineSlug": "elastic-crawler", 6 | "allow_org_users": true, 7 | "allowed_repo_permissions": ["admin", "write"], 8 | "allowed_list": [], 9 | "set_commit_status": true, 10 | "commit_status_context": "buildkite/elastic-crawler", 11 | "build_on_commit": false, 12 | "build_on_comment": true, 13 | "trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", 14 | "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))", 15 | "skip_ci_labels": ["skip-ci"], 16 | "skip_target_branches": [], 17 | "always_require_ci_on_changed": [] 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /.buildkite/scripts/run_ci_step.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | RUBY_VERSION="$(cat .ruby-version)" 6 | JAVA_VERSION="$(cat .java-version)" 7 | 8 | export RUBY_VERSION 9 | export JAVA_VERSION 10 | 11 | case $1 in 12 | lint) 13 | echo "---- running linter" 14 | make install-gems lint 15 | ;; 16 | 17 | test) 18 | echo "---- running tests" 19 | make install test 20 | ;; 21 | 22 | *) 23 | echo "Usage: run_command {docker|lint}" 24 | exit 2 25 | ;; 26 | esac 27 | -------------------------------------------------------------------------------- /.buildkite/scripts/run_command.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euxo pipefail 4 | 5 | COMMAND_TO_RUN=${1:-} 6 | 7 | if [[ "${COMMAND_TO_RUN:-}" == "" ]]; then 8 | echo "Usage: run_command.sh {lint|docker}" 9 | exit 2 10 | fi 11 | 12 | function realpath { 13 | echo "$(cd "$(dirname "$1")"; pwd)"/"$(basename "$1")"; 14 | } 15 | 16 | SCRIPT_WORKING_DIR=$(realpath "$(dirname "$0")") 17 | BUILDKITE_DIR=$(realpath "$(dirname "$SCRIPT_WORKING_DIR")") 18 | PROJECT_ROOT=$(realpath "$(dirname "$BUILDKITE_DIR")") 19 | 20 | DOCKER_IMAGE="crawler-ci" 21 | SCRIPT_CMD="/ci/.buildkite/scripts/run_ci_step.sh" 22 | 23 | if [[ "${COMMAND_TO_RUN:-}" == "docker" ]]; then 24 | echo "---- running docker build" 25 | make build-docker-ci 26 | else 27 | docker run --interactive --rm \ 28 | --sig-proxy=true --init \ 29 | --user "root" \ 30 | --volume "$PROJECT_ROOT:/ci" \ 31 | --workdir /ci \ 32 | --env HOME=/ci \ 33 | --env CI \ 34 | --env GIT_REVISION="${BUILDKITE_COMMIT-}" \ 35 | --env BUILD_ID="${BUILDKITE_BUILD_NUMBER-}" \ 36 | --entrypoint "${SCRIPT_CMD}" \ 37 | "$DOCKER_IMAGE" \ 38 | "$COMMAND_TO_RUN" 39 | fi 40 | -------------------------------------------------------------------------------- /.bundler-version: -------------------------------------------------------------------------------- 1 | 2.6.6 2 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Crawler Dev Container", 3 | "image": "jruby:9.4.12.0-jdk21", // Same image as Dockerfile 4 | "postCreateCommand": "IS_DOCKER=true make install", 5 | "features": { 6 | "ghcr.io/devcontainers/features/git:1": { 7 | "version": "latest" 8 | }, 9 | "ghcr.io/devcontainers/features/docker-in-docker:2.12.2": { 10 | "version": "latest", 11 | "dockerSocketBindMount": true 12 | } 13 | } 14 | } -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # These owners will be the default owners for everything in 2 | # the repo. 3 | * @elastic/search-extract-and-transform -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve. 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Bug Description 11 | 12 | 13 | ### To Reproduce 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | ## Expected behavior 21 | 22 | 23 | ## Screenshots 24 | 26 | 27 | ## Environment 28 | 29 | 30 | 31 | - OS: [e.g. iOS] 32 | - Browser [e.g. chrome, safari] 33 | - Version [e.g. 22] 34 | 35 | 36 | ## Additional context 37 | 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Question or Discussion 4 | url: https://discuss.elastic.co/c/search/ 5 | about: Please ask and answer questions here. 6 | - name: Security Vulnerability 7 | url: https://www.elastic.co/community/security 8 | about: DO NOT file issues related to security. Instead, please follow our security policy here. 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/enhancement.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Enhancement 3 | about: It's not a bug, but some desired feature is missing 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ### Problem Description 11 | 14 | 15 | ### Proposed Solution 16 | 18 | 19 | 20 | ### Alternatives 21 | 23 | 24 | ### Additional Context 25 | 26 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### Closes https://github.com/elastic/crawler/issues/### 2 | 3 | 9 | 10 | ### Checklists 11 | 12 | 14 | 15 | #### Pre-Review Checklist 16 | - [ ] This PR does NOT contain credentials of any kind, such as API keys or username/passwords (double check `crawler.yml.example` and `elasticsearch.yml.example`) 17 | - [ ] This PR has a meaningful title 18 | - [ ] This PR links to all relevant GitHub issues that it fixes or partially addresses 19 | - If there is no GitHub issue, please create it. Each PR should have a link to an issue 20 | - [ ] this PR has a thorough description 21 | - [ ] Covered the changes with automated tests 22 | - [ ] Tested the changes locally 23 | - [ ] Added a label for each target release version (example: `v0.1.0`) 24 | - [ ] Considered corresponding documentation changes 25 | - [ ] Contributed any configuration settings changes to the configuration reference 26 | - [ ] Ran `make notice` if any dependencies have been added 27 | 28 | #### Changes Requiring Extra Attention 29 | 30 | 33 | 34 | - [ ] Security-related changes (encryption, TLS, SSRF, etc) 35 | - [ ] New external service dependencies added. 36 | 37 | ### Related Pull Requests 38 | 39 | 42 | 43 | ### Release Note 44 | 45 | 49 | -------------------------------------------------------------------------------- /.github/workflows/add-labels-main.yml: -------------------------------------------------------------------------------- 1 | name: Force backport labels for main 2 | 3 | on: 4 | pull_request_target: 5 | branches: 6 | - main 7 | types: 8 | - opened 9 | 10 | jobs: 11 | add_labels: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 15 | - id: version 16 | uses: juliangruber/read-file-action@386973d5b59f826915775874c7d1f82c4bbcfb07 17 | with: 18 | path: ./product_version 19 | - uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf # v1 20 | with: 21 | labels: | 22 | auto-backport 23 | v${{ steps.version.outputs.content }} 24 | -------------------------------------------------------------------------------- /.github/workflows/backport.yml: -------------------------------------------------------------------------------- 1 | name: Backport PR 2 | 3 | on: 4 | pull_request_target: 5 | branches: 6 | - main 7 | types: 8 | - labeled 9 | - closed 10 | 11 | jobs: 12 | backport: 13 | if: | 14 | github.event.pull_request.merged == true 15 | && contains(github.event.pull_request.labels.*.name, 'auto-backport') 16 | && ( 17 | (github.event.action == 'labeled' && github.event.label.name == 'auto-backport') 18 | || (github.event.action == 'closed') 19 | ) 20 | runs-on: ubuntu-latest 21 | steps: 22 | - name: Checkout Actions 23 | uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2 24 | with: 25 | repository: 'swiftype/kibana-github-actions' 26 | ref: main 27 | path: ./actions 28 | 29 | - name: Install Actions 30 | run: npm install --production --prefix ./actions 31 | 32 | - name: Run Backport 33 | uses: ./actions/backport 34 | with: 35 | github_token: ${{ secrets.GITHUB_TOKEN }} 36 | approver_token: ${{ secrets.REPO_SCOPED_TOKEN }} 37 | auto_approve: 'true' 38 | commit_user: elastic 39 | commit_email: ent-search-backport@users.noreply.github.com 40 | auto_merge: 'true' 41 | auto_merge_method: 'squash' 42 | manual_backport_command_template: 'backport --pr %pullNumber% --autoMerge --autoMergeMethod squash' 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # bundler state 2 | /.bundle 3 | /vendor/bundle/ 4 | /vendor/ruby/ 5 | /vendor/filebeat/ 6 | /vendor/metricbeat/ 7 | /vendor/jruby/ 8 | 9 | # homebrew state 10 | .Brewfile.cached 11 | Brewfile.lock.json 12 | 13 | # Mac finder artifacts 14 | .DS_Store 15 | 16 | # Rubymine project files 17 | /.idea/ 18 | /.run/ 19 | *.iml 20 | 21 | # VSCode Workspace files 22 | /.vscode/ 23 | 24 | # Silver surfer ignore file 25 | .agignore 26 | 27 | # Sublime project files 28 | /*.sublime-project 29 | /*.sublime-workspace 30 | 31 | # vim artifacts 32 | *.swp 33 | *.swo 34 | *.un~ 35 | 36 | ngrok 37 | 38 | /data 39 | /.rvmrc 40 | /.envrc 41 | /.rbenv-vars 42 | /.irbrc 43 | /public/blog 44 | 45 | # honeypot dump 46 | dump/ 47 | 48 | # Emacs restclient files 49 | *.http 50 | 51 | # Codekit settings 52 | *.codekit 53 | 54 | # Static asset build system 55 | .tmp/ 56 | .m2/ 57 | .local/ 58 | .cache/ 59 | 60 | # Ignore example file outputs 61 | output 62 | 63 | # Config files 64 | config/*.yml 65 | 66 | # log files 67 | /logs 68 | 69 | # default File sink folder 70 | /crawled_docs 71 | 72 | # Code coverage 73 | coverage 74 | 75 | # Downloaded license files 76 | script/licenses/**/_downloaded_*-LICENSE.txt 77 | 78 | # Buildkite 79 | bin/container-structure-test 80 | .artifacts 81 | .buildkite/publish/container-structure-test.yaml 82 | -------------------------------------------------------------------------------- /.java-version: -------------------------------------------------------------------------------- 1 | 21 2 | -------------------------------------------------------------------------------- /.jrubyrc: -------------------------------------------------------------------------------- 1 | # Make it possible to use timeout regexp matching 2 | regexp.interruptible=true 3 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --profile 4 | -r spec_helper 5 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | require: 2 | - rubocop-performance 3 | 4 | AllCops: 5 | TargetRubyVersion: 3.1 6 | NewCops: enable 7 | Exclude: 8 | - 'Jarfile' 9 | - 'vendor/**/*' 10 | - 'Gemfile' 11 | - 'bin/**/*' 12 | - 'spec/support/faux/**/*' 13 | 14 | Style/Documentation: 15 | Enabled: false 16 | 17 | Metrics/MethodLength: 18 | CountAsOne: ['array', 'hash', 'heredoc', 'method_call'] 19 | Max: 15 20 | 21 | Metrics/ClassLength: 22 | Max: 200 23 | 24 | Metrics/ModuleLength: 25 | Max: 200 26 | 27 | Metrics/AbcSize: 28 | Max: 20 29 | 30 | # Disable for specs 31 | Metrics/BlockLength: 32 | Exclude: 33 | - 'spec/**/*' 34 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | jruby-9.4.12.0 2 | -------------------------------------------------------------------------------- /Brewfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Local dev utils 4 | brew 'rbenv' 5 | brew 'jenv' 6 | brew 'icu4c' 7 | 8 | # Stack services 9 | cask 'homebrew/cask-versions/temurin11' 10 | 11 | # For testing SSL locally 12 | brew 'mkcert' 13 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM jruby:9.4.12.0-jdk21@sha256:5641622b488d298362b96fdaea0f328248ce55962e68e224118be11ddb48d16e 2 | RUN apt-get update && \ 3 | apt-get install -y --no-install-recommends \ 4 | libicu-dev netbase make 5 | 6 | # used for skipping jenv/rbenv setup 7 | ENV IS_DOCKER=1 8 | 9 | # Set up crawlergroup and crawleruser 10 | RUN groupadd -g 451 crawlergroup && \ 11 | useradd -m -u 451 -g crawlergroup crawleruser 12 | 13 | # Copy and set up Crawler as crawleruser 14 | USER crawleruser 15 | COPY --chown=crawleruser:crawlergroup --chmod=775 . /home/app 16 | WORKDIR /home/app 17 | RUN make clean install 18 | 19 | # Clean up build dependencies 20 | RUN rm -r /home/crawleruser/.m2 21 | 22 | ENTRYPOINT [ "/bin/bash" ] 23 | -------------------------------------------------------------------------------- /Jarfile: -------------------------------------------------------------------------------- 1 | # This file is used to control our Jar dependencies and is used with jar-dependencies to vendor 2 | # our java dependencies into vendor/jars (see https://github.com/mkristian/jar-dependencies for details) 3 | # 4 | # If you update this file, please run the following command to update the jars cache: 5 | # make clean install 6 | # 7 | # When adding a new dependency, please explain what it is and why we're adding it in a comment. 8 | 9 | # Functionality common to any web crawler 10 | jar 'com.github.crawler-commons:crawler-commons', '1.2' 11 | 12 | # Pinned dependency of crawler-commons to resolve vulnerability (updated to 2.16.1 for commons-compress compatibility) 13 | jar 'commons-io:commons-io', '2.16.1' 14 | 15 | # Apache HTTP client used for requests to websites 16 | jar 'org.apache.httpcomponents.client5:httpclient5', '5.1' 17 | 18 | # For managing Brotli input streams 19 | jar 'org.apache.commons:commons-compress', '1.27.1' 20 | jar 'org.brotli:dec', '0.1.2' 21 | 22 | # Cleaner Java logs handling 23 | jar 'org.slf4j:slf4j-nop', '1.7.26' 24 | -------------------------------------------------------------------------------- /Jars.lock: -------------------------------------------------------------------------------- 1 | isorelax:isorelax:20030108:compile: 2 | org.nokogiri:nekodtd:0.1.11.noko2:compile: 3 | net.sourceforge.htmlunit:neko-htmlunit:2.63.0:compile: 4 | nu.validator:jing:20200702VNU:compile: 5 | net.sf.saxon:Saxon-HE:9.6.0-4:compile: 6 | xalan:serializer:2.7.3:compile: 7 | xalan:xalan:2.7.3:compile: 8 | xerces:xercesImpl:2.12.2:compile: 9 | xml-apis:xml-apis:1.4.01:compile: 10 | com.github.crawler-commons:crawler-commons:1.2:compile: 11 | org.slf4j:slf4j-api:1.7.7:compile: 12 | commons-io:commons-io:2.16.1:compile: 13 | org.apache.httpcomponents.client5:httpclient5:5.1:compile: 14 | org.apache.httpcomponents.core5:httpcore5:5.1.1:compile: 15 | org.apache.httpcomponents.core5:httpcore5-h2:5.1.1:compile: 16 | commons-codec:commons-codec:1.15:compile: 17 | org.apache.commons:commons-compress:1.27.1:compile: 18 | org.apache.commons:commons-lang3:3.16.0:compile: 19 | org.brotli:dec:0.1.2:compile: 20 | org.slf4j:slf4j-nop:1.7.26:compile: 21 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .phony: test lint autocorrect install install-ci install-gems install-jars clean notice build-docker-ci list-gems list-jars 2 | 3 | test: 4 | script/rspec $(file) 5 | 6 | lint: 7 | rubocop 8 | 9 | autocorrect: 10 | rubocop --autocorrect 11 | 12 | install: 13 | script/environment 14 | make install-gems 15 | make install-jars 16 | 17 | install-ci: 18 | make install-gems 19 | make install-jars 20 | 21 | install-gems: 22 | script/bundle install 23 | 24 | install-jars: 25 | script/bundle exec script/vendor_jars 26 | 27 | clean: 28 | rm -rf Jars.lock vendor/jars 29 | 30 | notice: 31 | script/licenses/generate_notice.rb 32 | 33 | build-docker-ci: 34 | docker build -t crawler-ci . 35 | 36 | list-gems: 37 | script/bundle exec gem dependency 38 | 39 | list-jars: 40 | script/bundle exec lock_jars --tree 41 | -------------------------------------------------------------------------------- /bin/crawler: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # 4 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 5 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 6 | # you may not use this file except in compliance with the Elastic License 2.0. 7 | # 8 | 9 | # frozen_string_literal: true 10 | 11 | require "bundler/setup" 12 | require 'dry/cli' 13 | 14 | # Standard libraries 15 | require 'getoptlong' 16 | require 'yaml' 17 | 18 | require_relative File.expand_path('../../lib/environment', __FILE__) 19 | require 'crawler/cli' 20 | 21 | java_import 'java.lang.System' 22 | 23 | # These opts are to prevent an intermittent `bad_record_mac` error when indexing into ES. 24 | # One workaround is to force use of SSLv3 only. 25 | System.setProperty('force.http.jre.executor', 'true') 26 | System.setProperty('https.protocols', 'SSLv3') 27 | 28 | Dry::CLI.new(Crawler::CLI).call 29 | -------------------------------------------------------------------------------- /config/README.md: -------------------------------------------------------------------------------- 1 | # Config 2 | 3 | See [CONFIG.md](../docs/CONFIG.md) for information on how to configure crawl jobs. 4 | -------------------------------------------------------------------------------- /config/examples/parks-australia.yml: -------------------------------------------------------------------------------- 1 | # This is a sample config file for crawling the parksaustralia.gov.au website writing output to an ES index 2 | # 3 | # The configuration options in this example are not exhaustive. To see all possible configuration options, 4 | # reference the config templates: 5 | # - config/crawler.yml.example 6 | # - config/elasticsearch.yml.example 7 | 8 | # Domains allowed for the crawl 9 | domains: 10 | - url: https://parksaustralia.gov.au 11 | seed_urls: 12 | - https://parksaustralia.gov.au 13 | - https://parksaustralia.gov.au/news/ 14 | 15 | # Where to send the results. Possible values are console, file, or elasticsearch 16 | output_sink: elasticsearch 17 | 18 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch 19 | output_index: parks-australia 20 | 21 | # Crawl tuning 22 | max_crawl_depth: 2 23 | 24 | # Crawl result field size limits 25 | max_title_size: 500 26 | max_body_size: 5_242_880 # 5 megabytes 27 | max_keywords_size: 512 28 | max_description_size: 512 29 | max_indexed_links_count: 5 30 | max_headings_count: 5 31 | 32 | elasticsearch: 33 | host: http://localhost 34 | port: 9200 35 | username: elastic 36 | password: changeme 37 | bulk_api: 38 | max_items: 10 39 | max_size_bytes: 1_048_576 40 | -------------------------------------------------------------------------------- /config/examples/simple.yml: -------------------------------------------------------------------------------- 1 | # This is an example config using the bare minimum configuration options possible. 2 | # 3 | # To see all possible configuration options, reference the config templates: 4 | # - config/crawler.yml.example 5 | # - config/elasticsearch.yml.example 6 | 7 | domains: 8 | - url: https://example.com 9 | 10 | output_sink: elasticsearch 11 | output_index: my-index 12 | 13 | elasticsearch: 14 | host: http://localhost 15 | # host: http://host.docker.internal # use this host instead if Elasticsearch is running on Docker on the same machine 16 | port: 9200 17 | username: elastic 18 | password: changeme 19 | -------------------------------------------------------------------------------- /config/filebeat.yml.example: -------------------------------------------------------------------------------- 1 | filebeat.inputs: 2 | - type: filestream 3 | id: crawler-events-filestream 4 | enabled: true 5 | paths: 6 | - "/path/to/opencrawler/crawler_event.log" 7 | fields: 8 | type: "event" 9 | processors: 10 | - decode_json_fields: 11 | fields: ["message"] 12 | target: "" 13 | overwrite_keys: true 14 | expand_keys: true 15 | 16 | - type: filestream 17 | id: crawler-system-log-filestream 18 | enabled: true 19 | paths: 20 | - "/path/to/opencrawler/crawler_system.log" 21 | fields: 22 | type: "system" 23 | processors: 24 | - dissect: 25 | tokenizer: "[%{@timestamp}] [crawl:%{crawl_id}] [%{crawl_stage}] %{message}" 26 | target_prefix: "" 27 | overwrite_keys: true 28 | trim_values: all 29 | 30 | setup.template.enabled: true 31 | setup.template.name: "filebeat" 32 | setup.template.pattern: "filebeat" 33 | 34 | setup.template.settings: 35 | index.number_of_shards: 1 36 | index.number_of_replicas: 1 37 | 38 | logging.level: debug 39 | 40 | output.elasticsearch: 41 | hosts: [""] 42 | api_key: "id:api_key" 43 | index: "logs-crawler-%{[fields.type]}" # see https://www.elastic.co/guide/en/fleet/8.17/data-streams.html#data-streams-naming-scheme 44 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | crawler: 3 | image: docker.elastic.co/integrations/crawler:${CRAWLER_VERSION:-latest} 4 | container_name: crawler 5 | volumes: 6 | - ./config:/home/app/config 7 | # - ./logs:/home/app/logs # Enable this to access log files outside the Docker container 8 | networks: 9 | - elastic 10 | stdin_open: true # Equivalent to -i 11 | tty: true # Required for interactive mode 12 | # Uncomment enviroment variable if running on MacOS M4 and experiencing Java Runtime errors 13 | # environment: 14 | # - "_JAVA_OPTIONS=-XX:UseSVE=0" 15 | 16 | networks: 17 | elastic: 18 | driver: bridge 19 | -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Open Crawler Changelog 2 | 3 | ## Legend 4 | 5 | - 🚀 Feature 6 | - 🐛 Bugfix 7 | - 🔨 Refactor 8 | 9 | ## `v0.2.0` 10 | 11 | - 🚀 Crawl jobs can now be scheduled using the CLI command `bin/crawler schedule`. See [CLI.md](./CLI.md#crawler-schedule). 12 | - 🚀 Crawler can now extract binary content from files. See [BINARY_CONTENT_EXTRACTION.md](./features/BINARY_CONTENT_EXTRACTION.md). 13 | - 🚀 Crawler will now purge outdated documents from the index at the end of the crawl. This is enabled by default. You can disable this by adding `purge_docs_enabled: false` to the crawler's yaml config file. 14 | - 🚀 Crawl rules can now be configured, allowing specified URLs to be allowed/denied. See [CRAWL_RULES.md](./features/CRAWL_RULES.md). 15 | - 🚀 Extraction rules using CSS, XPath, and URL selectors can now be applied to crawls. See [EXTRACTION_RULES.md](./features/EXTRACTION_RULES.md). 16 | - 🔨 The configuration field `content_extraction_enabled` is now `binary_content_extraction_enabled`. 17 | - 🔨 The configuration field `content_extraction_mime_types` is now `binary_content_extraction_mime_types`. 18 | - 🔨 The Elasticsearch document field `body_content` is now `body`. 19 | - 🔨 The format for config files has changed, so existing crawler configurations will not work. The new format can be referenced in the [crawler.yml.example](../config/crawler.yml.example) file. 20 | -------------------------------------------------------------------------------- /docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 303 See Other 2 | 3 | Location: https://www.elastic.co/community/codeofconduct 4 | -------------------------------------------------------------------------------- /docs/RELEASING.md: -------------------------------------------------------------------------------- 1 | # Releasing 2 | 3 | This doc is a reference for Elastic employees. 4 | Non-Elastic users can not publish a release. 5 | 6 | The version scheme we use is **MAJOR.MINOR.PATCH** and stored in the [product_version](../product_version) file at the root of this repository. 7 | Open Crawler follows its own release versioning and does not follow the Elastic stack unified release schedule or versioning. 8 | 9 | ## How to publish a Docker image 10 | 11 | Releasing is done entirely through Buildkite. 12 | The Open Crawler build job is named `crawler-docker-build-publish`. 13 | 14 | Build steps in buildkite: 15 | 16 | 1. Go to the [Buildkite job for publishing Crawler](https://buildkite.com/elastic/crawler-docker-build-publish) 17 | 2. Click `New Build` 18 | 3. Enter a message (e.g. `x.y release`) 19 | 4. Choose a version branch with the pattern `x.y` 20 | - Building from `main` is possible, but it will yield a _snapshot_ build. 21 | - For example, if the version inside the `product_version` file is `0.3.0`, then a build triggered from `main` will result in `0.3.0-SNAPSHOT` images. 22 | 5. Choose a commit 23 | - The default `HEAD` is usually fine 24 | 6. Click `Create Build` 25 | 7. Wait a minute for the Buildkite configuration to be loaded 26 | - When it has loaded, a `Build Information` button will appear 27 | 8. Select whether or not the build should have a `:latest` tag 28 | 9. Wait for the build to finish 29 | 30 | Creating a release in GitHub 31 | 32 | 1. Go to https://github.com/elastic/crawler/releases 33 | 2. Click `Draft new release` 34 | 3. Create a tag for this release, following the pattern `v{major}.{minor}.{patch}` 35 | 4. Choose the target branch, this should match the `{major}.{minor}` of the tag 36 | 5. Click `Generate release notes`, this should autofill all changes 37 | 6. If this is the latest release, make sure `Set as latest release` is selected 38 | 7. Click `Publish release` 39 | -------------------------------------------------------------------------------- /docs/SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | Thanks for your interest in the security of our products. Our security policy can be found at [https://www.elastic.co/community/security](https://www.elastic.co/community/security). 4 | 5 | ## Reporting a Vulnerability 6 | Please send security vulnerability reports to security@elastic.co. 7 | -------------------------------------------------------------------------------- /docs/SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Getting Support 2 | 3 | ### Official Support Services 4 | If you have an Elastic subscription, you are entitled to Support services. See our welcome page for [working with our support team](https://www.elastic.co/support/welcome). 5 | 6 | ### Where do I report issues with Elastic Crawler? 7 | If something is not working as expected, please open an [issue](https://github.com/elastic/elastic-crawler/issues/new). 8 | 9 | ### Where else can I go to get help? 10 | The Ingestion team at Elastic maintains this repository and is happy to help. Try posting your question to the [Elastic discuss forums](https://discuss.elastic.co/c/search/84). 11 | Be sure to mention that you're using Elastic Crawler and also let us know what service type you're trying to use, and any errors/issues you are encountering. 12 | You can also find us in the `#search-enterprise` channel of the [Elastic Community Slack](http://elasticstack.slack.com). 13 | -------------------------------------------------------------------------------- /docs/features/BINARY_CONTENT_EXTRACTION.md: -------------------------------------------------------------------------------- 1 | # Binary Content Extraction 2 | 3 | The Elastic Open Web Crawler can extract content from downloadable binary files, such as PDF and DOCX files. 4 | Binary content is extracted by converting file contents to base64 and including the output in a document to index. 5 | This value is picked up by an [Elasticsearch ingest pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html) that will convert the base64 content into plain text, to store in the `body` field of the same document. 6 | 7 | ## Using this feature 8 | 9 | 1. Enable ingest pipelines in the Elasticsearch configuration 10 | 2. Enable binary content extraction in the crawler configuration 11 | 3. Select which MIME types should have their contents extracted 12 | - The MIME type is determined by the HTTP response’s `Content-Type` header when downloading a given file 13 | - While intended primarily for PDF and Microsoft Office formats, you can use any of the formats supported by [Apache Tika](https://tika.apache.org/) 14 | - No default MIME types are defined, so at least at least one MIME type must be configured in order to extract non-HTML content 15 | - The ingest attachment processor does not support compressed files, e.g., an archive file containing a set of PDFs 16 | 17 | For example, the following configuration allows for the binary content extraction of PDF and DOCX files, through the default pipeline `ent-search-ingestion-pipeline`: 18 | 19 | ```yaml 20 | binary_content_extraction_enabled: true 21 | binary_content_extraction_mime_types: 22 | - application/pdf 23 | - application/msword 24 | 25 | elasticsearch: 26 | pipeline: ent-search-generic-ingestion 27 | pipeline_enabled: true 28 | ``` 29 | 30 | Read more on ingest pipelines in Open Crawler [here](./INGEST_PIPELINES.md). 31 | -------------------------------------------------------------------------------- /docs/features/INGEST_PIPELINES.md: -------------------------------------------------------------------------------- 1 | # Ingest Pipelines 2 | 3 | Open Crawler uses an [Elasticsearch ingest pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html) to power several content extraction features. 4 | The default pipeline, `ent-search-generic-ingestion`, is automatically created when Elasticsearch first starts. 5 | This pipeline does some pre-processing on documents before they are ingested by Open Crawler. 6 | See [Ingest pipelines for Search indices](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest-pipeline-search.html) for more details on this pipeline. 7 | -------------------------------------------------------------------------------- /docs/features/SCHEDULING.md: -------------------------------------------------------------------------------- 1 | # Scheduling Recurring Crawl Jobs 2 | 3 | Crawl jobs can be scheduled to recur. 4 | Scheduled crawl jobs run until terminated by the user. 5 | 6 | These schedules are defined through standard cron expressions. 7 | You can use the tool https://crontab.guru to test different cron expressions. 8 | 9 | For example, to schedule a crawl job that will execute once every 30 minutes, create a configuration file called `scheduled-crawl.yml` with the following contents: 10 | 11 | ```yaml 12 | domains: 13 | - url: "https://example.com" 14 | schedule: 15 | pattern: "*/30 * * * *" # run every 30th minute 16 | ``` 17 | 18 | Then, use the CLI to then begin the crawl job schedule: 19 | 20 | ```bash 21 | docker run \ 22 | -v ./scheduled-crawl.yml:/scheduled-crawl.yml \ 23 | -it docker.elastic.co/integrations/crawler:latest jruby bin/crawler schedule /scheduled-crawl.yml 24 | ``` 25 | 26 | **Scheduled crawl jobs from a single execution will not overlap.** 27 | 28 | Scheduled jobs will also not wait for existing jobs to complete. 29 | That means if a crawl job is already in progress when another schedule is triggered, the new job will be dropped. 30 | For example, if you have a schedule that triggers at every hour, but your crawl job takes 1.5 hours to complete, the crawl schedule will effectively trigger on every 2nd hour. 31 | 32 | **Executing multiple crawl schedules _can_ cause overlap.** 33 | 34 | Be wary of executing multiple schedules against the same index. 35 | As with ad-hoc triggered crawl jobs, two crawlers simultaneously interacting with a single index can lead to data loss. 36 | -------------------------------------------------------------------------------- /lib/constants.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Constants 10 | # Field names used in every crawl result when creating an ES doc 11 | RESERVED_FIELD_NAMES = %w[ 12 | id 13 | any 14 | all 15 | none 16 | or 17 | and 18 | not 19 | additional_urls 20 | body_content 21 | body 22 | domains 23 | headings 24 | last_crawled_at 25 | links 26 | meta_description 27 | meta_keywords 28 | title 29 | url 30 | url_host 31 | url_path 32 | url_path_dir1 33 | url_path_dir2 34 | url_path_dir3 35 | url_port 36 | url_scheme 37 | ].freeze 38 | end 39 | -------------------------------------------------------------------------------- /lib/crawler.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | # Current version of the crawler 11 | def self.version 12 | @version ||= File.read(File.join(__dir__, '../product_version')).strip 13 | end 14 | 15 | # A unique identifier of the crawler process 16 | def self.service_id 17 | @service_id ||= BSON::ObjectId.new.to_s 18 | end 19 | end 20 | 21 | # Load other parts of the crawler 22 | # Ignore Crawler CLI 23 | files = Dir[File.join(__dir__, 'crawler/**/*.rb')].reject do |file| 24 | file.include?('/crawler/cli/') 25 | end 26 | 27 | files.each { |f| require_dependency(f) } 28 | -------------------------------------------------------------------------------- /lib/crawler/cli.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # load CLI dependencies 10 | Dir[File.join(__dir__, 'cli/**/*.rb')].each { |f| require(f) } 11 | 12 | module Crawler 13 | module CLI 14 | extend Dry::CLI::Registry 15 | 16 | register 'version', Crawler::CLI::Version, aliases: ['v', '--version'] 17 | register 'crawl', Crawler::CLI::Crawl, aliases: %w[r run] 18 | register 'schedule', Crawler::CLI::Schedule 19 | register 'validate', Crawler::CLI::Validate 20 | register 'urltest', Crawler::CLI::UrlTest 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/crawler/cli/crawl.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'dry/cli' 10 | require 'yaml' 11 | 12 | module Crawler 13 | module CLI 14 | class Crawl < Dry::CLI::Command 15 | desc 'Run a crawl of the site' 16 | 17 | argument :crawl_config, required: true, desc: 'Path to crawl config file' 18 | 19 | option :es_config, desc: 'Path to elasticsearch config file' 20 | 21 | def call(crawl_config:, es_config: nil, **) 22 | crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config) 23 | crawl = Crawler::API::Crawl.new(crawl_config) 24 | 25 | crawl.start! 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/crawler/cli/schedule.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'dry/cli' 10 | require 'yaml' 11 | require 'rufus-scheduler' 12 | 13 | module Crawler 14 | module CLI 15 | class Schedule < Dry::CLI::Command 16 | desc 'Schedule a recurring crawl of the site' 17 | 18 | argument :crawl_config, required: true, desc: 'Path to crawl config file' 19 | 20 | option :es_config, desc: 'Path to elasticsearch config file' 21 | 22 | def call(crawl_config:, es_config: nil, **) 23 | crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config) 24 | if crawl_config.schedule.nil? || crawl_config.schedule[:pattern].nil? 25 | raise ArgumentError, 'No schedule found in config file' 26 | end 27 | 28 | run_schedule(crawl_config.schedule[:pattern], crawl_config) 29 | end 30 | 31 | def run_schedule(pattern, crawl_config) 32 | crawl_config.system_logger.info("Crawler initialized with a cron schedule of #{pattern}") 33 | 34 | # Schedule a recurrent task based on the config value `schedule.pattern`. 35 | # The arg `overlap: false` prevents multiple tasks from spawning when a crawl 36 | # task is longer than the schedule pattern. 37 | # This will run until the Crawler is terminated. 38 | scheduler = Rufus::Scheduler.new 39 | scheduler.cron(pattern, overlap: false) do |job| 40 | crawl_config.system_logger.info( 41 | "Beginning scheduled crawl for #{job.previous_time} (actual trigger time: #{Time.now})." 42 | ) 43 | crawl = Crawler::API::Crawl.new(crawl_config) 44 | crawl.start! 45 | crawl_config.system_logger.info( 46 | "Scheduled crawl ended at #{Time.now}. Next scheduled crawl should trigger around #{job.next_time}." 47 | ) 48 | end 49 | scheduler.join 50 | end 51 | end 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /lib/crawler/cli/urltest.rb: -------------------------------------------------------------------------------- 1 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 2 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 3 | # you may not use this file except in compliance with the Elastic License 2.0. 4 | # 5 | 6 | # frozen_string_literal: true 7 | 8 | require 'dry/cli' 9 | require 'yaml' 10 | 11 | module Crawler 12 | module CLI 13 | class UrlTest < Dry::CLI::Command 14 | desc 'Test a URL against a configuration' 15 | 16 | argument :crawl_config, required: true, desc: 'Path to crawl config file' 17 | 18 | argument :endpoint, required: true, desc: 'Endpoint to test' 19 | 20 | option :es_config, desc: 'Path to elasticsearch config file' 21 | 22 | def call(crawl_config:, endpoint:, es_config: nil, **) 23 | crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config) 24 | crawl = Crawler::API::Crawl.new(crawl_config) 25 | 26 | crawl.start_url_test!(endpoint) 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/crawler/cli/validate.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'dry/cli' 10 | require 'yaml' 11 | 12 | module Crawler 13 | module CLI 14 | class Validate < Dry::CLI::Command 15 | desc 'Validate crawler configuration' 16 | 17 | argument :crawl_config, required: true, desc: 'Path to crawl config file' 18 | 19 | def call(crawl_config:, es_config: nil, **) 20 | crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config) 21 | 22 | crawl_config.domain_allowlist.each do |domain| 23 | validator = Crawler::UrlValidator.new( 24 | url: domain.raw_url, 25 | crawl_config: 26 | ) 27 | 28 | print_validation_result(domain, validator) 29 | end 30 | end 31 | 32 | private 33 | 34 | def print_validation_result(domain, validator) 35 | if validator.valid? 36 | puts "Domain #{domain.raw_url} is valid" 37 | else 38 | puts "Domain #{domain.raw_url} is invalid:" 39 | puts validator.failed_checks.map(&:comment).join("\n") 40 | end 41 | rescue Crawler::UrlValidator::Error => e 42 | puts "Error validating domain #{domain.raw_url}: #{e}" 43 | end 44 | end 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /lib/crawler/cli/version.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'dry/cli' 10 | 11 | module Crawler 12 | module CLI 13 | class Version < Dry::CLI::Command 14 | VERSION_PATH = File.expand_path('../../../product_version', __dir__).freeze 15 | 16 | desc 'Print version' 17 | 18 | def call(*) 19 | puts File.read(VERSION_PATH).strip 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/crawler/content_engine/transformer.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module ContentEngine 11 | module Transformer 12 | INCLUDE_ATTR = 'data-elastic-include' 13 | EXCLUDE_ATTR = 'data-elastic-exclude' 14 | EXCLUDE_ATTR_SELECTOR = "[#{EXCLUDE_ATTR}]".freeze 15 | 16 | def self.transform(doc) 17 | transform!(doc.dup) 18 | end 19 | 20 | def self.transform!(doc) 21 | loop do 22 | node = doc.has_attribute?(EXCLUDE_ATTR) ? doc : doc.at_css(EXCLUDE_ATTR_SELECTOR) 23 | break unless node 24 | 25 | traverse!(node, mode: :exclude) 26 | end 27 | 28 | doc 29 | end 30 | 31 | def self.traverse!(node, mode:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity 32 | # The exclusion attribute is used to determine what to traverse next in the parent loop, 33 | # so we should remove the attribute while traversing to avoid an infinite loop. 34 | node.remove_attribute(EXCLUDE_ATTR) if node.has_attribute?(EXCLUDE_ATTR) 35 | 36 | node.children.each do |child_node| 37 | if child_node.text? && mode == :exclude 38 | child_node.unlink 39 | elsif child_node.element? 40 | new_mode = 41 | if child_node.has_attribute?(INCLUDE_ATTR) 42 | :include 43 | elsif child_node.has_attribute?(EXCLUDE_ATTR) 44 | :exclude 45 | else 46 | mode # mode is unchanged 47 | end 48 | 49 | traverse!(child_node, mode: new_mode) 50 | end 51 | end 52 | end 53 | end 54 | end 55 | end 56 | -------------------------------------------------------------------------------- /lib/crawler/core_ext.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # 10 | # This file contains useful extensions for core classes 11 | # 12 | class Time 13 | # Returns the number of seconds since the system boot 14 | # 15 | # This method is useful for calculating elapsed time or difference between 16 | # two events without having to worry about daylight savings, leap seconds, etc. 17 | # 18 | def self.monotonic_now 19 | Process.clock_gettime(Process::CLOCK_MONOTONIC) 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/content_extractable_file.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'digest' 10 | 11 | require_dependency(File.join(__dir__, 'success')) 12 | 13 | module Crawler 14 | module Data 15 | module CrawlResult 16 | class ContentExtractableFile < Success 17 | # Allow constructor to be called on concrete result classes 18 | public_class_method :new 19 | 20 | attr_reader :content_length, :content_type 21 | 22 | def initialize(status_code:, content_length:, content_type:, **kwargs) 23 | super(status_code:, **kwargs) 24 | 25 | @content_length = content_length 26 | @content_type = content_type 27 | end 28 | 29 | def content_hash 30 | @content_hash ||= Digest::SHA1.hexdigest(content) 31 | end 32 | 33 | def base64_encoded_content 34 | @base64_encoded_content ||= Base64.strict_encode64(content) 35 | end 36 | 37 | def file_name 38 | @file_name ||= File.basename(url) 39 | end 40 | end 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/error.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'base')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class Error < Base 15 | # Fake status code to be used for unexpected internal errors 16 | MISCELLANEOUS_ERROR = 599 17 | 18 | attr_reader :error, :suggestion_message 19 | 20 | # INTERNAL_ERROR_STATUS is used by default for unexpected internal errors that 21 | # were not part of the HTTP response from a crawled web page. 22 | def initialize(error:, status_code: MISCELLANEOUS_ERROR, suggestion_message: nil, **kwargs) 23 | super(status_code:, **kwargs) 24 | @error = error 25 | @suggestion_message = suggestion_message 26 | end 27 | 28 | # Allow constructor to be called on concrete result classes 29 | public_class_method :new 30 | 31 | #--------------------------------------------------------------------------------------------- 32 | def to_h 33 | super.merge(error:) 34 | end 35 | 36 | def to_s 37 | "" 38 | end 39 | end 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/http_auth_disallowed_error.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'error')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class HttpAuthDisallowedError < Error 15 | def initialize(error: nil, **kwargs) 16 | suggestion_message = <<~MSG 17 | Set `http_auth_allowed: true` if you want to 18 | allow authenticated crawling of non-HTTPS URLs. 19 | MSG 20 | 21 | super( 22 | error: error || 'Authenticated crawling of non-HTTPS URLs is not allowed', 23 | suggestion_message:, 24 | **kwargs 25 | ) 26 | end 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/redirect.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'base')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class Redirect < Base 15 | VALID_STATUS_CODES = (300..399) 16 | 17 | attr_reader :redirect_chain, :location 18 | 19 | def initialize(status_code:, location:, redirect_chain:, **kwargs) 20 | super(status_code:, **kwargs) 21 | 22 | unless status_code.in?(VALID_STATUS_CODES) 23 | error = "Redirects have to have a 3xx response code, received #{status_code.inspect}" 24 | raise ArgumentError, error 25 | end 26 | 27 | raise ArgumentError, 'Location needs to be a Crawler URL object!' unless location.is_a?(Crawler::Data::URL) 28 | 29 | @location = location 30 | @redirect_chain = redirect_chain 31 | end 32 | 33 | # Allow constructor to be called on concrete result classes 34 | public_class_method :new 35 | 36 | def to_h 37 | super.merge( 38 | location:, 39 | redirect_chain: 40 | ) 41 | end 42 | 43 | def to_s 44 | "" # rubocop:disable Layout/LineLength 45 | end 46 | 47 | def original_url 48 | redirect_chain.first || url 49 | end 50 | 51 | def redirect_count 52 | redirect_chain.size + 1 53 | end 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/redirect_error.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'error')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class RedirectError < Error 15 | def initialize(**kwargs) 16 | suggestion = <<~LOG 17 | Check the URL content in your browser and make sure it is something 18 | the crawler could understand. 19 | LOG 20 | 21 | super(suggestion_message: suggestion, **kwargs) 22 | end 23 | end 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/robots_txt.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'success')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class RobotsTxt < Success 15 | # Allow constructor to be called on concrete result classes 16 | public_class_method :new 17 | end 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/success.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'base')) 10 | 11 | # The base class for all successful responses 12 | module Crawler 13 | module Data 14 | module CrawlResult 15 | class Success < Base 16 | VALID_STATUS_CODES = (200..299) 17 | 18 | attr_reader :content 19 | 20 | def initialize(status_code:, content:, **kwargs) 21 | super(status_code:, **kwargs) 22 | 23 | unless status_code.in?(VALID_STATUS_CODES) 24 | error = "Successful responses have to have a 2xx response code, received #{status_code.inspect}" 25 | raise ArgumentError, error 26 | end 27 | 28 | @content = content 29 | end 30 | end 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/crawler/data/crawl_result/unsupported_content_type.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'error')) 10 | 11 | module Crawler 12 | module Data 13 | module CrawlResult 14 | class UnsupportedContentType < Error 15 | def initialize(content_type:, error: nil, **kwargs) 16 | suggestion = <<~MSG 17 | Check the URL content in your browser and make sure it is something 18 | the crawler could understand. 19 | MSG 20 | 21 | super( 22 | content_type:, 23 | error: error || "Unsupported content type: #{content_type}", 24 | suggestion_message: suggestion, 25 | **kwargs 26 | ) 27 | end 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/crawler/data/domain.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module Data 11 | class Domain 12 | attr_reader :scheme, :host, :port 13 | 14 | def initialize(domain) 15 | @url = Crawler::Data::URL.parse(domain) 16 | @scheme = url.scheme 17 | @host = url.host 18 | @port = url.port || standard_port_for_scheme(url.scheme) 19 | end 20 | 21 | def raw_url 22 | url 23 | end 24 | 25 | def robots_txt_url 26 | url.join('/robots.txt') 27 | end 28 | 29 | def standard_port_for_scheme(scheme) 30 | case scheme 31 | when 'http' then 80 32 | when 'https' then 443 33 | end 34 | end 35 | 36 | def ==(other) 37 | to_s == other.to_s 38 | end 39 | 40 | def to_s 41 | "#{scheme}://#{host}:#{port}" 42 | end 43 | 44 | private 45 | 46 | attr_reader :url 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/crawler/data/extraction/ruleset.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency(File.join(__dir__, 'rule')) 10 | require_dependency(File.join(__dir__, 'url_filter')) 11 | require_dependency(File.join(__dir__, '..', '..', 'utils')) 12 | 13 | module Crawler 14 | module Data 15 | module Extraction 16 | class Ruleset 17 | def initialize(ruleset, domain) 18 | @ruleset = ruleset 19 | @domain = domain 20 | validate_ruleset 21 | 22 | # initialize these after validating they are arrays 23 | rules 24 | url_filters 25 | end 26 | 27 | def rules 28 | @rules ||= 29 | if @ruleset[:rules]&.any? 30 | @ruleset[:rules].map do |rule| 31 | Crawler::Data::Extraction::Rule.new(rule) 32 | end 33 | else 34 | [] 35 | end 36 | end 37 | 38 | def url_filters 39 | @url_filters ||= 40 | if @ruleset[:url_filters]&.any? 41 | @ruleset[:url_filters].map do |filter| 42 | Crawler::Data::Extraction::UrlFilter.new(filter) 43 | end 44 | else 45 | [] 46 | end 47 | end 48 | 49 | def url_filtering_rules 50 | @url_filtering_rules ||= url_filters.map do |filter| 51 | pattern = Regexp.new(Crawler::Utils.url_pattern(@domain, filter.type, filter.pattern)) 52 | Crawler::Data::Rule.new(Crawler::Data::Rule::ALLOW, url_pattern: pattern) 53 | end 54 | end 55 | 56 | private 57 | 58 | def validate_ruleset 59 | if !@ruleset[:rules].nil? && !@ruleset[:rules].is_a?(Array) 60 | raise ArgumentError, 'Extraction ruleset rules must be an array' 61 | end 62 | 63 | return unless !@ruleset[:url_filters].nil? && !@ruleset[:url_filters].is_a?(Array) 64 | 65 | raise ArgumentError, 'Extraction ruleset url_filters must be an array' 66 | end 67 | end 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /lib/crawler/data/extraction/url_filter.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module Data 11 | module Extraction 12 | class UrlFilter 13 | REGEX_TIMEOUT = 0.5 # seconds 14 | TYPES = %w[begins ends contains regex].freeze 15 | 16 | attr_reader :type, :pattern 17 | 18 | def initialize(url_filter) 19 | @type = url_filter[:type] 20 | @pattern = url_filter[:pattern] 21 | validate_url_filter 22 | end 23 | 24 | private 25 | 26 | def validate_url_filter 27 | unless TYPES.include?(@type) 28 | raise ArgumentError, 29 | "Extraction ruleset url_filter `#{@type}` is invalid; value must be one of #{TYPES.join(', ')}" 30 | end 31 | 32 | raise ArgumentError, 'Extraction ruleset url_filter pattern can not be blank' if @pattern.blank? 33 | 34 | case @type 35 | when 'begins' 36 | unless @pattern.start_with?('/') 37 | raise ArgumentError, 38 | 'Extraction ruleset url_filter pattern must begin with a slash (/) if type is `begins`' 39 | end 40 | when 'regex' then validate_regex 41 | end 42 | end 43 | 44 | def validate_regex 45 | _ = Regexp.new(@pattern) 46 | rescue RegexpError => e 47 | raise ArgumentError, "Extraction ruleset url_filter pattern regex is invalid: #{e.message}" 48 | end 49 | end 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/crawler/data/rule.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module Data 11 | class Rule 12 | ALLOW = :allow 13 | DENY = :deny 14 | REGEX_TIMEOUT = 1.second 15 | 16 | SUPPORTED_POLICIES = [ALLOW, DENY].freeze 17 | 18 | attr_reader :policy, :source 19 | 20 | def initialize(policy, url_pattern:, source: nil) 21 | unless SUPPORTED_POLICIES.include?(policy) 22 | raise ArgumentError, "policy: #{policy.inspect} is not a supported value" 23 | end 24 | 25 | unless url_pattern.is_a?(Regexp) 26 | raise ArgumentError, "url_pattern: must be a Regexp, it was #{url_pattern.class}" 27 | end 28 | 29 | @policy = policy 30 | @url_pattern = url_pattern 31 | @source = source 32 | end 33 | 34 | def url_match?(url) 35 | Timeout.timeout(REGEX_TIMEOUT) do 36 | @url_pattern.match?(url.to_s) 37 | end 38 | end 39 | 40 | def description 41 | @description ||= "policy: #{@policy}, url_pattern: #{@url_pattern}" 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/crawler/data/seen_urls.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module Data 11 | class SeenUrls 12 | def initialize 13 | @seen_urls = Concurrent::Set.new 14 | end 15 | 16 | def clear 17 | @seen_urls.clear 18 | end 19 | 20 | def count 21 | @seen_urls.size 22 | end 23 | 24 | def delete(url) 25 | @seen_urls.delete(url_hash(url)) 26 | end 27 | 28 | # A method called when the crawler needs to stop and persist its state 29 | def save 30 | # nothing to do by default 31 | end 32 | 33 | # Tries to add an item to the set 34 | # Returns +true+ if this is a new URL and we should visit it 35 | # Returns +false+ if we have already seen this URL 36 | def add?(url) 37 | !!@seen_urls.add?(url_hash(url)) 38 | end 39 | 40 | private 41 | 42 | def url_hash(url) 43 | raise ArgumentError, 'Needs a URL' unless url.is_a?(Crawler::Data::URL) 44 | 45 | url.normalized_hash 46 | end 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /lib/crawler/data/url_queue.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module Data 11 | module UrlQueue 12 | class Error < StandardError; end 13 | 14 | class TransientError < Error; end 15 | 16 | class QueueFullError < TransientError; end 17 | 18 | def self.create(config) 19 | queue_type = config.url_queue.to_s 20 | queue_class_for_type(queue_type).new(config) 21 | end 22 | 23 | def self.queue_class_for_type(queue_type) 24 | queue_class_name = "Crawler::Data::UrlQueue::#{queue_type.classify}" 25 | queue_class_name.safe_constantize.tap do |queue_class| 26 | raise ArgumentError, "Unknown URL queue type: #{queue_type}" unless queue_class 27 | end 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/crawler/executor.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # An Executor fetches content by making requests described by CrawlTasks. 10 | module Crawler 11 | class Executor 12 | def run(_crawl_task) 13 | raise NotImplementError 14 | end 15 | 16 | # Override to provide stats about the HTTP client 17 | def http_client_status 18 | raise NotImplementedError 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/crawler/http_utils/all_trusting_trust_manager.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | java_import javax.net.ssl.X509TrustManager 10 | 11 | # A simple implementation of the trust manager interface that trusts everyone 12 | # Used by the Crawler HTTP client to implement ssl_verification_mode=none. 13 | module Crawler 14 | module HttpUtils 15 | class AllTrustingTrustManager 16 | include X509TrustManager 17 | 18 | # rubocop:disable Naming/MethodName 19 | def checkClientTrusted(*) 20 | true 21 | end 22 | 23 | def checkServerTrusted(*) 24 | true 25 | end 26 | 27 | def getAcceptedIssuers 28 | [] 29 | end 30 | # rubocop:enable Naming/MethodName 31 | end 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /lib/crawler/logging/handler/base.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # base class for all log handlers 10 | module Crawler 11 | module Logging 12 | module Handler 13 | class Base 14 | def initialize(log_level, filename = nil, rotation_period = 'weekly') 15 | @log_level = log_level 16 | @filename = filename 17 | @rotation_period = rotation_period 18 | end 19 | 20 | def log 21 | raise NotImplementedError 22 | end 23 | 24 | def add_tags 25 | raise NotImplementedError 26 | end 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/crawler/logging/handler/stdout.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'base') 10 | 11 | module Crawler 12 | module Logging 13 | module Handler 14 | attr_reader :event_logger, :logger_instance 15 | 16 | class StdoutHandler < Handler::Base 17 | def initialize(log_level) 18 | super 19 | # logger instance setup 20 | logger_instance = Logger.new($stdout) 21 | logger_instance.level = log_level 22 | # Set a base format to include timestamp 23 | format_logger(logger_instance) 24 | # convert logger instance to a StaticallyTaggedLogger so we can support tagging 25 | @logger_instance = logger_instance 26 | end 27 | 28 | def log(message, message_log_level) 29 | case message_log_level 30 | when Logger::DEBUG 31 | @logger_instance.debug(message) 32 | when Logger::INFO 33 | @logger_instance.info(message) 34 | when Logger::WARN 35 | @logger_instance.warn(message) 36 | when Logger::ERROR 37 | @logger_instance.error(message) 38 | when Logger::FATAL 39 | @logger_instance.fatal(message) 40 | else 41 | @logger_instance << message 42 | end 43 | end 44 | 45 | def add_tags(*tags) 46 | # this function re-formats the log format with the provided tags 47 | format_logger(@logger_instance, tags.join(' ')) 48 | end 49 | 50 | def format_logger(logger_instance, tags = nil) 51 | logger_instance.formatter = proc do |_severity, datetime, _progname, msg| 52 | timestamp = datetime.strftime('%Y-%m-%dT%H:%M:%S.%LZ') 53 | if tags 54 | "[#{timestamp}] #{tags} #{msg}\n" 55 | else 56 | "[#{timestamp}] #{msg}\n" 57 | end 58 | end 59 | end 60 | 61 | def level(log_level) 62 | @logger_instance.level = log_level 63 | end 64 | end 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/crawler/logging/logger.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsarch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # logging monolith class that maintains 10 | # a. list of log handlers 11 | # b. routing function to route log messages to all handlers 12 | module Crawler 13 | module Logging 14 | class CrawlLogger 15 | attr_reader :all_handlers 16 | 17 | def initialize 18 | # initialize with no handlers by default 19 | @all_handlers = [] 20 | end 21 | 22 | def route_logs_to_handlers(message, message_log_level) 23 | all_handlers.each do |handler| 24 | handler.log(message, message_log_level) 25 | end 26 | end 27 | 28 | def debug(message) 29 | route_logs_to_handlers(message, Logger::DEBUG) 30 | end 31 | 32 | def info(message) 33 | route_logs_to_handlers(message, Logger::INFO) 34 | end 35 | 36 | def warn(message) 37 | route_logs_to_handlers(message, Logger::WARN) 38 | end 39 | 40 | def error(message) 41 | route_logs_to_handlers(message, Logger::ERROR) 42 | end 43 | 44 | def fatal(message) 45 | route_logs_to_handlers(message, Logger::FATAL) 46 | end 47 | 48 | def add(custom_log_level, message) 49 | route_logs_to_handlers(message, custom_log_level) 50 | end 51 | 52 | def <<(message) 53 | route_logs_to_handlers(message, nil) 54 | end 55 | 56 | def add_handler(new_handler) 57 | all_handlers.append(new_handler) 58 | end 59 | 60 | def add_tags_to_log_handlers(tags) 61 | all_handlers.each do |handler| 62 | handler.add_tags(tags) 63 | end 64 | end 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/crawler/mock_event_logger.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'logger' 10 | 11 | module Crawler 12 | class MockEventLogger 13 | # Array of accumulated events (hash objects). 14 | attr_reader :mock_events 15 | 16 | def initialize 17 | @mock_events = [] 18 | end 19 | 20 | def <<(event) 21 | # Since we receive an already serialized event, but want to run tests against raw events 22 | original_event = JSON.parse(event) 23 | mock_events << original_event 24 | end 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/crawler/mock_executor.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'executor') 10 | 11 | # MockExecutor returns pre-populated results for specified URIs. 12 | module Crawler 13 | class MockExecutor < Crawler::Executor 14 | attr_reader :mock_results 15 | 16 | def initialize(mock_results = {}) # rubocop:disable Lint/MissingSuper 17 | @mock_results = mock_results # Hash of normalized URL strings to CrawlResponse objects. 18 | end 19 | 20 | def http_client_status 21 | {} 22 | end 23 | 24 | # The arg `follow_redirects` is required despite not being used within the method. 25 | # This is because the mock is called using expected args in specs. 26 | def run(crawl_task, follow_redirects: false) # rubocop:disable Lint/UnusedMethodArgument 27 | url = crawl_task.url 28 | mock_results.fetch(url.to_s, mock_404_result(url)) 29 | end 30 | 31 | def mock_404_result(url) 32 | Crawler::Data::CrawlResult::Error.new( 33 | url:, 34 | status_code: 404, 35 | error: 'Not found' 36 | ) 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/crawler/output_sink.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module OutputSink 11 | def self.create(config) 12 | sink_type = config.output_sink.to_s 13 | sink_class_for_type(sink_type).new(config) 14 | end 15 | 16 | def self.sink_class_for_type(sink_type) 17 | sink_class_name = "::Crawler::OutputSink::#{sink_type.classify}" 18 | sink_class_name.safe_constantize.tap do |sink_class| 19 | raise ArgumentError, "Unknown output sink: #{sink_type.inspect}" unless sink_class 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /lib/crawler/output_sink/base.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, '..', 'output_sink') 10 | 11 | module Crawler 12 | module OutputSink 13 | class Base 14 | attr_reader :config, :rule_engine 15 | 16 | delegate :crawl_id, :document_mapper, :events, :system_logger, to: :config 17 | 18 | def initialize(config) 19 | @config = config 20 | @rule_engine = create_rule_engine 21 | end 22 | 23 | def create_rule_engine 24 | Crawler::RuleEngine::Base.new(config) 25 | end 26 | 27 | def write(_crawl_result) 28 | raise NotImplementedError 29 | end 30 | 31 | def fetch_purge_docs(_crawl_start_time) 32 | raise NotImplementedError 33 | end 34 | 35 | def purge(_crawl_start_time) 36 | raise NotImplementedError 37 | end 38 | 39 | def to_doc(crawl_result) 40 | document_mapper.create_doc(crawl_result) 41 | end 42 | 43 | def close 44 | # To be implemented by the sink if needed. 45 | # Does nothing by default. 46 | end 47 | 48 | def flush 49 | # To be implemented by the sink if needed. 50 | # Does nothing by default. 51 | end 52 | 53 | # Returns a hash with the outcome of crawl result ingestion (to be used for logging above) 54 | def outcome(outcome, message) 55 | { outcome:, message: } 56 | end 57 | 58 | def success(message = 'Successfully ingested crawl result') 59 | outcome(:success, message) 60 | end 61 | 62 | def failure(message) 63 | outcome(:failure, message) 64 | end 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /lib/crawler/output_sink/console.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'base') 10 | 11 | module Crawler 12 | module OutputSink 13 | class Console < OutputSink::Base 14 | def write(crawl_result) 15 | puts "# #{crawl_result.id}, #{crawl_result.url}, #{crawl_result.status_code}" 16 | 17 | if crawl_result.content_extractable_file? 18 | puts "** [Content extractable file (content type: #{crawl_result.content_type}, " \ 19 | "content length: #{crawl_result.content.bytesize})] **" 20 | else 21 | puts crawl_result.content 22 | end 23 | 24 | success 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/crawler/output_sink/file.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'base') 10 | 11 | module Crawler 12 | module OutputSink 13 | class File < OutputSink::Base 14 | attr_reader :dir 15 | 16 | def initialize(*) 17 | super 18 | 19 | @dir = config.output_dir 20 | raise ArgumentError, 'Missing or invalid output directory' if !dir.is_a?(String) || dir.empty? 21 | 22 | FileUtils.mkdir_p(dir) 23 | end 24 | 25 | def write(crawl_result) 26 | doc = to_doc(crawl_result) 27 | result_file = "#{dir}/#{crawl_result.url_hash}.json" 28 | ::File.write(result_file, doc.to_json) 29 | 30 | success 31 | end 32 | end 33 | end 34 | end 35 | -------------------------------------------------------------------------------- /lib/crawler/output_sink/mock.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'base') 10 | 11 | module Crawler 12 | module OutputSink 13 | class Mock < OutputSink::Base 14 | attr_reader :results 15 | 16 | def initialize(*) 17 | super 18 | 19 | @results = config.results_collection 20 | raise ArgumentError, 'Needs a ResultsCollection' unless results.is_a?(ResultsCollection) 21 | end 22 | 23 | def write(crawl_result) 24 | results.append(crawl_result) 25 | 26 | success 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/crawler/output_sink/null.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require_dependency File.join(__dir__, 'base') 10 | 11 | module Crawler 12 | module OutputSink 13 | class Null < OutputSink::Base 14 | def write(_) 15 | # Discard the results 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/crawl_rules_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module UrlValidator::CrawlRulesCheckConcern # rubocop:disable Style/ClassAndModuleChildren 11 | extend ActiveSupport::Concern 12 | 13 | def validate_crawl_rules 14 | rule_engine = Crawler::RuleEngine::Elasticsearch.new(crawler_api_config) 15 | outcome = rule_engine.crawl_rules_outcome(normalized_url) 16 | rule = outcome.details[:rule] 17 | 18 | if outcome.allowed? 19 | validation_ok(:crawl_rules, 'The URL is allowed by one of the crawl rules', rule: rule.source) 20 | elsif rule 21 | validation_fail(:crawl_rules, 'The URL is denied by a crawl rule', rule: rule.source) 22 | else 23 | # This should never happen, but we're including it here to be safe 24 | validation_fail(:crawl_rules, 'The URL is denied because it did not match any rules') 25 | end 26 | end 27 | end 28 | end 29 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/dns_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'resolv' 10 | 11 | module Crawler 12 | module UrlValidator::DnsCheckConcern # rubocop:disable Style/ClassAndModuleChildren 13 | extend ActiveSupport::Concern 14 | 15 | def validate_dns 16 | if proxy_configured? 17 | warning = 'DNS resolution check could not be performed via an HTTP proxy.' 18 | return validation_warn(:dns, warning) 19 | end 20 | 21 | # Prepare DNS resolvers 22 | resolv = Resolv.new([ 23 | Resolv::Hosts.new, 24 | Resolv::DNS.new.tap do |dns| 25 | dns.timeouts = Crawler::UrlValidator::DNS_CHECK_TIMEOUT 26 | end 27 | ]) 28 | 29 | # Check DNS 30 | addresses = resolv.getaddresses(url.host) 31 | 32 | if addresses.empty? 33 | validation_fail(:dns, 'DNS name resolution failed. No suitable addresses found!') 34 | else 35 | validation_ok(:dns, "Domain name resolution successful: #{addresses.count} addresses found", 36 | addresses:) 37 | end 38 | rescue Resolv::ResolvError, ArgumentError => e 39 | validation_fail(:dns, <<~MESSAGE) 40 | DNS resolution failure: #{e}. Please check the spelling of your domain 41 | or your DNS configuration. 42 | MESSAGE 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/domain_access_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module UrlValidator::DomainAccessCheckConcern # rubocop:disable Style/ClassAndModuleChildren 11 | extend ActiveSupport::Concern 12 | 13 | def validate_domain_access 14 | if crawler_api_config.domain_allowlist.include?(url.domain) 15 | validation_ok(:domain_access, 'The URL matches one of the configured domains', domain: url.domain_name) 16 | else 17 | validation_fail(:domain_access, 'The URL does not match any configured domains') 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/domain_uniqueness_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module UrlValidator::DomainUniquenessCheckConcern # rubocop:disable Style/ClassAndModuleChildren 11 | extend ActiveSupport::Concern 12 | 13 | def validate_domain_uniqueness 14 | if crawler_api_config.domain_allowlist.include?(url.domain) 15 | validation_fail(:domain_uniqueness, 'Domain name already exists') 16 | else 17 | validation_ok(:domain_uniqueness, 'Domain name is new', domain: url.domain_name) 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/result.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | class UrlValidator::Result # rubocop:disable Style/ClassAndModuleChildren 11 | attr_reader :name, :result, :comment, :details 12 | 13 | def initialize(name:, result:, comment:, details: {}) 14 | @name = name 15 | @result = result 16 | @comment = comment 17 | @details = details 18 | end 19 | 20 | def failure? 21 | result == :failure 22 | end 23 | 24 | def to_h 25 | { name:, result:, comment: }.tap do |res| 26 | res[:details] = details if details.any? 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/tcp_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module UrlValidator::TcpCheckConcern # rubocop:disable Style/ClassAndModuleChildren 11 | extend ActiveSupport::Concern 12 | 13 | def validate_tcp 14 | if proxy_configured? 15 | warning = 'TCP connection check could not be performed via an HTTP proxy.' 16 | return validation_warn(:tcp, warning) 17 | end 18 | 19 | host = url.host 20 | port = url.inferred_port 21 | details = { host:, port: } 22 | 23 | Socket.tcp(host, port, connect_timeout: Crawler::UrlValidator::TCP_CHECK_TIMEOUT) do 24 | validation_ok(:tcp, 'TCP connection successful', details) 25 | end 26 | rescue Errno::ETIMEDOUT 27 | validation_fail(:tcp, <<~MESSAGE, details) 28 | TCP connection to #{host}:#{port} timed out. Please make sure the crawler 29 | instance is allowed to connect to your servers. 30 | MESSAGE 31 | rescue SocketError, SystemCallError => e 32 | validation_fail(:tcp, "TCP connection to #{host}:#{port} failed: #{e}", details) 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/crawler/url_validator/url_check_concern.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | module UrlValidator::UrlCheckConcern # rubocop:disable Style/ClassAndModuleChildren 11 | extend ActiveSupport::Concern 12 | 13 | def validate_url # rubocop:disable Metrics/AbcSize 14 | if url.scheme.blank? 15 | validation_fail(:url, 'URL scheme is missing. Domain URLs must start with https:// or http://') 16 | elsif !url.supported_scheme? 17 | validation_fail(:url, "Unsupported URL scheme: #{url.scheme}", scheme: url.scheme) 18 | elsif url.path.present? && !configuration 19 | validation_fail(:url, 'Domain URLs cannot contain a path') 20 | else 21 | validation_ok(:url, 'URL structure looks valid') 22 | end 23 | rescue Addressable::URI::InvalidURIError => e 24 | validation_fail(:url, "Error parsing domain name: #{e}") 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/crawler/utils.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | module Crawler 10 | class Utils 11 | def self.url_pattern(domain, type, pattern) 12 | "\\A#{Regexp.escape(domain)}#{path_pattern(type, pattern)}" 13 | end 14 | 15 | def self.path_pattern(type, pattern) 16 | case type 17 | when 'begins' 18 | pattern_with_wildcard(pattern) 19 | when 'ends' 20 | ".*#{pattern_with_wildcard(pattern)}\\z" 21 | when 'contains' 22 | ".*#{pattern_with_wildcard(pattern)}" 23 | when 'regex' 24 | pattern 25 | end 26 | end 27 | 28 | def self.pattern_with_wildcard(pattern) 29 | Regexp.escape(pattern).gsub('\*', '.*') 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/environment.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # Add the lib directory to the load path 10 | $LOAD_PATH << __dir__.to_s 11 | 12 | # Calculate the current environment 13 | CRAWLER_ENV = ENV.fetch('CRAWLER_ENV', 'development') 14 | 15 | # Set up bundler 16 | require 'rubygems' 17 | require 'bundler' 18 | Bundler.setup(:default, CRAWLER_ENV) 19 | 20 | # Load common dependencies 21 | require 'active_support' 22 | require 'active_support/core_ext' 23 | require 'active_support/dependencies' 24 | 25 | # Load crawler components 26 | require 'crawler' 27 | -------------------------------------------------------------------------------- /lib/errors.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | class Errors 10 | # Raised only if the queue item added somehow overflows the queue threshold. 11 | # The queue threshold is checked before an item is added so this error shouldn't occur. 12 | # If this error occurs, something is wrong with the interaction between the Elasticsearch sink and BulkQueue. 13 | class BulkQueueOverflowError < StandardError; end 14 | 15 | # Raised when attempting to add a crawl result to the sink, but it is currently locked. 16 | # This is specific for Elasticsearch sink. Basically the sink is single-threaded but 17 | # receives crawl results from multi-threaded processes. This error is raised to prevent 18 | # overloading the queue if Elasticsearch indexing is failing repeatedly and performing 19 | # exponential backoff. This error should be treated as retryable. 20 | class SinkLockedError < StandardError; end 21 | 22 | # Raised when there is a connection error to Elasticsearch. Specific for Elasticsearch sink. 23 | # During initialization of the Elasticsearch sink, it will attempt to make contact to 24 | # the host provided in the configuration. If contact cannot be established, a system exit will occur. 25 | class ExitIfESConnectionError < SystemExit; end 26 | 27 | # Raised when the desired output index does not exist. This is specific for Elasticsearch 28 | # sink. During initialization of the Elasticsearch sink, it will call indices.exists() 29 | # against the output_index value, and will continue if the index is found. 30 | # If it is not found, this error will be raised, which causes a system exit to occur. 31 | class ExitIfUnableToCreateIndex < SystemExit; end 32 | end 33 | -------------------------------------------------------------------------------- /product_version: -------------------------------------------------------------------------------- 1 | 0.3.0 2 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "github>elastic/renovate-config:only-chainguard" 5 | ], 6 | "schedule": [ 7 | "* * * * 0,6" 8 | ] 9 | } 10 | -------------------------------------------------------------------------------- /script/bundle: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source "$(dirname $0)/functions.sh" 4 | 5 | set -e 6 | 7 | # Used by Gemfile to limit direct access to bundle commands 8 | export SCRIPT_BUNDLE=true 9 | 10 | # Tune for faster startup 11 | export JRUBY_OPTS="${JRUBY_OPTS:-} --dev --debug" 12 | export JAVA_OPTS="-Xmx2g ${JAVA_OPTS:-} -Djava.awt.headless=true -Dsun.jnu.encoding=UTF-8 -Dfile.encoding=UTF-8 -XX:+HeapDumpOnOutOfMemoryError" 13 | 14 | function bundle_command() { 15 | if ! bundle "$@"; then 16 | set +x 17 | echo 18 | red_echo "ERROR: Bundle command failed!" 19 | yellow_echo "Try to run 'make install' and then retry this command" 20 | echo 21 | exit 42 22 | fi 23 | } 24 | 25 | BUNDLER_VERSION="$(cat .bundler-version)" 26 | BUNDLER_CONSTRAINT="~> $BUNDLER_VERSION" 27 | 28 | blue_echo "Bundling jruby gems..." 29 | bundle_command config cache_all true 30 | 31 | blue_echo "Running the bundle command..." 32 | bundle_command "$@" 33 | 34 | green_echo "Done!" 35 | echo 36 | -------------------------------------------------------------------------------- /script/environment: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source "$(dirname $0)/functions.sh" 4 | 5 | set -e 6 | 7 | load_version_constraints 8 | check_bundle 9 | -------------------------------------------------------------------------------- /script/licenses/README.md: -------------------------------------------------------------------------------- 1 | # 3rd Party :tada: dependencies 2 | 3 | This directory contains scripts and files for generating a `NOTICE.txt` file containing all licenses for the third-party dependencies that Crawler uses. 4 | It will look at the SPDX license for Ruby gems. 5 | If this cannot be found, it will attempt to download the LICENSE file and add it to the project for future reference. 6 | When a LICENSE file doesn't exist (or is in an unexpected location or format), a manual override must be added. 7 | 8 | Downloaded license files are added to the directories `rubygems_licenses` or `misc_licneses`. 9 | 10 | All license texts are then added to the repository's [NOTICE.txt](../../NOTICE.txt) file. 11 | 12 | ## Types of dependencies 13 | 14 | - Ruby Gems from `Gemfile` and `Gemfile.lock` 15 | - Misc. dependencies, like JRuby, Tika, etc. not managed by a package manager 16 | 17 | ## Generate NOTICE.txt 18 | 19 | ```bash 20 | ./script/licenses/generate_notice_txt.rb 21 | ``` 22 | -------------------------------------------------------------------------------- /script/licenses/generate_notice.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # 4 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 5 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 6 | # you may not use this file except in compliance with the Elastic License 2.0. 7 | # 8 | 9 | # frozen_string_literal: true 10 | 11 | NOTICE_TXT_PATH = File.expand_path('../../NOTICE.txt', __dir__) 12 | 13 | require_relative 'lib/third_party' 14 | 15 | def write_header_to_file(io) 16 | io.puts 'Elastic Open Web Crawler' 17 | io.puts 'Copyright 2024 Elasticsearch B.V.' 18 | io.puts 19 | io.puts 'The Elastic Open Web Crawler contains the following third-party dependencies:' 20 | io.puts 21 | end 22 | 23 | def write_license_to_file(io, klass_instance, identifier, dependency) 24 | io.puts '-' * 80 25 | io.puts "Library: #{klass_instance.format_library_for_notice_txt(identifier, dependency)}" 26 | io.puts "URL: #{dependency[:url]}" if dependency[:url] 27 | io.puts "License: #{dependency[:license]}" if dependency[:license] 28 | io.puts 29 | File.open(dependency[:license_file_path], 'r') do |license_file| 30 | io.puts(license_file.read) 31 | io.puts 32 | end 33 | end 34 | 35 | File.open(NOTICE_TXT_PATH, 'w') do |io| 36 | write_header_to_file(io) 37 | 38 | [ 39 | ThirdParty::RubygemsDependencies, 40 | ThirdParty::MiscDependencies 41 | ].each do |klass| 42 | klass_instance = klass.new 43 | dependencies = klass_instance.get(with_license_files: true) 44 | dependencies.keys.sort.each do |identifier| 45 | dependency = dependencies.fetch(identifier) 46 | 47 | unless dependency[:license_file_path] 48 | ThirdParty::LOGGER.error("There is no license file for #{identifier}!") 49 | exit(1) 50 | end 51 | 52 | unless File.exist?(dependency[:license_file_path]) 53 | err = "License file for #{identifier} does not exist locally (path: #{dependency[:license_file_path]})" 54 | ThirdParty::LOGGER.error(err) 55 | exit(2) 56 | end 57 | 58 | write_license_to_file(io, klass_instance, identifier, dependency) 59 | end 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /script/licenses/lib/third_party.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'logger' 10 | 11 | module ThirdParty 12 | LOGGER = Logger.new($stdout, level: Logger::DEBUG) 13 | 14 | LICENSE_FILE_NAME_OPTIONS = %w[ 15 | LICENSE 16 | LICENSE.md 17 | LICENSE.txt 18 | License.txt 19 | LICENCE 20 | LICENSE-MIT 21 | Licence.md 22 | Licence.rdoc 23 | MIT_LICENSE 24 | MIT-LICENSE 25 | MIT-LICENSE.txt 26 | BSDL 27 | COPYING 28 | COPYING.txt 29 | ].freeze 30 | UNKNOWN_LICENSE = 'UNKNOWN' 31 | 32 | module SPDX 33 | class << self 34 | def normalize_license(license) 35 | return license if SUPPORTED_IDENTIFIERS.include?(license) || license.match?(/\s+OR|AND|WITH\s+/) 36 | 37 | ALIASES.fetch(license, nil) 38 | end 39 | end 40 | 41 | SUPPORTED_IDENTIFIERS = %w[ 42 | 0BSD 43 | Apache-2.0 44 | AFL-2.1 45 | BSD-2-Clause 46 | BSD-3-Clause 47 | CC0-1.0 48 | CC-BY-3.0 49 | CC-BY-4.0 50 | Elastic-2.0 51 | EPL-1.0 52 | ISC 53 | GPL-2.0 54 | LGPL-2.1 55 | MIT 56 | MPL-2.0 57 | Ruby 58 | Unlicense 59 | ].freeze 60 | 61 | IDENTIFIER_TO_ALIASES = { 62 | 'AFL-2.1' => [ 63 | 'AFLv2.1' 64 | ], 65 | 'BSD-2-Clause' => [ 66 | 'BSD 2-Clause', 67 | 'BSD', 68 | 'BSD*', 69 | '2-clause BSDL' 70 | ], 71 | 'Apache-2.0' => [ 72 | 'Apache License Version 2.0', 73 | 'Apache License (2.0)' 74 | ], 75 | 'Ruby' => [ 76 | 'ruby' 77 | ], 78 | 'Python-2.0' => [ 79 | 'PSFL' 80 | ], 81 | 'MIT' => [ 82 | 'MIT*' 83 | ] 84 | }.freeze 85 | 86 | ALIASES = IDENTIFIER_TO_ALIASES.each_with_object({}) do |(spdx_identifier, aliases), out| 87 | aliases.each do |a| 88 | out[a] = spdx_identifier 89 | end 90 | end 91 | end 92 | end 93 | 94 | require_relative 'third_party/misc_dependencies' 95 | require_relative 'third_party/rubygems_dependencies' 96 | -------------------------------------------------------------------------------- /script/licenses/lib/third_party/misc_dependencies.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'pathname' 10 | require_relative 'base' 11 | 12 | module ThirdParty 13 | class MiscDependencies < Base 14 | def type 15 | 'Misc. Dependency' 16 | end 17 | 18 | def licenses_path 19 | LICENSES_PATH 20 | end 21 | 22 | def license_fallbacks 23 | {} 24 | end 25 | 26 | def license_file_fallbacks 27 | DEPENDENCIES.transform_values do |dependency| 28 | dependency.fetch(:license_file_override) 29 | end 30 | end 31 | 32 | def get(with_license_files: false) 33 | DEPENDENCIES.each_with_object({}) do |(identifier, dependency), out| 34 | out[identifier] = dependency.slice(:name, :version, :license, :url) 35 | 36 | out[identifier][:license_file_path] = license_file_path_for_dependency(identifier) if with_license_files 37 | end 38 | end 39 | 40 | LICENSES_PATH = Pathname.new(__dir__).join('..', '..', 'misc_licenses') 41 | JRUBY_VERSION = File.read(File.expand_path('../../../../.ruby-version', __dir__)).strip.delete_prefix('jruby-') 42 | 43 | DEPENDENCIES = { 44 | 'jruby' => { 45 | name: 'jruby', 46 | version: JRUBY_VERSION, 47 | license: 'EPL-2.0 OR GPL-2.0 OR LGPL-2.1', 48 | license_file_override: { manually_added: true }, 49 | url: 'https://www.jruby.org' 50 | }, 51 | 'tika' => { 52 | name: 'tika', 53 | version: '1.23', 54 | license: 'Apache-2.0', 55 | license_file_override: { manually_added: true }, 56 | url: 'https://github.com/apache/tika' 57 | } 58 | }.freeze 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /script/licenses/misc_licenses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/script/licenses/misc_licenses/.gitkeep -------------------------------------------------------------------------------- /script/licenses/rubygems_licenses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/script/licenses/rubygems_licenses/.gitkeep -------------------------------------------------------------------------------- /script/licenses/rubygems_licenses/_manually_added_faux-LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2024 Elasticsearch B.V. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /script/licenses/rubygems_licenses/_manually_added_httpclient-LICENSE.txt: -------------------------------------------------------------------------------- 1 | This program is copyrighted free software by NAKAMURA, Hiroshi. You can 2 | redistribute it and/or modify it under the same terms of Ruby's license; 3 | either the dual license version in 2003, or any later version. 4 | 5 | httpclient/session.rb is based on http-access.rb in http-access/0.0.4. Some 6 | part of it is copyrighted by Maebashi-san who made and published 7 | http-access/0.0.4. http-access/0.0.4 did not include license notice but when 8 | I asked Maebashi-san he agreed that I can redistribute it under the same terms 9 | of Ruby. Many thanks to Maebashi-san. 10 | -------------------------------------------------------------------------------- /script/licenses/rubygems_licenses/_manually_added_minitest-LICENSE.txt: -------------------------------------------------------------------------------- 1 | (The MIT License) 2 | 3 | Copyright (c) Ryan Davis, seattle.rb 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining 6 | a copy of this software and associated documentation files (the 7 | 'Software'), to deal in the Software without restriction, including 8 | without limitation the rights to use, copy, modify, merge, publish, 9 | distribute, sublicense, and/or sell copies of the Software, and to 10 | permit persons to whom the Software is furnished to do so, subject to 11 | the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /script/licenses/rubygems_licenses/_manually_added_strscan-LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (C) 1999-2006 Minero Aoki. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 1. Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | 2. Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | 12 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 13 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 14 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 15 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 16 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 17 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 18 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 19 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 20 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 21 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 22 | SUCH DAMAGE. -------------------------------------------------------------------------------- /script/rspec: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | export JRUBY_OPTS="${JRUBY_OPTS} --debug" 6 | 7 | BUNDLE_CMD=${BUNDLE_CMD:-bundle} 8 | $BUNDLE_CMD exec rspec $* 9 | -------------------------------------------------------------------------------- /script/support/string_colors.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # String colorization extensions 4 | class String 5 | def colorize(color_code) 6 | "\e[#{color_code}m#{self}\e[0m" 7 | end 8 | 9 | def red 10 | colorize(31) 11 | end 12 | 13 | def green 14 | colorize(32) 15 | end 16 | 17 | def yellow 18 | colorize(33) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /script/vendor_jars: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env jruby 2 | # frozen_string_literal: true 3 | 4 | require 'jar-dependencies' 5 | 6 | Jars.lock_down( 7 | debug: ENV['JARS_DEBUG'] == 'true', 8 | verbose: ENV['JARS_VERBOSE'] == 'true', 9 | vendor_dir: 'vendor/jars' 10 | ) 11 | -------------------------------------------------------------------------------- /spec/fixtures/crawl-flat-format.yml: -------------------------------------------------------------------------------- 1 | domains: [{url: "https://localhost:80", seed_urls: ["https://localhost:80", "https://localhost:80/news/"]}] 2 | schedule.pattern: '* * * * *' 3 | # Where to send the results. Possible values are console, file, or elasticsearch 4 | output_sink: elasticsearch 5 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch 6 | output_index: test-index 7 | # Crawl tuning 8 | max_crawl_depth: 2 9 | # Crawl result field size limits 10 | max_title_size: 500 11 | max_body_size: 5_242_880 # 5 megabytes 12 | max_keywords_size: 512 13 | max_description_size: 512 14 | max_indexed_links_count: 5 15 | max_headings_count: 5 16 | # elasticsearch settings 17 | elasticsearch.host: http://localhost 18 | elasticsearch.port: 9200 19 | elasticsearch.username: elastic 20 | elasticsearch.password: changeme 21 | elasticsearch.bulk_api.max_items: 10 22 | elasticsearch.bulk_api.max_size_bytes: 1_048_576 -------------------------------------------------------------------------------- /spec/fixtures/crawl.yml: -------------------------------------------------------------------------------- 1 | # Domains allowed for the crawl 2 | domains: 3 | - url: https://localhost:80 4 | seed_urls: 5 | - https://localhost:80 6 | - https://localhost:80/news/ 7 | 8 | schedule: 9 | pattern: '* * * * *' 10 | 11 | # Where to send the results. Possible values are console, file, or elasticsearch 12 | output_sink: elasticsearch 13 | 14 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch 15 | output_index: test-index 16 | 17 | # Crawl tuning 18 | max_crawl_depth: 2 19 | 20 | # Crawl result field size limits 21 | max_title_size: 500 22 | max_body_size: 5_242_880 # 5 megabytes 23 | max_keywords_size: 512 24 | max_description_size: 512 25 | max_indexed_links_count: 5 26 | max_headings_count: 5 27 | 28 | elasticsearch: 29 | host: http://localhost 30 | port: 9200 31 | username: elastic 32 | password: changeme 33 | bulk_api: 34 | max_items: 10 35 | max_size_bytes: 1_048_576 36 | -------------------------------------------------------------------------------- /spec/fixtures/do-not-visit.txt: -------------------------------------------------------------------------------- 1 | http://127.0.0.1:9393/do-not-visit-here 2 | -------------------------------------------------------------------------------- /spec/fixtures/elasticsearch-flat-format.yml: -------------------------------------------------------------------------------- 1 | elasticsearch.host: http://test:9200 2 | elasticsearch.username: test 3 | elasticsearch.password: changeme-test 4 | elasticsearch.api_key: 1234 5 | elasticsearch.pipeline: ent-search-generic-ingestion 6 | elasticsearch.pipeline_enabled: true 7 | elasticsearch.pipeline_params._reduce_whitespace: true 8 | elasticsearch.pipeline_params._run_ml_inference: true 9 | elasticsearch.pipeline_params._extract_binary_content: true 10 | elasticsearch.bulk_api.max_items: 10 11 | elasticsearch.bulk_api.max_size_bytes: 1_048_576 -------------------------------------------------------------------------------- /spec/fixtures/elasticsearch-partially-flat-format.yml: -------------------------------------------------------------------------------- 1 | elasticsearch: 2 | host: http://test:9200 3 | username: test 4 | password: changeme-test 5 | api_key: 1234 6 | pipeline: ent-search-generic-ingestion 7 | pipeline_enabled: true 8 | pipeline_params._reduce_whitespace: true 9 | pipeline_params._run_ml_inference: true 10 | pipeline_params._extract_binary_content: true 11 | bulk_api: 12 | max_items: 10 13 | max_size_bytes: 1_048_576 14 | -------------------------------------------------------------------------------- /spec/fixtures/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | elasticsearch: 2 | host: http://test:9200 3 | username: test 4 | password: changeme-test 5 | api_key: 1234 6 | pipeline: ent-search-generic-ingestion 7 | pipeline_enabled: true 8 | pipeline_params: 9 | _reduce_whitespace: true 10 | _run_ml_inference: true 11 | _extract_binary_content: true 12 | bulk_api: 13 | max_items: 10 14 | max_size_bytes: 1_048_576 15 | -------------------------------------------------------------------------------- /spec/fixtures/sitemap/sitemap_index.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://www.example.com/sitemap1.xml 5 | 2004-10-01T18:23:17+00:00 6 | 7 | 8 | http://www.example.com/sitemap2.xml 9 | 2005-01-01 10 | 11 | 12 | -------------------------------------------------------------------------------- /spec/fixtures/sitemap/sitemap_no_urls.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /spec/fixtures/sitemap/sitemap_urlset.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | http://www.example.com/ 5 | 2005-01-01 6 | monthly 7 | 0.8 8 | 9 | 10 | http://www.example.com/catalog?item=12&desc=vacation_hawaii 11 | weekly 12 | 13 | 14 | http://www.example.com/catalog?item=73&desc=vacation_new_zealand 15 | 2004-12-23 16 | weekly 17 | 18 | 19 | http://www.example.com/catalog?item=74&desc=vacation_newfoundland 20 | 2004-12-23T18:00:15+00:00 21 | 0.3 22 | 23 | 24 | http://www.example.com/catalog?item=83&desc=vacation_usa 25 | 2004-11-23 26 | 27 | 28 | -------------------------------------------------------------------------------- /spec/fixtures/sitemap/sitemap_urlset.xml.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/spec/fixtures/sitemap/sitemap_urlset.xml.gz -------------------------------------------------------------------------------- /spec/fixtures/ssl/ca.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDwDCCAqgCCQCgaeTT+pTAQzANBgkqhkiG9w0BAQsFADCBoTELMAkGA1UEBhMC 3 | VVMxCzAJBgNVBAgMAkNBMRYwFAYDVQQHDA1TYW4gRnJhbmNpc2NvMRAwDgYDVQQK 4 | DAdFbGFzdGljMRowGAYDVQQLDBFFbnRlcnByaXNlIFNlYXJjaDESMBAGA1UEAwwJ 5 | Y3VzdG9tLWNhMSswKQYJKoZIhvcNAQkBFhxlbnRlcnByaXNlLXNlYXJjaEBlbGFz 6 | dGljLmNvMB4XDTIxMDYxMDE2MTcwNFoXDTQ4MTAyNTE2MTcwNFowgaExCzAJBgNV 7 | BAYTAlVTMQswCQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4G 8 | A1UECgwHRWxhc3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNV 9 | BAMMCWN1c3RvbS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hA 10 | ZWxhc3RpYy5jbzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKX79WVB 11 | kDDq/TLCvJWWsTVjuHz4y0Z+iddYazQP2UCPng6uLiUWDmxu8Im+PdVb6iQDYw8N 12 | YgOZm0wUeXoozegs3RfcQHFTGosMVtD7bZrY24+3D4+XagIEe9rKiWBDtK7pHAcC 13 | kQg+2Z53tNu9h1TV8jE/GzjwedMfidHUTTQLMx853AywUEIZTusihrskkQeoWsXI 14 | CfWPWl8vKR1S7IdtnjR21H0RdyWGt7iQZHVy3ChrIWIInaq50qw7OOqzE/JNclOH 15 | 7bL/xBsZbGBIxnpOgMrpJak6NWcouoqH0sCisAqwQnn6kOI7GIrhAxhZa7c9Dbx0 16 | z7MYQfczUoWI6oMCAwEAATANBgkqhkiG9w0BAQsFAAOCAQEApNJwMB5gFQhRkkcz 17 | EQkC5n7ReMWQLyoRl3g8kUyMS9iYMxeJB+tnB8BMICUInpKcRbDlW1pCrstyW311 18 | O1FJweszWP3QRWBz49Cu5EPnFG75PJGnC2lOGcSC81M91yl5EjjvLTTWUcfuoMYF 19 | U2XrSo0LQpZdpzqjnG3ELMrcieplpiz7c/D7YIUK1wA8qy7Aif5uAjueY3NUfYzg 20 | wLdHRX5eRG6e4xV6iMI9ApetT1j2xoUeFHPO6yMRBcsdG+L20w8AAK6dqCa7vYhY 21 | fOKtgLhIR++qThawUwkb8HHHfXaJzP++0swXc0ljE/Uz0FFwRB9zbipVc0p6FAd7 22 | oFJU+w== 23 | -----END CERTIFICATE----- 24 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/ca.key: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | Proc-Type: 4,ENCRYPTED 3 | DEK-Info: DES-EDE3-CBC,9A2A4E541C52C308 4 | 5 | vr8Lcl7IK6neqrY6dwJiI864mWl1bVVH+kvPguAMVYp5W+m2KmYnBjvRCok5UFGY 6 | DAPWbtlY3X5eZrL6VKw7kaYEcr6DIlBniet9XOoJzzmGdRIbj1I5O+irdMgZg8SD 7 | RGyEYf6a3rtJj8tSrDjGlmf44xBnowN79QvBbKzCZI8vqlHcW90NZw9b8vpjj2IP 8 | cJXt91m71RwxWvjAOw2SFTXk4okmymrAlAB2+L8GQD/YgacQSihq9vri8aXwzmh2 9 | 8tmzcqGI+AUfxU77n7+dUS6rgJ2yzTLThaStPZiJBOAqrePuc9pyHV+yIYN28+3H 10 | H265/GWJwjvdc8iu0T73JVWplI5yM3xhofLzLXdFhjDoSdlzK9MqX5YhP/eBrolR 11 | Hm1Ly3Gi8WJgvn1LBn3bIZWZPH5Ch7UZt6kTG/TkpC1AKwATIKorWKetCACt2EeO 12 | txPMQt4XAykkzyAiyK4FW4Sh4KAqEGoXN0ELV8TTKk2YEV6tRk0XB3YIQgnDgBgS 13 | 0SvyjVTZ9cbuVAXN6oIhOtsrxU9NSidkAVw/wy5jbPhYHBYCGuM3QvUiMP0f4W69 14 | a4zqPjl36bGd/SDBgwbJmMW2qjzrx7Og0xpJccVQSr+N8JB4/AYBJIxYT4niER+V 15 | rlnEu/7OyVKjuS+EzDgSpoDRyiROgHaq9L5NfRum6b5ZtSdh9bSMoJq55aps+tUe 16 | pyp5ftO6l9ffJbuVdJfA63kvAq0MbwZ7Om10rsjGfM/XYITQNMlrEuGKEIyJIyv+ 17 | DnP8GUeHapPSTrjzk3cpwYnwknLeYtwM6MdgMSG/rW/Ksd2HoH8kA2WtGOrh34UQ 18 | Q1r9JO3Smog6Iq2DBl77w8oBIIx6LcMe5osUveEIsgoUyQoUMisVOP2kbdbQyN5X 19 | lzHajl/rORV7V2iKhgJ/yEroTwu8XgSBl6mSQFaRSwyqyNJTvqbFdIdN3dXvSqf+ 20 | qmfubcxGS3DRbI0pFGifUAPrD0hej/Dtm1dF9+4edB10hxYoDMiBidWgUaocZD6i 21 | zvKj5/JmOp6LboaW+VzcKFiHJ16Ntpu5I0opZZTjLiOE2l6Xcji9vCMRZcm8UFqo 22 | vWCMsll5cn7w8IluND0vCZvD7xsOjoluSW+XEXzmCSgRVH0Exi0voSuNtousl96i 23 | v5QOn/pIM4+pgBlDlO3qjbtkamZDiHtLepgWFwxb1IK7tmcmUZex/c6YHAheQdq/ 24 | SdpcG2QjvpGvTDndpoK64SP9roZpajnJOeqx3NX3rPs2y14Vu3p504aY9YWHp4QZ 25 | xSgURu0fBP4PFoWlGClJWtoFaCdqeiXSCfTliiR2H+LjYHiVpZ5hcC9yAXx1IRBt 26 | +JtZ1sPVUSJCTJzxWuyjCSMhjbmftG1MC/659CDuoyrGnDYS92iu6wM/tw80LEFm 27 | kwhSxWP4D0yyWBnbp0KdrAOu2cMPhkLMyJ/bGxvBupuVsrXubbfvqVnWn6CI62VW 28 | gAupwn0LnY58/7SrzT1prJKPLBJ0UuxOlfkpPcUxjzc3pyN6VwKHm0fxb9QaaXFN 29 | GWrmjpmJAEcFR5GrHmGiyjKMGvCrH33u5wkoLCD2Dxx4aoC2Mc2U0Q== 30 | -----END RSA PRIVATE KEY----- 31 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/ca.password.txt: -------------------------------------------------------------------------------- 1 | 13243546 -------------------------------------------------------------------------------- /spec/fixtures/ssl/expired/example.cnf: -------------------------------------------------------------------------------- 1 | FQDN = example.org 2 | ORGNAME = Elastic 3 | ALTNAMES = DNS:$FQDN, DNS:www.$FQDN 4 | 5 | [ req ] 6 | default_bits = 2048 7 | default_md = sha256 8 | prompt = no 9 | encrypt_key = no 10 | distinguished_name = dn 11 | req_extensions = req_ext 12 | 13 | [ dn ] 14 | C = CH 15 | O = $ORGNAME 16 | CN = $FQDN 17 | 18 | [ req_ext ] 19 | subjectAltName = $ALTNAMES 20 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/expired/example.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDTjCCAjYCBAdbzRUwDQYJKoZIhvcNAQELBQAwgaExCzAJBgNVBAYTAlVTMQsw 3 | CQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4GA1UECgwHRWxh 4 | c3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNVBAMMCWN1c3Rv 5 | bS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hAZWxhc3RpYy5j 6 | bzAeFw0yMTA2MDExOTM1MzJaFw0yMTA2MDIxOTM1MzJaMDUxCzAJBgNVBAYTAkNI 7 | MRAwDgYDVQQKDAdFbGFzdGljMRQwEgYDVQQDDAtleGFtcGxlLm9yZzCCASIwDQYJ 8 | KoZIhvcNAQEBBQADggEPADCCAQoCggEBAM2U6kepk57OfVBId1b7kkgKF5CIvKWr 9 | v4O9xCh+LnMpWxmpA4IyN66qd2G9aCiXK9d0bCNvue8TC3P5LHlcrfrI+yHmAgRj 10 | YAe249ifkFcQ0HPqPMe3B3+l50kCRkn2Wd0x2Clpz/tGLXVJ2AR/iYsTAOGMzC2O 11 | Ldv/F1pbDJIw4PTLTQBfDTh2m/S/GHz4b5ZetONlbHPXo2H/wj6/OHFAkcvvlzQa 12 | Qr8YT0/uyHzJBa6AQPQqUljjJSSAHnnC4fASwaFLUGjULrQmhwJzktukFk2eXGiu 13 | oo3prgoDQqRvknu9S4skTxn6Ku59VGjMfUMTExSOy3znSqye3HSVJBECAwEAATAN 14 | BgkqhkiG9w0BAQsFAAOCAQEAgjFqbtHkq6LsN1PGKKXlyJadr90AyD0TF1yA5tTA 15 | dtrixhgQFdnigR85Nyd9aKb8x7ocxmwotX3+WLwNb/+SmyICATJ5qCjuYACxx78z 16 | tmGqQEqYL4xF/gPxClkqPnCGM5kocu9Ct+3G5HJejghA4fbspx/2QtVbMa69Ac5B 17 | vZpFHXXiHtoWmK2skBxHJ5LAgq2LEWQVXzX9IDKy01qn+Jv+rD+G5vfdYaunldrI 18 | JrnRVsbt0ufCGzzqyesHUIUY8UTWLWeZ0Gr8XS0U5mfHqgwMR8PGrF/0sOdv/Jg1 19 | 910D5RGAo6niyi8fcdfxIPqmjhec6XbaJAvTQNgDmzVrkQ== 20 | -----END CERTIFICATE----- 21 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/expired/example.csr: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE REQUEST----- 2 | MIICtDCCAZwCAQAwNTELMAkGA1UEBhMCQ0gxEDAOBgNVBAoMB0VsYXN0aWMxFDAS 3 | BgNVBAMMC2V4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC 4 | AQEAzZTqR6mTns59UEh3VvuSSAoXkIi8pau/g73EKH4ucylbGakDgjI3rqp3Yb1o 5 | KJcr13RsI2+57xMLc/kseVyt+sj7IeYCBGNgB7bj2J+QVxDQc+o8x7cHf6XnSQJG 6 | SfZZ3THYKWnP+0YtdUnYBH+JixMA4YzMLY4t2/8XWlsMkjDg9MtNAF8NOHab9L8Y 7 | fPhvll6042Vsc9ejYf/CPr84cUCRy++XNBpCvxhPT+7IfMkFroBA9CpSWOMlJIAe 8 | ecLh8BLBoUtQaNQutCaHAnOS26QWTZ5caK6ijemuCgNCpG+Se71LiyRPGfoq7n1U 9 | aMx9QxMTFI7LfOdKrJ7cdJUkEQIDAQABoDowOAYJKoZIhvcNAQkOMSswKTAnBgNV 10 | HREEIDAeggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUub3JnMA0GCSqGSIb3DQEB 11 | CwUAA4IBAQAohzpolBHDmzHgG/AaOTbQhqYKdkh0tm5fLrC/Ve/2KBZU1pcLuTPk 12 | FIuSyQNrebeDIO8VHDLfRJrnjqIU7+fBWDgdxgkLezPqlX5WUFJiXvxuSRrD52Lk 13 | SPJVuHCs2BEimlRAxp937N/sWPdWD/A+wyzKVM+bD20krhpZoMMMXE6LQiKOnRan 14 | JToRgCAmL7fionmgzKwD2+k2nN3EFt+e6FaYKJqB3fkBX78FG1ijOftGlRD+D0hP 15 | r/Rc2b31nHNEhLHKvcYFwYTvE8EVIzYNJrYV+N/c6t3aOirWfL9xSW3VA9a/AqFB 16 | rYiGX+f4EmugECbO7KmSDxH6YDQeMg+N 17 | -----END CERTIFICATE REQUEST----- 18 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/expired/example.key: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEpAIBAAKCAQEAzZTqR6mTns59UEh3VvuSSAoXkIi8pau/g73EKH4ucylbGakD 3 | gjI3rqp3Yb1oKJcr13RsI2+57xMLc/kseVyt+sj7IeYCBGNgB7bj2J+QVxDQc+o8 4 | x7cHf6XnSQJGSfZZ3THYKWnP+0YtdUnYBH+JixMA4YzMLY4t2/8XWlsMkjDg9MtN 5 | AF8NOHab9L8YfPhvll6042Vsc9ejYf/CPr84cUCRy++XNBpCvxhPT+7IfMkFroBA 6 | 9CpSWOMlJIAeecLh8BLBoUtQaNQutCaHAnOS26QWTZ5caK6ijemuCgNCpG+Se71L 7 | iyRPGfoq7n1UaMx9QxMTFI7LfOdKrJ7cdJUkEQIDAQABAoIBAG648olQMrqIWgPA 8 | U84cRkfYb6KfkoLkAozQyvJIK3pI3tDuL36Sz1yaYRvaKFwcNzeec5OOXCUAK931 9 | aNega++zCVbTi2iToSfmf8avAc1yt+KGWN/zmu1MDEpNGFBDh1jTvKlpXOPngxo1 10 | gEvD6O9nd8UC0QEEH3zqYch/W0DsbK1GL+P8D32UzGrZFNiZPa3MzVLz+JDXgSak 11 | +Vpy6M5wJ9jNIQGtylM7COpXDay3TU0dvKsXKh41R45fbw9GXHHbG+bUHFdIPCAI 12 | xElcw/v3igHEm7m3kRo8+KLkcRwCFMCWsspYB0mKmk5CSu5Z8EeGszawTPvPCkwE 13 | l3roOaUCgYEA9mUsNBGJ9GBKgoC+0Y1vbBKXf0r2Zc5evQg2O7kJVlK+iMirB4X5 14 | Rcve4wbXJHVSG1BC1yD6OGVxd781eTPYAJ/7JcajOmGK739ycwpAg/0ObC11GmCC 15 | 9h5jFB6DyPHyBLkjbAe68Qdceixb2G7t1Fw0sKpN6baucNXxB03CQp8CgYEA1Zh0 16 | 0A5f+8y/Qe7o3+1lIAocJTLizDZf0AhwhvCdLbgfDDgCins7CFvG92el+6EiwStl 17 | 3QztnS3pqxs4K3NwnY5hyqY2QMfQLmKDNOr0n7zXUQm5VloLb7tLbVLkeEPif8d+ 18 | T1fREeDoD6lUT2OHVQrSF8ntiSqxsgGQt20iS08CgYAkEeeIr9CcP8RommRU1Pms 19 | voQDFHxBpxZjYeJel9XwFyjhaU7wSQKW3yN/5K52Hd6pNPSz/ZXjz6Xuu8UeOyKx 20 | 5LmlbBDVKRZnvSaKBMQxDkigOX9dHyfM0+H5PgZY0mJ1ooy28eLCCivVjszbQFH8 21 | torYGfZR8nZS+l7QjOeVTQKBgQC59GC+QcWOklJwNG7JhQPlQOf/+q043J3Nn4tX 22 | 72LnysQ8/wY3SdG7FSvDeJko2MBJLF8ic37quG4WaTwdmAMTKEI7CzlwbITx3RId 23 | n/AYoW9TPgP9CaerPoQMSX5etbsbQ7LToMCDsCpYeDLOavgHMcR2sXX7VRAeyP4U 24 | sw6IQQKBgQCBbg28NlJisJZ5InviYD6riuBZEf6h9SAFijdOSnb25vmdAOVKvG8d 25 | TFcxGNJnRcW2evdbr8Wg4AfhZQe+IqtDQIK/tITd1aUMhIX9Ij0xTOSABQPfFNvX 26 | UhERyqD/y4R3CCrwRIZ7mO995A8SXjPQowyid/GXlehtdjSPqeQEKQ== 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/expired/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Generating an SSL key..." 6 | openssl genrsa -out example.key 2048 7 | 8 | echo "Generating a CSR..." 9 | openssl req -new -key example.key -out example.csr -config example.cnf 10 | 11 | echo "Generating a Certificate (enter 13243546 when asked for a password)..." 12 | openssl x509 -req \ 13 | -in example.csr \ 14 | -CA ../ca.crt \ 15 | -CAkey ../ca.key \ 16 | -set_serial 123456789 \ 17 | -out example.crt \ 18 | -days 1 \ 19 | -sha256 20 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/invalid.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | This is not a valid certificate! 3 | -----END CERTIFICATE----- 4 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/self-signed/example.cnf: -------------------------------------------------------------------------------- 1 | FQDN = example.org 2 | ORGNAME = Elastic 3 | ALTNAMES = DNS:$FQDN, DNS:www.$FQDN 4 | 5 | [ req ] 6 | default_bits = 2048 7 | default_md = sha256 8 | prompt = no 9 | encrypt_key = no 10 | distinguished_name = dn 11 | req_extensions = req_ext 12 | 13 | [ dn ] 14 | C = CH 15 | O = $ORGNAME 16 | CN = $FQDN 17 | 18 | [ req_ext ] 19 | subjectAltName = $ALTNAMES 20 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/self-signed/example.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIDTjCCAjYCBAdbzRUwDQYJKoZIhvcNAQELBQAwgaExCzAJBgNVBAYTAlVTMQsw 3 | CQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4GA1UECgwHRWxh 4 | c3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNVBAMMCWN1c3Rv 5 | bS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hAZWxhc3RpYy5j 6 | bzAeFw0yMTA2MTAxNjMwNTBaFw00ODEwMjUxNjMwNTBaMDUxCzAJBgNVBAYTAkNI 7 | MRAwDgYDVQQKDAdFbGFzdGljMRQwEgYDVQQDDAtleGFtcGxlLm9yZzCCASIwDQYJ 8 | KoZIhvcNAQEBBQADggEPADCCAQoCggEBAOLX3PWCHRPmg4Zp70lkEuJqpzE/8Oa3 9 | 2G9+YCD02+dMJxqyCDsN3gS38OC8nPaHXrEKHVngUDHFrpWeJ70IZTK5yV/TlYM2 10 | Xtjgmq0Mwe7X96SF6lxBax6/zlbAFU0xJNG1KKxx8mUV35eIIkmN1/64HvvSDae5 11 | fTM0NVmv7TYcv8XNXTEDtQR+fkQhN5fZqFWd7/WNiW6nvhi3L/2X4jiS6BEWNQL/ 12 | tprVyqxQAwIYUxsgFx8WyWvKEJyoylbS/vqfaJaayNkUid1655zrGKpZLKWY66U/ 13 | 9DoUHeuJQ/SDKM5Aa1QmYkGojyUjpbRJ2jDqOLxBAUCrs+f1yDhCArkCAwEAATAN 14 | BgkqhkiG9w0BAQsFAAOCAQEAjMHM0yvEjR478ZyewC4TDNtcv2Eky9zZYz4H/NIJ 15 | vCNzz/PMXoWKZJzGNd+R4OBOghriO6mhXl7qYb1Ci12XASTmxi1fR7/HVhtBuNIX 16 | QYMWitFDGtOAiGvoNwmc1Uh24SrH7E30HW3fsiXk9UF/8uxn7kNBApJ7rg3PcsRs 17 | bNqHVULVU2I6q0NX/Y1igi1PdfKBwYKJAO/LDiXV0iafZYUfBBFb7qBgikl6g62X 18 | ulVVvCGZpZSg6YmwrLGVCT++ESnw5ejNs+3OWIYnE8tGVRwMOiHUiEVu9PtOp3Ag 19 | YOXa0egoUU7pbUzP9J438OXuiqfx+riGOqVVERv/EXm67w== 20 | -----END CERTIFICATE----- 21 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/self-signed/example.csr: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE REQUEST----- 2 | MIICtDCCAZwCAQAwNTELMAkGA1UEBhMCQ0gxEDAOBgNVBAoMB0VsYXN0aWMxFDAS 3 | BgNVBAMMC2V4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC 4 | AQEA4tfc9YIdE+aDhmnvSWQS4mqnMT/w5rfYb35gIPTb50wnGrIIOw3eBLfw4Lyc 5 | 9odesQodWeBQMcWulZ4nvQhlMrnJX9OVgzZe2OCarQzB7tf3pIXqXEFrHr/OVsAV 6 | TTEk0bUorHHyZRXfl4giSY3X/rge+9INp7l9MzQ1Wa/tNhy/xc1dMQO1BH5+RCE3 7 | l9moVZ3v9Y2Jbqe+GLcv/ZfiOJLoERY1Av+2mtXKrFADAhhTGyAXHxbJa8oQnKjK 8 | VtL++p9olprI2RSJ3XrnnOsYqlkspZjrpT/0OhQd64lD9IMozkBrVCZiQaiPJSOl 9 | tEnaMOo4vEEBQKuz5/XIOEICuQIDAQABoDowOAYJKoZIhvcNAQkOMSswKTAnBgNV 10 | HREEIDAeggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUub3JnMA0GCSqGSIb3DQEB 11 | CwUAA4IBAQAqA+uQgUZ5TQzDylCJjKTh3zgFHuSwOhlpPy930XUfccE+AmjF3VKD 12 | y4bCDc21IhQzYv1TD/2TXkTDoL4aIENP0b0AxgRFEV5reDhh0/RgcojgwdasNrG4 13 | Wymcqzdai+ZRaUCDvx9Llgus5qyajeeQ2z6SahCKerwOHo7WPO+s0q/yNVYCpQC7 14 | bq3vTimKbjDtX8HrYyLE6DDcvqevtwJCGhiH/YKyfHA75mYp5MJiCrHSG/grdt89 15 | qK8TUKvDm5xfvfEIY65nzKhMHz4RKG8WVyEMURnqpsQEJP+bYjbqDJZCb/sckTUj 16 | K2mLd3Ik/YUnvuWHetuBdw6fQIHR/j2y 17 | -----END CERTIFICATE REQUEST----- 18 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/self-signed/example.key: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | MIIEowIBAAKCAQEA4tfc9YIdE+aDhmnvSWQS4mqnMT/w5rfYb35gIPTb50wnGrII 3 | Ow3eBLfw4Lyc9odesQodWeBQMcWulZ4nvQhlMrnJX9OVgzZe2OCarQzB7tf3pIXq 4 | XEFrHr/OVsAVTTEk0bUorHHyZRXfl4giSY3X/rge+9INp7l9MzQ1Wa/tNhy/xc1d 5 | MQO1BH5+RCE3l9moVZ3v9Y2Jbqe+GLcv/ZfiOJLoERY1Av+2mtXKrFADAhhTGyAX 6 | HxbJa8oQnKjKVtL++p9olprI2RSJ3XrnnOsYqlkspZjrpT/0OhQd64lD9IMozkBr 7 | VCZiQaiPJSOltEnaMOo4vEEBQKuz5/XIOEICuQIDAQABAoIBABTjkepdv/W8LXJs 8 | QOe+Omr1LU5AuBtW5Kxns8x1H+btwVAZAt8FSOOgWKMpWz7selDNQKStHlVnAcuv 9 | U7N5mXARYbTcVBFQKW3JSRUUbqti4eAZoNo7//RF72dXqt5/3wccqpEusQaT/BIF 10 | LDsfv7sqE1hXIIDIePoFHcCTjcGEudIFEHDTZQ6Ip/zKdl6oeOTIhqTejdX3KWwj 11 | ERDn7L9QjGW2lgpNJzfzYqKHL8lrpsGYgPf4HN9LmF25tLOus/ZIDJ/1RvWDjtYp 12 | WNcmnDFqV24mXOizGMv0i4KRrLq1GY1dmb1CiHERM1rejSIfN/aIWNU4oF2nTvCd 13 | kodd6u0CgYEA96zj0fR/Zrh0f0Bbmzm2qMecdKAUCo+0gXElSzJdZkpyMTsZ265G 14 | nFXeNBetcPzi4Lh+AbkdAUcpu+z231QzKbybf1Q+zVrKLNMphBq6xLscIOVttAYE 15 | vrHPIgAPBZ0KEejHA6xUqRyNdkoP+x6f4wsE/tFyLFINQXSgT7/0orsCgYEA6ne5 16 | YOgcC1zOi3IgrZW2bI+vTDB6BGJzKCdF/0O96d7jAab8Zt4Re3KKzOxdlViAZOKv 17 | jIir0m8CgNTELry+Iqhx0sbWtGhWewALcHe8Cs1+FMPeOVDs1vpHILFFP89mPoni 18 | eQ2n/tBzxxxoCb6rG21Q9J9d+MO0WkZZBxX/exsCgYAR83f3qa7qNQhMiM+a6o3w 19 | obcXRNrvAQdmMlsvnhDi7xZjtxLitzjq79ZRFD4/6DRRcU3AtjgB9bRyqHQkL6gd 20 | qEvk6Kg8ng31PcDOkFllFOKvB7Hx0FXbtGt83WA5We526dYyz/S65RTjs+6AlvGj 21 | tRLBnVCXIcNQMTHFVfZXLQKBgQCvedrrg8s5VcPe6RM71ogox4BSbRVkoqm8q2ff 22 | mztPBNiwK+FKu3gqA5eNtnhzhUDSQDVR4Bd37kzZTmNk9yz1k0tcjCOz8UKH24i0 23 | K2g4TYLG17BBBSe73KPO+9zv3LCQrXEpV+ca9bcwlTnn0SMN68piycLyosUfqvaG 24 | Lxh9cQKBgFSEMrHyrf0I9ZvyWB7FTqKgq9O99S2YQg79S0MrD/IRwv8eMR5ISjG8 25 | JUC8S2yLSzXq5FSjZN+T5nZoL/QLywCJoRZz8W2UobZV01afu+LC6uQ8jNrKhlYN 26 | iQVlB9BkEO3nm/xGd+Ay8C1nHcGvNVjVdsL1Rqa9KjxGxNHOfuHb 27 | -----END RSA PRIVATE KEY----- 28 | -------------------------------------------------------------------------------- /spec/fixtures/ssl/self-signed/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | echo "Generating an SSL key..." 6 | openssl genrsa -out example.key 2048 7 | 8 | echo "Generating a CSR..." 9 | openssl req -new -key example.key -out example.csr -config example.cnf 10 | 11 | echo "Generating a Certificate (enter 13243546 when asked for a password)..." 12 | openssl x509 -req \ 13 | -in example.csr \ 14 | -CA ../ca.crt \ 15 | -CAkey ../ca.key \ 16 | -set_serial 123456789 \ 17 | -out example.crt \ 18 | -days 9999 \ 19 | -sha256 20 | -------------------------------------------------------------------------------- /spec/integration/charset_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Content charset' do 10 | let(:site) do 11 | Faux.site do 12 | page '/' do 13 | body do 14 | link_to '/utf8-without-charset' 15 | link_to '/utf8-with-charset' 16 | end 17 | end 18 | 19 | page '/utf8-with-charset' do 20 | headers 'Content-Type' => 'text/html; charset=UTF-8' 21 | body do 22 | text { "ma\u00F1ana ol\u00E9" } 23 | end 24 | end 25 | 26 | page '/utf8-without-charset' do 27 | headers 'Content-Type' => 'text/html' 28 | body do 29 | text { "ma\u00F1ana ol\u00E9" } 30 | end 31 | end 32 | end 33 | end 34 | 35 | it 'defaults to UTF-8' do 36 | results = FauxCrawl.run(site) 37 | 38 | expect(results).to have_only_these_results [ 39 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 40 | mock_response(url: 'http://127.0.0.1:9393/utf8-with-charset', status_code: 200, 41 | content: "ma\u00F1ana ol\u00E9"), 42 | mock_response(url: 'http://127.0.0.1:9393/utf8-without-charset', status_code: 200, 43 | content: "ma\u00F1ana ol\u00E9") 44 | ] 45 | end 46 | 47 | it 'can override fallback encoding' do 48 | results = FauxCrawl.run(site, default_encoding: 'ISO-8859-1') 49 | 50 | expect(results).to have_only_these_results [ 51 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 52 | mock_response(url: 'http://127.0.0.1:9393/utf8-with-charset', status_code: 200, 53 | content: "ma\u00F1ana ol\u00E9"), 54 | mock_response(url: 'http://127.0.0.1:9393/utf8-without-charset', status_code: 200, 55 | content: String.new("ma\xC3\xB1ana ol\xC3\xA9", encoding: 'ISO-8859-1')) 56 | ] 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/integration/content_extraction_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Content extractable file support' do 10 | let(:site) do 11 | Faux.site do 12 | page '/' do 13 | body do 14 | link_to '/html' 15 | link_to '/pdf' 16 | link_to '/powerpoint' 17 | link_to '/word' 18 | end 19 | end 20 | 21 | page '/html' do 22 | headers 'Content-Type' => 'text/html; charset=UTF-8' 23 | end 24 | 25 | page '/pdf' do 26 | headers 'Content-Type' => 'application/pdf' 27 | end 28 | 29 | page '/powerpoint' do 30 | headers 'Content-Type' => 'application/vnd.ms-powerpoint' 31 | end 32 | 33 | page '/word' do 34 | headers 'Content-Type' => 'application/msword' 35 | end 36 | end 37 | end 38 | 39 | it 'supports single and multiple Content-Type headers' do 40 | results = FauxCrawl.run( 41 | site, 42 | content_extraction: { 43 | enabled: true, 44 | mime_types: [ 45 | 'application/pdf', 46 | 'application/vnd.ms-powerpoint' 47 | ] 48 | } 49 | ) 50 | 51 | expect(results).to have_only_these_results [ 52 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 53 | mock_response(url: 'http://127.0.0.1:9393/html', status_code: 200), 54 | mock_response(url: 'http://127.0.0.1:9393/pdf', status_code: 200), 55 | mock_response(url: 'http://127.0.0.1:9393/powerpoint', status_code: 200) 56 | ] 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /spec/integration/nofollow_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Robots meta support' do 10 | let(:results) do 11 | FauxCrawl.crawl_site do 12 | page '/' do 13 | body do 14 | link_to '/noindex' 15 | link_to '/nofollow' 16 | 17 | # This link will not be followed 18 | link_to '/unreachable', rel: :nofollow 19 | end 20 | end 21 | 22 | # Should not be indexed, but the links should be followed 23 | page '/noindex' do 24 | head { robots 'noindex' } 25 | body { link_to '/foo' } 26 | end 27 | 28 | # Should be indexed, but the links should not be followed 29 | page '/nofollow' do 30 | head { robots 'nofollow' } 31 | body { link_to '/unreachable' } 32 | end 33 | 34 | # Only reachable via /noindex 35 | page '/foo' 36 | 37 | # Only reachable via nofollow links and pages, so the crawler won't ever find this 38 | page '/unreachable' 39 | end 40 | end 41 | 42 | it 'crawls all pages given the constraints specified by robots meta tags' do 43 | expect(results).to have_only_these_results [ 44 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 45 | mock_response(url: 'http://127.0.0.1:9393/nofollow', status_code: 200), 46 | mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200) 47 | ] 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /spec/integration/response_content_type_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Response Content-Type support' do 10 | let(:results) do 11 | FauxCrawl.crawl_site do 12 | page '/' do 13 | body do 14 | link_to '/html' 15 | link_to '/pdf' 16 | link_to '/pdf-multi-header' 17 | end 18 | end 19 | 20 | page '/html' do 21 | headers 'Content-Type' => 'text/html; charset=UTF-8' 22 | end 23 | 24 | page '/pdf' do 25 | headers 'Content-Type' => 'application/pdf' 26 | end 27 | 28 | page '/pdf-multi-header' do 29 | headers 'Content-Type' => ['application/pdf', 'text/html; charset=UTF-8'] 30 | end 31 | end 32 | end 33 | 34 | it 'supports single and multiple Content-Type headers' do 35 | expect(results).to have_only_these_results [ 36 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 37 | mock_response(url: 'http://127.0.0.1:9393/html', status_code: 200) 38 | ] 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/integration/response_limits_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # Generate a large enough random string that would require multiple TCP-packets to download 10 | require 'securerandom' 11 | MULTI_CHUNK_BODY = SecureRandom.alphanumeric(12_345) 12 | 13 | RSpec.describe 'Per-request resource limits support' do 14 | let(:results) do 15 | FauxCrawl.crawl_site do 16 | page '/' do 17 | body do 18 | link_to '/multi-chunk' 19 | link_to '/too-big' 20 | end 21 | end 22 | 23 | # Should be indexed, downloads will produce multiple chunks 24 | page '/multi-chunk' do 25 | def response_body 26 | [MULTI_CHUNK_BODY] 27 | end 28 | end 29 | 30 | # Should not be indexed because it is too big 31 | page '/too-big' do 32 | def response_body 33 | ['x' * 11_000_000] 34 | end 35 | end 36 | end 37 | end 38 | 39 | it 'crawls all pages given the constraints specified by resource limits' do 40 | expect(results).to have_only_these_results [ 41 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 42 | mock_response(url: 'http://127.0.0.1:9393/multi-chunk', status_code: 200) 43 | ] 44 | end 45 | 46 | it 'should correctly download multi-chunk responses' do 47 | multi_chunk_response = results.find { |r| r.url.to_s =~ /multi-chunk$/ } 48 | expect(multi_chunk_response.content).to eq(MULTI_CHUNK_BODY) 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/integration/seed_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Seed URLs' do 10 | let(:site) do 11 | Faux.site do 12 | page '/foo' 13 | page '/baz' 14 | end 15 | end 16 | 17 | it 'crawls all of the seed urls specified by the config' do 18 | results = FauxCrawl.run(site, seed_urls: %w[http://127.0.0.1:9393/foo http://127.0.0.1:9393/baz]) 19 | 20 | expect(results).to have_only_these_results [ 21 | mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200), 22 | mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200) 23 | ] 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/integration/sitemap_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Sitemaps Support' do 10 | let(:site) do 11 | Faux.site do 12 | page '/' do 13 | body do 14 | link_to '/foo' 15 | end 16 | end 17 | 18 | # Could be discovered via the home page or the sitemap 19 | page '/foo' 20 | 21 | # Not linked directly, but discoverable via the sitemap 22 | page '/bar' do 23 | body do 24 | link_to '/baz' 25 | end 26 | end 27 | 28 | # Not linked directly, but discoverable via '/bar' 29 | page '/baz' 30 | 31 | sitemap '/sitemap.xml' do 32 | link_to '/' 33 | link_to '/foo' 34 | link_to '/bar' 35 | end 36 | end 37 | end 38 | 39 | it 'makes it possible to use sitemap seed URLs for discovering links on a site' do 40 | results = FauxCrawl.run( 41 | site, 42 | seed_urls: ['http://127.0.0.1:9393/'], 43 | sitemap_urls: ['http://127.0.0.1:9393/sitemap.xml'] 44 | ) 45 | 46 | expect(results).to have_only_these_results [ 47 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 48 | mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200), 49 | mock_response(url: 'http://127.0.0.1:9393/bar', status_code: 200), 50 | mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200) 51 | ] 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /spec/integration/timeouts/socket_timeout_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Request to a site that is very slow to send us any data' do 10 | let(:site) do 11 | Faux.site do 12 | page '/' do 13 | body do 14 | link_to '/timeout' 15 | end 16 | end 17 | 18 | page '/timeout' do 19 | def response_body 20 | sleep 5 21 | 22 | ['Output'] 23 | end 24 | end 25 | end 26 | end 27 | 28 | it 'times out' do 29 | results = FauxCrawl.run(site, timeouts: { socket_timeout: 2 }) 30 | 31 | expect(results).to have_only_these_results [ 32 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200) 33 | ] 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/integration/url_fragments_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'URL normalization in the presence of URL fragments' do 10 | let(:results) do 11 | FauxCrawl.crawl_site do 12 | page '/' do 13 | body do 14 | link_to '/foo' 15 | link_to '/foo#bar' 16 | link_to '/baz#hello' 17 | end 18 | end 19 | 20 | page '/foo' 21 | page '/baz' 22 | end 23 | end 24 | 25 | it 'crawls discovered URLs while stripping out the fragments' do 26 | expect(results).to have_only_these_results [ 27 | mock_response(url: 'http://127.0.0.1:9393/', status_code: 200), 28 | mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200), 29 | mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200) 30 | ] 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /spec/lib/crawler/cli/version_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::CLI::Version) do 10 | describe '.call' do 11 | let(:version_path) { File.expand_path('../../../../product_version', __dir__) } 12 | 13 | it 'prints the current version from product_version_file' do 14 | expect(File).to receive(:read).with(version_path).and_return('1.0.0') 15 | expect { described_class.new.call }.to output("1.0.0\n").to_stdout 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/lib/crawler/content_engine/utils_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::ContentEngine::Utils) do 10 | describe '.node_descendant_text' do 11 | it 'should raise an error unless given a node object' do 12 | expect do 13 | Crawler::ContentEngine::Utils.node_descendant_text('something') 14 | end.to raise_error(ArgumentError, /node-like/) 15 | end 16 | 17 | it 'should replace break tags with spaces' do 18 | node = Nokogiri::HTML('Hello,
World!') 19 | expect(Crawler::ContentEngine::Utils.node_descendant_text(node)).to eq('Hello, World!') 20 | end 21 | 22 | context 'with uncrate.com pages' do 23 | let(:content) { read_fixture('uncrate.com.html') } 24 | let(:html) { Nokogiri::HTML(content) } 25 | 26 | it 'should have a reasonable performance' do 27 | duration = Benchmark.measure do 28 | Crawler::ContentEngine::Utils.node_descendant_text(html) 29 | end 30 | 31 | # It usually takes ~250 msec, used to take 180 sec before we fixed it, so let's aim for something reasonable 32 | expect(duration.real).to be < 5 33 | end 34 | end 35 | 36 | context 'with ignore_tags' do 37 | it 'ignores

P body

') 39 | expect(Crawler::ContentEngine::Utils.node_descendant_text(node)).to eq('P body') 40 | end 41 | end 42 | 43 | context 'without ignore_tags' do 44 | it 'does not ignores

P body

') 46 | expect(Crawler::ContentEngine::Utils.node_descendant_text(node, [])).to eq('Script body P body') 47 | end 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/lib/crawler/data/crawl_task_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe Crawler::Data::CrawlTask do 10 | let(:url) { Crawler::Data::URL.parse('https://example.com/') } 11 | let(:task) { Crawler::Data::CrawlTask.new(url:, type: :content, depth: 1) } 12 | 13 | describe '#inspect' do 14 | it 'should return a nice representation of the object for logging' do 15 | expect(task.inspect).to be_a(String) 16 | expect(task.inspect).to match(/CrawlTask/) 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /spec/lib/crawler/data/domain_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::Data::Domain) do 10 | def domain(url) 11 | Crawler::Data::Domain.new(url) 12 | end 13 | 14 | it 'should include the standard port in the normalized version' do 15 | expect(domain('http://google.com').to_s).to eq('http://google.com:80') 16 | expect(domain('https://google.com').to_s).to eq('https://google.com:443') 17 | end 18 | 19 | it 'should include the custom port in the normalized version' do 20 | expect(domain('https://google.com:123').to_s).to eq('https://google.com:123') 21 | end 22 | 23 | it 'should strip out the path' do 24 | expect(domain('https://google.com/something').to_s).to eq('https://google.com:443') 25 | end 26 | 27 | it 'should strip out the URL fragment' do 28 | expect(domain('https://google.com/something#foo').to_s).to eq('https://google.com:443') 29 | end 30 | 31 | context 'when compared to other objects' do 32 | it 'should use the normalized version for comparison' do 33 | expect(domain('https://google.com/something#foo') == 'https://google.com:443').to be(true) 34 | end 35 | end 36 | 37 | describe '#robots_txt_url' do 38 | it 'should return URL with /robots.txt as the path' do 39 | expect(domain('https://google.com').robots_txt_url.to_s).to eq('https://google.com/robots.txt') 40 | expect(domain('https://google.com/something#foo').robots_txt_url.to_s).to eq('https://google.com/robots.txt') 41 | expect(domain('https://google.com/something?q=v').robots_txt_url.to_s).to eq('https://google.com/robots.txt') 42 | expect(domain('https://google.com:123').robots_txt_url.to_s).to eq('https://google.com:123/robots.txt') 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/lib/crawler/data/rule_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::Data::Rule) do 10 | describe '#url_match?' do 11 | it 'allows rule' do 12 | rule = Crawler::Data::Rule.new(:allow, url_pattern: %r{\Ahttp://example.com/test[0-9]}) 13 | 14 | expect(rule.policy).to eq(:allow) 15 | expect(rule.url_match?(Crawler::Data::URL.parse('http://example.com/test1'))).to eq(true) 16 | expect(rule.url_match?(Crawler::Data::URL.parse('http://example.com/testx'))).to eq(false) 17 | end 18 | 19 | it 'denies rule' do 20 | rule = Crawler::Data::Rule.new(:deny, url_pattern: %r{\Ahttp://test[0-9].example.com}) 21 | 22 | expect(rule.policy).to eq(:deny) 23 | expect(rule.url_match?(Crawler::Data::URL.parse('http://test1.example.com'))).to eq(true) 24 | expect(rule.url_match?(Crawler::Data::URL.parse('http://testx.example.com'))).to eq(false) 25 | end 26 | 27 | it 'should time out on really complex matching rules' do 28 | regex = /((((((a*)*)*)*)*)*)*((((((a*)*)*)*)*)*)*((((((a*)*)*)*)*)*)*$/ 29 | rule = Crawler::Data::Rule.new(:deny, url_pattern: regex) 30 | url = Crawler::Data::URL.parse('http://test1.example.com//aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab') 31 | 32 | expect { rule.url_match?(url) }.to raise_error(Timeout::Error) 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/lib/crawler/data/url_queue_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::Data::UrlQueue) do 10 | let(:domains) { [{ url: 'http://example.com' }] } 11 | 12 | let(:config) do 13 | Crawler::API::Config.new( 14 | domains: 15 | ) 16 | end 17 | 18 | describe '.create' do 19 | it 'should return a queue object' do 20 | queue = Crawler::Data::UrlQueue.create(config) 21 | expect(queue).to be_kind_of(Crawler::Data::UrlQueue::Base) 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /spec/lib/crawler/http_utils/config_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::HttpUtils::Config) do 10 | describe 'constructor' do 11 | let(:valid_config) do 12 | { 13 | loopback_allowed: false, 14 | private_networks_allowed: false, 15 | logger: Logger.new($stdout) 16 | } 17 | end 18 | 19 | described_class::REQUIRED_OPTIONS.each do |opt| 20 | it "requires #{opt} option" do 21 | expect do 22 | described_class.new(valid_config.except(opt)) 23 | end.to raise_error(ArgumentError, "#{opt} is a required option") 24 | end 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/lib/crawler/http_utils/response_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::HttpUtils::Response) do 10 | let(:url) { Crawler::Data::URL.parse('http://example.org/') } 11 | let(:response) do 12 | Crawler::HttpUtils::Response.new( 13 | apache_response:, 14 | url:, 15 | request_start_time: 1.second.ago, 16 | request_end_time: Time.now 17 | ) 18 | end 19 | 20 | #------------------------------------------------------------------------------------------------- 21 | describe '#check_content_encoding' do 22 | let(:response_entity) { double(:response_entity, content_encoding: encoding) } 23 | let(:apache_response) { double(:apache_response, entity: response_entity) } 24 | 25 | def check_content_encoding 26 | response.send(:check_content_encoding) 27 | end 28 | 29 | context 'when given a supported content encoding' do 30 | let(:encoding) { 'gzip' } 31 | it 'should succeed' do 32 | expect { check_content_encoding }.to_not raise_error 33 | end 34 | end 35 | 36 | context 'when given a list of supported content encodings' do 37 | let(:encoding) { 'gzip,deflate' } 38 | it 'should succeed' do 39 | expect { check_content_encoding }.to_not raise_error 40 | end 41 | end 42 | 43 | context 'when given an unsupported content encoding' do 44 | let(:encoding) { 'banana' } 45 | it 'should fail' do 46 | expect { check_content_encoding }.to raise_error(Crawler::HttpUtils::InvalidEncoding) 47 | end 48 | end 49 | 50 | context 'when given a list with an unsupported content encoding' do 51 | let(:encoding) { 'gzip,banana' } 52 | it 'should fail' do 53 | expect { check_content_encoding }.to raise_error(Crawler::HttpUtils::InvalidEncoding) 54 | end 55 | end 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /spec/lib/crawler/output_sink/file_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::OutputSink::File) do 10 | let(:domains) { [{ url: 'http://example.com' }] } 11 | 12 | context '#initialize' do 13 | def new_sink(config) 14 | Crawler::OutputSink::File.new(config) 15 | end 16 | 17 | it 'has a default output directory of ./crawled_docs' do 18 | config = Crawler::API::Config.new( 19 | domains:, 20 | output_sink: './crawled_docs' 21 | ) 22 | 23 | expect { new_sink(config) }.to_not raise_error 24 | expect(config.output_dir).to eq('./crawled_docs') 25 | end 26 | 27 | it 'should create the output directory' do 28 | dir = '/some/directory' 29 | config = Crawler::API::Config.new( 30 | domains:, 31 | output_sink: 'file', 32 | output_dir: dir 33 | ) 34 | expect(FileUtils).to receive(:mkdir_p).with(dir) 35 | new_sink(config) 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /spec/lib/crawler/output_sink_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::OutputSink) do 10 | let(:domains) { [{ url: 'http://example.com' }] } 11 | 12 | let(:es_client) { double } 13 | let(:es_client_indices) { double(:es_client_indices, exists: double) } 14 | let(:build_info) { { version: { number: '8.99.0', build_flavor: 'default' } }.deep_stringify_keys } 15 | 16 | before(:each) do 17 | allow(ES::Client).to receive(:new).and_return(es_client) 18 | allow(es_client).to receive(:indices).and_return(es_client_indices) 19 | allow(es_client).to receive(:info).and_return(build_info) 20 | end 21 | 22 | context '.create' do 23 | it 'should validate the sync name' do 24 | config = Crawler::API::Config.new( 25 | domains:, 26 | output_sink: 'magnetic-tape' 27 | ) 28 | 29 | expect do 30 | Crawler::OutputSink.create(config) 31 | end.to raise_error(/Unknown output sink/) 32 | end 33 | 34 | it 'should return a new sink object of a correct type' do 35 | config = Crawler::API::Config.new( 36 | domains:, 37 | output_sink: :elasticsearch, 38 | output_index: 'some-index-name', 39 | elasticsearch: { 40 | host: 'http://localhost', 41 | port: 1234, 42 | api_key: 'key' 43 | } 44 | ) 45 | 46 | sink = Crawler::OutputSink.create(config) 47 | expect(sink).to be_kind_of(Crawler::OutputSink::Elasticsearch) 48 | end 49 | end 50 | end 51 | -------------------------------------------------------------------------------- /spec/lib/crawler/url_validator/crawl_rules_check_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | # Mock class definitions 10 | module Crawler 11 | module RuleEngine 12 | class Elasticsearch < Crawler::RuleEngine::Base 13 | def crawl_rules_outcome(url) end 14 | end 15 | end 16 | end 17 | 18 | RSpec.describe(Crawler::UrlValidator) do 19 | let(:valid_url) { Crawler::Data::URL.parse('http://example.com') } 20 | let(:domain_allowlist) { ['example.com'] } 21 | let(:crawl_config) { double('CrawlConfig', domain_allowlist:) } 22 | let(:validator) { described_class.new(url: valid_url, crawl_config:) } 23 | let(:rule_engine) { double('Crawler::RuleEngine::Elasticsearch') } 24 | let(:outcome) { double('Outcome', allowed?: allowed, details: { rule: }) } 25 | let(:rule) { double('Rule', source: 'some_rule_source') } 26 | 27 | describe '#validate_crawl_rules' do 28 | before do 29 | allow(Crawler::RuleEngine::Elasticsearch).to receive(:new).with(crawl_config).and_return(rule_engine) 30 | allow(rule_engine).to receive(:crawl_rules_outcome).with(validator.normalized_url).and_return(outcome) 31 | allow(validator).to receive(:validation_ok) 32 | allow(validator).to receive(:validation_fail) 33 | end 34 | 35 | context 'when the URL is allowed by a crawl rule' do 36 | let(:allowed) { true } 37 | 38 | it 'calls validation_ok' do 39 | validator.validate_crawl_rules 40 | expect(validator) 41 | .to have_received(:validation_ok) 42 | end 43 | end 44 | 45 | context 'when the URL is denied by a crawl rule' do 46 | let(:allowed) { false } 47 | 48 | it 'calls validation_fail' do 49 | validator.validate_crawl_rules 50 | expect(validator) 51 | .to have_received(:validation_fail) 52 | end 53 | end 54 | 55 | context 'when the URL is denied because it did not match any rules' do 56 | let(:allowed) { false } 57 | let(:rule) { nil } 58 | 59 | it 'calls validation_fail' do 60 | validator.validate_crawl_rules 61 | expect(validator) 62 | .to have_received(:validation_fail) 63 | end 64 | end 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /spec/lib/crawler/url_validator/domain_access_check_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::UrlValidator) do 10 | let(:valid_url) { Crawler::Data::URL.parse('http://example.com') } 11 | let(:domain_allowlist) { ['example.com'] } 12 | let(:crawl_config) { double('CrawlConfig', domain_allowlist:) } 13 | let(:url) { instance_double('Crawler::Data::URL', domain: domain_allowlist[0], domain_name: domain_allowlist[0]) } 14 | let(:validator) { described_class.new(url: valid_url, crawl_config:) } 15 | 16 | describe '#validate_domain_access' do 17 | before do 18 | validator.singleton_class.include(Crawler::UrlValidator::DomainAccessCheckConcern) 19 | allow(validator).to receive(:crawler_api_config).and_return(crawl_config) 20 | allow(validator).to receive(:url).and_return(url) 21 | allow(validator).to receive(:validation_ok) 22 | allow(validator).to receive(:validation_fail) 23 | end 24 | 25 | context 'when the URL matches one of the configured domains' do 26 | it 'calls validation_ok with the correct parameters' do 27 | validator.validate_domain_access 28 | expect(validator) 29 | .to have_received(:validation_ok) 30 | .with(:domain_access, 'The URL matches one of the configured domains', domain: 'example.com') 31 | end 32 | end 33 | 34 | context 'when the URL does not match any configured domains' do 35 | let(:url) { instance_double('Crawler::Data::URL', domain: 'notexample.com', domain_name: 'notexample.com') } 36 | 37 | it 'calls validation_fail with the correct parameters' do 38 | validator.validate_domain_access 39 | expect(validator) 40 | .to have_received(:validation_fail) 41 | .with(:domain_access, 'The URL does not match any configured domains') 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/lib/crawler/url_validator/domain_uniqueness_check_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler::UrlValidator) do 10 | let(:valid_url) { Crawler::Data::URL.parse('http://example.com') } 11 | let(:domain_allowlist) { ['example.com'] } 12 | let(:crawl_config) { double('CrawlConfig', domain_allowlist:) } 13 | let(:validator) { described_class.new(url: valid_url, crawl_config:) } 14 | let(:url) { instance_double('Crawler::Data::URL', domain: domain_allowlist[0], domain_name: domain_allowlist[0]) } 15 | 16 | describe '#validate_domain_uniqueness' do 17 | before do 18 | validator.singleton_class.include(Crawler::UrlValidator::DomainUniquenessCheckConcern) 19 | allow(validator).to receive(:crawler_api_config).and_return(crawl_config) 20 | allow(validator).to receive(:url).and_return(url) 21 | allow(validator).to receive(:validation_ok) 22 | allow(validator).to receive(:validation_fail) 23 | end 24 | 25 | context 'when the domain name already exists' do 26 | it 'calls validation_fail with the correct parameters' do 27 | validator.validate_domain_uniqueness 28 | expect(validator) 29 | .to have_received(:validation_fail) 30 | .with(:domain_uniqueness, 'Domain name already exists') 31 | end 32 | end 33 | 34 | context 'when the domain name is new' do 35 | let(:url) { instance_double('Crawler::Data::URL', domain: 'newexample.com', domain_name: 'newexample.com') } 36 | 37 | it 'calls validation_ok with the correct parameters' do 38 | validator.validate_domain_uniqueness 39 | expect(validator) 40 | .to have_received(:validation_ok) 41 | .with(:domain_uniqueness, 'Domain name is new', domain: 'newexample.com') 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /spec/lib/crawler_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe(Crawler) do 10 | it 'should define a version' do 11 | expect(Crawler.version).to be_a(String) 12 | end 13 | 14 | context '.service_id' do 15 | it 'should be cached' do 16 | expect(Crawler.service_id).to be(Crawler.service_id) 17 | end 18 | 19 | it 'should be process-scoped (not thread-local)' do 20 | id1 = Crawler.service_id 21 | 22 | t = Thread.new { Thread.current[:service_id] = Crawler.service_id }.join 23 | id2 = t[:service_id] 24 | expect(id1).to be(id2) 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/lib/environment_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | RSpec.describe 'Crawler Environment' do 10 | it 'should have CRAWLER_ENV defined' do 11 | expect(defined?(CRAWLER_ENV)).to eq('constant') 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /spec/support/cli_helpers.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'stringio' 10 | 11 | module RSpec 12 | module Support 13 | module Helpers 14 | def capture_output 15 | output = StringIO.new 16 | original_stdout = $stdout 17 | $stdout = output 18 | yield 19 | output.string 20 | rescue SystemExit 21 | output.string 22 | ensure 23 | $stdout = original_stdout 24 | end 25 | 26 | def capture_error 27 | error = StringIO.new 28 | original_stderr = $stderr 29 | $stderr = error 30 | yield 31 | error.string 32 | rescue SystemExit 33 | error.string 34 | ensure 35 | $stderr = original_stderr 36 | end 37 | end 38 | end 39 | end 40 | 41 | RSpec.configure do |config| 42 | config.include(RSpec::Support::Helpers) 43 | end 44 | -------------------------------------------------------------------------------- /spec/support/faux/results_collection.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'concurrent' 10 | 11 | # A simple wrapper class for a collection of crawl results gathered by the mock crawler sink 12 | class ResultsCollection 13 | attr_accessor :crawl_config, :crawl, :collection 14 | 15 | delegate :outcome, :outcome_message, to: :crawl 16 | 17 | def initialize 18 | @collection = Concurrent::Array.new 19 | end 20 | 21 | # Do not allow the collection to be duplicated when passed through config validation, etc 22 | # This is needed so that we could pass a collection as a config parameter to a Crawler instance 23 | # in tests and get it propagated to the sink itself and back. 24 | def dup 25 | self 26 | end 27 | 28 | def method_missing(meth, *args, &block) 29 | @collection.send(meth, *args, &block) 30 | end 31 | 32 | def respond_to_missing?(method_name, include_private = false) 33 | @collection.respond_to?(method_name, include_private) || super 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/support/fixtures.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | FIXTURES_HOME = File.join(__dir__, '..', 'fixtures') 10 | 11 | def fixture_file(*file_path) 12 | File.join(FIXTURES_HOME, *file_path) 13 | end 14 | 15 | def read_fixture(*file_path) 16 | File.read(fixture_file(*file_path)) 17 | end 18 | 19 | def fixture_xml(*file_path) 20 | file_name = file_path.pop 21 | file_name = "#{file_name}.xml" 22 | read_fixture(*file_path, file_name) 23 | end 24 | 25 | def fixture_xml_gz(*file_path) 26 | file_name = file_path.pop 27 | file_name = "#{file_name}.xml.gz" 28 | read_fixture(*file_path, file_name) 29 | end 30 | -------------------------------------------------------------------------------- /spec/support/mock_response.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0; 4 | # you may not use this file except in compliance with the Elastic License 2.0. 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'ostruct' 10 | 11 | class MockResponse < OpenStruct # rubocop:disable Style/OpenStructUse 12 | def equal_for_specified_keys?(response) 13 | to_h.all? do |key, val| 14 | val.to_s == response.send(key).to_s 15 | end 16 | end 17 | end 18 | 19 | def mock_response(args) 20 | MockResponse.new(args) 21 | end 22 | -------------------------------------------------------------------------------- /vendor/faux/.gitignore: -------------------------------------------------------------------------------- 1 | Gemfile.lock 2 | 3 | *.gem 4 | .bundle 5 | pkg/* 6 | .DS_Store 7 | .rvmrc 8 | .ruby-version 9 | .ruby-gemset 10 | -------------------------------------------------------------------------------- /vendor/faux/Gemfile: -------------------------------------------------------------------------------- 1 | # A sample Gemfile 2 | source "https://rubygems.org" 3 | 4 | gemspec 5 | 6 | group 'test' do 7 | gem 'pry' 8 | gem 'rack-test' 9 | gem 'rspec' 10 | gem 'awesome_print' 11 | end 12 | -------------------------------------------------------------------------------- /vendor/faux/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2024 Elasticsearch B.V. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /vendor/faux/README.md: -------------------------------------------------------------------------------- 1 | # Faux 2 | 3 | Faux is little Rack-based DSL for generating websites. Here's a simple example: 4 | 5 | ``` ruby 6 | class SimpleSite < Faux::Base 7 | page '/foo' do 8 | status 200 9 | link_to '/foobar' 10 | end 11 | 12 | page '/bar' do 13 | status 200 14 | link_to '/bang' 15 | link_to '/baz' 16 | end 17 | 18 | sitemap '/sitemap.xml' do 19 | link_to 'http://localhost:9393/foo' 20 | link_to '/bar' 21 | end 22 | 23 | # Adds a /robots.txt file with the specified rules. 24 | robots do 25 | user_agent '*' 26 | disallow '/foo' 27 | sitemap 'http://localhost:9393/sitemap.xml' 28 | end 29 | end 30 | ``` 31 | 32 | To boot the example site locally: 33 | ``` shell 34 | $ bundle exec rackup 35 | ``` 36 | 37 | The site will be running at `localhost:9393` 38 | 39 | ### Request Counter 40 | 41 | After booting an app, visit `/status` for a JSON report of which URLs have been visited and how many times they've been visited while the app has been running. It'll look like this: 42 | 43 | ``` json 44 | { 45 | "/bar": 7, 46 | "/foo": 5 47 | } 48 | ``` 49 | -------------------------------------------------------------------------------- /vendor/faux/Rakefile: -------------------------------------------------------------------------------- 1 | require "bundler/gem_tasks" 2 | 3 | begin 4 | require 'rspec/core/rake_task' 5 | RSpec::Core::RakeTask.new(:spec) 6 | rescue LoadError 7 | end 8 | 9 | task :default => :spec 10 | -------------------------------------------------------------------------------- /vendor/faux/faux.gemspec: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | lib = File.expand_path('../lib', __FILE__) 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 4 | 5 | require 'faux/version' 6 | 7 | Gem::Specification.new do |spec| 8 | spec.name = "faux" 9 | spec.version = Faux::VERSION 10 | spec.authors = ["Elastic Enterprise Search Team"] 11 | spec.email = ["enterprise-search@elastic.co"] 12 | spec.description = "Artisan faux web pages, by Wes Andreson" 13 | spec.summary = "Faux is little Rack-based DSL for generating websites" 14 | spec.homepage = "https://swiftype.com" 15 | spec.license = "MIT" 16 | 17 | spec.files = Dir.glob("{lib,sites}/**/*", File::FNM_DOTMATCH).reject {|f| File.directory?(f) } 18 | spec.test_files = spec.files.grep(%r{^(test|spec|features)/}) 19 | spec.require_paths = ["lib"] 20 | 21 | if spec.respond_to?(:metadata) 22 | spec.metadata['allowed_push_host'] = 'https://artifactory.elastic.dev/artifactory/api/gems/swiftype-gems' 23 | else 24 | raise "RubyGems 2.0 or newer is required to protect against public gem pushes." 25 | end 26 | 27 | spec.add_development_dependency 'rake' 28 | spec.add_development_dependency 'geminabox' 29 | 30 | spec.add_runtime_dependency 'activesupport' 31 | spec.add_runtime_dependency 'nokogiri' 32 | spec.add_runtime_dependency 'rack' 33 | spec.add_runtime_dependency 'rack-mount' 34 | end 35 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/atom_feed.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Element 9 | class AtomFeed < Base 10 | def call(env) 11 | @entries = [] 12 | super 13 | end 14 | 15 | def response_headers 16 | @headers.merge!({'Content-Type' => 'text/xml'}) 17 | super 18 | end 19 | 20 | def response_body 21 | builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml| 22 | xml.feed(:xmlns => "http://www.w3.org/2005/Atom") { 23 | xml.title 'Faux Feed' 24 | @entries.each do |tags| 25 | xml.entry { 26 | tags.each do |tag| 27 | if tag[:name] == 'content' # FIXME: Rewrite this as it makes me cry on the inside 28 | xml.send(tag[:name], {:type => 'html'}, tag[:text]) 29 | elsif tag[:text] # generated from method_missing 30 | xml.send(tag[:name], tag[:text]) 31 | else # generated from link_to 32 | xml.send(tag[:name], tag.reject{|k, _| k == :name}) 33 | end 34 | end 35 | } 36 | end 37 | } 38 | end 39 | 40 | builder.to_xml.split("\n") 41 | end 42 | 43 | def entry(&block) 44 | @tags = [] # Holds hashes with tags defined inside &block 45 | block.call 46 | @entries << @tags 47 | end 48 | 49 | def link_to(url, rel='self') 50 | @tags << {:name => :link, :href => absolute_url_for(url), :rel => rel} 51 | end 52 | 53 | def html_content(html) 54 | @tags << {:name => 'content', :text => html} 55 | end 56 | 57 | def method_missing(method, *args, &block) 58 | @tags << {:name => method, :text => args[0]} 59 | end 60 | end 61 | end 62 | end 63 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/base.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Element 9 | class Base 10 | include Faux::Helpers::Url 11 | 12 | attr_reader :content_block, :env, :options 13 | 14 | def initialize(options, &content_block) 15 | @content_block = content_block 16 | @options = options 17 | @status = 200 18 | end 19 | 20 | def call(env) 21 | @env = env 22 | @headers = {} 23 | 24 | instance_exec(&content_block) if content_block 25 | [response_status, response_headers, response_body] 26 | end 27 | 28 | # Get methods (used in `call`) 29 | def response_status 30 | @status 31 | end 32 | 33 | def response_headers 34 | unless @headers.keys.find { |k| k.downcase == 'content-type' } 35 | @headers['Content-Type'] = 'text/html' 36 | end 37 | @headers 38 | end 39 | 40 | # Set methods (used by DSL) 41 | def status(code) 42 | @status = code.to_i 43 | end 44 | 45 | def headers(headers_hash) 46 | @headers.merge!(headers_hash || {}) 47 | end 48 | 49 | def response_body 50 | raise 'Must be defined in a subclass' 51 | end 52 | 53 | def redirect(location, options = {}) 54 | @status = options[:permanent] ? 301 : 302 55 | @headers['Location'] = options[:relative] ? location : absolute_url_for(location) 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/fixture.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Element 9 | class Fixture < Base 10 | 11 | attr_reader :fixture_content 12 | 13 | def call(env) 14 | @fixture_content = nil 15 | super 16 | end 17 | 18 | def response_body 19 | [ @fixture_content ] 20 | end 21 | 22 | def path(fixture_file_path) 23 | begin 24 | full_path = File.join(Dir.pwd, fixture_file_path) 25 | file = File.open(full_path) 26 | rescue => e 27 | message = <<-EOL 28 | Please provide correct path to fixture: 29 | 30 | example: `path: 'fixture/simple.html'` 31 | 32 | error: #{e} #{e.message} 33 | backtrace: #{e.backtrace} 34 | EOL 35 | raise ArgumentError, message 36 | end 37 | @fixture_content = file.read 38 | end 39 | end 40 | end 41 | end 42 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/page.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Element 9 | class Page < Base 10 | 11 | attr_reader :canonical, :links, :meta_robots_rules, :base_url 12 | 13 | def call(env) 14 | @body_content = [] 15 | @head_content = [] 16 | @head_html = '' 17 | @body_html = '' 18 | super 19 | end 20 | 21 | def response_body 22 | [ '%s%s' % [ @head_html, @body_html ] ] 23 | end 24 | 25 | def head(&block) 26 | @head_html = begin 27 | block.call 28 | '%s' % @head_content.join("\n") 29 | end 30 | end 31 | 32 | def body(&block) 33 | @body_html = begin 34 | block.call 35 | '%s' % @body_content.join("\n") 36 | end 37 | end 38 | 39 | def text(&block) 40 | @body_content << block.call.to_s 41 | end 42 | 43 | private 44 | 45 | def canonical_to(url_or_path) 46 | @head_content << %Q() 51 | end 52 | 53 | def atom_to(path) 54 | @head_content << %Q() 55 | end 56 | 57 | def base(url_or_path) 58 | @head_content << %Q() 59 | end 60 | 61 | def link_to(url_or_path, options = {}) 62 | relative = options.delete(:relative) 63 | url_or_path = absolute_url_for(url_or_path) if relative == false 64 | 65 | attributes = [''] + options.map { |k,v| "#{k}='#{v}'"} 66 | @body_content << %Q(#{url_or_path}) 67 | end 68 | end 69 | end 70 | end 71 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/path_with_content_length.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'active_support/core_ext/numeric' 8 | 9 | module Faux 10 | module Element 11 | 12 | # This element is used primarily in testing against pages of given size. 13 | # Do NOT add functionality to this file to cater to other cases, use 14 | # `page` element instead. 15 | class PathWithContentLength < Base 16 | attr_reader :size 17 | 18 | def call(env) 19 | @size = options[:size] 20 | super 21 | end 22 | 23 | def response_body 24 | content = 'a' * (size || 0) 25 | [content] 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/robots.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Element 9 | class Robots < Base 10 | 11 | def call(env) 12 | @rules = [] 13 | super 14 | end 15 | 16 | def sitemap(url_or_path, options = {}) 17 | if options[:relative] == true 18 | url_or_path = absolute_url_for(url_or_path) 19 | end 20 | @rules << "Sitemap: #{url_or_path}\n" 21 | end 22 | 23 | def method_missing(name, *args, &block) 24 | @rules << "#{normalize_name(name)}: #{args.first}\n" 25 | end 26 | 27 | def response_body 28 | @rules 29 | end 30 | 31 | def response_headers 32 | @headers.merge!({'Content-Type' => 'text/plain'}) 33 | super 34 | end 35 | 36 | private 37 | 38 | def normalize_name(name) 39 | name.to_s.gsub('_', '-').capitalize 40 | end 41 | end 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/element/sitemap.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'stringio' 8 | require 'nokogiri' 9 | require 'zlib' 10 | 11 | module Faux 12 | module Element 13 | class Sitemap < Base 14 | def call(env) 15 | @links = [] 16 | super 17 | end 18 | 19 | def response_headers 20 | @headers.merge!({'Content-Type' => 'application/xml'}) 21 | super 22 | end 23 | 24 | def link_to(url_or_path, options = {}) 25 | if options[:relative] 26 | @links << url_or_path 27 | else 28 | @links << absolute_url_for(url_or_path) 29 | end 30 | end 31 | 32 | def response_body 33 | builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml| 34 | if options[:index] 35 | xml.sitemapindex(:xmlns => "http://www.sitemaps.org/schemas/sitemap/0.9") { 36 | @links.each do |link| 37 | xml.sitemap { 38 | xml.loc "#{link}" 39 | } 40 | end 41 | } 42 | else 43 | xml.urlset(:xmlns => "http://www.sitemaps.org/schemas/sitemap/0.9") { 44 | @links.each do |link| 45 | xml.url { 46 | xml.loc "#{link}" 47 | } 48 | end 49 | } 50 | end 51 | end 52 | 53 | sitemap_txt = builder.to_xml 54 | 55 | if options[:gzip] 56 | [gzip(sitemap_txt)] 57 | else 58 | sitemap_txt.split("\n") 59 | end 60 | end 61 | 62 | def gzip(contents) 63 | file = StringIO.new 64 | file.set_encoding("BINARY") 65 | 66 | writer = Zlib::GzipWriter.new(file) 67 | writer.write(contents) 68 | writer.close 69 | 70 | file.string 71 | end 72 | end 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/helpers/url.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Helpers 9 | module Url 10 | def absolute_url_for(path) 11 | "#{env['rack.url_scheme']}://#{env['HTTP_HOST']}#{path}" 12 | end 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/middleware/reporter.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | module Faux 8 | module Middleware 9 | 10 | # Rack middleware to intercept requests and increments a counter based on 11 | # the request path. If the path is '/status', we'll return a JSON report 12 | # of the request counts since the application has been running. 13 | class Reporter 14 | 15 | def self.counter 16 | @counter ||= Hash.new(0) 17 | end 18 | 19 | def self.reset! 20 | @counter = Hash.new(0) 21 | end 22 | 23 | def initialize(app) 24 | @app = app 25 | end 26 | 27 | def call(env) 28 | if env['PATH_INFO'] == '/status' 29 | [200, { 'Content-Type' => 'application/json' }, [ Reporter.counter.to_json ]] 30 | else 31 | Reporter.counter[env['PATH_INFO']] += 1 32 | @app.call(env) 33 | end 34 | end 35 | 36 | end 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /vendor/faux/lib/faux/version.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | module Faux 7 | VERSION = '0.1.0' 8 | end 9 | -------------------------------------------------------------------------------- /vendor/faux/lib/site.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | # frozen_string_literal: true 8 | 9 | require 'rack' 10 | require 'webrick' 11 | require 'webrick/https' 12 | 13 | module Faux 14 | # Class to manage creation and destruction of mounted Rack instances 15 | class Site 16 | attr_reader :site, :options, :server, :server_thread 17 | 18 | def initialize(site, options = {}) 19 | @site = site 20 | @options = options 21 | start if options.fetch(:start, true) 22 | end 23 | 24 | def start 25 | if options[:debug] 26 | puts "Faux: INFO: Starting Faux for #{site} (#{options.inspect})" 27 | end 28 | 29 | start_queue = Queue.new 30 | rack_opts = { 31 | :app => site, 32 | :Port => options[:port] || 9393, 33 | :server => :webrick, 34 | :StartCallback => proc { start_queue << :start } 35 | } 36 | 37 | if options[:ssl] 38 | key = OpenSSL::PKey::RSA.new(File.read(options.fetch(:ssl_key))) 39 | cert = OpenSSL::X509::Certificate.new(File.read(options.fetch(:ssl_certificate))) 40 | rack_opts.merge!( 41 | :SSLEnable => true, 42 | :SSLPrivateKey => key, 43 | :SSLCertificate => cert, 44 | :SSLCACertificateFile => options[:ssl_ca_certificate] 45 | ) 46 | end 47 | 48 | @server ||= Rack::Server.new(rack_opts) 49 | @server_thread = Thread.new { server.start } 50 | start_queue.pop 51 | end 52 | 53 | def stop 54 | # Stop Webrick 55 | server.server.shutdown 56 | 57 | # Make sure the thread has stopped or kill it within a second 58 | 10.times do 59 | break unless server_thread.alive? 60 | sleep(0.1) 61 | end 62 | server_thread.kill 63 | 64 | # Reset the state of the site 65 | @server_thread = nil 66 | @server = nil 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /vendor/faux/sites/fixture_site.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | class FixtureSite < Faux::Base 7 | fixture '/' do 8 | path 'spec/fixtures/simple.html' 9 | end 10 | 11 | fixture '/foo' do 12 | headers 'Content-Type' => 'application/xml' 13 | path 'spec/fixtures/atom-feed-example-com.xml' 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /vendor/faux/sites/robots_txt_respect_rules.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | class RobotsTxtRespectRules < Faux::Base 7 | page '/' do 8 | body do 9 | link_to '/bar' 10 | link_to '/foo' 11 | end 12 | end 13 | 14 | page '/bar' 15 | page '/foo' 16 | 17 | robots do 18 | user_agent '*' 19 | disallow '/foo' 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /vendor/faux/sites/simple_site.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | class SimpleSite < Faux::Base 7 | page '/' do 8 | head { atom_to '/feed' } 9 | body { link_to '/foo' } 10 | end 11 | 12 | page '/foo' do 13 | status 200 14 | body { link_to '/foobar' } 15 | end 16 | 17 | path_with_content_length '/large_page', 10.megabytes 18 | 19 | atom_feed '/feed' do 20 | entry do 21 | title 'Another Post' 22 | link_to '/foo' 23 | link_to '/wow' 24 | end 25 | 26 | entry do 27 | link_to '/bar' 28 | end 29 | end 30 | 31 | page '/bar' do 32 | status 200 33 | body do 34 | link_to '/bang', :relative => false 35 | link_to '/baz' 36 | end 37 | end 38 | 39 | page '/redirect' do 40 | redirect '/foo' 41 | end 42 | 43 | sitemap '/sitemap.xml' do 44 | link_to '/foo' 45 | link_to '/bar' 46 | end 47 | 48 | robots do 49 | user_agent '*' 50 | disallow '/foo' 51 | 52 | # Sitemap urls should be absolute. Pass :relative => true 53 | # so the url will be converted from relative to absolute. 54 | sitemap '/sitemap.xml', :relative => true 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /vendor/faux/sites/sitemap_pointing_to_sitemaps.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | class SitemapPointingToSitemaps < Faux::Base 7 | robots do 8 | user_agent '*' 9 | 10 | sitemap '/sitemap.xml' 11 | end 12 | 13 | sitemap_index '/sitemap.xml' do 14 | link_to '/sitemap_1.xml' 15 | link_to '/sitemap_2.xml' 16 | end 17 | 18 | sitemap '/sitemap_1.xml' do 19 | link_to '/foo' 20 | end 21 | 22 | sitemap '/sitemap_2.xml' do 23 | link_to '/bar' 24 | end 25 | 26 | page '/foo' 27 | page '/bar' 28 | end 29 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/element/base_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Element::Base do 10 | before :each do 11 | allow_any_instance_of(Faux::Element::Base).to receive(:response_body).and_return('body') 12 | end 13 | 14 | it 'should set defaults' do 15 | base = Faux::Element::Base.new({}) 16 | expect(base.call(double)).to eq [200, {'Content-Type' => 'text/html'}, 'body'] 17 | end 18 | 19 | it 'sets status' do 20 | content = Proc.new { status 400 } 21 | base = Faux::Element::Base.new({}, &content) 22 | expect(base.call(double)).to eq [400, {'Content-Type' => 'text/html'}, 'body'] 23 | end 24 | 25 | it 'sets headers' do 26 | content = Proc.new { headers 'Content-Type' => 'text/plain' } 27 | base = Faux::Element::Base.new({}, &content) 28 | expect(base.call(double)).to eq [200, {'Content-Type' => 'text/plain'}, 'body'] 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/element/fixture_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Element::Fixture do 10 | let(:site) { Class.new(Faux::Base) } 11 | 12 | def app 13 | site 14 | end 15 | 16 | it 'accepts path as argument' do 17 | app.fixture '/foo' do 18 | path 'spec/fixtures/simple.html' 19 | end 20 | 21 | get '/foo' 22 | expect(last_response.body).to match 'example' 23 | expect(last_response.body).to match 'another link' 24 | end 25 | 26 | it 'allows headers and status to be specified' do 27 | app.fixture '/foo' do 28 | status 404 29 | headers "Content-Type" => 'text/plain' 30 | path 'spec/fixtures/simple.html' 31 | end 32 | 33 | get '/foo' 34 | expect(last_response.body).to match 'example' 35 | expect(last_response.header['Content-Type']).to match 'text/plain' 36 | expect(last_response.status).to eq(404) 37 | end 38 | 39 | it 'works with xml files' do 40 | app.fixture '/foo' do 41 | headers 'Content-Type' => 'application/xml' 42 | path 'spec/fixtures/atom-feed-example-com.xml' 43 | end 44 | 45 | get '/foo' 46 | expect(last_response.body).to match '' 47 | end 48 | 49 | it 'raises error if path is wrong' do 50 | app.fixture '/foo' do 51 | path 'doesnt-exist' 52 | end 53 | 54 | expect do 55 | get '/foo' 56 | end.to raise_error ArgumentError 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/element/path_with_content_length_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Element::PathWithContentLength do 10 | let(:site) { Class.new(Faux::Base) } 11 | 12 | def app 13 | site 14 | end 15 | 16 | it 'should return valid page' do 17 | site.path_with_content_length '/large_page' 18 | 19 | get '/large_page' 20 | expect(last_response.content_type).to eq 'text/html' 21 | end 22 | 23 | it 'should return page of specified size' do 24 | site.path_with_content_length '/large_page', 10.megabytes 25 | 26 | get '/large_page' 27 | expect(last_response.content_length).to eq 10.megabytes 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/element/robots_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Element::Robots do 10 | let(:site) { Class.new(Faux::Base) } 11 | 12 | def app 13 | site 14 | end 15 | 16 | it 'should be accessible on \robots.txt' do 17 | site.robots 18 | 19 | get '/robots.txt' 20 | expect(last_response).to_not be_empty 21 | end 22 | 23 | it 'should render names' do 24 | site.robots do 25 | disallow '/blocked' 26 | end 27 | 28 | get '/robots.txt' 29 | expect(last_response.body).to eq "Disallow: /blocked\n" 30 | end 31 | 32 | it 'should render names with dashes / underscores' do 33 | site.robots do 34 | user_agent '*' 35 | end 36 | 37 | get '/robots.txt' 38 | expect(last_response.body).to eq "User-agent: *\n" 39 | end 40 | 41 | it 'combines multiple declarations on one file' do 42 | site.robots do 43 | disallow '/blocked' 44 | sitemap 'http://example.com/sitemap.xml' 45 | end 46 | 47 | get '/robots.txt' 48 | expect(last_response.body).to eq "Disallow: /blocked\nSitemap: http://example.com/sitemap.xml\n" 49 | end 50 | 51 | it 'returns correct content-type' do 52 | site.robots do 53 | disallow '/blocked' 54 | sitemap 'http://example.com/sitemap.xml' 55 | end 56 | 57 | get '/robots.txt' 58 | expect(last_response.content_type).to eq "text/plain" 59 | end 60 | 61 | it 'supports converting relative sitemap paths to absolute paths' do 62 | site.robots do 63 | sitemap '/sitemap.xml', :relative => true 64 | end 65 | 66 | get '/robots.txt' 67 | expect(last_response.body).to match 'http://example.org/sitemap.xml' 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/element/sitemap_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Element::Sitemap do 10 | let(:site) { Class.new(Faux::Base) } 11 | 12 | def app 13 | site 14 | end 15 | 16 | it 'should return 200 by default for sitemap' do 17 | site.sitemap '/sitemap.xml' 18 | 19 | get '/sitemap.xml' 20 | expect(last_response.status).to eq 200 21 | end 22 | 23 | it 'should return xml' do 24 | site.sitemap '/sitemap.xml' 25 | 26 | get '/sitemap.xml' 27 | expect(last_response.content_type).to eq 'application/xml' 28 | end 29 | 30 | context 'sitemap of URLs' do 31 | it 'includes links into generated sitemap' do 32 | site.sitemap '/sitemap.xml' do 33 | link_to '/anothersite' 34 | end 35 | 36 | get '/sitemap.xml' 37 | expect(last_response.body).to match 'http://example.org/anothersite' 38 | end 39 | 40 | it 'supports creating relative links' do 41 | site.sitemap '/sitemap.xml' do 42 | link_to '/anothersite', :relative => true 43 | end 44 | 45 | get '/sitemap.xml' 46 | expect(last_response.body).to match '/anothersite' 47 | end 48 | end 49 | 50 | context 'sitemap index' do 51 | it 'defines an index' do 52 | site.sitemap_index '/sitemap.xml' 53 | 54 | get '/sitemap.xml' 55 | expect(last_response.body).to match 'sitemapindex' 56 | end 57 | 58 | it 'supports creating links' do 59 | site.sitemap_index '/sitemap.xml' do 60 | link_to '/sitemap_2.xml' 61 | end 62 | 63 | get '/sitemap.xml' 64 | expect(last_response.body).to match 'http://example.org/sitemap_2.xml' 65 | end 66 | end 67 | end 68 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/middleware/reporter_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Middleware::Reporter do 10 | let(:site) do 11 | build_rack_test_session(:status) 12 | Class.new(Faux::Base) 13 | end 14 | 15 | def app 16 | site 17 | end 18 | 19 | it 'reports a count of the routes that have been visited' do 20 | pending "Intermittent error comes up (probably due to status not being cleared between test runs)" 21 | 22 | site.page '/foo' 23 | 24 | get '/foo' 25 | get '/foo' 26 | get '/status' 27 | 28 | expect(last_response.status).to eq(200) 29 | expect(JSON.parse(last_response.body)).to eq('/foo' => 2) 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux/site_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Site do 10 | let(:site) { Faux.site } 11 | 12 | it 'starts a Webrick handler for Rack' do 13 | server = double("server") 14 | expect(::Rack::Server).to receive(:new).with(:Port => 9393, :app => site, :server => :webrick).and_return(server) 15 | expect(server).to receive(:start) 16 | 17 | faux = Faux::Site.new(site, {}) 18 | sleep(1) 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /vendor/faux/spec/faux_spec.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'spec_helper' 8 | 9 | describe Faux::Base do 10 | 11 | let(:site) { Class.new(Faux::Base) } 12 | 13 | def app 14 | site 15 | end 16 | 17 | it 'adds a /status route by default' do 18 | get '/status' 19 | expect(last_response.status).to eq(200) 20 | end 21 | 22 | end 23 | -------------------------------------------------------------------------------- /vendor/faux/spec/fixtures/atom-feed-example-com.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Example.com 5 | 6 | 7 | 2012-10-11T12:46:09-07:00 8 | http://www.example.com/ 9 | 10 | Example.com 11 | 12 | 13 | 14 | Example.com Stuff thing Blah 15 | 16 | 2012-10-11T00:00:00-07:00 17 | 2012-10-11T00:00:00-07:00 18 | http://www.example.com/atom-feed-page-1 19 | blah blah blah 20 | 21 | Example.com Author 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /vendor/faux/spec/fixtures/simple.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | example 4 | 5 | 6 |

example

7 |

8 | link 9 | another link 10 |

11 | 12 | 13 | -------------------------------------------------------------------------------- /vendor/faux/spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one 3 | # or more contributor license agreements. Licensed under the MIT License; 4 | # see LICENSE file in the project root for details 5 | # 6 | 7 | require 'bundler/setup' 8 | require 'rspec' 9 | require 'rack/test' 10 | require 'pry' 11 | require 'awesome_print' 12 | 13 | require 'faux' 14 | 15 | RSpec.configure do |config| 16 | config.include Rack::Test::Methods 17 | config.color = true 18 | config.order = 'random' 19 | end 20 | -------------------------------------------------------------------------------- /vendor/jars/com/github/crawler-commons/crawler-commons/1.2/crawler-commons-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/com/github/crawler-commons/crawler-commons/1.2/crawler-commons-1.2.jar -------------------------------------------------------------------------------- /vendor/jars/commons-codec/commons-codec/1.15/commons-codec-1.15.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/commons-codec/commons-codec/1.15/commons-codec-1.15.jar -------------------------------------------------------------------------------- /vendor/jars/commons-io/commons-io/2.16.1/commons-io-2.16.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/commons-io/commons-io/2.16.1/commons-io-2.16.1.jar -------------------------------------------------------------------------------- /vendor/jars/isorelax/isorelax/20030108/isorelax-20030108.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/isorelax/isorelax/20030108/isorelax-20030108.jar -------------------------------------------------------------------------------- /vendor/jars/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar -------------------------------------------------------------------------------- /vendor/jars/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar -------------------------------------------------------------------------------- /vendor/jars/nu/validator/jing/20200702VNU/jing-20200702VNU.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/nu/validator/jing/20200702VNU/jing-20200702VNU.jar -------------------------------------------------------------------------------- /vendor/jars/org/apache/commons/commons-compress/1.27.1/commons-compress-1.27.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/apache/commons/commons-compress/1.27.1/commons-compress-1.27.1.jar -------------------------------------------------------------------------------- /vendor/jars/org/apache/commons/commons-lang3/3.16.0/commons-lang3-3.16.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/apache/commons/commons-lang3/3.16.0/commons-lang3-3.16.0.jar -------------------------------------------------------------------------------- /vendor/jars/org/apache/httpcomponents/client5/httpclient5/5.1/httpclient5-5.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/apache/httpcomponents/client5/httpclient5/5.1/httpclient5-5.1.jar -------------------------------------------------------------------------------- /vendor/jars/org/apache/httpcomponents/core5/httpcore5-h2/5.1.1/httpcore5-h2-5.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/apache/httpcomponents/core5/httpcore5-h2/5.1.1/httpcore5-h2-5.1.1.jar -------------------------------------------------------------------------------- /vendor/jars/org/apache/httpcomponents/core5/httpcore5/5.1.1/httpcore5-5.1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/apache/httpcomponents/core5/httpcore5/5.1.1/httpcore5-5.1.1.jar -------------------------------------------------------------------------------- /vendor/jars/org/brotli/dec/0.1.2/dec-0.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/brotli/dec/0.1.2/dec-0.1.2.jar -------------------------------------------------------------------------------- /vendor/jars/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar -------------------------------------------------------------------------------- /vendor/jars/org/slf4j/slf4j-api/1.7.7/slf4j-api-1.7.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/slf4j/slf4j-api/1.7.7/slf4j-api-1.7.7.jar -------------------------------------------------------------------------------- /vendor/jars/org/slf4j/slf4j-nop/1.7.26/slf4j-nop-1.7.26.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/org/slf4j/slf4j-nop/1.7.26/slf4j-nop-1.7.26.jar -------------------------------------------------------------------------------- /vendor/jars/xalan/serializer/2.7.3/serializer-2.7.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/xalan/serializer/2.7.3/serializer-2.7.3.jar -------------------------------------------------------------------------------- /vendor/jars/xalan/xalan/2.7.3/xalan-2.7.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/xalan/xalan/2.7.3/xalan-2.7.3.jar -------------------------------------------------------------------------------- /vendor/jars/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar -------------------------------------------------------------------------------- /vendor/jars/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/elastic/crawler/0a5ab5b74eae12f96b312d7cea39103a64b28700/vendor/jars/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar --------------------------------------------------------------------------------