├── .backportrc.json
├── .buildkite
    ├── pipeline.yml
    ├── publish
    │   ├── build-and-push-multiarch-docker.sh
    │   ├── build-docker.sh
    │   ├── publish-common.sh
    │   ├── push-docker.sh
    │   └── test-docker.sh
    ├── pull-requests.json
    ├── release-pipeline.yml
    └── scripts
    │   ├── run_ci_step.sh
    │   └── run_command.sh
├── .bundler-version
├── .devcontainer
    └── devcontainer.json
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── config.yml
    │   └── enhancement.md
    ├── PULL_REQUEST_TEMPLATE.md
    └── workflows
    │   ├── add-labels-main.yml
    │   └── backport.yml
├── .gitignore
├── .java-version
├── .jrubyrc
├── .rspec
├── .rubocop.yml
├── .ruby-version
├── Brewfile
├── Dockerfile
├── Dockerfile.wolfi
├── Gemfile
├── Gemfile.lock
├── Jarfile
├── Jars.lock
├── LICENSE
├── Makefile
├── NOTICE.txt
├── README.md
├── bin
    └── crawler
├── catalog-info.yaml
├── config
    ├── README.md
    ├── crawler.yml.example
    ├── elasticsearch.yml.example
    ├── examples
    │   ├── parks-australia.yml
    │   └── simple.yml
    └── filebeat.yml.example
├── docker-compose.yaml
├── docs
    ├── ADVANCED.md
    ├── CHANGELOG.md
    ├── CLI.md
    ├── CODE_OF_CONDUCT.md
    ├── CONFIG.md
    ├── CONTRIBUTING.md
    ├── DEVELOPER_GUIDE.md
    ├── ELASTICSEARCH.md
    ├── FEATURE_COMPARISON.md
    ├── RELEASING.md
    ├── SECURITY.md
    ├── SUPPORT.md
    └── features
    │   ├── BINARY_CONTENT_EXTRACTION.md
    │   ├── CRAWLER_DIRECTIVES.md
    │   ├── CRAWL_RULES.md
    │   ├── EXTRACTION_RULES.md
    │   ├── INGEST_PIPELINES.md
    │   ├── LOGGING.md
    │   └── SCHEDULING.md
├── lib
    ├── constants.rb
    ├── crawler.rb
    ├── crawler
    │   ├── api
    │   │   ├── config.rb
    │   │   └── crawl.rb
    │   ├── cli.rb
    │   ├── cli
    │   │   ├── crawl.rb
    │   │   ├── helpers.rb
    │   │   ├── schedule.rb
    │   │   ├── urltest.rb
    │   │   ├── validate.rb
    │   │   └── version.rb
    │   ├── content_engine
    │   │   ├── extractor.rb
    │   │   ├── transformer.rb
    │   │   └── utils.rb
    │   ├── coordinator.rb
    │   ├── core_ext.rb
    │   ├── data
    │   │   ├── crawl_result
    │   │   │   ├── base.rb
    │   │   │   ├── content_extractable_file.rb
    │   │   │   ├── error.rb
    │   │   │   ├── html.rb
    │   │   │   ├── http_auth_disallowed_error.rb
    │   │   │   ├── redirect.rb
    │   │   │   ├── redirect_error.rb
    │   │   │   ├── robots_txt.rb
    │   │   │   ├── sitemap.rb
    │   │   │   ├── success.rb
    │   │   │   └── unsupported_content_type.rb
    │   │   ├── crawl_task.rb
    │   │   ├── domain.rb
    │   │   ├── extraction
    │   │   │   ├── rule.rb
    │   │   │   ├── ruleset.rb
    │   │   │   └── url_filter.rb
    │   │   ├── link.rb
    │   │   ├── rule.rb
    │   │   ├── rule_engine_outcome.rb
    │   │   ├── seen_urls.rb
    │   │   ├── url.rb
    │   │   ├── url_queue.rb
    │   │   └── url_queue
    │   │   │   ├── base.rb
    │   │   │   └── memory_only.rb
    │   ├── document_mapper.rb
    │   ├── event_generator.rb
    │   ├── executor.rb
    │   ├── http_client.rb
    │   ├── http_executor.rb
    │   ├── http_header_service.rb
    │   ├── http_utils
    │   │   ├── all_trusting_trust_manager.rb
    │   │   ├── config.rb
    │   │   ├── exceptions.rb
    │   │   ├── filtering_dns_resolver.rb
    │   │   └── response.rb
    │   ├── logging
    │   │   ├── handler
    │   │   │   ├── base.rb
    │   │   │   ├── file.rb
    │   │   │   └── stdout.rb
    │   │   └── logger.rb
    │   ├── mock_event_logger.rb
    │   ├── mock_executor.rb
    │   ├── output_sink.rb
    │   ├── output_sink
    │   │   ├── base.rb
    │   │   ├── console.rb
    │   │   ├── elasticsearch.rb
    │   │   ├── file.rb
    │   │   ├── mock.rb
    │   │   └── null.rb
    │   ├── robots_txt_parser.rb
    │   ├── robots_txt_service.rb
    │   ├── rule_engine
    │   │   └── base.rb
    │   ├── stats.rb
    │   ├── url_validator.rb
    │   ├── url_validator
    │   │   ├── crawl_rules_check_concern.rb
    │   │   ├── dns_check_concern.rb
    │   │   ├── domain_access_check_concern.rb
    │   │   ├── domain_uniqueness_check_concern.rb
    │   │   ├── result.rb
    │   │   ├── robots_txt_check_concern.rb
    │   │   ├── tcp_check_concern.rb
    │   │   ├── url_check_concern.rb
    │   │   ├── url_content_check_concern.rb
    │   │   └── url_request_check_concern.rb
    │   └── utils.rb
    ├── environment.rb
    ├── errors.rb
    └── es
    │   ├── bulk_queue.rb
    │   └── client.rb
├── product_version
├── renovate.json
├── script
    ├── bundle
    ├── environment
    ├── functions.sh
    ├── licenses
    │   ├── README.md
    │   ├── generate_notice.rb
    │   ├── lib
    │   │   ├── third_party.rb
    │   │   └── third_party
    │   │   │   ├── base.rb
    │   │   │   ├── misc_dependencies.rb
    │   │   │   └── rubygems_dependencies.rb
    │   ├── misc_licenses
    │   │   ├── .gitkeep
    │   │   ├── _manually_added_jruby-LICENSE.txt
    │   │   └── _manually_added_tika-LICENSE.txt
    │   └── rubygems_licenses
    │   │   ├── .gitkeep
    │   │   ├── _manually_added_faux-LICENSE.txt
    │   │   ├── _manually_added_httpclient-LICENSE.txt
    │   │   ├── _manually_added_jruby-jars-LICENSE.txt
    │   │   ├── _manually_added_minitest-LICENSE.txt
    │   │   └── _manually_added_strscan-LICENSE.txt
    ├── rspec
    ├── support
    │   └── string_colors.rb
    └── vendor_jars
├── spec
    ├── factories
    │   └── crawl_results.rb
    ├── fixtures
    │   ├── crawl-flat-format.yml
    │   ├── crawl.yml
    │   ├── do-not-visit.txt
    │   ├── elasticsearch-flat-format.yml
    │   ├── elasticsearch-partially-flat-format.yml
    │   ├── elasticsearch.yml
    │   ├── gilacountyaz.gov.html
    │   ├── sitemap
    │   │   ├── sitemap_index.xml
    │   │   ├── sitemap_index_huge.xml
    │   │   ├── sitemap_no_urls.xml
    │   │   ├── sitemap_urlset.xml
    │   │   ├── sitemap_urlset.xml.gz
    │   │   ├── sitemap_urlset_10000_urls.xml
    │   │   └── sitemap_urlset_huge.xml
    │   ├── ssl
    │   │   ├── ca.crt
    │   │   ├── ca.key
    │   │   ├── ca.password.txt
    │   │   ├── config_with_cert.yml
    │   │   ├── expired
    │   │   │   ├── example.cnf
    │   │   │   ├── example.crt
    │   │   │   ├── example.csr
    │   │   │   ├── example.key
    │   │   │   └── generate.sh
    │   │   ├── invalid.crt
    │   │   └── self-signed
    │   │   │   ├── example.cnf
    │   │   │   ├── example.crt
    │   │   │   ├── example.csr
    │   │   │   ├── example.key
    │   │   │   └── generate.sh
    │   └── uncrate.com.html
    ├── integration
    │   ├── charset_spec.rb
    │   ├── content_extraction_spec.rb
    │   ├── headers_spec.rb
    │   ├── legacy_sitemaps_spec.rb
    │   ├── nofollow_spec.rb
    │   ├── redirects_spec.rb
    │   ├── response_content_type_spec.rb
    │   ├── response_limits_spec.rb
    │   ├── robots_txt_spec.rb
    │   ├── seed_spec.rb
    │   ├── sitemap_spec.rb
    │   ├── sitemap_xxe_spec.rb
    │   ├── timeouts
    │   │   ├── request_timeout_spec.rb
    │   │   └── socket_timeout_spec.rb
    │   └── url_fragments_spec.rb
    ├── lib
    │   ├── crawler
    │   │   ├── api
    │   │   │   ├── config_spec.rb
    │   │   │   └── crawl_spec.rb
    │   │   ├── cli
    │   │   │   ├── crawl_spec.rb
    │   │   │   ├── helpers_spec.rb
    │   │   │   ├── schedule_spec.rb
    │   │   │   ├── urltest_spec.rb
    │   │   │   ├── validate_spec.rb
    │   │   │   └── version_spec.rb
    │   │   ├── content_engine
    │   │   │   ├── extractor_spec.rb
    │   │   │   ├── transformer_spec.rb
    │   │   │   └── utils_spec.rb
    │   │   ├── coordinator_spec.rb
    │   │   ├── data
    │   │   │   ├── crawl_result
    │   │   │   │   ├── html_spec.rb
    │   │   │   │   └── sitemap_spec.rb
    │   │   │   ├── crawl_result_spec.rb
    │   │   │   ├── crawl_task_spec.rb
    │   │   │   ├── domain_spec.rb
    │   │   │   ├── extraction
    │   │   │   │   ├── rule_spec.rb
    │   │   │   │   ├── ruleset_spec.rb
    │   │   │   │   └── url_filter_spec.rb
    │   │   │   ├── link_spec.rb
    │   │   │   ├── rule_spec.rb
    │   │   │   ├── url_queue
    │   │   │   │   └── memory_only_spec.rb
    │   │   │   ├── url_queue_spec.rb
    │   │   │   └── url_spec.rb
    │   │   ├── document_mapper_spec.rb
    │   │   ├── event_generator_spec.rb
    │   │   ├── http_client_spec.rb
    │   │   ├── http_executor_spec.rb
    │   │   ├── http_utils
    │   │   │   ├── bad_ssl_spec.rb
    │   │   │   ├── config_spec.rb
    │   │   │   ├── filtering_dns_resolver_spec.rb
    │   │   │   └── response_spec.rb
    │   │   ├── logging
    │   │   │   └── crawllogger_spec.rb
    │   │   ├── output_sink
    │   │   │   ├── elasticsearch_spec.rb
    │   │   │   └── file_spec.rb
    │   │   ├── output_sink_spec.rb
    │   │   ├── robots_txt_parser_spec.rb
    │   │   ├── rule_engine
    │   │   │   └── base_spec.rb
    │   │   ├── stats_spec.rb
    │   │   ├── url_validator
    │   │   │   ├── crawl_rules_check_spec.rb
    │   │   │   ├── dns_check_spec.rb
    │   │   │   ├── domain_access_check_spec.rb
    │   │   │   ├── domain_uniqueness_check_spec.rb
    │   │   │   ├── robots_txt_check_spec.rb
    │   │   │   ├── tcp_check_spec.rb
    │   │   │   ├── url_check_spec.rb
    │   │   │   ├── url_content_check_spec.rb
    │   │   │   └── url_request_check_spec.rb
    │   │   └── url_validator_spec.rb
    │   ├── crawler_spec.rb
    │   ├── environment_spec.rb
    │   └── es
    │   │   ├── bulk_queue_spec.rb
    │   │   └── client_spec.rb
    ├── spec_helper.rb
    └── support
    │   ├── cli_helpers.rb
    │   ├── crawl_response_matchers.rb
    │   ├── faux
    │       ├── faux_crawl.rb
    │       └── results_collection.rb
    │   ├── fixtures.rb
    │   └── mock_response.rb
└── vendor
    ├── faux
        ├── .gitignore
        ├── Gemfile
        ├── LICENSE
        ├── README.md
        ├── Rakefile
        ├── faux.gemspec
        ├── lib
        │   ├── faux.rb
        │   ├── faux
        │   │   ├── element
        │   │   │   ├── atom_feed.rb
        │   │   │   ├── base.rb
        │   │   │   ├── fixture.rb
        │   │   │   ├── page.rb
        │   │   │   ├── path_with_content_length.rb
        │   │   │   ├── robots.rb
        │   │   │   └── sitemap.rb
        │   │   ├── helpers
        │   │   │   └── url.rb
        │   │   ├── middleware
        │   │   │   └── reporter.rb
        │   │   └── version.rb
        │   └── site.rb
        ├── sites
        │   ├── fixture_site.rb
        │   ├── robots_txt_respect_rules.rb
        │   ├── simple_site.rb
        │   └── sitemap_pointing_to_sitemaps.rb
        └── spec
        │   ├── faux
        │       ├── element
        │       │   ├── atom_feed_spec.rb
        │       │   ├── base_spec.rb
        │       │   ├── fixture_spec.rb
        │       │   ├── page_spec.rb
        │       │   ├── path_with_content_length_spec.rb
        │       │   ├── robots_spec.rb
        │       │   └── sitemap_spec.rb
        │       ├── middleware
        │       │   └── reporter_spec.rb
        │       └── site_spec.rb
        │   ├── faux_spec.rb
        │   ├── fixtures
        │       ├── atom-feed-example-com.xml
        │       └── simple.html
        │   └── spec_helper.rb
    └── jars
        ├── com
            └── github
            │   └── crawler-commons
            │       └── crawler-commons
            │           └── 1.2
            │               └── crawler-commons-1.2.jar
        ├── commons-codec
            └── commons-codec
            │   └── 1.15
            │       └── commons-codec-1.15.jar
        ├── commons-io
            └── commons-io
            │   └── 2.16.1
            │       └── commons-io-2.16.1.jar
        ├── isorelax
            └── isorelax
            │   └── 20030108
            │       └── isorelax-20030108.jar
        ├── net
            ├── sf
            │   └── saxon
            │   │   └── Saxon-HE
            │   │       └── 9.6.0-4
            │   │           └── Saxon-HE-9.6.0-4.jar
            └── sourceforge
            │   └── htmlunit
            │       └── neko-htmlunit
            │           └── 2.63.0
            │               └── neko-htmlunit-2.63.0.jar
        ├── nu
            └── validator
            │   └── jing
            │       └── 20200702VNU
            │           └── jing-20200702VNU.jar
        ├── org
            ├── apache
            │   ├── commons
            │   │   ├── commons-compress
            │   │   │   └── 1.27.1
            │   │   │   │   └── commons-compress-1.27.1.jar
            │   │   └── commons-lang3
            │   │   │   └── 3.16.0
            │   │   │       └── commons-lang3-3.16.0.jar
            │   └── httpcomponents
            │   │   ├── client5
            │   │       └── httpclient5
            │   │       │   └── 5.1
            │   │       │       └── httpclient5-5.1.jar
            │   │   └── core5
            │   │       ├── httpcore5-h2
            │   │           └── 5.1.1
            │   │           │   └── httpcore5-h2-5.1.1.jar
            │   │       └── httpcore5
            │   │           └── 5.1.1
            │   │               └── httpcore5-5.1.1.jar
            ├── brotli
            │   └── dec
            │   │   └── 0.1.2
            │   │       └── dec-0.1.2.jar
            ├── nokogiri
            │   └── nekodtd
            │   │   └── 0.1.11.noko2
            │   │       └── nekodtd-0.1.11.noko2.jar
            └── slf4j
            │   ├── slf4j-api
            │       └── 1.7.7
            │       │   └── slf4j-api-1.7.7.jar
            │   └── slf4j-nop
            │       └── 1.7.26
            │           └── slf4j-nop-1.7.26.jar
        ├── xalan
            ├── serializer
            │   └── 2.7.3
            │   │   └── serializer-2.7.3.jar
            └── xalan
            │   └── 2.7.3
            │       └── xalan-2.7.3.jar
        ├── xerces
            └── xercesImpl
            │   └── 2.12.2
            │       └── xercesImpl-2.12.2.jar
        └── xml-apis
            └── xml-apis
                └── 1.4.01
                    └── xml-apis-1.4.01.jar


/.backportrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "targetBranchChoices": [
 3 |     { "name": "main", "checked": true },
 4 |     "0.2",
 5 |     "0.1"
 6 |   ],
 7 |   "fork": false,
 8 |   "targetPRLabels": ["backport"],
 9 |   "branchLabelMapping": {
10 |     "^v0.3.0(.0)?$": "main",
11 |     "^v(\\d+).(\\d+)(.\\d+)+$": "$1.$2"
12 |   },
13 |   "upstream": "elastic/crawler"
14 | }
15 | 


--------------------------------------------------------------------------------
/.buildkite/pipeline.yml:
--------------------------------------------------------------------------------
 1 | agents:
 2 |   provider: "gcp"
 3 |   machineType: "n1-standard-8"
 4 | 
 5 | defaultTimeoutInMinutes: 5
 6 | 
 7 | notify:
 8 |   - if: 'build.branch =~ /^((main)|([0-9]+\.[0-9]+))\$/ && (build.state == "failed" || pipeline.started_passing)'
 9 |     slack:
10 |       channels:
11 |         - "#search-et-alerts"
12 |       message: "${BUILDKITE_MESSAGE}"
13 | 
14 | # TODO: change docker build steps into pulling a ci-agent-image to speed up build time
15 | steps:
16 |   - label: ":rubocop: Lint"
17 |     commands:
18 |       - ".buildkite/scripts/run_command.sh docker"
19 |       - ".buildkite/scripts/run_command.sh lint"
20 |   - label: ":rspec: Test"
21 |     commands:
22 |       - ".buildkite/scripts/run_command.sh docker"
23 |       - ".buildkite/scripts/run_command.sh test"
24 | 


--------------------------------------------------------------------------------
/.buildkite/publish/build-and-push-multiarch-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ########
 4 | # Builds the multiarch docker image and pushes it to the docker registry
 5 | ########
 6 | 
 7 | set -exu
 8 | set -o pipefail
 9 | 
10 | # Load our common environment variables for publishing
11 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
12 | export CURDIR
13 | 
14 | # shellcheck source=./publish-common.sh
15 | source "$CURDIR/publish-common.sh"
16 | 
17 | # Set our tag name as well as the tag names of the individual platform images
18 | TAG_NAME="${BASE_TAG_NAME}:${VERSION}"
19 | LATEST_TAG_NAME="${BASE_TAG_NAME}:latest"
20 | AMD64_TAG="${BASE_TAG_NAME}:${VERSION}-amd64"
21 | ARM64_TAG="${BASE_TAG_NAME}:${VERSION}-arm64"
22 | 
23 | # Pull the images from the registry
24 | buildah pull "$AMD64_TAG"
25 | buildah pull "$ARM64_TAG"
26 | 
27 | # ensure +x is set to avoid writing any sensitive information to the console
28 | set +x
29 | 
30 | # Log into Docker
31 | echo "Logging into docker..."
32 | DOCKER_USER=$(vault read -address "${VAULT_ADDR}" -field "${DOCKER_USER_KEY}" "${VAULT_PATH}")
33 | vault read -address "${VAULT_ADDR}" -field "${DOCKER_PASS_KEY}" "${VAULT_PATH}" | \
34 |   buildah login --username="${DOCKER_USER}" --password-stdin docker.elastic.co
35 | 
36 | # Create the manifest for the multiarch image
37 | echo "Creating ${VERSION} manifest..."
38 | buildah manifest create "$TAG_NAME" \
39 |   "$AMD64_TAG" \
40 |   "$ARM64_TAG"
41 | 
42 | # ... and push it
43 | echo "Pushing ${VERSION} manifest..."
44 | buildah manifest push "$TAG_NAME" "docker://$TAG_NAME"
45 | 
46 | # Write out the final manifest for debugging purposes
47 | echo "Built and pushed ${VERSION} multiarch image... dumping final manifest..."
48 | buildah manifest inspect "$TAG_NAME"
49 | 
50 | # Repeat for latest tag if applicable
51 | if [[ "${APPLY_LATEST_TAG:-}" == "true" ]]; then
52 |   echo "Creating :latest manifest..."
53 |   buildah manifest create "$LATEST_TAG_NAME" \
54 |     "$AMD64_TAG" \
55 |     "$ARM64_TAG"
56 | 
57 |   echo "Pushing :latest manifest..."
58 |   buildah manifest push "$LATEST_TAG_NAME" "docker://$LATEST_TAG_NAME"
59 | 
60 |   echo "Built and pushed :latest multiarch image... dumping final manifest..."
61 |   buildah manifest inspect "$LATEST_TAG_NAME"
62 | else
63 |   echo "No :latest manifest required."
64 | fi
65 | 


--------------------------------------------------------------------------------
/.buildkite/publish/build-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ########
 4 | # Builds the docker image and saves it to an archive file
 5 | # so it can be stored as an artifact in Buildkite
 6 | ########
 7 | 
 8 | set -exu
 9 | set -o pipefail
10 | 
11 | if [[ "${ARCHITECTURE:-}" == "" ]]; then
12 |   echo "!! ARCHITECTURE is not set. Exiting."
13 |   exit 2
14 | fi
15 | 
16 | # Load our common environment variables for publishing
17 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
18 | export CURDIR
19 | 
20 | # shellcheck source=./publish-common.sh
21 | source "$CURDIR/publish-common.sh"
22 | 
23 | pushd "$PROJECT_ROOT"
24 | 
25 | # set our complete tag name and build the image
26 | TAG_NAME="$BASE_TAG_NAME:${VERSION}-${ARCHITECTURE}"
27 | docker build -f "$DOCKERFILE_PATH" -t "$TAG_NAME" .
28 | 
29 | # save the image to an archive file
30 | OUTPUT_PATH="$PROJECT_ROOT/.artifacts"
31 | OUTPUT_FILE="$OUTPUT_PATH/${DOCKER_ARTIFACT_KEY}-${VERSION}-${ARCHITECTURE}.tar.gz"
32 | mkdir -p "$OUTPUT_PATH"
33 | docker save "$TAG_NAME" | gzip > "$OUTPUT_FILE"
34 | 
35 | popd
36 | 


--------------------------------------------------------------------------------
/.buildkite/publish/publish-common.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ "${CURDIR:-}" == "" ]]; then
 4 |   echo "!! CURDIR is not set. Exiting."
 5 |   exit 2
 6 | fi
 7 | 
 8 | function realpath {
 9 |   echo "$(cd "$(dirname "$1")" || exit; pwd)"/"$(basename "$1")";
10 | }
11 | 
12 | export SCRIPT_DIR="$CURDIR"
13 | 
14 | BUILDKITE_DIR=$(realpath "$(dirname "$SCRIPT_DIR")")
15 | PROJECT_ROOT=$(realpath "$(dirname "$BUILDKITE_DIR")")
16 | VERSION_PATH="$PROJECT_ROOT/product_version"
17 | VERSION=$(cat "$VERSION_PATH")
18 | IS_SNAPSHOT=$(buildkite-agent meta-data get is_snapshot)
19 | IS_LATEST=$(buildkite-agent meta-data get is_latest)
20 | 
21 | if [[ "${IS_SNAPSHOT:-}" == "false" && "${IS_LATEST:-}" == "true" ]]; then
22 |   # don't apply LATEST tag to SNAPSHOT builds
23 |   export APPLY_LATEST_TAG="true"
24 | else
25 |   export APPLY_LATEST_TAG="false"
26 | fi
27 | 
28 | export BUILDKITE_DIR
29 | export PROJECT_ROOT
30 | export VERSION
31 | 
32 | if [[ "${IS_SNAPSHOT:-}" == "true" ]]; then
33 |   echo "Adding SNAPSHOT labeling"
34 |   export VERSION="${VERSION}-SNAPSHOT"
35 | fi
36 | 
37 | export BASE_TAG_NAME="${DOCKER_IMAGE_NAME:-docker.elastic.co/integrations/crawler}"
38 | export DOCKERFILE_PATH="${DOCKERFILE_PATH:-Dockerfile.wolfi}"
39 | export DOCKER_ARTIFACT_KEY="${DOCKER_ARTIFACT_KEY:-elastic-crawler-docker}"
40 | 
41 | export VAULT_ADDR="${VAULT_ADDR:-https://vault-ci-prod.elastic.dev}"
42 | export VAULT_PATH="secret/ci/elastic-crawler/docker-ci-admin"
43 | export DOCKER_PASS_KEY="secret_20240823"
44 | export DOCKER_USER_KEY="user_20240823"
45 | 


--------------------------------------------------------------------------------
/.buildkite/publish/push-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ########
 4 | # Pushes the docker image to the docker registry
 5 | ########
 6 | 
 7 | set -exu
 8 | set -o pipefail
 9 | 
10 | if [[ "${ARCHITECTURE:-}" == "" ]]; then
11 |   echo "!! ARCHITECTURE is not set. Exiting."
12 |   exit 2
13 | fi
14 | 
15 | # Load our common environment variables for publishing
16 | CURDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
17 | export CURDIR
18 | 
19 | # shellcheck source=./publish-common.sh
20 | source "$CURDIR"/publish-common.sh
21 | 
22 | # Load the image from the artifact created in build-docker.sh
23 | echo "Loading image from archive file..."
24 | docker load < "$PROJECT_ROOT/.artifacts/${DOCKER_ARTIFACT_KEY}-${VERSION}-${ARCHITECTURE}.tar.gz"
25 | 
26 | # ensure +x is set to avoid writing any sensitive information to the console
27 | set +x
28 | 
29 | # Log into Docker
30 | echo "Logging into docker..."
31 | DOCKER_USER=$(vault read -address "${VAULT_ADDR}" -field "${DOCKER_USER_KEY}" "${VAULT_PATH}")
32 | vault read -address "${VAULT_ADDR}" -field "${DOCKER_PASS_KEY}" "${VAULT_PATH}" | \
33 |   docker login -u "$DOCKER_USER" --password-stdin docker.elastic.co
34 | 
35 | # Set our tag name and push the image
36 | TAG_NAME="$BASE_TAG_NAME:${VERSION}-${ARCHITECTURE}"
37 | echo "Pushing image to docker with tag: $TAG_NAME"
38 | docker push "$TAG_NAME"
39 | 


--------------------------------------------------------------------------------
/.buildkite/pull-requests.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "jobs": [
 3 |     {
 4 |       "enabled": true,
 5 |       "pipelineSlug": "elastic-crawler",
 6 |       "allow_org_users": true,
 7 |       "allowed_repo_permissions": ["admin", "write"],
 8 |       "allowed_list": [],
 9 |       "set_commit_status": true,
10 |       "commit_status_context": "buildkite/elastic-crawler",
11 |       "build_on_commit": false,
12 |       "build_on_comment": true,
13 |       "trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
14 |       "always_trigger_comment_regex": "^(?:(?:buildkite\\W+)?(?:build|test)\\W+(?:this|it))",
15 |       "skip_ci_labels": ["skip-ci"],
16 |       "skip_target_branches": [],
17 |       "always_require_ci_on_changed": []
18 |     }
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/run_ci_step.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | RUBY_VERSION="$(cat .ruby-version)"
 6 | JAVA_VERSION="$(cat .java-version)"
 7 | 
 8 | export RUBY_VERSION
 9 | export JAVA_VERSION
10 | 
11 | case $1 in
12 |   lint)
13 |     echo "---- running linter"
14 |     make install-gems lint
15 |     ;;
16 | 
17 |   test)
18 |     echo "---- running tests"
19 |     make install test
20 |     ;;
21 | 
22 |   *)
23 |     echo "Usage: run_command {docker|lint}"
24 |     exit 2
25 |     ;;
26 | esac
27 | 


--------------------------------------------------------------------------------
/.buildkite/scripts/run_command.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | COMMAND_TO_RUN=${1:-}
 6 | 
 7 | if [[ "${COMMAND_TO_RUN:-}" == "" ]]; then
 8 |     echo "Usage: run_command.sh {lint|docker}"
 9 |     exit 2
10 | fi
11 | 
12 | function realpath {
13 |   echo "$(cd "$(dirname "$1")"; pwd)"/"$(basename "$1")";
14 | }
15 | 
16 | SCRIPT_WORKING_DIR=$(realpath "$(dirname "$0")")
17 | BUILDKITE_DIR=$(realpath "$(dirname "$SCRIPT_WORKING_DIR")")
18 | PROJECT_ROOT=$(realpath "$(dirname "$BUILDKITE_DIR")")
19 | 
20 | DOCKER_IMAGE="crawler-ci"
21 | SCRIPT_CMD="/ci/.buildkite/scripts/run_ci_step.sh"
22 | 
23 | if [[ "${COMMAND_TO_RUN:-}" == "docker" ]]; then
24 |   echo "---- running docker build"
25 |   make build-docker-ci
26 | else
27 |   docker run --interactive --rm             \
28 |               --sig-proxy=true --init      \
29 |               --user "root"                \
30 |               --volume "$PROJECT_ROOT:/ci" \
31 |               --workdir /ci                \
32 |               --env HOME=/ci               \
33 |               --env CI                     \
34 |               --env GIT_REVISION="${BUILDKITE_COMMIT-}"        \
35 |               --env BUILD_ID="${BUILDKITE_BUILD_NUMBER-}"      \
36 |               --entrypoint "${SCRIPT_CMD}" \
37 |               "$DOCKER_IMAGE"              \
38 |               "$COMMAND_TO_RUN"
39 | fi
40 | 


--------------------------------------------------------------------------------
/.bundler-version:
--------------------------------------------------------------------------------
1 | 2.6.6
2 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "Crawler Dev Container",
 3 | 	"image": "jruby:9.4.12.0-jdk21", // Same image as Dockerfile
 4 | 	"postCreateCommand": "IS_DOCKER=true make install",
 5 | 	"features": {
 6 | 		"ghcr.io/devcontainers/features/git:1": {
 7 | 			"version": "latest"
 8 | 		},
 9 | 		"ghcr.io/devcontainers/features/docker-in-docker:2.12.2": {
10 | 			"version": "latest",
11 | 			"dockerSocketBindMount": true
12 | 		}
13 | 	}
14 | }


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # These owners will be the default owners for everything in
2 | # the repo.
3 | *    @elastic/search-extract-and-transform


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve.
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Bug Description
11 | <!--A clear and concise description of what the bug is.-->
12 | 
13 | ### To Reproduce
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | ## Expected behavior
21 | <!--A clear and concise description of what you expected to happen.-->
22 | 
23 | ## Screenshots
24 | <!--If applicable, add screenshots to help explain your problem.
25 | Include a screenshot of the browser console if it contains errors.-->
26 | 
27 | ## Environment
28 | <!--Please complete the information below if applicable-->
29 | 
30 | 
31 | - OS: [e.g. iOS]
32 | - Browser [e.g. chrome, safari]
33 | - Version [e.g. 22]
34 | 
35 | 
36 | ## Additional context
37 | <!--If applicable, include the Crawler Webserver logs containing the error or stack trace.
38 | Add any other context about the problem here.-->
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Question or Discussion
4 |     url: https://discuss.elastic.co/c/search/
5 |     about: Please ask and answer questions here.
6 |   - name: Security Vulnerability
7 |     url: https://www.elastic.co/community/security
8 |     about: DO NOT file issues related to security. Instead, please follow our security policy here.
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/enhancement.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Enhancement
 3 | about: It's not a bug, but some desired feature is missing
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ### Problem Description
11 | <!--Is your feature request related to a problem? Please describe.
12 | 
13 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -->
14 | 
15 | ### Proposed Solution
16 | <!--Describe the solution you'd like
17 | A clear and concise description of what you want to happen.-->
18 | 
19 | 
20 | ### Alternatives
21 | <!--Describe alternatives you've considered**
22 | A clear and concise description of any alternative solutions or features you've considered.-->
23 | 
24 | ### Additional Context
25 | <!--Add any other context or screenshots about the feature request here.-->
26 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | ### Closes https://github.com/elastic/crawler/issues/###
 2 | 
 3 | <!--Provide a general description of the code changes in your pull request.
 4 | If the change relates to a specific issue, include the link at the top.
 5 | 
 6 | If this is an ad-hoc/trivial change and does not have a corresponding
 7 | issue, please describe your changes in enough details, so that reviewers
 8 | and other team members can understand the reasoning behind the pull request.-->
 9 | 
10 | ### Checklists
11 | 
12 | <!--You can remove unrelated items from checklists below and/or add new
13 | items that may help during the review.-->
14 | 
15 | #### Pre-Review Checklist
16 | - [ ] This PR does NOT contain credentials of any kind, such as API keys or username/passwords (double check `crawler.yml.example` and `elasticsearch.yml.example`)
17 | - [ ] This PR has a meaningful title
18 | - [ ] This PR links to all relevant GitHub issues that it fixes or partially addresses
19 |     - If there is no GitHub issue, please create it. Each PR should have a link to an issue
20 | - [ ] this PR has a thorough description
21 | - [ ] Covered the changes with automated tests
22 | - [ ] Tested the changes locally
23 | - [ ] Added a label for each target release version (example: `v0.1.0`)
24 | - [ ] Considered corresponding documentation changes
25 | - [ ] Contributed any configuration settings changes to the configuration reference
26 | - [ ] Ran `make notice` if any dependencies have been added
27 | 
28 | #### Changes Requiring Extra Attention
29 | 
30 | <!--Please call out any changes that require special attention from the
31 | reviewers and/or increase the risk to availability or security of the
32 | system after deployment. Remove the ones that don't apply.-->
33 | 
34 | - [ ] Security-related changes (encryption, TLS, SSRF, etc)
35 | - [ ] New external service dependencies added.
36 | 
37 | ### Related Pull Requests
38 | 
39 | <!--List any relevant PRs here or remove the section if this is a standalone PR.
40 | 
41 | * https://github.com/elastic/.../pull/123-->
42 | 
43 | ### Release Note
44 | 
45 | <!--If you think this enhancement/fix should be included in the release notes,
46 | please write a concise user-facing description of the change here.
47 | You should also label the PR with `release_note` so the release notes
48 | author(s) can easily look it up.-->
49 | 


--------------------------------------------------------------------------------
/.github/workflows/add-labels-main.yml:
--------------------------------------------------------------------------------
 1 | name: Force backport labels for main
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     branches:
 6 |       - main
 7 |     types:
 8 |       - opened
 9 | 
10 | jobs:
11 |   add_labels:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2
15 |       - id: version
16 |         uses: juliangruber/read-file-action@386973d5b59f826915775874c7d1f82c4bbcfb07
17 |         with:
18 |           path: ./product_version
19 |       - uses: actions-ecosystem/action-add-labels@18f1af5e3544586314bbe15c0273249c770b2daf # v1
20 |         with:
21 |           labels: |
22 |             auto-backport
23 |             v${{ steps.version.outputs.content }}
24 | 


--------------------------------------------------------------------------------
/.github/workflows/backport.yml:
--------------------------------------------------------------------------------
 1 | name: Backport PR
 2 | 
 3 | on:
 4 |   pull_request_target:
 5 |     branches:
 6 |       - main
 7 |     types:
 8 |       - labeled
 9 |       - closed
10 | 
11 | jobs:
12 |   backport:
13 |     if: |
14 |       github.event.pull_request.merged == true
15 |       && contains(github.event.pull_request.labels.*.name, 'auto-backport')
16 |       && (
17 |         (github.event.action == 'labeled' && github.event.label.name == 'auto-backport')
18 |         || (github.event.action == 'closed')
19 |       )
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - name: Checkout Actions
23 |         uses: actions/checkout@ee0669bd1cc54295c223e0bb666b733df41de1c5 # v2
24 |         with:
25 |           repository: 'swiftype/kibana-github-actions'
26 |           ref: main
27 |           path: ./actions
28 | 
29 |       - name: Install Actions
30 |         run: npm install --production --prefix ./actions
31 | 
32 |       - name: Run Backport
33 |         uses: ./actions/backport
34 |         with:
35 |           github_token: ${{ secrets.GITHUB_TOKEN }}
36 |           approver_token: ${{ secrets.REPO_SCOPED_TOKEN }}
37 |           auto_approve: 'true'
38 |           commit_user: elastic
39 |           commit_email: ent-search-backport@users.noreply.github.com
40 |           auto_merge: 'true'
41 |           auto_merge_method: 'squash'
42 |           manual_backport_command_template: 'backport --pr %pullNumber% --autoMerge --autoMergeMethod squash'
43 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # bundler state
 2 | /.bundle
 3 | /vendor/bundle/
 4 | /vendor/ruby/
 5 | /vendor/filebeat/
 6 | /vendor/metricbeat/
 7 | /vendor/jruby/
 8 | 
 9 | # homebrew state
10 | .Brewfile.cached
11 | Brewfile.lock.json
12 | 
13 | # Mac finder artifacts
14 | .DS_Store
15 | 
16 | # Rubymine project files
17 | /.idea/
18 | /.run/
19 | *.iml
20 | 
21 | # VSCode Workspace files
22 | /.vscode/
23 | 
24 | # Silver surfer ignore file
25 | .agignore
26 | 
27 | # Sublime project files
28 | /*.sublime-project
29 | /*.sublime-workspace
30 | 
31 | # vim artifacts
32 | *.swp
33 | *.swo
34 | *.un~
35 | 
36 | ngrok
37 | 
38 | /data
39 | /.rvmrc
40 | /.envrc
41 | /.rbenv-vars
42 | /.irbrc
43 | /public/blog
44 | 
45 | # honeypot dump
46 | dump/
47 | 
48 | # Emacs restclient files
49 | *.http
50 | 
51 | # Codekit settings
52 | *.codekit
53 | 
54 | # Static asset build system
55 | .tmp/
56 | .m2/
57 | .local/
58 | .cache/
59 | 
60 | # Ignore example file outputs
61 | output
62 | 
63 | # Config files
64 | config/*.yml
65 | 
66 | # log files
67 | /logs
68 | 
69 | # default File sink folder
70 | /crawled_docs
71 | 
72 | # Code coverage
73 | coverage
74 | 
75 | # Downloaded license files
76 | script/licenses/**/_downloaded_*-LICENSE.txt
77 | 
78 | # Buildkite
79 | bin/container-structure-test
80 | .artifacts
81 | .buildkite/publish/container-structure-test.yaml
82 | 


--------------------------------------------------------------------------------
/.java-version:
--------------------------------------------------------------------------------
1 | 21
2 | 


--------------------------------------------------------------------------------
/.jrubyrc:
--------------------------------------------------------------------------------
1 | # Make it possible to use timeout regexp matching
2 | regexp.interruptible=true
3 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 | --profile
4 | -r spec_helper
5 | 


--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
 1 | require:
 2 |   - rubocop-performance
 3 | 
 4 | AllCops:
 5 |   TargetRubyVersion: 3.1
 6 |   NewCops: enable
 7 |   Exclude:
 8 |     - 'Jarfile'
 9 |     - 'vendor/**/*'
10 |     - 'Gemfile'
11 |     - 'bin/**/*'
12 |     - 'spec/support/faux/**/*'
13 | 
14 | Style/Documentation:
15 |   Enabled: false
16 | 
17 | Metrics/MethodLength:
18 |   CountAsOne: ['array', 'hash', 'heredoc', 'method_call']
19 |   Max: 15
20 | 
21 | Metrics/ClassLength:
22 |   Max: 200
23 | 
24 | Metrics/ModuleLength:
25 |   Max: 200
26 | 
27 | Metrics/AbcSize:
28 |   Max: 20
29 | 
30 | # Disable for specs
31 | Metrics/BlockLength:
32 |   Exclude:
33 |     - 'spec/**/*'
34 | 


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | jruby-9.4.12.0
2 | 


--------------------------------------------------------------------------------
/Brewfile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # Local dev utils
 4 | brew 'rbenv'
 5 | brew 'jenv'
 6 | brew 'icu4c'
 7 | 
 8 | # Stack services
 9 | cask 'homebrew/cask-versions/temurin11'
10 | 
11 | # For testing SSL locally
12 | brew 'mkcert'
13 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM jruby:9.4.12.0-jdk21@sha256:5641622b488d298362b96fdaea0f328248ce55962e68e224118be11ddb48d16e
 2 | RUN apt-get update && \
 3 |     apt-get install -y --no-install-recommends \
 4 |     libicu-dev netbase make
 5 | 
 6 | # used for skipping jenv/rbenv setup
 7 | ENV IS_DOCKER=1
 8 | 
 9 | # Set up crawlergroup and crawleruser
10 | RUN groupadd -g 451 crawlergroup && \
11 |     useradd -m -u 451 -g crawlergroup crawleruser
12 | 
13 | # Copy and set up Crawler as crawleruser
14 | USER crawleruser
15 | COPY --chown=crawleruser:crawlergroup --chmod=775 . /home/app
16 | WORKDIR /home/app
17 | RUN make clean install
18 | 
19 | # Clean up build dependencies
20 | RUN rm -r /home/crawleruser/.m2
21 | 
22 | ENTRYPOINT [ "/bin/bash" ]
23 | 


--------------------------------------------------------------------------------
/Jarfile:
--------------------------------------------------------------------------------
 1 | # This file is used to control our Jar dependencies and is used with jar-dependencies to vendor
 2 | # our java dependencies into vendor/jars (see https://github.com/mkristian/jar-dependencies for details)
 3 | #
 4 | # If you update this file, please run the following command to update the jars cache:
 5 | #    make clean install
 6 | #
 7 | # When adding a new dependency, please explain what it is and why we're adding it in a comment.
 8 | 
 9 | # Functionality common to any web crawler
10 | jar 'com.github.crawler-commons:crawler-commons', '1.2'
11 | 
12 | # Pinned dependency of crawler-commons to resolve vulnerability (updated to 2.16.1 for commons-compress compatibility)
13 | jar 'commons-io:commons-io', '2.16.1'
14 | 
15 | # Apache HTTP client used for requests to websites
16 | jar 'org.apache.httpcomponents.client5:httpclient5', '5.1'
17 | 
18 | # For managing Brotli input streams
19 | jar 'org.apache.commons:commons-compress', '1.27.1'
20 | jar 'org.brotli:dec', '0.1.2'
21 | 
22 | # Cleaner Java logs handling
23 | jar 'org.slf4j:slf4j-nop', '1.7.26'
24 | 


--------------------------------------------------------------------------------
/Jars.lock:
--------------------------------------------------------------------------------
 1 | isorelax:isorelax:20030108:compile:
 2 | org.nokogiri:nekodtd:0.1.11.noko2:compile:
 3 | net.sourceforge.htmlunit:neko-htmlunit:2.63.0:compile:
 4 | nu.validator:jing:20200702VNU:compile:
 5 | net.sf.saxon:Saxon-HE:9.6.0-4:compile:
 6 | xalan:serializer:2.7.3:compile:
 7 | xalan:xalan:2.7.3:compile:
 8 | xerces:xercesImpl:2.12.2:compile:
 9 | xml-apis:xml-apis:1.4.01:compile:
10 | com.github.crawler-commons:crawler-commons:1.2:compile:
11 | org.slf4j:slf4j-api:1.7.7:compile:
12 | commons-io:commons-io:2.16.1:compile:
13 | org.apache.httpcomponents.client5:httpclient5:5.1:compile:
14 | org.apache.httpcomponents.core5:httpcore5:5.1.1:compile:
15 | org.apache.httpcomponents.core5:httpcore5-h2:5.1.1:compile:
16 | commons-codec:commons-codec:1.15:compile:
17 | org.apache.commons:commons-compress:1.27.1:compile:
18 | org.apache.commons:commons-lang3:3.16.0:compile:
19 | org.brotli:dec:0.1.2:compile:
20 | org.slf4j:slf4j-nop:1.7.26:compile:
21 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .phony: test lint autocorrect install install-ci install-gems install-jars clean notice build-docker-ci list-gems list-jars
 2 | 
 3 | test:
 4 | 	script/rspec $(file)
 5 | 
 6 | lint:
 7 | 	rubocop
 8 | 
 9 | autocorrect:
10 | 	rubocop --autocorrect
11 | 
12 | install:
13 | 	script/environment
14 | 	make install-gems
15 | 	make install-jars
16 | 
17 | install-ci:
18 | 	make install-gems
19 | 	make install-jars
20 | 
21 | install-gems:
22 | 	script/bundle install
23 | 
24 | install-jars:
25 | 	script/bundle exec script/vendor_jars
26 | 
27 | clean:
28 | 	rm -rf Jars.lock vendor/jars
29 | 
30 | notice:
31 | 	script/licenses/generate_notice.rb
32 | 
33 | build-docker-ci:
34 | 	docker build -t crawler-ci .
35 | 
36 | list-gems:
37 | 	script/bundle exec gem dependency
38 | 
39 | list-jars:
40 | 	script/bundle exec lock_jars --tree
41 | 


--------------------------------------------------------------------------------
/bin/crawler:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | #
 4 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 5 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 6 | # you may not use this file except in compliance with the Elastic License 2.0.
 7 | #
 8 | 
 9 | # frozen_string_literal: true
10 | 
11 | require "bundler/setup"
12 | require 'dry/cli'
13 | 
14 | # Standard libraries
15 | require 'getoptlong'
16 | require 'yaml'
17 | 
18 | require_relative File.expand_path('../../lib/environment', __FILE__)
19 | require 'crawler/cli'
20 | 
21 | java_import 'java.lang.System'
22 | 
23 | # These opts are to prevent an intermittent `bad_record_mac` error when indexing into ES.
24 | # One workaround is to force use of SSLv3 only.
25 | System.setProperty('force.http.jre.executor', 'true')
26 | System.setProperty('https.protocols', 'SSLv3')
27 | 
28 | Dry::CLI.new(Crawler::CLI).call
29 | 


--------------------------------------------------------------------------------
/config/README.md:
--------------------------------------------------------------------------------
1 | # Config
2 | 
3 | See [CONFIG.md](../docs/CONFIG.md) for information on how to configure crawl jobs.
4 | 


--------------------------------------------------------------------------------
/config/examples/parks-australia.yml:
--------------------------------------------------------------------------------
 1 | # This is a sample config file for crawling the parksaustralia.gov.au website writing output to an ES index
 2 | #
 3 | # The configuration options in this example are not exhaustive. To see all possible configuration options,
 4 | # reference the config templates:
 5 | # - config/crawler.yml.example
 6 | # - config/elasticsearch.yml.example
 7 | 
 8 | # Domains allowed for the crawl
 9 | domains:
10 |   - url: https://parksaustralia.gov.au
11 |     seed_urls:
12 |       - https://parksaustralia.gov.au
13 |       - https://parksaustralia.gov.au/news/
14 | 
15 | # Where to send the results. Possible values are console, file, or elasticsearch
16 | output_sink: elasticsearch
17 | 
18 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
19 | output_index: parks-australia
20 | 
21 | # Crawl tuning
22 | max_crawl_depth: 2
23 | 
24 | # Crawl result field size limits
25 | max_title_size: 500
26 | max_body_size: 5_242_880 # 5 megabytes
27 | max_keywords_size: 512
28 | max_description_size: 512
29 | max_indexed_links_count: 5
30 | max_headings_count: 5
31 | 
32 | elasticsearch:
33 |   host: http://localhost
34 |   port: 9200
35 |   username: elastic
36 |   password: changeme
37 |   bulk_api:
38 |     max_items: 10
39 |     max_size_bytes: 1_048_576
40 | 


--------------------------------------------------------------------------------
/config/examples/simple.yml:
--------------------------------------------------------------------------------
 1 | # This is an example config using the bare minimum configuration options possible.
 2 | #
 3 | # To see all possible configuration options, reference the config templates:
 4 | # - config/crawler.yml.example
 5 | # - config/elasticsearch.yml.example
 6 | 
 7 | domains:
 8 |   - url: https://example.com
 9 | 
10 | output_sink: elasticsearch
11 | output_index: my-index
12 | 
13 | elasticsearch:
14 |   host: http://localhost
15 |   # host: http://host.docker.internal  # use this host instead if Elasticsearch is running on Docker on the same machine
16 |   port: 9200
17 |   username: elastic
18 |   password: changeme
19 | 


--------------------------------------------------------------------------------
/config/filebeat.yml.example:
--------------------------------------------------------------------------------
 1 | filebeat.inputs:
 2 | - type: filestream
 3 |   id: crawler-events-filestream
 4 |   enabled: true
 5 |   paths:
 6 |     - "/path/to/opencrawler/crawler_event.log"
 7 |   fields:
 8 |     type: "event"
 9 |   processors:
10 |     - decode_json_fields:
11 |         fields: ["message"]
12 |         target: ""
13 |         overwrite_keys: true
14 |         expand_keys: true
15 | 
16 | - type: filestream
17 |   id: crawler-system-log-filestream
18 |   enabled: true
19 |   paths:
20 |     - "/path/to/opencrawler/crawler_system.log"
21 |   fields:
22 |     type: "system"
23 |   processors:
24 |     - dissect:
25 |         tokenizer: "[%{@timestamp}] [crawl:%{crawl_id}] [%{crawl_stage}] %{message}"
26 |         target_prefix: ""
27 |         overwrite_keys: true
28 |         trim_values: all
29 | 
30 | setup.template.enabled: true
31 | setup.template.name: "filebeat"
32 | setup.template.pattern: "filebeat"
33 | 
34 | setup.template.settings:
35 |   index.number_of_shards: 1
36 |   index.number_of_replicas: 1
37 | 
38 | logging.level: debug
39 | 
40 | output.elasticsearch:
41 |   hosts: [""]
42 |   api_key: "id:api_key"
43 |   index: "logs-crawler-%{[fields.type]}" # see https://www.elastic.co/guide/en/fleet/8.17/data-streams.html#data-streams-naming-scheme
44 | 


--------------------------------------------------------------------------------
/docker-compose.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   crawler:
 3 |     image: docker.elastic.co/integrations/crawler:${CRAWLER_VERSION:-latest}
 4 |     container_name: crawler
 5 |     volumes:
 6 |       - ./config:/home/app/config
 7 | #      - ./logs:/home/app/logs # Enable this to access log files outside the Docker container
 8 |     networks:
 9 |       - elastic
10 |     stdin_open: true   # Equivalent to -i
11 |     tty: true          # Required for interactive mode
12 |     # Uncomment enviroment variable if running on MacOS M4 and experiencing Java Runtime errors
13 |     # environment:
14 |     #   - "_JAVA_OPTIONS=-XX:UseSVE=0"
15 | 
16 | networks:
17 |   elastic:
18 |     driver: bridge
19 | 


--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Open Crawler Changelog
 2 | 
 3 | ## Legend
 4 | 
 5 | - 🚀 Feature
 6 | - 🐛 Bugfix
 7 | - 🔨 Refactor
 8 | 
 9 | ## `v0.2.0`
10 | 
11 | - 🚀 Crawl jobs can now be scheduled using the CLI command `bin/crawler schedule`. See [CLI.md](./CLI.md#crawler-schedule).
12 | - 🚀 Crawler can now extract binary content from files. See [BINARY_CONTENT_EXTRACTION.md](./features/BINARY_CONTENT_EXTRACTION.md).
13 | - 🚀 Crawler will now purge outdated documents from the index at the end of the crawl. This is enabled by default. You can disable this by adding `purge_docs_enabled: false` to the crawler's yaml config file.
14 | - 🚀 Crawl rules can now be configured, allowing specified URLs to be allowed/denied. See [CRAWL_RULES.md](./features/CRAWL_RULES.md).
15 | - 🚀 Extraction rules using CSS, XPath, and URL selectors can now be applied to crawls. See [EXTRACTION_RULES.md](./features/EXTRACTION_RULES.md).
16 | - 🔨 The configuration field `content_extraction_enabled` is now `binary_content_extraction_enabled`.
17 | - 🔨 The configuration field `content_extraction_mime_types` is now `binary_content_extraction_mime_types`.
18 | - 🔨 The Elasticsearch document field `body_content` is now `body`.
19 | - 🔨 The format for config files has changed, so existing crawler configurations will not work. The new format can be referenced in the [crawler.yml.example](../config/crawler.yml.example) file.
20 | 


--------------------------------------------------------------------------------
/docs/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | 303 See Other
2 | 
3 | Location: https://www.elastic.co/community/codeofconduct
4 | 


--------------------------------------------------------------------------------
/docs/RELEASING.md:
--------------------------------------------------------------------------------
 1 | # Releasing
 2 | 
 3 | This doc is a reference for Elastic employees.
 4 | Non-Elastic users can not publish a release.
 5 | 
 6 | The version scheme we use is **MAJOR.MINOR.PATCH** and stored in the [product_version](../product_version) file at the root of this repository.
 7 | Open Crawler follows its own release versioning and does not follow the Elastic stack unified release schedule or versioning.
 8 | 
 9 | ## How to publish a Docker image
10 | 
11 | Releasing is done entirely through Buildkite.
12 | The Open Crawler build job is named `crawler-docker-build-publish`.
13 | 
14 | Build steps in buildkite:
15 | 
16 | 1. Go to the [Buildkite job for publishing Crawler](https://buildkite.com/elastic/crawler-docker-build-publish)
17 | 2. Click `New Build`
18 | 3. Enter a message (e.g. `x.y release`)
19 | 4. Choose a version branch with the pattern `x.y`
20 |     - Building from `main` is possible, but it will yield a _snapshot_ build.
21 |     - For example, if the version inside the `product_version` file is `0.3.0`, then a build triggered from `main` will result in `0.3.0-SNAPSHOT` images.
22 | 5. Choose a commit
23 |    - The default `HEAD` is usually fine
24 | 6. Click `Create Build`
25 | 7. Wait a minute for the Buildkite configuration to be loaded
26 |    - When it has loaded, a `Build Information` button will appear
27 | 8. Select whether or not the build should have a `:latest` tag
28 | 9. Wait for the build to finish
29 | 
30 | Creating a release in GitHub
31 | 
32 | 1. Go to https://github.com/elastic/crawler/releases
33 | 2. Click `Draft new release`
34 | 3. Create a tag for this release, following the pattern `v{major}.{minor}.{patch}`
35 | 4. Choose the target branch, this should match the `{major}.{minor}` of the tag
36 | 5. Click `Generate release notes`, this should autofill all changes 
37 | 6. If this is the latest release, make sure `Set as latest release` is selected
38 | 7. Click `Publish release`
39 | 


--------------------------------------------------------------------------------
/docs/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | Thanks for your interest in the security of our products. Our security policy can be found at [https://www.elastic.co/community/security](https://www.elastic.co/community/security).
4 | 
5 | ## Reporting a Vulnerability
6 | Please send security vulnerability reports to security@elastic.co.
7 | 


--------------------------------------------------------------------------------
/docs/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Getting Support
 2 | 
 3 | ### Official Support Services
 4 | If you have an Elastic subscription, you are entitled to Support services. See our welcome page for [working with our support team](https://www.elastic.co/support/welcome).
 5 | 
 6 | ### Where do I report issues with Elastic Crawler?
 7 | If something is not working as expected, please open an [issue](https://github.com/elastic/elastic-crawler/issues/new).
 8 | 
 9 | ### Where else can I go to get help?
10 | The Ingestion team at Elastic maintains this repository and is happy to help. Try posting your question to the [Elastic discuss forums](https://discuss.elastic.co/c/search/84).
11 | Be sure to mention that you're using Elastic Crawler and also let us know what service type you're trying to use, and any errors/issues you are encountering.
12 | You can also find us in the `#search-enterprise` channel of the [Elastic Community Slack](http://elasticstack.slack.com).
13 | 


--------------------------------------------------------------------------------
/docs/features/BINARY_CONTENT_EXTRACTION.md:
--------------------------------------------------------------------------------
 1 | # Binary Content Extraction
 2 | 
 3 | The Elastic Open Web Crawler can extract content from downloadable binary files, such as PDF and DOCX files.
 4 | Binary content is extracted by converting file contents to base64 and including the output in a document to index.
 5 | This value is picked up by an [Elasticsearch ingest pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html) that will convert the base64 content into plain text, to store in the `body` field of the same document.
 6 | 
 7 | ## Using this feature
 8 | 
 9 | 1. Enable ingest pipelines in the Elasticsearch configuration
10 | 2. Enable binary content extraction in the crawler configuration
11 | 3. Select which MIME types should have their contents extracted
12 |    - The MIME type is determined by the HTTP response’s `Content-Type` header when downloading a given file
13 |    - While intended primarily for PDF and Microsoft Office formats, you can use any of the formats supported by [Apache Tika](https://tika.apache.org/)
14 |    - No default MIME types are defined, so at least at least one MIME type must be configured in order to extract non-HTML content
15 |    - The ingest attachment processor does not support compressed files, e.g., an archive file containing a set of PDFs
16 | 
17 | For example, the following configuration allows for the binary content extraction of PDF and DOCX files, through the default pipeline `ent-search-ingestion-pipeline`:
18 | 
19 | ```yaml
20 | binary_content_extraction_enabled: true
21 | binary_content_extraction_mime_types:
22 |   - application/pdf
23 |   - application/msword
24 | 
25 | elasticsearch:
26 |    pipeline: ent-search-generic-ingestion
27 |    pipeline_enabled: true
28 | ```
29 | 
30 | Read more on ingest pipelines in Open Crawler [here](./INGEST_PIPELINES.md).
31 | 


--------------------------------------------------------------------------------
/docs/features/INGEST_PIPELINES.md:
--------------------------------------------------------------------------------
1 | # Ingest Pipelines
2 | 
3 | Open Crawler uses an [Elasticsearch ingest pipeline](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest.html) to power several content extraction features.
4 | The default pipeline, `ent-search-generic-ingestion`, is automatically created when Elasticsearch first starts.
5 | This pipeline does some pre-processing on documents before they are ingested by Open Crawler.
6 | See [Ingest pipelines for Search indices](https://www.elastic.co/guide/en/elasticsearch/reference/current/ingest-pipeline-search.html) for more details on this pipeline.
7 | 


--------------------------------------------------------------------------------
/docs/features/SCHEDULING.md:
--------------------------------------------------------------------------------
 1 | # Scheduling Recurring Crawl Jobs
 2 | 
 3 | Crawl jobs can be scheduled to recur.
 4 | Scheduled crawl jobs run until terminated by the user.
 5 | 
 6 | These schedules are defined through standard cron expressions.
 7 | You can use the tool https://crontab.guru to test different cron expressions.
 8 | 
 9 | For example, to schedule a crawl job that will execute once every 30 minutes, create a configuration file called `scheduled-crawl.yml` with the following contents:
10 | 
11 | ```yaml
12 | domains:
13 |   - url: "https://example.com"
14 | schedule:
15 |   pattern: "*/30 * * * *" # run every 30th minute
16 | ```
17 | 
18 | Then, use the CLI to then begin the crawl job schedule:
19 | 
20 | ```bash
21 | docker run \
22 |   -v ./scheduled-crawl.yml:/scheduled-crawl.yml \
23 |   -it docker.elastic.co/integrations/crawler:latest jruby bin/crawler schedule /scheduled-crawl.yml
24 | ```
25 | 
26 | **Scheduled crawl jobs from a single execution will not overlap.**
27 | 
28 | Scheduled jobs will also not wait for existing jobs to complete.
29 | That means if a crawl job is already in progress when another schedule is triggered, the new job will be dropped.
30 | For example, if you have a schedule that triggers at every hour, but your crawl job takes 1.5 hours to complete, the crawl schedule will effectively trigger on every 2nd hour.
31 | 
32 | **Executing multiple crawl schedules _can_ cause overlap.**
33 | 
34 | Be wary of executing multiple schedules against the same index.
35 | As with ad-hoc triggered crawl jobs, two crawlers simultaneously interacting with a single index can lead to data loss.
36 | 


--------------------------------------------------------------------------------
/lib/constants.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Constants
10 |   # Field names used in every crawl result when creating an ES doc
11 |   RESERVED_FIELD_NAMES = %w[
12 |     id
13 |     any
14 |     all
15 |     none
16 |     or
17 |     and
18 |     not
19 |     additional_urls
20 |     body_content
21 |     body
22 |     domains
23 |     headings
24 |     last_crawled_at
25 |     links
26 |     meta_description
27 |     meta_keywords
28 |     title
29 |     url
30 |     url_host
31 |     url_path
32 |     url_path_dir1
33 |     url_path_dir2
34 |     url_path_dir3
35 |     url_port
36 |     url_scheme
37 |   ].freeze
38 | end
39 | 


--------------------------------------------------------------------------------
/lib/crawler.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   # Current version of the crawler
11 |   def self.version
12 |     @version ||= File.read(File.join(__dir__, '../product_version')).strip
13 |   end
14 | 
15 |   # A unique identifier of the crawler process
16 |   def self.service_id
17 |     @service_id ||= BSON::ObjectId.new.to_s
18 |   end
19 | end
20 | 
21 | # Load other parts of the crawler
22 | # Ignore Crawler CLI
23 | files = Dir[File.join(__dir__, 'crawler/**/*.rb')].reject do |file|
24 |   file.include?('/crawler/cli/')
25 | end
26 | 
27 | files.each { |f| require_dependency(f) }
28 | 


--------------------------------------------------------------------------------
/lib/crawler/cli.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # load CLI dependencies
10 | Dir[File.join(__dir__, 'cli/**/*.rb')].each { |f| require(f) }
11 | 
12 | module Crawler
13 |   module CLI
14 |     extend Dry::CLI::Registry
15 | 
16 |     register 'version', Crawler::CLI::Version, aliases: ['v', '--version']
17 |     register 'crawl', Crawler::CLI::Crawl, aliases: %w[r run]
18 |     register 'schedule', Crawler::CLI::Schedule
19 |     register 'validate', Crawler::CLI::Validate
20 |     register 'urltest', Crawler::CLI::UrlTest
21 |   end
22 | end
23 | 


--------------------------------------------------------------------------------
/lib/crawler/cli/crawl.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'dry/cli'
10 | require 'yaml'
11 | 
12 | module Crawler
13 |   module CLI
14 |     class Crawl < Dry::CLI::Command
15 |       desc 'Run a crawl of the site'
16 | 
17 |       argument :crawl_config, required: true, desc: 'Path to crawl config file'
18 | 
19 |       option :es_config, desc: 'Path to elasticsearch config file'
20 | 
21 |       def call(crawl_config:, es_config: nil, **)
22 |         crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config)
23 |         crawl = Crawler::API::Crawl.new(crawl_config)
24 | 
25 |         crawl.start!
26 |       end
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/crawler/cli/schedule.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'dry/cli'
10 | require 'yaml'
11 | require 'rufus-scheduler'
12 | 
13 | module Crawler
14 |   module CLI
15 |     class Schedule < Dry::CLI::Command
16 |       desc 'Schedule a recurring crawl of the site'
17 | 
18 |       argument :crawl_config, required: true, desc: 'Path to crawl config file'
19 | 
20 |       option :es_config, desc: 'Path to elasticsearch config file'
21 | 
22 |       def call(crawl_config:, es_config: nil, **)
23 |         crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config)
24 |         if crawl_config.schedule.nil? || crawl_config.schedule[:pattern].nil?
25 |           raise ArgumentError, 'No schedule found in config file'
26 |         end
27 | 
28 |         run_schedule(crawl_config.schedule[:pattern], crawl_config)
29 |       end
30 | 
31 |       def run_schedule(pattern, crawl_config)
32 |         crawl_config.system_logger.info("Crawler initialized with a cron schedule of #{pattern}")
33 | 
34 |         # Schedule a recurrent task based on the config value `schedule.pattern`.
35 |         # The arg `overlap: false` prevents multiple tasks from spawning when a crawl
36 |         # task is longer than the schedule pattern.
37 |         # This will run until the Crawler is terminated.
38 |         scheduler = Rufus::Scheduler.new
39 |         scheduler.cron(pattern, overlap: false) do |job|
40 |           crawl_config.system_logger.info(
41 |             "Beginning scheduled crawl for #{job.previous_time} (actual trigger time: #{Time.now})."
42 |           )
43 |           crawl = Crawler::API::Crawl.new(crawl_config)
44 |           crawl.start!
45 |           crawl_config.system_logger.info(
46 |             "Scheduled crawl ended at #{Time.now}. Next scheduled crawl should trigger around #{job.next_time}."
47 |           )
48 |         end
49 |         scheduler.join
50 |       end
51 |     end
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/lib/crawler/cli/urltest.rb:
--------------------------------------------------------------------------------
 1 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 2 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 3 | # you may not use this file except in compliance with the Elastic License 2.0.
 4 | #
 5 | 
 6 | # frozen_string_literal: true
 7 | 
 8 | require 'dry/cli'
 9 | require 'yaml'
10 | 
11 | module Crawler
12 |   module CLI
13 |     class UrlTest < Dry::CLI::Command
14 |       desc 'Test a URL against a configuration'
15 | 
16 |       argument :crawl_config, required: true, desc: 'Path to crawl config file'
17 | 
18 |       argument :endpoint, required: true, desc: 'Endpoint to test'
19 | 
20 |       option :es_config, desc: 'Path to elasticsearch config file'
21 | 
22 |       def call(crawl_config:, endpoint:, es_config: nil, **)
23 |         crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config)
24 |         crawl = Crawler::API::Crawl.new(crawl_config)
25 | 
26 |         crawl.start_url_test!(endpoint)
27 |       end
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/crawler/cli/validate.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'dry/cli'
10 | require 'yaml'
11 | 
12 | module Crawler
13 |   module CLI
14 |     class Validate < Dry::CLI::Command
15 |       desc 'Validate crawler configuration'
16 | 
17 |       argument :crawl_config, required: true, desc: 'Path to crawl config file'
18 | 
19 |       def call(crawl_config:, es_config: nil, **)
20 |         crawl_config = Crawler::CLI::Helpers.load_crawl_config(crawl_config, es_config)
21 | 
22 |         crawl_config.domain_allowlist.each do |domain|
23 |           validator = Crawler::UrlValidator.new(
24 |             url: domain.raw_url,
25 |             crawl_config:
26 |           )
27 | 
28 |           print_validation_result(domain, validator)
29 |         end
30 |       end
31 | 
32 |       private
33 | 
34 |       def print_validation_result(domain, validator)
35 |         if validator.valid?
36 |           puts "Domain #{domain.raw_url} is valid"
37 |         else
38 |           puts "Domain #{domain.raw_url} is invalid:"
39 |           puts validator.failed_checks.map(&:comment).join("\n")
40 |         end
41 |       rescue Crawler::UrlValidator::Error => e
42 |         puts "Error validating domain #{domain.raw_url}: #{e}"
43 |       end
44 |     end
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/lib/crawler/cli/version.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'dry/cli'
10 | 
11 | module Crawler
12 |   module CLI
13 |     class Version < Dry::CLI::Command
14 |       VERSION_PATH = File.expand_path('../../../product_version', __dir__).freeze
15 | 
16 |       desc 'Print version'
17 | 
18 |       def call(*)
19 |         puts File.read(VERSION_PATH).strip
20 |       end
21 |     end
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/crawler/content_engine/transformer.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module ContentEngine
11 |     module Transformer
12 |       INCLUDE_ATTR = 'data-elastic-include'
13 |       EXCLUDE_ATTR = 'data-elastic-exclude'
14 |       EXCLUDE_ATTR_SELECTOR = "[#{EXCLUDE_ATTR}]".freeze
15 | 
16 |       def self.transform(doc)
17 |         transform!(doc.dup)
18 |       end
19 | 
20 |       def self.transform!(doc)
21 |         loop do
22 |           node = doc.has_attribute?(EXCLUDE_ATTR) ? doc : doc.at_css(EXCLUDE_ATTR_SELECTOR)
23 |           break unless node
24 | 
25 |           traverse!(node, mode: :exclude)
26 |         end
27 | 
28 |         doc
29 |       end
30 | 
31 |       def self.traverse!(node, mode:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity
32 |         # The exclusion attribute is used to determine what to traverse next in the parent loop,
33 |         # so we should remove the attribute while traversing to avoid an infinite loop.
34 |         node.remove_attribute(EXCLUDE_ATTR) if node.has_attribute?(EXCLUDE_ATTR)
35 | 
36 |         node.children.each do |child_node|
37 |           if child_node.text? && mode == :exclude
38 |             child_node.unlink
39 |           elsif child_node.element?
40 |             new_mode =
41 |               if child_node.has_attribute?(INCLUDE_ATTR)
42 |                 :include
43 |               elsif child_node.has_attribute?(EXCLUDE_ATTR)
44 |                 :exclude
45 |               else
46 |                 mode # mode is unchanged
47 |               end
48 | 
49 |             traverse!(child_node, mode: new_mode)
50 |           end
51 |         end
52 |       end
53 |     end
54 |   end
55 | end
56 | 


--------------------------------------------------------------------------------
/lib/crawler/core_ext.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | #
10 | # This file contains useful extensions for core classes
11 | #
12 | class Time
13 |   # Returns the number of seconds since the system boot
14 |   #
15 |   # This method is useful for calculating elapsed time or difference between
16 |   # two events without having to worry about daylight savings, leap seconds, etc.
17 |   #
18 |   def self.monotonic_now
19 |     Process.clock_gettime(Process::CLOCK_MONOTONIC)
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/content_extractable_file.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'digest'
10 | 
11 | require_dependency(File.join(__dir__, 'success'))
12 | 
13 | module Crawler
14 |   module Data
15 |     module CrawlResult
16 |       class ContentExtractableFile < Success
17 |         # Allow constructor to be called on concrete result classes
18 |         public_class_method :new
19 | 
20 |         attr_reader :content_length, :content_type
21 | 
22 |         def initialize(status_code:, content_length:, content_type:, **kwargs)
23 |           super(status_code:, **kwargs)
24 | 
25 |           @content_length = content_length
26 |           @content_type = content_type
27 |         end
28 | 
29 |         def content_hash
30 |           @content_hash ||= Digest::SHA1.hexdigest(content)
31 |         end
32 | 
33 |         def base64_encoded_content
34 |           @base64_encoded_content ||= Base64.strict_encode64(content)
35 |         end
36 | 
37 |         def file_name
38 |           @file_name ||= File.basename(url)
39 |         end
40 |       end
41 |     end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/error.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'base'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class Error < Base
15 |         # Fake status code to be used for unexpected internal errors
16 |         MISCELLANEOUS_ERROR = 599
17 | 
18 |         attr_reader :error, :suggestion_message
19 | 
20 |         # INTERNAL_ERROR_STATUS is used by default for unexpected internal errors that
21 |         # were not part of the HTTP response from a crawled web page.
22 |         def initialize(error:, status_code: MISCELLANEOUS_ERROR, suggestion_message: nil, **kwargs)
23 |           super(status_code:, **kwargs)
24 |           @error = error
25 |           @suggestion_message = suggestion_message
26 |         end
27 | 
28 |         # Allow constructor to be called on concrete result classes
29 |         public_class_method :new
30 | 
31 |         #---------------------------------------------------------------------------------------------
32 |         def to_h
33 |           super.merge(error:)
34 |         end
35 | 
36 |         def to_s
37 |           "<CrawlResult::Error: id=#{id}, status_code=#{status_code}, url=#{url}, error=#{error.inspect}>"
38 |         end
39 |       end
40 |     end
41 |   end
42 | end
43 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/http_auth_disallowed_error.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'error'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class HttpAuthDisallowedError < Error
15 |         def initialize(error: nil, **kwargs)
16 |           suggestion_message = <<~MSG
17 |             Set `http_auth_allowed: true` if you want to
18 |             allow authenticated crawling of non-HTTPS URLs.
19 |           MSG
20 | 
21 |           super(
22 |             error: error || 'Authenticated crawling of non-HTTPS URLs is not allowed',
23 |             suggestion_message:,
24 |             **kwargs
25 |           )
26 |         end
27 |       end
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/redirect.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'base'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class Redirect < Base
15 |         VALID_STATUS_CODES = (300..399)
16 | 
17 |         attr_reader :redirect_chain, :location
18 | 
19 |         def initialize(status_code:, location:, redirect_chain:, **kwargs)
20 |           super(status_code:, **kwargs)
21 | 
22 |           unless status_code.in?(VALID_STATUS_CODES)
23 |             error = "Redirects have to have a 3xx response code, received #{status_code.inspect}"
24 |             raise ArgumentError, error
25 |           end
26 | 
27 |           raise ArgumentError, 'Location needs to be a Crawler URL object!' unless location.is_a?(Crawler::Data::URL)
28 | 
29 |           @location = location
30 |           @redirect_chain = redirect_chain
31 |         end
32 | 
33 |         # Allow constructor to be called on concrete result classes
34 |         public_class_method :new
35 | 
36 |         def to_h
37 |           super.merge(
38 |             location:,
39 |             redirect_chain:
40 |           )
41 |         end
42 | 
43 |         def to_s
44 |           "<CrawlResult::Redirect: id=#{id}, status_code=#{status_code}, original_url=#{original_url}, location=#{location}, redirect_count=#{redirect_count}>" # rubocop:disable Layout/LineLength
45 |         end
46 | 
47 |         def original_url
48 |           redirect_chain.first || url
49 |         end
50 | 
51 |         def redirect_count
52 |           redirect_chain.size + 1
53 |         end
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/redirect_error.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'error'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class RedirectError < Error
15 |         def initialize(**kwargs)
16 |           suggestion = <<~LOG
17 |             Check the URL content in your browser and make sure it is something
18 |             the crawler could understand.
19 |           LOG
20 | 
21 |           super(suggestion_message: suggestion, **kwargs)
22 |         end
23 |       end
24 |     end
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/robots_txt.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'success'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class RobotsTxt < Success
15 |         # Allow constructor to be called on concrete result classes
16 |         public_class_method :new
17 |       end
18 |     end
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/success.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'base'))
10 | 
11 | # The base class for all successful responses
12 | module Crawler
13 |   module Data
14 |     module CrawlResult
15 |       class Success < Base
16 |         VALID_STATUS_CODES = (200..299)
17 | 
18 |         attr_reader :content
19 | 
20 |         def initialize(status_code:, content:, **kwargs)
21 |           super(status_code:, **kwargs)
22 | 
23 |           unless status_code.in?(VALID_STATUS_CODES)
24 |             error = "Successful responses have to have a 2xx response code, received #{status_code.inspect}"
25 |             raise ArgumentError, error
26 |           end
27 | 
28 |           @content = content
29 |         end
30 |       end
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/crawler/data/crawl_result/unsupported_content_type.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'error'))
10 | 
11 | module Crawler
12 |   module Data
13 |     module CrawlResult
14 |       class UnsupportedContentType < Error
15 |         def initialize(content_type:, error: nil, **kwargs)
16 |           suggestion = <<~MSG
17 |             Check the URL content in your browser and make sure it is something
18 |             the crawler could understand.
19 |           MSG
20 | 
21 |           super(
22 |             content_type:,
23 |             error: error || "Unsupported content type: #{content_type}",
24 |             suggestion_message: suggestion,
25 |             **kwargs
26 |           )
27 |         end
28 |       end
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/crawler/data/domain.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module Data
11 |     class Domain
12 |       attr_reader :scheme, :host, :port
13 | 
14 |       def initialize(domain)
15 |         @url = Crawler::Data::URL.parse(domain)
16 |         @scheme = url.scheme
17 |         @host = url.host
18 |         @port = url.port || standard_port_for_scheme(url.scheme)
19 |       end
20 | 
21 |       def raw_url
22 |         url
23 |       end
24 | 
25 |       def robots_txt_url
26 |         url.join('/robots.txt')
27 |       end
28 | 
29 |       def standard_port_for_scheme(scheme)
30 |         case scheme
31 |         when 'http' then 80
32 |         when 'https' then 443
33 |         end
34 |       end
35 | 
36 |       def ==(other)
37 |         to_s == other.to_s
38 |       end
39 | 
40 |       def to_s
41 |         "#{scheme}://#{host}:#{port}"
42 |       end
43 | 
44 |       private
45 | 
46 |       attr_reader :url
47 |     end
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/lib/crawler/data/extraction/ruleset.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency(File.join(__dir__, 'rule'))
10 | require_dependency(File.join(__dir__, 'url_filter'))
11 | require_dependency(File.join(__dir__, '..', '..', 'utils'))
12 | 
13 | module Crawler
14 |   module Data
15 |     module Extraction
16 |       class Ruleset
17 |         def initialize(ruleset, domain)
18 |           @ruleset = ruleset
19 |           @domain = domain
20 |           validate_ruleset
21 | 
22 |           # initialize these after validating they are arrays
23 |           rules
24 |           url_filters
25 |         end
26 | 
27 |         def rules
28 |           @rules ||=
29 |             if @ruleset[:rules]&.any?
30 |               @ruleset[:rules].map do |rule|
31 |                 Crawler::Data::Extraction::Rule.new(rule)
32 |               end
33 |             else
34 |               []
35 |             end
36 |         end
37 | 
38 |         def url_filters
39 |           @url_filters ||=
40 |             if @ruleset[:url_filters]&.any?
41 |               @ruleset[:url_filters].map do |filter|
42 |                 Crawler::Data::Extraction::UrlFilter.new(filter)
43 |               end
44 |             else
45 |               []
46 |             end
47 |         end
48 | 
49 |         def url_filtering_rules
50 |           @url_filtering_rules ||= url_filters.map do |filter|
51 |             pattern = Regexp.new(Crawler::Utils.url_pattern(@domain, filter.type, filter.pattern))
52 |             Crawler::Data::Rule.new(Crawler::Data::Rule::ALLOW, url_pattern: pattern)
53 |           end
54 |         end
55 | 
56 |         private
57 | 
58 |         def validate_ruleset
59 |           if !@ruleset[:rules].nil? && !@ruleset[:rules].is_a?(Array)
60 |             raise ArgumentError, 'Extraction ruleset rules must be an array'
61 |           end
62 | 
63 |           return unless !@ruleset[:url_filters].nil? && !@ruleset[:url_filters].is_a?(Array)
64 | 
65 |           raise ArgumentError, 'Extraction ruleset url_filters must be an array'
66 |         end
67 |       end
68 |     end
69 |   end
70 | end
71 | 


--------------------------------------------------------------------------------
/lib/crawler/data/extraction/url_filter.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module Data
11 |     module Extraction
12 |       class UrlFilter
13 |         REGEX_TIMEOUT = 0.5 # seconds
14 |         TYPES = %w[begins ends contains regex].freeze
15 | 
16 |         attr_reader :type, :pattern
17 | 
18 |         def initialize(url_filter)
19 |           @type = url_filter[:type]
20 |           @pattern = url_filter[:pattern]
21 |           validate_url_filter
22 |         end
23 | 
24 |         private
25 | 
26 |         def validate_url_filter
27 |           unless TYPES.include?(@type)
28 |             raise ArgumentError,
29 |                   "Extraction ruleset url_filter `#{@type}` is invalid; value must be one of #{TYPES.join(', ')}"
30 |           end
31 | 
32 |           raise ArgumentError, 'Extraction ruleset url_filter pattern can not be blank' if @pattern.blank?
33 | 
34 |           case @type
35 |           when 'begins'
36 |             unless @pattern.start_with?('/')
37 |               raise ArgumentError,
38 |                     'Extraction ruleset url_filter pattern must begin with a slash (/) if type is `begins`'
39 |             end
40 |           when 'regex' then validate_regex
41 |           end
42 |         end
43 | 
44 |         def validate_regex
45 |           _ = Regexp.new(@pattern)
46 |         rescue RegexpError => e
47 |           raise ArgumentError, "Extraction ruleset url_filter pattern regex is invalid: #{e.message}"
48 |         end
49 |       end
50 |     end
51 |   end
52 | end
53 | 


--------------------------------------------------------------------------------
/lib/crawler/data/rule.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module Data
11 |     class Rule
12 |       ALLOW = :allow
13 |       DENY = :deny
14 |       REGEX_TIMEOUT = 1.second
15 | 
16 |       SUPPORTED_POLICIES = [ALLOW, DENY].freeze
17 | 
18 |       attr_reader :policy, :source
19 | 
20 |       def initialize(policy, url_pattern:, source: nil)
21 |         unless SUPPORTED_POLICIES.include?(policy)
22 |           raise ArgumentError, "policy: #{policy.inspect} is not a supported value"
23 |         end
24 | 
25 |         unless url_pattern.is_a?(Regexp)
26 |           raise ArgumentError, "url_pattern: must be a Regexp, it was #{url_pattern.class}"
27 |         end
28 | 
29 |         @policy = policy
30 |         @url_pattern = url_pattern
31 |         @source = source
32 |       end
33 | 
34 |       def url_match?(url)
35 |         Timeout.timeout(REGEX_TIMEOUT) do
36 |           @url_pattern.match?(url.to_s)
37 |         end
38 |       end
39 | 
40 |       def description
41 |         @description ||= "policy: #{@policy}, url_pattern: #{@url_pattern}"
42 |       end
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/lib/crawler/data/seen_urls.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module Data
11 |     class SeenUrls
12 |       def initialize
13 |         @seen_urls = Concurrent::Set.new
14 |       end
15 | 
16 |       def clear
17 |         @seen_urls.clear
18 |       end
19 | 
20 |       def count
21 |         @seen_urls.size
22 |       end
23 | 
24 |       def delete(url)
25 |         @seen_urls.delete(url_hash(url))
26 |       end
27 | 
28 |       # A method called when the crawler needs to stop and persist its state
29 |       def save
30 |         # nothing to do by default
31 |       end
32 | 
33 |       # Tries to add an item to the set
34 |       # Returns +true+ if this is a new URL and we should visit it
35 |       # Returns +false+ if we have already seen this URL
36 |       def add?(url)
37 |         !!@seen_urls.add?(url_hash(url))
38 |       end
39 | 
40 |       private
41 | 
42 |       def url_hash(url)
43 |         raise ArgumentError, 'Needs a URL' unless url.is_a?(Crawler::Data::URL)
44 | 
45 |         url.normalized_hash
46 |       end
47 |     end
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/lib/crawler/data/url_queue.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module Data
11 |     module UrlQueue
12 |       class Error < StandardError; end
13 | 
14 |       class TransientError < Error; end
15 | 
16 |       class QueueFullError < TransientError; end
17 | 
18 |       def self.create(config)
19 |         queue_type = config.url_queue.to_s
20 |         queue_class_for_type(queue_type).new(config)
21 |       end
22 | 
23 |       def self.queue_class_for_type(queue_type)
24 |         queue_class_name = "Crawler::Data::UrlQueue::#{queue_type.classify}"
25 |         queue_class_name.safe_constantize.tap do |queue_class|
26 |           raise ArgumentError, "Unknown URL queue type: #{queue_type}" unless queue_class
27 |         end
28 |       end
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/crawler/executor.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # An Executor fetches content by making requests described by CrawlTasks.
10 | module Crawler
11 |   class Executor
12 |     def run(_crawl_task)
13 |       raise NotImplementError
14 |     end
15 | 
16 |     # Override to provide stats about the HTTP client
17 |     def http_client_status
18 |       raise NotImplementedError
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/crawler/http_utils/all_trusting_trust_manager.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | java_import javax.net.ssl.X509TrustManager
10 | 
11 | # A simple implementation of the trust manager interface that trusts everyone
12 | # Used by the Crawler HTTP client to implement ssl_verification_mode=none.
13 | module Crawler
14 |   module HttpUtils
15 |     class AllTrustingTrustManager
16 |       include X509TrustManager
17 | 
18 |       # rubocop:disable Naming/MethodName
19 |       def checkClientTrusted(*)
20 |         true
21 |       end
22 | 
23 |       def checkServerTrusted(*)
24 |         true
25 |       end
26 | 
27 |       def getAcceptedIssuers
28 |         []
29 |       end
30 |       # rubocop:enable Naming/MethodName
31 |     end
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/lib/crawler/logging/handler/base.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # base class for all log handlers
10 | module Crawler
11 |   module Logging
12 |     module Handler
13 |       class Base
14 |         def initialize(log_level, filename = nil, rotation_period = 'weekly')
15 |           @log_level = log_level
16 |           @filename = filename
17 |           @rotation_period = rotation_period
18 |         end
19 | 
20 |         def log
21 |           raise NotImplementedError
22 |         end
23 | 
24 |         def add_tags
25 |           raise NotImplementedError
26 |         end
27 |       end
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/crawler/logging/handler/stdout.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'base')
10 | 
11 | module Crawler
12 |   module Logging
13 |     module Handler
14 |       attr_reader :event_logger, :logger_instance
15 | 
16 |       class StdoutHandler < Handler::Base
17 |         def initialize(log_level)
18 |           super
19 |           # logger instance setup
20 |           logger_instance = Logger.new($stdout)
21 |           logger_instance.level = log_level
22 |           # Set a base format to include timestamp
23 |           format_logger(logger_instance)
24 |           # convert logger instance to a StaticallyTaggedLogger so we can support tagging
25 |           @logger_instance = logger_instance
26 |         end
27 | 
28 |         def log(message, message_log_level)
29 |           case message_log_level
30 |           when Logger::DEBUG
31 |             @logger_instance.debug(message)
32 |           when Logger::INFO
33 |             @logger_instance.info(message)
34 |           when Logger::WARN
35 |             @logger_instance.warn(message)
36 |           when Logger::ERROR
37 |             @logger_instance.error(message)
38 |           when Logger::FATAL
39 |             @logger_instance.fatal(message)
40 |           else
41 |             @logger_instance << message
42 |           end
43 |         end
44 | 
45 |         def add_tags(*tags)
46 |           # this function re-formats the log format with the provided tags
47 |           format_logger(@logger_instance, tags.join(' '))
48 |         end
49 | 
50 |         def format_logger(logger_instance, tags = nil)
51 |           logger_instance.formatter = proc do |_severity, datetime, _progname, msg|
52 |             timestamp = datetime.strftime('%Y-%m-%dT%H:%M:%S.%LZ')
53 |             if tags
54 |               "[#{timestamp}] #{tags} #{msg}\n"
55 |             else
56 |               "[#{timestamp}] #{msg}\n"
57 |             end
58 |           end
59 |         end
60 | 
61 |         def level(log_level)
62 |           @logger_instance.level = log_level
63 |         end
64 |       end
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/crawler/logging/logger.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsarch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # logging monolith class that maintains
10 | # a. list of log handlers
11 | # b. routing function to route log messages to all handlers
12 | module Crawler
13 |   module Logging
14 |     class CrawlLogger
15 |       attr_reader :all_handlers
16 | 
17 |       def initialize
18 |         # initialize with no handlers by default
19 |         @all_handlers = []
20 |       end
21 | 
22 |       def route_logs_to_handlers(message, message_log_level)
23 |         all_handlers.each do |handler|
24 |           handler.log(message, message_log_level)
25 |         end
26 |       end
27 | 
28 |       def debug(message)
29 |         route_logs_to_handlers(message, Logger::DEBUG)
30 |       end
31 | 
32 |       def info(message)
33 |         route_logs_to_handlers(message, Logger::INFO)
34 |       end
35 | 
36 |       def warn(message)
37 |         route_logs_to_handlers(message, Logger::WARN)
38 |       end
39 | 
40 |       def error(message)
41 |         route_logs_to_handlers(message, Logger::ERROR)
42 |       end
43 | 
44 |       def fatal(message)
45 |         route_logs_to_handlers(message, Logger::FATAL)
46 |       end
47 | 
48 |       def add(custom_log_level, message)
49 |         route_logs_to_handlers(message, custom_log_level)
50 |       end
51 | 
52 |       def <<(message)
53 |         route_logs_to_handlers(message, nil)
54 |       end
55 | 
56 |       def add_handler(new_handler)
57 |         all_handlers.append(new_handler)
58 |       end
59 | 
60 |       def add_tags_to_log_handlers(tags)
61 |         all_handlers.each do |handler|
62 |           handler.add_tags(tags)
63 |         end
64 |       end
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/crawler/mock_event_logger.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'logger'
10 | 
11 | module Crawler
12 |   class MockEventLogger
13 |     # Array of accumulated events (hash objects).
14 |     attr_reader :mock_events
15 | 
16 |     def initialize
17 |       @mock_events = []
18 |     end
19 | 
20 |     def <<(event)
21 |       # Since we receive an already serialized event, but want to run tests against raw events
22 |       original_event = JSON.parse(event)
23 |       mock_events << original_event
24 |     end
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/lib/crawler/mock_executor.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'executor')
10 | 
11 | # MockExecutor returns pre-populated results for specified URIs.
12 | module Crawler
13 |   class MockExecutor < Crawler::Executor
14 |     attr_reader :mock_results
15 | 
16 |     def initialize(mock_results = {}) # rubocop:disable Lint/MissingSuper
17 |       @mock_results = mock_results # Hash of normalized URL strings to CrawlResponse objects.
18 |     end
19 | 
20 |     def http_client_status
21 |       {}
22 |     end
23 | 
24 |     # The arg `follow_redirects` is required despite not being used within the method.
25 |     # This is because the mock is called using expected args in specs.
26 |     def run(crawl_task, follow_redirects: false) # rubocop:disable Lint/UnusedMethodArgument
27 |       url = crawl_task.url
28 |       mock_results.fetch(url.to_s, mock_404_result(url))
29 |     end
30 | 
31 |     def mock_404_result(url)
32 |       Crawler::Data::CrawlResult::Error.new(
33 |         url:,
34 |         status_code: 404,
35 |         error: 'Not found'
36 |       )
37 |     end
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module OutputSink
11 |     def self.create(config)
12 |       sink_type = config.output_sink.to_s
13 |       sink_class_for_type(sink_type).new(config)
14 |     end
15 | 
16 |     def self.sink_class_for_type(sink_type)
17 |       sink_class_name = "::Crawler::OutputSink::#{sink_type.classify}"
18 |       sink_class_name.safe_constantize.tap do |sink_class|
19 |         raise ArgumentError, "Unknown output sink: #{sink_type.inspect}" unless sink_class
20 |       end
21 |     end
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink/base.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, '..', 'output_sink')
10 | 
11 | module Crawler
12 |   module OutputSink
13 |     class Base
14 |       attr_reader :config, :rule_engine
15 | 
16 |       delegate :crawl_id, :document_mapper, :events, :system_logger, to: :config
17 | 
18 |       def initialize(config)
19 |         @config = config
20 |         @rule_engine = create_rule_engine
21 |       end
22 | 
23 |       def create_rule_engine
24 |         Crawler::RuleEngine::Base.new(config)
25 |       end
26 | 
27 |       def write(_crawl_result)
28 |         raise NotImplementedError
29 |       end
30 | 
31 |       def fetch_purge_docs(_crawl_start_time)
32 |         raise NotImplementedError
33 |       end
34 | 
35 |       def purge(_crawl_start_time)
36 |         raise NotImplementedError
37 |       end
38 | 
39 |       def to_doc(crawl_result)
40 |         document_mapper.create_doc(crawl_result)
41 |       end
42 | 
43 |       def close
44 |         # To be implemented by the sink if needed.
45 |         # Does nothing by default.
46 |       end
47 | 
48 |       def flush
49 |         # To be implemented by the sink if needed.
50 |         # Does nothing by default.
51 |       end
52 | 
53 |       # Returns a hash with the outcome of crawl result ingestion (to be used for logging above)
54 |       def outcome(outcome, message)
55 |         { outcome:, message: }
56 |       end
57 | 
58 |       def success(message = 'Successfully ingested crawl result')
59 |         outcome(:success, message)
60 |       end
61 | 
62 |       def failure(message)
63 |         outcome(:failure, message)
64 |       end
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink/console.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'base')
10 | 
11 | module Crawler
12 |   module OutputSink
13 |     class Console < OutputSink::Base
14 |       def write(crawl_result)
15 |         puts "# #{crawl_result.id}, #{crawl_result.url}, #{crawl_result.status_code}"
16 | 
17 |         if crawl_result.content_extractable_file?
18 |           puts "** [Content extractable file (content type: #{crawl_result.content_type}, " \
19 |                "content length: #{crawl_result.content.bytesize})] **"
20 |         else
21 |           puts crawl_result.content
22 |         end
23 | 
24 |         success
25 |       end
26 |     end
27 |   end
28 | end
29 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink/file.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'base')
10 | 
11 | module Crawler
12 |   module OutputSink
13 |     class File < OutputSink::Base
14 |       attr_reader :dir
15 | 
16 |       def initialize(*)
17 |         super
18 | 
19 |         @dir = config.output_dir
20 |         raise ArgumentError, 'Missing or invalid output directory' if !dir.is_a?(String) || dir.empty?
21 | 
22 |         FileUtils.mkdir_p(dir)
23 |       end
24 | 
25 |       def generate_filename_from_url(crawl_result)
26 |         full_url = crawl_result.url.to_s
27 |         full_url = full_url.chop if full_url.end_with?('/') # trim tailing slash if present
28 | 
29 |         filename = full_url
30 |                    .gsub(/[^a-zA-Z0-9\-_]/, '_') # replace slashes with underscores
31 |                    .squeeze('_') # remove repetitive underscores
32 |                    .gsub(/^https?_?(www_)?/, '') # remove scheme and www
33 | 
34 |         # Most OSes limit filenames to 256 chars, we should truncate if too long
35 |         if filename.length > 255
36 |           # 128 + 122 = 250 chars, leaving 6 chars for the file extension (.json)
37 |           starting_idx = 128
38 |           ending_idx = filename.length - 122
39 |           # slice out the middle to retain the domain and more unique paths at the end of the URL
40 |           filename.slice!(starting_idx..ending_idx)
41 |         end
42 | 
43 |         filename
44 |       end
45 | 
46 |       def write(crawl_result)
47 |         doc = to_doc(crawl_result)
48 |         document_filename = "#{generate_filename_from_url(crawl_result)}.json"
49 |         system_logger.debug("Writing crawled document to #{dir}/#{document_filename}")
50 |         result_file = "#{dir}/#{document_filename}"
51 |         ::File.write(result_file, doc.to_json)
52 | 
53 |         success
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink/mock.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'base')
10 | 
11 | module Crawler
12 |   module OutputSink
13 |     class Mock < OutputSink::Base
14 |       attr_reader :results
15 | 
16 |       def initialize(*)
17 |         super
18 | 
19 |         @results = config.results_collection
20 |         raise ArgumentError, 'Needs a ResultsCollection' unless results.is_a?(ResultsCollection)
21 |       end
22 | 
23 |       def write(crawl_result)
24 |         results.append(crawl_result)
25 | 
26 |         success
27 |       end
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/crawler/output_sink/null.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require_dependency File.join(__dir__, 'base')
10 | 
11 | module Crawler
12 |   module OutputSink
13 |     class Null < OutputSink::Base
14 |       def write(_)
15 |         # Discard the results
16 |       end
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/crawl_rules_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module UrlValidator::CrawlRulesCheckConcern # rubocop:disable Style/ClassAndModuleChildren
11 |     extend ActiveSupport::Concern
12 | 
13 |     def validate_crawl_rules
14 |       rule_engine = Crawler::RuleEngine::Elasticsearch.new(crawler_api_config)
15 |       outcome = rule_engine.crawl_rules_outcome(normalized_url)
16 |       rule = outcome.details[:rule]
17 | 
18 |       if outcome.allowed?
19 |         validation_ok(:crawl_rules, 'The URL is allowed by one of the crawl rules', rule: rule.source)
20 |       elsif rule
21 |         validation_fail(:crawl_rules, 'The URL is denied by a crawl rule', rule: rule.source)
22 |       else
23 |         # This should never happen, but we're including it here to be safe
24 |         validation_fail(:crawl_rules, 'The URL is denied because it did not match any rules')
25 |       end
26 |     end
27 |   end
28 | end
29 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/dns_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'resolv'
10 | 
11 | module Crawler
12 |   module UrlValidator::DnsCheckConcern # rubocop:disable Style/ClassAndModuleChildren
13 |     extend ActiveSupport::Concern
14 | 
15 |     def validate_dns
16 |       if proxy_configured?
17 |         warning = 'DNS resolution check could not be performed via an HTTP proxy.'
18 |         return validation_warn(:dns, warning)
19 |       end
20 | 
21 |       # Prepare DNS resolvers
22 |       resolv = Resolv.new([
23 |                             Resolv::Hosts.new,
24 |                             Resolv::DNS.new.tap do |dns|
25 |                               dns.timeouts = Crawler::UrlValidator::DNS_CHECK_TIMEOUT
26 |                             end
27 |                           ])
28 | 
29 |       # Check DNS
30 |       addresses = resolv.getaddresses(url.host)
31 | 
32 |       if addresses.empty?
33 |         validation_fail(:dns, 'DNS name resolution failed. No suitable addresses found!')
34 |       else
35 |         validation_ok(:dns, "Domain name resolution successful: #{addresses.count} addresses found",
36 |                       addresses:)
37 |       end
38 |     rescue Resolv::ResolvError, ArgumentError => e
39 |       validation_fail(:dns, <<~MESSAGE)
40 |         DNS resolution failure: #{e}. Please check the spelling of your domain
41 |         or your DNS configuration.
42 |       MESSAGE
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/domain_access_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module UrlValidator::DomainAccessCheckConcern # rubocop:disable Style/ClassAndModuleChildren
11 |     extend ActiveSupport::Concern
12 | 
13 |     def validate_domain_access
14 |       if crawler_api_config.domain_allowlist.include?(url.domain)
15 |         validation_ok(:domain_access, 'The URL matches one of the configured domains', domain: url.domain_name)
16 |       else
17 |         validation_fail(:domain_access, 'The URL does not match any configured domains')
18 |       end
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/domain_uniqueness_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module UrlValidator::DomainUniquenessCheckConcern # rubocop:disable Style/ClassAndModuleChildren
11 |     extend ActiveSupport::Concern
12 | 
13 |     def validate_domain_uniqueness
14 |       if crawler_api_config.domain_allowlist.include?(url.domain)
15 |         validation_fail(:domain_uniqueness, 'Domain name already exists')
16 |       else
17 |         validation_ok(:domain_uniqueness, 'Domain name is new', domain: url.domain_name)
18 |       end
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/result.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   class UrlValidator::Result # rubocop:disable Style/ClassAndModuleChildren
11 |     attr_reader :name, :result, :comment, :details
12 | 
13 |     def initialize(name:, result:, comment:, details: {})
14 |       @name = name
15 |       @result = result
16 |       @comment = comment
17 |       @details = details
18 |     end
19 | 
20 |     def failure?
21 |       result == :failure
22 |     end
23 | 
24 |     def to_h
25 |       { name:, result:, comment: }.tap do |res|
26 |         res[:details] = details if details.any?
27 |       end
28 |     end
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/tcp_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module UrlValidator::TcpCheckConcern # rubocop:disable Style/ClassAndModuleChildren
11 |     extend ActiveSupport::Concern
12 | 
13 |     def validate_tcp
14 |       if proxy_configured?
15 |         warning = 'TCP connection check could not be performed via an HTTP proxy.'
16 |         return validation_warn(:tcp, warning)
17 |       end
18 | 
19 |       host = url.host
20 |       port = url.inferred_port
21 |       details = { host:, port: }
22 | 
23 |       Socket.tcp(host, port, connect_timeout: Crawler::UrlValidator::TCP_CHECK_TIMEOUT) do
24 |         validation_ok(:tcp, 'TCP connection successful', details)
25 |       end
26 |     rescue Errno::ETIMEDOUT
27 |       validation_fail(:tcp, <<~MESSAGE, details)
28 |         TCP connection to #{host}:#{port} timed out. Please make sure the crawler
29 |         instance is allowed to connect to your servers.
30 |       MESSAGE
31 |     rescue SocketError, SystemCallError => e
32 |       validation_fail(:tcp, "TCP connection to #{host}:#{port} failed: #{e}", details)
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/lib/crawler/url_validator/url_check_concern.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   module UrlValidator::UrlCheckConcern # rubocop:disable Style/ClassAndModuleChildren
11 |     extend ActiveSupport::Concern
12 | 
13 |     def validate_url # rubocop:disable Metrics/AbcSize
14 |       if url.scheme.blank?
15 |         validation_fail(:url, 'URL scheme is missing. Domain URLs must start with https:// or http://')
16 |       elsif !url.supported_scheme?
17 |         validation_fail(:url, "Unsupported URL scheme: #{url.scheme}", scheme: url.scheme)
18 |       elsif url.path.present? && !configuration
19 |         validation_fail(:url, 'Domain URLs cannot contain a path')
20 |       else
21 |         validation_ok(:url, 'URL structure looks valid')
22 |       end
23 |     rescue Addressable::URI::InvalidURIError => e
24 |       validation_fail(:url, "Error parsing domain name: #{e}")
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/lib/crawler/utils.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | module Crawler
10 |   class Utils
11 |     def self.url_pattern(domain, type, pattern)
12 |       "\\A#{Regexp.escape(domain)}#{path_pattern(type, pattern)}"
13 |     end
14 | 
15 |     def self.path_pattern(type, pattern)
16 |       case type
17 |       when 'begins'
18 |         pattern_with_wildcard(pattern)
19 |       when 'ends'
20 |         ".*#{pattern_with_wildcard(pattern)}\\z"
21 |       when 'contains'
22 |         ".*#{pattern_with_wildcard(pattern)}"
23 |       when 'regex'
24 |         pattern
25 |       end
26 |     end
27 | 
28 |     def self.pattern_with_wildcard(pattern)
29 |       Regexp.escape(pattern).gsub('\*', '.*')
30 |     end
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/lib/environment.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # Add the lib directory to the load path
10 | $LOAD_PATH << __dir__.to_s
11 | 
12 | # Calculate the current environment
13 | CRAWLER_ENV = ENV.fetch('CRAWLER_ENV', 'development')
14 | 
15 | # Set up bundler
16 | require 'rubygems'
17 | require 'bundler'
18 | Bundler.setup(:default, CRAWLER_ENV)
19 | 
20 | # Load common dependencies
21 | require 'active_support'
22 | require 'active_support/core_ext'
23 | require 'active_support/dependencies'
24 | 
25 | # Load crawler components
26 | require 'crawler'
27 | 


--------------------------------------------------------------------------------
/lib/errors.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | class Errors
10 |   # Raised only if the queue item added somehow overflows the queue threshold.
11 |   # The queue threshold is checked before an item is added so this error shouldn't occur.
12 |   # If this error occurs, something is wrong with the interaction between the Elasticsearch sink and BulkQueue.
13 |   class BulkQueueOverflowError < StandardError; end
14 | 
15 |   # Raised when attempting to add a crawl result to the sink, but it is currently locked.
16 |   # This is specific for Elasticsearch sink. Basically the sink is single-threaded but
17 |   # receives crawl results from multi-threaded processes. This error is raised to prevent
18 |   # overloading the queue if Elasticsearch indexing is failing repeatedly and performing
19 |   # exponential backoff. This error should be treated as retryable.
20 |   class SinkLockedError < StandardError; end
21 | 
22 |   # Raised when there is a connection error to Elasticsearch. Specific for Elasticsearch sink.
23 |   # During initialization of the Elasticsearch sink, it will attempt to make contact to
24 |   # the host provided in the configuration. If contact cannot  be established, a system exit will occur.
25 |   class ExitIfESConnectionError < SystemExit; end
26 | 
27 |   # Raised when the desired output index does not exist. This is specific for Elasticsearch
28 |   # sink. During initialization of the Elasticsearch sink, it will call indices.exists()
29 |   # against the output_index value, and will continue if the index is found.
30 |   # If it is not found, this error will be raised, which causes a system exit to occur.
31 |   class ExitIfUnableToCreateIndex < SystemExit; end
32 | end
33 | 


--------------------------------------------------------------------------------
/product_version:
--------------------------------------------------------------------------------
1 | 0.3.0
2 | 


--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://docs.renovatebot.com/renovate-schema.json",
 3 |   "extends": [
 4 |     "github>elastic/renovate-config:only-chainguard"
 5 |   ],
 6 |   "schedule": [
 7 |     "* * * * 0,6"
 8 |   ]
 9 | }
10 | 


--------------------------------------------------------------------------------
/script/bundle:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | source "$(dirname $0)/functions.sh"
 4 | 
 5 | set -e
 6 | 
 7 | # Used by Gemfile to limit direct access to bundle commands
 8 | export SCRIPT_BUNDLE=true
 9 | 
10 | # Tune for faster startup
11 | export JRUBY_OPTS="${JRUBY_OPTS:-} --dev --debug"
12 | export JAVA_OPTS="-Xmx2g ${JAVA_OPTS:-} -Djava.awt.headless=true -Dsun.jnu.encoding=UTF-8 -Dfile.encoding=UTF-8 -XX:+HeapDumpOnOutOfMemoryError"
13 | 
14 | function bundle_command() {
15 |   if ! bundle "$@"; then
16 |     set +x
17 |     echo
18 |     red_echo "ERROR: Bundle command failed!"
19 |     yellow_echo "Try to run 'make install' and then retry this command"
20 |     echo
21 |     exit 42
22 |   fi
23 | }
24 | 
25 | BUNDLER_VERSION="$(cat .bundler-version)"
26 | BUNDLER_CONSTRAINT="~> $BUNDLER_VERSION"
27 | 
28 | blue_echo "Bundling jruby gems..."
29 | bundle_command config cache_all true
30 | 
31 | blue_echo "Running the bundle command..."
32 | bundle_command "$@"
33 | 
34 | green_echo "Done!"
35 | echo
36 | 


--------------------------------------------------------------------------------
/script/environment:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | source "$(dirname $0)/functions.sh"
4 | 
5 | set -e
6 | 
7 | load_version_constraints
8 | check_bundle
9 | 


--------------------------------------------------------------------------------
/script/licenses/README.md:
--------------------------------------------------------------------------------
 1 | # 3rd Party :tada: dependencies
 2 | 
 3 | This directory contains scripts and files for generating a `NOTICE.txt` file containing all licenses for the third-party dependencies that Crawler uses.
 4 | It will look at the SPDX license for Ruby gems.
 5 | If this cannot be found, it will attempt to download the LICENSE file and add it to the project for future reference.
 6 | When a LICENSE file doesn't exist (or is in an unexpected location or format), a manual override must be added.
 7 | 
 8 | Downloaded license files are added to the directories `rubygems_licenses` or `misc_licneses`.
 9 | 
10 | All license texts are then added to the repository's [NOTICE.txt](../../NOTICE.txt) file.
11 | 
12 | ## Types of dependencies
13 | 
14 | - Ruby Gems from `Gemfile` and `Gemfile.lock`
15 | - Misc. dependencies, like JRuby, Tika, etc. not managed by a package manager
16 | 
17 | ## Generate NOTICE.txt
18 | 
19 | ```bash
20 | ./script/licenses/generate_notice_txt.rb
21 | ```
22 | 


--------------------------------------------------------------------------------
/script/licenses/generate_notice.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | #
 4 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 5 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 6 | # you may not use this file except in compliance with the Elastic License 2.0.
 7 | #
 8 | 
 9 | # frozen_string_literal: true
10 | 
11 | NOTICE_TXT_PATH = File.expand_path('../../NOTICE.txt', __dir__)
12 | 
13 | require_relative 'lib/third_party'
14 | 
15 | def write_header_to_file(io)
16 |   io.puts 'Elastic Open Web Crawler'
17 |   io.puts 'Copyright 2024 Elasticsearch B.V.'
18 |   io.puts
19 |   io.puts 'The Elastic Open Web Crawler contains the following third-party dependencies:'
20 |   io.puts
21 | end
22 | 
23 | def write_license_to_file(io, klass_instance, identifier, dependency)
24 |   io.puts '-' * 80
25 |   io.puts "Library: #{klass_instance.format_library_for_notice_txt(identifier, dependency)}"
26 |   io.puts "URL: #{dependency[:url]}" if dependency[:url]
27 |   io.puts "License: #{dependency[:license]}" if dependency[:license]
28 |   io.puts
29 |   File.open(dependency[:license_file_path], 'r') do |license_file|
30 |     io.puts(license_file.read)
31 |     io.puts
32 |   end
33 | end
34 | 
35 | File.open(NOTICE_TXT_PATH, 'w') do |io|
36 |   write_header_to_file(io)
37 | 
38 |   [
39 |     ThirdParty::RubygemsDependencies,
40 |     ThirdParty::MiscDependencies
41 |   ].each do |klass|
42 |     klass_instance = klass.new
43 |     dependencies = klass_instance.get(with_license_files: true)
44 |     dependencies.keys.sort.each do |identifier|
45 |       dependency = dependencies.fetch(identifier)
46 | 
47 |       unless dependency[:license_file_path]
48 |         ThirdParty::LOGGER.error("There is no license file for #{identifier}!")
49 |         exit(1)
50 |       end
51 | 
52 |       unless File.exist?(dependency[:license_file_path])
53 |         err = "License file for #{identifier} does not exist locally (path: #{dependency[:license_file_path]})"
54 |         ThirdParty::LOGGER.error(err)
55 |         exit(2)
56 |       end
57 | 
58 |       write_license_to_file(io, klass_instance, identifier, dependency)
59 |     end
60 |   end
61 | end
62 | 


--------------------------------------------------------------------------------
/script/licenses/lib/third_party.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'logger'
10 | 
11 | module ThirdParty
12 |   LOGGER = Logger.new($stdout, level: Logger::DEBUG)
13 | 
14 |   LICENSE_FILE_NAME_OPTIONS = %w[
15 |     LICENSE
16 |     LICENSE.md
17 |     LICENSE.txt
18 |     License.txt
19 |     LICENCE
20 |     LICENSE-MIT
21 |     Licence.md
22 |     Licence.rdoc
23 |     MIT_LICENSE
24 |     MIT-LICENSE
25 |     MIT-LICENSE.txt
26 |     BSDL
27 |     COPYING
28 |     COPYING.txt
29 |   ].freeze
30 |   UNKNOWN_LICENSE = 'UNKNOWN'
31 | 
32 |   module SPDX
33 |     class << self
34 |       def normalize_license(license)
35 |         return license if SUPPORTED_IDENTIFIERS.include?(license) || license.match?(/\s+OR|AND|WITH\s+/)
36 | 
37 |         ALIASES.fetch(license, nil)
38 |       end
39 |     end
40 | 
41 |     SUPPORTED_IDENTIFIERS = %w[
42 |       0BSD
43 |       Apache-2.0
44 |       AFL-2.1
45 |       BSD-2-Clause
46 |       BSD-3-Clause
47 |       CC0-1.0
48 |       CC-BY-3.0
49 |       CC-BY-4.0
50 |       Elastic-2.0
51 |       EPL-1.0
52 |       ISC
53 |       GPL-2.0
54 |       LGPL-2.1
55 |       MIT
56 |       MPL-2.0
57 |       Ruby
58 |       Unlicense
59 |     ].freeze
60 | 
61 |     IDENTIFIER_TO_ALIASES = {
62 |       'AFL-2.1' => [
63 |         'AFLv2.1'
64 |       ],
65 |       'BSD-2-Clause' => [
66 |         'BSD 2-Clause',
67 |         'BSD',
68 |         'BSD*',
69 |         '2-clause BSDL'
70 |       ],
71 |       'Apache-2.0' => [
72 |         'Apache License Version 2.0',
73 |         'Apache License (2.0)'
74 |       ],
75 |       'Ruby' => [
76 |         'ruby'
77 |       ],
78 |       'Python-2.0' => [
79 |         'PSFL'
80 |       ],
81 |       'MIT' => [
82 |         'MIT*'
83 |       ]
84 |     }.freeze
85 | 
86 |     ALIASES = IDENTIFIER_TO_ALIASES.each_with_object({}) do |(spdx_identifier, aliases), out|
87 |       aliases.each do |a|
88 |         out[a] = spdx_identifier
89 |       end
90 |     end
91 |   end
92 | end
93 | 
94 | require_relative 'third_party/misc_dependencies'
95 | require_relative 'third_party/rubygems_dependencies'
96 | 


--------------------------------------------------------------------------------
/script/licenses/lib/third_party/misc_dependencies.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'pathname'
10 | require_relative 'base'
11 | 
12 | module ThirdParty
13 |   class MiscDependencies < Base
14 |     def type
15 |       'Misc. Dependency'
16 |     end
17 | 
18 |     def licenses_path
19 |       LICENSES_PATH
20 |     end
21 | 
22 |     def license_fallbacks
23 |       {}
24 |     end
25 | 
26 |     def license_file_fallbacks
27 |       DEPENDENCIES.transform_values do |dependency|
28 |         dependency.fetch(:license_file_override)
29 |       end
30 |     end
31 | 
32 |     def get(with_license_files: false)
33 |       DEPENDENCIES.each_with_object({}) do |(identifier, dependency), out|
34 |         out[identifier] = dependency.slice(:name, :version, :license, :url)
35 | 
36 |         out[identifier][:license_file_path] = license_file_path_for_dependency(identifier) if with_license_files
37 |       end
38 |     end
39 | 
40 |     LICENSES_PATH = Pathname.new(__dir__).join('..', '..', 'misc_licenses')
41 |     JRUBY_VERSION = File.read(File.expand_path('../../../../.ruby-version', __dir__)).strip.delete_prefix('jruby-')
42 | 
43 |     DEPENDENCIES = {
44 |       'jruby' => {
45 |         name: 'jruby',
46 |         version: JRUBY_VERSION,
47 |         license: 'EPL-2.0 OR GPL-2.0 OR LGPL-2.1',
48 |         license_file_override: { manually_added: true },
49 |         url: 'https://www.jruby.org'
50 |       },
51 |       'tika' => {
52 |         name: 'tika',
53 |         version: '1.23',
54 |         license: 'Apache-2.0',
55 |         license_file_override: { manually_added: true },
56 |         url: 'https://github.com/apache/tika'
57 |       }
58 |     }.freeze
59 |   end
60 | end
61 | 


--------------------------------------------------------------------------------
/script/licenses/misc_licenses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/script/licenses/misc_licenses/.gitkeep


--------------------------------------------------------------------------------
/script/licenses/rubygems_licenses/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/script/licenses/rubygems_licenses/.gitkeep


--------------------------------------------------------------------------------
/script/licenses/rubygems_licenses/_manually_added_faux-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2024 Elasticsearch B.V.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/script/licenses/rubygems_licenses/_manually_added_httpclient-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | This program is copyrighted free software by NAKAMURA, Hiroshi.  You can
 2 | redistribute it and/or modify it under the same terms of Ruby's license;
 3 | either the dual license version in 2003, or any later version.
 4 | 
 5 | httpclient/session.rb is based on http-access.rb in http-access/0.0.4.  Some
 6 | part of it is copyrighted by Maebashi-san who made and published
 7 | http-access/0.0.4.  http-access/0.0.4 did not include license notice but when
 8 | I asked Maebashi-san he agreed that I can redistribute it under the same terms
 9 | of Ruby.  Many thanks to Maebashi-san.
10 | 


--------------------------------------------------------------------------------
/script/licenses/rubygems_licenses/_manually_added_minitest-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | (The MIT License)
 2 | 
 3 | Copyright (c) Ryan Davis, seattle.rb
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining
 6 | a copy of this software and associated documentation files (the
 7 | 'Software'), to deal in the Software without restriction, including
 8 | without limitation the rights to use, copy, modify, merge, publish,
 9 | distribute, sublicense, and/or sell copies of the Software, and to
10 | permit persons to whom the Software is furnished to do so, subject to
11 | the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be
14 | included in all copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/script/licenses/rubygems_licenses/_manually_added_strscan-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (C) 1999-2006 Minero Aoki. All rights reserved.
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 | 1. Redistributions of source code must retain the above copyright
 7 |    notice, this list of conditions and the following disclaimer.
 8 | 2. Redistributions in binary form must reproduce the above copyright
 9 |    notice, this list of conditions and the following disclaimer in the
10 |    documentation and/or other materials provided with the distribution.
11 | 
12 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
13 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
14 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
15 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
16 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
17 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
18 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
19 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
20 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
21 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
22 | SUCH DAMAGE.


--------------------------------------------------------------------------------
/script/rspec:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -ex
4 | 
5 | export JRUBY_OPTS="${JRUBY_OPTS} --debug"
6 | 
7 | BUNDLE_CMD=${BUNDLE_CMD:-bundle}
8 | $BUNDLE_CMD exec rspec $*
9 | 


--------------------------------------------------------------------------------
/script/support/string_colors.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # String colorization extensions
 4 | class String
 5 |   def colorize(color_code)
 6 |     "\e[#{color_code}m#{self}\e[0m"
 7 |   end
 8 | 
 9 |   def red
10 |     colorize(31)
11 |   end
12 | 
13 |   def green
14 |     colorize(32)
15 |   end
16 | 
17 |   def yellow
18 |     colorize(33)
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/script/vendor_jars:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env jruby
 2 | # frozen_string_literal: true
 3 | 
 4 | require 'jar-dependencies'
 5 | 
 6 | Jars.lock_down(
 7 |   debug: ENV['JARS_DEBUG'] == 'true',
 8 |   verbose: ENV['JARS_VERBOSE'] == 'true',
 9 |   vendor_dir: 'vendor/jars'
10 | )
11 | 


--------------------------------------------------------------------------------
/spec/fixtures/crawl-flat-format.yml:
--------------------------------------------------------------------------------
 1 | domains: [{url: "https://localhost:80", seed_urls: ["https://localhost:80", "https://localhost:80/news/"]}]
 2 | schedule.pattern: '* * * * *'
 3 | # Where to send the results. Possible values are console, file, or elasticsearch
 4 | output_sink: elasticsearch
 5 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
 6 | output_index: test-index
 7 | # Crawl tuning
 8 | max_crawl_depth: 2
 9 | # Crawl result field size limits
10 | max_title_size: 500
11 | max_body_size: 5_242_880 # 5 megabytes
12 | max_keywords_size: 512
13 | max_description_size: 512
14 | max_indexed_links_count: 5
15 | max_headings_count: 5
16 | # elasticsearch settings
17 | elasticsearch.host: http://localhost
18 | elasticsearch.port: 9200
19 | elasticsearch.username: elastic
20 | elasticsearch.password: changeme
21 | elasticsearch.bulk_api.max_items: 10
22 | elasticsearch.bulk_api.max_size_bytes: 1_048_576


--------------------------------------------------------------------------------
/spec/fixtures/crawl.yml:
--------------------------------------------------------------------------------
 1 | # Domains allowed for the crawl
 2 | domains:
 3 |   - url: https://localhost:80
 4 |     seed_urls:
 5 |       - https://localhost:80
 6 |       - https://localhost:80/news/
 7 | 
 8 | schedule:
 9 |   pattern: '* * * * *'
10 | 
11 | # Where to send the results. Possible values are console, file, or elasticsearch
12 | output_sink: elasticsearch
13 | 
14 | # Elasticsearch index name to ingest crawl results into. Required if output_sink is elasticsearch
15 | output_index: test-index
16 | 
17 | # Crawl tuning
18 | max_crawl_depth: 2
19 | 
20 | # Crawl result field size limits
21 | max_title_size: 500
22 | max_body_size: 5_242_880 # 5 megabytes
23 | max_keywords_size: 512
24 | max_description_size: 512
25 | max_indexed_links_count: 5
26 | max_headings_count: 5
27 | 
28 | elasticsearch:
29 |   host: http://localhost
30 |   port: 9200
31 |   username: elastic
32 |   password: changeme
33 |   bulk_api:
34 |     max_items: 10
35 |     max_size_bytes: 1_048_576
36 | 


--------------------------------------------------------------------------------
/spec/fixtures/do-not-visit.txt:
--------------------------------------------------------------------------------
1 | http://127.0.0.1:9393/do-not-visit-here
2 | 


--------------------------------------------------------------------------------
/spec/fixtures/elasticsearch-flat-format.yml:
--------------------------------------------------------------------------------
 1 | elasticsearch.host: http://test:9200
 2 | elasticsearch.username: test
 3 | elasticsearch.password: changeme-test
 4 | elasticsearch.api_key: 1234
 5 | elasticsearch.pipeline: ent-search-generic-ingestion
 6 | elasticsearch.pipeline_enabled: true
 7 | elasticsearch.pipeline_params._reduce_whitespace: true
 8 | elasticsearch.pipeline_params._run_ml_inference: true
 9 | elasticsearch.pipeline_params._extract_binary_content: true
10 | elasticsearch.bulk_api.max_items: 10
11 | elasticsearch.bulk_api.max_size_bytes: 1_048_576


--------------------------------------------------------------------------------
/spec/fixtures/elasticsearch-partially-flat-format.yml:
--------------------------------------------------------------------------------
 1 | elasticsearch:
 2 |  host: http://test:9200
 3 |  username: test
 4 |  password: changeme-test
 5 |  api_key: 1234
 6 |  pipeline: ent-search-generic-ingestion
 7 |  pipeline_enabled: true
 8 |  pipeline_params._reduce_whitespace: true
 9 |  pipeline_params._run_ml_inference: true
10 |  pipeline_params._extract_binary_content: true
11 |  bulk_api:
12 |    max_items: 10
13 |    max_size_bytes: 1_048_576
14 | 


--------------------------------------------------------------------------------
/spec/fixtures/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | elasticsearch:
 2 |  host: http://test:9200
 3 |  username: test
 4 |  password: changeme-test
 5 |  api_key: 1234
 6 |  pipeline: ent-search-generic-ingestion
 7 |  pipeline_enabled: true
 8 |  pipeline_params:
 9 |    _reduce_whitespace: true
10 |    _run_ml_inference: true
11 |    _extract_binary_content: true
12 |  bulk_api:
13 |    max_items: 10
14 |    max_size_bytes: 1_048_576
15 | 


--------------------------------------------------------------------------------
/spec/fixtures/sitemap/sitemap_index.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |    <sitemap>
 4 |      <loc>http://www.example.com/sitemap1.xml</loc>
 5 |      <lastmod>2004-10-01T18:23:17+00:00</lastmod>
 6 |    </sitemap>
 7 |    <sitemap>
 8 |      <loc>http://www.example.com/sitemap2.xml</loc>
 9 |      <lastmod>2005-01-01</lastmod>
10 |    </sitemap>
11 | </sitemapindex>
12 | 


--------------------------------------------------------------------------------
/spec/fixtures/sitemap/sitemap_no_urls.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
3 | </urlset>
4 | 


--------------------------------------------------------------------------------
/spec/fixtures/sitemap/sitemap_urlset.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
 3 |   <url>
 4 |     <loc>http://www.example.com/</loc>
 5 |     <lastmod>2005-01-01</lastmod>
 6 |     <changefreq>monthly</changefreq>
 7 |     <priority>0.8</priority>
 8 |   </url>
 9 |   <url>
10 |     <loc>http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
11 |     <changefreq>weekly</changefreq>
12 |   </url>
13 |   <url>
14 |     <loc>http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>
15 |     <lastmod>2004-12-23</lastmod>
16 |     <changefreq>weekly</changefreq>
17 |   </url>
18 |   <url>
19 |     <loc>http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>
20 |     <lastmod>2004-12-23T18:00:15+00:00</lastmod>
21 |     <priority>0.3</priority>
22 |   </url>
23 |   <url>
24 |     <loc>http://www.example.com/catalog?item=83&amp;desc=vacation_usa</loc>
25 |     <lastmod>2004-11-23</lastmod>
26 |   </url>
27 | </urlset>
28 | 


--------------------------------------------------------------------------------
/spec/fixtures/sitemap/sitemap_urlset.xml.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/spec/fixtures/sitemap/sitemap_urlset.xml.gz


--------------------------------------------------------------------------------
/spec/fixtures/ssl/ca.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDwDCCAqgCCQCgaeTT+pTAQzANBgkqhkiG9w0BAQsFADCBoTELMAkGA1UEBhMC
 3 | VVMxCzAJBgNVBAgMAkNBMRYwFAYDVQQHDA1TYW4gRnJhbmNpc2NvMRAwDgYDVQQK
 4 | DAdFbGFzdGljMRowGAYDVQQLDBFFbnRlcnByaXNlIFNlYXJjaDESMBAGA1UEAwwJ
 5 | Y3VzdG9tLWNhMSswKQYJKoZIhvcNAQkBFhxlbnRlcnByaXNlLXNlYXJjaEBlbGFz
 6 | dGljLmNvMB4XDTIxMDYxMDE2MTcwNFoXDTQ4MTAyNTE2MTcwNFowgaExCzAJBgNV
 7 | BAYTAlVTMQswCQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4G
 8 | A1UECgwHRWxhc3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNV
 9 | BAMMCWN1c3RvbS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hA
10 | ZWxhc3RpYy5jbzCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAKX79WVB
11 | kDDq/TLCvJWWsTVjuHz4y0Z+iddYazQP2UCPng6uLiUWDmxu8Im+PdVb6iQDYw8N
12 | YgOZm0wUeXoozegs3RfcQHFTGosMVtD7bZrY24+3D4+XagIEe9rKiWBDtK7pHAcC
13 | kQg+2Z53tNu9h1TV8jE/GzjwedMfidHUTTQLMx853AywUEIZTusihrskkQeoWsXI
14 | CfWPWl8vKR1S7IdtnjR21H0RdyWGt7iQZHVy3ChrIWIInaq50qw7OOqzE/JNclOH
15 | 7bL/xBsZbGBIxnpOgMrpJak6NWcouoqH0sCisAqwQnn6kOI7GIrhAxhZa7c9Dbx0
16 | z7MYQfczUoWI6oMCAwEAATANBgkqhkiG9w0BAQsFAAOCAQEApNJwMB5gFQhRkkcz
17 | EQkC5n7ReMWQLyoRl3g8kUyMS9iYMxeJB+tnB8BMICUInpKcRbDlW1pCrstyW311
18 | O1FJweszWP3QRWBz49Cu5EPnFG75PJGnC2lOGcSC81M91yl5EjjvLTTWUcfuoMYF
19 | U2XrSo0LQpZdpzqjnG3ELMrcieplpiz7c/D7YIUK1wA8qy7Aif5uAjueY3NUfYzg
20 | wLdHRX5eRG6e4xV6iMI9ApetT1j2xoUeFHPO6yMRBcsdG+L20w8AAK6dqCa7vYhY
21 | fOKtgLhIR++qThawUwkb8HHHfXaJzP++0swXc0ljE/Uz0FFwRB9zbipVc0p6FAd7
22 | oFJU+w==
23 | -----END CERTIFICATE-----
24 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/ca.key:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | Proc-Type: 4,ENCRYPTED
 3 | DEK-Info: DES-EDE3-CBC,9A2A4E541C52C308
 4 | 
 5 | vr8Lcl7IK6neqrY6dwJiI864mWl1bVVH+kvPguAMVYp5W+m2KmYnBjvRCok5UFGY
 6 | DAPWbtlY3X5eZrL6VKw7kaYEcr6DIlBniet9XOoJzzmGdRIbj1I5O+irdMgZg8SD
 7 | RGyEYf6a3rtJj8tSrDjGlmf44xBnowN79QvBbKzCZI8vqlHcW90NZw9b8vpjj2IP
 8 | cJXt91m71RwxWvjAOw2SFTXk4okmymrAlAB2+L8GQD/YgacQSihq9vri8aXwzmh2
 9 | 8tmzcqGI+AUfxU77n7+dUS6rgJ2yzTLThaStPZiJBOAqrePuc9pyHV+yIYN28+3H
10 | H265/GWJwjvdc8iu0T73JVWplI5yM3xhofLzLXdFhjDoSdlzK9MqX5YhP/eBrolR
11 | Hm1Ly3Gi8WJgvn1LBn3bIZWZPH5Ch7UZt6kTG/TkpC1AKwATIKorWKetCACt2EeO
12 | txPMQt4XAykkzyAiyK4FW4Sh4KAqEGoXN0ELV8TTKk2YEV6tRk0XB3YIQgnDgBgS
13 | 0SvyjVTZ9cbuVAXN6oIhOtsrxU9NSidkAVw/wy5jbPhYHBYCGuM3QvUiMP0f4W69
14 | a4zqPjl36bGd/SDBgwbJmMW2qjzrx7Og0xpJccVQSr+N8JB4/AYBJIxYT4niER+V
15 | rlnEu/7OyVKjuS+EzDgSpoDRyiROgHaq9L5NfRum6b5ZtSdh9bSMoJq55aps+tUe
16 | pyp5ftO6l9ffJbuVdJfA63kvAq0MbwZ7Om10rsjGfM/XYITQNMlrEuGKEIyJIyv+
17 | DnP8GUeHapPSTrjzk3cpwYnwknLeYtwM6MdgMSG/rW/Ksd2HoH8kA2WtGOrh34UQ
18 | Q1r9JO3Smog6Iq2DBl77w8oBIIx6LcMe5osUveEIsgoUyQoUMisVOP2kbdbQyN5X
19 | lzHajl/rORV7V2iKhgJ/yEroTwu8XgSBl6mSQFaRSwyqyNJTvqbFdIdN3dXvSqf+
20 | qmfubcxGS3DRbI0pFGifUAPrD0hej/Dtm1dF9+4edB10hxYoDMiBidWgUaocZD6i
21 | zvKj5/JmOp6LboaW+VzcKFiHJ16Ntpu5I0opZZTjLiOE2l6Xcji9vCMRZcm8UFqo
22 | vWCMsll5cn7w8IluND0vCZvD7xsOjoluSW+XEXzmCSgRVH0Exi0voSuNtousl96i
23 | v5QOn/pIM4+pgBlDlO3qjbtkamZDiHtLepgWFwxb1IK7tmcmUZex/c6YHAheQdq/
24 | SdpcG2QjvpGvTDndpoK64SP9roZpajnJOeqx3NX3rPs2y14Vu3p504aY9YWHp4QZ
25 | xSgURu0fBP4PFoWlGClJWtoFaCdqeiXSCfTliiR2H+LjYHiVpZ5hcC9yAXx1IRBt
26 | +JtZ1sPVUSJCTJzxWuyjCSMhjbmftG1MC/659CDuoyrGnDYS92iu6wM/tw80LEFm
27 | kwhSxWP4D0yyWBnbp0KdrAOu2cMPhkLMyJ/bGxvBupuVsrXubbfvqVnWn6CI62VW
28 | gAupwn0LnY58/7SrzT1prJKPLBJ0UuxOlfkpPcUxjzc3pyN6VwKHm0fxb9QaaXFN
29 | GWrmjpmJAEcFR5GrHmGiyjKMGvCrH33u5wkoLCD2Dxx4aoC2Mc2U0Q==
30 | -----END RSA PRIVATE KEY-----
31 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/ca.password.txt:
--------------------------------------------------------------------------------
1 | 13243546


--------------------------------------------------------------------------------
/spec/fixtures/ssl/expired/example.cnf:
--------------------------------------------------------------------------------
 1 | FQDN = example.org
 2 | ORGNAME = Elastic
 3 | ALTNAMES = DNS:$FQDN, DNS:www.$FQDN
 4 | 
 5 | [ req ]
 6 | default_bits = 2048
 7 | default_md = sha256
 8 | prompt = no
 9 | encrypt_key = no
10 | distinguished_name = dn
11 | req_extensions = req_ext
12 | 
13 | [ dn ]
14 | C = CH
15 | O = $ORGNAME
16 | CN = $FQDN
17 | 
18 | [ req_ext ]
19 | subjectAltName = $ALTNAMES
20 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/expired/example.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDTjCCAjYCBAdbzRUwDQYJKoZIhvcNAQELBQAwgaExCzAJBgNVBAYTAlVTMQsw
 3 | CQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4GA1UECgwHRWxh
 4 | c3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNVBAMMCWN1c3Rv
 5 | bS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hAZWxhc3RpYy5j
 6 | bzAeFw0yMTA2MDExOTM1MzJaFw0yMTA2MDIxOTM1MzJaMDUxCzAJBgNVBAYTAkNI
 7 | MRAwDgYDVQQKDAdFbGFzdGljMRQwEgYDVQQDDAtleGFtcGxlLm9yZzCCASIwDQYJ
 8 | KoZIhvcNAQEBBQADggEPADCCAQoCggEBAM2U6kepk57OfVBId1b7kkgKF5CIvKWr
 9 | v4O9xCh+LnMpWxmpA4IyN66qd2G9aCiXK9d0bCNvue8TC3P5LHlcrfrI+yHmAgRj
10 | YAe249ifkFcQ0HPqPMe3B3+l50kCRkn2Wd0x2Clpz/tGLXVJ2AR/iYsTAOGMzC2O
11 | Ldv/F1pbDJIw4PTLTQBfDTh2m/S/GHz4b5ZetONlbHPXo2H/wj6/OHFAkcvvlzQa
12 | Qr8YT0/uyHzJBa6AQPQqUljjJSSAHnnC4fASwaFLUGjULrQmhwJzktukFk2eXGiu
13 | oo3prgoDQqRvknu9S4skTxn6Ku59VGjMfUMTExSOy3znSqye3HSVJBECAwEAATAN
14 | BgkqhkiG9w0BAQsFAAOCAQEAgjFqbtHkq6LsN1PGKKXlyJadr90AyD0TF1yA5tTA
15 | dtrixhgQFdnigR85Nyd9aKb8x7ocxmwotX3+WLwNb/+SmyICATJ5qCjuYACxx78z
16 | tmGqQEqYL4xF/gPxClkqPnCGM5kocu9Ct+3G5HJejghA4fbspx/2QtVbMa69Ac5B
17 | vZpFHXXiHtoWmK2skBxHJ5LAgq2LEWQVXzX9IDKy01qn+Jv+rD+G5vfdYaunldrI
18 | JrnRVsbt0ufCGzzqyesHUIUY8UTWLWeZ0Gr8XS0U5mfHqgwMR8PGrF/0sOdv/Jg1
19 | 910D5RGAo6niyi8fcdfxIPqmjhec6XbaJAvTQNgDmzVrkQ==
20 | -----END CERTIFICATE-----
21 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/expired/example.csr:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE REQUEST-----
 2 | MIICtDCCAZwCAQAwNTELMAkGA1UEBhMCQ0gxEDAOBgNVBAoMB0VsYXN0aWMxFDAS
 3 | BgNVBAMMC2V4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC
 4 | AQEAzZTqR6mTns59UEh3VvuSSAoXkIi8pau/g73EKH4ucylbGakDgjI3rqp3Yb1o
 5 | KJcr13RsI2+57xMLc/kseVyt+sj7IeYCBGNgB7bj2J+QVxDQc+o8x7cHf6XnSQJG
 6 | SfZZ3THYKWnP+0YtdUnYBH+JixMA4YzMLY4t2/8XWlsMkjDg9MtNAF8NOHab9L8Y
 7 | fPhvll6042Vsc9ejYf/CPr84cUCRy++XNBpCvxhPT+7IfMkFroBA9CpSWOMlJIAe
 8 | ecLh8BLBoUtQaNQutCaHAnOS26QWTZ5caK6ijemuCgNCpG+Se71LiyRPGfoq7n1U
 9 | aMx9QxMTFI7LfOdKrJ7cdJUkEQIDAQABoDowOAYJKoZIhvcNAQkOMSswKTAnBgNV
10 | HREEIDAeggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUub3JnMA0GCSqGSIb3DQEB
11 | CwUAA4IBAQAohzpolBHDmzHgG/AaOTbQhqYKdkh0tm5fLrC/Ve/2KBZU1pcLuTPk
12 | FIuSyQNrebeDIO8VHDLfRJrnjqIU7+fBWDgdxgkLezPqlX5WUFJiXvxuSRrD52Lk
13 | SPJVuHCs2BEimlRAxp937N/sWPdWD/A+wyzKVM+bD20krhpZoMMMXE6LQiKOnRan
14 | JToRgCAmL7fionmgzKwD2+k2nN3EFt+e6FaYKJqB3fkBX78FG1ijOftGlRD+D0hP
15 | r/Rc2b31nHNEhLHKvcYFwYTvE8EVIzYNJrYV+N/c6t3aOirWfL9xSW3VA9a/AqFB
16 | rYiGX+f4EmugECbO7KmSDxH6YDQeMg+N
17 | -----END CERTIFICATE REQUEST-----
18 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/expired/example.key:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEpAIBAAKCAQEAzZTqR6mTns59UEh3VvuSSAoXkIi8pau/g73EKH4ucylbGakD
 3 | gjI3rqp3Yb1oKJcr13RsI2+57xMLc/kseVyt+sj7IeYCBGNgB7bj2J+QVxDQc+o8
 4 | x7cHf6XnSQJGSfZZ3THYKWnP+0YtdUnYBH+JixMA4YzMLY4t2/8XWlsMkjDg9MtN
 5 | AF8NOHab9L8YfPhvll6042Vsc9ejYf/CPr84cUCRy++XNBpCvxhPT+7IfMkFroBA
 6 | 9CpSWOMlJIAeecLh8BLBoUtQaNQutCaHAnOS26QWTZ5caK6ijemuCgNCpG+Se71L
 7 | iyRPGfoq7n1UaMx9QxMTFI7LfOdKrJ7cdJUkEQIDAQABAoIBAG648olQMrqIWgPA
 8 | U84cRkfYb6KfkoLkAozQyvJIK3pI3tDuL36Sz1yaYRvaKFwcNzeec5OOXCUAK931
 9 | aNega++zCVbTi2iToSfmf8avAc1yt+KGWN/zmu1MDEpNGFBDh1jTvKlpXOPngxo1
10 | gEvD6O9nd8UC0QEEH3zqYch/W0DsbK1GL+P8D32UzGrZFNiZPa3MzVLz+JDXgSak
11 | +Vpy6M5wJ9jNIQGtylM7COpXDay3TU0dvKsXKh41R45fbw9GXHHbG+bUHFdIPCAI
12 | xElcw/v3igHEm7m3kRo8+KLkcRwCFMCWsspYB0mKmk5CSu5Z8EeGszawTPvPCkwE
13 | l3roOaUCgYEA9mUsNBGJ9GBKgoC+0Y1vbBKXf0r2Zc5evQg2O7kJVlK+iMirB4X5
14 | Rcve4wbXJHVSG1BC1yD6OGVxd781eTPYAJ/7JcajOmGK739ycwpAg/0ObC11GmCC
15 | 9h5jFB6DyPHyBLkjbAe68Qdceixb2G7t1Fw0sKpN6baucNXxB03CQp8CgYEA1Zh0
16 | 0A5f+8y/Qe7o3+1lIAocJTLizDZf0AhwhvCdLbgfDDgCins7CFvG92el+6EiwStl
17 | 3QztnS3pqxs4K3NwnY5hyqY2QMfQLmKDNOr0n7zXUQm5VloLb7tLbVLkeEPif8d+
18 | T1fREeDoD6lUT2OHVQrSF8ntiSqxsgGQt20iS08CgYAkEeeIr9CcP8RommRU1Pms
19 | voQDFHxBpxZjYeJel9XwFyjhaU7wSQKW3yN/5K52Hd6pNPSz/ZXjz6Xuu8UeOyKx
20 | 5LmlbBDVKRZnvSaKBMQxDkigOX9dHyfM0+H5PgZY0mJ1ooy28eLCCivVjszbQFH8
21 | torYGfZR8nZS+l7QjOeVTQKBgQC59GC+QcWOklJwNG7JhQPlQOf/+q043J3Nn4tX
22 | 72LnysQ8/wY3SdG7FSvDeJko2MBJLF8ic37quG4WaTwdmAMTKEI7CzlwbITx3RId
23 | n/AYoW9TPgP9CaerPoQMSX5etbsbQ7LToMCDsCpYeDLOavgHMcR2sXX7VRAeyP4U
24 | sw6IQQKBgQCBbg28NlJisJZ5InviYD6riuBZEf6h9SAFijdOSnb25vmdAOVKvG8d
25 | TFcxGNJnRcW2evdbr8Wg4AfhZQe+IqtDQIK/tITd1aUMhIX9Ij0xTOSABQPfFNvX
26 | UhERyqD/y4R3CCrwRIZ7mO995A8SXjPQowyid/GXlehtdjSPqeQEKQ==
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/expired/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | echo "Generating an SSL key..."
 6 | openssl genrsa -out example.key 2048
 7 | 
 8 | echo "Generating a CSR..."
 9 | openssl req -new -key example.key -out example.csr -config example.cnf
10 | 
11 | echo "Generating a Certificate (enter 13243546 when asked for a password)..."
12 | openssl x509 -req \
13 |              -in example.csr \
14 |              -CA ../ca.crt \
15 |              -CAkey ../ca.key \
16 |              -set_serial 123456789 \
17 |              -out example.crt \
18 |              -days 1 \
19 |              -sha256
20 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/invalid.crt:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | This is not a valid certificate!
3 | -----END CERTIFICATE-----
4 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/self-signed/example.cnf:
--------------------------------------------------------------------------------
 1 | FQDN = example.org
 2 | ORGNAME = Elastic
 3 | ALTNAMES = DNS:$FQDN, DNS:www.$FQDN
 4 | 
 5 | [ req ]
 6 | default_bits = 2048
 7 | default_md = sha256
 8 | prompt = no
 9 | encrypt_key = no
10 | distinguished_name = dn
11 | req_extensions = req_ext
12 | 
13 | [ dn ]
14 | C = CH
15 | O = $ORGNAME
16 | CN = $FQDN
17 | 
18 | [ req_ext ]
19 | subjectAltName = $ALTNAMES
20 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/self-signed/example.crt:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE-----
 2 | MIIDTjCCAjYCBAdbzRUwDQYJKoZIhvcNAQELBQAwgaExCzAJBgNVBAYTAlVTMQsw
 3 | CQYDVQQIDAJDQTEWMBQGA1UEBwwNU2FuIEZyYW5jaXNjbzEQMA4GA1UECgwHRWxh
 4 | c3RpYzEaMBgGA1UECwwRRW50ZXJwcmlzZSBTZWFyY2gxEjAQBgNVBAMMCWN1c3Rv
 5 | bS1jYTErMCkGCSqGSIb3DQEJARYcZW50ZXJwcmlzZS1zZWFyY2hAZWxhc3RpYy5j
 6 | bzAeFw0yMTA2MTAxNjMwNTBaFw00ODEwMjUxNjMwNTBaMDUxCzAJBgNVBAYTAkNI
 7 | MRAwDgYDVQQKDAdFbGFzdGljMRQwEgYDVQQDDAtleGFtcGxlLm9yZzCCASIwDQYJ
 8 | KoZIhvcNAQEBBQADggEPADCCAQoCggEBAOLX3PWCHRPmg4Zp70lkEuJqpzE/8Oa3
 9 | 2G9+YCD02+dMJxqyCDsN3gS38OC8nPaHXrEKHVngUDHFrpWeJ70IZTK5yV/TlYM2
10 | Xtjgmq0Mwe7X96SF6lxBax6/zlbAFU0xJNG1KKxx8mUV35eIIkmN1/64HvvSDae5
11 | fTM0NVmv7TYcv8XNXTEDtQR+fkQhN5fZqFWd7/WNiW6nvhi3L/2X4jiS6BEWNQL/
12 | tprVyqxQAwIYUxsgFx8WyWvKEJyoylbS/vqfaJaayNkUid1655zrGKpZLKWY66U/
13 | 9DoUHeuJQ/SDKM5Aa1QmYkGojyUjpbRJ2jDqOLxBAUCrs+f1yDhCArkCAwEAATAN
14 | BgkqhkiG9w0BAQsFAAOCAQEAjMHM0yvEjR478ZyewC4TDNtcv2Eky9zZYz4H/NIJ
15 | vCNzz/PMXoWKZJzGNd+R4OBOghriO6mhXl7qYb1Ci12XASTmxi1fR7/HVhtBuNIX
16 | QYMWitFDGtOAiGvoNwmc1Uh24SrH7E30HW3fsiXk9UF/8uxn7kNBApJ7rg3PcsRs
17 | bNqHVULVU2I6q0NX/Y1igi1PdfKBwYKJAO/LDiXV0iafZYUfBBFb7qBgikl6g62X
18 | ulVVvCGZpZSg6YmwrLGVCT++ESnw5ejNs+3OWIYnE8tGVRwMOiHUiEVu9PtOp3Ag
19 | YOXa0egoUU7pbUzP9J438OXuiqfx+riGOqVVERv/EXm67w==
20 | -----END CERTIFICATE-----
21 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/self-signed/example.csr:
--------------------------------------------------------------------------------
 1 | -----BEGIN CERTIFICATE REQUEST-----
 2 | MIICtDCCAZwCAQAwNTELMAkGA1UEBhMCQ0gxEDAOBgNVBAoMB0VsYXN0aWMxFDAS
 3 | BgNVBAMMC2V4YW1wbGUub3JnMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKC
 4 | AQEA4tfc9YIdE+aDhmnvSWQS4mqnMT/w5rfYb35gIPTb50wnGrIIOw3eBLfw4Lyc
 5 | 9odesQodWeBQMcWulZ4nvQhlMrnJX9OVgzZe2OCarQzB7tf3pIXqXEFrHr/OVsAV
 6 | TTEk0bUorHHyZRXfl4giSY3X/rge+9INp7l9MzQ1Wa/tNhy/xc1dMQO1BH5+RCE3
 7 | l9moVZ3v9Y2Jbqe+GLcv/ZfiOJLoERY1Av+2mtXKrFADAhhTGyAXHxbJa8oQnKjK
 8 | VtL++p9olprI2RSJ3XrnnOsYqlkspZjrpT/0OhQd64lD9IMozkBrVCZiQaiPJSOl
 9 | tEnaMOo4vEEBQKuz5/XIOEICuQIDAQABoDowOAYJKoZIhvcNAQkOMSswKTAnBgNV
10 | HREEIDAeggtleGFtcGxlLm9yZ4IPd3d3LmV4YW1wbGUub3JnMA0GCSqGSIb3DQEB
11 | CwUAA4IBAQAqA+uQgUZ5TQzDylCJjKTh3zgFHuSwOhlpPy930XUfccE+AmjF3VKD
12 | y4bCDc21IhQzYv1TD/2TXkTDoL4aIENP0b0AxgRFEV5reDhh0/RgcojgwdasNrG4
13 | Wymcqzdai+ZRaUCDvx9Llgus5qyajeeQ2z6SahCKerwOHo7WPO+s0q/yNVYCpQC7
14 | bq3vTimKbjDtX8HrYyLE6DDcvqevtwJCGhiH/YKyfHA75mYp5MJiCrHSG/grdt89
15 | qK8TUKvDm5xfvfEIY65nzKhMHz4RKG8WVyEMURnqpsQEJP+bYjbqDJZCb/sckTUj
16 | K2mLd3Ik/YUnvuWHetuBdw6fQIHR/j2y
17 | -----END CERTIFICATE REQUEST-----
18 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/self-signed/example.key:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | MIIEowIBAAKCAQEA4tfc9YIdE+aDhmnvSWQS4mqnMT/w5rfYb35gIPTb50wnGrII
 3 | Ow3eBLfw4Lyc9odesQodWeBQMcWulZ4nvQhlMrnJX9OVgzZe2OCarQzB7tf3pIXq
 4 | XEFrHr/OVsAVTTEk0bUorHHyZRXfl4giSY3X/rge+9INp7l9MzQ1Wa/tNhy/xc1d
 5 | MQO1BH5+RCE3l9moVZ3v9Y2Jbqe+GLcv/ZfiOJLoERY1Av+2mtXKrFADAhhTGyAX
 6 | HxbJa8oQnKjKVtL++p9olprI2RSJ3XrnnOsYqlkspZjrpT/0OhQd64lD9IMozkBr
 7 | VCZiQaiPJSOltEnaMOo4vEEBQKuz5/XIOEICuQIDAQABAoIBABTjkepdv/W8LXJs
 8 | QOe+Omr1LU5AuBtW5Kxns8x1H+btwVAZAt8FSOOgWKMpWz7selDNQKStHlVnAcuv
 9 | U7N5mXARYbTcVBFQKW3JSRUUbqti4eAZoNo7//RF72dXqt5/3wccqpEusQaT/BIF
10 | LDsfv7sqE1hXIIDIePoFHcCTjcGEudIFEHDTZQ6Ip/zKdl6oeOTIhqTejdX3KWwj
11 | ERDn7L9QjGW2lgpNJzfzYqKHL8lrpsGYgPf4HN9LmF25tLOus/ZIDJ/1RvWDjtYp
12 | WNcmnDFqV24mXOizGMv0i4KRrLq1GY1dmb1CiHERM1rejSIfN/aIWNU4oF2nTvCd
13 | kodd6u0CgYEA96zj0fR/Zrh0f0Bbmzm2qMecdKAUCo+0gXElSzJdZkpyMTsZ265G
14 | nFXeNBetcPzi4Lh+AbkdAUcpu+z231QzKbybf1Q+zVrKLNMphBq6xLscIOVttAYE
15 | vrHPIgAPBZ0KEejHA6xUqRyNdkoP+x6f4wsE/tFyLFINQXSgT7/0orsCgYEA6ne5
16 | YOgcC1zOi3IgrZW2bI+vTDB6BGJzKCdF/0O96d7jAab8Zt4Re3KKzOxdlViAZOKv
17 | jIir0m8CgNTELry+Iqhx0sbWtGhWewALcHe8Cs1+FMPeOVDs1vpHILFFP89mPoni
18 | eQ2n/tBzxxxoCb6rG21Q9J9d+MO0WkZZBxX/exsCgYAR83f3qa7qNQhMiM+a6o3w
19 | obcXRNrvAQdmMlsvnhDi7xZjtxLitzjq79ZRFD4/6DRRcU3AtjgB9bRyqHQkL6gd
20 | qEvk6Kg8ng31PcDOkFllFOKvB7Hx0FXbtGt83WA5We526dYyz/S65RTjs+6AlvGj
21 | tRLBnVCXIcNQMTHFVfZXLQKBgQCvedrrg8s5VcPe6RM71ogox4BSbRVkoqm8q2ff
22 | mztPBNiwK+FKu3gqA5eNtnhzhUDSQDVR4Bd37kzZTmNk9yz1k0tcjCOz8UKH24i0
23 | K2g4TYLG17BBBSe73KPO+9zv3LCQrXEpV+ca9bcwlTnn0SMN68piycLyosUfqvaG
24 | Lxh9cQKBgFSEMrHyrf0I9ZvyWB7FTqKgq9O99S2YQg79S0MrD/IRwv8eMR5ISjG8
25 | JUC8S2yLSzXq5FSjZN+T5nZoL/QLywCJoRZz8W2UobZV01afu+LC6uQ8jNrKhlYN
26 | iQVlB9BkEO3nm/xGd+Ay8C1nHcGvNVjVdsL1Rqa9KjxGxNHOfuHb
27 | -----END RSA PRIVATE KEY-----
28 | 


--------------------------------------------------------------------------------
/spec/fixtures/ssl/self-signed/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | echo "Generating an SSL key..."
 6 | openssl genrsa -out example.key 2048
 7 | 
 8 | echo "Generating a CSR..."
 9 | openssl req -new -key example.key -out example.csr -config example.cnf
10 | 
11 | echo "Generating a Certificate (enter 13243546 when asked for a password)..."
12 | openssl x509 -req \
13 |              -in example.csr \
14 |              -CA ../ca.crt \
15 |              -CAkey ../ca.key \
16 |              -set_serial 123456789 \
17 |              -out example.crt \
18 |              -days 9999 \
19 |              -sha256
20 | 


--------------------------------------------------------------------------------
/spec/integration/charset_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Content charset' do
10 |   let(:site) do
11 |     Faux.site do
12 |       page '/' do
13 |         body do
14 |           link_to '/utf8-without-charset'
15 |           link_to '/utf8-with-charset'
16 |         end
17 |       end
18 | 
19 |       page '/utf8-with-charset' do
20 |         headers 'Content-Type' => 'text/html; charset=UTF-8'
21 |         body do
22 |           text { "ma\u00F1ana ol\u00E9" }
23 |         end
24 |       end
25 | 
26 |       page '/utf8-without-charset' do
27 |         headers 'Content-Type' => 'text/html'
28 |         body do
29 |           text { "ma\u00F1ana ol\u00E9" }
30 |         end
31 |       end
32 |     end
33 |   end
34 | 
35 |   it 'defaults to UTF-8' do
36 |     results = FauxCrawl.run(site)
37 | 
38 |     expect(results).to have_only_these_results [
39 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
40 |       mock_response(url: 'http://127.0.0.1:9393/utf8-with-charset', status_code: 200,
41 |                     content: "<html><body>ma\u00F1ana ol\u00E9</body></html>"),
42 |       mock_response(url: 'http://127.0.0.1:9393/utf8-without-charset', status_code: 200,
43 |                     content: "<html><body>ma\u00F1ana ol\u00E9</body></html>")
44 |     ]
45 |   end
46 | 
47 |   it 'can override fallback encoding' do
48 |     results = FauxCrawl.run(site, default_encoding: 'ISO-8859-1')
49 | 
50 |     expect(results).to have_only_these_results [
51 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
52 |       mock_response(url: 'http://127.0.0.1:9393/utf8-with-charset', status_code: 200,
53 |                     content: "<html><body>ma\u00F1ana ol\u00E9</body></html>"),
54 |       mock_response(url: 'http://127.0.0.1:9393/utf8-without-charset', status_code: 200,
55 |                     content: String.new("<html><body>ma\xC3\xB1ana ol\xC3\xA9</body></html>", encoding: 'ISO-8859-1'))
56 |     ]
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/spec/integration/content_extraction_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Content extractable file support' do
10 |   let(:site) do
11 |     Faux.site do
12 |       page '/' do
13 |         body do
14 |           link_to '/html'
15 |           link_to '/pdf'
16 |           link_to '/powerpoint'
17 |           link_to '/word'
18 |         end
19 |       end
20 | 
21 |       page '/html' do
22 |         headers 'Content-Type' => 'text/html; charset=UTF-8'
23 |       end
24 | 
25 |       page '/pdf' do
26 |         headers 'Content-Type' => 'application/pdf'
27 |       end
28 | 
29 |       page '/powerpoint' do
30 |         headers 'Content-Type' => 'application/vnd.ms-powerpoint'
31 |       end
32 | 
33 |       page '/word' do
34 |         headers 'Content-Type' => 'application/msword'
35 |       end
36 |     end
37 |   end
38 | 
39 |   it 'supports single and multiple Content-Type headers' do
40 |     results = FauxCrawl.run(
41 |       site,
42 |       content_extraction: {
43 |         enabled: true,
44 |         mime_types: [
45 |           'application/pdf',
46 |           'application/vnd.ms-powerpoint'
47 |         ]
48 |       }
49 |     )
50 | 
51 |     expect(results).to have_only_these_results [
52 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
53 |       mock_response(url: 'http://127.0.0.1:9393/html', status_code: 200),
54 |       mock_response(url: 'http://127.0.0.1:9393/pdf', status_code: 200),
55 |       mock_response(url: 'http://127.0.0.1:9393/powerpoint', status_code: 200)
56 |     ]
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/spec/integration/nofollow_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Robots meta support' do
10 |   let(:results) do
11 |     FauxCrawl.crawl_site do
12 |       page '/' do
13 |         body do
14 |           link_to '/noindex'
15 |           link_to '/nofollow'
16 | 
17 |           # This link will not be followed
18 |           link_to '/unreachable', rel: :nofollow
19 |         end
20 |       end
21 | 
22 |       # Should not be indexed, but the links should be followed
23 |       page '/noindex' do
24 |         head { robots 'noindex' }
25 |         body { link_to '/foo' }
26 |       end
27 | 
28 |       # Should be indexed, but the links should not be followed
29 |       page '/nofollow' do
30 |         head { robots 'nofollow' }
31 |         body { link_to '/unreachable' }
32 |       end
33 | 
34 |       # Only reachable via /noindex
35 |       page '/foo'
36 | 
37 |       # Only reachable via nofollow links and pages, so the crawler won't ever find this
38 |       page '/unreachable'
39 |     end
40 |   end
41 | 
42 |   it 'crawls all pages given the constraints specified by robots meta tags' do
43 |     expect(results).to have_only_these_results [
44 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
45 |       mock_response(url: 'http://127.0.0.1:9393/nofollow', status_code: 200),
46 |       mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200)
47 |     ]
48 |   end
49 | end
50 | 


--------------------------------------------------------------------------------
/spec/integration/response_content_type_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Response Content-Type support' do
10 |   let(:results) do
11 |     FauxCrawl.crawl_site do
12 |       page '/' do
13 |         body do
14 |           link_to '/html'
15 |           link_to '/pdf'
16 |           link_to '/pdf-multi-header'
17 |         end
18 |       end
19 | 
20 |       page '/html' do
21 |         headers 'Content-Type' => 'text/html; charset=UTF-8'
22 |       end
23 | 
24 |       page '/pdf' do
25 |         headers 'Content-Type' => 'application/pdf'
26 |       end
27 | 
28 |       page '/pdf-multi-header' do
29 |         headers 'Content-Type' => ['application/pdf', 'text/html; charset=UTF-8']
30 |       end
31 |     end
32 |   end
33 | 
34 |   it 'supports single and multiple Content-Type headers' do
35 |     expect(results).to have_only_these_results [
36 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
37 |       mock_response(url: 'http://127.0.0.1:9393/html', status_code: 200)
38 |     ]
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/spec/integration/response_limits_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # Generate a large enough random string that would require multiple TCP-packets to download
10 | require 'securerandom'
11 | MULTI_CHUNK_BODY = SecureRandom.alphanumeric(12_345)
12 | 
13 | RSpec.describe 'Per-request resource limits support' do
14 |   let(:results) do
15 |     FauxCrawl.crawl_site do
16 |       page '/' do
17 |         body do
18 |           link_to '/multi-chunk'
19 |           link_to '/too-big'
20 |         end
21 |       end
22 | 
23 |       # Should be indexed, downloads will produce multiple chunks
24 |       page '/multi-chunk' do
25 |         def response_body
26 |           [MULTI_CHUNK_BODY]
27 |         end
28 |       end
29 | 
30 |       # Should not be indexed because it is too big
31 |       page '/too-big' do
32 |         def response_body
33 |           ['x' * 11_000_000]
34 |         end
35 |       end
36 |     end
37 |   end
38 | 
39 |   it 'crawls all pages given the constraints specified by resource limits' do
40 |     expect(results).to have_only_these_results [
41 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
42 |       mock_response(url: 'http://127.0.0.1:9393/multi-chunk', status_code: 200)
43 |     ]
44 |   end
45 | 
46 |   it 'should correctly download multi-chunk responses' do
47 |     multi_chunk_response = results.find { |r| r.url.to_s =~ /multi-chunk$/ }
48 |     expect(multi_chunk_response.content).to eq(MULTI_CHUNK_BODY)
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/spec/integration/seed_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Seed URLs' do
10 |   let(:site) do
11 |     Faux.site do
12 |       page '/foo'
13 |       page '/baz'
14 |     end
15 |   end
16 | 
17 |   it 'crawls all of the seed urls specified by the config' do
18 |     results = FauxCrawl.run(site, seed_urls: %w[http://127.0.0.1:9393/foo http://127.0.0.1:9393/baz])
19 | 
20 |     expect(results).to have_only_these_results [
21 |       mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200),
22 |       mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200)
23 |     ]
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/spec/integration/sitemap_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Sitemaps Support' do
10 |   let(:site) do
11 |     Faux.site do
12 |       page '/' do
13 |         body do
14 |           link_to '/foo'
15 |         end
16 |       end
17 | 
18 |       # Could be discovered via the home page or the sitemap
19 |       page '/foo'
20 | 
21 |       # Not linked directly, but discoverable via the sitemap
22 |       page '/bar' do
23 |         body do
24 |           link_to '/baz'
25 |         end
26 |       end
27 | 
28 |       # Not linked directly, but discoverable via '/bar'
29 |       page '/baz'
30 | 
31 |       sitemap '/sitemap.xml' do
32 |         link_to '/'
33 |         link_to '/foo'
34 |         link_to '/bar'
35 |       end
36 |     end
37 |   end
38 | 
39 |   it 'makes it possible to use sitemap seed URLs for discovering links on a site' do
40 |     results = FauxCrawl.run(
41 |       site,
42 |       seed_urls: ['http://127.0.0.1:9393/'],
43 |       sitemap_urls: ['http://127.0.0.1:9393/sitemap.xml']
44 |     )
45 | 
46 |     expect(results).to have_only_these_results [
47 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
48 |       mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200),
49 |       mock_response(url: 'http://127.0.0.1:9393/bar', status_code: 200),
50 |       mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200)
51 |     ]
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/spec/integration/timeouts/socket_timeout_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Request to a site that is very slow to send us any data' do
10 |   let(:site) do
11 |     Faux.site do
12 |       page '/' do
13 |         body do
14 |           link_to '/timeout'
15 |         end
16 |       end
17 | 
18 |       page '/timeout' do
19 |         def response_body
20 |           sleep 5
21 | 
22 |           ['Output']
23 |         end
24 |       end
25 |     end
26 |   end
27 | 
28 |   it 'times out' do
29 |     results = FauxCrawl.run(site, timeouts: { socket_timeout: 2 })
30 | 
31 |     expect(results).to have_only_these_results [
32 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200)
33 |     ]
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/spec/integration/url_fragments_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'URL normalization in the presence of URL fragments' do
10 |   let(:results) do
11 |     FauxCrawl.crawl_site do
12 |       page '/' do
13 |         body do
14 |           link_to '/foo'
15 |           link_to '/foo#bar'
16 |           link_to '/baz#hello'
17 |         end
18 |       end
19 | 
20 |       page '/foo'
21 |       page '/baz'
22 |     end
23 |   end
24 | 
25 |   it 'crawls discovered URLs while stripping out the fragments' do
26 |     expect(results).to have_only_these_results [
27 |       mock_response(url: 'http://127.0.0.1:9393/', status_code: 200),
28 |       mock_response(url: 'http://127.0.0.1:9393/foo', status_code: 200),
29 |       mock_response(url: 'http://127.0.0.1:9393/baz', status_code: 200)
30 |     ]
31 |   end
32 | end
33 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/cli/version_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::CLI::Version) do
10 |   describe '.call' do
11 |     let(:version_path) { File.expand_path('../../../../product_version', __dir__) }
12 | 
13 |     it 'prints the current version from product_version_file' do
14 |       expect(File).to receive(:read).with(version_path).and_return('1.0.0')
15 |       expect { described_class.new.call }.to output("1.0.0\n").to_stdout
16 |     end
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/content_engine/utils_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::ContentEngine::Utils) do
10 |   describe '.node_descendant_text' do
11 |     it 'should raise an error unless given a node object' do
12 |       expect do
13 |         Crawler::ContentEngine::Utils.node_descendant_text('something')
14 |       end.to raise_error(ArgumentError, /node-like/)
15 |     end
16 | 
17 |     it 'should replace break tags with spaces' do
18 |       node = Nokogiri::HTML('<body>Hello,<br>World!')
19 |       expect(Crawler::ContentEngine::Utils.node_descendant_text(node)).to eq('Hello, World!')
20 |     end
21 | 
22 |     context 'with uncrate.com pages' do
23 |       let(:content) { read_fixture('uncrate.com.html') }
24 |       let(:html) { Nokogiri::HTML(content) }
25 | 
26 |       it 'should have a reasonable performance' do
27 |         duration = Benchmark.measure do
28 |           Crawler::ContentEngine::Utils.node_descendant_text(html)
29 |         end
30 | 
31 |         # It usually takes ~250 msec, used to take 180 sec before we fixed it, so let's aim for something reasonable
32 |         expect(duration.real).to be < 5
33 |       end
34 |     end
35 | 
36 |     context 'with ignore_tags' do
37 |       it 'ignores <script> tags' do
38 |         node = Nokogiri::HTML('<div><script>Script body</script><p>P body</p></div>')
39 |         expect(Crawler::ContentEngine::Utils.node_descendant_text(node)).to eq('P body')
40 |       end
41 |     end
42 | 
43 |     context 'without ignore_tags' do
44 |       it 'does not ignores <script> tags' do
45 |         node = Nokogiri::HTML('<div><script>Script body</script><p>P body</p></div>')
46 |         expect(Crawler::ContentEngine::Utils.node_descendant_text(node, [])).to eq('Script body P body')
47 |       end
48 |     end
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/data/crawl_task_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe Crawler::Data::CrawlTask do
10 |   let(:url) { Crawler::Data::URL.parse('https://example.com/') }
11 |   let(:task) { Crawler::Data::CrawlTask.new(url:, type: :content, depth: 1) }
12 | 
13 |   describe '#inspect' do
14 |     it 'should return a nice representation of the object for logging' do
15 |       expect(task.inspect).to be_a(String)
16 |       expect(task.inspect).to match(/CrawlTask/)
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/data/domain_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::Data::Domain) do
10 |   def domain(url)
11 |     Crawler::Data::Domain.new(url)
12 |   end
13 | 
14 |   it 'should include the standard port in the normalized version' do
15 |     expect(domain('http://google.com').to_s).to eq('http://google.com:80')
16 |     expect(domain('https://google.com').to_s).to eq('https://google.com:443')
17 |   end
18 | 
19 |   it 'should include the custom port in the normalized version' do
20 |     expect(domain('https://google.com:123').to_s).to eq('https://google.com:123')
21 |   end
22 | 
23 |   it 'should strip out the path' do
24 |     expect(domain('https://google.com/something').to_s).to eq('https://google.com:443')
25 |   end
26 | 
27 |   it 'should strip out the URL fragment' do
28 |     expect(domain('https://google.com/something#foo').to_s).to eq('https://google.com:443')
29 |   end
30 | 
31 |   context 'when compared to other objects' do
32 |     it 'should use the normalized version for comparison' do
33 |       expect(domain('https://google.com/something#foo') == 'https://google.com:443').to be(true)
34 |     end
35 |   end
36 | 
37 |   describe '#robots_txt_url' do
38 |     it 'should return URL with /robots.txt as the path' do
39 |       expect(domain('https://google.com').robots_txt_url.to_s).to eq('https://google.com/robots.txt')
40 |       expect(domain('https://google.com/something#foo').robots_txt_url.to_s).to eq('https://google.com/robots.txt')
41 |       expect(domain('https://google.com/something?q=v').robots_txt_url.to_s).to eq('https://google.com/robots.txt')
42 |       expect(domain('https://google.com:123').robots_txt_url.to_s).to eq('https://google.com:123/robots.txt')
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/data/rule_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::Data::Rule) do
10 |   describe '#url_match?' do
11 |     it 'allows rule' do
12 |       rule = Crawler::Data::Rule.new(:allow, url_pattern: %r{\Ahttp://example.com/test[0-9]})
13 | 
14 |       expect(rule.policy).to eq(:allow)
15 |       expect(rule.url_match?(Crawler::Data::URL.parse('http://example.com/test1'))).to eq(true)
16 |       expect(rule.url_match?(Crawler::Data::URL.parse('http://example.com/testx'))).to eq(false)
17 |     end
18 | 
19 |     it 'denies rule' do
20 |       rule = Crawler::Data::Rule.new(:deny, url_pattern: %r{\Ahttp://test[0-9].example.com})
21 | 
22 |       expect(rule.policy).to eq(:deny)
23 |       expect(rule.url_match?(Crawler::Data::URL.parse('http://test1.example.com'))).to eq(true)
24 |       expect(rule.url_match?(Crawler::Data::URL.parse('http://testx.example.com'))).to eq(false)
25 |     end
26 | 
27 |     it 'should time out on really complex matching rules' do
28 |       regex = /((((((a*)*)*)*)*)*)*((((((a*)*)*)*)*)*)*((((((a*)*)*)*)*)*)*$/
29 |       rule = Crawler::Data::Rule.new(:deny, url_pattern: regex)
30 |       url = Crawler::Data::URL.parse('http://test1.example.com//aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab')
31 | 
32 |       expect { rule.url_match?(url) }.to raise_error(Timeout::Error)
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/data/url_queue_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::Data::UrlQueue) do
10 |   let(:domains) { [{ url: 'http://example.com' }] }
11 | 
12 |   let(:config) do
13 |     Crawler::API::Config.new(
14 |       domains:
15 |     )
16 |   end
17 | 
18 |   describe '.create' do
19 |     it 'should return a queue object' do
20 |       queue = Crawler::Data::UrlQueue.create(config)
21 |       expect(queue).to be_kind_of(Crawler::Data::UrlQueue::Base)
22 |     end
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/http_utils/config_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::HttpUtils::Config) do
10 |   describe 'constructor' do
11 |     let(:valid_config) do
12 |       {
13 |         loopback_allowed: false,
14 |         private_networks_allowed: false,
15 |         logger: Logger.new($stdout)
16 |       }
17 |     end
18 | 
19 |     described_class::REQUIRED_OPTIONS.each do |opt|
20 |       it "requires #{opt} option" do
21 |         expect do
22 |           described_class.new(valid_config.except(opt))
23 |         end.to raise_error(ArgumentError, "#{opt} is a required option")
24 |       end
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/http_utils/response_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::HttpUtils::Response) do
10 |   let(:url) { Crawler::Data::URL.parse('http://example.org/') }
11 |   let(:response) do
12 |     Crawler::HttpUtils::Response.new(
13 |       apache_response:,
14 |       url:,
15 |       request_start_time: 1.second.ago,
16 |       request_end_time: Time.now
17 |     )
18 |   end
19 | 
20 |   #-------------------------------------------------------------------------------------------------
21 |   describe '#check_content_encoding' do
22 |     let(:response_entity) { double(:response_entity, content_encoding: encoding) }
23 |     let(:apache_response) { double(:apache_response, entity: response_entity) }
24 | 
25 |     def check_content_encoding
26 |       response.send(:check_content_encoding)
27 |     end
28 | 
29 |     context 'when given a supported content encoding' do
30 |       let(:encoding) { 'gzip' }
31 |       it 'should succeed' do
32 |         expect { check_content_encoding }.to_not raise_error
33 |       end
34 |     end
35 | 
36 |     context 'when given a list of supported content encodings' do
37 |       let(:encoding) { 'gzip,deflate' }
38 |       it 'should succeed' do
39 |         expect { check_content_encoding }.to_not raise_error
40 |       end
41 |     end
42 | 
43 |     context 'when given an unsupported content encoding' do
44 |       let(:encoding) { 'banana' }
45 |       it 'should fail' do
46 |         expect { check_content_encoding }.to raise_error(Crawler::HttpUtils::InvalidEncoding)
47 |       end
48 |     end
49 | 
50 |     context 'when given a list with an unsupported content encoding' do
51 |       let(:encoding) { 'gzip,banana' }
52 |       it 'should fail' do
53 |         expect { check_content_encoding }.to raise_error(Crawler::HttpUtils::InvalidEncoding)
54 |       end
55 |     end
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/output_sink_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::OutputSink) do
10 |   let(:domains) { [{ url: 'http://example.com' }] }
11 | 
12 |   let(:es_client) { double }
13 |   let(:es_client_indices) { double(:es_client_indices, exists: double) }
14 |   let(:build_info) { { version: { number: '8.99.0', build_flavor: 'default' } }.deep_stringify_keys }
15 | 
16 |   before(:each) do
17 |     allow(ES::Client).to receive(:new).and_return(es_client)
18 |     allow(es_client).to receive(:indices).and_return(es_client_indices)
19 |     allow(es_client).to receive(:info).and_return(build_info)
20 |   end
21 | 
22 |   context '.create' do
23 |     it 'should validate the sync name' do
24 |       config = Crawler::API::Config.new(
25 |         domains:,
26 |         output_sink: 'magnetic-tape'
27 |       )
28 | 
29 |       expect do
30 |         Crawler::OutputSink.create(config)
31 |       end.to raise_error(/Unknown output sink/)
32 |     end
33 | 
34 |     it 'should return a new sink object of a correct type' do
35 |       config = Crawler::API::Config.new(
36 |         domains:,
37 |         output_sink: :elasticsearch,
38 |         output_index: 'some-index-name',
39 |         elasticsearch: {
40 |           host: 'http://localhost',
41 |           port: 1234,
42 |           api_key: 'key'
43 |         }
44 |       )
45 | 
46 |       sink = Crawler::OutputSink.create(config)
47 |       expect(sink).to be_kind_of(Crawler::OutputSink::Elasticsearch)
48 |     end
49 |   end
50 | end
51 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/url_validator/crawl_rules_check_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | # Mock class definitions
10 | module Crawler
11 |   module RuleEngine
12 |     class Elasticsearch < Crawler::RuleEngine::Base
13 |       def crawl_rules_outcome(url) end
14 |     end
15 |   end
16 | end
17 | 
18 | RSpec.describe(Crawler::UrlValidator) do
19 |   let(:valid_url) { Crawler::Data::URL.parse('http://example.com') }
20 |   let(:domain_allowlist) { ['example.com'] }
21 |   let(:crawl_config) { double('CrawlConfig', domain_allowlist:) }
22 |   let(:validator) { described_class.new(url: valid_url, crawl_config:) }
23 |   let(:rule_engine) { double('Crawler::RuleEngine::Elasticsearch') }
24 |   let(:outcome) { double('Outcome', allowed?: allowed, details: { rule: }) }
25 |   let(:rule) { double('Rule', source: 'some_rule_source') }
26 | 
27 |   describe '#validate_crawl_rules' do
28 |     before do
29 |       allow(Crawler::RuleEngine::Elasticsearch).to receive(:new).with(crawl_config).and_return(rule_engine)
30 |       allow(rule_engine).to receive(:crawl_rules_outcome).with(validator.normalized_url).and_return(outcome)
31 |       allow(validator).to receive(:validation_ok)
32 |       allow(validator).to receive(:validation_fail)
33 |     end
34 | 
35 |     context 'when the URL is allowed by a crawl rule' do
36 |       let(:allowed) { true }
37 | 
38 |       it 'calls validation_ok' do
39 |         validator.validate_crawl_rules
40 |         expect(validator)
41 |           .to have_received(:validation_ok)
42 |       end
43 |     end
44 | 
45 |     context 'when the URL is denied by a crawl rule' do
46 |       let(:allowed) { false }
47 | 
48 |       it 'calls validation_fail' do
49 |         validator.validate_crawl_rules
50 |         expect(validator)
51 |           .to have_received(:validation_fail)
52 |       end
53 |     end
54 | 
55 |     context 'when the URL is denied because it did not match any rules' do
56 |       let(:allowed) { false }
57 |       let(:rule) { nil }
58 | 
59 |       it 'calls validation_fail' do
60 |         validator.validate_crawl_rules
61 |         expect(validator)
62 |           .to have_received(:validation_fail)
63 |       end
64 |     end
65 |   end
66 | end
67 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/url_validator/domain_access_check_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::UrlValidator) do
10 |   let(:valid_url) { Crawler::Data::URL.parse('http://example.com') }
11 |   let(:domain_allowlist) { ['example.com'] }
12 |   let(:crawl_config) { double('CrawlConfig', domain_allowlist:) }
13 |   let(:url) { instance_double('Crawler::Data::URL', domain: domain_allowlist[0], domain_name: domain_allowlist[0]) }
14 |   let(:validator) { described_class.new(url: valid_url, crawl_config:) }
15 | 
16 |   describe '#validate_domain_access' do
17 |     before do
18 |       validator.singleton_class.include(Crawler::UrlValidator::DomainAccessCheckConcern)
19 |       allow(validator).to receive(:crawler_api_config).and_return(crawl_config)
20 |       allow(validator).to receive(:url).and_return(url)
21 |       allow(validator).to receive(:validation_ok)
22 |       allow(validator).to receive(:validation_fail)
23 |     end
24 | 
25 |     context 'when the URL matches one of the configured domains' do
26 |       it 'calls validation_ok with the correct parameters' do
27 |         validator.validate_domain_access
28 |         expect(validator)
29 |           .to have_received(:validation_ok)
30 |           .with(:domain_access, 'The URL matches one of the configured domains', domain: 'example.com')
31 |       end
32 |     end
33 | 
34 |     context 'when the URL does not match any configured domains' do
35 |       let(:url) { instance_double('Crawler::Data::URL', domain: 'notexample.com', domain_name: 'notexample.com') }
36 | 
37 |       it 'calls validation_fail with the correct parameters' do
38 |         validator.validate_domain_access
39 |         expect(validator)
40 |           .to have_received(:validation_fail)
41 |           .with(:domain_access, 'The URL does not match any configured domains')
42 |       end
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/spec/lib/crawler/url_validator/domain_uniqueness_check_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler::UrlValidator) do
10 |   let(:valid_url) { Crawler::Data::URL.parse('http://example.com') }
11 |   let(:domain_allowlist) { ['example.com'] }
12 |   let(:crawl_config) { double('CrawlConfig', domain_allowlist:) }
13 |   let(:validator) { described_class.new(url: valid_url, crawl_config:) }
14 |   let(:url) { instance_double('Crawler::Data::URL', domain: domain_allowlist[0], domain_name: domain_allowlist[0]) }
15 | 
16 |   describe '#validate_domain_uniqueness' do
17 |     before do
18 |       validator.singleton_class.include(Crawler::UrlValidator::DomainUniquenessCheckConcern)
19 |       allow(validator).to receive(:crawler_api_config).and_return(crawl_config)
20 |       allow(validator).to receive(:url).and_return(url)
21 |       allow(validator).to receive(:validation_ok)
22 |       allow(validator).to receive(:validation_fail)
23 |     end
24 | 
25 |     context 'when the domain name already exists' do
26 |       it 'calls validation_fail with the correct parameters' do
27 |         validator.validate_domain_uniqueness
28 |         expect(validator)
29 |           .to have_received(:validation_fail)
30 |           .with(:domain_uniqueness, 'Domain name already exists')
31 |       end
32 |     end
33 | 
34 |     context 'when the domain name is new' do
35 |       let(:url) { instance_double('Crawler::Data::URL', domain: 'newexample.com', domain_name: 'newexample.com') }
36 | 
37 |       it 'calls validation_ok with the correct parameters' do
38 |         validator.validate_domain_uniqueness
39 |         expect(validator)
40 |           .to have_received(:validation_ok)
41 |           .with(:domain_uniqueness, 'Domain name is new', domain: 'newexample.com')
42 |       end
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/spec/lib/crawler_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe(Crawler) do
10 |   it 'should define a version' do
11 |     expect(Crawler.version).to be_a(String)
12 |   end
13 | 
14 |   context '.service_id' do
15 |     it 'should be cached' do
16 |       expect(Crawler.service_id).to be(Crawler.service_id)
17 |     end
18 | 
19 |     it 'should be process-scoped (not thread-local)' do
20 |       id1 = Crawler.service_id
21 | 
22 |       t = Thread.new { Thread.current[:service_id] = Crawler.service_id }.join
23 |       id2 = t[:service_id]
24 |       expect(id1).to be(id2)
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/spec/lib/environment_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | RSpec.describe 'Crawler Environment' do
10 |   it 'should have CRAWLER_ENV defined' do
11 |     expect(defined?(CRAWLER_ENV)).to eq('constant')
12 |   end
13 | end
14 | 


--------------------------------------------------------------------------------
/spec/support/cli_helpers.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'stringio'
10 | 
11 | module RSpec
12 |   module Support
13 |     module Helpers
14 |       def capture_output
15 |         output = StringIO.new
16 |         original_stdout = $stdout
17 |         $stdout = output
18 |         yield
19 |         output.string
20 |       rescue SystemExit
21 |         output.string
22 |       ensure
23 |         $stdout = original_stdout
24 |       end
25 | 
26 |       def capture_error
27 |         error = StringIO.new
28 |         original_stderr = $stderr
29 |         $stderr = error
30 |         yield
31 |         error.string
32 |       rescue SystemExit
33 |         error.string
34 |       ensure
35 |         $stderr = original_stderr
36 |       end
37 |     end
38 |   end
39 | end
40 | 
41 | RSpec.configure do |config|
42 |   config.include(RSpec::Support::Helpers)
43 | end
44 | 


--------------------------------------------------------------------------------
/spec/support/faux/results_collection.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'concurrent'
10 | 
11 | # A simple wrapper class for a collection of crawl results gathered by the mock crawler sink
12 | class ResultsCollection
13 |   attr_accessor :crawl_config, :crawl, :collection
14 | 
15 |   delegate :outcome, :outcome_message, to: :crawl
16 | 
17 |   def initialize
18 |     @collection = Concurrent::Array.new
19 |   end
20 | 
21 |   # Do not allow the collection to be duplicated when passed through config validation, etc
22 |   # This is needed so that we could pass a collection as a config parameter to a Crawler instance
23 |   # in tests and get it propagated to the sink itself and back.
24 |   def dup
25 |     self
26 |   end
27 | 
28 |   def method_missing(meth, *args, &block)
29 |     @collection.send(meth, *args, &block)
30 |   end
31 | 
32 |   def respond_to_missing?(method_name, include_private = false)
33 |     @collection.respond_to?(method_name, include_private) || super
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/spec/support/fixtures.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | FIXTURES_HOME = File.join(__dir__, '..', 'fixtures')
10 | 
11 | def fixture_file(*file_path)
12 |   File.join(FIXTURES_HOME, *file_path)
13 | end
14 | 
15 | def read_fixture(*file_path)
16 |   File.read(fixture_file(*file_path))
17 | end
18 | 
19 | def fixture_xml(*file_path)
20 |   file_name = file_path.pop
21 |   file_name = "#{file_name}.xml"
22 |   read_fixture(*file_path, file_name)
23 | end
24 | 
25 | def fixture_xml_gz(*file_path)
26 |   file_name = file_path.pop
27 |   file_name = "#{file_name}.xml.gz"
28 |   read_fixture(*file_path, file_name)
29 | end
30 | 


--------------------------------------------------------------------------------
/spec/support/mock_response.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the Elastic License 2.0;
 4 | # you may not use this file except in compliance with the Elastic License 2.0.
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'ostruct'
10 | 
11 | class MockResponse < OpenStruct # rubocop:disable Style/OpenStructUse
12 |   def equal_for_specified_keys?(response)
13 |     to_h.all? do |key, val|
14 |       val.to_s == response.send(key).to_s
15 |     end
16 |   end
17 | end
18 | 
19 | def mock_response(args)
20 |   MockResponse.new(args)
21 | end
22 | 


--------------------------------------------------------------------------------
/vendor/faux/.gitignore:
--------------------------------------------------------------------------------
 1 | Gemfile.lock
 2 | 
 3 | *.gem
 4 | .bundle
 5 | pkg/*
 6 | .DS_Store
 7 | .rvmrc
 8 | .ruby-version
 9 | .ruby-gemset
10 | 


--------------------------------------------------------------------------------
/vendor/faux/Gemfile:
--------------------------------------------------------------------------------
 1 | # A sample Gemfile
 2 | source "https://rubygems.org"
 3 | 
 4 | gemspec
 5 | 
 6 | group 'test' do
 7 |   gem 'pry'
 8 |   gem 'rack-test'
 9 |   gem 'rspec'
10 |   gem 'awesome_print'
11 | end
12 | 


--------------------------------------------------------------------------------
/vendor/faux/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2024 Elasticsearch B.V.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vendor/faux/README.md:
--------------------------------------------------------------------------------
 1 | # Faux
 2 | 
 3 | Faux is little Rack-based DSL for generating websites. Here's a simple example:
 4 | 
 5 | ``` ruby
 6 | class SimpleSite < Faux::Base
 7 |   page '/foo' do
 8 |     status 200
 9 |     link_to '/foobar'
10 |   end
11 | 
12 |   page '/bar' do
13 |     status 200
14 |     link_to '/bang'
15 |     link_to '/baz'
16 |   end
17 | 
18 |   sitemap '/sitemap.xml' do
19 |     link_to 'http://localhost:9393/foo'
20 |     link_to '/bar'
21 |   end
22 | 
23 |   # Adds a /robots.txt file with the specified rules.
24 |   robots do
25 |     user_agent '*'
26 |     disallow '/foo'
27 |     sitemap 'http://localhost:9393/sitemap.xml'
28 |   end
29 | end
30 | ```
31 | 
32 | To boot the example site locally:
33 | ``` shell
34 |   $ bundle exec rackup
35 | ```
36 | 
37 | The site will be running at `localhost:9393`
38 | 
39 | ### Request Counter
40 | 
41 | After booting an app, visit `/status` for a JSON report of which URLs have been visited and how many times they've been visited while the app has been running. It'll look like this:
42 | 
43 | ``` json
44 | {
45 |   "/bar": 7,
46 |   "/foo": 5
47 | }
48 | ```
49 | 


--------------------------------------------------------------------------------
/vendor/faux/Rakefile:
--------------------------------------------------------------------------------
 1 | require "bundler/gem_tasks"
 2 | 
 3 | begin
 4 |   require 'rspec/core/rake_task'
 5 |   RSpec::Core::RakeTask.new(:spec)
 6 | rescue LoadError
 7 | end
 8 | 
 9 | task :default => :spec
10 | 


--------------------------------------------------------------------------------
/vendor/faux/faux.gemspec:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | lib = File.expand_path('../lib', __FILE__)
 3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
 4 | 
 5 | require 'faux/version'
 6 | 
 7 | Gem::Specification.new do |spec|
 8 |   spec.name          = "faux"
 9 |   spec.version       = Faux::VERSION
10 |   spec.authors       = ["Elastic Enterprise Search Team"]
11 |   spec.email         = ["enterprise-search@elastic.co"]
12 |   spec.description   = "Artisan faux web pages, by Wes Andreson"
13 |   spec.summary       = "Faux is little Rack-based DSL for generating websites"
14 |   spec.homepage      = "https://swiftype.com"
15 |   spec.license       = "MIT"
16 | 
17 |   spec.files         = Dir.glob("{lib,sites}/**/*", File::FNM_DOTMATCH).reject {|f| File.directory?(f) }
18 |   spec.test_files    = spec.files.grep(%r{^(test|spec|features)/})
19 |   spec.require_paths = ["lib"]
20 | 
21 |   if spec.respond_to?(:metadata)
22 |     spec.metadata['allowed_push_host'] = 'https://artifactory.elastic.dev/artifactory/api/gems/swiftype-gems'
23 |   else
24 |     raise "RubyGems 2.0 or newer is required to protect against public gem pushes."
25 |   end
26 | 
27 |   spec.add_development_dependency 'rake'
28 |   spec.add_development_dependency 'geminabox'
29 | 
30 |   spec.add_runtime_dependency 'activesupport'
31 |   spec.add_runtime_dependency 'nokogiri'
32 |   spec.add_runtime_dependency 'rack'
33 |   spec.add_runtime_dependency 'rack-mount'
34 | end
35 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/atom_feed.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Element
 9 |     class AtomFeed < Base
10 |       def call(env)
11 |         @entries = []
12 |         super
13 |       end
14 | 
15 |       def response_headers
16 |         @headers.merge!({'Content-Type' => 'text/xml'})
17 |         super
18 |       end
19 | 
20 |       def response_body
21 |         builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml|
22 |           xml.feed(:xmlns => "http://www.w3.org/2005/Atom") {
23 |             xml.title 'Faux Feed'
24 |             @entries.each do |tags|
25 |               xml.entry {
26 |                 tags.each do |tag|
27 |                   if tag[:name] == 'content' # FIXME: Rewrite this as it makes me cry on the inside
28 |                     xml.send(tag[:name], {:type => 'html'}, tag[:text])
29 |                   elsif tag[:text] # generated from method_missing
30 |                     xml.send(tag[:name], tag[:text])
31 |                   else # generated from link_to
32 |                     xml.send(tag[:name], tag.reject{|k, _| k == :name})
33 |                   end
34 |                 end
35 |               }
36 |             end
37 |           }
38 |         end
39 | 
40 |         builder.to_xml.split("\n")
41 |       end
42 | 
43 |       def entry(&block)
44 |         @tags = [] # Holds hashes with tags defined inside &block
45 |         block.call
46 |         @entries << @tags
47 |       end
48 | 
49 |       def link_to(url, rel='self')
50 |         @tags << {:name => :link, :href => absolute_url_for(url), :rel => rel}
51 |       end
52 | 
53 |       def html_content(html)
54 |         @tags << {:name => 'content', :text => html}
55 |       end
56 | 
57 |       def method_missing(method, *args, &block)
58 |         @tags << {:name => method, :text => args[0]}
59 |       end
60 |     end
61 |   end
62 | end
63 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/base.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Element
 9 |     class Base
10 |       include Faux::Helpers::Url
11 | 
12 |       attr_reader :content_block, :env, :options
13 | 
14 |       def initialize(options, &content_block)
15 |         @content_block = content_block
16 |         @options = options
17 |         @status = 200
18 |       end
19 | 
20 |       def call(env)
21 |         @env = env
22 |         @headers = {}
23 | 
24 |         instance_exec(&content_block) if content_block
25 |         [response_status, response_headers, response_body]
26 |       end
27 | 
28 |       # Get methods (used in `call`)
29 |       def response_status
30 |         @status
31 |       end
32 | 
33 |       def response_headers
34 |         unless @headers.keys.find { |k| k.downcase == 'content-type' }
35 |           @headers['Content-Type'] = 'text/html'
36 |         end
37 |         @headers
38 |       end
39 | 
40 |       # Set methods (used by DSL)
41 |       def status(code)
42 |         @status = code.to_i
43 |       end
44 | 
45 |       def headers(headers_hash)
46 |         @headers.merge!(headers_hash || {})
47 |       end
48 | 
49 |       def response_body
50 |         raise 'Must be defined in a subclass'
51 |       end
52 | 
53 |       def redirect(location, options = {})
54 |         @status = options[:permanent] ? 301 : 302
55 |         @headers['Location'] = options[:relative] ? location : absolute_url_for(location)
56 |       end
57 |     end
58 |   end
59 | end
60 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/fixture.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Element
 9 |     class Fixture < Base
10 | 
11 |       attr_reader :fixture_content
12 | 
13 |       def call(env)
14 |         @fixture_content = nil
15 |         super
16 |       end
17 | 
18 |       def response_body
19 |         [ @fixture_content ]
20 |       end
21 | 
22 |       def path(fixture_file_path)
23 |         begin
24 |           full_path = File.join(Dir.pwd, fixture_file_path)
25 |           file = File.open(full_path)
26 |         rescue => e
27 | message = <<-EOL
28 | Please provide correct path to fixture:
29 | 
30 |   example: `path: 'fixture/simple.html'`
31 | 
32 |   error: #{e} #{e.message}
33 |   backtrace: #{e.backtrace}
34 | EOL
35 | raise ArgumentError, message
36 |         end
37 |         @fixture_content = file.read
38 |       end
39 |     end
40 |   end
41 | end
42 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/page.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Element
 9 |     class Page < Base
10 | 
11 |       attr_reader :canonical, :links, :meta_robots_rules, :base_url
12 | 
13 |       def call(env)
14 |         @body_content = []
15 |         @head_content = []
16 |         @head_html = ''
17 |         @body_html = ''
18 |         super
19 |       end
20 | 
21 |       def response_body
22 |         [ '<html>%s%s</html>' % [ @head_html, @body_html ] ]
23 |       end
24 | 
25 |       def head(&block)
26 |         @head_html = begin
27 |           block.call
28 |           '<head>%s</head>' % @head_content.join("\n")
29 |         end
30 |       end
31 | 
32 |       def body(&block)
33 |         @body_html = begin
34 |           block.call
35 |           '<body>%s</body>' % @body_content.join("\n")
36 |         end
37 |       end
38 | 
39 |       def text(&block)
40 |         @body_content << block.call.to_s
41 |       end
42 | 
43 |       private
44 | 
45 |       def canonical_to(url_or_path)
46 |         @head_content << %Q(<link rel="canonical" href="#{url_or_path}")
47 |       end
48 | 
49 |       def robots(rule)
50 |         @head_content << %Q(<meta name="robots" content="#{rule}">)
51 |       end
52 | 
53 |       def atom_to(path)
54 |         @head_content << %Q(<link rel="alternate" type="application/atom+xml" href="#{path}" />)
55 |       end
56 | 
57 |       def base(url_or_path)
58 |         @head_content << %Q(<base href="#{url_or_path}">)
59 |       end
60 | 
61 |       def link_to(url_or_path, options = {})
62 |         relative = options.delete(:relative)
63 |         url_or_path = absolute_url_for(url_or_path) if relative == false
64 | 
65 |         attributes = [''] + options.map { |k,v| "#{k}='#{v}'"}
66 |         @body_content << %Q(<a href="#{url_or_path}"#{attributes.join(' ')}>#{url_or_path}</a>)
67 |       end
68 |     end
69 |   end
70 | end
71 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/path_with_content_length.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'active_support/core_ext/numeric'
 8 | 
 9 | module Faux
10 |   module Element
11 | 
12 |     # This element is used primarily in testing against pages of given size.
13 |     # Do NOT add functionality to this file to cater to other cases, use
14 |     # `page` element instead.
15 |     class PathWithContentLength < Base
16 |       attr_reader :size
17 | 
18 |       def call(env)
19 |         @size = options[:size]
20 |         super
21 |       end
22 | 
23 |       def response_body
24 |         content = 'a' * (size || 0)
25 |         [content]
26 |       end
27 |     end
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/robots.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Element
 9 |     class Robots < Base
10 | 
11 |       def call(env)
12 |         @rules = []
13 |         super
14 |       end
15 | 
16 |       def sitemap(url_or_path, options = {})
17 |         if options[:relative] == true
18 |           url_or_path = absolute_url_for(url_or_path)
19 |         end
20 |         @rules << "Sitemap: #{url_or_path}\n"
21 |       end
22 | 
23 |       def method_missing(name, *args, &block)
24 |         @rules << "#{normalize_name(name)}: #{args.first}\n"
25 |       end
26 | 
27 |       def response_body
28 |         @rules
29 |       end
30 | 
31 |       def response_headers
32 |         @headers.merge!({'Content-Type' => 'text/plain'})
33 |         super
34 |       end
35 | 
36 |       private
37 | 
38 |       def normalize_name(name)
39 |         name.to_s.gsub('_', '-').capitalize
40 |       end
41 |     end
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/element/sitemap.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'stringio'
 8 | require 'nokogiri'
 9 | require 'zlib'
10 | 
11 | module Faux
12 |   module Element
13 |     class Sitemap < Base
14 |       def call(env)
15 |         @links = []
16 |         super
17 |       end
18 | 
19 |       def response_headers
20 |         @headers.merge!({'Content-Type' => 'application/xml'})
21 |         super
22 |       end
23 | 
24 |       def link_to(url_or_path, options = {})
25 |         if options[:relative]
26 |           @links << url_or_path
27 |         else
28 |           @links << absolute_url_for(url_or_path)
29 |         end
30 |       end
31 | 
32 |       def response_body
33 |         builder = Nokogiri::XML::Builder.new(:encoding => 'UTF-8') do |xml|
34 |           if options[:index]
35 |             xml.sitemapindex(:xmlns => "http://www.sitemaps.org/schemas/sitemap/0.9") {
36 |               @links.each do |link|
37 |                 xml.sitemap {
38 |                   xml.loc "#{link}"
39 |                 }
40 |               end
41 |             }
42 |           else
43 |             xml.urlset(:xmlns => "http://www.sitemaps.org/schemas/sitemap/0.9") {
44 |               @links.each do |link|
45 |                 xml.url {
46 |                   xml.loc "#{link}"
47 |                 }
48 |               end
49 |             }
50 |           end
51 |         end
52 | 
53 |         sitemap_txt = builder.to_xml
54 | 
55 |         if options[:gzip]
56 |           [gzip(sitemap_txt)]
57 |         else
58 |           sitemap_txt.split("\n")
59 |         end
60 |       end
61 | 
62 |       def gzip(contents)
63 |         file = StringIO.new
64 |         file.set_encoding("BINARY")
65 | 
66 |         writer = Zlib::GzipWriter.new(file)
67 |         writer.write(contents)
68 |         writer.close
69 | 
70 |         file.string
71 |       end
72 |     end
73 |   end
74 | end
75 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/helpers/url.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Helpers
 9 |     module Url
10 |       def absolute_url_for(path)
11 |         "#{env['rack.url_scheme']}://#{env['HTTP_HOST']}#{path}"
12 |       end
13 |     end
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/middleware/reporter.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | module Faux
 8 |   module Middleware
 9 | 
10 |     # Rack middleware to intercept requests and increments a counter based on
11 |     # the request path. If the path is '/status', we'll return a JSON report
12 |     # of the request counts since the application has been running.
13 |     class Reporter
14 | 
15 |       def self.counter
16 |         @counter ||= Hash.new(0)
17 |       end
18 | 
19 |       def self.reset!
20 |         @counter = Hash.new(0)
21 |       end
22 | 
23 |       def initialize(app)
24 |         @app = app
25 |       end
26 | 
27 |       def call(env)
28 |         if env['PATH_INFO'] == '/status'
29 |           [200, { 'Content-Type' => 'application/json' }, [ Reporter.counter.to_json ]]
30 |         else
31 |           Reporter.counter[env['PATH_INFO']] += 1
32 |           @app.call(env)
33 |         end
34 |       end
35 | 
36 |     end
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/faux/version.rb:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3 | # or more contributor license agreements. Licensed under the MIT License;
4 | # see LICENSE file in the project root for details
5 | #
6 | module Faux
7 |   VERSION = '0.1.0'
8 | end
9 | 


--------------------------------------------------------------------------------
/vendor/faux/lib/site.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | # frozen_string_literal: true
 8 | 
 9 | require 'rack'
10 | require 'webrick'
11 | require 'webrick/https'
12 | 
13 | module Faux
14 |   # Class to manage creation and destruction of mounted Rack instances
15 |   class Site
16 |     attr_reader :site, :options, :server, :server_thread
17 | 
18 |     def initialize(site, options = {})
19 |       @site = site
20 |       @options = options
21 |       start if options.fetch(:start, true)
22 |     end
23 | 
24 |     def start
25 |       if options[:debug]
26 |         puts "Faux: INFO: Starting Faux for #{site} (#{options.inspect})"
27 |       end
28 | 
29 |       start_queue = Queue.new
30 |       rack_opts = {
31 |         :app => site,
32 |         :Port => options[:port] || 9393,
33 |         :server => :webrick,
34 |         :StartCallback => proc { start_queue << :start }
35 |       }
36 | 
37 |       if options[:ssl]
38 |         key = OpenSSL::PKey::RSA.new(File.read(options.fetch(:ssl_key)))
39 |         cert = OpenSSL::X509::Certificate.new(File.read(options.fetch(:ssl_certificate)))
40 |         rack_opts.merge!(
41 |           :SSLEnable => true,
42 |           :SSLPrivateKey => key,
43 |           :SSLCertificate => cert,
44 |           :SSLCACertificateFile => options[:ssl_ca_certificate]
45 |         )
46 |       end
47 | 
48 |       @server ||= Rack::Server.new(rack_opts)
49 |       @server_thread = Thread.new { server.start }
50 |       start_queue.pop
51 |     end
52 | 
53 |     def stop
54 |       # Stop Webrick
55 |       server.server.shutdown
56 | 
57 |       # Make sure the thread has stopped or kill it within a second
58 |       10.times do
59 |         break unless server_thread.alive?
60 |         sleep(0.1)
61 |       end
62 |       server_thread.kill
63 | 
64 |       # Reset the state of the site
65 |       @server_thread = nil
66 |       @server = nil
67 |     end
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/vendor/faux/sites/fixture_site.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | class FixtureSite < Faux::Base
 7 |   fixture '/' do
 8 |     path 'spec/fixtures/simple.html'
 9 |   end
10 | 
11 |   fixture '/foo' do
12 |     headers 'Content-Type' => 'application/xml'
13 |     path 'spec/fixtures/atom-feed-example-com.xml'
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------
/vendor/faux/sites/robots_txt_respect_rules.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | class RobotsTxtRespectRules < Faux::Base
 7 |   page '/' do
 8 |     body do
 9 |       link_to '/bar'
10 |       link_to '/foo'
11 |     end
12 |   end
13 | 
14 |   page '/bar'
15 |   page '/foo'
16 | 
17 |   robots do
18 |     user_agent '*'
19 |     disallow '/foo'
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/vendor/faux/sites/simple_site.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | class SimpleSite < Faux::Base
 7 |   page '/' do
 8 |     head { atom_to '/feed' }
 9 |     body { link_to '/foo' }
10 |   end
11 | 
12 |   page '/foo' do
13 |     status 200
14 |     body { link_to '/foobar' }
15 |   end
16 | 
17 |   path_with_content_length '/large_page', 10.megabytes
18 | 
19 |   atom_feed '/feed' do
20 |     entry do
21 |       title 'Another Post'
22 |       link_to '/foo'
23 |       link_to '/wow'
24 |     end
25 | 
26 |     entry do
27 |       link_to '/bar'
28 |     end
29 |   end
30 | 
31 |   page '/bar' do
32 |     status 200
33 |     body do
34 |       link_to '/bang', :relative => false
35 |       link_to '/baz'
36 |     end
37 |   end
38 | 
39 |   page '/redirect' do
40 |     redirect '/foo'
41 |   end
42 | 
43 |   sitemap '/sitemap.xml' do
44 |     link_to '/foo'
45 |     link_to '/bar'
46 |   end
47 | 
48 |   robots do
49 |     user_agent '*'
50 |     disallow '/foo'
51 | 
52 |     # Sitemap urls should be absolute. Pass :relative => true
53 |     # so the url will be converted from relative to absolute.
54 |     sitemap '/sitemap.xml', :relative => true
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/vendor/faux/sites/sitemap_pointing_to_sitemaps.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | class SitemapPointingToSitemaps < Faux::Base
 7 |   robots do
 8 |     user_agent '*'
 9 | 
10 |     sitemap '/sitemap.xml'
11 |   end
12 | 
13 |   sitemap_index '/sitemap.xml' do
14 |     link_to '/sitemap_1.xml'
15 |     link_to '/sitemap_2.xml'
16 |   end
17 | 
18 |   sitemap '/sitemap_1.xml' do
19 |     link_to '/foo'
20 |   end
21 | 
22 |   sitemap '/sitemap_2.xml' do
23 |     link_to '/bar'
24 |   end
25 | 
26 |   page '/foo'
27 |   page '/bar'
28 | end
29 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/element/base_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Element::Base do
10 |   before :each do
11 |     allow_any_instance_of(Faux::Element::Base).to receive(:response_body).and_return('body')
12 |   end
13 | 
14 |   it 'should set defaults' do
15 |     base = Faux::Element::Base.new({})
16 |     expect(base.call(double)).to eq [200, {'Content-Type' => 'text/html'}, 'body']
17 |   end
18 | 
19 |   it 'sets status' do
20 |     content = Proc.new { status 400 }
21 |     base = Faux::Element::Base.new({}, &content)
22 |     expect(base.call(double)).to eq [400, {'Content-Type' => 'text/html'}, 'body']
23 |   end
24 | 
25 |   it 'sets headers' do
26 |     content = Proc.new { headers 'Content-Type' => 'text/plain' }
27 |     base = Faux::Element::Base.new({}, &content)
28 |     expect(base.call(double)).to eq [200, {'Content-Type' => 'text/plain'}, 'body']
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/element/fixture_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Element::Fixture do
10 |   let(:site) { Class.new(Faux::Base) }
11 | 
12 |   def app
13 |     site
14 |   end
15 | 
16 |   it 'accepts path as argument' do
17 |     app.fixture '/foo' do
18 |       path 'spec/fixtures/simple.html'
19 |     end
20 | 
21 |     get '/foo'
22 |     expect(last_response.body).to match '<title>example</title>'
23 |     expect(last_response.body).to match '<a href="another">another link</a>'
24 |   end
25 | 
26 |   it 'allows headers and status to be specified' do
27 |     app.fixture '/foo' do
28 |       status 404
29 |       headers "Content-Type" => 'text/plain'
30 |       path 'spec/fixtures/simple.html'
31 |     end
32 | 
33 |     get '/foo'
34 |     expect(last_response.body).to match '<title>example</title>'
35 |     expect(last_response.header['Content-Type']).to match 'text/plain'
36 |     expect(last_response.status).to eq(404)
37 |   end
38 | 
39 |   it 'works with xml files' do
40 |     app.fixture '/foo' do
41 |       headers 'Content-Type' => 'application/xml'
42 |       path 'spec/fixtures/atom-feed-example-com.xml'
43 |     end
44 | 
45 |     get '/foo'
46 |     expect(last_response.body).to match '<feed xmlns="http://www.w3.org/2005/Atom">'
47 |   end
48 | 
49 |   it 'raises error if path is wrong' do
50 |     app.fixture '/foo' do
51 |       path 'doesnt-exist'
52 |     end
53 | 
54 |     expect do
55 |       get '/foo'
56 |     end.to raise_error ArgumentError
57 |   end
58 | end
59 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/element/path_with_content_length_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Element::PathWithContentLength do
10 |   let(:site) { Class.new(Faux::Base) }
11 | 
12 |   def app
13 |     site
14 |   end
15 | 
16 |   it 'should return valid page' do
17 |     site.path_with_content_length '/large_page'
18 | 
19 |     get '/large_page'
20 |     expect(last_response.content_type).to eq 'text/html'
21 |   end
22 | 
23 |   it 'should return page of specified size' do
24 |     site.path_with_content_length '/large_page', 10.megabytes
25 | 
26 |     get '/large_page'
27 |     expect(last_response.content_length).to eq 10.megabytes
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/element/robots_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Element::Robots do
10 |   let(:site) { Class.new(Faux::Base) }
11 | 
12 |   def app
13 |     site
14 |   end
15 | 
16 |   it 'should be accessible on \robots.txt' do
17 |     site.robots
18 | 
19 |     get '/robots.txt'
20 |     expect(last_response).to_not be_empty
21 |   end
22 | 
23 |   it 'should render names' do
24 |     site.robots do
25 |       disallow '/blocked'
26 |     end
27 | 
28 |     get '/robots.txt'
29 |     expect(last_response.body).to eq "Disallow: /blocked\n"
30 |   end
31 | 
32 |   it 'should render names with dashes / underscores' do
33 |     site.robots do
34 |       user_agent '*'
35 |     end
36 | 
37 |     get '/robots.txt'
38 |     expect(last_response.body).to eq "User-agent: *\n"
39 |   end
40 | 
41 |   it 'combines multiple declarations on one file' do
42 |     site.robots do
43 |       disallow '/blocked'
44 |       sitemap 'http://example.com/sitemap.xml'
45 |     end
46 | 
47 |     get '/robots.txt'
48 |     expect(last_response.body).to eq "Disallow: /blocked\nSitemap: http://example.com/sitemap.xml\n"
49 |   end
50 | 
51 |   it 'returns correct content-type' do
52 |     site.robots do
53 |       disallow '/blocked'
54 |       sitemap 'http://example.com/sitemap.xml'
55 |     end
56 | 
57 |     get '/robots.txt'
58 |     expect(last_response.content_type).to eq "text/plain"
59 |   end
60 | 
61 |   it 'supports converting relative sitemap paths to absolute paths' do
62 |     site.robots do
63 |       sitemap '/sitemap.xml', :relative => true
64 |     end
65 | 
66 |     get '/robots.txt'
67 |     expect(last_response.body).to match 'http://example.org/sitemap.xml'
68 |   end
69 | end
70 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/element/sitemap_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Element::Sitemap do
10 |   let(:site) { Class.new(Faux::Base) }
11 | 
12 |   def app
13 |     site
14 |   end
15 | 
16 |   it 'should return 200 by default for sitemap' do
17 |     site.sitemap '/sitemap.xml'
18 | 
19 |     get '/sitemap.xml'
20 |     expect(last_response.status).to eq 200
21 |   end
22 | 
23 |   it 'should return xml' do
24 |     site.sitemap '/sitemap.xml'
25 | 
26 |     get '/sitemap.xml'
27 |     expect(last_response.content_type).to eq 'application/xml'
28 |   end
29 | 
30 |   context 'sitemap of URLs' do
31 |     it 'includes links into generated sitemap' do
32 |       site.sitemap '/sitemap.xml' do
33 |         link_to '/anothersite'
34 |       end
35 | 
36 |       get '/sitemap.xml'
37 |       expect(last_response.body).to match 'http://example.org/anothersite'
38 |     end
39 | 
40 |     it 'supports creating relative links' do
41 |       site.sitemap '/sitemap.xml' do
42 |         link_to '/anothersite', :relative => true
43 |       end
44 | 
45 |       get '/sitemap.xml'
46 |       expect(last_response.body).to match '<loc>/anothersite</loc>'
47 |     end
48 |   end
49 | 
50 |   context 'sitemap index' do
51 |     it 'defines an index' do
52 |       site.sitemap_index '/sitemap.xml'
53 | 
54 |       get '/sitemap.xml'
55 |       expect(last_response.body).to match 'sitemapindex'
56 |     end
57 | 
58 |     it 'supports creating links' do
59 |       site.sitemap_index '/sitemap.xml' do
60 |         link_to '/sitemap_2.xml'
61 |       end
62 | 
63 |       get '/sitemap.xml'
64 |       expect(last_response.body).to match '<loc>http://example.org/sitemap_2.xml</loc>'
65 |     end
66 |   end
67 | end
68 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/middleware/reporter_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Middleware::Reporter do
10 |   let(:site) do
11 |     build_rack_test_session(:status)
12 |     Class.new(Faux::Base)
13 |   end
14 | 
15 |   def app
16 |     site
17 |   end
18 | 
19 |   it 'reports a count of the routes that have been visited' do
20 |     pending "Intermittent error comes up (probably due to status not being cleared between test runs)"
21 | 
22 |     site.page '/foo'
23 | 
24 |     get '/foo'
25 |     get '/foo'
26 |     get '/status'
27 | 
28 |     expect(last_response.status).to eq(200)
29 |     expect(JSON.parse(last_response.body)).to eq('/foo' => 2)
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux/site_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Site do
10 |   let(:site) { Faux.site }
11 | 
12 |   it 'starts a Webrick handler for Rack' do
13 |     server = double("server")
14 |     expect(::Rack::Server).to receive(:new).with(:Port => 9393, :app => site, :server => :webrick).and_return(server)
15 |     expect(server).to receive(:start)
16 | 
17 |     faux = Faux::Site.new(site, {})
18 |     sleep(1)
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/faux_spec.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'spec_helper'
 8 | 
 9 | describe Faux::Base do
10 | 
11 |   let(:site) { Class.new(Faux::Base) }
12 | 
13 |   def app
14 |     site
15 |   end
16 | 
17 |   it 'adds a /status route by default' do
18 |     get '/status'
19 |     expect(last_response.status).to eq(200)
20 |   end
21 | 
22 | end
23 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/fixtures/atom-feed-example-com.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Atom feed with example.com links for easy combination with the sitemap fixtures -->
 3 | <feed xmlns="http://www.w3.org/2005/Atom">
 4 |  <title>Example.com</title>
 5 |  <link href="http://www.example.com/atom.xml" rel="self"/>
 6 |  <link href="http://www.example.com/"/>
 7 |  <updated>2012-10-11T12:46:09-07:00</updated>
 8 |  <id>http://www.example.com/</id>
 9 |  <author>
10 |    <name>Example.com</name>
11 |  </author>
12 | 
13 |  <entry>
14 |    <title>Example.com Stuff thing Blah</title>
15 |    <link href="http://www.example.com/atom-feed-page-1"/>
16 |    <published>2012-10-11T00:00:00-07:00</published>
17 |    <updated>2012-10-11T00:00:00-07:00</updated>
18 |    <id>http://www.example.com/atom-feed-page-1</id>
19 |    <content type="html">blah blah blah</content>
20 |    <author>
21 |      <name>Example.com Author</name>
22 |    </author>
23 |  </entry>
24 | </feed>
25 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/fixtures/simple.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 | <title>example</title>
 4 | </head>
 5 | <body>
 6 | <h1>example</h1>
 7 | <p>
 8 | <a href="foobar">link</a>
 9 | <a href="another">another link</a>
10 | </p>
11 | </body>
12 | </html>
13 | 


--------------------------------------------------------------------------------
/vendor/faux/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 3 | # or more contributor license agreements. Licensed under the MIT License;
 4 | # see LICENSE file in the project root for details
 5 | #
 6 | 
 7 | require 'bundler/setup'
 8 | require 'rspec'
 9 | require 'rack/test'
10 | require 'pry'
11 | require 'awesome_print'
12 | 
13 | require 'faux'
14 | 
15 | RSpec.configure do |config|
16 |   config.include Rack::Test::Methods
17 |   config.color = true
18 |   config.order = 'random'
19 | end
20 | 


--------------------------------------------------------------------------------
/vendor/jars/com/github/crawler-commons/crawler-commons/1.2/crawler-commons-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/com/github/crawler-commons/crawler-commons/1.2/crawler-commons-1.2.jar


--------------------------------------------------------------------------------
/vendor/jars/commons-codec/commons-codec/1.15/commons-codec-1.15.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/commons-codec/commons-codec/1.15/commons-codec-1.15.jar


--------------------------------------------------------------------------------
/vendor/jars/commons-io/commons-io/2.16.1/commons-io-2.16.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/commons-io/commons-io/2.16.1/commons-io-2.16.1.jar


--------------------------------------------------------------------------------
/vendor/jars/isorelax/isorelax/20030108/isorelax-20030108.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/isorelax/isorelax/20030108/isorelax-20030108.jar


--------------------------------------------------------------------------------
/vendor/jars/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar


--------------------------------------------------------------------------------
/vendor/jars/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar


--------------------------------------------------------------------------------
/vendor/jars/nu/validator/jing/20200702VNU/jing-20200702VNU.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/nu/validator/jing/20200702VNU/jing-20200702VNU.jar


--------------------------------------------------------------------------------
/vendor/jars/org/apache/commons/commons-compress/1.27.1/commons-compress-1.27.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/apache/commons/commons-compress/1.27.1/commons-compress-1.27.1.jar


--------------------------------------------------------------------------------
/vendor/jars/org/apache/commons/commons-lang3/3.16.0/commons-lang3-3.16.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/apache/commons/commons-lang3/3.16.0/commons-lang3-3.16.0.jar


--------------------------------------------------------------------------------
/vendor/jars/org/apache/httpcomponents/client5/httpclient5/5.1/httpclient5-5.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/apache/httpcomponents/client5/httpclient5/5.1/httpclient5-5.1.jar


--------------------------------------------------------------------------------
/vendor/jars/org/apache/httpcomponents/core5/httpcore5-h2/5.1.1/httpcore5-h2-5.1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/apache/httpcomponents/core5/httpcore5-h2/5.1.1/httpcore5-h2-5.1.1.jar


--------------------------------------------------------------------------------
/vendor/jars/org/apache/httpcomponents/core5/httpcore5/5.1.1/httpcore5-5.1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/apache/httpcomponents/core5/httpcore5/5.1.1/httpcore5-5.1.1.jar


--------------------------------------------------------------------------------
/vendor/jars/org/brotli/dec/0.1.2/dec-0.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/brotli/dec/0.1.2/dec-0.1.2.jar


--------------------------------------------------------------------------------
/vendor/jars/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar


--------------------------------------------------------------------------------
/vendor/jars/org/slf4j/slf4j-api/1.7.7/slf4j-api-1.7.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/slf4j/slf4j-api/1.7.7/slf4j-api-1.7.7.jar


--------------------------------------------------------------------------------
/vendor/jars/org/slf4j/slf4j-nop/1.7.26/slf4j-nop-1.7.26.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/org/slf4j/slf4j-nop/1.7.26/slf4j-nop-1.7.26.jar


--------------------------------------------------------------------------------
/vendor/jars/xalan/serializer/2.7.3/serializer-2.7.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/xalan/serializer/2.7.3/serializer-2.7.3.jar


--------------------------------------------------------------------------------
/vendor/jars/xalan/xalan/2.7.3/xalan-2.7.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/xalan/xalan/2.7.3/xalan-2.7.3.jar


--------------------------------------------------------------------------------
/vendor/jars/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar


--------------------------------------------------------------------------------
/vendor/jars/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/elastic/crawler/096cd7e244b76aab9526d310aeece5140852db34/vendor/jars/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar


--------------------------------------------------------------------------------