├── .gitignore ├── README.md └── cloud-crawler ├── .project ├── CHANGELOG.rdoc ├── Gemfile ├── Gemfile.lock ├── INSTALL.aws.rdoc ├── INSTALL.local.rdoc ├── LICENSE ├── README.rdoc ├── Rakefile ├── VERSION ├── bin ├── restart_workers.rb ├── run_worker.rb ├── standalone_crawl.rb ├── start_batch_crawl.rb ├── start_crawl.rb ├── stop_worker.rb └── test_logging.rb ├── cloud-crawler.gemspec ├── config ├── master_schedule.rb └── worker_schedule.rb ├── examples ├── count_listings.rb ├── crossfit_crawl.rb ├── find_404s.rb ├── link_selector.rb ├── sugary_word_count.rb └── word_count.rb ├── lib ├── cloud-crawler.rb └── cloud-crawler │ ├── batch_api_job.rb │ ├── batch_crawl_job.rb │ ├── batch_curl_job.rb │ ├── batch_job.rb │ ├── browser.rb │ ├── cookie_store.rb │ ├── crawl_job.rb │ ├── driver.rb │ ├── dsl_common.rb │ ├── dsl_core.rb │ ├── dsl_front_end.rb │ ├── exceptions.rb │ ├── http.rb │ ├── http_party.rb │ ├── logger.rb │ ├── mozilla_agents.rb │ ├── page.rb │ ├── redis_doc_store.rb │ ├── redis_dsl_core.rb │ ├── redis_page_store.rb │ ├── redis_url_bloomfilter.rb │ ├── test_worker.rb │ └── worker.rb ├── logs └── master.log ├── spec ├── batch_crawl_job_spec.rb ├── batch_curl_job_spec.rb ├── batch_job_spec.rb ├── child_spawning_batch_job.rb ├── child_spawning_batch_job_spec.rb ├── cloud_crawler_spec.rb ├── cookie_store_spec.rb ├── crawl_job_spec.rb ├── driver_spec.rb ├── dsl_common_spec.rb ├── fakeweb_helper.rb ├── headless_http_spec.rb ├── headless_test_endpoint.rb ├── http_spec.rb ├── make_test_data.rb ├── page_spec.rb ├── redis_page_store_spec.rb ├── redis_url_bloomfilter_spec.rb ├── spec_helper.rb ├── test_batch_curl_job.rb ├── test_batch_job.rb └── test_crawl_job.rb └── test ├── crawl_by_selector.rb ├── test_bloom_filter.rb ├── test_crawl.rb ├── testbf.rb └── testbf_2.rb /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/README.md -------------------------------------------------------------------------------- /cloud-crawler/.project: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/.project -------------------------------------------------------------------------------- /cloud-crawler/CHANGELOG.rdoc: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cloud-crawler/Gemfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/Gemfile -------------------------------------------------------------------------------- /cloud-crawler/Gemfile.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/Gemfile.lock -------------------------------------------------------------------------------- /cloud-crawler/INSTALL.aws.rdoc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/INSTALL.aws.rdoc -------------------------------------------------------------------------------- /cloud-crawler/INSTALL.local.rdoc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/INSTALL.local.rdoc -------------------------------------------------------------------------------- /cloud-crawler/LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/LICENSE -------------------------------------------------------------------------------- /cloud-crawler/README.rdoc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/README.rdoc -------------------------------------------------------------------------------- /cloud-crawler/Rakefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/Rakefile -------------------------------------------------------------------------------- /cloud-crawler/VERSION: -------------------------------------------------------------------------------- 1 | 0.2 -------------------------------------------------------------------------------- /cloud-crawler/bin/restart_workers.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/restart_workers.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/run_worker.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/run_worker.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/standalone_crawl.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/standalone_crawl.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/start_batch_crawl.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/start_batch_crawl.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/start_crawl.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/start_crawl.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/stop_worker.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/stop_worker.rb -------------------------------------------------------------------------------- /cloud-crawler/bin/test_logging.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/bin/test_logging.rb -------------------------------------------------------------------------------- /cloud-crawler/cloud-crawler.gemspec: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/cloud-crawler.gemspec -------------------------------------------------------------------------------- /cloud-crawler/config/master_schedule.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/config/master_schedule.rb -------------------------------------------------------------------------------- /cloud-crawler/config/worker_schedule.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/config/worker_schedule.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/count_listings.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/count_listings.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/crossfit_crawl.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/crossfit_crawl.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/find_404s.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/find_404s.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/link_selector.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/link_selector.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/sugary_word_count.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/sugary_word_count.rb -------------------------------------------------------------------------------- /cloud-crawler/examples/word_count.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/examples/word_count.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/batch_api_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/batch_api_job.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/batch_crawl_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/batch_crawl_job.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/batch_curl_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/batch_curl_job.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/batch_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/batch_job.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/browser.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/browser.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/cookie_store.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/cookie_store.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/crawl_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/crawl_job.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/driver.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/driver.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/dsl_common.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/dsl_common.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/dsl_core.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/dsl_core.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/dsl_front_end.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/dsl_front_end.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/exceptions.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/exceptions.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/http.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/http.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/http_party.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/http_party.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/logger.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/logger.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/mozilla_agents.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/mozilla_agents.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/page.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/page.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/redis_doc_store.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/redis_doc_store.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/redis_dsl_core.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/redis_dsl_core.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/redis_page_store.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/redis_page_store.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/redis_url_bloomfilter.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/redis_url_bloomfilter.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/test_worker.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/test_worker.rb -------------------------------------------------------------------------------- /cloud-crawler/lib/cloud-crawler/worker.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/lib/cloud-crawler/worker.rb -------------------------------------------------------------------------------- /cloud-crawler/logs/master.log: -------------------------------------------------------------------------------- 1 | # master.log -------------------------------------------------------------------------------- /cloud-crawler/spec/batch_crawl_job_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/batch_crawl_job_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/batch_curl_job_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/batch_curl_job_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/batch_job_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/batch_job_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/child_spawning_batch_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/child_spawning_batch_job.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/child_spawning_batch_job_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/child_spawning_batch_job_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/cloud_crawler_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/cloud_crawler_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/cookie_store_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/cookie_store_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/crawl_job_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/crawl_job_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/driver_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/driver_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/dsl_common_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/dsl_common_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/fakeweb_helper.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/fakeweb_helper.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/headless_http_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/headless_http_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/headless_test_endpoint.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/headless_test_endpoint.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/http_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/http_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/make_test_data.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/make_test_data.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/page_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/page_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/redis_page_store_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/redis_page_store_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/redis_url_bloomfilter_spec.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/redis_url_bloomfilter_spec.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/spec_helper.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/spec_helper.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/test_batch_curl_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/test_batch_curl_job.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/test_batch_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/test_batch_job.rb -------------------------------------------------------------------------------- /cloud-crawler/spec/test_crawl_job.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/spec/test_crawl_job.rb -------------------------------------------------------------------------------- /cloud-crawler/test/crawl_by_selector.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/test/crawl_by_selector.rb -------------------------------------------------------------------------------- /cloud-crawler/test/test_bloom_filter.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/test/test_bloom_filter.rb -------------------------------------------------------------------------------- /cloud-crawler/test/test_crawl.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/test/test_crawl.rb -------------------------------------------------------------------------------- /cloud-crawler/test/testbf.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/test/testbf.rb -------------------------------------------------------------------------------- /cloud-crawler/test/testbf_2.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CalculatedContent/cloud-crawler/HEAD/cloud-crawler/test/testbf_2.rb --------------------------------------------------------------------------------