├── log └── .keep ├── app ├── sites │ └── .keep ├── workers │ ├── worker.rb │ ├── scheduler │ │ ├── base.rb │ │ ├── reindexer.rb │ │ └── clearer.rb │ ├── crawler │ │ ├── slider.rb │ │ ├── socializer.rb │ │ ├── screener.rb │ │ ├── stretcher.rb │ │ ├── scrimper.rb │ │ ├── base.rb │ │ ├── sampler.rb │ │ ├── spider.rb │ │ ├── scraper.rb │ │ └── sitemapper.rb │ ├── syncer │ │ ├── reslider.rb │ │ ├── resocializer.rb │ │ ├── respider.rb │ │ ├── rescrimper.rb │ │ ├── resampler.rb │ │ ├── base.rb │ │ ├── reindexer.rb │ │ ├── mover.rb │ │ ├── rescreener.rb │ │ └── refixer.rb │ ├── recorder │ │ ├── collector.rb │ │ ├── base.rb │ │ ├── uploader.rb │ │ └── fixer.rb │ └── mapper │ │ ├── id_availability.rb │ │ ├── base.rb │ │ ├── cleaner.rb │ │ ├── url_availability.rb │ │ └── indexer.rb ├── models │ ├── page │ │ ├── base.rb │ │ ├── url.rb │ │ └── parse.rb │ ├── persist.rb │ ├── record │ │ ├── screenshot.rb │ │ ├── export.rb │ │ ├── addons.rb │ │ ├── base.rb │ │ ├── search.rb │ │ ├── trends.rb │ │ ├── match.rb │ │ └── upload.rb │ ├── crawl │ │ ├── sitemap.rb │ │ ├── social.rb │ │ ├── capture.rb │ │ ├── base.rb │ │ └── google.rb │ ├── flattener.rb │ └── cloud.rb ├── controllers │ ├── application_controller.rb │ └── v1 │ │ ├── batch_controller.rb │ │ ├── trends_controller.rb │ │ ├── match_controller.rb │ │ ├── search_controller.rb │ │ ├── status_controller.rb │ │ └── record_controller.rb └── helpers │ ├── counts_helper.rb │ ├── page_helper.rb │ └── schema_org_helper.rb ├── test ├── helpers │ └── .keep ├── mailers │ └── .keep ├── models │ ├── .keep │ └── url_test.rb ├── controllers │ └── .keep ├── fixtures │ └── .keep ├── integration │ └── .keep └── test_helper.rb ├── .ruby-gemset ├── .ruby-version ├── roles ├── chruby │ ├── .gitignore │ ├── defaults │ │ └── main.yml │ ├── templates │ │ ├── chruby.fact │ │ └── chruby.sh │ ├── test.yml │ ├── meta │ │ └── main.yml │ ├── .travis.yml │ ├── README.md │ ├── LICENSE │ └── tasks │ │ └── main.yml ├── elasticsearch │ ├── ansible.cfg │ ├── files │ │ ├── scripts │ │ │ └── calculate-score.groovy │ │ └── templates │ │ │ └── basic.json │ ├── test │ │ └── integration │ │ │ ├── multi-1x │ │ │ ├── multi.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── multi-2x │ │ │ ├── multi.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── config-1x │ │ │ ├── config.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── config-2x │ │ │ ├── config.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── package-1x │ │ │ ├── package.yaml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── package-2x │ │ │ ├── package.yaml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── standard-1x │ │ │ ├── standard.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── standard-2x │ │ │ ├── standard.yml │ │ │ └── serverspec │ │ │ │ └── default_spec.rb │ │ │ ├── helpers │ │ │ └── serverspec │ │ │ │ ├── Gemfile │ │ │ │ ├── spec_helper.rb │ │ │ │ ├── standard_spec.rb │ │ │ │ └── package_spec.rb │ │ │ ├── standard.yml │ │ │ ├── package.yml │ │ │ ├── config.yml │ │ │ └── multi.yml │ ├── .gitignore │ ├── tasks │ │ ├── elasticsearch-Debian-version-lock.yml │ │ ├── elasticsearch-service.yml │ │ ├── elasticsearch-RedHat-version-lock.yml │ │ ├── elasticsearch-version-lock.yml │ │ ├── java.yml │ │ ├── elasticsearch-optional-user.yml │ │ ├── elasticsearch-scripts.yml │ │ ├── main.yml │ │ ├── elasticsearch-RedHat.yml │ │ ├── elasticsearch-Debian.yml │ │ ├── elasticsearch-templates.yml │ │ ├── elasticsearch.yml │ │ ├── checkParameters.yml │ │ ├── elasticsearch-plugins.yml │ │ └── elasticsearch-config.yml │ ├── vars │ │ ├── Debian.yml │ │ ├── RedHat.yml │ │ └── main.yml │ ├── Gemfile │ ├── templates │ │ ├── elasticsearch.repo │ │ ├── elasticsearch.yml.j2 │ │ ├── systemd │ │ │ └── elasticsearch.j2 │ │ ├── logging.yml.j2 │ │ └── elasticsearch.j2 │ ├── handlers │ │ └── main.yml │ ├── meta │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── filter_plugins │ │ └── custom.py │ ├── Gemfile.lock │ └── .kitchen.yml ├── ruby-install │ ├── .gitignore │ ├── meta │ │ ├── .galaxy_install_info │ │ └── main.yml │ ├── templates │ │ └── ruby_install.fact │ ├── defaults │ │ └── main.yml │ ├── test.yml │ ├── .travis.yml │ ├── README.md │ ├── LICENSE │ └── tasks │ │ └── main.yml ├── logrotate │ ├── tests │ │ ├── inventory │ │ └── test.yml │ ├── defaults │ │ └── main.yml │ ├── meta │ │ └── main.yml │ ├── templates │ │ └── logrotate.d.j2 │ ├── tasks │ │ └── main.yml │ ├── .travis.yml │ ├── LICENSE │ └── README.md ├── build-ruby │ ├── defaults │ │ └── main.yml │ ├── vars │ │ └── main.yml │ ├── handlers │ │ └── main.yml │ ├── tasks │ │ └── main.yml │ ├── README.md │ └── meta │ │ └── main.yml ├── nginx-unicorn │ ├── defaults │ │ └── main.yml │ ├── handlers │ │ └── main.yml │ ├── tasks │ │ ├── main.yml │ │ ├── redhat.yml │ │ └── debian.yml │ ├── meta │ │ └── main.yml │ ├── LICENSE │ └── README.md ├── swapfile │ ├── handlers │ │ └── main.yml │ ├── defaults │ │ └── main.yml │ ├── meta │ │ └── main.yml │ ├── LICENSE │ ├── tasks │ │ └── main.yml │ └── README.md ├── imagemagick │ ├── .gitignore │ ├── tasks │ │ └── main.yml │ ├── meta │ │ └── main.yml │ └── README.md ├── ubuntu-common │ ├── templates │ │ ├── locale │ │ └── sources.list │ ├── defaults │ │ └── main.yml │ ├── meta │ │ └── main.yml │ ├── README.md │ └── tasks │ │ └── main.yml └── letsencrypt │ ├── meta │ ├── .galaxy_install_info │ └── main.yml │ ├── defaults │ └── main.yml │ ├── tasks │ └── main.yml │ └── README.md ├── .guardrc ├── bin ├── rake ├── bundle ├── rails └── spring ├── config ├── initializers │ ├── cookies_serializer.rb │ ├── elasticsearch.rb │ ├── session_store.rb │ ├── redis.rb │ ├── filter_parameter_logging.rb │ ├── mime_types.rb │ ├── vcr.rb │ ├── assets.rb │ ├── backtrace_silencers.rb │ ├── wrap_parameters.rb │ ├── inflections.rb │ └── sidekiq.rb ├── environment.rb ├── boot.rb ├── database.yml ├── application.rb ├── locales │ └── en.yml ├── sidekiq-slim.yml.example ├── secrets.yml.example ├── config.yml.example ├── sitemap.rb ├── unicorn.rb ├── routes.rb ├── environments │ ├── development.rb │ ├── test.rb │ └── production.rb └── sidekiq.yml.example ├── restart.sh ├── lib └── tasks │ ├── map.rake │ ├── report.rake │ ├── crawl.rake │ └── sync.rake ├── Rakefile ├── run ├── .gitignore ├── config.ru ├── Gemfile ├── production.yml └── Guardfile /log/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/sites/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/helpers/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/mailers/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/models/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.ruby-gemset: -------------------------------------------------------------------------------- 1 | skynet 2 | -------------------------------------------------------------------------------- /.ruby-version: -------------------------------------------------------------------------------- 1 | 2.7.2 2 | -------------------------------------------------------------------------------- /test/controllers/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/fixtures/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /test/integration/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /roles/chruby/.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant 2 | -------------------------------------------------------------------------------- /roles/elasticsearch/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] -------------------------------------------------------------------------------- /roles/ruby-install/.gitignore: -------------------------------------------------------------------------------- 1 | .vagrant 2 | -------------------------------------------------------------------------------- /roles/logrotate/tests/inventory: -------------------------------------------------------------------------------- 1 | localhost 2 | -------------------------------------------------------------------------------- /roles/build-ruby/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2.2.1 3 | -------------------------------------------------------------------------------- /roles/chruby/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | chruby_version: '0.3.9' 3 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | nginx_sites: [] 3 | -------------------------------------------------------------------------------- /.guardrc: -------------------------------------------------------------------------------- 1 | require File.expand_path('../config/environment', __FILE__) 2 | -------------------------------------------------------------------------------- /roles/build-ruby/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # vars file for build-ruby 3 | -------------------------------------------------------------------------------- /app/workers/worker.rb: -------------------------------------------------------------------------------- 1 | class Worker 2 | include Sidekiq::Worker 3 | end 4 | -------------------------------------------------------------------------------- /roles/build-ruby/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # handlers file for build-ruby 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/files/scripts/calculate-score.groovy: -------------------------------------------------------------------------------- 1 | log(_score * 2) + my_modifier -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/multi-1x/multi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/multi-2x/multi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/chruby/templates/chruby.fact: -------------------------------------------------------------------------------- 1 | {"version": "{{ installed_chruby_version.stdout }}"} 2 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/config-1x/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/config-2x/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/package-1x/package.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/package-2x/package.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/swapfile/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Reload sysctl 3 | command: sysctl -p 4 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/standard-1x/standard.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/standard-2x/standard.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - host: test-kitchen 3 | -------------------------------------------------------------------------------- /roles/imagemagick/.gitignore: -------------------------------------------------------------------------------- 1 | *.sublime-project 2 | *.sublime-workspace 3 | .DS_Store 4 | .idea 5 | -------------------------------------------------------------------------------- /roles/logrotate/defaults/main.yml: -------------------------------------------------------------------------------- 1 | logrotate_conf_dir: "/etc/logrotate.d/" 2 | logrotate_scripts: [] 3 | -------------------------------------------------------------------------------- /roles/ubuntu-common/templates/locale: -------------------------------------------------------------------------------- 1 | LC_ALL={{ common_locale_all }} 2 | LANG={{ common_locale_lang }} -------------------------------------------------------------------------------- /app/workers/scheduler/base.rb: -------------------------------------------------------------------------------- 1 | class Scheduler::Base < Worker 2 | include Sidetiq::Schedulable 3 | end 4 | -------------------------------------------------------------------------------- /roles/letsencrypt/meta/.galaxy_install_info: -------------------------------------------------------------------------------- 1 | {install_date: 'Thu Feb 4 19:04:37 2016', version: master} 2 | -------------------------------------------------------------------------------- /roles/ruby-install/meta/.galaxy_install_info: -------------------------------------------------------------------------------- 1 | {install_date: 'Thu Oct 2 20:55:31 2014', version: v0.1.0} 2 | -------------------------------------------------------------------------------- /roles/ruby-install/templates/ruby_install.fact: -------------------------------------------------------------------------------- 1 | {"version": "{{ installed_ruby_install_version.stdout }}"} 2 | -------------------------------------------------------------------------------- /bin/rake: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | require_relative '../config/boot' 3 | require 'rake' 4 | Rake.application.run 5 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/handlers/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: reload nginx 3 | service: name=nginx state=reloaded 4 | 5 | -------------------------------------------------------------------------------- /roles/ruby-install/defaults/main.yml: -------------------------------------------------------------------------------- 1 | # file: ruby-install/defaults/main.yml 2 | 3 | ruby_install_version: '0.4.3' 4 | -------------------------------------------------------------------------------- /roles/chruby/templates/chruby.sh: -------------------------------------------------------------------------------- 1 | source /usr/local/share/chruby/chruby.sh 2 | source /usr/local/share/chruby/auto.sh 3 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/helpers/serverspec/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'rspec-retry' 4 | -------------------------------------------------------------------------------- /roles/chruby/test.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | vars_files: 3 | - defaults/main.yml 4 | tasks: 5 | - include: tasks/main.yml 6 | -------------------------------------------------------------------------------- /roles/elasticsearch/.gitignore: -------------------------------------------------------------------------------- 1 | .kitchen/ 2 | *.pyc 3 | .vendor 4 | .bundle 5 | Converging 6 | TODO 7 | .idea/ 8 | elasticsearch.iml 9 | -------------------------------------------------------------------------------- /roles/ruby-install/test.yml: -------------------------------------------------------------------------------- 1 | - hosts: all 2 | vars_files: 3 | - defaults/main.yml 4 | tasks: 5 | - include: tasks/main.yml 6 | -------------------------------------------------------------------------------- /bin/bundle: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) 3 | load Gem.bin_path('bundler', 'bundle') 4 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-Debian-version-lock.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Debian - hold elasticsearch version 3 | command: apt-mark hold elasticsearch 4 | -------------------------------------------------------------------------------- /bin/rails: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | APP_PATH = File.expand_path('../../config/application', __FILE__) 3 | require_relative '../config/boot' 4 | require 'rails/commands' 5 | -------------------------------------------------------------------------------- /config/initializers/cookies_serializer.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | Rails.application.config.action_dispatch.cookies_serializer = :json 4 | -------------------------------------------------------------------------------- /config/initializers/elasticsearch.rb: -------------------------------------------------------------------------------- 1 | require 'typhoeus/adapters/faraday' 2 | Elasticsearch::Model.client = Elasticsearch::Client.new(Rails.configuration.config[:elasticsearch]) 3 | -------------------------------------------------------------------------------- /config/initializers/session_store.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | Rails.application.config.session_store :cookie_store, key: '_crawler_session' 4 | -------------------------------------------------------------------------------- /roles/elasticsearch/vars/Debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | java: "{{ es_java | default('openjdk-7-jre-headless') }}" 3 | default_file: "/etc/default/elasticsearch" 4 | es_home: "/usr/share/elasticsearch" -------------------------------------------------------------------------------- /roles/nginx-unicorn/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - include: debian.yml 3 | when: ansible_os_family == 'Debian' 4 | 5 | - include: redhat.yml 6 | when: ansible_os_family == 'RedHat' 7 | 8 | -------------------------------------------------------------------------------- /config/environment.rb: -------------------------------------------------------------------------------- 1 | # Load the Rails application. 2 | require File.expand_path('../application', __FILE__) 3 | 4 | # Initialize the Rails application. 5 | Rails.application.initialize! 6 | -------------------------------------------------------------------------------- /roles/elasticsearch/vars/RedHat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | java: "{{ es_java | default('java-1.8.0-openjdk.x86_64') }}" 3 | default_file: "/etc/sysconfig/elasticsearch" 4 | es_home: "/usr/share/elasticsearch" -------------------------------------------------------------------------------- /roles/elasticsearch/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'test-kitchen', '1.4.2' 4 | gem "kitchen-docker", '2.1.0' 5 | gem 'kitchen-ansible', '0.40.1' 6 | gem 'net-ssh', '~> 2.0' 7 | -------------------------------------------------------------------------------- /roles/swapfile/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | swapfile_location: /swapfile 3 | swapfile_size: 512MB 4 | swapfile_swappiness: False 5 | swapfile_vfs_cache_pressure: False 6 | swapfile_use_dd: False 7 | -------------------------------------------------------------------------------- /config/boot.rb: -------------------------------------------------------------------------------- 1 | # Set up gems listed in the Gemfile. 2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__) 3 | 4 | require 'bundler/setup' if File.exist?(ENV['BUNDLE_GEMFILE']) 5 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/config-1x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'config_spec' 2 | 3 | describe 'Config Tests v 1.x' do 4 | include_examples 'config::init', "1.7.3" 5 | end 6 | 7 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/config-2x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'config_spec' 2 | 3 | describe 'Config Tests v 2.x' do 4 | include_examples 'config::init', "2.2.0" 5 | end 6 | 7 | -------------------------------------------------------------------------------- /config/initializers/redis.rb: -------------------------------------------------------------------------------- 1 | Redis.current = Redis.new(host: Rails.configuration.config[:redis][:host], port: Rails.configuration.config[:redis][:port], password: Rails.configuration.config[:redis][:password]) 2 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/multi-1x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'multi_spec' 2 | 3 | 4 | describe 'Multi Tests v 1.x' do 5 | include_examples 'multi::init', "1.7.3", ["kopf","marvel"] 6 | end -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/package-1x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'package_spec' 2 | 3 | describe 'Package Tests v 1.x' do 4 | include_examples 'package::init', "1.7.3", ["kopf","marvel"] 5 | end -------------------------------------------------------------------------------- /test/test_helper.rb: -------------------------------------------------------------------------------- 1 | ENV['RAILS_ENV'] ||= 'test' 2 | require File.expand_path('../../config/environment', __FILE__) 3 | require 'rails/test_help' 4 | 5 | class ActiveSupport::TestCase 6 | fixtures :all 7 | end 8 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/standard-1x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'standard_spec' 2 | 3 | 4 | describe 'Standard Tests v 1.x' do 5 | include_examples 'standard::init', "1.7.3" 6 | end 7 | 8 | 9 | -------------------------------------------------------------------------------- /app/models/page/base.rb: -------------------------------------------------------------------------------- 1 | class Page::Base < Page::Url 2 | attr_accessor :page 3 | 4 | def parser 5 | page.parser 6 | end 7 | 8 | def base 9 | "#{page.uri.scheme}://#{page.uri.host}" 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /restart.sh: -------------------------------------------------------------------------------- 1 | git checkout . && git pull origin master && cd app/sites && git pull origin master && cd ../.. && cp config/sidekiq.yml.example config/sidekiq.yml && bundle && RAILS_ENV=production bundle exec sidekiq -d -L log/sidekiq.log 2 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/package-2x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'package_spec' 2 | 3 | 4 | describe 'Package Tests v 2.x' do 5 | include_examples 'package::init', "2.2.0", ["kopf","license","marvel-agent"] 6 | end -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/standard-2x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'standard_spec' 2 | 3 | 4 | describe 'Standard Tests v 2.x' do 5 | include_examples 'standard::init', "2.2.0" 6 | end 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/multi-2x/serverspec/default_spec.rb: -------------------------------------------------------------------------------- 1 | require 'multi_spec' 2 | 3 | 4 | describe 'Multi Tests v 2.x' do 5 | include_examples 'multi::init', "2.2.0", ["kopf","license","marvel-agent"] 6 | end 7 | 8 | 9 | -------------------------------------------------------------------------------- /roles/swapfile/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: "Kamal Nasser" 4 | description: swapfile 5 | license: MIT 6 | min_ansible_version: 1.4 7 | version: 0.4 8 | categories: 9 | - system 10 | dependencies: [] 11 | -------------------------------------------------------------------------------- /config/initializers/filter_parameter_logging.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Configure sensitive parameters which will be filtered from the log file. 4 | Rails.application.config.filter_parameters += [:password] 5 | -------------------------------------------------------------------------------- /roles/elasticsearch/files/templates/basic.json: -------------------------------------------------------------------------------- 1 | { 2 | "template" : "te*", 3 | "settings" : { 4 | "number_of_shards" : 1 5 | }, 6 | "mappings" : { 7 | "type1" : { 8 | "_source" : { "enabled" : false } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /lib/tasks/map.rake: -------------------------------------------------------------------------------- 1 | namespace :map do 2 | desc 'Run the crawler in Mapper::Reader mode' 3 | task :reader, [:bucket] => :environment do |_task, args| 4 | Redis::List.new('visited').clear 5 | Mapper::Reader.perform_async args.bucket 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/standard.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: wrapper playbook for kitchen testing "elasticsearch" 3 | hosts: localhost 4 | roles: 5 | - { role: elasticsearch, es_instance_name: "node1" } 6 | vars: 7 | es_use_repository: "true" -------------------------------------------------------------------------------- /app/controllers/application_controller.rb: -------------------------------------------------------------------------------- 1 | class ApplicationController < ActionController::Base 2 | respond_to :json 3 | 4 | def index 5 | redirect_to Rails.configuration.config[:admin][:docs] || 'https://github.com/bastosmichael/skynet' 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /lib/tasks/report.rake: -------------------------------------------------------------------------------- 1 | namespace :report do 2 | desc 'Get List of Api keys' 3 | task :api_keys do 4 | Cloud.new('api-keys').files.map(&:key).map { |key| Record::Base.new('api-keys', key.gsub('.json','')).data.merge(api_key: key).symbolize_keys! } 5 | end 6 | end 7 | -------------------------------------------------------------------------------- /roles/elasticsearch/vars/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | es_package_url: "https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch" 3 | es_conf_dir: "/etc/elasticsearch" 4 | sysd_script: "/usr/lib/systemd/system/elasticsearch.service" 5 | init_script: "/etc/init.d/elasticsearch" -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # Add your own tasks in files placed in lib/tasks ending in .rake, 2 | # for example lib/tasks/capistrano.rake, and they will automatically be available to Rake. 3 | 4 | require File.expand_path('../config/application', __FILE__) 5 | 6 | Rails.application.load_tasks 7 | -------------------------------------------------------------------------------- /roles/ubuntu-common/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | common_apt_mirror: http://archive.ubuntu.com/ubuntu/ 3 | common_release_code: trusty 4 | 5 | common_timezone: Asia/Shanghai 6 | 7 | common_locale_all: en_US.UTF-8 8 | common_locale_lang: en_US.UTF-8 9 | 10 | common_apt_cache_time: 3600 11 | -------------------------------------------------------------------------------- /app/workers/crawler/slider.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Slider < Crawler::Sampler 2 | sidekiq_options queue: :slider, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | end 8 | -------------------------------------------------------------------------------- /config/initializers/mime_types.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Add new mime types for use in respond_to blocks: 4 | # Mime::Type.register "text/richtext", :rtf 5 | 6 | Mime::Type.register 'application/xls', :xls 7 | Mime::Type.register 'application/jpeg', :jpg 8 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-service.yml: -------------------------------------------------------------------------------- 1 | # Make sure the service is started, and restart if necessary 2 | - name: Start elasticsearch service 3 | service: name={{instance_init_script | basename}} state=started enabled=yes 4 | when: es_start_service 5 | register: elasticsearch_started 6 | 7 | -------------------------------------------------------------------------------- /app/workers/syncer/reslider.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Reslider < Syncer::Base 2 | def perform(container) 3 | @container = container 4 | records.with_progress("Reslide Crawling #{container}").each do |r| 5 | Crawler::Slider.perform_async record(r.key.gsub('.json','')).try(:url) 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /roles/ubuntu-common/meta/main.yml: -------------------------------------------------------------------------------- 1 | galaxy_info: 2 | author: AR 3 | description: common setup routine for ubuntu 4 | license: MIT 5 | min_ansible_version: 1.2 6 | platforms: 7 | - name: Ubuntu 8 | versions: 9 | - precise 10 | - trusty 11 | categories: 12 | - system 13 | dependencies: [] 14 | -------------------------------------------------------------------------------- /roles/elasticsearch/templates/elasticsearch.repo: -------------------------------------------------------------------------------- 1 | [elasticsearch-{{ es_major_version }}] 2 | name=Elasticsearch repository for {{ es_major_version }} packages 3 | baseurl=http://packages.elastic.co/elasticsearch/{{ es_major_version }}/centos 4 | gpgcheck=1 5 | gpgkey=http://packages.elastic.co/GPG-KEY-elasticsearch 6 | enabled=1 7 | -------------------------------------------------------------------------------- /app/models/persist.rb: -------------------------------------------------------------------------------- 1 | class Persist 2 | def initialize(cloud) 3 | @cloud = cloud 4 | end 5 | 6 | def [](key) 7 | @cloud.get(key).try(:body) 8 | end 9 | 10 | def []=(key, content) 11 | @cloud.sync(key, content) 12 | end 13 | 14 | def exists?(key) 15 | @cloud.head(key) 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /app/workers/syncer/resocializer.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Resocializer < Syncer::Base 2 | def perform(container) 3 | @container = container 4 | records.with_progress("Resocialize Crawling #{container}").each do |r| 5 | Crawler::Socializer.perform_async record(r.key.gsub('.json','')).try(:url) 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /roles/elasticsearch/handlers/main.yml: -------------------------------------------------------------------------------- 1 | 2 | - name: restart elasticsearch 3 | service: name={{instance_init_script | basename}} state=restarted enabled=yes 4 | when: es_restart_on_change and es_start_service and not elasticsearch_started.changed and ((plugin_installed is defined and plugin_installed.changed) or elasticsearch_install.changed) 5 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/tasks/redhat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Update/Install nginx 3 | yum: name=nginx state=latest 4 | 5 | - name: Generate site configurations 6 | template: src=nginx-site.j2 dest=/etc/nginx/conf.d/{{ item.name }}.conf owner=root group=root mode=0644 7 | notify: 8 | - reload nginx 9 | with_items: nginx_sites 10 | 11 | -------------------------------------------------------------------------------- /roles/logrotate/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: Nick Hammond 4 | description: Role to configure logrotate scripts 5 | license: BSD 6 | min_ansible_version: 1.5 7 | platforms: 8 | - name: Ubuntu 9 | versions: 10 | - lucid 11 | - precise 12 | - trusty 13 | categories: 14 | - system 15 | dependencies: [] 16 | -------------------------------------------------------------------------------- /app/workers/syncer/respider.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Respider < Syncer::Base 2 | def perform(container, spider_type = 'Spider') 3 | @container = container 4 | records.with_progress("Respider Crawling #{container}").each do |r| 5 | ('Crawler::' + spider_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url) 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /app/workers/recorder/collector.rb: -------------------------------------------------------------------------------- 1 | class Recorder::Collector < Recorder::Base 2 | def perform(container) 3 | @container = container 4 | # collections.each do |r| 5 | # ap record(r.key) 6 | # end 7 | end 8 | 9 | def collections 10 | @collections ||= cloud.files.map { |f| f if f.key.starts_with? '_' }.compact 11 | end 12 | end 13 | -------------------------------------------------------------------------------- /app/workers/recorder/base.rb: -------------------------------------------------------------------------------- 1 | class Recorder::Base < Worker 2 | sidekiq_options queue: :recorder, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def cloud 9 | @cloud ||= Cloud.new(@container) 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /app/workers/syncer/rescrimper.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Rescrimper < Syncer::Base 2 | def perform(container, scrimper_type = 'Scrimper') 3 | @container = container 4 | records.with_progress("Rescrimp Crawling #{container}").each do |r| 5 | ('Crawler::' + scrimper_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url) 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /roles/ruby-install/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: Andrew Angelo Ang 4 | description: "Installs ruby-install." 5 | company: InnoHub, Inc. 6 | license: MIT 7 | min_ansible_version: 1.4 8 | platforms: 9 | - name: Ubuntu 10 | versions: 11 | - trusty 12 | categories: 13 | - development 14 | - system 15 | dependencies: [] 16 | -------------------------------------------------------------------------------- /app/workers/scheduler/reindexer.rb: -------------------------------------------------------------------------------- 1 | # class Scheduler::Reindexer < Scheduler::Base 2 | # recurrence { daily } 3 | 4 | # def perform 5 | # containers = Rails.configuration.config[:admin][:api_containers] 6 | # if containers.any? 7 | # containers.each {|c| Syncer::Reindexer.perform_async c } 8 | # end if Rails.env.production? 9 | # end 10 | # end 11 | -------------------------------------------------------------------------------- /config/initializers/vcr.rb: -------------------------------------------------------------------------------- 1 | VCR.configure do |c| 2 | c.cassette_library_dir = 'tmp/cache' 3 | c.hook_into :typhoeus 4 | c.default_cassette_options = { match_requests_on: [:uri, :body, :method] } 5 | # c.cassette_persisters[:cloud] = Persist.new(Cloud.new('semanticvcr')) 6 | # c.default_cassette_options[:persist_with] = :cloud 7 | c.allow_http_connections_when_no_cassette = true 8 | end 9 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/helpers/serverspec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | require 'serverspec' 2 | set :backend, :exec 3 | 4 | require 'rspec/retry' 5 | 6 | RSpec.configure do |config| 7 | # show retry status in spec process 8 | config.verbose_retry = true 9 | # show exception that triggers a retry if verbose_retry is set to true 10 | config.display_try_failure_messages = true 11 | end -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-RedHat-version-lock.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: RedHat - install yum-version-lock 3 | yum: name=yum-plugin-versionlock state=present update_cache=yes 4 | - name: RedHat - lock elasticsearch version 5 | shell: yum versionlock delete 0:elasticsearch* ; yum versionlock add elasticsearch{% if es_version is defined and es_version != "" %}-{{ es_version }}{% endif %} 6 | 7 | -------------------------------------------------------------------------------- /roles/imagemagick/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Install ImageMagick (Debian) 4 | apt: pkg=imagemagick 5 | when: ansible_os_family == "Debian" 6 | become: true 7 | 8 | - name: Install ImageMagick (Red Hat) 9 | yum: pkg={{ item }} state=installed 10 | when: ansible_os_family == "RedHat" 11 | become: true 12 | with_items: 13 | - ImageMagick 14 | - ImageMagick-devel 15 | -------------------------------------------------------------------------------- /app/workers/syncer/resampler.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Resampler < Syncer::Base 2 | def perform(container, sampler_type = 'Sampler', scrimper_type = 'Scrimper') 3 | @container = container 4 | records.with_progress("Resample Crawling #{container}").each do |r| 5 | ('Crawler::' + sampler_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url), scrimper_type 6 | end 7 | end 8 | end 9 | -------------------------------------------------------------------------------- /roles/chruby/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: Andrew Angelo Ang 4 | description: Installs chruby on Ubuntu systems. 5 | company: InnoHub, Inc. 6 | license: MIT 7 | min_ansible_version: 1.4 8 | platforms: 9 | - name: Ubuntu 10 | versions: 11 | - precise 12 | - trusty 13 | categories: 14 | - development 15 | - system 16 | dependencies: [] 17 | -------------------------------------------------------------------------------- /roles/chruby/.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | language: python 3 | python: "2.7" 4 | before_install: 5 | - sudo apt-get update -qq 6 | - sudo apt-get install -qq python-apt python-pycurl 7 | install: 8 | - pip install ansible==1.7.1 9 | script: 10 | - echo localhost > inventory 11 | - ansible-playbook --syntax-check -i inventory test.yml 12 | - ansible-playbook -i inventory test.yml --connection=local --sudo 13 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-version-lock.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Trigger Debian section 3 | - name: Include Debian specific Elasticsearch 4 | include: elasticsearch-Debian-version-lock.yml 5 | when: ansible_os_family == 'Debian' 6 | 7 | # Trigger Redhat section 8 | - name: Include RedHat specific Elasticsearch 9 | include: elasticsearch-RedHat-version-lock.yml 10 | when: ansible_os_family == 'RedHat' 11 | -------------------------------------------------------------------------------- /roles/ruby-install/.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | language: python 3 | python: "2.7" 4 | before_install: 5 | - sudo apt-get update -qq 6 | - sudo apt-get install -qq python-apt python-pycurl 7 | install: 8 | - pip install ansible==1.7.1 9 | script: 10 | - echo localhost > inventory 11 | - ansible-playbook --syntax-check -i inventory test.yml 12 | - ansible-playbook -i inventory test.yml --connection=local --sudo 13 | -------------------------------------------------------------------------------- /config/initializers/assets.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Version of your assets, change this if you want to expire all your assets. 4 | Rails.application.config.assets.version = '1.0' 5 | 6 | # Precompile additional assets. 7 | # application.js, application.css, and all non-JS/CSS in app/assets folder are already added. 8 | # Rails.application.config.assets.precompile += %w( search.js ) 9 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/package.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Elasticsearch Package tests 3 | hosts: localhost 4 | roles: 5 | - { role: elasticsearch, es_config: { "http.port": 9200, "transport.tcp.port":9300, discovery.zen.ping.unicast.hosts: "localhost:9300" }, es_instance_name: "node1" } 6 | vars: 7 | es_scripts: true 8 | es_templates: true 9 | #Plugins installed for this test are specified in .kitchen.yml under suite -------------------------------------------------------------------------------- /roles/logrotate/templates/logrotate.d.j2: -------------------------------------------------------------------------------- 1 | # {{ ansible_managed }} 2 | 3 | "{{ item.path }}" { 4 | {% if item.options is defined -%} 5 | {% for option in item.options -%} 6 | {{ option }} 7 | {% endfor -%} 8 | {% endif %} 9 | {%- if item.scripts is defined -%} 10 | {%- for name, script in item.scripts.iteritems() -%} 11 | {{ name }} 12 | {{ script }} 13 | endscript 14 | {% endfor -%} 15 | {% endif -%} 16 | } 17 | -------------------------------------------------------------------------------- /roles/imagemagick/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | dependencies: [] 4 | 5 | galaxy_info: 6 | author: danbohea 7 | description: ImageMagick for Linux 8 | license: MIT 9 | min_ansible_version: 1.4 10 | platforms: 11 | - name: Debian 12 | versions: 13 | - jessie 14 | - wheezy 15 | - name: EL 16 | versions: 17 | - 6 18 | - name: Ubuntu 19 | versions: 20 | - precise 21 | - trusty 22 | categories: 23 | - web 24 | -------------------------------------------------------------------------------- /app/workers/mapper/id_availability.rb: -------------------------------------------------------------------------------- 1 | class Mapper::IdAvailability < Mapper::Base 2 | def perform(container, id) 3 | @container = container 4 | types = container.split('-').last.pluralize.gsub(':', '') 5 | index = Rails.env + '-' + types 6 | cloud.head(id + '.json').try(:destroy) 7 | Elasticsearch::Model.client.delete index: index, type: container, id: id 8 | Elasticsearch::Model.client.indices.refresh index: index 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /config/initializers/backtrace_silencers.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # You can add backtrace silencers for libraries that you're using but don't wish to see in your backtraces. 4 | # Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ } 5 | 6 | # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code. 7 | # Rails.backtrace_cleaner.remove_silencers! 8 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/java.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | 4 | - set_fact: java_state="present" 5 | 6 | - set_fact: java_state="latest" 7 | when: update_java == true 8 | 9 | - name: RedHat - Ensure Java is installed 10 | yum: name={{ java }} state={{java_state}} 11 | when: ansible_os_family == 'RedHat' 12 | 13 | - name: Debian - Ensure Java is installed 14 | apt: name={{ java }} state={{java_state}} update_cache=yes force=yes 15 | when: ansible_os_family == 'Debian' -------------------------------------------------------------------------------- /roles/logrotate/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: nickhammond.logrotate | Install logrotate 3 | action: "{{ansible_pkg_mgr}} pkg=logrotate state=present" 4 | when: logrotate_scripts is defined and len(logrotate_scripts) > 0 5 | 6 | - name: nickhammond.logrotate | Setup logrotate.d scripts 7 | template: 8 | src: logrotate.d.j2 9 | dest: "{{ logrotate_conf_dir }}{{ item.name }}" 10 | with_items: logrotate_scripts 11 | when: logrotate_scripts is defined 12 | -------------------------------------------------------------------------------- /roles/build-ruby/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # tasks file for build-ruby 3 | 4 | 5 | #- command: /usr/bin/test -e /opt/rubies/ruby-{{version}} 6 | # register: ruby_installed 7 | # ignore_errors: True 8 | 9 | - name: Install ruby-{{version}} 10 | shell: ruby-install --install-dir /opt/rubies/ruby-{{version}} --no-reinstall ruby {{version}} 11 | #when: ruby_installed|failed 12 | 13 | - name: Install bundler 14 | shell: chruby-exec ruby {{version}} -- gem install bundler --conservative 15 | -------------------------------------------------------------------------------- /roles/logrotate/.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | language: python 3 | python: "2.7" 4 | before_install: 5 | - sudo apt-get update -qq 6 | - sudo apt-get install -qq python-apt python-pycurl 7 | install: 8 | - pip install ansible 9 | script: 10 | - "printf '[defaults]\nroles_path = ../' > ansible.cfg" 11 | - ansible-playbook -i tests/inventory --syntax-check tests/test.yml 12 | - ansible-playbook -i tests/inventory --connection=local --sudo -vvvv tests/test.yml 13 | notifications: 14 | email: false 15 | -------------------------------------------------------------------------------- /roles/ubuntu-common/README.md: -------------------------------------------------------------------------------- 1 | # ubuntu-common (ansible role) 2 | 3 | This role intends to provide command setup routine for ubuntu LTS. 4 | 5 | ## Requirements 6 | 7 | This role requires Ansible 1.2 or higher, and platform requirements are listed in the metadata file. 8 | 9 | ## Role Variables 10 | 11 | TODO 12 | 13 | ## Examples 14 | 15 | TODO 16 | 17 | ## Dependencies 18 | 19 | None 20 | 21 | ## License 22 | 23 | MIT 24 | 25 | ## Author Information 26 | 27 | AR 28 | -------------------------------------------------------------------------------- /app/workers/recorder/uploader.rb: -------------------------------------------------------------------------------- 1 | class Recorder::Uploader < Recorder::Base 2 | def perform(metadata = {}) 3 | if url = metadata['url'] 4 | uploader = Record::Upload.new(url) 5 | uploader.id = metadata['id'] 6 | uploader.metadata = metadata 7 | hash = uploader.sync 8 | 9 | Mapper::Indexer.perform_async uploader.container, 10 | uploader.id, 11 | hash 12 | end unless metadata.nil? 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: Alexandros Giouzenis 4 | description: Nginx installation with Unicorn integration 5 | license: MIT 6 | min_ansible_version: 1.2 7 | platforms: 8 | - name: EL 9 | versions: 10 | - all 11 | - name: Fedora 12 | versions: 13 | - all 14 | - name: Ubuntu 15 | versions: 16 | - all 17 | - name: Debian 18 | versions: 19 | - all 20 | categories: 21 | - system 22 | - web 23 | dependencies: [] 24 | 25 | -------------------------------------------------------------------------------- /app/workers/syncer/base.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Base < Worker 2 | sidekiq_options queue: :syncer, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def cloud 9 | @cloud ||= Cloud.new(@container) 10 | end 11 | 12 | def records 13 | @records ||= cloud.files 14 | end 15 | 16 | def record(record) 17 | Record::Base.new(@container, record) 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /roles/logrotate/tests/test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | sudo: True 4 | roles: 5 | - ansible-logrotate 6 | - role: ansible-logrotate 7 | logrotate_scripts: 8 | - name: nginx-options 9 | path: /var/log/nginx/options.log 10 | options: 11 | - daily 12 | 13 | - role: ansible-logrotate 14 | logrotate_scripts: 15 | - name: nginx-scripts 16 | path: /var/log/nginx/scripts.log 17 | scripts: 18 | postrotate: "echo test" 19 | -------------------------------------------------------------------------------- /app/helpers/counts_helper.rb: -------------------------------------------------------------------------------- 1 | module CountsHelper 2 | def pretty_integer(integer) 3 | return '0' if integer.nil? || integer == 0 4 | if integer > 999 && integer <= 999_999 5 | ('%.1f K' % (integer / 1000.0)).sub('.0', '') 6 | elsif integer > 999_999 && integer <= 999_999_999 7 | ('%.1f M' % (integer / 1_000_000.0)).sub('.0', '') 8 | elsif integer > 999_999_999 9 | ('%.1f B' % (integer / 1_000_000_000.0)).sub('.0', '') 10 | else 11 | integer.to_s 12 | end 13 | end 14 | end 15 | -------------------------------------------------------------------------------- /roles/letsencrypt/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: Finn Herzfeld 4 | description: Generate TLS certificates and get them signed by Let's Encrypt. 5 | issue_tracker_url: https://github.com/thefinn93/ansible-letsencrypt/issues 6 | license: GPL 7 | min_ansible_version: 1.2 8 | 9 | platforms: 10 | - name: Ubuntu 11 | versions: 12 | - trusty 13 | - name: Debian 14 | versions: 15 | - jessie 16 | 17 | categories: 18 | - networking 19 | - web 20 | 21 | dependencies: [] 22 | -------------------------------------------------------------------------------- /app/workers/crawler/socializer.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Socializer < Crawler::Sampler 2 | sidekiq_options queue: :socializer, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def upload 9 | scraper.clear 10 | @parsed = parsed.merge(parser.save) if parser.build 11 | if parsed.presence && parsed['type'] 12 | Recorder::Uploader.perform_async parsed.merge(social.shares) 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /roles/elasticsearch/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | allow_duplicates: yes 4 | 5 | galaxy_info: 6 | author: Robin Clarke, Jakob Reiter, Dale McDiarmid 7 | description: Elasticsearch for Linux 8 | company: "Elastic.co" 9 | license: "license (Apache)" 10 | # Require 1.6 for apt deb install 11 | min_ansible_version: 1.6 12 | platforms: 13 | - name: EL 14 | versions: 15 | - 6 16 | - 7 17 | - name: Debian 18 | versions: 19 | - all 20 | - name: Ubuntu 21 | versions: 22 | - all 23 | categories: 24 | - system 25 | 26 | dependencies: [] 27 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/tasks/debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Update/Install nginx 3 | apt: name=nginx state=latest 4 | 5 | - name: Generate site configurations 6 | template: src=nginx-site.j2 dest=/etc/nginx/sites-available/{{ item.name }} owner=root group=root mode=0644 7 | notify: 8 | - reload nginx 9 | with_items: nginx_sites 10 | 11 | - name: Enable sites 12 | file: src=/etc/nginx/sites-available/{{ item.name }} dest=/etc/nginx/sites-enabled/{{ item.name }} state=link owner=root group=root mode=0644 13 | notify: 14 | - reload nginx 15 | with_items: nginx_sites 16 | 17 | -------------------------------------------------------------------------------- /bin/spring: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | # This file loads spring without using Bundler, in order to be fast 4 | # It gets overwritten when you run the `spring binstub` command 5 | 6 | unless defined?(Spring) 7 | require 'rubygems' 8 | require 'bundler' 9 | 10 | if match = Bundler.default_lockfile.read.match(/^GEM$.*?^ spring \((.*?)\)$.*?^$/m) 11 | ENV['GEM_PATH'] = ([Bundler.bundle_path.to_s] + Gem.path).join(File::PATH_SEPARATOR) 12 | ENV['GEM_HOME'] = '' 13 | Gem.paths = ENV 14 | 15 | gem 'spring', match[1] 16 | require 'spring/binstub' 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /config/initializers/wrap_parameters.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # This file contains settings for ActionController::ParamsWrapper which 4 | # is enabled by default. 5 | 6 | # Enable parameter wrapping for JSON. You can disable this by setting :format to an empty array. 7 | ActiveSupport.on_load(:action_controller) do 8 | wrap_parameters format: [:json] if respond_to?(:wrap_parameters) 9 | end 10 | 11 | # To enable root element in JSON for ActiveRecord objects. 12 | # ActiveSupport.on_load(:active_record) do 13 | # self.include_root_in_json = true 14 | # end 15 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-optional-user.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #Add the elasticsearch user before installing from packages. 3 | - name: Ensure optional elasticsearch group is created with the correct id. 4 | group: 5 | state: present 6 | name: "{{ es_group }}" 7 | system: yes 8 | gid: "{{ es_group_id }}" 9 | 10 | - name: Ensure optional elasticsearch user is created with the correct id. 11 | user: 12 | state: present 13 | name: "{{ es_user }}" 14 | comment: elasticsearch system user 15 | system: yes 16 | createhome: no 17 | uid: "{{ es_user_id }}" 18 | group: "{{ es_group }}" 19 | -------------------------------------------------------------------------------- /roles/letsencrypt/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | letsencrypt_src_directory: /usr/local/share/letsencrypt 3 | letsencrypt_venv: "{{ letsencrypt_src_directory }}/env" 4 | letsencrypt_cert_domains: 5 | - "{{ ansible_fqdn }}" 6 | letsencrypt_webroot_path: /var/www 7 | letsencrypt_authenticator: webroot 8 | letsencrypt_email: "webmaster@{{ ansible_domain }}" 9 | letsencrypt_command: "{{ letsencrypt_venv }}/bin/letsencrypt --agree-tos --text {% for domain in letsencrypt_cert_domains %}-d {{ domain }} {% endfor %}--email {{ letsencrypt_email }} {% if letsencrypt_server is defined %}--server {{ letsencrypt_server }}{% endif %} --expand" 10 | -------------------------------------------------------------------------------- /run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'trollop' 4 | require 'ap' 5 | 6 | opts = Trollop.options do 7 | banner = ':Usage =>ruby crawl.rb -u http://amazon.com [options]' 8 | opt :host, 'Set the host api to grab from', type: :string 9 | opt :urls, 'Set the URL you want to crawl', type: :strings 10 | opt :api_key, 'Set the api key to grab from', type: :string 11 | opt :depth, 'Set the depth you want to crawl', type: :integer 12 | opt :file, 'Set the URL to be grabbed from a url.txt file in data folder', default: false 13 | opt :ua, 'Set a custom user agent. Ex:-ua Googlebot' 14 | end 15 | 16 | trap('INT') { exit } 17 | 18 | ap opts 19 | -------------------------------------------------------------------------------- /app/models/record/screenshot.rb: -------------------------------------------------------------------------------- 1 | class Record::Screenshot < Record::Base 2 | def initialize(container, record, date) 3 | if match = container.match(/(.+?)-/) 4 | @container = match[1] + '-screenshots' 5 | end 6 | @record_id = record 7 | @record = record + '/' + date + '.jpg' 8 | end 9 | 10 | def screenshot 11 | @screenshot ||= cloud.get(@record) 12 | end 13 | 14 | def link 15 | screenshot.url(Date.tomorrow.to_time.to_i) 16 | end 17 | 18 | def data 19 | if screenshot 20 | { id: @record_id, redirect_url: link } 21 | else 22 | { error: 'screenshot not available' } 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files for more about ignoring files. 2 | # 3 | # If you find yourself ignoring temporary files generated by your text editor 4 | # or operating system, you probably want to add a global ignore instead: 5 | # git config --global core.excludesfile '~/.gitignore_global' 6 | 7 | # Ignore bundler config. 8 | /.bundle 9 | 10 | # Ignore the default SQLite database. 11 | /db/*.sqlite3 12 | /db/*.sqlite3-journal 13 | 14 | # Ignore all logfiles and tempfiles. 15 | /log/*.log 16 | /tmp 17 | /app/sites 18 | /config/application.yml 19 | /config/sidekiq.yml 20 | /config/config.yml 21 | /config/secrets.yml 22 | /production 23 | -------------------------------------------------------------------------------- /app/workers/mapper/base.rb: -------------------------------------------------------------------------------- 1 | class Mapper::Base < Worker 2 | sidekiq_options queue: :mapper, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def cloud 9 | @cloud ||= Cloud.new(@container) 10 | end 11 | 12 | def records 13 | @records ||= cloud.files.select { |f| f unless f.key.starts_with? '_' } 14 | end 15 | 16 | def indexes 17 | @records ||= cloud.files.select { |f| f if f.key.starts_with? '_' } 18 | end 19 | 20 | def record(record) 21 | Record::Base.new(@container, record) 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /config/database.yml: -------------------------------------------------------------------------------- 1 | # SQLite version 3.x 2 | # gem install sqlite3 3 | # 4 | # Ensure the SQLite 3 gem is defined in your Gemfile 5 | # gem 'sqlite3' 6 | # 7 | default: &default 8 | adapter: sqlite3 9 | pool: 5 10 | timeout: 5000 11 | 12 | development: 13 | <<: *default 14 | database: db/development.sqlite3 15 | 16 | # Warning: The database defined as "test" will be erased and 17 | # re-generated from your development database when you run "rake". 18 | # Do not set this db to the same as development or production. 19 | test: 20 | <<: *default 21 | database: db/test.sqlite3 22 | 23 | production: 24 | <<: *default 25 | database: db/production.sqlite3 26 | -------------------------------------------------------------------------------- /roles/elasticsearch/defaults/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | es_major_version: "2.x" 3 | es_version: "2.2.0" 4 | es_version_lock: false 5 | es_use_repository: true 6 | es_start_service: true 7 | update_java: false 8 | es_restart_on_change: true 9 | es_plugins_reinstall: false 10 | es_scripts: false 11 | es_templates: false 12 | es_user: elasticsearch 13 | es_group: elasticsearch 14 | es_config: {} 15 | #Need to provide default directories 16 | es_pid_dir: "/var/run/elasticsearch" 17 | es_data_dirs: "/var/lib/elasticsearch" 18 | es_log_dir: "/var/log/elasticsearch" 19 | es_work_dir: "/tmp/elasticsearch" 20 | es_plugin_dir: "/usr/share/elasticsearch/plugins" 21 | es_max_open_files: 65536 22 | 23 | -------------------------------------------------------------------------------- /roles/chruby/README.md: -------------------------------------------------------------------------------- 1 | # InnoHub Ansible : chruby [![Build Status](https://travis-ci.org/innohub-ansible/chruby.svg?branch=master)](https://travis-ci.org/innohub-ansible/chruby) 2 | 3 | Installs chruby. 4 | 5 | Requirements 6 | ------------ 7 | 8 | Tested on Ubuntu 12.04 and 14.04 only. 9 | 10 | Role Variables 11 | -------------- 12 | 13 | chruby_version : defaults to '0.3.9' 14 | 15 | Example Playbook 16 | ---------------- 17 | 18 | Example Playbook: 19 | 20 | - hosts: servers 21 | roles: 22 | - { role: innohub-ansible.chruby } 23 | 24 | Example Role: 25 | 26 | dependencies: 27 | - { role: chruby } 28 | 29 | License 30 | ------- 31 | 32 | MIT 33 | -------------------------------------------------------------------------------- /roles/imagemagick/README.md: -------------------------------------------------------------------------------- 1 | # Ansible Role: ImageMagick 2 | 3 | An Ansible role that installs [ImageMagick](http://www.imagemagick.org/script/index.php) on RHEL/CentOS and Debian/Ubuntu. 4 | 5 | 6 | ## Requirements 7 | 8 | None. 9 | 10 | 11 | ## Role Variables 12 | 13 | None. 14 | 15 | 16 | ## Dependencies 17 | 18 | None. 19 | 20 | 21 | ## Example Playbook 22 | 23 | ``` 24 | - hosts: servers 25 | roles: 26 | - { role: hashbangcode.imagemagick } 27 | ``` 28 | 29 | ## License 30 | 31 | MIT 32 | 33 | 34 | ## Author Information 35 | 36 | This role was created by [Dan Bohea](http://bohea.co.uk) originally for use with [Vlad](https://github.com/hashbangcode/vlad). 37 | -------------------------------------------------------------------------------- /app/controllers/v1/batch_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::BatchController < V1::AccessController 2 | def index 3 | container = Api::V1.new(params[:container]) 4 | new_params = params 5 | new_params.delete(:container) if params[:container] 6 | if new_params.empty? 7 | results = errors_response('no results found') 8 | status = 404 9 | else 10 | results = { results: container.batch(new_params[:batch], default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) } } 11 | status = 200 12 | end 13 | respond_to do |format| 14 | format.json { json_response(status, results) } 15 | format.xml { xml_response(status, results) } 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /app/controllers/v1/trends_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::TrendsController < V1::AccessController 2 | def index 3 | container = Record::Trends.new(params[:container]) 4 | if params[:array].empty? 5 | results = errors_response('no results found') 6 | status = 404 7 | else 8 | results = { results: container.sort(params[:array].split(','), default_options.merge(social: params[:social] || true)).map { |h| Record::Addons.append(h) }, 9 | pagination: pagination(container.total) } 10 | status = 200 11 | end 12 | respond_to do |format| 13 | format.json { json_response(status, results) } 14 | format.xml { xml_response(status, results) } 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /config/application.rb: -------------------------------------------------------------------------------- 1 | require File.expand_path('../boot', __FILE__) 2 | 3 | require 'rails/all' 4 | 5 | # Require the gems listed in Gemfile, including any gems 6 | # you've limited to :test, :development, or :production. 7 | Bundler.require(*Rails.groups) 8 | 9 | module Crawler 10 | class Application < Rails::Application 11 | config.middleware.insert_before 0, Rack::Health, :path => '/elb-status' 12 | config.config = config_for(:config).deep_symbolize_keys! 13 | require_relative '../app/sites/initializer.rb' if File.exists?('../app/sites/initializer.rb') 14 | config.autoload_paths += Dir[Rails.root.join('app', 'sites', '{**}')] 15 | config.autoload_paths += %W(#{config.root}/helpers) 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /roles/elasticsearch/templates/elasticsearch.yml.j2: -------------------------------------------------------------------------------- 1 | 2 | {% if es_config %} 3 | {{ es_config | to_nice_yaml }} 4 | {% endif %} 5 | 6 | {% if es_config['cluster.name'] is not defined %} 7 | cluster.name: elasticsearch 8 | {% endif %} 9 | 10 | {% if es_config['node.name'] is not defined %} 11 | node.name: {{inventory_hostname}}-{{es_instance_name}} 12 | {% endif %} 13 | 14 | #################################### Paths #################################### 15 | 16 | # Path to directory containing configuration (this file and logging.yml): 17 | path.conf: {{ conf_dir }} 18 | 19 | path.data: {{ data_dirs | array_to_str }} 20 | 21 | path.work: {{ work_dir }} 22 | 23 | path.logs: {{ log_dir }} 24 | 25 | path.plugins: {{ plugin_dir }} -------------------------------------------------------------------------------- /config.ru: -------------------------------------------------------------------------------- 1 | # This file is used by Rack-based servers to start the application. 2 | 3 | # --- Start of unicorn worker killer code --- 4 | 5 | if ENV['RAILS_ENV'] == 'production' 6 | require 'unicorn/worker_killer' 7 | 8 | max_request_min = 500 9 | max_request_max = 600 10 | 11 | # Max requests per worker 12 | use Unicorn::WorkerKiller::MaxRequests, max_request_min, max_request_max 13 | 14 | oom_min = (240) * (1024**2) 15 | oom_max = (260) * (1024**2) 16 | 17 | # Max memory size (RSS) per worker 18 | use Unicorn::WorkerKiller::Oom, oom_min, oom_max 19 | end 20 | 21 | # --- End of unicorn worker killer code --- 22 | 23 | require ::File.expand_path('../config/environment', __FILE__) 24 | run Rails.application 25 | -------------------------------------------------------------------------------- /config/initializers/inflections.rb: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Add new inflection rules using the following format. Inflections 4 | # are locale specific, and you may define rules for as many different 5 | # locales as you wish. All of these examples are active by default: 6 | # ActiveSupport::Inflector.inflections(:en) do |inflect| 7 | # inflect.plural /^(ox)$/i, '\1en' 8 | # inflect.singular /^(ox)en/i, '\1' 9 | # inflect.irregular 'person', 'people' 10 | # inflect.uncountable %w( fish sheep ) 11 | # end 12 | 13 | # These inflection rules are supported but not enabled by default: 14 | # ActiveSupport::Inflector.inflections(:en) do |inflect| 15 | # inflect.acronym 'RESTful' 16 | # end 17 | -------------------------------------------------------------------------------- /config/locales/en.yml: -------------------------------------------------------------------------------- 1 | # Files in the config/locales directory are used for internationalization 2 | # and are automatically loaded by Rails. If you want to use locales other 3 | # than English, add the necessary files in this directory. 4 | # 5 | # To use the locales, use `I18n.t`: 6 | # 7 | # I18n.t 'hello' 8 | # 9 | # In views, this is aliased to just `t`: 10 | # 11 | # <%= t('hello') %> 12 | # 13 | # To use a different locale, set it with `I18n.locale`: 14 | # 15 | # I18n.locale = :es 16 | # 17 | # This would use the information in config/locales/es.yml. 18 | # 19 | # To learn more, please read the Rails Internationalization guide 20 | # available at http://guides.rubyonrails.org/i18n.html. 21 | 22 | en: 23 | hello: "Hello world" 24 | -------------------------------------------------------------------------------- /roles/ubuntu-common/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: set locale 3 | template: src=locale 4 | dest=/etc/default/locale 5 | 6 | - name: set timezone 7 | file: src=/usr/share/zoneinfo/{{ common_timezone }} 8 | dest=/etc/localtime 9 | force=yes 10 | state=link 11 | 12 | - name: update source.list 13 | template: src=sources.list 14 | dest=/etc/apt/sources.list 15 | 16 | - name: update apt cache 17 | apt: update_cache=yes 18 | cache_valid_time={{ common_apt_cache_time }} 19 | 20 | - name: install common packages 21 | apt: pkg={{ item }} 22 | state=present 23 | with_items: 24 | - build-essential 25 | - git 26 | - sqlite 27 | - libsqlite3-dev 28 | - libmagickwand-dev 29 | -------------------------------------------------------------------------------- /app/models/record/export.rb: -------------------------------------------------------------------------------- 1 | class Record::Export 2 | def initialize(container, headers = ['name', 'url']) 3 | @container = container 4 | @headers = headers 5 | end 6 | 7 | def csv 8 | require 'csv' 9 | CSV.open('test.csv', 'w') do |csv| 10 | indexes.with_progress.each do |index| 11 | id = index.key.gsub('.json','') 12 | hash = record(id).current_data({ crawl: false, social: false }) 13 | csv << hash.values 14 | end # of hsh's (rows) 15 | end # of csv open 16 | end 17 | 18 | def cloud 19 | @cloud ||= Cloud.new(@container) 20 | end 21 | 22 | def indexes 23 | @records ||= cloud.files 24 | end 25 | 26 | def record(record) 27 | Api::V1.new(@container, record) 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /app/workers/scheduler/clearer.rb: -------------------------------------------------------------------------------- 1 | class Scheduler::Clearer < Scheduler::Base 2 | recurrence { daily } 3 | 4 | def perform 5 | Redis::List.new('sampler_visited').clear 6 | Redis::List.new('sampler_one_visited').clear 7 | Redis::List.new('sampler_two_visited').clear 8 | Redis::List.new('sampler_three_visited').clear 9 | Redis::List.new('sampler_four_visited').clear 10 | Redis::List.new('sampler_five_visited').clear 11 | Redis::List.new('spider_visited').clear 12 | Redis::List.new('spider_one_visited').clear 13 | Redis::List.new('spider_two_visited').clear 14 | Redis::List.new('spider_three_visited').clear 15 | Redis::List.new('spider_four_visited').clear 16 | Redis::List.new('spider_five_visited').clear 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /app/controllers/v1/match_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::MatchController < V1::AccessController 2 | def index 3 | container = Record::Match.new(params[:container]) 4 | new_params = params 5 | new_params.delete(:container) if params[:container] 6 | if new_params.empty? 7 | results = errors_response('no results found') 8 | status = 404 9 | else 10 | results = { results: container.best(new_params, default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) }, 11 | pagination: pagination(container.total) } 12 | status = 200 13 | end 14 | respond_to do |format| 15 | format.json { json_response(status, results) } 16 | format.xml { xml_response(status, results) } 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /app/controllers/v1/search_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::SearchController < V1::AccessController 2 | def index 3 | container = Record::Search.new(params[:container]) 4 | new_params = params 5 | new_params.delete(:container) if params[:container] 6 | if new_params.empty? 7 | results = errors_response('no results found') 8 | status = 404 9 | else 10 | results = { results: container.search(new_params, default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) }, 11 | pagination: pagination(container.total) } 12 | status = 200 13 | end 14 | respond_to do |format| 15 | format.json { json_response(status, results) } 16 | format.xml { xml_response(status, results) } 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /app/workers/syncer/reindexer.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Reindexer < Syncer::Base 2 | def perform(container) 3 | @container = container 4 | types = container.split('-').last.pluralize.gsub(':', '') 5 | index = Rails.env + '-' + types 6 | Elasticsearch::Model.client.indices.refresh index: index 7 | records.with_progress("Remapping #{container}").each do |r| 8 | id = r.key.gsub('.json','') 9 | begin 10 | unless Elasticsearch::Model.client.exists? index: index, type: container, id: id 11 | # temp = Mapper::Indexer.new 12 | # temp.perform @container, id 13 | Mapper::Indexer.perform_async @container, id 14 | end 15 | rescue 16 | Mapper::Indexer.perform_async @container, id 17 | end 18 | end 19 | end 20 | end 21 | -------------------------------------------------------------------------------- /config/sidekiq-slim.yml.example: -------------------------------------------------------------------------------- 1 | --- 2 | :concurrency: 1 3 | :pidfile: tmp/pids/sidekiq.pid 4 | :queues: 5 | - [mapper, 4_000_000] 6 | - [recorder, 300_000] 7 | - [default, 20] 8 | - [sitemapper_one, 1] 9 | - [sitemapper_two, 1] 10 | - [sitemapper_three, 1] 11 | - [sitemapper_four, 1] 12 | - [sitemapper_five, 1] 13 | - [sitemapper_six, 1] 14 | - [sitemapper_seven, 1] 15 | - [sitemapper_eight, 1] 16 | - [sitemapper_nine, 1] 17 | - [sitemapper_ten, 1] 18 | - [sitemapper, 1] 19 | :limits: 20 | sitemapper_one: 1 21 | sitemapper_two: 1 22 | sitemapper_three: 1 23 | sitemapper_four: 1 24 | sitemapper_five: 1 25 | sitemapper_six: 1 26 | sitemapper_seven: 1 27 | sitemapper_eight: 1 28 | sitemapper_nine: 1 29 | sitemapper_ten: 1 30 | sitemapper: 1 31 | stretcher: 1 32 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-scripts.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - set_fact: es_script_dir={{ es_conf_dir }}/{{es_instance_name}} 4 | tags: 5 | - always 6 | 7 | - set_fact: es_script_dir={{es_config['path.scripts']}} 8 | when: es_config['path.scripts'] is defined 9 | tags: 10 | - always 11 | 12 | - name: Create script dir 13 | file: state=directory path={{ es_script_dir }} owner={{ es_user }} group={{ es_group }} 14 | 15 | - name: Copy default scripts to elasticsearch 16 | copy: src=scripts dest={{ es_script_dir }} owner={{ es_user }} group={{ es_group }} 17 | when: es_scripts_fileglob is not defined 18 | 19 | - name: Copy scripts to elasticsearch 20 | copy: src={{ item }} dest={{ es_script_dir }} owner={{ es_user }} group={{ es_group }} 21 | with_fileglob: es_scripts_fileglob 22 | -------------------------------------------------------------------------------- /app/workers/syncer/mover.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Mover < Syncer::Base 2 | def perform(from_container, to_container) 3 | @container = from_container 4 | @to_container = to_container 5 | records.with_progress("Move from #{from_container} to #{to_container}").each do |r| 6 | from_record = record(r.key) 7 | old_data = from_record.data 8 | old_data['type'] = new_type 9 | to_record(r.key).data = old_data 10 | from_record.delete 11 | end 12 | end 13 | 14 | def new_type 15 | @new_type ||= @to_container.match(/-(.+)/)[1].try(:singularize).try(:capitalize) rescue nil 16 | end 17 | 18 | def to_record(new_record) 19 | Record::Base.new(@to_container, new_record) 20 | end 21 | 22 | def to_cloud 23 | @to_cloud ||= Cloud.new(@to_container) 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /app/workers/recorder/fixer.rb: -------------------------------------------------------------------------------- 1 | class Recorder::Fixer < Recorder::Base 2 | def perform(container = nil, record = nil) 3 | if container && record 4 | new_hash = Record::Base.new(container, record).data 5 | new_hash.delete('screenshot') 6 | 7 | if new_hash['price'] 8 | new_hash['price'] = new_hash['price'].delete_if {|k,v| v.include?('-') } 9 | new_hash.delete('price') if new_hash['price'].blank? 10 | end 11 | 12 | if new_hash['original_price'] 13 | new_hash['original_price'] = new_hash['original_price'].delete_if {|k,v| v.include?('-') } 14 | new_hash.delete('original_price') if new_hash['original_price'].blank? 15 | end 16 | 17 | Record::Base.new(container, record).data = new_hash 18 | Crawler::Scrimper.perform_async new_hash['url'] 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #Test explicit setting of parameters and variables 3 | - name: Elasticsearch Config tests 4 | hosts: localhost 5 | roles: 6 | #expand to all available parameters 7 | - { role: elasticsearch, es_instance_name: "node1", es_data_dirs: ["/opt/elasticsearch/data-1","/opt/elasticsearch/data-2"], es_log_dir: "/opt/elasticsearch/logs", es_work_dir: "/opt/elasticsearch/temp", es_user_id: 333, es_group_id: 333, es_config: {node.name: "node1", cluster.name: "custom-cluster", discovery.zen.ping.unicast.hosts: "localhost:9301", http.port: 9201, transport.tcp.port: 9301, node.data: false, node.master: true, bootstrap.mlockall: true, discovery.zen.ping.multicast.enabled: false } } 8 | vars: 9 | es_scripts: false 10 | es_templates: false 11 | es_version_lock: false 12 | es_heap_size: 1g -------------------------------------------------------------------------------- /app/workers/mapper/cleaner.rb: -------------------------------------------------------------------------------- 1 | class Mapper::Cleaner < Mapper::Base 2 | def perform(container, _standard = []) 3 | @container = container 4 | records.with_progress{"Cleaning #{container}"}.each do |r| 5 | data = record(r.key).data 6 | new_data = parse_record data 7 | record(r.key).data = new_data unless data == new_data 8 | end 9 | end 10 | 11 | def parse_record(data) 12 | if id = data['id'] 13 | data.each do |k, v| 14 | ap 'KEY' 15 | ap k 16 | if v.is_a?(Hash) 17 | v.each do |k2, v2| 18 | ap '!!!!!!!!!!!!INNER KEY' 19 | ap k2 20 | ap '!!!!!!!!!!!!INNER VALUE' 21 | ap v2 22 | end 23 | else 24 | ap 'VALUE' 25 | ap v 26 | end 27 | end 28 | end 29 | data 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: check-parameters 3 | include: checkParameters.yml 4 | tags: 5 | - check 6 | - name: os-specific vars 7 | include_vars: "{{ansible_os_family}}.yml" 8 | tags: 9 | - always 10 | - include: java.yml 11 | tags: 12 | - java 13 | - include: elasticsearch.yml 14 | tags: 15 | - install 16 | - include: elasticsearch-config.yml 17 | tags: 18 | - config 19 | - include: elasticsearch-scripts.yml 20 | when: es_scripts 21 | tags: 22 | - scripts 23 | - include: elasticsearch-plugins.yml 24 | when: es_plugins is defined or es_plugins_reinstall 25 | tags: 26 | - plugins 27 | - include: elasticsearch-service.yml 28 | tags: 29 | - service 30 | - include: elasticsearch-templates.yml 31 | when: es_templates 32 | tags: 33 | - templates 34 | - meta: flush_handlers 35 | -------------------------------------------------------------------------------- /app/workers/syncer/rescreener.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Rescreener < Syncer::Base 2 | def perform(container, cleanup = false) 3 | @container = container 4 | records.with_progress("Rescreen Crawling #{container}").each do |r| 5 | record(r.key).screenshots.each do |key, value| 6 | Crawler::Screener.perform_async value, key unless files.include? key 7 | files.delete(key) 8 | end 9 | end if screenshots_cloud 10 | files.each { |f| screenshots_cloud.head(f).try(:destroy) } if cleanup 11 | end 12 | 13 | def screenshots_container 14 | @screenshots_container ||= @container.match(/(.+)-/)[1] + '-screenshots' rescue nil 15 | end 16 | 17 | def screenshots_cloud 18 | @screenshots_cloud ||= Cloud.new(screenshots_container) rescue nil 19 | end 20 | 21 | def files 22 | @files ||= screenshots_cloud.files.map(&:key) 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /app/models/record/addons.rb: -------------------------------------------------------------------------------- 1 | class Record::Addons 2 | def self.append hash 3 | if appends = Rails.configuration.config[:admin][:append][hash[:container].try(:to_sym)] 4 | appends.each do |key, value| 5 | hash[key] = hash[key] + value if hash[key] 6 | end 7 | end 8 | 9 | if inserts = Rails.configuration.config[:admin][:insert][hash[:container].try(:to_sym)] 10 | inserts.each do |key, value| 11 | if hash[key] && key == :url 12 | hash[key] = value + CGI.escape(hash[key]) 13 | elsif hash[key] 14 | hash[key] = value + hash[key] 15 | end 16 | end 17 | end 18 | 19 | if addons = Rails.configuration.config[:admin][:addons][hash[:container].try(:to_sym)] 20 | addons.each do |key, value| 21 | hash[key] = value 22 | end 23 | end 24 | 25 | return hash 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /app/workers/crawler/screener.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Screener < Crawler::Base 2 | sidekiq_options queue: :screener, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, path) 9 | return if url.nil? 10 | @url = url 11 | capturer.relative_path = path 12 | capturer.screen 13 | rescue EOFError => e 14 | Crawler::Screener.perform_async url, path 15 | rescue Net::ReadTimeout => e 16 | Crawler::Screener.perform_async url, path 17 | rescue ChildProcess::TimeoutError => e 18 | Crawler::Screener.perform_async url, path 19 | rescue Selenium::WebDriver::Error::WebDriverError => e 20 | Crawler::Screener.perform_async url, path 21 | end 22 | 23 | def capturer 24 | @capturer ||= Crawl::Capture.new(@url) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /roles/elasticsearch/filter_plugins/custom.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dale mcdiarmid' 2 | 3 | import re 4 | 5 | def modify_list(values=[], pattern='', replacement='', ignorecase=False): 6 | ''' Perform a `re.sub` on every item in the list''' 7 | if ignorecase: 8 | flags = re.I 9 | else: 10 | flags = 0 11 | _re = re.compile(pattern, flags=flags) 12 | return [_re.sub(replacement, value) for value in values] 13 | 14 | def append_to_list(values=[], suffix=''): 15 | if isinstance(values, basestring): 16 | values = values.split(',') 17 | return [str(value+suffix) for value in values] 18 | 19 | def array_to_str(values=[],separator=','): 20 | return separator.join(values) 21 | 22 | class FilterModule(object): 23 | def filters(self): 24 | return {'modify_list': modify_list, 25 | 'append_to_list':append_to_list, 26 | 'array_to_str':array_to_str} -------------------------------------------------------------------------------- /app/workers/syncer/refixer.rb: -------------------------------------------------------------------------------- 1 | class Syncer::Refixer < Syncer::Base 2 | def perform(container) 3 | @container = container 4 | types = container.split('-').last.pluralize.gsub(':', '') 5 | index = Rails.env + '-' + types 6 | Elasticsearch::Model.client.indices.refresh index: index 7 | records.with_progress("Refixing #{container}").each do |r| 8 | id = r.key.gsub('.json','') 9 | if id.size > 20 10 | begin 11 | Elasticsearch::Model.client.delete index: index, type: container, id: id 12 | Elasticsearch::Model.client.indices.refresh index: index 13 | r = record(id) 14 | if url = r.try(:url) 15 | Crawler::Scrimper.perform_async url 16 | end 17 | r.delete 18 | rescue 19 | Mapper::IdAvailability.perform_async container, id 20 | end 21 | end 22 | end 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /app/workers/crawler/stretcher.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Stretcher < Crawler::Base 2 | sidekiq_options queue: :stretcher, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, hash = {}) 9 | return if url.nil? 10 | @parsed = hash 11 | 12 | @url = url 13 | parser.page = scraper.get 14 | upload 15 | rescue Mechanize::ResponseCodeError => e 16 | if e.response_code == '404' || 17 | e.response_code == '410' || 18 | e.response_code == '520' || 19 | e.response_code == '500' || 20 | e.response_code == '301' || 21 | e.response_code == '302' 22 | Mapper::UrlAvailability.perform_async url 23 | else 24 | raise 25 | end 26 | rescue Mechanize::RedirectLimitReachedError => e 27 | nil 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/tasks/crawl.rake: -------------------------------------------------------------------------------- 1 | namespace :crawl do 2 | desc 'Run the crawler in Crawler::Spider mode' 3 | task :spider, [:url] => :environment do |_task, args| 4 | Redis::List.new('visited').clear 5 | Crawler::Spider.perform_async args.url 6 | end 7 | 8 | desc 'Run the crawler in Crawler::Scrimper mode' 9 | task :scrimper, [:url] => :environment do |_task, args| 10 | Redis::List.new('visited').clear 11 | Crawler::Scrimper.perform_async args.url 12 | end 13 | 14 | desc 'Run the crawler in Crawler::Sampler mode' 15 | task :sampler, [:url] => :environment do |_task, args| 16 | Redis::List.new('visited').clear 17 | Crawler::Sampler.perform_async args.url 18 | end 19 | 20 | desc 'Run the crawler in Crawler::Sitemapper mode' 21 | task :sitemapper, [:url] => :environment do |_task, args| 22 | Redis::List.new('visited').clear 23 | Crawler::Sitemapper.perform_async args.url 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /app/helpers/page_helper.rb: -------------------------------------------------------------------------------- 1 | module PageHelper 2 | def build_page 3 | methods.grep(/page_helper/).each do |page| 4 | send(page) 5 | end 6 | # @id = @name.tr(" ", "_") if @type 7 | end 8 | 9 | def page_helper_id 10 | @id = md5 unless @id 11 | end 12 | 13 | def page_helper_url 14 | @url = parser.css("link[@rel='canonical']").first['href'].try(:squish) unless @url rescue nil 15 | @url = page.uri.to_s unless @url 16 | end 17 | 18 | def page_helper_name 19 | @name = parser.at('title').inner_html.try(:squish) unless @name rescue nil 20 | end 21 | 22 | def page_helper_description 23 | @description = parser.css("meta[@name='description']").first['content'].try(:squish) unless @description rescue nil 24 | end 25 | 26 | def page_helper_mobile_url 27 | @mobile_url = parser.css("link[@media='handheld']").first['href'].try(:squish) unless @mobile_url rescue nil 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /roles/ruby-install/README.md: -------------------------------------------------------------------------------- 1 | InnoHub Ansible : ruby-install [![Build Status](https://travis-ci.org/innohub-ansible/ruby-install.svg?branch=master)](https://travis-ci.org/innohub-ansible/ruby-install) 2 | ========================================================================================================================================================================== 3 | 4 | Installs ruby-install. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Works ONLY on Ubuntu 14.04. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | ruby_install_version : defaults to '0.4.3' 15 | 16 | Dependencies 17 | ------------ 18 | 19 | None 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Example Playbook: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: innohub-ansible.ruby-install } 29 | 30 | Example Role: 31 | 32 | dependencies: 33 | - { role: ruby_install } 34 | 35 | License 36 | ------- 37 | 38 | MIT 39 | -------------------------------------------------------------------------------- /app/models/crawl/sitemap.rb: -------------------------------------------------------------------------------- 1 | class Crawl::Sitemap < Page::Url 2 | attr_accessor :xml 3 | 4 | def parser 5 | @parser ||= begin 6 | if uri.to_s.ends_with?('.gz') 7 | require 'zlib' 8 | require 'stringio' 9 | gz = Zlib::GzipReader.new(StringIO.new(xml.body.to_s)) 10 | Nokogiri::XML.parse(gz.read) 11 | else 12 | Nokogiri::XML.parse(xml.body) 13 | end 14 | rescue Zlib::GzipFile::Error 15 | Nokogiri::XML.parse(xml.body) 16 | end 17 | end 18 | 19 | def index_links 20 | @index_links ||= parser.css('//sitemap/loc').map(&:text).compact.uniq.shuffle 21 | end 22 | 23 | def site_links 24 | @site_links ||= parser.css('//url/loc').map(&:text).compact.uniq.shuffle 25 | end 26 | 27 | def base 28 | "#{uri.scheme}://#{uri.host}" 29 | end 30 | 31 | def indexes? 32 | !index_links.empty? 33 | end 34 | 35 | def sites? 36 | !site_links.empty? 37 | end 38 | end 39 | -------------------------------------------------------------------------------- /app/models/flattener.rb: -------------------------------------------------------------------------------- 1 | class Flattener 2 | def initialize(hash) 3 | @hash = hash 4 | @result = {} 5 | @result_iter = {} 6 | @paths = hash.keys.map { |key| [key] } 7 | end 8 | 9 | def flatten(hash = @hash, old_path = []) 10 | hash.each do |key, value| 11 | current_path = old_path + [key] 12 | 13 | if !value.respond_to?(:keys) 14 | @result[current_path.join('_')] = value 15 | else 16 | flatten(value, current_path) 17 | end 18 | end 19 | 20 | @result 21 | end 22 | 23 | def flatten_iter 24 | until @paths.empty? 25 | path = @paths.shift 26 | value = @hash 27 | path.each { |step| value = value[step] } 28 | 29 | if value.respond_to?(:keys) 30 | value.keys.each { |key| @paths << path + [key] } 31 | else 32 | @result_iter[path.join('_')] = value 33 | end 34 | end 35 | 36 | @result_iter 37 | end 38 | 39 | def are_the_same? 40 | flatten == flatten_iter 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /app/workers/crawler/scrimper.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Scrimper < Crawler::Base 2 | sidekiq_options queue: :scrimper, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, hash = {}) 9 | return if url.nil? 10 | @parsed = hash 11 | 12 | @url = url 13 | Timeout::timeout(60) do 14 | parser.page = scraper.get 15 | end 16 | upload 17 | rescue Mechanize::ResponseCodeError => e 18 | if e.response_code == '404' || 19 | e.response_code == '410' || 20 | e.response_code == '520' || 21 | e.response_code == '500' || 22 | e.response_code == '301' || 23 | e.response_code == '302' 24 | Mapper::UrlAvailability.perform_async url 25 | else 26 | raise 27 | end 28 | rescue Mechanize::RedirectLimitReachedError => e 29 | nil 30 | rescue Timeout::Error => e 31 | Crawler::Stretcher.perform_async url 32 | end 33 | end 34 | -------------------------------------------------------------------------------- /config/secrets.yml.example: -------------------------------------------------------------------------------- 1 | # Be sure to restart your server when you modify this file. 2 | 3 | # Your secret key is used for verifying the integrity of signed cookies. 4 | # If you change this key, all old signed cookies will become invalid! 5 | 6 | # Make sure the secret is at least 30 characters and all random, 7 | # no regular words or you'll be exposed to dictionary attacks. 8 | # You can use `rake secret` to generate a secure secret key. 9 | 10 | # Make sure the secrets in this file are kept private 11 | # if you're sharing your code publicly. 12 | 13 | development: 14 | secret_key_base: 2be1e0b5ab9e3917ae9dd7511f040b9fcbaa9ddf145ecae46500065762c4f940ade359f8530dc49ec9c434ebe8fa9375f8beda49880c0202eb94de563d0ede19 15 | 16 | test: 17 | secret_key_base: d8e74cc35877031cfb84919c62d2cb486de7f77b70daa7d8e1ac8e7ed186a19d7b7ea0593cb1166ff0ed4f31fe272e73c7a682ce442a51423d4900e6afb0daa2 18 | 19 | # Do not keep production secrets in the repository, 20 | # instead read values from the environment. 21 | production: 22 | secret_key_base: <%= ENV["SECRET_KEY_BASE"] %> 23 | -------------------------------------------------------------------------------- /roles/elasticsearch/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | faraday (0.9.2) 5 | multipart-post (>= 1.2, < 3) 6 | highline (1.7.8) 7 | kitchen-ansible (0.40.1) 8 | librarian-ansible 9 | test-kitchen (~> 1.4) 10 | kitchen-docker (2.1.0) 11 | test-kitchen (>= 1.0.0) 12 | librarian (0.1.2) 13 | highline 14 | thor (~> 0.15) 15 | librarian-ansible (3.0.0) 16 | faraday 17 | librarian (~> 0.1.0) 18 | mixlib-shellout (2.2.6) 19 | multipart-post (2.0.0) 20 | net-scp (1.2.1) 21 | net-ssh (>= 2.6.5) 22 | net-ssh (2.9.4) 23 | safe_yaml (1.0.4) 24 | test-kitchen (1.4.2) 25 | mixlib-shellout (>= 1.2, < 3.0) 26 | net-scp (~> 1.1) 27 | net-ssh (~> 2.7, < 2.10) 28 | safe_yaml (~> 1.0) 29 | thor (~> 0.18) 30 | thor (0.19.1) 31 | 32 | PLATFORMS 33 | ruby 34 | 35 | DEPENDENCIES 36 | kitchen-ansible (= 0.40.1) 37 | kitchen-docker (= 2.1.0) 38 | net-ssh (~> 2.0) 39 | test-kitchen (= 1.4.2) 40 | 41 | BUNDLED WITH 42 | 1.11.2 43 | -------------------------------------------------------------------------------- /config/config.yml.example: -------------------------------------------------------------------------------- 1 | default: &default 2 | secret: 1234 3 | redis: 4 | host: 127.0.0.1 5 | port: 6379 6 | database: 0 7 | password: 8 | elasticsearch: 9 | host: 127.0.0.1 10 | port: 9200 11 | protocol: http 12 | fog: 13 | provider: 'Local' 14 | local_root: 'tmp/fog' 15 | # provider: 'AWS' 16 | # aws_access_key_id: '32 bit key' 17 | # aws_secret_access_key: '32 bit key' 18 | admin: 19 | username: admin 20 | password: password 21 | docs: https://github.com/bastosmichael/skynet 22 | api_containers: 23 | - 24 | api_keys: 25 | 'sample-key': 26 | customer: name 27 | permissions: 28 | - record_show 29 | - search_index 30 | - match_index 31 | - trends_index 32 | - batch_index 33 | - status_index 34 | limit: 20 35 | append: {} 36 | insert: {} 37 | tracker: {} 38 | app: 39 | name: crawler 40 | 41 | development: 42 | <<: *default 43 | 44 | production: 45 | <<: *default 46 | 47 | test: 48 | <<: *default 49 | -------------------------------------------------------------------------------- /app/workers/crawler/base.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Base < Worker 2 | def scraper 3 | @scraper ||= Crawl::Base.new(@url) 4 | end 5 | 6 | def parser 7 | @parser ||= scraper.name.capitalize.constantize.new(@url) 8 | rescue NameError 9 | @parser ||= Page::Parse.new(@url) 10 | end 11 | 12 | def upload 13 | scraper.clear 14 | @parsed = parsed.merge(parser.save) if parser.build 15 | if parsed.presence && parsed['type'] 16 | Recorder::Uploader.perform_async parsed 17 | end 18 | end 19 | 20 | def parsed 21 | @parsed ||= {} 22 | end 23 | 24 | def social 25 | @social ||= Crawl::Social.new(@url) 26 | rescue 27 | {} 28 | end 29 | 30 | def internal_links 31 | @internal_links ||= begin 32 | parser.internal_links.map do |url| 33 | scraper.name.capitalize.constantize.sanitize_url(url) 34 | end.compact 35 | rescue 36 | parser.internal_links 37 | end 38 | end 39 | 40 | def visit 41 | internal_links.each do |url| 42 | ('Crawler::' + next_type).constantize.perform_async url 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /roles/ubuntu-common/templates/sources.list: -------------------------------------------------------------------------------- 1 | # binary packages 2 | deb {{ common_apt_mirror }} {{ common_release_code }} main multiverse restricted universe 3 | deb {{ common_apt_mirror }} {{ common_release_code }}-updates main multiverse restricted universe 4 | deb {{ common_apt_mirror }} {{ common_release_code }}-proposed main multiverse restricted universe 5 | deb {{ common_apt_mirror }} {{ common_release_code }}-backports main multiverse restricted universe 6 | deb {{ common_apt_mirror }} {{ common_release_code }}-security main multiverse restricted universe 7 | 8 | # # sources 9 | # deb-src {{ common_apt_mirror }} {{ common_release_code }} main multiverse restricted universe 10 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-updates main multiverse restricted universe 11 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-proposed main multiverse restricted universe 12 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-backports main multiverse restricted universe 13 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-security main multiverse restricted universe 14 | -------------------------------------------------------------------------------- /app/models/page/url.rb: -------------------------------------------------------------------------------- 1 | class Page::Url 2 | URI_REGEX = /\A#{URI.regexp(%w(http https))}\z/ 3 | 4 | attr_accessor :date 5 | 6 | def initialize(url) 7 | @url = clean_up_url(url) 8 | self.date = Date.today.to_s if date.nil? 9 | end 10 | 11 | def cache_key 12 | File.join(build_path, date) 13 | end 14 | 15 | def build_path 16 | File.join(host, md5) 17 | end 18 | 19 | def uri 20 | @uri ||= URI.parse(@url) 21 | end 22 | 23 | def url 24 | @url ||= uri.to_s 25 | end 26 | 27 | def md5 28 | Digest::MD5.hexdigest(url) 29 | end 30 | 31 | def host 32 | get_host_without_www uri 33 | end 34 | 35 | def name 36 | host.split('.').first 37 | end 38 | 39 | def get_host_without_www(new_uri) 40 | host = new_uri.host.downcase 41 | begin 42 | host.split(/\./)[1] + '.' + host.split(/\./)[2] 43 | rescue 44 | host.start_with?('www.') ? host[4..-1] : host 45 | end 46 | end 47 | 48 | def clean_up_url(url) 49 | url = URI.encode(url) 50 | url = "http://#{url}" if URI.parse(url).scheme.nil? 51 | url 52 | end 53 | end 54 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-RedHat.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Ensure libselinux-python on CentOS 6.x 3 | yum: name=libselinux-python state=present update_cache=yes 4 | when: ( ansible_distribution == "CentOS" ) and ( ansible_distribution_major_version == "6" ) 5 | 6 | - name: RedHat - add Elasticsearch repo 7 | template: src=elasticsearch.repo dest=/etc/yum.repos.d/elasticsearch-{{ es_major_version }}.repo 8 | when: es_use_repository 9 | 10 | - name: RedHat - include versionlock 11 | include: elasticsearch-RedHat-version-lock.yml 12 | when: es_version_lock 13 | 14 | - name: RedHat - Install Elasticsearch 15 | yum: name=elasticsearch{% if es_version is defined and es_version != "" %}-{{ es_version }}{% endif %} state=present update_cache=yes 16 | when: es_use_repository 17 | register: elasticsearch_install 18 | 19 | - name: RedHat - Install Elasticsearch from url 20 | yum: name={% if es_custom_package_url is defined %}{{ es_custom_package_url }}{% else %}{{ es_package_url }}-{{ es_version }}.noarch.rpm{% endif %} state=present 21 | when: not es_use_repository 22 | register: elasticsearch_install -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/multi.yml: -------------------------------------------------------------------------------- 1 | --- 2 | #Test ability to deploy multiple instances to a machine 3 | - name: Elasticsearch Multi tests 4 | hosts: localhost 5 | roles: 6 | - { role: elasticsearch, es_instance_name: "master", es_data_dirs: ["/opt/elasticsearch/master"], es_heap_size: "1g", es_config: { "discovery.zen.ping.multicast.enabled": false, discovery.zen.ping.unicast.hosts: "localhost:9300", http.port: 9200, transport.tcp.port: 9300, node.data: false, node.master: true, bootstrap.mlockall: true, discovery.zen.ping.multicast.enabled: false } } 7 | - { role: elasticsearch, es_instance_name: "node1", es_data_dirs: "/opt/elasticsearch/data-1,/opt/elasticsearch/data-2", es_config: { "discovery.zen.ping.multicast.enabled": false, discovery.zen.ping.unicast.hosts: "localhost:9300", http.port: 9201, transport.tcp.port: 9301, node.data: true, node.master: false, discovery.zen.ping.multicast.enabled: false } } 8 | vars: 9 | es_scripts: true 10 | es_templates: true 11 | es_plugin_dir: "/opt/elasticsearch/plugins" 12 | #Plugins installed for this test are specified in .kitchen.yml under suite -------------------------------------------------------------------------------- /roles/chruby/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Andrew Angelo Ang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Alexandros Giouzenis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /roles/ruby-install/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Andrew Angelo Ang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /roles/swapfile/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Kamal Nasser 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/tasks/sync.rake: -------------------------------------------------------------------------------- 1 | namespace :sync do 2 | desc 'Run the crawler in Recrod::Mover mode' 3 | task :mover, [:from_bucket, :to_bucket] => :environment do |_task, args| 4 | Redis::List.new('visited').clear 5 | Syncer::Mover.perform_async args.from_bucket, args.to_bucket 6 | end 7 | 8 | desc 'Run the crawler in Recrod::Rescreener mode' 9 | task :rescreener, [:bucket] => :environment do |_task, args| 10 | Redis::List.new('visited').clear 11 | Syncer::Rescreener.perform_async args.bucket 12 | end 13 | 14 | desc 'Run the crawler in Recrod::Rescrimper mode' 15 | task :rescrimper, [:bucket] => :environment do |_task, args| 16 | Redis::List.new('visited').clear 17 | Syncer::Rescrimper.perform_async args.bucket 18 | end 19 | 20 | desc 'Run the crawler in Recrod::Resampler mode' 21 | task :resampler, [:bucket] => :environment do |_task, args| 22 | Redis::List.new('visited').clear 23 | Syncer::Resampler.perform_async args.bucket 24 | end 25 | 26 | desc 'Run the crawler in Recrod::Respider mode' 27 | task :respider, [:bucket] => :environment do |_task, args| 28 | Redis::List.new('visited').clear 29 | Syncer::Respider.perform_async args.bucket 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /app/workers/mapper/url_availability.rb: -------------------------------------------------------------------------------- 1 | class Mapper::UrlAvailability < Mapper::Base 2 | def perform(url) 3 | @name = Page::Url.new(url).name 4 | @container = Rails.configuration.config[:admin][:api_containers].find { |c| c.include?(@name) } 5 | types = @container.split('-').last.pluralize.gsub(':', '') 6 | @index = Rails.env + '-' + types 7 | 8 | records = Elasticsearch::Model.client.search(index: @index, type: @container, body: { query: { match_phrase_prefix: { url: url } } }) 9 | 10 | if records['hits']['total'] > 0 11 | records['hits']['hits'].each do |record| 12 | Recorder::Uploader.perform_async({ id: record['_id'], 13 | available: false, 14 | url: record['_source']['url'], 15 | type: record['_type'].split('-').last.capitalize.singularize }) 16 | # cloud.head(record['_id'] + '.json').try(:destroy) 17 | # Elasticsearch::Model.client.delete index: @index, type: @container, id: record['_id'] 18 | # Crawler::Scrimper.perform_async url 19 | end 20 | end 21 | # rescue NoMethodError => e 22 | # nil 23 | end 24 | end 25 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-Debian.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Debian - Add Elasticsearch repository key 3 | apt_key: url="http://packages.elasticsearch.org/GPG-KEY-elasticsearch" state=present 4 | when: es_use_repository 5 | 6 | - name: Debian - add elasticsearch repository 7 | apt_repository: repo="deb http://packages.elastic.co/elasticsearch/{{ es_major_version }}/debian stable main" state=present 8 | when: es_use_repository 9 | 10 | - name: Debian - Ensure elasticsearch is installed 11 | apt: name=elasticsearch{% if es_version is defined and es_version != "" %}={{ es_version }}{% endif %} state=present cache_valid_time=86400 12 | when: es_use_repository 13 | register: elasticsearch_install 14 | 15 | - name: Debian - Download elasticsearch from url 16 | get_url: url={% if es_custom_package_url is defined %}{{ es_custom_package_url }}{% else %}{{ es_package_url }}-{{ es_version }}.deb{% endif %} dest=/tmp/elasticsearch-{{ es_version }}.deb validate_certs=no 17 | when: not es_use_repository 18 | 19 | - name: Debian - Ensure elasticsearch is installed from downloaded package 20 | apt: deb=/tmp/elasticsearch-{{ es_version }}.deb 21 | when: not es_use_repository 22 | register: elasticsearch_install -------------------------------------------------------------------------------- /config/sitemap.rb: -------------------------------------------------------------------------------- 1 | # Set the host name for URL creation 2 | SitemapGenerator::Sitemap.default_host = "https://" + ENV['DOMAIN'] + '/' 3 | 4 | SitemapGenerator::Sitemap.sitemaps_path = "#{ENV['CONTAINER']}/" 5 | 6 | SitemapGenerator::Sitemap.create_index = true 7 | 8 | SitemapGenerator::Sitemap.public_path = "tmp/#{ENV['DOMAIN']}/#{ENV['CONTAINER']}/" 9 | 10 | SitemapGenerator::Sitemap.sitemaps_host = "https://#{ENV['DOMAIN']}/" 11 | 12 | SitemapGenerator::Sitemap.adapter = SitemapGenerator::S3Adapter.new(Rails.configuration.config[:fog].merge(fog_directory: "#{ENV['DOMAIN']}-sitemaps", 13 | fog_region: 'us-west-1', fog_provider: 'AWS')) 14 | sitemap_default_options = { 15 | changefreq: nil, 16 | priority: nil, 17 | lastmod: nil 18 | } 19 | 20 | SitemapGenerator::Sitemap.create do 21 | # ['boxed-offers'].each do |container| 22 | begin 23 | Cloud.new(ENV['CONTAINER']).files.each do |file| 24 | add (ENV['CONTAINER'] + '/' + file.key.gsub('.json','')), sitemap_default_options#.merge(lastmod: file.last_modified) 25 | end 26 | rescue => e 27 | ap e.message 28 | end 29 | # Example. DOMAIN=pricenometry.com CONTAINER=newegg-offers bundle exec rake sitemap:refresh 30 | end 31 | -------------------------------------------------------------------------------- /config/initializers/sidekiq.rb: -------------------------------------------------------------------------------- 1 | require 'sidekiq' 2 | 3 | Sidekiq.configure_server do |config| 4 | if Rails.configuration.config[:redis][:password].presence 5 | url = "redis://:#{Rails.configuration.config[:redis][:password]}@#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}" 6 | else 7 | url = "redis://#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}" 8 | end 9 | 10 | config.redis = { url: url, namespace: 'crawler' } 11 | end 12 | 13 | Sidekiq.configure_client do |config| 14 | if Rails.configuration.config[:redis][:password].presence 15 | url = "redis://:#{Rails.configuration.config[:redis][:password]}@#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}" 16 | else 17 | url = "redis://#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}" 18 | end 19 | 20 | config.redis = { url: url, namespace: 'crawler' } 21 | end 22 | 23 | SidekiqUniqueJobs.config.unique_args_enabled 24 | -------------------------------------------------------------------------------- /test/models/url_test.rb: -------------------------------------------------------------------------------- 1 | require 'test_helper' 2 | 3 | class UrlTest < ActiveSupport::TestCase 4 | setup do 5 | assert @url = Page::Url.new('google.com') 6 | assert @url.date = '2014-08-09' 7 | end 8 | 9 | test 'cache_key method returns correct cache path' do 10 | assert_equal @url.cache_key, 'google/google.com/c7b920f57e553df2bb68272f61570210/2014-08-09' 11 | end 12 | 13 | test 'build_path method returns correct hashed path' do 14 | assert_equal @url.build_path, 'google/google.com/c7b920f57e553df2bb68272f61570210' 15 | end 16 | 17 | # TODO 18 | # test 'uri method returns a URI object' do 19 | # pending 'Needs to match a URI object' 20 | # end 21 | 22 | test 'url method returns correctly formatted internet address' do 23 | assert_equal @url.url, 'http://google.com' 24 | end 25 | 26 | test 'md5 method returns correct ' do 27 | assert path = @url.url 28 | assert checksum = Digest::MD5.hexdigest(path) 29 | assert_equal @url.md5, checksum 30 | end 31 | 32 | test 'host method returns correct host name' do 33 | assert_equal @url.host, 'google.com' 34 | end 35 | 36 | test 'name method returns correct site name' do 37 | assert_equal @url.name, 'google' 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /app/models/crawl/social.rb: -------------------------------------------------------------------------------- 1 | class Crawl::Social < Page::Url 2 | require 'social_shares' 3 | 4 | def shares 5 | if shares = all(url).delete_if { |_k, v| v == 0 }.presence 6 | return shares 7 | elsif url.starts_with?('https') && shares = all(url.gsub('https://','http://')).delete_if { |_k, v| v == 0 }.presence 8 | return shares 9 | else 10 | {} 11 | end 12 | end 13 | 14 | def all(new_url) 15 | @all ||= SocialShares.all(new_url).delete_if { |_k, v| v == 0 }.map { |k, v| { k.to_s + '_shares' => v.to_i } }.reduce({}, :merge) 16 | end 17 | 18 | def total 19 | all.values.sum 20 | end 21 | 22 | def has_shares? 23 | SocialShares.has_any?(url) 24 | end 25 | 26 | def facebook 27 | @facebook ||= sanitize_facebook JSON.parse(Crawl::Base.new("https://graph.facebook.com/?id=#{@url}").get.try(:body), quirks_mode: true) 28 | rescue 29 | {} 30 | end 31 | 32 | def sanitize_facebook(data) 33 | return nil if data['error_message'] || data['error_type'] || data['error_code'] 34 | return nil if data.empty? 35 | Flattener.new(data).flatten.delete_if { |_k, v| v == 0 || v == @url }.map { |k, v| { 'facebook_' + k.to_s => v.try(:squish) || v } }.reduce({}, :merge) 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /app/workers/crawler/sampler.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Sampler < Crawler::Base 2 | sidekiq_options queue: :sampler, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, type = nil, hash = {}) 9 | return if url.nil? 10 | @parsed = hash 11 | 12 | if type.nil? 13 | next_type 14 | else 15 | @type = type 16 | end 17 | 18 | @url = url 19 | 20 | Timeout::timeout(60) do 21 | parser.page = scraper.get 22 | end 23 | 24 | visit 25 | upload 26 | rescue Mechanize::ResponseCodeError => e 27 | if e.response_code == '404' || 28 | e.response_code == '410' || 29 | e.response_code == '520' || 30 | e.response_code == '500' || 31 | e.response_code == '301' || 32 | e.response_code == '302' 33 | Mapper::UrlAvailability.perform_async url 34 | else 35 | raise 36 | end 37 | rescue Mechanize::RedirectLimitReachedError => e 38 | nil 39 | rescue Timeout::Error => e 40 | Crawler::Stretcher.perform_async url 41 | end 42 | 43 | def next_type 44 | @type ||= 'Scrimper' 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /app/workers/crawler/spider.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Spider < Crawler::Base 2 | sidekiq_options queue: :spider, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 * 365 7 | 8 | def perform(url, type = nil, hash = {}) 9 | return if url.nil? 10 | @parsed = hash 11 | 12 | if type.nil? 13 | next_type 14 | else 15 | @type = type 16 | end 17 | 18 | @url = url 19 | 20 | Timeout::timeout(60) do 21 | parser.page = scraper.get 22 | end 23 | 24 | visit 25 | upload 26 | rescue Mechanize::ResponseCodeError => e 27 | if e.response_code == '404' || 28 | e.response_code == '410' || 29 | e.response_code == '520' || 30 | e.response_code == '500' || 31 | e.response_code == '301' || 32 | e.response_code == '302' 33 | Mapper::UrlAvailability.perform_async url 34 | else 35 | raise 36 | end 37 | rescue Mechanize::RedirectLimitReachedError => e 38 | nil 39 | rescue Timeout::Error => e 40 | Crawler::Stretcher.perform_async url 41 | end 42 | 43 | def next_type 44 | @type ||= 'Spider' 45 | end 46 | end 47 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-templates.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - file: path=/etc/elasticsearch/templates state=directory owner={{ es_user }} group={{ es_group }} 4 | 5 | - name: Copy default templates to elasticsearch 6 | copy: src=templates dest=/etc/elasticsearch/ owner={{ es_user }} group={{ es_group }} 7 | when: es_templates_fileglob is not defined 8 | 9 | - name: Copy templates to elasticsearch 10 | copy: src={{ item }} dest=/etc/elasticsearch/templates owner={{ es_user }} group={{ es_group }} 11 | with_fileglob: "{{ es_templates_fileglob }}" 12 | 13 | - set_fact: http_port=9200 14 | tags: 15 | - always 16 | 17 | - set_fact: http_port={{es_config['http.port']}} 18 | when: es_config['http.port'] is defined 19 | tags: 20 | - always 21 | 22 | - name: Wait for elasticsearch to startup 23 | wait_for: port={{http_port}} delay=10 24 | 25 | - name: Get template files 26 | shell: find . -maxdepth 1 -type f | sed "s#\./##" | sed "s/.json//" chdir=/etc/elasticsearch/templates 27 | register: resultstemplate 28 | 29 | - name: Install template(s) 30 | command: "curl -sL -XPUT http://localhost:{{http_port}}/_template/{{item}} -d @/etc/elasticsearch/templates/{{item}}.json" 31 | with_items: "{{ resultstemplate.stdout_lines }}" 32 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - set_fact: instance_default_file={{default_file | dirname}}/{{es_instance_name}}_{{default_file | basename}} 4 | tags: 5 | - always 6 | - set_fact: instance_init_script={{init_script | dirname }}/{{es_instance_name}}_{{init_script | basename}} 7 | tags: 8 | - always 9 | - set_fact: conf_dir={{ es_conf_dir }}/{{es_instance_name}} 10 | tags: 11 | - always 12 | - set_fact: plugin_dir={{ es_plugin_dir }}/{{es_instance_name}} 13 | tags: 14 | - always 15 | - set_fact: m_lock_enabled={{ es_config['bootstrap.mlockall'] is defined and es_config['bootstrap.mlockall'] == True }} 16 | tags: 17 | - always 18 | 19 | - debug: msg="Node configuration {{ es_config }} " 20 | 21 | - name: Include optional user and group creation. 22 | when: (es_user_id is defined) and (es_group_id is defined) 23 | include: elasticsearch-optional-user.yml 24 | 25 | #- name: Include specific Elasticsearch 26 | # include: "elasticsearch-{{ansible_os_family}}.yml" 27 | 28 | #Install OS specific elasticsearch - this can be abbreviated in version 2.0.0 29 | - name: Include specific Elasticsearch 30 | include: elasticsearch-Debian.yml 31 | when: ansible_os_family == 'Debian' 32 | 33 | - name: Include specific Elasticsearch 34 | include: elasticsearch-RedHat.yml 35 | when: ansible_os_family == 'RedHat' 36 | -------------------------------------------------------------------------------- /config/unicorn.rb: -------------------------------------------------------------------------------- 1 | # paths 2 | app_path = '/home/ubuntu/skynet' 3 | working_directory "#{app_path}/current" 4 | pid "#{app_path}/current/tmp/pids/unicorn.pid" 5 | 6 | # listen 7 | listen "#{app_path}/current/tmp/sockets/unicorn.sock", backlog: 64 8 | 9 | # logging 10 | stderr_path 'log/unicorn.stderr.log' 11 | stdout_path 'log/unicorn.stdout.log' 12 | 13 | # workers 14 | worker_processes 2 # Use 2 With Nano Server, 18 for medium 15 | 16 | # use correct Gemfile on restarts 17 | before_exec do |_server| 18 | ENV['BUNDLE_GEMFILE'] = "#{app_path}/current/Gemfile" 19 | end 20 | 21 | # preload 22 | preload_app true 23 | 24 | before_fork do |server, _worker| 25 | # the following is highly recomended for Rails + "preload_app true" 26 | # as there's no need for the master process to hold a connection 27 | ActiveRecord::Base.connection.disconnect! if defined?(ActiveRecord::Base) 28 | 29 | # Before forking, kill the master process that belongs to the .oldbin PID. 30 | # This enables 0 downtime deploys. 31 | old_pid = "#{server.config[:pid]}.oldbin" 32 | if File.exist?(old_pid) && server.pid != old_pid 33 | begin 34 | Process.kill('QUIT', File.read(old_pid).to_i) 35 | rescue Errno::ENOENT, Errno::ESRCH 36 | # someone else did our job for us 37 | end 38 | end 39 | end 40 | 41 | after_fork do |_server, _worker| 42 | ActiveRecord::Base.establish_connection if defined?(ActiveRecord::Base) 43 | end 44 | -------------------------------------------------------------------------------- /config/routes.rb: -------------------------------------------------------------------------------- 1 | Rails.application.routes.draw do 2 | 3 | root to: 'application#index' 4 | 5 | require "sidekiq/web" 6 | Sidekiq::Web.use Rack::Auth::Basic do |username, password| 7 | username == Rails.configuration.config[:admin][:username] && password == Rails.configuration.config[:admin][:password] 8 | end if Rails.env.production? 9 | mount Sidekiq::Web, at: "/sidekiq" 10 | 11 | namespace :v1, defaults: { format: 'json' } do 12 | get '/', to: 'status#index' 13 | get '/match', to: 'match#index', results: 10 14 | get '/search/:query', to: 'search#index' 15 | post '/batch' => 'batch#index' 16 | get '/trends/:array', to: 'trends#index' 17 | get '/:container/ids', to: 'record#ids' 18 | get '/:container/match', to: 'match#index' 19 | get '/:container/search/:query', to: 'search#index' 20 | get '/:container/trends/:array', to: 'trends#index' 21 | get '/:container/:record_id/related', to: 'record#related' 22 | get '/:container/:record_id/history', to: 'record#history' 23 | get '/:container/:record_id/news', to: 'record#news' 24 | get '/:container/:record_id/videos', to: 'record#videos' 25 | get '/:container/:record_id/references', to: 'record#references' 26 | get '/:container/:record_id/links', to: 'record#links' 27 | get '/:container/:record_id/:screenshot_id', to: 'record#screenshot' 28 | get '/:container/:record_id', to: 'record#index' 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /roles/ruby-install/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Download ruby-install 3 | shell: wget -O ruby-install-{{ ruby_install_version }}.tar.gz https://github.com/postmodern/ruby-install/archive/v{{ ruby_install_version }}.tar.gz 4 | when: 5 | ansible_local is not defined or 6 | ansible_local.ruby_install is not defined or 7 | ansible_local.ruby_install.version != ruby_install_version 8 | register: ruby_install_downloaded 9 | 10 | - name: Extract ruby-install 11 | shell: tar -xzvf ruby-install-{{ ruby_install_version }}.tar.gz 12 | when: ruby_install_downloaded | changed 13 | 14 | - name: Install ruby-install 15 | sudo: true 16 | shell: 17 | chdir=ruby-install-{{ ruby_install_version }} 18 | make install 19 | when: ruby_install_downloaded | changed 20 | register: ruby_install_installed 21 | 22 | - name: Clean up ruby-install sources 23 | shell: rm -rf ruby-install-* 24 | when: ruby_install_downloaded | changed 25 | 26 | # 27 | # Setup ruby-install facts.d 28 | # 29 | - name: Capture installed ruby-install version 30 | shell: ruby-install --version | awk '{ print $2 }' 31 | ignore_errors: yes 32 | register: installed_ruby_install_version 33 | 34 | - name: Create ansible facts directory 35 | sudo: true 36 | file: state=directory recurse=yes path=/etc/ansible/facts.d 37 | 38 | - name: Set ruby-install facts 39 | sudo: true 40 | template: src=ruby_install.fact dest=/etc/ansible/facts.d/ruby_install.fact 41 | -------------------------------------------------------------------------------- /roles/chruby/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Download chruby 4 | shell: wget -O chruby-{{ chruby_version }}.tar.gz https://github.com/postmodern/chruby/archive/v{{ chruby_version }}.tar.gz 5 | when: 6 | ansible_local is not defined or 7 | ansible_local.chruby is not defined or 8 | ansible_local.chruby.version != chruby_version 9 | register: chruby_downloaded 10 | 11 | - name: Extract chruby 12 | shell: tar -xzvf chruby-{{ chruby_version }}.tar.gz 13 | when: chruby_downloaded | changed 14 | register: chruby_extracted 15 | 16 | - name: Install chruby 17 | sudo: true 18 | shell: cd chruby-{{ chruby_version }}/ && make install 19 | when: chruby_extracted | changed 20 | register: chruby_installed 21 | 22 | - name: Clean up chruby sources 23 | shell: rm -rf chruby-* 24 | when: chruby_downloaded | changed 25 | 26 | - name: Attach chruby into shell 27 | sudo: true 28 | template: src=chruby.sh dest=/etc/profile.d/chruby.sh 29 | 30 | # 31 | # Setup chruby facts.d 32 | # 33 | - name: Capture installed chruby version 34 | shell: > 35 | executable=/bin/bash source /etc/profile; 36 | chruby --version | awk '{ print $2 }' 37 | ignore_errors: yes 38 | register: installed_chruby_version 39 | 40 | - name: Create ansible facts directory 41 | sudo: true 42 | file: state=directory recurse=yes path=/etc/ansible/facts.d 43 | 44 | - name: Set chruby facts 45 | sudo: true 46 | template: src=chruby.fact dest=/etc/ansible/facts.d/chruby.fact 47 | -------------------------------------------------------------------------------- /app/models/record/base.rb: -------------------------------------------------------------------------------- 1 | class Record::Base 2 | def initialize(container = nil, record = nil) 3 | @record = record 4 | @container = container 5 | @types = container.split('-').last.pluralize.gsub(':', '') if container 6 | @index = Rails.env + '-' + @types if @types 7 | end 8 | 9 | def delete 10 | cloud.head(@record + '.json').try(:destroy) 11 | end 12 | 13 | def url 14 | @url ||= data['url'] 15 | end 16 | 17 | def screenshots 18 | @screenshots ||= data['screenshot'].map { |_key, value| { value => url } }.reduce({}, :merge) 19 | end 20 | 21 | def data 22 | JSON.parse(cloud.get(@record + '.json').try(:body), quirks_mode: true) 23 | rescue 24 | {} 25 | end 26 | 27 | def data=(new_hash = {}) 28 | cloud.sync @record + '.json', new_hash.to_json 29 | end 30 | 31 | def cloud 32 | Cloud.new(@container) 33 | end 34 | 35 | private 36 | 37 | def sanitize_value value 38 | if value.is_a?(Array) || !!value == value 39 | return value 40 | elsif value.to_i.to_s == value.to_s 41 | return value.to_i 42 | elsif (Float(value) rescue false) 43 | return value.to_f 44 | else 45 | return value 46 | end 47 | end 48 | 49 | def recrawl(url, options) 50 | if options[:crawl] 51 | options[:social] ? Crawler::Socializer.perform_async(url) : Crawler::Slider.perform_async(url) 52 | end 53 | rescue #TODO find the correct error for Redis not responding 54 | nil 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /roles/build-ruby/README.md: -------------------------------------------------------------------------------- 1 | Role Name 2 | ========= 3 | 4 | A brief description of the role goes here. 5 | 6 | Requirements 7 | ------------ 8 | 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required. 10 | 11 | Role Variables 12 | -------------- 13 | 14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well. 15 | 16 | Dependencies 17 | ------------ 18 | 19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles. 20 | 21 | Example Playbook 22 | ---------------- 23 | 24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too: 25 | 26 | - hosts: servers 27 | roles: 28 | - { role: username.rolename, x: 42 } 29 | 30 | License 31 | ------- 32 | 33 | BSD 34 | 35 | Author Information 36 | ------------------ 37 | 38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed). 39 | -------------------------------------------------------------------------------- /roles/swapfile/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Write swapfile 3 | command: | 4 | {% if swapfile_use_dd %} 5 | dd if=/dev/zero of={{ swapfile_location }} bs=1M count={{ swapfile_size }} creates={{ swapfile_location }} 6 | {% else %} 7 | fallocate -l {{ swapfile_size }} {{ swapfile_location }} creates={{ swapfile_location }} 8 | {% endif %} 9 | register: write_swapfile 10 | when: swapfile_size != false 11 | 12 | - name: Set swapfile permissions 13 | file: path={{ swapfile_location }} mode=600 14 | when: swapfile_size != false 15 | 16 | - name: Create swapfile 17 | command: mkswap {{ swapfile_location }} 18 | register: create_swapfile 19 | when: swapfile_size != false and write_swapfile.changed 20 | 21 | - name: Enable swapfile 22 | command: swapon {{ swapfile_location }} 23 | when: swapfile_size != false and create_swapfile.changed 24 | 25 | - name: Add swapfile to /etc/fstab 26 | lineinfile: dest=/etc/fstab line="{{ swapfile_location }} none swap sw 0 0" state=present 27 | when: swapfile_size != false 28 | 29 | - name: Configure vm.swappiness 30 | lineinfile: dest=/etc/sysctl.conf line="vm.swappiness = {{ swapfile_swappiness }}" regexp="^vm.swappiness[\s]?=" state=present 31 | notify: Reload sysctl 32 | when: swapfile_swappiness != false 33 | 34 | - name: Configure vm.vfs_cache_pressure 35 | lineinfile: dest=/etc/sysctl.conf line="vm.vfs_cache_pressure = {{ swapfile_vfs_cache_pressure }}" regexp="^vm.vfs_cache_pressure[\s]?=" state=present 36 | notify: Reload sysctl 37 | when: swapfile_vfs_cache_pressure != false 38 | -------------------------------------------------------------------------------- /roles/logrotate/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016-14, Nick Hammond 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of ansiblebit nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /roles/letsencrypt/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - apt: update_cache=yes cache_valid_time=3600 3 | 4 | - name: Install depends 5 | apt: name={{ item }} state=present 6 | with_items: 7 | - python 8 | - python-dev 9 | - python-virtualenv 10 | - gcc 11 | - dialog 12 | - libaugeas0 13 | - libssl-dev 14 | - libffi-dev 15 | - ca-certificates 16 | - python-pip 17 | - git 18 | 19 | - name: Install virtualenv (Debian) 20 | apt: name={{ item }} state=present 21 | with_items: 22 | - virtualenv 23 | when: ansible_distribution == 'Debian' 24 | 25 | - name: Install python depends 26 | pip: virtualenv="{{ letsencrypt_venv }}" virtualenv_site_packages=no name={{ item }} state=latest 27 | with_items: 28 | - setuptools 29 | - pip 30 | 31 | - name: More python depends 32 | pip: virtualenv="{{ letsencrypt_venv }}" virtualenv_site_packages=no name=letsencrypt 33 | 34 | - name: Attempt to get the certificate using the webroot authenticator 35 | command: "{{ letsencrypt_command }} -a webroot --webroot-path {{ letsencrypt_webroot_path }} certonly" 36 | args: 37 | creates: "/etc/letsencrypt/live/{{ letsencrypt_cert_domains[0] }}" 38 | when: letsencrypt_authenticator == "webroot" 39 | ignore_errors: True 40 | 41 | - name: Attempt to get the certificate using the standalone authenticator (in case eg the webserver isn't running yet) 42 | command: "{{ letsencrypt_command }} -a standalone auth" 43 | args: 44 | creates: "/etc/letsencrypt/live/{{ letsencrypt_cert_domains[0] }}" 45 | -------------------------------------------------------------------------------- /app/models/crawl/capture.rb: -------------------------------------------------------------------------------- 1 | class Crawl::Capture < Page::Url 2 | require 'rmagick' 3 | include Magick 4 | 5 | attr_accessor :relative_path 6 | 7 | PNG = '.png' 8 | JPG = '.jpg' 9 | 10 | def screen 11 | unless cloud.head relative_path 12 | check_temp_path 13 | get_png 14 | compress_png 15 | cloud.sync(relative_path, jpeg) 16 | delete_images 17 | end 18 | relative_path 19 | end 20 | 21 | def compress_png 22 | image.minify.write(jpg_file_path) do 23 | self.format = 'JPEG' 24 | end 25 | end 26 | 27 | def get_png 28 | headless = Headless.new 29 | headless.start 30 | driver = Selenium::WebDriver.for :firefox 31 | driver.navigate.to @url 32 | driver.save_screenshot(png_file_path) 33 | driver.close 34 | headless.destroy 35 | end 36 | 37 | def check_temp_path 38 | path = File.dirname temp_path 39 | FileUtils.mkdir_p(path) unless File.exist?(path) 40 | end 41 | 42 | def delete_images 43 | FileUtils.rm jpg_file_path 44 | FileUtils.rm png_file_path 45 | rescue Errno::ENOENT 46 | nil 47 | end 48 | 49 | def jpeg 50 | File.read jpg_file_path 51 | rescue Errno::ENOENT 52 | nil 53 | end 54 | 55 | def png_file_path 56 | temp_path + PNG 57 | end 58 | 59 | def jpg_file_path 60 | temp_path + JPG 61 | end 62 | 63 | def temp_path 64 | File.join(Rails.root, 'tmp/cache', md5) 65 | end 66 | 67 | def image 68 | @image ||= Image.read(png_file_path).first 69 | end 70 | 71 | def cloud 72 | @cloud ||= Cloud.new(name + '-screenshots') 73 | end 74 | end 75 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/checkParameters.yml: -------------------------------------------------------------------------------- 1 | # Check for mandatory parameters 2 | 3 | - fail: msg="es_instance_name must be specified and cannot be blank" 4 | when: es_instance_name is not defined or es_instance_name == '' 5 | 6 | - fail: msg="es_proxy_port must be specified and cannot be blank when es_proxy_host is defined" 7 | when: (es_proxy_port is not defined or es_proxy_port == '') and (es_proxy_host is defined and es_proxy_host != '') 8 | 9 | - set_fact: multi_cast={{ (es_version | version_compare('2.0', '<') and es_config['discovery.zen.ping.multicast.enabled'] is not defined) or (es_config['discovery.zen.ping.multicast.enabled'] is defined and es_config['discovery.zen.ping.multicast.enabled'])}} 10 | 11 | - debug: msg="WARNING - It is recommended you specify the parameter 'http.port' when multicast is disabled" 12 | when: not multi_cast and es_config['http.port'] is not defined 13 | 14 | - debug: msg="WARNING - It is recommended you specify the parameter 'transport.tcp.port' when multicast is disabled" 15 | when: not multi_cast and es_config['transport.tcp.port'] is not defined 16 | 17 | - debug: msg="WARNING - It is recommended you specify the parameter 'discovery.zen.ping.unicast.hosts' when multicast is disabled" 18 | when: not multi_cast and es_config['discovery.zen.ping.unicast.hosts'] is not defined 19 | 20 | #If the user attempts to lock memory they must specify a heap size 21 | - fail: msg="If locking memory with bootstrap.mlockall a heap size must be specified" 22 | when: es_config['bootstrap.mlockall'] is defined and es_config['bootstrap.mlockall'] == True and es_heap_size is not defined -------------------------------------------------------------------------------- /config/environments/development.rb: -------------------------------------------------------------------------------- 1 | Rails.application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # In the development environment your application's code is reloaded on 5 | # every request. This slows down response time but is perfect for development 6 | # since you don't have to restart the web server when you make code changes. 7 | config.cache_classes = true 8 | 9 | # Do not eager load code on boot. 10 | config.eager_load = true 11 | 12 | # Show full error reports and disable caching. 13 | config.consider_all_requests_local = true 14 | config.action_controller.perform_caching = true 15 | # config.cache_store = :dalli_store 16 | 17 | # Don't care if the mailer can't send. 18 | config.action_mailer.raise_delivery_errors = false 19 | 20 | # Print deprecation notices to the Rails logger. 21 | config.active_support.deprecation = :log 22 | 23 | # Raise an error on page load if there are pending migrations. 24 | config.active_record.migration_error = :page_load 25 | 26 | # Debug mode disables concatenation and preprocessing of assets. 27 | # This option may cause significant delays in view rendering with a large 28 | # number of complex assets. 29 | config.assets.debug = true 30 | 31 | # Adds additional error checking when serving assets at runtime. 32 | # Checks for improperly declared sprockets dependencies. 33 | # Raises helpful error messages. 34 | config.assets.raise_runtime_errors = true 35 | 36 | # Raises error for missing translations 37 | # config.action_view.raise_on_missing_translations = true 38 | 39 | require 'sidekiq/testing/inline' 40 | end 41 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'bundler' 4 | 5 | gem 'rails' 6 | gem 'rack-health' 7 | gem 'responders' 8 | gem 'sqlite3' 9 | gem 'pry' 10 | 11 | gem 'mina', :require => false 12 | gem 'mina-sidekiq', :require => false 13 | gem 'mina-unicorn', :require => false 14 | gem 'sitemap_generator' 15 | 16 | gem 'oj' 17 | gem 'oj_mimic_json' 18 | 19 | gem 'awesome_print' 20 | gem 'progress' 21 | gem 'groupdate' 22 | 23 | gem 'nokogiri' 24 | gem 'mechanize' 25 | gem 'social_shares' 26 | gem 'user-agent-randomizer' 27 | 28 | gem 'selenium-webdriver' 29 | gem 'headless' 30 | gem 'rmagick' 31 | 32 | gem 'vcr' 33 | gem 'typhoeus' 34 | gem 'fog' 35 | 36 | gem 'redis-namespace' 37 | gem 'redis-objects' 38 | gem 'redis-rails' 39 | gem 'elasticsearch-model' 40 | 41 | # gem 'dalli' 42 | 43 | gem 'sidekiq' 44 | gem 'sidekiq-unique-jobs' 45 | gem 'sidekiq-limit_fetch' 46 | gem 'sidetiq' 47 | # gem 'sidekiq-statistic' 48 | gem 'sinatra', require: false 49 | 50 | gem 'google-search' 51 | gem 'wikipedia-client' 52 | 53 | group :doc do 54 | # bundle exec rake doc:rails generates the API under doc/api. 55 | gem 'sdoc', '~> 0.4.0' 56 | end 57 | 58 | group :development, :test do 59 | gem 'thin' 60 | end 61 | 62 | group :development do 63 | gem 'pry-rails' 64 | gem 'spring' 65 | gem 'better_errors' 66 | gem 'binding_of_caller' 67 | gem 'meta_request' 68 | gem 'quiet_assets' 69 | gem 'rails_layout' 70 | gem 'rubocop', require: false 71 | gem 'guard-test', require: false 72 | gem 'guard-livereload', require: false 73 | gem 'guard-rails', require: false 74 | gem 'guard-sidekiq', require: false 75 | gem 'rack-livereload' 76 | gem 'mock_redis' 77 | end 78 | 79 | group :production do 80 | gem 'unicorn' 81 | gem 'unicorn-worker-killer' 82 | end 83 | -------------------------------------------------------------------------------- /app/models/crawl/base.rb: -------------------------------------------------------------------------------- 1 | class Crawl::Base < Page::Url 2 | require 'user_agent_randomizer' 3 | require 'timeout' 4 | 5 | def agent 6 | @agent ||= defaults 7 | end 8 | 9 | def get 10 | page = agent.get(url) 11 | 12 | return page if page.code == '200' 13 | 14 | if page.code == '301' || page.code == '302' 15 | page = agent.get(url.gsub('http://','https://')) 16 | 17 | return page if page.code == '200' 18 | end 19 | 20 | raise Mechanize::ResponseCodeError.new(page, 'Not 200') 21 | end 22 | 23 | def clear 24 | agent.shutdown 25 | end 26 | 27 | def post(params, headers = '') 28 | # TODO: change it back to cache_key when built 29 | VCR.use_cassette(File.join(cache_vcr, params.to_query + headers), record: :new_episodes) do 30 | # Rails.cache.fetch(build_path, params.to_query + headers) do 31 | @agent = defaults 32 | @agent.post(url, params, headers) 33 | end 34 | end 35 | 36 | private 37 | 38 | def get_with_vcr(record) 39 | # TODO: change it back to cache_key when built 40 | VCR.use_cassette(cache_vcr, record: record) do 41 | # Rails.cache.fetch(build_path) do 42 | @agent = defaults 43 | @agent.get(url) 44 | end 45 | end 46 | 47 | def cache_vcr 48 | File.join(host, date, md5) 49 | end 50 | 51 | def defaults 52 | agent = Mechanize.new 53 | agent.user_agent = UserAgentRandomizer::UserAgent.fetch(type: "desktop_browser").string 54 | agent.html_parser = Nokogiri::HTML 55 | agent.redirect_ok = false 56 | # agent.ssl_version = 'SSLv3' 57 | agent.open_timeout = 300 58 | agent.read_timeout = 300 59 | agent.idle_timeout = 300 60 | agent.max_history = 10 61 | agent.keep_alive = false 62 | agent 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /app/workers/crawler/scraper.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Scraper < Crawler::Base 2 | sidekiq_options queue: :scraper, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, type = nil) 9 | return if url.nil? 10 | 11 | if type.nil? 12 | next_type 13 | else 14 | @type = type 15 | end 16 | 17 | @url = url 18 | 19 | Timeout::timeout(60) do 20 | parser.page = scraper.get 21 | end 22 | 23 | if scraping.presence 24 | scraping.each do |hash| 25 | if hash[:url].presence 26 | ('Crawler::' + next_type).constantize.perform_async hash[:url], hash 27 | else 28 | Recorder::Uploader.perform_async hash.merge(url: @url) 29 | end 30 | end 31 | else 32 | raise "Scraping not found" 33 | end 34 | 35 | paginate 36 | 37 | # upload 38 | rescue Mechanize::ResponseCodeError => e 39 | if e.response_code == '404' || 40 | e.response_code == '410' || 41 | e.response_code == '520' || 42 | e.response_code == '500' || 43 | e.response_code == '301' || 44 | e.response_code == '302' 45 | Mapper::UrlAvailability.perform_async url 46 | else 47 | raise 48 | end 49 | rescue Mechanize::RedirectLimitReachedError => e 50 | nil 51 | rescue Timeout::Error => e 52 | Crawler::Stretcher.perform_async url 53 | end 54 | 55 | def next_type 56 | @type ||= 'Scrimper' 57 | end 58 | 59 | def scraping 60 | @scraping ||= parser.scraping.compact 61 | end 62 | 63 | def paginate 64 | parser.paginate.each do |next_url| 65 | Crawler::Scraper.perform_async next_url 66 | end 67 | end 68 | end 69 | -------------------------------------------------------------------------------- /config/environments/test.rb: -------------------------------------------------------------------------------- 1 | Rails.application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # The test environment is used exclusively to run your application's 5 | # test suite. You never need to work with it otherwise. Remember that 6 | # your test database is "scratch space" for the test suite and is wiped 7 | # and recreated between test runs. Don't rely on the data there! 8 | config.cache_classes = true 9 | 10 | # Do not eager load code on boot. This avoids loading your whole application 11 | # just for the purpose of running a single test. If you are using a tool that 12 | # preloads Rails for running tests, you may have to set it to true. 13 | config.eager_load = false 14 | 15 | # Configure static asset server for tests with Cache-Control for performance. 16 | config.serve_static_assets = true 17 | config.static_cache_control = 'public, max-age=3600' 18 | 19 | # Show full error reports and disable caching. 20 | config.consider_all_requests_local = true 21 | config.action_controller.perform_caching = false 22 | 23 | # Raise exceptions instead of rendering exception templates. 24 | config.action_dispatch.show_exceptions = false 25 | 26 | # Disable request forgery protection in test environment. 27 | config.action_controller.allow_forgery_protection = false 28 | 29 | # Tell Action Mailer not to deliver emails to the real world. 30 | # The :test delivery method accumulates sent emails in the 31 | # ActionMailer::Base.deliveries array. 32 | config.action_mailer.delivery_method = :test 33 | 34 | # Print deprecation notices to the stderr. 35 | config.active_support.deprecation = :stderr 36 | 37 | # Raises error for missing translations 38 | # config.action_view.raise_on_missing_translations = true 39 | end 40 | -------------------------------------------------------------------------------- /production.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - hosts: all 3 | user: ubuntu 4 | sudo: yes 5 | roles: 6 | - ubuntu-common 7 | - imagemagick 8 | - chruby 9 | - ruby-install 10 | - role: build-ruby 11 | version: 2.3.0 12 | - role: swapfile 13 | swapfile_size: 8GB 14 | swapfile_swappiness: 10 15 | swapfile_location: /mnt/swapfile 16 | # - role: letsencrypt 17 | # letsencrypt_webroot_path: /home/ubuntu/skynet/current/public 18 | # letsencrypt_email: bastosmichael@gmail.com 19 | # letsencrypt_cert_domains: 20 | # - api.pricenometry.com 21 | - role: nginx-unicorn 22 | nginx_sites: 23 | - name: 'production' 24 | server_name: 'localhost' 25 | root: '/home/ubuntu/skynet/current' 26 | listen: '0.0.0.0:80' 27 | access_log: 28 | format: 'combined' 29 | # ssl: 30 | # ssl_only: true 31 | # # sensitive_uris: 32 | # # - ^/sidekiq(.*) 33 | # certificate: /etc/letsencrypt/live/api.pricenometry.com/fullchain.pem 34 | # certificate_key: /etc/letsencrypt/live/api.pricenometry.com/privkey.pem 35 | - role: logrotate 36 | logrotate_scripts: 37 | - name: rails 38 | path: "/home/ubuntu/skynet/shared/log/*.log" 39 | options: 40 | - hourly 41 | - size 25M 42 | - missingok 43 | - compress 44 | - delaycompress 45 | - copytruncate 46 | - name: nginx 47 | path: /var/log/nginx/*.log 48 | options: 49 | - hourly 50 | - size 25M 51 | - rotate 7 52 | - missingok 53 | - compress 54 | - delaycompress 55 | - copytruncate 56 | scripts: 57 | postrotate: "[ -s /run/nginx.pid ] && kill -USR1 `cat /run/nginx.pid`" 58 | -------------------------------------------------------------------------------- /app/models/record/search.rb: -------------------------------------------------------------------------------- 1 | class Record::Search < Record::Match 2 | alias_method :search, :best 3 | 4 | def match_query 5 | [ 6 | { 7 | match: { 8 | name: @query_hash[:query] 9 | } 10 | }, 11 | # { 12 | # match: { 13 | # description: @query_hash[:query] 14 | # } 15 | # }, 16 | # { 17 | # match: { 18 | # url: @query_hash[:query] 19 | # } 20 | # }, 21 | # { 22 | # match: { 23 | # tags: @query_hash[:query] 24 | # } 25 | # }, 26 | # { 27 | # match: { 28 | # categories: @query_hash[:query] 29 | # } 30 | # }, 31 | # { 32 | # flt_field: { 33 | # name: { 34 | # like_text: @query_hash[:query], 35 | # analyzer: 'snowball', 36 | # fuzziness: 0.1, 37 | # boost: 5 38 | # } 39 | # } 40 | # }, 41 | # { 42 | # flt_field: { 43 | # description: { 44 | # like_text: @query_hash[:query], 45 | # analyzer: 'snowball', 46 | # fuzziness: 0.3 47 | # } 48 | # } 49 | # }, 50 | # { 51 | # flt_field: { 52 | # url: { 53 | # like_text: @query_hash[:query], 54 | # analyzer: 'snowball', 55 | # fuzziness: 0.5 56 | # } 57 | # } 58 | # }, 59 | # { 60 | # flt_field: { 61 | # tags: { 62 | # like_text: @query_hash[:query], 63 | # analyzer: 'snowball', 64 | # fuzziness: 0.7 65 | # } 66 | # } 67 | # }, 68 | # { 69 | # flt_field: { 70 | # categories: { 71 | # like_text: @query_hash[:query], 72 | # analyzer: 'snowball', 73 | # fuzziness: 0.9 74 | # } 75 | # } 76 | # }, 77 | ] 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /roles/logrotate/README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/nickhammond/ansible-logrotate.svg?branch=master)](https://travis-ci.org/nickhammond/ansible-logrotate) 2 | 3 | Role Name 4 | ======== 5 | 6 | Installs logrotate and provides an easy way to setup additional logrotate scripts by specifying a list of directives. 7 | 8 | Requirements 9 | ------------ 10 | 11 | None 12 | 13 | Role Variables 14 | -------------- 15 | 16 | **logrotate_scripts**: A list of logrotate scripts and the directives to use for the rotation. 17 | 18 | * name - The name of the script that goes into /etc/logrotate.d/ 19 | * path - Path to point logrotate to for the log rotation 20 | * options - List of directives for logrotate, view the logrotate man page for specifics 21 | * scripts - Dict of scripts for logrotate (see Example below) 22 | 23 | ``` 24 | logrotate_scripts: 25 | - name: rails 26 | path: "/srv/current/log/*.log" 27 | options: 28 | - weekly 29 | - size 25M 30 | - missingok 31 | - compress 32 | - delaycompress 33 | - copytruncate 34 | ``` 35 | 36 | Dependencies 37 | ------------ 38 | 39 | None 40 | 41 | Example Playbook 42 | ------------------------- 43 | 44 | Setting up logrotate for additional Nginx logs, with postrotate script (assuming this role is located in `roles/logrotate`). 45 | 46 | ``` 47 | - role: logrotate 48 | logrotate_scripts: 49 | - name: nginx 50 | path: /var/log/nginx/*.log 51 | options: 52 | - weekly 53 | - size 25M 54 | - rotate 7 55 | - missingok 56 | - compress 57 | - delaycompress 58 | - copytruncate 59 | scripts: 60 | postrotate: "[ -s /run/nginx.pid ] && kill -USR1 `cat /run/nginx.pid`" 61 | ``` 62 | 63 | License 64 | ------- 65 | 66 | BSD 67 | 68 | Author Information 69 | ------------------ 70 | 71 | Find [Nick Hammond]( http://www.nickhammond.com ) on [Twitter](http://twitter.com/nickhammond). 72 | -------------------------------------------------------------------------------- /roles/elasticsearch/templates/systemd/elasticsearch.j2: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Elasticsearch-{{es_instance_name}} 3 | Documentation=http://www.elastic.co 4 | Wants=network-online.target 5 | After=network-online.target 6 | 7 | [Service] 8 | Environment=ES_HOME={{es_home}} 9 | Environment=CONF_DIR={{conf_dir}} 10 | Environment=DATA_DIR={{ data_dirs | array_to_str }} 11 | Environment=LOG_DIR={{log_dir}} 12 | Environment=PID_DIR={{pid_dir}} 13 | EnvironmentFile=-{{instance_default_file}} 14 | 15 | User={{es_user}} 16 | Group={{es_group}} 17 | 18 | ExecStart={{es_home}}/bin/elasticsearch \ 19 | -Des.pidfile=${PID_DIR}/elasticsearch.pid \ 20 | -Des.default.path.home=${ES_HOME} \ 21 | -Des.default.path.logs=${LOG_DIR} \ 22 | -Des.default.path.data=${DATA_DIR} \ 23 | -Des.default.path.conf=${CONF_DIR} 24 | 25 | 26 | 27 | # Connects standard output to /dev/null 28 | StandardOutput=null 29 | 30 | # Connects standard error to journal 31 | StandardError=journal 32 | 33 | # Specifies the maximum file descriptor number that can be opened by this process 34 | LimitNOFILE={{es_max_open_files}} 35 | 36 | # Specifies the maximum number of bytes of memory that may be locked into RAM 37 | # Set to "infinity" if you use the 'bootstrap.mlockall: true' option 38 | # in elasticsearch.yml and 'MAX_LOCKED_MEMORY=unlimited' in {{instance_default_file}} 39 | {% if m_lock_enabled %} 40 | LimitMEMLOCK=infinity 41 | {% endif %} 42 | 43 | # Disable timeout logic and wait until process is stopped 44 | TimeoutStopSec=0 45 | 46 | # SIGTERM signal is used to stop the Java process 47 | KillSignal=SIGTERM 48 | 49 | # Java process is never killed 50 | SendSIGKILL=no 51 | 52 | # When a JVM receives a SIGTERM signal it exits with code 143 53 | SuccessExitStatus=143 54 | 55 | [Install] 56 | WantedBy=multi-user.target 57 | -------------------------------------------------------------------------------- /app/models/cloud.rb: -------------------------------------------------------------------------------- 1 | class Cloud 2 | MAX_KEYS = 100_000_000_000 3 | 4 | attr_accessor :bucket 5 | attr_accessor :provider 6 | 7 | def initialize(bucket_name) 8 | self.bucket = bucket_name if bucket_name 9 | end 10 | 11 | def after_initialize 12 | return unless new_record? 13 | self.bucket = 'crawler' 14 | end 15 | 16 | def storage 17 | @storage ||= Fog::Storage.new(Rails.configuration.config[:fog]) 18 | end 19 | 20 | def container 21 | @container ||= storage.directories.get(bucket) 22 | create_container if @container.nil? 23 | @container 24 | end 25 | 26 | def files 27 | @files ||= update_files 28 | end 29 | 30 | def keys 31 | @keys ||= files.map(&:key) 32 | end 33 | 34 | def update_files 35 | files = container.files 36 | truncated = files.try(:is_truncated) 37 | while truncated 38 | ap "Collecting #{files.count} from #{self.bucket}..." 39 | bucket_object = container.files.all(marker: files.last.key) 40 | truncated = bucket_object.is_truncated 41 | files += bucket_object 42 | end 43 | files 44 | end 45 | 46 | def listing(prefix) 47 | @listing ||= container.files.all delimiter: '/', prefix: prefix 48 | end 49 | 50 | def head(key) 51 | container.files.head key 52 | end 53 | 54 | def get(key) 55 | container.files.get key 56 | end 57 | 58 | def get_url(key) 59 | container.files.get_https_url(key, 300) 60 | end 61 | 62 | def sync(key, data) 63 | if data 64 | copy key, data 65 | else 66 | head = head key 67 | head.try :destroy 68 | end 69 | end 70 | 71 | def copy(key, data) 72 | file = container.files.new key: key 73 | file.body = data 74 | file.save 75 | end 76 | 77 | def create_container 78 | @container = storage.directories.create(key: bucket, public: true) 79 | end 80 | 81 | def delete_all 82 | files.with_progress("Deleting files in #{bucket}").each { |k| k.try(:destroy) } 83 | end 84 | 85 | def count 86 | files.count 87 | end 88 | end 89 | -------------------------------------------------------------------------------- /app/workers/crawler/sitemapper.rb: -------------------------------------------------------------------------------- 1 | class Crawler::Sitemapper < Crawler::Base 2 | sidekiq_options queue: :sitemapper, 3 | retry: true, 4 | backtrace: true, 5 | unique: :until_and_while_executing, 6 | unique_expiration: 120 * 60 7 | 8 | def perform(url, type = 'Scrimper') 9 | return if url.nil? 10 | @url = url 11 | @type = type 12 | @name = Page::Url.new(url).name 13 | @container = Rails.configuration.config[:admin][:api_containers].find { |c| c.include?(@name) } 14 | 15 | get_xml 16 | 17 | sitemap.site_links.each do |u| 18 | check_page(u) 19 | end if sitemap.sites? 20 | 21 | sitemap.index_links.each do |u| 22 | get_sitemap u 23 | end if sitemap.indexes? 24 | end 25 | 26 | def get_xml 27 | sitemap.xml = scraper.get 28 | scraper.clear 29 | end 30 | 31 | def check_page(url) 32 | if new_url = @name.capitalize.constantize.sanitize_url(url) 33 | if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: new_url.gsub('https://','http://') } } })['hits']['total'] == 0 34 | get_page(new_url) if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: new_url } } })['hits']['total'] == 0 35 | end 36 | end 37 | rescue NoMethodError => e 38 | if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: url.gsub('https://','http://') } } })['hits']['total'] == 0 39 | get_page(url) if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: url } } })['hits']['total'] == 0 40 | end 41 | end 42 | 43 | def get_page(url) 44 | ('Crawler::' + @type).constantize.perform_async url 45 | end 46 | 47 | def get_sitemap(url) 48 | Crawler::Sitemapper.perform_async url, @type 49 | end 50 | 51 | def sitemap 52 | @sitemap ||= Crawl::Sitemap.new(@url) 53 | end 54 | end 55 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | guard 'livereload' do 2 | watch(%r{app/views/.+\.(erb|haml|slim)$}) 3 | watch(%r{app/helpers/.+\.rb}) 4 | watch(%r{public/.+\.(css|js|html)}) 5 | watch(%r{config/locales/.+\.yml}) 6 | # Rails Assets Pipeline 7 | watch(%r{(app|vendor)(/assets/\w+/(.+\.(css|js|html|png|jpg))).*}) { |m| "/assets/#{m[3]}" } 8 | end 9 | 10 | guard 'rails' do 11 | watch('Gemfile.lock') 12 | watch(%r{^(config|lib)/.*}) 13 | end 14 | 15 | ### Guard::Sidekiq 16 | # available options: 17 | # - :verbose 18 | # - :queue (defaults to "default") can be an array 19 | # - :concurrency (defaults to 1) 20 | # - :timeout 21 | # - :environment (corresponds to RAILS_ENV for the Sidekiq worker) 22 | 23 | guard 'sidekiq', concurrency: 1 do 24 | watch(%r{^app/workers/(.+)\.rb$}) 25 | watch(%r{^app/models/(.+)\.rb$}) 26 | watch(%r{^app/sites/(.+)\.rb$}) 27 | watch(%r{^app/helpers/(.+)\.rb$}) 28 | end 29 | 30 | guard :test do 31 | watch(%r{^test/.+_test\.rb$}) 32 | watch('test/test_helper.rb') { 'test' } 33 | 34 | # Non-rails 35 | watch(%r{^lib/(.+)\.rb$}) { |m| "test/#{m[1]}_test.rb" } 36 | 37 | # Rails 4 38 | # watch(%r{^app/(.+)\.rb}) { |m| "test/#{m[1]}_test.rb" } 39 | # watch(%r{^app/controllers/application_controller\.rb}) { 'test/controllers' } 40 | # watch(%r{^app/controllers/(.+)_controller\.rb}) { |m| "test/integration/#{m[1]}_test.rb" } 41 | # watch(%r{^app/views/(.+)_mailer/.+}) { |m| "test/mailers/#{m[1]}_mailer_test.rb" } 42 | # watch(%r{^lib/(.+)\.rb}) { |m| "test/lib/#{m[1]}_test.rb" } 43 | 44 | # Rails < 4 45 | # watch(%r{^app/models/(.+)\.rb$}) { |m| "test/unit/#{m[1]}_test.rb" } 46 | # watch(%r{^app/controllers/(.+)\.rb$}) { |m| "test/functional/#{m[1]}_test.rb" } 47 | # watch(%r{^app/views/(.+)/.+\.erb$}) { |m| "test/functional/#{m[1]}_controller_test.rb" } 48 | # watch(%r{^app/views/.+$}) { 'test/integration' } 49 | # watch('app/controllers/application_controller.rb') { ['test/functional', 'test/integration'] } 50 | end 51 | -------------------------------------------------------------------------------- /roles/letsencrypt/README.md: -------------------------------------------------------------------------------- 1 | # ansible-letsencrypt 2 | An ansible role to generate TLS certificates and get them signed by Let's Encrypt. 3 | 4 | Currently attempts first to use the `webroot` authenticator, then if that fails to create certificates, 5 | it will use the standalone authenticator. This is handy for generating certs on a fresh machine before 6 | the web server has been configured or even installed. 7 | 8 | I've tested this on a couple of Debian Jessie boxes with nginx, if you test it on other things please let me know 9 | the results (positive or otherwise) so I can document them here/fix the issue. 10 | 11 | # Usage 12 | First, read Let's Encrypt's TOS and EULA. Only proceed if you agree to them. 13 | 14 | The following variables are available: 15 | 16 | `letsencrypt_webroot_path` is the root path that gets served by your web server. Defaults to `/var/www`. 17 | 18 | `letsencrypt_email` needs to be set to your email address. Let's Encrypt wants it. Defaults to `webmaster@{{ ansible_fqdn }}`. 19 | 20 | `letsencrypt_cert_domains` is a list of domains you wish to get a certificate for. It defaults to a single item with the value of `{{ ansible_fqdn }}`. 21 | 22 | `letsencrypt_install_directory` should probably be left alone, but if you set it, it will change where the letsencrypt program is installed. 23 | 24 | `letsencrypt_server` sets the auth server. Set to `https://acme-staging.api.letsencrypt.org/directory` to use the staging server (far higher rate limits, but certs are not trusted, intended for testing) 25 | 26 | The [Let's Encrypt client](https://github.com/letsencrypt/letsencrypt) will put the certificate and accessories in `/etc/letsencrypt/live//`. For more info, see the [Let's Encrypt documentation](https://letsencrypt.readthedocs.org/en/latest/using.html#where-are-my-certificates). 27 | 28 | # Example Playbook 29 | ``` 30 | --- 31 | - hosts: tls_servers 32 | user: root 33 | roles: 34 | - role: letsencrypt 35 | letsencrypt_webroot_path: /var/www/html 36 | letsencrypt_email: user@example.net 37 | letsencrypt_cert_domains: 38 | - www.example.net 39 | - example.net 40 | ``` 41 | -------------------------------------------------------------------------------- /app/models/page/parse.rb: -------------------------------------------------------------------------------- 1 | class Page::Parse < Page::Base 2 | include PageHelper 3 | include OpenGraphHelper 4 | include SchemaOrgHelper 5 | 6 | def build 7 | parent_build 8 | methods.grep(/find_/).each { |parse| send(parse) } if @type 9 | end 10 | 11 | def parent_build 12 | build_open_graph 13 | build_schema 14 | build_page 15 | end 16 | 17 | def self.sanitize_url url 18 | return url 19 | end 20 | 21 | def paginate 22 | parser.css('.next').map {|n| n[:href]}.compact.uniq 23 | end 24 | 25 | def scraping 26 | [] 27 | end 28 | 29 | def screenshot 30 | @screenshot ||= File.join(@id, date) + '.jpg' 31 | end 32 | 33 | def remove_extras(symbol) 34 | remove_instance_variable(symbol) rescue nil 35 | end 36 | 37 | def save 38 | remove_instance_variable(:@page) 39 | remove_instance_variable(:@links) if @links 40 | remove_instance_variable(:@internal_links) if @internal_links 41 | remove_instance_variable(:@external_links) if @external_links 42 | remove_instance_variable(:@uri) rescue nil 43 | hash = {} 44 | instance_variables.each do |var| 45 | value = instance_variable_get(var) 46 | hash[var.to_s.delete('@')] = value unless value.blank? 47 | end 48 | hash 49 | end 50 | 51 | def links 52 | @links ||= page.links.map do |link| 53 | remove_hash_bangs(clean_up_link(link.href)) 54 | end.compact.uniq 55 | end 56 | 57 | def internal_links 58 | @internal_links ||= links.map { |link| link if internal? link }.compact 59 | end 60 | 61 | def external_links 62 | @external_links ||= links.map { |link| link unless internal? link }.compact 63 | end 64 | 65 | def clean_up_link(link) 66 | link_uri = URI.parse(link) 67 | if link_uri.scheme.nil? && link_uri.host.nil? 68 | link = (base + link) 69 | else 70 | link 71 | end 72 | rescue 73 | nil 74 | end 75 | 76 | def remove_hash_bangs(link) 77 | return if link.nil? 78 | if hash_bang = link.match(/(.+?)\#/) 79 | hash_bang[1] 80 | else 81 | link 82 | end 83 | end 84 | 85 | def internal?(link) 86 | get_host_without_www(URI.parse(link)) == host 87 | rescue 88 | nil 89 | end 90 | end 91 | -------------------------------------------------------------------------------- /app/helpers/schema_org_helper.rb: -------------------------------------------------------------------------------- 1 | module SchemaOrgHelper 2 | def build_schema 3 | # schema = @page.doc.css('//*[contains(@itemtype, "schema.org")]').first["itemtype"] 4 | @schema_org = false 5 | methods.grep(/schema_org/).each do |schema| 6 | send(schema) rescue nil 7 | end 8 | @schema_org = true if @type 9 | end 10 | 11 | ############################################################### 12 | # Types that have multiple parents are expanded out only once 13 | # and have an asterisk 14 | ############################################################### 15 | 16 | def schema_org_type 17 | @type = cleanup_value page.body.match(/itemtype="http:\/\/schema.org\/(.+?)"/)[1] 18 | end 19 | 20 | ############################################################### 21 | # Grab Meta Data for Schema and assign instance variable 22 | ############################################################### 23 | 24 | def schema_org_meta 25 | parser.css('//meta').each do |m| 26 | unless m[:itemprop].nil? 27 | key = cleanup_key m[:itemprop] 28 | value = cleanup_value m[:content] 29 | instance_variable_set("@#{key}", value) 30 | end 31 | end 32 | end 33 | 34 | ############################################################### 35 | # Grab Span Data for Schema and assign instance variable 36 | ############################################################### 37 | 38 | def schema_org_span 39 | parser.css('//span').each do |m| 40 | unless m[:itemprop].nil? 41 | key = cleanup_key m[:itemprop] 42 | value = cleanup_value m.text 43 | instance_variable_set("@#{key}", value) 44 | end 45 | end 46 | end 47 | 48 | ############################################################### 49 | # Grabbing Keywords as Tags 50 | ############################################################### 51 | 52 | def schema_org_tags 53 | tags = parser.css("meta[@name='keywords']").first['content'].split(/ |,/) 54 | tags.delete_if { |x| x.match(/and|for|more/) || x.squish.blank? } 55 | @tags = tags.reject(&:empty?).uniq 56 | end 57 | 58 | def cleanup_key(key) 59 | key.tr(' ', '_') 60 | end 61 | 62 | def cleanup_value(value) 63 | value.try(:squish) 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/helpers/serverspec/standard_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | shared_examples 'standard::init' do |es_version| 4 | 5 | describe user('elasticsearch') do 6 | it { should exist } 7 | end 8 | 9 | describe service('node1_elasticsearch') do 10 | it { should be_running } 11 | end 12 | 13 | describe package('elasticsearch') do 14 | it { should be_installed } 15 | end 16 | 17 | describe file('/etc/elasticsearch/node1/elasticsearch.yml') do 18 | it { should be_file } 19 | it { should be_owned_by 'elasticsearch' } 20 | end 21 | 22 | describe file('/etc/elasticsearch/node1/logging.yml') do 23 | it { should be_file } 24 | it { should be_owned_by 'elasticsearch' } 25 | end 26 | 27 | describe file('/etc/elasticsearch/node1/elasticsearch.yml') do 28 | it { should contain 'node.name: localhost-node1' } 29 | it { should contain 'cluster.name: elasticsearch' } 30 | it { should contain 'path.conf: /etc/elasticsearch/node1' } 31 | it { should contain 'path.data: /var/lib/elasticsearch/localhost-node1' } 32 | it { should contain 'path.work: /tmp/elasticsearch/localhost-node1' } 33 | it { should contain 'path.logs: /var/log/elasticsearch/localhost-node1' } 34 | end 35 | 36 | describe 'Node listening' do 37 | it 'listening in port 9200' do 38 | expect(port 9200).to be_listening 39 | end 40 | end 41 | 42 | describe 'version check' do 43 | it 'should be reported as version '+es_version do 44 | command = command('curl -s localhost:9200 | grep number') 45 | expect(command.stdout).to match(es_version) 46 | expect(command.exit_status).to eq(0) 47 | end 48 | end 49 | 50 | describe file('/etc/init.d/elasticsearch') do 51 | it { should_not exist } 52 | end 53 | 54 | describe file('/etc/default/elasticsearch') do 55 | it { should_not exist } 56 | end 57 | 58 | describe file('/etc/sysconfig/elasticsearch') do 59 | it { should_not exist } 60 | end 61 | 62 | describe file('/usr/lib/systemd/system/elasticsearch.service') do 63 | it { should_not exist } 64 | end 65 | 66 | describe file('/etc/elasticsearch/elasticsearch.yml') do 67 | it { should_not exist } 68 | end 69 | 70 | describe file('/etc/elasticsearch/logging.yml') do 71 | it { should_not exist } 72 | end 73 | 74 | end 75 | 76 | -------------------------------------------------------------------------------- /roles/elasticsearch/templates/logging.yml.j2: -------------------------------------------------------------------------------- 1 | # you can override this using by setting a system property, for example -Des.logger.level=DEBUG 2 | es.logger.level: INFO 3 | rootLogger: ${es.logger.level}, console, file 4 | logger: 5 | # log action execution errors for easier debugging 6 | action: DEBUG 7 | # reduce the logging for aws, too much is logged under the default INFO 8 | com.amazonaws: WARN 9 | org.apache.http: INFO 10 | 11 | # gateway 12 | #gateway: DEBUG 13 | #index.gateway: DEBUG 14 | 15 | # peer shard recovery 16 | #indices.recovery: DEBUG 17 | 18 | # discovery 19 | #discovery: TRACE 20 | 21 | index.search.slowlog: TRACE, index_search_slow_log_file 22 | index.indexing.slowlog: TRACE, index_indexing_slow_log_file 23 | 24 | additivity: 25 | index.search.slowlog: false 26 | index.indexing.slowlog: false 27 | 28 | appender: 29 | console: 30 | type: console 31 | layout: 32 | type: consolePattern 33 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 34 | 35 | file: 36 | type: dailyRollingFile 37 | file: ${path.logs}/${cluster.name}.log 38 | datePattern: "'.'yyyy-MM-dd" 39 | layout: 40 | type: pattern 41 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 42 | 43 | # Use the following log4j-extras RollingFileAppender to enable gzip compression of log files. 44 | # For more information see https://logging.apache.org/log4j/extras/apidocs/org/apache/log4j/rolling/RollingFileAppender.html 45 | #file: 46 | #type: extrasRollingFile 47 | #file: ${path.logs}/${cluster.name}.log 48 | #rollingPolicy: timeBased 49 | #rollingPolicy.FileNamePattern: ${path.logs}/${cluster.name}.log.%d{yyyy-MM-dd}.gz 50 | #layout: 51 | #type: pattern 52 | #conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 53 | 54 | index_search_slow_log_file: 55 | type: dailyRollingFile 56 | file: ${path.logs}/${cluster.name}_index_search_slowlog.log 57 | datePattern: "'.'yyyy-MM-dd" 58 | layout: 59 | type: pattern 60 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" 61 | 62 | index_indexing_slow_log_file: 63 | type: dailyRollingFile 64 | file: ${path.logs}/${cluster.name}_index_indexing_slowlog.log 65 | datePattern: "'.'yyyy-MM-dd" 66 | layout: 67 | type: pattern 68 | conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n" -------------------------------------------------------------------------------- /app/workers/mapper/indexer.rb: -------------------------------------------------------------------------------- 1 | class Mapper::Indexer < Mapper::Base 2 | def perform(container, id, hash = nil) 3 | @container = container 4 | types = container.split('-').last.pluralize.gsub(':', '') 5 | hash = record(id).data if hash.nil? 6 | index = Rails.env + '-' + types 7 | new_hash = {} 8 | 9 | # hash.each do |k, v| 10 | hash.each do |k, v| 11 | unless Record::Upload::EXCLUDE.include? k.to_sym 12 | if v.is_a?(Hash) 13 | value = v.values.last 14 | 15 | if value.is_a?(Array) || !!value == value 16 | new_hash[k] = value 17 | elsif value.to_i.to_s == value.to_s 18 | new_hash[k] = value.to_i 19 | elsif (Float(value) rescue false) 20 | new_hash[k] = value.to_f 21 | new_hash[k] = value if new_hash[k].infinite? 22 | else 23 | new_hash[k] = value 24 | end 25 | 26 | new_hash[k + '_history'] = v.count if v.count > 1 27 | elsif !!v == v # Check if Boolean 28 | new_hash[k] = v 29 | elsif v.is_a?(Array) 30 | new_hash[k] = v.map {|value| value.encode(Encoding.find('UTF-8'), {invalid: :replace, undef: :replace, replace: ''}) } 31 | else 32 | new_hash[k] = v.encode(Encoding.find('UTF-8'), {invalid: :replace, undef: :replace, replace: ''}) 33 | end 34 | end 35 | end 36 | 37 | # Delete bad keys from search... 38 | if bad_ids = Elasticsearch::Model.client.search(index: index, type: @container, body: { query: { match_phrase_prefix: { url: new_hash['url'] } } })['hits']['hits'].select do |hit| 39 | hit['_id'] != id 40 | end 41 | bad_ids.each do |bad_id| 42 | record(bad_id['_id']).delete 43 | Elasticsearch::Model.client.delete index: index, type: @container, id: bad_id['_id'] 44 | end unless bad_ids.empty? 45 | end 46 | 47 | Elasticsearch::Model.client.index index: index, type: container, id: id, body: new_hash.sort.to_h 48 | 49 | Elasticsearch::Model.client.indices.refresh index: index 50 | # rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e 51 | # # rescue Elasticsearch::Transport::Transport::Errors::NotFound 52 | # record(id).delete 53 | # Crawler::Scrimper.perform_async new_hash['url'] if new_hash['url'] 54 | rescue Elasticsearch::Transport::Transport::Errors::NotFound => e 55 | nil 56 | end 57 | end 58 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-plugins.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | #es_plugins_reinstall will be set to true if elasticsearch_install.changed i.e. we have changed ES version, or if no plugins listed. Otherwise it is false and requires explicitly setting. 4 | - set_fact: es_plugins_reinstall=true 5 | when: (elasticsearch_install is defined and elasticsearch_install.changed) or es_plugins is not defined or es_plugins is none 6 | tags: 7 | - always 8 | 9 | - set_fact: list_command="list" 10 | tags: 11 | - always 12 | - set_fact: list_command="--list" 13 | when: es_version | version_compare('2.0', '<') 14 | tags: 15 | - always 16 | 17 | #List currently installed plugins 18 | - shell: "{{es_home}}/bin/plugin {{list_command}} | sed -n '1!p' | cut -d '-' -f2-" 19 | register: installed_plugins 20 | changed_when: False 21 | environment: 22 | CONF_DIR: "{{ conf_dir }}" 23 | ES_INCLUDE: "{{ instance_default_file }}" 24 | 25 | #This needs to removes any currently installed plugins 26 | - name: Remove elasticsearch plugins 27 | command: "{{es_home}}/bin/plugin remove {{item}} --silent" 28 | ignore_errors: yes 29 | with_items: "{{ installed_plugins.stdout_lines }}" 30 | when: es_plugins_reinstall and installed_plugins.stdout_lines | length > 0 and not 'No plugin detected' in installed_plugins.stdout_lines[0] 31 | notify: restart elasticsearch 32 | environment: 33 | CONF_DIR: "{{ conf_dir }}" 34 | ES_INCLUDE: "{{ instance_default_file }}" 35 | 36 | - name: Install elasticsearch plugins 37 | #debug: var=item 38 | command: > 39 | {{es_home}}/bin/plugin install {{ item.plugin }}{% if item.version is defined and item.version != '' %}/{{ item.version }}{% endif %} {% if item.proxy_host is defined and item.proxy_host != '' and item.proxy_port is defined and item.proxy_port != ''%} -DproxyHost={{ item.proxy_host }} -DproxyPort={{ item.proxy_port }} {% elif es_proxy_host is defined and es_proxy_host != '' %} -DproxyHost={{ es_proxy_host }} -DproxyPort={{ es_proxy_port }} {% endif %} --silent 40 | register: plugin_installed 41 | failed_when: "'ERROR' in plugin_installed.stdout" 42 | changed_when: plugin_installed.rc == 0 43 | with_items: "{{ es_plugins }}" 44 | when: es_plugins is defined and not es_plugins is none 45 | notify: restart elasticsearch 46 | environment: 47 | CONF_DIR: "{{ conf_dir }}" 48 | ES_INCLUDE: "{{ instance_default_file }}" 49 | 50 | #Set permissions on plugins directory 51 | - name: Set Plugin Directory Permissions 52 | file: state=directory path={{ plugin_dir }} owner={{ es_user }} group={{ es_group }} recurse=yes 53 | -------------------------------------------------------------------------------- /config/sidekiq.yml.example: -------------------------------------------------------------------------------- 1 | --- 2 | :concurrency: 1 3 | :pidfile: tmp/pids/sidekiq.pid 4 | :queues: 5 | - [socializer, 1200000] 6 | - [recorder, 1100000] 7 | - [slider, 1000000] 8 | - [scrimper_one, 90000] 9 | - [scrimper_two, 90000] 10 | - [scrimper_three, 90000] 11 | - [scrimper_four, 90000] 12 | - [scrimper_five, 90000] 13 | - [scrimper_six, 90000] 14 | - [scrimper_seven, 90000] 15 | - [scrimper_eight, 90000] 16 | - [scrimper_nine, 90000] 17 | - [scrimper_ten, 90000] 18 | - [scrimper, 80000] 19 | - [spider_one, 70000] 20 | - [spider_two, 70000] 21 | - [spider_three, 70000] 22 | - [spider_four, 70000] 23 | - [spider_five, 70000] 24 | - [spider_six, 70000] 25 | - [spider_seven, 70000] 26 | - [spider_eight, 70000] 27 | - [spider_nine, 70000] 28 | - [spider_ten, 70000] 29 | - [spider, 60000] 30 | - [scraper_one, 50000] 31 | - [scraper_two, 50000] 32 | - [scraper_three, 50000] 33 | - [scraper_four, 50000] 34 | - [scraper_five, 50000] 35 | - [scraper_six, 50000] 36 | - [scraper_seven, 50000] 37 | - [scraper_eight, 50000] 38 | - [scraper_nine, 50000] 39 | - [scraper_ten, 50000] 40 | - [scraper, 40000] 41 | - [sampler_one, 30000] 42 | - [sampler_two, 30000] 43 | - [sampler_three, 30000] 44 | - [sampler_four, 30000] 45 | - [sampler_five, 30000] 46 | - [sampler_six, 30000] 47 | - [sampler_seven, 30000] 48 | - [sampler_eight, 30000] 49 | - [sampler_nine, 30000] 50 | - [sampler_ten, 30000] 51 | - [sampler, 20000] 52 | - [stretcher, 1] 53 | :process_limits: 54 | stretcher: 1 55 | slider: 1 56 | socializer: 1 57 | scrimper_one: 1 58 | scrimper_two: 1 59 | scrimper_three: 1 60 | scrimper_four: 1 61 | scrimper_five: 1 62 | scrimper_six: 1 63 | scrimper_seven: 1 64 | scrimper_eight: 1 65 | scrimper_nine: 1 66 | scrimper_ten: 1 67 | scrimper: 1 68 | spider_one: 1 69 | spider_two: 1 70 | spider_three: 1 71 | spider_four: 1 72 | spider_five: 1 73 | spider_six: 1 74 | spider_seven: 1 75 | spider_eight: 1 76 | spider_nine: 1 77 | spider_ten: 1 78 | spider: 1 79 | scraper_one: 1 80 | scraper_two: 1 81 | scraper_three: 1 82 | scraper_four: 1 83 | scraper_five: 1 84 | scraper_six: 1 85 | scraper_seven: 1 86 | scraper_eight: 1 87 | scraper_nine: 1 88 | scraper_ten: 1 89 | scraper: 1 90 | sampler_one: 1 91 | sampler_two: 1 92 | sampler_three: 1 93 | sampler_four: 1 94 | sampler_five: 1 95 | sampler_six: 1 96 | sampler_seven: 1 97 | sampler_eight: 1 98 | sampler_nine: 1 99 | sampler_ten: 1 100 | sampler: 1 101 | -------------------------------------------------------------------------------- /app/models/record/trends.rb: -------------------------------------------------------------------------------- 1 | class Record::Trends < Record::Match 2 | def sort(query_array = ['date'], options = { crawl: true, social: true, results: 10, page: 1, fix: false }) 3 | @options = options 4 | @query_array = query_array 5 | 6 | if !@container.nil? && !@container.include?(Rails.env) 7 | types = @container.split('-').last.pluralize.gsub(':', '') 8 | @index = [ Rails.env + '-' + types ] 9 | elsif @container.nil? 10 | @index = Rails.configuration.config[:admin][:api_containers].map { |c| Rails.env + '-' + c.split('-').last.pluralize.gsub(':', '') }.uniq 11 | end 12 | 13 | @options = options 14 | sanitize_results 15 | end 16 | 17 | def elasticsearch_results 18 | @elasticsearch_results ||= Elasticsearch::Model.client.search(index: @index, type: @container, body: query).deep_symbolize_keys! 19 | end 20 | 21 | def total 22 | ((elasticsearch_results[:hits][:total] || 0) / limit_results.to_f).ceil 23 | end 24 | 25 | def sanitize_results 26 | elasticsearch_results[:hits][:hits].map do |e| 27 | delete_bad_data e[:_source][:url] if @options[:fix] 28 | recrawl(e[:_source][:url], @options) if e[:_source][:url] 29 | 30 | new_data = { id: e[:_id], 31 | container: e[:_type], 32 | score: e[:_score], 33 | available: true, 34 | history: {}, 35 | social: {}, 36 | price: {} 37 | } 38 | 39 | e[:_source].each do |k,v| 40 | if k.to_s.include?('_history') 41 | new_data[:history][k.to_s.gsub('_history','')] = v 42 | elsif k.to_s.include?('facebook') || k.to_s.include?('_shares') 43 | new_data[:social][k] = v 44 | elsif k.to_s.include?('price') 45 | new_data[:price][k] = v 46 | else 47 | new_data[k] = v 48 | end 49 | end 50 | new_data 51 | end 52 | end 53 | 54 | def query 55 | @query = 56 | { 57 | filter: { 58 | match_all: { } 59 | }, 60 | sort: sort_query, 61 | size: limit_results, 62 | from: from_page 63 | } 64 | end 65 | 66 | def from_page 67 | ((@options[:page].try(:to_i) || 1) - 1) * limit_results 68 | end 69 | 70 | def sort_query 71 | @query_array.map do |n| 72 | { 73 | n => { 74 | order: "desc" 75 | } 76 | } 77 | end 78 | end 79 | 80 | def limit_results 81 | if !@options[:results] 82 | 10 83 | elsif @options[:results] > 25 84 | 25 85 | else 86 | @options[:results] 87 | end 88 | end 89 | 90 | def delete_bad_data url 91 | Mapper::UrlAvailability.perform_async url 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /app/models/record/match.rb: -------------------------------------------------------------------------------- 1 | class Record::Match < Record::Base 2 | def best(query_hash = {}, options = { crawl: true, social: false, results: 1, page: 1 }) 3 | @options = options 4 | @query_hash = query_hash.delete_if { |_k, v| v.nil? || v.blank? } 5 | 6 | if !@container.nil? && !@container.include?(Rails.env) 7 | types = container.split('-').last.pluralize.gsub(':', '') 8 | @index = [ Rails.env + '-' + types ] 9 | elsif @container.nil? 10 | @container = Rails.configuration.config[:admin][:api_containers] 11 | @index = @container.map { |c| Rails.env + '-' + c.split('-').last.pluralize.gsub(':', '') }.uniq 12 | end 13 | 14 | @options = options 15 | sanitize_results 16 | end 17 | 18 | def elasticsearch_results 19 | @elasticsearch_results ||= Elasticsearch::Model.client.search(index: @index, type: @container, body: query).deep_symbolize_keys! 20 | end 21 | 22 | def total 23 | ((elasticsearch_results[:hits][:total] || 0) / limit_results.to_f).ceil 24 | end 25 | 26 | def sanitize_results 27 | elasticsearch_results[:hits][:hits].map do |e| 28 | recrawl(e[:_source][:url], @options) if e[:_source][:url] 29 | 30 | new_data = { id: e[:_id], 31 | container: e[:_type], 32 | score: e[:_score], 33 | available: true, 34 | history: {}, 35 | social: {}, 36 | price: {} 37 | } 38 | 39 | e[:_source].each do |k,v| 40 | if k.to_s.include?('_history') 41 | new_data[:history][k.to_s.gsub('_history','')] = v 42 | elsif k.to_s.include?('facebook') || k.to_s.include?('_shares') 43 | new_data[:social][k] = v 44 | elsif k.to_s.include?('price') 45 | new_data[:price][k] = v 46 | else 47 | new_data[k] = v 48 | end 49 | end 50 | new_data 51 | end 52 | end 53 | 54 | def query 55 | @query = { 56 | query: { 57 | bool: { 58 | must_not: { 59 | term: { 60 | available: false 61 | } 62 | }, 63 | should: match_query 64 | } 65 | }, 66 | size: limit_results, 67 | from: from_page 68 | } 69 | end 70 | 71 | def match_query 72 | @query_hash.map do |k, v| 73 | { 74 | match: { 75 | k => v 76 | } 77 | } 78 | end 79 | end 80 | 81 | def from_page 82 | ((@options[:page].try(:to_i) || 1) - 1) * limit_results 83 | end 84 | 85 | def limit_results 86 | if !@options[:results] 87 | 1 88 | elsif @options[:results] > 25 89 | 25 90 | else 91 | @options[:results] 92 | end 93 | end 94 | end 95 | -------------------------------------------------------------------------------- /roles/build-ruby/meta/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | galaxy_info: 3 | author: your name 4 | description: 5 | company: your company (optional) 6 | # Some suggested licenses: 7 | # - BSD (default) 8 | # - MIT 9 | # - GPLv2 10 | # - GPLv3 11 | # - Apache 12 | # - CC-BY 13 | license: license (GPLv2, CC-BY, etc) 14 | min_ansible_version: 1.2 15 | # 16 | # Below are all platforms currently available. Just uncomment 17 | # the ones that apply to your role. If you don't see your 18 | # platform on this list, let us know and we'll get it added! 19 | # 20 | #platforms: 21 | #- name: EL 22 | # versions: 23 | # - all 24 | # - 5 25 | # - 6 26 | # - 7 27 | #- name: GenericUNIX 28 | # versions: 29 | # - all 30 | # - any 31 | #- name: Fedora 32 | # versions: 33 | # - all 34 | # - 16 35 | # - 17 36 | # - 18 37 | # - 19 38 | # - 20 39 | #- name: opensuse 40 | # versions: 41 | # - all 42 | # - 12.1 43 | # - 12.2 44 | # - 12.3 45 | # - 13.1 46 | # - 13.2 47 | #- name: Amazon 48 | # versions: 49 | # - all 50 | # - 2013.03 51 | # - 2013.09 52 | #- name: GenericBSD 53 | # versions: 54 | # - all 55 | # - any 56 | #- name: FreeBSD 57 | # versions: 58 | # - all 59 | # - 8.0 60 | # - 8.1 61 | # - 8.2 62 | # - 8.3 63 | # - 8.4 64 | # - 9.0 65 | # - 9.1 66 | # - 9.1 67 | # - 9.2 68 | #- name: Ubuntu 69 | # versions: 70 | # - all 71 | # - lucid 72 | # - maverick 73 | # - natty 74 | # - oneiric 75 | # - precise 76 | # - quantal 77 | # - raring 78 | # - saucy 79 | # - trusty 80 | #- name: SLES 81 | # versions: 82 | # - all 83 | # - 10SP3 84 | # - 10SP4 85 | # - 11 86 | # - 11SP1 87 | # - 11SP2 88 | # - 11SP3 89 | #- name: GenericLinux 90 | # versions: 91 | # - all 92 | # - any 93 | #- name: Debian 94 | # versions: 95 | # - all 96 | # - etch 97 | # - lenny 98 | # - squeeze 99 | # - wheezy 100 | # 101 | # Below are all categories currently available. Just as with 102 | # the platforms above, uncomment those that apply to your role. 103 | # 104 | #categories: 105 | #- cloud 106 | #- cloud:ec2 107 | #- cloud:gce 108 | #- cloud:rax 109 | #- clustering 110 | #- database 111 | #- database:nosql 112 | #- database:sql 113 | #- development 114 | #- monitoring 115 | #- networking 116 | #- packaging 117 | #- system 118 | #- web 119 | dependencies: [] 120 | # List your role dependencies here, one per line. Only 121 | # dependencies available via galaxy should be listed here. 122 | # Be sure to remove the '[]' above if you add dependencies 123 | # to this list. 124 | 125 | -------------------------------------------------------------------------------- /roles/elasticsearch/templates/elasticsearch.j2: -------------------------------------------------------------------------------- 1 | ################################ 2 | # Elasticsearch 3 | ################################ 4 | 5 | # Elasticsearch home directory 6 | ES_HOME={{es_home}} 7 | 8 | # Elasticsearch configuration directory 9 | CONF_DIR={{conf_dir}} 10 | 11 | # Elasticsearch data directory 12 | DATA_DIR={{ data_dirs | array_to_str }} 13 | 14 | # Elasticsearch logs directory 15 | LOG_DIR={{log_dir}} 16 | 17 | # Elasticsearch work directory 18 | WORK_DIR={{work_dir}} 19 | 20 | # Elasticsearch PID directory 21 | PID_DIR={{pid_dir}} 22 | 23 | # Heap size defaults to 256m min, 1g max 24 | # Set ES_HEAP_SIZE to 50% of available RAM, but no more than 31g 25 | {% if es_heap_size is defined %} 26 | ES_HEAP_SIZE={{es_heap_size}} 27 | {% endif %} 28 | 29 | # Heap new generation 30 | #ES_HEAP_NEWSIZE= 31 | 32 | # Maximum direct memory 33 | #ES_DIRECT_SIZE= 34 | 35 | # Additional Java OPTS 36 | #ES_JAVA_OPTS= 37 | 38 | # Configure restart on package upgrade (true, every other setting will lead to not restarting) 39 | #ES_RESTART_ON_UPGRADE=true 40 | 41 | # Path to the GC log file 42 | #ES_GC_LOG_FILE=/var/log/elasticsearch/gc.log 43 | 44 | ################################ 45 | # Elasticsearch service 46 | ################################ 47 | 48 | # SysV init.d 49 | # 50 | # When executing the init script, this user will be used to run the elasticsearch service. 51 | # The default value is 'elasticsearch' and is declared in the init.d file. 52 | # Note that this setting is only used by the init script. If changed, make sure that 53 | # the configured user can read and write into the data, work, plugins and log directories. 54 | # For systemd service, the user is usually configured in file /usr/lib/systemd/system/elasticsearch.service 55 | ES_USER={{es_user}} 56 | ES_GROUP={{es_group}} 57 | 58 | ################################ 59 | # System properties 60 | ################################ 61 | 62 | # Specifies the maximum file descriptor number that can be opened by this process 63 | # When using Systemd, this setting is ignored and the LimitNOFILE defined in 64 | # /usr/lib/systemd/system/elasticsearch.service takes precedence 65 | {% if es_max_open_files is defined %} 66 | #MAX_OPEN_FILES 67 | MAX_OPEN_FILES={{es_max_open_files}} 68 | {% endif %} 69 | 70 | # The maximum number of bytes of memory that may be locked into RAM 71 | # Set to "unlimited" if you use the 'bootstrap.mlockall: true' option 72 | # in elasticsearch.yml (ES_HEAP_SIZE must also be set). 73 | # When using Systemd, the LimitMEMLOCK property must be set 74 | # in /usr/lib/systemd/system/elasticsearch.service 75 | {% if m_lock_enabled %} 76 | #MAX_LOCKED_MEMORY= 77 | MAX_LOCKED_MEMORY=unlimited 78 | {% endif %} 79 | 80 | # Maximum number of VMA (Virtual Memory Areas) a process can own 81 | # When using Systemd, this setting is ignored and the 'vm.max_map_count' 82 | # property is set at boot time in /usr/lib/sysctl.d/elasticsearch.conf 83 | #MAX_MAP_COUNT=262144 84 | -------------------------------------------------------------------------------- /app/models/record/upload.rb: -------------------------------------------------------------------------------- 1 | class Record::Upload < Page::Url 2 | CANONICAL = %i(site_name 3 | id 4 | url 5 | type 6 | date 7 | name 8 | image 9 | description 10 | tags 11 | categories 12 | open_graph 13 | schema_org 14 | available).freeze 15 | 16 | EXCLUDE = %i(site_name 17 | id 18 | type 19 | screenshot).freeze 20 | 21 | attr_accessor :metadata 22 | attr_accessor :id 23 | attr_accessor :screenshot 24 | 25 | def sync 26 | self.data = update_metadata(update_canonical(data)) 27 | end 28 | 29 | def update_canonical(new_data = {}) 30 | new_data['available'] = true unless metadata['available'] 31 | types 32 | set_date 33 | set_screenshot 34 | metadata.each do |key, value| 35 | if CANONICAL.include? key.to_sym 36 | unless new_data[key] == value 37 | new_data[key] = value 38 | end 39 | metadata.delete(key) 40 | end 41 | end 42 | new_data 43 | end 44 | 45 | def update_metadata(new_data = {}) 46 | metadata.each do |key, value| 47 | if new_data[key] 48 | original_hash = new_data[key] 49 | new_hash = {} 50 | last_key = original_hash.keys.last 51 | original_hash.each do |k, v| 52 | if k == last_key && v != value 53 | new_hash[date] = value 54 | if screenshot 55 | new_data['screenshot'][date] = screenshot 56 | launch_screener 57 | end 58 | end 59 | end 60 | new_data[key] = original_hash.merge!(new_hash) 61 | else 62 | new_data[key] = { date => value } 63 | 64 | if screenshot 65 | unless new_data['screenshot'] 66 | new_data['screenshot'] = { date => screenshot } 67 | launch_screener 68 | end 69 | end 70 | end 71 | end 72 | new_data 73 | end 74 | 75 | def set_date 76 | self.date = metadata['date'] if metadata['date'] 77 | end 78 | 79 | def set_screenshot 80 | if metadata['screenshot'] 81 | self.screenshot = metadata['screenshot'] 82 | metadata.delete('screenshot') 83 | end 84 | end 85 | 86 | def launch_screener 87 | Crawler::Screener.perform_async url, screenshot 88 | end 89 | 90 | def data 91 | return record.data if record.data 92 | {} 93 | end 94 | 95 | def data=(new_data) 96 | record.data = new_data 97 | end 98 | 99 | def record 100 | @data ||= Record::Base.new(container, json_relative_path) 101 | end 102 | 103 | def json_relative_path 104 | @json_relative_path ||= id ? id : md5 105 | end 106 | 107 | def types 108 | @types ||= metadata['type'].downcase.pluralize.gsub(':', '') 109 | end 110 | 111 | def container 112 | @container ||= name + '-' + types 113 | end 114 | end 115 | -------------------------------------------------------------------------------- /app/controllers/v1/status_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::StatusController < V1::AccessController 2 | def index 3 | respond_to do |format| 4 | format.json { json_response(200, status: (Rails.configuration.config[:admin][:api_keys][check_token.try(:to_sym)] || {}).merge(counts)) } 5 | format.xml { xml_response(200, status: (Rails.configuration.config[:admin][:api_keys][check_token.try(:to_sym)] || {}).merge(counts)) } 6 | end 7 | end 8 | 9 | private 10 | 11 | def counts 12 | { available: count_indexes, 13 | count: Rails.configuration.config[:admin][:api_containers].count.to_s, 14 | total: pretty_integer(@total || 0), 15 | indexing: pretty_integer(count_indexers), 16 | processing: pretty_integer(count_scrimpers), 17 | pending: pretty_integer(count_sitemappers * 50_000) } 18 | end 19 | 20 | def count_indexes 21 | Rails.configuration.config[:admin][:api_containers] 22 | .map { |c| [c, count_containers(c)] } 23 | .sort_by(&:last).reverse 24 | .map { |array| { array.first => pretty_integer(array.last) } }.inject(:merge) 25 | end 26 | 27 | def count_indexers 28 | Sidekiq::Queue.new('mapper').size + 29 | Sidekiq::Queue.new('recorder').size 30 | rescue Redis::CannotConnectError => e 31 | 0 32 | end 33 | 34 | def count_scrimpers 35 | Sidekiq::Queue.new('scrimper').size + 36 | Sidekiq::Queue.new('scrimper_one').size + 37 | Sidekiq::Queue.new('scrimper_two').size + 38 | Sidekiq::Queue.new('scrimper_three').size + 39 | Sidekiq::Queue.new('scrimper_four').size + 40 | Sidekiq::Queue.new('scrimper_five').size + 41 | Sidekiq::Queue.new('sampler').size + 42 | Sidekiq::Queue.new('sampler_one').size + 43 | Sidekiq::Queue.new('sampler_two').size + 44 | Sidekiq::Queue.new('sampler_three').size + 45 | Sidekiq::Queue.new('sampler_four').size + 46 | Sidekiq::Queue.new('sampler_five').size + 47 | Sidekiq::Queue.new('spider').size + 48 | Sidekiq::Queue.new('spider_one').size + 49 | Sidekiq::Queue.new('spider_two').size + 50 | Sidekiq::Queue.new('spider_three').size + 51 | Sidekiq::Queue.new('spider_four').size + 52 | Sidekiq::Queue.new('spider_five').size + 53 | Sidekiq::Queue.new('slider').size + 54 | Sidekiq::Queue.new('socializer').size 55 | rescue Redis::CannotConnectError => e 56 | 0 57 | end 58 | 59 | def count_sitemappers 60 | Sidekiq::Queue.new('sitemapper').size + 61 | Sidekiq::Queue.new('sitemapper_one').size + 62 | Sidekiq::Queue.new('sitemapper_two').size + 63 | Sidekiq::Queue.new('sitemapper_three').size + 64 | Sidekiq::Queue.new('sitemapper_four').size + 65 | Sidekiq::Queue.new('sitemapper_five').size 66 | rescue Redis::CannotConnectError => e 67 | 0 68 | end 69 | 70 | def count_containers(container) 71 | @total ||= 0 72 | index = Rails.env + '-' + container.split('-').last.pluralize.delete(':') 73 | count = Elasticsearch::Model.client.count(index: index, type: container)['count'] 74 | @total = @total + count 75 | count 76 | rescue Elasticsearch::Transport::Transport::Errors => e 77 | 0 78 | end 79 | end 80 | -------------------------------------------------------------------------------- /roles/swapfile/README.md: -------------------------------------------------------------------------------- 1 | ansible-swapfile 2 | ================ 3 | 4 | This role configures a swapfile (/swapfile) with the (default) size of 512MB. 5 | 6 | ## Dependencies 7 | 8 | None. 9 | 10 | ## Variables 11 | 12 | * `swapfile_use_dd` [default: `False`]: if set to False, `fallocate` is used to create the swapfile, otherwise, `dd` is used. You may need to set this to True if your filesystem does not support `fallocate` -- see Issue #3. 13 | 14 | * `swapfile_size` [default: `512MB`]: the size of the swapfile to create in the format that `fallocate` expects: 15 | 16 | The length and offset arguments may be followed by binary (2^N) suffixes KiB, MiB, GiB, TiB, PiB and EiB (the "iB" is optional, e.g. "K" has the same meaning as "KiB") or decimal (10^N) suffixes KB, MB, GB, PB and EB. 17 | 18 | If `swapfile_use_dd` is set to True, `swapfile_size` must be set to the amount of megabytes to write, e.g. `512`. 19 | 20 | * `swapfile_location` [default: `/swapfile`]: the location of where the swapfile will be created 21 | 22 | ### Optional 23 | 24 | The following variables are set to `False` by default and will not have any effect on your hosts. Setting them to any value other than `False` will update your hosts' sysctl.conf file. 25 | 26 | * `swapfile_swappiness` [default: `False`]: the swappiness percentage (vm.swappiness) -- the lower it is, the less your system swaps memory pages 27 | 28 | * `swapfile_vfs_cache_pressure` [default: `False`]: "this percentage value controls the tendency of the kernel to reclaim the memory which is used for caching of directory and inode objects." 29 | 30 | ## Usage 31 | 32 | ```yaml 33 | - hosts: all 34 | roles: 35 | - kamaln7.swapfile 36 | ``` 37 | 38 | or: 39 | 40 | ```yaml 41 | - hosts: all 42 | roles: 43 | - { role: kamaln7.swapfile, swapfile_size: 1GB, swapfile_swappiness: 10, swapfile_location: /mnt/swapfile } 44 | ``` 45 | 46 | You can also set the variables described above in `group_vars` or `host_vars` (see `defaults/main.yml`). 47 | 48 | ## License 49 | 50 | The MIT License (MIT) 51 | 52 | Copyright (c) 2014 Kamal Nasser 53 | 54 | Permission is hereby granted, free of charge, to any person obtaining a copy 55 | of this software and associated documentation files (the "Software"), to deal 56 | in the Software without restriction, including without limitation the rights 57 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 58 | copies of the Software, and to permit persons to whom the Software is 59 | furnished to do so, subject to the following conditions: 60 | 61 | The above copyright notice and this permission notice shall be included in all 62 | copies or substantial portions of the Software. 63 | 64 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 65 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 66 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 67 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 68 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 69 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 70 | SOFTWARE. 71 | -------------------------------------------------------------------------------- /roles/nginx-unicorn/README.md: -------------------------------------------------------------------------------- 1 | Ansible Nginx/Unicorn setup 2 | =========================== 3 | 4 | This Ansible role installs Nginx and generates configuration for Unicorn 5 | applications 6 | 7 | Requirements 8 | ------------ 9 | 10 | None 11 | 12 | Notes 13 | ----- 14 | 15 | This role does not install or configure Unicorn itself. It is designed 16 | to play nicely with a Unicorn role such as 17 | [Unicorn-RVM](https://github.com/agios/ansible-unicorn-rvm). 18 | 19 | Role Variables 20 | -------------- 21 | 22 | - `nginx_sites` is an array of unicorn sites, defaults to `[]` 23 | 24 | Each nginx_sites entry is a dict with the following options: 25 | 26 | - `name` (eg `my_app`, required) 27 | - `server_name` (eg `my-app.my-domain.org`, required, in any 28 | format supported by nginx) 29 | - `root` defaults to `/var/www/{{ name }}/current` (Capistrano 30 | compatible) 31 | - `listen` defaults to `[::]:80` (Both IPv4 and IPv6) 32 | - `access_log` is a dict with the following options: 33 | - `path` defaults to `/var/log/nginx/{{ name }}.access.log` 34 | - `format` is optional, can be used to specify a custom nginx 35 | log output format 36 | - `error_log` see above 37 | - `ssl` if this option is given, an ssl config section will be 38 | generated. It contains the following options: 39 | - `certificate` required, path to ssl certificate 40 | - `certificate_key` required, path to ssl certificate key 41 | - `ssl_only` if set to `true`, always redirect to ssl 42 | - `spdy` if set to `true`, enable spdy support 43 | - `gzip_assets` if set to `true`, enable serving gzipped 44 | 'assets' folder, cached for 16w (useful for rails with asset 45 | precompilation) 46 | - `sensitive_uris` required unless `ssl_only`, nginx uri 47 | expressions that will be served using https 48 | - `access_log` as above, for https requests 49 | - `error_log` as above, for https requests 50 | 51 | 52 | 53 | 54 | Example Playbook 55 | ---------------- 56 | 57 | The role could be included in a playbook as follows (unicorn-rvm also 58 | shown): 59 | 60 | ```yaml 61 | --- 62 | -hosts: application 63 | roles: 64 | - role: unicorn-rvm 65 | rails_apps: 66 | - { name: 'my_app1', ruby_version: 'ruby-1.9.3' } 67 | - { name: 'my_app2', ruby_version: 'ruby-2.1.1', root: '/var/test_apps/app2', env: staging } 68 | - role: nginx-unicorn 69 | nginx_sites: 70 | - name: 'my_app1' 71 | server_name: 'my-app1.example.com' 72 | access_log: 73 | format: 'main' 74 | ssl: 75 | certificate: /etc/ssl/localcerts/my_app1.pem 76 | certificate_key: /etc/ssl/localcerts/my_app1.key 77 | sensitive_uris: 78 | - ^/user/sign_in(.*) 79 | - ^/user/password(.*) 80 | access_log: 81 | format: 'main' 82 | - name: 'my_app2' 83 | server_name: 'my-app2.example.com *.mydomain.com' 84 | root: '/var/test_apps/app2' 85 | ssl: 86 | certificate: /etc/ssl/localcerts/my_app2.crt 87 | certificate_key: /etc/ssl/localcerts/my_app2.key 88 | ssl_only: true 89 | ``` 90 | 91 | License 92 | ------- 93 | 94 | MIT 95 | 96 | -------------------------------------------------------------------------------- /app/models/crawl/google.rb: -------------------------------------------------------------------------------- 1 | class Crawl::Google 2 | include ActionView::Helpers::DateHelper 3 | 4 | def initialize(query = nil) 5 | @query = query 6 | end 7 | 8 | def videos 9 | @videos ||= Rails.cache.fetch("#{@query}/google_videos", expires_in: 7.days) do 10 | if google_hash = Google::Search::Video.new(query: @query).response.hash['responseData'] 11 | google_hash['results'].map do |hash| 12 | { 13 | title: hash['titleNoFormatting'], 14 | description: ActionView::Base.full_sanitizer.sanitize(hash['content']), 15 | image: hash['tbUrl'], 16 | url: hash['url'], 17 | length: distance_of_time_in_words(hash['duration'].to_i), 18 | published: hash['published'].to_date.to_s 19 | } 20 | end 21 | else 22 | nil 23 | end 24 | end 25 | end 26 | 27 | def news 28 | @news ||= Rails.cache.fetch("#{@query}/google_news", expires_in: 7.days) do 29 | if google_hash = Google::Search::News.new(query: @query).response.hash['responseData'] 30 | google_hash['results'].map do |hash| 31 | image = hash['image']['tbUrl'] if hash['image'] 32 | if related = hash['relatedStories'] 33 | related = related.map do |h| 34 | { 35 | title: h['titleNoFormatting'], 36 | url: h['unescapedUrl'], 37 | publisher: h['publisher'], 38 | published: h['publishedDate'].to_date.to_s, 39 | language: hash['language'] 40 | } 41 | end 42 | end 43 | { 44 | title: hash['titleNoFormatting'], 45 | description: ActionView::Base.full_sanitizer.sanitize(hash['content']), 46 | image: image, 47 | url: hash['unescapedUrl'], 48 | publisher: hash['publisher'], 49 | published: hash['publishedDate'].to_date.to_s, 50 | language: hash['language'], 51 | related: related || [] 52 | } 53 | end 54 | else 55 | nil 56 | end 57 | end 58 | end 59 | 60 | def references 61 | @references ||= Rails.cache.fetch("#{@query}/google_references", expires_in: 7.days) do 62 | if google_hash = Google::Search::Book.new(query: @query).response.hash['responseData'] 63 | google_hash['results'].map do |hash| 64 | description = "by #{hash['authors']} ISBN: #{hash['bookId']}" 65 | 66 | image = hash['tbUrl'] unless hash['tbUrl'] == "/googlebooks/images/no_cover_thumb.gif" 67 | 68 | { 69 | title: hash['titleNoFormatting'], 70 | description: description, 71 | image: image, 72 | url: hash['unescapedUrl'], 73 | length: hash['pageCount'] + ' pages', 74 | published: hash['publishedYear'] 75 | } 76 | end 77 | else 78 | nil 79 | end 80 | end 81 | end 82 | 83 | def links 84 | @links ||= Rails.cache.fetch("#{@query}/google_links", expires_in: 7.days) do 85 | if google_hash = Google::Search::Web.new(query: @query).response.hash['responseData'] 86 | google_hash['results'].map do |hash| 87 | { 88 | title: hash['titleNoFormatting'], 89 | description: ActionView::Base.full_sanitizer.sanitize(hash['content']), 90 | url: hash['unescapedUrl'] 91 | } 92 | end 93 | else 94 | nil 95 | end 96 | end 97 | end 98 | end 99 | -------------------------------------------------------------------------------- /roles/elasticsearch/test/integration/helpers/serverspec/package_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | shared_examples 'package::init' do |es_version,plugins| 4 | 5 | describe user('elasticsearch') do 6 | it { should exist } 7 | end 8 | 9 | describe service('node1_elasticsearch') do 10 | it { should be_running } 11 | end 12 | 13 | describe package('elasticsearch') do 14 | it { should be_installed } 15 | end 16 | 17 | describe file('/etc/elasticsearch/node1/elasticsearch.yml') do 18 | it { should be_file } 19 | it { should contain 'path.plugins: /usr/share/elasticsearch/plugins/node1' } 20 | it { should contain 'http.port: 9200' } 21 | it { should contain 'transport.tcp.port: 9300' } 22 | it { should contain 'discovery.zen.ping.unicast.hosts: localhost:9300' } 23 | end 24 | 25 | describe file('/etc/elasticsearch/node1/scripts') do 26 | it { should be_directory } 27 | it { should be_owned_by 'elasticsearch' } 28 | end 29 | 30 | 31 | 32 | describe file('/etc/elasticsearch/node1/scripts/calculate-score.groovy') do 33 | it { should be_file } 34 | it { should be_owned_by 'elasticsearch' } 35 | end 36 | 37 | describe 'Node listening' do 38 | it 'listening in port 9200' do 39 | expect(port 9200).to be_listening 40 | end 41 | end 42 | 43 | describe file('/etc/elasticsearch/templates') do 44 | it { should be_directory } 45 | it { should be_owned_by 'elasticsearch' } 46 | end 47 | 48 | describe file('/etc/elasticsearch/templates/basic.json') do 49 | it { should be_file } 50 | it { should be_owned_by 'elasticsearch' } 51 | end 52 | 53 | describe 'Template Installed' do 54 | it 'should be reported as being installed', :retry => 3, :retry_wait => 10 do 55 | command = command('curl -s "localhost:9200/_template/basic"') 56 | expect(command.stdout).to match(/basic/) 57 | expect(command.exit_status).to eq(0) 58 | end 59 | end 60 | 61 | describe 'version check' do 62 | it 'should be reported as version '+es_version do 63 | command = command('curl -s localhost:9200 | grep number') 64 | expect(command.stdout).to match(es_version) 65 | expect(command.exit_status).to eq(0) 66 | end 67 | end 68 | 69 | describe file('/usr/share/elasticsearch/plugins/node1') do 70 | it { should be_directory } 71 | it { should be_owned_by 'elasticsearch' } 72 | end 73 | 74 | 75 | for plugin in plugins 76 | describe file('/usr/share/elasticsearch/plugins/node1/'+plugin) do 77 | it { should be_directory } 78 | it { should be_owned_by 'elasticsearch' } 79 | end 80 | 81 | describe command('curl -s localhost:9200/_nodes/plugins?pretty=true | grep '+plugin) do 82 | its(:exit_status) { should eq 0 } 83 | end 84 | end 85 | 86 | describe file('/etc/init.d/elasticsearch') do 87 | it { should_not exist } 88 | end 89 | 90 | describe file('/etc/default/elasticsearch') do 91 | it { should_not exist } 92 | end 93 | 94 | describe file('/etc/sysconfig/elasticsearch') do 95 | it { should_not exist } 96 | end 97 | 98 | describe file('/usr/lib/systemd/system/elasticsearch.service') do 99 | it { should_not exist } 100 | end 101 | 102 | describe file('/etc/elasticsearch/elasticsearch.yml') do 103 | it { should_not exist } 104 | end 105 | 106 | describe file('/etc/elasticsearch/logging.yml') do 107 | it { should_not exist } 108 | end 109 | 110 | end 111 | 112 | -------------------------------------------------------------------------------- /config/environments/production.rb: -------------------------------------------------------------------------------- 1 | Rails.application.configure do 2 | # Settings specified here will take precedence over those in config/application.rb. 3 | 4 | # Code is not reloaded between requests. 5 | config.cache_classes = true 6 | 7 | # Eager load code on boot. This eager loads most of Rails and 8 | # your application in memory, allowing both threaded web servers 9 | # and those relying on copy on write to perform better. 10 | # Rake tasks automatically ignore this option for performance. 11 | config.eager_load = true 12 | 13 | # Full error reports are disabled and caching is turned on. 14 | config.consider_all_requests_local = false 15 | config.action_controller.perform_caching = true 16 | # config.cache_store = :dalli_store 17 | 18 | # Enable Rack::Cache to put a simple HTTP cache in front of your application 19 | # Add `rack-cache` to your Gemfile before enabling this. 20 | # For large-scale production use, consider using a caching reverse proxy like nginx, varnish or squid. 21 | # config.action_dispatch.rack_cache = true 22 | 23 | # Disable Rails's static asset server (Apache or nginx will already do this). 24 | config.serve_static_files = false 25 | 26 | # Compress JavaScripts and CSS. 27 | config.assets.js_compressor = :uglifier 28 | # config.assets.css_compressor = :sass 29 | 30 | # Do not fallback to assets pipeline if a precompiled asset is missed. 31 | config.assets.compile = false 32 | 33 | # Generate digests for assets URLs. 34 | config.assets.digest = true 35 | 36 | # `config.assets.precompile` has moved to config/initializers/assets.rb 37 | 38 | # Specifies the header that your server uses for sending files. 39 | # config.action_dispatch.x_sendfile_header = "X-Sendfile" # for apache 40 | # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for nginx 41 | 42 | # Force all access to the app over SSL, use Strict-Transport-Security, and use secure cookies. 43 | # config.force_ssl = true 44 | 45 | # Set to :debug to see everything in the log. 46 | config.log_level = :info 47 | 48 | # Prepend all log lines with the following tags. 49 | # config.log_tags = [ :subdomain, :uuid ] 50 | 51 | # Use a different logger for distributed setups. 52 | # config.logger = ActiveSupport::TaggedLogging.new(SyslogLogger.new) 53 | 54 | # Use a different cache store in production. 55 | # config.cache_store = :mem_cache_store 56 | 57 | # Enable serving of images, stylesheets, and JavaScripts from an asset server. 58 | # config.action_controller.asset_host = "http://assets.example.com" 59 | 60 | # Precompile additional assets. 61 | # application.js, application.css, and all non-JS/CSS in app/assets folder are already added. 62 | # config.assets.precompile += %w( search.js ) 63 | 64 | # Ignore bad email addresses and do not raise email delivery errors. 65 | # Set this to true and configure the email server for immediate delivery to raise delivery errors. 66 | # config.action_mailer.raise_delivery_errors = false 67 | 68 | # Enable locale fallbacks for I18n (makes lookups for any locale fall back to 69 | # the I18n.default_locale when a translation cannot be found). 70 | config.i18n.fallbacks = true 71 | 72 | # Send deprecation notices to registered listeners. 73 | config.active_support.deprecation = :notify 74 | 75 | # Disable automatic flushing of the log to improve performance. 76 | # config.autoflush_log = false 77 | 78 | # Use default logging formatter so that PID and timestamp are not suppressed. 79 | config.log_formatter = ::Logger::Formatter.new 80 | 81 | # Do not dump schema after migrations. 82 | config.active_record.dump_schema_after_migration = false 83 | end 84 | -------------------------------------------------------------------------------- /roles/elasticsearch/tasks/elasticsearch-config.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | # Configure Elasticsearch Node 4 | 5 | #Use systemd for the following distributions: 6 | # 7 | #Ubuntu 15 and up 8 | #Debian 8 and up 9 | #Centos 7 and up 10 | #Relies on elasticsearch distribution installing a serviced script to determine whether one should be copied. 11 | 12 | 13 | - set_fact: use_system_d={{(ansible_distribution == 'Debian' and ansible_distribution_version | version_compare('8', '>=')) or (ansible_distribution == 'CentOS' and ansible_distribution_version | version_compare('7', '>=')) or (ansible_distribution == 'Ubuntu' and ansible_distribution_version | version_compare('15', '>=')) }} 14 | tags: 15 | - always 16 | 17 | - set_fact: instance_sysd_script={{sysd_script | dirname }}/{{es_instance_name}}_{{sysd_script | basename}} 18 | when: use_system_d 19 | tags: 20 | - always 21 | 22 | #For directories we also use the {{inventory_hostname}}-{{ es_instance_name }} - this helps if we have a shared SAN. 23 | 24 | - set_fact: instance_suffix={{inventory_hostname}}-{{ es_instance_name }} 25 | tags: 26 | - always 27 | 28 | - set_fact: pid_dir={{ es_pid_dir }}/{{instance_suffix}} 29 | tags: 30 | - always 31 | 32 | - set_fact: log_dir={{ es_log_dir }}/{{instance_suffix}} 33 | tags: 34 | - always 35 | 36 | - set_fact: work_dir={{ es_work_dir }}/{{instance_suffix}} 37 | tags: 38 | - always 39 | 40 | #Create required directories 41 | - name: Create Directories 42 | file: path={{ item }} state=directory owner={{ es_user }} group={{ es_group }} 43 | with_items: 44 | - "{{pid_dir}}" 45 | - "{{work_dir}}" 46 | - "{{log_dir}}" 47 | - "{{conf_dir}}" 48 | - "{{plugin_dir}}" 49 | 50 | - set_fact: data_dirs={{ es_data_dirs | append_to_list('/'+instance_suffix) }} 51 | tags: 52 | - always 53 | 54 | - name: Create Data Directories 55 | file: path={{ item }} state=directory owner={{ es_user }} group={{ es_group }} 56 | with_items: 57 | - "{{data_dirs}}" 58 | 59 | 60 | #Copy the config template 61 | - name: Copy Configuration File 62 | template: src=elasticsearch.yml.j2 dest={{conf_dir}}/elasticsearch.yml owner={{ es_user }} group={{ es_group }} mode=0644 force=yes 63 | notify: restart elasticsearch 64 | 65 | #Copy the instance specific default file 66 | - name: Copy Default File for Instance 67 | template: src=elasticsearch.j2 dest={{instance_default_file}} mode=0644 force=yes 68 | notify: restart elasticsearch 69 | 70 | #Copy the instance specific init file 71 | - name: Copy Debian Init File for Instance 72 | template: src=init/debian/elasticsearch.j2 dest={{instance_init_script}} mode=0755 force=yes 73 | when: ansible_os_family == 'Debian' and not use_system_d 74 | notify: restart elasticsearch 75 | 76 | #Copy the instance specific init file 77 | - name: Copy Redhat Init File for Instance 78 | template: src=init/redhat/elasticsearch.j2 dest={{instance_init_script}} mode=0755 force=yes 79 | when: ansible_os_family == 'RedHat' and not use_system_d 80 | notify: restart elasticsearch 81 | 82 | #Copy the systemd specific file if systemd is installed 83 | - name: Copy Systemd File for Instance 84 | template: src=systemd/elasticsearch.j2 dest={{instance_sysd_script}} mode=0644 force=yes 85 | when: use_system_d 86 | notify: restart elasticsearch 87 | 88 | #Copy the logging.yml 89 | - name: Copy Logging.yml File for Instance 90 | template: src=logging.yml.j2 dest={{conf_dir}}/logging.yml owner={{ es_user }} group={{ es_group }} mode=0644 force=yes 91 | notify: restart elasticsearch 92 | 93 | #Clean up un-wanted package scripts to avoid confusion 94 | 95 | - name: Delete Default Init 96 | file: dest=/etc/init.d/elasticsearch state=absent 97 | 98 | - name: Delete Default Environment File 99 | file: dest=/etc/default/elasticsearch state=absent 100 | when: ansible_os_family == 'Debian' 101 | 102 | - name: Delete Default Environment File 103 | file: dest=/etc/sysconfig/elasticsearch state=absent 104 | when: ansible_os_family == 'RedHat' 105 | 106 | - name: Delete Default Sysconfig File 107 | file: dest=/usr/lib/systemd/system/elasticsearch.service state=absent 108 | 109 | - name: Delete Default Configuration File 110 | file: dest=/etc/elasticsearch/elasticsearch.yml state=absent 111 | 112 | - name: Delete Default Logging File 113 | file: dest=/etc/elasticsearch/logging.yml state=absent 114 | 115 | - debug: msg="Data Dirs {{data_dirs}}" -------------------------------------------------------------------------------- /app/controllers/v1/record_controller.rb: -------------------------------------------------------------------------------- 1 | class V1::RecordController < V1::AccessController 2 | def index 3 | object = Record::Addons.append(record.current_data(default_options)) 4 | respond_to do |format| 5 | format.json { json_response(200, result: object) } 6 | format.xml { xml_response(200, result: object) } 7 | end 8 | end 9 | 10 | def history 11 | history = record.historical_data(default_options) 12 | respond_to do |format| 13 | format.json { json_response(200, result: history) } 14 | # format.xml { xml_response(200, result: history) } 15 | # format.csv do 16 | # # history = Record::Base.new('bestbuy-offers','9071056').historical_data 17 | # headers_hash = history.keys.map {|k| {k => nil}}.inject({},:merge) 18 | # dates_hash = history.values.flat_map {|hash| if hash.try(:keys) then hash.keys end }.compact.uniq.sort.map {|date| {date.to_date => headers_hash} }.inject({},:merge) 19 | # dates_hash.each do |key,value| 20 | # puts key 21 | # value.each do |k,v| 22 | # ap k 23 | # ap v 24 | # puts history[k][key.to_date] 25 | # # # dates_hash[key][k] = history[k][key] 26 | # end 27 | # end 28 | # dates_hash 29 | 30 | # # history.each do |key,value| 31 | # # if value.is_a? Hash 32 | # # ap key 33 | # # value.each do |k,v| 34 | # # ap k.to_date 35 | # # ap v 36 | # # dates_hash[k][key] = v 37 | # # end 38 | # # else 39 | # # # dates_hash.keys.each do |date| 40 | # # # dates_hash[date][key] = value 41 | # # # end 42 | # # end 43 | # # end 44 | 45 | # # dates = 46 | # # csv_string = history.keys.join(',') + "\n" + history.collect { |node| "#{node.collect { |_k, v| v }.join(',')}\n" }.join 47 | # # send_data csv_string, type: 'text/csv; charset=iso-8859-1; header=present', disposition: 'attachment;data=historical_data.csv' 48 | # end 49 | end 50 | end 51 | 52 | def related 53 | related = record.related_data(default_options) 54 | respond_to do |format| 55 | format.json { json_response(200, result: related) } 56 | format.xml { xml_response(200, result: related) } 57 | end 58 | end 59 | 60 | def news 61 | news = record.news_data(default_options) 62 | respond_to do |format| 63 | format.json { json_response(200, result: news) } 64 | format.xml { xml_response(200, result: news) } 65 | end 66 | end 67 | 68 | def videos 69 | videos = record.videos_data(default_options) 70 | respond_to do |format| 71 | format.json { json_response(200, result: videos) } 72 | format.xml { xml_response(200, result: videos) } 73 | end 74 | end 75 | 76 | def links 77 | links = record.links_data(default_options) 78 | # if links[:links] 79 | # links[:links] = links[:links].map {|h| Record::Addons.append(h) } 80 | # end 81 | respond_to do |format| 82 | format.json { json_response(200, result: links) } 83 | format.xml { xml_response(200, result: links) } 84 | end 85 | end 86 | 87 | def references 88 | references = record.references_data(default_options) 89 | respond_to do |format| 90 | format.json { json_response(200, result: references) } 91 | format.xml { xml_response(200, result: references) } 92 | end 93 | end 94 | 95 | def ids 96 | records = Record::Base.new(params[:container]).ids(default_options) 97 | respond_to do |format| 98 | format.json { json_response(200, result: records[:result], 99 | pagination: pagination(records[:total])) } 100 | format.xml { xml_response(200, result: records[:result], 101 | pagination: pagination(records[:total])) } 102 | end 103 | end 104 | 105 | def screenshot 106 | screenshot = Record::Screenshot.new(params[:container], params[:record_id], params[:screenshot_id]) 107 | respond_to do |format| 108 | format.json { json_response(200, screenshot.data) } 109 | format.xml { xml_response(200, screenshot.data) } 110 | format.jpg { redirect_to screenshot.link } 111 | end 112 | end 113 | 114 | private 115 | 116 | def record 117 | @record ||= Api::V1.new(params[:container], params[:record_id]) 118 | end 119 | end 120 | -------------------------------------------------------------------------------- /roles/elasticsearch/.kitchen.yml: -------------------------------------------------------------------------------- 1 | --- 2 | driver: 3 | name: docker 4 | 5 | provisioner: 6 | name: ansible_playbook 7 | hosts: localhost 8 | roles_path: ../ 9 | require_ansible_repo: true 10 | ansible_verbose: true 11 | http_proxy: <%= ENV['HTTP_PROXY'] %> 12 | https_proxy: <%= ENV['HTTPS_PROXY'] %> 13 | no_proxy: localhost,127.0.0.1 14 | 15 | platforms: 16 | - name: ubuntu-14.04 17 | driver_config: 18 | image: dliappis/ubuntu-devopsci:14.04 19 | privileged: true 20 | provision_command: 21 | - apt-get update && apt-get install -y software-properties-common && add-apt-repository -y ppa:ansible/ansible 22 | - apt-get update && apt-get -y -q install ansible python-apt python-pycurl 23 | use_sudo: false 24 | - name: debian-7 25 | driver_config: 26 | image: dliappis/debian-devopsci:7 27 | privileged: true 28 | provision_command: 29 | - apt-get update && apt-get -y install python python-dev python-pip build-essential libyaml-dev python-yaml 30 | - pip install ansible 31 | - apt-get install -y -q net-tools 32 | use_sudo: false 33 | - name: debian-8 34 | driver_config: 35 | image: dliappis/debian-devopsci:8 36 | privileged: true 37 | provision_command: 38 | - apt-get update && apt-get -y install python python-dev python-pip build-essential libyaml-dev python-yaml curl wget 39 | - pip install ansible 40 | - apt-get install -y -q net-tools 41 | - sed -ri 's/^#?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config 42 | - sed -ri 's/^#?PasswordAuthentication .*/PasswordAuthentication yes/' /etc/ssh/sshd_config 43 | - sed -ri 's/^#?UsePAM .*/UsePAM no/' /etc/ssh/sshd_config 44 | use_sudo: false 45 | run_command: "/sbin/init" 46 | - name: centos-6 47 | driver_config: 48 | image: dliappis/centos-devopsci:6 49 | privileged: true 50 | provision_command: 51 | use_sudo: false 52 | - name: centos-7 53 | driver_config: 54 | image: dliappis/centos-devopsci:7 55 | provision_command: 56 | - sed -ri 's/^#?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config 57 | - sed -ri 's/^#?PasswordAuthentication .*/PasswordAuthentication yes/' /etc/ssh/sshd_config 58 | - sed -ri 's/^#?UsePAM .*/UsePAM no/' /etc/ssh/sshd_config 59 | - yum -y install initscripts 60 | - yum clean all 61 | run_command: "/usr/sbin/init" 62 | privileged: true 63 | use_sudo: false 64 | 65 | suites: 66 | - name: standard-2x 67 | provisioner: 68 | playbook: test/integration/standard.yml 69 | run_list: 70 | attributes: 71 | - name: package-2x 72 | run_list: 73 | attributes: 74 | extra_vars: 75 | es_plugins: 76 | - plugin: lmenezes/elasticsearch-kopf 77 | version: master 78 | - plugin: license 79 | - plugin: marvel-agent 80 | provisioner: 81 | playbook: test/integration/package.yml 82 | - name: config-2x 83 | run_list: 84 | attributes: 85 | provisioner: 86 | playbook: test/integration/config.yml 87 | - name: multi-2x 88 | run_list: 89 | attributes: 90 | extra_vars: 91 | es_plugins: 92 | - plugin: lmenezes/elasticsearch-kopf 93 | version: master 94 | - plugin: license 95 | - plugin: marvel-agent 96 | provisioner: 97 | playbook: test/integration/multi.yml 98 | - name: standard-1x 99 | provisioner: 100 | playbook: test/integration/standard.yml 101 | run_list: 102 | attributes: 103 | extra_vars: 104 | es_major_version: 1.7 105 | es_version: 1.7.3 106 | - name: package-1x 107 | run_list: 108 | attributes: 109 | extra_vars: 110 | es_major_version: 1.7 111 | es_version: 1.7.3 112 | es_plugins: 113 | - plugin: lmenezes/elasticsearch-kopf 114 | version: master 115 | - plugin: elasticsearch/marvel 116 | version: latest 117 | provisioner: 118 | playbook: test/integration/package.yml 119 | - name: config-1x 120 | run_list: 121 | attributes: 122 | extra_vars: 123 | es_major_version: 1.7 124 | es_version: 1.7.3 125 | provisioner: 126 | playbook: test/integration/config.yml 127 | - name: multi-1x 128 | run_list: 129 | attributes: 130 | extra_vars: 131 | es_major_version: 1.7 132 | es_version: 1.7.3 133 | es_plugins: 134 | - plugin: lmenezes/elasticsearch-kopf 135 | version: master 136 | - plugin: elasticsearch/marvel 137 | version: latest 138 | provisioner: 139 | playbook: test/integration/multi.yml 140 | --------------------------------------------------------------------------------