├── log
    └── .keep
├── app
    ├── sites
    │   └── .keep
    ├── workers
    │   ├── worker.rb
    │   ├── scheduler
    │   │   ├── base.rb
    │   │   ├── reindexer.rb
    │   │   └── clearer.rb
    │   ├── crawler
    │   │   ├── slider.rb
    │   │   ├── socializer.rb
    │   │   ├── screener.rb
    │   │   ├── stretcher.rb
    │   │   ├── scrimper.rb
    │   │   ├── base.rb
    │   │   ├── sampler.rb
    │   │   ├── spider.rb
    │   │   ├── scraper.rb
    │   │   └── sitemapper.rb
    │   ├── syncer
    │   │   ├── reslider.rb
    │   │   ├── resocializer.rb
    │   │   ├── respider.rb
    │   │   ├── rescrimper.rb
    │   │   ├── resampler.rb
    │   │   ├── base.rb
    │   │   ├── reindexer.rb
    │   │   ├── mover.rb
    │   │   ├── rescreener.rb
    │   │   └── refixer.rb
    │   ├── recorder
    │   │   ├── collector.rb
    │   │   ├── base.rb
    │   │   ├── uploader.rb
    │   │   └── fixer.rb
    │   └── mapper
    │   │   ├── id_availability.rb
    │   │   ├── base.rb
    │   │   ├── cleaner.rb
    │   │   ├── url_availability.rb
    │   │   └── indexer.rb
    ├── models
    │   ├── page
    │   │   ├── base.rb
    │   │   ├── url.rb
    │   │   └── parse.rb
    │   ├── persist.rb
    │   ├── record
    │   │   ├── screenshot.rb
    │   │   ├── export.rb
    │   │   ├── addons.rb
    │   │   ├── base.rb
    │   │   ├── search.rb
    │   │   ├── trends.rb
    │   │   ├── match.rb
    │   │   └── upload.rb
    │   ├── crawl
    │   │   ├── sitemap.rb
    │   │   ├── social.rb
    │   │   ├── capture.rb
    │   │   ├── base.rb
    │   │   └── google.rb
    │   ├── flattener.rb
    │   └── cloud.rb
    ├── controllers
    │   ├── application_controller.rb
    │   └── v1
    │   │   ├── batch_controller.rb
    │   │   ├── trends_controller.rb
    │   │   ├── match_controller.rb
    │   │   ├── search_controller.rb
    │   │   ├── status_controller.rb
    │   │   └── record_controller.rb
    └── helpers
    │   ├── counts_helper.rb
    │   ├── page_helper.rb
    │   └── schema_org_helper.rb
├── test
    ├── helpers
    │   └── .keep
    ├── mailers
    │   └── .keep
    ├── models
    │   ├── .keep
    │   └── url_test.rb
    ├── controllers
    │   └── .keep
    ├── fixtures
    │   └── .keep
    ├── integration
    │   └── .keep
    └── test_helper.rb
├── .ruby-gemset
├── .ruby-version
├── roles
    ├── chruby
    │   ├── .gitignore
    │   ├── defaults
    │   │   └── main.yml
    │   ├── templates
    │   │   ├── chruby.fact
    │   │   └── chruby.sh
    │   ├── test.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── .travis.yml
    │   ├── README.md
    │   ├── LICENSE
    │   └── tasks
    │   │   └── main.yml
    ├── elasticsearch
    │   ├── ansible.cfg
    │   ├── files
    │   │   ├── scripts
    │   │   │   └── calculate-score.groovy
    │   │   └── templates
    │   │   │   └── basic.json
    │   ├── test
    │   │   └── integration
    │   │   │   ├── multi-1x
    │   │   │       ├── multi.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── multi-2x
    │   │   │       ├── multi.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── config-1x
    │   │   │       ├── config.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── config-2x
    │   │   │       ├── config.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── package-1x
    │   │   │       ├── package.yaml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── package-2x
    │   │   │       ├── package.yaml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── standard-1x
    │   │   │       ├── standard.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── standard-2x
    │   │   │       ├── standard.yml
    │   │   │       └── serverspec
    │   │   │       │   └── default_spec.rb
    │   │   │   ├── helpers
    │   │   │       └── serverspec
    │   │   │       │   ├── Gemfile
    │   │   │       │   ├── spec_helper.rb
    │   │   │       │   ├── standard_spec.rb
    │   │   │       │   └── package_spec.rb
    │   │   │   ├── standard.yml
    │   │   │   ├── package.yml
    │   │   │   ├── config.yml
    │   │   │   └── multi.yml
    │   ├── .gitignore
    │   ├── tasks
    │   │   ├── elasticsearch-Debian-version-lock.yml
    │   │   ├── elasticsearch-service.yml
    │   │   ├── elasticsearch-RedHat-version-lock.yml
    │   │   ├── elasticsearch-version-lock.yml
    │   │   ├── java.yml
    │   │   ├── elasticsearch-optional-user.yml
    │   │   ├── elasticsearch-scripts.yml
    │   │   ├── main.yml
    │   │   ├── elasticsearch-RedHat.yml
    │   │   ├── elasticsearch-Debian.yml
    │   │   ├── elasticsearch-templates.yml
    │   │   ├── elasticsearch.yml
    │   │   ├── checkParameters.yml
    │   │   ├── elasticsearch-plugins.yml
    │   │   └── elasticsearch-config.yml
    │   ├── vars
    │   │   ├── Debian.yml
    │   │   ├── RedHat.yml
    │   │   └── main.yml
    │   ├── Gemfile
    │   ├── templates
    │   │   ├── elasticsearch.repo
    │   │   ├── elasticsearch.yml.j2
    │   │   ├── systemd
    │   │   │   └── elasticsearch.j2
    │   │   ├── logging.yml.j2
    │   │   └── elasticsearch.j2
    │   ├── handlers
    │   │   └── main.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── filter_plugins
    │   │   └── custom.py
    │   ├── Gemfile.lock
    │   └── .kitchen.yml
    ├── ruby-install
    │   ├── .gitignore
    │   ├── meta
    │   │   ├── .galaxy_install_info
    │   │   └── main.yml
    │   ├── templates
    │   │   └── ruby_install.fact
    │   ├── defaults
    │   │   └── main.yml
    │   ├── test.yml
    │   ├── .travis.yml
    │   ├── README.md
    │   ├── LICENSE
    │   └── tasks
    │   │   └── main.yml
    ├── logrotate
    │   ├── tests
    │   │   ├── inventory
    │   │   └── test.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── templates
    │   │   └── logrotate.d.j2
    │   ├── tasks
    │   │   └── main.yml
    │   ├── .travis.yml
    │   ├── LICENSE
    │   └── README.md
    ├── build-ruby
    │   ├── defaults
    │   │   └── main.yml
    │   ├── vars
    │   │   └── main.yml
    │   ├── handlers
    │   │   └── main.yml
    │   ├── tasks
    │   │   └── main.yml
    │   ├── README.md
    │   └── meta
    │   │   └── main.yml
    ├── nginx-unicorn
    │   ├── defaults
    │   │   └── main.yml
    │   ├── handlers
    │   │   └── main.yml
    │   ├── tasks
    │   │   ├── main.yml
    │   │   ├── redhat.yml
    │   │   └── debian.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── LICENSE
    │   └── README.md
    ├── swapfile
    │   ├── handlers
    │   │   └── main.yml
    │   ├── defaults
    │   │   └── main.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── LICENSE
    │   ├── tasks
    │   │   └── main.yml
    │   └── README.md
    ├── imagemagick
    │   ├── .gitignore
    │   ├── tasks
    │   │   └── main.yml
    │   ├── meta
    │   │   └── main.yml
    │   └── README.md
    ├── ubuntu-common
    │   ├── templates
    │   │   ├── locale
    │   │   └── sources.list
    │   ├── defaults
    │   │   └── main.yml
    │   ├── meta
    │   │   └── main.yml
    │   ├── README.md
    │   └── tasks
    │   │   └── main.yml
    └── letsencrypt
    │   ├── meta
    │       ├── .galaxy_install_info
    │       └── main.yml
    │   ├── defaults
    │       └── main.yml
    │   ├── tasks
    │       └── main.yml
    │   └── README.md
├── .guardrc
├── bin
    ├── rake
    ├── bundle
    ├── rails
    └── spring
├── config
    ├── initializers
    │   ├── cookies_serializer.rb
    │   ├── elasticsearch.rb
    │   ├── session_store.rb
    │   ├── redis.rb
    │   ├── filter_parameter_logging.rb
    │   ├── mime_types.rb
    │   ├── vcr.rb
    │   ├── assets.rb
    │   ├── backtrace_silencers.rb
    │   ├── wrap_parameters.rb
    │   ├── inflections.rb
    │   └── sidekiq.rb
    ├── environment.rb
    ├── boot.rb
    ├── database.yml
    ├── application.rb
    ├── locales
    │   └── en.yml
    ├── sidekiq-slim.yml.example
    ├── secrets.yml.example
    ├── config.yml.example
    ├── sitemap.rb
    ├── unicorn.rb
    ├── routes.rb
    ├── environments
    │   ├── development.rb
    │   ├── test.rb
    │   └── production.rb
    └── sidekiq.yml.example
├── restart.sh
├── lib
    └── tasks
    │   ├── map.rake
    │   ├── report.rake
    │   ├── crawl.rake
    │   └── sync.rake
├── Rakefile
├── run
├── .gitignore
├── config.ru
├── Gemfile
├── production.yml
└── Guardfile


/log/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/sites/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/helpers/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/mailers/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/models/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.ruby-gemset:
--------------------------------------------------------------------------------
1 | skynet
2 | 


--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.7.2
2 | 


--------------------------------------------------------------------------------
/test/controllers/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/fixtures/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/test/integration/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/roles/chruby/.gitignore:
--------------------------------------------------------------------------------
1 | .vagrant
2 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/ansible.cfg:
--------------------------------------------------------------------------------
1 | [defaults]


--------------------------------------------------------------------------------
/roles/ruby-install/.gitignore:
--------------------------------------------------------------------------------
1 | .vagrant
2 | 


--------------------------------------------------------------------------------
/roles/logrotate/tests/inventory:
--------------------------------------------------------------------------------
1 | localhost
2 | 


--------------------------------------------------------------------------------
/roles/build-ruby/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2.2.1
3 | 


--------------------------------------------------------------------------------
/roles/chruby/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | chruby_version: '0.3.9'
3 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | nginx_sites: []
3 | 


--------------------------------------------------------------------------------
/.guardrc:
--------------------------------------------------------------------------------
1 | require File.expand_path('../config/environment', __FILE__)
2 | 


--------------------------------------------------------------------------------
/roles/build-ruby/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # vars file for build-ruby
3 | 


--------------------------------------------------------------------------------
/app/workers/worker.rb:
--------------------------------------------------------------------------------
1 | class Worker
2 |   include Sidekiq::Worker
3 | end
4 | 


--------------------------------------------------------------------------------
/roles/build-ruby/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # handlers file for build-ruby
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/files/scripts/calculate-score.groovy:
--------------------------------------------------------------------------------
1 | log(_score * 2) + my_modifier


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/multi-1x/multi.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/multi-2x/multi.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/chruby/templates/chruby.fact:
--------------------------------------------------------------------------------
1 | {"version": "{{ installed_chruby_version.stdout }}"}
2 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/config-1x/config.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/config-2x/config.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/package-1x/package.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/package-2x/package.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/swapfile/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Reload sysctl
3 |   command: sysctl -p
4 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/standard-1x/standard.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/standard-2x/standard.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - host: test-kitchen
3 | 


--------------------------------------------------------------------------------
/roles/imagemagick/.gitignore:
--------------------------------------------------------------------------------
1 | *.sublime-project
2 | *.sublime-workspace
3 | .DS_Store
4 | .idea
5 | 


--------------------------------------------------------------------------------
/roles/logrotate/defaults/main.yml:
--------------------------------------------------------------------------------
1 | logrotate_conf_dir: "/etc/logrotate.d/"
2 | logrotate_scripts: []
3 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/templates/locale:
--------------------------------------------------------------------------------
1 | LC_ALL={{ common_locale_all }}
2 | LANG={{ common_locale_lang }}


--------------------------------------------------------------------------------
/app/workers/scheduler/base.rb:
--------------------------------------------------------------------------------
1 | class Scheduler::Base < Worker
2 |   include Sidetiq::Schedulable
3 | end
4 | 


--------------------------------------------------------------------------------
/roles/letsencrypt/meta/.galaxy_install_info:
--------------------------------------------------------------------------------
1 | {install_date: 'Thu Feb  4 19:04:37 2016', version: master}
2 | 


--------------------------------------------------------------------------------
/roles/ruby-install/meta/.galaxy_install_info:
--------------------------------------------------------------------------------
1 | {install_date: 'Thu Oct  2 20:55:31 2014', version: v0.1.0}
2 | 


--------------------------------------------------------------------------------
/roles/ruby-install/templates/ruby_install.fact:
--------------------------------------------------------------------------------
1 | {"version": "{{ installed_ruby_install_version.stdout }}"}
2 | 


--------------------------------------------------------------------------------
/bin/rake:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | require_relative '../config/boot'
3 | require 'rake'
4 | Rake.application.run
5 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/handlers/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: reload nginx
3 |   service: name=nginx state=reloaded
4 | 
5 | 


--------------------------------------------------------------------------------
/roles/ruby-install/defaults/main.yml:
--------------------------------------------------------------------------------
1 | # file: ruby-install/defaults/main.yml
2 | 
3 | ruby_install_version: '0.4.3'
4 | 


--------------------------------------------------------------------------------
/roles/chruby/templates/chruby.sh:
--------------------------------------------------------------------------------
1 | source /usr/local/share/chruby/chruby.sh
2 | source /usr/local/share/chruby/auto.sh
3 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/helpers/serverspec/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'rspec-retry'
4 | 


--------------------------------------------------------------------------------
/roles/chruby/test.yml:
--------------------------------------------------------------------------------
1 | - hosts: all
2 |   vars_files:
3 |     - defaults/main.yml
4 |   tasks:
5 |     - include: tasks/main.yml
6 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/.gitignore:
--------------------------------------------------------------------------------
1 | .kitchen/
2 | *.pyc
3 | .vendor
4 | .bundle
5 | Converging
6 | TODO
7 | .idea/
8 | elasticsearch.iml
9 | 


--------------------------------------------------------------------------------
/roles/ruby-install/test.yml:
--------------------------------------------------------------------------------
1 | - hosts: all
2 |   vars_files:
3 |     - defaults/main.yml
4 |   tasks:
5 |     - include: tasks/main.yml
6 | 


--------------------------------------------------------------------------------
/bin/bundle:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__)
3 | load Gem.bin_path('bundler', 'bundle')
4 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-Debian-version-lock.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Debian - hold elasticsearch version
3 |   command: apt-mark hold elasticsearch
4 | 


--------------------------------------------------------------------------------
/bin/rails:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 | APP_PATH = File.expand_path('../../config/application',  __FILE__)
3 | require_relative '../config/boot'
4 | require 'rails/commands'
5 | 


--------------------------------------------------------------------------------
/config/initializers/cookies_serializer.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | Rails.application.config.action_dispatch.cookies_serializer = :json
4 | 


--------------------------------------------------------------------------------
/config/initializers/elasticsearch.rb:
--------------------------------------------------------------------------------
1 | require 'typhoeus/adapters/faraday'
2 | Elasticsearch::Model.client = Elasticsearch::Client.new(Rails.configuration.config[:elasticsearch])
3 | 


--------------------------------------------------------------------------------
/config/initializers/session_store.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | Rails.application.config.session_store :cookie_store, key: '_crawler_session'
4 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/vars/Debian.yml:
--------------------------------------------------------------------------------
1 | ---
2 | java: "{{ es_java | default('openjdk-7-jre-headless') }}"
3 | default_file: "/etc/default/elasticsearch"
4 | es_home: "/usr/share/elasticsearch"


--------------------------------------------------------------------------------
/roles/nginx-unicorn/tasks/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - include: debian.yml
3 |   when: ansible_os_family == 'Debian'
4 | 
5 | - include: redhat.yml
6 |   when: ansible_os_family == 'RedHat'
7 | 
8 | 


--------------------------------------------------------------------------------
/config/environment.rb:
--------------------------------------------------------------------------------
1 | # Load the Rails application.
2 | require File.expand_path('../application', __FILE__)
3 | 
4 | # Initialize the Rails application.
5 | Rails.application.initialize!
6 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/vars/RedHat.yml:
--------------------------------------------------------------------------------
1 | ---
2 | java: "{{ es_java | default('java-1.8.0-openjdk.x86_64') }}"
3 | default_file: "/etc/sysconfig/elasticsearch"
4 | es_home: "/usr/share/elasticsearch"


--------------------------------------------------------------------------------
/roles/elasticsearch/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'test-kitchen', '1.4.2'
4 | gem "kitchen-docker", '2.1.0'
5 | gem 'kitchen-ansible', '0.40.1'
6 | gem 'net-ssh', '~> 2.0'
7 | 


--------------------------------------------------------------------------------
/roles/swapfile/defaults/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | swapfile_location: /swapfile
3 | swapfile_size: 512MB
4 | swapfile_swappiness: False
5 | swapfile_vfs_cache_pressure: False
6 | swapfile_use_dd: False
7 | 


--------------------------------------------------------------------------------
/config/boot.rb:
--------------------------------------------------------------------------------
1 | # Set up gems listed in the Gemfile.
2 | ENV['BUNDLE_GEMFILE'] ||= File.expand_path('../../Gemfile', __FILE__)
3 | 
4 | require 'bundler/setup' if File.exist?(ENV['BUNDLE_GEMFILE'])
5 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/config-1x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'config_spec'
2 | 
3 | describe 'Config Tests v 1.x' do
4 |   include_examples 'config::init', "1.7.3"
5 | end
6 | 
7 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/config-2x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'config_spec'
2 | 
3 | describe 'Config Tests v 2.x' do
4 |   include_examples 'config::init', "2.2.0"
5 | end
6 | 
7 | 


--------------------------------------------------------------------------------
/config/initializers/redis.rb:
--------------------------------------------------------------------------------
1 | Redis.current = Redis.new(host: Rails.configuration.config[:redis][:host], port: Rails.configuration.config[:redis][:port], password: Rails.configuration.config[:redis][:password])
2 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/multi-1x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'multi_spec'
2 | 
3 | 
4 | describe 'Multi Tests v 1.x' do
5 |   include_examples 'multi::init', "1.7.3", ["kopf","marvel"]
6 | end


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/package-1x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'package_spec'
2 | 
3 | describe 'Package Tests v 1.x' do
4 |   include_examples 'package::init', "1.7.3", ["kopf","marvel"]
5 | end


--------------------------------------------------------------------------------
/test/test_helper.rb:
--------------------------------------------------------------------------------
1 | ENV['RAILS_ENV'] ||= 'test'
2 | require File.expand_path('../../config/environment', __FILE__)
3 | require 'rails/test_help'
4 | 
5 | class ActiveSupport::TestCase
6 |   fixtures :all
7 | end
8 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/standard-1x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'standard_spec'
2 | 
3 | 
4 | describe 'Standard Tests v 1.x' do
5 |   include_examples 'standard::init', "1.7.3"
6 | end
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/app/models/page/base.rb:
--------------------------------------------------------------------------------
 1 | class Page::Base < Page::Url
 2 |   attr_accessor :page
 3 | 
 4 |   def parser
 5 |     page.parser
 6 |   end
 7 | 
 8 |   def base
 9 |     "#{page.uri.scheme}://#{page.uri.host}"
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/restart.sh:
--------------------------------------------------------------------------------
1 | git checkout . && git pull origin master && cd app/sites && git pull origin master && cd ../.. && cp config/sidekiq.yml.example config/sidekiq.yml && bundle && RAILS_ENV=production bundle exec sidekiq -d -L log/sidekiq.log
2 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/package-2x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'package_spec'
2 | 
3 | 
4 | describe 'Package Tests v 2.x' do
5 |   include_examples 'package::init', "2.2.0", ["kopf","license","marvel-agent"]
6 | end


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/standard-2x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'standard_spec'
 2 | 
 3 | 
 4 | describe 'Standard Tests v 2.x' do
 5 |   include_examples 'standard::init', "2.2.0"
 6 | end
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/multi-2x/serverspec/default_spec.rb:
--------------------------------------------------------------------------------
1 | require 'multi_spec'
2 | 
3 | 
4 | describe 'Multi Tests v 2.x' do
5 |   include_examples 'multi::init', "2.2.0", ["kopf","license","marvel-agent"]
6 | end
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/roles/swapfile/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: "Kamal Nasser"
 4 |   description: swapfile
 5 |   license: MIT
 6 |   min_ansible_version: 1.4
 7 |   version: 0.4
 8 |   categories:
 9 |     - system
10 |   dependencies: []
11 | 


--------------------------------------------------------------------------------
/config/initializers/filter_parameter_logging.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | # Configure sensitive parameters which will be filtered from the log file.
4 | Rails.application.config.filter_parameters += [:password]
5 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/files/templates/basic.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "template" : "te*",
 3 |   "settings" : {
 4 |     "number_of_shards" : 1
 5 |   },
 6 |   "mappings" : {
 7 |     "type1" : {
 8 |       "_source" : { "enabled" : false }
 9 |     }
10 |   }
11 | }


--------------------------------------------------------------------------------
/lib/tasks/map.rake:
--------------------------------------------------------------------------------
1 | namespace :map do
2 |   desc 'Run the crawler in Mapper::Reader mode'
3 |   task :reader, [:bucket] => :environment do |_task, args|
4 |     Redis::List.new('visited').clear
5 |     Mapper::Reader.perform_async args.bucket
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/standard.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: wrapper playbook for kitchen testing "elasticsearch"
3 |   hosts: localhost
4 |   roles:
5 |     - { role: elasticsearch, es_instance_name: "node1" }
6 |   vars:
7 |     es_use_repository: "true"


--------------------------------------------------------------------------------
/app/controllers/application_controller.rb:
--------------------------------------------------------------------------------
1 | class ApplicationController < ActionController::Base
2 |   respond_to :json
3 | 
4 |   def index
5 |     redirect_to Rails.configuration.config[:admin][:docs] || 'https://github.com/bastosmichael/skynet'
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/lib/tasks/report.rake:
--------------------------------------------------------------------------------
1 | namespace :report do
2 |   desc 'Get List of Api keys'
3 |   task :api_keys do
4 |     Cloud.new('api-keys').files.map(&:key).map { |key| Record::Base.new('api-keys', key.gsub('.json','')).data.merge(api_key: key).symbolize_keys! }
5 |   end
6 | end
7 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/vars/main.yml:
--------------------------------------------------------------------------------
1 | ---
2 | es_package_url: "https://download.elastic.co/elasticsearch/elasticsearch/elasticsearch"
3 | es_conf_dir: "/etc/elasticsearch"
4 | sysd_script: "/usr/lib/systemd/system/elasticsearch.service"
5 | init_script: "/etc/init.d/elasticsearch"


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | # Add your own tasks in files placed in lib/tasks ending in .rake,
2 | # for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
3 | 
4 | require File.expand_path('../config/application', __FILE__)
5 | 
6 | Rails.application.load_tasks
7 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | common_apt_mirror: http://archive.ubuntu.com/ubuntu/
 3 | common_release_code: trusty
 4 | 
 5 | common_timezone: Asia/Shanghai
 6 | 
 7 | common_locale_all: en_US.UTF-8
 8 | common_locale_lang: en_US.UTF-8
 9 | 
10 | common_apt_cache_time: 3600
11 | 


--------------------------------------------------------------------------------
/app/workers/crawler/slider.rb:
--------------------------------------------------------------------------------
1 | class Crawler::Slider < Crawler::Sampler
2 |   sidekiq_options queue: :slider,
3 |                   retry: true,
4 |                   backtrace: true,
5 |                   unique: :until_and_while_executing,
6 |                   unique_expiration: 120 * 60
7 | end
8 | 


--------------------------------------------------------------------------------
/config/initializers/mime_types.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | # Add new mime types for use in respond_to blocks:
4 | # Mime::Type.register "text/richtext", :rtf
5 | 
6 | Mime::Type.register 'application/xls', :xls
7 | Mime::Type.register 'application/jpeg', :jpg
8 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-service.yml:
--------------------------------------------------------------------------------
1 | # Make sure the service is started, and restart if necessary
2 | - name: Start elasticsearch service
3 |   service: name={{instance_init_script | basename}} state=started enabled=yes
4 |   when: es_start_service
5 |   register: elasticsearch_started
6 | 
7 | 


--------------------------------------------------------------------------------
/app/workers/syncer/reslider.rb:
--------------------------------------------------------------------------------
1 | class Syncer::Reslider < Syncer::Base
2 |   def perform(container)
3 |     @container = container
4 |     records.with_progress("Reslide Crawling #{container}").each do |r|
5 |       Crawler::Slider.perform_async record(r.key.gsub('.json','')).try(:url)
6 |     end
7 |   end
8 | end
9 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/meta/main.yml:
--------------------------------------------------------------------------------
 1 | galaxy_info:
 2 |   author: AR
 3 |   description: common setup routine for ubuntu
 4 |   license: MIT
 5 |   min_ansible_version: 1.2
 6 |   platforms:
 7 |   - name: Ubuntu
 8 |     versions:
 9 |     - precise
10 |     - trusty
11 |   categories:
12 |   - system
13 | dependencies: []
14 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/templates/elasticsearch.repo:
--------------------------------------------------------------------------------
1 | [elasticsearch-{{ es_major_version }}]
2 | name=Elasticsearch repository for {{ es_major_version }} packages
3 | baseurl=http://packages.elastic.co/elasticsearch/{{ es_major_version }}/centos
4 | gpgcheck=1
5 | gpgkey=http://packages.elastic.co/GPG-KEY-elasticsearch
6 | enabled=1
7 | 


--------------------------------------------------------------------------------
/app/models/persist.rb:
--------------------------------------------------------------------------------
 1 | class Persist
 2 |   def initialize(cloud)
 3 |     @cloud = cloud
 4 |   end
 5 | 
 6 |   def [](key)
 7 |     @cloud.get(key).try(:body)
 8 |   end
 9 | 
10 |   def []=(key, content)
11 |     @cloud.sync(key, content)
12 |   end
13 | 
14 |   def exists?(key)
15 |     @cloud.head(key)
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/app/workers/syncer/resocializer.rb:
--------------------------------------------------------------------------------
1 | class Syncer::Resocializer < Syncer::Base
2 |   def perform(container)
3 |     @container = container
4 |     records.with_progress("Resocialize Crawling #{container}").each do |r|
5 |       Crawler::Socializer.perform_async record(r.key.gsub('.json','')).try(:url)
6 |     end
7 |   end
8 | end
9 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/handlers/main.yml:
--------------------------------------------------------------------------------
1 | 
2 | - name: restart elasticsearch
3 |   service: name={{instance_init_script | basename}} state=restarted enabled=yes
4 |   when: es_restart_on_change and es_start_service and not elasticsearch_started.changed and ((plugin_installed is defined and plugin_installed.changed) or elasticsearch_install.changed)
5 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/tasks/redhat.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Update/Install nginx
 3 |   yum: name=nginx state=latest
 4 | 
 5 | - name: Generate site configurations
 6 |   template: src=nginx-site.j2 dest=/etc/nginx/conf.d/{{ item.name }}.conf owner=root group=root mode=0644
 7 |   notify:
 8 |     - reload nginx
 9 |   with_items: nginx_sites
10 | 
11 | 


--------------------------------------------------------------------------------
/roles/logrotate/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: Nick Hammond
 4 |   description: Role to configure logrotate scripts
 5 |   license: BSD
 6 |   min_ansible_version: 1.5
 7 |   platforms:
 8 |   - name: Ubuntu
 9 |     versions:
10 |     - lucid
11 |     - precise
12 |     - trusty
13 |   categories:
14 |   - system
15 | dependencies: []
16 | 


--------------------------------------------------------------------------------
/app/workers/syncer/respider.rb:
--------------------------------------------------------------------------------
1 | class Syncer::Respider < Syncer::Base
2 |   def perform(container, spider_type = 'Spider')
3 |     @container = container
4 |     records.with_progress("Respider Crawling #{container}").each do |r|
5 |       ('Crawler::' + spider_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url)
6 |     end
7 |   end
8 | end
9 | 


--------------------------------------------------------------------------------
/app/workers/recorder/collector.rb:
--------------------------------------------------------------------------------
 1 | class Recorder::Collector < Recorder::Base
 2 |   def perform(container)
 3 |     @container = container
 4 |     # collections.each do |r|
 5 |     #   ap record(r.key)
 6 |     # end
 7 |   end
 8 | 
 9 |   def collections
10 |     @collections ||= cloud.files.map { |f| f if f.key.starts_with? '_' }.compact
11 |   end
12 | end
13 | 


--------------------------------------------------------------------------------
/app/workers/recorder/base.rb:
--------------------------------------------------------------------------------
 1 | class Recorder::Base < Worker
 2 |   sidekiq_options queue: :recorder,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def cloud
 9 |     @cloud ||= Cloud.new(@container)
10 |   end
11 | end
12 | 


--------------------------------------------------------------------------------
/app/workers/syncer/rescrimper.rb:
--------------------------------------------------------------------------------
1 | class Syncer::Rescrimper < Syncer::Base
2 |   def perform(container, scrimper_type = 'Scrimper')
3 |     @container = container
4 |     records.with_progress("Rescrimp Crawling #{container}").each do |r|
5 |       ('Crawler::' + scrimper_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url)
6 |     end
7 |   end
8 | end
9 | 


--------------------------------------------------------------------------------
/roles/ruby-install/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: Andrew Angelo Ang
 4 |   description: "Installs ruby-install."
 5 |   company: InnoHub, Inc.
 6 |   license: MIT
 7 |   min_ansible_version: 1.4
 8 |   platforms:
 9 |   - name: Ubuntu
10 |     versions:
11 |     - trusty
12 |   categories:
13 |   - development
14 |   - system
15 | dependencies: []
16 | 


--------------------------------------------------------------------------------
/app/workers/scheduler/reindexer.rb:
--------------------------------------------------------------------------------
 1 | # class Scheduler::Reindexer < Scheduler::Base
 2 | #   recurrence { daily }
 3 | 
 4 | #   def perform
 5 | #     containers = Rails.configuration.config[:admin][:api_containers]
 6 | #     if containers.any?
 7 | #       containers.each {|c| Syncer::Reindexer.perform_async c }
 8 | #     end if Rails.env.production?
 9 | #   end
10 | # end
11 | 


--------------------------------------------------------------------------------
/config/initializers/vcr.rb:
--------------------------------------------------------------------------------
1 | VCR.configure do |c|
2 |   c.cassette_library_dir = 'tmp/cache'
3 |   c.hook_into :typhoeus
4 |   c.default_cassette_options = { match_requests_on: [:uri, :body, :method] }
5 |   # c.cassette_persisters[:cloud] = Persist.new(Cloud.new('semanticvcr'))
6 |   # c.default_cassette_options[:persist_with] = :cloud
7 |   c.allow_http_connections_when_no_cassette = true
8 | end
9 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/helpers/serverspec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | require 'serverspec'
 2 | set :backend, :exec
 3 | 
 4 | require 'rspec/retry'
 5 | 
 6 | RSpec.configure do |config|
 7 |   # show retry status in spec process
 8 |   config.verbose_retry = true
 9 |   # show exception that triggers a retry if verbose_retry is set to true
10 |   config.display_try_failure_messages = true
11 | end


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-RedHat-version-lock.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: RedHat - install yum-version-lock
3 |   yum: name=yum-plugin-versionlock state=present update_cache=yes
4 | - name: RedHat - lock elasticsearch version
5 |   shell: yum versionlock delete 0:elasticsearch* ; yum versionlock add elasticsearch{% if es_version is defined and es_version != "" %}-{{ es_version }}{% endif %}
6 | 
7 | 


--------------------------------------------------------------------------------
/roles/imagemagick/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Install ImageMagick (Debian)
 4 |   apt: pkg=imagemagick
 5 |   when: ansible_os_family == "Debian"
 6 |   become: true
 7 | 
 8 | - name: Install ImageMagick (Red Hat)
 9 |   yum: pkg={{ item }} state=installed
10 |   when: ansible_os_family == "RedHat"
11 |   become: true
12 |   with_items:
13 |     - ImageMagick
14 |     - ImageMagick-devel
15 | 


--------------------------------------------------------------------------------
/app/workers/syncer/resampler.rb:
--------------------------------------------------------------------------------
1 | class Syncer::Resampler < Syncer::Base
2 |   def perform(container, sampler_type = 'Sampler', scrimper_type = 'Scrimper')
3 |     @container = container
4 |     records.with_progress("Resample Crawling #{container}").each do |r|
5 |       ('Crawler::' + sampler_type).constantize.perform_async record(r.key.gsub('.json','')).try(:url), scrimper_type
6 |     end
7 |   end
8 | end
9 | 


--------------------------------------------------------------------------------
/roles/chruby/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: Andrew Angelo Ang
 4 |   description: Installs chruby on Ubuntu systems.
 5 |   company: InnoHub, Inc.
 6 |   license: MIT
 7 |   min_ansible_version: 1.4
 8 |   platforms:
 9 |     - name: Ubuntu
10 |       versions:
11 |         - precise
12 |         - trusty
13 |   categories:
14 |     - development
15 |     - system
16 | dependencies: []
17 | 


--------------------------------------------------------------------------------
/roles/chruby/.travis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: python
 3 | python: "2.7"
 4 | before_install:
 5 |   - sudo apt-get update -qq
 6 |   - sudo apt-get install -qq python-apt python-pycurl
 7 | install:
 8 |   - pip install ansible==1.7.1
 9 | script:
10 |   - echo localhost > inventory
11 |   - ansible-playbook --syntax-check -i inventory test.yml
12 |   - ansible-playbook -i inventory test.yml --connection=local --sudo
13 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-version-lock.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Trigger Debian section
 3 | - name: Include Debian specific Elasticsearch
 4 |   include: elasticsearch-Debian-version-lock.yml
 5 |   when: ansible_os_family == 'Debian'
 6 | 
 7 | # Trigger Redhat section
 8 | - name: Include RedHat specific Elasticsearch
 9 |   include: elasticsearch-RedHat-version-lock.yml
10 |   when: ansible_os_family == 'RedHat'
11 | 


--------------------------------------------------------------------------------
/roles/ruby-install/.travis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: python
 3 | python: "2.7"
 4 | before_install:
 5 |   - sudo apt-get update -qq
 6 |   - sudo apt-get install -qq python-apt python-pycurl
 7 | install:
 8 |   - pip install ansible==1.7.1
 9 | script:
10 |   - echo localhost > inventory
11 |   - ansible-playbook --syntax-check -i inventory test.yml
12 |   - ansible-playbook -i inventory test.yml --connection=local --sudo
13 | 


--------------------------------------------------------------------------------
/config/initializers/assets.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | # Version of your assets, change this if you want to expire all your assets.
4 | Rails.application.config.assets.version = '1.0'
5 | 
6 | # Precompile additional assets.
7 | # application.js, application.css, and all non-JS/CSS in app/assets folder are already added.
8 | # Rails.application.config.assets.precompile += %w( search.js )
9 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/package.yml:
--------------------------------------------------------------------------------
1 | ---
2 | - name: Elasticsearch Package tests
3 |   hosts: localhost
4 |   roles:
5 |     - { role: elasticsearch, es_config: { "http.port": 9200, "transport.tcp.port":9300, discovery.zen.ping.unicast.hosts: "localhost:9300" }, es_instance_name: "node1" }
6 |   vars:
7 |     es_scripts: true
8 |     es_templates: true
9 | #Plugins installed for this test are specified in .kitchen.yml under suite


--------------------------------------------------------------------------------
/roles/logrotate/templates/logrotate.d.j2:
--------------------------------------------------------------------------------
 1 | # {{ ansible_managed }}
 2 | 
 3 | "{{ item.path }}" {
 4 |   {% if item.options is defined -%}
 5 |   {% for option in item.options -%}
 6 |   {{ option }}
 7 |   {% endfor -%}
 8 |   {% endif %}
 9 |   {%- if item.scripts is defined -%}
10 |   {%- for name, script in item.scripts.iteritems() -%}
11 |   {{ name }}
12 |     {{ script }}
13 |   endscript
14 |   {% endfor -%}
15 |   {% endif -%}
16 | }
17 | 


--------------------------------------------------------------------------------
/roles/imagemagick/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | dependencies: []
 4 | 
 5 | galaxy_info:
 6 |   author: danbohea
 7 |   description: ImageMagick for Linux
 8 |   license: MIT
 9 |   min_ansible_version: 1.4
10 |   platforms:
11 |   - name: Debian
12 |     versions:
13 |     - jessie
14 |     - wheezy
15 |   - name: EL
16 |     versions:
17 |     - 6
18 |   - name: Ubuntu
19 |     versions:
20 |     - precise
21 |     - trusty
22 |   categories:
23 |     - web
24 | 


--------------------------------------------------------------------------------
/app/workers/mapper/id_availability.rb:
--------------------------------------------------------------------------------
 1 | class Mapper::IdAvailability < Mapper::Base
 2 |   def perform(container, id)
 3 |     @container = container
 4 |     types = container.split('-').last.pluralize.gsub(':', '')
 5 |     index = Rails.env + '-' + types
 6 |     cloud.head(id + '.json').try(:destroy)
 7 |     Elasticsearch::Model.client.delete index: index, type: container, id: id
 8 |     Elasticsearch::Model.client.indices.refresh index: index
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/config/initializers/backtrace_silencers.rb:
--------------------------------------------------------------------------------
1 | # Be sure to restart your server when you modify this file.
2 | 
3 | # You can add backtrace silencers for libraries that you're using but don't wish to see in your backtraces.
4 | # Rails.backtrace_cleaner.add_silencer { |line| line =~ /my_noisy_library/ }
5 | 
6 | # You can also remove all the silencers if you're trying to debug a problem that might stem from framework code.
7 | # Rails.backtrace_cleaner.remove_silencers!
8 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/java.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | 
 4 | - set_fact: java_state="present"
 5 | 
 6 | - set_fact: java_state="latest"
 7 |   when: update_java == true
 8 | 
 9 | - name: RedHat - Ensure Java is installed
10 |   yum: name={{ java }} state={{java_state}}
11 |   when: ansible_os_family == 'RedHat'
12 |   
13 | - name: Debian - Ensure Java is installed
14 |   apt: name={{ java }} state={{java_state}} update_cache=yes force=yes
15 |   when: ansible_os_family == 'Debian'


--------------------------------------------------------------------------------
/roles/logrotate/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: nickhammond.logrotate | Install logrotate
 3 |   action: "{{ansible_pkg_mgr}} pkg=logrotate state=present"
 4 |   when: logrotate_scripts is defined and len(logrotate_scripts) > 0
 5 | 
 6 | - name: nickhammond.logrotate | Setup logrotate.d scripts
 7 |   template:
 8 |     src: logrotate.d.j2
 9 |     dest: "{{ logrotate_conf_dir }}{{ item.name }}"
10 |   with_items: logrotate_scripts
11 |   when: logrotate_scripts is defined
12 | 


--------------------------------------------------------------------------------
/roles/build-ruby/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # tasks file for build-ruby
 3 | 
 4 | 
 5 | #- command: /usr/bin/test -e /opt/rubies/ruby-{{version}}
 6 | #  register: ruby_installed
 7 | #  ignore_errors: True
 8 | 
 9 | - name: Install ruby-{{version}}
10 |   shell: ruby-install --install-dir /opt/rubies/ruby-{{version}} --no-reinstall ruby {{version}}
11 |   #when: ruby_installed|failed
12 | 
13 | - name: Install bundler
14 |   shell: chruby-exec ruby {{version}} -- gem install bundler --conservative
15 | 


--------------------------------------------------------------------------------
/roles/logrotate/.travis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | language: python
 3 | python: "2.7"
 4 | before_install:
 5 |   - sudo apt-get update -qq
 6 |   - sudo apt-get install -qq python-apt python-pycurl
 7 | install:
 8 |   - pip install ansible
 9 | script:
10 |   - "printf '[defaults]\nroles_path = ../' > ansible.cfg"
11 |   - ansible-playbook -i tests/inventory --syntax-check tests/test.yml
12 |   - ansible-playbook -i tests/inventory --connection=local --sudo -vvvv tests/test.yml
13 | notifications:
14 |   email: false
15 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/README.md:
--------------------------------------------------------------------------------
 1 | # ubuntu-common (ansible role)
 2 | 
 3 | This role intends to provide command setup routine for ubuntu LTS.
 4 | 
 5 | ## Requirements
 6 | 
 7 | This role requires Ansible 1.2 or higher, and platform requirements are listed in the metadata file.
 8 | 
 9 | ## Role Variables
10 | 
11 | TODO
12 | 
13 | ## Examples
14 | 
15 | TODO
16 | 
17 | ## Dependencies
18 | 
19 | None
20 | 
21 | ## License
22 | 
23 | MIT
24 | 
25 | ## Author Information
26 | 
27 | AR <aleiphoenix@gmail.com>
28 | 


--------------------------------------------------------------------------------
/app/workers/recorder/uploader.rb:
--------------------------------------------------------------------------------
 1 | class Recorder::Uploader < Recorder::Base
 2 |   def perform(metadata = {})
 3 |     if url = metadata['url']
 4 |       uploader = Record::Upload.new(url)
 5 |       uploader.id = metadata['id']
 6 |       uploader.metadata = metadata
 7 |       hash = uploader.sync
 8 | 
 9 |       Mapper::Indexer.perform_async uploader.container,
10 |                                     uploader.id,
11 |                                     hash
12 |     end unless metadata.nil?
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: Alexandros Giouzenis
 4 |   description: Nginx installation with Unicorn integration
 5 |   license: MIT
 6 |   min_ansible_version: 1.2
 7 |   platforms:
 8 |   - name: EL
 9 |     versions:
10 |     - all
11 |   - name: Fedora
12 |     versions:
13 |     - all
14 |   - name: Ubuntu
15 |     versions:
16 |     - all
17 |   - name: Debian
18 |     versions:
19 |     - all
20 |   categories:
21 |   - system
22 |   - web
23 | dependencies: []
24 | 
25 | 


--------------------------------------------------------------------------------
/app/workers/syncer/base.rb:
--------------------------------------------------------------------------------
 1 | class Syncer::Base < Worker
 2 |   sidekiq_options queue: :syncer,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def cloud
 9 |     @cloud ||= Cloud.new(@container)
10 |   end
11 | 
12 |   def records
13 |     @records ||= cloud.files
14 |   end
15 | 
16 |   def record(record)
17 |     Record::Base.new(@container, record)
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/roles/logrotate/tests/test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   sudo: True
 4 |   roles:
 5 |     - ansible-logrotate
 6 |     - role: ansible-logrotate
 7 |       logrotate_scripts:
 8 |         - name: nginx-options
 9 |           path: /var/log/nginx/options.log
10 |           options:
11 |             - daily
12 | 
13 |     - role: ansible-logrotate
14 |       logrotate_scripts:
15 |         - name: nginx-scripts
16 |           path: /var/log/nginx/scripts.log
17 |           scripts:
18 |             postrotate: "echo test"
19 | 


--------------------------------------------------------------------------------
/app/helpers/counts_helper.rb:
--------------------------------------------------------------------------------
 1 | module CountsHelper
 2 |   def pretty_integer(integer)
 3 |     return '0' if integer.nil? || integer == 0
 4 |     if integer > 999 && integer <= 999_999
 5 |       ('%.1f K' % (integer / 1000.0)).sub('.0', '')
 6 |     elsif integer > 999_999 && integer <= 999_999_999
 7 |       ('%.1f M' % (integer / 1_000_000.0)).sub('.0', '')
 8 |     elsif integer > 999_999_999
 9 |       ('%.1f B' % (integer / 1_000_000_000.0)).sub('.0', '')
10 |     else
11 |       integer.to_s
12 |     end
13 |   end
14 | end
15 | 


--------------------------------------------------------------------------------
/roles/letsencrypt/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | galaxy_info:
 3 |   author: Finn Herzfeld
 4 |   description: Generate TLS certificates and get them signed by Let's Encrypt.
 5 |   issue_tracker_url: https://github.com/thefinn93/ansible-letsencrypt/issues
 6 |   license: GPL
 7 |   min_ansible_version: 1.2
 8 | 
 9 |   platforms:
10 |     - name: Ubuntu
11 |       versions:
12 |         - trusty
13 |     - name: Debian
14 |       versions:
15 |         - jessie
16 | 
17 |   categories:
18 |     - networking
19 |     - web
20 | 
21 | dependencies: []
22 | 


--------------------------------------------------------------------------------
/app/workers/crawler/socializer.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Socializer < Crawler::Sampler
 2 |   sidekiq_options queue: :socializer,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def upload
 9 |     scraper.clear
10 |     @parsed = parsed.merge(parser.save) if parser.build
11 |     if parsed.presence && parsed['type']
12 |       Recorder::Uploader.perform_async parsed.merge(social.shares)
13 |     end
14 |   end
15 | end
16 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/meta/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | allow_duplicates: yes
 4 | 
 5 | galaxy_info:
 6 |   author: Robin Clarke, Jakob Reiter, Dale McDiarmid
 7 |   description: Elasticsearch for Linux
 8 |   company: "Elastic.co"
 9 |   license: "license (Apache)"
10 |   # Require 1.6 for apt deb install
11 |   min_ansible_version: 1.6
12 |   platforms:
13 |   - name: EL
14 |     versions:
15 |     - 6
16 |     - 7
17 |   - name: Debian
18 |     versions:
19 |     - all
20 |   - name: Ubuntu
21 |     versions:
22 |     - all
23 |   categories:
24 |     - system
25 | 
26 | dependencies: []
27 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/tasks/debian.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Update/Install nginx
 3 |   apt: name=nginx state=latest
 4 | 
 5 | - name: Generate site configurations
 6 |   template: src=nginx-site.j2 dest=/etc/nginx/sites-available/{{ item.name }} owner=root group=root mode=0644
 7 |   notify:
 8 |     - reload nginx
 9 |   with_items: nginx_sites
10 | 
11 | - name: Enable sites
12 |   file: src=/etc/nginx/sites-available/{{ item.name }} dest=/etc/nginx/sites-enabled/{{ item.name }} state=link owner=root group=root mode=0644
13 |   notify:
14 |     - reload nginx
15 |   with_items: nginx_sites
16 | 
17 | 


--------------------------------------------------------------------------------
/bin/spring:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | # This file loads spring without using Bundler, in order to be fast
 4 | # It gets overwritten when you run the `spring binstub` command
 5 | 
 6 | unless defined?(Spring)
 7 |   require 'rubygems'
 8 |   require 'bundler'
 9 | 
10 |   if match = Bundler.default_lockfile.read.match(/^GEM$.*?^    spring \((.*?)\)$.*?^$/m)
11 |     ENV['GEM_PATH'] = ([Bundler.bundle_path.to_s] + Gem.path).join(File::PATH_SEPARATOR)
12 |     ENV['GEM_HOME'] = ''
13 |     Gem.paths = ENV
14 | 
15 |     gem 'spring', match[1]
16 |     require 'spring/binstub'
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/config/initializers/wrap_parameters.rb:
--------------------------------------------------------------------------------
 1 | # Be sure to restart your server when you modify this file.
 2 | 
 3 | # This file contains settings for ActionController::ParamsWrapper which
 4 | # is enabled by default.
 5 | 
 6 | # Enable parameter wrapping for JSON. You can disable this by setting :format to an empty array.
 7 | ActiveSupport.on_load(:action_controller) do
 8 |   wrap_parameters format: [:json] if respond_to?(:wrap_parameters)
 9 | end
10 | 
11 | # To enable root element in JSON for ActiveRecord objects.
12 | # ActiveSupport.on_load(:active_record) do
13 | #  self.include_root_in_json = true
14 | # end
15 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-optional-user.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | #Add the elasticsearch user before installing from packages.
 3 | - name: Ensure optional elasticsearch group is created with the correct id.
 4 |   group:
 5 |     state: present
 6 |     name: "{{ es_group }}"
 7 |     system: yes
 8 |     gid: "{{ es_group_id }}"
 9 | 
10 | - name: Ensure optional elasticsearch user is created with the correct id.
11 |   user:
12 |     state: present
13 |     name: "{{ es_user }}"
14 |     comment: elasticsearch system user
15 |     system: yes
16 |     createhome: no
17 |     uid: "{{ es_user_id }}"
18 |     group: "{{ es_group }}"
19 | 


--------------------------------------------------------------------------------
/roles/letsencrypt/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   letsencrypt_src_directory: /usr/local/share/letsencrypt
 3 |   letsencrypt_venv: "{{ letsencrypt_src_directory }}/env"
 4 |   letsencrypt_cert_domains:
 5 |     - "{{ ansible_fqdn }}"
 6 |   letsencrypt_webroot_path: /var/www
 7 |   letsencrypt_authenticator: webroot
 8 |   letsencrypt_email: "webmaster@{{ ansible_domain }}"
 9 |   letsencrypt_command: "{{ letsencrypt_venv }}/bin/letsencrypt --agree-tos --text {% for domain in letsencrypt_cert_domains %}-d {{ domain }} {% endfor %}--email {{ letsencrypt_email }} {% if letsencrypt_server is defined %}--server {{ letsencrypt_server }}{% endif %} --expand"
10 | 


--------------------------------------------------------------------------------
/run:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | 
 3 | require 'trollop'
 4 | require 'ap'
 5 | 
 6 | opts = Trollop.options do
 7 |   banner = ':Usage =>ruby crawl.rb -u http://amazon.com [options]'
 8 |   opt :host,	'Set the host api to grab from', type: :string
 9 |   opt :urls, 	'Set the URL you want to crawl', type: :strings
10 |   opt :api_key,	'Set the api key to grab from', type: :string
11 |   opt :depth,   'Set the depth you want to crawl', type: :integer
12 |   opt :file, 	'Set the URL to be grabbed from a url.txt file in data folder', default: false
13 |   opt :ua, 		'Set a custom user agent. Ex:-ua Googlebot'
14 | end
15 | 
16 | trap('INT') { exit }
17 | 
18 | ap opts
19 | 


--------------------------------------------------------------------------------
/app/models/record/screenshot.rb:
--------------------------------------------------------------------------------
 1 | class Record::Screenshot < Record::Base
 2 |   def initialize(container, record, date)
 3 |     if match = container.match(/(.+?)-/)
 4 |       @container = match[1] + '-screenshots'
 5 |     end
 6 |     @record_id = record
 7 |     @record = record + '/' + date + '.jpg'
 8 |   end
 9 | 
10 |   def screenshot
11 |     @screenshot ||= cloud.get(@record)
12 |   end
13 | 
14 |   def link
15 |     screenshot.url(Date.tomorrow.to_time.to_i)
16 |   end
17 | 
18 |   def data
19 |     if screenshot
20 |       { id: @record_id, redirect_url: link }
21 |     else
22 |       { error: 'screenshot not available' }
23 |     end
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # See https://help.github.com/articles/ignoring-files for more about ignoring files.
 2 | #
 3 | # If you find yourself ignoring temporary files generated by your text editor
 4 | # or operating system, you probably want to add a global ignore instead:
 5 | #   git config --global core.excludesfile '~/.gitignore_global'
 6 | 
 7 | # Ignore bundler config.
 8 | /.bundle
 9 | 
10 | # Ignore the default SQLite database.
11 | /db/*.sqlite3
12 | /db/*.sqlite3-journal
13 | 
14 | # Ignore all logfiles and tempfiles.
15 | /log/*.log
16 | /tmp
17 | /app/sites
18 | /config/application.yml
19 | /config/sidekiq.yml
20 | /config/config.yml
21 | /config/secrets.yml
22 | /production
23 | 


--------------------------------------------------------------------------------
/app/workers/mapper/base.rb:
--------------------------------------------------------------------------------
 1 | class Mapper::Base < Worker
 2 |   sidekiq_options queue: :mapper,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def cloud
 9 |     @cloud ||= Cloud.new(@container)
10 |   end
11 | 
12 |   def records
13 |     @records ||= cloud.files.select { |f| f unless f.key.starts_with? '_' }
14 |   end
15 | 
16 |   def indexes
17 |     @records ||= cloud.files.select { |f| f if f.key.starts_with? '_' }
18 |   end
19 | 
20 |   def record(record)
21 |     Record::Base.new(@container, record)
22 |   end
23 | end
24 | 


--------------------------------------------------------------------------------
/config/database.yml:
--------------------------------------------------------------------------------
 1 | # SQLite version 3.x
 2 | #   gem install sqlite3
 3 | #
 4 | #   Ensure the SQLite 3 gem is defined in your Gemfile
 5 | #   gem 'sqlite3'
 6 | #
 7 | default: &default
 8 |   adapter: sqlite3
 9 |   pool: 5
10 |   timeout: 5000
11 | 
12 | development:
13 |   <<: *default
14 |   database: db/development.sqlite3
15 | 
16 | # Warning: The database defined as "test" will be erased and
17 | # re-generated from your development database when you run "rake".
18 | # Do not set this db to the same as development or production.
19 | test:
20 |   <<: *default
21 |   database: db/test.sqlite3
22 | 
23 | production:
24 |   <<: *default
25 |   database: db/production.sqlite3
26 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/defaults/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | es_major_version: "2.x"
 3 | es_version: "2.2.0"
 4 | es_version_lock: false
 5 | es_use_repository: true
 6 | es_start_service: true
 7 | update_java: false
 8 | es_restart_on_change: true
 9 | es_plugins_reinstall: false
10 | es_scripts: false
11 | es_templates: false
12 | es_user: elasticsearch
13 | es_group: elasticsearch
14 | es_config: {}
15 | #Need to provide default directories
16 | es_pid_dir: "/var/run/elasticsearch"
17 | es_data_dirs: "/var/lib/elasticsearch"
18 | es_log_dir: "/var/log/elasticsearch"
19 | es_work_dir: "/tmp/elasticsearch"
20 | es_plugin_dir: "/usr/share/elasticsearch/plugins"
21 | es_max_open_files: 65536
22 | 
23 | 


--------------------------------------------------------------------------------
/roles/chruby/README.md:
--------------------------------------------------------------------------------
 1 | # InnoHub Ansible : chruby [![Build Status](https://travis-ci.org/innohub-ansible/chruby.svg?branch=master)](https://travis-ci.org/innohub-ansible/chruby)
 2 | 
 3 | Installs chruby.
 4 | 
 5 | Requirements
 6 | ------------
 7 | 
 8 | Tested on Ubuntu 12.04 and 14.04 only.
 9 | 
10 | Role Variables
11 | --------------
12 | 
13 | chruby_version : defaults to '0.3.9'
14 | 
15 | Example Playbook
16 | ----------------
17 | 
18 | Example Playbook:
19 | 
20 |     - hosts: servers
21 |       roles:
22 |          - { role: innohub-ansible.chruby }
23 | 
24 | Example Role:
25 | 
26 |     dependencies:
27 |       - { role: chruby }
28 | 
29 | License
30 | -------
31 | 
32 | MIT
33 | 


--------------------------------------------------------------------------------
/roles/imagemagick/README.md:
--------------------------------------------------------------------------------
 1 | # Ansible Role: ImageMagick
 2 | 
 3 | An Ansible role that installs [ImageMagick](http://www.imagemagick.org/script/index.php) on RHEL/CentOS and Debian/Ubuntu.
 4 | 
 5 | 
 6 | ## Requirements
 7 | 
 8 | None.
 9 | 
10 | 
11 | ## Role Variables
12 | 
13 | None.
14 | 
15 | 
16 | ## Dependencies
17 | 
18 | None.
19 | 
20 | 
21 | ## Example Playbook
22 | 
23 | ```
24 | - hosts: servers
25 |   roles:
26 |      - { role: hashbangcode.imagemagick }
27 | ```
28 | 
29 | ## License
30 | 
31 | MIT
32 | 
33 | 
34 | ## Author Information
35 | 
36 | This role was created by [Dan Bohea](http://bohea.co.uk) originally for use with [Vlad](https://github.com/hashbangcode/vlad).
37 | 


--------------------------------------------------------------------------------
/app/controllers/v1/batch_controller.rb:
--------------------------------------------------------------------------------
 1 | class V1::BatchController < V1::AccessController
 2 |   def index
 3 |     container = Api::V1.new(params[:container])
 4 |     new_params = params
 5 |     new_params.delete(:container) if params[:container]
 6 |     if new_params.empty?
 7 |       results = errors_response('no results found')
 8 |       status = 404
 9 |     else
10 |       results = { results: container.batch(new_params[:batch], default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) } }
11 |       status = 200
12 |     end
13 |     respond_to do |format|
14 |       format.json { json_response(status, results) }
15 |       format.xml { xml_response(status, results) }
16 |     end
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/app/controllers/v1/trends_controller.rb:
--------------------------------------------------------------------------------
 1 | class V1::TrendsController < V1::AccessController
 2 |   def index
 3 |     container = Record::Trends.new(params[:container])
 4 |     if params[:array].empty?
 5 |       results = errors_response('no results found')
 6 |       status = 404
 7 |     else
 8 |       results = { results: container.sort(params[:array].split(','), default_options.merge(social: params[:social] || true)).map { |h| Record::Addons.append(h) },
 9 |                   pagination: pagination(container.total) }
10 |       status = 200
11 |     end
12 |     respond_to do |format|
13 |       format.json { json_response(status, results) }
14 |       format.xml { xml_response(status, results) }
15 |     end
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/config/application.rb:
--------------------------------------------------------------------------------
 1 | require File.expand_path('../boot', __FILE__)
 2 | 
 3 | require 'rails/all'
 4 | 
 5 | # Require the gems listed in Gemfile, including any gems
 6 | # you've limited to :test, :development, or :production.
 7 | Bundler.require(*Rails.groups)
 8 | 
 9 | module Crawler
10 |   class Application < Rails::Application
11 |     config.middleware.insert_before 0, Rack::Health, :path => '/elb-status'
12 |     config.config = config_for(:config).deep_symbolize_keys!
13 |     require_relative '../app/sites/initializer.rb' if File.exists?('../app/sites/initializer.rb')
14 |     config.autoload_paths += Dir[Rails.root.join('app', 'sites', '{**}')]
15 |     config.autoload_paths += %W(#{config.root}/helpers)
16 |   end
17 | end
18 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/templates/elasticsearch.yml.j2:
--------------------------------------------------------------------------------
 1 | 
 2 | {% if es_config %}
 3 | {{ es_config | to_nice_yaml }}
 4 | {% endif %}
 5 | 
 6 | {% if es_config['cluster.name'] is not defined %}
 7 | cluster.name: elasticsearch
 8 | {% endif %}
 9 | 
10 | {% if es_config['node.name'] is not defined %}
11 | node.name: {{inventory_hostname}}-{{es_instance_name}}
12 | {% endif %}
13 | 
14 | #################################### Paths ####################################
15 | 
16 | # Path to directory containing configuration (this file and logging.yml):
17 | path.conf: {{ conf_dir }}
18 | 
19 | path.data: {{ data_dirs | array_to_str }}
20 | 
21 | path.work: {{ work_dir }}
22 | 
23 | path.logs: {{ log_dir }}
24 | 
25 | path.plugins: {{ plugin_dir }}


--------------------------------------------------------------------------------
/config.ru:
--------------------------------------------------------------------------------
 1 | # This file is used by Rack-based servers to start the application.
 2 | 
 3 | # --- Start of unicorn worker killer code ---
 4 | 
 5 | if ENV['RAILS_ENV'] == 'production'
 6 |   require 'unicorn/worker_killer'
 7 | 
 8 |   max_request_min =  500
 9 |   max_request_max =  600
10 | 
11 |   # Max requests per worker
12 |   use Unicorn::WorkerKiller::MaxRequests, max_request_min, max_request_max
13 | 
14 |   oom_min = (240) * (1024**2)
15 |   oom_max = (260) * (1024**2)
16 | 
17 |   # Max memory size (RSS) per worker
18 |   use Unicorn::WorkerKiller::Oom, oom_min, oom_max
19 | end
20 | 
21 | # --- End of unicorn worker killer code ---
22 | 
23 | require ::File.expand_path('../config/environment',  __FILE__)
24 | run Rails.application
25 | 


--------------------------------------------------------------------------------
/config/initializers/inflections.rb:
--------------------------------------------------------------------------------
 1 | # Be sure to restart your server when you modify this file.
 2 | 
 3 | # Add new inflection rules using the following format. Inflections
 4 | # are locale specific, and you may define rules for as many different
 5 | # locales as you wish. All of these examples are active by default:
 6 | # ActiveSupport::Inflector.inflections(:en) do |inflect|
 7 | #   inflect.plural /^(ox)$/i, '\1en'
 8 | #   inflect.singular /^(ox)en/i, '\1'
 9 | #   inflect.irregular 'person', 'people'
10 | #   inflect.uncountable %w( fish sheep )
11 | # end
12 | 
13 | # These inflection rules are supported but not enabled by default:
14 | # ActiveSupport::Inflector.inflections(:en) do |inflect|
15 | #   inflect.acronym 'RESTful'
16 | # end
17 | 


--------------------------------------------------------------------------------
/config/locales/en.yml:
--------------------------------------------------------------------------------
 1 | # Files in the config/locales directory are used for internationalization
 2 | # and are automatically loaded by Rails. If you want to use locales other
 3 | # than English, add the necessary files in this directory.
 4 | #
 5 | # To use the locales, use `I18n.t`:
 6 | #
 7 | #     I18n.t 'hello'
 8 | #
 9 | # In views, this is aliased to just `t`:
10 | #
11 | #     <%= t('hello') %>
12 | #
13 | # To use a different locale, set it with `I18n.locale`:
14 | #
15 | #     I18n.locale = :es
16 | #
17 | # This would use the information in config/locales/es.yml.
18 | #
19 | # To learn more, please read the Rails Internationalization guide
20 | # available at http://guides.rubyonrails.org/i18n.html.
21 | 
22 | en:
23 |   hello: "Hello world"
24 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: set locale
 3 |   template: src=locale
 4 |             dest=/etc/default/locale
 5 | 
 6 | - name: set timezone
 7 |   file: src=/usr/share/zoneinfo/{{ common_timezone }}
 8 |         dest=/etc/localtime
 9 |         force=yes
10 |         state=link
11 | 
12 | - name: update source.list
13 |   template: src=sources.list
14 |             dest=/etc/apt/sources.list
15 | 
16 | - name: update apt cache
17 |   apt: update_cache=yes
18 |        cache_valid_time={{ common_apt_cache_time }}
19 | 
20 | - name: install common packages
21 |   apt: pkg={{ item }}
22 |        state=present
23 |   with_items:
24 |     - build-essential
25 |     - git
26 |     - sqlite
27 |     - libsqlite3-dev
28 |     - libmagickwand-dev
29 | 


--------------------------------------------------------------------------------
/app/models/record/export.rb:
--------------------------------------------------------------------------------
 1 | class Record::Export
 2 |   def initialize(container, headers = ['name', 'url'])
 3 |     @container = container
 4 |     @headers = headers
 5 |   end
 6 | 
 7 |   def csv
 8 |     require 'csv'
 9 |     CSV.open('test.csv', 'w') do |csv|
10 |       indexes.with_progress.each do |index|
11 |         id = index.key.gsub('.json','')
12 |         hash = record(id).current_data({ crawl: false, social: false })
13 |         csv << hash.values
14 |       end # of hsh's (rows)
15 |     end # of csv open
16 |   end
17 | 
18 |   def cloud
19 |     @cloud ||= Cloud.new(@container)
20 |   end
21 | 
22 |   def indexes
23 |     @records ||= cloud.files
24 |   end
25 | 
26 |   def record(record)
27 |     Api::V1.new(@container, record)
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/app/workers/scheduler/clearer.rb:
--------------------------------------------------------------------------------
 1 | class Scheduler::Clearer < Scheduler::Base
 2 |   recurrence { daily }
 3 | 
 4 |   def perform
 5 |     Redis::List.new('sampler_visited').clear
 6 |     Redis::List.new('sampler_one_visited').clear
 7 |     Redis::List.new('sampler_two_visited').clear
 8 |     Redis::List.new('sampler_three_visited').clear
 9 |     Redis::List.new('sampler_four_visited').clear
10 |     Redis::List.new('sampler_five_visited').clear
11 |     Redis::List.new('spider_visited').clear
12 |     Redis::List.new('spider_one_visited').clear
13 |     Redis::List.new('spider_two_visited').clear
14 |     Redis::List.new('spider_three_visited').clear
15 |     Redis::List.new('spider_four_visited').clear
16 |     Redis::List.new('spider_five_visited').clear
17 |   end
18 | end
19 | 


--------------------------------------------------------------------------------
/app/controllers/v1/match_controller.rb:
--------------------------------------------------------------------------------
 1 | class V1::MatchController < V1::AccessController
 2 |   def index
 3 |     container = Record::Match.new(params[:container])
 4 |     new_params = params
 5 |     new_params.delete(:container) if params[:container]
 6 |     if new_params.empty?
 7 |       results = errors_response('no results found')
 8 |       status = 404
 9 |     else
10 |       results = { results: container.best(new_params, default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) },
11 |                   pagination: pagination(container.total) }
12 |       status = 200
13 |     end
14 |     respond_to do |format|
15 |       format.json { json_response(status, results) }
16 |       format.xml { xml_response(status, results) }
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/app/controllers/v1/search_controller.rb:
--------------------------------------------------------------------------------
 1 | class V1::SearchController < V1::AccessController
 2 |   def index
 3 |     container = Record::Search.new(params[:container])
 4 |     new_params = params
 5 |     new_params.delete(:container) if params[:container]
 6 |     if new_params.empty?
 7 |       results = errors_response('no results found')
 8 |       status = 404
 9 |     else
10 |       results = { results: container.search(new_params, default_options.merge(results: current_results)).map { |h| Record::Addons.append(h) },
11 |                   pagination: pagination(container.total) }
12 |       status = 200
13 |     end
14 |     respond_to do |format|
15 |       format.json { json_response(status, results) }
16 |       format.xml { xml_response(status, results) }
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/app/workers/syncer/reindexer.rb:
--------------------------------------------------------------------------------
 1 | class Syncer::Reindexer < Syncer::Base
 2 |   def perform(container)
 3 |     @container = container
 4 |     types = container.split('-').last.pluralize.gsub(':', '')
 5 |     index = Rails.env + '-' + types
 6 |     Elasticsearch::Model.client.indices.refresh index: index
 7 |     records.with_progress("Remapping #{container}").each do |r|
 8 |       id = r.key.gsub('.json','')
 9 |       begin
10 |         unless Elasticsearch::Model.client.exists? index: index, type: container, id: id
11 |           # temp = Mapper::Indexer.new
12 |           # temp.perform @container, id
13 |           Mapper::Indexer.perform_async @container, id
14 |         end
15 |       rescue
16 |         Mapper::Indexer.perform_async @container, id
17 |       end
18 |     end
19 |   end
20 | end
21 | 


--------------------------------------------------------------------------------
/config/sidekiq-slim.yml.example:
--------------------------------------------------------------------------------
 1 | ---
 2 | :concurrency: 1
 3 | :pidfile: tmp/pids/sidekiq.pid
 4 | :queues:
 5 |   - [mapper, 4_000_000]
 6 |   - [recorder, 300_000]
 7 |   - [default, 20]
 8 |   - [sitemapper_one, 1]
 9 |   - [sitemapper_two, 1]
10 |   - [sitemapper_three, 1]
11 |   - [sitemapper_four, 1]
12 |   - [sitemapper_five, 1]
13 |   - [sitemapper_six, 1]
14 |   - [sitemapper_seven, 1]
15 |   - [sitemapper_eight, 1]
16 |   - [sitemapper_nine, 1]
17 |   - [sitemapper_ten, 1]
18 |   - [sitemapper, 1]
19 | :limits:
20 |   sitemapper_one: 1
21 |   sitemapper_two: 1
22 |   sitemapper_three: 1
23 |   sitemapper_four: 1
24 |   sitemapper_five: 1
25 |   sitemapper_six: 1
26 |   sitemapper_seven: 1
27 |   sitemapper_eight: 1
28 |   sitemapper_nine: 1
29 |   sitemapper_ten: 1
30 |   sitemapper: 1
31 |   stretcher: 1
32 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-scripts.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - set_fact: es_script_dir={{ es_conf_dir }}/{{es_instance_name}}
 4 |   tags:
 5 |       - always
 6 | 
 7 | - set_fact: es_script_dir={{es_config['path.scripts']}}
 8 |   when: es_config['path.scripts'] is defined
 9 |   tags:
10 |       - always
11 | 
12 | - name: Create script dir
13 |   file: state=directory path={{ es_script_dir }} owner={{ es_user }} group={{ es_group }}
14 | 
15 | - name: Copy default scripts to elasticsearch
16 |   copy: src=scripts dest={{ es_script_dir }} owner={{ es_user }} group={{ es_group }}
17 |   when: es_scripts_fileglob is not defined
18 | 
19 | - name: Copy scripts to elasticsearch
20 |   copy: src={{ item }} dest={{ es_script_dir }} owner={{ es_user }} group={{ es_group }}
21 |   with_fileglob: es_scripts_fileglob
22 | 


--------------------------------------------------------------------------------
/app/workers/syncer/mover.rb:
--------------------------------------------------------------------------------
 1 | class Syncer::Mover < Syncer::Base
 2 |   def perform(from_container, to_container)
 3 |     @container = from_container
 4 |     @to_container = to_container
 5 |     records.with_progress("Move from #{from_container} to #{to_container}").each do |r|
 6 |       from_record = record(r.key)
 7 |       old_data = from_record.data
 8 |       old_data['type'] = new_type
 9 |       to_record(r.key).data = old_data
10 |       from_record.delete
11 |     end
12 |   end
13 | 
14 |   def new_type
15 |     @new_type ||= @to_container.match(/-(.+)/)[1].try(:singularize).try(:capitalize) rescue nil
16 |   end
17 | 
18 |   def to_record(new_record)
19 |     Record::Base.new(@to_container, new_record)
20 |   end
21 | 
22 |   def to_cloud
23 |     @to_cloud ||= Cloud.new(@to_container)
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/app/workers/recorder/fixer.rb:
--------------------------------------------------------------------------------
 1 | class Recorder::Fixer < Recorder::Base
 2 |   def perform(container = nil, record = nil)
 3 |     if container && record
 4 |       new_hash = Record::Base.new(container, record).data
 5 |       new_hash.delete('screenshot')
 6 | 
 7 |       if new_hash['price']
 8 |         new_hash['price'] = new_hash['price'].delete_if {|k,v| v.include?('-') }
 9 |         new_hash.delete('price') if new_hash['price'].blank?
10 |       end
11 | 
12 |       if new_hash['original_price']
13 |         new_hash['original_price'] = new_hash['original_price'].delete_if {|k,v| v.include?('-') }
14 |         new_hash.delete('original_price') if new_hash['original_price'].blank?
15 |       end
16 | 
17 |       Record::Base.new(container, record).data = new_hash
18 |       Crawler::Scrimper.perform_async new_hash['url']
19 |     end
20 |   end
21 | end
22 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/config.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | #Test explicit setting of parameters and variables
 3 | - name: Elasticsearch Config tests
 4 |   hosts: localhost
 5 |   roles:
 6 |     #expand to all available parameters
 7 |     - { role: elasticsearch, es_instance_name: "node1", es_data_dirs: ["/opt/elasticsearch/data-1","/opt/elasticsearch/data-2"], es_log_dir: "/opt/elasticsearch/logs", es_work_dir: "/opt/elasticsearch/temp", es_user_id: 333, es_group_id: 333, es_config: {node.name: "node1", cluster.name: "custom-cluster", discovery.zen.ping.unicast.hosts: "localhost:9301", http.port: 9201, transport.tcp.port: 9301, node.data: false, node.master: true, bootstrap.mlockall: true, discovery.zen.ping.multicast.enabled: false } }
 8 |   vars:
 9 |     es_scripts: false
10 |     es_templates: false
11 |     es_version_lock: false
12 |     es_heap_size: 1g


--------------------------------------------------------------------------------
/app/workers/mapper/cleaner.rb:
--------------------------------------------------------------------------------
 1 | class Mapper::Cleaner < Mapper::Base
 2 |   def perform(container, _standard = [])
 3 |     @container = container
 4 |     records.with_progress{"Cleaning #{container}"}.each do |r|
 5 |       data = record(r.key).data
 6 |       new_data = parse_record data
 7 |       record(r.key).data = new_data unless data == new_data
 8 |     end
 9 |   end
10 | 
11 |   def parse_record(data)
12 |     if id = data['id']
13 |       data.each do |k, v|
14 |         ap 'KEY'
15 |         ap k
16 |         if v.is_a?(Hash)
17 |           v.each do |k2, v2|
18 |             ap '!!!!!!!!!!!!INNER KEY'
19 |             ap k2
20 |             ap '!!!!!!!!!!!!INNER VALUE'
21 |             ap v2
22 |           end
23 |         else
24 |           ap 'VALUE'
25 |           ap v
26 |         end
27 |       end
28 |     end
29 |     data
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: check-parameters
 3 |   include: checkParameters.yml
 4 |   tags:
 5 |       - check
 6 | - name: os-specific vars
 7 |   include_vars: "{{ansible_os_family}}.yml"
 8 |   tags:
 9 |       - always
10 | - include: java.yml
11 |   tags:
12 |       - java
13 | - include: elasticsearch.yml
14 |   tags:
15 |       - install
16 | - include: elasticsearch-config.yml
17 |   tags:
18 |       - config
19 | - include: elasticsearch-scripts.yml
20 |   when: es_scripts
21 |   tags:
22 |       - scripts
23 | - include: elasticsearch-plugins.yml
24 |   when: es_plugins is defined or es_plugins_reinstall
25 |   tags:
26 |       - plugins
27 | - include: elasticsearch-service.yml
28 |   tags:
29 |       - service
30 | - include: elasticsearch-templates.yml
31 |   when: es_templates
32 |   tags:
33 |       - templates
34 | - meta: flush_handlers
35 | 


--------------------------------------------------------------------------------
/app/workers/syncer/rescreener.rb:
--------------------------------------------------------------------------------
 1 | class Syncer::Rescreener < Syncer::Base
 2 |   def perform(container, cleanup = false)
 3 |     @container = container
 4 |     records.with_progress("Rescreen Crawling #{container}").each do |r|
 5 |       record(r.key).screenshots.each do |key, value|
 6 |         Crawler::Screener.perform_async value, key unless files.include? key
 7 |         files.delete(key)
 8 |       end
 9 |     end if screenshots_cloud
10 |     files.each { |f| screenshots_cloud.head(f).try(:destroy) } if cleanup
11 |   end
12 | 
13 |   def screenshots_container
14 |     @screenshots_container ||= @container.match(/(.+)-/)[1] + '-screenshots' rescue nil
15 |   end
16 | 
17 |   def screenshots_cloud
18 |     @screenshots_cloud ||= Cloud.new(screenshots_container) rescue nil
19 |   end
20 | 
21 |   def files
22 |     @files ||= screenshots_cloud.files.map(&:key)
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/app/models/record/addons.rb:
--------------------------------------------------------------------------------
 1 | class Record::Addons
 2 |   def self.append hash
 3 |     if appends = Rails.configuration.config[:admin][:append][hash[:container].try(:to_sym)]
 4 |       appends.each do |key, value|
 5 |         hash[key] = hash[key] + value if hash[key]
 6 |       end
 7 |     end
 8 | 
 9 |     if inserts = Rails.configuration.config[:admin][:insert][hash[:container].try(:to_sym)]
10 |       inserts.each do |key, value|
11 |         if hash[key] && key == :url
12 |           hash[key] = value + CGI.escape(hash[key])
13 |         elsif hash[key]
14 |           hash[key] = value + hash[key]
15 |         end
16 |       end
17 |     end
18 | 
19 |     if addons = Rails.configuration.config[:admin][:addons][hash[:container].try(:to_sym)]
20 |       addons.each do |key, value|
21 |         hash[key] = value
22 |       end
23 |     end
24 | 
25 |     return hash
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/app/workers/crawler/screener.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Screener < Crawler::Base
 2 |   sidekiq_options queue: :screener,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, path)
 9 |     return if url.nil?
10 |     @url = url
11 |     capturer.relative_path = path
12 |     capturer.screen
13 |   rescue EOFError => e
14 |     Crawler::Screener.perform_async url, path
15 |   rescue Net::ReadTimeout => e
16 |     Crawler::Screener.perform_async url, path
17 |   rescue ChildProcess::TimeoutError => e
18 |     Crawler::Screener.perform_async url, path
19 |   rescue Selenium::WebDriver::Error::WebDriverError => e
20 |     Crawler::Screener.perform_async url, path
21 |   end
22 | 
23 |   def capturer
24 |     @capturer ||= Crawl::Capture.new(@url)
25 |   end
26 | end
27 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/filter_plugins/custom.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'dale mcdiarmid'
 2 | 
 3 | import re
 4 | 
 5 | def modify_list(values=[], pattern='', replacement='', ignorecase=False):
 6 |     ''' Perform a `re.sub` on every item in the list'''
 7 |     if ignorecase:
 8 |         flags = re.I
 9 |     else:
10 |         flags = 0
11 |     _re = re.compile(pattern, flags=flags)
12 |     return [_re.sub(replacement, value) for value in values]
13 | 
14 | def append_to_list(values=[], suffix=''):
15 |     if isinstance(values, basestring):
16 |         values = values.split(',')
17 |     return [str(value+suffix) for value in values]
18 | 
19 | def array_to_str(values=[],separator=','):
20 |     return separator.join(values)
21 | 
22 | class FilterModule(object):
23 |     def filters(self):
24 |         return {'modify_list': modify_list,
25 |         'append_to_list':append_to_list,
26 |         'array_to_str':array_to_str}


--------------------------------------------------------------------------------
/app/workers/syncer/refixer.rb:
--------------------------------------------------------------------------------
 1 | class Syncer::Refixer < Syncer::Base
 2 |   def perform(container)
 3 |     @container = container
 4 |     types = container.split('-').last.pluralize.gsub(':', '')
 5 |     index = Rails.env + '-' + types
 6 |     Elasticsearch::Model.client.indices.refresh index: index
 7 |     records.with_progress("Refixing #{container}").each do |r|
 8 |       id = r.key.gsub('.json','')
 9 |       if id.size > 20
10 |         begin
11 |           Elasticsearch::Model.client.delete index: index, type: container, id: id
12 |           Elasticsearch::Model.client.indices.refresh index: index
13 |           r = record(id)
14 |           if url = r.try(:url)
15 |             Crawler::Scrimper.perform_async url
16 |           end
17 |           r.delete
18 |         rescue
19 |           Mapper::IdAvailability.perform_async container, id
20 |         end
21 |       end
22 |     end
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/app/workers/crawler/stretcher.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Stretcher < Crawler::Base
 2 |   sidekiq_options queue: :stretcher,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, hash = {})
 9 |     return if url.nil?
10 |     @parsed = hash
11 | 
12 |     @url = url
13 |     parser.page = scraper.get
14 |     upload
15 |   rescue Mechanize::ResponseCodeError => e
16 |     if e.response_code == '404' ||
17 |          e.response_code == '410' ||
18 |          e.response_code == '520' ||
19 |          e.response_code == '500' ||
20 |          e.response_code == '301' ||
21 |          e.response_code == '302'
22 |       Mapper::UrlAvailability.perform_async url
23 |     else
24 |       raise
25 |     end
26 |   rescue Mechanize::RedirectLimitReachedError => e
27 |     nil
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/lib/tasks/crawl.rake:
--------------------------------------------------------------------------------
 1 | namespace :crawl do
 2 |   desc 'Run the crawler in Crawler::Spider mode'
 3 |   task :spider, [:url] => :environment do |_task, args|
 4 |     Redis::List.new('visited').clear
 5 |     Crawler::Spider.perform_async args.url
 6 |   end
 7 | 
 8 |   desc 'Run the crawler in Crawler::Scrimper mode'
 9 |   task :scrimper, [:url] => :environment do |_task, args|
10 |     Redis::List.new('visited').clear
11 |     Crawler::Scrimper.perform_async args.url
12 |   end
13 | 
14 |   desc 'Run the crawler in Crawler::Sampler mode'
15 |   task :sampler, [:url] => :environment do |_task, args|
16 |     Redis::List.new('visited').clear
17 |     Crawler::Sampler.perform_async args.url
18 |   end
19 | 
20 |   desc 'Run the crawler in Crawler::Sitemapper mode'
21 |   task :sitemapper, [:url] => :environment do |_task, args|
22 |     Redis::List.new('visited').clear
23 |     Crawler::Sitemapper.perform_async args.url
24 |   end
25 | end
26 | 


--------------------------------------------------------------------------------
/app/helpers/page_helper.rb:
--------------------------------------------------------------------------------
 1 | module PageHelper
 2 |   def build_page
 3 |     methods.grep(/page_helper/).each do |page|
 4 |       send(page)
 5 |     end
 6 |     # @id = @name.tr(" ", "_") if @type
 7 |   end
 8 | 
 9 |   def page_helper_id
10 |     @id = md5 unless @id
11 |   end
12 | 
13 |   def page_helper_url
14 |     @url = parser.css("link[@rel='canonical']").first['href'].try(:squish) unless @url rescue nil
15 |     @url = page.uri.to_s unless @url
16 |   end
17 | 
18 |   def page_helper_name
19 |     @name = parser.at('title').inner_html.try(:squish) unless @name rescue nil
20 |   end
21 | 
22 |   def page_helper_description
23 |     @description = parser.css("meta[@name='description']").first['content'].try(:squish) unless @description rescue nil
24 |   end
25 | 
26 |   def page_helper_mobile_url
27 |     @mobile_url = parser.css("link[@media='handheld']").first['href'].try(:squish) unless @mobile_url rescue nil
28 |   end
29 | end
30 | 


--------------------------------------------------------------------------------
/roles/ruby-install/README.md:
--------------------------------------------------------------------------------
 1 | InnoHub Ansible : ruby-install [![Build Status](https://travis-ci.org/innohub-ansible/ruby-install.svg?branch=master)](https://travis-ci.org/innohub-ansible/ruby-install)
 2 | ==========================================================================================================================================================================
 3 | 
 4 | Installs ruby-install.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | Works ONLY on Ubuntu 14.04.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | ruby_install_version : defaults to '0.4.3'
15 | 
16 | Dependencies
17 | ------------
18 | 
19 | None
20 | 
21 | Example Playbook
22 | ----------------
23 | 
24 | Example Playbook:
25 | 
26 |     - hosts: servers
27 |       roles:
28 |          - { role: innohub-ansible.ruby-install }
29 | 
30 | Example Role:
31 | 
32 |     dependencies:
33 |       - { role: ruby_install }
34 | 
35 | License
36 | -------
37 | 
38 | MIT
39 | 


--------------------------------------------------------------------------------
/app/models/crawl/sitemap.rb:
--------------------------------------------------------------------------------
 1 | class Crawl::Sitemap < Page::Url
 2 |   attr_accessor :xml
 3 | 
 4 |   def parser
 5 |     @parser ||= begin
 6 |       if uri.to_s.ends_with?('.gz')
 7 |         require 'zlib'
 8 |         require 'stringio'
 9 |         gz = Zlib::GzipReader.new(StringIO.new(xml.body.to_s))
10 |         Nokogiri::XML.parse(gz.read)
11 |       else
12 |         Nokogiri::XML.parse(xml.body)
13 |       end
14 |     rescue Zlib::GzipFile::Error
15 |       Nokogiri::XML.parse(xml.body)
16 |     end
17 |   end
18 | 
19 |   def index_links
20 |     @index_links ||= parser.css('//sitemap/loc').map(&:text).compact.uniq.shuffle
21 |   end
22 | 
23 |   def site_links
24 |     @site_links ||= parser.css('//url/loc').map(&:text).compact.uniq.shuffle
25 |   end
26 | 
27 |   def base
28 |     "#{uri.scheme}://#{uri.host}"
29 |   end
30 | 
31 |   def indexes?
32 |     !index_links.empty?
33 |   end
34 | 
35 |   def sites?
36 |     !site_links.empty?
37 |   end
38 | end
39 | 


--------------------------------------------------------------------------------
/app/models/flattener.rb:
--------------------------------------------------------------------------------
 1 | class Flattener
 2 |   def initialize(hash)
 3 |     @hash = hash
 4 |     @result = {}
 5 |     @result_iter = {}
 6 |     @paths = hash.keys.map { |key| [key] }
 7 |   end
 8 | 
 9 |   def flatten(hash = @hash, old_path = [])
10 |     hash.each do |key, value|
11 |       current_path = old_path + [key]
12 | 
13 |       if !value.respond_to?(:keys)
14 |         @result[current_path.join('_')] = value
15 |       else
16 |         flatten(value, current_path)
17 |       end
18 |     end
19 | 
20 |     @result
21 |   end
22 | 
23 |   def flatten_iter
24 |     until @paths.empty?
25 |       path = @paths.shift
26 |       value = @hash
27 |       path.each { |step| value = value[step] }
28 | 
29 |       if value.respond_to?(:keys)
30 |         value.keys.each { |key| @paths << path + [key] }
31 |       else
32 |         @result_iter[path.join('_')] = value
33 |       end
34 |     end
35 | 
36 |     @result_iter
37 |   end
38 | 
39 |   def are_the_same?
40 |     flatten == flatten_iter
41 |   end
42 | end
43 | 


--------------------------------------------------------------------------------
/app/workers/crawler/scrimper.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Scrimper < Crawler::Base
 2 |   sidekiq_options queue: :scrimper,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, hash = {})
 9 |     return if url.nil?
10 |     @parsed = hash
11 | 
12 |     @url = url
13 |     Timeout::timeout(60) do
14 |       parser.page = scraper.get
15 |     end
16 |     upload
17 |   rescue Mechanize::ResponseCodeError => e
18 |     if e.response_code == '404' ||
19 |          e.response_code == '410' ||
20 |          e.response_code == '520' ||
21 |          e.response_code == '500' ||
22 |          e.response_code == '301' ||
23 |          e.response_code == '302'
24 |       Mapper::UrlAvailability.perform_async url
25 |     else
26 |       raise
27 |     end
28 |   rescue Mechanize::RedirectLimitReachedError => e
29 |     nil
30 |   rescue Timeout::Error => e
31 |     Crawler::Stretcher.perform_async url
32 |   end
33 | end
34 | 


--------------------------------------------------------------------------------
/config/secrets.yml.example:
--------------------------------------------------------------------------------
 1 | # Be sure to restart your server when you modify this file.
 2 | 
 3 | # Your secret key is used for verifying the integrity of signed cookies.
 4 | # If you change this key, all old signed cookies will become invalid!
 5 | 
 6 | # Make sure the secret is at least 30 characters and all random,
 7 | # no regular words or you'll be exposed to dictionary attacks.
 8 | # You can use `rake secret` to generate a secure secret key.
 9 | 
10 | # Make sure the secrets in this file are kept private
11 | # if you're sharing your code publicly.
12 | 
13 | development:
14 |   secret_key_base: 2be1e0b5ab9e3917ae9dd7511f040b9fcbaa9ddf145ecae46500065762c4f940ade359f8530dc49ec9c434ebe8fa9375f8beda49880c0202eb94de563d0ede19
15 | 
16 | test:
17 |   secret_key_base: d8e74cc35877031cfb84919c62d2cb486de7f77b70daa7d8e1ac8e7ed186a19d7b7ea0593cb1166ff0ed4f31fe272e73c7a682ce442a51423d4900e6afb0daa2
18 | 
19 | # Do not keep production secrets in the repository,
20 | # instead read values from the environment.
21 | production:
22 |   secret_key_base: <%= ENV["SECRET_KEY_BASE"] %>
23 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     faraday (0.9.2)
 5 |       multipart-post (>= 1.2, < 3)
 6 |     highline (1.7.8)
 7 |     kitchen-ansible (0.40.1)
 8 |       librarian-ansible
 9 |       test-kitchen (~> 1.4)
10 |     kitchen-docker (2.1.0)
11 |       test-kitchen (>= 1.0.0)
12 |     librarian (0.1.2)
13 |       highline
14 |       thor (~> 0.15)
15 |     librarian-ansible (3.0.0)
16 |       faraday
17 |       librarian (~> 0.1.0)
18 |     mixlib-shellout (2.2.6)
19 |     multipart-post (2.0.0)
20 |     net-scp (1.2.1)
21 |       net-ssh (>= 2.6.5)
22 |     net-ssh (2.9.4)
23 |     safe_yaml (1.0.4)
24 |     test-kitchen (1.4.2)
25 |       mixlib-shellout (>= 1.2, < 3.0)
26 |       net-scp (~> 1.1)
27 |       net-ssh (~> 2.7, < 2.10)
28 |       safe_yaml (~> 1.0)
29 |       thor (~> 0.18)
30 |     thor (0.19.1)
31 | 
32 | PLATFORMS
33 |   ruby
34 | 
35 | DEPENDENCIES
36 |   kitchen-ansible (= 0.40.1)
37 |   kitchen-docker (= 2.1.0)
38 |   net-ssh (~> 2.0)
39 |   test-kitchen (= 1.4.2)
40 | 
41 | BUNDLED WITH
42 |    1.11.2
43 | 


--------------------------------------------------------------------------------
/config/config.yml.example:
--------------------------------------------------------------------------------
 1 | default: &default
 2 |   secret: 1234
 3 |   redis:
 4 |     host: 127.0.0.1
 5 |     port: 6379
 6 |     database: 0
 7 |     password:
 8 |   elasticsearch:
 9 |     host: 127.0.0.1
10 |     port: 9200
11 |     protocol: http
12 |   fog:
13 |     provider: 'Local'
14 |     local_root: 'tmp/fog'
15 |     # provider: 'AWS'
16 |     # aws_access_key_id: '32 bit key'
17 |     # aws_secret_access_key: '32 bit key'
18 |   admin:
19 |     username: admin
20 |     password: password
21 |     docs: https://github.com/bastosmichael/skynet
22 |     api_containers:
23 |       -
24 |     api_keys:
25 |       'sample-key':
26 |         customer: name
27 |         permissions:
28 |           - record_show
29 |           - search_index
30 |           - match_index
31 |           - trends_index
32 |           - batch_index
33 |           - status_index
34 |         limit: 20
35 |     append: {}
36 |     insert: {}
37 |     tracker: {}
38 |   app:
39 |     name: crawler
40 | 
41 | development:
42 |   <<: *default
43 | 
44 | production:
45 |   <<: *default
46 | 
47 | test:
48 |   <<: *default
49 | 


--------------------------------------------------------------------------------
/app/workers/crawler/base.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Base < Worker
 2 |   def scraper
 3 |     @scraper ||= Crawl::Base.new(@url)
 4 |   end
 5 | 
 6 |   def parser
 7 |     @parser ||= scraper.name.capitalize.constantize.new(@url)
 8 |   rescue NameError
 9 |     @parser ||= Page::Parse.new(@url)
10 |   end
11 | 
12 |   def upload
13 |     scraper.clear
14 |     @parsed = parsed.merge(parser.save) if parser.build
15 |     if parsed.presence && parsed['type']
16 |       Recorder::Uploader.perform_async parsed
17 |     end
18 |   end
19 | 
20 |   def parsed
21 |     @parsed ||= {}
22 |   end
23 | 
24 |   def social
25 |     @social ||= Crawl::Social.new(@url)
26 |   rescue
27 |     {}
28 |   end
29 | 
30 |   def internal_links
31 |     @internal_links ||= begin
32 |       parser.internal_links.map do |url|
33 |         scraper.name.capitalize.constantize.sanitize_url(url)
34 |       end.compact
35 |     rescue
36 |       parser.internal_links
37 |     end
38 |   end
39 | 
40 |   def visit
41 |     internal_links.each do |url|
42 |       ('Crawler::' + next_type).constantize.perform_async url
43 |     end
44 |   end
45 | end
46 | 


--------------------------------------------------------------------------------
/roles/ubuntu-common/templates/sources.list:
--------------------------------------------------------------------------------
 1 | # binary packages
 2 | deb {{ common_apt_mirror }} {{ common_release_code }} main multiverse restricted universe
 3 | deb {{ common_apt_mirror }} {{ common_release_code }}-updates main multiverse restricted universe
 4 | deb {{ common_apt_mirror }} {{ common_release_code }}-proposed main multiverse restricted universe
 5 | deb {{ common_apt_mirror }} {{ common_release_code }}-backports main multiverse restricted universe
 6 | deb {{ common_apt_mirror }} {{ common_release_code }}-security  main multiverse restricted universe
 7 | 
 8 | # # sources
 9 | # deb-src {{ common_apt_mirror }} {{ common_release_code }} main multiverse restricted universe
10 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-updates main multiverse restricted universe
11 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-proposed main multiverse restricted universe
12 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-backports main multiverse restricted universe
13 | # deb-src {{ common_apt_mirror }} {{ common_release_code }}-security  main multiverse restricted universe
14 | 


--------------------------------------------------------------------------------
/app/models/page/url.rb:
--------------------------------------------------------------------------------
 1 | class Page::Url
 2 |   URI_REGEX = /\A#{URI.regexp(%w(http https))}\z/
 3 | 
 4 |   attr_accessor :date
 5 | 
 6 |   def initialize(url)
 7 |     @url = clean_up_url(url)
 8 |     self.date = Date.today.to_s if date.nil?
 9 |   end
10 | 
11 |   def cache_key
12 |     File.join(build_path, date)
13 |   end
14 | 
15 |   def build_path
16 |     File.join(host, md5)
17 |   end
18 | 
19 |   def uri
20 |     @uri ||= URI.parse(@url)
21 |   end
22 | 
23 |   def url
24 |     @url ||= uri.to_s
25 |   end
26 | 
27 |   def md5
28 |     Digest::MD5.hexdigest(url)
29 |   end
30 | 
31 |   def host
32 |     get_host_without_www uri
33 |   end
34 | 
35 |   def name
36 |     host.split('.').first
37 |   end
38 | 
39 |   def get_host_without_www(new_uri)
40 |     host = new_uri.host.downcase
41 |     begin
42 |       host.split(/\./)[1] + '.' + host.split(/\./)[2]
43 |     rescue
44 |       host.start_with?('www.') ? host[4..-1] : host
45 |     end
46 |   end
47 | 
48 |   def clean_up_url(url)
49 |     url = URI.encode(url)
50 |     url = "http://#{url}" if URI.parse(url).scheme.nil?
51 |     url
52 |   end
53 | end
54 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-RedHat.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Ensure libselinux-python on CentOS 6.x
 3 |   yum: name=libselinux-python state=present update_cache=yes
 4 |   when: ( ansible_distribution == "CentOS" ) and ( ansible_distribution_major_version == "6" )
 5 | 
 6 | - name: RedHat - add Elasticsearch repo
 7 |   template: src=elasticsearch.repo dest=/etc/yum.repos.d/elasticsearch-{{ es_major_version }}.repo
 8 |   when: es_use_repository
 9 | 
10 | - name: RedHat - include versionlock
11 |   include: elasticsearch-RedHat-version-lock.yml
12 |   when: es_version_lock
13 | 
14 | - name: RedHat - Install Elasticsearch
15 |   yum: name=elasticsearch{% if es_version is defined and es_version != ""  %}-{{ es_version }}{% endif %} state=present update_cache=yes
16 |   when: es_use_repository
17 |   register: elasticsearch_install
18 | 
19 | - name: RedHat - Install Elasticsearch from url
20 |   yum: name={% if es_custom_package_url is defined %}{{ es_custom_package_url }}{% else %}{{ es_package_url }}-{{ es_version }}.noarch.rpm{% endif %} state=present
21 |   when: not es_use_repository
22 |   register: elasticsearch_install


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/multi.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | #Test ability to deploy multiple instances to a machine
 3 | - name: Elasticsearch Multi tests
 4 |   hosts: localhost
 5 |   roles:
 6 |     - { role: elasticsearch, es_instance_name: "master", es_data_dirs: ["/opt/elasticsearch/master"], es_heap_size: "1g", es_config: { "discovery.zen.ping.multicast.enabled": false, discovery.zen.ping.unicast.hosts: "localhost:9300", http.port: 9200, transport.tcp.port: 9300, node.data: false, node.master: true, bootstrap.mlockall: true, discovery.zen.ping.multicast.enabled: false } }
 7 |     - { role: elasticsearch, es_instance_name: "node1", es_data_dirs: "/opt/elasticsearch/data-1,/opt/elasticsearch/data-2", es_config: { "discovery.zen.ping.multicast.enabled": false,  discovery.zen.ping.unicast.hosts: "localhost:9300", http.port: 9201, transport.tcp.port: 9301, node.data: true, node.master: false, discovery.zen.ping.multicast.enabled: false } }
 8 |   vars:
 9 |     es_scripts: true
10 |     es_templates: true
11 |     es_plugin_dir: "/opt/elasticsearch/plugins"
12 | #Plugins installed for this test are specified in .kitchen.yml under suite


--------------------------------------------------------------------------------
/roles/chruby/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Andrew Angelo Ang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Alexandros Giouzenis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/roles/ruby-install/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Andrew Angelo Ang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/roles/swapfile/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Kamal Nasser <hello@kamal.io>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/tasks/sync.rake:
--------------------------------------------------------------------------------
 1 | namespace :sync do
 2 |   desc 'Run the crawler in Recrod::Mover mode'
 3 |   task :mover, [:from_bucket, :to_bucket] => :environment do |_task, args|
 4 |     Redis::List.new('visited').clear
 5 |     Syncer::Mover.perform_async args.from_bucket, args.to_bucket
 6 |   end
 7 | 
 8 |   desc 'Run the crawler in Recrod::Rescreener mode'
 9 |   task :rescreener, [:bucket] => :environment do |_task, args|
10 |     Redis::List.new('visited').clear
11 |     Syncer::Rescreener.perform_async args.bucket
12 |   end
13 | 
14 |   desc 'Run the crawler in Recrod::Rescrimper mode'
15 |   task :rescrimper, [:bucket] => :environment do |_task, args|
16 |     Redis::List.new('visited').clear
17 |     Syncer::Rescrimper.perform_async args.bucket
18 |   end
19 | 
20 |   desc 'Run the crawler in Recrod::Resampler mode'
21 |   task :resampler, [:bucket] => :environment do |_task, args|
22 |     Redis::List.new('visited').clear
23 |     Syncer::Resampler.perform_async args.bucket
24 |   end
25 | 
26 |   desc 'Run the crawler in Recrod::Respider mode'
27 |   task :respider, [:bucket] => :environment do |_task, args|
28 |     Redis::List.new('visited').clear
29 |     Syncer::Respider.perform_async args.bucket
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/app/workers/mapper/url_availability.rb:
--------------------------------------------------------------------------------
 1 | class Mapper::UrlAvailability < Mapper::Base
 2 |   def perform(url)
 3 |     @name = Page::Url.new(url).name
 4 |     @container = Rails.configuration.config[:admin][:api_containers].find { |c| c.include?(@name) }
 5 |     types = @container.split('-').last.pluralize.gsub(':', '')
 6 |     @index = Rails.env + '-' + types
 7 | 
 8 |     records = Elasticsearch::Model.client.search(index: @index, type: @container, body: { query: { match_phrase_prefix: { url: url } } })
 9 | 
10 |     if records['hits']['total'] > 0
11 |       records['hits']['hits'].each do |record|
12 |         Recorder::Uploader.perform_async({ id: record['_id'],
13 |                                            available: false,
14 |                                            url: record['_source']['url'],
15 |                                            type: record['_type'].split('-').last.capitalize.singularize })
16 |         # cloud.head(record['_id'] + '.json').try(:destroy)
17 |         # Elasticsearch::Model.client.delete index: @index, type: @container, id: record['_id']
18 |         # Crawler::Scrimper.perform_async url
19 |       end
20 |     end
21 |   # rescue NoMethodError => e
22 |   #   nil
23 |   end
24 | end
25 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-Debian.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Debian - Add Elasticsearch repository key
 3 |   apt_key: url="http://packages.elasticsearch.org/GPG-KEY-elasticsearch" state=present
 4 |   when: es_use_repository
 5 | 
 6 | - name: Debian - add elasticsearch repository
 7 |   apt_repository: repo="deb http://packages.elastic.co/elasticsearch/{{ es_major_version }}/debian stable main" state=present
 8 |   when: es_use_repository
 9 | 
10 | - name: Debian - Ensure elasticsearch is installed
11 |   apt: name=elasticsearch{% if es_version is defined and es_version != "" %}={{ es_version }}{% endif %} state=present cache_valid_time=86400
12 |   when: es_use_repository
13 |   register: elasticsearch_install
14 | 
15 | - name: Debian - Download elasticsearch from url
16 |   get_url: url={% if es_custom_package_url is defined %}{{ es_custom_package_url }}{% else %}{{ es_package_url }}-{{ es_version }}.deb{% endif %} dest=/tmp/elasticsearch-{{ es_version }}.deb validate_certs=no
17 |   when: not es_use_repository
18 | 
19 | - name: Debian - Ensure elasticsearch is installed from downloaded package
20 |   apt: deb=/tmp/elasticsearch-{{ es_version }}.deb
21 |   when: not es_use_repository
22 |   register: elasticsearch_install


--------------------------------------------------------------------------------
/config/sitemap.rb:
--------------------------------------------------------------------------------
 1 | # Set the host name for URL creation
 2 | SitemapGenerator::Sitemap.default_host = "https://" + ENV['DOMAIN'] + '/'
 3 | 
 4 | SitemapGenerator::Sitemap.sitemaps_path = "#{ENV['CONTAINER']}/"
 5 | 
 6 | SitemapGenerator::Sitemap.create_index = true
 7 | 
 8 | SitemapGenerator::Sitemap.public_path = "tmp/#{ENV['DOMAIN']}/#{ENV['CONTAINER']}/"
 9 | 
10 | SitemapGenerator::Sitemap.sitemaps_host = "https://#{ENV['DOMAIN']}/"
11 | 
12 | SitemapGenerator::Sitemap.adapter = SitemapGenerator::S3Adapter.new(Rails.configuration.config[:fog].merge(fog_directory: "#{ENV['DOMAIN']}-sitemaps",
13 |                                          fog_region: 'us-west-1', fog_provider: 'AWS'))
14 | sitemap_default_options = {
15 |   changefreq: nil,
16 |   priority: nil,
17 |   lastmod: nil
18 | }
19 | 
20 | SitemapGenerator::Sitemap.create do
21 |   # ['boxed-offers'].each do |container|
22 |   begin
23 |     Cloud.new(ENV['CONTAINER']).files.each do |file|
24 |       add (ENV['CONTAINER'] + '/' + file.key.gsub('.json','')), sitemap_default_options#.merge(lastmod: file.last_modified)
25 |     end
26 |   rescue => e
27 |     ap e.message
28 |   end
29 |   # Example. DOMAIN=pricenometry.com CONTAINER=newegg-offers bundle exec rake sitemap:refresh
30 | end
31 | 


--------------------------------------------------------------------------------
/config/initializers/sidekiq.rb:
--------------------------------------------------------------------------------
 1 | require 'sidekiq'
 2 | 
 3 | Sidekiq.configure_server do |config|
 4 |   if Rails.configuration.config[:redis][:password].presence
 5 |     url = "redis://:#{Rails.configuration.config[:redis][:password]}@#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}"
 6 |   else
 7 |     url = "redis://#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}"
 8 |   end
 9 | 
10 |   config.redis = { url: url, namespace: 'crawler' }
11 | end
12 | 
13 | Sidekiq.configure_client do |config|
14 |   if Rails.configuration.config[:redis][:password].presence
15 |     url = "redis://:#{Rails.configuration.config[:redis][:password]}@#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}"
16 |   else
17 |     url = "redis://#{Rails.configuration.config[:redis][:host]}:#{Rails.configuration.config[:redis][:port]}/#{Rails.configuration.config[:redis][:database]}"
18 |   end
19 | 
20 |   config.redis = { url: url, namespace: 'crawler' }
21 | end
22 | 
23 | SidekiqUniqueJobs.config.unique_args_enabled
24 | 


--------------------------------------------------------------------------------
/test/models/url_test.rb:
--------------------------------------------------------------------------------
 1 | require 'test_helper'
 2 | 
 3 | class UrlTest < ActiveSupport::TestCase
 4 |   setup do
 5 |     assert @url = Page::Url.new('google.com')
 6 |     assert @url.date = '2014-08-09'
 7 |   end
 8 | 
 9 |   test 'cache_key method returns correct cache path' do
10 |     assert_equal @url.cache_key, 'google/google.com/c7b920f57e553df2bb68272f61570210/2014-08-09'
11 |   end
12 | 
13 |   test 'build_path method returns correct hashed path' do
14 |     assert_equal @url.build_path, 'google/google.com/c7b920f57e553df2bb68272f61570210'
15 |   end
16 | 
17 |   # TODO
18 |   # test 'uri method returns a URI object' do
19 |   #   pending 'Needs to match a URI object'
20 |   # end
21 | 
22 |   test 'url method returns correctly formatted internet address' do
23 |     assert_equal @url.url, 'http://google.com'
24 |   end
25 | 
26 |   test 'md5 method returns correct ' do
27 |     assert path = @url.url
28 |     assert checksum = Digest::MD5.hexdigest(path)
29 |     assert_equal @url.md5, checksum
30 |   end
31 | 
32 |   test 'host method returns correct host name' do
33 |     assert_equal @url.host, 'google.com'
34 |   end
35 | 
36 |   test 'name method returns correct site name' do
37 |     assert_equal @url.name, 'google'
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/app/models/crawl/social.rb:
--------------------------------------------------------------------------------
 1 | class Crawl::Social < Page::Url
 2 |   require 'social_shares'
 3 | 
 4 |   def shares
 5 |     if shares = all(url).delete_if { |_k, v| v == 0 }.presence
 6 |       return shares
 7 |     elsif url.starts_with?('https') && shares = all(url.gsub('https://','http://')).delete_if { |_k, v| v == 0 }.presence
 8 |       return shares
 9 |     else
10 |       {}
11 |     end
12 |   end
13 | 
14 |   def all(new_url)
15 |     @all ||= SocialShares.all(new_url).delete_if { |_k, v| v == 0 }.map { |k, v| { k.to_s + '_shares' => v.to_i } }.reduce({}, :merge)
16 |   end
17 | 
18 |   def total
19 |     all.values.sum
20 |   end
21 | 
22 |   def has_shares?
23 |     SocialShares.has_any?(url)
24 |   end
25 | 
26 |   def facebook
27 |     @facebook ||= sanitize_facebook JSON.parse(Crawl::Base.new("https://graph.facebook.com/?id=#{@url}").get.try(:body), quirks_mode: true)
28 |   rescue
29 |     {}
30 |   end
31 | 
32 |   def sanitize_facebook(data)
33 |     return nil if data['error_message'] || data['error_type'] || data['error_code']
34 |     return nil if data.empty?
35 |     Flattener.new(data).flatten.delete_if { |_k, v| v == 0 || v == @url }.map { |k, v| { 'facebook_' + k.to_s => v.try(:squish) || v } }.reduce({}, :merge)
36 |   end
37 | end
38 | 


--------------------------------------------------------------------------------
/app/workers/crawler/sampler.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Sampler < Crawler::Base
 2 |   sidekiq_options queue: :sampler,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, type = nil, hash = {})
 9 |     return if url.nil?
10 |     @parsed = hash
11 | 
12 |     if type.nil?
13 |       next_type
14 |     else
15 |       @type = type
16 |     end
17 | 
18 |     @url = url
19 | 
20 |     Timeout::timeout(60) do
21 |       parser.page = scraper.get
22 |     end
23 | 
24 |     visit
25 |     upload
26 |   rescue Mechanize::ResponseCodeError => e
27 |     if e.response_code == '404' ||
28 |          e.response_code == '410' ||
29 |          e.response_code == '520' ||
30 |          e.response_code == '500' ||
31 |          e.response_code == '301' ||
32 |          e.response_code == '302'
33 |       Mapper::UrlAvailability.perform_async url
34 |     else
35 |       raise
36 |     end
37 |   rescue Mechanize::RedirectLimitReachedError => e
38 |     nil
39 |   rescue Timeout::Error => e
40 |     Crawler::Stretcher.perform_async url
41 |   end
42 | 
43 |   def next_type
44 |     @type ||= 'Scrimper'
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/app/workers/crawler/spider.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Spider < Crawler::Base
 2 |   sidekiq_options queue: :spider,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60 * 365
 7 | 
 8 |   def perform(url, type = nil, hash = {})
 9 |     return if url.nil?
10 |     @parsed = hash
11 | 
12 |     if type.nil?
13 |       next_type
14 |     else
15 |       @type = type
16 |     end
17 | 
18 |     @url = url
19 | 
20 |     Timeout::timeout(60) do
21 |       parser.page = scraper.get
22 |     end
23 | 
24 |     visit
25 |     upload
26 |   rescue Mechanize::ResponseCodeError => e
27 |     if e.response_code == '404' ||
28 |          e.response_code == '410' ||
29 |          e.response_code == '520' ||
30 |          e.response_code == '500' ||
31 |          e.response_code == '301' ||
32 |          e.response_code == '302'
33 |       Mapper::UrlAvailability.perform_async url
34 |     else
35 |       raise
36 |     end
37 |   rescue Mechanize::RedirectLimitReachedError => e
38 |     nil
39 |   rescue Timeout::Error => e
40 |     Crawler::Stretcher.perform_async url
41 |   end
42 | 
43 |   def next_type
44 |     @type ||= 'Spider'
45 |   end
46 | end
47 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-templates.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - file: path=/etc/elasticsearch/templates state=directory owner={{ es_user }} group={{ es_group }}
 4 | 
 5 | - name: Copy default templates to elasticsearch
 6 |   copy: src=templates dest=/etc/elasticsearch/ owner={{ es_user }} group={{ es_group }}
 7 |   when: es_templates_fileglob is not defined
 8 | 
 9 | - name: Copy templates to elasticsearch
10 |   copy: src={{ item }} dest=/etc/elasticsearch/templates owner={{ es_user }} group={{ es_group }}
11 |   with_fileglob: "{{ es_templates_fileglob }}"
12 | 
13 | - set_fact: http_port=9200
14 |   tags:
15 |       - always
16 | 
17 | - set_fact: http_port={{es_config['http.port']}}
18 |   when: es_config['http.port'] is defined
19 |   tags:
20 |       - always
21 | 
22 | - name: Wait for elasticsearch to startup
23 |   wait_for: port={{http_port}} delay=10
24 | 
25 | - name: Get template files 
26 |   shell: find . -maxdepth 1 -type f | sed "s#\./##" | sed "s/.json//" chdir=/etc/elasticsearch/templates
27 |   register: resultstemplate
28 | 
29 | - name: Install template(s)
30 |   command: "curl -sL -XPUT http://localhost:{{http_port}}/_template/{{item}} -d @/etc/elasticsearch/templates/{{item}}.json"
31 |   with_items: "{{ resultstemplate.stdout_lines }}"
32 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - set_fact: instance_default_file={{default_file | dirname}}/{{es_instance_name}}_{{default_file | basename}}
 4 |   tags:
 5 |       - always
 6 | - set_fact: instance_init_script={{init_script | dirname }}/{{es_instance_name}}_{{init_script | basename}}
 7 |   tags:
 8 |       - always
 9 | - set_fact: conf_dir={{ es_conf_dir }}/{{es_instance_name}}
10 |   tags:
11 |       - always
12 | - set_fact: plugin_dir={{ es_plugin_dir }}/{{es_instance_name}}
13 |   tags:
14 |       - always
15 | - set_fact: m_lock_enabled={{ es_config['bootstrap.mlockall'] is defined and es_config['bootstrap.mlockall'] == True }}
16 |   tags:
17 |       - always
18 | 
19 | - debug: msg="Node configuration {{ es_config }} "
20 | 
21 | - name: Include optional user and group creation. 
22 |   when: (es_user_id is defined) and (es_group_id is defined)
23 |   include: elasticsearch-optional-user.yml
24 | 
25 | #- name: Include specific Elasticsearch
26 | #  include: "elasticsearch-{{ansible_os_family}}.yml"
27 | 
28 | #Install OS specific elasticsearch - this can be abbreviated in version 2.0.0
29 | - name: Include specific Elasticsearch
30 |   include: elasticsearch-Debian.yml
31 |   when: ansible_os_family == 'Debian'
32 | 
33 | - name: Include specific Elasticsearch
34 |   include: elasticsearch-RedHat.yml
35 |   when: ansible_os_family == 'RedHat'
36 | 


--------------------------------------------------------------------------------
/config/unicorn.rb:
--------------------------------------------------------------------------------
 1 | # paths
 2 | app_path = '/home/ubuntu/skynet'
 3 | working_directory "#{app_path}/current"
 4 | pid "#{app_path}/current/tmp/pids/unicorn.pid"
 5 | 
 6 | # listen
 7 | listen "#{app_path}/current/tmp/sockets/unicorn.sock", backlog: 64
 8 | 
 9 | # logging
10 | stderr_path 'log/unicorn.stderr.log'
11 | stdout_path 'log/unicorn.stdout.log'
12 | 
13 | # workers
14 | worker_processes 2 # Use 2 With Nano Server, 18 for medium
15 | 
16 | # use correct Gemfile on restarts
17 | before_exec do |_server|
18 |   ENV['BUNDLE_GEMFILE'] = "#{app_path}/current/Gemfile"
19 | end
20 | 
21 | # preload
22 | preload_app true
23 | 
24 | before_fork do |server, _worker|
25 |   # the following is highly recomended for Rails + "preload_app true"
26 |   # as there's no need for the master process to hold a connection
27 |   ActiveRecord::Base.connection.disconnect! if defined?(ActiveRecord::Base)
28 | 
29 |   # Before forking, kill the master process that belongs to the .oldbin PID.
30 |   # This enables 0 downtime deploys.
31 |   old_pid = "#{server.config[:pid]}.oldbin"
32 |   if File.exist?(old_pid) && server.pid != old_pid
33 |     begin
34 |       Process.kill('QUIT', File.read(old_pid).to_i)
35 |     rescue Errno::ENOENT, Errno::ESRCH
36 |       # someone else did our job for us
37 |     end
38 |   end
39 | end
40 | 
41 | after_fork do |_server, _worker|
42 |   ActiveRecord::Base.establish_connection if defined?(ActiveRecord::Base)
43 | end
44 | 


--------------------------------------------------------------------------------
/config/routes.rb:
--------------------------------------------------------------------------------
 1 | Rails.application.routes.draw do
 2 | 
 3 |   root to: 'application#index'
 4 | 
 5 |   require "sidekiq/web"
 6 |   Sidekiq::Web.use Rack::Auth::Basic do |username, password|
 7 |     username == Rails.configuration.config[:admin][:username] && password == Rails.configuration.config[:admin][:password]
 8 |   end if Rails.env.production?
 9 |   mount Sidekiq::Web, at: "/sidekiq"
10 | 
11 |   namespace :v1, defaults: { format: 'json' } do
12 |     get '/', to: 'status#index'
13 |     get '/match', to: 'match#index', results: 10
14 |     get '/search/:query', to: 'search#index'
15 |     post '/batch' => 'batch#index'
16 |     get '/trends/:array', to: 'trends#index'
17 |     get '/:container/ids', to: 'record#ids'
18 |     get '/:container/match', to: 'match#index'
19 |     get '/:container/search/:query', to: 'search#index'
20 |     get '/:container/trends/:array', to: 'trends#index'
21 |     get '/:container/:record_id/related', to: 'record#related'
22 |     get '/:container/:record_id/history', to: 'record#history'
23 |     get '/:container/:record_id/news', to: 'record#news'
24 |     get '/:container/:record_id/videos', to: 'record#videos'
25 |     get '/:container/:record_id/references', to: 'record#references'
26 |     get '/:container/:record_id/links', to: 'record#links'
27 |     get '/:container/:record_id/:screenshot_id', to: 'record#screenshot'
28 |     get '/:container/:record_id', to: 'record#index'
29 |   end
30 | end
31 | 


--------------------------------------------------------------------------------
/roles/ruby-install/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Download ruby-install
 3 |   shell: wget -O ruby-install-{{ ruby_install_version }}.tar.gz https://github.com/postmodern/ruby-install/archive/v{{ ruby_install_version }}.tar.gz
 4 |   when:
 5 |     ansible_local is not defined or
 6 |     ansible_local.ruby_install is not defined or
 7 |     ansible_local.ruby_install.version != ruby_install_version
 8 |   register: ruby_install_downloaded
 9 | 
10 | - name: Extract ruby-install
11 |   shell: tar -xzvf ruby-install-{{ ruby_install_version }}.tar.gz
12 |   when: ruby_install_downloaded | changed
13 | 
14 | - name: Install ruby-install
15 |   sudo: true
16 |   shell:
17 |     chdir=ruby-install-{{ ruby_install_version }}
18 |     make install
19 |   when: ruby_install_downloaded | changed
20 |   register: ruby_install_installed
21 | 
22 | - name: Clean up ruby-install sources
23 |   shell: rm -rf ruby-install-*
24 |   when: ruby_install_downloaded | changed
25 | 
26 | #
27 | # Setup ruby-install facts.d
28 | #
29 | - name: Capture installed ruby-install version
30 |   shell: ruby-install --version | awk '{ print $2 }'
31 |   ignore_errors: yes
32 |   register: installed_ruby_install_version
33 | 
34 | - name: Create ansible facts directory
35 |   sudo: true
36 |   file: state=directory recurse=yes path=/etc/ansible/facts.d
37 | 
38 | - name: Set ruby-install facts
39 |   sudo: true
40 |   template: src=ruby_install.fact dest=/etc/ansible/facts.d/ruby_install.fact
41 | 


--------------------------------------------------------------------------------
/roles/chruby/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - name: Download chruby
 4 |   shell: wget -O chruby-{{ chruby_version }}.tar.gz https://github.com/postmodern/chruby/archive/v{{ chruby_version }}.tar.gz
 5 |   when:
 6 |     ansible_local is not defined or
 7 |     ansible_local.chruby is not defined or
 8 |     ansible_local.chruby.version != chruby_version
 9 |   register: chruby_downloaded
10 | 
11 | - name: Extract chruby
12 |   shell: tar -xzvf chruby-{{ chruby_version }}.tar.gz
13 |   when: chruby_downloaded | changed
14 |   register: chruby_extracted
15 | 
16 | - name: Install chruby
17 |   sudo: true
18 |   shell: cd chruby-{{ chruby_version }}/ && make install
19 |   when: chruby_extracted | changed
20 |   register: chruby_installed
21 | 
22 | - name: Clean up chruby sources
23 |   shell: rm -rf chruby-*
24 |   when: chruby_downloaded | changed
25 | 
26 | - name: Attach chruby into shell
27 |   sudo: true
28 |   template: src=chruby.sh dest=/etc/profile.d/chruby.sh
29 | 
30 | #
31 | # Setup chruby facts.d
32 | #
33 | - name: Capture installed chruby version
34 |   shell: >
35 |     executable=/bin/bash source /etc/profile;
36 |     chruby --version | awk '{ print $2 }'
37 |   ignore_errors: yes
38 |   register: installed_chruby_version
39 | 
40 | - name: Create ansible facts directory
41 |   sudo: true
42 |   file: state=directory recurse=yes path=/etc/ansible/facts.d
43 | 
44 | - name: Set chruby facts
45 |   sudo: true
46 |   template: src=chruby.fact dest=/etc/ansible/facts.d/chruby.fact
47 | 


--------------------------------------------------------------------------------
/app/models/record/base.rb:
--------------------------------------------------------------------------------
 1 | class Record::Base
 2 |   def initialize(container = nil, record = nil)
 3 |     @record = record
 4 |     @container = container
 5 |     @types = container.split('-').last.pluralize.gsub(':', '') if container
 6 |     @index = Rails.env + '-' + @types if @types
 7 |   end
 8 | 
 9 |   def delete
10 |     cloud.head(@record + '.json').try(:destroy)
11 |   end
12 | 
13 |   def url
14 |     @url ||= data['url']
15 |   end
16 | 
17 |   def screenshots
18 |     @screenshots ||= data['screenshot'].map { |_key, value| { value => url } }.reduce({}, :merge)
19 |   end
20 | 
21 |   def data
22 |     JSON.parse(cloud.get(@record + '.json').try(:body), quirks_mode: true)
23 |   rescue
24 |     {}
25 |   end
26 | 
27 |   def data=(new_hash = {})
28 |     cloud.sync @record + '.json', new_hash.to_json
29 |   end
30 | 
31 |   def cloud
32 |     Cloud.new(@container)
33 |   end
34 | 
35 |   private
36 | 
37 |   def sanitize_value value
38 |     if value.is_a?(Array) || !!value == value
39 |       return value
40 |     elsif value.to_i.to_s == value.to_s
41 |       return value.to_i
42 |     elsif (Float(value) rescue false)
43 |       return value.to_f
44 |     else
45 |       return value
46 |     end
47 |   end
48 | 
49 |   def recrawl(url, options)
50 |     if options[:crawl]
51 |       options[:social] ? Crawler::Socializer.perform_async(url) : Crawler::Slider.perform_async(url)
52 |     end
53 |   rescue #TODO find the correct error for Redis not responding
54 |     nil
55 |   end
56 | end
57 | 


--------------------------------------------------------------------------------
/roles/build-ruby/README.md:
--------------------------------------------------------------------------------
 1 | Role Name
 2 | =========
 3 | 
 4 | A brief description of the role goes here.
 5 | 
 6 | Requirements
 7 | ------------
 8 | 
 9 | Any pre-requisites that may not be covered by Ansible itself or the role should be mentioned here. For instance, if the role uses the EC2 module, it may be a good idea to mention in this section that the boto package is required.
10 | 
11 | Role Variables
12 | --------------
13 | 
14 | A description of the settable variables for this role should go here, including any variables that are in defaults/main.yml, vars/main.yml, and any variables that can/should be set via parameters to the role. Any variables that are read from other roles and/or the global scope (ie. hostvars, group vars, etc.) should be mentioned here as well.
15 | 
16 | Dependencies
17 | ------------
18 | 
19 | A list of other roles hosted on Galaxy should go here, plus any details in regards to parameters that may need to be set for other roles, or variables that are used from other roles.
20 | 
21 | Example Playbook
22 | ----------------
23 | 
24 | Including an example of how to use your role (for instance, with variables passed in as parameters) is always nice for users too:
25 | 
26 |     - hosts: servers
27 |       roles:
28 |          - { role: username.rolename, x: 42 }
29 | 
30 | License
31 | -------
32 | 
33 | BSD
34 | 
35 | Author Information
36 | ------------------
37 | 
38 | An optional section for the role authors to include contact information, or a website (HTML is not allowed).
39 | 


--------------------------------------------------------------------------------
/roles/swapfile/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - name: Write swapfile
 3 |   command: |
 4 |     {% if swapfile_use_dd %}
 5 |     dd if=/dev/zero of={{ swapfile_location }} bs=1M count={{ swapfile_size }} creates={{ swapfile_location }}
 6 |     {% else %}
 7 |     fallocate -l {{ swapfile_size }} {{ swapfile_location }} creates={{ swapfile_location }}
 8 |     {% endif %}
 9 |   register: write_swapfile
10 |   when: swapfile_size != false
11 | 
12 | - name: Set swapfile permissions
13 |   file: path={{ swapfile_location }} mode=600
14 |   when: swapfile_size != false
15 | 
16 | - name: Create swapfile
17 |   command: mkswap {{ swapfile_location }}
18 |   register: create_swapfile
19 |   when: swapfile_size != false and write_swapfile.changed
20 | 
21 | - name: Enable swapfile
22 |   command: swapon {{ swapfile_location }}
23 |   when: swapfile_size != false and create_swapfile.changed
24 | 
25 | - name: Add swapfile to /etc/fstab
26 |   lineinfile: dest=/etc/fstab line="{{ swapfile_location }}   none    swap    sw    0   0" state=present
27 |   when: swapfile_size != false
28 | 
29 | - name: Configure vm.swappiness
30 |   lineinfile: dest=/etc/sysctl.conf line="vm.swappiness = {{ swapfile_swappiness }}" regexp="^vm.swappiness[\s]?=" state=present
31 |   notify: Reload sysctl
32 |   when: swapfile_swappiness != false
33 | 
34 | - name: Configure vm.vfs_cache_pressure
35 |   lineinfile: dest=/etc/sysctl.conf line="vm.vfs_cache_pressure = {{ swapfile_vfs_cache_pressure }}" regexp="^vm.vfs_cache_pressure[\s]?=" state=present
36 |   notify: Reload sysctl
37 |   when: swapfile_vfs_cache_pressure != false
38 | 


--------------------------------------------------------------------------------
/roles/logrotate/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016-14, Nick Hammond
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of ansiblebit nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/roles/letsencrypt/tasks/main.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 |   - apt: update_cache=yes cache_valid_time=3600
 3 | 
 4 |   - name: Install depends
 5 |     apt: name={{ item }} state=present
 6 |     with_items:
 7 |       - python
 8 |       - python-dev
 9 |       - python-virtualenv
10 |       - gcc
11 |       - dialog
12 |       - libaugeas0
13 |       - libssl-dev
14 |       - libffi-dev
15 |       - ca-certificates
16 |       - python-pip
17 |       - git
18 | 
19 |   - name: Install virtualenv (Debian)
20 |     apt: name={{ item }} state=present
21 |     with_items:
22 |       - virtualenv
23 |     when: ansible_distribution == 'Debian'
24 | 
25 |   - name: Install python depends
26 |     pip: virtualenv="{{ letsencrypt_venv }}" virtualenv_site_packages=no name={{ item }} state=latest
27 |     with_items:
28 |       - setuptools
29 |       - pip
30 | 
31 |   - name: More python depends
32 |     pip: virtualenv="{{ letsencrypt_venv }}" virtualenv_site_packages=no name=letsencrypt
33 | 
34 |   - name: Attempt to get the certificate using the webroot authenticator
35 |     command: "{{ letsencrypt_command }} -a webroot --webroot-path {{ letsencrypt_webroot_path }} certonly"
36 |     args:
37 |       creates: "/etc/letsencrypt/live/{{ letsencrypt_cert_domains[0] }}"
38 |     when: letsencrypt_authenticator == "webroot"
39 |     ignore_errors: True
40 | 
41 |   - name: Attempt to get the certificate using the standalone authenticator (in case eg the webserver isn't running yet)
42 |     command: "{{ letsencrypt_command }} -a standalone auth"
43 |     args:
44 |       creates: "/etc/letsencrypt/live/{{ letsencrypt_cert_domains[0] }}"
45 | 


--------------------------------------------------------------------------------
/app/models/crawl/capture.rb:
--------------------------------------------------------------------------------
 1 | class Crawl::Capture < Page::Url
 2 |   require 'rmagick'
 3 |   include Magick
 4 | 
 5 |   attr_accessor :relative_path
 6 | 
 7 |   PNG = '.png'
 8 |   JPG = '.jpg'
 9 | 
10 |   def screen
11 |     unless cloud.head relative_path
12 |       check_temp_path
13 |       get_png
14 |       compress_png
15 |       cloud.sync(relative_path, jpeg)
16 |       delete_images
17 |     end
18 |     relative_path
19 |   end
20 | 
21 |   def compress_png
22 |     image.minify.write(jpg_file_path) do
23 |       self.format = 'JPEG'
24 |     end
25 |   end
26 | 
27 |   def get_png
28 |     headless = Headless.new
29 |     headless.start
30 |     driver = Selenium::WebDriver.for :firefox
31 |     driver.navigate.to @url
32 |     driver.save_screenshot(png_file_path)
33 |     driver.close
34 |     headless.destroy
35 |   end
36 | 
37 |   def check_temp_path
38 |     path = File.dirname temp_path
39 |     FileUtils.mkdir_p(path) unless File.exist?(path)
40 |   end
41 | 
42 |   def delete_images
43 |     FileUtils.rm jpg_file_path
44 |     FileUtils.rm png_file_path
45 |   rescue Errno::ENOENT
46 |     nil
47 |   end
48 | 
49 |   def jpeg
50 |     File.read jpg_file_path
51 |   rescue Errno::ENOENT
52 |     nil
53 |   end
54 | 
55 |   def png_file_path
56 |     temp_path + PNG
57 |   end
58 | 
59 |   def jpg_file_path
60 |     temp_path + JPG
61 |   end
62 | 
63 |   def temp_path
64 |     File.join(Rails.root, 'tmp/cache', md5)
65 |   end
66 | 
67 |   def image
68 |     @image ||= Image.read(png_file_path).first
69 |   end
70 | 
71 |   def cloud
72 |     @cloud ||= Cloud.new(name + '-screenshots')
73 |   end
74 | end
75 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/checkParameters.yml:
--------------------------------------------------------------------------------
 1 | # Check for mandatory parameters
 2 | 
 3 | - fail: msg="es_instance_name must be specified and cannot be blank"
 4 |   when: es_instance_name is not defined or es_instance_name == ''
 5 | 
 6 | - fail: msg="es_proxy_port must be specified and cannot be blank when es_proxy_host is defined"
 7 |   when: (es_proxy_port is not defined or es_proxy_port == '') and (es_proxy_host is defined and es_proxy_host != '')
 8 | 
 9 | - set_fact: multi_cast={{ (es_version | version_compare('2.0', '<') and es_config['discovery.zen.ping.multicast.enabled'] is not defined) or (es_config['discovery.zen.ping.multicast.enabled'] is defined and es_config['discovery.zen.ping.multicast.enabled'])}}
10 | 
11 | - debug: msg="WARNING - It is recommended you specify the parameter 'http.port' when multicast is disabled"
12 |   when: not multi_cast and es_config['http.port'] is not defined
13 | 
14 | - debug: msg="WARNING - It is recommended you specify the parameter 'transport.tcp.port' when multicast is disabled"
15 |   when: not multi_cast and es_config['transport.tcp.port'] is not defined
16 | 
17 | - debug: msg="WARNING - It is recommended you specify the parameter 'discovery.zen.ping.unicast.hosts' when multicast is disabled"
18 |   when: not multi_cast and es_config['discovery.zen.ping.unicast.hosts'] is not defined
19 | 
20 | #If the user attempts to lock memory they must specify a heap size
21 | - fail: msg="If locking memory with bootstrap.mlockall a heap size must be specified"
22 |   when: es_config['bootstrap.mlockall'] is defined and es_config['bootstrap.mlockall'] == True and es_heap_size is not defined


--------------------------------------------------------------------------------
/config/environments/development.rb:
--------------------------------------------------------------------------------
 1 | Rails.application.configure do
 2 |   # Settings specified here will take precedence over those in config/application.rb.
 3 | 
 4 |   # In the development environment your application's code is reloaded on
 5 |   # every request. This slows down response time but is perfect for development
 6 |   # since you don't have to restart the web server when you make code changes.
 7 |   config.cache_classes = true
 8 | 
 9 |   # Do not eager load code on boot.
10 |   config.eager_load = true
11 | 
12 |   # Show full error reports and disable caching.
13 |   config.consider_all_requests_local       = true
14 |   config.action_controller.perform_caching = true
15 |   # config.cache_store = :dalli_store
16 | 
17 |   # Don't care if the mailer can't send.
18 |   config.action_mailer.raise_delivery_errors = false
19 | 
20 |   # Print deprecation notices to the Rails logger.
21 |   config.active_support.deprecation = :log
22 | 
23 |   # Raise an error on page load if there are pending migrations.
24 |   config.active_record.migration_error = :page_load
25 | 
26 |   # Debug mode disables concatenation and preprocessing of assets.
27 |   # This option may cause significant delays in view rendering with a large
28 |   # number of complex assets.
29 |   config.assets.debug = true
30 | 
31 |   # Adds additional error checking when serving assets at runtime.
32 |   # Checks for improperly declared sprockets dependencies.
33 |   # Raises helpful error messages.
34 |   config.assets.raise_runtime_errors = true
35 | 
36 |   # Raises error for missing translations
37 |   # config.action_view.raise_on_missing_translations = true
38 | 
39 |   require 'sidekiq/testing/inline'
40 | end
41 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org'
 2 | 
 3 | gem 'bundler'
 4 | 
 5 | gem 'rails'
 6 | gem 'rack-health'
 7 | gem 'responders'
 8 | gem 'sqlite3'
 9 | gem 'pry'
10 | 
11 | gem 'mina', :require => false
12 | gem 'mina-sidekiq', :require => false
13 | gem 'mina-unicorn', :require => false
14 | gem 'sitemap_generator'
15 | 
16 | gem 'oj'
17 | gem 'oj_mimic_json'
18 | 
19 | gem 'awesome_print'
20 | gem 'progress'
21 | gem 'groupdate'
22 | 
23 | gem 'nokogiri'
24 | gem 'mechanize'
25 | gem 'social_shares'
26 | gem 'user-agent-randomizer'
27 | 
28 | gem 'selenium-webdriver'
29 | gem 'headless'
30 | gem 'rmagick'
31 | 
32 | gem 'vcr'
33 | gem 'typhoeus'
34 | gem 'fog'
35 | 
36 | gem 'redis-namespace'
37 | gem 'redis-objects'
38 | gem 'redis-rails'
39 | gem 'elasticsearch-model'
40 | 
41 | # gem 'dalli'
42 | 
43 | gem 'sidekiq'
44 | gem 'sidekiq-unique-jobs'
45 | gem 'sidekiq-limit_fetch'
46 | gem 'sidetiq'
47 | # gem 'sidekiq-statistic'
48 | gem 'sinatra', require: false
49 | 
50 | gem 'google-search'
51 | gem 'wikipedia-client'
52 | 
53 | group :doc do
54 |   # bundle exec rake doc:rails generates the API under doc/api.
55 |   gem 'sdoc', '~> 0.4.0'
56 | end
57 | 
58 | group :development, :test do
59 |   gem 'thin'
60 | end
61 | 
62 | group :development do
63 |   gem 'pry-rails'
64 |   gem 'spring'
65 |   gem 'better_errors'
66 |   gem 'binding_of_caller'
67 |   gem 'meta_request'
68 |   gem 'quiet_assets'
69 |   gem 'rails_layout'
70 |   gem 'rubocop', require: false
71 |   gem 'guard-test', require: false
72 |   gem 'guard-livereload', require: false
73 |   gem 'guard-rails', require: false
74 |   gem 'guard-sidekiq', require: false
75 |   gem 'rack-livereload'
76 |   gem 'mock_redis'
77 | end
78 | 
79 | group :production do
80 |   gem 'unicorn'
81 |   gem 'unicorn-worker-killer'
82 | end
83 | 


--------------------------------------------------------------------------------
/app/models/crawl/base.rb:
--------------------------------------------------------------------------------
 1 | class Crawl::Base < Page::Url
 2 |   require 'user_agent_randomizer'
 3 |   require 'timeout'
 4 | 
 5 |   def agent
 6 |     @agent ||= defaults
 7 |   end
 8 | 
 9 |   def get
10 |     page = agent.get(url)
11 | 
12 |     return page if page.code == '200'
13 | 
14 |     if page.code == '301' || page.code == '302'
15 |       page = agent.get(url.gsub('http://','https://'))
16 | 
17 |       return page if page.code == '200'
18 |     end
19 | 
20 |     raise Mechanize::ResponseCodeError.new(page, 'Not 200')
21 |   end
22 | 
23 |   def clear
24 |     agent.shutdown
25 |   end
26 | 
27 |   def post(params, headers = '')
28 |     # TODO: change it back to cache_key when built
29 |     VCR.use_cassette(File.join(cache_vcr, params.to_query + headers), record: :new_episodes) do
30 |       # Rails.cache.fetch(build_path, params.to_query + headers) do
31 |       @agent = defaults
32 |       @agent.post(url, params, headers)
33 |     end
34 |   end
35 | 
36 |   private
37 | 
38 |   def get_with_vcr(record)
39 |     # TODO: change it back to cache_key when built
40 |     VCR.use_cassette(cache_vcr, record: record) do
41 |       # Rails.cache.fetch(build_path) do
42 |       @agent = defaults
43 |       @agent.get(url)
44 |     end
45 |   end
46 | 
47 |   def cache_vcr
48 |     File.join(host, date, md5)
49 |   end
50 | 
51 |   def defaults
52 |     agent = Mechanize.new
53 |     agent.user_agent = UserAgentRandomizer::UserAgent.fetch(type: "desktop_browser").string
54 |     agent.html_parser = Nokogiri::HTML
55 |     agent.redirect_ok = false
56 |     # agent.ssl_version = 'SSLv3'
57 |     agent.open_timeout = 300
58 |     agent.read_timeout = 300
59 |     agent.idle_timeout = 300
60 |     agent.max_history = 10
61 |     agent.keep_alive = false
62 |     agent
63 |   end
64 | end
65 | 


--------------------------------------------------------------------------------
/app/workers/crawler/scraper.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Scraper < Crawler::Base
 2 |   sidekiq_options queue: :scraper,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, type = nil)
 9 |     return if url.nil?
10 | 
11 |     if type.nil?
12 |       next_type
13 |     else
14 |       @type = type
15 |     end
16 | 
17 |     @url = url
18 | 
19 |     Timeout::timeout(60) do
20 |       parser.page = scraper.get
21 |     end
22 | 
23 |     if scraping.presence
24 |       scraping.each do |hash|
25 |         if hash[:url].presence
26 |           ('Crawler::' + next_type).constantize.perform_async hash[:url], hash
27 |         else
28 |           Recorder::Uploader.perform_async hash.merge(url: @url)
29 |         end
30 |       end
31 |     else
32 |       raise "Scraping not found"
33 |     end
34 | 
35 |     paginate
36 | 
37 |     # upload
38 |   rescue Mechanize::ResponseCodeError => e
39 |     if e.response_code == '404' ||
40 |          e.response_code == '410' ||
41 |          e.response_code == '520' ||
42 |          e.response_code == '500' ||
43 |          e.response_code == '301' ||
44 |          e.response_code == '302'
45 |       Mapper::UrlAvailability.perform_async url
46 |     else
47 |       raise
48 |     end
49 |   rescue Mechanize::RedirectLimitReachedError => e
50 |     nil
51 |   rescue Timeout::Error => e
52 |     Crawler::Stretcher.perform_async url
53 |   end
54 | 
55 |   def next_type
56 |     @type ||= 'Scrimper'
57 |   end
58 | 
59 |   def scraping
60 |     @scraping ||= parser.scraping.compact
61 |   end
62 | 
63 |   def paginate
64 |     parser.paginate.each do |next_url|
65 |       Crawler::Scraper.perform_async next_url
66 |     end
67 |   end
68 | end
69 | 


--------------------------------------------------------------------------------
/config/environments/test.rb:
--------------------------------------------------------------------------------
 1 | Rails.application.configure do
 2 |   # Settings specified here will take precedence over those in config/application.rb.
 3 | 
 4 |   # The test environment is used exclusively to run your application's
 5 |   # test suite. You never need to work with it otherwise. Remember that
 6 |   # your test database is "scratch space" for the test suite and is wiped
 7 |   # and recreated between test runs. Don't rely on the data there!
 8 |   config.cache_classes = true
 9 | 
10 |   # Do not eager load code on boot. This avoids loading your whole application
11 |   # just for the purpose of running a single test. If you are using a tool that
12 |   # preloads Rails for running tests, you may have to set it to true.
13 |   config.eager_load = false
14 | 
15 |   # Configure static asset server for tests with Cache-Control for performance.
16 |   config.serve_static_assets  = true
17 |   config.static_cache_control = 'public, max-age=3600'
18 | 
19 |   # Show full error reports and disable caching.
20 |   config.consider_all_requests_local       = true
21 |   config.action_controller.perform_caching = false
22 | 
23 |   # Raise exceptions instead of rendering exception templates.
24 |   config.action_dispatch.show_exceptions = false
25 | 
26 |   # Disable request forgery protection in test environment.
27 |   config.action_controller.allow_forgery_protection = false
28 | 
29 |   # Tell Action Mailer not to deliver emails to the real world.
30 |   # The :test delivery method accumulates sent emails in the
31 |   # ActionMailer::Base.deliveries array.
32 |   config.action_mailer.delivery_method = :test
33 | 
34 |   # Print deprecation notices to the stderr.
35 |   config.active_support.deprecation = :stderr
36 | 
37 |   # Raises error for missing translations
38 |   # config.action_view.raise_on_missing_translations = true
39 | end
40 | 


--------------------------------------------------------------------------------
/production.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | - hosts: all
 3 |   user: ubuntu
 4 |   sudo: yes
 5 |   roles:
 6 |     - ubuntu-common
 7 |     - imagemagick
 8 |     - chruby
 9 |     - ruby-install
10 |     - role: build-ruby
11 |       version: 2.3.0
12 |     - role: swapfile
13 |       swapfile_size: 8GB
14 |       swapfile_swappiness: 10
15 |       swapfile_location: /mnt/swapfile
16 |     # - role: letsencrypt
17 |     #   letsencrypt_webroot_path: /home/ubuntu/skynet/current/public
18 |     #   letsencrypt_email: bastosmichael@gmail.com
19 |     #   letsencrypt_cert_domains:
20 |     #     - api.pricenometry.com
21 |     - role: nginx-unicorn
22 |       nginx_sites:
23 |         - name: 'production'
24 |           server_name: 'localhost'
25 |           root: '/home/ubuntu/skynet/current'
26 |           listen: '0.0.0.0:80'
27 |           access_log:
28 |             format: 'combined'
29 |     #       ssl:
30 |     #         ssl_only: true
31 |     #         # sensitive_uris:
32 |     #         #   - ^/sidekiq(.*)
33 |     #         certificate: /etc/letsencrypt/live/api.pricenometry.com/fullchain.pem
34 |     #         certificate_key: /etc/letsencrypt/live/api.pricenometry.com/privkey.pem
35 |     - role: logrotate
36 |       logrotate_scripts:
37 |         - name: rails
38 |           path: "/home/ubuntu/skynet/shared/log/*.log"
39 |           options:
40 |             - hourly
41 |             - size 25M
42 |             - missingok
43 |             - compress
44 |             - delaycompress
45 |             - copytruncate
46 |         - name: nginx
47 |           path: /var/log/nginx/*.log
48 |           options:
49 |             - hourly
50 |             - size 25M
51 |             - rotate 7
52 |             - missingok
53 |             - compress
54 |             - delaycompress
55 |             - copytruncate
56 |           scripts:
57 |             postrotate: "[ -s /run/nginx.pid ] && kill -USR1 `cat /run/nginx.pid`"
58 | 


--------------------------------------------------------------------------------
/app/models/record/search.rb:
--------------------------------------------------------------------------------
 1 | class Record::Search < Record::Match
 2 |   alias_method :search, :best
 3 | 
 4 |   def match_query
 5 |     [
 6 |       {
 7 |         match: {
 8 |           name: @query_hash[:query]
 9 |         }
10 |       },
11 |       # {
12 |       #  match: {
13 |       #    description: @query_hash[:query]
14 |       #   }
15 |       # },
16 |       # {
17 |       #   match: {
18 |       #     url: @query_hash[:query]
19 |       #   }
20 |       # },
21 |       # {
22 |       #   match: {
23 |       #     tags: @query_hash[:query]
24 |       #   }
25 |       # },
26 |       # {
27 |       #   match: {
28 |       #     categories: @query_hash[:query]
29 |       #   }
30 |       # },
31 |       # {
32 |       #   flt_field: {
33 |       #     name: {
34 |       #       like_text: @query_hash[:query],
35 |       #       analyzer: 'snowball',
36 |       #       fuzziness: 0.1,
37 |       #       boost: 5
38 |       #     }
39 |       #   }
40 |       # },
41 |       # {
42 |       #  flt_field: {
43 |       #    description: {
44 |       #      like_text: @query_hash[:query],
45 |       #      analyzer: 'snowball',
46 |       #      fuzziness: 0.3
47 |       #    }
48 |       #  }
49 |       # },
50 |       # {
51 |       #  flt_field: {
52 |       #    url: {
53 |       #     like_text: @query_hash[:query],
54 |       #     analyzer: 'snowball',
55 |       #     fuzziness: 0.5
56 |       #   }
57 |       # }
58 |       # },
59 |       # {
60 |       #  flt_field: {
61 |       #    tags: {
62 |       #      like_text: @query_hash[:query],
63 |       #      analyzer: 'snowball',
64 |       #      fuzziness: 0.7
65 |       #    }
66 |       #  }
67 |       # },
68 |       # {
69 |       #  flt_field: {
70 |       #    categories: {
71 |       #      like_text: @query_hash[:query],
72 |       #      analyzer: 'snowball',
73 |       #      fuzziness: 0.9
74 |       #    }
75 |       #  }
76 |       # },
77 |     ]
78 |   end
79 | end
80 | 


--------------------------------------------------------------------------------
/roles/logrotate/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/nickhammond/ansible-logrotate.svg?branch=master)](https://travis-ci.org/nickhammond/ansible-logrotate)
 2 | 
 3 | Role Name
 4 | ========
 5 | 
 6 | Installs logrotate and provides an easy way to setup additional logrotate scripts by specifying a list of directives.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | None
12 | 
13 | Role Variables
14 | --------------
15 | 
16 | **logrotate_scripts**: A list of logrotate scripts and the directives to use for the rotation.
17 | 
18 | * name - The name of the script that goes into /etc/logrotate.d/
19 | * path - Path to point logrotate to for the log rotation
20 | * options - List of directives for logrotate, view the logrotate man page for specifics
21 | * scripts - Dict of scripts for logrotate (see Example below)
22 | 
23 | ```
24 | logrotate_scripts:
25 |   - name: rails
26 |     path: "/srv/current/log/*.log"
27 |     options:
28 |       - weekly
29 |       - size 25M
30 |       - missingok
31 |       - compress
32 |       - delaycompress
33 |       - copytruncate
34 | ```
35 | 
36 | Dependencies
37 | ------------
38 | 
39 | None
40 | 
41 | Example Playbook
42 | -------------------------
43 | 
44 | Setting up logrotate for additional Nginx logs, with postrotate script (assuming this role is located in `roles/logrotate`).
45 | 
46 | ```
47 | - role: logrotate
48 |   logrotate_scripts:
49 |     - name: nginx
50 |       path: /var/log/nginx/*.log
51 |       options:
52 |         - weekly
53 |         - size 25M
54 |         - rotate 7
55 |         - missingok
56 |         - compress
57 |         - delaycompress
58 |         - copytruncate
59 |       scripts:
60 |         postrotate: "[ -s /run/nginx.pid ] && kill -USR1 `cat /run/nginx.pid`"
61 | ```
62 | 
63 | License
64 | -------
65 | 
66 | BSD
67 | 
68 | Author Information
69 | ------------------
70 | 
71 | Find [Nick Hammond]( http://www.nickhammond.com ) on [Twitter](http://twitter.com/nickhammond).
72 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/templates/systemd/elasticsearch.j2:
--------------------------------------------------------------------------------
 1 | [Unit]
 2 | Description=Elasticsearch-{{es_instance_name}}
 3 | Documentation=http://www.elastic.co
 4 | Wants=network-online.target
 5 | After=network-online.target
 6 | 
 7 | [Service]
 8 | Environment=ES_HOME={{es_home}}
 9 | Environment=CONF_DIR={{conf_dir}}
10 | Environment=DATA_DIR={{ data_dirs | array_to_str }}
11 | Environment=LOG_DIR={{log_dir}}
12 | Environment=PID_DIR={{pid_dir}}
13 | EnvironmentFile=-{{instance_default_file}}
14 | 
15 | User={{es_user}}
16 | Group={{es_group}}
17 | 
18 | ExecStart={{es_home}}/bin/elasticsearch \
19 |                                         -Des.pidfile=${PID_DIR}/elasticsearch.pid \
20 |                                         -Des.default.path.home=${ES_HOME} \
21 |                                         -Des.default.path.logs=${LOG_DIR} \
22 |                                         -Des.default.path.data=${DATA_DIR} \
23 |                                         -Des.default.path.conf=${CONF_DIR}
24 | 
25 | 
26 | 
27 | # Connects standard output to /dev/null
28 | StandardOutput=null
29 | 
30 | # Connects standard error to journal
31 | StandardError=journal
32 | 
33 | # Specifies the maximum file descriptor number that can be opened by this process
34 | LimitNOFILE={{es_max_open_files}}
35 | 
36 | # Specifies the maximum number of bytes of memory that may be locked into RAM
37 | # Set to "infinity" if you use the 'bootstrap.mlockall: true' option
38 | # in elasticsearch.yml and 'MAX_LOCKED_MEMORY=unlimited' in {{instance_default_file}}
39 | {% if m_lock_enabled %}
40 | LimitMEMLOCK=infinity
41 | {% endif %}
42 | 
43 | # Disable timeout logic and wait until process is stopped
44 | TimeoutStopSec=0
45 | 
46 | # SIGTERM signal is used to stop the Java process
47 | KillSignal=SIGTERM
48 | 
49 | # Java process is never killed
50 | SendSIGKILL=no
51 | 
52 | # When a JVM receives a SIGTERM signal it exits with code 143
53 | SuccessExitStatus=143
54 | 
55 | [Install]
56 | WantedBy=multi-user.target
57 | 


--------------------------------------------------------------------------------
/app/models/cloud.rb:
--------------------------------------------------------------------------------
 1 | class Cloud
 2 |   MAX_KEYS = 100_000_000_000
 3 | 
 4 |   attr_accessor :bucket
 5 |   attr_accessor :provider
 6 | 
 7 |   def initialize(bucket_name)
 8 |     self.bucket = bucket_name if bucket_name
 9 |   end
10 | 
11 |   def after_initialize
12 |     return unless new_record?
13 |     self.bucket = 'crawler'
14 |   end
15 | 
16 |   def storage
17 |     @storage ||= Fog::Storage.new(Rails.configuration.config[:fog])
18 |   end
19 | 
20 |   def container
21 |     @container ||= storage.directories.get(bucket)
22 |     create_container if @container.nil?
23 |     @container
24 |   end
25 | 
26 |   def files
27 |     @files ||= update_files
28 |   end
29 | 
30 |   def keys
31 |     @keys ||= files.map(&:key)
32 |   end
33 | 
34 |   def update_files
35 |     files = container.files
36 |     truncated = files.try(:is_truncated)
37 |     while truncated
38 |       ap "Collecting #{files.count} from #{self.bucket}..."
39 |       bucket_object = container.files.all(marker: files.last.key)
40 |       truncated = bucket_object.is_truncated
41 |       files += bucket_object
42 |     end
43 |     files
44 |   end
45 | 
46 |   def listing(prefix)
47 |     @listing ||= container.files.all delimiter: '/', prefix: prefix
48 |   end
49 | 
50 |   def head(key)
51 |     container.files.head key
52 |   end
53 | 
54 |   def get(key)
55 |     container.files.get key
56 |   end
57 | 
58 |   def get_url(key)
59 |     container.files.get_https_url(key, 300)
60 |   end
61 | 
62 |   def sync(key, data)
63 |     if data
64 |       copy key, data
65 |     else
66 |       head = head key
67 |       head.try :destroy
68 |     end
69 |   end
70 | 
71 |   def copy(key, data)
72 |     file = container.files.new key: key
73 |     file.body = data
74 |     file.save
75 |   end
76 | 
77 |   def create_container
78 |     @container = storage.directories.create(key: bucket, public: true)
79 |   end
80 | 
81 |   def delete_all
82 |     files.with_progress("Deleting files in #{bucket}").each { |k| k.try(:destroy) }
83 |   end
84 | 
85 |   def count
86 |     files.count
87 |   end
88 | end
89 | 


--------------------------------------------------------------------------------
/app/workers/crawler/sitemapper.rb:
--------------------------------------------------------------------------------
 1 | class Crawler::Sitemapper < Crawler::Base
 2 |   sidekiq_options queue: :sitemapper,
 3 |                   retry: true,
 4 |                   backtrace: true,
 5 |                   unique: :until_and_while_executing,
 6 |                   unique_expiration: 120 * 60
 7 | 
 8 |   def perform(url, type = 'Scrimper')
 9 |     return if url.nil?
10 |     @url = url
11 |     @type = type
12 |     @name = Page::Url.new(url).name
13 |     @container = Rails.configuration.config[:admin][:api_containers].find { |c| c.include?(@name) }
14 | 
15 |     get_xml
16 | 
17 |     sitemap.site_links.each do |u|
18 |       check_page(u)
19 |     end if sitemap.sites?
20 | 
21 |     sitemap.index_links.each do |u|
22 |       get_sitemap u
23 |     end if sitemap.indexes?
24 |   end
25 | 
26 |   def get_xml
27 |     sitemap.xml = scraper.get
28 |     scraper.clear
29 |   end
30 | 
31 |   def check_page(url)
32 |     if new_url = @name.capitalize.constantize.sanitize_url(url)
33 |       if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: new_url.gsub('https://','http://') } } })['hits']['total'] == 0
34 |         get_page(new_url) if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: new_url } } })['hits']['total'] == 0
35 |       end
36 |     end
37 |   rescue NoMethodError => e
38 |     if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: url.gsub('https://','http://') } } })['hits']['total'] == 0
39 |       get_page(url) if Elasticsearch::Model.client.search(index: '_all', type: @container, body: { query: { match_phrase_prefix: { url: url } } })['hits']['total'] == 0
40 |     end
41 |   end
42 | 
43 |   def get_page(url)
44 |     ('Crawler::' + @type).constantize.perform_async url
45 |   end
46 | 
47 |   def get_sitemap(url)
48 |     Crawler::Sitemapper.perform_async url, @type
49 |   end
50 | 
51 |   def sitemap
52 |     @sitemap ||= Crawl::Sitemap.new(@url)
53 |   end
54 | end
55 | 


--------------------------------------------------------------------------------
/Guardfile:
--------------------------------------------------------------------------------
 1 | guard 'livereload' do
 2 |   watch(%r{app/views/.+\.(erb|haml|slim)$})
 3 |   watch(%r{app/helpers/.+\.rb})
 4 |   watch(%r{public/.+\.(css|js|html)})
 5 |   watch(%r{config/locales/.+\.yml})
 6 |   # Rails Assets Pipeline
 7 |   watch(%r{(app|vendor)(/assets/\w+/(.+\.(css|js|html|png|jpg))).*}) { |m| "/assets/#{m[3]}" }
 8 | end
 9 | 
10 | guard 'rails' do
11 |   watch('Gemfile.lock')
12 |   watch(%r{^(config|lib)/.*})
13 | end
14 | 
15 | ### Guard::Sidekiq
16 | #  available options:
17 | #  - :verbose
18 | #  - :queue (defaults to "default") can be an array
19 | #  - :concurrency (defaults to 1)
20 | #  - :timeout
21 | #  - :environment (corresponds to RAILS_ENV for the Sidekiq worker)
22 | 
23 | guard 'sidekiq', concurrency: 1 do
24 |   watch(%r{^app/workers/(.+)\.rb$})
25 |   watch(%r{^app/models/(.+)\.rb$})
26 |   watch(%r{^app/sites/(.+)\.rb$})
27 |   watch(%r{^app/helpers/(.+)\.rb$})
28 | end
29 | 
30 | guard :test do
31 |   watch(%r{^test/.+_test\.rb$})
32 |   watch('test/test_helper.rb')  { 'test' }
33 | 
34 |   # Non-rails
35 |   watch(%r{^lib/(.+)\.rb$}) { |m| "test/#{m[1]}_test.rb" }
36 | 
37 |   # Rails 4
38 |   # watch(%r{^app/(.+)\.rb})                               { |m| "test/#{m[1]}_test.rb" }
39 |   # watch(%r{^app/controllers/application_controller\.rb}) { 'test/controllers' }
40 |   # watch(%r{^app/controllers/(.+)_controller\.rb})        { |m| "test/integration/#{m[1]}_test.rb" }
41 |   # watch(%r{^app/views/(.+)_mailer/.+})                   { |m| "test/mailers/#{m[1]}_mailer_test.rb" }
42 |   # watch(%r{^lib/(.+)\.rb})                               { |m| "test/lib/#{m[1]}_test.rb" }
43 | 
44 |   # Rails < 4
45 |   # watch(%r{^app/models/(.+)\.rb$})                   { |m| "test/unit/#{m[1]}_test.rb" }
46 |   # watch(%r{^app/controllers/(.+)\.rb$})              { |m| "test/functional/#{m[1]}_test.rb" }
47 |   # watch(%r{^app/views/(.+)/.+\.erb$})                { |m| "test/functional/#{m[1]}_controller_test.rb" }
48 |   # watch(%r{^app/views/.+$})                          { 'test/integration' }
49 |   # watch('app/controllers/application_controller.rb') { ['test/functional', 'test/integration'] }
50 | end
51 | 


--------------------------------------------------------------------------------
/roles/letsencrypt/README.md:
--------------------------------------------------------------------------------
 1 | # ansible-letsencrypt
 2 | An ansible role to generate TLS certificates and get them signed by Let's Encrypt.
 3 | 
 4 | Currently attempts first to use the `webroot` authenticator, then if that fails to create certificates,
 5 | it will use the standalone authenticator. This is handy for generating certs on a fresh machine before
 6 | the web server has been configured or even installed.
 7 | 
 8 | I've tested this on a couple of Debian Jessie boxes with nginx, if you test it on other things please let me know
 9 | the results (positive or otherwise) so I can document them here/fix the issue.
10 | 
11 | # Usage
12 | First, read Let's Encrypt's TOS and EULA. Only proceed if you agree to them.
13 | 
14 | The following variables are available:
15 | 
16 | `letsencrypt_webroot_path` is the root path that gets served by your web server. Defaults to `/var/www`.
17 | 
18 | `letsencrypt_email` needs to be set to your email address. Let's Encrypt wants it. Defaults to `webmaster@{{ ansible_fqdn }}`.
19 | 
20 | `letsencrypt_cert_domains` is a list of domains you wish to get a certificate for. It defaults to a single item with the value of `{{ ansible_fqdn }}`.
21 | 
22 | `letsencrypt_install_directory` should probably be left alone, but if you set it, it will change where the letsencrypt program is installed.
23 | 
24 | `letsencrypt_server` sets the auth server. Set to `https://acme-staging.api.letsencrypt.org/directory` to use the staging server (far higher rate limits, but certs are not trusted, intended for testing)
25 | 
26 | The [Let's Encrypt client](https://github.com/letsencrypt/letsencrypt) will put the certificate and accessories in `/etc/letsencrypt/live/<first listed domain>/`. For more info, see the [Let's Encrypt documentation](https://letsencrypt.readthedocs.org/en/latest/using.html#where-are-my-certificates).
27 | 
28 | # Example Playbook
29 | ```
30 | ---
31 |  - hosts: tls_servers
32 |    user: root
33 |    roles:
34 |      - role: letsencrypt
35 |        letsencrypt_webroot_path: /var/www/html
36 |        letsencrypt_email: user@example.net
37 |        letsencrypt_cert_domains:
38 |         - www.example.net
39 |         - example.net
40 | ```
41 | 


--------------------------------------------------------------------------------
/app/models/page/parse.rb:
--------------------------------------------------------------------------------
 1 | class Page::Parse < Page::Base
 2 |   include PageHelper
 3 |   include OpenGraphHelper
 4 |   include SchemaOrgHelper
 5 | 
 6 |   def build
 7 |     parent_build
 8 |     methods.grep(/find_/).each { |parse| send(parse) } if @type
 9 |   end
10 | 
11 |   def parent_build
12 |     build_open_graph
13 |     build_schema
14 |     build_page
15 |   end
16 | 
17 |   def self.sanitize_url url
18 |     return url
19 |   end
20 | 
21 |   def paginate
22 |     parser.css('.next').map {|n| n[:href]}.compact.uniq
23 |   end
24 | 
25 |   def scraping
26 |     []
27 |   end
28 | 
29 |   def screenshot
30 |     @screenshot ||= File.join(@id, date) + '.jpg'
31 |   end
32 | 
33 |   def remove_extras(symbol)
34 |     remove_instance_variable(symbol) rescue nil
35 |   end
36 | 
37 |   def save
38 |     remove_instance_variable(:@page)
39 |     remove_instance_variable(:@links) if @links
40 |     remove_instance_variable(:@internal_links) if @internal_links
41 |     remove_instance_variable(:@external_links) if @external_links
42 |     remove_instance_variable(:@uri) rescue nil
43 |     hash = {}
44 |     instance_variables.each do |var|
45 |       value = instance_variable_get(var)
46 |       hash[var.to_s.delete('@')] = value unless value.blank?
47 |     end
48 |     hash
49 |   end
50 | 
51 |   def links
52 |     @links ||= page.links.map do |link|
53 |       remove_hash_bangs(clean_up_link(link.href))
54 |     end.compact.uniq
55 |   end
56 | 
57 |   def internal_links
58 |     @internal_links ||= links.map { |link| link if internal? link }.compact
59 |   end
60 | 
61 |   def external_links
62 |     @external_links ||= links.map { |link| link unless internal? link }.compact
63 |   end
64 | 
65 |   def clean_up_link(link)
66 |     link_uri = URI.parse(link)
67 |     if link_uri.scheme.nil? && link_uri.host.nil?
68 |       link = (base + link)
69 |     else
70 |       link
71 |     end
72 |   rescue
73 |     nil
74 |   end
75 | 
76 |   def remove_hash_bangs(link)
77 |     return if link.nil?
78 |     if hash_bang = link.match(/(.+?)\#/)
79 |       hash_bang[1]
80 |     else
81 |       link
82 |     end
83 |   end
84 | 
85 |   def internal?(link)
86 |     get_host_without_www(URI.parse(link)) == host
87 |   rescue
88 |     nil
89 |   end
90 | end
91 | 


--------------------------------------------------------------------------------
/app/helpers/schema_org_helper.rb:
--------------------------------------------------------------------------------
 1 | module SchemaOrgHelper
 2 |   def build_schema
 3 |     # schema = @page.doc.css('//*[contains(@itemtype, "schema.org")]').first["itemtype"]
 4 |     @schema_org = false
 5 |     methods.grep(/schema_org/).each do |schema|
 6 |       send(schema) rescue nil
 7 |     end
 8 |     @schema_org = true if @type
 9 |   end
10 | 
11 |   ###############################################################
12 |   # Types that have multiple parents are expanded out only once
13 |   # and have an asterisk
14 |   ###############################################################
15 | 
16 |   def schema_org_type
17 |     @type = cleanup_value page.body.match(/itemtype="http:\/\/schema.org\/(.+?)"/)[1]
18 |   end
19 | 
20 |   ###############################################################
21 |   # Grab Meta Data for Schema and assign instance variable
22 |   ###############################################################
23 | 
24 |   def schema_org_meta
25 |     parser.css('//meta').each do |m|
26 |       unless m[:itemprop].nil?
27 |         key = cleanup_key m[:itemprop]
28 |         value = cleanup_value m[:content]
29 |         instance_variable_set("@#{key}", value)
30 |       end
31 |     end
32 |   end
33 | 
34 |   ###############################################################
35 |   # Grab Span Data for Schema and assign instance variable
36 |   ###############################################################
37 | 
38 |   def schema_org_span
39 |     parser.css('//span').each do |m|
40 |       unless m[:itemprop].nil?
41 |         key = cleanup_key m[:itemprop]
42 |         value = cleanup_value m.text
43 |         instance_variable_set("@#{key}", value)
44 |       end
45 |     end
46 |   end
47 | 
48 |   ###############################################################
49 |   # Grabbing Keywords as Tags
50 |   ###############################################################
51 | 
52 |   def schema_org_tags
53 |     tags = parser.css("meta[@name='keywords']").first['content'].split(/ |,/)
54 |     tags.delete_if { |x| x.match(/and|for|more/) || x.squish.blank? }
55 |     @tags = tags.reject(&:empty?).uniq
56 |   end
57 | 
58 |   def cleanup_key(key)
59 |     key.tr(' ', '_')
60 |   end
61 | 
62 |   def cleanup_value(value)
63 |     value.try(:squish)
64 |   end
65 | end
66 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/helpers/serverspec/standard_spec.rb:
--------------------------------------------------------------------------------
 1 | require 'spec_helper'
 2 | 
 3 | shared_examples 'standard::init' do |es_version|
 4 | 
 5 |   describe user('elasticsearch') do
 6 |     it { should exist }
 7 |   end
 8 | 
 9 |   describe service('node1_elasticsearch') do
10 |     it { should be_running }
11 |   end
12 | 
13 |   describe package('elasticsearch') do
14 |     it { should be_installed }
15 |   end
16 | 
17 |   describe file('/etc/elasticsearch/node1/elasticsearch.yml') do
18 |     it { should be_file }
19 |     it { should be_owned_by 'elasticsearch' }
20 |   end
21 | 
22 |   describe file('/etc/elasticsearch/node1/logging.yml') do
23 |     it { should be_file }
24 |     it { should be_owned_by 'elasticsearch' }
25 |   end
26 | 
27 |   describe file('/etc/elasticsearch/node1/elasticsearch.yml') do
28 |     it { should contain 'node.name: localhost-node1' }
29 |     it { should contain 'cluster.name: elasticsearch' }
30 |     it { should contain 'path.conf: /etc/elasticsearch/node1' }
31 |     it { should contain 'path.data: /var/lib/elasticsearch/localhost-node1' }
32 |     it { should contain 'path.work: /tmp/elasticsearch/localhost-node1' }
33 |     it { should contain 'path.logs: /var/log/elasticsearch/localhost-node1' }
34 |   end
35 | 
36 |   describe 'Node listening' do
37 |     it 'listening in port 9200' do
38 |       expect(port 9200).to be_listening
39 |     end
40 |   end
41 | 
42 |   describe 'version check' do
43 |     it 'should be reported as version '+es_version do
44 |       command = command('curl -s localhost:9200 | grep number')
45 |       expect(command.stdout).to match(es_version)
46 |       expect(command.exit_status).to eq(0)
47 |     end
48 |   end
49 | 
50 |   describe file('/etc/init.d/elasticsearch') do
51 |     it { should_not exist }
52 |   end
53 | 
54 |   describe file('/etc/default/elasticsearch') do
55 |     it { should_not exist }
56 |   end
57 | 
58 |   describe file('/etc/sysconfig/elasticsearch') do
59 |     it { should_not exist }
60 |   end
61 | 
62 |   describe file('/usr/lib/systemd/system/elasticsearch.service') do
63 |     it { should_not exist }
64 |   end
65 | 
66 |   describe file('/etc/elasticsearch/elasticsearch.yml') do
67 |     it { should_not exist }
68 |   end
69 | 
70 |   describe file('/etc/elasticsearch/logging.yml') do
71 |     it { should_not exist }
72 |   end
73 | 
74 | end
75 | 
76 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/templates/logging.yml.j2:
--------------------------------------------------------------------------------
 1 | # you can override this using by setting a system property, for example -Des.logger.level=DEBUG
 2 | es.logger.level: INFO
 3 | rootLogger: ${es.logger.level}, console, file
 4 | logger:
 5 |   # log action execution errors for easier debugging
 6 |   action: DEBUG
 7 |   # reduce the logging for aws, too much is logged under the default INFO
 8 |   com.amazonaws: WARN
 9 |   org.apache.http: INFO
10 | 
11 |   # gateway
12 |   #gateway: DEBUG
13 |   #index.gateway: DEBUG
14 | 
15 |   # peer shard recovery
16 |   #indices.recovery: DEBUG
17 | 
18 |   # discovery
19 |   #discovery: TRACE
20 | 
21 |   index.search.slowlog: TRACE, index_search_slow_log_file
22 |   index.indexing.slowlog: TRACE, index_indexing_slow_log_file
23 | 
24 | additivity:
25 |   index.search.slowlog: false
26 |   index.indexing.slowlog: false
27 | 
28 | appender:
29 |   console:
30 |     type: console
31 |     layout:
32 |       type: consolePattern
33 |       conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n"
34 | 
35 |   file:
36 |     type: dailyRollingFile
37 |     file: ${path.logs}/${cluster.name}.log
38 |     datePattern: "'.'yyyy-MM-dd"
39 |     layout:
40 |       type: pattern
41 |       conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n"
42 | 
43 |   # Use the following log4j-extras RollingFileAppender to enable gzip compression of log files.
44 |   # For more information see https://logging.apache.org/log4j/extras/apidocs/org/apache/log4j/rolling/RollingFileAppender.html
45 |   #file:
46 |     #type: extrasRollingFile
47 |     #file: ${path.logs}/${cluster.name}.log
48 |     #rollingPolicy: timeBased
49 |     #rollingPolicy.FileNamePattern: ${path.logs}/${cluster.name}.log.%d{yyyy-MM-dd}.gz
50 |     #layout:
51 |       #type: pattern
52 |       #conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n"
53 | 
54 |   index_search_slow_log_file:
55 |     type: dailyRollingFile
56 |     file: ${path.logs}/${cluster.name}_index_search_slowlog.log
57 |     datePattern: "'.'yyyy-MM-dd"
58 |     layout:
59 |       type: pattern
60 |       conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n"
61 | 
62 |   index_indexing_slow_log_file:
63 |     type: dailyRollingFile
64 |     file: ${path.logs}/${cluster.name}_index_indexing_slowlog.log
65 |     datePattern: "'.'yyyy-MM-dd"
66 |     layout:
67 |       type: pattern
68 |       conversionPattern: "[%d{ISO8601}][%-5p][%-25c] %m%n"


--------------------------------------------------------------------------------
/app/workers/mapper/indexer.rb:
--------------------------------------------------------------------------------
 1 | class Mapper::Indexer < Mapper::Base
 2 |   def perform(container, id, hash = nil)
 3 |     @container = container
 4 |     types = container.split('-').last.pluralize.gsub(':', '')
 5 |     hash = record(id).data if hash.nil?
 6 |     index = Rails.env + '-' + types
 7 |     new_hash = {}
 8 | 
 9 |     # hash.each do |k, v|
10 |     hash.each do |k, v|
11 |       unless Record::Upload::EXCLUDE.include? k.to_sym
12 |         if v.is_a?(Hash)
13 |           value = v.values.last
14 | 
15 |           if value.is_a?(Array) || !!value == value
16 |             new_hash[k] = value
17 |           elsif value.to_i.to_s == value.to_s
18 |             new_hash[k] = value.to_i
19 |           elsif (Float(value) rescue false)
20 |             new_hash[k] = value.to_f
21 |             new_hash[k] = value if new_hash[k].infinite?
22 |           else
23 |             new_hash[k] = value
24 |           end
25 | 
26 |           new_hash[k + '_history'] = v.count if v.count > 1
27 |         elsif !!v == v # Check if Boolean
28 |           new_hash[k] = v
29 |         elsif v.is_a?(Array)
30 |           new_hash[k] = v.map {|value| value.encode(Encoding.find('UTF-8'), {invalid: :replace, undef: :replace, replace: ''}) }
31 |         else
32 |           new_hash[k] = v.encode(Encoding.find('UTF-8'), {invalid: :replace, undef: :replace, replace: ''})
33 |         end
34 |       end
35 |     end
36 | 
37 |     # Delete bad keys from search...
38 |     if bad_ids = Elasticsearch::Model.client.search(index: index, type: @container, body: { query: { match_phrase_prefix: { url: new_hash['url'] } } })['hits']['hits'].select do |hit|
39 |                    hit['_id'] != id
40 |                  end
41 |       bad_ids.each do |bad_id|
42 |         record(bad_id['_id']).delete
43 |         Elasticsearch::Model.client.delete index: index, type: @container, id: bad_id['_id']
44 |       end unless bad_ids.empty?
45 |     end
46 | 
47 |     Elasticsearch::Model.client.index index: index, type: container, id: id, body: new_hash.sort.to_h
48 | 
49 |     Elasticsearch::Model.client.indices.refresh index: index
50 |   # rescue Elasticsearch::Transport::Transport::Errors::BadRequest => e
51 |   # # rescue Elasticsearch::Transport::Transport::Errors::NotFound
52 |   #  record(id).delete
53 |   #  Crawler::Scrimper.perform_async new_hash['url'] if new_hash['url']
54 |   rescue Elasticsearch::Transport::Transport::Errors::NotFound => e
55 |     nil
56 |   end
57 | end
58 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-plugins.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | #es_plugins_reinstall will be set to true if elasticsearch_install.changed i.e. we have changed ES version, or if no plugins listed. Otherwise it is false and requires explicitly setting.
 4 | - set_fact: es_plugins_reinstall=true
 5 |   when: (elasticsearch_install is defined and elasticsearch_install.changed) or es_plugins is not defined or es_plugins is none
 6 |   tags:
 7 |       - always
 8 | 
 9 | - set_fact: list_command="list"
10 |   tags:
11 |       - always
12 | - set_fact: list_command="--list"
13 |   when: es_version | version_compare('2.0', '<')
14 |   tags:
15 |       - always
16 | 
17 | #List currently installed plugins
18 | - shell: "{{es_home}}/bin/plugin {{list_command}} | sed -n '1!p' | cut -d '-' -f2-"
19 |   register: installed_plugins
20 |   changed_when: False
21 |   environment:
22 |       CONF_DIR: "{{ conf_dir }}"
23 |       ES_INCLUDE: "{{ instance_default_file }}"
24 | 
25 | #This needs to removes any currently installed plugins
26 | - name: Remove elasticsearch plugins
27 |   command: "{{es_home}}/bin/plugin remove {{item}} --silent"
28 |   ignore_errors: yes
29 |   with_items: "{{ installed_plugins.stdout_lines }}"
30 |   when: es_plugins_reinstall and installed_plugins.stdout_lines | length > 0 and not 'No plugin detected' in installed_plugins.stdout_lines[0]
31 |   notify: restart elasticsearch
32 |   environment:
33 |     CONF_DIR: "{{ conf_dir }}"
34 |     ES_INCLUDE: "{{ instance_default_file }}"
35 | 
36 | - name: Install elasticsearch plugins
37 |   #debug: var=item
38 |   command: >
39 |     {{es_home}}/bin/plugin install {{ item.plugin }}{% if item.version is defined and item.version != '' %}/{{ item.version }}{% endif %} {% if item.proxy_host is defined and item.proxy_host != '' and item.proxy_port is defined and item.proxy_port != ''%} -DproxyHost={{ item.proxy_host }} -DproxyPort={{ item.proxy_port }} {% elif es_proxy_host is defined and es_proxy_host != '' %} -DproxyHost={{ es_proxy_host }} -DproxyPort={{ es_proxy_port }} {% endif %} --silent
40 |   register: plugin_installed
41 |   failed_when: "'ERROR' in plugin_installed.stdout"
42 |   changed_when: plugin_installed.rc == 0
43 |   with_items: "{{ es_plugins }}"
44 |   when: es_plugins is defined and not es_plugins is none
45 |   notify: restart elasticsearch
46 |   environment:
47 |     CONF_DIR: "{{ conf_dir }}"
48 |     ES_INCLUDE: "{{ instance_default_file }}"
49 | 
50 | #Set permissions on plugins directory
51 | - name: Set Plugin Directory Permissions
52 |   file: state=directory path={{ plugin_dir }} owner={{ es_user }} group={{ es_group }} recurse=yes
53 | 


--------------------------------------------------------------------------------
/config/sidekiq.yml.example:
--------------------------------------------------------------------------------
  1 | ---
  2 | :concurrency: 1
  3 | :pidfile: tmp/pids/sidekiq.pid
  4 | :queues:
  5 |   - [socializer, 1200000]
  6 |   - [recorder, 1100000]
  7 |   - [slider, 1000000]
  8 |   - [scrimper_one, 90000]
  9 |   - [scrimper_two, 90000]
 10 |   - [scrimper_three, 90000]
 11 |   - [scrimper_four, 90000]
 12 |   - [scrimper_five, 90000]
 13 |   - [scrimper_six, 90000]
 14 |   - [scrimper_seven, 90000]
 15 |   - [scrimper_eight, 90000]
 16 |   - [scrimper_nine, 90000]
 17 |   - [scrimper_ten, 90000]
 18 |   - [scrimper, 80000]
 19 |   - [spider_one, 70000]
 20 |   - [spider_two, 70000]
 21 |   - [spider_three, 70000]
 22 |   - [spider_four, 70000]
 23 |   - [spider_five, 70000]
 24 |   - [spider_six, 70000]
 25 |   - [spider_seven, 70000]
 26 |   - [spider_eight, 70000]
 27 |   - [spider_nine, 70000]
 28 |   - [spider_ten, 70000]
 29 |   - [spider, 60000]
 30 |   - [scraper_one, 50000]
 31 |   - [scraper_two, 50000]
 32 |   - [scraper_three, 50000]
 33 |   - [scraper_four, 50000]
 34 |   - [scraper_five, 50000]
 35 |   - [scraper_six, 50000]
 36 |   - [scraper_seven, 50000]
 37 |   - [scraper_eight, 50000]
 38 |   - [scraper_nine, 50000]
 39 |   - [scraper_ten, 50000]
 40 |   - [scraper, 40000]
 41 |   - [sampler_one, 30000]
 42 |   - [sampler_two, 30000]
 43 |   - [sampler_three, 30000]
 44 |   - [sampler_four, 30000]
 45 |   - [sampler_five, 30000]
 46 |   - [sampler_six, 30000]
 47 |   - [sampler_seven, 30000]
 48 |   - [sampler_eight, 30000]
 49 |   - [sampler_nine, 30000]
 50 |   - [sampler_ten, 30000]
 51 |   - [sampler, 20000]
 52 |   - [stretcher, 1]
 53 | :process_limits:
 54 |   stretcher: 1
 55 |   slider: 1
 56 |   socializer: 1
 57 |   scrimper_one: 1
 58 |   scrimper_two: 1
 59 |   scrimper_three: 1
 60 |   scrimper_four: 1
 61 |   scrimper_five: 1
 62 |   scrimper_six: 1
 63 |   scrimper_seven: 1
 64 |   scrimper_eight: 1
 65 |   scrimper_nine: 1
 66 |   scrimper_ten: 1
 67 |   scrimper: 1
 68 |   spider_one: 1
 69 |   spider_two: 1
 70 |   spider_three: 1
 71 |   spider_four: 1
 72 |   spider_five: 1
 73 |   spider_six: 1
 74 |   spider_seven: 1
 75 |   spider_eight: 1
 76 |   spider_nine: 1
 77 |   spider_ten: 1
 78 |   spider: 1
 79 |   scraper_one: 1
 80 |   scraper_two: 1
 81 |   scraper_three: 1
 82 |   scraper_four: 1
 83 |   scraper_five: 1
 84 |   scraper_six: 1
 85 |   scraper_seven: 1
 86 |   scraper_eight: 1
 87 |   scraper_nine: 1
 88 |   scraper_ten: 1
 89 |   scraper: 1
 90 |   sampler_one: 1
 91 |   sampler_two: 1
 92 |   sampler_three: 1
 93 |   sampler_four: 1
 94 |   sampler_five: 1
 95 |   sampler_six: 1
 96 |   sampler_seven: 1
 97 |   sampler_eight: 1
 98 |   sampler_nine: 1
 99 |   sampler_ten: 1
100 |   sampler: 1
101 | 


--------------------------------------------------------------------------------
/app/models/record/trends.rb:
--------------------------------------------------------------------------------
 1 | class Record::Trends < Record::Match
 2 |   def sort(query_array = ['date'], options = { crawl: true, social: true, results: 10, page: 1, fix: false })
 3 |     @options = options
 4 |     @query_array = query_array
 5 | 
 6 |     if !@container.nil? && !@container.include?(Rails.env)
 7 |       types = @container.split('-').last.pluralize.gsub(':', '')
 8 |       @index = [ Rails.env + '-' + types ]
 9 |     elsif @container.nil?
10 |       @index = Rails.configuration.config[:admin][:api_containers].map { |c| Rails.env + '-' + c.split('-').last.pluralize.gsub(':', '') }.uniq
11 |     end
12 | 
13 |     @options = options
14 |     sanitize_results
15 |   end
16 | 
17 |   def elasticsearch_results
18 |     @elasticsearch_results ||= Elasticsearch::Model.client.search(index: @index, type: @container, body: query).deep_symbolize_keys!
19 |   end
20 | 
21 |   def total
22 |     ((elasticsearch_results[:hits][:total] || 0) / limit_results.to_f).ceil
23 |   end
24 | 
25 |   def sanitize_results
26 |     elasticsearch_results[:hits][:hits].map do |e|
27 |       delete_bad_data e[:_source][:url] if @options[:fix]
28 |       recrawl(e[:_source][:url], @options) if e[:_source][:url]
29 | 
30 |       new_data = { id: e[:_id],
31 |                    container: e[:_type],
32 |                    score: e[:_score],
33 |                    available: true,
34 |                    history: {},
35 |                    social: {},
36 |                    price: {}
37 |                  }
38 | 
39 |       e[:_source].each do |k,v|
40 |         if k.to_s.include?('_history')
41 |           new_data[:history][k.to_s.gsub('_history','')] = v
42 |         elsif k.to_s.include?('facebook') || k.to_s.include?('_shares')
43 |           new_data[:social][k] = v
44 |         elsif k.to_s.include?('price')
45 |           new_data[:price][k] = v
46 |         else
47 |           new_data[k] = v
48 |         end
49 |       end
50 |       new_data
51 |     end
52 |   end
53 | 
54 |   def query
55 |     @query =
56 |     {
57 |       filter: {
58 |         match_all: { }
59 |       },
60 |       sort: sort_query,
61 |       size: limit_results,
62 |       from: from_page
63 |     }
64 |   end
65 | 
66 |   def from_page
67 |     ((@options[:page].try(:to_i) || 1) - 1) * limit_results
68 |   end
69 | 
70 |   def sort_query
71 |     @query_array.map do |n|
72 |       {
73 |         n => {
74 |           order: "desc"
75 |         }
76 |       }
77 |     end
78 |   end
79 | 
80 |   def limit_results
81 |     if !@options[:results]
82 |       10
83 |     elsif @options[:results] > 25
84 |       25
85 |     else
86 |       @options[:results]
87 |     end
88 |   end
89 | 
90 |   def delete_bad_data url
91 |     Mapper::UrlAvailability.perform_async url
92 |   end
93 | end
94 | 


--------------------------------------------------------------------------------
/app/models/record/match.rb:
--------------------------------------------------------------------------------
 1 | class Record::Match < Record::Base
 2 |   def best(query_hash = {}, options = { crawl: true, social: false, results: 1, page: 1 })
 3 |     @options = options
 4 |     @query_hash = query_hash.delete_if { |_k, v| v.nil? || v.blank? }
 5 | 
 6 |     if !@container.nil? && !@container.include?(Rails.env)
 7 |       types = container.split('-').last.pluralize.gsub(':', '')
 8 |       @index = [ Rails.env + '-' + types ]
 9 |     elsif @container.nil?
10 |       @container = Rails.configuration.config[:admin][:api_containers]
11 |       @index = @container.map { |c| Rails.env + '-' + c.split('-').last.pluralize.gsub(':', '') }.uniq
12 |     end
13 | 
14 |     @options = options
15 |     sanitize_results
16 |   end
17 | 
18 |   def elasticsearch_results
19 |     @elasticsearch_results ||= Elasticsearch::Model.client.search(index: @index, type: @container, body: query).deep_symbolize_keys!
20 |   end
21 | 
22 |   def total
23 |     ((elasticsearch_results[:hits][:total] || 0) / limit_results.to_f).ceil
24 |   end
25 | 
26 |   def sanitize_results
27 |     elasticsearch_results[:hits][:hits].map do |e|
28 |       recrawl(e[:_source][:url], @options) if e[:_source][:url]
29 | 
30 |       new_data = { id: e[:_id],
31 |                    container: e[:_type],
32 |                    score: e[:_score],
33 |                    available: true,
34 |                    history: {},
35 |                    social: {},
36 |                    price: {}
37 |                  }
38 | 
39 |       e[:_source].each do |k,v|
40 |         if k.to_s.include?('_history')
41 |           new_data[:history][k.to_s.gsub('_history','')] = v
42 |         elsif k.to_s.include?('facebook') || k.to_s.include?('_shares')
43 |           new_data[:social][k] = v
44 |         elsif k.to_s.include?('price')
45 |           new_data[:price][k] = v
46 |         else
47 |           new_data[k] = v
48 |         end
49 |       end
50 |       new_data
51 |     end
52 |   end
53 | 
54 |   def query
55 |     @query = {
56 |       query: {
57 |         bool: {
58 |           must_not: {
59 |             term: {
60 |               available: false
61 |             }
62 |           },
63 |           should: match_query
64 |         }
65 |       },
66 |       size: limit_results,
67 |       from: from_page
68 |     }
69 |   end
70 | 
71 |   def match_query
72 |     @query_hash.map do |k, v|
73 |       {
74 |         match: {
75 |           k => v
76 |         }
77 |       }
78 |     end
79 |   end
80 | 
81 |   def from_page
82 |     ((@options[:page].try(:to_i) || 1) - 1) * limit_results
83 |   end
84 | 
85 |   def limit_results
86 |     if !@options[:results]
87 |       1
88 |     elsif @options[:results] > 25
89 |       25
90 |     else
91 |       @options[:results]
92 |     end
93 |   end
94 | end
95 | 


--------------------------------------------------------------------------------
/roles/build-ruby/meta/main.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | galaxy_info:
  3 |   author: your name
  4 |   description: 
  5 |   company: your company (optional)
  6 |   # Some suggested licenses:
  7 |   # - BSD (default)
  8 |   # - MIT
  9 |   # - GPLv2
 10 |   # - GPLv3
 11 |   # - Apache
 12 |   # - CC-BY
 13 |   license: license (GPLv2, CC-BY, etc)
 14 |   min_ansible_version: 1.2
 15 |   #
 16 |   # Below are all platforms currently available. Just uncomment
 17 |   # the ones that apply to your role. If you don't see your 
 18 |   # platform on this list, let us know and we'll get it added!
 19 |   #
 20 |   #platforms:
 21 |   #- name: EL
 22 |   #  versions:
 23 |   #  - all
 24 |   #  - 5
 25 |   #  - 6
 26 |   #  - 7
 27 |   #- name: GenericUNIX
 28 |   #  versions:
 29 |   #  - all
 30 |   #  - any
 31 |   #- name: Fedora
 32 |   #  versions:
 33 |   #  - all
 34 |   #  - 16
 35 |   #  - 17
 36 |   #  - 18
 37 |   #  - 19
 38 |   #  - 20
 39 |   #- name: opensuse
 40 |   #  versions:
 41 |   #  - all
 42 |   #  - 12.1
 43 |   #  - 12.2
 44 |   #  - 12.3
 45 |   #  - 13.1
 46 |   #  - 13.2
 47 |   #- name: Amazon
 48 |   #  versions:
 49 |   #  - all
 50 |   #  - 2013.03
 51 |   #  - 2013.09
 52 |   #- name: GenericBSD
 53 |   #  versions:
 54 |   #  - all
 55 |   #  - any
 56 |   #- name: FreeBSD
 57 |   #  versions:
 58 |   #  - all
 59 |   #  - 8.0
 60 |   #  - 8.1
 61 |   #  - 8.2
 62 |   #  - 8.3
 63 |   #  - 8.4
 64 |   #  - 9.0
 65 |   #  - 9.1
 66 |   #  - 9.1
 67 |   #  - 9.2
 68 |   #- name: Ubuntu
 69 |   #  versions:
 70 |   #  - all
 71 |   #  - lucid
 72 |   #  - maverick
 73 |   #  - natty
 74 |   #  - oneiric
 75 |   #  - precise
 76 |   #  - quantal
 77 |   #  - raring
 78 |   #  - saucy
 79 |   #  - trusty
 80 |   #- name: SLES
 81 |   #  versions:
 82 |   #  - all
 83 |   #  - 10SP3
 84 |   #  - 10SP4
 85 |   #  - 11
 86 |   #  - 11SP1
 87 |   #  - 11SP2
 88 |   #  - 11SP3
 89 |   #- name: GenericLinux
 90 |   #  versions:
 91 |   #  - all
 92 |   #  - any
 93 |   #- name: Debian
 94 |   #  versions:
 95 |   #  - all
 96 |   #  - etch
 97 |   #  - lenny
 98 |   #  - squeeze
 99 |   #  - wheezy
100 |   #
101 |   # Below are all categories currently available. Just as with
102 |   # the platforms above, uncomment those that apply to your role.
103 |   #
104 |   #categories:
105 |   #- cloud
106 |   #- cloud:ec2
107 |   #- cloud:gce
108 |   #- cloud:rax
109 |   #- clustering
110 |   #- database
111 |   #- database:nosql
112 |   #- database:sql
113 |   #- development
114 |   #- monitoring
115 |   #- networking
116 |   #- packaging
117 |   #- system
118 |   #- web
119 | dependencies: []
120 |   # List your role dependencies here, one per line. Only
121 |   # dependencies available via galaxy should be listed here.
122 |   # Be sure to remove the '[]' above if you add dependencies
123 |   # to this list.
124 |   
125 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/templates/elasticsearch.j2:
--------------------------------------------------------------------------------
 1 | ################################
 2 | # Elasticsearch
 3 | ################################
 4 | 
 5 | # Elasticsearch home directory
 6 | ES_HOME={{es_home}}
 7 | 
 8 | # Elasticsearch configuration directory
 9 | CONF_DIR={{conf_dir}}
10 | 
11 | # Elasticsearch data directory
12 | DATA_DIR={{ data_dirs | array_to_str }}
13 | 
14 | # Elasticsearch logs directory
15 | LOG_DIR={{log_dir}}
16 | 
17 | # Elasticsearch work directory
18 | WORK_DIR={{work_dir}}
19 | 
20 | # Elasticsearch PID directory
21 | PID_DIR={{pid_dir}}
22 | 
23 | # Heap size defaults to 256m min, 1g max
24 | # Set ES_HEAP_SIZE to 50% of available RAM, but no more than 31g
25 | {% if es_heap_size is defined %}
26 | ES_HEAP_SIZE={{es_heap_size}}
27 | {% endif %}
28 | 
29 | # Heap new generation
30 | #ES_HEAP_NEWSIZE=
31 | 
32 | # Maximum direct memory
33 | #ES_DIRECT_SIZE=
34 | 
35 | # Additional Java OPTS
36 | #ES_JAVA_OPTS=
37 | 
38 | # Configure restart on package upgrade (true, every other setting will lead to not restarting)
39 | #ES_RESTART_ON_UPGRADE=true
40 | 
41 | # Path to the GC log file
42 | #ES_GC_LOG_FILE=/var/log/elasticsearch/gc.log
43 | 
44 | ################################
45 | # Elasticsearch service
46 | ################################
47 | 
48 | # SysV init.d
49 | #
50 | # When executing the init script, this user will be used to run the elasticsearch service.
51 | # The default value is 'elasticsearch' and is declared in the init.d file.
52 | # Note that this setting is only used by the init script. If changed, make sure that
53 | # the configured user can read and write into the data, work, plugins and log directories.
54 | # For systemd service, the user is usually configured in file /usr/lib/systemd/system/elasticsearch.service
55 | ES_USER={{es_user}}
56 | ES_GROUP={{es_group}}
57 | 
58 | ################################
59 | # System properties
60 | ################################
61 | 
62 | # Specifies the maximum file descriptor number that can be opened by this process
63 | # When using Systemd, this setting is ignored and the LimitNOFILE defined in
64 | # /usr/lib/systemd/system/elasticsearch.service takes precedence
65 | {% if es_max_open_files is defined %}
66 | #MAX_OPEN_FILES
67 | MAX_OPEN_FILES={{es_max_open_files}}
68 | {% endif %}
69 | 
70 | # The maximum number of bytes of memory that may be locked into RAM
71 | # Set to "unlimited" if you use the 'bootstrap.mlockall: true' option
72 | # in elasticsearch.yml (ES_HEAP_SIZE  must also be set).
73 | # When using Systemd, the LimitMEMLOCK property must be set
74 | # in /usr/lib/systemd/system/elasticsearch.service
75 | {% if m_lock_enabled %}
76 | #MAX_LOCKED_MEMORY=
77 | MAX_LOCKED_MEMORY=unlimited
78 | {% endif %}
79 | 
80 | # Maximum number of VMA (Virtual Memory Areas) a process can own
81 | # When using Systemd, this setting is ignored and the 'vm.max_map_count'
82 | # property is set at boot time in /usr/lib/sysctl.d/elasticsearch.conf
83 | #MAX_MAP_COUNT=262144
84 | 


--------------------------------------------------------------------------------
/app/models/record/upload.rb:
--------------------------------------------------------------------------------
  1 | class Record::Upload < Page::Url
  2 |   CANONICAL = %i(site_name
  3 |                  id
  4 |                  url
  5 |                  type
  6 |                  date
  7 |                  name
  8 |                  image
  9 |                  description
 10 |                  tags
 11 |                  categories
 12 |                  open_graph
 13 |                  schema_org
 14 |                  available).freeze
 15 | 
 16 |   EXCLUDE = %i(site_name
 17 |                id
 18 |                type
 19 |                screenshot).freeze
 20 | 
 21 |   attr_accessor :metadata
 22 |   attr_accessor :id
 23 |   attr_accessor :screenshot
 24 | 
 25 |   def sync
 26 |     self.data = update_metadata(update_canonical(data))
 27 |   end
 28 | 
 29 |   def update_canonical(new_data = {})
 30 |     new_data['available'] = true unless metadata['available']
 31 |     types
 32 |     set_date
 33 |     set_screenshot
 34 |     metadata.each do |key, value|
 35 |       if CANONICAL.include? key.to_sym
 36 |         unless new_data[key] == value
 37 |           new_data[key] = value
 38 |         end
 39 |         metadata.delete(key)
 40 |       end
 41 |     end
 42 |     new_data
 43 |   end
 44 | 
 45 |   def update_metadata(new_data = {})
 46 |     metadata.each do |key, value|
 47 |       if new_data[key]
 48 |         original_hash = new_data[key]
 49 |         new_hash = {}
 50 |         last_key = original_hash.keys.last
 51 |         original_hash.each do |k, v|
 52 |           if k == last_key && v != value
 53 |             new_hash[date] = value
 54 |             if screenshot
 55 |               new_data['screenshot'][date] = screenshot
 56 |               launch_screener
 57 |             end
 58 |           end
 59 |         end
 60 |         new_data[key] = original_hash.merge!(new_hash)
 61 |       else
 62 |         new_data[key] = { date => value }
 63 | 
 64 |         if screenshot
 65 |           unless new_data['screenshot']
 66 |             new_data['screenshot'] = { date => screenshot }
 67 |             launch_screener
 68 |           end
 69 |         end
 70 |       end
 71 |     end
 72 |     new_data
 73 |   end
 74 | 
 75 |   def set_date
 76 |     self.date = metadata['date'] if metadata['date']
 77 |   end
 78 | 
 79 |   def set_screenshot
 80 |     if metadata['screenshot']
 81 |       self.screenshot = metadata['screenshot']
 82 |       metadata.delete('screenshot')
 83 |     end
 84 |   end
 85 | 
 86 |   def launch_screener
 87 |     Crawler::Screener.perform_async url, screenshot
 88 |   end
 89 | 
 90 |   def data
 91 |     return record.data if record.data
 92 |     {}
 93 |   end
 94 | 
 95 |   def data=(new_data)
 96 |     record.data = new_data
 97 |   end
 98 | 
 99 |   def record
100 |     @data ||= Record::Base.new(container, json_relative_path)
101 |   end
102 | 
103 |   def json_relative_path
104 |     @json_relative_path ||= id ? id : md5
105 |   end
106 | 
107 |   def types
108 |     @types ||= metadata['type'].downcase.pluralize.gsub(':', '')
109 |   end
110 | 
111 |   def container
112 |     @container ||= name + '-' + types
113 |   end
114 | end
115 | 


--------------------------------------------------------------------------------
/app/controllers/v1/status_controller.rb:
--------------------------------------------------------------------------------
 1 | class V1::StatusController < V1::AccessController
 2 |   def index
 3 |     respond_to do |format|
 4 |       format.json { json_response(200, status: (Rails.configuration.config[:admin][:api_keys][check_token.try(:to_sym)] || {}).merge(counts)) }
 5 |       format.xml { xml_response(200, status: (Rails.configuration.config[:admin][:api_keys][check_token.try(:to_sym)] || {}).merge(counts)) }
 6 |     end
 7 |   end
 8 | 
 9 |   private
10 | 
11 |   def counts
12 |     { available: count_indexes,
13 |       count: Rails.configuration.config[:admin][:api_containers].count.to_s,
14 |       total: pretty_integer(@total || 0),
15 |       indexing: pretty_integer(count_indexers),
16 |       processing: pretty_integer(count_scrimpers),
17 |       pending: pretty_integer(count_sitemappers * 50_000) }
18 |   end
19 | 
20 |   def count_indexes
21 |     Rails.configuration.config[:admin][:api_containers]
22 |       .map { |c| [c, count_containers(c)] }
23 |       .sort_by(&:last).reverse
24 |       .map { |array| { array.first => pretty_integer(array.last) } }.inject(:merge)
25 |   end
26 | 
27 |   def count_indexers
28 |     Sidekiq::Queue.new('mapper').size +
29 |       Sidekiq::Queue.new('recorder').size
30 |   rescue Redis::CannotConnectError => e
31 |     0
32 |   end
33 | 
34 |   def count_scrimpers
35 |     Sidekiq::Queue.new('scrimper').size +
36 |       Sidekiq::Queue.new('scrimper_one').size +
37 |       Sidekiq::Queue.new('scrimper_two').size +
38 |       Sidekiq::Queue.new('scrimper_three').size +
39 |       Sidekiq::Queue.new('scrimper_four').size +
40 |       Sidekiq::Queue.new('scrimper_five').size +
41 |       Sidekiq::Queue.new('sampler').size +
42 |       Sidekiq::Queue.new('sampler_one').size +
43 |       Sidekiq::Queue.new('sampler_two').size +
44 |       Sidekiq::Queue.new('sampler_three').size +
45 |       Sidekiq::Queue.new('sampler_four').size +
46 |       Sidekiq::Queue.new('sampler_five').size +
47 |       Sidekiq::Queue.new('spider').size +
48 |       Sidekiq::Queue.new('spider_one').size +
49 |       Sidekiq::Queue.new('spider_two').size +
50 |       Sidekiq::Queue.new('spider_three').size +
51 |       Sidekiq::Queue.new('spider_four').size +
52 |       Sidekiq::Queue.new('spider_five').size +
53 |       Sidekiq::Queue.new('slider').size +
54 |       Sidekiq::Queue.new('socializer').size
55 |   rescue Redis::CannotConnectError => e
56 |     0
57 |   end
58 | 
59 |   def count_sitemappers
60 |     Sidekiq::Queue.new('sitemapper').size +
61 |       Sidekiq::Queue.new('sitemapper_one').size +
62 |       Sidekiq::Queue.new('sitemapper_two').size +
63 |       Sidekiq::Queue.new('sitemapper_three').size +
64 |       Sidekiq::Queue.new('sitemapper_four').size +
65 |       Sidekiq::Queue.new('sitemapper_five').size
66 |   rescue Redis::CannotConnectError => e
67 |     0
68 |   end
69 | 
70 |   def count_containers(container)
71 |     @total ||= 0
72 |     index = Rails.env + '-' + container.split('-').last.pluralize.delete(':')
73 |     count = Elasticsearch::Model.client.count(index: index, type: container)['count']
74 |     @total = @total + count
75 |     count
76 |   rescue Elasticsearch::Transport::Transport::Errors => e
77 |     0
78 |   end
79 | end
80 | 


--------------------------------------------------------------------------------
/roles/swapfile/README.md:
--------------------------------------------------------------------------------
 1 | ansible-swapfile
 2 | ================
 3 | 
 4 | This role configures a swapfile (/swapfile) with the (default) size of 512MB.
 5 | 
 6 | ## Dependencies
 7 | 
 8 | None.
 9 | 
10 | ## Variables
11 | 
12 | * `swapfile_use_dd` [default: `False`]: if set to False, `fallocate` is used to create the swapfile, otherwise, `dd` is used. You may need to set this to True if your filesystem does not support `fallocate` -- see Issue #3.
13 | 
14 | * `swapfile_size` [default: `512MB`]: the size of the swapfile to create in the format that `fallocate` expects:
15 | 
16 |     The  length and offset arguments may be followed by binary (2^N) suffixes KiB, MiB, GiB, TiB, PiB and EiB (the "iB" is optional, e.g. "K" has the same meaning as "KiB") or decimal (10^N) suffixes KB, MB, GB, PB and EB.
17 | 
18 |     If `swapfile_use_dd` is set to True, `swapfile_size` must be set to the amount of megabytes to write, e.g. `512`.
19 | 
20 | * `swapfile_location` [default: `/swapfile`]: the location of where the swapfile will be created
21 | 
22 | ### Optional
23 | 
24 | The following variables are set to `False` by default and will not have any effect on your hosts. Setting them to any value other than `False` will update your hosts' sysctl.conf file.
25 | 
26 | * `swapfile_swappiness` [default: `False`]: the swappiness percentage (vm.swappiness) -- the lower it is, the less your system swaps memory pages
27 | 
28 | * `swapfile_vfs_cache_pressure` [default: `False`]: "this percentage value controls the tendency of the kernel to reclaim the memory which is used for caching of directory and inode objects."
29 | 
30 | ## Usage
31 | 
32 | ```yaml
33 | - hosts: all
34 |   roles:
35 |     - kamaln7.swapfile
36 | ```
37 | 
38 | or:
39 | 
40 | ```yaml
41 | - hosts: all
42 |   roles:
43 |     - { role: kamaln7.swapfile, swapfile_size: 1GB, swapfile_swappiness: 10, swapfile_location: /mnt/swapfile }
44 | ```
45 | 
46 | You can also set the variables described above in `group_vars` or `host_vars` (see `defaults/main.yml`).
47 | 
48 | ## License
49 | 
50 | The MIT License (MIT)
51 | 
52 | Copyright (c) 2014 Kamal Nasser <hello@kamal.io>
53 | 
54 | Permission is hereby granted, free of charge, to any person obtaining a copy
55 | of this software and associated documentation files (the "Software"), to deal
56 | in the Software without restriction, including without limitation the rights
57 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
58 | copies of the Software, and to permit persons to whom the Software is
59 | furnished to do so, subject to the following conditions:
60 | 
61 | The above copyright notice and this permission notice shall be included in all
62 | copies or substantial portions of the Software.
63 | 
64 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
65 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
66 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
67 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
68 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
69 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
70 | SOFTWARE.
71 | 


--------------------------------------------------------------------------------
/roles/nginx-unicorn/README.md:
--------------------------------------------------------------------------------
 1 | Ansible Nginx/Unicorn setup
 2 | ===========================
 3 | 
 4 | This Ansible role installs Nginx and generates configuration for Unicorn
 5 | applications
 6 | 
 7 | Requirements
 8 | ------------
 9 | 
10 | None
11 | 
12 | Notes
13 | -----
14 | 
15 | This role does not install or configure Unicorn itself. It is designed
16 | to play nicely with a Unicorn role such as
17 | [Unicorn-RVM](https://github.com/agios/ansible-unicorn-rvm).
18 | 
19 | Role Variables
20 | --------------
21 | 
22 | -   `nginx_sites` is an array of unicorn sites, defaults to `[]`
23 | 
24 |     Each nginx_sites entry is a dict with the following options:
25 | 
26 |     -   `name` (eg `my_app`, required)
27 |     -   `server_name` (eg `my-app.my-domain.org`, required, in any
28 |         format supported by nginx)
29 |     -   `root` defaults to `/var/www/{{ name }}/current` (Capistrano
30 |         compatible)
31 |     -   `listen` defaults to `[::]:80` (Both IPv4 and IPv6)
32 |     -   `access_log` is a dict with the following options:
33 |         -   `path` defaults to `/var/log/nginx/{{ name }}.access.log`
34 |         -   `format` is optional, can be used to specify a custom nginx
35 |             log output format
36 |     -   `error_log` see above
37 |     -   `ssl` if this option is given, an ssl config section will be
38 |         generated. It contains the following options:
39 |         -   `certificate` required, path to ssl certificate
40 |         -   `certificate_key` required, path to ssl certificate key
41 |         -   `ssl_only` if set to `true`, always redirect to ssl
42 |         -   `spdy` if set to `true`, enable spdy support
43 |         -   `gzip_assets` if set to `true`, enable serving gzipped
44 |             'assets' folder, cached for 16w (useful for rails with asset
45 |             precompilation)
46 |         -   `sensitive_uris` required unless `ssl_only`, nginx uri
47 |             expressions that will be served using https
48 |         -   `access_log` as above, for https requests
49 |         -   `error_log` as above, for https requests
50 | 
51 | 
52 | 
53 | 
54 | Example Playbook
55 | ----------------
56 | 
57 | The role could be included in a playbook as follows (unicorn-rvm also
58 | shown):
59 | 
60 | ```yaml
61 | ---
62 | -hosts: application
63 |   roles:
64 |     - role: unicorn-rvm
65 |       rails_apps:
66 |         - { name: 'my_app1', ruby_version: 'ruby-1.9.3' }
67 |         - { name: 'my_app2', ruby_version: 'ruby-2.1.1', root: '/var/test_apps/app2', env: staging }
68 |     - role: nginx-unicorn
69 |       nginx_sites:
70 |         - name: 'my_app1'
71 |           server_name: 'my-app1.example.com'
72 |           access_log:
73 |             format: 'main'
74 |           ssl:
75 |             certificate: /etc/ssl/localcerts/my_app1.pem
76 |             certificate_key: /etc/ssl/localcerts/my_app1.key
77 |             sensitive_uris:
78 |               - ^/user/sign_in(.*)
79 |               - ^/user/password(.*)
80 |             access_log:
81 |               format: 'main'
82 |         - name: 'my_app2'
83 |           server_name: 'my-app2.example.com *.mydomain.com'
84 |           root: '/var/test_apps/app2'
85 |           ssl:
86 |             certificate: /etc/ssl/localcerts/my_app2.crt
87 |             certificate_key: /etc/ssl/localcerts/my_app2.key
88 |             ssl_only: true
89 | ```
90 | 
91 | License
92 | -------
93 | 
94 | MIT
95 | 
96 | 


--------------------------------------------------------------------------------
/app/models/crawl/google.rb:
--------------------------------------------------------------------------------
 1 | class Crawl::Google
 2 |   include ActionView::Helpers::DateHelper
 3 | 
 4 |   def initialize(query = nil)
 5 |     @query = query
 6 |   end
 7 | 
 8 |   def videos
 9 |     @videos ||= Rails.cache.fetch("#{@query}/google_videos", expires_in: 7.days) do
10 |       if google_hash = Google::Search::Video.new(query: @query).response.hash['responseData']
11 |         google_hash['results'].map do |hash|
12 |           {
13 |             title: hash['titleNoFormatting'],
14 |             description: ActionView::Base.full_sanitizer.sanitize(hash['content']),
15 |             image: hash['tbUrl'],
16 |             url: hash['url'],
17 |             length: distance_of_time_in_words(hash['duration'].to_i),
18 |             published: hash['published'].to_date.to_s
19 |           }
20 |         end
21 |       else
22 |         nil
23 |       end
24 |     end
25 |   end
26 | 
27 |   def news
28 |     @news ||= Rails.cache.fetch("#{@query}/google_news", expires_in: 7.days) do
29 |       if google_hash = Google::Search::News.new(query: @query).response.hash['responseData']
30 |         google_hash['results'].map do |hash|
31 |           image = hash['image']['tbUrl'] if hash['image']
32 |           if related = hash['relatedStories']
33 |             related = related.map do |h|
34 |               {
35 |                 title: h['titleNoFormatting'],
36 |                 url: h['unescapedUrl'],
37 |                 publisher: h['publisher'],
38 |                 published: h['publishedDate'].to_date.to_s,
39 |                 language: hash['language']
40 |               }
41 |             end
42 |           end
43 |           {
44 |             title: hash['titleNoFormatting'],
45 |             description: ActionView::Base.full_sanitizer.sanitize(hash['content']),
46 |             image: image,
47 |             url: hash['unescapedUrl'],
48 |             publisher: hash['publisher'],
49 |             published: hash['publishedDate'].to_date.to_s,
50 |             language: hash['language'],
51 |             related: related || []
52 |           }
53 |         end
54 |       else
55 |         nil
56 |       end
57 |     end
58 |   end
59 | 
60 |   def references
61 |     @references ||= Rails.cache.fetch("#{@query}/google_references", expires_in: 7.days) do
62 |       if google_hash = Google::Search::Book.new(query: @query).response.hash['responseData']
63 |         google_hash['results'].map do |hash|
64 |           description = "by #{hash['authors']} ISBN: #{hash['bookId']}"
65 | 
66 |           image = hash['tbUrl'] unless hash['tbUrl'] == "/googlebooks/images/no_cover_thumb.gif"
67 | 
68 |           {
69 |             title: hash['titleNoFormatting'],
70 |             description: description,
71 |             image: image,
72 |             url: hash['unescapedUrl'],
73 |             length: hash['pageCount'] + ' pages',
74 |             published: hash['publishedYear']
75 |           }
76 |         end
77 |       else
78 |         nil
79 |       end
80 |     end
81 |   end
82 | 
83 |   def links
84 |     @links ||= Rails.cache.fetch("#{@query}/google_links", expires_in: 7.days) do
85 |       if google_hash = Google::Search::Web.new(query: @query).response.hash['responseData']
86 |         google_hash['results'].map do |hash|
87 |           {
88 |             title: hash['titleNoFormatting'],
89 |             description: ActionView::Base.full_sanitizer.sanitize(hash['content']),
90 |             url: hash['unescapedUrl']
91 |           }
92 |         end
93 |       else
94 |         nil
95 |       end
96 |     end
97 |   end
98 | end
99 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/test/integration/helpers/serverspec/package_spec.rb:
--------------------------------------------------------------------------------
  1 | require 'spec_helper'
  2 | 
  3 | shared_examples 'package::init' do  |es_version,plugins|
  4 | 
  5 |   describe user('elasticsearch') do
  6 |     it { should exist }
  7 |   end
  8 | 
  9 |   describe service('node1_elasticsearch') do
 10 |     it { should be_running }
 11 |   end
 12 | 
 13 |   describe package('elasticsearch') do
 14 |     it { should be_installed }
 15 |   end
 16 | 
 17 |   describe file('/etc/elasticsearch/node1/elasticsearch.yml') do
 18 |     it { should be_file }
 19 |     it { should contain 'path.plugins: /usr/share/elasticsearch/plugins/node1' }
 20 |     it { should contain 'http.port: 9200' }
 21 |     it { should contain 'transport.tcp.port: 9300' }
 22 |     it { should contain 'discovery.zen.ping.unicast.hosts: localhost:9300' }
 23 |   end
 24 | 
 25 |   describe file('/etc/elasticsearch/node1/scripts') do
 26 |     it { should be_directory }
 27 |     it { should be_owned_by 'elasticsearch' }
 28 |   end
 29 | 
 30 | 
 31 | 
 32 |   describe file('/etc/elasticsearch/node1/scripts/calculate-score.groovy') do
 33 |     it { should be_file }
 34 |     it { should be_owned_by 'elasticsearch' }
 35 |   end
 36 | 
 37 |   describe 'Node listening' do
 38 |     it 'listening in port 9200' do
 39 |       expect(port 9200).to be_listening
 40 |     end
 41 |   end
 42 | 
 43 |   describe file('/etc/elasticsearch/templates') do
 44 |     it { should be_directory }
 45 |     it { should be_owned_by 'elasticsearch' }
 46 |   end
 47 | 
 48 |   describe file('/etc/elasticsearch/templates/basic.json') do
 49 |     it { should be_file }
 50 |     it { should be_owned_by 'elasticsearch' }
 51 |   end
 52 | 
 53 |   describe 'Template Installed' do
 54 |     it 'should be reported as being installed', :retry => 3, :retry_wait => 10 do
 55 |       command = command('curl -s "localhost:9200/_template/basic"')
 56 |       expect(command.stdout).to match(/basic/)
 57 |       expect(command.exit_status).to eq(0)
 58 |     end
 59 |   end
 60 | 
 61 |   describe 'version check' do
 62 |     it 'should be reported as version '+es_version do
 63 |       command = command('curl -s localhost:9200 | grep number')
 64 |       expect(command.stdout).to match(es_version)
 65 |       expect(command.exit_status).to eq(0)
 66 |     end
 67 |   end
 68 | 
 69 |   describe file('/usr/share/elasticsearch/plugins/node1') do
 70 |     it { should be_directory }
 71 |     it { should be_owned_by 'elasticsearch' }
 72 |   end
 73 | 
 74 | 
 75 |   for plugin in plugins
 76 |     describe file('/usr/share/elasticsearch/plugins/node1/'+plugin) do
 77 |       it { should be_directory }
 78 |       it { should be_owned_by 'elasticsearch' }
 79 |     end
 80 | 
 81 |     describe command('curl -s localhost:9200/_nodes/plugins?pretty=true | grep '+plugin) do
 82 |       its(:exit_status) { should eq 0 }
 83 |     end
 84 |   end
 85 | 
 86 |   describe file('/etc/init.d/elasticsearch') do
 87 |     it { should_not exist }
 88 |   end
 89 | 
 90 |   describe file('/etc/default/elasticsearch') do
 91 |     it { should_not exist }
 92 |   end
 93 | 
 94 |   describe file('/etc/sysconfig/elasticsearch') do
 95 |     it { should_not exist }
 96 |   end
 97 | 
 98 |   describe file('/usr/lib/systemd/system/elasticsearch.service') do
 99 |     it { should_not exist }
100 |   end
101 | 
102 |   describe file('/etc/elasticsearch/elasticsearch.yml') do
103 |     it { should_not exist }
104 |   end
105 | 
106 |   describe file('/etc/elasticsearch/logging.yml') do
107 |     it { should_not exist }
108 |   end
109 | 
110 | end
111 | 
112 | 


--------------------------------------------------------------------------------
/config/environments/production.rb:
--------------------------------------------------------------------------------
 1 | Rails.application.configure do
 2 |   # Settings specified here will take precedence over those in config/application.rb.
 3 | 
 4 |   # Code is not reloaded between requests.
 5 |   config.cache_classes = true
 6 | 
 7 |   # Eager load code on boot. This eager loads most of Rails and
 8 |   # your application in memory, allowing both threaded web servers
 9 |   # and those relying on copy on write to perform better.
10 |   # Rake tasks automatically ignore this option for performance.
11 |   config.eager_load = true
12 | 
13 |   # Full error reports are disabled and caching is turned on.
14 |   config.consider_all_requests_local       = false
15 |   config.action_controller.perform_caching = true
16 |   # config.cache_store = :dalli_store
17 | 
18 |   # Enable Rack::Cache to put a simple HTTP cache in front of your application
19 |   # Add `rack-cache` to your Gemfile before enabling this.
20 |   # For large-scale production use, consider using a caching reverse proxy like nginx, varnish or squid.
21 |   # config.action_dispatch.rack_cache = true
22 | 
23 |   # Disable Rails's static asset server (Apache or nginx will already do this).
24 |   config.serve_static_files = false
25 | 
26 |   # Compress JavaScripts and CSS.
27 |   config.assets.js_compressor = :uglifier
28 |   # config.assets.css_compressor = :sass
29 | 
30 |   # Do not fallback to assets pipeline if a precompiled asset is missed.
31 |   config.assets.compile = false
32 | 
33 |   # Generate digests for assets URLs.
34 |   config.assets.digest = true
35 | 
36 |   # `config.assets.precompile` has moved to config/initializers/assets.rb
37 | 
38 |   # Specifies the header that your server uses for sending files.
39 |   # config.action_dispatch.x_sendfile_header = "X-Sendfile" # for apache
40 |   # config.action_dispatch.x_sendfile_header = 'X-Accel-Redirect' # for nginx
41 | 
42 |   # Force all access to the app over SSL, use Strict-Transport-Security, and use secure cookies.
43 |   # config.force_ssl = true
44 | 
45 |   # Set to :debug to see everything in the log.
46 |   config.log_level = :info
47 | 
48 |   # Prepend all log lines with the following tags.
49 |   # config.log_tags = [ :subdomain, :uuid ]
50 | 
51 |   # Use a different logger for distributed setups.
52 |   # config.logger = ActiveSupport::TaggedLogging.new(SyslogLogger.new)
53 | 
54 |   # Use a different cache store in production.
55 |   # config.cache_store = :mem_cache_store
56 | 
57 |   # Enable serving of images, stylesheets, and JavaScripts from an asset server.
58 |   # config.action_controller.asset_host = "http://assets.example.com"
59 | 
60 |   # Precompile additional assets.
61 |   # application.js, application.css, and all non-JS/CSS in app/assets folder are already added.
62 |   # config.assets.precompile += %w( search.js )
63 | 
64 |   # Ignore bad email addresses and do not raise email delivery errors.
65 |   # Set this to true and configure the email server for immediate delivery to raise delivery errors.
66 |   # config.action_mailer.raise_delivery_errors = false
67 | 
68 |   # Enable locale fallbacks for I18n (makes lookups for any locale fall back to
69 |   # the I18n.default_locale when a translation cannot be found).
70 |   config.i18n.fallbacks = true
71 | 
72 |   # Send deprecation notices to registered listeners.
73 |   config.active_support.deprecation = :notify
74 | 
75 |   # Disable automatic flushing of the log to improve performance.
76 |   # config.autoflush_log = false
77 | 
78 |   # Use default logging formatter so that PID and timestamp are not suppressed.
79 |   config.log_formatter = ::Logger::Formatter.new
80 | 
81 |   # Do not dump schema after migrations.
82 |   config.active_record.dump_schema_after_migration = false
83 | end
84 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/tasks/elasticsearch-config.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | 
  3 | # Configure Elasticsearch Node
  4 | 
  5 | #Use systemd for the following distributions:
  6 | #
  7 | #Ubuntu 15 and up
  8 | #Debian 8 and up
  9 | #Centos 7 and up
 10 | #Relies on elasticsearch distribution installing a serviced script to determine whether one should be copied.
 11 | 
 12 | 
 13 | - set_fact: use_system_d={{(ansible_distribution == 'Debian' and ansible_distribution_version | version_compare('8', '>=')) or (ansible_distribution == 'CentOS' and ansible_distribution_version | version_compare('7', '>=')) or (ansible_distribution == 'Ubuntu' and ansible_distribution_version | version_compare('15', '>=')) }}
 14 |   tags:
 15 |       - always
 16 | 
 17 | - set_fact: instance_sysd_script={{sysd_script | dirname }}/{{es_instance_name}}_{{sysd_script | basename}}
 18 |   when: use_system_d
 19 |   tags:
 20 |       - always
 21 | 
 22 | #For directories we also use the {{inventory_hostname}}-{{ es_instance_name }} - this helps if we have a shared SAN.
 23 | 
 24 | - set_fact: instance_suffix={{inventory_hostname}}-{{ es_instance_name }}
 25 |   tags:
 26 |       - always
 27 | 
 28 | - set_fact: pid_dir={{ es_pid_dir }}/{{instance_suffix}}
 29 |   tags:
 30 |       - always
 31 | 
 32 | - set_fact: log_dir={{ es_log_dir }}/{{instance_suffix}}
 33 |   tags:
 34 |       - always
 35 | 
 36 | - set_fact: work_dir={{ es_work_dir }}/{{instance_suffix}}
 37 |   tags:
 38 |       - always
 39 | 
 40 | #Create required directories
 41 | - name: Create Directories
 42 |   file: path={{ item }} state=directory owner={{ es_user }} group={{ es_group }}
 43 |   with_items:
 44 |     - "{{pid_dir}}"
 45 |     - "{{work_dir}}"
 46 |     - "{{log_dir}}"
 47 |     - "{{conf_dir}}"
 48 |     - "{{plugin_dir}}"
 49 | 
 50 | - set_fact: data_dirs={{ es_data_dirs | append_to_list('/'+instance_suffix) }}
 51 |   tags:
 52 |       - always
 53 | 
 54 | - name: Create Data Directories
 55 |   file: path={{ item }} state=directory owner={{ es_user }} group={{ es_group }}
 56 |   with_items:
 57 |     - "{{data_dirs}}"
 58 | 
 59 | 
 60 | #Copy the config template
 61 | - name: Copy Configuration File
 62 |   template: src=elasticsearch.yml.j2 dest={{conf_dir}}/elasticsearch.yml owner={{ es_user }} group={{ es_group }} mode=0644 force=yes
 63 |   notify: restart elasticsearch
 64 | 
 65 | #Copy the instance specific default file
 66 | - name: Copy Default File for Instance
 67 |   template: src=elasticsearch.j2 dest={{instance_default_file}} mode=0644 force=yes
 68 |   notify: restart elasticsearch
 69 | 
 70 | #Copy the instance specific init file
 71 | - name: Copy Debian Init File for Instance
 72 |   template: src=init/debian/elasticsearch.j2 dest={{instance_init_script}} mode=0755 force=yes
 73 |   when: ansible_os_family == 'Debian' and not use_system_d
 74 |   notify: restart elasticsearch
 75 | 
 76 | #Copy the instance specific init file
 77 | - name: Copy Redhat Init File for Instance
 78 |   template: src=init/redhat/elasticsearch.j2 dest={{instance_init_script}} mode=0755 force=yes
 79 |   when: ansible_os_family == 'RedHat' and not use_system_d
 80 |   notify: restart elasticsearch
 81 | 
 82 | #Copy the systemd specific file if systemd is installed
 83 | - name: Copy Systemd File for Instance
 84 |   template: src=systemd/elasticsearch.j2 dest={{instance_sysd_script}} mode=0644 force=yes
 85 |   when: use_system_d
 86 |   notify: restart elasticsearch
 87 | 
 88 | #Copy the logging.yml
 89 | - name: Copy Logging.yml File for Instance
 90 |   template: src=logging.yml.j2 dest={{conf_dir}}/logging.yml owner={{ es_user }} group={{ es_group }} mode=0644 force=yes
 91 |   notify: restart elasticsearch
 92 | 
 93 | #Clean up un-wanted package scripts to avoid confusion
 94 | 
 95 | - name: Delete Default Init
 96 |   file: dest=/etc/init.d/elasticsearch state=absent
 97 | 
 98 | - name: Delete Default Environment File
 99 |   file: dest=/etc/default/elasticsearch state=absent
100 |   when: ansible_os_family == 'Debian'
101 | 
102 | - name: Delete Default Environment File
103 |   file: dest=/etc/sysconfig/elasticsearch state=absent
104 |   when: ansible_os_family == 'RedHat'
105 | 
106 | - name: Delete Default Sysconfig File
107 |   file: dest=/usr/lib/systemd/system/elasticsearch.service state=absent
108 | 
109 | - name: Delete Default Configuration File
110 |   file: dest=/etc/elasticsearch/elasticsearch.yml state=absent
111 | 
112 | - name: Delete Default Logging File
113 |   file: dest=/etc/elasticsearch/logging.yml state=absent
114 | 
115 | - debug: msg="Data Dirs {{data_dirs}}"


--------------------------------------------------------------------------------
/app/controllers/v1/record_controller.rb:
--------------------------------------------------------------------------------
  1 | class V1::RecordController < V1::AccessController
  2 |   def index
  3 |     object = Record::Addons.append(record.current_data(default_options))
  4 |     respond_to do |format|
  5 |       format.json { json_response(200, result: object) }
  6 |       format.xml { xml_response(200, result: object) }
  7 |     end
  8 |   end
  9 | 
 10 |   def history
 11 |     history = record.historical_data(default_options)
 12 |     respond_to do |format|
 13 |       format.json { json_response(200, result: history) }
 14 |       # format.xml { xml_response(200, result: history) }
 15 |       # format.csv do
 16 |       #   # history = Record::Base.new('bestbuy-offers','9071056').historical_data
 17 |       #   headers_hash = history.keys.map {|k| {k => nil}}.inject({},:merge)
 18 |       #   dates_hash = history.values.flat_map {|hash| if hash.try(:keys) then hash.keys end }.compact.uniq.sort.map {|date| {date.to_date => headers_hash} }.inject({},:merge)
 19 |       #   dates_hash.each do |key,value|
 20 |       #     puts key
 21 |       #     value.each do |k,v|
 22 |       #       ap k
 23 |       #       ap v
 24 |       #       puts history[k][key.to_date]
 25 |       #     #   # dates_hash[key][k] = history[k][key]
 26 |       #     end
 27 |       #   end
 28 |       #   dates_hash
 29 | 
 30 |       #   # history.each do |key,value|
 31 |       #   #   if value.is_a? Hash
 32 |       #   #     ap key
 33 |       #   #     value.each do |k,v|
 34 |       #   #       ap k.to_date
 35 |       #   #       ap v
 36 |       #   #       dates_hash[k][key] = v
 37 |       #   #     end
 38 |       #   #   else
 39 |       #   #     # dates_hash.keys.each do |date|
 40 |       #   #     #   dates_hash[date][key] = value
 41 |       #   #     # end
 42 |       #   #   end
 43 |       #   # end
 44 | 
 45 |       #   # dates =
 46 |       #   # csv_string = history.keys.join(',') + "\n" + history.collect { |node| "#{node.collect { |_k, v| v }.join(',')}\n" }.join
 47 |       #   # send_data csv_string, type: 'text/csv; charset=iso-8859-1; header=present', disposition: 'attachment;data=historical_data.csv'
 48 |       # end
 49 |     end
 50 |   end
 51 | 
 52 |   def related
 53 |     related = record.related_data(default_options)
 54 |     respond_to do |format|
 55 |       format.json { json_response(200, result: related) }
 56 |       format.xml { xml_response(200, result: related) }
 57 |     end
 58 |   end
 59 | 
 60 |   def news
 61 |     news = record.news_data(default_options)
 62 |     respond_to do |format|
 63 |       format.json { json_response(200, result: news) }
 64 |       format.xml { xml_response(200, result: news) }
 65 |     end
 66 |   end
 67 | 
 68 |   def videos
 69 |     videos = record.videos_data(default_options)
 70 |     respond_to do |format|
 71 |       format.json { json_response(200, result: videos) }
 72 |       format.xml { xml_response(200, result: videos) }
 73 |     end
 74 |   end
 75 | 
 76 |   def links
 77 |     links = record.links_data(default_options)
 78 |     # if links[:links]
 79 |     #   links[:links] = links[:links].map {|h| Record::Addons.append(h) }
 80 |     # end
 81 |     respond_to do |format|
 82 |       format.json { json_response(200, result: links) }
 83 |       format.xml { xml_response(200, result: links) }
 84 |     end
 85 |   end
 86 | 
 87 |   def references
 88 |     references = record.references_data(default_options)
 89 |     respond_to do |format|
 90 |       format.json { json_response(200, result: references) }
 91 |       format.xml { xml_response(200, result: references) }
 92 |     end
 93 |   end
 94 | 
 95 |   def ids
 96 |     records = Record::Base.new(params[:container]).ids(default_options)
 97 |     respond_to do |format|
 98 |       format.json { json_response(200, result: records[:result],
 99 |                                        pagination: pagination(records[:total])) }
100 |       format.xml { xml_response(200, result: records[:result],
101 |                                      pagination: pagination(records[:total])) }
102 |     end
103 |   end
104 | 
105 |   def screenshot
106 |     screenshot = Record::Screenshot.new(params[:container], params[:record_id], params[:screenshot_id])
107 |     respond_to do |format|
108 |       format.json { json_response(200, screenshot.data) }
109 |       format.xml { xml_response(200, screenshot.data) }
110 |       format.jpg { redirect_to screenshot.link }
111 |     end
112 |   end
113 | 
114 |   private
115 | 
116 |   def record
117 |     @record ||= Api::V1.new(params[:container], params[:record_id])
118 |   end
119 | end
120 | 


--------------------------------------------------------------------------------
/roles/elasticsearch/.kitchen.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | driver:
  3 |   name: docker
  4 | 
  5 | provisioner:
  6 |   name: ansible_playbook
  7 |   hosts: localhost
  8 |   roles_path: ../
  9 |   require_ansible_repo: true
 10 |   ansible_verbose: true
 11 |   http_proxy: <%= ENV['HTTP_PROXY'] %>
 12 |   https_proxy: <%= ENV['HTTPS_PROXY'] %>
 13 |   no_proxy: localhost,127.0.0.1
 14 | 
 15 | platforms:
 16 |   - name: ubuntu-14.04
 17 |     driver_config:
 18 |       image: dliappis/ubuntu-devopsci:14.04
 19 |       privileged: true
 20 |       provision_command:
 21 |         - apt-get update && apt-get install -y software-properties-common &&  add-apt-repository -y ppa:ansible/ansible
 22 |         - apt-get update && apt-get -y -q install ansible python-apt python-pycurl
 23 |       use_sudo: false
 24 |   - name: debian-7
 25 |     driver_config:
 26 |       image: dliappis/debian-devopsci:7
 27 |       privileged: true
 28 |       provision_command:
 29 |         - apt-get update && apt-get -y install python python-dev python-pip build-essential libyaml-dev python-yaml
 30 |         - pip install ansible
 31 |         - apt-get install -y -q net-tools
 32 |       use_sudo: false
 33 |   - name: debian-8
 34 |     driver_config:
 35 |       image: dliappis/debian-devopsci:8
 36 |       privileged: true
 37 |       provision_command:
 38 |         - apt-get update && apt-get -y install python python-dev python-pip build-essential libyaml-dev python-yaml curl wget
 39 |         - pip install ansible
 40 |         - apt-get install -y -q net-tools
 41 |         - sed -ri 's/^#?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config
 42 |         - sed -ri 's/^#?PasswordAuthentication .*/PasswordAuthentication yes/' /etc/ssh/sshd_config
 43 |         - sed -ri 's/^#?UsePAM .*/UsePAM no/' /etc/ssh/sshd_config
 44 |       use_sudo: false
 45 |       run_command: "/sbin/init"
 46 |   - name: centos-6
 47 |     driver_config:
 48 |       image: dliappis/centos-devopsci:6
 49 |       privileged: true
 50 |       provision_command:
 51 |       use_sudo: false
 52 |   - name: centos-7
 53 |     driver_config:
 54 |       image: dliappis/centos-devopsci:7
 55 |       provision_command:
 56 |         - sed -ri 's/^#?PermitRootLogin .*/PermitRootLogin yes/' /etc/ssh/sshd_config
 57 |         - sed -ri 's/^#?PasswordAuthentication .*/PasswordAuthentication yes/' /etc/ssh/sshd_config
 58 |         - sed -ri 's/^#?UsePAM .*/UsePAM no/' /etc/ssh/sshd_config
 59 |         - yum -y install initscripts
 60 |         - yum clean all
 61 |       run_command: "/usr/sbin/init"
 62 |       privileged: true
 63 |       use_sudo: false
 64 | 
 65 | suites:
 66 |   - name: standard-2x
 67 |     provisioner:
 68 |       playbook: test/integration/standard.yml
 69 |     run_list:
 70 |     attributes:
 71 |   - name: package-2x
 72 |     run_list:
 73 |     attributes:
 74 |       extra_vars:
 75 |         es_plugins:
 76 |           - plugin: lmenezes/elasticsearch-kopf
 77 |             version: master
 78 |           - plugin: license
 79 |           - plugin: marvel-agent
 80 |     provisioner:
 81 |       playbook: test/integration/package.yml
 82 |   - name: config-2x
 83 |     run_list:
 84 |     attributes:
 85 |     provisioner:
 86 |       playbook: test/integration/config.yml
 87 |   - name: multi-2x
 88 |     run_list:
 89 |     attributes:
 90 |       extra_vars:
 91 |         es_plugins:
 92 |           - plugin: lmenezes/elasticsearch-kopf
 93 |             version: master
 94 |           - plugin: license
 95 |           - plugin: marvel-agent
 96 |     provisioner:
 97 |       playbook: test/integration/multi.yml
 98 |   - name: standard-1x
 99 |     provisioner:
100 |       playbook: test/integration/standard.yml
101 |     run_list:
102 |     attributes:
103 |       extra_vars:
104 |         es_major_version: 1.7
105 |         es_version: 1.7.3
106 |   - name: package-1x
107 |     run_list:
108 |     attributes:
109 |       extra_vars:
110 |         es_major_version: 1.7
111 |         es_version: 1.7.3
112 |         es_plugins:
113 |           - plugin: lmenezes/elasticsearch-kopf
114 |             version: master
115 |           - plugin: elasticsearch/marvel
116 |             version: latest
117 |     provisioner:
118 |       playbook: test/integration/package.yml
119 |   - name: config-1x
120 |     run_list:
121 |     attributes:
122 |       extra_vars:
123 |         es_major_version: 1.7
124 |         es_version: 1.7.3
125 |     provisioner:
126 |       playbook: test/integration/config.yml
127 |   - name: multi-1x
128 |     run_list:
129 |     attributes:
130 |       extra_vars:
131 |         es_major_version: 1.7
132 |         es_version: 1.7.3
133 |         es_plugins:
134 |           - plugin: lmenezes/elasticsearch-kopf
135 |             version: master
136 |           - plugin: elasticsearch/marvel
137 |             version: latest
138 |     provisioner:
139 |       playbook: test/integration/multi.yml
140 | 


--------------------------------------------------------------------------------