├── .yardopts ├── spec ├── support │ ├── sample_data │ │ ├── directories │ │ │ ├── simple │ │ │ │ ├── first.txt │ │ │ │ └── second.txt │ │ │ └── mixed-file-types │ │ │ │ ├── first.txt │ │ │ │ ├── another.md │ │ │ │ └── second.txt │ │ ├── sample.json │ │ ├── sample.jsonl │ │ ├── two-records.csv │ │ ├── test.csv │ │ └── long-csv.csv │ ├── mock_plugins │ │ ├── chronicle-error │ │ │ └── chronicle │ │ │ │ └── error.rb │ │ ├── chronicle-foo │ │ │ └── chronicle │ │ │ │ ├── foo.rb │ │ │ │ └── foo │ │ │ │ └── simple_authorizer.rb │ │ └── chronicle-empty │ │ │ └── chronicle │ │ │ └── empty.rb │ ├── mock_homedir │ │ └── .config │ │ │ └── chronicle │ │ │ └── etl │ │ │ ├── secrets │ │ │ ├── provider-two.yml │ │ │ └── provider-one.yml │ │ │ └── jobs │ │ │ └── command.yml │ ├── mocked_stdin.rb │ ├── run_extraction.rb │ ├── wait_until.rb │ ├── mocked_config_directory.rb │ ├── invoke_cli.rb │ └── capture_io.rb ├── chronicle │ ├── etl_spec.rb │ └── etl │ │ ├── extractor_spec.rb │ │ ├── transformers.rb │ │ └── null_transformer_spec.rb │ │ ├── loaders │ │ ├── table_loader_spec.rb │ │ ├── csv_loader_spec.rb │ │ └── json_loader_spec.rb │ │ ├── config_spec.rb │ │ ├── registry │ │ └── self_registering_spec.rb │ │ ├── cli │ │ ├── main_spec.rb │ │ ├── plugins_spec.rb │ │ ├── secrets_spec.rb │ │ ├── connectors_spec.rb │ │ ├── jobs_spec.rb │ │ └── authorizations_spec.rb │ │ ├── extractors │ │ ├── csv_extractor_spec.rb │ │ ├── json_extractor_spec.rb │ │ └── file_extractor_spec.rb │ │ ├── secrets_spec.rb │ │ ├── runner_spec.rb │ │ ├── oauth_authorizer_spec.rb │ │ └── configurable_spec.rb └── spec_helper.rb ├── .rubocop.yml ├── .rspec ├── lib └── chronicle │ ├── etl │ ├── version.rb │ ├── registry │ │ ├── registry.rb │ │ ├── plugin_registration.rb │ │ ├── self_registering.rb │ │ ├── connector_registration.rb │ │ ├── connectors.rb │ │ └── plugins.rb │ ├── transformers │ │ ├── null_transformer.rb │ │ ├── merge_meta_transformer.rb │ │ ├── multiply_transformer.rb │ │ ├── sampler_transformer.rb │ │ ├── fields_limit_transformer.rb │ │ ├── buffer_transformer.rb │ │ ├── filter_transformer.rb │ │ ├── sort_transformer.rb │ │ ├── format_transformer.rb │ │ ├── chronicle_transformer.rb │ │ ├── filter_fields_transformer.rb │ │ ├── transformer.rb │ │ └── chronobase_transformer.rb │ ├── record.rb │ ├── cli.rb │ ├── extractors │ │ ├── stdin_extractor.rb │ │ ├── json_extractor.rb │ │ ├── csv_extractor.rb │ │ ├── file_extractor.rb │ │ ├── extractor.rb │ │ └── helpers │ │ │ └── input_reader.rb │ ├── loaders │ │ ├── helpers │ │ │ ├── encoding_helper.rb │ │ │ └── stdout_helper.rb │ │ ├── rest_loader.rb │ │ ├── loader.rb │ │ ├── csv_loader.rb │ │ ├── table_loader.rb │ │ └── json_loader.rb │ ├── extraction.rb │ ├── utils │ │ ├── binary_attachments.rb │ │ └── progress_bar.rb │ ├── logger.rb │ ├── authorizer.rb │ ├── cli │ │ ├── cli_base.rb │ │ ├── subcommand_base.rb │ │ ├── secrets.rb │ │ ├── connectors.rb │ │ ├── plugins.rb │ │ ├── authorizations.rb │ │ ├── main.rb │ │ └── jobs.rb │ ├── authorization_server.rb │ ├── exceptions.rb │ ├── secrets.rb │ ├── job_logger.rb │ ├── config.rb │ ├── job_log.rb │ ├── job.rb │ ├── job_definition.rb │ ├── oauth_authorizer.rb │ ├── runner.rb │ └── configurable.rb │ └── etl.rb ├── exe └── chronicle-etl ├── .travis.yml ├── bin ├── setup └── console ├── Rakefile ├── Gemfile ├── Guardfile ├── .gitignore ├── .github └── workflows │ └── ruby.yml ├── LICENSE.txt ├── CODE_OF_CONDUCT.md ├── chronicle-etl.gemspec └── README.md /.yardopts: -------------------------------------------------------------------------------- 1 | --markup=markdown -------------------------------------------------------------------------------- /spec/support/sample_data/directories/simple/first.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spec/support/sample_data/directories/simple/second.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spec/support/sample_data/directories/mixed-file-types/first.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.rubocop.yml: -------------------------------------------------------------------------------- 1 | inherit_gem: 2 | chronicle-core: .rubocop.yml 3 | -------------------------------------------------------------------------------- /spec/support/sample_data/directories/mixed-file-types/another.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spec/support/sample_data/directories/mixed-file-types/second.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --format documentation 2 | --color 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /spec/support/sample_data/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "str": "foo", 3 | "num":40 4 | } 5 | -------------------------------------------------------------------------------- /spec/support/mock_plugins/chronicle-error/chronicle/error.rb: -------------------------------------------------------------------------------- 1 | raise "Plugin can't load" 2 | -------------------------------------------------------------------------------- /spec/support/mock_plugins/chronicle-foo/chronicle/foo.rb: -------------------------------------------------------------------------------- 1 | require_relative 'foo/simple_authorizer' 2 | -------------------------------------------------------------------------------- /spec/support/sample_data/sample.jsonl: -------------------------------------------------------------------------------- 1 | { "str": "foo", "num":40 } 2 | { "str": "bar", "num":50 } 3 | -------------------------------------------------------------------------------- /spec/support/mock_plugins/chronicle-empty/chronicle/empty.rb: -------------------------------------------------------------------------------- 1 | module EmptyPlugin 2 | # empty 3 | end 4 | -------------------------------------------------------------------------------- /lib/chronicle/etl/version.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | VERSION = '0.6.1'.freeze 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /spec/support/sample_data/two-records.csv: -------------------------------------------------------------------------------- 1 | id,end_at,value 2 | 45403503,2013-08-13,1 3 | 45403503,2013-08-13 06:00:39,3.1 4 | -------------------------------------------------------------------------------- /exe/chronicle-etl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'chronicle/etl/cli' 4 | 5 | Chronicle::ETL::CLI::Main.start(ARGV) 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | --- 2 | sudo: false 3 | language: ruby 4 | cache: bundler 5 | rvm: 6 | - 2.6.4 7 | before_install: gem install bundler -v 1.17.2 8 | -------------------------------------------------------------------------------- /spec/support/mock_homedir/.config/chronicle/etl/secrets/provider-two.yml: -------------------------------------------------------------------------------- 1 | provider: provider-two 2 | secrets: 3 | foo: bar 4 | chronicle_etl_version: 0.4.4 5 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euo pipefail 3 | IFS=$'\n\t' 4 | set -vx 5 | 6 | bundle install 7 | 8 | # Do any other automated setup that you need to do here 9 | -------------------------------------------------------------------------------- /spec/chronicle/etl_spec.rb: -------------------------------------------------------------------------------- 1 | RSpec.describe Chronicle::ETL do 2 | it 'has a version number' do 3 | expect(Chronicle::ETL::VERSION).not_to be nil 4 | end 5 | end 6 | -------------------------------------------------------------------------------- /spec/support/sample_data/test.csv: -------------------------------------------------------------------------------- 1 | id,end_at,value 2 | 1,2013-08-13,1 3 | 6,2013-08-13 06:00:39,3.1 4 | 5,2013-08-13 06:00:39 +0200, 5 | 3,2013-08-16 03:45:08 +0200,4 6 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | require 'bundler/gem_tasks' 2 | require 'rspec/core/rake_task' 3 | RSpec::Core::RakeTask.new(:spec) 4 | 5 | require 'yard' 6 | YARD::Rake::YardocTask.new 7 | 8 | task default: :spec 9 | -------------------------------------------------------------------------------- /spec/support/mock_homedir/.config/chronicle/etl/secrets/provider-one.yml: -------------------------------------------------------------------------------- 1 | provider: provider-one 2 | secrets: 3 | foo: bar 4 | another: test 5 | third: 123 6 | chronicle_etl_version: 0.4.4 7 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } 4 | 5 | # Specify your gem's dependencies in chronicle-etl.gemspec 6 | gemspec 7 | -------------------------------------------------------------------------------- /Guardfile: -------------------------------------------------------------------------------- 1 | guard :rspec, cmd: 'bundle exec rspec' do 2 | require 'guard/rspec/dsl' 3 | 4 | watch(%r{^spec/.+_spec\.rb$}) 5 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" } 6 | watch('spec/spec_helper.rb') { 'spec' } 7 | end 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.bundle/ 2 | /.yardoc 3 | /_yardoc/ 4 | /coverage/ 5 | /doc/ 6 | /pkg/ 7 | /spec/reports/ 8 | /tmp/ 9 | 10 | # https://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/ 11 | Gemfile.lock 12 | 13 | # rspec failure tracking 14 | .rspec_status 15 | .DS_Store -------------------------------------------------------------------------------- /spec/support/mock_plugins/chronicle-foo/chronicle/foo/simple_authorizer.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module Foo 3 | class SimpleAuthorizer < Chronicle::ETL::Authorizer 4 | provider :foo 5 | 6 | def authorize! 7 | { token: 'abc' } 8 | end 9 | end 10 | end 11 | end 12 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/registry.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module Registry 4 | end 5 | end 6 | end 7 | 8 | require_relative 'self_registering' 9 | require_relative 'connector_registration' 10 | require_relative 'connectors' 11 | require_relative 'plugin_registration' 12 | require_relative 'plugins' 13 | -------------------------------------------------------------------------------- /spec/chronicle/etl/extractor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::Extractor do 4 | describe '#extract' do 5 | it 'raises an exception by default' do 6 | e = Chronicle::ETL::Extractor.new 7 | expect { e.extract }.to raise_error(NotImplementedError) 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /spec/support/mock_homedir/.config/chronicle/etl/jobs/command.yml: -------------------------------------------------------------------------------- 1 | extractor: 2 | name: shell 3 | options: 4 | since: 2022-01-10 5 | transformer: 6 | name: shell 7 | options: 8 | loader: 9 | name: table 10 | options: 11 | truncate_values_at: 50 12 | fields_include: 13 | - end_at 14 | - involved.body 15 | -------------------------------------------------------------------------------- /spec/support/mocked_stdin.rb: -------------------------------------------------------------------------------- 1 | require 'stringio' 2 | 3 | RSpec.shared_context 'mocked stdin' do 4 | let(:fake_stdin) { StringIO.new } 5 | 6 | def load_stdin(input) 7 | fake_stdin.puts(input) 8 | fake_stdin.rewind 9 | end 10 | 11 | around(:each) do |example| 12 | $stdin = fake_stdin 13 | example.run 14 | ensure 15 | $stdin = STDIN 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /spec/support/run_extraction.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module SpecHelpers 4 | def run_extraction(klass, options = {}) 5 | extractor = klass.new(options) 6 | extractor.prepare 7 | results = [] 8 | extractor.extract do |extraction| 9 | results << extraction 10 | end 11 | results 12 | end 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/null_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class NullTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :null 8 | r.description = 'in no way' 9 | end 10 | 11 | def transform(record) 12 | yield record.data 13 | end 14 | end 15 | end 16 | end 17 | -------------------------------------------------------------------------------- /lib/chronicle/etl/record.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # TODO: move this into chronicle-core after figuring out what to do about data vs properties 4 | module Chronicle 5 | module ETL 6 | class Record 7 | attr_accessor :data, :extraction 8 | 9 | def initialize(data: {}, extraction: nil) 10 | @data = data 11 | @extraction = extraction 12 | end 13 | end 14 | end 15 | end 16 | -------------------------------------------------------------------------------- /spec/support/sample_data/long-csv.csv: -------------------------------------------------------------------------------- 1 | id,end_at,value 2 | 45403503,2013-08-13,1 3 | 45403503,2013-08-13 06:00:39,3.1 4 | 45403503,2013-08-13 06:00:39 +0200, 5 | 1541631,2013-08-16 03:45:08 +0200,4 6 | 1541632,2013-08-16 03:45:08 +0200,4 7 | 1541633,2013-08-16 03:45:08 +0200,4 8 | 1541634,2013-08-16 03:45:08 +0200,5 9 | 1541635,2013-08-16 03:45:08 +0200,7 10 | 1541636,2013-08-16 03:45:08 +0200,8 11 | 1541637,2013-08-16 03:45:08 +0200,9 -------------------------------------------------------------------------------- /lib/chronicle/etl/cli.rb: -------------------------------------------------------------------------------- 1 | require 'thor' 2 | require 'thor/hollaback' 3 | require 'chronicle/etl' 4 | 5 | require 'chronicle/etl/cli/cli_base' 6 | require 'chronicle/etl/cli/subcommand_base' 7 | require 'chronicle/etl/cli/authorizations' 8 | require 'chronicle/etl/cli/connectors' 9 | require 'chronicle/etl/cli/jobs' 10 | require 'chronicle/etl/cli/plugins' 11 | require 'chronicle/etl/cli/secrets' 12 | require 'chronicle/etl/cli/main' 13 | -------------------------------------------------------------------------------- /spec/chronicle/etl/transformers.rb/null_transformer_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::NullTransformer do 4 | let(:record) { Chronicle::ETL::Record.new(data: { foo: 'bar' }) } 5 | 6 | describe '#transform' do 7 | it 'does nothing' do 8 | Chronicle::ETL::NullTransformer.new.transform(record) do |result| 9 | expect(result).to eq(foo: 'bar') 10 | end 11 | end 12 | end 13 | end 14 | -------------------------------------------------------------------------------- /spec/support/wait_until.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module SpecHelpers 4 | # https://stackoverflow.com/questions/19388474/how-can-i-use-sinatra-to-simulate-a-remote-server-in-rspec-vcr 5 | def wait_until(timeout = 1) 6 | start_time = Time.now 7 | 8 | loop do 9 | return if yield 10 | raise TimeoutError if (Time.now - start_time) > timeout 11 | 12 | sleep(0.1) 13 | end 14 | end 15 | end 16 | end 17 | end 18 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/plugin_registration.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module Registry 4 | class PluginRegistration 5 | attr_accessor :name, :description, :gem, :version, :installed, :gemspec 6 | 7 | def initialize(name = nil) 8 | @installed = false 9 | @name = name 10 | yield self if block_given? 11 | end 12 | 13 | def installed? 14 | @installed || false 15 | end 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /.github/workflows/ruby.yml: -------------------------------------------------------------------------------- 1 | name: Ruby 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | 16 | - name: Set up Ruby 17 | uses: ruby/setup-ruby@v1 18 | with: 19 | ruby-version: 3.2 20 | 21 | - name: Install dependencies 22 | run: bundle install 23 | 24 | - name: Run tests 25 | run: bundle exec rake 26 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/stdin_extractor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class StdinExtractor < Chronicle::ETL::Extractor 6 | register_connector do |r| 7 | r.identifier = :stdin 8 | r.description = 'stdin' 9 | end 10 | 11 | def extract 12 | $stdin.read.each_line do |line| 13 | data = { line: line.strip } 14 | yield Chronicle::ETL::Extraction.new(data: data) 15 | end 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /spec/support/mocked_config_directory.rb: -------------------------------------------------------------------------------- 1 | require 'fakefs/spec_helpers' 2 | 3 | RSpec.shared_context 'mocked config directory' do 4 | around(:each) do |example| 5 | include FakeFS::SpecHelpers 6 | 7 | FakeFS.with_fresh do 8 | home = File.expand_path(File.join(RSPEC_ROOT, 'support/mock_homedir')) 9 | FakeFS::FileSystem.clone(home) 10 | 11 | Chronicle::ETL::Config.xdg_environment = { 'HOME' => home } 12 | 13 | example.run 14 | 15 | Chronicle::ETL::Config.xdg_environment = nil 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/helpers/encoding_helper.rb: -------------------------------------------------------------------------------- 1 | require 'pathname' 2 | 3 | module Chronicle 4 | module ETL 5 | module Loaders 6 | module Helpers 7 | module EncodingHelper 8 | # Mostly useful for handling loading with binary data from a raw extraction 9 | def force_utf8(value) 10 | return value unless value.is_a?(String) 11 | 12 | value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '') 13 | end 14 | end 15 | end 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/merge_meta_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class MergeMetaTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :merge_meta 8 | r.description = 'merge extraction meta fields into the record' 9 | end 10 | 11 | def transform(record) 12 | record.data unless record.extraction&.meta 13 | 14 | record.data[:_meta] = record.extraction.meta 15 | record.data 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /spec/chronicle/etl/loaders/table_loader_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::TableLoader do 4 | let(:record) do 5 | { 6 | provider: 'foo', 7 | verb: 'tested', 8 | actor: { 9 | represent: 'identity', 10 | provider: 'bar' 11 | } 12 | } 13 | end 14 | 15 | it 'can output a table' do 16 | l = Chronicle::ETL::TableLoader.new 17 | 18 | l.load(record) 19 | lines = capture do 20 | l.finish 21 | end.first.split("\n") 22 | 23 | # header + record 24 | expect(lines.count).to eql(2) 25 | end 26 | end 27 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extraction.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class Extraction 6 | attr_accessor :data, :meta, :source, :type, :strategy, :extractor 7 | 8 | def initialize(data: {}, meta: {}, source: nil, type: nil, strategy: nil, extractor: nil) 9 | @data = data 10 | @meta = meta 11 | @source = source 12 | @type = type 13 | @strategy = strategy 14 | @extractor = extractor 15 | end 16 | 17 | def to_h 18 | { data: @data, meta: @meta, source: @source } 19 | end 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/multiply_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class MultiplyTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :multiply 8 | r.description = 'by taking a sample' 9 | end 10 | 11 | setting :n, default: 2, type: :numeric 12 | 13 | # return the result, sample_size percentage of the time. otherwise nil 14 | def transform(record) 15 | @config.n.to_i.times do 16 | yield record.data 17 | end 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/sampler_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class SamplerTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :sampler 8 | r.description = 'by taking a sample' 9 | end 10 | 11 | setting :percent, default: 10, type: :numeric 12 | 13 | # return the result, `percent` percentage of the time. otherwise nil 14 | def transform(record) 15 | return unless rand(100) < @config.percent 16 | 17 | record.data 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/chronicle/etl/utils/binary_attachments.rb: -------------------------------------------------------------------------------- 1 | require 'marcel' 2 | require 'base64' 3 | 4 | module Chronicle 5 | module ETL 6 | module Utils 7 | # Utility methods for dealing with binary files 8 | module BinaryAttachments 9 | def self.filename_to_base64(filename:, mimetype: nil) 10 | mimetype ||= guess_mimetype(filename: filename) 11 | 12 | "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}" 13 | end 14 | 15 | def self.guess_mimetype(filename:) 16 | Marcel::MimeType.for(filename) 17 | end 18 | end 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /spec/chronicle/etl/config_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::Config do 4 | include_context 'mocked config directory' 5 | 6 | # TODO: remove this after proper tests written for Config 7 | it 'can set a custom homedir' do 8 | data = {} 9 | data[Time.now.to_i] = Time.now 10 | Chronicle::ETL::Config.write('jobs', 'foo', data) 11 | expect(Chronicle::ETL::Config.available_jobs).to contain_exactly('command', 'foo') 12 | end 13 | 14 | describe '#available_jobs' do 15 | it 'can list jobs' do 16 | expect(Chronicle::ETL::Config.available_jobs).to eq(['command']) 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /spec/chronicle/etl/registry/self_registering_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Connectors do 4 | describe '#register_connector' do 5 | it 'can register a new class' do 6 | expect do 7 | class TestExtractor < Chronicle::ETL::Extractor 8 | register_connector do |r| 9 | r.description = 'foobar' 10 | end 11 | end 12 | end.to change { Chronicle::ETL::Registry::Connectors.connectors.count }.by(1) 13 | 14 | expect(Chronicle::ETL::Registry::Connectors.connectors.map(&:description)) 15 | .to include('foobar') 16 | end 17 | end 18 | end 19 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/main_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Main do 4 | describe '#version' do 5 | it 'outputs correct version' do 6 | output, = invoke_cli(['version']) 7 | expect(output).to match("chronicle-etl #{Chronicle::ETL::VERSION}") 8 | end 9 | 10 | it 'can be shown by calling cli with `--version`' do 11 | output, = invoke_cli(['--version']) 12 | expect(output).to match("chronicle-etl #{Chronicle::ETL::VERSION}") 13 | end 14 | end 15 | 16 | describe '#help' do 17 | it 'outputs help menu' do 18 | output, = invoke_cli(['help']) 19 | expect(output).to match(/ALL COMMANDS/) 20 | end 21 | end 22 | end 23 | -------------------------------------------------------------------------------- /spec/support/invoke_cli.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module SpecHelpers 4 | # Run the main CLI app with given args 5 | # 6 | # @param [Array] the command line arguments to pass to the CLI 7 | # @param [Boolean] rescue_from_exit whether to rescue when CLI explictly 8 | # exits. If set to false, example must include 9 | # `.to raise_error(SystemExit)`, otherwise tests will prematurely end 10 | def invoke_cli(args = [], rescue_from_exit = true) 11 | capture do 12 | Chronicle::ETL::CLI::Main.start(args) 13 | rescue SystemExit 14 | raise unless rescue_from_exit 15 | end 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /spec/chronicle/etl/extractors/csv_extractor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CSVExtractor do 4 | let(:filename) { 'spec/support/sample_data/two-records.csv' } 5 | 6 | describe '#results_count' do 7 | it 'can extract from a CSV file' do 8 | e = Chronicle::ETL::CSVExtractor.new(input: filename) 9 | e.prepare 10 | expect(e.results_count).to eql(2) 11 | end 12 | end 13 | 14 | describe '#extract' do 15 | it 'can extract from a CSV file' do 16 | e = Chronicle::ETL::CSVExtractor.new(input: filename) 17 | e.prepare 18 | expect { |b| e.extract(&b) }.to yield_successive_args(Chronicle::ETL::Extraction, Chronicle::ETL::Extraction) 19 | end 20 | end 21 | end 22 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/fields_limit_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'chronicle/utils/hash_utils' 4 | 5 | module Chronicle 6 | module ETL 7 | # A transformer that filters the fields of a record and returns a new hash with only the specified fields. 8 | class FieldsLimitTransformer < Chronicle::ETL::Transformer 9 | register_connector do |r| 10 | r.identifier = :fields_limit 11 | r.description = 'by taking first N fields' 12 | end 13 | 14 | setting :limit, type: :numeric, default: 10 15 | 16 | def transform(record) 17 | # flattern hash and then take the first limit fields 18 | 19 | Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h 20 | end 21 | end 22 | end 23 | end 24 | -------------------------------------------------------------------------------- /bin/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | require 'bundler/setup' 4 | require 'chronicle/etl' 5 | 6 | # You can add fixtures and/or initialization code here to make experimenting 7 | # with your gem easier. You can also use a different console, if you like. 8 | 9 | # (If you use this, don't forget to add pry to your Gemfile!) 10 | require 'pry' 11 | Pry.start 12 | 13 | def reload!(print = true) 14 | puts 'Reloading ...' if print 15 | # Main project directory. 16 | root_dir = File.expand_path('..', __dir__) 17 | # Directories within the project that should be reloaded. 18 | reload_dirs = %w[lib] 19 | # Loop through and reload every file in all relevant project directories. 20 | reload_dirs.each do |dir| 21 | Dir.glob("#{root_dir}/#{dir}/**/*.rb").each { |f| load(f) } 22 | end 23 | # Return true when complete. 24 | true 25 | end 26 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/buffer_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class BufferTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :buffer 8 | r.description = 'by buffering' 9 | end 10 | 11 | setting :size, default: 10, description: 'The size of the buffer' 12 | 13 | def transform(record) 14 | stash_record(record) 15 | 16 | # FIXME: this doesn't seem to be working with the runner 17 | return if @stashed_records.size < @config.size 18 | 19 | # FIXME: this will result in the wrong extraction being associated with 20 | # the batch of flushed records 21 | flush_stashed_records.map(&:data) 22 | end 23 | 24 | def finish 25 | flush_stashed_records 26 | end 27 | end 28 | end 29 | end 30 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/filter_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | # Return only records that match all the conditions of the filters 6 | # setting. 7 | class FilterTransformer < Chronicle::ETL::Transformer 8 | register_connector do |r| 9 | r.identifier = :filter 10 | r.description = 'by only accepting records that match conditions' 11 | end 12 | 13 | setting :filters, type: :hash 14 | 15 | def transform(record) 16 | record_hash = record.data.to_h 17 | 18 | @config.filters.each do |key, value| 19 | path = key.split('.').map do |k| 20 | k.match?(/^\d+$/) ? k.to_i : k.to_sym 21 | end 22 | 23 | return nil unless record_hash.dig(*path) == value 24 | end 25 | 26 | record.data 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/sort_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class SortTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :sort 8 | r.description = 'sorts records by a given field' 9 | end 10 | 11 | setting :key, required: true, default: 'id' 12 | setting :direction, required: false, default: 'desc' 13 | 14 | def transform(record) 15 | stash_record(record) 16 | end 17 | 18 | def finish 19 | return unless @stashed_records&.any? 20 | 21 | sorted = @stashed_records.sort_by do |record| 22 | value = record.data[@config.key] 23 | value.nil? ? [1] : [0, value] 24 | end 25 | 26 | sorted.reverse! if @config.direction == 'desc' 27 | sorted 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/format_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class FormatTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :format 8 | r.description = 'records to a differnet hash/json format' 9 | end 10 | 11 | setting :format, default: nil 12 | 13 | def transform(record) 14 | serializer = find_serializer(@config.format) 15 | serializer.serialize(record.data) 16 | end 17 | 18 | private 19 | 20 | def find_serializer(format) 21 | case format 22 | when 'jsonld' 23 | Chronicle::Serialization::JSONLDSerializer 24 | when 'jsonapi' 25 | Chronicle::Serialization::JSONAPISerializer 26 | else 27 | raise 'unknown format' 28 | end 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/self_registering.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module Chronicle 4 | module ETL 5 | module Registry 6 | # Gives a connector class the ability to let the Chronicle::ETL::Registry 7 | # know about itself 8 | module SelfRegistering 9 | extend Forwardable 10 | 11 | attr_accessor :connector_registration 12 | 13 | def_delegators :@connector_registration, :description, :provider, :identifier 14 | 15 | # Creates a ConnectorRegistration for this connector's details and register's it 16 | # into the Registry 17 | def register_connector 18 | @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self) 19 | yield @connector_registration if block_given? 20 | ::Chronicle::ETL::Registry::Connectors.register(@connector_registration) 21 | end 22 | end 23 | end 24 | end 25 | end 26 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/plugins_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Plugins do 4 | describe '#list' do 5 | it 'shows installed plugins' do 6 | VCR.use_cassette('plugins-on-rubygems') do 7 | stdout, = invoke_cli(%w[plugins:list]) 8 | expect(stdout.split("\n").first).to match(/Available plugins/) 9 | end 10 | end 11 | end 12 | 13 | describe '#uninstall' do 14 | context "for a plugin that doesn't exist" do 15 | it 'will exit with an error' do 16 | expect do 17 | invoke_cli(%w[plugins:uninstall foobar123], false) 18 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 19 | end 20 | 21 | it 'will show an error message' do 22 | _, stderr = invoke_cli(%w[plugins:uninstall foobar123]) 23 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/could not be uninstalled/) 24 | end 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/chronicle_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class ChronicleTransformer < Chronicle::ETL::Transformer 6 | register_connector do |r| 7 | r.identifier = :chronicle 8 | r.description = 'records to Chronicle schema' 9 | end 10 | 11 | def transform(record) 12 | converter_klass = find_converter(record.extraction) 13 | # TODO: handle missing converter 14 | 15 | converter_klass.new.call(record) do |transformed_record| 16 | yield transformed_record.data 17 | end 18 | end 19 | 20 | private 21 | 22 | def find_converter(extraction) 23 | Chronicle::ETL::Registry::Connectors.find_converter_for_source( 24 | source: extraction.source, 25 | type: extraction.type, 26 | strategy: extraction.strategy, 27 | target: :chronicle 28 | )&.klass 29 | end 30 | end 31 | end 32 | end 33 | -------------------------------------------------------------------------------- /lib/chronicle/etl.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'chronicle/schema' 4 | require 'chronicle/models/base' 5 | 6 | require_relative 'etl/registry/registry' 7 | require_relative 'etl/authorizer' 8 | require_relative 'etl/config' 9 | require_relative 'etl/configurable' 10 | require_relative 'etl/exceptions' 11 | require_relative 'etl/extraction' 12 | require_relative 'etl/record' 13 | require_relative 'etl/job_definition' 14 | require_relative 'etl/job_log' 15 | require_relative 'etl/job_logger' 16 | require_relative 'etl/job' 17 | require_relative 'etl/logger' 18 | require_relative 'etl/runner' 19 | require_relative 'etl/secrets' 20 | require_relative 'etl/utils/binary_attachments' 21 | require_relative 'etl/utils/progress_bar' 22 | require_relative 'etl/version' 23 | 24 | require_relative 'etl/extractors/extractor' 25 | require_relative 'etl/loaders/loader' 26 | require_relative 'etl/transformers/transformer' 27 | 28 | begin 29 | require 'pry' 30 | rescue LoadError 31 | # Pry not available 32 | end 33 | -------------------------------------------------------------------------------- /lib/chronicle/etl/logger.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module Logger 4 | extend self 5 | 6 | DEBUG = 0 7 | INFO = 1 8 | WARN = 2 9 | ERROR = 3 10 | FATAL = 4 11 | SILENT = 5 12 | 13 | attr_accessor :log_level 14 | 15 | @log_level = INFO 16 | 17 | def output(message, level) 18 | return unless level >= @log_level 19 | 20 | if @ui_element 21 | @ui_element.log(message) 22 | else 23 | warn(message) 24 | end 25 | end 26 | 27 | def fatal(message) 28 | output(message, FATAL) 29 | end 30 | 31 | def error(message) 32 | output(message, ERROR) 33 | end 34 | 35 | def info(message) 36 | output(message, INFO) 37 | end 38 | 39 | def debug(message) 40 | output(message, DEBUG) 41 | end 42 | 43 | def attach_to_ui(ui_element) 44 | @ui_element = ui_element 45 | end 46 | 47 | def detach_from_ui 48 | @ui_element = nil 49 | end 50 | end 51 | end 52 | end 53 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/rest_loader.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'net/http' 4 | require 'uri' 5 | require 'json' 6 | require 'chronicle/serialization' 7 | 8 | module Chronicle 9 | module ETL 10 | class RestLoader < Chronicle::ETL::Loader 11 | register_connector do |r| 12 | r.identifier = :rest 13 | r.description = 'a REST endpoint' 14 | end 15 | 16 | setting :hostname, required: true 17 | setting :endpoint, required: true 18 | setting :access_token 19 | 20 | def load(payload) 21 | uri = URI.parse("#{@config.hostname}#{@config.endpoint}") 22 | 23 | header = { 24 | 'Authorization' => "Bearer #{@config.access_token}", 25 | 'Content-Type': 'application/json' 26 | } 27 | use_ssl = uri.scheme == 'https' 28 | 29 | Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http| 30 | request = Net::HTTP::Post.new(uri.request_uri, header) 31 | request.body = payload.to_json 32 | http.request(request) 33 | end 34 | end 35 | end 36 | end 37 | end 38 | -------------------------------------------------------------------------------- /spec/chronicle/etl/loaders/csv_loader_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'csv' 3 | 4 | RSpec.describe Chronicle::ETL::CSVLoader do 5 | # TODO: consolidate this with other specs 6 | let(:record) do 7 | { 8 | provider: 'foo', 9 | verb: 'tested', 10 | actor: { 11 | represent: 'identity', 12 | provider: 'bar' 13 | } 14 | } 15 | end 16 | 17 | context 'when destination is stdout' do 18 | it 'can output a CSV' do 19 | l = Chronicle::ETL::CSVLoader.new 20 | 21 | l.load(record) 22 | l.load(record) 23 | 24 | lines = capture do 25 | l.finish 26 | end.first.split("\n") 27 | 28 | expect(lines.count).to eql(3) 29 | end 30 | end 31 | 32 | context 'when destination is a file' do 33 | it 'writes json to a file' do 34 | FakeFS.with_fresh do 35 | l = Chronicle::ETL::CSVLoader.new(output: 'test.csv') 36 | l.load(record) 37 | l.load(record) 38 | l.finish 39 | 40 | csv = CSV.parse(File.read('test.csv')) 41 | expect(csv.count).to eql(3) 42 | end 43 | end 44 | end 45 | end 46 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2020 Andrew Louis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /spec/chronicle/etl/loaders/json_loader_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | require 'fakefs/safe' 3 | 4 | RSpec.describe Chronicle::ETL::JSONLoader do 5 | let(:record) do 6 | { foo: 'bar' } 7 | end 8 | 9 | context 'when using stdout as destination' do 10 | it 'can output JSON from a Raw model' do 11 | l = Chronicle::ETL::JSONLoader.new 12 | 13 | output, = capture do 14 | l.start 15 | l.load(record) 16 | l.load(record) 17 | l.finish 18 | end 19 | 20 | lines = output.split("\n") 21 | expect(lines.count).to eql(2) 22 | expect(JSON.parse(lines.first)).to include({ 'foo' => 'bar' }) 23 | end 24 | end 25 | 26 | context 'when using a file as destination' do 27 | it 'writes json to a file' do 28 | FakeFS.with_fresh do 29 | l = Chronicle::ETL::JSONLoader.new(output: 'output.jsonl') 30 | l.start 31 | l.load(record) 32 | l.load(record) 33 | l.finish 34 | 35 | contents = File.read('output.jsonl').split("\n") 36 | expect(JSON.parse(contents.first)).to include({ 'foo' => 'bar' }) 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/chronicle/etl/extractors/json_extractor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::JSONExtractor do 4 | let(:json_filename) { 'spec/support/sample_data/sample.json' } 5 | let(:jsonl_filename) { 'spec/support/sample_data/sample.jsonl' } 6 | let(:invalid_filename) { 'spec/support/sample_data/test.csv' } 7 | 8 | describe '#results_count' do 9 | it 'can extract from a CSV file' do 10 | e = Chronicle::ETL::JSONExtractor.new(input: jsonl_filename) 11 | e.prepare 12 | expect(e.results_count).to eql(2) 13 | end 14 | end 15 | 16 | describe '#extract' do 17 | it 'can extract from a JSONL file' do 18 | e = Chronicle::ETL::JSONExtractor.new(input: jsonl_filename) 19 | e.prepare 20 | expect { |b| e.extract(&b) }.to yield_successive_args(Chronicle::ETL::Extraction, Chronicle::ETL::Extraction) 21 | end 22 | 23 | context 'for invalid json' do 24 | it 'will raise an exception' do 25 | e = Chronicle::ETL::JSONExtractor.new(input: invalid_filename) 26 | expect { e.prepare }.to raise_error(Chronicle::ETL::ExtractionError) 27 | end 28 | end 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /lib/chronicle/etl/authorizer.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | # An authorization strategy for a third-party data source 4 | class Authorizer 5 | class << self 6 | attr_reader :provider_name 7 | 8 | # Macro for setting provider on an Authorizer 9 | def provider(provider_name) 10 | @provider_name = provider_name.to_sym 11 | end 12 | 13 | # From all loaded Authorizers, return the first one that matches 14 | # a given provider 15 | # 16 | # @todo Have a proper identifier system for authorizers 17 | # (to have more than one per plugin) 18 | def find_by_provider(provider) 19 | ObjectSpace.each_object(::Class).select { |klass| klass < self }.find do |authorizer| 20 | authorizer.provider_name == provider 21 | end 22 | end 23 | end 24 | 25 | # Construct a new authorizer 26 | def initialize(args); end 27 | 28 | # Main entry-point for authorization flows. Implemented by subclass 29 | def authorize! 30 | raise NotImplementedError 31 | end 32 | end 33 | end 34 | end 35 | 36 | require_relative 'oauth_authorizer' 37 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/helpers/stdout_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tempfile' 4 | 5 | module Chronicle 6 | module ETL 7 | module Loaders 8 | module Helpers 9 | module StdoutHelper 10 | # TODO: have option to immediately output to stdout 11 | 12 | # TODO: let users use "stdout" as an option for the `output` setting 13 | # Assume we're using stdout if no output is specified 14 | def output_to_stdout? 15 | !@config.output 16 | end 17 | 18 | def create_stdout_temp_file 19 | file = Tempfile.new('chronicle-stdout') 20 | file.unlink 21 | file 22 | end 23 | 24 | def write_to_stdout_from_temp_file(file) 25 | file.rewind 26 | write_to_stdout(file.read) 27 | end 28 | 29 | def write_to_stdout(output) 30 | # We .dup because rspec overwrites $stdout (in helper #capture) to 31 | # capture output. 32 | stdout = $stdout.dup 33 | stdout.write(output) 34 | stdout.flush 35 | end 36 | end 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/chronicle/etl/secrets_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | RSpec.describe Chronicle::ETL::Secrets do 6 | include_context 'mocked config directory' 7 | 8 | describe '#all' do 9 | it 'can retrieve all secrets' do 10 | expect(described_class.all.keys).to contain_exactly(:'provider-one', :'provider-two') 11 | end 12 | end 13 | 14 | describe '#set' do 15 | it 'can set a secret' do 16 | described_class.set('new-namespace', 'key', 'value') 17 | value = described_class.read('new-namespace')[:key] 18 | expect(value).to eql('value') 19 | end 20 | end 21 | 22 | describe '#unset' do 23 | it 'can unset a secret' do 24 | described_class.set('new-namespace', 'key', 'value') 25 | value = described_class.read('new-namespace')[:key] 26 | expect(value).to eql('value') 27 | 28 | described_class.unset('new-namespace', 'key') 29 | value = described_class.read('new-namespace')[:key] 30 | expect(value).to eql(nil) 31 | end 32 | end 33 | 34 | describe '#available_secrets' do 35 | it 'can list all secrets' do 36 | expect(described_class.available_secrets).to contain_exactly('provider-one', 'provider-two') 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/loader.rb: -------------------------------------------------------------------------------- 1 | require_relative 'helpers/encoding_helper' 2 | require_relative 'helpers/stdout_helper' 3 | 4 | module Chronicle 5 | module ETL 6 | # Abstract class representing a Loader for an ETL job 7 | class Loader 8 | extend Chronicle::ETL::Registry::SelfRegistering 9 | include Chronicle::ETL::Configurable 10 | include Chronicle::ETL::Loaders::Helpers::EncodingHelper 11 | 12 | setting :output 13 | setting :fields 14 | setting :fields_limit, default: nil 15 | setting :fields_exclude 16 | 17 | # Construct a new instance of this loader. Options are passed in from a Runner 18 | # == Parameters: 19 | # options:: 20 | # Options for configuring this Loader 21 | def initialize(options = {}) 22 | apply_options(options) 23 | end 24 | 25 | # Called once before processing records 26 | def start; end 27 | 28 | # Load a single record 29 | def load 30 | raise NotImplementedError 31 | end 32 | 33 | # Called once there are no more records to process 34 | def finish; end 35 | end 36 | end 37 | end 38 | 39 | require_relative 'csv_loader' 40 | require_relative 'json_loader' 41 | require_relative 'rest_loader' 42 | require_relative 'table_loader' 43 | -------------------------------------------------------------------------------- /spec/support/capture_io.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module SpecHelpers 4 | # Capture stdout/stderr in a block 5 | # Adapted from minitest 6 | # https://github.com/seattlerb/minitest/blob/7d2134a1d386a068f1c7705889c7764a47413861/lib/minitest/assertions.rb#L514 7 | def capture 8 | orig_stdout = $stdout 9 | orig_stderr = $stderr 10 | 11 | captured_stdout = StringIO.new 12 | captured_stderr = StringIO.new 13 | 14 | $stdout = captured_stdout 15 | $stderr = captured_stderr 16 | 17 | yield 18 | 19 | [captured_stdout.string, captured_stderr.string] 20 | ensure 21 | $stdout = orig_stdout 22 | $stderr = orig_stderr 23 | end 24 | 25 | # Quick and dirty method to run a block with suppressed stdout/stderr 26 | # TODO: refactor this to share code with above 27 | def suppress_output 28 | orig_stdout = $stdout 29 | orig_stderr = $stderr 30 | 31 | captured_stdout = StringIO.new 32 | captured_stderr = StringIO.new 33 | 34 | $stdout = captured_stdout 35 | $stderr = captured_stderr 36 | 37 | yield 38 | ensure 39 | $stdout = orig_stdout 40 | $stderr = orig_stderr 41 | end 42 | end 43 | end 44 | end 45 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/cli_base.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module CLI 4 | # Base class for CLI commands 5 | class CLIBase < ::Thor 6 | no_commands do 7 | # Shorthand for cli_exit(status: :failure) 8 | def cli_fail(message: nil, exception: nil) 9 | if exception && Chronicle::ETL::Logger.log_level > Chronicle::ETL::Logger::DEBUG 10 | message += "\nRe-run the command with --verbose to see details." 11 | end 12 | 13 | cli_exit(status: :failure, message: message, exception: exception) 14 | end 15 | 16 | # Exit from CLI 17 | # 18 | # @params status Can be eitiher :success or :failure 19 | # @params message to print 20 | # @params exception stacktrace if log_level is set to debug 21 | def cli_exit(status: :success, message: nil, exception: nil) 22 | exit_code = status == :success ? 0 : 1 23 | log_level = status == :success ? :info : :fatal 24 | 25 | message = message.red if status != :success 26 | 27 | Chronicle::ETL::Logger.debug(exception.full_message) if exception 28 | Chronicle::ETL::Logger.send(log_level, message) if message 29 | exit(exit_code) 30 | end 31 | end 32 | end 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/json_extractor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class JSONExtractor < Chronicle::ETL::Extractor 6 | include Extractors::Helpers::InputReader 7 | 8 | register_connector do |r| 9 | r.identifier = :json 10 | r.description = 'JSON' 11 | end 12 | 13 | setting :jsonl, default: true, type: :boolean 14 | setting :path, default: nil, type: :string 15 | 16 | def prepare 17 | @jsons = [] 18 | load_input do |input| 19 | data = parse_data(input) 20 | @jsons += [data].flatten 21 | end 22 | end 23 | 24 | def extract 25 | @jsons.each do |json| 26 | yield Chronicle::ETL::Extraction.new(data: json) 27 | end 28 | end 29 | 30 | def results_count 31 | @jsons.count 32 | end 33 | 34 | private 35 | 36 | def parse_data(data) 37 | parsed_data = JSON.parse(data) 38 | if @config.path 39 | parsed_data.dig(*@config.path.split('.')) 40 | else 41 | parsed_data 42 | end 43 | rescue JSON::ParserError 44 | raise Chronicle::ETL::ExtractionError, 'Could not parse JSON' 45 | end 46 | 47 | def load_input(&block) 48 | if @config.jsonl 49 | read_input_as_lines(&block) 50 | else 51 | read_input(&block) 52 | end 53 | end 54 | end 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/csv_extractor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'csv' 4 | 5 | module Chronicle 6 | module ETL 7 | class CSVExtractor < Chronicle::ETL::Extractor 8 | include Extractors::Helpers::InputReader 9 | 10 | register_connector do |r| 11 | r.identifier = :csv 12 | r.description = 'CSV' 13 | end 14 | 15 | setting :headers, default: true 16 | 17 | def prepare 18 | @csvs = prepare_sources 19 | end 20 | 21 | def extract 22 | @csvs.each do |csv| 23 | csv.read.each do |row| 24 | yield Chronicle::ETL::Extraction.new(data: row.to_h) 25 | end 26 | end 27 | end 28 | 29 | def results_count 30 | @csvs.reduce(0) do |total_rows, csv| 31 | row_count = csv.readlines.size 32 | csv.rewind 33 | total_rows + row_count 34 | end 35 | end 36 | 37 | private 38 | 39 | def all_rows 40 | @csvs.reduce([]) do |all_rows, csv| 41 | all_rows + csv.to_a.map(&:to_h) 42 | end 43 | end 44 | 45 | def prepare_sources 46 | @csvs = [] 47 | read_input do |csv_data| 48 | csv_options = { 49 | headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers, 50 | converters: :all 51 | } 52 | @csvs << CSV.new(csv_data, **csv_options) 53 | end 54 | @csvs 55 | end 56 | end 57 | end 58 | end 59 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/subcommand_base.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | module CLI 4 | # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax 5 | class SubcommandBase < Chronicle::ETL::CLI::CLIBase 6 | # Print usage instructions for a subcommand 7 | def self.help(shell, subcommand = false) 8 | list = printable_commands(true, subcommand) 9 | ::Thor::Util.thor_classes_in(self).each do |klass| 10 | list += klass.printable_commands(false) 11 | end 12 | list.sort! { |a, b| a[0] <=> b[0] } 13 | 14 | shell.say 'COMMANDS'.bold 15 | shell.print_table(list, indent: 2, truncate: true) 16 | shell.say 17 | class_options_help(shell) 18 | end 19 | 20 | # Show docs with command:subcommand pattern. 21 | # For `help` command, don't use colon 22 | def self.banner(command, _namespace = nil, _subcommand = false) 23 | if command.name == 'help' 24 | "#{subcommand_prefix} #{command.usage}" 25 | else 26 | "#{subcommand_prefix}:#{command.usage}" 27 | end 28 | end 29 | 30 | # Use subcommand classname to derive display name for subcommand 31 | def self.subcommand_prefix 32 | name.gsub(/.*::/, '').gsub(/^[A-Z]/) do |match| 33 | match[0].downcase 34 | end.gsub(/[A-Z]/) { |match| "-#{match[0].downcase}" } 35 | end 36 | end 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/secrets_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Secrets do 4 | include_context 'mocked config directory' 5 | 6 | describe 'chronicle-etl secrets:list' do 7 | it 'can list available secrets' do 8 | args = ['secrets:list'] 9 | output, = invoke_cli(args) 10 | 11 | all_secrets = Chronicle::ETL::Secrets.all.values.map(&:values).flatten 12 | 13 | expect(output.split("\n").count).to eql(all_secrets.count + 2) 14 | end 15 | end 16 | 17 | describe 'chronicle-etl secrets:set' do 18 | it 'can set a secret' do 19 | args = %w[secrets:set foo key value] 20 | invoke_cli(args) 21 | 22 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('value') 23 | end 24 | 25 | context 'when value not provided' do 26 | include_context 'mocked stdin' 27 | 28 | it 'can set a secret with stdin' do 29 | load_stdin('baz') 30 | args = %w[secrets:set foo key] 31 | invoke_cli(args) 32 | 33 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('baz') 34 | end 35 | end 36 | end 37 | 38 | describe 'chronicle-etl secrets:unset' do 39 | it 'can unset a secret' do 40 | args = %w[secrets:set foo key value] 41 | invoke_cli(args) 42 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('value') 43 | 44 | args = %w[secrets:unset foo key] 45 | invoke_cli(args) 46 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to be_nil 47 | end 48 | end 49 | end 50 | -------------------------------------------------------------------------------- /spec/chronicle/etl/extractors/file_extractor_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::FileExtractor do 4 | let(:filename) { 'spec/support/sample_data/two-records.csv' } 5 | let(:directory) { 'spec/support/sample_data/directories/simple' } 6 | 7 | context 'for a simple directory' do 8 | describe '#results_count' do 9 | it 'can extract from a CSV file' do 10 | e = Chronicle::ETL::FileExtractor.new(input: directory, dir_glob_pattern: '**/*') 11 | e.prepare 12 | expect(e.results_count).to eql(2) 13 | end 14 | end 15 | 16 | describe '#extract' do 17 | it 'can yield filenames in the directory' do 18 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: directory, dir_glob_pattern: '**/*' }) 19 | expect(results).to all(be_a(Chronicle::ETL::Extraction)) 20 | expect(results.count).to eql(2) 21 | end 22 | end 23 | end 24 | 25 | context 'when passed in files' do 26 | it 'will yield file back' do 27 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: [filename] }) 28 | expect(results.count).to eql(1) 29 | expect(results.first.data).to eql(filename) 30 | end 31 | 32 | context 'when passed in two of the same files' do 33 | it 'will yield file once' do 34 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: [filename, filename] }) 35 | expect(results.count).to eql(1) 36 | expect(results.first.data).to eql(filename) 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /lib/chronicle/etl/authorization_server.rb: -------------------------------------------------------------------------------- 1 | require 'sinatra' 2 | require 'omniauth' 3 | 4 | module Chronicle 5 | module ETL 6 | class AuthorizationServer < Sinatra::Base 7 | class << self 8 | attr_accessor :latest_authorization 9 | end 10 | 11 | configure do 12 | set :inline_templates, true 13 | set :dump_errors, false 14 | set :raise_errors, true 15 | disable :logging 16 | set :sessions, true 17 | set :quiet, true 18 | set :threaded, true 19 | set :environment, ENV['APP_ENV'] == 'test' ? :test : :production 20 | end 21 | 22 | use OmniAuth::Builder do 23 | Chronicle::ETL::OauthAuthorizer.all.each do |klass| 24 | args = [klass.client_id, klass.client_secret, klass.options].compact 25 | provider( 26 | klass.strategy, 27 | *args 28 | ) 29 | end 30 | end 31 | 32 | OmniAuth.config.logger = Chronicle::ETL::Logger 33 | OmniAuth.config.silence_get_warning = true 34 | OmniAuth.config.allowed_request_methods = %i[get] 35 | 36 | get '/auth/:provider/callback' do 37 | authorization = request.env['omniauth.auth'].to_h.deep_transform_keys(&:to_sym) 38 | self.class.latest_authorization = authorization 39 | erb "

Settings saved for #{params[:provider]}

You can now close this tab and return to your terminal!

" 40 | end 41 | 42 | get '/auth/failure' do 43 | # TODO: handle this 44 | erb "

Authentication Failed:

message:

#{params}
" 45 | end 46 | end 47 | end 48 | end 49 | -------------------------------------------------------------------------------- /spec/chronicle/etl/runner_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'spec_helper' 4 | 5 | RSpec.describe Chronicle::ETL::Runner do 6 | before(:all) do 7 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::FATAL 8 | end 9 | 10 | describe '#run!' do 11 | it 'runs' do 12 | filename = 'spec/support/sample_data/test.csv' 13 | 14 | # rows in sample CSV file (excluding header) 15 | file_record_count = File.read(filename).each_line.count - 1 16 | 17 | definition = Chronicle::ETL::JobDefinition.new 18 | definition.add_config( 19 | { 20 | extractor: { 21 | name: 'csv', 22 | options: { 23 | input: filename 24 | } 25 | }, 26 | transformers: [ 27 | { 28 | name: 'multiply', 29 | options: { 30 | n: 2 31 | } 32 | }, 33 | { 34 | name: 'multiply', 35 | options: { 36 | n: 2 37 | } 38 | }, 39 | { 40 | name: 'sort', 41 | options: { 42 | key: 'id', 43 | direction: 'desc' 44 | } 45 | } 46 | ], 47 | loader: { 48 | name: 'json' 49 | } 50 | } 51 | ) 52 | 53 | job = Chronicle::ETL::Job.new(definition) 54 | 55 | r = Chronicle::ETL::Runner.new(job) 56 | 57 | output, = capture do 58 | r.run! 59 | end 60 | 61 | expect(output.split("\n").count).to eql(file_record_count * 4) 62 | end 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /lib/chronicle/etl/exceptions.rb: -------------------------------------------------------------------------------- 1 | module Chronicle 2 | module ETL 3 | class Error < StandardError; end 4 | 5 | class SecretsError < Error; end 6 | 7 | class AuthorizationError < Error; end 8 | 9 | class ConfigError < Error; end 10 | 11 | class RunnerError < Error; end 12 | class RunInterruptedError < RunnerError; end 13 | 14 | class RunnerTypeError < Error; end 15 | 16 | class JobDefinitionError < Error 17 | attr_reader :job_definition 18 | 19 | def initialize(job_definition) 20 | @job_definition = job_definition 21 | super 22 | end 23 | end 24 | 25 | class PluginError < Error 26 | attr_reader :name 27 | 28 | def initialize(name) 29 | super 30 | @name = name 31 | end 32 | end 33 | 34 | class PluginNotInstalledError < PluginError; end 35 | class PluginConflictError < PluginError; end 36 | class PluginNotAvailableError < PluginError; end 37 | class PluginLoadError < PluginError; end 38 | 39 | class ConnectorConfigurationError < Error; end 40 | 41 | class ConnectorNotAvailableError < Error 42 | def initialize(message, provider: nil, name: nil) 43 | super(message) 44 | @provider = provider 45 | @name = name 46 | end 47 | attr_reader :name, :provider 48 | end 49 | 50 | class ProviderNotAvailableError < ConnectorNotAvailableError; end 51 | class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end 52 | 53 | class ExtractionError < Error; end 54 | 55 | class TransformationError < Error; end 56 | class UntransformableRecordError < TransformationError; end 57 | 58 | class LoaderError < Error; end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /spec/chronicle/etl/oauth_authorizer_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | OmniAuth.config.test_mode = true 4 | OmniAuth.config.mock_auth[:developer] = { 5 | token: 'abc' 6 | } 7 | # Make sure AuthorizationServer knows we're in test mode 8 | ENV['APP_ENV'] = 'test' 9 | 10 | # Prevent Launchy from attempt to open windows in oauth_authorizer.rb 11 | ENV['LAUNCHY_DRY_RUN'] = 'true' 12 | 13 | RSpec.describe Chronicle::ETL::OauthAuthorizer do 14 | let(:port) { 5678 } 15 | let(:authorizer) do 16 | Class.new(Chronicle::ETL::OauthAuthorizer) do 17 | provider :foo 18 | omniauth_strategy :developer 19 | scope 'email' 20 | pluck_secrets({ token: [:token] }) 21 | end 22 | end 23 | 24 | before do 25 | stub_const('FooAuthorizer', authorizer) 26 | end 27 | 28 | it 'returs an authorization after oauth flow completed' do 29 | a = authorizer.new(port: port) 30 | thread = Thread.new do 31 | wait_until do 32 | booted? 33 | end 34 | fetch("http://localhost:#{port}/auth/developer/") 35 | end 36 | 37 | result = suppress_output do 38 | a.authorize! 39 | end 40 | thread.join 41 | expect(result).to eql({ token: 'abc' }) 42 | end 43 | 44 | it 'raises an exception if flow aborts early' do 45 | # TODO: implement this somehow 46 | # send signal to sinatra? 47 | end 48 | 49 | def booted? 50 | fetch("http://localhost:#{port}/") 51 | true 52 | rescue Errno::ECONNREFUSED, Errno::EBADF 53 | false 54 | end 55 | 56 | # TODO: use library? put in SpecHelpers? 57 | def fetch(uri, limit = 10) 58 | response = Net::HTTP.get_response(URI(uri)) 59 | fetch(response['location'], limit - 1) if response == Net::HTTPRedirection || response.code == '302' 60 | end 61 | end 62 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/connector_registration.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | module Registry 6 | # Records details about a connector such as its source provider and a description 7 | class ConnectorRegistration 8 | attr_accessor :klass, :identifier, :source, :strategy, :type, :description, :from_schema, :to_schema 9 | 10 | # Create a new connector registration 11 | def initialize(klass) 12 | @klass = klass 13 | end 14 | 15 | # The ETL phase of this connector 16 | def phase 17 | if klass.ancestors.include? Chronicle::ETL::Extractor 18 | :extractor 19 | elsif klass.ancestors.include? Chronicle::ETL::Transformer 20 | :transformer 21 | elsif klass.ancestors.include? Chronicle::ETL::Loader 22 | :loader 23 | end 24 | end 25 | 26 | def to_s 27 | "#{phase}-#{identifier}" 28 | end 29 | 30 | # Whether this connector is built-in to Chronicle 31 | def built_in? 32 | @klass.to_s.include? 'Chronicle::ETL' 33 | end 34 | 35 | def klass_name 36 | @klass.to_s 37 | end 38 | 39 | # TODO: allow overriding here. Maybe through self-registration process 40 | def plugin 41 | @source 42 | end 43 | 44 | def descriptive_phrase 45 | prefix = case phase 46 | when :extractor 47 | 'Extracts from' 48 | when :transformer 49 | 'Transforms' 50 | when :loader 51 | 'Loads to' 52 | end 53 | 54 | "#{prefix} #{description}" 55 | end 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/csv_loader.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'csv' 4 | require 'chronicle/utils/hash_utils' 5 | 6 | module Chronicle 7 | module ETL 8 | class CSVLoader < Chronicle::ETL::Loader 9 | include Chronicle::ETL::Loaders::Helpers::StdoutHelper 10 | 11 | register_connector do |r| 12 | r.identifier = :csv 13 | r.description = 'CSV' 14 | end 15 | 16 | setting :output 17 | setting :headers, default: true 18 | setting :header_row, default: true 19 | 20 | def records 21 | @records ||= [] 22 | end 23 | 24 | def load(record) 25 | records << record 26 | end 27 | 28 | def finish 29 | return unless records.any? 30 | 31 | # headers = filtered_headers(records) 32 | headers = gather_headers(records) 33 | 34 | csv_options = {} 35 | if @config.headers 36 | csv_options[:write_headers] = @config.header_row 37 | csv_options[:headers] = headers 38 | end 39 | 40 | csv_output = CSV.generate(**csv_options) do |csv| 41 | records.each do |record| 42 | csv << Chronicle::Utils::HashUtils.flatten_hash(record.to_h) 43 | .values_at(*headers) 44 | .map { |value| force_utf8(value) } 45 | end 46 | end 47 | 48 | # TODO: just write to io directly 49 | if output_to_stdout? 50 | write_to_stdout(csv_output) 51 | else 52 | File.write(@config.output, csv_output) 53 | end 54 | end 55 | 56 | private 57 | 58 | def gather_headers(records) 59 | records_flattened = records.map do |record| 60 | Chronicle::Utils::HashUtils.flatten_hash(record.to_h) 61 | end 62 | records_flattened.flat_map(&:keys).uniq 63 | end 64 | end 65 | end 66 | end 67 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'simplecov' 4 | SimpleCov.start 5 | 6 | require 'bundler/setup' 7 | require 'chronicle/etl' 8 | require 'chronicle/etl/cli' 9 | 10 | require 'vcr' 11 | VCR.configure do |config| 12 | config.cassette_library_dir = 'spec/fixtures/vcr_cassettes' 13 | config.allow_http_connections_when_no_cassette = true 14 | config.hook_into :webmock 15 | config.filter_sensitive_data('') { Gem.configuration.rubygems_api_key } 16 | end 17 | 18 | require_relative 'support/capture_io' 19 | require_relative 'support/invoke_cli' 20 | require_relative 'support/mocked_config_directory' 21 | require_relative 'support/mocked_stdin' 22 | require_relative 'support/run_extraction' 23 | require_relative 'support/wait_until' 24 | 25 | RSPEC_ROOT = File.dirname(__FILE__) 26 | 27 | RSpec.configure do |config| 28 | config.include Chronicle::ETL::SpecHelpers 29 | config.include_context 'mocked config directory', include_shared: true 30 | config.include_context 'mocked stdin', include_shared: true 31 | 32 | # Enable flags like --only-failures and --next-failure 33 | config.example_status_persistence_file_path = '.rspec_status' 34 | 35 | # Disable RSpec exposing methods globally on `Module` and `main` 36 | config.disable_monkey_patching! 37 | 38 | config.mock_with :rspec 39 | 40 | config.filter_run focus: true 41 | config.run_all_when_everything_filtered = true 42 | 43 | config.expect_with :rspec do |c| 44 | c.syntax = :expect 45 | end 46 | end 47 | # This monkeypatch is required because of weird interactions between the 48 | # `tty-screen` used for CLI output and the way rspec captures stdout 49 | # see: https://github.com/rspec/rspec-expectations/issues/1305 50 | # and: https://github.com/emsk/bundle_outdated_formatter/blob/v0.7.0/spec/spec_helper.rb#L16-L21 51 | require 'stringio' 52 | class StringIO 53 | def ioctl(*) 54 | 0 55 | end 56 | end 57 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/file_extractor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'pathname' 4 | 5 | module Chronicle 6 | module ETL 7 | # Return filenames that match a pattern in a directory 8 | class FileExtractor < Chronicle::ETL::Extractor 9 | register_connector do |r| 10 | r.identifier = :file 11 | r.description = 'file or directory of files' 12 | end 13 | 14 | setting :input, default: ['.'] 15 | setting :dir_glob_pattern, default: '**/*' 16 | setting :larger_than 17 | setting :smaller_than 18 | 19 | def prepare 20 | @pathnames = gather_files 21 | end 22 | 23 | def extract 24 | @pathnames.each do |pathname| 25 | yield Chronicle::ETL::Extraction.new(data: pathname.to_path) 26 | end 27 | end 28 | 29 | def results_count 30 | @pathnames.count 31 | end 32 | 33 | private 34 | 35 | def gather_files 36 | roots = [@config.input].flatten.map { |filename| Pathname.new(filename) } 37 | raise(ExtractionError, 'Input must exist') unless roots.all?(&:exist?) 38 | 39 | directories, files = roots.partition(&:directory?) 40 | 41 | directories.each do |directory| 42 | files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) } 43 | end 44 | 45 | files = files.uniq 46 | 47 | files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since 48 | files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until 49 | 50 | # pass in file sizes in bytes 51 | files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than 52 | files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than 53 | 54 | # # TODO: incorporate sort argument 55 | files.sort_by(&:mtime) 56 | end 57 | end 58 | end 59 | end 60 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/filter_fields_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | # A transformer that filters the fields of a record and returns a new hash with only the specified fields. 6 | class FilterFieldsTransformer < Chronicle::ETL::Transformer 7 | register_connector do |r| 8 | r.identifier = :filter_fields 9 | r.description = 'by taking a subset of the fields' 10 | end 11 | 12 | setting :fields, type: :array, default: [] 13 | 14 | def transform(record) 15 | hash = record.data.to_h.deep_transform_keys(&:to_sym) 16 | filter_hash(hash, @config.fields.map) 17 | end 18 | 19 | private 20 | 21 | def access_nested_value(data, path) 22 | keys = path.split('.') 23 | keys.reduce(data) do |acc, key| 24 | if acc.is_a?(Array) 25 | acc.map do |item| 26 | item[key.to_sym] 27 | rescue StandardError 28 | nil 29 | end 30 | .compact 31 | elsif key.include?('[') 32 | key, index = key.split(/\[|\]/).reject(&:empty?) 33 | acc = acc[key.to_sym] if acc 34 | acc.is_a?(Array) ? acc[index.to_i] : nil 35 | else 36 | acc&.dig(key.to_sym) 37 | end 38 | end 39 | end 40 | 41 | def filter_hash(original_hash, fields) 42 | fields.each_with_object({}) do |field, result| 43 | value = access_nested_value(original_hash, field) 44 | keys = field.split('.') 45 | last_key = keys.pop.to_sym 46 | 47 | current = result 48 | keys.each do |key| 49 | key = key.to_sym 50 | key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[') 51 | current[key] ||= {} 52 | current = current[key] 53 | end 54 | 55 | current[last_key] = value 56 | end 57 | end 58 | end 59 | end 60 | end 61 | -------------------------------------------------------------------------------- /lib/chronicle/etl/utils/progress_bar.rb: -------------------------------------------------------------------------------- 1 | require 'tty/progressbar' 2 | require 'colorize' 3 | 4 | module Chronicle 5 | module ETL 6 | module Utils 7 | class ProgressBar 8 | FORMAT_WITH_TOTAL = [ 9 | ':bar ', 10 | ':percent'.light_white, 11 | ' | '.light_black, 12 | ':current'.light_white, 13 | '/'.light_black, 14 | ':total'.light_white, 15 | ' ('.light_black, 16 | 'ELAPSED:'.light_black, 17 | ':elapsed'.light_white, 18 | ' | ETA:'.light_black, 19 | ':eta'.light_white, 20 | ' | RATE: '.light_black, 21 | ':mean_rate'.light_white, 22 | '/s) '.light_black 23 | ].join.freeze 24 | 25 | FORMAT_WITHOUT_TOTAL = [ 26 | ':current'.light_white, 27 | '/'.light_black, 28 | '???'.light_white, 29 | ' ('.light_black, 30 | 'ELAPSED:'.light_black, 31 | ':elapsed'.light_white, 32 | ' | ETA:'.light_black, 33 | '??:??'.light_white, 34 | ' | RATE: '.light_black, 35 | ':mean_rate'.light_white, 36 | '/s) '.light_black 37 | ].join.freeze 38 | 39 | def initialize(total:, title: 'Loading') 40 | opts = { 41 | clear: true, 42 | complete: '▓'.light_blue, 43 | incomplete: '░'.blue, 44 | frequency: 10 45 | } 46 | 47 | if total 48 | opts[:total] = total 49 | format_str = "#{title} #{FORMAT_WITH_TOTAL}" 50 | @pbar = TTY::ProgressBar.new(FORMAT_WITH_TOTAL, opts) 51 | else 52 | format_str = "#{title} #{FORMAT_WITHOUT_TOTAL}" 53 | opts[:no_width] = true 54 | end 55 | 56 | @pbar = TTY::ProgressBar.new(format_str, opts) 57 | 58 | @pbar.resize 59 | end 60 | 61 | def increment 62 | @pbar.advance(1) 63 | end 64 | 65 | def log(message) 66 | message.split("\n").each do |_line| 67 | @pbar.log message 68 | end 69 | end 70 | 71 | def finish 72 | @pbar.finish 73 | end 74 | end 75 | end 76 | end 77 | end 78 | -------------------------------------------------------------------------------- /lib/chronicle/etl/secrets.rb: -------------------------------------------------------------------------------- 1 | require 'active_support/core_ext/hash/keys' 2 | 3 | module Chronicle 4 | module ETL 5 | # Secret management module 6 | module Secrets 7 | module_function 8 | 9 | # Whether a given namespace exists 10 | def exists?(namespace) 11 | Chronicle::ETL::Config.exists?('secrets', namespace) 12 | end 13 | 14 | # Save a setting to a namespaced config file 15 | def set(namespace, key, value) 16 | config = read(namespace) 17 | config[key.to_sym] = value 18 | write(namespace, config) 19 | end 20 | 21 | # Save a hash to a secrets namespace 22 | def set_all(namespace, secrets) 23 | config = read(namespace) 24 | config = config.merge(secrets.deep_stringify_keys) 25 | write(namespace, config) 26 | end 27 | 28 | # Remove a setting from a namespaced config file 29 | def unset(namespace, key) 30 | config = read(namespace) 31 | config.delete(key.to_sym) 32 | write(namespace, config) 33 | end 34 | 35 | # Retrieve all secrets from all namespaces 36 | def all(namespace = nil) 37 | namespaces = namespace.nil? ? available_secrets : [namespace] 38 | namespaces 39 | .to_h { |namespace| [namespace.to_sym, read(namespace)] } 40 | .delete_if { |_, v| v.empty? } 41 | end 42 | 43 | # Return whether a namespace name is valid (lowercase alphanumeric and -) 44 | def valid_namespace_name?(namespace) 45 | namespace.match(/^[a-z0-9\-]+$/) 46 | end 47 | 48 | # Read secrets from a config file 49 | def read(namespace) 50 | definition = Chronicle::ETL::Config.load('secrets', namespace) 51 | definition[:secrets] || {} 52 | end 53 | 54 | # Write secrets to a config file 55 | def write(namespace, secrets) 56 | data = { 57 | secrets: (secrets || {}).transform_keys(&:to_s), 58 | chronicle_etl_version: Chronicle::ETL::VERSION 59 | } 60 | Chronicle::ETL::Config.write('secrets', namespace, data) 61 | end 62 | 63 | # Which config files are available in ~/.config/chronicle/etl/secrets 64 | def available_secrets 65 | Chronicle::ETL::Config.available_configs('secrets') 66 | end 67 | end 68 | end 69 | end 70 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/extractor.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'chronicle/etl' 4 | 5 | module Chronicle 6 | module ETL 7 | # Abstract class representing an Extractor for an ETL job 8 | class Extractor 9 | extend Chronicle::ETL::Registry::SelfRegistering 10 | include Chronicle::ETL::Configurable 11 | 12 | setting :since, type: :time 13 | setting :until, type: :time 14 | setting :limit, type: :numeric 15 | setting :load_after_id 16 | setting :input 17 | 18 | # Construct a new instance of this extractor. Options are passed in from a Runner 19 | # == Parameters: 20 | # options:: 21 | # Options for configuring this Extractor 22 | def initialize(options = {}) 23 | apply_options(options) 24 | end 25 | 26 | # Hook called before #extract. Useful for gathering data, initializing proxies, etc 27 | def prepare; end 28 | 29 | # An optional method to calculate how many records there are to extract. Used primarily for 30 | # building the progress bar 31 | def results_count; end 32 | 33 | # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded 34 | def extract 35 | raise NotImplementedError 36 | end 37 | 38 | protected 39 | 40 | def build_extraction(data:, meta: nil, source: nil, type: nil, strategy: nil) 41 | Extraction.new( 42 | extractor: self.class, 43 | data: data, 44 | meta: meta, 45 | source: source || self.class.connector_registration.source, 46 | type: type || self.class.connector_registration.type, 47 | strategy: strategy || self.class.connector_registration.strategy 48 | ) 49 | end 50 | 51 | # TODO: reimplemenet this 52 | # def handle_continuation 53 | # return unless @config.continuation 54 | 55 | # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp 56 | # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id 57 | # end 58 | end 59 | end 60 | end 61 | 62 | require_relative 'helpers/input_reader' 63 | require_relative 'csv_extractor' 64 | require_relative 'file_extractor' 65 | require_relative 'json_extractor' 66 | require_relative 'stdin_extractor' 67 | -------------------------------------------------------------------------------- /lib/chronicle/etl/extractors/helpers/input_reader.rb: -------------------------------------------------------------------------------- 1 | require 'pathname' 2 | 3 | module Chronicle 4 | module ETL 5 | module Extractors 6 | module Helpers 7 | module InputReader 8 | # Return an array of input filenames; converts a single string 9 | # to an array if necessary 10 | def filenames 11 | [@config.input].flatten.map 12 | end 13 | 14 | # Filenames as an array of pathnames 15 | def pathnames 16 | filenames.map { |filename| Pathname.new(filename) } 17 | end 18 | 19 | # Whether we're reading from files 20 | def read_from_files? 21 | filenames.any? 22 | end 23 | 24 | # Whether we're reading input from stdin 25 | def read_from_stdin? 26 | !read_from_files? && $stdin.stat.pipe? 27 | end 28 | 29 | # Read input sources and yield each content 30 | def read_input 31 | if read_from_files? 32 | pathnames.each do |pathname| 33 | File.open(pathname) do |file| 34 | yield file.read, pathname.to_path 35 | end 36 | end 37 | elsif read_from_stdin? 38 | yield $stdin.read, $stdin 39 | else 40 | raise ExtractionError, 'No input files or stdin provided' 41 | end 42 | end 43 | 44 | # Read input sources line by line 45 | def read_input_as_lines(&block) 46 | if read_from_files? 47 | lines_from_files(&block) 48 | elsif read_from_stdin? 49 | lines_from_stdin(&block) 50 | else 51 | raise ExtractionError, 'No input files or stdin provided' 52 | end 53 | end 54 | 55 | private 56 | 57 | def lines_from_files(&block) 58 | pathnames.each do |pathname| 59 | File.open(pathname) do |file| 60 | lines_from_io(file, &block) 61 | end 62 | end 63 | end 64 | 65 | def lines_from_stdin(&block) 66 | lines_from_io($stdin, &block) 67 | end 68 | 69 | def lines_from_io(io, &block) 70 | io.each_line(&block) 71 | end 72 | end 73 | end 74 | end 75 | end 76 | end 77 | -------------------------------------------------------------------------------- /lib/chronicle/etl/job_logger.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | require 'sequel' 3 | require 'xdg' 4 | 5 | module Chronicle 6 | module ETL 7 | # Saves JobLogs to db and loads previous ones 8 | class JobLogger 9 | extend Forwardable 10 | 11 | def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success 12 | attr_accessor :job_log 13 | 14 | # For a given `job_id`, return the last successful log 15 | def self.load_latest(_job_id) 16 | with_db_connection do |db| 17 | attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first 18 | JobLog.build_from_serialized(attrs) if attrs 19 | end 20 | end 21 | 22 | def self.with_db_connection 23 | initialize_db unless db_exists? 24 | Sequel.connect("sqlite://#{db_filename}") do |db| 25 | initialize_schema(db) unless schema_exists?(db) 26 | yield db 27 | end 28 | end 29 | 30 | def self.db_exists? 31 | File.exist?(db_filename) 32 | end 33 | 34 | def self.schema_exists?(db) 35 | db.tables.include? :job_logs 36 | end 37 | 38 | def self.db_filename 39 | base = Pathname.new(XDG::Data.new.home) 40 | base.join('job_log.db') 41 | end 42 | 43 | def self.initialize_db 44 | FileUtils.mkdir_p(File.dirname(db_filename)) 45 | end 46 | 47 | def self.initialize_schema(db) 48 | db.create_table :job_logs do 49 | primary_key :id 50 | String :job_id, null: false 51 | String :last_id 52 | Time :highest_timestamp 53 | Integer :num_records_processed 54 | boolean :success, default: false 55 | Time :started_at 56 | Time :finished_at 57 | end 58 | end 59 | 60 | # Create a new JobLogger 61 | def initialize(job) 62 | @job_log = JobLog.new do |job_log| 63 | job_log.job = job 64 | end 65 | end 66 | 67 | # Save this JobLogger's JobLog to db 68 | def save 69 | return unless @job_log.save_log? 70 | 71 | JobLogger.with_db_connection do |db| 72 | dataset = db[:job_logs] 73 | dataset.insert(@job_log.serialize) 74 | end 75 | end 76 | 77 | def summarize 78 | @job_log.inspect 79 | end 80 | end 81 | end 82 | end 83 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/connectors_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Connectors do 4 | describe '#list' do 5 | it 'lists installed connectors' do 6 | expected_klasses = Chronicle::ETL::Registry::Connectors.connectors.map(&:klass_name) 7 | 8 | stdout, = invoke_cli(%w[connectors:list]) 9 | outputted_klasses = stdout 10 | .split("\n") # ignore the ascii table header 11 | .drop(1) # parse out the connector classes 12 | .map { |k| k.match(/(Chronicle::(\w+)::(\w+))/)&.captures&.first } 13 | .compact 14 | 15 | expect(expected_klasses).to match_array(outputted_klasses) 16 | end 17 | end 18 | 19 | describe '#show' do 20 | context 'with a a bad phase type' do 21 | it 'will exit with an error' do 22 | expect do 23 | invoke_cli(%w[connectors:show transmorpher foo], false) 24 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 25 | end 26 | 27 | it 'will show an error message' do 28 | _, stderr = invoke_cli(%w[connectors:show transmorpher foo]) 29 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/must be one of/) 30 | end 31 | end 32 | 33 | context 'for a connector that does not exist' do 34 | it 'will exit with an error' do 35 | expect do 36 | invoke_cli(%w[connectors:show extractor foo], false) 37 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 38 | end 39 | 40 | it 'will show an error' do 41 | _, stderr = invoke_cli(%w[connectors:show extractor unknown]) 42 | # puts stderr 43 | expect(stderr).to match(/Could not find/) 44 | end 45 | end 46 | 47 | context 'for a connector that exists' do 48 | it 'can show basic information a connector' do 49 | output = invoke_cli(%w[connectors:show extractor csv]).first.split("\n").map(&:uncolorize) 50 | expect(output.first).to eql('Chronicle::ETL::CSVExtractor') 51 | end 52 | end 53 | end 54 | 55 | describe '#help' do 56 | it 'outputs help for connectors' do 57 | expect(invoke_cli(%w[connectors help]).first).to match(/COMMANDS/) 58 | end 59 | 60 | it 'outputs help for a connector subcommand' do 61 | expect(invoke_cli(%w[connectors help list]).first).to match(/Usage:/) 62 | end 63 | end 64 | end 65 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/table_loader.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tty/table' 4 | require 'chronicle/utils/hash_utils' 5 | require 'active_support/core_ext/string/filters' 6 | require 'active_support/core_ext/hash/reverse_merge' 7 | 8 | module Chronicle 9 | module ETL 10 | class TableLoader < Chronicle::ETL::Loader 11 | 12 | register_connector do |r| 13 | r.identifier = :table 14 | r.description = 'an ASCII table' 15 | end 16 | 17 | setting :truncate_values_at, default: 40 18 | setting :table_renderer, default: :basic 19 | setting :fields_exclude, default: ['type'] 20 | setting :header_row, default: true 21 | 22 | def load(record) 23 | records << record 24 | end 25 | 26 | def finish 27 | return if records.empty? 28 | 29 | headers = gather_headers(records) 30 | rows = build_rows(records, headers) 31 | 32 | render_table(headers, rows) 33 | end 34 | 35 | def records 36 | @records ||= [] 37 | end 38 | 39 | private 40 | 41 | def render_table(headers, rows) 42 | @table = TTY::Table.new(header: (headers if @config.header_row), rows: rows) 43 | puts @table.render( 44 | @config.table_renderer.to_sym, 45 | padding: [0, 2, 0, 0] 46 | ) 47 | rescue TTY::Table::ResizeError 48 | # The library throws this error before trying to render the table 49 | # vertically. These options seem to work. 50 | puts @table.render( 51 | @config.table_renderer.to_sym, 52 | padding: [0, 2, 0, 0], 53 | width: 10_000, 54 | resize: false 55 | ) 56 | end 57 | 58 | def gather_headers(records) 59 | records_flattened = records.map do |record| 60 | Chronicle::Utils::HashUtils.flatten_hash(record.to_h) 61 | end 62 | records_flattened.flat_map(&:keys).uniq 63 | end 64 | 65 | def build_rows(records, headers) 66 | records.map do |record| 67 | values = Chronicle::Utils::HashUtils.flatten_hash(record.to_h) 68 | .values_at(*headers) 69 | .map { |value| force_utf8(value.to_s) } 70 | 71 | values = values.map { |value| value.truncate(@config.truncate_values_at) } if @config.truncate_values_at 72 | 73 | values 74 | end 75 | end 76 | end 77 | end 78 | end 79 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/secrets.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tty-prompt' 4 | 5 | module Chronicle 6 | module ETL 7 | module CLI 8 | # CLI commands for working with ETL plugins 9 | class Secrets < SubcommandBase 10 | default_task 'list' 11 | namespace :secrets 12 | 13 | desc 'set NAMESPACE KEY [VALUE]', 'Add a secret. VALUE can be set as argument or from stdin' 14 | def set(namespace, key, value = nil) 15 | validate_namespace(namespace) 16 | 17 | if value 18 | # came as argument 19 | elsif $stdin.respond_to?(:stat) && $stdin.stat.pipe? 20 | value = $stdin.read 21 | else 22 | prompt = TTY::Prompt.new 23 | value = prompt.mask("Please enter #{key} for #{namespace}:") 24 | end 25 | 26 | Chronicle::ETL::Secrets.set(namespace, key, value.strip) 27 | cli_exit(message: 'Secret set') 28 | rescue TTY::Reader::InputInterrupt 29 | cli_fail(message: "\nSecret not set") 30 | end 31 | 32 | desc 'unset NAMESPACE KEY', 'Remove a secret' 33 | def unset(namespace, key) 34 | validate_namespace(namespace) 35 | 36 | Chronicle::ETL::Secrets.unset(namespace, key) 37 | cli_exit(message: 'Secret unset') 38 | end 39 | 40 | desc 'list', 'List available secrets' 41 | def list(namespace = nil) 42 | all_secrets = Chronicle::ETL::Secrets.all(namespace) 43 | cli_exit(message: 'No secrets are stored') unless all_secrets.any? 44 | 45 | rows = [] 46 | all_secrets.each do |namespace, secrets| 47 | rows += secrets.map do |key, value| 48 | # hidden_value = (value[0..5] + ("*" * [0, [value.length - 5, 30].min].max)).truncate(30) 49 | truncated_value = value&.truncate(30) 50 | [namespace, key, truncated_value] 51 | end 52 | end 53 | 54 | headers = %w[namespace key value].map { |h| h.upcase.bold } 55 | 56 | puts 'Available secrets:' 57 | table = TTY::Table.new(headers, rows) 58 | puts table.render(indent: 0, padding: [0, 2]) 59 | end 60 | 61 | private 62 | 63 | def validate_namespace(namespace) 64 | return if Chronicle::ETL::Secrets.valid_namespace_name?(namespace) 65 | 66 | cli_fail(message: "'#{namespace}' is not a valid namespace") 67 | end 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/jobs_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Jobs do 4 | let(:csv_filename) { 'spec/support/sample_data/test.csv' } 5 | let(:csv_job_args) do 6 | %w[ 7 | --extractor csv 8 | --log-level fatal 9 | --extractor-opts 10 | ] << "input:#{csv_filename}" 11 | end 12 | 13 | describe 'chronicle-etl jobs:run' do 14 | it 'run a simple job' do 15 | file_record_count = File.read(csv_filename).each_line.count - 1 16 | 17 | args = ['jobs:run'] << csv_job_args 18 | output, = invoke_cli(args) 19 | 20 | # jsonl output 21 | expect(output.split("\n").count).to eql(file_record_count) 22 | end 23 | 24 | context 'for jobs with required plugins not installed' do 25 | include_context 'mocked stdin' 26 | 27 | it 'will prompt to install plugin' do 28 | args = %w[jobs:run -e unknown:extractor --log-level fatal] 29 | load_stdin('n') 30 | output, = invoke_cli(args) 31 | expect(output).to match(/want to install/) 32 | end 33 | end 34 | end 35 | 36 | describe 'chronicle-etl jobs:show' do 37 | it 'shows details about a simple job' do 38 | args = ['jobs:show'] << csv_job_args 39 | output, = invoke_cli(args) 40 | 41 | expect(output).to match(/Extracting from/) 42 | expect(output).to match(/Transforming/) 43 | expect(output).to match(/Loading/) 44 | # TODO: do more precise matching based on job 45 | end 46 | end 47 | 48 | describe 'chronicle-etl jobs:edit' do 49 | it 'launches an editor' do 50 | # TODO 51 | end 52 | end 53 | 54 | describe 'chronicle-etl jobs:save' do 55 | include_context 'mocked config directory' 56 | 57 | it 'can save a job file' do 58 | args = %w[jobs:save test-job] 59 | expect { invoke_cli(args) } 60 | .to change { Chronicle::ETL::Config.available_jobs.count } 61 | .by(1) 62 | end 63 | end 64 | 65 | describe 'chronicle-etl jobs:list' do 66 | include_context 'mocked config directory' 67 | 68 | it 'lists available jobs' do 69 | output, = invoke_cli(%w[jobs list]) 70 | expect(output.split("\n").last).to match('^ command') 71 | end 72 | end 73 | 74 | describe 'chronicle-etl jobs help' do 75 | it 'outputs help for jobs' do 76 | expect(invoke_cli(%w[jobs help]).first).to match(/COMMANDS/) 77 | end 78 | 79 | it 'outputs help for a job subcommand' do 80 | expect(invoke_cli(%w[jobs help show]).first).to match(/Usage:/) 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/connectors.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | module CLI 6 | # CLI commands for working with ETL connectors 7 | # 8 | # @todo make this work with new plugin system (i.e. no loading of all plugins) 9 | class Connectors < SubcommandBase 10 | default_task 'list' 11 | namespace :connectors 12 | 13 | desc 'list', 'Lists available connectors' 14 | # Display all available connectors that chronicle-etl has access to 15 | def list 16 | connector_info = Chronicle::ETL::Registry::Connectors.connectors.map do |connector_registration| 17 | { 18 | identifier: connector_registration.identifier, 19 | phase: connector_registration.phase, 20 | description: connector_registration.descriptive_phrase, 21 | source: connector_registration.source, 22 | core: connector_registration.built_in? ? '✓' : '', 23 | class: connector_registration.klass_name 24 | } 25 | end 26 | 27 | connector_info = connector_info.sort_by do |a| 28 | [a[:core].to_s, a[:provider], a[:phase], a[:identifier]] 29 | end 30 | 31 | headers = connector_info.first.keys.map do |key| 32 | key.to_s.upcase.bold 33 | end 34 | 35 | table = TTY::Table.new(headers, connector_info.map(&:values)) 36 | puts table.render(indent: 0, padding: [0, 2]) 37 | end 38 | 39 | desc 'show PHASE IDENTIFIER', 'Show information about a connector' 40 | def show(phase, identifier) 41 | unless %w[extractor transformer loader].include?(phase) 42 | cli_fail(message: 'Phase argument must be one of: [extractor, transformer, loader]') 43 | end 44 | 45 | begin 46 | connector = Chronicle::ETL::Registry::Connectors.find_by_phase_and_identifier(phase.to_sym, identifier) 47 | rescue Chronicle::ETL::ConnectorNotAvailableError, Chronicle::ETL::PluginError => e 48 | cli_fail(message: "Could not find #{phase} #{identifier}", exception: e) 49 | end 50 | 51 | puts connector.klass.to_s.bold 52 | puts " #{connector.descriptive_phrase}" 53 | puts 54 | puts 'Settings:' 55 | 56 | headers = %w[name default required].map { |h| h.to_s.upcase.bold } 57 | 58 | settings = connector.klass.settings.map do |name, setting| 59 | [ 60 | name, 61 | setting.default, 62 | setting.required ? 'yes' : 'no' 63 | ] 64 | end 65 | table = TTY::Table.new(headers, settings) 66 | puts table.render(indent: 0, padding: [0, 2]) 67 | end 68 | end 69 | end 70 | end 71 | end 72 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/plugins.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tty-prompt' 4 | require 'tty-spinner' 5 | 6 | module Chronicle 7 | module ETL 8 | module CLI 9 | # CLI commands for working with ETL plugins 10 | class Plugins < SubcommandBase 11 | default_task 'list' 12 | namespace :plugins 13 | 14 | desc 'install', 'Install a plugin' 15 | def install(*plugins) 16 | cli_fail(message: 'Please specify a plugin to install') unless plugins.any? 17 | 18 | installed, not_installed = plugins.partition do |plugin| 19 | Chronicle::ETL::Registry::Plugins.installed?(plugin) 20 | end 21 | 22 | puts "Already installed: #{installed.join(', ')}" if installed.any? 23 | cli_exit unless not_installed.any? 24 | 25 | spinner = TTY::Spinner.new("[:spinner] Installing #{not_installed.join(', ')}...", format: :dots_2) 26 | spinner.auto_spin 27 | 28 | not_installed.each do |plugin| 29 | spinner.update(title: "Installing #{plugin}") 30 | Chronicle::ETL::Registry::Plugins.install(plugin) 31 | rescue Chronicle::ETL::PluginError => e 32 | spinner.error('Error'.red) 33 | cli_fail(message: "Plugin '#{plugin}' could not be installed", exception: e) 34 | end 35 | 36 | spinner.success("(#{'successful'.green})") 37 | end 38 | 39 | desc 'uninstall', 'Unintall a plugin' 40 | def uninstall(name) 41 | spinner = TTY::Spinner.new("[:spinner] Uninstalling plugin #{name}...", format: :dots_2) 42 | spinner.auto_spin 43 | Chronicle::ETL::Registry::Plugins.uninstall(name) 44 | spinner.success("(#{'successful'.green})") 45 | rescue Chronicle::ETL::PluginError => e 46 | spinner.error('Error'.red) 47 | cli_fail(message: "Plugin '#{name}' could not be uninstalled (was it installed?)", exception: e) 48 | end 49 | 50 | desc 'list', 'Lists available plugins' 51 | # Display all available plugins that chronicle-etl has access to 52 | def list 53 | values = Chronicle::ETL::Registry::Plugins.all 54 | .map do |plugin| 55 | [ 56 | plugin.name, 57 | plugin.description, 58 | plugin.installed ? '✓' : '', 59 | plugin.version 60 | ] 61 | end 62 | 63 | headers = %w[name description installed version].map { |h| h.to_s.upcase.bold } 64 | table = TTY::Table.new(headers, values) 65 | puts 'Available plugins:' 66 | puts table.render( 67 | indent: 2, 68 | padding: [0, 0], 69 | alignments: %i[left left center left] 70 | ) 71 | end 72 | end 73 | end 74 | end 75 | end 76 | -------------------------------------------------------------------------------- /lib/chronicle/etl/config.rb: -------------------------------------------------------------------------------- 1 | require 'active_support/core_ext/hash/keys' 2 | require 'fileutils' 3 | require 'yaml' 4 | 5 | module Chronicle 6 | module ETL 7 | # Utility methods to read, write, and access config files 8 | module Config 9 | extend self 10 | 11 | attr_accessor :xdg_environment 12 | 13 | def load(type, identifier) 14 | base = config_pathname_for_type(type) 15 | path = base.join("#{identifier}.yml") 16 | return {} unless path.exist? 17 | 18 | YAML.safe_load_file(path, symbolize_names: true, permitted_classes: [Symbol, Date, Time]) 19 | end 20 | 21 | # Writes a hash as a yml config file 22 | def write(type, identifier, data) 23 | base = config_pathname_for_type(type) 24 | path = base.join("#{identifier}.yml") 25 | 26 | data.deep_stringify_keys! 27 | FileUtils.mkdir_p(File.dirname(path)) 28 | File.open(path, 'w', 0o600) do |f| 29 | # Ruby likes to add --- separators when writing yaml files 30 | f << data.to_yaml.gsub(/^-+\n/, '') 31 | end 32 | end 33 | 34 | # Returns path for a given config type and identifier 35 | def path(type, identifier) 36 | base = config_pathname_for_type(type) 37 | base.join("#{identifier}.yml") 38 | end 39 | 40 | # Whether a config exists for a given type and identifier 41 | def exists?(type, identifier) 42 | base = config_pathname_for_type(type) 43 | path = base.join("#{identifier}.yml") 44 | path.exist? 45 | end 46 | 47 | # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml 48 | def available_jobs 49 | Dir.glob(File.join(config_pathname_for_type('jobs'), '*.yml')).map do |filename| 50 | File.basename(filename, '.*') 51 | end 52 | end 53 | 54 | # Returns all configs available for a given type 55 | def available_configs(type) 56 | Dir.glob(File.join(config_pathname_for_type(type), '*.yml')).map do |filename| 57 | File.basename(filename, '.*') 58 | end 59 | end 60 | 61 | # Load a job definition from job config directory 62 | def read_job(job_name) 63 | definition = load('jobs', job_name) 64 | definition[:name] ||= job_name 65 | definition 66 | end 67 | 68 | def config_pathname 69 | base = Pathname.new(xdg_config.config_home) 70 | base.join('chronicle', 'etl') 71 | end 72 | 73 | def config_pathname_for_type(type) 74 | config_pathname.join(type) 75 | end 76 | 77 | def xdg_config 78 | # Only used for overriding ENV['HOME'] for XDG-related specs 79 | if @xdg_environment 80 | XDG::Environment.new(environment: @xdg_environment) 81 | else 82 | XDG::Environment.new 83 | end 84 | end 85 | end 86 | end 87 | end 88 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | # Abstract class representing an Transformer for an ETL job 6 | class Transformer 7 | extend Chronicle::ETL::Registry::SelfRegistering 8 | include Chronicle::ETL::Configurable 9 | 10 | attr_reader :stashed_records 11 | 12 | # Construct a new instance of this transformer. Options are passed in from a Runner 13 | # == Parameters: 14 | # options:: 15 | # Options for configuring this Transformer 16 | def initialize(options = {}) 17 | apply_options(options) 18 | end 19 | 20 | # Called once for each extracted record. Can return 0 or more transformed records. 21 | def call(record, &block) 22 | raise ArgumentError, 'Input must be a Chronicle::ETL::Record' unless record.is_a?(Record) 23 | 24 | yielded = false 25 | 26 | transformed_data = transform(record) do |data| 27 | new_record = update_data(record, data) 28 | block.call(new_record) 29 | 30 | yielded = true 31 | end 32 | 33 | return if yielded 34 | 35 | # Handle transformers that don't yield anything and return 36 | # transformed data directly. Skip nil values. 37 | [transformed_data].flatten.compact.each do |data| 38 | new_record = update_data(record, data) 39 | block.call(new_record) 40 | end 41 | end 42 | 43 | def call_finish(&block) 44 | remaining_records = finish 45 | return if remaining_records.nil? 46 | 47 | remaining_records.each do |record| 48 | block.call(record) 49 | end 50 | end 51 | 52 | def transform(_record) 53 | raise NotImplementedError, 'You must implement the transform method' 54 | end 55 | 56 | # Called once after runner has processed all records 57 | def finish; end 58 | 59 | protected 60 | 61 | def stash_record(record) 62 | @stashed_records ||= [] 63 | @stashed_records << record 64 | nil 65 | end 66 | 67 | def flush_stashed_records 68 | @stashed_records.tap(&:clear) 69 | end 70 | 71 | def update_data(record, new_data) 72 | new_record = record.clone 73 | new_record.data = new_data 74 | new_record 75 | end 76 | end 77 | end 78 | end 79 | 80 | require_relative 'null_transformer' 81 | require_relative 'sampler_transformer' 82 | require_relative 'buffer_transformer' 83 | require_relative 'multiply_transformer' 84 | require_relative 'sort_transformer' 85 | require_relative 'chronicle_transformer' 86 | require_relative 'format_transformer' 87 | require_relative 'filter_fields_transformer' 88 | require_relative 'fields_limit_transformer' 89 | require_relative 'merge_meta_transformer' 90 | require_relative 'filter_transformer' 91 | require_relative 'chronobase_transformer' 92 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/authorizations.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'sinatra' 4 | require 'launchy' 5 | 6 | module Chronicle 7 | module ETL 8 | module CLI 9 | # CLI commands for authorizing chronicle-etl with third-party services 10 | class Authorizations < SubcommandBase 11 | default_task 'new' 12 | namespace :authorizations 13 | 14 | desc 'authorize', 'Authorize with a third-party provider' 15 | option :port, desc: 'Port to run authorization server on', type: :numeric, default: 4567 16 | option :credentials, desc: 'Secrets namespace for where to read credentials from (default: PROVIDER)', 17 | type: :string, banner: 'NAMESPACE' 18 | option :secrets, desc: 'Secrets namespace for where authorization should be saved to (default: PROVIDER)', 19 | type: :string, banner: 'NAMESPACE' 20 | option :print, desc: 'Show authorization results (instead of just saving secrets)', type: :boolean, 21 | default: false 22 | def new(provider) 23 | authorizer_klass = find_authorizer_klass(provider) 24 | credentials = load_credentials(provider: provider, credentials_source: options[:credentials]) 25 | authorizer = authorizer_klass.new(port: options[:port], credentials: credentials) 26 | 27 | secrets = authorizer.authorize! 28 | secrets_namespace = options[:secrets] || provider 29 | Chronicle::ETL::Secrets.set_all(secrets_namespace, secrets) 30 | 31 | pp secrets if options[:print] 32 | 33 | cli_exit(message: "Authorization saved to '#{secrets_namespace}' secrets") 34 | rescue StandardError => e 35 | cli_fail(message: "Authorization not successful.\n#{e.message}", exception: e) 36 | end 37 | 38 | private 39 | 40 | def find_authorizer_klass(provider) 41 | # TODO: this assumes provider:plugin one-to-one 42 | unless Chronicle::ETL::Registry::Plugins.installed?(provider) 43 | cli_fail(message: "Plugin for #{provider} is not installed.") 44 | end 45 | 46 | begin 47 | Chronicle::ETL::Registry::Plugins.activate(provider) 48 | rescue PluginError => e 49 | cli_fail(message: "Could not load plugin '#{provider}'.\n" + e.message, exception: e) 50 | end 51 | 52 | Authorizer.find_by_provider(provider.to_sym) || cli_fail(message: "No authorizer available for '#{provider}'") 53 | end 54 | 55 | def load_credentials(provider:, credentials_source: nil) 56 | if credentials_source && !Chronicle::ETL::Secrets.exists?(credentials_source) 57 | cli_fail(message: "OAuth credentials specified as '#{credentials_source}' but a secrets namespace with that name does not exist.") 58 | end 59 | 60 | Chronicle::ETL::Secrets.read(credentials_source || provider) 61 | end 62 | end 63 | end 64 | end 65 | end 66 | -------------------------------------------------------------------------------- /lib/chronicle/etl/job_log.rb: -------------------------------------------------------------------------------- 1 | require 'forwardable' 2 | 3 | module Chronicle 4 | module ETL 5 | # A record of what happened in the running of a job. We're interested in 6 | # tracking when it ran, if it was successful, and what the latest record 7 | # we found is (to use as a cursor for the next time) 8 | class JobLog 9 | extend Forwardable 10 | 11 | attr_accessor :job, 12 | :job_id, 13 | :last_id, 14 | :highest_timestamp, 15 | :num_records_processed, 16 | :started_at, 17 | :finished_at, 18 | :success 19 | 20 | def_delegators :@job, :save_log? 21 | 22 | # Create a new JobLog for a given Job 23 | def initialize 24 | @num_records_processed = 0 25 | @success = false 26 | yield self if block_given? 27 | end 28 | 29 | # Log the result of a single transformation in a job 30 | # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran 31 | def log_transformation(_transformer) 32 | # @last_id = transformer.id if transformer.id 33 | 34 | # Save the highest timestamp that we've encountered so far 35 | # @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp 36 | 37 | # TODO: a transformer might yield nil. We might also want certain transformers to explode 38 | # records into multiple new ones. Therefore, this this variable will need more subtle behaviour 39 | @num_records_processed += 1 40 | end 41 | 42 | # Indicate that a job has started 43 | def start 44 | @started_at = Time.now 45 | end 46 | 47 | # Indicate that a job has finished 48 | def finish 49 | @finished_at = Time.now 50 | @success = true 51 | end 52 | 53 | def error 54 | @finished_at = Time.now 55 | end 56 | 57 | def job=(job) 58 | @job = job 59 | @job_id = job.id 60 | end 61 | 62 | def duration 63 | return unless @finished_at && @started_at 64 | 65 | @finished_at - @started_at 66 | end 67 | 68 | # Take a JobLog's instance variables and turn them into a hash representation 69 | def serialize 70 | { 71 | job_id: @job_id, 72 | last_id: @last_id, 73 | highest_timestamp: @highest_timestamp, 74 | num_records_processed: @num_records_processed, 75 | started_at: @started_at, 76 | finished_at: @finished_at, 77 | success: @success 78 | } 79 | end 80 | 81 | # Create a new JobLog and set its instance variables from a serialized hash 82 | def self.build_from_serialized(attrs) 83 | attrs.delete(:id) 84 | new do |job_log| 85 | attrs.each do |key, value| 86 | setter = :"#{key}=" 87 | job_log.send(setter, value) 88 | end 89 | end 90 | end 91 | end 92 | end 93 | end 94 | -------------------------------------------------------------------------------- /lib/chronicle/etl/job.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'forwardable' 4 | 5 | module Chronicle 6 | module ETL 7 | # A runner job 8 | # 9 | # TODO: this can probably be merged with JobDefinition. Not clear 10 | # where the boundaries are 11 | class Job 12 | extend Forwardable 13 | 14 | def_delegators :@job_definition, :dry_run? 15 | 16 | attr_accessor :name, 17 | :extractor_klass, 18 | :extractor_options, 19 | :transformer_klasses, 20 | :transformer_options, 21 | :loader_klass, 22 | :loader_options, 23 | :job_definition 24 | 25 | # TODO: build a proper id system 26 | alias id name 27 | 28 | def initialize(job_definition) 29 | @job_definition = job_definition 30 | @name = @job_definition.definition[:name] 31 | @extractor_options = @job_definition.extractor_options 32 | @transformer_options = @job_definition.transformer_options 33 | @loader_options = @job_definition.loader_options 34 | 35 | set_continuation if use_continuation? 36 | yield self if block_given? 37 | end 38 | 39 | def instantiate_extractor 40 | @extractor_klass = @job_definition.extractor_klass 41 | @extractor_klass.new(@extractor_options) 42 | end 43 | 44 | def instantiate_transformers 45 | @job_definition.transformer_klasses.each_with_index.map do |klass, i| 46 | klass.new(@transformer_options[i] || {}) 47 | end 48 | end 49 | 50 | def instantiate_loader 51 | @loader_klass = @job_definition.loader_klass 52 | @loader_klass.new(@loader_options) 53 | end 54 | 55 | def save_log? 56 | # TODO: this needs more nuance 57 | !id.nil? 58 | end 59 | 60 | def to_s 61 | output = "Job summary\n".upcase.bold 62 | # output = "" 63 | output += "#{name}:\n" if name 64 | output += "→ Extracting from #{@job_definition.extractor_klass.description}\n" 65 | output += options_to_s(@extractor_options) 66 | 67 | @job_definition.transformer_klasses.each do |klass| 68 | output += "→ Transforming #{klass.description}\n" 69 | end 70 | # TODO: transformer options 71 | output += "→ Loading to #{@job_definition.loader_klass.description}\n" 72 | output += options_to_s(@loader_options) 73 | output 74 | end 75 | 76 | private 77 | 78 | def options_to_s(options, indent: 4) 79 | output = '' 80 | options.each do |k, v| 81 | output += "#{' ' * indent}#{k.to_s.light_blue}: #{v}\n" 82 | end 83 | output 84 | end 85 | 86 | def set_continuation 87 | continuation = Chronicle::ETL::JobLogger.load_latest(@id) 88 | @extractor_options[:continuation] = continuation 89 | end 90 | 91 | def use_continuation? 92 | @job_definition.incremental? 93 | end 94 | end 95 | end 96 | end 97 | -------------------------------------------------------------------------------- /lib/chronicle/etl/transformers/chronobase_transformer.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Chronicle 4 | module ETL 5 | class ChronobaseTransformer < Chronicle::ETL::Transformer 6 | PROPERTY_MAP = { 7 | source: :provider, 8 | source_id: :provider_id, 9 | url: :provider_url, 10 | end_time: :end_at, 11 | start_time: :start_at, 12 | 13 | name: :title, 14 | description: :body, 15 | text: :body, 16 | 17 | recipient: :consumers, 18 | agent: :actor, 19 | object: :involved, 20 | 21 | # music ones 22 | by_artist: :creators, 23 | in_album: :containers 24 | }.freeze 25 | 26 | VERB_MAP = { 27 | ListenAction: 'listened', 28 | CommunicateAction: 'messaged' 29 | }.freeze 30 | 31 | ENTITY_MAP = { 32 | MusicRecording: 'song', 33 | MusicAlbum: 'album', 34 | MusicGroup: 'musicartist', 35 | Message: 'message', 36 | Person: 'person' 37 | }.freeze 38 | 39 | register_connector do |r| 40 | r.identifier = :chronobase 41 | r.description = 'records to chronobase schema' 42 | end 43 | 44 | def transform(record) 45 | deeply_convert_record(record.data) 46 | end 47 | 48 | private 49 | 50 | def deeply_convert_record(record) 51 | type = activity?(record) ? 'activity' : 'entity' 52 | 53 | properties = record.properties.compact.each_with_object({}) do |(k, v), h| 54 | key = PROPERTY_MAP[k.to_sym] || k 55 | h[key] = v 56 | end 57 | 58 | properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym) 59 | properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym) 60 | 61 | properties.transform_values! do |v| 62 | case v 63 | when Chronicle::Models::Base 64 | deeply_convert_record(v) 65 | when Array 66 | v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e } 67 | else 68 | v 69 | end 70 | end 71 | 72 | Chronicle::Serialization::Record.new( 73 | id: record.id, 74 | type: type, 75 | properties: properties.compact, 76 | meta: { 77 | dedupe_on: transform_dedupe_on(record) 78 | }, 79 | schema: 'chronobase' 80 | ) 81 | end 82 | 83 | def activity?(record) 84 | record.type_id.end_with?('Action') 85 | end 86 | 87 | def transform_dedupe_on(record) 88 | property_map_with_type = PROPERTY_MAP.merge({ 89 | type: activity?(record) ? :verb : :represents 90 | }) 91 | 92 | record.dedupe_on.map do |set| 93 | set.map do |d| 94 | property_map_with_type[d] || d 95 | end.join(',') 96 | end 97 | end 98 | end 99 | end 100 | end 101 | -------------------------------------------------------------------------------- /lib/chronicle/etl/loaders/json_loader.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tempfile' 4 | 5 | module Chronicle 6 | module ETL 7 | class JSONLoader < Chronicle::ETL::Loader 8 | include Chronicle::ETL::Loaders::Helpers::StdoutHelper 9 | 10 | register_connector do |r| 11 | r.identifier = :json 12 | r.description = 'json' 13 | end 14 | 15 | setting :output 16 | 17 | # If true, one JSON record per line. If false, output a single json 18 | # object with an array of records 19 | setting :line_separated, default: true, type: :boolean 20 | 21 | def initialize(*args) 22 | super 23 | @first_line = true 24 | end 25 | 26 | def start 27 | @output_file = 28 | if output_to_stdout? 29 | create_stdout_temp_file 30 | else 31 | File.open(@config.output, 'w+') 32 | end 33 | 34 | @output_file.puts("[\n") unless @config.line_separated 35 | end 36 | 37 | def load(record) 38 | serialized = record.to_h 39 | 40 | # When dealing with raw data, we can get improperly encoded strings 41 | # (eg from sqlite database columns). We force conversion to UTF-8 42 | # before converting into JSON 43 | # encoded = serialized.transform_values do |value| 44 | # next value unless value.is_a?(String) 45 | 46 | # force_utf8(value) 47 | # end 48 | encoded = deeply_force_utf8(serialized) 49 | 50 | line = encoded.to_json 51 | # For line-separated output, we just put json + newline 52 | if @config.line_separated 53 | line = "#{line}\n" 54 | # Otherwise, we add a comma and newline and then add record to the 55 | # array we created in #start (unless it's the first line). 56 | else 57 | line = ",\n#{line}" unless @first_line 58 | end 59 | 60 | @output_file.write(line) 61 | 62 | @first_line = false 63 | # rescue StandardError => e 64 | # binding.pry 65 | end 66 | 67 | def finish 68 | # Close the array unless we're doing line-separated JSON 69 | @output_file.puts("\n]") unless @config.line_separated 70 | 71 | write_to_stdout_from_temp_file(@output_file) if output_to_stdout? 72 | 73 | @output_file.close 74 | end 75 | 76 | private 77 | 78 | # TODO: Move this to a helper module 79 | def deeply_force_utf8(hash) 80 | # FIXME: probably shouldn't happen but it does 81 | return hash.map { |x| force_utf8(x) } if hash.is_a?(Array) 82 | return force_utf8(hash) unless hash.is_a?(Hash) 83 | 84 | hash.transform_values do |value| 85 | case value 86 | when String 87 | force_utf8(value) 88 | when Hash 89 | deeply_force_utf8(value) 90 | when Array 91 | value.map { |v| deeply_force_utf8(v) } 92 | else 93 | value 94 | end 95 | end 96 | end 97 | end 98 | end 99 | end 100 | -------------------------------------------------------------------------------- /spec/chronicle/etl/cli/authorizations_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::CLI::Authorizations do 4 | describe '#new' do 5 | context 'with an available plugin' do 6 | before do 7 | %w[foo empty error].each do |plugin| 8 | path = File.expand_path(File.join(RSPEC_ROOT, "support/mock_plugins/chronicle-#{plugin}")) 9 | $LOAD_PATH.unshift(path) 10 | Chronicle::ETL::Registry::Plugins.register_standalone(name: plugin) 11 | end 12 | end 13 | 14 | it 'can authorize' do 15 | FakeFS.with_fresh do 16 | invoke_cli(%w[authorizations:new foo]) 17 | 18 | expect(Chronicle::ETL::Secrets.read('foo')).to eql({ token: 'abc' }) 19 | end 20 | end 21 | 22 | it 'can print authorization results to stdout' do 23 | FakeFS.with_fresh do 24 | stdout, = invoke_cli(%w[authorizations:new foo --print]) 25 | 26 | expect(stdout).to match(/abc/) 27 | end 28 | end 29 | end 30 | 31 | context "for credentials specified that don't exist" do 32 | it 'will exit with an error' do 33 | expect do 34 | invoke_cli(%w[authorizations:new foo --credentials fake123], false) 35 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 36 | end 37 | 38 | it 'will show an error message' do 39 | _, stderr = invoke_cli(%w[authorizations:new foo --credentials fake123]) 40 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/name does not exist/) 41 | end 42 | end 43 | 44 | context "for a plugin that can't be loaded" do 45 | it 'will exit with an error' do 46 | expect do 47 | invoke_cli(%w[authorizations:new error], false) 48 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 49 | end 50 | 51 | it 'will show an error message' do 52 | _, stderr = invoke_cli(%w[authorizations:new error]) 53 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/Could not load/) 54 | end 55 | end 56 | 57 | context "for a plugin that doesn't have an authorizer" do 58 | it 'will exit with an error' do 59 | expect do 60 | invoke_cli(%w[authorizations:new empty], false) 61 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 62 | end 63 | 64 | it 'will show an error message' do 65 | _, stderr = invoke_cli(%w[authorizations:new empty]) 66 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/No authorizer available/) 67 | end 68 | end 69 | 70 | context "for a plugin that's not installed" do 71 | it 'will exit with an error' do 72 | expect do 73 | invoke_cli(%w[authorizations:new foobar123], false) 74 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) } 75 | end 76 | 77 | it 'will show an error message' do 78 | _, stderr = invoke_cli(%w[authorizations:new foobar123]) 79 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/is not installed/) 80 | end 81 | end 82 | end 83 | end 84 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at andrew@hyfen.net. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /chronicle-etl.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | lib = File.expand_path('lib', __dir__) 4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) 5 | require 'chronicle/etl/version' 6 | 7 | Gem::Specification.new do |spec| 8 | spec.name = 'chronicle-etl' 9 | spec.version = Chronicle::ETL::VERSION 10 | spec.authors = ['Andrew Louis'] 11 | spec.email = ['andrew@hyfen.net'] 12 | 13 | spec.summary = 'ETL tool for personal data' 14 | spec.description = 'Chronicle-ETL allows you to extract personal data from a variety of services, transformer it, and load it.' 15 | spec.homepage = 'https://github.com/chronicle-app' 16 | spec.license = 'MIT' 17 | 18 | # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host' 19 | # to allow pushing to a single host or delete this section to allow pushing to any host. 20 | if spec.respond_to?(:metadata) 21 | spec.metadata['allowed_push_host'] = 'https://rubygems.org' 22 | 23 | spec.metadata['homepage_uri'] = spec.homepage 24 | spec.metadata['source_code_uri'] = 'https://github.com/chronicle-app/chronicle-etl' 25 | spec.metadata['changelog_uri'] = 'https://github.com/chronicle-app/chronicle-etl/releases' 26 | else 27 | raise 'RubyGems 2.0 or newer is required to protect against ' \ 28 | 'public gem pushes.' 29 | end 30 | 31 | # Specify which files should be added to the gem when it is released. 32 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git. 33 | spec.files = Dir.chdir(File.expand_path(__dir__)) do 34 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) } 35 | end 36 | spec.bindir = 'exe' 37 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) } 38 | spec.require_paths = ['lib'] 39 | spec.required_ruby_version = '>= 3.1' 40 | spec.metadata['rubygems_mfa_required'] = 'true' 41 | 42 | spec.add_dependency 'activesupport', '~> 7.0' 43 | spec.add_dependency 'chronic_duration', '~> 0.10.6' 44 | spec.add_dependency 'chronicle-core', '~> 0.3' 45 | spec.add_dependency 'colorize', '~> 0.8.1' 46 | spec.add_dependency 'gems', '>= 1' 47 | spec.add_dependency 'launchy' 48 | spec.add_dependency 'marcel', '~> 1.0.2' 49 | spec.add_dependency 'omniauth', '~> 2' 50 | spec.add_dependency 'sequel', '~> 5.35' 51 | spec.add_dependency 'sinatra', '~> 2' 52 | spec.add_dependency 'sqlite3', '~> 1.4' 53 | spec.add_dependency 'thor', '~> 1.2' 54 | spec.add_dependency 'thor-hollaback', '~> 0.2' 55 | spec.add_dependency 'tty-progressbar', '~> 0.17' 56 | spec.add_dependency 'tty-prompt', '~> 0.23' 57 | spec.add_dependency 'tty-spinner' 58 | spec.add_dependency 'tty-table', '~> 0.12' 59 | spec.add_dependency 'xdg', '>= 4.0' 60 | 61 | spec.add_development_dependency 'bundler', '~> 2.1' 62 | spec.add_development_dependency 'fakefs', '~> 1.4' 63 | spec.add_development_dependency 'guard-rspec', '~> 4.7.3' 64 | spec.add_development_dependency 'pry-byebug', '~> 3.9' 65 | spec.add_development_dependency 'rake', '~> 13.0' 66 | spec.add_development_dependency 'rspec', '~> 3.9' 67 | spec.add_development_dependency 'rubocop', '~> 1.57' 68 | spec.add_development_dependency 'simplecov', '~> 0.21' 69 | spec.add_development_dependency 'vcr', '~> 6.1' 70 | spec.add_development_dependency 'webmock', '~> 3' 71 | spec.add_development_dependency 'webrick', '~> 1.7' 72 | spec.add_development_dependency 'yard', '~> 0.9.7' 73 | end 74 | -------------------------------------------------------------------------------- /spec/chronicle/etl/configurable_spec.rb: -------------------------------------------------------------------------------- 1 | require 'spec_helper' 2 | 3 | RSpec.describe Chronicle::ETL::Configurable do 4 | let(:basic) do 5 | Class.new do 6 | include Chronicle::ETL::Configurable 7 | 8 | setting :foo 9 | 10 | def initialize(options = {}) 11 | apply_options(options) 12 | end 13 | end 14 | end 15 | 16 | let(:inherited) do 17 | Class.new(basic) 18 | end 19 | 20 | let(:inherited_inherited) do 21 | Class.new(inherited) 22 | end 23 | 24 | let(:with_required) do 25 | Class.new(basic) do 26 | setting :req, required: true 27 | end 28 | end 29 | 30 | let(:with_default) do 31 | Class.new(basic) do 32 | setting :def, default: 'default value' 33 | end 34 | end 35 | 36 | let(:with_type_time) do 37 | Class.new(basic) do 38 | setting :since, type: :time 39 | end 40 | end 41 | 42 | describe 'Basic use' do 43 | before do 44 | stub_const('BasicClass', basic) 45 | stub_const('InheritedFromBasicClass', inherited) 46 | stub_const('InheritedInheritedFromBasicClass', inherited_inherited) 47 | end 48 | 49 | it 'can be configured' do 50 | c = BasicClass.new(foo: 'bar') 51 | expect(c.config.foo).to eql('bar') 52 | end 53 | 54 | it 'can inherit settings from superclass' do 55 | c = InheritedFromBasicClass.new(foo: 'bar') 56 | expect(c.config.foo).to eql('bar') 57 | end 58 | 59 | it "can inherit settings from superclass's superclass" do 60 | c = InheritedInheritedFromBasicClass.new(foo: 'bar') 61 | expect(c.config.foo).to eql('bar') 62 | end 63 | 64 | it 'does not configure unrecognized settings' do 65 | c = BasicClass.new(arbitrary_setting: 'bar') 66 | expect(c.config.arbitrary_setting).to be_nil 67 | end 68 | end 69 | 70 | describe 'Required settings' do 71 | before do 72 | stub_const('RequiredSettingClass', with_required) 73 | stub_const('RequiredSettingSubclass', Class.new(RequiredSettingClass) { setting(:req, required: false) }) 74 | end 75 | 76 | it 'raises an exception if missing an option' do 77 | expect { RequiredSettingClass.new(foo: 'bar') }.to raise_error(Chronicle::ETL::ConnectorConfigurationError) 78 | end 79 | 80 | it 'can override parent class required setting' do 81 | expect { RequiredSettingSubclass.new(foo: 'bar') }.to_not raise_error 82 | end 83 | end 84 | 85 | describe 'Default values' do 86 | before do 87 | stub_const('DefaultSettingClass', with_default) 88 | stub_const('DefaultSettingSubclass', Class.new(DefaultSettingClass) { setting(:def, default: 'new value') }) 89 | end 90 | 91 | it 'has a default value set' do 92 | c = DefaultSettingClass.new(foo: 'bar') 93 | expect(c.config.def).to eql('default value') 94 | end 95 | 96 | it 'can have a default value overriden by a subclass' do 97 | c = DefaultSettingSubclass.new(foo: 'bar') 98 | expect(c.config.def).to eql('new value') 99 | end 100 | end 101 | 102 | describe 'Typed settings' do 103 | context 'for type time' do 104 | before do 105 | stub_const('TypedSettingClass', with_type_time) 106 | end 107 | 108 | it 'does not change values that do not have to be coerced' do 109 | c = TypedSettingClass.new(since: Time.new(2022, 2, 24)) 110 | expect(c.config.since).to be_a_kind_of(Time) 111 | expect(c.config.since.to_date.iso8601).to eq('2022-02-24') 112 | end 113 | 114 | it 'coerces settings of type: time into Time objects' do 115 | c = TypedSettingClass.new(since: '2022-02-24 14:00-0500') 116 | expect(c.config.since).to be_a_kind_of(Time) 117 | expect(c.config.since.iso8601).to eq('2022-02-24T14:00:00-05:00') 118 | end 119 | 120 | it 'coerces Date values into Time objects' do 121 | c = TypedSettingClass.new(since: Date.new(2022, 4, 1)) 122 | expect(c.config.since).to be_a_kind_of(Time) 123 | expect(c.config.since.iso8601).to eq('2022-04-01T00:00:00+00:00') 124 | end 125 | 126 | it 'interprets fuzzy time ranges correctly' do 127 | c = TypedSettingClass.new(since: '1d3h') 128 | expected_time = Time.now.to_i - 86_400 - 10_800 129 | expect(c.config.since).to be_a_kind_of(Time) 130 | expect(c.config.since.to_i).to be_within(100).of(expected_time) 131 | end 132 | 133 | it "returns an error when a range can't be parsed" do 134 | expect { TypedSettingClass.new(since: 'foo') }.to raise_error(Chronicle::ETL::ConnectorConfigurationError) 135 | end 136 | end 137 | end 138 | end 139 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/connectors.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rubygems' 4 | 5 | module Chronicle 6 | module ETL 7 | module Registry 8 | # A singleton class that acts as a registry of connector classes available for ETL jobs 9 | module Connectors 10 | PHASES = %i[extractor transformer loader].freeze 11 | public_constant :PHASES 12 | 13 | class << self 14 | attr_accessor :connectors 15 | end 16 | 17 | def self.register(connector) 18 | connectors << connector 19 | end 20 | 21 | def self.connectors 22 | @connectors ||= [] 23 | end 24 | 25 | def self.ancestor_for_phase(phase) 26 | case phase 27 | when :extractor 28 | Chronicle::ETL::Extractor 29 | when :transformer 30 | Chronicle::ETL::Transformer 31 | when :loader 32 | Chronicle::ETL::Loader 33 | end 34 | end 35 | 36 | def self.find_converter_for_source(source:, type: nil, strategy: nil, target: nil) 37 | # FIXME: we're assuming extractor plugin has been loaded already 38 | # This may not be the case if the schema converter is running 39 | # off a json dump off extraction data. 40 | # plugin = source_klass.connector_registration.source 41 | # type = source_klass.connector_registration.type 42 | # strategy = source_klass.connector_registration.strategy 43 | 44 | connectors.find do |c| 45 | c.phase == :transformer && 46 | c.source == source && 47 | (type.nil? || c.type == type) && 48 | (strategy.nil? || c.strategy == strategy || c.strategy.nil?) && 49 | (target.nil? || c.to_schema == target) 50 | end 51 | end 52 | 53 | # Find connector from amongst those currently loaded 54 | def self.find_by_phase_and_identifier_built_in(phase, identifier) 55 | connectors.find { |c| c.phase == phase.to_sym && c.identifier == identifier.to_sym } 56 | end 57 | 58 | # Find connector and load relevant plugin to find it if necessary 59 | def self.find_by_phase_and_identifier(phase, identifier) 60 | connector = find_by_phase_and_identifier_built_in(phase, identifier) 61 | return connector if connector 62 | 63 | # determine if we need to try to load a local file. if it has a dot in the identifier, we treat it as a file 64 | return find_by_phase_and_identifier_local(phase, identifier) if identifier.to_s.include?('.') 65 | 66 | # Example identifier: lastfm:listens:api 67 | plugin, type, strategy = identifier.split(':') 68 | .map { |part| part.gsub('-', '_') } 69 | .map(&:to_sym) 70 | 71 | plugin_identifier = plugin.to_s.gsub('_', '-') 72 | 73 | unless Chronicle::ETL::Registry::Plugins.installed?(plugin_identifier) 74 | raise Chronicle::ETL::PluginNotInstalledError, plugin_identifier 75 | end 76 | 77 | Chronicle::ETL::Registry::Plugins.activate(plugin_identifier) 78 | 79 | # find most specific connector that matches the identifier 80 | connector = connectors.find do |c| 81 | c.plugin == plugin && (type.nil? || c.type == type) && (strategy.nil? || c.strategy == strategy) 82 | end 83 | 84 | connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found") 85 | end 86 | 87 | # Load a plugin from local file system 88 | def self.find_by_phase_and_identifier_local(phase, identifier) 89 | script = File.read(identifier) 90 | raise ConnectorNotAvailableError, "Connector '#{identifier}' not found" if script.nil? 91 | 92 | # load the file by evaluating the contents 93 | eval(script, TOPLEVEL_BINDING, __FILE__, __LINE__) # rubocop:disable Security/Eval 94 | 95 | # read the file and look for all class definitions in the ruby script. 96 | class_names = script.scan(/class (\w+)/).flatten 97 | 98 | class_names.each do |class_name| 99 | klass = Object.const_get(class_name) 100 | 101 | next unless klass.ancestors.include?(ancestor_for_phase(phase)) 102 | 103 | registration = ::Chronicle::ETL::Registry::ConnectorRegistration.new(klass) 104 | 105 | klass.connector_registration = registration 106 | return registration 107 | # return klass 108 | rescue NameError 109 | # ignore 110 | end 111 | 112 | raise ConnectorNotAvailableError, "Connector '#{identifier}' not found" 113 | end 114 | end 115 | end 116 | end 117 | end 118 | -------------------------------------------------------------------------------- /lib/chronicle/etl/job_definition.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'active_support/core_ext/hash/deep_merge' 4 | 5 | module Chronicle 6 | module ETL 7 | class JobDefinition 8 | SKELETON_DEFINITION = { 9 | incremental: false, 10 | extractor: { 11 | name: 'stdin', 12 | options: {} 13 | }, 14 | transformers: [ 15 | { 16 | name: 'null', 17 | options: {} 18 | } 19 | ], 20 | loader: { 21 | name: 'json', 22 | options: {} 23 | } 24 | }.freeze 25 | 26 | attr_reader :errors 27 | attr_accessor :definition 28 | 29 | def initialize 30 | @definition = SKELETON_DEFINITION 31 | end 32 | 33 | def valid? 34 | validate 35 | @errors.empty? 36 | end 37 | 38 | def validate 39 | @errors = {} 40 | 41 | extractor_klass 42 | transformer_klasses 43 | loader_klass 44 | rescue Chronicle::ETL::PluginError => e 45 | @errors[:plugins] ||= [] 46 | @errors[:plugins] << e 47 | end 48 | 49 | def plugins_missing? 50 | validate 51 | 52 | return false unless @errors[:plugins]&.any? 53 | 54 | @errors[:plugins] 55 | .any? { |e| e.instance_of?(Chronicle::ETL::PluginNotInstalledError) } 56 | end 57 | 58 | def validate! 59 | raise(Chronicle::ETL::JobDefinitionError.new(self), 'Job definition is invalid') unless valid? 60 | 61 | true 62 | end 63 | 64 | # Add config hash to this definition 65 | def add_config(config = {}) 66 | @definition = @definition.deep_merge(config) 67 | load_credentials 68 | end 69 | 70 | # For each connector in this job, mix in secrets into the options 71 | def apply_default_secrets 72 | # FIXME: handle transformer secrets 73 | %i[extractor loader].each do |phase| 74 | # If the option have a `secrets` key, we look up those secrets and 75 | # mix them in. If not, use the connector's plugin name and look up 76 | # secrets with the same namespace 77 | if @definition[phase][:options][:secrets] 78 | namespace = @definition[phase][:options][:secrets] 79 | else 80 | # We don't want to do this lookup for built-in connectors 81 | next if __send__(:"#{phase}_klass").connector_registration.built_in? 82 | 83 | # infer plugin name from connector name and use it for secrets 84 | # namesepace 85 | namespace = @definition[phase][:name].split(':').first 86 | end 87 | 88 | # Reverse merge secrets into connector's options (we want to preserve 89 | # options that came from job file or CLI options) 90 | secrets = Chronicle::ETL::Secrets.read(namespace) 91 | @definition[phase][:options] = secrets.merge(@definition[phase][:options]) 92 | end 93 | end 94 | 95 | # Is this job continuing from a previous run? 96 | def incremental? 97 | @definition[:incremental] 98 | end 99 | 100 | def dry_run? 101 | @definition[:dry_run] 102 | end 103 | 104 | def extractor_klass 105 | find_connector_klass(:extractor, @definition[:extractor][:name]) 106 | end 107 | 108 | def transformer_klasses 109 | @definition[:transformers].map do |transformer| 110 | find_connector_klass(:transformer, transformer[:name]) 111 | end 112 | end 113 | 114 | def loader_klass 115 | find_connector_klass(:loader, @definition[:loader][:name]) 116 | end 117 | 118 | def extractor_options 119 | @definition[:extractor][:options] 120 | end 121 | 122 | def transformer_options 123 | @definition[:transformers].map do |transformer| 124 | transformer[:options] 125 | end 126 | end 127 | 128 | def loader_options 129 | @definition[:loader][:options] 130 | end 131 | 132 | private 133 | 134 | def find_schema_transformer_klass(source_klass, target) 135 | Chronicle::ETL::Registry::Connectors.find_converter_for_source(source_klass, target).klass 136 | end 137 | 138 | def find_connector_klass(phase, identifier) 139 | Chronicle::ETL::Registry::Connectors.find_by_phase_and_identifier(phase, identifier).klass 140 | end 141 | 142 | def load_credentials 143 | %i[extractor loader].each do |phase| 144 | credentials_name = @definition[phase].dig(:options, :credentials) 145 | if credentials_name 146 | credentials = Chronicle::ETL::Config.load_credentials(credentials_name) 147 | @definition[phase][:options].deep_merge(credentials) 148 | end 149 | end 150 | end 151 | end 152 | end 153 | end 154 | -------------------------------------------------------------------------------- /lib/chronicle/etl/oauth_authorizer.rb: -------------------------------------------------------------------------------- 1 | require 'omniauth' 2 | require 'tty-spinner' 3 | 4 | module Chronicle 5 | module ETL 6 | # An authorization strategy that uses oauth2 (and omniauth under the hood) 7 | class OauthAuthorizer < Authorizer 8 | class << self 9 | attr_reader :strategy, :provider_name, :authorization_to_secret_map 10 | attr_accessor :client_id, :client_secret 11 | 12 | # Macro for specifying which omniauth strategy to use 13 | def omniauth_strategy(strategy) 14 | @strategy = strategy 15 | end 16 | 17 | # Macro for specifying which omniauth scopes to request 18 | def scope(value) 19 | options[:scope] = value 20 | end 21 | 22 | # Macro for specifying hash of returned authorization to secrets hash 23 | def pluck_secrets(map) 24 | @authorization_to_secret_map = map 25 | end 26 | 27 | # # Macro for specifying options to pass to omniauth 28 | def options 29 | @options ||= {} 30 | end 31 | 32 | # Returns all subclasses of OauthAuthorizer 33 | # (Used by AuthorizationServer to build omniauth providers) 34 | def all 35 | ObjectSpace.each_object(::Class).select { |klass| klass < self } 36 | end 37 | end 38 | 39 | attr_reader :authorization 40 | 41 | # Create a new instance of OauthAuthorizer 42 | def initialize(port:, credentials: {}) 43 | @port = port 44 | @credentials = credentials 45 | super 46 | end 47 | 48 | # Start up an authorization server and handle the oauth flow 49 | def authorize! 50 | associate_oauth_credentials 51 | @server = load_server 52 | spinner = TTY::Spinner.new(':spinner :title', format: :dots_2) 53 | spinner.auto_spin 54 | spinner.update(title: "Starting temporary authorization server on port #{@port}"'') 55 | 56 | server_thread = start_authorization_server(port: @port) 57 | start_oauth_flow 58 | 59 | spinner.update(title: 'Waiting for authorization to complete in your browser') 60 | sleep 0.1 while authorization_pending?(server_thread) 61 | 62 | @server.quit! 63 | server_thread.join 64 | spinner.success("(#{'successful'.green})") 65 | 66 | # TODO: properly handle failed authorizations 67 | raise Chronicle::ETL::AuthorizationError unless @server.latest_authorization 68 | 69 | @authorization = @server.latest_authorization 70 | 71 | extract_secrets(authorization: @authorization, pluck_values: self.class.authorization_to_secret_map) 72 | end 73 | 74 | private 75 | 76 | def authorization_pending?(server_thread) 77 | server_thread.status && !@server.latest_authorization 78 | end 79 | 80 | def associate_oauth_credentials 81 | self.class.client_id = @credentials[:client_id] 82 | self.class.client_secret = @credentials[:client_secret] 83 | end 84 | 85 | def load_server 86 | # Load at runtime so that we can set omniauth strategies based on 87 | # which chronicle plugin has been loaded. 88 | require_relative 'authorization_server' 89 | Chronicle::ETL::AuthorizationServer 90 | end 91 | 92 | def start_authorization_server(port:) 93 | @server.settings.port = port 94 | suppress_webrick_logging(@server) 95 | Thread.abort_on_exception = true 96 | Thread.report_on_exception = false 97 | 98 | Thread.new do 99 | @server.run!({ port: @port }) do |s| 100 | s.silent = true if defined?(::Thin::Server) && s.instance_of?(::Thin::Server) 101 | end 102 | end 103 | end 104 | 105 | def start_oauth_flow 106 | url = "http://localhost:#{@port}/auth/#{omniauth_strategy}" 107 | Launchy.open(url) 108 | rescue Launchy::CommandNotFoundError 109 | Chronicle::ETL::Logger.info("Please open #{url} in a browser to continue") 110 | end 111 | 112 | def suppress_webrick_logging(server) 113 | require 'webrick' 114 | server.set( 115 | :server_settings, 116 | { 117 | AccessLog: [], 118 | # TODO: make this windows friendly 119 | # https://github.com/winton/stasis/commit/77da36f43285fda129300e382f18dfaff48571b0 120 | Logger: WEBrick::Log.new('/dev/null') 121 | } 122 | ) 123 | rescue LoadError 124 | # no worries if we're not using WEBrick 125 | end 126 | 127 | def extract_secrets(authorization:, pluck_values:) 128 | return authorization unless pluck_values&.any? 129 | 130 | pluck_values.transform_values do |identifiers| 131 | authorization.dig(*identifiers) 132 | end 133 | end 134 | 135 | def omniauth_strategy 136 | self.class.strategy 137 | end 138 | end 139 | end 140 | end 141 | -------------------------------------------------------------------------------- /lib/chronicle/etl/runner.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'colorize' 4 | require 'chronic_duration' 5 | require 'tty-spinner' 6 | 7 | module Chronicle 8 | module ETL 9 | class Runner 10 | def initialize(job) 11 | @job = job 12 | @job_logger = Chronicle::ETL::JobLogger.new(@job) 13 | end 14 | 15 | def run! 16 | begin_job 17 | validate_job 18 | instantiate_connectors 19 | prepare_job 20 | prepare_ui 21 | run_extraction 22 | rescue Chronicle::ETL::ExtractionError => e 23 | @job_logger&.error 24 | raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}") 25 | rescue Interrupt 26 | @job_logger&.error 27 | raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.') 28 | # rescue StandardError => e 29 | # # Just throwing this in here until we have better exception handling in 30 | # # loaders, etc 31 | # @job_logger&.error 32 | # raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}") 33 | ensure 34 | finish_job 35 | end 36 | 37 | private 38 | 39 | def begin_job 40 | Chronicle::ETL::Logger.info(tty_log_job_initialize) 41 | @initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2) 42 | end 43 | 44 | def validate_job 45 | @initialization_spinner.update(title: 'Validating job') 46 | @job.job_definition.validate! 47 | end 48 | 49 | def instantiate_connectors 50 | @initialization_spinner.update(title: 'Initializing connectors') 51 | @extractor = @job.instantiate_extractor 52 | @transformers = @job.instantiate_transformers 53 | @loader = @job.instantiate_loader 54 | end 55 | 56 | def prepare_job 57 | @initialization_spinner.update(title: 'Preparing job') 58 | @job_logger.start 59 | @loader.start 60 | 61 | @initialization_spinner.update(title: 'Preparing extraction') 62 | @initialization_spinner.auto_spin 63 | @extractor.prepare 64 | @initialization_spinner.success("(#{'successful'.green})") 65 | Chronicle::ETL::Logger.info("\n") 66 | end 67 | 68 | def prepare_ui 69 | total = @extractor.results_count 70 | @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total) 71 | Chronicle::ETL::Logger.attach_to_ui(@progress_bar) 72 | end 73 | 74 | def run_extraction 75 | # Pattern based on Kiba's StreamingRunner 76 | # https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb 77 | stream = extractor_stream 78 | recurser = ->(s, t) { transform_stream(s, t) } 79 | @transformers.reduce(stream, &recurser).each do |record| 80 | Chronicle::ETL::Logger.debug(tty_log_transformation(record)) 81 | @job_logger.log_transformation(record) 82 | @progress_bar.increment 83 | load_record(record) 84 | end 85 | 86 | @progress_bar.finish 87 | 88 | # This is typically a slow method (writing to stdout, writing a big file, etc) 89 | # TODO: consider adding a spinner? 90 | @loader.finish 91 | @job_logger.finish 92 | end 93 | 94 | # Initial steam of extracted data, wrapped in a Record class 95 | def extractor_stream 96 | Enumerator.new do |y| 97 | @extractor.extract do |extraction| 98 | record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction) 99 | y << record 100 | end 101 | end 102 | end 103 | 104 | # For a given stream of records and a given transformer, 105 | # returns a new stream of transformed records and finally 106 | # calls the finish method on the transformer 107 | def transform_stream(stream, transformer) 108 | Enumerator.new do |y| 109 | stream.each do |record| 110 | transformer.call(record) do |transformed_record| 111 | y << transformed_record 112 | end 113 | end 114 | 115 | transformer.call_finish do |transformed_record| 116 | y << transformed_record 117 | end 118 | end 119 | end 120 | 121 | def load_record(record) 122 | @loader.load(record.data) unless @job.dry_run? 123 | end 124 | 125 | def finish_job 126 | @job_logger.save 127 | @progress_bar&.finish 128 | Chronicle::ETL::Logger.detach_from_ui 129 | Chronicle::ETL::Logger.info(tty_log_completion) 130 | end 131 | 132 | def tty_log_job_initialize 133 | output = 'Beginning job ' 134 | output += "'#{@job.name}'".bold if @job.name 135 | output 136 | end 137 | 138 | def tty_log_transformation(record) 139 | output = ' ✓'.green 140 | output + " #{record}" 141 | end 142 | 143 | def tty_log_transformation_failure(exception, transformer) 144 | output = ' ✖'.red 145 | output + " Failed to transform #{transformer}. #{exception.message}" 146 | end 147 | 148 | def tty_log_completion 149 | status = @job_logger.success ? 'Success' : 'Failed' 150 | job_completion = @job_logger.success ? 'Completed' : 'Partially completed' 151 | output = "\n#{job_completion} job" 152 | output += " '#{@job.name}'".bold if @job.name 153 | output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration 154 | output += "\n Status:\t".light_black + status 155 | output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s 156 | if @job_logger.job_log.highest_timestamp 157 | output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s 158 | end 159 | output 160 | end 161 | end 162 | end 163 | end 164 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/main.rb: -------------------------------------------------------------------------------- 1 | require 'colorize' 2 | 3 | module Chronicle 4 | module ETL 5 | module CLI 6 | # Main entrypoint for CLI app 7 | class Main < Chronicle::ETL::CLI::CLIBase 8 | class_before :set_log_level 9 | class_before :set_color_output 10 | 11 | class_option :log_level, desc: 'Log level (debug, info, warn, error, fatal, silent)', default: 'info' 12 | class_option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean 13 | class_option :silent, desc: 'Silence all output', type: :boolean 14 | class_option :'no-color', desc: 'Disable colour output', type: :boolean 15 | 16 | default_task 'jobs' 17 | 18 | desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true 19 | subcommand 'connectors', Connectors 20 | 21 | desc 'jobs:COMMAND', 'Configure and run jobs', hide: true 22 | subcommand 'jobs', Jobs 23 | 24 | desc 'plugins:COMMAND', 'Configure plugins', hide: true 25 | subcommand 'plugins', Plugins 26 | 27 | desc 'secrets:COMMAND', 'Manage secrets', hide: true 28 | subcommand 'secrets', Secrets 29 | 30 | desc 'authorizations', 'Authorize', hide: true 31 | subcommand 'authorizations', Authorizations 32 | 33 | # Entrypoint for the CLI 34 | def self.start(given_args = ARGV, config = {}) 35 | # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class 36 | if given_args.any? && given_args[0].include?(':') 37 | commands = given_args.shift.split(':') 38 | given_args = given_args.unshift(commands).flatten 39 | end 40 | 41 | super(given_args, config) 42 | end 43 | 44 | def self.exit_on_failure? 45 | true 46 | end 47 | 48 | desc 'version', 'Show version' 49 | map %w[--version -v] => :version 50 | def version 51 | shell.say "chronicle-etl #{Chronicle::ETL::VERSION}" 52 | end 53 | 54 | # Displays help options for chronicle-etl 55 | def help(meth = nil, _subcommand = false) 56 | if meth && !respond_to?(meth) 57 | klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}") 58 | klass.start(['-h', task].compact, shell:) 59 | else 60 | shell.say 'ABOUT:'.bold 61 | shell.say " #{'chronicle-etl'.italic} is a toolkit for extracting and working with your digital" 62 | shell.say ' history. 📜' 63 | shell.say 64 | shell.say " A job #{'extracts'.underline} personal data from a source, #{'transforms'.underline} it (Chronicle" 65 | shell.say " Schema or preserves raw data), and then #{'loads'.underline} it to a destination. Use" 66 | shell.say ' built-in extractors (json, csv, stdin) and loaders (csv, json, table,' 67 | shell.say ' rest) or use plugins to connect to third-party services.' 68 | shell.say 69 | shell.say ' Plugins: https://github.com/chronicle-app/chronicle-etl#currently-available' 70 | shell.say 71 | shell.say 'USAGE:'.bold 72 | shell.say ' # Basic job usage:'.italic.light_black 73 | shell.say ' $ chronicle-etl --extractor NAME --transformer NAME --loader NAME' 74 | shell.say 75 | shell.say ' # Read test.csv and display it to stdout as a table:'.italic.light_black 76 | shell.say ' $ chronicle-etl --extractor csv --input data.csv --loader table' 77 | shell.say 78 | shell.say ' # Show available plugins:'.italic.light_black 79 | shell.say ' $ chronicle-etl plugins:list' 80 | shell.say 81 | shell.say ' # Save an access token as a secret and use it in a job:'.italic.light_black 82 | shell.say ' $ chronicle-etl secrets:set pinboard access_token username:foo123' 83 | shell.say ' $ chronicle-etl secrets:list' 84 | shell.say ' $ chronicle-etl -e pinboard --since 1mo' 85 | shell.say 86 | shell.say ' # Show full job options:'.italic.light_black 87 | shell.say ' $ chronicle-etl jobs help run' 88 | shell.say 89 | shell.say 'FULL DOCUMENTATION:'.bold 90 | shell.say ' https://github.com/chronicle-app/chronicle-etl'.blue 91 | shell.say 92 | 93 | list = [] 94 | ::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class| 95 | list += thor_class.printable_tasks(false) 96 | end 97 | list.sort! { |a, b| a[0] <=> b[0] } 98 | list.unshift ['help', '# This help menu'] 99 | 100 | shell.say 101 | shell.say 'ALL COMMANDS:'.bold 102 | shell.print_table(list, indent: 2, truncate: true) 103 | shell.say 104 | shell.say 'VERSION:'.bold 105 | shell.say " #{Chronicle::ETL::VERSION}" 106 | shell.say 107 | shell.say ' Display current version:'.italic.light_black 108 | shell.say ' $ chronicle-etl --version' 109 | end 110 | end 111 | 112 | no_commands do 113 | def set_color_output 114 | String.disable_colorization true if options[:'no-color'] || ENV['NO_COLOR'] 115 | end 116 | 117 | def set_log_level 118 | if options[:silent] 119 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::SILENT 120 | elsif options[:verbose] 121 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG 122 | elsif options[:log_level] 123 | level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase) 124 | Chronicle::ETL::Logger.log_level = level 125 | end 126 | end 127 | end 128 | end 129 | end 130 | end 131 | end 132 | -------------------------------------------------------------------------------- /lib/chronicle/etl/configurable.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'ostruct' 4 | require 'chronic_duration' 5 | 6 | module Chronicle 7 | module ETL 8 | # A mixin that gives a class 9 | # a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define 10 | # settings and their properties (require, type, etc) 11 | # 12 | # @example Basic usage 13 | # class Test < Chronicle::ETL::Extractor 14 | # include Chronicle::ETL::Configurable 15 | # setting :when, type: :date, required: true 16 | # end 17 | # 18 | # t = Test.new(when: '2022-02-24') 19 | # t.config.when 20 | module Configurable 21 | # An individual setting for this Configurable 22 | Setting = Struct.new(:default, :required, :type, :description) 23 | private_constant :Setting 24 | 25 | # Collection of user-supplied options for this Configurable 26 | class Config < OpenStruct 27 | # Config values that aren't nil, as a hash 28 | def compacted_h 29 | to_h.compact 30 | end 31 | end 32 | 33 | # @private 34 | def self.included(klass) 35 | klass.extend(ClassMethods) 36 | klass.include(InstanceMethods) 37 | klass.prepend(Initializer) 38 | end 39 | 40 | # Initializer method for classes that have Configurable mixed in 41 | module Initializer 42 | # Make sure this class has a default @config ready to use 43 | def initialize(*args) 44 | @config = initialize_default_config 45 | super 46 | end 47 | end 48 | 49 | # Instance methods for classes that have Configurable mixed in 50 | module InstanceMethods 51 | attr_reader :config 52 | 53 | # Take given options and apply them to this class's settings 54 | # and make them available in @config and validates that they 55 | # conform to setting rules 56 | def apply_options(options) 57 | options.transform_keys!(&:to_sym) 58 | 59 | options.each do |name, value| 60 | setting = self.class.all_settings[name] 61 | 62 | # Do nothing with a given option if it's not a connector setting 63 | next unless setting 64 | 65 | @config[name] = coerced_value(setting, name, value) 66 | end 67 | validate_config 68 | options 69 | end 70 | 71 | # Name of all settings available to this class 72 | def self.settings 73 | self.class.all_settings.keys 74 | end 75 | 76 | private 77 | 78 | def initialize_default_config 79 | self.class.config_with_defaults 80 | end 81 | 82 | def validate_config 83 | missing = (self.class.all_required_settings.keys - @config.compacted_h.keys) 84 | raise Chronicle::ETL::ConnectorConfigurationError, "Missing options: #{missing}" if missing.count.positive? 85 | end 86 | 87 | def coerced_value(setting, name, value) 88 | setting.type ? __send__("coerce_#{setting.type}", value) : value 89 | rescue StandardError 90 | raise( 91 | Chronicle::ETL::ConnectorConfigurationError, 92 | "Could not convert value '#{value}' into a #{setting.type} for setting '#{name}'" 93 | ) 94 | end 95 | 96 | def coerce_hash(value) 97 | value.is_a?(Hash) ? value : {} 98 | end 99 | 100 | def coerce_string(value) 101 | value.to_s 102 | end 103 | 104 | # TODO: think about whether to split up float, integer 105 | def coerce_numeric(value) 106 | value.to_f 107 | end 108 | 109 | def coerce_boolean(value) 110 | if value.is_a?(String) 111 | value.downcase == 'true' 112 | else 113 | value 114 | end 115 | end 116 | 117 | def coerce_array(value) 118 | value.is_a?(Array) ? value : [value] 119 | end 120 | 121 | def coerce_time(value) 122 | # parsing yml files might result in us getting Date objects 123 | # we convert to DateTime first to to ensure UTC 124 | return value.to_datetime.to_time if value.is_a?(Date) 125 | 126 | return value unless value.is_a?(String) 127 | 128 | # Hacky check for duration strings like "60m" 129 | if value.match(/[a-z]+/) 130 | ChronicDuration.raise_exceptions = true 131 | duration_ago = ChronicDuration.parse(value) 132 | Time.now - duration_ago 133 | else 134 | Time.parse(value) 135 | end 136 | end 137 | end 138 | 139 | # Class methods for classes that have Configurable mixed in 140 | module ClassMethods 141 | # Macro for creating a setting on a class {::Chronicle::ETL::Configurable} 142 | # 143 | # @param [String] name Name of the setting 144 | # @param [Boolean] required whether setting is required 145 | # @param [Object] default Default value 146 | # @param [Symbol] type Type 147 | # 148 | # @example Basic usage 149 | # setting :when, type: :date, required: true 150 | # 151 | # @see ::Chronicle::ETL::Configurable 152 | def setting(name, default: nil, required: false, type: nil, description: nil) 153 | s = Setting.new(default, required, type, description) 154 | settings[name] = s 155 | end 156 | 157 | # Collect all settings defined on this class and its ancestors (that 158 | # have Configurable mixin included) 159 | def all_settings 160 | if superclass.include?(Chronicle::ETL::Configurable) 161 | superclass.all_settings.merge(settings) 162 | else 163 | settings 164 | end 165 | end 166 | 167 | # Filters settings to those that are required. 168 | def all_required_settings 169 | all_settings.select { |_name, setting| setting.required } || {} 170 | end 171 | 172 | def settings 173 | @settings ||= {} 174 | end 175 | 176 | def setting_exists?(name) 177 | all_settings.keys.include? name 178 | end 179 | 180 | def config_with_defaults 181 | s = all_settings.transform_values(&:default) 182 | Config.new(s) 183 | end 184 | end 185 | end 186 | end 187 | end 188 | -------------------------------------------------------------------------------- /lib/chronicle/etl/registry/plugins.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'rubygems' 4 | require 'rubygems/command' 5 | require 'rubygems/commands/install_command' 6 | require 'rubygems/uninstaller' 7 | require 'gems' 8 | require 'active_support/core_ext/hash/deep_merge' 9 | 10 | module Chronicle 11 | module ETL 12 | module Registry 13 | # Responsible for managing plugins available to chronicle-etl 14 | # 15 | # @todo Better validation for whether a gem is actually a plugin 16 | # @todo Add ways to load a plugin that don't require a gem on rubygems.org 17 | module Plugins 18 | KNOWN_PLUGINS = %w[ 19 | apple-podcasts 20 | email 21 | foursquare 22 | github 23 | imessage 24 | pinboard 25 | safari 26 | shell 27 | spotify 28 | zulip 29 | ].freeze 30 | public_constant :KNOWN_PLUGINS 31 | 32 | # Start of a system for having non-gem plugins. Right now, we just 33 | # make registry aware of existence of name of non-gem plugin 34 | def self.register_standalone(name:) 35 | plugin = Chronicle::ETL::Registry::PluginRegistration.new do |p| 36 | p.name = name.to_sym 37 | p.installed = true 38 | end 39 | 40 | installed_standalone << plugin 41 | end 42 | 43 | # Plugins either installed as gems or manually loaded/registered 44 | def self.installed 45 | installed_standalone + installed_as_gem 46 | end 47 | 48 | # Check whether a given plugin is installed 49 | def self.installed?(name) 50 | installed.map(&:name).include?(name.to_sym) 51 | end 52 | 53 | # List of plugins installed as standalone 54 | def self.installed_standalone 55 | @installed_standalone ||= [] 56 | end 57 | 58 | # List of plugins installed as gems 59 | def self.installed_as_gem 60 | installed_gemspecs_latest.map do |gem| 61 | Chronicle::ETL::Registry::PluginRegistration.new do |p| 62 | p.name = gem.name.sub('chronicle-', '').to_sym 63 | p.gem = gem.name 64 | p.description = gem.description 65 | p.version = gem.version.to_s 66 | p.installed = true 67 | end 68 | end 69 | end 70 | 71 | # List of all plugins available to chronicle-etl 72 | def self.available 73 | available_as_gem 74 | end 75 | 76 | # List of plugins available through rubygems 77 | # TODO: make this concurrent 78 | def self.available_as_gem 79 | KNOWN_PLUGINS.map do |name| 80 | info = gem_info(name) 81 | Chronicle::ETL::Registry::PluginRegistration.new do |p| 82 | p.name = name 83 | p.gem = info['name'] 84 | p.version = info['version'] 85 | p.description = info['info'] 86 | end 87 | end 88 | end 89 | 90 | # Load info about a gem plugin from rubygems API 91 | def self.gem_info(name) 92 | gem_name = "chronicle-#{name}" 93 | Gems.info(gem_name) 94 | end 95 | 96 | # Union of installed gems (latest version) + available gems 97 | def self.all 98 | (installed + available) 99 | .group_by(&:name) 100 | .transform_values { |plugin| plugin.find(&:installed) || plugin.first } 101 | .values 102 | end 103 | 104 | # Does a plugin with a given name exist? 105 | def self.exists?(name) 106 | KNOWN_PLUGINS.include?(name) 107 | end 108 | 109 | # All versions of all plugins currently installed 110 | def self.installed_gemspecs 111 | # TODO: add check for chronicle-etl dependency 112 | Gem::Specification.filter do |s| 113 | s.name.match(/^chronicle-/) && s.name != 'chronicle-etl' && s.name != 'chronicle-core' 114 | end 115 | end 116 | 117 | # Latest version of each installed plugin 118 | def self.installed_gemspecs_latest 119 | installed_gemspecs.group_by(&:name) 120 | .transform_values { |versions| versions.sort_by(&:version).reverse.first } 121 | .values 122 | end 123 | 124 | # Activate a plugin with given name by `require`ing it 125 | def self.activate(name) 126 | # By default, activates the latest available version of a gem 127 | # so don't have to run Kernel#gem separately 128 | 129 | plugin_require_name = name.to_s.gsub('-', '_') 130 | require "chronicle/#{plugin_require_name}" 131 | rescue Gem::ConflictError => e 132 | # TODO: figure out if there's more we can do here 133 | raise Chronicle::ETL::PluginConflictError.new(name), 134 | "Plugin '#{plugin_require_name}' couldn't be loaded. #{e.message}" 135 | rescue StandardError, LoadError 136 | # StandardError to catch random non-loading problems that might occur 137 | # when requiring the plugin (eg class macro invoked the wrong way) 138 | # TODO: decide if this should be separated 139 | raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{plugin_require_name}' couldn't be loaded" 140 | end 141 | 142 | # Install a plugin to local gems 143 | def self.install(name) 144 | return if installed?(name) 145 | raise(Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist") unless exists?(name) 146 | 147 | gem_name = "chronicle-#{name}" 148 | 149 | Gem::DefaultUserInteraction.ui = Gem::SilentUI.new 150 | Gem.install(gem_name) 151 | 152 | activate(name) 153 | rescue Gem::UnsatisfiableDependencyError 154 | # TODO: we need to catch a lot more than this here 155 | raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed." 156 | end 157 | 158 | # Uninstall a plugin 159 | def self.uninstall(name) 160 | gem_name = "chronicle-#{name}" 161 | Gem::DefaultUserInteraction.ui = Gem::SilentUI.new 162 | uninstaller = Gem::Uninstaller.new(gem_name) 163 | uninstaller.uninstall 164 | rescue Gem::InstallError 165 | # TODO: strengthen this exception handling 166 | raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled") 167 | end 168 | end 169 | end 170 | end 171 | end 172 | -------------------------------------------------------------------------------- /lib/chronicle/etl/cli/jobs.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'tty-prompt' 4 | 5 | module Chronicle 6 | module ETL 7 | module CLI 8 | # CLI commands for working with ETL jobs 9 | class Jobs < SubcommandBase 10 | default_task 'start' 11 | namespace :jobs 12 | 13 | class_option :extractor, aliases: '-e', desc: 'Extractor class. Default: stdin', banner: 'NAME' 14 | class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {} 15 | class_option :transformer, 16 | aliases: '-t', 17 | desc: 'Transformer identifier. Default: null', 18 | banner: 'NAME', 19 | type: 'array', 20 | repeatable: true 21 | class_option :loader, aliases: '-l', desc: 'Loader class. Default: table', banner: 'NAME' 22 | class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {} 23 | 24 | # This is an array to deal with shell globbing 25 | class_option :input, 26 | aliases: '-i', 27 | desc: 'Input filename or directory', 28 | default: [], 29 | type: 'array', 30 | banner: 'FILENAME' 31 | class_option :since, desc: 'Load records SINCE this date (or fuzzy time duration)', banner: 'DATE' 32 | class_option :until, desc: 'Load records UNTIL this date (or fuzzy time duration)', banner: 'DATE' 33 | class_option :limit, desc: 'Only extract the first LIMIT records', banner: 'N' 34 | 35 | class_option :schema, 36 | desc: 'Which Schema to transform', 37 | banner: 'SCHEMA_NAME', 38 | type: 'string', 39 | enum: %w[chronicle activitystream schemaorg chronobase] 40 | class_option :format, 41 | desc: 'How to serialize results', 42 | banner: 'SCHEMA_NAME', 43 | type: 'string', 44 | enum: %w[jsonapi jsonld] 45 | 46 | class_option :output, aliases: '-o', desc: 'Output filename', type: 'string' 47 | class_option :fields, desc: 'Output only these fields', type: 'array', banner: 'field1 field2 ...' 48 | class_option :'fields-limit', desc: 'Output first N fields', type: :numeric 49 | class_option :filter, desc: 'Filter records', type: 'array', banner: 'field=value' 50 | class_option :header_row, desc: 'Output the header row of tabular output', type: 'boolean' 51 | 52 | # Thor doesn't like `run` as a command name 53 | map run: :start 54 | desc 'run', 'Start a job' 55 | option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean 56 | long_desc <<-LONG_DESC 57 | This will run an ETL job. Each job needs three parts: 58 | 59 | 1. #{'Extractor'.underline}: pulls data from an external source. By default, this is stdout. Other common options including pulling data from an API or reading JSON from a file. 60 | 61 | 2. #{'Transformers'.underline}: transform data into a new format. If none is specified, we use the `null` transformer which does nothing to the data. 62 | 63 | 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run. 64 | 65 | If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME` 66 | LONG_DESC 67 | # Run an ETL job 68 | def start(*args) 69 | name = args.first 70 | 71 | # If someone runs `$ chronicle-etl` with no arguments, show help menu. 72 | # TODO: decide if we should check that there's nothing in stdin pipe 73 | # in case user wants to actually run this sort of job stdin->null->stdout 74 | if name.nil? && options[:extractor].nil? 75 | m = Chronicle::ETL::CLI::Main.new 76 | m.help 77 | cli_exit 78 | end 79 | 80 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name) 81 | 82 | job_definition = build_job_definition(name, options) 83 | 84 | if job_definition.plugins_missing? 85 | missing_plugins = job_definition.errors[:plugins] 86 | .select { |error| error.is_a?(Chronicle::ETL::PluginNotInstalledError) } 87 | .map(&:name) 88 | .uniq 89 | install_missing_plugins(missing_plugins) 90 | end 91 | 92 | run_job(job_definition) 93 | rescue Chronicle::ETL::JobDefinitionError => e 94 | message = '' 95 | job_definition.errors.each_pair do |category, errors| 96 | message << "Problem with #{category}:\n - #{errors.map(&:to_s).join("\n - ")}" 97 | end 98 | cli_fail(message: "Error running job.\n#{message}", exception: e) 99 | end 100 | 101 | option :'skip-confirmation', aliases: '-y', type: :boolean 102 | desc 'save', 'Save a job' 103 | # Create an ETL job 104 | def save(name) 105 | write_config = true 106 | job_definition = build_job_definition(name, options) 107 | job_definition.validate! 108 | 109 | if Chronicle::ETL::Config.exists?('jobs', name) && !options[:'skip-confirmation'] 110 | prompt = TTY::Prompt.new 111 | write_config = false 112 | message = "Job '#{name}' exists already. Ovewrite it?" 113 | begin 114 | write_config = prompt.yes?(message) 115 | rescue TTY::Reader::InputInterrupt 116 | end 117 | end 118 | 119 | if write_config 120 | Chronicle::ETL::Config.write('jobs', name, job_definition.definition) 121 | cli_exit(message: "Job saved. Run it with `$ chronicle-etl jobs:run #{name}`") 122 | else 123 | cli_fail(message: "\nJob not saved") 124 | end 125 | rescue Chronicle::ETL::JobDefinitionError => e 126 | cli_fail(message: 'Job definition error', exception: e) 127 | end 128 | 129 | desc 'show', 'Show details about a job' 130 | # Show an ETL job 131 | def show(name = nil) 132 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name) 133 | 134 | job_definition = build_job_definition(name, options) 135 | job_definition.validate! 136 | puts Chronicle::ETL::Job.new(job_definition) 137 | rescue Chronicle::ETL::JobDefinitionError => e 138 | cli_fail(message: 'Job definition error', exception: e) 139 | end 140 | 141 | desc 'edit', 'Edit a job in default editor ($EDITOR)' 142 | def edit(name = nil) 143 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name) 144 | 145 | filename = Chronicle::ETL::Config.path('jobs', name) 146 | system "${VISUAL:-${EDITOR:-vi}} \"#{filename}\"" 147 | 148 | definition = Chronicle::ETL::JobDefinition.new 149 | definition.add_config(load_job_config(name)) 150 | definition.validate! 151 | 152 | cli_exit(message: "Job '#{name}' saved") 153 | rescue Chronicle::ETL::JobDefinitionError => e 154 | cli_fail(message: 'Job definition error', exception: e) 155 | end 156 | 157 | desc 'list', 'List all available jobs' 158 | # List available ETL jobs 159 | def list 160 | jobs = Chronicle::ETL::Config.available_jobs 161 | 162 | job_details = jobs.map do |job| 163 | r = Chronicle::ETL::Config.load('jobs', job) 164 | 165 | extractor = r[:extractor][:name] if r[:extractor] 166 | transformer = r[:transformer][:name] if r[:transformer] 167 | loader = r[:loader][:name] if r[:loader] 168 | 169 | [job, extractor, transformer, loader] 170 | end 171 | 172 | headers = %w[name extractor transformer loader].map { |h| h.upcase.bold } 173 | 174 | puts 'Available jobs:' 175 | table = TTY::Table.new(headers, job_details) 176 | puts table.render(indent: 0, padding: [0, 2]) 177 | rescue Chronicle::ETL::ConfigError => e 178 | cli_fail(message: "Config error. #{e.message}", exception: e) 179 | end 180 | 181 | private 182 | 183 | def run_job(job_definition) 184 | # FIXME: have to validate here so next method can work. This is clumsy 185 | job_definition.validate! 186 | # FIXME: clumsy to make CLI responsible for setting secrets here. Think about a better way to do this 187 | job_definition.apply_default_secrets 188 | job = Chronicle::ETL::Job.new(job_definition) 189 | runner = Chronicle::ETL::Runner.new(job) 190 | runner.run! 191 | rescue RunnerError => e 192 | cli_fail(message: e.message.to_s, exception: e) 193 | end 194 | 195 | # TODO: probably could merge this with something in cli/plugin 196 | def install_missing_plugins(missing_plugins) 197 | prompt = TTY::Prompt.new 198 | message = "Plugin#{'s' if missing_plugins.count > 1} specified by job not installed.\n" 199 | message += 'Do you want to install ' 200 | message += missing_plugins.map { |name| "chronicle-#{name}".bold } 201 | .join(', ') 202 | message += ' and start the job?' 203 | will_install = prompt.yes?(message) 204 | cli_fail(message: "Must install #{missing_plugins.join(', ')} plugin to run job") unless will_install 205 | 206 | Chronicle::ETL::CLI::Plugins.new.install(*missing_plugins) 207 | end 208 | 209 | # Create job definition by reading config file and then overwriting with flag options 210 | def build_job_definition(name, options) 211 | definition = Chronicle::ETL::JobDefinition.new 212 | definition.add_config(load_job_config(name)) 213 | definition.add_config(process_flag_options(options).transform_keys(&:to_sym)) 214 | definition 215 | end 216 | 217 | def load_job_config(name) 218 | Chronicle::ETL::Config.read_job(name) 219 | end 220 | 221 | # Takes flag options and turns them into a runner config 222 | # TODO: this needs a lot of refactoring 223 | def process_flag_options(options) 224 | extractor_options = options[:'extractor-opts'].transform_keys(&:to_sym).merge( 225 | { 226 | input: (options[:input] if options[:input].any?), 227 | since: options[:since], 228 | until: options[:until], 229 | limit: options[:limit] 230 | }.compact 231 | ) 232 | 233 | loader_options = options[:'loader-opts'].transform_keys(&:to_sym).merge( 234 | { 235 | output: options[:output], 236 | header_row: options[:header_row] 237 | }.compact 238 | ) 239 | 240 | processed_options = { 241 | dry_run: options[:dry_run], 242 | extractor: { 243 | name: options[:extractor], 244 | options: extractor_options 245 | }.compact, 246 | loader: { 247 | name: options[:loader], 248 | options: loader_options 249 | }.compact 250 | } 251 | 252 | add_transformer(processed_options, 'chronicle') if options[:schema] 253 | add_transformer(processed_options, options[:schema]) if options[:schema] && options[:schema] != 'chronicle' 254 | add_transformers_from_option(processed_options, options[:transformer]) if options[:transformer]&.any? 255 | if options[:filter] 256 | add_transformer(processed_options, :filter, { filters: options[:filter].to_h do |f| 257 | f.split('=') 258 | end }) 259 | end 260 | add_transformer(processed_options, :format, { format: options[:format] }) if options[:format] 261 | add_transformer(processed_options, :filter_fields, { fields: options[:fields] }) if options[:fields] 262 | if options[:'fields-limit'] 263 | add_transformer(processed_options, :fields_limit, 264 | { limit: options[:'fields-limit'] }) 265 | end 266 | 267 | processed_options 268 | end 269 | 270 | def add_transformer(processed_options, name, options = {}) 271 | processed_options[:transformers] ||= [] 272 | processed_options[:transformers] << { name:, options: } 273 | end 274 | 275 | def add_transformers_from_option(processed_options, transformer_option) 276 | processed_options[:transformers] ||= [] 277 | processed_options[:transformers] += transformer_option.map do |transformer_args| 278 | transformer_name, *transformer_options = transformer_args 279 | transformer_options = transformer_options.filter { |opt| opt.include?('=') } 280 | 281 | { 282 | name: transformer_name, 283 | options: transformer_options.to_h do |opt| 284 | key, value = opt.split('=') 285 | [key.to_sym, value] 286 | end 287 | } 288 | end 289 | end 290 | end 291 | end 292 | end 293 | end 294 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## A CLI toolkit for extracting and working with your digital history 2 | 3 | ![chronicle-etl-banner](https://user-images.githubusercontent.com/6291/157330518-0f934c9a-9ec4-43d9-9cc2-12f156d09b37.png) 4 | 5 | [![Gem Version](https://badge.fury.io/rb/chronicle-etl.svg)](https://badge.fury.io/rb/chronicle-etl) [![Ruby](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml/badge.svg)](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml) [![Docs](https://img.shields.io/badge/docs-rubydoc.info-blue)](https://www.rubydoc.info/gems/chronicle-etl/) 6 | 7 | Are you trying to archive your digital history or incorporate it into your own projects? You’ve probably discovered how frustrating it is to get machine-readable access to your own data. While [building a memex](https://hyfen.net/memex/), I learned first-hand what great efforts must be made before you can begin using the data in interesting ways. 8 | 9 | If you don’t want to spend all your time writing scrapers, reverse-engineering APIs, or parsing export data, this tool is for you! (_If you do enjoy these things, please see the [open issues](https://github.com/chronicle-app/chronicle-etl/issues)._) 10 | 11 | **`chronicle-etl` is a CLI tool that gives you a unified interface to your personal data.** It uses the ETL pattern to _extract_ data from a source (e.g. your local browser history, a directory of images, goodreads.com reading history), _transform_ it (into a given schema), and _load_ it to a destination (e.g. a CSV file, JSON, external API). 12 | 13 | ## What does `chronicle-etl` give you? 14 | 15 | - **A CLI tool for working with personal data**. You can monitor progress of exports, manipulate the output, set up recurring jobs, manage credentials, and more. 16 | - **Plugins for many third-party sources** (see [list](#available-plugins-and-connectors)). This plugin system allows you to access data from dozens of third-party services, all accessible through a common CLI interface. 17 | - **A common, opinionated schema**: You can normalize different datasets into a single schema so that, for example, all your iMessages and emails are represented in a common schema. (Don’t want to use this schema? `chronicle-etl` always allows you to fall back on working with the raw extraction data.) 18 | 19 | ## Chronicle-ETL in action 20 | 21 | ![demo](https://user-images.githubusercontent.com/6291/161410839-b5ce931a-2353-4585-b530-929f46e3f960.svg) 22 | 23 | ### Longer screencast 24 | 25 | [![asciicast](https://asciinema.org/a/483455.svg)](https://asciinema.org/a/483455) 26 | 27 | ## Installation 28 | 29 | Using homebrew: 30 | 31 | ```sh 32 | $ brew install chronicle-app/etl/chronicle-etl 33 | ``` 34 | 35 | Using rubygems: 36 | 37 | ```sh 38 | $ gem install chronicle-etl 39 | ``` 40 | 41 | Confirm it installed successfully: 42 | 43 | ```sh 44 | $ chronicle-etl --version 45 | ``` 46 | 47 | ## Basic usage and running jobs 48 | 49 | ```sh 50 | # Display help 51 | $ chronicle-etl help 52 | 53 | # Run a basic job 54 | $ chronicle-etl --extractor NAME --transformer NAME --loader NAME 55 | 56 | # Read test.csv and display it to stdout as a table 57 | $ chronicle-etl --extractor csv --input data.csv --loader table 58 | 59 | # Show available plugins and install one 60 | $ chronicle-etl plugins:list 61 | $ chronicle-etl plugins:install imessage 62 | 63 | # Retrieve imessage messages from the last 5 hours 64 | $ chronicle-etl -e imessage --since 5h 65 | 66 | # Get email senders from an .mbox email archive file 67 | $ chronicle-etl --extractor email:mbox -i sample-email-archive.mbox -t email --fields actor.slug 68 | 69 | # Save an access token as a secret and use it in a job 70 | $ chronicle-etl secrets:set pinboard access_token username:foo123 71 | $ chronicle-etl secrets:list # Verify that's it's available 72 | $ chronicle-etl -e pinboard --since 1mo # Used automatically based on plugin name 73 | ``` 74 | 75 | ### Common options 76 | 77 | ```sh 78 | Options: 79 | -e, [--extractor=NAME] # Extractor class. Default: stdin 80 | [--extractor-opts=key:value] # Extractor options 81 | -t, [--transformer=NAME] # Transformer class. Default: null 82 | [--transformer-opts=key:value] # Transformer options 83 | -l, [--loader=NAME] # Loader class. Default: json 84 | [--loader-opts=key:value] # Loader options 85 | -i, [--input=FILENAME] # Input filename or directory 86 | [--since=DATE] # Load records SINCE this date (or fuzzy time duration) 87 | [--until=DATE] # Load records UNTIL this date (or fuzzy time duration) 88 | [--limit=N] # Only extract the first LIMIT records 89 | [--schema=SCHEMA_NAME] # Which Schema to transform 90 | # Possible values: chronicle, activitystream, schemaorg, chronobase 91 | [--format=SCHEMA_NAME] # How to serialize results 92 | # Possible values: jsonapi, jsonld 93 | -o, [--output=OUTPUT] # Output filename 94 | [--fields=field1 field2 ...] # Output only these fields 95 | [--header-row], [--no-header-row] # Output the header row of tabular output 96 | 97 | [--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal) 98 | # Default: info 99 | -v, [--verbose], [--no-verbose] # Set log level to verbose 100 | [--silent], [--no-silent] # Silence all output 101 | ``` 102 | 103 | ### Saving a job 104 | 105 | You can save details about a job to a local config file (saved by default in `~/.config/chronicle/etl/jobs/JOB_NAME.yml`) to save yourself the trouble specifying options each time. 106 | 107 | ```sh 108 | # Save a job named 'sample' to ~/.config/chronicle/etl/jobs/sample.yml 109 | $ chronicle-etl jobs:save sample --extractor pinboard --since 10d 110 | 111 | # Run the job 112 | $ chronicle-etl jobs:run sample 113 | 114 | # Show details about the job 115 | $ chronicle-etl jobs:show sample 116 | 117 | # Edit a job definition with default editor ($EDITOR) 118 | $ chronicle-etl jobs:edit sample 119 | 120 | # Show all saved jobs 121 | $ chronicle-etl jobs:list 122 | ``` 123 | 124 | ## Connectors and plugins 125 | 126 | Connectors let you work with different data formats or third-party sources. 127 | 128 | ### Built-in Connectors 129 | 130 | `chronicle-etl` comes with several built-in connectors for common formats and sources. 131 | 132 | ```sh 133 | # List all available connectors 134 | $ chronicle-etl connectors:list 135 | ``` 136 | 137 | #### Extractors 138 | 139 | - [`csv`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/csv_extractor.rb) - Load records from CSV files or stdin 140 | - [`json`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/json_extractor.rb) - Load JSON (either [line-separated objects](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON) or one object) 141 | - [`file`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/file_extractor.rb) - load from a single file or directory (with a glob pattern) 142 | 143 | #### Transformers 144 | 145 | - [`null`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/null_transformer.rb) - (default) Don’t do anything and pass on raw extraction data 146 | - [`sampler`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/sampler_transformer.rb) - Sample `percent` records from the extraction 147 | - [`sort`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/sampler_transformer.rb) - sort extracted results by `key` and `direction` 148 | 149 | 150 | #### Loaders 151 | 152 | - [`json`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/json_loader.rb) - (default) Load records serialized as JSON 153 | - [`table`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/table_loader.rb) - Output an ascii table of records. Useful for exploring data. 154 | - [`csv`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/csv_extractor.rb) - Load records to CSV 155 | - [`rest`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/rest_loader.rb) - Send JSON to a REST API 156 | 157 | ### Chronicle Plugins for third-party services 158 | 159 | Plugins provide access to data from third-party platforms, services, or formats. Plugins are packaged as separate gems and can be installed through the CLI (under the hood, it's a `gem install chronicle-PLUGINNAME`) 160 | 161 | #### Plugin usage 162 | 163 | ```bash 164 | # List available plugins 165 | $ chronicle-etl plugins:list 166 | 167 | # Install a plugin 168 | $ chronicle-etl plugins:install NAME 169 | 170 | # Use a plugin 171 | $ chronicle-etl plugins:install imessage 172 | $ chronicle-etl --extractor imessage --limit 10 173 | 174 | # Uninstall a plugin 175 | $ chronicle-etl plugins:uninstall NAME 176 | ``` 177 | 178 | #### Available plugins and connectors 179 | 180 | The following are the officially-supported list of plugins and their available connectors: 181 | 182 | | Plugin | Type | Identifier | Description | 183 | | --------------------------------------------------------------------------- | ----------- | ---------------- | -------------------------------------------- | 184 | | [apple-podcasts](https://github.com/chronicle-app/chronicle-apple-podcasts) | extractor | listens | listening history of podcast episodes | 185 | | [apple-podcasts](https://github.com/chronicle-app/chronicle-apple-podcasts) | transformer | listen | a podcast episode listen to Chronicle Schema | 186 | | [email](https://github.com/chronicle-app/chronicle-email) | extractor | imap | emails over an IMAP connection | 187 | | [email](https://github.com/chronicle-app/chronicle-email) | extractor | mbox | emails from an .mbox file | 188 | | [email](https://github.com/chronicle-app/chronicle-email) | transformer | email | email to Chronicle Schema | 189 | | [foursquare](https://github.com/chronicle-app/chronicle-foursquare) | extractor | checkins | Foursqure visits | 190 | | [foursquare](https://github.com/chronicle-app/chronicle-foursquare) | transformer | checkin | checkin to Chronicle Schema | 191 | | [github](https://github.com/chronicle-app/chronicle-github) | extractor | activity | user activity stream | 192 | | [imessage](https://github.com/chronicle-app/chronicle-imessage) | extractor | messages | imessages from local macOS | 193 | | [imessage](https://github.com/chronicle-app/chronicle-imessage) | transformer | message | imessage to Chronicle Schema | 194 | | [pinboard](https://github.com/chronicle-app/chronicle-pinboard) | extractor | bookmarks | Pinboard.in bookmarks | 195 | | [pinboard](https://github.com/chronicle-app/chronicle-pinboard) | transformer | bookmark | bookmark to Chronicle Schema | 196 | | [safari](https://github.com/chronicle-app/chronicle-safari) | extractor | browser-history | browser history | 197 | | [safari ](https://github.com/chronicle-app/chronicle-safari) | transformer | browser-history | browser history to Chronicle Schema | 198 | | [shell](https://github.com/chronicle-app/chronicle-shell) | extractor | history | shell command history (bash / zsh) | 199 | | [shell](https://github.com/chronicle-app/chronicle-shell) | transformer | command | command to Chronicle Schema | 200 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | liked-tracks | liked tracks | 201 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | saved-albums | saved albums | 202 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | listens | recently listened tracks (last 50 tracks) | 203 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | transformer | like | like to Chronicle Schema | 204 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | transformer | listen | listen to Chronicle Schema | 205 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | authorizer | | OAuth authorizer | 206 | | [zulip](https://github.com/chronicle-app/chronicle-zulip) | extractor | private-messages | private messages | 207 | | [zulip](https://github.com/chronicle-app/chronicle-zulip) | transformer | message | message to Chronicle Schema | 208 | 209 | ### Coming soon 210 | 211 | A few dozen importers exist [in my Memex project](https://hyfen.net/memex/) and I'm porting them over to the Chronicle system. The [Chronicle Plugin Tracker](https://github.com/orgs/chronicle-app/projects/1/views/1) lets you keep track what's available and what's coming soon. 212 | 213 | If you don't see a plugin for a third-party provider or data source that you're interested in using with `chronicle-etl`, [please open an issue](https://github.com/chronicle-app/chronicle-etl/issues/new). If you want to work together on a plugin, please [get in touch](#get-in-touch)! 214 | 215 | In summary, the following **are coming soon**: 216 | anki, arc, bear, chrome, facebook, firefox, fitbit, foursquare, git, github, goodreads, google-calendar, images, instagram, lastfm, shazam, slack, strava, timing, things, twitter, whatsapp, youtube. 217 | 218 | ### Writing your own plugin 219 | 220 | Additional connectors are packaged as separate ruby gems. You can view the [iMessage plugin](https://github.com/chronicle-app/chronicle-imessage) for an example. 221 | 222 | If you want to load a custom connector without creating a gem, you can help by [completing this issue](https://github.com/chronicle-app/chronicle-etl/issues/23). 223 | 224 | If you want to work together on a connector, please [get in touch](#get-in-touch)! 225 | 226 | #### Sample custom Extractor class 227 | 228 | ```ruby 229 | # TODO 230 | ``` 231 | 232 | ## Secrets Management 233 | 234 | If your job needs secrets such as access tokens or passwords, `chronicle-etl` has a built-in secret management system. 235 | 236 | Secrets are organized in namespaces. Typically, you use one namespace per plugin (`pinboard` secrets for the `pinboard` plugin). When you run a job that uses the `pinboard` plugin extractor, for example, the secrets from that namespace will automatically be included in the extractor's options. To override which secrets get included, you can use do it in the connector options with `secrets: ALT-NAMESPACE`. 237 | 238 | Under the hood, secrets are stored in `~/.config/chronicle/etl/secrets/NAMESPACE.yml` with 0600 permissions on each file. 239 | 240 | ### Using the secret manager 241 | 242 | ```sh 243 | # Save a secret under the 'pinboard' namespace 244 | $ chronicle-etl secrets:set pinboard access_token username:foo123 245 | 246 | # Set a secret using stdin 247 | $ echo -n "username:foo123" | chronicle-etl secrets:set pinboard access_token 248 | 249 | # List available secretes 250 | $ chronicle-etl secrets:list 251 | 252 | # Use 'pinboard' secrets in the pinboard extractor's options (happens automatically) 253 | $ chronicle-etl -e pinboard --since 1mo 254 | 255 | # Use a custom secrets namespace 256 | $ chronicle-etl secrets:set pinboard-alt access_token different-username:foo123 257 | $ chronicle-etl -e pinboard --extractor-opts secrets:pinboard-alt --since 1mo 258 | 259 | # Remove a secret 260 | $ chronicle-etl secrets:unset pinboard access_token 261 | ``` 262 | 263 | ## Roadmap 264 | 265 | - Keep tackling **new plugins**. See: [Chronicle Plugin Tracker](https://github.com/orgs/chronicle-app/projects/1) 266 | - Add support for **incremental extractions** ([#37](https://github.com/chronicle-app/chronicle-etl/issues/37)) 267 | - **Improve stdin extractor and shell command transformer** so that users can easily integrate their own scripts/languages/tools into jobs ([#5](https://github.com/chronicle-app/chronicle-etl/issues/48)) 268 | - **Add documentation for Chronicle Schema**. It's found throughout this project but never explained. 269 | 270 | ## Development 271 | 272 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment. 273 | 274 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org). 275 | 276 | ### Additional development commands 277 | 278 | ```bash 279 | # run tests 280 | bundle exec rake spec 281 | 282 | # generate docs 283 | bundle exec rake yard 284 | 285 | # use Guard to run specs automatically 286 | bundle exec guard 287 | ``` 288 | 289 | ## Get in touch 290 | 291 | - [@hyfen](https://twitter.com/hyfen) on Twitter 292 | - [@hyfen](https://github.com/hyfen) on Github 293 | - Email: andrew@hyfen.net 294 | 295 | ## Contributing 296 | 297 | Bug reports and pull requests are welcome on GitHub at https://github.com/chronicle-app/chronicle-etl. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct. 298 | 299 | ## License 300 | 301 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT). 302 | 303 | ## Code of Conduct 304 | 305 | Everyone interacting in the Chronicle::ETL project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/chronicle-app/chronicle-etl/blob/main/CODE_OF_CONDUCT.md). 306 | --------------------------------------------------------------------------------