├── .yardopts
├── spec
├── support
│ ├── sample_data
│ │ ├── directories
│ │ │ ├── simple
│ │ │ │ ├── first.txt
│ │ │ │ └── second.txt
│ │ │ └── mixed-file-types
│ │ │ │ ├── first.txt
│ │ │ │ ├── another.md
│ │ │ │ └── second.txt
│ │ ├── sample.json
│ │ ├── sample.jsonl
│ │ ├── two-records.csv
│ │ ├── test.csv
│ │ └── long-csv.csv
│ ├── mock_plugins
│ │ ├── chronicle-error
│ │ │ └── chronicle
│ │ │ │ └── error.rb
│ │ ├── chronicle-foo
│ │ │ └── chronicle
│ │ │ │ ├── foo.rb
│ │ │ │ └── foo
│ │ │ │ └── simple_authorizer.rb
│ │ └── chronicle-empty
│ │ │ └── chronicle
│ │ │ └── empty.rb
│ ├── mock_homedir
│ │ └── .config
│ │ │ └── chronicle
│ │ │ └── etl
│ │ │ ├── secrets
│ │ │ ├── provider-two.yml
│ │ │ └── provider-one.yml
│ │ │ └── jobs
│ │ │ └── command.yml
│ ├── mocked_stdin.rb
│ ├── run_extraction.rb
│ ├── wait_until.rb
│ ├── mocked_config_directory.rb
│ ├── invoke_cli.rb
│ └── capture_io.rb
├── chronicle
│ ├── etl_spec.rb
│ └── etl
│ │ ├── extractor_spec.rb
│ │ ├── transformers.rb
│ │ └── null_transformer_spec.rb
│ │ ├── loaders
│ │ ├── table_loader_spec.rb
│ │ ├── csv_loader_spec.rb
│ │ └── json_loader_spec.rb
│ │ ├── config_spec.rb
│ │ ├── registry
│ │ └── self_registering_spec.rb
│ │ ├── cli
│ │ ├── main_spec.rb
│ │ ├── plugins_spec.rb
│ │ ├── secrets_spec.rb
│ │ ├── connectors_spec.rb
│ │ ├── jobs_spec.rb
│ │ └── authorizations_spec.rb
│ │ ├── extractors
│ │ ├── csv_extractor_spec.rb
│ │ ├── json_extractor_spec.rb
│ │ └── file_extractor_spec.rb
│ │ ├── secrets_spec.rb
│ │ ├── runner_spec.rb
│ │ ├── oauth_authorizer_spec.rb
│ │ └── configurable_spec.rb
└── spec_helper.rb
├── .rubocop.yml
├── .rspec
├── lib
└── chronicle
│ ├── etl
│ ├── version.rb
│ ├── registry
│ │ ├── registry.rb
│ │ ├── plugin_registration.rb
│ │ ├── self_registering.rb
│ │ ├── connector_registration.rb
│ │ ├── connectors.rb
│ │ └── plugins.rb
│ ├── transformers
│ │ ├── null_transformer.rb
│ │ ├── merge_meta_transformer.rb
│ │ ├── multiply_transformer.rb
│ │ ├── sampler_transformer.rb
│ │ ├── fields_limit_transformer.rb
│ │ ├── buffer_transformer.rb
│ │ ├── filter_transformer.rb
│ │ ├── sort_transformer.rb
│ │ ├── format_transformer.rb
│ │ ├── chronicle_transformer.rb
│ │ ├── filter_fields_transformer.rb
│ │ ├── transformer.rb
│ │ └── chronobase_transformer.rb
│ ├── record.rb
│ ├── cli.rb
│ ├── extractors
│ │ ├── stdin_extractor.rb
│ │ ├── json_extractor.rb
│ │ ├── csv_extractor.rb
│ │ ├── file_extractor.rb
│ │ ├── extractor.rb
│ │ └── helpers
│ │ │ └── input_reader.rb
│ ├── loaders
│ │ ├── helpers
│ │ │ ├── encoding_helper.rb
│ │ │ └── stdout_helper.rb
│ │ ├── rest_loader.rb
│ │ ├── loader.rb
│ │ ├── csv_loader.rb
│ │ ├── table_loader.rb
│ │ └── json_loader.rb
│ ├── extraction.rb
│ ├── utils
│ │ ├── binary_attachments.rb
│ │ └── progress_bar.rb
│ ├── logger.rb
│ ├── authorizer.rb
│ ├── cli
│ │ ├── cli_base.rb
│ │ ├── subcommand_base.rb
│ │ ├── secrets.rb
│ │ ├── connectors.rb
│ │ ├── plugins.rb
│ │ ├── authorizations.rb
│ │ ├── main.rb
│ │ └── jobs.rb
│ ├── authorization_server.rb
│ ├── exceptions.rb
│ ├── secrets.rb
│ ├── job_logger.rb
│ ├── config.rb
│ ├── job_log.rb
│ ├── job.rb
│ ├── job_definition.rb
│ ├── oauth_authorizer.rb
│ ├── runner.rb
│ └── configurable.rb
│ └── etl.rb
├── exe
└── chronicle-etl
├── .travis.yml
├── bin
├── setup
└── console
├── Rakefile
├── Gemfile
├── Guardfile
├── .gitignore
├── .github
└── workflows
│ └── ruby.yml
├── LICENSE.txt
├── CODE_OF_CONDUCT.md
├── chronicle-etl.gemspec
└── README.md
/.yardopts:
--------------------------------------------------------------------------------
1 | --markup=markdown
--------------------------------------------------------------------------------
/spec/support/sample_data/directories/simple/first.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spec/support/sample_data/directories/simple/second.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spec/support/sample_data/directories/mixed-file-types/first.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.rubocop.yml:
--------------------------------------------------------------------------------
1 | inherit_gem:
2 | chronicle-core: .rubocop.yml
3 |
--------------------------------------------------------------------------------
/spec/support/sample_data/directories/mixed-file-types/another.md:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/spec/support/sample_data/directories/mixed-file-types/second.txt:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --format documentation
2 | --color
3 | --require spec_helper
4 |
--------------------------------------------------------------------------------
/spec/support/sample_data/sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "str": "foo",
3 | "num":40
4 | }
5 |
--------------------------------------------------------------------------------
/spec/support/mock_plugins/chronicle-error/chronicle/error.rb:
--------------------------------------------------------------------------------
1 | raise "Plugin can't load"
2 |
--------------------------------------------------------------------------------
/spec/support/mock_plugins/chronicle-foo/chronicle/foo.rb:
--------------------------------------------------------------------------------
1 | require_relative 'foo/simple_authorizer'
2 |
--------------------------------------------------------------------------------
/spec/support/sample_data/sample.jsonl:
--------------------------------------------------------------------------------
1 | { "str": "foo", "num":40 }
2 | { "str": "bar", "num":50 }
3 |
--------------------------------------------------------------------------------
/spec/support/mock_plugins/chronicle-empty/chronicle/empty.rb:
--------------------------------------------------------------------------------
1 | module EmptyPlugin
2 | # empty
3 | end
4 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/version.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | VERSION = '0.6.1'.freeze
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/spec/support/sample_data/two-records.csv:
--------------------------------------------------------------------------------
1 | id,end_at,value
2 | 45403503,2013-08-13,1
3 | 45403503,2013-08-13 06:00:39,3.1
4 |
--------------------------------------------------------------------------------
/exe/chronicle-etl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'chronicle/etl/cli'
4 |
5 | Chronicle::ETL::CLI::Main.start(ARGV)
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | ---
2 | sudo: false
3 | language: ruby
4 | cache: bundler
5 | rvm:
6 | - 2.6.4
7 | before_install: gem install bundler -v 1.17.2
8 |
--------------------------------------------------------------------------------
/spec/support/mock_homedir/.config/chronicle/etl/secrets/provider-two.yml:
--------------------------------------------------------------------------------
1 | provider: provider-two
2 | secrets:
3 | foo: bar
4 | chronicle_etl_version: 0.4.4
5 |
--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -euo pipefail
3 | IFS=$'\n\t'
4 | set -vx
5 |
6 | bundle install
7 |
8 | # Do any other automated setup that you need to do here
9 |
--------------------------------------------------------------------------------
/spec/chronicle/etl_spec.rb:
--------------------------------------------------------------------------------
1 | RSpec.describe Chronicle::ETL do
2 | it 'has a version number' do
3 | expect(Chronicle::ETL::VERSION).not_to be nil
4 | end
5 | end
6 |
--------------------------------------------------------------------------------
/spec/support/sample_data/test.csv:
--------------------------------------------------------------------------------
1 | id,end_at,value
2 | 1,2013-08-13,1
3 | 6,2013-08-13 06:00:39,3.1
4 | 5,2013-08-13 06:00:39 +0200,
5 | 3,2013-08-16 03:45:08 +0200,4
6 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | require 'bundler/gem_tasks'
2 | require 'rspec/core/rake_task'
3 | RSpec::Core::RakeTask.new(:spec)
4 |
5 | require 'yard'
6 | YARD::Rake::YardocTask.new
7 |
8 | task default: :spec
9 |
--------------------------------------------------------------------------------
/spec/support/mock_homedir/.config/chronicle/etl/secrets/provider-one.yml:
--------------------------------------------------------------------------------
1 | provider: provider-one
2 | secrets:
3 | foo: bar
4 | another: test
5 | third: 123
6 | chronicle_etl_version: 0.4.4
7 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | git_source(:github) { |repo_name| "https://github.com/#{repo_name}" }
4 |
5 | # Specify your gem's dependencies in chronicle-etl.gemspec
6 | gemspec
7 |
--------------------------------------------------------------------------------
/Guardfile:
--------------------------------------------------------------------------------
1 | guard :rspec, cmd: 'bundle exec rspec' do
2 | require 'guard/rspec/dsl'
3 |
4 | watch(%r{^spec/.+_spec\.rb$})
5 | watch(%r{^lib/(.+)\.rb$}) { |m| "spec/#{m[1]}_spec.rb" }
6 | watch('spec/spec_helper.rb') { 'spec' }
7 | end
8 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /.bundle/
2 | /.yardoc
3 | /_yardoc/
4 | /coverage/
5 | /doc/
6 | /pkg/
7 | /spec/reports/
8 | /tmp/
9 |
10 | # https://yehudakatz.com/2010/12/16/clarifying-the-roles-of-the-gemspec-and-gemfile/
11 | Gemfile.lock
12 |
13 | # rspec failure tracking
14 | .rspec_status
15 | .DS_Store
--------------------------------------------------------------------------------
/spec/support/mock_plugins/chronicle-foo/chronicle/foo/simple_authorizer.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module Foo
3 | class SimpleAuthorizer < Chronicle::ETL::Authorizer
4 | provider :foo
5 |
6 | def authorize!
7 | { token: 'abc' }
8 | end
9 | end
10 | end
11 | end
12 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/registry.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module Registry
4 | end
5 | end
6 | end
7 |
8 | require_relative 'self_registering'
9 | require_relative 'connector_registration'
10 | require_relative 'connectors'
11 | require_relative 'plugin_registration'
12 | require_relative 'plugins'
13 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/extractor_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::Extractor do
4 | describe '#extract' do
5 | it 'raises an exception by default' do
6 | e = Chronicle::ETL::Extractor.new
7 | expect { e.extract }.to raise_error(NotImplementedError)
8 | end
9 | end
10 | end
11 |
--------------------------------------------------------------------------------
/spec/support/mock_homedir/.config/chronicle/etl/jobs/command.yml:
--------------------------------------------------------------------------------
1 | extractor:
2 | name: shell
3 | options:
4 | since: 2022-01-10
5 | transformer:
6 | name: shell
7 | options:
8 | loader:
9 | name: table
10 | options:
11 | truncate_values_at: 50
12 | fields_include:
13 | - end_at
14 | - involved.body
15 |
--------------------------------------------------------------------------------
/spec/support/mocked_stdin.rb:
--------------------------------------------------------------------------------
1 | require 'stringio'
2 |
3 | RSpec.shared_context 'mocked stdin' do
4 | let(:fake_stdin) { StringIO.new }
5 |
6 | def load_stdin(input)
7 | fake_stdin.puts(input)
8 | fake_stdin.rewind
9 | end
10 |
11 | around(:each) do |example|
12 | $stdin = fake_stdin
13 | example.run
14 | ensure
15 | $stdin = STDIN
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/spec/support/run_extraction.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module SpecHelpers
4 | def run_extraction(klass, options = {})
5 | extractor = klass.new(options)
6 | extractor.prepare
7 | results = []
8 | extractor.extract do |extraction|
9 | results << extraction
10 | end
11 | results
12 | end
13 | end
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/null_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class NullTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :null
8 | r.description = 'in no way'
9 | end
10 |
11 | def transform(record)
12 | yield record.data
13 | end
14 | end
15 | end
16 | end
17 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/record.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | # TODO: move this into chronicle-core after figuring out what to do about data vs properties
4 | module Chronicle
5 | module ETL
6 | class Record
7 | attr_accessor :data, :extraction
8 |
9 | def initialize(data: {}, extraction: nil)
10 | @data = data
11 | @extraction = extraction
12 | end
13 | end
14 | end
15 | end
16 |
--------------------------------------------------------------------------------
/spec/support/sample_data/long-csv.csv:
--------------------------------------------------------------------------------
1 | id,end_at,value
2 | 45403503,2013-08-13,1
3 | 45403503,2013-08-13 06:00:39,3.1
4 | 45403503,2013-08-13 06:00:39 +0200,
5 | 1541631,2013-08-16 03:45:08 +0200,4
6 | 1541632,2013-08-16 03:45:08 +0200,4
7 | 1541633,2013-08-16 03:45:08 +0200,4
8 | 1541634,2013-08-16 03:45:08 +0200,5
9 | 1541635,2013-08-16 03:45:08 +0200,7
10 | 1541636,2013-08-16 03:45:08 +0200,8
11 | 1541637,2013-08-16 03:45:08 +0200,9
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli.rb:
--------------------------------------------------------------------------------
1 | require 'thor'
2 | require 'thor/hollaback'
3 | require 'chronicle/etl'
4 |
5 | require 'chronicle/etl/cli/cli_base'
6 | require 'chronicle/etl/cli/subcommand_base'
7 | require 'chronicle/etl/cli/authorizations'
8 | require 'chronicle/etl/cli/connectors'
9 | require 'chronicle/etl/cli/jobs'
10 | require 'chronicle/etl/cli/plugins'
11 | require 'chronicle/etl/cli/secrets'
12 | require 'chronicle/etl/cli/main'
13 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/transformers.rb/null_transformer_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::NullTransformer do
4 | let(:record) { Chronicle::ETL::Record.new(data: { foo: 'bar' }) }
5 |
6 | describe '#transform' do
7 | it 'does nothing' do
8 | Chronicle::ETL::NullTransformer.new.transform(record) do |result|
9 | expect(result).to eq(foo: 'bar')
10 | end
11 | end
12 | end
13 | end
14 |
--------------------------------------------------------------------------------
/spec/support/wait_until.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module SpecHelpers
4 | # https://stackoverflow.com/questions/19388474/how-can-i-use-sinatra-to-simulate-a-remote-server-in-rspec-vcr
5 | def wait_until(timeout = 1)
6 | start_time = Time.now
7 |
8 | loop do
9 | return if yield
10 | raise TimeoutError if (Time.now - start_time) > timeout
11 |
12 | sleep(0.1)
13 | end
14 | end
15 | end
16 | end
17 | end
18 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/plugin_registration.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module Registry
4 | class PluginRegistration
5 | attr_accessor :name, :description, :gem, :version, :installed, :gemspec
6 |
7 | def initialize(name = nil)
8 | @installed = false
9 | @name = name
10 | yield self if block_given?
11 | end
12 |
13 | def installed?
14 | @installed || false
15 | end
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/.github/workflows/ruby.yml:
--------------------------------------------------------------------------------
1 | name: Ruby
2 |
3 | on:
4 | push:
5 | branches: [main]
6 | pull_request:
7 | branches: [main]
8 |
9 | jobs:
10 | build:
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 |
16 | - name: Set up Ruby
17 | uses: ruby/setup-ruby@v1
18 | with:
19 | ruby-version: 3.2
20 |
21 | - name: Install dependencies
22 | run: bundle install
23 |
24 | - name: Run tests
25 | run: bundle exec rake
26 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/stdin_extractor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class StdinExtractor < Chronicle::ETL::Extractor
6 | register_connector do |r|
7 | r.identifier = :stdin
8 | r.description = 'stdin'
9 | end
10 |
11 | def extract
12 | $stdin.read.each_line do |line|
13 | data = { line: line.strip }
14 | yield Chronicle::ETL::Extraction.new(data: data)
15 | end
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/spec/support/mocked_config_directory.rb:
--------------------------------------------------------------------------------
1 | require 'fakefs/spec_helpers'
2 |
3 | RSpec.shared_context 'mocked config directory' do
4 | around(:each) do |example|
5 | include FakeFS::SpecHelpers
6 |
7 | FakeFS.with_fresh do
8 | home = File.expand_path(File.join(RSPEC_ROOT, 'support/mock_homedir'))
9 | FakeFS::FileSystem.clone(home)
10 |
11 | Chronicle::ETL::Config.xdg_environment = { 'HOME' => home }
12 |
13 | example.run
14 |
15 | Chronicle::ETL::Config.xdg_environment = nil
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/helpers/encoding_helper.rb:
--------------------------------------------------------------------------------
1 | require 'pathname'
2 |
3 | module Chronicle
4 | module ETL
5 | module Loaders
6 | module Helpers
7 | module EncodingHelper
8 | # Mostly useful for handling loading with binary data from a raw extraction
9 | def force_utf8(value)
10 | return value unless value.is_a?(String)
11 |
12 | value.encode('UTF-8', invalid: :replace, undef: :replace, replace: '')
13 | end
14 | end
15 | end
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/merge_meta_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class MergeMetaTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :merge_meta
8 | r.description = 'merge extraction meta fields into the record'
9 | end
10 |
11 | def transform(record)
12 | record.data unless record.extraction&.meta
13 |
14 | record.data[:_meta] = record.extraction.meta
15 | record.data
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/loaders/table_loader_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::TableLoader do
4 | let(:record) do
5 | {
6 | provider: 'foo',
7 | verb: 'tested',
8 | actor: {
9 | represent: 'identity',
10 | provider: 'bar'
11 | }
12 | }
13 | end
14 |
15 | it 'can output a table' do
16 | l = Chronicle::ETL::TableLoader.new
17 |
18 | l.load(record)
19 | lines = capture do
20 | l.finish
21 | end.first.split("\n")
22 |
23 | # header + record
24 | expect(lines.count).to eql(2)
25 | end
26 | end
27 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extraction.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class Extraction
6 | attr_accessor :data, :meta, :source, :type, :strategy, :extractor
7 |
8 | def initialize(data: {}, meta: {}, source: nil, type: nil, strategy: nil, extractor: nil)
9 | @data = data
10 | @meta = meta
11 | @source = source
12 | @type = type
13 | @strategy = strategy
14 | @extractor = extractor
15 | end
16 |
17 | def to_h
18 | { data: @data, meta: @meta, source: @source }
19 | end
20 | end
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/multiply_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class MultiplyTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :multiply
8 | r.description = 'by taking a sample'
9 | end
10 |
11 | setting :n, default: 2, type: :numeric
12 |
13 | # return the result, sample_size percentage of the time. otherwise nil
14 | def transform(record)
15 | @config.n.to_i.times do
16 | yield record.data
17 | end
18 | end
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/sampler_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class SamplerTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :sampler
8 | r.description = 'by taking a sample'
9 | end
10 |
11 | setting :percent, default: 10, type: :numeric
12 |
13 | # return the result, `percent` percentage of the time. otherwise nil
14 | def transform(record)
15 | return unless rand(100) < @config.percent
16 |
17 | record.data
18 | end
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/utils/binary_attachments.rb:
--------------------------------------------------------------------------------
1 | require 'marcel'
2 | require 'base64'
3 |
4 | module Chronicle
5 | module ETL
6 | module Utils
7 | # Utility methods for dealing with binary files
8 | module BinaryAttachments
9 | def self.filename_to_base64(filename:, mimetype: nil)
10 | mimetype ||= guess_mimetype(filename: filename)
11 |
12 | "data:#{mimetype};base64,#{Base64.strict_encode64(File.read(filename))}"
13 | end
14 |
15 | def self.guess_mimetype(filename:)
16 | Marcel::MimeType.for(filename)
17 | end
18 | end
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/config_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::Config do
4 | include_context 'mocked config directory'
5 |
6 | # TODO: remove this after proper tests written for Config
7 | it 'can set a custom homedir' do
8 | data = {}
9 | data[Time.now.to_i] = Time.now
10 | Chronicle::ETL::Config.write('jobs', 'foo', data)
11 | expect(Chronicle::ETL::Config.available_jobs).to contain_exactly('command', 'foo')
12 | end
13 |
14 | describe '#available_jobs' do
15 | it 'can list jobs' do
16 | expect(Chronicle::ETL::Config.available_jobs).to eq(['command'])
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/registry/self_registering_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Connectors do
4 | describe '#register_connector' do
5 | it 'can register a new class' do
6 | expect do
7 | class TestExtractor < Chronicle::ETL::Extractor
8 | register_connector do |r|
9 | r.description = 'foobar'
10 | end
11 | end
12 | end.to change { Chronicle::ETL::Registry::Connectors.connectors.count }.by(1)
13 |
14 | expect(Chronicle::ETL::Registry::Connectors.connectors.map(&:description))
15 | .to include('foobar')
16 | end
17 | end
18 | end
19 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/main_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Main do
4 | describe '#version' do
5 | it 'outputs correct version' do
6 | output, = invoke_cli(['version'])
7 | expect(output).to match("chronicle-etl #{Chronicle::ETL::VERSION}")
8 | end
9 |
10 | it 'can be shown by calling cli with `--version`' do
11 | output, = invoke_cli(['--version'])
12 | expect(output).to match("chronicle-etl #{Chronicle::ETL::VERSION}")
13 | end
14 | end
15 |
16 | describe '#help' do
17 | it 'outputs help menu' do
18 | output, = invoke_cli(['help'])
19 | expect(output).to match(/ALL COMMANDS/)
20 | end
21 | end
22 | end
23 |
--------------------------------------------------------------------------------
/spec/support/invoke_cli.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module SpecHelpers
4 | # Run the main CLI app with given args
5 | #
6 | # @param [Array] the command line arguments to pass to the CLI
7 | # @param [Boolean] rescue_from_exit whether to rescue when CLI explictly
8 | # exits. If set to false, example must include
9 | # `.to raise_error(SystemExit)`, otherwise tests will prematurely end
10 | def invoke_cli(args = [], rescue_from_exit = true)
11 | capture do
12 | Chronicle::ETL::CLI::Main.start(args)
13 | rescue SystemExit
14 | raise unless rescue_from_exit
15 | end
16 | end
17 | end
18 | end
19 | end
20 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/extractors/csv_extractor_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CSVExtractor do
4 | let(:filename) { 'spec/support/sample_data/two-records.csv' }
5 |
6 | describe '#results_count' do
7 | it 'can extract from a CSV file' do
8 | e = Chronicle::ETL::CSVExtractor.new(input: filename)
9 | e.prepare
10 | expect(e.results_count).to eql(2)
11 | end
12 | end
13 |
14 | describe '#extract' do
15 | it 'can extract from a CSV file' do
16 | e = Chronicle::ETL::CSVExtractor.new(input: filename)
17 | e.prepare
18 | expect { |b| e.extract(&b) }.to yield_successive_args(Chronicle::ETL::Extraction, Chronicle::ETL::Extraction)
19 | end
20 | end
21 | end
22 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/fields_limit_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'chronicle/utils/hash_utils'
4 |
5 | module Chronicle
6 | module ETL
7 | # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
8 | class FieldsLimitTransformer < Chronicle::ETL::Transformer
9 | register_connector do |r|
10 | r.identifier = :fields_limit
11 | r.description = 'by taking first N fields'
12 | end
13 |
14 | setting :limit, type: :numeric, default: 10
15 |
16 | def transform(record)
17 | # flattern hash and then take the first limit fields
18 |
19 | Chronicle::Utils::HashUtils.flatten_hash(record.data.to_h).first(@config.limit).to_h
20 | end
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/bin/console:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'bundler/setup'
4 | require 'chronicle/etl'
5 |
6 | # You can add fixtures and/or initialization code here to make experimenting
7 | # with your gem easier. You can also use a different console, if you like.
8 |
9 | # (If you use this, don't forget to add pry to your Gemfile!)
10 | require 'pry'
11 | Pry.start
12 |
13 | def reload!(print = true)
14 | puts 'Reloading ...' if print
15 | # Main project directory.
16 | root_dir = File.expand_path('..', __dir__)
17 | # Directories within the project that should be reloaded.
18 | reload_dirs = %w[lib]
19 | # Loop through and reload every file in all relevant project directories.
20 | reload_dirs.each do |dir|
21 | Dir.glob("#{root_dir}/#{dir}/**/*.rb").each { |f| load(f) }
22 | end
23 | # Return true when complete.
24 | true
25 | end
26 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/buffer_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class BufferTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :buffer
8 | r.description = 'by buffering'
9 | end
10 |
11 | setting :size, default: 10, description: 'The size of the buffer'
12 |
13 | def transform(record)
14 | stash_record(record)
15 |
16 | # FIXME: this doesn't seem to be working with the runner
17 | return if @stashed_records.size < @config.size
18 |
19 | # FIXME: this will result in the wrong extraction being associated with
20 | # the batch of flushed records
21 | flush_stashed_records.map(&:data)
22 | end
23 |
24 | def finish
25 | flush_stashed_records
26 | end
27 | end
28 | end
29 | end
30 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/filter_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | # Return only records that match all the conditions of the filters
6 | # setting.
7 | class FilterTransformer < Chronicle::ETL::Transformer
8 | register_connector do |r|
9 | r.identifier = :filter
10 | r.description = 'by only accepting records that match conditions'
11 | end
12 |
13 | setting :filters, type: :hash
14 |
15 | def transform(record)
16 | record_hash = record.data.to_h
17 |
18 | @config.filters.each do |key, value|
19 | path = key.split('.').map do |k|
20 | k.match?(/^\d+$/) ? k.to_i : k.to_sym
21 | end
22 |
23 | return nil unless record_hash.dig(*path) == value
24 | end
25 |
26 | record.data
27 | end
28 | end
29 | end
30 | end
31 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/sort_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class SortTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :sort
8 | r.description = 'sorts records by a given field'
9 | end
10 |
11 | setting :key, required: true, default: 'id'
12 | setting :direction, required: false, default: 'desc'
13 |
14 | def transform(record)
15 | stash_record(record)
16 | end
17 |
18 | def finish
19 | return unless @stashed_records&.any?
20 |
21 | sorted = @stashed_records.sort_by do |record|
22 | value = record.data[@config.key]
23 | value.nil? ? [1] : [0, value]
24 | end
25 |
26 | sorted.reverse! if @config.direction == 'desc'
27 | sorted
28 | end
29 | end
30 | end
31 | end
32 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/format_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class FormatTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :format
8 | r.description = 'records to a differnet hash/json format'
9 | end
10 |
11 | setting :format, default: nil
12 |
13 | def transform(record)
14 | serializer = find_serializer(@config.format)
15 | serializer.serialize(record.data)
16 | end
17 |
18 | private
19 |
20 | def find_serializer(format)
21 | case format
22 | when 'jsonld'
23 | Chronicle::Serialization::JSONLDSerializer
24 | when 'jsonapi'
25 | Chronicle::Serialization::JSONAPISerializer
26 | else
27 | raise 'unknown format'
28 | end
29 | end
30 | end
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/self_registering.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | module Chronicle
4 | module ETL
5 | module Registry
6 | # Gives a connector class the ability to let the Chronicle::ETL::Registry
7 | # know about itself
8 | module SelfRegistering
9 | extend Forwardable
10 |
11 | attr_accessor :connector_registration
12 |
13 | def_delegators :@connector_registration, :description, :provider, :identifier
14 |
15 | # Creates a ConnectorRegistration for this connector's details and register's it
16 | # into the Registry
17 | def register_connector
18 | @connector_registration ||= ::Chronicle::ETL::Registry::ConnectorRegistration.new(self)
19 | yield @connector_registration if block_given?
20 | ::Chronicle::ETL::Registry::Connectors.register(@connector_registration)
21 | end
22 | end
23 | end
24 | end
25 | end
26 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/plugins_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Plugins do
4 | describe '#list' do
5 | it 'shows installed plugins' do
6 | VCR.use_cassette('plugins-on-rubygems') do
7 | stdout, = invoke_cli(%w[plugins:list])
8 | expect(stdout.split("\n").first).to match(/Available plugins/)
9 | end
10 | end
11 | end
12 |
13 | describe '#uninstall' do
14 | context "for a plugin that doesn't exist" do
15 | it 'will exit with an error' do
16 | expect do
17 | invoke_cli(%w[plugins:uninstall foobar123], false)
18 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
19 | end
20 |
21 | it 'will show an error message' do
22 | _, stderr = invoke_cli(%w[plugins:uninstall foobar123])
23 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/could not be uninstalled/)
24 | end
25 | end
26 | end
27 | end
28 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/chronicle_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class ChronicleTransformer < Chronicle::ETL::Transformer
6 | register_connector do |r|
7 | r.identifier = :chronicle
8 | r.description = 'records to Chronicle schema'
9 | end
10 |
11 | def transform(record)
12 | converter_klass = find_converter(record.extraction)
13 | # TODO: handle missing converter
14 |
15 | converter_klass.new.call(record) do |transformed_record|
16 | yield transformed_record.data
17 | end
18 | end
19 |
20 | private
21 |
22 | def find_converter(extraction)
23 | Chronicle::ETL::Registry::Connectors.find_converter_for_source(
24 | source: extraction.source,
25 | type: extraction.type,
26 | strategy: extraction.strategy,
27 | target: :chronicle
28 | )&.klass
29 | end
30 | end
31 | end
32 | end
33 |
--------------------------------------------------------------------------------
/lib/chronicle/etl.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'chronicle/schema'
4 | require 'chronicle/models/base'
5 |
6 | require_relative 'etl/registry/registry'
7 | require_relative 'etl/authorizer'
8 | require_relative 'etl/config'
9 | require_relative 'etl/configurable'
10 | require_relative 'etl/exceptions'
11 | require_relative 'etl/extraction'
12 | require_relative 'etl/record'
13 | require_relative 'etl/job_definition'
14 | require_relative 'etl/job_log'
15 | require_relative 'etl/job_logger'
16 | require_relative 'etl/job'
17 | require_relative 'etl/logger'
18 | require_relative 'etl/runner'
19 | require_relative 'etl/secrets'
20 | require_relative 'etl/utils/binary_attachments'
21 | require_relative 'etl/utils/progress_bar'
22 | require_relative 'etl/version'
23 |
24 | require_relative 'etl/extractors/extractor'
25 | require_relative 'etl/loaders/loader'
26 | require_relative 'etl/transformers/transformer'
27 |
28 | begin
29 | require 'pry'
30 | rescue LoadError
31 | # Pry not available
32 | end
33 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/logger.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module Logger
4 | extend self
5 |
6 | DEBUG = 0
7 | INFO = 1
8 | WARN = 2
9 | ERROR = 3
10 | FATAL = 4
11 | SILENT = 5
12 |
13 | attr_accessor :log_level
14 |
15 | @log_level = INFO
16 |
17 | def output(message, level)
18 | return unless level >= @log_level
19 |
20 | if @ui_element
21 | @ui_element.log(message)
22 | else
23 | warn(message)
24 | end
25 | end
26 |
27 | def fatal(message)
28 | output(message, FATAL)
29 | end
30 |
31 | def error(message)
32 | output(message, ERROR)
33 | end
34 |
35 | def info(message)
36 | output(message, INFO)
37 | end
38 |
39 | def debug(message)
40 | output(message, DEBUG)
41 | end
42 |
43 | def attach_to_ui(ui_element)
44 | @ui_element = ui_element
45 | end
46 |
47 | def detach_from_ui
48 | @ui_element = nil
49 | end
50 | end
51 | end
52 | end
53 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/rest_loader.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'net/http'
4 | require 'uri'
5 | require 'json'
6 | require 'chronicle/serialization'
7 |
8 | module Chronicle
9 | module ETL
10 | class RestLoader < Chronicle::ETL::Loader
11 | register_connector do |r|
12 | r.identifier = :rest
13 | r.description = 'a REST endpoint'
14 | end
15 |
16 | setting :hostname, required: true
17 | setting :endpoint, required: true
18 | setting :access_token
19 |
20 | def load(payload)
21 | uri = URI.parse("#{@config.hostname}#{@config.endpoint}")
22 |
23 | header = {
24 | 'Authorization' => "Bearer #{@config.access_token}",
25 | 'Content-Type': 'application/json'
26 | }
27 | use_ssl = uri.scheme == 'https'
28 |
29 | Net::HTTP.start(uri.host, uri.port, use_ssl: use_ssl) do |http|
30 | request = Net::HTTP::Post.new(uri.request_uri, header)
31 | request.body = payload.to_json
32 | http.request(request)
33 | end
34 | end
35 | end
36 | end
37 | end
38 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/loaders/csv_loader_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'csv'
3 |
4 | RSpec.describe Chronicle::ETL::CSVLoader do
5 | # TODO: consolidate this with other specs
6 | let(:record) do
7 | {
8 | provider: 'foo',
9 | verb: 'tested',
10 | actor: {
11 | represent: 'identity',
12 | provider: 'bar'
13 | }
14 | }
15 | end
16 |
17 | context 'when destination is stdout' do
18 | it 'can output a CSV' do
19 | l = Chronicle::ETL::CSVLoader.new
20 |
21 | l.load(record)
22 | l.load(record)
23 |
24 | lines = capture do
25 | l.finish
26 | end.first.split("\n")
27 |
28 | expect(lines.count).to eql(3)
29 | end
30 | end
31 |
32 | context 'when destination is a file' do
33 | it 'writes json to a file' do
34 | FakeFS.with_fresh do
35 | l = Chronicle::ETL::CSVLoader.new(output: 'test.csv')
36 | l.load(record)
37 | l.load(record)
38 | l.finish
39 |
40 | csv = CSV.parse(File.read('test.csv'))
41 | expect(csv.count).to eql(3)
42 | end
43 | end
44 | end
45 | end
46 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2020 Andrew Louis
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/loaders/json_loader_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'fakefs/safe'
3 |
4 | RSpec.describe Chronicle::ETL::JSONLoader do
5 | let(:record) do
6 | { foo: 'bar' }
7 | end
8 |
9 | context 'when using stdout as destination' do
10 | it 'can output JSON from a Raw model' do
11 | l = Chronicle::ETL::JSONLoader.new
12 |
13 | output, = capture do
14 | l.start
15 | l.load(record)
16 | l.load(record)
17 | l.finish
18 | end
19 |
20 | lines = output.split("\n")
21 | expect(lines.count).to eql(2)
22 | expect(JSON.parse(lines.first)).to include({ 'foo' => 'bar' })
23 | end
24 | end
25 |
26 | context 'when using a file as destination' do
27 | it 'writes json to a file' do
28 | FakeFS.with_fresh do
29 | l = Chronicle::ETL::JSONLoader.new(output: 'output.jsonl')
30 | l.start
31 | l.load(record)
32 | l.load(record)
33 | l.finish
34 |
35 | contents = File.read('output.jsonl').split("\n")
36 | expect(JSON.parse(contents.first)).to include({ 'foo' => 'bar' })
37 | end
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/extractors/json_extractor_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::JSONExtractor do
4 | let(:json_filename) { 'spec/support/sample_data/sample.json' }
5 | let(:jsonl_filename) { 'spec/support/sample_data/sample.jsonl' }
6 | let(:invalid_filename) { 'spec/support/sample_data/test.csv' }
7 |
8 | describe '#results_count' do
9 | it 'can extract from a CSV file' do
10 | e = Chronicle::ETL::JSONExtractor.new(input: jsonl_filename)
11 | e.prepare
12 | expect(e.results_count).to eql(2)
13 | end
14 | end
15 |
16 | describe '#extract' do
17 | it 'can extract from a JSONL file' do
18 | e = Chronicle::ETL::JSONExtractor.new(input: jsonl_filename)
19 | e.prepare
20 | expect { |b| e.extract(&b) }.to yield_successive_args(Chronicle::ETL::Extraction, Chronicle::ETL::Extraction)
21 | end
22 |
23 | context 'for invalid json' do
24 | it 'will raise an exception' do
25 | e = Chronicle::ETL::JSONExtractor.new(input: invalid_filename)
26 | expect { e.prepare }.to raise_error(Chronicle::ETL::ExtractionError)
27 | end
28 | end
29 | end
30 | end
31 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/authorizer.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | # An authorization strategy for a third-party data source
4 | class Authorizer
5 | class << self
6 | attr_reader :provider_name
7 |
8 | # Macro for setting provider on an Authorizer
9 | def provider(provider_name)
10 | @provider_name = provider_name.to_sym
11 | end
12 |
13 | # From all loaded Authorizers, return the first one that matches
14 | # a given provider
15 | #
16 | # @todo Have a proper identifier system for authorizers
17 | # (to have more than one per plugin)
18 | def find_by_provider(provider)
19 | ObjectSpace.each_object(::Class).select { |klass| klass < self }.find do |authorizer|
20 | authorizer.provider_name == provider
21 | end
22 | end
23 | end
24 |
25 | # Construct a new authorizer
26 | def initialize(args); end
27 |
28 | # Main entry-point for authorization flows. Implemented by subclass
29 | def authorize!
30 | raise NotImplementedError
31 | end
32 | end
33 | end
34 | end
35 |
36 | require_relative 'oauth_authorizer'
37 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/helpers/stdout_helper.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tempfile'
4 |
5 | module Chronicle
6 | module ETL
7 | module Loaders
8 | module Helpers
9 | module StdoutHelper
10 | # TODO: have option to immediately output to stdout
11 |
12 | # TODO: let users use "stdout" as an option for the `output` setting
13 | # Assume we're using stdout if no output is specified
14 | def output_to_stdout?
15 | !@config.output
16 | end
17 |
18 | def create_stdout_temp_file
19 | file = Tempfile.new('chronicle-stdout')
20 | file.unlink
21 | file
22 | end
23 |
24 | def write_to_stdout_from_temp_file(file)
25 | file.rewind
26 | write_to_stdout(file.read)
27 | end
28 |
29 | def write_to_stdout(output)
30 | # We .dup because rspec overwrites $stdout (in helper #capture) to
31 | # capture output.
32 | stdout = $stdout.dup
33 | stdout.write(output)
34 | stdout.flush
35 | end
36 | end
37 | end
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/secrets_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'spec_helper'
4 |
5 | RSpec.describe Chronicle::ETL::Secrets do
6 | include_context 'mocked config directory'
7 |
8 | describe '#all' do
9 | it 'can retrieve all secrets' do
10 | expect(described_class.all.keys).to contain_exactly(:'provider-one', :'provider-two')
11 | end
12 | end
13 |
14 | describe '#set' do
15 | it 'can set a secret' do
16 | described_class.set('new-namespace', 'key', 'value')
17 | value = described_class.read('new-namespace')[:key]
18 | expect(value).to eql('value')
19 | end
20 | end
21 |
22 | describe '#unset' do
23 | it 'can unset a secret' do
24 | described_class.set('new-namespace', 'key', 'value')
25 | value = described_class.read('new-namespace')[:key]
26 | expect(value).to eql('value')
27 |
28 | described_class.unset('new-namespace', 'key')
29 | value = described_class.read('new-namespace')[:key]
30 | expect(value).to eql(nil)
31 | end
32 | end
33 |
34 | describe '#available_secrets' do
35 | it 'can list all secrets' do
36 | expect(described_class.available_secrets).to contain_exactly('provider-one', 'provider-two')
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/loader.rb:
--------------------------------------------------------------------------------
1 | require_relative 'helpers/encoding_helper'
2 | require_relative 'helpers/stdout_helper'
3 |
4 | module Chronicle
5 | module ETL
6 | # Abstract class representing a Loader for an ETL job
7 | class Loader
8 | extend Chronicle::ETL::Registry::SelfRegistering
9 | include Chronicle::ETL::Configurable
10 | include Chronicle::ETL::Loaders::Helpers::EncodingHelper
11 |
12 | setting :output
13 | setting :fields
14 | setting :fields_limit, default: nil
15 | setting :fields_exclude
16 |
17 | # Construct a new instance of this loader. Options are passed in from a Runner
18 | # == Parameters:
19 | # options::
20 | # Options for configuring this Loader
21 | def initialize(options = {})
22 | apply_options(options)
23 | end
24 |
25 | # Called once before processing records
26 | def start; end
27 |
28 | # Load a single record
29 | def load
30 | raise NotImplementedError
31 | end
32 |
33 | # Called once there are no more records to process
34 | def finish; end
35 | end
36 | end
37 | end
38 |
39 | require_relative 'csv_loader'
40 | require_relative 'json_loader'
41 | require_relative 'rest_loader'
42 | require_relative 'table_loader'
43 |
--------------------------------------------------------------------------------
/spec/support/capture_io.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module SpecHelpers
4 | # Capture stdout/stderr in a block
5 | # Adapted from minitest
6 | # https://github.com/seattlerb/minitest/blob/7d2134a1d386a068f1c7705889c7764a47413861/lib/minitest/assertions.rb#L514
7 | def capture
8 | orig_stdout = $stdout
9 | orig_stderr = $stderr
10 |
11 | captured_stdout = StringIO.new
12 | captured_stderr = StringIO.new
13 |
14 | $stdout = captured_stdout
15 | $stderr = captured_stderr
16 |
17 | yield
18 |
19 | [captured_stdout.string, captured_stderr.string]
20 | ensure
21 | $stdout = orig_stdout
22 | $stderr = orig_stderr
23 | end
24 |
25 | # Quick and dirty method to run a block with suppressed stdout/stderr
26 | # TODO: refactor this to share code with above
27 | def suppress_output
28 | orig_stdout = $stdout
29 | orig_stderr = $stderr
30 |
31 | captured_stdout = StringIO.new
32 | captured_stderr = StringIO.new
33 |
34 | $stdout = captured_stdout
35 | $stderr = captured_stderr
36 |
37 | yield
38 | ensure
39 | $stdout = orig_stdout
40 | $stderr = orig_stderr
41 | end
42 | end
43 | end
44 | end
45 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/cli_base.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module CLI
4 | # Base class for CLI commands
5 | class CLIBase < ::Thor
6 | no_commands do
7 | # Shorthand for cli_exit(status: :failure)
8 | def cli_fail(message: nil, exception: nil)
9 | if exception && Chronicle::ETL::Logger.log_level > Chronicle::ETL::Logger::DEBUG
10 | message += "\nRe-run the command with --verbose to see details."
11 | end
12 |
13 | cli_exit(status: :failure, message: message, exception: exception)
14 | end
15 |
16 | # Exit from CLI
17 | #
18 | # @params status Can be eitiher :success or :failure
19 | # @params message to print
20 | # @params exception stacktrace if log_level is set to debug
21 | def cli_exit(status: :success, message: nil, exception: nil)
22 | exit_code = status == :success ? 0 : 1
23 | log_level = status == :success ? :info : :fatal
24 |
25 | message = message.red if status != :success
26 |
27 | Chronicle::ETL::Logger.debug(exception.full_message) if exception
28 | Chronicle::ETL::Logger.send(log_level, message) if message
29 | exit(exit_code)
30 | end
31 | end
32 | end
33 | end
34 | end
35 | end
36 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/json_extractor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class JSONExtractor < Chronicle::ETL::Extractor
6 | include Extractors::Helpers::InputReader
7 |
8 | register_connector do |r|
9 | r.identifier = :json
10 | r.description = 'JSON'
11 | end
12 |
13 | setting :jsonl, default: true, type: :boolean
14 | setting :path, default: nil, type: :string
15 |
16 | def prepare
17 | @jsons = []
18 | load_input do |input|
19 | data = parse_data(input)
20 | @jsons += [data].flatten
21 | end
22 | end
23 |
24 | def extract
25 | @jsons.each do |json|
26 | yield Chronicle::ETL::Extraction.new(data: json)
27 | end
28 | end
29 |
30 | def results_count
31 | @jsons.count
32 | end
33 |
34 | private
35 |
36 | def parse_data(data)
37 | parsed_data = JSON.parse(data)
38 | if @config.path
39 | parsed_data.dig(*@config.path.split('.'))
40 | else
41 | parsed_data
42 | end
43 | rescue JSON::ParserError
44 | raise Chronicle::ETL::ExtractionError, 'Could not parse JSON'
45 | end
46 |
47 | def load_input(&block)
48 | if @config.jsonl
49 | read_input_as_lines(&block)
50 | else
51 | read_input(&block)
52 | end
53 | end
54 | end
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/csv_extractor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'csv'
4 |
5 | module Chronicle
6 | module ETL
7 | class CSVExtractor < Chronicle::ETL::Extractor
8 | include Extractors::Helpers::InputReader
9 |
10 | register_connector do |r|
11 | r.identifier = :csv
12 | r.description = 'CSV'
13 | end
14 |
15 | setting :headers, default: true
16 |
17 | def prepare
18 | @csvs = prepare_sources
19 | end
20 |
21 | def extract
22 | @csvs.each do |csv|
23 | csv.read.each do |row|
24 | yield Chronicle::ETL::Extraction.new(data: row.to_h)
25 | end
26 | end
27 | end
28 |
29 | def results_count
30 | @csvs.reduce(0) do |total_rows, csv|
31 | row_count = csv.readlines.size
32 | csv.rewind
33 | total_rows + row_count
34 | end
35 | end
36 |
37 | private
38 |
39 | def all_rows
40 | @csvs.reduce([]) do |all_rows, csv|
41 | all_rows + csv.to_a.map(&:to_h)
42 | end
43 | end
44 |
45 | def prepare_sources
46 | @csvs = []
47 | read_input do |csv_data|
48 | csv_options = {
49 | headers: @config.headers.is_a?(String) ? @config.headers.split(',') : @config.headers,
50 | converters: :all
51 | }
52 | @csvs << CSV.new(csv_data, **csv_options)
53 | end
54 | @csvs
55 | end
56 | end
57 | end
58 | end
59 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/subcommand_base.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | module CLI
4 | # Base class for CLI subcommands. Overrides Thor methods so we can use command:subcommand syntax
5 | class SubcommandBase < Chronicle::ETL::CLI::CLIBase
6 | # Print usage instructions for a subcommand
7 | def self.help(shell, subcommand = false)
8 | list = printable_commands(true, subcommand)
9 | ::Thor::Util.thor_classes_in(self).each do |klass|
10 | list += klass.printable_commands(false)
11 | end
12 | list.sort! { |a, b| a[0] <=> b[0] }
13 |
14 | shell.say 'COMMANDS'.bold
15 | shell.print_table(list, indent: 2, truncate: true)
16 | shell.say
17 | class_options_help(shell)
18 | end
19 |
20 | # Show docs with command:subcommand pattern.
21 | # For `help` command, don't use colon
22 | def self.banner(command, _namespace = nil, _subcommand = false)
23 | if command.name == 'help'
24 | "#{subcommand_prefix} #{command.usage}"
25 | else
26 | "#{subcommand_prefix}:#{command.usage}"
27 | end
28 | end
29 |
30 | # Use subcommand classname to derive display name for subcommand
31 | def self.subcommand_prefix
32 | name.gsub(/.*::/, '').gsub(/^[A-Z]/) do |match|
33 | match[0].downcase
34 | end.gsub(/[A-Z]/) { |match| "-#{match[0].downcase}" }
35 | end
36 | end
37 | end
38 | end
39 | end
40 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/secrets_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Secrets do
4 | include_context 'mocked config directory'
5 |
6 | describe 'chronicle-etl secrets:list' do
7 | it 'can list available secrets' do
8 | args = ['secrets:list']
9 | output, = invoke_cli(args)
10 |
11 | all_secrets = Chronicle::ETL::Secrets.all.values.map(&:values).flatten
12 |
13 | expect(output.split("\n").count).to eql(all_secrets.count + 2)
14 | end
15 | end
16 |
17 | describe 'chronicle-etl secrets:set' do
18 | it 'can set a secret' do
19 | args = %w[secrets:set foo key value]
20 | invoke_cli(args)
21 |
22 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('value')
23 | end
24 |
25 | context 'when value not provided' do
26 | include_context 'mocked stdin'
27 |
28 | it 'can set a secret with stdin' do
29 | load_stdin('baz')
30 | args = %w[secrets:set foo key]
31 | invoke_cli(args)
32 |
33 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('baz')
34 | end
35 | end
36 | end
37 |
38 | describe 'chronicle-etl secrets:unset' do
39 | it 'can unset a secret' do
40 | args = %w[secrets:set foo key value]
41 | invoke_cli(args)
42 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to eql('value')
43 |
44 | args = %w[secrets:unset foo key]
45 | invoke_cli(args)
46 | expect(Chronicle::ETL::Secrets.read('foo')[:key]).to be_nil
47 | end
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/extractors/file_extractor_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::FileExtractor do
4 | let(:filename) { 'spec/support/sample_data/two-records.csv' }
5 | let(:directory) { 'spec/support/sample_data/directories/simple' }
6 |
7 | context 'for a simple directory' do
8 | describe '#results_count' do
9 | it 'can extract from a CSV file' do
10 | e = Chronicle::ETL::FileExtractor.new(input: directory, dir_glob_pattern: '**/*')
11 | e.prepare
12 | expect(e.results_count).to eql(2)
13 | end
14 | end
15 |
16 | describe '#extract' do
17 | it 'can yield filenames in the directory' do
18 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: directory, dir_glob_pattern: '**/*' })
19 | expect(results).to all(be_a(Chronicle::ETL::Extraction))
20 | expect(results.count).to eql(2)
21 | end
22 | end
23 | end
24 |
25 | context 'when passed in files' do
26 | it 'will yield file back' do
27 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: [filename] })
28 | expect(results.count).to eql(1)
29 | expect(results.first.data).to eql(filename)
30 | end
31 |
32 | context 'when passed in two of the same files' do
33 | it 'will yield file once' do
34 | results = run_extraction(Chronicle::ETL::FileExtractor, { input: [filename, filename] })
35 | expect(results.count).to eql(1)
36 | expect(results.first.data).to eql(filename)
37 | end
38 | end
39 | end
40 | end
41 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/authorization_server.rb:
--------------------------------------------------------------------------------
1 | require 'sinatra'
2 | require 'omniauth'
3 |
4 | module Chronicle
5 | module ETL
6 | class AuthorizationServer < Sinatra::Base
7 | class << self
8 | attr_accessor :latest_authorization
9 | end
10 |
11 | configure do
12 | set :inline_templates, true
13 | set :dump_errors, false
14 | set :raise_errors, true
15 | disable :logging
16 | set :sessions, true
17 | set :quiet, true
18 | set :threaded, true
19 | set :environment, ENV['APP_ENV'] == 'test' ? :test : :production
20 | end
21 |
22 | use OmniAuth::Builder do
23 | Chronicle::ETL::OauthAuthorizer.all.each do |klass|
24 | args = [klass.client_id, klass.client_secret, klass.options].compact
25 | provider(
26 | klass.strategy,
27 | *args
28 | )
29 | end
30 | end
31 |
32 | OmniAuth.config.logger = Chronicle::ETL::Logger
33 | OmniAuth.config.silence_get_warning = true
34 | OmniAuth.config.allowed_request_methods = %i[get]
35 |
36 | get '/auth/:provider/callback' do
37 | authorization = request.env['omniauth.auth'].to_h.deep_transform_keys(&:to_sym)
38 | self.class.latest_authorization = authorization
39 | erb "
Settings saved for #{params[:provider]}
You can now close this tab and return to your terminal!
"
40 | end
41 |
42 | get '/auth/failure' do
43 | # TODO: handle this
44 | erb "Authentication Failed:
message: #{params}"
45 | end
46 | end
47 | end
48 | end
49 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/runner_spec.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'spec_helper'
4 |
5 | RSpec.describe Chronicle::ETL::Runner do
6 | before(:all) do
7 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::FATAL
8 | end
9 |
10 | describe '#run!' do
11 | it 'runs' do
12 | filename = 'spec/support/sample_data/test.csv'
13 |
14 | # rows in sample CSV file (excluding header)
15 | file_record_count = File.read(filename).each_line.count - 1
16 |
17 | definition = Chronicle::ETL::JobDefinition.new
18 | definition.add_config(
19 | {
20 | extractor: {
21 | name: 'csv',
22 | options: {
23 | input: filename
24 | }
25 | },
26 | transformers: [
27 | {
28 | name: 'multiply',
29 | options: {
30 | n: 2
31 | }
32 | },
33 | {
34 | name: 'multiply',
35 | options: {
36 | n: 2
37 | }
38 | },
39 | {
40 | name: 'sort',
41 | options: {
42 | key: 'id',
43 | direction: 'desc'
44 | }
45 | }
46 | ],
47 | loader: {
48 | name: 'json'
49 | }
50 | }
51 | )
52 |
53 | job = Chronicle::ETL::Job.new(definition)
54 |
55 | r = Chronicle::ETL::Runner.new(job)
56 |
57 | output, = capture do
58 | r.run!
59 | end
60 |
61 | expect(output.split("\n").count).to eql(file_record_count * 4)
62 | end
63 | end
64 | end
65 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/exceptions.rb:
--------------------------------------------------------------------------------
1 | module Chronicle
2 | module ETL
3 | class Error < StandardError; end
4 |
5 | class SecretsError < Error; end
6 |
7 | class AuthorizationError < Error; end
8 |
9 | class ConfigError < Error; end
10 |
11 | class RunnerError < Error; end
12 | class RunInterruptedError < RunnerError; end
13 |
14 | class RunnerTypeError < Error; end
15 |
16 | class JobDefinitionError < Error
17 | attr_reader :job_definition
18 |
19 | def initialize(job_definition)
20 | @job_definition = job_definition
21 | super
22 | end
23 | end
24 |
25 | class PluginError < Error
26 | attr_reader :name
27 |
28 | def initialize(name)
29 | super
30 | @name = name
31 | end
32 | end
33 |
34 | class PluginNotInstalledError < PluginError; end
35 | class PluginConflictError < PluginError; end
36 | class PluginNotAvailableError < PluginError; end
37 | class PluginLoadError < PluginError; end
38 |
39 | class ConnectorConfigurationError < Error; end
40 |
41 | class ConnectorNotAvailableError < Error
42 | def initialize(message, provider: nil, name: nil)
43 | super(message)
44 | @provider = provider
45 | @name = name
46 | end
47 | attr_reader :name, :provider
48 | end
49 |
50 | class ProviderNotAvailableError < ConnectorNotAvailableError; end
51 | class ProviderConnectorNotAvailableError < ConnectorNotAvailableError; end
52 |
53 | class ExtractionError < Error; end
54 |
55 | class TransformationError < Error; end
56 | class UntransformableRecordError < TransformationError; end
57 |
58 | class LoaderError < Error; end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/oauth_authorizer_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | OmniAuth.config.test_mode = true
4 | OmniAuth.config.mock_auth[:developer] = {
5 | token: 'abc'
6 | }
7 | # Make sure AuthorizationServer knows we're in test mode
8 | ENV['APP_ENV'] = 'test'
9 |
10 | # Prevent Launchy from attempt to open windows in oauth_authorizer.rb
11 | ENV['LAUNCHY_DRY_RUN'] = 'true'
12 |
13 | RSpec.describe Chronicle::ETL::OauthAuthorizer do
14 | let(:port) { 5678 }
15 | let(:authorizer) do
16 | Class.new(Chronicle::ETL::OauthAuthorizer) do
17 | provider :foo
18 | omniauth_strategy :developer
19 | scope 'email'
20 | pluck_secrets({ token: [:token] })
21 | end
22 | end
23 |
24 | before do
25 | stub_const('FooAuthorizer', authorizer)
26 | end
27 |
28 | it 'returs an authorization after oauth flow completed' do
29 | a = authorizer.new(port: port)
30 | thread = Thread.new do
31 | wait_until do
32 | booted?
33 | end
34 | fetch("http://localhost:#{port}/auth/developer/")
35 | end
36 |
37 | result = suppress_output do
38 | a.authorize!
39 | end
40 | thread.join
41 | expect(result).to eql({ token: 'abc' })
42 | end
43 |
44 | it 'raises an exception if flow aborts early' do
45 | # TODO: implement this somehow
46 | # send signal to sinatra?
47 | end
48 |
49 | def booted?
50 | fetch("http://localhost:#{port}/")
51 | true
52 | rescue Errno::ECONNREFUSED, Errno::EBADF
53 | false
54 | end
55 |
56 | # TODO: use library? put in SpecHelpers?
57 | def fetch(uri, limit = 10)
58 | response = Net::HTTP.get_response(URI(uri))
59 | fetch(response['location'], limit - 1) if response == Net::HTTPRedirection || response.code == '302'
60 | end
61 | end
62 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/connector_registration.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | module Registry
6 | # Records details about a connector such as its source provider and a description
7 | class ConnectorRegistration
8 | attr_accessor :klass, :identifier, :source, :strategy, :type, :description, :from_schema, :to_schema
9 |
10 | # Create a new connector registration
11 | def initialize(klass)
12 | @klass = klass
13 | end
14 |
15 | # The ETL phase of this connector
16 | def phase
17 | if klass.ancestors.include? Chronicle::ETL::Extractor
18 | :extractor
19 | elsif klass.ancestors.include? Chronicle::ETL::Transformer
20 | :transformer
21 | elsif klass.ancestors.include? Chronicle::ETL::Loader
22 | :loader
23 | end
24 | end
25 |
26 | def to_s
27 | "#{phase}-#{identifier}"
28 | end
29 |
30 | # Whether this connector is built-in to Chronicle
31 | def built_in?
32 | @klass.to_s.include? 'Chronicle::ETL'
33 | end
34 |
35 | def klass_name
36 | @klass.to_s
37 | end
38 |
39 | # TODO: allow overriding here. Maybe through self-registration process
40 | def plugin
41 | @source
42 | end
43 |
44 | def descriptive_phrase
45 | prefix = case phase
46 | when :extractor
47 | 'Extracts from'
48 | when :transformer
49 | 'Transforms'
50 | when :loader
51 | 'Loads to'
52 | end
53 |
54 | "#{prefix} #{description}"
55 | end
56 | end
57 | end
58 | end
59 | end
60 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/csv_loader.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'csv'
4 | require 'chronicle/utils/hash_utils'
5 |
6 | module Chronicle
7 | module ETL
8 | class CSVLoader < Chronicle::ETL::Loader
9 | include Chronicle::ETL::Loaders::Helpers::StdoutHelper
10 |
11 | register_connector do |r|
12 | r.identifier = :csv
13 | r.description = 'CSV'
14 | end
15 |
16 | setting :output
17 | setting :headers, default: true
18 | setting :header_row, default: true
19 |
20 | def records
21 | @records ||= []
22 | end
23 |
24 | def load(record)
25 | records << record
26 | end
27 |
28 | def finish
29 | return unless records.any?
30 |
31 | # headers = filtered_headers(records)
32 | headers = gather_headers(records)
33 |
34 | csv_options = {}
35 | if @config.headers
36 | csv_options[:write_headers] = @config.header_row
37 | csv_options[:headers] = headers
38 | end
39 |
40 | csv_output = CSV.generate(**csv_options) do |csv|
41 | records.each do |record|
42 | csv << Chronicle::Utils::HashUtils.flatten_hash(record.to_h)
43 | .values_at(*headers)
44 | .map { |value| force_utf8(value) }
45 | end
46 | end
47 |
48 | # TODO: just write to io directly
49 | if output_to_stdout?
50 | write_to_stdout(csv_output)
51 | else
52 | File.write(@config.output, csv_output)
53 | end
54 | end
55 |
56 | private
57 |
58 | def gather_headers(records)
59 | records_flattened = records.map do |record|
60 | Chronicle::Utils::HashUtils.flatten_hash(record.to_h)
61 | end
62 | records_flattened.flat_map(&:keys).uniq
63 | end
64 | end
65 | end
66 | end
67 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'simplecov'
4 | SimpleCov.start
5 |
6 | require 'bundler/setup'
7 | require 'chronicle/etl'
8 | require 'chronicle/etl/cli'
9 |
10 | require 'vcr'
11 | VCR.configure do |config|
12 | config.cassette_library_dir = 'spec/fixtures/vcr_cassettes'
13 | config.allow_http_connections_when_no_cassette = true
14 | config.hook_into :webmock
15 | config.filter_sensitive_data('') { Gem.configuration.rubygems_api_key }
16 | end
17 |
18 | require_relative 'support/capture_io'
19 | require_relative 'support/invoke_cli'
20 | require_relative 'support/mocked_config_directory'
21 | require_relative 'support/mocked_stdin'
22 | require_relative 'support/run_extraction'
23 | require_relative 'support/wait_until'
24 |
25 | RSPEC_ROOT = File.dirname(__FILE__)
26 |
27 | RSpec.configure do |config|
28 | config.include Chronicle::ETL::SpecHelpers
29 | config.include_context 'mocked config directory', include_shared: true
30 | config.include_context 'mocked stdin', include_shared: true
31 |
32 | # Enable flags like --only-failures and --next-failure
33 | config.example_status_persistence_file_path = '.rspec_status'
34 |
35 | # Disable RSpec exposing methods globally on `Module` and `main`
36 | config.disable_monkey_patching!
37 |
38 | config.mock_with :rspec
39 |
40 | config.filter_run focus: true
41 | config.run_all_when_everything_filtered = true
42 |
43 | config.expect_with :rspec do |c|
44 | c.syntax = :expect
45 | end
46 | end
47 | # This monkeypatch is required because of weird interactions between the
48 | # `tty-screen` used for CLI output and the way rspec captures stdout
49 | # see: https://github.com/rspec/rspec-expectations/issues/1305
50 | # and: https://github.com/emsk/bundle_outdated_formatter/blob/v0.7.0/spec/spec_helper.rb#L16-L21
51 | require 'stringio'
52 | class StringIO
53 | def ioctl(*)
54 | 0
55 | end
56 | end
57 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/file_extractor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'pathname'
4 |
5 | module Chronicle
6 | module ETL
7 | # Return filenames that match a pattern in a directory
8 | class FileExtractor < Chronicle::ETL::Extractor
9 | register_connector do |r|
10 | r.identifier = :file
11 | r.description = 'file or directory of files'
12 | end
13 |
14 | setting :input, default: ['.']
15 | setting :dir_glob_pattern, default: '**/*'
16 | setting :larger_than
17 | setting :smaller_than
18 |
19 | def prepare
20 | @pathnames = gather_files
21 | end
22 |
23 | def extract
24 | @pathnames.each do |pathname|
25 | yield Chronicle::ETL::Extraction.new(data: pathname.to_path)
26 | end
27 | end
28 |
29 | def results_count
30 | @pathnames.count
31 | end
32 |
33 | private
34 |
35 | def gather_files
36 | roots = [@config.input].flatten.map { |filename| Pathname.new(filename) }
37 | raise(ExtractionError, 'Input must exist') unless roots.all?(&:exist?)
38 |
39 | directories, files = roots.partition(&:directory?)
40 |
41 | directories.each do |directory|
42 | files += Dir.glob(File.join(directory, @config.dir_glob_pattern)).map { |filename| Pathname.new(filename) }
43 | end
44 |
45 | files = files.uniq
46 |
47 | files = files.keep_if { |f| (f.mtime > @config.since) } if @config.since
48 | files = files.keep_if { |f| (f.mtime < @config.until) } if @config.until
49 |
50 | # pass in file sizes in bytes
51 | files = files.keep_if { |f| (f.size < @config.smaller_than) } if @config.smaller_than
52 | files = files.keep_if { |f| (f.size > @config.larger_than) } if @config.larger_than
53 |
54 | # # TODO: incorporate sort argument
55 | files.sort_by(&:mtime)
56 | end
57 | end
58 | end
59 | end
60 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/filter_fields_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | # A transformer that filters the fields of a record and returns a new hash with only the specified fields.
6 | class FilterFieldsTransformer < Chronicle::ETL::Transformer
7 | register_connector do |r|
8 | r.identifier = :filter_fields
9 | r.description = 'by taking a subset of the fields'
10 | end
11 |
12 | setting :fields, type: :array, default: []
13 |
14 | def transform(record)
15 | hash = record.data.to_h.deep_transform_keys(&:to_sym)
16 | filter_hash(hash, @config.fields.map)
17 | end
18 |
19 | private
20 |
21 | def access_nested_value(data, path)
22 | keys = path.split('.')
23 | keys.reduce(data) do |acc, key|
24 | if acc.is_a?(Array)
25 | acc.map do |item|
26 | item[key.to_sym]
27 | rescue StandardError
28 | nil
29 | end
30 | .compact
31 | elsif key.include?('[')
32 | key, index = key.split(/\[|\]/).reject(&:empty?)
33 | acc = acc[key.to_sym] if acc
34 | acc.is_a?(Array) ? acc[index.to_i] : nil
35 | else
36 | acc&.dig(key.to_sym)
37 | end
38 | end
39 | end
40 |
41 | def filter_hash(original_hash, fields)
42 | fields.each_with_object({}) do |field, result|
43 | value = access_nested_value(original_hash, field)
44 | keys = field.split('.')
45 | last_key = keys.pop.to_sym
46 |
47 | current = result
48 | keys.each do |key|
49 | key = key.to_sym
50 | key, = key.to_s.split(/\[|\]/) if key.to_s.include?('[')
51 | current[key] ||= {}
52 | current = current[key]
53 | end
54 |
55 | current[last_key] = value
56 | end
57 | end
58 | end
59 | end
60 | end
61 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/utils/progress_bar.rb:
--------------------------------------------------------------------------------
1 | require 'tty/progressbar'
2 | require 'colorize'
3 |
4 | module Chronicle
5 | module ETL
6 | module Utils
7 | class ProgressBar
8 | FORMAT_WITH_TOTAL = [
9 | ':bar ',
10 | ':percent'.light_white,
11 | ' | '.light_black,
12 | ':current'.light_white,
13 | '/'.light_black,
14 | ':total'.light_white,
15 | ' ('.light_black,
16 | 'ELAPSED:'.light_black,
17 | ':elapsed'.light_white,
18 | ' | ETA:'.light_black,
19 | ':eta'.light_white,
20 | ' | RATE: '.light_black,
21 | ':mean_rate'.light_white,
22 | '/s) '.light_black
23 | ].join.freeze
24 |
25 | FORMAT_WITHOUT_TOTAL = [
26 | ':current'.light_white,
27 | '/'.light_black,
28 | '???'.light_white,
29 | ' ('.light_black,
30 | 'ELAPSED:'.light_black,
31 | ':elapsed'.light_white,
32 | ' | ETA:'.light_black,
33 | '??:??'.light_white,
34 | ' | RATE: '.light_black,
35 | ':mean_rate'.light_white,
36 | '/s) '.light_black
37 | ].join.freeze
38 |
39 | def initialize(total:, title: 'Loading')
40 | opts = {
41 | clear: true,
42 | complete: '▓'.light_blue,
43 | incomplete: '░'.blue,
44 | frequency: 10
45 | }
46 |
47 | if total
48 | opts[:total] = total
49 | format_str = "#{title} #{FORMAT_WITH_TOTAL}"
50 | @pbar = TTY::ProgressBar.new(FORMAT_WITH_TOTAL, opts)
51 | else
52 | format_str = "#{title} #{FORMAT_WITHOUT_TOTAL}"
53 | opts[:no_width] = true
54 | end
55 |
56 | @pbar = TTY::ProgressBar.new(format_str, opts)
57 |
58 | @pbar.resize
59 | end
60 |
61 | def increment
62 | @pbar.advance(1)
63 | end
64 |
65 | def log(message)
66 | message.split("\n").each do |_line|
67 | @pbar.log message
68 | end
69 | end
70 |
71 | def finish
72 | @pbar.finish
73 | end
74 | end
75 | end
76 | end
77 | end
78 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/secrets.rb:
--------------------------------------------------------------------------------
1 | require 'active_support/core_ext/hash/keys'
2 |
3 | module Chronicle
4 | module ETL
5 | # Secret management module
6 | module Secrets
7 | module_function
8 |
9 | # Whether a given namespace exists
10 | def exists?(namespace)
11 | Chronicle::ETL::Config.exists?('secrets', namespace)
12 | end
13 |
14 | # Save a setting to a namespaced config file
15 | def set(namespace, key, value)
16 | config = read(namespace)
17 | config[key.to_sym] = value
18 | write(namespace, config)
19 | end
20 |
21 | # Save a hash to a secrets namespace
22 | def set_all(namespace, secrets)
23 | config = read(namespace)
24 | config = config.merge(secrets.deep_stringify_keys)
25 | write(namespace, config)
26 | end
27 |
28 | # Remove a setting from a namespaced config file
29 | def unset(namespace, key)
30 | config = read(namespace)
31 | config.delete(key.to_sym)
32 | write(namespace, config)
33 | end
34 |
35 | # Retrieve all secrets from all namespaces
36 | def all(namespace = nil)
37 | namespaces = namespace.nil? ? available_secrets : [namespace]
38 | namespaces
39 | .to_h { |namespace| [namespace.to_sym, read(namespace)] }
40 | .delete_if { |_, v| v.empty? }
41 | end
42 |
43 | # Return whether a namespace name is valid (lowercase alphanumeric and -)
44 | def valid_namespace_name?(namespace)
45 | namespace.match(/^[a-z0-9\-]+$/)
46 | end
47 |
48 | # Read secrets from a config file
49 | def read(namespace)
50 | definition = Chronicle::ETL::Config.load('secrets', namespace)
51 | definition[:secrets] || {}
52 | end
53 |
54 | # Write secrets to a config file
55 | def write(namespace, secrets)
56 | data = {
57 | secrets: (secrets || {}).transform_keys(&:to_s),
58 | chronicle_etl_version: Chronicle::ETL::VERSION
59 | }
60 | Chronicle::ETL::Config.write('secrets', namespace, data)
61 | end
62 |
63 | # Which config files are available in ~/.config/chronicle/etl/secrets
64 | def available_secrets
65 | Chronicle::ETL::Config.available_configs('secrets')
66 | end
67 | end
68 | end
69 | end
70 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/extractor.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'chronicle/etl'
4 |
5 | module Chronicle
6 | module ETL
7 | # Abstract class representing an Extractor for an ETL job
8 | class Extractor
9 | extend Chronicle::ETL::Registry::SelfRegistering
10 | include Chronicle::ETL::Configurable
11 |
12 | setting :since, type: :time
13 | setting :until, type: :time
14 | setting :limit, type: :numeric
15 | setting :load_after_id
16 | setting :input
17 |
18 | # Construct a new instance of this extractor. Options are passed in from a Runner
19 | # == Parameters:
20 | # options::
21 | # Options for configuring this Extractor
22 | def initialize(options = {})
23 | apply_options(options)
24 | end
25 |
26 | # Hook called before #extract. Useful for gathering data, initializing proxies, etc
27 | def prepare; end
28 |
29 | # An optional method to calculate how many records there are to extract. Used primarily for
30 | # building the progress bar
31 | def results_count; end
32 |
33 | # Entrypoint for this Extractor. Called by a Runner. Expects a series of records to be yielded
34 | def extract
35 | raise NotImplementedError
36 | end
37 |
38 | protected
39 |
40 | def build_extraction(data:, meta: nil, source: nil, type: nil, strategy: nil)
41 | Extraction.new(
42 | extractor: self.class,
43 | data: data,
44 | meta: meta,
45 | source: source || self.class.connector_registration.source,
46 | type: type || self.class.connector_registration.type,
47 | strategy: strategy || self.class.connector_registration.strategy
48 | )
49 | end
50 |
51 | # TODO: reimplemenet this
52 | # def handle_continuation
53 | # return unless @config.continuation
54 |
55 | # @config.since = @config.continuation.highest_timestamp if @config.continuation.highest_timestamp
56 | # @config.load_after_id = @config.continuation.last_id if @config.continuation.last_id
57 | # end
58 | end
59 | end
60 | end
61 |
62 | require_relative 'helpers/input_reader'
63 | require_relative 'csv_extractor'
64 | require_relative 'file_extractor'
65 | require_relative 'json_extractor'
66 | require_relative 'stdin_extractor'
67 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/extractors/helpers/input_reader.rb:
--------------------------------------------------------------------------------
1 | require 'pathname'
2 |
3 | module Chronicle
4 | module ETL
5 | module Extractors
6 | module Helpers
7 | module InputReader
8 | # Return an array of input filenames; converts a single string
9 | # to an array if necessary
10 | def filenames
11 | [@config.input].flatten.map
12 | end
13 |
14 | # Filenames as an array of pathnames
15 | def pathnames
16 | filenames.map { |filename| Pathname.new(filename) }
17 | end
18 |
19 | # Whether we're reading from files
20 | def read_from_files?
21 | filenames.any?
22 | end
23 |
24 | # Whether we're reading input from stdin
25 | def read_from_stdin?
26 | !read_from_files? && $stdin.stat.pipe?
27 | end
28 |
29 | # Read input sources and yield each content
30 | def read_input
31 | if read_from_files?
32 | pathnames.each do |pathname|
33 | File.open(pathname) do |file|
34 | yield file.read, pathname.to_path
35 | end
36 | end
37 | elsif read_from_stdin?
38 | yield $stdin.read, $stdin
39 | else
40 | raise ExtractionError, 'No input files or stdin provided'
41 | end
42 | end
43 |
44 | # Read input sources line by line
45 | def read_input_as_lines(&block)
46 | if read_from_files?
47 | lines_from_files(&block)
48 | elsif read_from_stdin?
49 | lines_from_stdin(&block)
50 | else
51 | raise ExtractionError, 'No input files or stdin provided'
52 | end
53 | end
54 |
55 | private
56 |
57 | def lines_from_files(&block)
58 | pathnames.each do |pathname|
59 | File.open(pathname) do |file|
60 | lines_from_io(file, &block)
61 | end
62 | end
63 | end
64 |
65 | def lines_from_stdin(&block)
66 | lines_from_io($stdin, &block)
67 | end
68 |
69 | def lines_from_io(io, &block)
70 | io.each_line(&block)
71 | end
72 | end
73 | end
74 | end
75 | end
76 | end
77 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/job_logger.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 | require 'sequel'
3 | require 'xdg'
4 |
5 | module Chronicle
6 | module ETL
7 | # Saves JobLogs to db and loads previous ones
8 | class JobLogger
9 | extend Forwardable
10 |
11 | def_delegators :@job_log, :start, :finish, :error, :log_transformation, :duration, :success
12 | attr_accessor :job_log
13 |
14 | # For a given `job_id`, return the last successful log
15 | def self.load_latest(_job_id)
16 | with_db_connection do |db|
17 | attrs = db[:job_logs].reverse_order(:finished_at).where(success: true).first
18 | JobLog.build_from_serialized(attrs) if attrs
19 | end
20 | end
21 |
22 | def self.with_db_connection
23 | initialize_db unless db_exists?
24 | Sequel.connect("sqlite://#{db_filename}") do |db|
25 | initialize_schema(db) unless schema_exists?(db)
26 | yield db
27 | end
28 | end
29 |
30 | def self.db_exists?
31 | File.exist?(db_filename)
32 | end
33 |
34 | def self.schema_exists?(db)
35 | db.tables.include? :job_logs
36 | end
37 |
38 | def self.db_filename
39 | base = Pathname.new(XDG::Data.new.home)
40 | base.join('job_log.db')
41 | end
42 |
43 | def self.initialize_db
44 | FileUtils.mkdir_p(File.dirname(db_filename))
45 | end
46 |
47 | def self.initialize_schema(db)
48 | db.create_table :job_logs do
49 | primary_key :id
50 | String :job_id, null: false
51 | String :last_id
52 | Time :highest_timestamp
53 | Integer :num_records_processed
54 | boolean :success, default: false
55 | Time :started_at
56 | Time :finished_at
57 | end
58 | end
59 |
60 | # Create a new JobLogger
61 | def initialize(job)
62 | @job_log = JobLog.new do |job_log|
63 | job_log.job = job
64 | end
65 | end
66 |
67 | # Save this JobLogger's JobLog to db
68 | def save
69 | return unless @job_log.save_log?
70 |
71 | JobLogger.with_db_connection do |db|
72 | dataset = db[:job_logs]
73 | dataset.insert(@job_log.serialize)
74 | end
75 | end
76 |
77 | def summarize
78 | @job_log.inspect
79 | end
80 | end
81 | end
82 | end
83 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/connectors_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Connectors do
4 | describe '#list' do
5 | it 'lists installed connectors' do
6 | expected_klasses = Chronicle::ETL::Registry::Connectors.connectors.map(&:klass_name)
7 |
8 | stdout, = invoke_cli(%w[connectors:list])
9 | outputted_klasses = stdout
10 | .split("\n") # ignore the ascii table header
11 | .drop(1) # parse out the connector classes
12 | .map { |k| k.match(/(Chronicle::(\w+)::(\w+))/)&.captures&.first }
13 | .compact
14 |
15 | expect(expected_klasses).to match_array(outputted_klasses)
16 | end
17 | end
18 |
19 | describe '#show' do
20 | context 'with a a bad phase type' do
21 | it 'will exit with an error' do
22 | expect do
23 | invoke_cli(%w[connectors:show transmorpher foo], false)
24 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
25 | end
26 |
27 | it 'will show an error message' do
28 | _, stderr = invoke_cli(%w[connectors:show transmorpher foo])
29 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/must be one of/)
30 | end
31 | end
32 |
33 | context 'for a connector that does not exist' do
34 | it 'will exit with an error' do
35 | expect do
36 | invoke_cli(%w[connectors:show extractor foo], false)
37 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
38 | end
39 |
40 | it 'will show an error' do
41 | _, stderr = invoke_cli(%w[connectors:show extractor unknown])
42 | # puts stderr
43 | expect(stderr).to match(/Could not find/)
44 | end
45 | end
46 |
47 | context 'for a connector that exists' do
48 | it 'can show basic information a connector' do
49 | output = invoke_cli(%w[connectors:show extractor csv]).first.split("\n").map(&:uncolorize)
50 | expect(output.first).to eql('Chronicle::ETL::CSVExtractor')
51 | end
52 | end
53 | end
54 |
55 | describe '#help' do
56 | it 'outputs help for connectors' do
57 | expect(invoke_cli(%w[connectors help]).first).to match(/COMMANDS/)
58 | end
59 |
60 | it 'outputs help for a connector subcommand' do
61 | expect(invoke_cli(%w[connectors help list]).first).to match(/Usage:/)
62 | end
63 | end
64 | end
65 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/table_loader.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tty/table'
4 | require 'chronicle/utils/hash_utils'
5 | require 'active_support/core_ext/string/filters'
6 | require 'active_support/core_ext/hash/reverse_merge'
7 |
8 | module Chronicle
9 | module ETL
10 | class TableLoader < Chronicle::ETL::Loader
11 |
12 | register_connector do |r|
13 | r.identifier = :table
14 | r.description = 'an ASCII table'
15 | end
16 |
17 | setting :truncate_values_at, default: 40
18 | setting :table_renderer, default: :basic
19 | setting :fields_exclude, default: ['type']
20 | setting :header_row, default: true
21 |
22 | def load(record)
23 | records << record
24 | end
25 |
26 | def finish
27 | return if records.empty?
28 |
29 | headers = gather_headers(records)
30 | rows = build_rows(records, headers)
31 |
32 | render_table(headers, rows)
33 | end
34 |
35 | def records
36 | @records ||= []
37 | end
38 |
39 | private
40 |
41 | def render_table(headers, rows)
42 | @table = TTY::Table.new(header: (headers if @config.header_row), rows: rows)
43 | puts @table.render(
44 | @config.table_renderer.to_sym,
45 | padding: [0, 2, 0, 0]
46 | )
47 | rescue TTY::Table::ResizeError
48 | # The library throws this error before trying to render the table
49 | # vertically. These options seem to work.
50 | puts @table.render(
51 | @config.table_renderer.to_sym,
52 | padding: [0, 2, 0, 0],
53 | width: 10_000,
54 | resize: false
55 | )
56 | end
57 |
58 | def gather_headers(records)
59 | records_flattened = records.map do |record|
60 | Chronicle::Utils::HashUtils.flatten_hash(record.to_h)
61 | end
62 | records_flattened.flat_map(&:keys).uniq
63 | end
64 |
65 | def build_rows(records, headers)
66 | records.map do |record|
67 | values = Chronicle::Utils::HashUtils.flatten_hash(record.to_h)
68 | .values_at(*headers)
69 | .map { |value| force_utf8(value.to_s) }
70 |
71 | values = values.map { |value| value.truncate(@config.truncate_values_at) } if @config.truncate_values_at
72 |
73 | values
74 | end
75 | end
76 | end
77 | end
78 | end
79 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/secrets.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tty-prompt'
4 |
5 | module Chronicle
6 | module ETL
7 | module CLI
8 | # CLI commands for working with ETL plugins
9 | class Secrets < SubcommandBase
10 | default_task 'list'
11 | namespace :secrets
12 |
13 | desc 'set NAMESPACE KEY [VALUE]', 'Add a secret. VALUE can be set as argument or from stdin'
14 | def set(namespace, key, value = nil)
15 | validate_namespace(namespace)
16 |
17 | if value
18 | # came as argument
19 | elsif $stdin.respond_to?(:stat) && $stdin.stat.pipe?
20 | value = $stdin.read
21 | else
22 | prompt = TTY::Prompt.new
23 | value = prompt.mask("Please enter #{key} for #{namespace}:")
24 | end
25 |
26 | Chronicle::ETL::Secrets.set(namespace, key, value.strip)
27 | cli_exit(message: 'Secret set')
28 | rescue TTY::Reader::InputInterrupt
29 | cli_fail(message: "\nSecret not set")
30 | end
31 |
32 | desc 'unset NAMESPACE KEY', 'Remove a secret'
33 | def unset(namespace, key)
34 | validate_namespace(namespace)
35 |
36 | Chronicle::ETL::Secrets.unset(namespace, key)
37 | cli_exit(message: 'Secret unset')
38 | end
39 |
40 | desc 'list', 'List available secrets'
41 | def list(namespace = nil)
42 | all_secrets = Chronicle::ETL::Secrets.all(namespace)
43 | cli_exit(message: 'No secrets are stored') unless all_secrets.any?
44 |
45 | rows = []
46 | all_secrets.each do |namespace, secrets|
47 | rows += secrets.map do |key, value|
48 | # hidden_value = (value[0..5] + ("*" * [0, [value.length - 5, 30].min].max)).truncate(30)
49 | truncated_value = value&.truncate(30)
50 | [namespace, key, truncated_value]
51 | end
52 | end
53 |
54 | headers = %w[namespace key value].map { |h| h.upcase.bold }
55 |
56 | puts 'Available secrets:'
57 | table = TTY::Table.new(headers, rows)
58 | puts table.render(indent: 0, padding: [0, 2])
59 | end
60 |
61 | private
62 |
63 | def validate_namespace(namespace)
64 | return if Chronicle::ETL::Secrets.valid_namespace_name?(namespace)
65 |
66 | cli_fail(message: "'#{namespace}' is not a valid namespace")
67 | end
68 | end
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/jobs_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Jobs do
4 | let(:csv_filename) { 'spec/support/sample_data/test.csv' }
5 | let(:csv_job_args) do
6 | %w[
7 | --extractor csv
8 | --log-level fatal
9 | --extractor-opts
10 | ] << "input:#{csv_filename}"
11 | end
12 |
13 | describe 'chronicle-etl jobs:run' do
14 | it 'run a simple job' do
15 | file_record_count = File.read(csv_filename).each_line.count - 1
16 |
17 | args = ['jobs:run'] << csv_job_args
18 | output, = invoke_cli(args)
19 |
20 | # jsonl output
21 | expect(output.split("\n").count).to eql(file_record_count)
22 | end
23 |
24 | context 'for jobs with required plugins not installed' do
25 | include_context 'mocked stdin'
26 |
27 | it 'will prompt to install plugin' do
28 | args = %w[jobs:run -e unknown:extractor --log-level fatal]
29 | load_stdin('n')
30 | output, = invoke_cli(args)
31 | expect(output).to match(/want to install/)
32 | end
33 | end
34 | end
35 |
36 | describe 'chronicle-etl jobs:show' do
37 | it 'shows details about a simple job' do
38 | args = ['jobs:show'] << csv_job_args
39 | output, = invoke_cli(args)
40 |
41 | expect(output).to match(/Extracting from/)
42 | expect(output).to match(/Transforming/)
43 | expect(output).to match(/Loading/)
44 | # TODO: do more precise matching based on job
45 | end
46 | end
47 |
48 | describe 'chronicle-etl jobs:edit' do
49 | it 'launches an editor' do
50 | # TODO
51 | end
52 | end
53 |
54 | describe 'chronicle-etl jobs:save' do
55 | include_context 'mocked config directory'
56 |
57 | it 'can save a job file' do
58 | args = %w[jobs:save test-job]
59 | expect { invoke_cli(args) }
60 | .to change { Chronicle::ETL::Config.available_jobs.count }
61 | .by(1)
62 | end
63 | end
64 |
65 | describe 'chronicle-etl jobs:list' do
66 | include_context 'mocked config directory'
67 |
68 | it 'lists available jobs' do
69 | output, = invoke_cli(%w[jobs list])
70 | expect(output.split("\n").last).to match('^ command')
71 | end
72 | end
73 |
74 | describe 'chronicle-etl jobs help' do
75 | it 'outputs help for jobs' do
76 | expect(invoke_cli(%w[jobs help]).first).to match(/COMMANDS/)
77 | end
78 |
79 | it 'outputs help for a job subcommand' do
80 | expect(invoke_cli(%w[jobs help show]).first).to match(/Usage:/)
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/connectors.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | module CLI
6 | # CLI commands for working with ETL connectors
7 | #
8 | # @todo make this work with new plugin system (i.e. no loading of all plugins)
9 | class Connectors < SubcommandBase
10 | default_task 'list'
11 | namespace :connectors
12 |
13 | desc 'list', 'Lists available connectors'
14 | # Display all available connectors that chronicle-etl has access to
15 | def list
16 | connector_info = Chronicle::ETL::Registry::Connectors.connectors.map do |connector_registration|
17 | {
18 | identifier: connector_registration.identifier,
19 | phase: connector_registration.phase,
20 | description: connector_registration.descriptive_phrase,
21 | source: connector_registration.source,
22 | core: connector_registration.built_in? ? '✓' : '',
23 | class: connector_registration.klass_name
24 | }
25 | end
26 |
27 | connector_info = connector_info.sort_by do |a|
28 | [a[:core].to_s, a[:provider], a[:phase], a[:identifier]]
29 | end
30 |
31 | headers = connector_info.first.keys.map do |key|
32 | key.to_s.upcase.bold
33 | end
34 |
35 | table = TTY::Table.new(headers, connector_info.map(&:values))
36 | puts table.render(indent: 0, padding: [0, 2])
37 | end
38 |
39 | desc 'show PHASE IDENTIFIER', 'Show information about a connector'
40 | def show(phase, identifier)
41 | unless %w[extractor transformer loader].include?(phase)
42 | cli_fail(message: 'Phase argument must be one of: [extractor, transformer, loader]')
43 | end
44 |
45 | begin
46 | connector = Chronicle::ETL::Registry::Connectors.find_by_phase_and_identifier(phase.to_sym, identifier)
47 | rescue Chronicle::ETL::ConnectorNotAvailableError, Chronicle::ETL::PluginError => e
48 | cli_fail(message: "Could not find #{phase} #{identifier}", exception: e)
49 | end
50 |
51 | puts connector.klass.to_s.bold
52 | puts " #{connector.descriptive_phrase}"
53 | puts
54 | puts 'Settings:'
55 |
56 | headers = %w[name default required].map { |h| h.to_s.upcase.bold }
57 |
58 | settings = connector.klass.settings.map do |name, setting|
59 | [
60 | name,
61 | setting.default,
62 | setting.required ? 'yes' : 'no'
63 | ]
64 | end
65 | table = TTY::Table.new(headers, settings)
66 | puts table.render(indent: 0, padding: [0, 2])
67 | end
68 | end
69 | end
70 | end
71 | end
72 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/plugins.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tty-prompt'
4 | require 'tty-spinner'
5 |
6 | module Chronicle
7 | module ETL
8 | module CLI
9 | # CLI commands for working with ETL plugins
10 | class Plugins < SubcommandBase
11 | default_task 'list'
12 | namespace :plugins
13 |
14 | desc 'install', 'Install a plugin'
15 | def install(*plugins)
16 | cli_fail(message: 'Please specify a plugin to install') unless plugins.any?
17 |
18 | installed, not_installed = plugins.partition do |plugin|
19 | Chronicle::ETL::Registry::Plugins.installed?(plugin)
20 | end
21 |
22 | puts "Already installed: #{installed.join(', ')}" if installed.any?
23 | cli_exit unless not_installed.any?
24 |
25 | spinner = TTY::Spinner.new("[:spinner] Installing #{not_installed.join(', ')}...", format: :dots_2)
26 | spinner.auto_spin
27 |
28 | not_installed.each do |plugin|
29 | spinner.update(title: "Installing #{plugin}")
30 | Chronicle::ETL::Registry::Plugins.install(plugin)
31 | rescue Chronicle::ETL::PluginError => e
32 | spinner.error('Error'.red)
33 | cli_fail(message: "Plugin '#{plugin}' could not be installed", exception: e)
34 | end
35 |
36 | spinner.success("(#{'successful'.green})")
37 | end
38 |
39 | desc 'uninstall', 'Unintall a plugin'
40 | def uninstall(name)
41 | spinner = TTY::Spinner.new("[:spinner] Uninstalling plugin #{name}...", format: :dots_2)
42 | spinner.auto_spin
43 | Chronicle::ETL::Registry::Plugins.uninstall(name)
44 | spinner.success("(#{'successful'.green})")
45 | rescue Chronicle::ETL::PluginError => e
46 | spinner.error('Error'.red)
47 | cli_fail(message: "Plugin '#{name}' could not be uninstalled (was it installed?)", exception: e)
48 | end
49 |
50 | desc 'list', 'Lists available plugins'
51 | # Display all available plugins that chronicle-etl has access to
52 | def list
53 | values = Chronicle::ETL::Registry::Plugins.all
54 | .map do |plugin|
55 | [
56 | plugin.name,
57 | plugin.description,
58 | plugin.installed ? '✓' : '',
59 | plugin.version
60 | ]
61 | end
62 |
63 | headers = %w[name description installed version].map { |h| h.to_s.upcase.bold }
64 | table = TTY::Table.new(headers, values)
65 | puts 'Available plugins:'
66 | puts table.render(
67 | indent: 2,
68 | padding: [0, 0],
69 | alignments: %i[left left center left]
70 | )
71 | end
72 | end
73 | end
74 | end
75 | end
76 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/config.rb:
--------------------------------------------------------------------------------
1 | require 'active_support/core_ext/hash/keys'
2 | require 'fileutils'
3 | require 'yaml'
4 |
5 | module Chronicle
6 | module ETL
7 | # Utility methods to read, write, and access config files
8 | module Config
9 | extend self
10 |
11 | attr_accessor :xdg_environment
12 |
13 | def load(type, identifier)
14 | base = config_pathname_for_type(type)
15 | path = base.join("#{identifier}.yml")
16 | return {} unless path.exist?
17 |
18 | YAML.safe_load_file(path, symbolize_names: true, permitted_classes: [Symbol, Date, Time])
19 | end
20 |
21 | # Writes a hash as a yml config file
22 | def write(type, identifier, data)
23 | base = config_pathname_for_type(type)
24 | path = base.join("#{identifier}.yml")
25 |
26 | data.deep_stringify_keys!
27 | FileUtils.mkdir_p(File.dirname(path))
28 | File.open(path, 'w', 0o600) do |f|
29 | # Ruby likes to add --- separators when writing yaml files
30 | f << data.to_yaml.gsub(/^-+\n/, '')
31 | end
32 | end
33 |
34 | # Returns path for a given config type and identifier
35 | def path(type, identifier)
36 | base = config_pathname_for_type(type)
37 | base.join("#{identifier}.yml")
38 | end
39 |
40 | # Whether a config exists for a given type and identifier
41 | def exists?(type, identifier)
42 | base = config_pathname_for_type(type)
43 | path = base.join("#{identifier}.yml")
44 | path.exist?
45 | end
46 |
47 | # Returns all jobs available in ~/.config/chronicle/etl/jobs/*.yml
48 | def available_jobs
49 | Dir.glob(File.join(config_pathname_for_type('jobs'), '*.yml')).map do |filename|
50 | File.basename(filename, '.*')
51 | end
52 | end
53 |
54 | # Returns all configs available for a given type
55 | def available_configs(type)
56 | Dir.glob(File.join(config_pathname_for_type(type), '*.yml')).map do |filename|
57 | File.basename(filename, '.*')
58 | end
59 | end
60 |
61 | # Load a job definition from job config directory
62 | def read_job(job_name)
63 | definition = load('jobs', job_name)
64 | definition[:name] ||= job_name
65 | definition
66 | end
67 |
68 | def config_pathname
69 | base = Pathname.new(xdg_config.config_home)
70 | base.join('chronicle', 'etl')
71 | end
72 |
73 | def config_pathname_for_type(type)
74 | config_pathname.join(type)
75 | end
76 |
77 | def xdg_config
78 | # Only used for overriding ENV['HOME'] for XDG-related specs
79 | if @xdg_environment
80 | XDG::Environment.new(environment: @xdg_environment)
81 | else
82 | XDG::Environment.new
83 | end
84 | end
85 | end
86 | end
87 | end
88 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | # Abstract class representing an Transformer for an ETL job
6 | class Transformer
7 | extend Chronicle::ETL::Registry::SelfRegistering
8 | include Chronicle::ETL::Configurable
9 |
10 | attr_reader :stashed_records
11 |
12 | # Construct a new instance of this transformer. Options are passed in from a Runner
13 | # == Parameters:
14 | # options::
15 | # Options for configuring this Transformer
16 | def initialize(options = {})
17 | apply_options(options)
18 | end
19 |
20 | # Called once for each extracted record. Can return 0 or more transformed records.
21 | def call(record, &block)
22 | raise ArgumentError, 'Input must be a Chronicle::ETL::Record' unless record.is_a?(Record)
23 |
24 | yielded = false
25 |
26 | transformed_data = transform(record) do |data|
27 | new_record = update_data(record, data)
28 | block.call(new_record)
29 |
30 | yielded = true
31 | end
32 |
33 | return if yielded
34 |
35 | # Handle transformers that don't yield anything and return
36 | # transformed data directly. Skip nil values.
37 | [transformed_data].flatten.compact.each do |data|
38 | new_record = update_data(record, data)
39 | block.call(new_record)
40 | end
41 | end
42 |
43 | def call_finish(&block)
44 | remaining_records = finish
45 | return if remaining_records.nil?
46 |
47 | remaining_records.each do |record|
48 | block.call(record)
49 | end
50 | end
51 |
52 | def transform(_record)
53 | raise NotImplementedError, 'You must implement the transform method'
54 | end
55 |
56 | # Called once after runner has processed all records
57 | def finish; end
58 |
59 | protected
60 |
61 | def stash_record(record)
62 | @stashed_records ||= []
63 | @stashed_records << record
64 | nil
65 | end
66 |
67 | def flush_stashed_records
68 | @stashed_records.tap(&:clear)
69 | end
70 |
71 | def update_data(record, new_data)
72 | new_record = record.clone
73 | new_record.data = new_data
74 | new_record
75 | end
76 | end
77 | end
78 | end
79 |
80 | require_relative 'null_transformer'
81 | require_relative 'sampler_transformer'
82 | require_relative 'buffer_transformer'
83 | require_relative 'multiply_transformer'
84 | require_relative 'sort_transformer'
85 | require_relative 'chronicle_transformer'
86 | require_relative 'format_transformer'
87 | require_relative 'filter_fields_transformer'
88 | require_relative 'fields_limit_transformer'
89 | require_relative 'merge_meta_transformer'
90 | require_relative 'filter_transformer'
91 | require_relative 'chronobase_transformer'
92 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/authorizations.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'sinatra'
4 | require 'launchy'
5 |
6 | module Chronicle
7 | module ETL
8 | module CLI
9 | # CLI commands for authorizing chronicle-etl with third-party services
10 | class Authorizations < SubcommandBase
11 | default_task 'new'
12 | namespace :authorizations
13 |
14 | desc 'authorize', 'Authorize with a third-party provider'
15 | option :port, desc: 'Port to run authorization server on', type: :numeric, default: 4567
16 | option :credentials, desc: 'Secrets namespace for where to read credentials from (default: PROVIDER)',
17 | type: :string, banner: 'NAMESPACE'
18 | option :secrets, desc: 'Secrets namespace for where authorization should be saved to (default: PROVIDER)',
19 | type: :string, banner: 'NAMESPACE'
20 | option :print, desc: 'Show authorization results (instead of just saving secrets)', type: :boolean,
21 | default: false
22 | def new(provider)
23 | authorizer_klass = find_authorizer_klass(provider)
24 | credentials = load_credentials(provider: provider, credentials_source: options[:credentials])
25 | authorizer = authorizer_klass.new(port: options[:port], credentials: credentials)
26 |
27 | secrets = authorizer.authorize!
28 | secrets_namespace = options[:secrets] || provider
29 | Chronicle::ETL::Secrets.set_all(secrets_namespace, secrets)
30 |
31 | pp secrets if options[:print]
32 |
33 | cli_exit(message: "Authorization saved to '#{secrets_namespace}' secrets")
34 | rescue StandardError => e
35 | cli_fail(message: "Authorization not successful.\n#{e.message}", exception: e)
36 | end
37 |
38 | private
39 |
40 | def find_authorizer_klass(provider)
41 | # TODO: this assumes provider:plugin one-to-one
42 | unless Chronicle::ETL::Registry::Plugins.installed?(provider)
43 | cli_fail(message: "Plugin for #{provider} is not installed.")
44 | end
45 |
46 | begin
47 | Chronicle::ETL::Registry::Plugins.activate(provider)
48 | rescue PluginError => e
49 | cli_fail(message: "Could not load plugin '#{provider}'.\n" + e.message, exception: e)
50 | end
51 |
52 | Authorizer.find_by_provider(provider.to_sym) || cli_fail(message: "No authorizer available for '#{provider}'")
53 | end
54 |
55 | def load_credentials(provider:, credentials_source: nil)
56 | if credentials_source && !Chronicle::ETL::Secrets.exists?(credentials_source)
57 | cli_fail(message: "OAuth credentials specified as '#{credentials_source}' but a secrets namespace with that name does not exist.")
58 | end
59 |
60 | Chronicle::ETL::Secrets.read(credentials_source || provider)
61 | end
62 | end
63 | end
64 | end
65 | end
66 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/job_log.rb:
--------------------------------------------------------------------------------
1 | require 'forwardable'
2 |
3 | module Chronicle
4 | module ETL
5 | # A record of what happened in the running of a job. We're interested in
6 | # tracking when it ran, if it was successful, and what the latest record
7 | # we found is (to use as a cursor for the next time)
8 | class JobLog
9 | extend Forwardable
10 |
11 | attr_accessor :job,
12 | :job_id,
13 | :last_id,
14 | :highest_timestamp,
15 | :num_records_processed,
16 | :started_at,
17 | :finished_at,
18 | :success
19 |
20 | def_delegators :@job, :save_log?
21 |
22 | # Create a new JobLog for a given Job
23 | def initialize
24 | @num_records_processed = 0
25 | @success = false
26 | yield self if block_given?
27 | end
28 |
29 | # Log the result of a single transformation in a job
30 | # @param transformer [Chronicle::ETL::Tranformer] The transformer that ran
31 | def log_transformation(_transformer)
32 | # @last_id = transformer.id if transformer.id
33 |
34 | # Save the highest timestamp that we've encountered so far
35 | # @highest_timestamp = [transformer.timestamp, @highest_timestamp].compact.max if transformer.timestamp
36 |
37 | # TODO: a transformer might yield nil. We might also want certain transformers to explode
38 | # records into multiple new ones. Therefore, this this variable will need more subtle behaviour
39 | @num_records_processed += 1
40 | end
41 |
42 | # Indicate that a job has started
43 | def start
44 | @started_at = Time.now
45 | end
46 |
47 | # Indicate that a job has finished
48 | def finish
49 | @finished_at = Time.now
50 | @success = true
51 | end
52 |
53 | def error
54 | @finished_at = Time.now
55 | end
56 |
57 | def job=(job)
58 | @job = job
59 | @job_id = job.id
60 | end
61 |
62 | def duration
63 | return unless @finished_at && @started_at
64 |
65 | @finished_at - @started_at
66 | end
67 |
68 | # Take a JobLog's instance variables and turn them into a hash representation
69 | def serialize
70 | {
71 | job_id: @job_id,
72 | last_id: @last_id,
73 | highest_timestamp: @highest_timestamp,
74 | num_records_processed: @num_records_processed,
75 | started_at: @started_at,
76 | finished_at: @finished_at,
77 | success: @success
78 | }
79 | end
80 |
81 | # Create a new JobLog and set its instance variables from a serialized hash
82 | def self.build_from_serialized(attrs)
83 | attrs.delete(:id)
84 | new do |job_log|
85 | attrs.each do |key, value|
86 | setter = :"#{key}="
87 | job_log.send(setter, value)
88 | end
89 | end
90 | end
91 | end
92 | end
93 | end
94 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/job.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'forwardable'
4 |
5 | module Chronicle
6 | module ETL
7 | # A runner job
8 | #
9 | # TODO: this can probably be merged with JobDefinition. Not clear
10 | # where the boundaries are
11 | class Job
12 | extend Forwardable
13 |
14 | def_delegators :@job_definition, :dry_run?
15 |
16 | attr_accessor :name,
17 | :extractor_klass,
18 | :extractor_options,
19 | :transformer_klasses,
20 | :transformer_options,
21 | :loader_klass,
22 | :loader_options,
23 | :job_definition
24 |
25 | # TODO: build a proper id system
26 | alias id name
27 |
28 | def initialize(job_definition)
29 | @job_definition = job_definition
30 | @name = @job_definition.definition[:name]
31 | @extractor_options = @job_definition.extractor_options
32 | @transformer_options = @job_definition.transformer_options
33 | @loader_options = @job_definition.loader_options
34 |
35 | set_continuation if use_continuation?
36 | yield self if block_given?
37 | end
38 |
39 | def instantiate_extractor
40 | @extractor_klass = @job_definition.extractor_klass
41 | @extractor_klass.new(@extractor_options)
42 | end
43 |
44 | def instantiate_transformers
45 | @job_definition.transformer_klasses.each_with_index.map do |klass, i|
46 | klass.new(@transformer_options[i] || {})
47 | end
48 | end
49 |
50 | def instantiate_loader
51 | @loader_klass = @job_definition.loader_klass
52 | @loader_klass.new(@loader_options)
53 | end
54 |
55 | def save_log?
56 | # TODO: this needs more nuance
57 | !id.nil?
58 | end
59 |
60 | def to_s
61 | output = "Job summary\n".upcase.bold
62 | # output = ""
63 | output += "#{name}:\n" if name
64 | output += "→ Extracting from #{@job_definition.extractor_klass.description}\n"
65 | output += options_to_s(@extractor_options)
66 |
67 | @job_definition.transformer_klasses.each do |klass|
68 | output += "→ Transforming #{klass.description}\n"
69 | end
70 | # TODO: transformer options
71 | output += "→ Loading to #{@job_definition.loader_klass.description}\n"
72 | output += options_to_s(@loader_options)
73 | output
74 | end
75 |
76 | private
77 |
78 | def options_to_s(options, indent: 4)
79 | output = ''
80 | options.each do |k, v|
81 | output += "#{' ' * indent}#{k.to_s.light_blue}: #{v}\n"
82 | end
83 | output
84 | end
85 |
86 | def set_continuation
87 | continuation = Chronicle::ETL::JobLogger.load_latest(@id)
88 | @extractor_options[:continuation] = continuation
89 | end
90 |
91 | def use_continuation?
92 | @job_definition.incremental?
93 | end
94 | end
95 | end
96 | end
97 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/transformers/chronobase_transformer.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | module Chronicle
4 | module ETL
5 | class ChronobaseTransformer < Chronicle::ETL::Transformer
6 | PROPERTY_MAP = {
7 | source: :provider,
8 | source_id: :provider_id,
9 | url: :provider_url,
10 | end_time: :end_at,
11 | start_time: :start_at,
12 |
13 | name: :title,
14 | description: :body,
15 | text: :body,
16 |
17 | recipient: :consumers,
18 | agent: :actor,
19 | object: :involved,
20 |
21 | # music ones
22 | by_artist: :creators,
23 | in_album: :containers
24 | }.freeze
25 |
26 | VERB_MAP = {
27 | ListenAction: 'listened',
28 | CommunicateAction: 'messaged'
29 | }.freeze
30 |
31 | ENTITY_MAP = {
32 | MusicRecording: 'song',
33 | MusicAlbum: 'album',
34 | MusicGroup: 'musicartist',
35 | Message: 'message',
36 | Person: 'person'
37 | }.freeze
38 |
39 | register_connector do |r|
40 | r.identifier = :chronobase
41 | r.description = 'records to chronobase schema'
42 | end
43 |
44 | def transform(record)
45 | deeply_convert_record(record.data)
46 | end
47 |
48 | private
49 |
50 | def deeply_convert_record(record)
51 | type = activity?(record) ? 'activity' : 'entity'
52 |
53 | properties = record.properties.compact.each_with_object({}) do |(k, v), h|
54 | key = PROPERTY_MAP[k.to_sym] || k
55 | h[key] = v
56 | end
57 |
58 | properties[:verb] = VERB_MAP[record.type_id.to_sym] if VERB_MAP.key?(record.type_id.to_sym)
59 | properties[:represents] = ENTITY_MAP[record.type_id.to_sym] if ENTITY_MAP.key?(record.type_id.to_sym)
60 |
61 | properties.transform_values! do |v|
62 | case v
63 | when Chronicle::Models::Base
64 | deeply_convert_record(v)
65 | when Array
66 | v.map { |e| e.is_a?(Chronicle::Models::Base) ? deeply_convert_record(e) : e }
67 | else
68 | v
69 | end
70 | end
71 |
72 | Chronicle::Serialization::Record.new(
73 | id: record.id,
74 | type: type,
75 | properties: properties.compact,
76 | meta: {
77 | dedupe_on: transform_dedupe_on(record)
78 | },
79 | schema: 'chronobase'
80 | )
81 | end
82 |
83 | def activity?(record)
84 | record.type_id.end_with?('Action')
85 | end
86 |
87 | def transform_dedupe_on(record)
88 | property_map_with_type = PROPERTY_MAP.merge({
89 | type: activity?(record) ? :verb : :represents
90 | })
91 |
92 | record.dedupe_on.map do |set|
93 | set.map do |d|
94 | property_map_with_type[d] || d
95 | end.join(',')
96 | end
97 | end
98 | end
99 | end
100 | end
101 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/loaders/json_loader.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tempfile'
4 |
5 | module Chronicle
6 | module ETL
7 | class JSONLoader < Chronicle::ETL::Loader
8 | include Chronicle::ETL::Loaders::Helpers::StdoutHelper
9 |
10 | register_connector do |r|
11 | r.identifier = :json
12 | r.description = 'json'
13 | end
14 |
15 | setting :output
16 |
17 | # If true, one JSON record per line. If false, output a single json
18 | # object with an array of records
19 | setting :line_separated, default: true, type: :boolean
20 |
21 | def initialize(*args)
22 | super
23 | @first_line = true
24 | end
25 |
26 | def start
27 | @output_file =
28 | if output_to_stdout?
29 | create_stdout_temp_file
30 | else
31 | File.open(@config.output, 'w+')
32 | end
33 |
34 | @output_file.puts("[\n") unless @config.line_separated
35 | end
36 |
37 | def load(record)
38 | serialized = record.to_h
39 |
40 | # When dealing with raw data, we can get improperly encoded strings
41 | # (eg from sqlite database columns). We force conversion to UTF-8
42 | # before converting into JSON
43 | # encoded = serialized.transform_values do |value|
44 | # next value unless value.is_a?(String)
45 |
46 | # force_utf8(value)
47 | # end
48 | encoded = deeply_force_utf8(serialized)
49 |
50 | line = encoded.to_json
51 | # For line-separated output, we just put json + newline
52 | if @config.line_separated
53 | line = "#{line}\n"
54 | # Otherwise, we add a comma and newline and then add record to the
55 | # array we created in #start (unless it's the first line).
56 | else
57 | line = ",\n#{line}" unless @first_line
58 | end
59 |
60 | @output_file.write(line)
61 |
62 | @first_line = false
63 | # rescue StandardError => e
64 | # binding.pry
65 | end
66 |
67 | def finish
68 | # Close the array unless we're doing line-separated JSON
69 | @output_file.puts("\n]") unless @config.line_separated
70 |
71 | write_to_stdout_from_temp_file(@output_file) if output_to_stdout?
72 |
73 | @output_file.close
74 | end
75 |
76 | private
77 |
78 | # TODO: Move this to a helper module
79 | def deeply_force_utf8(hash)
80 | # FIXME: probably shouldn't happen but it does
81 | return hash.map { |x| force_utf8(x) } if hash.is_a?(Array)
82 | return force_utf8(hash) unless hash.is_a?(Hash)
83 |
84 | hash.transform_values do |value|
85 | case value
86 | when String
87 | force_utf8(value)
88 | when Hash
89 | deeply_force_utf8(value)
90 | when Array
91 | value.map { |v| deeply_force_utf8(v) }
92 | else
93 | value
94 | end
95 | end
96 | end
97 | end
98 | end
99 | end
100 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/cli/authorizations_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::CLI::Authorizations do
4 | describe '#new' do
5 | context 'with an available plugin' do
6 | before do
7 | %w[foo empty error].each do |plugin|
8 | path = File.expand_path(File.join(RSPEC_ROOT, "support/mock_plugins/chronicle-#{plugin}"))
9 | $LOAD_PATH.unshift(path)
10 | Chronicle::ETL::Registry::Plugins.register_standalone(name: plugin)
11 | end
12 | end
13 |
14 | it 'can authorize' do
15 | FakeFS.with_fresh do
16 | invoke_cli(%w[authorizations:new foo])
17 |
18 | expect(Chronicle::ETL::Secrets.read('foo')).to eql({ token: 'abc' })
19 | end
20 | end
21 |
22 | it 'can print authorization results to stdout' do
23 | FakeFS.with_fresh do
24 | stdout, = invoke_cli(%w[authorizations:new foo --print])
25 |
26 | expect(stdout).to match(/abc/)
27 | end
28 | end
29 | end
30 |
31 | context "for credentials specified that don't exist" do
32 | it 'will exit with an error' do
33 | expect do
34 | invoke_cli(%w[authorizations:new foo --credentials fake123], false)
35 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
36 | end
37 |
38 | it 'will show an error message' do
39 | _, stderr = invoke_cli(%w[authorizations:new foo --credentials fake123])
40 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/name does not exist/)
41 | end
42 | end
43 |
44 | context "for a plugin that can't be loaded" do
45 | it 'will exit with an error' do
46 | expect do
47 | invoke_cli(%w[authorizations:new error], false)
48 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
49 | end
50 |
51 | it 'will show an error message' do
52 | _, stderr = invoke_cli(%w[authorizations:new error])
53 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/Could not load/)
54 | end
55 | end
56 |
57 | context "for a plugin that doesn't have an authorizer" do
58 | it 'will exit with an error' do
59 | expect do
60 | invoke_cli(%w[authorizations:new empty], false)
61 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
62 | end
63 |
64 | it 'will show an error message' do
65 | _, stderr = invoke_cli(%w[authorizations:new empty])
66 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/No authorizer available/)
67 | end
68 | end
69 |
70 | context "for a plugin that's not installed" do
71 | it 'will exit with an error' do
72 | expect do
73 | invoke_cli(%w[authorizations:new foobar123], false)
74 | end.to raise_error(SystemExit) { |exit| expect(exit.status).to be(1) }
75 | end
76 |
77 | it 'will show an error message' do
78 | _, stderr = invoke_cli(%w[authorizations:new foobar123])
79 | expect(stderr.split("\n").map(&:uncolorize).first).to match(/is not installed/)
80 | end
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at andrew@hyfen.net. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 |
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 |
68 | ## Attribution
69 |
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 |
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 |
--------------------------------------------------------------------------------
/chronicle-etl.gemspec:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | lib = File.expand_path('lib', __dir__)
4 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5 | require 'chronicle/etl/version'
6 |
7 | Gem::Specification.new do |spec|
8 | spec.name = 'chronicle-etl'
9 | spec.version = Chronicle::ETL::VERSION
10 | spec.authors = ['Andrew Louis']
11 | spec.email = ['andrew@hyfen.net']
12 |
13 | spec.summary = 'ETL tool for personal data'
14 | spec.description = 'Chronicle-ETL allows you to extract personal data from a variety of services, transformer it, and load it.'
15 | spec.homepage = 'https://github.com/chronicle-app'
16 | spec.license = 'MIT'
17 |
18 | # Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
19 | # to allow pushing to a single host or delete this section to allow pushing to any host.
20 | if spec.respond_to?(:metadata)
21 | spec.metadata['allowed_push_host'] = 'https://rubygems.org'
22 |
23 | spec.metadata['homepage_uri'] = spec.homepage
24 | spec.metadata['source_code_uri'] = 'https://github.com/chronicle-app/chronicle-etl'
25 | spec.metadata['changelog_uri'] = 'https://github.com/chronicle-app/chronicle-etl/releases'
26 | else
27 | raise 'RubyGems 2.0 or newer is required to protect against ' \
28 | 'public gem pushes.'
29 | end
30 |
31 | # Specify which files should be added to the gem when it is released.
32 | # The `git ls-files -z` loads the files in the RubyGem that have been added into git.
33 | spec.files = Dir.chdir(File.expand_path(__dir__)) do
34 | `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
35 | end
36 | spec.bindir = 'exe'
37 | spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
38 | spec.require_paths = ['lib']
39 | spec.required_ruby_version = '>= 3.1'
40 | spec.metadata['rubygems_mfa_required'] = 'true'
41 |
42 | spec.add_dependency 'activesupport', '~> 7.0'
43 | spec.add_dependency 'chronic_duration', '~> 0.10.6'
44 | spec.add_dependency 'chronicle-core', '~> 0.3'
45 | spec.add_dependency 'colorize', '~> 0.8.1'
46 | spec.add_dependency 'gems', '>= 1'
47 | spec.add_dependency 'launchy'
48 | spec.add_dependency 'marcel', '~> 1.0.2'
49 | spec.add_dependency 'omniauth', '~> 2'
50 | spec.add_dependency 'sequel', '~> 5.35'
51 | spec.add_dependency 'sinatra', '~> 2'
52 | spec.add_dependency 'sqlite3', '~> 1.4'
53 | spec.add_dependency 'thor', '~> 1.2'
54 | spec.add_dependency 'thor-hollaback', '~> 0.2'
55 | spec.add_dependency 'tty-progressbar', '~> 0.17'
56 | spec.add_dependency 'tty-prompt', '~> 0.23'
57 | spec.add_dependency 'tty-spinner'
58 | spec.add_dependency 'tty-table', '~> 0.12'
59 | spec.add_dependency 'xdg', '>= 4.0'
60 |
61 | spec.add_development_dependency 'bundler', '~> 2.1'
62 | spec.add_development_dependency 'fakefs', '~> 1.4'
63 | spec.add_development_dependency 'guard-rspec', '~> 4.7.3'
64 | spec.add_development_dependency 'pry-byebug', '~> 3.9'
65 | spec.add_development_dependency 'rake', '~> 13.0'
66 | spec.add_development_dependency 'rspec', '~> 3.9'
67 | spec.add_development_dependency 'rubocop', '~> 1.57'
68 | spec.add_development_dependency 'simplecov', '~> 0.21'
69 | spec.add_development_dependency 'vcr', '~> 6.1'
70 | spec.add_development_dependency 'webmock', '~> 3'
71 | spec.add_development_dependency 'webrick', '~> 1.7'
72 | spec.add_development_dependency 'yard', '~> 0.9.7'
73 | end
74 |
--------------------------------------------------------------------------------
/spec/chronicle/etl/configurable_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | RSpec.describe Chronicle::ETL::Configurable do
4 | let(:basic) do
5 | Class.new do
6 | include Chronicle::ETL::Configurable
7 |
8 | setting :foo
9 |
10 | def initialize(options = {})
11 | apply_options(options)
12 | end
13 | end
14 | end
15 |
16 | let(:inherited) do
17 | Class.new(basic)
18 | end
19 |
20 | let(:inherited_inherited) do
21 | Class.new(inherited)
22 | end
23 |
24 | let(:with_required) do
25 | Class.new(basic) do
26 | setting :req, required: true
27 | end
28 | end
29 |
30 | let(:with_default) do
31 | Class.new(basic) do
32 | setting :def, default: 'default value'
33 | end
34 | end
35 |
36 | let(:with_type_time) do
37 | Class.new(basic) do
38 | setting :since, type: :time
39 | end
40 | end
41 |
42 | describe 'Basic use' do
43 | before do
44 | stub_const('BasicClass', basic)
45 | stub_const('InheritedFromBasicClass', inherited)
46 | stub_const('InheritedInheritedFromBasicClass', inherited_inherited)
47 | end
48 |
49 | it 'can be configured' do
50 | c = BasicClass.new(foo: 'bar')
51 | expect(c.config.foo).to eql('bar')
52 | end
53 |
54 | it 'can inherit settings from superclass' do
55 | c = InheritedFromBasicClass.new(foo: 'bar')
56 | expect(c.config.foo).to eql('bar')
57 | end
58 |
59 | it "can inherit settings from superclass's superclass" do
60 | c = InheritedInheritedFromBasicClass.new(foo: 'bar')
61 | expect(c.config.foo).to eql('bar')
62 | end
63 |
64 | it 'does not configure unrecognized settings' do
65 | c = BasicClass.new(arbitrary_setting: 'bar')
66 | expect(c.config.arbitrary_setting).to be_nil
67 | end
68 | end
69 |
70 | describe 'Required settings' do
71 | before do
72 | stub_const('RequiredSettingClass', with_required)
73 | stub_const('RequiredSettingSubclass', Class.new(RequiredSettingClass) { setting(:req, required: false) })
74 | end
75 |
76 | it 'raises an exception if missing an option' do
77 | expect { RequiredSettingClass.new(foo: 'bar') }.to raise_error(Chronicle::ETL::ConnectorConfigurationError)
78 | end
79 |
80 | it 'can override parent class required setting' do
81 | expect { RequiredSettingSubclass.new(foo: 'bar') }.to_not raise_error
82 | end
83 | end
84 |
85 | describe 'Default values' do
86 | before do
87 | stub_const('DefaultSettingClass', with_default)
88 | stub_const('DefaultSettingSubclass', Class.new(DefaultSettingClass) { setting(:def, default: 'new value') })
89 | end
90 |
91 | it 'has a default value set' do
92 | c = DefaultSettingClass.new(foo: 'bar')
93 | expect(c.config.def).to eql('default value')
94 | end
95 |
96 | it 'can have a default value overriden by a subclass' do
97 | c = DefaultSettingSubclass.new(foo: 'bar')
98 | expect(c.config.def).to eql('new value')
99 | end
100 | end
101 |
102 | describe 'Typed settings' do
103 | context 'for type time' do
104 | before do
105 | stub_const('TypedSettingClass', with_type_time)
106 | end
107 |
108 | it 'does not change values that do not have to be coerced' do
109 | c = TypedSettingClass.new(since: Time.new(2022, 2, 24))
110 | expect(c.config.since).to be_a_kind_of(Time)
111 | expect(c.config.since.to_date.iso8601).to eq('2022-02-24')
112 | end
113 |
114 | it 'coerces settings of type: time into Time objects' do
115 | c = TypedSettingClass.new(since: '2022-02-24 14:00-0500')
116 | expect(c.config.since).to be_a_kind_of(Time)
117 | expect(c.config.since.iso8601).to eq('2022-02-24T14:00:00-05:00')
118 | end
119 |
120 | it 'coerces Date values into Time objects' do
121 | c = TypedSettingClass.new(since: Date.new(2022, 4, 1))
122 | expect(c.config.since).to be_a_kind_of(Time)
123 | expect(c.config.since.iso8601).to eq('2022-04-01T00:00:00+00:00')
124 | end
125 |
126 | it 'interprets fuzzy time ranges correctly' do
127 | c = TypedSettingClass.new(since: '1d3h')
128 | expected_time = Time.now.to_i - 86_400 - 10_800
129 | expect(c.config.since).to be_a_kind_of(Time)
130 | expect(c.config.since.to_i).to be_within(100).of(expected_time)
131 | end
132 |
133 | it "returns an error when a range can't be parsed" do
134 | expect { TypedSettingClass.new(since: 'foo') }.to raise_error(Chronicle::ETL::ConnectorConfigurationError)
135 | end
136 | end
137 | end
138 | end
139 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/connectors.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'rubygems'
4 |
5 | module Chronicle
6 | module ETL
7 | module Registry
8 | # A singleton class that acts as a registry of connector classes available for ETL jobs
9 | module Connectors
10 | PHASES = %i[extractor transformer loader].freeze
11 | public_constant :PHASES
12 |
13 | class << self
14 | attr_accessor :connectors
15 | end
16 |
17 | def self.register(connector)
18 | connectors << connector
19 | end
20 |
21 | def self.connectors
22 | @connectors ||= []
23 | end
24 |
25 | def self.ancestor_for_phase(phase)
26 | case phase
27 | when :extractor
28 | Chronicle::ETL::Extractor
29 | when :transformer
30 | Chronicle::ETL::Transformer
31 | when :loader
32 | Chronicle::ETL::Loader
33 | end
34 | end
35 |
36 | def self.find_converter_for_source(source:, type: nil, strategy: nil, target: nil)
37 | # FIXME: we're assuming extractor plugin has been loaded already
38 | # This may not be the case if the schema converter is running
39 | # off a json dump off extraction data.
40 | # plugin = source_klass.connector_registration.source
41 | # type = source_klass.connector_registration.type
42 | # strategy = source_klass.connector_registration.strategy
43 |
44 | connectors.find do |c|
45 | c.phase == :transformer &&
46 | c.source == source &&
47 | (type.nil? || c.type == type) &&
48 | (strategy.nil? || c.strategy == strategy || c.strategy.nil?) &&
49 | (target.nil? || c.to_schema == target)
50 | end
51 | end
52 |
53 | # Find connector from amongst those currently loaded
54 | def self.find_by_phase_and_identifier_built_in(phase, identifier)
55 | connectors.find { |c| c.phase == phase.to_sym && c.identifier == identifier.to_sym }
56 | end
57 |
58 | # Find connector and load relevant plugin to find it if necessary
59 | def self.find_by_phase_and_identifier(phase, identifier)
60 | connector = find_by_phase_and_identifier_built_in(phase, identifier)
61 | return connector if connector
62 |
63 | # determine if we need to try to load a local file. if it has a dot in the identifier, we treat it as a file
64 | return find_by_phase_and_identifier_local(phase, identifier) if identifier.to_s.include?('.')
65 |
66 | # Example identifier: lastfm:listens:api
67 | plugin, type, strategy = identifier.split(':')
68 | .map { |part| part.gsub('-', '_') }
69 | .map(&:to_sym)
70 |
71 | plugin_identifier = plugin.to_s.gsub('_', '-')
72 |
73 | unless Chronicle::ETL::Registry::Plugins.installed?(plugin_identifier)
74 | raise Chronicle::ETL::PluginNotInstalledError, plugin_identifier
75 | end
76 |
77 | Chronicle::ETL::Registry::Plugins.activate(plugin_identifier)
78 |
79 | # find most specific connector that matches the identifier
80 | connector = connectors.find do |c|
81 | c.plugin == plugin && (type.nil? || c.type == type) && (strategy.nil? || c.strategy == strategy)
82 | end
83 |
84 | connector || raise(ConnectorNotAvailableError, "Connector '#{identifier}' not found")
85 | end
86 |
87 | # Load a plugin from local file system
88 | def self.find_by_phase_and_identifier_local(phase, identifier)
89 | script = File.read(identifier)
90 | raise ConnectorNotAvailableError, "Connector '#{identifier}' not found" if script.nil?
91 |
92 | # load the file by evaluating the contents
93 | eval(script, TOPLEVEL_BINDING, __FILE__, __LINE__) # rubocop:disable Security/Eval
94 |
95 | # read the file and look for all class definitions in the ruby script.
96 | class_names = script.scan(/class (\w+)/).flatten
97 |
98 | class_names.each do |class_name|
99 | klass = Object.const_get(class_name)
100 |
101 | next unless klass.ancestors.include?(ancestor_for_phase(phase))
102 |
103 | registration = ::Chronicle::ETL::Registry::ConnectorRegistration.new(klass)
104 |
105 | klass.connector_registration = registration
106 | return registration
107 | # return klass
108 | rescue NameError
109 | # ignore
110 | end
111 |
112 | raise ConnectorNotAvailableError, "Connector '#{identifier}' not found"
113 | end
114 | end
115 | end
116 | end
117 | end
118 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/job_definition.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'active_support/core_ext/hash/deep_merge'
4 |
5 | module Chronicle
6 | module ETL
7 | class JobDefinition
8 | SKELETON_DEFINITION = {
9 | incremental: false,
10 | extractor: {
11 | name: 'stdin',
12 | options: {}
13 | },
14 | transformers: [
15 | {
16 | name: 'null',
17 | options: {}
18 | }
19 | ],
20 | loader: {
21 | name: 'json',
22 | options: {}
23 | }
24 | }.freeze
25 |
26 | attr_reader :errors
27 | attr_accessor :definition
28 |
29 | def initialize
30 | @definition = SKELETON_DEFINITION
31 | end
32 |
33 | def valid?
34 | validate
35 | @errors.empty?
36 | end
37 |
38 | def validate
39 | @errors = {}
40 |
41 | extractor_klass
42 | transformer_klasses
43 | loader_klass
44 | rescue Chronicle::ETL::PluginError => e
45 | @errors[:plugins] ||= []
46 | @errors[:plugins] << e
47 | end
48 |
49 | def plugins_missing?
50 | validate
51 |
52 | return false unless @errors[:plugins]&.any?
53 |
54 | @errors[:plugins]
55 | .any? { |e| e.instance_of?(Chronicle::ETL::PluginNotInstalledError) }
56 | end
57 |
58 | def validate!
59 | raise(Chronicle::ETL::JobDefinitionError.new(self), 'Job definition is invalid') unless valid?
60 |
61 | true
62 | end
63 |
64 | # Add config hash to this definition
65 | def add_config(config = {})
66 | @definition = @definition.deep_merge(config)
67 | load_credentials
68 | end
69 |
70 | # For each connector in this job, mix in secrets into the options
71 | def apply_default_secrets
72 | # FIXME: handle transformer secrets
73 | %i[extractor loader].each do |phase|
74 | # If the option have a `secrets` key, we look up those secrets and
75 | # mix them in. If not, use the connector's plugin name and look up
76 | # secrets with the same namespace
77 | if @definition[phase][:options][:secrets]
78 | namespace = @definition[phase][:options][:secrets]
79 | else
80 | # We don't want to do this lookup for built-in connectors
81 | next if __send__(:"#{phase}_klass").connector_registration.built_in?
82 |
83 | # infer plugin name from connector name and use it for secrets
84 | # namesepace
85 | namespace = @definition[phase][:name].split(':').first
86 | end
87 |
88 | # Reverse merge secrets into connector's options (we want to preserve
89 | # options that came from job file or CLI options)
90 | secrets = Chronicle::ETL::Secrets.read(namespace)
91 | @definition[phase][:options] = secrets.merge(@definition[phase][:options])
92 | end
93 | end
94 |
95 | # Is this job continuing from a previous run?
96 | def incremental?
97 | @definition[:incremental]
98 | end
99 |
100 | def dry_run?
101 | @definition[:dry_run]
102 | end
103 |
104 | def extractor_klass
105 | find_connector_klass(:extractor, @definition[:extractor][:name])
106 | end
107 |
108 | def transformer_klasses
109 | @definition[:transformers].map do |transformer|
110 | find_connector_klass(:transformer, transformer[:name])
111 | end
112 | end
113 |
114 | def loader_klass
115 | find_connector_klass(:loader, @definition[:loader][:name])
116 | end
117 |
118 | def extractor_options
119 | @definition[:extractor][:options]
120 | end
121 |
122 | def transformer_options
123 | @definition[:transformers].map do |transformer|
124 | transformer[:options]
125 | end
126 | end
127 |
128 | def loader_options
129 | @definition[:loader][:options]
130 | end
131 |
132 | private
133 |
134 | def find_schema_transformer_klass(source_klass, target)
135 | Chronicle::ETL::Registry::Connectors.find_converter_for_source(source_klass, target).klass
136 | end
137 |
138 | def find_connector_klass(phase, identifier)
139 | Chronicle::ETL::Registry::Connectors.find_by_phase_and_identifier(phase, identifier).klass
140 | end
141 |
142 | def load_credentials
143 | %i[extractor loader].each do |phase|
144 | credentials_name = @definition[phase].dig(:options, :credentials)
145 | if credentials_name
146 | credentials = Chronicle::ETL::Config.load_credentials(credentials_name)
147 | @definition[phase][:options].deep_merge(credentials)
148 | end
149 | end
150 | end
151 | end
152 | end
153 | end
154 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/oauth_authorizer.rb:
--------------------------------------------------------------------------------
1 | require 'omniauth'
2 | require 'tty-spinner'
3 |
4 | module Chronicle
5 | module ETL
6 | # An authorization strategy that uses oauth2 (and omniauth under the hood)
7 | class OauthAuthorizer < Authorizer
8 | class << self
9 | attr_reader :strategy, :provider_name, :authorization_to_secret_map
10 | attr_accessor :client_id, :client_secret
11 |
12 | # Macro for specifying which omniauth strategy to use
13 | def omniauth_strategy(strategy)
14 | @strategy = strategy
15 | end
16 |
17 | # Macro for specifying which omniauth scopes to request
18 | def scope(value)
19 | options[:scope] = value
20 | end
21 |
22 | # Macro for specifying hash of returned authorization to secrets hash
23 | def pluck_secrets(map)
24 | @authorization_to_secret_map = map
25 | end
26 |
27 | # # Macro for specifying options to pass to omniauth
28 | def options
29 | @options ||= {}
30 | end
31 |
32 | # Returns all subclasses of OauthAuthorizer
33 | # (Used by AuthorizationServer to build omniauth providers)
34 | def all
35 | ObjectSpace.each_object(::Class).select { |klass| klass < self }
36 | end
37 | end
38 |
39 | attr_reader :authorization
40 |
41 | # Create a new instance of OauthAuthorizer
42 | def initialize(port:, credentials: {})
43 | @port = port
44 | @credentials = credentials
45 | super
46 | end
47 |
48 | # Start up an authorization server and handle the oauth flow
49 | def authorize!
50 | associate_oauth_credentials
51 | @server = load_server
52 | spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
53 | spinner.auto_spin
54 | spinner.update(title: "Starting temporary authorization server on port #{@port}"'')
55 |
56 | server_thread = start_authorization_server(port: @port)
57 | start_oauth_flow
58 |
59 | spinner.update(title: 'Waiting for authorization to complete in your browser')
60 | sleep 0.1 while authorization_pending?(server_thread)
61 |
62 | @server.quit!
63 | server_thread.join
64 | spinner.success("(#{'successful'.green})")
65 |
66 | # TODO: properly handle failed authorizations
67 | raise Chronicle::ETL::AuthorizationError unless @server.latest_authorization
68 |
69 | @authorization = @server.latest_authorization
70 |
71 | extract_secrets(authorization: @authorization, pluck_values: self.class.authorization_to_secret_map)
72 | end
73 |
74 | private
75 |
76 | def authorization_pending?(server_thread)
77 | server_thread.status && !@server.latest_authorization
78 | end
79 |
80 | def associate_oauth_credentials
81 | self.class.client_id = @credentials[:client_id]
82 | self.class.client_secret = @credentials[:client_secret]
83 | end
84 |
85 | def load_server
86 | # Load at runtime so that we can set omniauth strategies based on
87 | # which chronicle plugin has been loaded.
88 | require_relative 'authorization_server'
89 | Chronicle::ETL::AuthorizationServer
90 | end
91 |
92 | def start_authorization_server(port:)
93 | @server.settings.port = port
94 | suppress_webrick_logging(@server)
95 | Thread.abort_on_exception = true
96 | Thread.report_on_exception = false
97 |
98 | Thread.new do
99 | @server.run!({ port: @port }) do |s|
100 | s.silent = true if defined?(::Thin::Server) && s.instance_of?(::Thin::Server)
101 | end
102 | end
103 | end
104 |
105 | def start_oauth_flow
106 | url = "http://localhost:#{@port}/auth/#{omniauth_strategy}"
107 | Launchy.open(url)
108 | rescue Launchy::CommandNotFoundError
109 | Chronicle::ETL::Logger.info("Please open #{url} in a browser to continue")
110 | end
111 |
112 | def suppress_webrick_logging(server)
113 | require 'webrick'
114 | server.set(
115 | :server_settings,
116 | {
117 | AccessLog: [],
118 | # TODO: make this windows friendly
119 | # https://github.com/winton/stasis/commit/77da36f43285fda129300e382f18dfaff48571b0
120 | Logger: WEBrick::Log.new('/dev/null')
121 | }
122 | )
123 | rescue LoadError
124 | # no worries if we're not using WEBrick
125 | end
126 |
127 | def extract_secrets(authorization:, pluck_values:)
128 | return authorization unless pluck_values&.any?
129 |
130 | pluck_values.transform_values do |identifiers|
131 | authorization.dig(*identifiers)
132 | end
133 | end
134 |
135 | def omniauth_strategy
136 | self.class.strategy
137 | end
138 | end
139 | end
140 | end
141 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/runner.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'colorize'
4 | require 'chronic_duration'
5 | require 'tty-spinner'
6 |
7 | module Chronicle
8 | module ETL
9 | class Runner
10 | def initialize(job)
11 | @job = job
12 | @job_logger = Chronicle::ETL::JobLogger.new(@job)
13 | end
14 |
15 | def run!
16 | begin_job
17 | validate_job
18 | instantiate_connectors
19 | prepare_job
20 | prepare_ui
21 | run_extraction
22 | rescue Chronicle::ETL::ExtractionError => e
23 | @job_logger&.error
24 | raise(Chronicle::ETL::RunnerError, "Extraction failed. #{e.message}")
25 | rescue Interrupt
26 | @job_logger&.error
27 | raise(Chronicle::ETL::RunInterruptedError, 'Job interrupted.')
28 | # rescue StandardError => e
29 | # # Just throwing this in here until we have better exception handling in
30 | # # loaders, etc
31 | # @job_logger&.error
32 | # raise(Chronicle::ETL::RunnerError, "Error running job. #{e.message}")
33 | ensure
34 | finish_job
35 | end
36 |
37 | private
38 |
39 | def begin_job
40 | Chronicle::ETL::Logger.info(tty_log_job_initialize)
41 | @initialization_spinner = TTY::Spinner.new(':spinner :title', format: :dots_2)
42 | end
43 |
44 | def validate_job
45 | @initialization_spinner.update(title: 'Validating job')
46 | @job.job_definition.validate!
47 | end
48 |
49 | def instantiate_connectors
50 | @initialization_spinner.update(title: 'Initializing connectors')
51 | @extractor = @job.instantiate_extractor
52 | @transformers = @job.instantiate_transformers
53 | @loader = @job.instantiate_loader
54 | end
55 |
56 | def prepare_job
57 | @initialization_spinner.update(title: 'Preparing job')
58 | @job_logger.start
59 | @loader.start
60 |
61 | @initialization_spinner.update(title: 'Preparing extraction')
62 | @initialization_spinner.auto_spin
63 | @extractor.prepare
64 | @initialization_spinner.success("(#{'successful'.green})")
65 | Chronicle::ETL::Logger.info("\n")
66 | end
67 |
68 | def prepare_ui
69 | total = @extractor.results_count
70 | @progress_bar = Chronicle::ETL::Utils::ProgressBar.new(title: 'Running job', total: total)
71 | Chronicle::ETL::Logger.attach_to_ui(@progress_bar)
72 | end
73 |
74 | def run_extraction
75 | # Pattern based on Kiba's StreamingRunner
76 | # https://github.com/thbar/kiba/blob/master/lib/kiba/streaming_runner.rb
77 | stream = extractor_stream
78 | recurser = ->(s, t) { transform_stream(s, t) }
79 | @transformers.reduce(stream, &recurser).each do |record|
80 | Chronicle::ETL::Logger.debug(tty_log_transformation(record))
81 | @job_logger.log_transformation(record)
82 | @progress_bar.increment
83 | load_record(record)
84 | end
85 |
86 | @progress_bar.finish
87 |
88 | # This is typically a slow method (writing to stdout, writing a big file, etc)
89 | # TODO: consider adding a spinner?
90 | @loader.finish
91 | @job_logger.finish
92 | end
93 |
94 | # Initial steam of extracted data, wrapped in a Record class
95 | def extractor_stream
96 | Enumerator.new do |y|
97 | @extractor.extract do |extraction|
98 | record = Chronicle::ETL::Record.new(data: extraction.data, extraction: extraction)
99 | y << record
100 | end
101 | end
102 | end
103 |
104 | # For a given stream of records and a given transformer,
105 | # returns a new stream of transformed records and finally
106 | # calls the finish method on the transformer
107 | def transform_stream(stream, transformer)
108 | Enumerator.new do |y|
109 | stream.each do |record|
110 | transformer.call(record) do |transformed_record|
111 | y << transformed_record
112 | end
113 | end
114 |
115 | transformer.call_finish do |transformed_record|
116 | y << transformed_record
117 | end
118 | end
119 | end
120 |
121 | def load_record(record)
122 | @loader.load(record.data) unless @job.dry_run?
123 | end
124 |
125 | def finish_job
126 | @job_logger.save
127 | @progress_bar&.finish
128 | Chronicle::ETL::Logger.detach_from_ui
129 | Chronicle::ETL::Logger.info(tty_log_completion)
130 | end
131 |
132 | def tty_log_job_initialize
133 | output = 'Beginning job '
134 | output += "'#{@job.name}'".bold if @job.name
135 | output
136 | end
137 |
138 | def tty_log_transformation(record)
139 | output = ' ✓'.green
140 | output + " #{record}"
141 | end
142 |
143 | def tty_log_transformation_failure(exception, transformer)
144 | output = ' ✖'.red
145 | output + " Failed to transform #{transformer}. #{exception.message}"
146 | end
147 |
148 | def tty_log_completion
149 | status = @job_logger.success ? 'Success' : 'Failed'
150 | job_completion = @job_logger.success ? 'Completed' : 'Partially completed'
151 | output = "\n#{job_completion} job"
152 | output += " '#{@job.name}'".bold if @job.name
153 | output += " in #{ChronicDuration.output(@job_logger.duration)}" if @job_logger.duration
154 | output += "\n Status:\t".light_black + status
155 | output += "\n Completed:\t".light_black + @job_logger.job_log.num_records_processed.to_s
156 | if @job_logger.job_log.highest_timestamp
157 | output += "\n Latest:\t".light_black + @job_logger.job_log.highest_timestamp.iso8601.to_s
158 | end
159 | output
160 | end
161 | end
162 | end
163 | end
164 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/main.rb:
--------------------------------------------------------------------------------
1 | require 'colorize'
2 |
3 | module Chronicle
4 | module ETL
5 | module CLI
6 | # Main entrypoint for CLI app
7 | class Main < Chronicle::ETL::CLI::CLIBase
8 | class_before :set_log_level
9 | class_before :set_color_output
10 |
11 | class_option :log_level, desc: 'Log level (debug, info, warn, error, fatal, silent)', default: 'info'
12 | class_option :verbose, aliases: '-v', desc: 'Set log level to verbose', type: :boolean
13 | class_option :silent, desc: 'Silence all output', type: :boolean
14 | class_option :'no-color', desc: 'Disable colour output', type: :boolean
15 |
16 | default_task 'jobs'
17 |
18 | desc 'connectors:COMMAND', 'Connectors available for ETL jobs', hide: true
19 | subcommand 'connectors', Connectors
20 |
21 | desc 'jobs:COMMAND', 'Configure and run jobs', hide: true
22 | subcommand 'jobs', Jobs
23 |
24 | desc 'plugins:COMMAND', 'Configure plugins', hide: true
25 | subcommand 'plugins', Plugins
26 |
27 | desc 'secrets:COMMAND', 'Manage secrets', hide: true
28 | subcommand 'secrets', Secrets
29 |
30 | desc 'authorizations', 'Authorize', hide: true
31 | subcommand 'authorizations', Authorizations
32 |
33 | # Entrypoint for the CLI
34 | def self.start(given_args = ARGV, config = {})
35 | # take a subcommand:command and splits them so Thor knows how to hand off to the subcommand class
36 | if given_args.any? && given_args[0].include?(':')
37 | commands = given_args.shift.split(':')
38 | given_args = given_args.unshift(commands).flatten
39 | end
40 |
41 | super(given_args, config)
42 | end
43 |
44 | def self.exit_on_failure?
45 | true
46 | end
47 |
48 | desc 'version', 'Show version'
49 | map %w[--version -v] => :version
50 | def version
51 | shell.say "chronicle-etl #{Chronicle::ETL::VERSION}"
52 | end
53 |
54 | # Displays help options for chronicle-etl
55 | def help(meth = nil, _subcommand = false)
56 | if meth && !respond_to?(meth)
57 | klass, task = ::Thor::Util.find_class_and_task_by_namespace("#{meth}:#{meth}")
58 | klass.start(['-h', task].compact, shell:)
59 | else
60 | shell.say 'ABOUT:'.bold
61 | shell.say " #{'chronicle-etl'.italic} is a toolkit for extracting and working with your digital"
62 | shell.say ' history. 📜'
63 | shell.say
64 | shell.say " A job #{'extracts'.underline} personal data from a source, #{'transforms'.underline} it (Chronicle"
65 | shell.say " Schema or preserves raw data), and then #{'loads'.underline} it to a destination. Use"
66 | shell.say ' built-in extractors (json, csv, stdin) and loaders (csv, json, table,'
67 | shell.say ' rest) or use plugins to connect to third-party services.'
68 | shell.say
69 | shell.say ' Plugins: https://github.com/chronicle-app/chronicle-etl#currently-available'
70 | shell.say
71 | shell.say 'USAGE:'.bold
72 | shell.say ' # Basic job usage:'.italic.light_black
73 | shell.say ' $ chronicle-etl --extractor NAME --transformer NAME --loader NAME'
74 | shell.say
75 | shell.say ' # Read test.csv and display it to stdout as a table:'.italic.light_black
76 | shell.say ' $ chronicle-etl --extractor csv --input data.csv --loader table'
77 | shell.say
78 | shell.say ' # Show available plugins:'.italic.light_black
79 | shell.say ' $ chronicle-etl plugins:list'
80 | shell.say
81 | shell.say ' # Save an access token as a secret and use it in a job:'.italic.light_black
82 | shell.say ' $ chronicle-etl secrets:set pinboard access_token username:foo123'
83 | shell.say ' $ chronicle-etl secrets:list'
84 | shell.say ' $ chronicle-etl -e pinboard --since 1mo'
85 | shell.say
86 | shell.say ' # Show full job options:'.italic.light_black
87 | shell.say ' $ chronicle-etl jobs help run'
88 | shell.say
89 | shell.say 'FULL DOCUMENTATION:'.bold
90 | shell.say ' https://github.com/chronicle-app/chronicle-etl'.blue
91 | shell.say
92 |
93 | list = []
94 | ::Thor::Util.thor_classes_in(Chronicle::ETL::CLI).each do |thor_class|
95 | list += thor_class.printable_tasks(false)
96 | end
97 | list.sort! { |a, b| a[0] <=> b[0] }
98 | list.unshift ['help', '# This help menu']
99 |
100 | shell.say
101 | shell.say 'ALL COMMANDS:'.bold
102 | shell.print_table(list, indent: 2, truncate: true)
103 | shell.say
104 | shell.say 'VERSION:'.bold
105 | shell.say " #{Chronicle::ETL::VERSION}"
106 | shell.say
107 | shell.say ' Display current version:'.italic.light_black
108 | shell.say ' $ chronicle-etl --version'
109 | end
110 | end
111 |
112 | no_commands do
113 | def set_color_output
114 | String.disable_colorization true if options[:'no-color'] || ENV['NO_COLOR']
115 | end
116 |
117 | def set_log_level
118 | if options[:silent]
119 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::SILENT
120 | elsif options[:verbose]
121 | Chronicle::ETL::Logger.log_level = Chronicle::ETL::Logger::DEBUG
122 | elsif options[:log_level]
123 | level = Chronicle::ETL::Logger.const_get(options[:log_level].upcase)
124 | Chronicle::ETL::Logger.log_level = level
125 | end
126 | end
127 | end
128 | end
129 | end
130 | end
131 | end
132 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/configurable.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'ostruct'
4 | require 'chronic_duration'
5 |
6 | module Chronicle
7 | module ETL
8 | # A mixin that gives a class
9 | # a {Chronicle::ETL::Configurable::ClassMethods#setting} macro to define
10 | # settings and their properties (require, type, etc)
11 | #
12 | # @example Basic usage
13 | # class Test < Chronicle::ETL::Extractor
14 | # include Chronicle::ETL::Configurable
15 | # setting :when, type: :date, required: true
16 | # end
17 | #
18 | # t = Test.new(when: '2022-02-24')
19 | # t.config.when
20 | module Configurable
21 | # An individual setting for this Configurable
22 | Setting = Struct.new(:default, :required, :type, :description)
23 | private_constant :Setting
24 |
25 | # Collection of user-supplied options for this Configurable
26 | class Config < OpenStruct
27 | # Config values that aren't nil, as a hash
28 | def compacted_h
29 | to_h.compact
30 | end
31 | end
32 |
33 | # @private
34 | def self.included(klass)
35 | klass.extend(ClassMethods)
36 | klass.include(InstanceMethods)
37 | klass.prepend(Initializer)
38 | end
39 |
40 | # Initializer method for classes that have Configurable mixed in
41 | module Initializer
42 | # Make sure this class has a default @config ready to use
43 | def initialize(*args)
44 | @config = initialize_default_config
45 | super
46 | end
47 | end
48 |
49 | # Instance methods for classes that have Configurable mixed in
50 | module InstanceMethods
51 | attr_reader :config
52 |
53 | # Take given options and apply them to this class's settings
54 | # and make them available in @config and validates that they
55 | # conform to setting rules
56 | def apply_options(options)
57 | options.transform_keys!(&:to_sym)
58 |
59 | options.each do |name, value|
60 | setting = self.class.all_settings[name]
61 |
62 | # Do nothing with a given option if it's not a connector setting
63 | next unless setting
64 |
65 | @config[name] = coerced_value(setting, name, value)
66 | end
67 | validate_config
68 | options
69 | end
70 |
71 | # Name of all settings available to this class
72 | def self.settings
73 | self.class.all_settings.keys
74 | end
75 |
76 | private
77 |
78 | def initialize_default_config
79 | self.class.config_with_defaults
80 | end
81 |
82 | def validate_config
83 | missing = (self.class.all_required_settings.keys - @config.compacted_h.keys)
84 | raise Chronicle::ETL::ConnectorConfigurationError, "Missing options: #{missing}" if missing.count.positive?
85 | end
86 |
87 | def coerced_value(setting, name, value)
88 | setting.type ? __send__("coerce_#{setting.type}", value) : value
89 | rescue StandardError
90 | raise(
91 | Chronicle::ETL::ConnectorConfigurationError,
92 | "Could not convert value '#{value}' into a #{setting.type} for setting '#{name}'"
93 | )
94 | end
95 |
96 | def coerce_hash(value)
97 | value.is_a?(Hash) ? value : {}
98 | end
99 |
100 | def coerce_string(value)
101 | value.to_s
102 | end
103 |
104 | # TODO: think about whether to split up float, integer
105 | def coerce_numeric(value)
106 | value.to_f
107 | end
108 |
109 | def coerce_boolean(value)
110 | if value.is_a?(String)
111 | value.downcase == 'true'
112 | else
113 | value
114 | end
115 | end
116 |
117 | def coerce_array(value)
118 | value.is_a?(Array) ? value : [value]
119 | end
120 |
121 | def coerce_time(value)
122 | # parsing yml files might result in us getting Date objects
123 | # we convert to DateTime first to to ensure UTC
124 | return value.to_datetime.to_time if value.is_a?(Date)
125 |
126 | return value unless value.is_a?(String)
127 |
128 | # Hacky check for duration strings like "60m"
129 | if value.match(/[a-z]+/)
130 | ChronicDuration.raise_exceptions = true
131 | duration_ago = ChronicDuration.parse(value)
132 | Time.now - duration_ago
133 | else
134 | Time.parse(value)
135 | end
136 | end
137 | end
138 |
139 | # Class methods for classes that have Configurable mixed in
140 | module ClassMethods
141 | # Macro for creating a setting on a class {::Chronicle::ETL::Configurable}
142 | #
143 | # @param [String] name Name of the setting
144 | # @param [Boolean] required whether setting is required
145 | # @param [Object] default Default value
146 | # @param [Symbol] type Type
147 | #
148 | # @example Basic usage
149 | # setting :when, type: :date, required: true
150 | #
151 | # @see ::Chronicle::ETL::Configurable
152 | def setting(name, default: nil, required: false, type: nil, description: nil)
153 | s = Setting.new(default, required, type, description)
154 | settings[name] = s
155 | end
156 |
157 | # Collect all settings defined on this class and its ancestors (that
158 | # have Configurable mixin included)
159 | def all_settings
160 | if superclass.include?(Chronicle::ETL::Configurable)
161 | superclass.all_settings.merge(settings)
162 | else
163 | settings
164 | end
165 | end
166 |
167 | # Filters settings to those that are required.
168 | def all_required_settings
169 | all_settings.select { |_name, setting| setting.required } || {}
170 | end
171 |
172 | def settings
173 | @settings ||= {}
174 | end
175 |
176 | def setting_exists?(name)
177 | all_settings.keys.include? name
178 | end
179 |
180 | def config_with_defaults
181 | s = all_settings.transform_values(&:default)
182 | Config.new(s)
183 | end
184 | end
185 | end
186 | end
187 | end
188 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/registry/plugins.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'rubygems'
4 | require 'rubygems/command'
5 | require 'rubygems/commands/install_command'
6 | require 'rubygems/uninstaller'
7 | require 'gems'
8 | require 'active_support/core_ext/hash/deep_merge'
9 |
10 | module Chronicle
11 | module ETL
12 | module Registry
13 | # Responsible for managing plugins available to chronicle-etl
14 | #
15 | # @todo Better validation for whether a gem is actually a plugin
16 | # @todo Add ways to load a plugin that don't require a gem on rubygems.org
17 | module Plugins
18 | KNOWN_PLUGINS = %w[
19 | apple-podcasts
20 | email
21 | foursquare
22 | github
23 | imessage
24 | pinboard
25 | safari
26 | shell
27 | spotify
28 | zulip
29 | ].freeze
30 | public_constant :KNOWN_PLUGINS
31 |
32 | # Start of a system for having non-gem plugins. Right now, we just
33 | # make registry aware of existence of name of non-gem plugin
34 | def self.register_standalone(name:)
35 | plugin = Chronicle::ETL::Registry::PluginRegistration.new do |p|
36 | p.name = name.to_sym
37 | p.installed = true
38 | end
39 |
40 | installed_standalone << plugin
41 | end
42 |
43 | # Plugins either installed as gems or manually loaded/registered
44 | def self.installed
45 | installed_standalone + installed_as_gem
46 | end
47 |
48 | # Check whether a given plugin is installed
49 | def self.installed?(name)
50 | installed.map(&:name).include?(name.to_sym)
51 | end
52 |
53 | # List of plugins installed as standalone
54 | def self.installed_standalone
55 | @installed_standalone ||= []
56 | end
57 |
58 | # List of plugins installed as gems
59 | def self.installed_as_gem
60 | installed_gemspecs_latest.map do |gem|
61 | Chronicle::ETL::Registry::PluginRegistration.new do |p|
62 | p.name = gem.name.sub('chronicle-', '').to_sym
63 | p.gem = gem.name
64 | p.description = gem.description
65 | p.version = gem.version.to_s
66 | p.installed = true
67 | end
68 | end
69 | end
70 |
71 | # List of all plugins available to chronicle-etl
72 | def self.available
73 | available_as_gem
74 | end
75 |
76 | # List of plugins available through rubygems
77 | # TODO: make this concurrent
78 | def self.available_as_gem
79 | KNOWN_PLUGINS.map do |name|
80 | info = gem_info(name)
81 | Chronicle::ETL::Registry::PluginRegistration.new do |p|
82 | p.name = name
83 | p.gem = info['name']
84 | p.version = info['version']
85 | p.description = info['info']
86 | end
87 | end
88 | end
89 |
90 | # Load info about a gem plugin from rubygems API
91 | def self.gem_info(name)
92 | gem_name = "chronicle-#{name}"
93 | Gems.info(gem_name)
94 | end
95 |
96 | # Union of installed gems (latest version) + available gems
97 | def self.all
98 | (installed + available)
99 | .group_by(&:name)
100 | .transform_values { |plugin| plugin.find(&:installed) || plugin.first }
101 | .values
102 | end
103 |
104 | # Does a plugin with a given name exist?
105 | def self.exists?(name)
106 | KNOWN_PLUGINS.include?(name)
107 | end
108 |
109 | # All versions of all plugins currently installed
110 | def self.installed_gemspecs
111 | # TODO: add check for chronicle-etl dependency
112 | Gem::Specification.filter do |s|
113 | s.name.match(/^chronicle-/) && s.name != 'chronicle-etl' && s.name != 'chronicle-core'
114 | end
115 | end
116 |
117 | # Latest version of each installed plugin
118 | def self.installed_gemspecs_latest
119 | installed_gemspecs.group_by(&:name)
120 | .transform_values { |versions| versions.sort_by(&:version).reverse.first }
121 | .values
122 | end
123 |
124 | # Activate a plugin with given name by `require`ing it
125 | def self.activate(name)
126 | # By default, activates the latest available version of a gem
127 | # so don't have to run Kernel#gem separately
128 |
129 | plugin_require_name = name.to_s.gsub('-', '_')
130 | require "chronicle/#{plugin_require_name}"
131 | rescue Gem::ConflictError => e
132 | # TODO: figure out if there's more we can do here
133 | raise Chronicle::ETL::PluginConflictError.new(name),
134 | "Plugin '#{plugin_require_name}' couldn't be loaded. #{e.message}"
135 | rescue StandardError, LoadError
136 | # StandardError to catch random non-loading problems that might occur
137 | # when requiring the plugin (eg class macro invoked the wrong way)
138 | # TODO: decide if this should be separated
139 | raise Chronicle::ETL::PluginLoadError.new(name), "Plugin '#{plugin_require_name}' couldn't be loaded"
140 | end
141 |
142 | # Install a plugin to local gems
143 | def self.install(name)
144 | return if installed?(name)
145 | raise(Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} doesn't exist") unless exists?(name)
146 |
147 | gem_name = "chronicle-#{name}"
148 |
149 | Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
150 | Gem.install(gem_name)
151 |
152 | activate(name)
153 | rescue Gem::UnsatisfiableDependencyError
154 | # TODO: we need to catch a lot more than this here
155 | raise Chronicle::ETL::PluginNotAvailableError.new(name), "Plugin #{name} could not be installed."
156 | end
157 |
158 | # Uninstall a plugin
159 | def self.uninstall(name)
160 | gem_name = "chronicle-#{name}"
161 | Gem::DefaultUserInteraction.ui = Gem::SilentUI.new
162 | uninstaller = Gem::Uninstaller.new(gem_name)
163 | uninstaller.uninstall
164 | rescue Gem::InstallError
165 | # TODO: strengthen this exception handling
166 | raise(Chronicle::ETL::PluginError.new(name), "Plugin #{name} wasn't uninstalled")
167 | end
168 | end
169 | end
170 | end
171 | end
172 |
--------------------------------------------------------------------------------
/lib/chronicle/etl/cli/jobs.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 |
3 | require 'tty-prompt'
4 |
5 | module Chronicle
6 | module ETL
7 | module CLI
8 | # CLI commands for working with ETL jobs
9 | class Jobs < SubcommandBase
10 | default_task 'start'
11 | namespace :jobs
12 |
13 | class_option :extractor, aliases: '-e', desc: 'Extractor class. Default: stdin', banner: 'NAME'
14 | class_option :'extractor-opts', desc: 'Extractor options', type: :hash, default: {}
15 | class_option :transformer,
16 | aliases: '-t',
17 | desc: 'Transformer identifier. Default: null',
18 | banner: 'NAME',
19 | type: 'array',
20 | repeatable: true
21 | class_option :loader, aliases: '-l', desc: 'Loader class. Default: table', banner: 'NAME'
22 | class_option :'loader-opts', desc: 'Loader options', type: :hash, default: {}
23 |
24 | # This is an array to deal with shell globbing
25 | class_option :input,
26 | aliases: '-i',
27 | desc: 'Input filename or directory',
28 | default: [],
29 | type: 'array',
30 | banner: 'FILENAME'
31 | class_option :since, desc: 'Load records SINCE this date (or fuzzy time duration)', banner: 'DATE'
32 | class_option :until, desc: 'Load records UNTIL this date (or fuzzy time duration)', banner: 'DATE'
33 | class_option :limit, desc: 'Only extract the first LIMIT records', banner: 'N'
34 |
35 | class_option :schema,
36 | desc: 'Which Schema to transform',
37 | banner: 'SCHEMA_NAME',
38 | type: 'string',
39 | enum: %w[chronicle activitystream schemaorg chronobase]
40 | class_option :format,
41 | desc: 'How to serialize results',
42 | banner: 'SCHEMA_NAME',
43 | type: 'string',
44 | enum: %w[jsonapi jsonld]
45 |
46 | class_option :output, aliases: '-o', desc: 'Output filename', type: 'string'
47 | class_option :fields, desc: 'Output only these fields', type: 'array', banner: 'field1 field2 ...'
48 | class_option :'fields-limit', desc: 'Output first N fields', type: :numeric
49 | class_option :filter, desc: 'Filter records', type: 'array', banner: 'field=value'
50 | class_option :header_row, desc: 'Output the header row of tabular output', type: 'boolean'
51 |
52 | # Thor doesn't like `run` as a command name
53 | map run: :start
54 | desc 'run', 'Start a job'
55 | option :dry_run, desc: 'Only run the extraction and transform steps, not the loading', type: :boolean
56 | long_desc <<-LONG_DESC
57 | This will run an ETL job. Each job needs three parts:
58 |
59 | 1. #{'Extractor'.underline}: pulls data from an external source. By default, this is stdout. Other common options including pulling data from an API or reading JSON from a file.
60 |
61 | 2. #{'Transformers'.underline}: transform data into a new format. If none is specified, we use the `null` transformer which does nothing to the data.
62 |
63 | 3. #{'Loader'.underline}: takes that transformed data and loads it externally. This can be an API, flat files, (or by default), stdout. With the --dry-run option, this step won't be run.
64 |
65 | If you do not want to use the command line flags, you can also configure a job with a .yml config file. You can either specify the path to this file or use the filename and place the file in ~/.config/chronicle/etl/jobs/NAME.yml and call it with `--job NAME`
66 | LONG_DESC
67 | # Run an ETL job
68 | def start(*args)
69 | name = args.first
70 |
71 | # If someone runs `$ chronicle-etl` with no arguments, show help menu.
72 | # TODO: decide if we should check that there's nothing in stdin pipe
73 | # in case user wants to actually run this sort of job stdin->null->stdout
74 | if name.nil? && options[:extractor].nil?
75 | m = Chronicle::ETL::CLI::Main.new
76 | m.help
77 | cli_exit
78 | end
79 |
80 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name)
81 |
82 | job_definition = build_job_definition(name, options)
83 |
84 | if job_definition.plugins_missing?
85 | missing_plugins = job_definition.errors[:plugins]
86 | .select { |error| error.is_a?(Chronicle::ETL::PluginNotInstalledError) }
87 | .map(&:name)
88 | .uniq
89 | install_missing_plugins(missing_plugins)
90 | end
91 |
92 | run_job(job_definition)
93 | rescue Chronicle::ETL::JobDefinitionError => e
94 | message = ''
95 | job_definition.errors.each_pair do |category, errors|
96 | message << "Problem with #{category}:\n - #{errors.map(&:to_s).join("\n - ")}"
97 | end
98 | cli_fail(message: "Error running job.\n#{message}", exception: e)
99 | end
100 |
101 | option :'skip-confirmation', aliases: '-y', type: :boolean
102 | desc 'save', 'Save a job'
103 | # Create an ETL job
104 | def save(name)
105 | write_config = true
106 | job_definition = build_job_definition(name, options)
107 | job_definition.validate!
108 |
109 | if Chronicle::ETL::Config.exists?('jobs', name) && !options[:'skip-confirmation']
110 | prompt = TTY::Prompt.new
111 | write_config = false
112 | message = "Job '#{name}' exists already. Ovewrite it?"
113 | begin
114 | write_config = prompt.yes?(message)
115 | rescue TTY::Reader::InputInterrupt
116 | end
117 | end
118 |
119 | if write_config
120 | Chronicle::ETL::Config.write('jobs', name, job_definition.definition)
121 | cli_exit(message: "Job saved. Run it with `$ chronicle-etl jobs:run #{name}`")
122 | else
123 | cli_fail(message: "\nJob not saved")
124 | end
125 | rescue Chronicle::ETL::JobDefinitionError => e
126 | cli_fail(message: 'Job definition error', exception: e)
127 | end
128 |
129 | desc 'show', 'Show details about a job'
130 | # Show an ETL job
131 | def show(name = nil)
132 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name)
133 |
134 | job_definition = build_job_definition(name, options)
135 | job_definition.validate!
136 | puts Chronicle::ETL::Job.new(job_definition)
137 | rescue Chronicle::ETL::JobDefinitionError => e
138 | cli_fail(message: 'Job definition error', exception: e)
139 | end
140 |
141 | desc 'edit', 'Edit a job in default editor ($EDITOR)'
142 | def edit(name = nil)
143 | cli_fail(message: "Job '#{name}' does not exist") if name && !Chronicle::ETL::Config.exists?('jobs', name)
144 |
145 | filename = Chronicle::ETL::Config.path('jobs', name)
146 | system "${VISUAL:-${EDITOR:-vi}} \"#{filename}\""
147 |
148 | definition = Chronicle::ETL::JobDefinition.new
149 | definition.add_config(load_job_config(name))
150 | definition.validate!
151 |
152 | cli_exit(message: "Job '#{name}' saved")
153 | rescue Chronicle::ETL::JobDefinitionError => e
154 | cli_fail(message: 'Job definition error', exception: e)
155 | end
156 |
157 | desc 'list', 'List all available jobs'
158 | # List available ETL jobs
159 | def list
160 | jobs = Chronicle::ETL::Config.available_jobs
161 |
162 | job_details = jobs.map do |job|
163 | r = Chronicle::ETL::Config.load('jobs', job)
164 |
165 | extractor = r[:extractor][:name] if r[:extractor]
166 | transformer = r[:transformer][:name] if r[:transformer]
167 | loader = r[:loader][:name] if r[:loader]
168 |
169 | [job, extractor, transformer, loader]
170 | end
171 |
172 | headers = %w[name extractor transformer loader].map { |h| h.upcase.bold }
173 |
174 | puts 'Available jobs:'
175 | table = TTY::Table.new(headers, job_details)
176 | puts table.render(indent: 0, padding: [0, 2])
177 | rescue Chronicle::ETL::ConfigError => e
178 | cli_fail(message: "Config error. #{e.message}", exception: e)
179 | end
180 |
181 | private
182 |
183 | def run_job(job_definition)
184 | # FIXME: have to validate here so next method can work. This is clumsy
185 | job_definition.validate!
186 | # FIXME: clumsy to make CLI responsible for setting secrets here. Think about a better way to do this
187 | job_definition.apply_default_secrets
188 | job = Chronicle::ETL::Job.new(job_definition)
189 | runner = Chronicle::ETL::Runner.new(job)
190 | runner.run!
191 | rescue RunnerError => e
192 | cli_fail(message: e.message.to_s, exception: e)
193 | end
194 |
195 | # TODO: probably could merge this with something in cli/plugin
196 | def install_missing_plugins(missing_plugins)
197 | prompt = TTY::Prompt.new
198 | message = "Plugin#{'s' if missing_plugins.count > 1} specified by job not installed.\n"
199 | message += 'Do you want to install '
200 | message += missing_plugins.map { |name| "chronicle-#{name}".bold }
201 | .join(', ')
202 | message += ' and start the job?'
203 | will_install = prompt.yes?(message)
204 | cli_fail(message: "Must install #{missing_plugins.join(', ')} plugin to run job") unless will_install
205 |
206 | Chronicle::ETL::CLI::Plugins.new.install(*missing_plugins)
207 | end
208 |
209 | # Create job definition by reading config file and then overwriting with flag options
210 | def build_job_definition(name, options)
211 | definition = Chronicle::ETL::JobDefinition.new
212 | definition.add_config(load_job_config(name))
213 | definition.add_config(process_flag_options(options).transform_keys(&:to_sym))
214 | definition
215 | end
216 |
217 | def load_job_config(name)
218 | Chronicle::ETL::Config.read_job(name)
219 | end
220 |
221 | # Takes flag options and turns them into a runner config
222 | # TODO: this needs a lot of refactoring
223 | def process_flag_options(options)
224 | extractor_options = options[:'extractor-opts'].transform_keys(&:to_sym).merge(
225 | {
226 | input: (options[:input] if options[:input].any?),
227 | since: options[:since],
228 | until: options[:until],
229 | limit: options[:limit]
230 | }.compact
231 | )
232 |
233 | loader_options = options[:'loader-opts'].transform_keys(&:to_sym).merge(
234 | {
235 | output: options[:output],
236 | header_row: options[:header_row]
237 | }.compact
238 | )
239 |
240 | processed_options = {
241 | dry_run: options[:dry_run],
242 | extractor: {
243 | name: options[:extractor],
244 | options: extractor_options
245 | }.compact,
246 | loader: {
247 | name: options[:loader],
248 | options: loader_options
249 | }.compact
250 | }
251 |
252 | add_transformer(processed_options, 'chronicle') if options[:schema]
253 | add_transformer(processed_options, options[:schema]) if options[:schema] && options[:schema] != 'chronicle'
254 | add_transformers_from_option(processed_options, options[:transformer]) if options[:transformer]&.any?
255 | if options[:filter]
256 | add_transformer(processed_options, :filter, { filters: options[:filter].to_h do |f|
257 | f.split('=')
258 | end })
259 | end
260 | add_transformer(processed_options, :format, { format: options[:format] }) if options[:format]
261 | add_transformer(processed_options, :filter_fields, { fields: options[:fields] }) if options[:fields]
262 | if options[:'fields-limit']
263 | add_transformer(processed_options, :fields_limit,
264 | { limit: options[:'fields-limit'] })
265 | end
266 |
267 | processed_options
268 | end
269 |
270 | def add_transformer(processed_options, name, options = {})
271 | processed_options[:transformers] ||= []
272 | processed_options[:transformers] << { name:, options: }
273 | end
274 |
275 | def add_transformers_from_option(processed_options, transformer_option)
276 | processed_options[:transformers] ||= []
277 | processed_options[:transformers] += transformer_option.map do |transformer_args|
278 | transformer_name, *transformer_options = transformer_args
279 | transformer_options = transformer_options.filter { |opt| opt.include?('=') }
280 |
281 | {
282 | name: transformer_name,
283 | options: transformer_options.to_h do |opt|
284 | key, value = opt.split('=')
285 | [key.to_sym, value]
286 | end
287 | }
288 | end
289 | end
290 | end
291 | end
292 | end
293 | end
294 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## A CLI toolkit for extracting and working with your digital history
2 |
3 | 
4 |
5 | [](https://badge.fury.io/rb/chronicle-etl) [](https://github.com/chronicle-app/chronicle-etl/actions/workflows/ruby.yml) [](https://www.rubydoc.info/gems/chronicle-etl/)
6 |
7 | Are you trying to archive your digital history or incorporate it into your own projects? You’ve probably discovered how frustrating it is to get machine-readable access to your own data. While [building a memex](https://hyfen.net/memex/), I learned first-hand what great efforts must be made before you can begin using the data in interesting ways.
8 |
9 | If you don’t want to spend all your time writing scrapers, reverse-engineering APIs, or parsing export data, this tool is for you! (_If you do enjoy these things, please see the [open issues](https://github.com/chronicle-app/chronicle-etl/issues)._)
10 |
11 | **`chronicle-etl` is a CLI tool that gives you a unified interface to your personal data.** It uses the ETL pattern to _extract_ data from a source (e.g. your local browser history, a directory of images, goodreads.com reading history), _transform_ it (into a given schema), and _load_ it to a destination (e.g. a CSV file, JSON, external API).
12 |
13 | ## What does `chronicle-etl` give you?
14 |
15 | - **A CLI tool for working with personal data**. You can monitor progress of exports, manipulate the output, set up recurring jobs, manage credentials, and more.
16 | - **Plugins for many third-party sources** (see [list](#available-plugins-and-connectors)). This plugin system allows you to access data from dozens of third-party services, all accessible through a common CLI interface.
17 | - **A common, opinionated schema**: You can normalize different datasets into a single schema so that, for example, all your iMessages and emails are represented in a common schema. (Don’t want to use this schema? `chronicle-etl` always allows you to fall back on working with the raw extraction data.)
18 |
19 | ## Chronicle-ETL in action
20 |
21 | 
22 |
23 | ### Longer screencast
24 |
25 | [](https://asciinema.org/a/483455)
26 |
27 | ## Installation
28 |
29 | Using homebrew:
30 |
31 | ```sh
32 | $ brew install chronicle-app/etl/chronicle-etl
33 | ```
34 |
35 | Using rubygems:
36 |
37 | ```sh
38 | $ gem install chronicle-etl
39 | ```
40 |
41 | Confirm it installed successfully:
42 |
43 | ```sh
44 | $ chronicle-etl --version
45 | ```
46 |
47 | ## Basic usage and running jobs
48 |
49 | ```sh
50 | # Display help
51 | $ chronicle-etl help
52 |
53 | # Run a basic job
54 | $ chronicle-etl --extractor NAME --transformer NAME --loader NAME
55 |
56 | # Read test.csv and display it to stdout as a table
57 | $ chronicle-etl --extractor csv --input data.csv --loader table
58 |
59 | # Show available plugins and install one
60 | $ chronicle-etl plugins:list
61 | $ chronicle-etl plugins:install imessage
62 |
63 | # Retrieve imessage messages from the last 5 hours
64 | $ chronicle-etl -e imessage --since 5h
65 |
66 | # Get email senders from an .mbox email archive file
67 | $ chronicle-etl --extractor email:mbox -i sample-email-archive.mbox -t email --fields actor.slug
68 |
69 | # Save an access token as a secret and use it in a job
70 | $ chronicle-etl secrets:set pinboard access_token username:foo123
71 | $ chronicle-etl secrets:list # Verify that's it's available
72 | $ chronicle-etl -e pinboard --since 1mo # Used automatically based on plugin name
73 | ```
74 |
75 | ### Common options
76 |
77 | ```sh
78 | Options:
79 | -e, [--extractor=NAME] # Extractor class. Default: stdin
80 | [--extractor-opts=key:value] # Extractor options
81 | -t, [--transformer=NAME] # Transformer class. Default: null
82 | [--transformer-opts=key:value] # Transformer options
83 | -l, [--loader=NAME] # Loader class. Default: json
84 | [--loader-opts=key:value] # Loader options
85 | -i, [--input=FILENAME] # Input filename or directory
86 | [--since=DATE] # Load records SINCE this date (or fuzzy time duration)
87 | [--until=DATE] # Load records UNTIL this date (or fuzzy time duration)
88 | [--limit=N] # Only extract the first LIMIT records
89 | [--schema=SCHEMA_NAME] # Which Schema to transform
90 | # Possible values: chronicle, activitystream, schemaorg, chronobase
91 | [--format=SCHEMA_NAME] # How to serialize results
92 | # Possible values: jsonapi, jsonld
93 | -o, [--output=OUTPUT] # Output filename
94 | [--fields=field1 field2 ...] # Output only these fields
95 | [--header-row], [--no-header-row] # Output the header row of tabular output
96 |
97 | [--log-level=LOG_LEVEL] # Log level (debug, info, warn, error, fatal)
98 | # Default: info
99 | -v, [--verbose], [--no-verbose] # Set log level to verbose
100 | [--silent], [--no-silent] # Silence all output
101 | ```
102 |
103 | ### Saving a job
104 |
105 | You can save details about a job to a local config file (saved by default in `~/.config/chronicle/etl/jobs/JOB_NAME.yml`) to save yourself the trouble specifying options each time.
106 |
107 | ```sh
108 | # Save a job named 'sample' to ~/.config/chronicle/etl/jobs/sample.yml
109 | $ chronicle-etl jobs:save sample --extractor pinboard --since 10d
110 |
111 | # Run the job
112 | $ chronicle-etl jobs:run sample
113 |
114 | # Show details about the job
115 | $ chronicle-etl jobs:show sample
116 |
117 | # Edit a job definition with default editor ($EDITOR)
118 | $ chronicle-etl jobs:edit sample
119 |
120 | # Show all saved jobs
121 | $ chronicle-etl jobs:list
122 | ```
123 |
124 | ## Connectors and plugins
125 |
126 | Connectors let you work with different data formats or third-party sources.
127 |
128 | ### Built-in Connectors
129 |
130 | `chronicle-etl` comes with several built-in connectors for common formats and sources.
131 |
132 | ```sh
133 | # List all available connectors
134 | $ chronicle-etl connectors:list
135 | ```
136 |
137 | #### Extractors
138 |
139 | - [`csv`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/csv_extractor.rb) - Load records from CSV files or stdin
140 | - [`json`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/json_extractor.rb) - Load JSON (either [line-separated objects](https://en.wikipedia.org/wiki/JSON_streaming#Line-delimited_JSON) or one object)
141 | - [`file`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/file_extractor.rb) - load from a single file or directory (with a glob pattern)
142 |
143 | #### Transformers
144 |
145 | - [`null`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/null_transformer.rb) - (default) Don’t do anything and pass on raw extraction data
146 | - [`sampler`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/sampler_transformer.rb) - Sample `percent` records from the extraction
147 | - [`sort`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/transformers/sampler_transformer.rb) - sort extracted results by `key` and `direction`
148 |
149 |
150 | #### Loaders
151 |
152 | - [`json`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/json_loader.rb) - (default) Load records serialized as JSON
153 | - [`table`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/table_loader.rb) - Output an ascii table of records. Useful for exploring data.
154 | - [`csv`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/extractors/csv_extractor.rb) - Load records to CSV
155 | - [`rest`](https://github.com/chronicle-app/chronicle-etl/blob/main/lib/chronicle/etl/loaders/rest_loader.rb) - Send JSON to a REST API
156 |
157 | ### Chronicle Plugins for third-party services
158 |
159 | Plugins provide access to data from third-party platforms, services, or formats. Plugins are packaged as separate gems and can be installed through the CLI (under the hood, it's a `gem install chronicle-PLUGINNAME`)
160 |
161 | #### Plugin usage
162 |
163 | ```bash
164 | # List available plugins
165 | $ chronicle-etl plugins:list
166 |
167 | # Install a plugin
168 | $ chronicle-etl plugins:install NAME
169 |
170 | # Use a plugin
171 | $ chronicle-etl plugins:install imessage
172 | $ chronicle-etl --extractor imessage --limit 10
173 |
174 | # Uninstall a plugin
175 | $ chronicle-etl plugins:uninstall NAME
176 | ```
177 |
178 | #### Available plugins and connectors
179 |
180 | The following are the officially-supported list of plugins and their available connectors:
181 |
182 | | Plugin | Type | Identifier | Description |
183 | | --------------------------------------------------------------------------- | ----------- | ---------------- | -------------------------------------------- |
184 | | [apple-podcasts](https://github.com/chronicle-app/chronicle-apple-podcasts) | extractor | listens | listening history of podcast episodes |
185 | | [apple-podcasts](https://github.com/chronicle-app/chronicle-apple-podcasts) | transformer | listen | a podcast episode listen to Chronicle Schema |
186 | | [email](https://github.com/chronicle-app/chronicle-email) | extractor | imap | emails over an IMAP connection |
187 | | [email](https://github.com/chronicle-app/chronicle-email) | extractor | mbox | emails from an .mbox file |
188 | | [email](https://github.com/chronicle-app/chronicle-email) | transformer | email | email to Chronicle Schema |
189 | | [foursquare](https://github.com/chronicle-app/chronicle-foursquare) | extractor | checkins | Foursqure visits |
190 | | [foursquare](https://github.com/chronicle-app/chronicle-foursquare) | transformer | checkin | checkin to Chronicle Schema |
191 | | [github](https://github.com/chronicle-app/chronicle-github) | extractor | activity | user activity stream |
192 | | [imessage](https://github.com/chronicle-app/chronicle-imessage) | extractor | messages | imessages from local macOS |
193 | | [imessage](https://github.com/chronicle-app/chronicle-imessage) | transformer | message | imessage to Chronicle Schema |
194 | | [pinboard](https://github.com/chronicle-app/chronicle-pinboard) | extractor | bookmarks | Pinboard.in bookmarks |
195 | | [pinboard](https://github.com/chronicle-app/chronicle-pinboard) | transformer | bookmark | bookmark to Chronicle Schema |
196 | | [safari](https://github.com/chronicle-app/chronicle-safari) | extractor | browser-history | browser history |
197 | | [safari ](https://github.com/chronicle-app/chronicle-safari) | transformer | browser-history | browser history to Chronicle Schema |
198 | | [shell](https://github.com/chronicle-app/chronicle-shell) | extractor | history | shell command history (bash / zsh) |
199 | | [shell](https://github.com/chronicle-app/chronicle-shell) | transformer | command | command to Chronicle Schema |
200 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | liked-tracks | liked tracks |
201 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | saved-albums | saved albums |
202 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | extractor | listens | recently listened tracks (last 50 tracks) |
203 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | transformer | like | like to Chronicle Schema |
204 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | transformer | listen | listen to Chronicle Schema |
205 | | [spotify](https://github.com/chronicle-app/chronicle-spotify) | authorizer | | OAuth authorizer |
206 | | [zulip](https://github.com/chronicle-app/chronicle-zulip) | extractor | private-messages | private messages |
207 | | [zulip](https://github.com/chronicle-app/chronicle-zulip) | transformer | message | message to Chronicle Schema |
208 |
209 | ### Coming soon
210 |
211 | A few dozen importers exist [in my Memex project](https://hyfen.net/memex/) and I'm porting them over to the Chronicle system. The [Chronicle Plugin Tracker](https://github.com/orgs/chronicle-app/projects/1/views/1) lets you keep track what's available and what's coming soon.
212 |
213 | If you don't see a plugin for a third-party provider or data source that you're interested in using with `chronicle-etl`, [please open an issue](https://github.com/chronicle-app/chronicle-etl/issues/new). If you want to work together on a plugin, please [get in touch](#get-in-touch)!
214 |
215 | In summary, the following **are coming soon**:
216 | anki, arc, bear, chrome, facebook, firefox, fitbit, foursquare, git, github, goodreads, google-calendar, images, instagram, lastfm, shazam, slack, strava, timing, things, twitter, whatsapp, youtube.
217 |
218 | ### Writing your own plugin
219 |
220 | Additional connectors are packaged as separate ruby gems. You can view the [iMessage plugin](https://github.com/chronicle-app/chronicle-imessage) for an example.
221 |
222 | If you want to load a custom connector without creating a gem, you can help by [completing this issue](https://github.com/chronicle-app/chronicle-etl/issues/23).
223 |
224 | If you want to work together on a connector, please [get in touch](#get-in-touch)!
225 |
226 | #### Sample custom Extractor class
227 |
228 | ```ruby
229 | # TODO
230 | ```
231 |
232 | ## Secrets Management
233 |
234 | If your job needs secrets such as access tokens or passwords, `chronicle-etl` has a built-in secret management system.
235 |
236 | Secrets are organized in namespaces. Typically, you use one namespace per plugin (`pinboard` secrets for the `pinboard` plugin). When you run a job that uses the `pinboard` plugin extractor, for example, the secrets from that namespace will automatically be included in the extractor's options. To override which secrets get included, you can use do it in the connector options with `secrets: ALT-NAMESPACE`.
237 |
238 | Under the hood, secrets are stored in `~/.config/chronicle/etl/secrets/NAMESPACE.yml` with 0600 permissions on each file.
239 |
240 | ### Using the secret manager
241 |
242 | ```sh
243 | # Save a secret under the 'pinboard' namespace
244 | $ chronicle-etl secrets:set pinboard access_token username:foo123
245 |
246 | # Set a secret using stdin
247 | $ echo -n "username:foo123" | chronicle-etl secrets:set pinboard access_token
248 |
249 | # List available secretes
250 | $ chronicle-etl secrets:list
251 |
252 | # Use 'pinboard' secrets in the pinboard extractor's options (happens automatically)
253 | $ chronicle-etl -e pinboard --since 1mo
254 |
255 | # Use a custom secrets namespace
256 | $ chronicle-etl secrets:set pinboard-alt access_token different-username:foo123
257 | $ chronicle-etl -e pinboard --extractor-opts secrets:pinboard-alt --since 1mo
258 |
259 | # Remove a secret
260 | $ chronicle-etl secrets:unset pinboard access_token
261 | ```
262 |
263 | ## Roadmap
264 |
265 | - Keep tackling **new plugins**. See: [Chronicle Plugin Tracker](https://github.com/orgs/chronicle-app/projects/1)
266 | - Add support for **incremental extractions** ([#37](https://github.com/chronicle-app/chronicle-etl/issues/37))
267 | - **Improve stdin extractor and shell command transformer** so that users can easily integrate their own scripts/languages/tools into jobs ([#5](https://github.com/chronicle-app/chronicle-etl/issues/48))
268 | - **Add documentation for Chronicle Schema**. It's found throughout this project but never explained.
269 |
270 | ## Development
271 |
272 | After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
273 |
274 | To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
275 |
276 | ### Additional development commands
277 |
278 | ```bash
279 | # run tests
280 | bundle exec rake spec
281 |
282 | # generate docs
283 | bundle exec rake yard
284 |
285 | # use Guard to run specs automatically
286 | bundle exec guard
287 | ```
288 |
289 | ## Get in touch
290 |
291 | - [@hyfen](https://twitter.com/hyfen) on Twitter
292 | - [@hyfen](https://github.com/hyfen) on Github
293 | - Email: andrew@hyfen.net
294 |
295 | ## Contributing
296 |
297 | Bug reports and pull requests are welcome on GitHub at https://github.com/chronicle-app/chronicle-etl. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
298 |
299 | ## License
300 |
301 | The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
302 |
303 | ## Code of Conduct
304 |
305 | Everyone interacting in the Chronicle::ETL project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/chronicle-app/chronicle-etl/blob/main/CODE_OF_CONDUCT.md).
306 |
--------------------------------------------------------------------------------