├── .gitignore
├── .ruby-version
├── .travis.yml
├── .yardopts
├── CHANGELOG.md
├── CONTRIBUTING.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── bin
└── forklift
├── example
├── Gemfile
├── Gemfile.lock
├── config
│ ├── connections
│ │ ├── csv
│ │ │ └── csv.yml
│ │ ├── elasticsearch
│ │ │ └── source.yml
│ │ └── mysql
│ │ │ ├── destination.yml
│ │ │ └── source.yml
│ └── email.yml
├── plan.rb
├── template
│ └── email.erb
└── transformations
│ ├── cleanup.sql
│ ├── combined_name.sql
│ └── email_suffix.rb
├── forklift.jpg
├── forklift_etl.gemspec
├── forklift_small.jpg
├── lib
├── forklift.rb
└── forklift
│ ├── base
│ ├── connection.rb
│ ├── logger.rb
│ ├── mailer.rb
│ ├── pid.rb
│ └── utils.rb
│ ├── patterns
│ ├── elasticsearch_patterns.rb
│ └── mysql_patterns.rb
│ ├── plan.rb
│ ├── transports
│ ├── csv.rb
│ ├── elasticsearch.rb
│ └── mysql.rb
│ └── version.rb
├── spec
├── config
│ ├── connections
│ │ ├── csv
│ │ │ ├── forklift_test_destination.yml
│ │ │ └── forklift_test_source.yml
│ │ ├── elasticsearch
│ │ │ └── forklift_test.yml
│ │ └── mysql
│ │ │ ├── forklift_test_destination.yml
│ │ │ ├── forklift_test_source_a.yml
│ │ │ ├── forklift_test_source_b.yml
│ │ │ └── forklift_test_working.yml
│ └── email.yml
├── integration
│ ├── basic_spec.rb
│ ├── csv_spec.rb
│ ├── elasticsearch_patterns_spec.rb
│ ├── elasticsearch_spec.rb
│ ├── multi_transport_spec.rb
│ ├── mysql_patterns_spec.rb
│ ├── mysql_spec.rb
│ └── transformations_spec.rb
├── spec_helper.rb
├── support
│ ├── dumps
│ │ ├── csv
│ │ │ └── source.csv
│ │ ├── elasticsearch
│ │ │ └── forklift_test.json
│ │ └── mysql
│ │ │ ├── forklift_test_source_a.sql
│ │ │ └── forklift_test_source_b.sql
│ ├── spec_client.rb
│ ├── spec_plan.rb
│ └── spec_seeds.rb
├── template
│ ├── spec_email_template.erb
│ ├── spec_user_transformation.rb
│ └── spec_user_transformation.sql
└── unit
│ ├── connection
│ └── mysql_spec.rb
│ └── misc
│ ├── email_spec.rb
│ ├── error_spec.rb
│ ├── pid_spec.rb
│ └── step_spec.rb
└── template
├── destination.yml
├── email.erb
├── email.yml
├── plan.rb
└── source.yml
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | *.gem
3 | */.DS_Store
4 | Gemfile.lock
5 | .bundle/
6 | pid
7 | log
8 | config/databases.yml
9 | config/email.yml
10 | config/dump.yml
11 | /deploy
12 | /config
13 | .yardoc/
14 |
--------------------------------------------------------------------------------
/.ruby-version:
--------------------------------------------------------------------------------
1 | 2.1.5
2 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 |
3 | language: ruby
4 | rvm:
5 | - 2.0.0
6 | - 2.1.9
7 | - 2.2.5
8 | - 2.3.1
9 | # - jruby-19mode # TODO: We'll need a ODBC variant of the mysql2 driver
10 | services:
11 | - elasticsearch
12 | - mysql
13 |
--------------------------------------------------------------------------------
/.yardopts:
--------------------------------------------------------------------------------
1 | --asset forklift_small.jpg
2 | -
3 | README.md
4 | CHANGELOG.md
5 | CONTRIBUTING.md
6 | LICENSE.txt
7 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # Change Log
2 |
3 | ## [2.0.0 - Unreleased]
4 | ### Major
5 | - Remove support for Ruby 1.9.3 and earlier. This version is no longer
6 | supported per [this announcement](https://www.ruby-lang.org/en/news/2015/02/23/support-for-ruby-1-9-3-has-ended/).
7 |
8 | ### Added
9 | - More docs around the Mysql code.
10 | - New maintainer contact information added to gem
11 |
12 | ### Fixed
13 | - Gem spec license was incorrectly referring to MIT while the license is
14 | Apache-2.0
15 |
16 | ### Changed
17 | - Transitioned `Forklift::Patterns::Mysql` methods to use an options
18 | `Hash` instead of positional parameters. See:
19 | - `.pipe`
20 | - `.incremental_pipe`
21 | - `.optimistic_pipe`
22 | - `.mysql_optimistic_import`
23 | - `.mysql_incremental_import`
24 | - `.mysql_import`
25 | - `.can_incremental_pipe?`
26 | - `.can_incremental_import?`
27 | - `Forklift::Patterns::Mysql.mysql_optimistic_import` no longer loops
28 | through all tables. This behavior was inconsitent with the semantics
29 | of similar methods and caused problems if the specific tables required
30 | different parameters to be imported properly
31 | - `Forklift::Connection::Mysql#max_timestamp` now accepts a symbol for
32 | the matcher and returns a `Time` object. If no timestamp is found
33 | either due to missing table, missing column, or empty table then the
34 | epoch is returned (`Time.at(0)`).
35 | - `Forklift::Connection::Mysql#read_since` expects a `Time` object for
36 | the second "since" parameter in accordance with the change to
37 | `#max_timestamp`.
38 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # How to Contribute
2 |
3 | ## Getting Started
4 | * Fork the repository and clone it to your local machine.
5 | * Install Ruby 2.1.5 (this should work on any 2.x version, but this is
6 | the prefered local setup)
7 | * From within the cloned forklift repository run `bundle install`
8 | * Install MySQL and Elasticsearch
9 | * You should be all set with a working dev environment.
10 |
11 | ## Running Tests
12 | To run this test suite, you will need access to both a MySQL and
13 | Elasticsearch database. Test configurations are saved in
14 | `/spec/config/connections`. They assume that you have MySQL listening on
15 | `127.0.0.1:3306` and can be accessed with a user named `root` and with
16 | no password. Elasticsearch is expected to be listening on
17 | `127.0.0.1:9200`.
18 |
19 | The MySQL tests will create and auto-populate 4 databases:
20 |
21 | * `forklift_test_destination`
22 | * `forklift_test_source_a`
23 | * `forklift_test_source_b`
24 | * `forklift_test_working`
25 |
26 | You can run the whole suite of tests by running `rake`.
27 |
28 | # Change Logs
29 | When submitting a patch ensure you include an update to the relevant
30 | section of the [CHANGELOG](CHANGELOG.md). Use
31 | [http://keepachangelog.com/](http://keepachangelog.com/) as a guide.
32 |
--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | # Specify your gem's dependencies in gemspec
4 | gemspec
5 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright {yyyy} {name of copyright owner}
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Forklift ETL
2 |
3 | Moving heavy databases around. [](http://badge.fury.io/rb/forklift_etl)
4 | [](http://travis-ci.org/taskrabbit/forklift)
5 |
6 | 
7 |
8 | ## What?
9 |
10 | [Forklift](https://github.com/taskrabbit/forklift) is a ruby gem that makes it easy for you to move your data around. Forklift can be an integral part of your datawarehouse pipeline or a backup tool. Forklift can collect and collapse data from multiple sources or across a single source. In forklift's first version, it was only a MySQL tool but now, you can create transports to deal with the data of your choice.
11 |
12 | ## Set up
13 |
14 | Make a new directory with a `Gemfile` like this:
15 | ```ruby
16 | source 'http://rubygems.org'
17 | gem 'forklift_etl'
18 | ```
19 |
20 | Then `bundle`
21 |
22 | Use the generator by doing `(bundle exec) forklift --generate`
23 |
24 | Make your `plan.rb` using the examples below.
25 |
26 | Run your plan `forklift plan.rb`
27 | You can run specific parts of your plan like `forklift plan.rb step1 step5`
28 |
29 | ### Directory structure
30 | Forklift expects your project to be arranged like:
31 |
32 | ```bash
33 | ├── config/
34 | | ├── email.yml
35 | ├── connections/
36 | | ├── mysql/
37 | | ├── (DB).yml
38 | | ├── elasticsearch/
39 | | ├── (DB).yml
40 | | ├── csv/
41 | | ├── (file).yml
42 | ├── log/
43 | ├── pid/
44 | ├── template/
45 | ├── patterns/
46 | ├── transformations/
47 | ├── Gemfile
48 | ├── Gemfile.lock
49 | ├── plan.rb
50 | ```
51 |
52 | To enable a foklift connection, all you need to do is place the yml config file for it within `/config/connections/(type)/(name).yml`
53 | Files you place within `/patterns/` or `connections/(type)/` will be loaded automatically.
54 |
55 | ## Examples
56 |
57 | ### Example Project
58 |
59 | Visit the [`/example`](https://github.com/taskrabbit/forklift/tree/master/example) directory to see a whole forklift project.
60 |
61 | ### Simple extract and load (no transformations)
62 |
63 | If you have multiple databases and want to consolidate into one, this plan
64 | should suffice.
65 |
66 | ```ruby
67 | plan = Forklift::Plan.new
68 |
69 | plan.do! do
70 | # ==> Connections
71 | service1 = plan.connections[:mysql][:service1]
72 | service2 = plan.connections[:mysql][:service2]
73 | analytics_working = plan.connections[:mysql][:analytics_working]
74 | analytics = plan.connections[:mysql][:analytics]
75 |
76 | # ==> Extract
77 | # Load data from your services into your working database
78 | # If you want every table: service1.tables.each do |table|
79 | # Data will be extracted in 1000 row collections
80 | %w(users organizations).each do |table|
81 | service1.read("select * from `#{table}`") { |data| analytics_working.write(data, table) }
82 | end
83 |
84 | %w(orders line_items).each do |table|
85 | service2.read("select * from `#{table}`") { |data| analytics_working.write(data, table) }
86 | end
87 |
88 | # ==> Load
89 | # Load data from the working database to the final database
90 | analytics_working.tables.each do |table|
91 | # will attempt to do an incremental pipe, will fall back to a full table copy
92 | # by default, incremental updates happen off of the `updated_at` column, but you can modify this by setting the `matcher` in the options
93 | # If you want a full pipe instead of incremental, then just use `pipe` instead of `optimistic_pipe`
94 | # The `pipe pattern` works within the same database. To copy across databases, try the `mysql_optimistic_import` method
95 | # This example show the options with their default values.
96 | Forklift::Patterns::Mysql.optimistic_pipe(analytics_working.current_database, table, analytics.current_database, table, matcher: 'updated_at', primary_key: 'id')
97 | end
98 | end
99 | ```
100 |
101 | ### Simple MySQL ETL
102 | ```ruby
103 | plan = Forklift::Plan.new
104 | plan.do! do
105 | # Do some SQL transformations
106 | # SQL transformations are done exactly as they are written
107 | destination = plan.connections[:mysql][:destination]
108 | destination.exec!("./transformations/combined_name.sql")
109 |
110 | # Do some Ruby transformations
111 | # Ruby transformations expect `do!(connection, forklift)` to be defined
112 | destination = plan.connections[:mysql][:destination]
113 | destination.exec!("./transformations/email_suffix.rb")
114 |
115 | # mySQL Dump the destination
116 | destination = plan.connections[:mysql][:destination]
117 | destination.dump('/tmp/destination.sql.gz')
118 | end
119 | ```
120 |
121 | ### Elasticsearch to MySQL
122 | ```ruby
123 | plan = Forklift::Plan.new
124 | plan.do! do
125 | source = plan.connections[:elasticsearch][:source]
126 | destination = plan.connections[:mysql][:destination]
127 | table = 'es_import'
128 | index = 'aaa'
129 | query = { query: { match_all: {} } } # pagination will happen automatically
130 | destination.truncate!(table) if destination.tables.include? table
131 | source.read(index, query) {|data| destination.write(data, table) }
132 | end
133 | ```
134 |
135 | ### MySQL to Elasticsearch
136 | ```ruby
137 | plan = Forklift::Plan.new
138 | plan.do! do
139 | source = plan.connections[:mysql][:source]
140 | destination = plan.connections[:elasticsearch][:source]
141 | table = 'users'
142 | index = 'users'
143 | query = "select * from users" # pagination will happen automatically
144 | source.read(query) {|data| destination.write(data, table, true, 'user') }
145 | end
146 | ```
147 |
148 | ## Forklift Emails
149 |
150 | #### Setup
151 | Put this at the end of your plan inside the `do!` block.
152 |
153 | ```ruby
154 | # ==> Email
155 | # Let your team know the outcome. Attaches the log.
156 | email_args = {
157 | to: "team@yourcompany.com",
158 | from: "Forklift",
159 | subject: "Forklift has moved your database @ #{Time.new}",
160 | body: "So much data!"
161 | }
162 | plan.mailer.send(email_args, plan.logger.messages)
163 | ```
164 |
165 | #### ERB templates
166 | You can get fancy by using an ERB template for your email and SQL variables:
167 |
168 | ```ruby
169 | # ==> Email
170 | # Let your team know the outcome. Attaches the log.
171 | email_args = {
172 | to: "team@yourcompany.com",
173 | from: "Forklift",
174 | subject: "Forklift has moved your database @ #{Time.new}"
175 | }
176 | email_variables = {
177 | total_users_count: service1.read('select count(1) as "count" from users')[0][:count]
178 | }
179 | email_template = "./template/email.erb"
180 | plan.mailer.send_template(email_args, email_template, email_variables, plan.logger.messages)
181 | ```
182 |
183 | Then in `template/email.erb`:
184 |
185 | ```erb
186 |
Your forklift email
187 |
188 |
189 | - Total Users: <%= @total_users_count %>
190 |
191 | ```
192 |
193 | #### Config
194 | When you run `forklift --generate`, we create `config/email.yml` for you:
195 |
196 | ```yml
197 | # Configuration is passed to Pony (https://github.com/benprew/pony)
198 |
199 | # ==> SMTP
200 | # If testing locally, mailcatcher (https://github.com/sj26/mailcatcher) is a helpful gem
201 | via: smtp
202 | via_options:
203 | address: localhost
204 | port: 1025
205 | # user_name: user
206 | # password: password
207 | # authentication: :plain # :plain, :login, :cram_md5, no auth by default
208 | # domain: "localhost.localdomain" # the HELO domain provided by the client to the server
209 |
210 | # ==> Sendmail
211 | # via: sendmail
212 | # via_options:
213 | # location: /usr/sbin/sendmail
214 | # arguments: '-t -i'
215 | ```
216 |
217 | ## Workflow
218 |
219 | ```ruby
220 | # do! is a wrapper around common setup methods (pidfile locking, setting up the logger, etc)
221 | # you don't need to use do! if you want finer control
222 | def do!
223 | # you can use `plan.logger.log` in your plan for logging
224 | self.logger.log "Starting forklift"
225 |
226 | # use a pidfile to ensure that only one instance of forklift is running at a time; store the file if OK
227 | self.pid.safe_to_run?
228 | self.pid.store!
229 |
230 | # this will load all connections in /config/connections/#{type}/#{name}.yml into the plan.connections hash
231 | # and build all the connection objects (and try to connect in some cases)
232 | self.connect!
233 |
234 | yield # your stuff here!
235 |
236 | # remove the pidfile
237 | self.logger.log "Completed forklift"
238 | self.pid.delete!
239 | end
240 |
241 | ```
242 |
243 | ### Steps
244 |
245 | You can optionally divide up your forklift plan into steps:
246 |
247 | ```ruby
248 | plan = Forklift::Plan.new
249 | plan.do! do
250 |
251 | plan.step('Mysql Import'){
252 | source = plan.connections[:mysql][:source]
253 | destination = plan.connections[:mysql][:destination]
254 | source.tables.each do |table|
255 | Forklift::Patterns::Mysql.optimistic_pipe(source, table, destination, table)
256 | end
257 | }
258 |
259 | plan.step('Elasticsearch Import'){
260 | source = plan.connections[:elasticsearch][:source]
261 | destination = plan.connections[:mysql][:destination]
262 | table = 'es_import'
263 | index = 'aaa'
264 | query = { query: { match_all: {} } } # pagination will happen automatically
265 | destination.truncate!(table) if destination.tables.include? table
266 | source.read(index, query) {|data| destination.write(data, table) }
267 | }
268 |
269 | end
270 | ```
271 |
272 | When you use steps, you can run your whole plan, or just part if it with command line arguments. For example, `forklift plan.rb "Elasticsearch Import"` would just run that single portion of the plan. Note that any parts of your plan not within a step will be run each time.
273 |
274 | ### Error Handling
275 |
276 | By default, exceptions within your plan will raise and crash your application. However, you can pass an optional `error_handler` lambda to your step about how to handle the error. the `error_handler` will be passed (`step_name`,`exception`). If you don't re-raise within your error handler, your plan will continue to excecute. For example:
277 |
278 | ```ruby
279 |
280 | error_handler = lambda { |name, exception|
281 | if exception.class =~ /connection/
282 | # I can't connect, I should halt
283 | raise e
284 | elsif exception.class =~ /SoftError/
285 | # this type of error is OK
286 | else
287 | raise e
288 | end
289 | }
290 |
291 | plan.step('a_complex_step', error_handler){
292 | # ...
293 | }
294 |
295 | ```
296 |
297 | ## Transports
298 |
299 | Transports are how you interact with your data. Every transport defines `read` and `write` methods which handle arrays of data objects (and the helper methods required).
300 |
301 | Each transport should have a config file in `./config/connections/#{transport}/`. It will be loaded at boot.
302 |
303 | Transports optionally define helper methods which are a shortcut to copy data *within* a transport, like the mysql `pipe` methods (i.e.: `insert into #{to_db}.#{to_table}; select * from #{from_db}.#{from_table})`. A transport may also define other helpers (like how to create a MySQL dump). These should be defined in `/patterns/#{type}.rb` within the `Forklift::Patterns::#{type}` namespace.
304 |
305 | ### Creating your own transport
306 |
307 | In the `/connections` directory in your project, create a file that defines at least the following:
308 |
309 | ```ruby
310 | module Forklift
311 | module Connection
312 | class Mixpanel < Forklift::Base::Connection
313 |
314 | def initialize(config, forklift)
315 | @config = config
316 | @forklift = forklift
317 | end
318 |
319 | def config
320 | @config
321 | end
322 |
323 | def forklift
324 | @forklift
325 | end
326 |
327 | def read(index, query, args)
328 | # ...
329 | data = [] # data is an array of hashes
330 | # ...
331 | if block_given?
332 | yield data
333 | else
334 | return data
335 | end
336 | end
337 |
338 | def write(data, table)
339 | # data is an array of hashes
340 | # "table" can be any argument(s) you need to know where/how to write
341 | # ...
342 | end
343 |
344 | def pipe(from_table, from_db, to_table, to_db)
345 | # ...
346 | end
347 |
348 | private
349 |
350 | #/private
351 |
352 | end
353 | end
354 | end
355 | ```
356 |
357 | Existing transports and patterns for them are documented [here](http://www.rubydoc.info/gems/forklift_etl)
358 | ### MySQL
359 |
360 | - [Transport](http://www.rubydoc.info/gems/forklift_etl/Forklift/Connection/Mysql)
361 | - [Patterns](http://www.rubydoc.info/gems/forklift_etl/Forklift/Patterns/Mysql)
362 |
363 | ### Elasticsearch
364 |
365 | - [Transport](http://www.rubydoc.info/gems/forklift_etl/Forklift/Connection/Elasticsearch)
366 | - [Patterns](http://www.rubydoc.info/gems/forklift_etl/Forklift/Patterns/Elasticsearch)
367 |
368 | ### Csv
369 |
370 | - [Transport](http://www.rubydoc.info/gems/forklift_etl/Forklift/Connection/Csv)
371 |
372 | ## Transformations
373 |
374 | Forklift allows you to create both Ruby transformations and script transformations.
375 |
376 | - It is up to the transport to define `exec_script`, and not all transports will support it. Mysql can run `.sql` files, but there is not an equivalent for elasticsearch. Mysql scripts evaluate statement by statement. The delimeter (by default `;`) can be redefined using the `delimeter` command as described [here](http://dev.mysql.com/doc/refman/5.7/en/stored-programs-defining.html)
377 | - `.exec` runs and logs exceptions, while `.exec!` will raise on an error. For example, `destination.exec("./transformations/cleanup.rb")` will run cleanup.rb on the destination database.
378 | - Script files are run as-is, but ruby transformations must define a `do!` method in their class and are passed `def do!(connection, forklift)`
379 | - args is optional, and can be passed in from your plan
380 |
381 | ```ruby
382 | # Example transformation to count users
383 | # count_users.rb
384 |
385 | class CountUsers
386 | def do!(connection, forklift, args)
387 | forklift.logger.log "counting users"
388 | count = connection.count('users')
389 | forklift.logger.log "[#{args.name}] found #{count} users"
390 | end
391 | end
392 | ```
393 |
394 | ```ruby
395 | # in your plan.rb
396 | plan = Forklift::Plan.new
397 | plan.do! do
398 | destination = plan.connections[:mysql][:destination]
399 | destination.exec!("./transformations/combined_name.sql", {name: 'user counter'})
400 |
401 | end
402 | ```
403 |
404 | ## Options & Notes
405 | - Thanks to [@rahilsondhi](https://github.com/rahilsondhi), [@rgarver](https://github.com/rgarver) and [Looksharp](https://www.looksharp.com/) for all their help
406 | - email_options is a hash consumed by the [Pony mail gem](https://github.com/benprew/pony)
407 | - Forklift's logger is [Lumberjack](https://github.com/bdurand/lumberjack) with a wrapper to also echo the log lines to stdout and save them to an array to be accessed later by the email system.
408 | - The mysql connections hash will be passed directly to a [mysql2](https://github.com/brianmario/mysql2) connection.
409 | - The elasticsearch connections hash will be passed directly to a [elasticsearch](https://github.com/elasticsearch/elasticsearch-ruby) connection.
410 | - Your databases must exist. Forklift will not create them for you.
411 | - Ensure your databases have the right encoding (eg utf8) or you will get errors like `#`
412 | - If testing locally, mailcatcher (https://github.com/sj26/mailcatcher) is a helpful gem to test your email sending
413 |
414 | ## Contributing and Testing
415 | See: [CONTRIBUTING](CONTRIBUTING.md)
416 |
417 | ## Alternatives
418 | If you want something similar for Node.js try [Empujar](https://github.com/taskrabbit/empujar)
419 |
--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env rake
2 | require 'rake'
3 | require "bundler/gem_tasks"
4 | require 'rspec/core/rake_task'
5 |
6 | RSpec::Core::RakeTask.new(:spec) do |spec|
7 | spec.pattern = [
8 | 'spec/*/*_spec.rb',
9 | 'spec/*/*/*_spec.rb',
10 | 'spec/*/*/*/*_spec.rb',
11 | ]
12 | spec.rspec_opts = '--format documentation'
13 | end
14 |
15 | task default: :spec
--------------------------------------------------------------------------------
/bin/forklift:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env ruby
2 |
3 | require 'rubygems'
4 | require 'fileutils'
5 |
6 | begin
7 | require 'forklift'
8 | rescue LoadError
9 | require "#{File.expand_path(File.dirname(__FILE__))}/../lib/forklift.rb"
10 | end
11 |
12 | def generate
13 | p = Dir.pwd
14 |
15 | Dir.mkdir "#{p}/config"
16 | Dir.mkdir "#{p}/config/connections"
17 | Dir.mkdir "#{p}/config/connections/mysql"
18 | Dir.mkdir "#{p}/config/connections/elasticsearch"
19 | Dir.mkdir "#{p}/config/connections/csv"
20 | Dir.mkdir "#{p}/log"
21 | Dir.mkdir "#{p}/pid"
22 | Dir.mkdir "#{p}/template"
23 | Dir.mkdir "#{p}/transformations"
24 | Dir.mkdir "#{p}/transports"
25 | Dir.mkdir "#{p}/patterns"
26 |
27 | template('source.yml', "#{p}/config/connections/mysql/source.yml")
28 | template('destination.yml', "#{p}/config/connections/mysql/destination.yml")
29 | template('email.yml', "#{p}/config/email.yml")
30 | template('email.erb', "#{p}/template/email.erb")
31 | template('plan.rb', "#{p}/plan.rb")
32 | end
33 |
34 | def template(source, destination)
35 | t = "#{File.expand_path(File.dirname(__FILE__))}/../template"
36 | FileUtils.copy("#{t}/#{source}", destination)
37 | puts "Example plan generated"
38 | end
39 |
40 |
41 | def run_plan
42 | file = "#{Dir.pwd}/#{ARGV[0]}"
43 | if ARGV[0].nil?
44 | puts "[error] Please provide a plan.rb as the first argument"
45 | exit(1)
46 | end
47 | Dir.chdir File.expand_path(File.dirname(ARGV[0]))
48 | begin
49 | require 'bundler'
50 | Bundler.require(:default)
51 | rescue Exception => e
52 | puts "cannot load bundler: #{e}"
53 | end
54 | require file
55 | end
56 |
57 | ############
58 |
59 | if ['--generate', '-generate'].include?(ARGV[0])
60 | generate
61 | else
62 | run_plan
63 | end
64 |
--------------------------------------------------------------------------------
/example/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | gem 'forklift_etl', path: '../'
4 |
--------------------------------------------------------------------------------
/example/Gemfile.lock:
--------------------------------------------------------------------------------
1 | PATH
2 | remote: ../
3 | specs:
4 | forklift (1.0.3)
5 | activesupport
6 | elasticsearch
7 | lumberjack
8 | mysql2
9 | pony
10 |
11 | GEM
12 | remote: https://rubygems.org/
13 | specs:
14 | activesupport (4.0.3)
15 | i18n (~> 0.6, >= 0.6.4)
16 | minitest (~> 4.2)
17 | multi_json (~> 1.3)
18 | thread_safe (~> 0.1)
19 | tzinfo (~> 0.3.37)
20 | atomic (1.1.15)
21 | elasticsearch (1.0.1)
22 | elasticsearch-api (= 1.0.1)
23 | elasticsearch-transport (= 1.0.1)
24 | elasticsearch-api (1.0.1)
25 | multi_json
26 | elasticsearch-transport (1.0.1)
27 | faraday
28 | multi_json
29 | faraday (0.9.0)
30 | multipart-post (>= 1.2, < 3)
31 | i18n (0.6.9)
32 | lumberjack (1.0.5)
33 | mail (2.5.4)
34 | mime-types (~> 1.16)
35 | treetop (~> 1.4.8)
36 | mime-types (1.25.1)
37 | minitest (4.7.5)
38 | multi_json (1.9.2)
39 | multipart-post (2.0.0)
40 | mysql2 (0.3.15)
41 | polyglot (0.3.4)
42 | pony (1.8)
43 | mail (>= 2.0)
44 | thread_safe (0.2.0)
45 | atomic (>= 1.1.7, < 2)
46 | treetop (1.4.15)
47 | polyglot
48 | polyglot (>= 0.3.1)
49 | tzinfo (0.3.39)
50 |
51 | PLATFORMS
52 | ruby
53 |
54 | DEPENDENCIES
55 | forklift!
56 |
--------------------------------------------------------------------------------
/example/config/connections/csv/csv.yml:
--------------------------------------------------------------------------------
1 | :file: /path/to/file.csv
--------------------------------------------------------------------------------
/example/config/connections/elasticsearch/source.yml:
--------------------------------------------------------------------------------
1 | :host: http://localhost:9200
2 |
--------------------------------------------------------------------------------
/example/config/connections/mysql/destination.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :database: destination
3 | :username: root
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/example/config/connections/mysql/source.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :database: source
3 | :username: root
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/example/config/email.yml:
--------------------------------------------------------------------------------
1 | # Configuration is passed to Pony (https://github.com/benprew/pony)
2 |
3 | # ==> SMTP
4 | # If testing locally, mailcatcher (https://github.com/sj26/mailcatcher) is a helpful gem
5 | via: smtp
6 | via_options:
7 | address: localhost
8 | port: 1025
9 | # user_name: user
10 | # password: password
11 | # authentication: :plain # :plain, :login, :cram_md5, no auth by default
12 | # domain: "localhost.localdomain" # the HELO domain provided by the client to the server
13 |
14 | # ==> Sendmail
15 | # via: sendmail
16 | # via_options:
17 | # location: /usr/sbin/sendmail
18 | # arguments: '-t -i'
19 |
--------------------------------------------------------------------------------
/example/plan.rb:
--------------------------------------------------------------------------------
1 | # plan = Forklift::Plan.new
2 | # Or, you can pass configs
3 | plan = Forklift::Plan.new ({
4 | # logger: {debug: true}
5 | })
6 |
7 | plan.do! {
8 | # do! is a wrapper around common setup methods (pidfile locking, setting up the logger, etc)
9 | # you don't need to use do! if you want finer control
10 |
11 | # cleanup from a previous run
12 | plan.step('Cleanup'){
13 | destination = plan.connections[:mysql][:destination]
14 | destination.exec("./transformations/cleanup.sql");
15 | end
16 |
17 | # mySQL -> mySQL
18 | plan.step('Mysql Import'){
19 | source = plan.connections[:mysql][:source]
20 | destination = plan.connections[:mysql][:destination]
21 | source.tables.each do |table|
22 | Forklift::Patterns::Mysql.optimistic_pipe(source, table, destination, table)
23 | # will attempt to do an incremental pipe, will fall back to a full table copy
24 | # by default, incremental updates happen off of the `created_at` column, but you can modify this with "matcher"
25 | end
26 | }
27 |
28 | # Elasticsearch -> mySQL
29 | plan.step('Elasticsearch Import'){
30 | source = plan.connections[:elasticsearch][:source]
31 | destination = plan.connections[:mysql][:destination]
32 | table = 'es_import'
33 | index = 'aaa'
34 | query = { query: { match_all: {} } } # pagination will happen automatically
35 | destination.truncate!(table) if destination.tables.include? table
36 | source.read(index, query) {|data| destination.write(data, table) }
37 | }
38 |
39 | # mySQL -> Elasticsearch
40 | plan.step('Elasticsearch Load'){
41 | source = plan.connections[:mysql][:source]
42 | destination = plan.connections[:elasticsearch][:source]
43 | table = 'users'
44 | index = 'users'
45 | query = "select * from users" # pagination will happen automatically
46 | source.read(query) {|data| destination.write(data, table, true, 'user') }
47 | }
48 |
49 | # ... and you can write your own connections [LINK GOES HERE]
50 |
51 | # Do some SQL transformations
52 | plan.step('Transformations'){
53 | # SQL transformations are done exactly as they are written
54 | destination = plan.connections[:mysql][:destination]
55 | destination.exec!("./transformations/combined_name.sql")
56 |
57 | # Do some Ruby transformations
58 | # Ruby transformations expect `do!(connection, forklift)` to be defined
59 | destination = plan.connections[:mysql][:destination]
60 | destination.exec!("./transformations/email_suffix.rb")
61 | }
62 |
63 | # mySQL Dump the destination
64 | plan.step('Mysql Dump'){
65 | destination = plan.connections[:mysql][:destination]
66 | destination.dump('/tmp/destination.sql.gz')
67 | }
68 |
69 | # email the logs and a summary
70 | plan.step('Email'){
71 | destination = plan.connections[:mysql][:destination]
72 |
73 | email_args = {
74 | to: "YOU@FAKE.com",
75 | from: "Forklift",
76 | subject: "value", "Forklift has moved your database @ #{Time.new}",
77 | }
78 |
79 | email_variables = {
80 | total_users_count: destination.read('select count(1) as "count" from users')[0][:count],
81 | new_users_count: destination.read('select count(1) as "count" from users where date(created_at) = date(NOW())')[0][:count],
82 | }
83 |
84 | email_template = "./template/email.erb"
85 | plan.mailer.send_template(email_args, email_template, email_variables, plan.logger.messages) unless ENV['EMAIL'] == 'false'
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/example/template/email.erb:
--------------------------------------------------------------------------------
1 | Your forklift email
2 |
3 |
4 | - Total Users: <%= @total_users_count %>
5 | - New Users: <%= @new_users_count %>
6 |
7 |
--------------------------------------------------------------------------------
/example/transformations/cleanup.sql:
--------------------------------------------------------------------------------
1 | ALTER TABLE `users` DROP `combined_name`;
2 |
--------------------------------------------------------------------------------
/example/transformations/combined_name.sql:
--------------------------------------------------------------------------------
1 | ALTER TABLE `users` ADD `combined_name` VARCHAR(255) NULL DEFAULT NULL AFTER `last_name`;
2 |
3 | UPDATE `users` SET `combined_name` = (
4 | select CONCAT(first_name, " ", last_name)
5 | );
6 |
7 | CREATE INDEX combined_name ON users (combined_name);
8 |
--------------------------------------------------------------------------------
/example/transformations/email_suffix.rb:
--------------------------------------------------------------------------------
1 | class EmailSuffix
2 |
3 | def do!(connection, forklift)
4 | forklift.logger.log "collecting email suffixes..."
5 |
6 | suffixes = {}
7 | connection.read("select email from users"){|data|
8 | data.each do |row|
9 | part = row[:email].split('@').last
10 | suffixes[part] = 0 if suffixes[part].nil?
11 | suffixes[part] = suffixes[part] + 1
12 | end
13 | }
14 |
15 | suffixes.each do |suffix, count|
16 | forklift.logger.log " > #{suffix}: #{count}" if count > 5
17 | end
18 | end
19 |
20 | end
21 |
--------------------------------------------------------------------------------
/forklift.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taskrabbit/forklift/e4261da656cb0af77cea37deb07502e24bb1abe2/forklift.jpg
--------------------------------------------------------------------------------
/forklift_etl.gemspec:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | lib = File.expand_path('../lib', __FILE__)
3 | $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4 | require 'forklift/version'
5 |
6 | Gem::Specification.new do |s|
7 | s.name = "forklift_etl"
8 | s.version = Forklift::VERSION
9 | s.authors = ["Evan Tahler", "Ryan Garver"]
10 | s.email = ["evan@taskrabbit.com", "ragarver@gmail.com"]
11 | s.homepage = "https://github.com/taskrabbit/forklift"
12 | s.summary = %q{Forklift: Moving big databases around. A ruby ETL tool.}
13 | s.description = %q{A collection of ETL tools and patterns for mysql and elasticsearch.}
14 | s.license = "Apache-2.0"
15 |
16 | s.rubyforge_project = "forklift_etl"
17 |
18 | s.files = `git ls-files`.split("\n")
19 | s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
20 | s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
21 | s.require_paths = ["lib"]
22 |
23 | s.add_dependency "activesupport", '~> 4.0', ">= 4.0.0"
24 | s.add_dependency "mysql2", '~> 0.0', ">= 0.0.1"
25 | s.add_dependency "elasticsearch", '~> 1.0', ">= 1.0.0"
26 | s.add_dependency "pony", '~> 1.0', ">= 1.0.0"
27 | s.add_dependency "lumberjack", '~> 1.0', ">= 1.0.0"
28 | s.add_development_dependency 'rake'
29 | s.add_development_dependency 'rspec'
30 | s.add_development_dependency 'email_spec'
31 | end
32 |
--------------------------------------------------------------------------------
/forklift_small.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/taskrabbit/forklift/e4261da656cb0af77cea37deb07502e24bb1abe2/forklift_small.jpg
--------------------------------------------------------------------------------
/lib/forklift.rb:
--------------------------------------------------------------------------------
1 | require 'forklift/version'
2 |
3 | module Forklift
4 |
5 | lib = File.join(File.expand_path(File.dirname(__FILE__)), 'forklift')
6 |
7 | require "#{lib}/base/utils.rb"
8 | require "#{lib}/base/pid.rb"
9 | require "#{lib}/base/logger.rb"
10 | require "#{lib}/base/mailer.rb"
11 | require "#{lib}/base/connection.rb"
12 |
13 | Dir["#{lib}/transports/*.rb"].each {|file| require file }
14 | Dir["#{lib}/patterns/*.rb"].each {|file| require file }
15 | Dir["#{Dir.pwd}/transports/*.rb"].each {|file| require file } if File.directory?("#{Dir.pwd}/transports")
16 | Dir["#{Dir.pwd}/patterns/*.rb"].each {|file| require file } if File.directory?("#{Dir.pwd}/patterns")
17 |
18 | require "#{lib}/plan.rb"
19 | end
20 |
--------------------------------------------------------------------------------
/lib/forklift/base/connection.rb:
--------------------------------------------------------------------------------
1 | module Forklift
2 | module Base
3 | class Connection
4 | attr_reader :config, :forklift, :client
5 |
6 | def initialize(config, forklift)
7 | @config = config
8 | @forklift = forklift
9 | end
10 |
11 | def connect
12 | # Will define @client
13 | raise 'not implemented'
14 | end
15 |
16 | def disconnect
17 | raise 'not implemented'
18 | end
19 |
20 | def read(query)
21 | # will return an array of data rows
22 | raise 'not implemented'
23 | end
24 |
25 | def write(data, collection)
26 | # will write array data to collection (table)
27 | raise 'not implemented'
28 | end
29 |
30 | def pipe
31 | # when copying within the same connection, this method can be defined to speed things up
32 | raise 'not implemented'
33 | end
34 |
35 | def exec(path, *args)
36 | begin
37 | exec!(path, &args)
38 | rescue Exception => e
39 | forklift.logger.log(e)
40 | end
41 | end
42 |
43 | def exec!(path, *args)
44 | forklift.logger.log "Running script: #{path}"
45 | extension = path.split(".").last
46 | if(extension == "rb" || extension == "ruby")
47 | exec_ruby(path, *args)
48 | else
49 | exec_script(path, *args)
50 | end
51 | end
52 |
53 | def exec_ruby(path, *args)
54 | klass = forklift.utils.class_name_from_file(path)
55 | require path
56 | model = eval("#{klass}.new")
57 | model.do!(self, forklift, *args)
58 | end
59 |
60 | def exec_script(path, *args)
61 | raise 'not implemented'
62 | end
63 |
64 | end
65 | end
66 | end
67 |
--------------------------------------------------------------------------------
/lib/forklift/base/logger.rb:
--------------------------------------------------------------------------------
1 | require 'lumberjack'
2 |
3 | module Forklift
4 | module Base
5 | class Logger
6 |
7 | def initialize(forklift)
8 | @forklift = forklift
9 | end
10 |
11 | def forklift
12 | @forklift
13 | end
14 |
15 | def messages
16 | @messages ||= []
17 | end
18 |
19 | def logger
20 | log_dir = "#{forklift.config[:project_root]}/log"
21 | @logger ||= ::Lumberjack::Logger.new("#{log_dir}/forklift.log", buffer_size: 0)
22 | end
23 |
24 | def log(message, severity="info")
25 | timed_message = "[Forklift @ #{Time.now}] #{message}"
26 | puts timed_message unless forklift.config[:logger][:stdout] != true
27 | logger.send(severity.to_sym, message) unless logger.nil?
28 | messages << timed_message
29 | end
30 |
31 | def debug(message)
32 | if forklift.config[:logger][:debug] == true
33 | log("[debug] #{message}")
34 | end
35 | end
36 |
37 | def emphatically(message)
38 | log "" if message.length > 0
39 | log "*** #{message} ***"
40 | log ""
41 | end
42 |
43 | def fatal(message)
44 | log "!!! #{message} !!!"
45 | end
46 |
47 | end
48 | end
49 | end
50 |
--------------------------------------------------------------------------------
/lib/forklift/base/mailer.rb:
--------------------------------------------------------------------------------
1 | require 'pony'
2 | require 'erb'
3 | require 'active_support/core_ext/hash/keys'
4 |
5 | module Forklift
6 | module Base
7 | class Mailer
8 |
9 | def initialize(forklift)
10 | @forklift = forklift
11 | end
12 |
13 | # Public: Pull out the settings from config/email.yml.
14 | #
15 | # Returns a Hash with all symbolized keys.
16 | def config
17 | config_file = "#{forklift.config[:project_root]}/config/email.yml"
18 | @config ||= forklift.utils.load_yml(config_file).deep_symbolize_keys
19 | end
20 |
21 | def forklift
22 | @forklift
23 | end
24 |
25 | def message_defaults
26 | {
27 | from: "Forklift",
28 | subject: "Forklift has moved your database @ #{Time.new}",
29 | body: "Forklift has moved your database @ #{Time.new}",
30 | }
31 | end
32 |
33 | def send_template(args, template_file, variables, attachment_lines=[])
34 | renderer = ERB.new(File.read(template_file))
35 | binder = ERBBinding.new(variables)
36 | body = renderer.result(binder.get_binding)
37 | args[:body] = body
38 | send(args, attachment_lines)
39 | end
40 |
41 | def send(args, attachment_lines=[])
42 | params = message_defaults
43 | [:to, :from, :subject, :body].each do |i|
44 | params[i] = args[i] unless args[i].nil?
45 | end
46 | if attachment_lines.length > 0
47 | params[:attachments] = {"log.txt" => attachment_lines.join("\r\n")}
48 | end
49 | deliver(params)
50 | end
51 |
52 | private
53 |
54 | # Private: Actually deliver the message using Pony.
55 | #
56 | # Returns the raw email from Pony.
57 | def deliver(params)
58 | forklift.logger.log("Sending email via #{config[:via]}")
59 | if params[:html_body].nil?
60 | params[:html_body] = params[:body]
61 | params.delete(:body)
62 | end
63 | params[:via] = config[:via].to_sym
64 | params[:via_options] = config[:via_options]
65 | Pony.mail(params)
66 | end
67 |
68 | class ERBBinding
69 | def initialize(hash)
70 | hash.each do |k,v|
71 | v = v.gsub("'", " ") if v.class == String
72 | instance_variable_set("@#{k}", v)
73 | end
74 | end
75 |
76 | def get_binding
77 | return binding()
78 | end
79 | end
80 |
81 | end
82 | end
83 | end
84 |
--------------------------------------------------------------------------------
/lib/forklift/base/pid.rb:
--------------------------------------------------------------------------------
1 | module Forklift
2 | module Base
3 | class Pid
4 |
5 | def initialize(forklift)
6 | @forklift = forklift
7 | end
8 |
9 | def forklift
10 | @forklift
11 | end
12 |
13 | def pid_dir
14 | "#{forklift.config[:project_root]}/pid"
15 | end
16 |
17 | def ensure_pid_dir
18 | `mkdir -p #{pid_dir}`
19 | end
20 |
21 | def pidfile
22 | "#{pid_dir}/pidfile"
23 | end
24 |
25 | def store!
26 | forklift.logger.debug "Creating pidfile @ #{pidfile}"
27 | ensure_pid_dir
28 | File.open(pidfile, 'w') {|f| f << Process.pid}
29 | end
30 |
31 | def recall
32 | ensure_pid_dir
33 | IO.read(pidfile).to_i rescue nil
34 | end
35 |
36 | def delete!
37 | forklift.logger.debug "Removing pidfile @ #{pidfile}"
38 | FileUtils.rm(pidfile) rescue nil
39 | end
40 |
41 | def safe_to_run?
42 | return if recall.nil?
43 | count = `ps -p #{recall} | wc -l`.to_i
44 | if count >= 2
45 | forklift.logger.fatal "This application is already running (pidfile) #{recall}. Exiting now"
46 | exit(1)
47 | else
48 | forklift.logger.log "Clearing old pidfile from previous process #{recall}"
49 | delete!
50 | end
51 | end
52 |
53 | end
54 | end
55 | end
56 |
--------------------------------------------------------------------------------
/lib/forklift/base/utils.rb:
--------------------------------------------------------------------------------
1 | require 'yaml'
2 | require 'erb'
3 |
4 | module Forklift
5 | module Base
6 | class Utils
7 |
8 | def load_yml(file)
9 | YAML.load(ERB.new(File.read(file)).result)
10 | end
11 |
12 | def class_name_from_file(file)
13 | klass = ""
14 | words = file.split("/").last.split(".").first.split("_")
15 | words.each do |word|
16 | klass << word.capitalize
17 | end
18 | klass
19 | end
20 |
21 | end
22 | end
23 | end
24 |
--------------------------------------------------------------------------------
/lib/forklift/patterns/elasticsearch_patterns.rb:
--------------------------------------------------------------------------------
1 | module Forklift
2 | module Patterns
3 | class Elasticsearch
4 |
5 | end
6 | end
7 | end
8 |
--------------------------------------------------------------------------------
/lib/forklift/patterns/mysql_patterns.rb:
--------------------------------------------------------------------------------
1 | module Forklift
2 | module Patterns
3 | class Mysql
4 | class< `#{to_db}`.`#{to_table}`")
34 |
35 | source.q("DROP TABLE IF EXISTS `#{to_db}`.`#{tmp_table}`")
36 | source.q("CREATE TABLE `#{to_db}`.`#{tmp_table}` LIKE `#{from_db}`.`#{from_table}`")
37 | source.q("INSERT INTO `#{to_db}`.`#{tmp_table}` SELECT * FROM `#{from_db}`.`#{from_table}`")
38 | source.q("DROP TABLE IF EXISTS `#{to_db}`.`#{to_table}`")
39 | source.q("RENAME TABLE `#{to_db}`.`#{tmp_table}` TO `#{to_db}`.`#{to_table}`")
40 |
41 | delta = Time.new.to_i - start
42 | source.forklift.logger.log(" ^ moved #{destination.count(to_table, to_db)} rows in #{delta}s")
43 | end
44 |
45 | # Pipe rows from one table to another within the same database
46 | # (see .pipe). This is the incremental version of the {.pipe}
47 | # pattern and will only move records whose `matcher` column is
48 | # newer than the maximum in the destination table.
49 | #
50 | # @param (see .pipe)
51 | # @option options [String] :matcher ('updated_at') The datetime
52 | # column used to find the "newest" records in the `from_table`
53 | # @option options [String] :primary_key ('id') The column to use
54 | # to determine if the row should be updated or inserted. Updates
55 | # are performed by deleting the old version of the row and
56 | # reinserting the new, updated row.
57 | #
58 | # @see .mysql_incremental_import
59 | # @see .pipe
60 | def incremental_pipe(source, from_table, destination, to_table, options={})
61 | start = Time.new.to_i
62 | from_db = source.current_database
63 | to_db = destination.current_database
64 | matcher = options[:matcher] || source.default_matcher
65 | primary_key = options[:primary_key] || :id
66 | source.forklift.logger.log("mysql incremental_pipe: `#{from_db}`.`#{from_table}` => `#{to_db}`.`#{to_table}`")
67 | source.q("CREATE TABLE IF NOT EXISTS `#{to_db}`.`#{to_table}` LIKE `#{from_db}`.`#{from_table}`")
68 |
69 | # Count the number of rows in to_table
70 | original_count = source.count(to_table, to_db)
71 |
72 | # Find the latest/max/newest timestamp from the final table
73 | # in order to determine the last copied row.
74 | latest_timestamp = source.max_timestamp(to_table, matcher, to_db)
75 |
76 | # If to_table has existing rows, ensure none of them are "stale."
77 | # A stale row in to_table means a previously copied row was
78 | # updated in from_table, so let's delete it from the to_table
79 | # so we can get a fresh copy of that row.
80 | if original_count > 0
81 | # Get the ids of rows in from_table that are newer than the newest row in to_table.
82 | # Some of these rows could either be a) stale or b) new.
83 | source.read("SELECT `#{primary_key}` FROM `#{from_db}`.`#{from_table}` WHERE `#{matcher}` > \"#{latest_timestamp}\" ORDER BY `#{matcher}`") do |stale_rows|
84 | if stale_rows.length > 0
85 | # Delete these ids from to_table.
86 | # If the ids are stale, then they'll be deleted. If they're new, they won't exist, and nothing will happen.
87 | stale_ids = stale_rows.map { |row| row[primary_key] }.join(',')
88 | source.q("DELETE FROM `#{to_db}`.`#{to_table}` WHERE `#{primary_key}` IN (#{stale_ids})")
89 | source.forklift.logger.log(" ^ deleted up to #{stale_rows.length} stale rows from `#{to_db}`.`#{to_table}`")
90 | end
91 | end
92 | end
93 |
94 | # Do the insert into to_table
95 | destination.q("INSERT INTO `#{to_db}`.`#{to_table}` SELECT * FROM `#{from_db}`.`#{from_table}` WHERE `#{matcher}` > \"#{latest_timestamp.to_s(:db)}\" ORDER BY `#{matcher}`")
96 | delta = Time.new.to_i - start
97 | new_count = destination.count(to_table, to_db) - original_count
98 | source.forklift.logger.log(" ^ created #{new_count} new rows in #{delta}s")
99 | end
100 |
101 | # Attempt an {.incremental_pipe} and fall back to a {.pipe} if unable
102 | # to run incrementally.
103 | #
104 | # @param (see .pipe)
105 | # @option (see .pipe)
106 | # @option (see .incremental_pipe)
107 | #
108 | # @see .pipe
109 | # @see .incremental_pipe
110 | def optimistic_pipe(source, from_table, destination, to_table, options={})
111 | from_db = source.current_database
112 | to_db = destination.current_database
113 | if self.can_incremental_pipe?(source, from_table, destination, to_table, options)
114 | begin
115 | incremental_pipe(source, from_table, destination, to_table, options)
116 | rescue Exception => e
117 | source.forklift.logger.log("! incremental_pipe failure on #{from_table} => #{to_table}: #{e} ")
118 | source.forklift.logger.log("! falling back to pipe...")
119 | pipe(source, from_table, destination, to_table)
120 | end
121 | else
122 | pipe(source, from_table, destination, to_table, options)
123 | end
124 | end
125 |
126 | # Attempt a {.mysql_incremental_import} and fall back to {.mysql_import}
127 | #
128 | # @param (see .mysql_import)
129 | # @option (see .mysql_import)
130 | # @option (see .mysql_incremental_import)
131 | #
132 | # @see .mysql_import
133 | # @see .mysql_incremental_import
134 | def mysql_optimistic_import(source, from_table, destination, to_table, options={})
135 | if self.can_incremental_import?(source, from_table, destination, to_table, options)
136 | begin
137 | self.mysql_incremental_import(source, from_table, destination, to_table, options)
138 | rescue Exception => e
139 | source.forklift.logger.log("! incremental import failure on #{from_table} => #{to_table}: #{e} ")
140 | source.forklift.logger.log("! falling back to import...")
141 | self.mysql_import(source, from_table, destination, to_table, options)
142 | end
143 | else
144 | self.mysql_import(source, from_table, destination, to_table, options)
145 | end
146 | end
147 |
148 | def detect_primary_key_or_default(source, from_table)
149 | source.q("SHOW INDEX FROM `#{source.current_database}`.`#{from_table}` WHERE key_name = 'PRIMARY';").try(:first).try(:[], :Column_name).try(:to_sym) || :id
150 | end
151 |
152 | # Import table from one mysql instance to another incrementally.
153 | #
154 | # @param (see .mysql_import
155 | # @option options [String] :matcher ('updated_at') The datetime
156 | # column used to find the "newest" records in the `from_table`
157 | #
158 | # @see .mysql_import
159 | # @see .incremental_pipe
160 | def mysql_incremental_import(source, from_table, destination, to_table, options={})
161 | matcher = options[:matcher] || source.default_matcher
162 | primary_key = detect_primary_key_or_default(source, from_table)
163 |
164 | since = destination.max_timestamp(to_table, matcher)
165 | source.read_since(from_table, since, matcher){ |data| destination.write(data, to_table, true, destination.current_database, primary_key) }
166 | end
167 |
168 | # Pull a table from the `source` database in to the `destination` database.
169 | # This is an upoptimized version of {.pipe}. Unlike {.pipe} this method can
170 | # pull records from one mysql instance in to another. The `to_table` at the
171 | # `destination` database will get a `DROP` if it exists.
172 | #
173 | # @param (see .pipe)
174 | #
175 | # @return
176 | #
177 | # @see .pipe
178 | def mysql_import(source, from_table, destination, to_table, options={})
179 | primary_key = detect_primary_key_or_default(source, from_table)
180 |
181 | # destination.truncate table
182 | destination.drop! to_table if destination.tables.include?(to_table)
183 | source.read("SELECT * FROM #{from_table}"){ |data| destination.write(data, to_table, true, destination.current_database, primary_key) }
184 | end
185 |
186 | # The high water method will stub a row in all tables with a `default_matcher` column prentending to have a record from `time`
187 | # This enabled partial forklift funs which will only extract data "later than X"
188 | #
189 | # @todo assumes all columns have a default NULL setting
190 | def write_high_water_mark(db, time, matcher=db.default_matcher)
191 | db.tables.each do |table|
192 | columns, types = db.columns(table, db.current_database, true)
193 | if columns.include?(matcher)
194 | row = {}
195 | i = 0
196 | while( i < columns.length )
197 | if(columns[i] == matcher)
198 | row[columns[i]] = time.to_s(:db)
199 | elsif( types[i] =~ /text/ )
200 | row[columns[i]] = "~~stub~~"
201 | elsif( types[i] =~ /varchar/ )
202 | row[columns[i]] = "~~stub~~".to_sym
203 | elsif( types[i] =~ /float/ || types[i] =~ /int/ || types[i] =~ /decimal/ )
204 | row[columns[i]] = 0
205 | elsif( types[i] =~ /datetime/ || types[i] =~ /timestamp/ )
206 | row[columns[i]] = time.to_s(:db)
207 | elsif( types[i] =~ /date/ )
208 | row[columns[i]] = time.to_s(:db).split(" ").first
209 | else
210 | row[columns[i]] = "NULL"
211 | end
212 | i = i + 1
213 | end
214 | db.write([row], table)
215 | end
216 | end
217 | end
218 |
219 | # Tests if a particular pipe parameterization can be performed incrementally
220 | #
221 | # @param (see .incremental_pipe)
222 | #
223 | # @return [true|false]
224 | def can_incremental_pipe?(source, from_table, destination, to_table, options={})
225 | matcher = options[:matcher] || source.default_matcher
226 | return false unless source.tables.include?(from_table)
227 | return false unless destination.tables.include?(to_table)
228 | source_cols = source.columns(from_table, source.current_database)
229 | destination_cols = destination.columns(to_table, destination.current_database)
230 | return false unless source_cols.include?(matcher)
231 | return false unless destination_cols.include?(matcher)
232 | source_cols.each do |source_col|
233 | return false unless destination_cols.include?(source_col)
234 | end
235 | destination_cols.each do |destination_col|
236 | return false unless source_cols.include?(destination_col)
237 | end
238 | true
239 | end
240 |
241 | # Tests if a particular import parameterization can be performed incrementally
242 | #
243 | # @param (see .mysql_incremental_import)
244 | #
245 | # @return [true|false]
246 | def can_incremental_import?(source, from_table, destination, to_table, options={})
247 | matcher = options[:matcher] || source.default_matcher
248 | source.columns(from_table).include?(matcher) && destination.tables.include?(to_table) && destination.columns(to_table).include?(matcher)
249 | end
250 | end
251 | end
252 | end
253 | end
254 |
--------------------------------------------------------------------------------
/lib/forklift/plan.rb:
--------------------------------------------------------------------------------
1 | require 'active_support/all'
2 |
3 | module Forklift
4 | class Plan
5 |
6 | def initialize(config={})
7 | @config = default_config.merge(config)
8 | @utils = Forklift::Base::Utils.new
9 | @pid = Forklift::Base::Pid.new(self)
10 | @logger = Forklift::Base::Logger.new(self)
11 | @mailer = Forklift::Base::Mailer.new(self)
12 | @connections = {}
13 | @steps = {}
14 | end
15 |
16 | def connections; @connections end
17 | def steps; @steps end
18 | def config; @config end
19 | def logger; @logger end
20 | def mailer; @mailer end
21 | def utils; @utils end
22 | def pid; @pid end
23 |
24 | def connect!
25 | files = Dir["#{config[:project_root]}/config/connections/**/*.yml"]
26 | files.each do |f|
27 | next if f.include?('example.yml')
28 | name = f.split("/")[-1].split('.')[0]
29 | type = f.split("/")[-2]
30 | connections[type.to_sym] = {} if connections[type.to_sym].nil?
31 | db_config = utils.load_yml(f)
32 |
33 | begin
34 | loader = "Forklift::Connection::#{type.camelcase}.new(db_config, self)"
35 | connection = eval(loader)
36 | connection.connect
37 | connections[type.to_sym][name.to_sym] = connection
38 | logger.debug "loaded a #{type.camelcase} connection from #{f}"
39 | rescue Exception => e
40 | logger.fatal "cannot create a class type of #{loader} from #{f} | #{e}"
41 | # raise e ## Don't raise here, but let a step fail so the error_handler can report
42 | end
43 | end
44 | end
45 |
46 | def disconnect!
47 | connections.each do |k, collection|
48 | collection.each do |k, connection|
49 | connection.disconnect
50 | end
51 | end
52 | end
53 |
54 | def default_error_handler
55 | return lambda {|name, e| raise e }
56 | end
57 |
58 | def step(*args, &block)
59 | name = args[0].to_sym
60 | error_handler = default_error_handler
61 | error_handler = args[1] unless args[1].nil?
62 | self.steps[name] = {
63 | ran: false,
64 | to_run: false,
65 | block: block,
66 | error_handler: error_handler,
67 | }
68 | end
69 |
70 | def do_step!(name)
71 | name = name.to_sym
72 | if self.steps[name].nil?
73 | self.logger.log "[error] step `#{name}` not found"
74 | else
75 | step = self.steps[name]
76 | if step[:ran] == true
77 | self.logger.log "step `#{name}` already ran"
78 | elsif step[:to_run] == false
79 | self.logger.log "skipping step `#{name}`"
80 | else
81 | self.logger.log "*** step: #{name} ***"
82 | begin
83 | step[:block].call
84 | step[:ran] = true
85 | rescue Exception => e
86 | step[:error_handler].call(name, e)
87 | end
88 | end
89 | end
90 | end
91 |
92 | def argv
93 | ARGV
94 | end
95 |
96 | def activate_steps
97 | # all steps are run by default
98 | # step names are passed as ARGV
99 | # `forklift plan.rb` runs everything and `forklift plan.rb send_email` only sends the email
100 | if argv.length < 2 || ENV['FORKLIFT_RUN_ALL_STEPS'] == 'true'
101 | self.steps.each do |k,v|
102 | self.steps[k][:to_run] = true
103 | end
104 | else
105 | i = 1
106 | while i < argv.length
107 | name = argv[i].to_sym
108 | unless self.steps[name].nil?
109 | self.steps[name][:to_run] = true
110 | else
111 | self.logger.log "[error] step `#{name}` not found"
112 | exit(1)
113 | end
114 | i = i + 1
115 | end
116 | end
117 | end
118 |
119 | def do!
120 | # you can use `plan.logger.log` in your plan for logging
121 | self.logger.log "Starting forklift"
122 |
123 | # use a pidfile to ensure that only one instance of forklift is running at a time; store the file if OK
124 | self.pid.safe_to_run?
125 | self.pid.store!
126 |
127 | # this will load all connections in /config/connections/#{type}/#{name}.yml into the plan.connections hash
128 | # and build all the connection objects (and try to connect in some cases)
129 | self.connect!
130 |
131 | yield # your stuff here!
132 |
133 | self.activate_steps
134 | self.steps.each do |k, v|
135 | do_step!(k)
136 | end
137 |
138 | # remove the pidfile
139 | self.logger.log "Completed forklift"
140 | self.pid.delete!
141 | end
142 |
143 | private
144 |
145 | def default_config
146 | return {
147 | project_root: Dir.pwd,
148 | batch_size: 1000,
149 | char_bytecode_max: 65535, # the utf8 char limit
150 | logger: {
151 | stdout: true,
152 | debug: false,
153 | },
154 | }
155 | end
156 |
157 | #/private
158 |
159 | end
160 | end
161 |
--------------------------------------------------------------------------------
/lib/forklift/transports/csv.rb:
--------------------------------------------------------------------------------
1 | require 'csv'
2 | require 'fileutils'
3 |
4 | module Forklift
5 | module Connection
6 | class Csv < Forklift::Base::Connection
7 | def connect; end
8 | def disconnect; end
9 |
10 | def read(size=forklift.config[:batch_size])
11 | data = []
12 | CSV.foreach(config[:file], headers: true, converters: :all) do |row|
13 | data << row.to_hash.symbolize_keys
14 | if(data.length == size)
15 | if block_given?
16 | yield data
17 | data = []
18 | else
19 | return data
20 | end
21 | end
22 | end
23 |
24 | if block_given?
25 | yield data
26 | else
27 | return data
28 | end
29 | end
30 |
31 | def write(data, append=true)
32 | if (append == false)
33 | FileUtils.rm(config[:file], {force: true})
34 | end
35 |
36 | if( !File.exists?(config[:file]) )
37 | keys = data.first.keys
38 | row = {}
39 | keys.each do |k|
40 | row[k] = k
41 | end
42 | data = [row] + data
43 | end
44 |
45 | CSV.open(config[:file],'a') do |file|
46 | data.each do |row|
47 | file << row.values
48 | end
49 | end
50 |
51 | end
52 |
53 | private
54 |
55 | #/private
56 |
57 | end
58 | end
59 | end
60 |
--------------------------------------------------------------------------------
/lib/forklift/transports/elasticsearch.rb:
--------------------------------------------------------------------------------
1 | require 'elasticsearch'
2 |
3 | module Forklift
4 | module Connection
5 | class Elasticsearch < Forklift::Base::Connection
6 | def connect
7 | @client = ::Elasticsearch::Client.new(config)
8 | end
9 |
10 | def disconnect
11 | @client = nil
12 | end
13 |
14 | def read(index, query, looping=true, from=0, size=forklift.config[:batch_size])
15 | offset = 0
16 | loop_count = 0
17 |
18 | while (looping == true || loop_count == 0)
19 | data = []
20 | prepared_query = query
21 | prepared_query[:from] = from + offset
22 | prepared_query[:size] = size
23 |
24 | forklift.logger.debug " ELASTICSEARCH: #{query.to_json}"
25 | results = client.search( { index: index, body: prepared_query } )
26 | results["hits"]["hits"].each do |hit|
27 | data << hit["_source"]
28 | end
29 |
30 | data.map{|l| l.symbolize_keys! }
31 |
32 | if block_given?
33 | yield data
34 | else
35 | return data
36 | end
37 |
38 | looping = false if results["hits"]["hits"].length == 0
39 | offset = offset + size
40 | loop_count = loop_count + 1
41 | end
42 | end
43 |
44 | def write(data, index, update=false, type='forklift', primary_key=:id)
45 | data.map{|l| l.symbolize_keys! }
46 |
47 | data.each do |d|
48 | object = {
49 | index: index,
50 | body: d,
51 | type: type,
52 | }
53 | object[:id] = d[primary_key] if ( !d[primary_key].nil? && update == true )
54 |
55 | forklift.logger.debug " ELASTICSEARCH (store): #{object.to_json}"
56 | client.index object
57 | end
58 | client.indices.refresh({ index: index })
59 | end
60 |
61 | def delete_index(index)
62 | forklift.logger.debug " ELASTICSEARCH (delete index): #{index}"
63 | client.indices.delete({ index: index }) if client.indices.exists({ index: index })
64 | end
65 |
66 | private
67 |
68 | #/private
69 |
70 | end
71 | end
72 | end
73 |
--------------------------------------------------------------------------------
/lib/forklift/transports/mysql.rb:
--------------------------------------------------------------------------------
1 | require 'mysql2'
2 | require 'open3'
3 |
4 | module Forklift
5 | module Connection
6 | class Mysql < Forklift::Base::Connection
7 | def connect
8 | @client = Mysql2::Client.new(config)
9 | q("USE `#{config[:database]}`")
10 | end
11 |
12 | def disconnect
13 | @client.close
14 | end
15 |
16 | def default_matcher
17 | :updated_at
18 | end
19 |
20 | def drop!(table, database=current_database)
21 | q("DROP table `#{database}`.`#{table}`");
22 | end
23 |
24 | def rename(table, new_table, database=current_database, new_database=current_database)
25 | q("RENAME TABLE `#{database}`.`#{table}` TO `#{new_database}`.`#{new_table}`")
26 | end
27 |
28 | def read(query, database=current_database, looping=true, limit=forklift.config[:batch_size], offset=0)
29 | loop_count = 0
30 | # TODO: Detect limit/offset already present in query
31 |
32 | while ( looping == true || loop_count == 0 )
33 | data = []
34 | prepared_query = query
35 | if prepared_query.downcase.include?("select") && !prepared_query.downcase.include?("limit")
36 | prepared_query = "#{prepared_query} LIMIT #{offset}, #{limit}"
37 | end
38 | response = q(prepared_query)
39 | response.each do |row|
40 | data << row
41 | end
42 |
43 | if block_given?
44 | yield data
45 | else
46 | return data
47 | end
48 |
49 | offset = offset + limit
50 | looping = false if data.length == 0
51 | loop_count = loop_count + 1
52 | end
53 | end
54 |
55 | def write(rows, table, to_update=true, database=current_database, primary_key=:id, lazy=true, crash_on_extral_col=false)
56 | if tables.include? table
57 | ensure_row_types(rows, table, database)
58 | elsif(lazy == true && rows.length > 0)
59 | lazy_table_create(table, rows, database, primary_key)
60 | end
61 |
62 | if rows.length > 0
63 | columns = columns(table, database)
64 | rows.each do |row|
65 | if crash_on_extral_col == false
66 | row.each do |column, value|
67 | unless columns.include?(column)
68 | q("ALTER TABLE `#{database}`.`#{table}` ADD `#{column}` #{sql_type(value)} NULL DEFAULT NULL;")
69 | columns = columns(table, database)
70 | end
71 | end
72 | end
73 | end
74 |
75 | insert_values = []
76 | delete_keys = []
77 | rows.map do |row|
78 | delete_keys << row[primary_key] if to_update && row[primary_key].present?
79 | insert_values << safe_values(columns, row)
80 | end
81 |
82 | unless delete_keys.empty?
83 | q(%{DELETE FROM `#{database}`.`#{table}` WHERE `#{primary_key}` IN (#{delete_keys.join(',')})})
84 | end
85 |
86 | begin
87 | q(%{INSERT INTO `#{database}`.`#{table}` (#{safe_columns(columns)}) VALUES #{insert_values.join(',')}})
88 | rescue Mysql2::Error => ex
89 | # UTF8 Safety. Open a PR if you don't want UTF8 data...
90 | # https://github.com/taskrabbit/demoji
91 | raise ex unless ex.message.match /Incorrect string value:/
92 | safer_insert_q = ""
93 | for i in (0...insert_q.length)
94 | char = insert_q[i]
95 | char = '???' if char.ord > forklift.config[:char_bytecode_max]
96 | safer_insert_q << char
97 | end
98 | q(safer_insert_q)
99 | end
100 |
101 | forklift.logger.log "wrote #{rows.length} rows to `#{database}`.`#{table}`"
102 | end
103 | end
104 |
105 | def lazy_table_create(table, data, database=current_database, primary_key=:id, matcher=default_matcher)
106 | keys = {}
107 | data.each do |item|
108 | item.each do |k,v|
109 | keys[k] = sql_type(v) if (keys[k].nil? || keys[k] == sql_type(nil))
110 | end
111 | end
112 | keys[primary_key] = 'bigint(20)' unless keys.has_key?(primary_key)
113 |
114 | col_defn = keys.map do |col, type|
115 | if col == primary_key
116 | "`#{col}` #{type} NOT NULL AUTO_INCREMENT"
117 | else
118 | "`#{col}` #{type} DEFAULT NULL"
119 | end
120 | end
121 | col_defn << "PRIMARY KEY (`#{primary_key}`)"
122 | col_defn << "KEY `#{matcher}` (`#{matcher}`)" if keys.include?(matcher)
123 |
124 | command = <<-EOS
125 | CREATE TABLE `#{database}`.`#{table}` (
126 | #{col_defn.join(', ')}
127 | )
128 | EOS
129 |
130 | q(command)
131 | forklift.logger.log "lazy-created table `#{database}`.`#{table}`"
132 | end
133 |
134 | def sql_type(v)
135 | return "bigint(20)" if v.class == Fixnum
136 | return "float" if v.class == Float
137 | return "float" if v.class == BigDecimal
138 | return "date" if v.class == Date
139 | return "datetime" if v.class == Time
140 | return "datetime" if v.class == DateTime
141 | return "varchar(255)" if v.class == Symbol
142 | return "tinyint(1)" if v.class == TrueClass
143 | return "tinyint(1)" if v.class == FalseClass
144 | return "text" if v.class == String
145 | return "varchar(0)" if v.class == NilClass
146 | return "text" # catchall
147 | end
148 |
149 | def read_since(table, since, matcher=default_matcher, database=current_database, limit=forklift.config[:batch_size])
150 | query = "SELECT * FROM `#{database}`.`#{table}` WHERE `#{matcher}` >= '#{since.to_s(:db)}' ORDER BY `#{matcher}` ASC"
151 | self.read(query, database, true, limit){|data|
152 | if block_given?
153 | yield data
154 | else
155 | return data
156 | end
157 | }
158 | end
159 |
160 | def max_timestamp(table, matcher=default_matcher, database=current_database)
161 | return Time.at(0) unless tables.include?(table)
162 | last_copied_row = read("SELECT MAX(`#{matcher}`) AS \"#{matcher}\" FROM `#{database}`.`#{table}`")[0]
163 | if ( last_copied_row.nil? || last_copied_row[matcher].nil? )
164 | Time.at(0)
165 | else
166 | last_copied_row[matcher]
167 | end
168 | end
169 |
170 | def tables
171 | t = []
172 | client.query("show tables").each do |row|
173 | t << row.values[0]
174 | end
175 | t
176 | end
177 |
178 | def current_database
179 | @_current_database ||= q("SELECT DATABASE() AS 'db'").first[:db]
180 | end
181 |
182 | def count(table, database=current_database)
183 | q("SELECT COUNT(1) AS \"count\" FROM `#{database}`.`#{table}`").first[:count]
184 | end
185 |
186 | def truncate!(table, database=current_database)
187 | q("TRUNCATE TABLE `#{database}`.`#{table}`")
188 | end
189 |
190 | def truncate(table, database=current_database)
191 | begin
192 | self.truncate!(table, database=current_database)
193 | rescue Exception => e
194 | forklift.logger.debug e
195 | end
196 | end
197 |
198 | def columns(table, database=current_database, return_types=false)
199 | cols = []
200 | types = []
201 | read("DESCRIBE `#{database}`.`#{table}`").each do |row|
202 | cols << row[:Field].to_sym
203 | types << row[:Type]
204 | end
205 | return cols if return_types == false
206 | return cols, types
207 | end
208 |
209 | def dump(file, options=[])
210 | # example options:
211 | # options.push '--max_allowed_packet=512M'
212 | # options.push '--set-gtid-purged=OFF'
213 | cmd = "mysqldump"
214 | cmd << " -u#{config[:username]}" unless config[:username].nil?
215 | cmd << " -p#{config[:password]}" unless config[:password].nil?
216 | options.each do |o|
217 | cmd << " #{o} "
218 | end
219 | cmd << " #{config[:database]}"
220 | cmd << " | gzip > #{file}"
221 | forklift.logger.log "Dumping #{config['database']} to #{file}"
222 | forklift.logger.debug cmd
223 |
224 | stdin, stdout, stderr = Open3.popen3(cmd)
225 | stdout = stdout.readlines
226 | stderr = stderr.readlines
227 | if stderr.length > 0
228 | raise " > Dump error: #{stderr.join(" ")}"
229 | else
230 | forklift.logger.log " > Dump complete"
231 | end
232 | end
233 |
234 | def exec_script(path)
235 | body = File.read(path)
236 | delim = ';'
237 | body.split(/^(delimiter\s+.*)$/i).each do |section|
238 | if section =~ /^delimiter/i
239 | delim = section[/^delimiter\s+(.+)$/i,1]
240 | next
241 | end
242 |
243 | lines = section.split(delim)
244 | lines.each do |line|
245 | line.strip!
246 | q(line) if line.length > 0
247 | end
248 | end
249 | end
250 |
251 | def q(query, options={})
252 | forklift.logger.debug "\tSQL[#{config[:database]}]: #{query}"
253 | return client.query(query, {symbolize_keys: true}.merge(options))
254 | end
255 |
256 | private
257 |
258 | def ensure_row_types(data, table, database=current_database)
259 | read("describe `#{database}`.`#{table}`").each do |row|
260 | if row[:Type] == 'varchar(0)'
261 |
262 | value = nil
263 | data.each do |r|
264 | if ( !r[row[:Field].to_sym].nil? )
265 | value = r[row[:Field].to_sym]
266 | break
267 | end
268 | end
269 |
270 | if !value.nil?
271 | sql_type = sql_type(value)
272 | alter_sql = "ALTER TABLE `#{database}`.`#{table}` CHANGE `#{row[:Field]}` `#{row[:Field]}` #{sql_type};"
273 | forklift.logger.log alter_sql
274 | q(alter_sql)
275 | end
276 |
277 | end
278 | end
279 | end
280 |
281 | def safe_columns(cols)
282 | a = []
283 | cols.each do |c|
284 | a << "`#{c}`"
285 | end
286 | return a.join(', ')
287 | end
288 |
289 | def safe_values(columns, row)
290 | "(" + columns.map do |column|
291 | v = row[column]
292 | case v
293 | when String, Symbol then %{"#{Mysql2::Client.escape(v.to_s)}"}
294 | when Date, Time, DateTime then %{"#{v.to_s(:db)}"}
295 | when Fixnum then v
296 | when Float, BigDecimal then v.to_f
297 | else 'NULL'
298 | end
299 | end.compact.join(', ') + ")"
300 | end
301 |
302 | #/private
303 |
304 | end
305 | end
306 | end
307 |
--------------------------------------------------------------------------------
/lib/forklift/version.rb:
--------------------------------------------------------------------------------
1 | module Forklift
2 | VERSION = "2.0.0-alpha"
3 | end
4 |
--------------------------------------------------------------------------------
/spec/config/connections/csv/forklift_test_destination.yml:
--------------------------------------------------------------------------------
1 | :file: /tmp/destination.csv
--------------------------------------------------------------------------------
/spec/config/connections/csv/forklift_test_source.yml:
--------------------------------------------------------------------------------
1 | :file: /tmp/source.csv
--------------------------------------------------------------------------------
/spec/config/connections/elasticsearch/forklift_test.yml:
--------------------------------------------------------------------------------
1 | :host: http://localhost:9200
2 |
--------------------------------------------------------------------------------
/spec/config/connections/mysql/forklift_test_destination.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :username: root
3 | :database: forklift_test_destination
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/spec/config/connections/mysql/forklift_test_source_a.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :username: root
3 | :database: forklift_test_source_a
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/spec/config/connections/mysql/forklift_test_source_b.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :username: root
3 | :database: forklift_test_source_b
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/spec/config/connections/mysql/forklift_test_working.yml:
--------------------------------------------------------------------------------
1 | :encoding: utf8
2 | :username: root
3 | :database: forklift_test_working
4 | :password:
5 | :host: 127.0.0.1
6 | :port: 3306
7 |
--------------------------------------------------------------------------------
/spec/config/email.yml:
--------------------------------------------------------------------------------
1 | via: sendmail
2 | via_options:
3 | location: /usr/sbin/sendmail
4 | arguments: '-t -i'
5 |
--------------------------------------------------------------------------------
/spec/integration/basic_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'basics' do
4 |
5 | describe 'test suite setup' do
6 | it 'seeded the mysql dbs' do
7 | client = SpecClient.mysql('forklift_test_source_a')
8 | tables = []
9 | client.query("show tables").each do |row|
10 | tables << row.values[0]
11 | end
12 | expect(tables.count).to eql 3
13 | client.close
14 |
15 | client = SpecClient.mysql('forklift_test_source_b')
16 | tables = []
17 | client.query("show tables").each do |row|
18 | tables << row.values[0]
19 | end
20 | expect(tables.count).to eql 1
21 | client.close
22 | end
23 |
24 | it 'seeded the elasticsearch db' do
25 | client = SpecClient.elasticsearch('forklift_test')
26 | results = client.search({ index: 'forklift_test' , body: { query: { match_all: {} } } })
27 | expect(results['hits']['total']).to eql 5
28 | end
29 | end
30 |
31 | end
--------------------------------------------------------------------------------
/spec/integration/csv_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'csv'
3 |
4 | describe 'csv' do
5 |
6 | after(:each) do
7 | SpecSeeds.setup_csv
8 | end
9 |
10 | it "can read data (simple)" do
11 | plan = SpecPlan.new
12 | @rows = []
13 |
14 | plan.do! {
15 | source = plan.connections[:csv][:forklift_test_source]
16 | source.read {|data|
17 | @rows = (@rows + data)
18 | }
19 | }
20 |
21 | expect(@rows.length).to eql 5
22 | expect(@rows.first[:vendor_id]).to eql 1
23 | expect(@rows.last[:vendor_id]).to eql 5
24 | end
25 |
26 | it "can read partial data" do
27 | plan = SpecPlan.new
28 | @rows = []
29 |
30 | plan.do! {
31 | source = plan.connections[:csv][:forklift_test_source]
32 | @rows = source.read(3)
33 | }
34 |
35 | expect(@rows.length).to eql 3
36 | expect(@rows.first[:vendor_id]).to eql 1
37 | expect(@rows.last[:vendor_id]).to eql 3
38 | end
39 |
40 | it "can write data (simple)" do
41 | plan = SpecPlan.new
42 | data = [
43 | {thing: 1, when: Time.now},
44 | {thing: 2, when: Time.now},
45 | ]
46 |
47 | plan.do! {
48 | destination = plan.connections[:csv][:forklift_test_destination]
49 | destination.write(data)
50 | }
51 |
52 | @rows = SpecClient.csv('/tmp/destination.csv')
53 | expect(@rows.length).to eql 2
54 | expect(@rows.first[:thing]).to eql 1
55 | expect(@rows.last[:thing]).to eql 2
56 | end
57 |
58 | it "can append data" do
59 | plan = SpecPlan.new
60 |
61 | plan.do! {
62 | destination = plan.connections[:csv][:forklift_test_destination]
63 |
64 | data = [
65 | {thing: 1, when: Time.now},
66 | {thing: 2, when: Time.now},
67 | ]
68 |
69 | destination.write(data)
70 |
71 | data = [
72 | {thing: 3, when: Time.now},
73 | ]
74 |
75 | destination.write(data)
76 | }
77 |
78 | @rows = SpecClient.csv('/tmp/destination.csv')
79 | expect(@rows.length).to eql 3
80 | expect(@rows.first[:thing]).to eql 1
81 | expect(@rows.last[:thing]).to eql 3
82 | end
83 |
84 | end
--------------------------------------------------------------------------------
/spec/integration/elasticsearch_patterns_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'elasticsearch patterns' do
4 |
5 | end
--------------------------------------------------------------------------------
/spec/integration/elasticsearch_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'elasticsearch' do
4 |
5 | before(:each) do
6 | SpecSeeds.setup_elasticsearch
7 | end
8 |
9 | it "can read data (raw)" do
10 | index = 'forklift_test'
11 | query = { query: { match_all: {} } }
12 | plan = SpecPlan.new
13 | @rows = []
14 | plan.do! {
15 | source = plan.connections[:elasticsearch][:forklift_test]
16 | source.read(index, query) {|data|
17 | @rows = (@rows + data)
18 | }
19 | }
20 | plan.disconnect!
21 |
22 | expect(@rows.length).to eql 5
23 | end
24 |
25 | it "can read data (filtered)" do
26 | index = 'forklift_test'
27 | query = { query: { match_all: {} } }
28 | plan = SpecPlan.new
29 | @rows = []
30 | plan.do! {
31 | source = plan.connections[:elasticsearch][:forklift_test]
32 | source.read(index, query, false, 0, 3) {|data|
33 | @rows = (@rows + data)
34 | }
35 | }
36 | plan.disconnect!
37 |
38 | expect(@rows.length).to eql 3
39 | end
40 |
41 | it "can write new data" do
42 | index = 'forklift_test'
43 | plan = SpecPlan.new
44 | data = [
45 | {id: 99, user_id: 99, product_id: 99, viewed_at: 99}
46 | ]
47 | plan.do! {
48 | destination = plan.connections[:elasticsearch][:forklift_test]
49 | destination.write(data, index)
50 | }
51 | plan.disconnect!
52 |
53 | destination = SpecClient.elasticsearch('forklift_test')
54 | count = destination.count({ index: index })["count"]
55 |
56 | expect(count).to eql 6
57 | end
58 |
59 | it "can overwrite existing data, probided a primary key" do
60 | index = 'forklift_test'
61 | plan = SpecPlan.new
62 | data = [
63 | {'id' => 1, 'user_id' => 1, 'product_id' => 1, 'viewed_at' => 99}
64 | ]
65 | plan.do! {
66 | destination = plan.connections[:elasticsearch][:forklift_test]
67 | destination.write(data, index, true)
68 | }
69 | plan.disconnect!
70 |
71 | destination = SpecClient.elasticsearch('forklift_test')
72 | count = destination.count({ index: index })["count"]
73 | expect(count).to eql 5
74 | result = destination.search({ index: index, body: { query: {term: {id: 1}} } })
75 | expect(result["hits"]["total"]).to eql 1
76 | obj = result["hits"]["hits"][0]["_source"]
77 | expect(obj["id"]).to eql 1
78 | expect(obj["user_id"]).to eql 1
79 | expect(obj["product_id"]).to eql 1
80 | expect(obj["viewed_at"]).to eql 99
81 | end
82 |
83 | it "can delete an index" do
84 | index = 'other_test_index'
85 | plan = SpecPlan.new
86 | client = SpecClient.elasticsearch('forklift_test')
87 | data = [
88 | {id: 1}
89 | ]
90 | plan.do! {
91 | destination = plan.connections[:elasticsearch][:forklift_test]
92 | expect { client.search({ index: index }) }.to raise_error(/index_not_found_exception|IndexMissingException/)
93 | destination.write(data, index, true)
94 | expect { client.search({ index: index }) }.to_not raise_error
95 | destination.delete_index(index)
96 | expect { client.search({ index: index }) }.to raise_error(/index_not_found_exception|IndexMissingException/)
97 | }
98 | plan.disconnect!
99 | end
100 | end
101 |
--------------------------------------------------------------------------------
/spec/integration/multi_transport_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'multiple trasport types' do
4 |
5 | before(:each) do
6 | SpecSeeds.setup_mysql
7 | SpecSeeds.setup_elasticsearch
8 | end
9 |
10 | describe 'elasticsearch => mysql' do
11 | it 'can load in a full query' do
12 | table = 'es_import'
13 | index = 'forklift_test'
14 | query = { query: { match_all: {} } }
15 | plan = SpecPlan.new
16 | plan.do! {
17 | source = plan.connections[:elasticsearch][:forklift_test]
18 | destination = plan.connections[:mysql][:forklift_test_destination]
19 | source.read(index, query) {|data| destination.write(data, table) }
20 | }
21 | plan.disconnect!
22 |
23 | destination = SpecClient.mysql('forklift_test_destination')
24 | rows = destination.query("select count(1) as 'count' from es_import").first["count"]
25 | expect(rows).to eql 5
26 | end
27 |
28 | it 'can load in a partial query' do
29 | table = 'es_import'
30 | index = 'forklift_test'
31 | query = { query: { match_all: {} }, sort: [{ id: {order: "asc" } }] }
32 | plan = SpecPlan.new
33 | plan.do! {
34 | source = plan.connections[:elasticsearch][:forklift_test]
35 | destination = plan.connections[:mysql][:forklift_test_destination]
36 | source.read(index, query, false, 0, 3) {|data| destination.write(data, table) }
37 | }
38 | plan.disconnect!
39 |
40 | destination = SpecClient.mysql('forklift_test_destination')
41 | rows = destination.query("select count(1) as 'count' from es_import").first["count"]
42 | expect(rows).to eql 3
43 | min = destination.query("select min(id) as 'min' from es_import").first["min"]
44 | expect(min).to eql 1
45 | max = destination.query("select max(id) as 'max' from es_import").first["max"]
46 | expect(max).to eql 3
47 | end
48 |
49 | it 'can detect data types' do
50 | table = 'es_import'
51 | index = 'forklift_test'
52 | query = { query: { match_all: {} } }
53 | plan = SpecPlan.new
54 | plan.do! {
55 | source = plan.connections[:elasticsearch][:forklift_test]
56 | destination = plan.connections[:mysql][:forklift_test_destination]
57 | source.read(index, query) {|data|
58 | clean_data = []
59 | data.each do |row|
60 | row[:viewed_at] = Time.at(row[:viewed_at])
61 | clean_data << row
62 | end
63 | destination.write(clean_data, table)
64 | }
65 | }
66 | plan.disconnect!
67 |
68 | destination = SpecClient.mysql('forklift_test_destination')
69 | max = destination.query("select max(viewed_at) as 'max' from es_import").first["max"]
70 | expect(max.class).to eql Time
71 | end
72 |
73 | end
74 |
75 | describe 'mysql => elasticsearch' do
76 |
77 | after(:each) do
78 | es = SpecClient.elasticsearch('forklift_test')
79 | es.indices.delete({ index: 'users' }) if es.indices.exists({ index: 'users' })
80 | end
81 |
82 | it 'can load in a full table' do
83 | table = 'users'
84 | index = 'users'
85 | plan = SpecPlan.new
86 | plan.do! {
87 | source = plan.connections[:mysql][:forklift_test_source_a]
88 | destination = plan.connections[:elasticsearch][:forklift_test]
89 | source.read("select * from #{table}") {|data| destination.write(data, index) }
90 | }
91 | plan.disconnect!
92 |
93 | destination = SpecClient.elasticsearch('forklift_test')
94 | count = destination.count({ index: index })["count"]
95 | expect(count).to eql 5
96 | end
97 |
98 | it 'can load in only some rows' do
99 | table = 'users'
100 | index = 'users'
101 | plan = SpecPlan.new
102 | plan.do! {
103 | source = plan.connections[:mysql][:forklift_test_source_a]
104 | destination = plan.connections[:elasticsearch][:forklift_test]
105 | source.read("select * from #{table}", source.current_database, false, 3, 0) {|data|
106 | destination.write(data, index)
107 | }
108 | }
109 | plan.disconnect!
110 |
111 | destination = SpecClient.elasticsearch('forklift_test')
112 | count = destination.count({ index: index })["count"]
113 | expect(count).to eql 3
114 | end
115 | end
116 |
117 | end
--------------------------------------------------------------------------------
/spec/integration/mysql_patterns_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'mysql patterns' do
4 |
5 | before(:each) do
6 | SpecSeeds.setup_mysql
7 | end
8 |
9 | it "can do a raw data pipe" do
10 | plan = SpecPlan.new
11 | plan.do! {
12 | source = plan.connections[:mysql][:forklift_test_source_a]
13 | destination = plan.connections[:mysql][:forklift_test_destination]
14 |
15 | expect(source.tables.length).to eql 3
16 | expect(destination.tables.length).to eql 0
17 |
18 | source.tables.each do |table|
19 | Forklift::Patterns::Mysql.pipe(source, table, destination, table)
20 | end
21 |
22 | expect(destination.tables.length).to eql 3
23 | }
24 | plan.disconnect!
25 | end
26 |
27 | it "can do an incramental data pipe with only updated data" do
28 | plan = SpecPlan.new
29 | table = 'users'
30 | plan.do! {
31 | source = plan.connections[:mysql][:forklift_test_source_a]
32 | destination = plan.connections[:mysql][:forklift_test_destination]
33 | Forklift::Patterns::Mysql.incremental_pipe(source, table, destination, table)
34 |
35 | expect(destination.count('users')).to eql 5
36 | expect(destination.read('select first_name from users where id = 1')[0][:first_name]).to eql 'Evan'
37 |
38 | source.q("UPDATE `users` SET `first_name` = 'EvanAgain' WHERE `id` = '1'")
39 | source.q("UPDATE `users` SET `updated_at` = NOW() WHERE `id` = '1'")
40 |
41 | Forklift::Patterns::Mysql.incremental_pipe(source, table, destination, table)
42 |
43 | expect(destination.count('users')).to eql 5
44 | expect(destination.read('select first_name from users where id = 1')[0][:first_name]).to eql 'EvanAgain'
45 | }
46 | plan.disconnect!
47 | end
48 |
49 | it "(optimistic_pipe) can determine if it should do an incramental or full pipe" do
50 | plan = SpecPlan.new
51 | plan.do! {
52 | source = plan.connections[:mysql][:forklift_test_source_a]
53 | expect(Forklift::Patterns::Mysql.can_incremental_pipe?(source, 'users', source, 'users')).to eql true
54 | expect(Forklift::Patterns::Mysql.can_incremental_pipe?(source, 'sales', source, 'sales')).to eql false
55 | expect(Forklift::Patterns::Mysql.can_incremental_pipe?(source, 'products', source, 'products')).to eql true
56 | }
57 | plan.disconnect!
58 | end
59 |
60 | it "can run the mysql_optimistic_import pattern" do
61 | plan = SpecPlan.new
62 | table = 'users'
63 | plan.do! {
64 | source = plan.connections[:mysql][:forklift_test_source_a]
65 | destination = plan.connections[:mysql][:forklift_test_destination]
66 |
67 | Forklift::Patterns::Mysql.mysql_optimistic_import(source, table, destination, table)
68 |
69 | expect(destination.tables.length).to eql 1
70 |
71 | source.q("UPDATE `users` SET `first_name` = 'EvanAgain' WHERE `id` = '1'")
72 | source.q("UPDATE `users` SET `updated_at` = NOW() WHERE `id` = '1'")
73 |
74 | Forklift::Patterns::Mysql.mysql_optimistic_import(source, table, destination, table)
75 |
76 | expect(destination.count('users')).to eql 5
77 | expect(destination.read('select first_name from users where id = 1')[0][:first_name]).to eql 'EvanAgain'
78 | }
79 | plan.disconnect!
80 | end
81 |
82 | it "can write the high_water_mark"
83 | end
84 |
--------------------------------------------------------------------------------
/spec/integration/mysql_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'mysql' do
4 |
5 | before(:each) do
6 | SpecSeeds.setup_mysql
7 | end
8 |
9 | it "can read data (raw)" do
10 | query = 'select * from `users`'
11 | plan = SpecPlan.new
12 | @rows = []
13 | plan.do! {
14 | source = plan.connections[:mysql][:forklift_test_source_a]
15 | source.read(query) {|data|
16 | @rows = (@rows + data)
17 | }
18 | }
19 | plan.disconnect!
20 |
21 | expect(@rows.length).to eql 5
22 | end
23 |
24 | it "can read data (filtered)" do
25 | query = 'select * from `users`'
26 | plan = SpecPlan.new
27 | @rows = []
28 | plan.do! {
29 | source = plan.connections[:mysql][:forklift_test_source_a]
30 | source.read(query, source.current_database, false, 3, 0) {|data|
31 | @rows = (@rows + data)
32 | }
33 | }
34 | plan.disconnect!
35 |
36 | expect(@rows.length).to eql 3
37 | end
38 |
39 | it "can write new data" do
40 | table = "users"
41 | data = [
42 | {email: 'other@example.com', first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)},
43 | {email: 'else@example.com', first_name: 'else', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
44 | ]
45 | plan = SpecPlan.new
46 | plan.do! {
47 | destination = plan.connections[:mysql][:forklift_test_source_a]
48 | destination.write(data, table)
49 | }
50 | plan.disconnect!
51 |
52 | destination = SpecClient.mysql('forklift_test_source_a')
53 | count = destination.query('select count(1) as "count" from users').first['count']
54 | expect(count).to eql 7
55 | end
56 |
57 | it "can update existing data" do
58 | table = "users"
59 | data = [
60 | {id: 1, email: 'evan@example.com', first_name: 'New Name', last_name: 'T', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
61 | ]
62 | plan = SpecPlan.new
63 | plan.do! {
64 | destination = plan.connections[:mysql][:forklift_test_source_a]
65 | destination.write(data, table)
66 | }
67 | plan.disconnect!
68 |
69 | destination = SpecClient.mysql('forklift_test_source_a')
70 | count = destination.query('select count(1) as "count" from users').first['count']
71 | expect(count).to eql 5
72 | first_name = destination.query('select first_name from users where id = 1').first['first_name']
73 | expect(first_name).to eql 'New Name'
74 | end
75 |
76 | describe 'lazy create' do
77 |
78 | after(:each) do
79 | destination = SpecClient.mysql('forklift_test_source_a')
80 | destination.query('drop table if exists `new_table`')
81 | end
82 |
83 | it "can lazy-create a table with primary keys provided" do
84 | data = [
85 | {id: 1, thing: 'stuff a', updated_at: Time.new},
86 | {id: 2, thing: 'stuff b', updated_at: Time.new},
87 | {id: 3, thing: 'stuff c', updated_at: Time.new},
88 | ]
89 | table = "new_table"
90 | plan = SpecPlan.new
91 | plan.do! {
92 | destination = plan.connections[:mysql][:forklift_test_source_a]
93 | destination.write(data, table)
94 | }
95 | plan.disconnect!
96 |
97 | destination = SpecClient.mysql('forklift_test_source_a')
98 | cols = []
99 | destination.query("describe #{table}").each do |row|
100 | cols << row["Field"]
101 | case row["Field"]
102 | when "id"
103 | expect(row["Type"]).to eql "bigint(20)"
104 | when "thing"
105 | expect(row["Type"]).to eql "text"
106 | when "updated_at"
107 | expect(row["Type"]).to eql "datetime"
108 | end
109 | end
110 | expect(cols).to eql ['id', 'thing', 'updated_at']
111 | end
112 |
113 | it "can lazy-create a table without primary keys provided" do
114 | data = [
115 | {thing: 'stuff a', number: 1.123, updated_at: Time.new},
116 | {thing: 'stuff b', number: 1.123, updated_at: Time.new},
117 | {thing: 'stuff c', number: 1.123, updated_at: Time.new},
118 | ]
119 | table = "new_table"
120 | plan = SpecPlan.new
121 | plan.do! {
122 | destination = plan.connections[:mysql][:forklift_test_source_a]
123 | destination.write(data, table)
124 | }
125 | plan.disconnect!
126 |
127 | destination = SpecClient.mysql('forklift_test_source_a')
128 | cols = []
129 | destination.query("describe #{table}").each do |row|
130 | cols << row["Field"]
131 | case row["Field"]
132 | when "id"
133 | expect(row["Type"]).to eql "bigint(20)"
134 | when "thing"
135 | expect(row["Type"]).to eql "text"
136 | when "number"
137 | expect(row["Type"]).to eql "float"
138 | when "updated_at"
139 | expect(row["Type"]).to eql "datetime"
140 | end
141 | end
142 | expect(cols).to include('id', 'thing', 'number', 'updated_at')
143 | end
144 |
145 | it "can add columns to exiting tables when new keys are provided" do
146 | table = "users"
147 | raw = SpecClient.mysql('forklift_test_source_a')
148 |
149 | count = raw.query("SHOW COLUMNS FROM #{table}").count
150 | expect(count).to eql 6
151 |
152 | data = [
153 | {email: 'other@example.com', something_else: :abc123, first_name: 'other', last_name: 'n', created_at: Time.new.to_s(:db), updated_at: Time.new.to_s(:db)}
154 | ]
155 | plan = SpecPlan.new
156 | plan.do! {
157 | destination = plan.connections[:mysql][:forklift_test_source_a]
158 | destination.write(data, table)
159 | }
160 | plan.disconnect!
161 |
162 | count = raw.query("SHOW COLUMNS FROM #{table}").count
163 | expect(count).to eql 7
164 | end
165 |
166 | it "can will seek further for null-ish values" do
167 | data = [
168 | {id: 1, thing: 'stuff a', number: nil, updated_at: Time.new},
169 | {id: 2, thing: 'stuff b', number: nil, updated_at: Time.new},
170 | {id: 3, thing: 'stuff c', number: 100, updated_at: Time.new},
171 | ]
172 | table = "new_table"
173 | plan = SpecPlan.new
174 | plan.do! {
175 | destination = plan.connections[:mysql][:forklift_test_source_a]
176 | destination.write(data, table)
177 | }
178 | plan.disconnect!
179 |
180 | destination = SpecClient.mysql('forklift_test_source_a')
181 | cols = []
182 | destination.query("describe #{table}").each do |row|
183 | cols << row["Field"]
184 | case row["Field"]
185 | when "id"
186 | expect(row["Type"]).to eql "bigint(20)"
187 | when "thing"
188 | expect(row["Type"]).to eql "text"
189 | when "number"
190 | expect(row["Type"]).to eql "bigint(20)"
191 | when "updated_at"
192 | expect(row["Type"]).to eql "datetime"
193 | end
194 | end
195 | expect(cols).to include('id', 'thing', 'updated_at', 'number')
196 | end
197 |
198 | it "null rows will be text, and can be updated on subsequent writes" do
199 | data = [
200 | {id: 1, number: nil, updated_at: Time.new},
201 | {id: 2, number: nil, updated_at: Time.new},
202 | ]
203 |
204 | table = "new_table"
205 |
206 | plan = SpecPlan.new
207 | plan.do! {
208 | destination = plan.connections[:mysql][:forklift_test_source_a]
209 | destination.write(data, table)
210 | }
211 | plan.disconnect!
212 |
213 | destination = SpecClient.mysql('forklift_test_source_a')
214 | cols = []
215 |
216 | destination.query("describe #{table}").each do |row|
217 | cols << row["Field"]
218 | case row["Field"]
219 | when "id"
220 | expect(row["Type"]).to eql "bigint(20)"
221 | when "number"
222 | expect(row["Type"]).to eql "varchar(0)"
223 | when "updated_at"
224 | expect(row["Type"]).to eql "datetime"
225 | end
226 | end
227 | expect(cols).to include('id', 'updated_at', 'number')
228 |
229 | data = [
230 | {id: 3, number: 123, updated_at: Time.new},
231 | ]
232 |
233 | plan = SpecPlan.new
234 | plan.do! {
235 | destination = plan.connections[:mysql][:forklift_test_source_a]
236 | destination.write(data, table)
237 | }
238 | plan.disconnect!
239 |
240 | destination = SpecClient.mysql('forklift_test_source_a')
241 | cols = []
242 |
243 | destination.query("describe #{table}").each do |row|
244 | cols << row["Field"]
245 | case row["Field"]
246 | when "number"
247 | expect(row["Type"]).to eql "bigint(20)"
248 | end
249 | end
250 | expect(cols).to include('id', 'updated_at', 'number')
251 | end
252 |
253 | end
254 |
255 | end
256 |
--------------------------------------------------------------------------------
/spec/integration/transformations_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'transformations' do
4 |
5 | before(:each) do
6 | SpecSeeds.setup_mysql
7 | end
8 |
9 | it "can run a native transformation" do
10 | plan = SpecPlan.new
11 | @rows = []
12 |
13 | raw = SpecClient.mysql('forklift_test_destination')
14 |
15 | plan.do! {
16 | source = plan.connections[:mysql][:forklift_test_source_a]
17 | destination = plan.connections[:mysql][:forklift_test_destination]
18 | source.read('select * from `users`') {|data| destination.write(data, 'users') }
19 |
20 | expect( destination.columns("users").include?(:full_name) ).to eql false
21 |
22 | transformation_file = "#{File.dirname(__FILE__)}/../template/spec_user_transformation.sql"
23 | destination.exec!(transformation_file)
24 |
25 | expect( destination.columns("users").include?(:full_name) ).to eql true
26 | }
27 | plan.disconnect!
28 | end
29 |
30 | it "can run a ruby transformation" do
31 | plan = SpecPlan.new
32 | @rows = []
33 |
34 | raw = SpecClient.mysql('forklift_test_destination')
35 |
36 | plan.do! {
37 | source = plan.connections[:mysql][:forklift_test_source_a]
38 | destination = plan.connections[:mysql][:forklift_test_destination]
39 | source.read('select * from `users`') {|data| destination.write(data, 'users') }
40 |
41 | expect( destination.columns("users").include?(:full_name) ).to eql false
42 |
43 | transformation_file = "#{File.dirname(__FILE__)}/../template/spec_user_transformation.rb"
44 | destination.exec!(transformation_file, {prefix: 'my_prefix' })
45 |
46 | expect( destination.columns("users").include?(:full_name) ).to eql true
47 |
48 | data = destination.read('select * from `users` where email="evan@example.com"')
49 | expect( data.first[:full_name] ).to eql 'my_prefix Evan T'
50 | }
51 | plan.disconnect!
52 | end
53 |
54 | end
55 |
--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
1 | #############
2 | ## WARNING ##
3 | #############
4 |
5 | # THIS TEST SUITE IS VERY MEAN TO MYSQL AND ELASTICSEARCH
6 | # IT *WILL* DELETE ANY CONTENT IN THE TEST DBs
7 |
8 | $LOAD_PATH.unshift(File.dirname(__FILE__))
9 | $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
10 | APP_DIR ||= File.expand_path('../../', __FILE__)
11 |
12 | require 'forklift'
13 | require 'rspec'
14 | require 'fileutils'
15 |
16 | ENV["FORKLIFT_RUN_ALL_STEPS"] = 'true'
17 |
18 | Dir["#{APP_DIR}/spec/support/**/*.rb"].each {|f| require f}
19 |
20 | RSpec.configure do |config|
21 |
22 | config.before(:all) do
23 | piddir = "#{File.dirname(__FILE__)}/pid"
24 | FileUtils.rmdir(piddir) if File.exists?(piddir)
25 | SpecSeeds.setup_mysql
26 | SpecSeeds.setup_elasticsearch
27 | SpecSeeds.setup_csv
28 | end
29 |
30 | end
31 |
--------------------------------------------------------------------------------
/spec/support/dumps/csv/source.csv:
--------------------------------------------------------------------------------
1 | vendor_id,vendor_name,created_at,updated_at
2 | 1,Evan's Hats,2000-01-01 00:00:01,2000-01-01 00:00:01
3 | 2,Aaron's Scarves,2000-01-01 00:00:02,2000-01-01 00:00:02
4 | 3,Pablo's Shirts,2000-01-01 00:00:03,2000-01-01 00:00:03
5 | 4,Kevin's Headies,2000-01-01 00:00:04,2000-01-01 00:00:04
6 | 5,Brian's Boots,2000-01-01 00:00:05,2000-01-01 00:00:05
--------------------------------------------------------------------------------
/spec/support/dumps/elasticsearch/forklift_test.json:
--------------------------------------------------------------------------------
1 | [
2 | {"id": 1, "user_id": 1, "product_id": 1, "viewed_at": 1396552251},
3 | {"id": 2, "user_id": 1, "product_id": 2, "viewed_at": 1396552252},
4 | {"id": 3, "user_id": 2, "product_id": 5, "viewed_at": 1396552253},
5 | {"id": 4, "user_id": 2, "product_id": 5, "viewed_at": 1396552254},
6 | {"id": 5, "user_id": 2, "product_id": 5, "viewed_at": 1396552255}
7 | ]
--------------------------------------------------------------------------------
/spec/support/dumps/mysql/forklift_test_source_a.sql:
--------------------------------------------------------------------------------
1 | # Dump of table products
2 | # ------------------------------------------------------------
3 |
4 | DROP TABLE IF EXISTS `products`;
5 |
6 | CREATE TABLE `products` (
7 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
8 | `name` varchar(255) NOT NULL DEFAULT '',
9 | `description` text NOT NULL,
10 | `inventory` int(11) DEFAULT NULL,
11 | `created_at` datetime NOT NULL,
12 | `updated_at` datetime NOT NULL,
13 | PRIMARY KEY (`id`)
14 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
15 |
16 | LOCK TABLES `products` WRITE;
17 |
18 | INSERT INTO `products` (`id`, `name`, `description`, `inventory`, `created_at`, `updated_at`)
19 | VALUES
20 | (1,'car','a car',10,'2014-04-03 11:45:51','2014-04-03 11:45:51'),
21 | (2,'boat','a boat',3,'2014-04-03 11:45:52','2014-04-03 11:45:52'),
22 | (3,'bus','a bus',5,'2014-04-03 11:45:54','2014-04-03 11:45:54'),
23 | (4,'motorcycle','a motorcycle',23,'2014-04-03 11:45:56','2014-04-03 11:45:56'),
24 | (5,'hang_glider','awesome',2,'2014-04-03 11:46:19','2014-04-03 11:46:19');
25 |
26 | UNLOCK TABLES;
27 |
28 |
29 | # Dump of table sales
30 | # ------------------------------------------------------------
31 |
32 | DROP TABLE IF EXISTS `sales`;
33 |
34 | CREATE TABLE `sales` (
35 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
36 | `user_id` int(11) NOT NULL,
37 | `product_id` int(11) NOT NULL,
38 | `timestamp` datetime NOT NULL,
39 | PRIMARY KEY (`id`)
40 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
41 |
42 | LOCK TABLES `sales` WRITE;
43 |
44 | INSERT INTO `sales` (`id`, `user_id`, `product_id`, `timestamp`)
45 | VALUES
46 | (1,1,1,'2014-04-03 11:47:11'),
47 | (2,1,2,'2014-04-03 11:47:11'),
48 | (3,4,5,'2014-04-03 11:47:12'),
49 | (4,4,4,'2014-04-03 11:47:25'),
50 | (5,5,5,'2014-04-03 11:47:26');
51 |
52 | UNLOCK TABLES;
53 |
54 |
55 | # Dump of table users
56 | # ------------------------------------------------------------
57 |
58 | DROP TABLE IF EXISTS `users`;
59 |
60 | CREATE TABLE `users` (
61 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
62 | `email` varchar(255) NOT NULL DEFAULT '',
63 | `first_name` varchar(255) NOT NULL DEFAULT '',
64 | `last_name` varchar(255) NOT NULL DEFAULT '',
65 | `created_at` datetime NOT NULL,
66 | `updated_at` datetime NOT NULL,
67 | PRIMARY KEY (`id`)
68 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
69 |
70 | LOCK TABLES `users` WRITE;
71 |
72 | INSERT INTO `users` (`id`, `email`, `first_name`, `last_name`, `created_at`, `updated_at`)
73 | VALUES
74 | (1,'evan@example.com','Evan','T','2014-04-03 11:40:12','2014-04-03 11:39:28'),
75 | (2,'pablo@example.com','Pablo ','J','2014-04-03 11:41:08','2014-04-03 11:41:08'),
76 | (3,'kevin@example.com','Kevin','B','2014-04-03 11:41:10','2014-04-03 11:41:10'),
77 | (4,'brian@example.com','Brian','L','2014-04-03 11:41:12','2014-04-03 11:41:12'),
78 | (5,'aaront@example.com','Aaron','B','2014-04-03 11:41:13','2014-04-03 11:41:13');
79 |
80 | UNLOCK TABLES;
--------------------------------------------------------------------------------
/spec/support/dumps/mysql/forklift_test_source_b.sql:
--------------------------------------------------------------------------------
1 | # Dump of table admin_notes
2 | # ------------------------------------------------------------
3 |
4 | DROP TABLE IF EXISTS `admin_notes`;
5 |
6 | CREATE TABLE `admin_notes` (
7 | `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
8 | `user_id` int(11) NOT NULL,
9 | `note` text NOT NULL,
10 | `created_at` datetime NOT NULL,
11 | `updated_at` datetime NOT NULL,
12 | PRIMARY KEY (`id`)
13 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
14 |
15 | LOCK TABLES `admin_notes` WRITE;
16 |
17 | INSERT INTO `admin_notes` (`id`, `user_id`, `note`, `created_at`, `updated_at`)
18 | VALUES
19 | (1,1,'User 1 called customer support\n','2014-04-03 11:50:25','2014-04-03 11:50:25'),
20 | (2,2,'User 2 called customer support','2014-04-03 11:50:26','2014-04-03 11:50:26'),
21 | (3,5,'User 5 returned the purchase','2014-04-03 11:50:28','2014-04-03 11:50:28');
22 |
23 | UNLOCK TABLES;
--------------------------------------------------------------------------------
/spec/support/spec_client.rb:
--------------------------------------------------------------------------------
1 | require 'yaml'
2 | require 'erb'
3 |
4 | class SpecClient
5 |
6 | def self.load_config(file)
7 | YAML.load(ERB.new(File.read(file)).result)
8 | end
9 |
10 | def self.mysql(name)
11 | file = File.join(File.dirname(__FILE__), '..', 'config', 'connections', 'mysql', "#{name}.yml")
12 | config = self.load_config(file)
13 | db = config[:database]
14 | config.delete(:database)
15 | connection = ::Mysql2::Client.new(config)
16 | begin
17 | connection.query("use `#{db}`")
18 | rescue Exception => e
19 | puts "#{e} => will create new databse #{db}"
20 | end
21 | connection
22 | end
23 |
24 | def self.elasticsearch(name)
25 | file = File.join(File.dirname(__FILE__), '..', 'config', 'connections', 'elasticsearch', "#{name}.yml")
26 | config = self.load_config(file)
27 | ::Elasticsearch::Client.new(config)
28 | end
29 |
30 | def self.csv(file)
31 | CSV.read(file, headers: true, converters: :all).map {|r| r = r.to_hash.symbolize_keys }
32 | end
33 |
34 | end
--------------------------------------------------------------------------------
/spec/support/spec_plan.rb:
--------------------------------------------------------------------------------
1 | class SpecPlan
2 | def self.config
3 | return {
4 | project_root: File.join(Dir.pwd, 'spec'),
5 | logger: {
6 | stdout: false,
7 | debug: false,
8 | },
9 | }
10 | end
11 |
12 | def self.new
13 | return Forklift::Plan.new(self.config)
14 | end
15 | end
--------------------------------------------------------------------------------
/spec/support/spec_seeds.rb:
--------------------------------------------------------------------------------
1 | require 'json'
2 | require 'fileutils'
3 |
4 | class SpecSeeds
5 |
6 | def self.setup_mysql
7 | mysql_connections = []
8 | mysql_databases = []
9 |
10 | files = Dir["#{File.dirname(__FILE__)}/../config/connections/mysql/*.yml"]
11 | files.each do |f|
12 | name = f.split('/').last.gsub('.yml','')
13 | mysql_connections << ::SpecClient.mysql(name)
14 | mysql_databases << name
15 | end
16 |
17 | i = 0
18 | while i < mysql_connections.count
19 | conn = mysql_connections[i]
20 | db = mysql_databases[i]
21 | seed = File.join(File.dirname(__FILE__), '..', 'support', 'dumps', 'mysql', "#{db}.sql")
22 | conn.query("drop database if exists `#{db}`")
23 | conn.query("create database `#{db}`")
24 | conn.query("use `#{db}`")
25 | if File.exists? seed
26 | lines = File.read(seed).split(";")
27 | lines.each do |line|
28 | conn.query(line) if line[0] != "#"
29 | end
30 | end
31 |
32 | i = i + 1
33 | end
34 | end
35 |
36 | def self.setup_elasticsearch
37 | elasticsearch_connections = []
38 | elasticsearch_databases = []
39 |
40 | files = Dir["#{File.dirname(__FILE__)}/../config/connections/elasticsearch/*.yml"]
41 | files.each do |f|
42 | name = f.split('/').last.gsub('.yml','')
43 | elasticsearch_connections << ::SpecClient.elasticsearch(name)
44 | elasticsearch_databases << name
45 | end
46 |
47 | i = 0
48 | while i < elasticsearch_connections.count
49 | conn = elasticsearch_connections[i]
50 | index = elasticsearch_databases[i]
51 | seed = File.join(File.dirname(__FILE__), '..', 'support', 'dumps', 'elasticsearch', "#{index}.json")
52 | conn.indices.delete({ index: index }) if conn.indices.exists({ index: index })
53 | if File.exists? seed
54 | lines = JSON.parse(File.read(seed))
55 | lines.each do |line|
56 | object = {
57 | index: index,
58 | body: line,
59 | type: 'forklift',
60 | id: line['id']
61 | }
62 | conn.index object # assumes ES is setup to allow index creation on write
63 | end
64 | conn.indices.refresh({ index: index })
65 | end
66 | i = i + 1
67 | end
68 | end
69 |
70 | def self.setup_csv
71 | seed = File.join(File.dirname(__FILE__), '..', 'support', 'dumps', 'csv', "source.csv")
72 | source = '/tmp/source.csv'
73 | destination = '/tmp/destination.csv'
74 | FileUtils.rm(source, {force: true})
75 | FileUtils.rm(destination, {force: true})
76 | FileUtils.copy(seed, source)
77 | end
78 |
79 | end
--------------------------------------------------------------------------------
/spec/template/spec_email_template.erb:
--------------------------------------------------------------------------------
1 | ## Your forklift email ##
2 |
3 | Total Users: <%= @total_users_count %>
4 | New Users: <%= @new_users_count %>
--------------------------------------------------------------------------------
/spec/template/spec_user_transformation.rb:
--------------------------------------------------------------------------------
1 | class SpecUserTransformation
2 |
3 | def do!(connection, forklift, args)
4 | connection.q("ALTER TABLE `users` ADD `full_name` VARCHAR(255) NULL DEFAULT NULL AFTER `updated_at`;")
5 | connection.q("UPDATE `users` SET full_name = CONCAT('#{args[:prefix]}', ' ', first_name, ' ', last_name);")
6 | end
7 |
8 | end
--------------------------------------------------------------------------------
/spec/template/spec_user_transformation.sql:
--------------------------------------------------------------------------------
1 | ALTER TABLE `users` ADD `full_name` VARCHAR(255) NULL DEFAULT NULL AFTER `updated_at`;
2 | UPDATE `users` SET full_name = CONCAT(first_name, ' ', last_name);
--------------------------------------------------------------------------------
/spec/unit/connection/mysql_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require 'zlib'
3 |
4 | describe Forklift::Connection::Mysql do
5 |
6 | describe "read/write utils" do
7 | before(:each) do
8 | SpecSeeds.setup_mysql
9 | end
10 |
11 | it "can read a list of tables" do
12 | plan = SpecPlan.new
13 | plan.do! {
14 | source = plan.connections[:mysql][:forklift_test_source_a]
15 | expect(source.tables).to include 'users'
16 | expect(source.tables).to include 'products'
17 | expect(source.tables).to include 'sales'
18 | }
19 | plan.disconnect!
20 | end
21 |
22 | it "can delete a table" do
23 | plan = SpecPlan.new
24 | table = "users"
25 | plan.do! {
26 | source = plan.connections[:mysql][:forklift_test_source_a]
27 | expect(source.tables).to include 'users'
28 | source.drop! table
29 | expect(source.tables).to_not include 'users'
30 | }
31 | plan.disconnect!
32 | end
33 |
34 | it "can count the rows in a table" do
35 | plan = SpecPlan.new
36 | table = "users"
37 | plan.do! {
38 | source = plan.connections[:mysql][:forklift_test_source_a]
39 | expect(source.count(table)).to eql 5
40 | }
41 | plan.disconnect!
42 | end
43 |
44 | it "can truncate a table (both with and without !)" do
45 | plan = SpecPlan.new
46 | table = "users"
47 | plan.do! {
48 | source = plan.connections[:mysql][:forklift_test_source_a]
49 | expect(source.count(table)).to eql 5
50 | source.truncate! table
51 | expect(source.count(table)).to eql 0
52 | expect { source.truncate(table) }.to_not raise_error
53 | }
54 | plan.disconnect!
55 | end
56 |
57 | it 'trunacte! will raise if the table does not exist' do
58 | plan = SpecPlan.new
59 | table = "other_table"
60 | plan.do! {
61 | source = plan.connections[:mysql][:forklift_test_source_a]
62 | expect { source.truncate!(table) }.to raise_error(/Table 'forklift_test_source_a.other_table' doesn't exist/)
63 | }
64 | plan.disconnect!
65 | end
66 |
67 | it "can get the columns of a table" do
68 | plan = SpecPlan.new
69 | table = "sales"
70 | plan.do! {
71 | source = plan.connections[:mysql][:forklift_test_source_a]
72 | columns = source.columns(table)
73 | expect(columns).to include :id
74 | expect(columns).to include :user_id
75 | expect(columns).to include :product_id
76 | expect(columns).to include :timestamp
77 | }
78 | plan.disconnect!
79 | end
80 |
81 | it "can create a mysqldump" do
82 | dump = "/tmp/destination.sql.gz"
83 | plan = SpecPlan.new
84 | plan.do! {
85 | source = plan.connections[:mysql][:forklift_test_source_a]
86 | source.dump(dump)
87 | }
88 | plan.disconnect!
89 |
90 | expect(File.exists?(dump)).to eql true
91 | contents = Zlib::GzipReader.new(StringIO.new(File.read(dump))).read
92 | expect(contents).to include "(1,'evan@example.com','Evan','T','2014-04-03 11:40:12','2014-04-03 11:39:28')"
93 | end
94 |
95 | end
96 |
97 | describe "#safe_values" do
98 | subject { described_class.new({}, {}) }
99 |
100 | it "escapes one trailing backslash" do
101 | columns = [:col]
102 | values = {:col => "foo\\"}
103 | expect(subject.send(:safe_values, columns, values)).to eq("(\"foo\\\\\")")
104 | end
105 |
106 | it "escapes two trailing backslashes" do
107 | columns = [:col]
108 | values = {:col => "foo\\\\" }
109 | expect(subject.send(:safe_values, columns, values)).to eq("(\"foo\\\\\\\\\")")
110 | end
111 | end
112 | end
113 |
--------------------------------------------------------------------------------
/spec/unit/misc/email_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 | require "email_spec"
3 |
4 | describe 'misc forklift core' do
5 | describe 'email' do
6 | include EmailSpec::Helpers
7 | include EmailSpec::Matchers
8 |
9 | it "can send mail with an email template" do
10 | plan = SpecPlan.new
11 | plan.do! {
12 | email_args = {
13 | to: "YOU@FAKE.com",
14 | from: "Forklift",
15 | subject: "Forklift has moved your database",
16 | }
17 | email_variables = {
18 | total_users_count: 10,
19 | new_users_count: 5,
20 | }
21 | email_template = "#{File.dirname(__FILE__)}/../../template/spec_email_template.erb"
22 | @email = plan.mailer.send_template(email_args, email_template, email_variables).first
23 | }
24 | plan.disconnect!
25 |
26 | expect(@email).to deliver_to("YOU@FAKE.com")
27 | expect(@email).to have_subject(/Forklift has moved your database/)
28 | expect(@email).to have_body_text(/Your forklift email/) # base
29 | expect(@email).to have_body_text(/Total Users: 10/) # template
30 | expect(@email).to have_body_text(/New Users: 5/) # template
31 | end
32 |
33 | it "can send mail with an attachment" do
34 | skip("how to test email attachments?")
35 | end
36 | end
37 |
38 | end
--------------------------------------------------------------------------------
/spec/unit/misc/error_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'misc forklift core' do
4 | describe 'error handling' do
5 |
6 | it "un-caught errors will raise" do
7 | plan = SpecPlan.new
8 | expect{
9 | plan.do! {
10 | plan.step("step_a"){ raise 'BREAK' }
11 | }
12 | }.to raise_error 'BREAK'
13 | plan.pid.delete!
14 | plan.disconnect!
15 | end
16 |
17 | it 'can make error handlers' do
18 | plan = SpecPlan.new
19 | name = ''
20 | ex = ''
21 | error_handler = lambda{ |n, e|
22 | ex = e
23 | name = n
24 | }
25 | plan.do! {
26 | plan.step("step_a", error_handler){ raise 'BREAK' }
27 | }
28 | plan.disconnect!
29 |
30 | expect(name).to eql :step_a
31 | expect(ex.to_s).to eql 'BREAK'
32 | end
33 |
34 | end
35 | end
--------------------------------------------------------------------------------
/spec/unit/misc/pid_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'misc forklift core' do
4 |
5 | describe 'pidfile' do
6 | it "can create a pidfile and will remove it when the plan is over" do
7 | plan = SpecPlan.new
8 | pid = "#{File.dirname(__FILE__)}/../../pid/pidfile"
9 | expect(File.exists?(pid)).to eql false
10 | plan.do! {
11 | expect(File.exists?(pid)).to eql true
12 | expect(File.read(pid).to_i).to eql Process.pid
13 | }
14 | plan.disconnect!
15 | expect(File.exists?(pid)).to eql false
16 | end
17 |
18 | it "will not run with an existing pidfile" do
19 | plan = SpecPlan.new
20 | plan.pid.store!
21 | expect { plan.do! }.to raise_error SystemExit
22 | plan.pid.delete!
23 | plan.disconnect!
24 | end
25 | end
26 |
27 | end
--------------------------------------------------------------------------------
/spec/unit/misc/step_spec.rb:
--------------------------------------------------------------------------------
1 | require 'spec_helper'
2 |
3 | describe 'misc forklift core' do
4 | describe 'steps' do
5 |
6 | before(:each) do
7 | ENV['FORKLIFT_RUN_ALL_STEPS'] = 'false'
8 | end
9 |
10 | after(:each) do
11 | ENV['FORKLIFT_RUN_ALL_STEPS'] = 'true'
12 | end
13 |
14 | it "will run all steps with no extra ARGV" do
15 | plan = SpecPlan.new
16 | allow(plan).to receive(:argv){ ['/path/to/plan'] }
17 | steps_run = []
18 | plan.do! {
19 | plan.step("a"){ steps_run << 'a' }
20 | plan.step("b"){ steps_run << 'b' }
21 | plan.step("c"){ steps_run << 'c' }
22 | }
23 | plan.disconnect!
24 | expect(steps_run).to include 'a'
25 | expect(steps_run).to include 'b'
26 | expect(steps_run).to include 'c'
27 | end
28 |
29 | it "will only run steps named within ARGV" do
30 | plan = SpecPlan.new
31 | allow(plan).to receive(:argv){ ['/path/to/plan', 'a','c'] }
32 | steps_run = []
33 | plan.do! {
34 | plan.step("a"){ steps_run << 'a' }
35 | plan.step("b"){ steps_run << 'b' }
36 | plan.step("c"){ steps_run << 'c' }
37 | }
38 | plan.disconnect!
39 | expect(steps_run).to include 'a'
40 | expect(steps_run).to_not include 'b'
41 | expect(steps_run).to include 'c'
42 | end
43 |
44 | it "won't run on a badly defined step" do
45 | plan = SpecPlan.new
46 | allow(plan).to receive(:argv){ ['/path/to/plan', 'missing_step'] }
47 | expect{
48 | plan.do! {
49 | plan.step("a"){ raise 'never should get here' }
50 | }
51 | plan.disconnect!
52 | }.to raise_error SystemExit
53 | end
54 | end
55 |
56 | end
--------------------------------------------------------------------------------
/template/destination.yml:
--------------------------------------------------------------------------------
1 | encoding: utf8
2 | database: destination
3 | username: root
4 | password:
5 | host: 127.0.0.1
6 | port: 3306
7 |
--------------------------------------------------------------------------------
/template/email.erb:
--------------------------------------------------------------------------------
1 | Your forklift email
2 |
--------------------------------------------------------------------------------
/template/email.yml:
--------------------------------------------------------------------------------
1 | # Configuration is passed to Pony (https://github.com/benprew/pony)
2 |
3 | # ==> SMTP
4 | # If testing locally, mailcatcher (https://github.com/sj26/mailcatcher) is a helpful gem
5 | via: smtp
6 | via_options:
7 | address: localhost
8 | port: 1025
9 | # user_name: user
10 | # password: password
11 | # authentication: :plain # :plain, :login, :cram_md5, no auth by default
12 | # domain: "localhost.localdomain" # the HELO domain provided by the client to the server
13 |
14 | # ==> Sendmail
15 | # via: sendmail
16 | # via_options:
17 | # location: /usr/sbin/sendmail
18 | # arguments: '-t -i'
19 |
--------------------------------------------------------------------------------
/template/plan.rb:
--------------------------------------------------------------------------------
1 | # plan = Forklift::Plan.new
2 |
3 | # Or, you can pass configs
4 | plan = Forklift::Plan.new({
5 | # logger: {debug: true}
6 | })
7 |
8 | plan.do! do
9 | # Your plan here.
10 | end
11 |
--------------------------------------------------------------------------------
/template/source.yml:
--------------------------------------------------------------------------------
1 | encoding: utf8
2 | database: source
3 | username: root
4 | password:
5 | host: 127.0.0.1
6 | port: 3306
7 |
--------------------------------------------------------------------------------