├── .gitignore
├── Makefile
├── README.md
├── circle.yml
├── logback.example.xml
├── project.clj
├── src
    └── uswitch
    │   └── big_replicate
    │       ├── materialize.clj
    │       └── sync.clj
└── test
    └── uswitch
        └── big_replicate
            └── sync_test.clj


/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | .nrepl-port
3 | .lein-failures
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | SOURCES=$(wildcard *.clj) $(wildcard src/uswitch/*.clj)
2 | 
3 | ./target/big-replicate-standalone.jar: $(SOURCES)
4 | 	lein uberjar
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Big Replicate
 2 | 
 3 | Command-line tool for [Google Cloud BigQuery](https://cloud.google.com/bigquery/).
 4 | 
 5 | Provides two core features:
 6 | 
 7 | 1. **Sync**: will copy/synchronise Google Analytics session tables between different datasets/projects. For example, moving Google Analytics data from a US dataset into the EU.
 8 | 2. **Materialize**: executes a statement and outputs to a table. Useful for materializing views/complex intermediate tables.
 9 | 
10 | [![CircleCI](https://circleci.com/gh/uswitch/big-replicate.svg?style=svg)](https://circleci.com/gh/uswitch/big-replicate)
11 | 
12 | ## Usage
13 | 
14 | ### Synchronising Data
15 | 
16 | The destination project must have a dataset (with the same name as the source) that already exists. It will look for any session tables that are missing from the destination dataset and replicate the `--number` of most recent ones. We use the `--number` parameter to help incrementally copy very large datasets over time.
17 | 
18 | ```bash
19 | export GCLOUD_PROJECT="source-project-id"
20 | export GOOGLE_APPLICATION_CREDENTIALS="./service-account-key.json"
21 | 
22 | export JVM_OPTS="-Dlogback.configurationFile=./logback.example.xml"
23 | 
24 | java $JVM_OPTS -cp big-replicate-standalone.jar \
25 |   uswitch.big_replicate.sync \
26 |   --source-project source-project-id \
27 |   --source-dataset 98909919 \
28 |   --destination-project destination-project-id \
29 |   --destination-dataset 98909919 \
30 |   --table-filter "ga_sessions_\d+" \
31 |   --google-cloud-bucket gs://staging-data-bucket \
32 |   --number 30
33 | ```
34 | 
35 | Because only missing tables from the destination dataset are processed tables will not be overwritten. 
36 | 
37 | The example above is intended for replicating [Google Analytics BigQuery data](https://support.google.com/analytics/answer/3437618?hl=en) from one project to another. It works by:
38 | 
39 | * Specifying `--table-filter` to only replicate tables matching the expected `ga_sessions_\d+` filter. This can be any valid Java regular expression. 
40 | * Specifying `--number` restricts the replication to 30 tables. Tables are reverse ordered lexicographically by the tool.
41 | 
42 | This ensures that the most recent 30 days of tables that don't exist (in the `--destination-project` and `--destination-dataset`) but do in the sources will be replicated.
43 | 
44 | `big-replicate` will run multiple extract and loads concurrently- this is currently set to the number of available processors (as reported by the JVM runtime). You can override this with the `--number-of-agents` flag. Since no processing is performed client-side (all operations are BigQuery jobs) its safe to set this well above the processor count.
45 | 
46 | ### Materializing Data
47 | 
48 | We often use views to help break apart more complex queries, building join tables between datasets etc. The `materialize` operation executes a statement and stores the output in a table. 
49 | 
50 | ```bash
51 | export GCLOUD_PROJECT="source-project-id"
52 | export GOOGLE_APPLICATION_CREDENTIALS="./service-account-key.json"
53 | 
54 | export JVM_OPTS="-Dlogback.configurationFile=./logback.example.xml"
55 | 
56 | echo "SELECT * FROM [dataset.sample_table]" | java $JVM_OPTS \
57 |   -cp big-replicate-standalone.jar \
58 |   uswitch.big_replicate.materialize \
59 |   --project-id destination-project-id \
60 |   --dataset-id destination-dataset-id \
61 |   --table-id destination-table \
62 |   --force
63 | ```
64 | 
65 | ## Releases
66 | 
67 | Binaries are built on [CircleCI](https://circleci.com/gh/uswitch/big-replicate) with artifacts pushed to [GitHub Releases](https://github.com/uswitch/big-replicate/releases). The published jar is suitable for running directly as above.
68 | 
69 | ## Building
70 | 
71 | The tool is written in [Clojure](https://clojure.org) and requires [Leiningen](https://github.com/technomancy/leiningen).
72 | 
73 | ```
74 | $ make
75 | ```
76 | 
77 | ## License
78 | 
79 | Copyright © 2016 uSwitch
80 | 
81 | Distributed under the Eclipse Public License either version 1.0 or (at
82 | your option) any later version.
83 | 


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
 1 | general:
 2 |   branches:
 3 |     only:
 4 |       - master
 5 |   artifacts:
 6 |     - "target/big-replicate-standalone.jar"
 7 | test:
 8 |   override:
 9 |     - make
10 | deployment:
11 |   release:
12 |     branch: master
13 |     commands:
14 |       - go get github.com/uswitch/ghr
15 |       - ghr -t $GITHUB_TOKEN -u $CIRCLE_PROJECT_USERNAME -r $CIRCLE_PROJECT_REPONAME --replace $CIRCLE_BUILD_NUM target/big-replicate-standalone.jar
16 | 


--------------------------------------------------------------------------------
/logback.example.xml:
--------------------------------------------------------------------------------
 1 | <!-- Logback configuration. See http://logback.qos.ch/manual/index.html -->
 2 | <configuration scan="true" scanPeriod="10 seconds">
 3 |   <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
 4 |     <encoder>
 5 |       <pattern>[%d] %-5level %logger{36} - %msg%n%ex{full}</pattern>
 6 |     </encoder>
 7 |   </appender>
 8 | 
 9 |   <root level="INFO">
10 |     <appender-ref ref="STDOUT" />
11 |   </root>
12 | </configuration>
13 | 


--------------------------------------------------------------------------------
/project.clj:
--------------------------------------------------------------------------------
 1 | (defproject big-replicate "0.1.0"
 2 |   :description "Copies data between BigQuery projects"
 3 |   :url "https://github.com/uswitch/big-replicate"
 4 |   :license {:name "Eclipse Public License"
 5 |             :url "http://www.eclipse.org/legal/epl-v10.html"}
 6 |   :uberjar-name "big-replicate-standalone.jar"
 7 |   :dependencies [[org.clojure/clojure "1.8.0"]
 8 |                  [gclouj/bigquery "0.2.5" :exclusions [commons-logging]]
 9 |                  [gclouj/storage "0.2.5"]
10 |                  [org.clojure/tools.cli "0.3.1"]
11 |                  [org.clojure/tools.logging "0.3.1"]
12 |                  [org.clojure/core.async "0.2.385"]
13 |                  [org.slf4j/slf4j-api "1.7.21"]
14 |                  [org.slf4j/jcl-over-slf4j "1.7.21"]]
15 |   :profiles {:dev {:dependencies [[org.slf4j/slf4j-simple "1.7.21"]]
16 |                    :jvm-opts ["-Dorg.slf4j.simpleLogger.defaultLogLevel=debug"]}
17 |              :uberjar {:dependencies [[ch.qos.logback/logback-classic "1.1.7"]]
18 |                        :aot :all}})
19 | 


--------------------------------------------------------------------------------
/src/uswitch/big_replicate/materialize.clj:
--------------------------------------------------------------------------------
 1 | (ns uswitch.big-replicate.materialize
 2 |   (:require [clojure.tools.cli :refer (parse-opts)]
 3 |             [clojure.tools.logging :refer (info error)]
 4 |             [gclouj.bigquery :as bq])
 5 |   (:gen-class))
 6 | 
 7 | (def cli [["-p" "--project-id PROJECT_ID" "Google Cloud Project ID"]
 8 |           ["-d" "--dataset-id DATASET_ID" "Output Dataset ID"]
 9 |           ["-t" "--table-id TABLE_ID"     "Output Table ID"]
10 |           ["-f" "--force"                 "Overwrite destination table contents" :default false]
11 |           ["-h" "--help"]])
12 | 
13 | (defn waiting? [job]
14 |   (let [state (get-in job [:status :state])]
15 |     (or (= :running state)
16 |         (= :pending state))))
17 | 
18 | (def finished? (complement waiting?))
19 | 
20 | (defn wait-for-job! [bigquery job-id]
21 |   (loop [job (bq/job bigquery job-id)]
22 |     (if (finished? job)
23 |       job
24 |       (do (info "waiting for job:" job)
25 |           (Thread/sleep 10000)
26 |           (recur (bq/job bigquery job-id))))))
27 | 
28 | (defn errors [job]
29 |   (get-in job [:status :errors]))
30 | 
31 | (defn materialize [query {:keys [project-id dataset-id table-id force]}]
32 |   (let [bigquery    (bq/service)
33 |         destination {:project-id project-id
34 |                      :dataset-id dataset-id
35 |                      :table-id   table-id}]
36 |     (let [{:keys [job-id] :as job} (bq/query-job bigquery query {:create-disposition :needed
37 |                                                                  :write-disposition  (if force :truncate :empty)
38 |                                                                  :large-results?     true
39 |                                                                  :destination-table  destination
40 |                                                                  :use-cache?         false
41 |                                                                  :priority           :interactive})]
42 |       (info "started materialize job" job)
43 |       (let [job (wait-for-job! bigquery job-id)
44 |             es  (errors job)]
45 |         (if (seq es)
46 |           (do (error "failed:" es)
47 |               (System/exit 1))
48 |           (info "completed" job))))))
49 | 
50 | (defn -main [& args]
51 |   (let [{:keys [options summary errors]} (parse-opts args cli)]
52 |     (when errors
53 |       (println errors)
54 |       (System/exit 1))
55 |     (when (:help options)
56 |       (println summary)
57 |       (System/exit 0))
58 |     (materialize (slurp *in*) options)))
59 | 


--------------------------------------------------------------------------------
/src/uswitch/big_replicate/sync.clj:
--------------------------------------------------------------------------------
  1 | (ns uswitch.big-replicate.sync
  2 |   (:require [gclouj.bigquery :as bq]
  3 |             [gclouj.storage :as cs]
  4 |             [clojure.string :as s]
  5 |             [clojure.tools.cli :refer (parse-opts)]
  6 |             [clojure.set :as se]
  7 |             [clojure.string :as st]
  8 |             [clojure.tools.logging :refer (info error debug)]
  9 |             [clojure.core.async :as a])
 10 |   (:import [com.google.api.client.googleapis.json GoogleJsonResponseException])
 11 |   (:gen-class))
 12 | 
 13 | (def cli [["-s" "--source-project PROJECT_ID"      "Source Google Cloud Project"]
 14 |           ["-i" "--source-dataset DATASET_ID"      "Source BigQuery dataset"]
 15 |           ["-p" "--destination-project PROJECT_ID" "Destination Google Cloud Project"]
 16 |           ["-d" "--destination-dataset DATASET_ID" "Destination BigQuery dataset"]
 17 |           ["-f" "--table-filter REGEXP"            "Only tables matching this regexp will be processed"
 18 |            :default (re-pattern ".*") :parse-fn re-pattern]
 19 |           ["-g" "--google-cloud-bucket BUCKET"     "Staging bucket to store exported data"]
 20 |           ["-n" "--number NUMBER"                  "Number of days to look back for missing tables"
 21 |            :default 7 :parse-fn #(Integer/parseInt %)]
 22 |           ["-a" "--number-of-agents NUMBER"        "Number of concurrent replication agents to run."
 23 |            :default (.availableProcessors (Runtime/getRuntime)) :parse-fn #(Integer/parseInt %)]
 24 |           ["-h" "--help"                           "Display summary"]])
 25 | 
 26 | (defrecord TableReference [project-id dataset-id table-id])
 27 | 
 28 | (defn table-ref [{:keys [project-id dataset-id table-id]}]
 29 |   (TableReference. project-id dataset-id table-id))
 30 | 
 31 | (defn tables
 32 |   "Finds all tables in dataset to replicate"
 33 |   [project-id dataset pattern]
 34 |   (let [service (bq/service {:project-id project-id})]
 35 |     (->> (bq/tables service {:project-id project-id
 36 |                              :dataset-id dataset})
 37 |          (map :table-id)
 38 |          (filter (fn [table] (re-matches pattern (:table-id table))))
 39 |          (map table-ref))))
 40 | 
 41 | (defn staging-location [bucket {:keys [dataset-id table-id] :as table-reference}]
 42 |   (let [prefix (format "%s/%s" dataset-id table-id)]
 43 |     {:uri    (format "%s/%s/*" bucket prefix)
 44 |      :prefix prefix}))
 45 | 
 46 | 
 47 | 
 48 | 
 49 | (defn extract-table [{:keys [source-table staging-bucket] :as current-state}]
 50 |   {:pre [(.startsWith staging-bucket "gs://")
 51 |          (not (.endsWith staging-bucket "/"))]}
 52 |   (let [{:keys [uri prefix]} (staging-location staging-bucket source-table)]
 53 |     (info "starting extract for" source-table "into" uri)
 54 |     (let [job (bq/extract-job (bq/service {:project-id (:project-id source-table)}) source-table uri)]
 55 |       (assoc current-state
 56 |         :state          :wait-for-extract
 57 |         :extract-uri    uri
 58 |         :staging-prefix prefix
 59 |         :job            job))))
 60 | 
 61 | (defn pending? [job]
 62 |   (= :pending (get-in job [:status :state])))
 63 | 
 64 | (defn failed? [job]
 65 |   (and (= :done (get-in job [:status :state]))
 66 |        (not (empty? (get-in job [:status :errors])))))
 67 | 
 68 | (defn poll-job [job]
 69 |   (let [{:keys [job-id]} job]
 70 |     (let [job-state        (bq/job (bq/service {:project-id (:project-id job-id)})
 71 |                                    job-id)]
 72 |       (cond (pending? job-state)       [:pending job-state]
 73 |             (bq/running? job-state)    [:running job-state]
 74 |             (failed? job-state)        [:failed job-state]
 75 |             (bq/successful? job-state) [:successful job-state]))))
 76 | 
 77 | (defn wait-for-job [next-state {:keys [job] :as current-state}]
 78 |   (let [[status job] (poll-job job)]
 79 |     (cond (= :failed status)     (assoc current-state
 80 |                                    :state :failed
 81 |                                    :job   job)
 82 |           (= :successful status) (assoc current-state
 83 |                                    :state next-state)
 84 |           :else                  (do (debug "waiting for job" job)
 85 |                                      (Thread/sleep 30000)
 86 |                                      (assoc current-state :job job)))))
 87 | 
 88 | (def wait-for-extract (partial wait-for-job :load))
 89 | (def wait-for-load    (partial wait-for-job :cleanup))
 90 | 
 91 | (defn load-table [{:keys [destination-table source-table extract-uri] :as current-state}]
 92 |   (let [table                 (bq/table (bq/service {:project-id (:project-id source-table)}) source-table)
 93 |         schema                (get-in table [:definition :schema])]
 94 |     (let [job (bq/load-job (bq/service {:project-id (:project-id destination-table)})
 95 |                            destination-table
 96 |                            {:create-disposition :needed
 97 |                             :write-disposition  :empty
 98 |                             :schema             schema} [extract-uri])]
 99 |       (info "starting load into" destination-table)
100 |       (assoc current-state
101 |              :state :wait-for-load
102 |              :job   job))))
103 | 
104 | (defn cleanup [{:keys [extract-uri staging-bucket staging-prefix] :as current-state}]
105 |   (let [bucket  (st/replace staging-bucket "gs://" "")
106 |         storage (cs/service)]
107 |     (info "deleting staging location" extract-uri)
108 |     (debug "finding objects in" bucket "with prefix" staging-prefix)
109 |     (loop [objects (cs/blobs storage bucket staging-prefix)]
110 |       (if-let [blobs (seq objects)]
111 |         (let [blob (first blobs)]
112 |           (debug "deleting" (:id blob))
113 |           (let [deleted? (cs/delete-blob storage (:id blob))]
114 |             (if deleted?
115 |               (recur (rest objects))
116 |               (assoc current-state
117 |                      :state :failed
118 |                      :cause (format "couldn't delete %s" (pr-str (:id blob)))))))
119 |         (assoc current-state :state :completed)))))
120 | 
121 | (defn progress [current-state]
122 |   (loop [current-state current-state]
123 |     (let [state (:state current-state)]
124 |       (if-let [op ({:extract          extract-table
125 |                     :wait-for-extract wait-for-extract
126 |                     :load             load-table
127 |                     :wait-for-load    wait-for-load
128 |                     :cleanup          cleanup} state)]
129 |         (recur (try (op current-state)
130 |                     (catch Exception ex
131 |                       (assoc current-state :state :failed :exception ex))))
132 |         current-state))))
133 | 
134 | (defn replicator-agent [in-ch completed-ch]
135 |   (a/thread
136 |     (loop [state (a/<!! in-ch)]
137 |       (when state
138 |         (let [final-state (progress state)]
139 |           (a/>!! completed-ch final-state))
140 |         (recur (a/<!! in-ch))))))
141 | 
142 | 
143 | (defn missing-tables [sources destinations]
144 |   (let [s (map :table-id sources)
145 |         d (map :table-id destinations)
146 |         t (se/difference (set s) (set d))]
147 |     (->> sources
148 |          (filter (fn [{:keys [table-id]}]
149 |                    (some (set [table-id]) t))))))
150 | 
151 | (defn- override [m k overrides]
152 |   (let [override (k overrides)]
153 |     (update-in m [k] (fn [val] (or override val)))))
154 | 
155 | (defn destination-table
156 |   [table overrides]
157 |   (-> table
158 |       (override :project-id overrides)
159 |       (override :dataset-id overrides)))
160 | 
161 | (defn target-tables [{:keys [source-project source-dataset destination-project destination-dataset table-filter number] :as options}]
162 |   (let [sources      (tables source-project source-dataset table-filter)
163 |         destinations (tables destination-project
164 |                              (or destination-dataset
165 |                                  source-dataset)
166 |                              table-filter)]
167 |     (->> (missing-tables sources destinations)
168 |          (sort-by :table-id)
169 |          (reverse)
170 |          (take number))))
171 | 
172 | (defn -main [& args]
173 |   (let [{:keys [options summary errors]} (parse-opts args cli)]
174 |     (when errors
175 |       (println errors)
176 |       (System/exit 1))
177 |     (when (:help options)
178 |       (println summary)
179 |       (System/exit 0))
180 |     (let [targets (target-tables options)]
181 |       (when (empty? targets)
182 |         (info "no tables to copy")
183 |         (System/exit 0))
184 |       (let [in-ch        (a/chan (:number options))
185 |             completed-ch (a/chan)]
186 |         (info "syncing" (count targets) "tables:\n" (st/join "\n" (map pr-str targets)))
187 |         (let [{:keys [google-cloud-bucket destination-project destination-dataset]} options
188 |               overrides {:project-id destination-project
189 |                          :dataset-id destination-dataset}]
190 |           (->> targets
191 |                (map (fn [source-table]
192 |                       {:source-table      source-table
193 |                        :destination-table (destination-table source-table overrides)
194 |                        :staging-bucket    google-cloud-bucket
195 |                        :state             :extract}))
196 |                (a/onto-chan in-ch)))
197 |         (let [agents (:number-of-agents options)]
198 |           (info "creating" agents "replicator agents")
199 |           (dotimes [_ agents]
200 |             (replicator-agent in-ch completed-ch)))
201 |         (let [expected-count (count targets)]
202 |           (loop [n 1
203 |                  m (a/<!! completed-ch)]
204 |             (when m
205 |               (info (format "%d/%d" n expected-count) "completed")
206 |               (if (= :failed (:state m))
207 |                 (error "sync failed. final state:" m)
208 |                 (info "sync successful:" (select-keys m [:source-table :destination-table])))
209 |               (if (= n expected-count)
210 |                 (info "finished")
211 |                 (recur (inc n) (a/<!! completed-ch))))))))))
212 | 


--------------------------------------------------------------------------------
/test/uswitch/big_replicate/sync_test.clj:
--------------------------------------------------------------------------------
 1 | (ns uswitch.big-replicate.sync-test
 2 |   (:require [uswitch.big-replicate.sync :refer :all]
 3 |             [clojure.test :refer :all]))
 4 | 
 5 | (defn- table [project dataset table]
 6 |   (map->TableReference {:project-id project
 7 |                         :dataset-id dataset
 8 |                         :table-id   table}))
 9 | 
10 | (deftest identify-target-tables
11 |   (let [sources [(table "source-project" "source-dataset" "source-table1")]]
12 |     (is (= 1 (count (missing-tables sources []))))
13 |     (is (= (table "source-project" "source-dataset" "source-table1")
14 |            (first (missing-tables sources []))))))
15 | 
16 | (deftest override-destination
17 |   (is (= (table "source-project" "new-dataset" "table1")
18 |          (destination-table (table "source-project"
19 |                                    "source-destination"
20 |                                    "table1")
21 |                             {:dataset-id "new-dataset"}))))
22 | 


--------------------------------------------------------------------------------