├── .github └── workflows │ ├── codeql-analysis.yml │ └── issues.yml ├── .gitignore ├── .travis.yml ├── Makefile ├── README.md ├── bin └── update-readme ├── cmds └── moresql │ └── main.go ├── config.go ├── config_test.go ├── db.go ├── docs ├── README.md ├── README.template.md ├── TODO.md ├── deploying.md ├── index.md └── performance.html ├── examples └── moresql.json ├── full_sync.go ├── full_sync_test.go ├── go.mod ├── go.sum ├── goreleaser.yml ├── mkdocs.yml ├── moresql.go ├── moresql_test.go ├── structs.go ├── structs_test.go ├── tail.go ├── tail_test.go ├── utils.go └── utils_test.go /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ master ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ master ] 20 | schedule: 21 | - cron: '22 16 * * 1' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'go' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://git.io/codeql-language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v2 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v1 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 52 | 53 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 54 | # If this step fails, then you should remove it and run the build manually (see below) 55 | - name: Autobuild 56 | uses: github/codeql-action/autobuild@v1 57 | 58 | # ℹ️ Command-line programs to run using the OS shell. 59 | # 📚 https://git.io/JvXDl 60 | 61 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 62 | # and modify them (or add more) to build your code if your project 63 | # uses a compiled language 64 | 65 | #- run: | 66 | # make bootstrap 67 | # make release 68 | 69 | - name: Perform CodeQL Analysis 70 | uses: github/codeql-action/analyze@v1 71 | -------------------------------------------------------------------------------- /.github/workflows/issues.yml: -------------------------------------------------------------------------------- 1 | name: "Close stale issues" 2 | on: 3 | schedule: 4 | - cron: "30 1 * * *" 5 | 6 | jobs: 7 | stale: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - uses: actions/stale@v3.0.14 11 | with: 12 | repo-token: ${{ secrets.GITHUB_TOKEN }} 13 | stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days' 14 | days-before-stale: 30 15 | days-before-close: 5 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | debug 2 | bin/moresql 3 | tmp 4 | resources 5 | bin 6 | moresql.json 7 | moresql-* 8 | moresql 9 | dist/ 10 | site/ 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | 3 | go: 4 | - 1.6 5 | - 1.7 6 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Borrowed from: 2 | # https://github.com/silven/go-example/blob/master/Makefile 3 | # https://vic.demuzere.be/articles/golang-makefile-crosscompile/ 4 | # https://ariejan.net/2015/10/03/a-makefile-for-golang-cli-tools/ 5 | # https://marmelab.com/blog/2016/02/29/auto-documented-makefile.html 6 | 7 | SOURCEDIR=. 8 | SOURCES := $(shell find $(SOURCEDIR) -name '*.go' -maxdepth 1 | grep -v main.go | grep -v _test.go) 9 | FILES = $(SOURCES) 10 | BINARY = moresql 11 | MAIN = cmds/moresql/main.go 12 | DATE_COMPILED = $(shell date -u +"%Y-%m-%dT%H:%M:%SZ") 13 | LDFLAGS_BASE = "-X main.version='$(shell git describe --abbrev=0 --tags --always)' -X main.BuildDate='$(DATE_COMPILED)' -X main.GitRef='$(shell git describe --tags --dirty --always)' -X main.GitSHA='$(shell git rev-parse --short HEAD)'" 14 | LDFLAGS = -ldflags $(LDFLAGS_BASE) 15 | # Symlink into GOPATH 16 | GITHUB_USERNAME=zph 17 | BUILD_DIR=${GOPATH}/src/github.com/${GITHUB_USERNAME}/${BINARY} 18 | CURRENT_DIR=$(shell pwd) 19 | BUILD_DIR_LINK=$(shell readlink ${BUILD_DIR}) 20 | GOARCH = amd64 21 | .DEFAULT_GOAL := help 22 | 23 | # Build the project 24 | all: clean fmt test_full linux build docs 25 | 26 | $(BINARY): $(FILES) $(MAIN) ## Build binary for current system architecture 27 | go build $(LDFLAGS) -o bin/$(BINARY) $(MAIN) 28 | 29 | build: $(BINARY) 30 | 31 | heroku: build ## Used by heroku build process 32 | 33 | flags: 34 | @echo "$(LDFLAGS_BASE)" 35 | 36 | test: ## Run tests 37 | go test -v 38 | 39 | test_full: ## Test with race and coverage 40 | go test -v -race -cover 41 | 42 | linux: 43 | GOOS=linux GOARCH=${GOARCH} go build $(LDFLAGS) -o bin/$(BINARY)-linux-${GOARCH} $(MAIN) 44 | 45 | # darwin: 46 | # cd ${BUILD_DIR}; \ 47 | # GOOS=darwin GOARCH=${GOARCH} go build ${LDFLAGS} -o bin/${BINARY}-darwin-${GOARCH} . ; \ 48 | # cd - >/dev/null 49 | 50 | fmt: ## Go fmt the code 51 | cd ${BUILD_DIR}; \ 52 | go fmt $$(go list ./... | grep -v /vendor/) ; \ 53 | cd - >/dev/null 54 | 55 | clean: ## Clean out the generated binaries 56 | -rm -f bin/${BINARY}-* 57 | -rm -f bin/${BINARY} 58 | 59 | docs: clean ## Regenerate README.md from template 60 | @./bin/update-readme 61 | @echo "If changes occured in README.md that you want in mkdocs run:" 62 | @echo "cp -f README.md docs/README.md" 63 | 64 | docs-deploy: 65 | @git diff-index --quiet HEAD -- || (echo "Only allowed with clean working directory" && exit 1) 66 | @mkdocs gh-deploy 67 | 68 | # Allows building whether in GOPATH or not 69 | # link: 70 | # BUILD_DIR=${BUILD_DIR}; \ 71 | # BUILD_DIR_LINK=${BUILD_DIR_LINK}; \ 72 | # CURRENT_DIR=${CURRENT_DIR}; \ 73 | # if [ "$${BUILD_DIR_LINK}" != "$${CURRENT_DIR}" ]; then \ 74 | # echo "Fixing symlinks for build"; \ 75 | # rm -f $${BUILD_DIR}; \ 76 | # ln -s $${CURRENT_DIR} $${BUILD_DIR}; \ 77 | # fi 78 | 79 | help: ## prints help 80 | @ cat $(MAKEFILE_LIST) | grep -e "^[a-zA-Z_\-]*: *.*## *" | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 81 | 82 | .PHONY: link linux darwin test fmt clean help 83 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MoreSQL 2 | 3 | [](NOTE: README.md is a generated FILE changes belong in docs/README.template.md. Update with make docs) 4 | [![Build Status](https://travis-ci.org/zph/moresql.svg?branch=master)](https://travis-ci.org/zph/moresql) 5 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 6 | 7 | ## Introduction 8 | 9 | MoreSQL streams changes occuring in Mongo database into a Postgres db. MoreSQL tails the oplog and generates appropriate actions against Postgres. MoreSQL has the ability to do full synchronizations using `UPSERTS`, with the benefit over `INSERTS` that this can be executed against tables with existing data. 10 | 11 | MoreSQL gives you a chance to use more sql and less mongo query language. 12 | 13 | Project maturity: Moresql has 3 years of running in production for 2+ companies 14 | on their business critical workloads for core systems. It is a stable project 15 | with low maintenance and high performance. 16 | 17 | ## QuickStart 18 | ### Introduction 19 | 20 | * Create metadata table 21 | * Setup moresql.json (see Configuration) 22 | * Setup any recipient tables in postgres 23 | * Validate with `./moresql -validate` 24 | * Deploy moresql binary to server from Github Releases 25 | * Configure Environmental variables 26 | * Run `./moresql -tail` to start transmitting novelty 27 | * Run `./moresql -full-sync` to populate the database 28 | * Write more sql ;D 29 | 30 | # Usage 31 | 32 | ## Basic Use 33 | ### Configuration 34 | 35 | moresql.json configuration structure 36 | 37 | ``` 38 | { 39 | "DB_NAME": { 40 | "collections": { 41 | "COLLECTION_NAME": { 42 | "name": "COLLECTION_NAME", 43 | "pg_table": "PG_TABLE_NAME", 44 | "fields": { 45 | ... 46 | } 47 | }, 48 | "COLLECTION_NAME2": { 49 | "name": "COLLECTION_NAME2", 50 | "pg_table": "PG_TABLE_NAME2", 51 | "fields": { 52 | ... 53 | } 54 | } 55 | } 56 | } 57 | } 58 | ``` 59 | 60 | Field attributes have a simple and complex format. 61 | 62 | The simple format is where you want to use the mongo field name as the postgres column name. 63 | In this format fields take the mongo field name (as key) and its postgres type (as value). 64 | The value can be any valid postgres type (https://www.postgresql.org/docs/current/datatype.html) 65 | ``` 66 | "fields": { 67 | "_id": "TEXT", 68 | "addresses": "JSONB", 69 | } 70 | ``` 71 | 72 | The complex format allows for renaming fields, extracting nested keys and some advanced operations. 73 | See gjson project for full syntax details of dot notation: https://github.com/tidwall/gjson#path-syntax 74 | ``` 75 | "fields": { 76 | "_id": "TEXT", 77 | // preferences is an field name that has an object in it. The object has a key of unsubscribe. 78 | // The following will promote this nested unsubscribe to a top level value as a postgres column 79 | "preferences.unsubscribe": { 80 | "Postgres": {"Name": "is_unsubscribed", "Type": "BOOLEAN"}, 81 | "Mongo": {"Name": "preferences.unsubscribe", "Type": "object"} 82 | }, 83 | // This extraction will look in books field with [{product_id: 1}, {product_id: 2}] 84 | // And for each object in there, fetch the product_id and combine it into an array of product ids [1, 2] 85 | // That value will be inserted into a postgres column called `book_ids`. 86 | "books.#.product_id": { 87 | "Postgres": { 88 | "Name": "book_ids", 89 | "Type": "JSONB" 90 | }, 91 | "Mongo": { 92 | "Name": "books.#.product_id", 93 | "Type": "object" 94 | } 95 | }, 96 | } 97 | ``` 98 | 99 | See `examples/moresql.json` for a full configuration 100 | 101 | ### Tail 102 | 103 | `./moresql -tail -config-file=moresql.json` 104 | 105 | Tail is the primary run mode for MoreSQL. When tailing, the oplog is observed for novely and each INSERT/UPDATE/DELETE is translated to its SQL equivalent, then executed against Postgres. 106 | 107 | Tail makes a best faith effort to do this and uses checkpoint markers to track last successfully applied Mongo Oplog event. 108 | 109 | Given that `tail` mode executes `UPSERTS` instead of `INSERT || UPDATE`, we expect MoreSQL to be roughly eventually consistent. We're chosing to prioritize speed of execution (multiple workers) in lieu of some consistency. This helps to keep low latency with larger workloads. We currently partition workload among multiple workers but ensure that each `collection.id` combination will be routed to same worker in correct oplog order. This avoids the circumstance where two operations against same `collection.id` are executed by different workers, out of order. 110 | 111 | ### Full Sync 112 | 113 | `./moresql -full-sync -config-file=moresql.json` 114 | 115 | Full sync is useful when first setting up a MoreSQL installation to port the existing Mongo data to Postgres. We recommend setting up a tailing instance first. Once that's running, do a full sync in different process. This should put the Mongo and Postgres into identical states. 116 | 117 | Given the nature of streaming replica data from Mongo -> Postgres, it's recommended to run full sync at intervals in order to offset losses that may have occured during network issues, system downtime, etc. 118 | 119 | ### Documentation 120 | 121 | https://zph.github.io/moresql/ 122 | 123 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 124 | 125 | ## Table Setup 126 | 127 | ```sql 128 | -- Execute the following SQL to setup table in Postgres. Replace $USERNAME with the moresql user. 129 | -- create the moresql_metadata table for checkpoint persistance 130 | CREATE TABLE public.moresql_metadata 131 | ( 132 | app_name TEXT NOT NULL, 133 | last_epoch INT NOT NULL, 134 | processed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL 135 | ); 136 | -- Setup mandatory unique index 137 | CREATE UNIQUE INDEX moresql_metadata_app_name_uindex ON public.moresql_metadata (app_name); 138 | 139 | -- Grant permissions to this user, replace $USERNAME with moresql's user 140 | GRANT SELECT, UPDATE, DELETE ON TABLE public.moresql_metadata TO $USERNAME; 141 | 142 | COMMENT ON COLUMN public.moresql_metadata.app_name IS 'Name of application. Used for circumstances where multiple apps stream to same PG instance.'; 143 | COMMENT ON COLUMN public.moresql_metadata.last_epoch IS 'Most recent epoch processed from Mongo'; 144 | COMMENT ON COLUMN public.moresql_metadata.processed_at IS 'Timestamp for when the last epoch was processed at'; 145 | COMMENT ON TABLE public.moresql_metadata IS 'Stores checkpoint data for MoreSQL (mongo->pg) streaming'; 146 | ``` 147 | 148 | ## Building Binary 149 | 150 | Compile binary using `make build` 151 | 152 | ## Commandline Arguments / Usage 153 | 154 | Execute `./moresql --help` 155 | 156 | ``` 157 | ./bin/moresql 158 | Repo https://github.com/zph/moresql 159 | Usage of ./bin/moresql: 160 | -allow-deletes 161 | Allow deletes to propagate from Mongo -> PG (default true) 162 | -app-name string 163 | AppName used in Checkpoint table (default "moresql") 164 | -checkpoint 165 | Store and restore from checkpoints in PG table: moresql_metadata 166 | -config-file string 167 | Configuration file to use (default "moresql.json") 168 | -create-table-sql 169 | Print out the necessary SQL for creating metadata table required for checkpointing 170 | -enable-monitor 171 | Run expvarmon endpoint 172 | -error-reporting string 173 | Error reporting tool to use (currently only supporting Rollbar) 174 | -full-sync 175 | Run full sync for each db.collection in config 176 | -memprofile string 177 | Profile memory usage. Supply filename for output of memory usage 178 | -mongo-url MONGO_URL 179 | MONGO_URL aka connection string 180 | -postgres-url POSTGRES_URL 181 | POSTGRES_URL aka connection string 182 | -replay-duration duration 183 | Last x to replay ie '1s', '5m', etc as parsed by Time.ParseDuration. Will be subtracted from time.Now() 184 | -replay-second int 185 | Replay a specific epoch second of the oplog and forward from there. 186 | -ssl-cert string 187 | SSL PEM cert for Mongodb 188 | -ssl-insecure-skip-verify 189 | Skip verification of Mongo SSL certificate ala sslAllowInvalidCertificates 190 | -tail 191 | Tail mongodb for each db.collection in config 192 | -validate 193 | Validate the postgres table structures and exit 194 | ``` 195 | 196 | ### Validation of Configuration + Postgres Schema 197 | 198 | `./moresql -validate` 199 | 200 | This will report any issues related to the postgres schema being a mis-match for the fields and tables setup in configuration. 201 | 202 | # Requirements, Stability and Versioning 203 | 204 | MoreSQL is expected and built with Golang 1.6, 1.7 and master in mind. Broken tests on these versions indicates a bug. 205 | 206 | MoreSQL requires Postgres 9.5+ due to usage of UPSERTs. Using UPSERTs simplifies internal logic but also depends on UNIQUE indexes existing on each `_id` column in Postgres. See `moresql -validate` for advice. 207 | 208 | # Miscellanea 209 | 210 | ### Error Reporting 211 | 212 | Available through Rollbar. PRs welcome for other services. We currently use Rollus 213 | which reports errors synchronously. If this is a performance bottleneck please PR or issue. 214 | 215 | Enable this by two steps: 216 | 217 | ``` 218 | export ERROR_REPORTING_TOKEN=asdfasdfasdf 219 | export APP_ENV=[production, development, or staging] 220 | ``` 221 | 222 | And when running application use the following flag to enable reporting: 223 | 224 | `./moresql -tail -error-reporting "rollbar"` 225 | 226 | If these steps are not followed, errors will be reported out solely via logging. 227 | 228 | ### Environmental Variables used in Moresql 229 | 230 | ``` 231 | MONGO_URL 232 | POSTGRES_URL 233 | ERROR_REPORTING_TOKEN 234 | APP_ENV 235 | DYNO 236 | LOG_LEVEL 237 | ``` 238 | 239 | ### Mongo types 240 | 241 | We guard against a few of these for conversion into Postgres friendly types. 242 | 243 | Objects and Arrays do not behave properly when inserting into Postgres. These will be automatically converted into their JSON representation before inserting into Postgres. 244 | 245 | As of writing, any BsonID/ObjectId should be noted as `id` type in `Fields.Mongo.Type` to facilitate this. In the future we may assume that all fields ending in `_id` are Id based fields and require conversion. 246 | 247 | ## Converting from MoSQL 248 | 249 | Read through configuration structure and convert your collections.yml into the new json format. 250 | 251 | ## Unsupported Features 252 | 253 | These features are part of mosql but not implemented in MoreSQL. PRs welcome. 254 | 255 | * extra_props field for spare data 256 | * Automatic creation of tables/columns (we require explicit actions from users after providing guidance) 257 | 258 | ## Dot notation 259 | 260 | We support dot notation for extracting nested objects from Mongo. 261 | 262 | `user.address` will perform a `(get_in map [:user :address])` type nested fetch. 263 | 264 | See gjson project for full syntax details: https://github.com/tidwall/gjson#path-syntax 265 | ## Performance 266 | 267 | During benchmarking when moresql is asked to replay existing events from oplog we've seen the following performance with the following configurations: 268 | 269 | 5 workers per collection 270 | 500 generic workers 271 | On a Heroku 1X dyno 272 | 273 | ``` 274 | ~ $ ./moresql -tail -replay-duration "5000m" | grep "Rate of" 275 | {"level":"info","msg":"Rate of insert per min: 532","time":"2017-02-23T01:49:31Z"} 276 | {"level":"info","msg":"Rate of update per min: 44089","time":"2017-02-23T01:49:31Z"} 277 | {"level":"info","msg":"Rate of delete per min: 1","time":"2017-02-23T01:49:31Z"} 278 | {"level":"info","msg":"Rate of read per min: 91209","time":"2017-02-23T01:49:31Z"} 279 | {"level":"info","msg":"Rate of skipped per min: 46587","time":"2017-02-23T01:49:31Z"} 280 | ``` 281 | 282 | Approximately 700 updates/sec and 1500 reads/sec is our top observed throughput so far. Please submit PRs with further numbers using a similar command. 283 | 284 | We expect the following bottlenecks: connection count in Postgres, pg connection limitations in Moresql (for safety), network bandwidth, worker availability. 285 | 286 | At this level of throughput, Moresql uses ~90MB RAM. At low idle throughput of 10-20 req/sec it consumes ~30MB RAM. 287 | 288 | In another benchmark when updating 28k documents simultaneously, we observed mean lag of ~ 500ms and 95% of requests arrived in <= 1194ms between when the document was updated in Mongo and when it arrived in Postgres. 289 | 290 | See full [performance information](https://zph.github.io/moresql/performance/) 291 | 292 | For a general discussion of UPSERT performance in Postgres: https://mark.zealey.org/2016/01/08/how-we-tweaked-postgres-upsert-performance-to-be-2-3-faster-than-mongodb 293 | 294 | ## Binaries 295 | 296 | We release binaries for semvar tags on Github Releases page using `goreleaser` for the platforms listed in goreleaser.yml. 297 | 298 | # Credit and Prior Art 299 | 300 | * [MoSQL](https://github.com/stripe/mosql) - the project we used for 3 yrs at work and then retired with MoreSQL. Thanks Stripe! 301 | * [GTM](https://github.com/rwynn/gtm) - the go library that builds on mgo to wrap the tailing and oplog interface in a pleasant API. rwynn was a large help with improving GTM's performance with varying levels of consistency guarantees. 302 | -------------------------------------------------------------------------------- /bin/update-readme: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Ensure current build 4 | make build 5 | 6 | # Requires envsubst, OSX instructions: 7 | # brew install gettext 8 | # brew link --force gettext 9 | export LOG_LEVEL=fatal 10 | MORESQL_USAGE="$(./bin/moresql 2>&1 | tr "\t" " " | sed '/^Version.*$/d')" 11 | SQL_OUTPUT="$(./bin/moresql -create-table-sql 2>&1 | tr "\t" " ")" 12 | GO_ENVS="$(grep os.Getenv *.go | grep -Eo '\(".*"\)' | tr -d '"()')" 13 | MORESQL_VERSION=$(git describe --abbrev=0 --tags --always) MORESQL_USAGE="$MORESQL_USAGE" SQL_OUTPUT="$SQL_OUTPUT" ENV_VARIABLES_FROM_GO="$GO_ENVS" envsubst < docs/README.template.md > README.md 14 | -------------------------------------------------------------------------------- /cmds/moresql/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | _ "expvar" 5 | "fmt" 6 | "os" 7 | 8 | "flag" 9 | 10 | _ "github.com/lib/pq" 11 | "github.com/zph/moresql" 12 | ) 13 | 14 | var GitRef, version, BuildDate, GitSHA string 15 | 16 | func usage() func() { 17 | return func() { 18 | fmt.Fprintf(os.Stderr, "%s\n", os.Args[0]) 19 | fmt.Fprintf(os.Stderr, "Version %s, Git %s, Git SHA %s, BuildDate %s\n", version, GitRef, GitSHA, BuildDate) 20 | fmt.Fprintln(os.Stderr, "Repo https://github.com/zph/moresql") 21 | fmt.Fprintf(os.Stderr, "Usage of %s:\n", os.Args[0]) 22 | flag.PrintDefaults() 23 | } 24 | } 25 | 26 | func main() { 27 | flag.Usage = usage() 28 | moresql.Run() 29 | } 30 | -------------------------------------------------------------------------------- /config.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "encoding/json" 5 | "errors" 6 | "fmt" 7 | "io/ioutil" 8 | "regexp" 9 | 10 | "strings" 11 | 12 | log "github.com/Sirupsen/logrus" 13 | ) 14 | 15 | func LoadConfigString(s string) (Config, error) { 16 | config := Config{} 17 | var configDelayed ConfigDelayed 18 | err := json.Unmarshal([]byte(s), &configDelayed) 19 | if err != nil { 20 | log.Fatalln(err) 21 | } 22 | for k, v := range configDelayed { 23 | db := DB{} 24 | collections := Collections{} 25 | db.Collections = collections 26 | for k, v := range v.Collections { 27 | coll := Collection{Name: v.Name, PgTable: v.PgTable} 28 | var fields Fields 29 | fields, err = JsonToFields(string(v.Fields)) 30 | if err != nil { 31 | log.Warnf("JSON Config decoding error: ", err) 32 | return nil, fmt.Errorf("Unable to decode %s", err) 33 | } 34 | coll.Fields = fields 35 | db.Collections[k] = coll 36 | } 37 | config[k] = db 38 | } 39 | return config, nil 40 | } 41 | 42 | func LoadConfig(path string) Config { 43 | b, err := ioutil.ReadFile(path) 44 | if err != nil { 45 | panic(err) 46 | } 47 | config, err := LoadConfigString(string(b)) 48 | if err != nil { 49 | panic(err) 50 | } 51 | return config 52 | } 53 | 54 | func mongoToPostgresTypeConversion(mongoType string) string { 55 | // Coerce "id" bsonId types into text since Postgres doesn't have type for BSONID 56 | switch strings.ToLower(mongoType) { 57 | case "id": 58 | return "text" 59 | } 60 | return mongoType 61 | } 62 | 63 | func normalizeDotNotationToPostgresNaming(key string) string { 64 | re := regexp.MustCompile("\\.") 65 | return re.ReplaceAllString(key, "_") 66 | } 67 | 68 | func JsonToFields(s string) (Fields, error) { 69 | var init FieldsWrapper 70 | var err error 71 | result := Fields{} 72 | err = json.Unmarshal([]byte(s), &init) 73 | for k, v := range init { 74 | field := Field{} 75 | str := "" 76 | if err := json.Unmarshal(v, &field); err == nil { 77 | result[k] = field 78 | } else if err := json.Unmarshal(v, &str); err == nil { 79 | // Convert shorthand to longhand Field 80 | f := Field{ 81 | Mongo{k, str}, 82 | Postgres{normalizeDotNotationToPostgresNaming(k), mongoToPostgresTypeConversion(str)}, 83 | } 84 | result[k] = f 85 | } else { 86 | errLong := json.Unmarshal(v, &field) 87 | errShort := json.Unmarshal(v, &str) 88 | err = errors.New(fmt.Sprintf("Could not decode Field. Long decoding %+v. Short decoding %+v", errLong, errShort)) 89 | return nil, err 90 | } 91 | 92 | } 93 | 94 | return result, err 95 | } 96 | -------------------------------------------------------------------------------- /config_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | m "github.com/zph/moresql" 5 | . "gopkg.in/check.v1" 6 | ) 7 | 8 | func BuildFieldFromId(str string) m.Field { 9 | field := m.Field{} 10 | field.Mongo.Name = str 11 | field.Mongo.Type = "id" 12 | field.Postgres.Name = str 13 | field.Postgres.Type = "text" 14 | return field 15 | } 16 | 17 | func BuildTextField(str string) m.Field { 18 | field := m.Field{} 19 | field.Mongo.Name = str 20 | field.Mongo.Type = "text" 21 | field.Postgres.Name = str 22 | field.Postgres.Type = "text" 23 | return field 24 | } 25 | 26 | func (s *MySuite) TestConfigParsingForFields(c *C) { 27 | // Fields struct 28 | ex1 := ` 29 | {"_id": { 30 | "mongo": { 31 | "name": "_id", 32 | "type": "id" 33 | }, 34 | "postgres": { 35 | "name": "_id", 36 | "type": "text" 37 | } 38 | }} 39 | ` 40 | field := BuildFieldFromId("_id") 41 | expected1 := m.Fields{"_id": field} 42 | 43 | ex2 := ` 44 | {"name": { 45 | "mongo": { 46 | "name": "name", 47 | "type": "text" 48 | }, 49 | "postgres": { 50 | "name": "name", 51 | "type": "text" 52 | } 53 | }} 54 | ` 55 | expected2 := m.Fields{"name": BuildTextField("name")} 56 | 57 | ex3 := `{"name": "text"}` 58 | expected3 := m.Fields{"name": BuildTextField("name")} 59 | 60 | ex4 := `{"age": "integer", "name": "text"}` 61 | exField4 := BuildTextField("age") 62 | exField4.Mongo.Type = "integer" 63 | exField4.Postgres.Type = "integer" 64 | expected4 := m.Fields{"name": BuildTextField("name"), "age": exField4} 65 | 66 | var table = []struct { 67 | js string 68 | expected m.Fields 69 | }{ 70 | {ex1, expected1}, 71 | {ex2, expected2}, 72 | {ex3, expected3}, 73 | {ex4, expected4}, 74 | } 75 | for _, t := range table { 76 | f, err := m.JsonToFields(t.js) 77 | c.Check(err, Equals, nil) 78 | c.Check(f, DeepEquals, t.expected) 79 | } 80 | } 81 | 82 | func (s *MySuite) TestConfigParsingFull(c *C) { 83 | // Fields struct 84 | ex1 := ` 85 | { 86 | "company-production": { 87 | "collections": { 88 | "accounts": { 89 | "name": "users", 90 | "pg_table": "users", 91 | "fields": { 92 | "_id": { 93 | "mongo": { 94 | "name": "_id", 95 | "type": "id" 96 | }, 97 | "postgres": { 98 | "name": "_id", 99 | "type": "text" 100 | } 101 | }, 102 | "bio": { 103 | "mongo": { 104 | "name": "bio", 105 | "type": "text" 106 | }, 107 | "postgres": { 108 | "name": "bio", 109 | "type": "text" 110 | } 111 | } 112 | } 113 | }, 114 | "campaigns": { 115 | "name": "campaigns", 116 | "pg_table": "campaigns", 117 | "fields": { 118 | "_id": "id", 119 | "created_at": "text" 120 | } 121 | } 122 | } 123 | } 124 | } 125 | ` 126 | 127 | expected1 := m.Config{"company-production": m.DB{Collections: m.Collections{"accounts": m.Collection{Name: "users", PgTable: "users", Fields: m.Fields{"_id": m.Field{Mongo: m.Mongo{Name: "_id", Type: "id"}, Postgres: m.Postgres{Name: "_id", Type: "text"}}, "bio": m.Field{Mongo: m.Mongo{Name: "bio", Type: "text"}, Postgres: m.Postgres{Name: "bio", Type: "text"}}}}, "campaigns": m.Collection{Name: "campaigns", PgTable: "campaigns", Fields: m.Fields{"_id": m.Field{Mongo: m.Mongo{Name: "_id", Type: "id"}, Postgres: m.Postgres{Name: "_id", Type: "text"}}, "created_at": m.Field{Mongo: m.Mongo{Name: "created_at", Type: "text"}, Postgres: m.Postgres{Name: "created_at", Type: "text"}}}}}}} 128 | 129 | shorthand := ` 130 | { 131 | "company-production": { 132 | "collections": { 133 | "accounts": { 134 | "name": "users", 135 | "pg_table": "users", 136 | "fields": { 137 | "_id": "id", 138 | "bio": "text" 139 | } 140 | }, 141 | "campaigns": { 142 | "name": "campaigns", 143 | "pg_table": "campaigns", 144 | "fields": { 145 | "_id": "id", 146 | "created_at": "text" 147 | } 148 | } 149 | } 150 | } 151 | } 152 | ` 153 | var table = []struct { 154 | js string 155 | expected m.Config 156 | }{ 157 | {ex1, expected1}, 158 | {shorthand, expected1}, 159 | } 160 | for _, t := range table { 161 | f, err := m.LoadConfigString(t.js) 162 | c.Check(f, DeepEquals, t.expected) 163 | c.Check(err, Equals, nil) 164 | } 165 | } 166 | -------------------------------------------------------------------------------- /db.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "bytes" 5 | "crypto/tls" 6 | "crypto/x509" 7 | "encoding/binary" 8 | "errors" 9 | "io/ioutil" 10 | "math" 11 | "net" 12 | "time" 13 | 14 | log "github.com/Sirupsen/logrus" 15 | "github.com/jmoiron/sqlx" 16 | _ "github.com/lib/pq" 17 | mgo "gopkg.in/mgo.v2" 18 | "gopkg.in/mgo.v2/bson" 19 | ) 20 | 21 | func GetMongoConnection(env Env) (session *mgo.Session) { 22 | if env.UseSSL() { 23 | var tlsConfig *tls.Config 24 | if env.SSLCert != "" { 25 | // Certificate Provided 26 | clientCert, err := ioutil.ReadFile(env.SSLCert) 27 | if err != nil { 28 | log.Fatalln("Unable to read ssl certificate") 29 | } 30 | roots := x509.NewCertPool() 31 | ok := roots.AppendCertsFromPEM([]byte(clientCert)) 32 | if !ok { 33 | log.Fatalln("failed to parse root certificate") 34 | } 35 | 36 | tlsConfig = &tls.Config{RootCAs: roots} 37 | } else if env.SSLInsecureSkipVerify { 38 | tlsConfig = &tls.Config{InsecureSkipVerify: true} 39 | } 40 | 41 | c, err := mgo.ParseURL(env.urls.mongo) 42 | if err != nil { 43 | log.Fatalf("Unable to parse mongo url") 44 | } 45 | dialInfo := &mgo.DialInfo{ 46 | Addrs: c.Addrs, 47 | Database: c.Database, 48 | Source: c.Source, 49 | Username: c.Username, 50 | Password: c.Password, 51 | DialServer: func(addr *mgo.ServerAddr) (net.Conn, error) { 52 | return tls.Dial("tcp", addr.String(), tlsConfig) 53 | }, 54 | Timeout: time.Second * 10, 55 | } 56 | session, err = mgo.DialWithInfo(dialInfo) 57 | if err != nil { 58 | log.Fatal(err) 59 | } 60 | session.SetMode(mgo.Primary, true) 61 | } else { 62 | var err error 63 | session, err = mgo.Dial(env.urls.mongo) 64 | if err != nil { 65 | log.Fatal(err) 66 | } 67 | session.SetMode(mgo.Primary, true) 68 | } 69 | return 70 | } 71 | 72 | func GetPostgresConnection(env Env) (pg *sqlx.DB) { 73 | var err error 74 | pg, err = sqlx.Connect("postgres", env.urls.postgres) 75 | if err != nil { 76 | log.Fatal(err) 77 | } 78 | setupPgDefaults(pg) 79 | return 80 | } 81 | 82 | // setupPgDefaults: Set safe cap so workers do not overwhelm server 83 | func setupPgDefaults(pg *sqlx.DB) { 84 | pg.SetMaxIdleConns(50) 85 | pg.SetMaxOpenConns(50) 86 | } 87 | 88 | // Credit: https://github.com/go-mgo/mgo/pull/202/files#diff-b47d6566744e81abad9312022bdc8896R374 89 | // From @mwmahlberg 90 | func NewMongoTimestamp(t time.Time, c uint32) (bson.MongoTimestamp, error) { 91 | var tv uint32 92 | u := t.Unix() 93 | if u < 0 || u > math.MaxUint32 { 94 | return -1, errors.New("invalid value for time") 95 | } 96 | tv = uint32(u) 97 | buf := bytes.Buffer{} 98 | binary.Write(&buf, binary.BigEndian, tv) 99 | binary.Write(&buf, binary.BigEndian, c) 100 | i := int64(binary.BigEndian.Uint64(buf.Bytes())) 101 | return bson.MongoTimestamp(i), nil 102 | } 103 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # MoreSQL 2 | 3 | [![Build Status](https://travis-ci.org/zph/moresql.svg?branch=master)](https://travis-ci.org/zph/moresql) 4 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 5 | 6 | ## Introduction 7 | 8 | MoreSQL streams changes occuring in Mongo database into a Postgres db. MoreSQL tails the oplog and generates appropriate actions against Postgres. MoreSQL has the ability to do full synchronizations using `UPSERTS`, with the benefit over `INSERTS` that this can be executed against tables with existing data. 9 | 10 | MoreSQL gives you a chance to use more sql and less mongo query language. 11 | 12 | # Usage 13 | 14 | ## Basic Use 15 | 16 | ### Tail 17 | 18 | `./moresql -tail -config-file=moresql.json` 19 | 20 | Tail is the primary run mode for MoreSQL. When tailing, the oplog is observed for novely and each INSERT/UPDATE/DELETE is translated to its SQL equivalent, then executed against Postgres. 21 | 22 | Tail makes a best faith effort to do this and does not use checkpoint markers to track position in the oplog. It may be introduced in later releases. Or we could introduce a way to split MoreSQL into a producer (oplog tail) that puts records onto stream (Kinesis/Kafka/etc) and a consumer that reads from the stream. By doing so, we'd avoid re-implmenting checkpoints in MoreSQL. 23 | 24 | Given that `tail` mode executes `UPSERTS` instead of `INSERT || UPDATE`, we expect MoreSQL to be roughly eventually consistent. We're chosing to prioritize speed of execution (multiple workers) in lieu of some consistency. This helps to keep low latency with larger workloads. 25 | 26 | ### Full Sync 27 | 28 | `./moresql -full-sync -config-file=moresql.json` 29 | 30 | Full sync is useful when first setting up a MoreSQL installation to port the existing Mongo data to Postgres. We recommend setting up a tailing instance first. Once that's running, do a full sync in different process. This should put the Mongo and Postgres into identical states. 31 | 32 | Given the nature of streaming replica data from Mongo -> Postgres, it's recommended to run full sync at intervals in order to offset losses that may have occured during network issues, system downtime, etc. 33 | 34 | ### Documentation 35 | 36 | https://zph.github.io/moresql/ 37 | 38 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 39 | 40 | ## QuickStart 41 | 42 | ### Introduction 43 | 44 | * Create metadata table 45 | * Setup moresql.json 46 | * Setup any recipient tables in postgres 47 | * Validate with `./moresql -validate` 48 | * Deploy binary to server 49 | * Configure Environmental variables 50 | * Run `./moresql -tail` to start transmitting novelty 51 | * Run `./moresql -full-sync` to populate the database 52 | 53 | ### Table Setup 54 | 55 | ```sql 56 | -- Execute the following SQL to setup table in Postgres. Replace $USERNAME with the moresql user. 57 | -- create the moresql_metadata table for checkpoint persistance 58 | CREATE TABLE public.moresql_metadata 59 | ( 60 | app_name TEXT NOT NULL, 61 | last_epoch INT NOT NULL, 62 | processed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL 63 | ); 64 | -- Setup mandatory unique index 65 | CREATE UNIQUE INDEX moresql_metadata_app_name_uindex ON public.moresql_metadata (app_name); 66 | 67 | -- Grant permissions to this user, replace $USERNAME with moresql's user 68 | GRANT SELECT, UPDATE, DELETE ON TABLE public.moresql_metadata TO $USERNAME; 69 | 70 | COMMENT ON COLUMN public.moresql_metadata.app_name IS 'Name of application. Used for circumstances where multiple apps stream to same PG instance.'; 71 | COMMENT ON COLUMN public.moresql_metadata.last_epoch IS 'Most recent epoch processed from Mongo'; 72 | COMMENT ON COLUMN public.moresql_metadata.processed_at IS 'Timestamp for when the last epoch was processed at'; 73 | COMMENT ON TABLE public.moresql_metadata IS 'Stores checkpoint data for MoreSQL (mongo->pg) streaming'; 74 | ``` 75 | 76 | ### Building Binary 77 | 78 | Compile binary using `make build` 79 | 80 | ### Commandline Arguments / Usage 81 | 82 | Execute `./moresql --help` 83 | 84 | ``` 85 | ./bin/moresql 86 | Repo https://github.com/zph/moresql 87 | Usage of ./bin/moresql: 88 | -allow-deletes 89 | Allow deletes to propagate from Mongo -> PG (default true) 90 | -app-name string 91 | AppName used in Checkpoint table (default "moresql") 92 | -checkpoint 93 | Store and restore from checkpoints in PG table: moresql_metadata 94 | -config-file string 95 | Configuration file to use (default "moresql.json") 96 | -create-table-sql 97 | Print out the necessary SQL for creating metadata table required for checkpointing 98 | -enable-monitor 99 | Run expvarmon endpoint 100 | -error-reporting string 101 | Error reporting tool to use (currently only supporting Rollbar) 102 | -full-sync 103 | Run full sync for each db.collection in config 104 | -memprofile string 105 | Profile memory usage. Supply filename for output of memory usage 106 | -mongo-url MONGO_URL 107 | MONGO_URL aka connection string 108 | -postgres-url POSTGRES_URL 109 | POSTGRES_URL aka connection string 110 | -replay-duration duration 111 | Last x to replay ie '1s', '5m', etc as parsed by Time.ParseDuration. Will be subtracted from time.Now() 112 | -replay-second int 113 | Replay a specific epoch second of the oplog and forward from there. 114 | -ssl-cert string 115 | SSL PEM cert for Mongodb 116 | -tail 117 | Tail mongodb for each db.collection in config 118 | -validate 119 | Validate the postgres table structures and exit 120 | ``` 121 | 122 | ### Validation of Configuration + Postgres Schema 123 | 124 | `./moresql -validate` 125 | 126 | This will report any issues related to the postgres schema being a mis-match for the fields and tables setup in configuration. 127 | 128 | # Requirements, Stability and Versioning 129 | 130 | MoreSQL is expected and built with Golang 1.6, 1.7 and master in mind. Broken tests on these versions indicates a bug. 131 | 132 | MoreSQL requires Postgres 9.5+ due to usage of UPSERTs. Using UPSERTs simplifies internal logic but also depends on UNIQUE indexes existing on each `_id` column in Postgres. See `moresql -validate` for advice. 133 | 134 | # Miscellanea 135 | 136 | ### Error Reporting 137 | 138 | Available through Rollbar. PRs welcome for other services. We currently use Rollus 139 | which reports errors synchronously. If this is a performance bottleneck please PR or issue. 140 | 141 | Enable this by two steps: 142 | 143 | ``` 144 | export ERROR_REPORTING_TOKEN=asdfasdfasdf 145 | export APP_ENV=[production, development, or staging] 146 | ``` 147 | 148 | And when running application use the following flag to enable reporting: 149 | 150 | `./moresql -tail -error-reporting "rollbar"` 151 | 152 | If these steps are not followed, errors will be reported out solely via logging. 153 | 154 | ### Environmental Variables used in Moresql 155 | 156 | ``` 157 | MONGO_URL 158 | POSTGRES_URL 159 | ERROR_REPORTING_TOKEN 160 | APP_ENV 161 | DYNO 162 | LOG_LEVEL 163 | ``` 164 | 165 | ### Mongo types 166 | 167 | We guard against a few of these for conversion into Postgres friendly types. 168 | 169 | Objects and Arrays do not behave properly when inserting into Postgres. These will be automatically converted into their JSON representation before inserting into Postgres. 170 | 171 | As of writing, any BsonID/ObjectId should be noted as `id` type in `Fields.Mongo.Type` to facilitate this. In the future we may assume that all fields ending in `_id` are Id based fields and require conversion. 172 | 173 | ## Converting from MoSQL 174 | 175 | Run the ./bin/convert_config_from_mosql_to_moresql script in a folder with `collections.yml` 176 | 177 | ``` 178 | ruby ./bin/convert_config_from_mosql_to_moresql collection.yml 179 | ``` 180 | 181 | The generated file `moresql.json` will be in place ready for use. 182 | 183 | ## Unsupported Features 184 | 185 | These features are part of mosql but not implemented in MoreSQL. PRs welcome. 186 | 187 | * Dot Notation for nested structures 188 | * extra_props field for spare data 189 | * Automatic creation of tables/columns 190 | 191 | ## Performance 192 | 193 | During benchmarking when moresql is asked to replay existing events from oplog we've seen the following performance with the following configurations: 194 | 195 | 5 workers per collection 196 | 500 generic workers 197 | On a Heroku 1X dyno 198 | 199 | ``` 200 | ~ $ ./moresql -tail -replay-duration "5000m" | grep "Rate of" 201 | {"level":"info","msg":"Rate of insert per min: 532","time":"2017-02-23T01:49:31Z"} 202 | {"level":"info","msg":"Rate of update per min: 44089","time":"2017-02-23T01:49:31Z"} 203 | {"level":"info","msg":"Rate of delete per min: 1","time":"2017-02-23T01:49:31Z"} 204 | {"level":"info","msg":"Rate of read per min: 91209","time":"2017-02-23T01:49:31Z"} 205 | {"level":"info","msg":"Rate of skipped per min: 46587","time":"2017-02-23T01:49:31Z"} 206 | ``` 207 | 208 | Approximately 700 updates/sec and 1500 reads/sec is our top observed throughput so far. Please submit PRs with further numbers using a similar command. 209 | 210 | We expect the following bottlenecks: connection count in Postgres, pg connection limitations in Moresql (for safety), network bandwidth, worker availability. 211 | 212 | At this level of throughput, Moresql uses ~90MB RAM. At low idle throughput of 10-20 req/sec it consumes ~30MB RAM. 213 | 214 | In another benchmark when updating 28k documents simultaneously, we observed mean lag of ~ 500ms and 95% of requests arrived in <= 1194ms between when the document was updated in Mongo and when it arrived in Postgres. 215 | 216 | See full [performance information](https://zph.github.io/moresql/performance/) 217 | 218 | For a general discussion of UPSERT performance in Postgres: https://mark.zealey.org/2016/01/08/how-we-tweaked-postgres-upsert-performance-to-be-2-3-faster-than-mongodb 219 | 220 | ## Binaries 221 | 222 | We release binaries for semvar tags on Github Releases page using `goreleaser` for the platforms listed in goreleaser.yml. 223 | 224 | # Credit and Prior Art 225 | 226 | * [MoSQL](https://github.com/stripe/mosql) - the project we used for 3 yrs at work and then retired with MoreSQL. Thanks Stripe! 227 | * [GTM](https://github.com/rwynn/gtm) - the go library that builds on mgo to wrap the tailing and oplog interface in a pleasant API. rwynn was a large help with improving GTM's performance with varying levels of consistency guarantees. 228 | -------------------------------------------------------------------------------- /docs/README.template.md: -------------------------------------------------------------------------------- 1 | # MoreSQL 2 | 3 | [](NOTE: README.md is a generated FILE changes belong in docs/README.template.md. Update with make docs) 4 | [![Build Status](https://travis-ci.org/zph/moresql.svg?branch=master)](https://travis-ci.org/zph/moresql) 5 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 6 | 7 | ## Introduction 8 | 9 | MoreSQL streams changes occuring in Mongo database into a Postgres db. MoreSQL tails the oplog and generates appropriate actions against Postgres. MoreSQL has the ability to do full synchronizations using `UPSERTS`, with the benefit over `INSERTS` that this can be executed against tables with existing data. 10 | 11 | MoreSQL gives you a chance to use more sql and less mongo query language. 12 | 13 | Project maturity: Moresql has 3 years of running in production for 2+ companies 14 | on their business critical workloads for core systems. It is a stable project 15 | with low maintenance and high performance. 16 | 17 | ## QuickStart 18 | ### Introduction 19 | 20 | * Create metadata table 21 | * Setup moresql.json (see Configuration) 22 | * Setup any recipient tables in postgres 23 | * Validate with `./moresql -validate` 24 | * Deploy moresql binary to server from Github Releases 25 | * Configure Environmental variables 26 | * Run `./moresql -tail` to start transmitting novelty 27 | * Run `./moresql -full-sync` to populate the database 28 | * Write more sql ;D 29 | 30 | # Usage 31 | 32 | ## Basic Use 33 | ### Configuration 34 | 35 | moresql.json configuration structure 36 | 37 | ``` 38 | { 39 | "DB_NAME": { 40 | "collections": { 41 | "COLLECTION_NAME": { 42 | "name": "COLLECTION_NAME", 43 | "pg_table": "PG_TABLE_NAME", 44 | "fields": { 45 | ... 46 | } 47 | }, 48 | "COLLECTION_NAME2": { 49 | "name": "COLLECTION_NAME2", 50 | "pg_table": "PG_TABLE_NAME2", 51 | "fields": { 52 | ... 53 | } 54 | } 55 | } 56 | } 57 | } 58 | ``` 59 | 60 | Field attributes have a simple and complex format. 61 | 62 | The simple format is where you want to use the mongo field name as the postgres column name. 63 | In this format fields take the mongo field name (as key) and its postgres type (as value). 64 | The value can be any valid postgres type (https://www.postgresql.org/docs/current/datatype.html) 65 | ``` 66 | "fields": { 67 | "_id": "TEXT", 68 | "addresses": "JSONB", 69 | } 70 | ``` 71 | 72 | The complex format allows for renaming fields, extracting nested keys and some advanced operations. 73 | See gjson project for full syntax details of dot notation: https://github.com/tidwall/gjson#path-syntax 74 | ``` 75 | "fields": { 76 | "_id": "TEXT", 77 | // preferences is an field name that has an object in it. The object has a key of unsubscribe. 78 | // The following will promote this nested unsubscribe to a top level value as a postgres column 79 | "preferences.unsubscribe": { 80 | "Postgres": {"Name": "is_unsubscribed", "Type": "BOOLEAN"}, 81 | "Mongo": {"Name": "preferences.unsubscribe", "Type": "object"} 82 | }, 83 | // This extraction will look in books field with [{product_id: 1}, {product_id: 2}] 84 | // And for each object in there, fetch the product_id and combine it into an array of product ids [1, 2] 85 | // That value will be inserted into a postgres column called `book_ids`. 86 | "books.#.product_id": { 87 | "Postgres": { 88 | "Name": "book_ids", 89 | "Type": "JSONB" 90 | }, 91 | "Mongo": { 92 | "Name": "books.#.product_id", 93 | "Type": "object" 94 | } 95 | }, 96 | } 97 | ``` 98 | 99 | See `examples/moresql.json` for a full configuration 100 | 101 | ### Tail 102 | 103 | `./moresql -tail -config-file=moresql.json` 104 | 105 | Tail is the primary run mode for MoreSQL. When tailing, the oplog is observed for novely and each INSERT/UPDATE/DELETE is translated to its SQL equivalent, then executed against Postgres. 106 | 107 | Tail makes a best faith effort to do this and uses checkpoint markers to track last successfully applied Mongo Oplog event. 108 | 109 | Given that `tail` mode executes `UPSERTS` instead of `INSERT || UPDATE`, we expect MoreSQL to be roughly eventually consistent. We're chosing to prioritize speed of execution (multiple workers) in lieu of some consistency. This helps to keep low latency with larger workloads. We currently partition workload among multiple workers but ensure that each `collection.id` combination will be routed to same worker in correct oplog order. This avoids the circumstance where two operations against same `collection.id` are executed by different workers, out of order. 110 | 111 | ### Full Sync 112 | 113 | `./moresql -full-sync -config-file=moresql.json` 114 | 115 | Full sync is useful when first setting up a MoreSQL installation to port the existing Mongo data to Postgres. We recommend setting up a tailing instance first. Once that's running, do a full sync in different process. This should put the Mongo and Postgres into identical states. 116 | 117 | Given the nature of streaming replica data from Mongo -> Postgres, it's recommended to run full sync at intervals in order to offset losses that may have occured during network issues, system downtime, etc. 118 | 119 | ### Documentation 120 | 121 | https://zph.github.io/moresql/ 122 | 123 | [![GoDoc](https://godoc.org/github.com/zph/moresql?status.svg)](https://godoc.org/github.com/zph/moresql) 124 | 125 | ## Table Setup 126 | 127 | ```sql 128 | $SQL_OUTPUT 129 | ``` 130 | 131 | ## Building Binary 132 | 133 | Compile binary using `make build` 134 | 135 | ## Commandline Arguments / Usage 136 | 137 | Execute `./moresql --help` 138 | 139 | ``` 140 | $MORESQL_USAGE 141 | ``` 142 | 143 | ### Validation of Configuration + Postgres Schema 144 | 145 | `./moresql -validate` 146 | 147 | This will report any issues related to the postgres schema being a mis-match for the fields and tables setup in configuration. 148 | 149 | # Requirements, Stability and Versioning 150 | 151 | MoreSQL is expected and built with Golang 1.6, 1.7 and master in mind. Broken tests on these versions indicates a bug. 152 | 153 | MoreSQL requires Postgres 9.5+ due to usage of UPSERTs. Using UPSERTs simplifies internal logic but also depends on UNIQUE indexes existing on each `_id` column in Postgres. See `moresql -validate` for advice. 154 | 155 | # Miscellanea 156 | 157 | ### Error Reporting 158 | 159 | Available through Rollbar. PRs welcome for other services. We currently use Rollus 160 | which reports errors synchronously. If this is a performance bottleneck please PR or issue. 161 | 162 | Enable this by two steps: 163 | 164 | ``` 165 | export ERROR_REPORTING_TOKEN=asdfasdfasdf 166 | export APP_ENV=[production, development, or staging] 167 | ``` 168 | 169 | And when running application use the following flag to enable reporting: 170 | 171 | `./moresql -tail -error-reporting "rollbar"` 172 | 173 | If these steps are not followed, errors will be reported out solely via logging. 174 | 175 | ### Environmental Variables used in Moresql 176 | 177 | ``` 178 | $ENV_VARIABLES_FROM_GO 179 | ``` 180 | 181 | ### Mongo types 182 | 183 | We guard against a few of these for conversion into Postgres friendly types. 184 | 185 | Objects and Arrays do not behave properly when inserting into Postgres. These will be automatically converted into their JSON representation before inserting into Postgres. 186 | 187 | As of writing, any BsonID/ObjectId should be noted as `id` type in `Fields.Mongo.Type` to facilitate this. In the future we may assume that all fields ending in `_id` are Id based fields and require conversion. 188 | 189 | ## Converting from MoSQL 190 | 191 | Read through configuration structure and convert your collections.yml into the new json format. 192 | 193 | ## Unsupported Features 194 | 195 | These features are part of mosql but not implemented in MoreSQL. PRs welcome. 196 | 197 | * extra_props field for spare data 198 | * Automatic creation of tables/columns (we require explicit actions from users after providing guidance) 199 | 200 | ## Dot notation 201 | 202 | We support dot notation for extracting nested objects from Mongo. 203 | 204 | `user.address` will perform a `(get_in map [:user :address])` type nested fetch. 205 | 206 | See gjson project for full syntax details: https://github.com/tidwall/gjson#path-syntax 207 | ## Performance 208 | 209 | During benchmarking when moresql is asked to replay existing events from oplog we've seen the following performance with the following configurations: 210 | 211 | 5 workers per collection 212 | 500 generic workers 213 | On a Heroku 1X dyno 214 | 215 | ``` 216 | ~ $ ./moresql -tail -replay-duration "5000m" | grep "Rate of" 217 | {"level":"info","msg":"Rate of insert per min: 532","time":"2017-02-23T01:49:31Z"} 218 | {"level":"info","msg":"Rate of update per min: 44089","time":"2017-02-23T01:49:31Z"} 219 | {"level":"info","msg":"Rate of delete per min: 1","time":"2017-02-23T01:49:31Z"} 220 | {"level":"info","msg":"Rate of read per min: 91209","time":"2017-02-23T01:49:31Z"} 221 | {"level":"info","msg":"Rate of skipped per min: 46587","time":"2017-02-23T01:49:31Z"} 222 | ``` 223 | 224 | Approximately 700 updates/sec and 1500 reads/sec is our top observed throughput so far. Please submit PRs with further numbers using a similar command. 225 | 226 | We expect the following bottlenecks: connection count in Postgres, pg connection limitations in Moresql (for safety), network bandwidth, worker availability. 227 | 228 | At this level of throughput, Moresql uses ~90MB RAM. At low idle throughput of 10-20 req/sec it consumes ~30MB RAM. 229 | 230 | In another benchmark when updating 28k documents simultaneously, we observed mean lag of ~ 500ms and 95% of requests arrived in <= 1194ms between when the document was updated in Mongo and when it arrived in Postgres. 231 | 232 | See full [performance information](https://zph.github.io/moresql/performance/) 233 | 234 | For a general discussion of UPSERT performance in Postgres: https://mark.zealey.org/2016/01/08/how-we-tweaked-postgres-upsert-performance-to-be-2-3-faster-than-mongodb 235 | 236 | ## Binaries 237 | 238 | We release binaries for semvar tags on Github Releases page using `goreleaser` for the platforms listed in goreleaser.yml. 239 | 240 | # Credit and Prior Art 241 | 242 | * [MoSQL](https://github.com/stripe/mosql) - the project we used for 3 yrs at work and then retired with MoreSQL. Thanks Stripe! 243 | * [GTM](https://github.com/rwynn/gtm) - the go library that builds on mgo to wrap the tailing and oplog interface in a pleasant API. rwynn was a large help with improving GTM's performance with varying levels of consistency guarantees. 244 | -------------------------------------------------------------------------------- /docs/TODO.md: -------------------------------------------------------------------------------- 1 | TODOs 2 | == 3 | 4 | TO DEPLOY and RELEASE 5 | === 6 | * [x] Add checkpointing in case of downtime 7 | * [x] Make it set on timer, ie every minute or configurable duration 8 | * [x] determine if we want to play catch up in oplog or not (get oldest db.oplog.rs.find().sort({ts:-1}).limit(1)) 9 | * [x] Make sure that multiple moresqls can insert their marker on same db meta table 10 | * [x] Add conversion from MoSQL to MoreSQL 11 | * [x] Add quickstart guide and gh pages or link to godoc 12 | * [x] Add starter SQL script for checkpointing script 13 | * [x] Add validation cmdline flag to compare configuration with PG tables/columns 14 | * [x] Add documentation for deploying on Heroku via null buildpack 15 | * [x] Verify that log statements are set at appropriate levels, ie warn/error/etc 16 | * [x] Test in staging, then production 17 | * [x] Bake binary in production 18 | * [ ] Release and announce project to the world 19 | 20 | 21 | DESIRED 22 | == 23 | * [x] Use dot notation for config description of nested maps, ie allow for `get_in("outerkey", "innerkey")` as `outerkey.innerkey` by using https://github.com/tidwall/gjson 24 | * [x] Refactor tail to have a producer/consumer as Read/Write 25 | * [x] Setup https://github.com/thejerf/suture wrappers on components 26 | * [x] Add testing and refactor to make each bit fairly decoupled 27 | * [x] Include docs and scripts to transition from mosql to moresql 28 | * [x] Setup formal worker pool along with overflow pool of workers 29 | * [x] add tracking mechanism for missing/broken tables beyond "log it into abyss". 30 | * [x] add error handling with rollbar/bugsnag/etc 31 | * [ ] Improve library testing (unit and integration/system). Potentially using docker for full trip integration tests. 32 | * [ ] Add validation for the moresql_metadata table 33 | * [ ] Add configuration option to use configurable schema for metadata table and I/U/D 34 | * [ ] Add `full-sync` option to only re-sync specific table 35 | * [ ] Fix logging to include TIMESTAMP when deployed outside Heroku 36 | 37 | 38 | SOMEDAYs 39 | == 40 | * [ ] Setup system tests (https://www.elastic.co/blog/code-coverage-for-your-golang-system-tests) 41 | * [ ] Add basic auth and SSL for endpoint of expvarmon 42 | * [ ] add signal handling for SIGTERM to flush existing content in buffers then exit 43 | * [ ] Add way to reload configuration without dropping events? 44 | * [ ] add expvar.Publish for backlog of all events waiting to process in `fan` 45 | * [ ] time operates on int64, suggest that gtm.ParseTimestamp do likewise for interop 46 | * [ ] Make library generic with regard to event destination. Could be expanded out as a bridge Mongo->{Kinesis,Kafka,Postgres,MySQL} 47 | * [ ] https://github.com/zph/moresql/blob/master/full_sync.go#L135 48 | * [ ] Make the writer function configurable with postgres as the default 49 | * [ ] Writers should fit the interface of accepting a pointer to tables struct and the channel of incoming operations 50 | * [ ] All of https://github.com/zph/moresql/blob/master/full_sync.go#L129-L136 should be inside the writer function as it will differ by output sink. 51 | * [ ] Add persistance for oplog if desired by user via commandline flag 52 | -------------------------------------------------------------------------------- /docs/deploying.md: -------------------------------------------------------------------------------- 1 | Deploying 2 | = 3 | 4 | On Server 5 | == 6 | 7 | * Download release or compile binary for platform 8 | * Follow [setup guide for MoreSQL](/README/#quickstart) 9 | * Set environmental variables 10 | * Run moresql under process manager 11 | 12 | 13 | On Heroku 14 | == 15 | 16 | * Follow [setup guide for MoreSQL](/README/#quickstart) 17 | 18 | * Create repository for deployment 19 | 20 | * Create Procfile 21 | 22 | Sample: 23 | ``` 24 | worker: ./moresql -tail -checkpoint -error-reporting "rollbar" 25 | ``` 26 | 27 | * Set the ENV variables according to README [section](/README/#environmental-variables-used-in-moresql) 28 | 29 | * Download latest stable release of moresql or build yourself for the linux amd64 platform using cross compilation 30 | 31 | * Commit that binary to deploy project 32 | 33 | * Add null buildpack for using a binary on heroku 34 | 35 | ``` 36 | heroku buildpacks:set -r REMOTE_NAME https://github.com/ryandotsmith/null-buildpack.git#72915d8b59f0f089931b4ed3b9c9b6f1750c331a 37 | ``` 38 | 39 | Note: we pin to specific version of buildpack so future upgrades aren't automatically applied. 40 | 41 | * Deploy to heroku with a git push 42 | 43 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to Moresql Documentation 2 | 3 | For basic introduction: [README](/README/) 4 | 5 | [Github Repository](https://github.com/zph/moresql) 6 | 7 | 8 | MoreSQL streams changes occuring in Mongo database into a Postgres db. MoreSQL tails the oplog and generates appropriate actions against Postgres. MoreSQL has the ability to do full synchronizations using UPSERTS, with the benefit over INSERTS that this can be executed against tables with existing data. 9 | 10 | MoreSQL gives you a chance to use more sql and less mongo query language. 11 | 12 | ## Commands 13 | 14 | * `moresql -tail` - Start tailing the oplog from mongo and persist to Postgres. 15 | * `moresql -full-sync` - Conduct a full sync based on configuration file from mongo->pg. 16 | * `moresql -help` - Usage Instructions. 17 | -------------------------------------------------------------------------------- /examples/moresql.json: -------------------------------------------------------------------------------- 1 | { 2 | "acme_project": { 3 | "collections": { 4 | "Authors": { 5 | "name": "Authors", 6 | "pg_table": "authors", 7 | "fields": { 8 | "_id": "TEXT", 9 | "addresses": "JSONB", 10 | "books": "JSONB", 11 | "books.#.product_id": { 12 | "Postgres": { 13 | "Name": "book_ids", 14 | "Type": "JSONB" 15 | }, 16 | "Mongo": { 17 | "Name": "books.#.product_id", 18 | "Type": "object" 19 | } 20 | } 21 | } 22 | }, 23 | "Users": { 24 | "name": "Users", 25 | "pg_table": "users", 26 | "fields": { 27 | "_id": "TEXT", 28 | "email": "TEXT", 29 | "name": "TEXT", 30 | "preferences": "JSONB", 31 | "preferences.unsubscribe": { 32 | "Postgres": {"Name": "is_unsubscribed", "Type": "BOOLEAN"}, 33 | "Mongo": {"Name": "preferences.unsubscribe", "Type": "object"} 34 | }, 35 | "createdAt": "TIMESTAMP WITH TIME ZONE" 36 | } 37 | } 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /full_sync.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "expvar" 5 | "fmt" 6 | "time" 7 | 8 | log "github.com/Sirupsen/logrus" 9 | "github.com/jmoiron/sqlx" 10 | "github.com/orcaman/concurrent-map" 11 | "github.com/paulbellamy/ratecounter" 12 | "github.com/rwynn/gtm" 13 | mgo "gopkg.in/mgo.v2" 14 | ) 15 | 16 | type Syncer interface { 17 | Read() func() 18 | Write() func() 19 | BuildOpFromMgo() func(o Statement, e DBResult, coll Collection) gtm.Op 20 | } 21 | 22 | type FullSyncer struct { 23 | Config Config 24 | Output *sqlx.DB 25 | Mongo *mgo.Session 26 | C chan DBResult 27 | done chan bool 28 | 29 | insertCounter *ratecounter.RateCounter 30 | readCounter *ratecounter.RateCounter 31 | } 32 | 33 | func (z *FullSyncer) Read() { 34 | for dbName, v := range z.Config { 35 | db := z.Mongo.DB(dbName) 36 | for name := range v.Collections { 37 | coll := db.C(name) 38 | iter := coll.Find(nil).Iter() 39 | var result map[string]interface{} 40 | for iter.Next(&result) { 41 | z.readCounter.Incr(1) 42 | z.C <- DBResult{dbName, name, result} 43 | // Clear out result data for next round 44 | result = make(map[string]interface{}) 45 | } 46 | if err := iter.Close(); err != nil { 47 | log.Error("Unable to close iterator: %s", err) 48 | } 49 | } 50 | } 51 | close(z.C) 52 | wg.Done() 53 | } 54 | 55 | func (z *FullSyncer) Write() { 56 | var workers [workerCountOverflow]int 57 | tables := z.buildTables() 58 | for _ = range workers { 59 | wg.Add(1) 60 | go z.writer(&tables) 61 | } 62 | wg.Done() 63 | } 64 | 65 | func BuildOpFromMgo(mongoFields []string, e DBResult, coll Collection) *gtm.Op { 66 | var op gtm.Op 67 | op.Data = e.Data 68 | opRef := EnsureOpHasAllFields(&op, mongoFields) 69 | opRef.Id = e.Data["_id"] 70 | // Set to I so we are consistent about these beings inserts 71 | // This avoids our guardclause in sanitize 72 | opRef.Operation = "i" 73 | data := SanitizeData(coll.Fields, opRef) 74 | opRef.Data = data 75 | return opRef 76 | } 77 | 78 | func (z *FullSyncer) writer(tables *cmap.ConcurrentMap) { 79 | ForStatement: 80 | for { 81 | select { 82 | case e, more := <-z.C: 83 | if !more { 84 | break ForStatement 85 | } 86 | key := createFanKey(e.MongoDB, e.Collection) 87 | v, ok := tables.Get(key) 88 | if ok && !v.(bool) { 89 | // Table doesn't exist, skip 90 | break 91 | } 92 | o, coll := z.statementFromDbCollection(e.MongoDB, e.Collection) 93 | op := BuildOpFromMgo(o.mongoFields(), e, coll) 94 | s := o.BuildUpsert() 95 | log.WithFields(log.Fields{ 96 | "collection": e.Collection, 97 | "id": op.Id, 98 | }).Info("Syncing record") 99 | log.Debug("SQL Command ", s) 100 | log.Debug("Data ", op.Data) 101 | log.Debug("Executing statement: ", s) 102 | _, err := z.Output.NamedExec(s, op.Data) 103 | log.Debug("Statement executed successfully") 104 | z.insertCounter.Incr(1) 105 | if err != nil { 106 | log.WithFields(log.Fields{ 107 | "description": err, 108 | }).Error("Error") 109 | if err.Error() == fmt.Sprintf(`pq: relation "%s" does not exist`, e.Collection) { 110 | tables.Set(key, false) 111 | } 112 | } 113 | } 114 | } 115 | wg.Done() 116 | } 117 | func (z *FullSyncer) statementFromDbCollection(db string, collectionName string) (Statement, Collection) { 118 | c := z.Config[db].Collections[collectionName] 119 | return Statement{c}, c 120 | } 121 | 122 | func (z *FullSyncer) buildTables() (tables cmap.ConcurrentMap) { 123 | tables = cmap.New() 124 | for dbName, db := range z.Config { 125 | for collectionName := range db.Collections { 126 | // Assume all tables are present 127 | tables.Set(createFanKey(dbName, collectionName), true) 128 | } 129 | } 130 | return 131 | } 132 | 133 | func NewSynchronizer(config Config, pg *sqlx.DB, mongo *mgo.Session) FullSyncer { 134 | c := make(chan DBResult) 135 | insertCounter := ratecounter.NewRateCounter(1 * time.Second) 136 | readCounter := ratecounter.NewRateCounter(1 * time.Second) 137 | expvar.Publish("insert/sec", insertCounter) 138 | expvar.Publish("read/sec", readCounter) 139 | done := make(chan bool, 2) 140 | sync := FullSyncer{config, pg, mongo, c, done, insertCounter, readCounter} 141 | return sync 142 | } 143 | 144 | func FullSync(config Config, pg *sqlx.DB, mongo *mgo.Session) { 145 | sync := NewSynchronizer(config, pg, mongo) 146 | wg.Add(2) 147 | log.Debug("Starting writer") 148 | go sync.Write() 149 | log.Debug("Starting reader") 150 | go sync.Read() 151 | 152 | wg.Wait() 153 | } 154 | -------------------------------------------------------------------------------- /full_sync_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | m "github.com/zph/moresql" 5 | . "gopkg.in/check.v1" 6 | "gopkg.in/mgo.v2/bson" 7 | ) 8 | 9 | func BuildFields(sx ...string) m.Fields { 10 | f := m.Fields{} 11 | for _, s := range sx { 12 | var mon string 13 | if s == "_id" { 14 | mon = "id" 15 | } else { 16 | mon = "string" 17 | } 18 | f[s] = m.Field{ 19 | m.Mongo{s, mon}, 20 | m.Postgres{s, "string"}, 21 | } 22 | } 23 | return f 24 | } 25 | 26 | func (s *MySuite) TestBuildOpFromMongo(c *C) { 27 | result := make(map[string]interface{}) 28 | id := bson.ObjectId("123") 29 | result["_id"] = id 30 | result["name"] = "Alice" 31 | db := m.DBResult{"user", "user", result} 32 | fields := BuildFields("_id", "name", "age") 33 | coll := m.Collection{"user", "user", fields} 34 | op := m.BuildOpFromMgo([]string{"_id", "name", "age"}, db, coll) 35 | 36 | c.Check(op.Id, Equals, id) 37 | c.Check(op.Operation, Equals, "i") 38 | c.Check(op.Data["name"], Equals, "Alice") 39 | if val, ok := op.Data["age"]; ok { 40 | c.Check(ok, Equals, true) 41 | c.Check(val, Equals, nil) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/zph/moresql 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/Sirupsen/logrus v0.10.1-0.20160601113210-f3cfb454f4c2 7 | github.com/heroku/rollrus v0.0.0-20160824233412-d20e35b8f913 8 | github.com/jmoiron/sqlx v0.0.0-20161209024531-cac998c4f095 9 | github.com/lib/pq v0.0.0-20160511035104-ee1442bda7bd 10 | github.com/orcaman/concurrent-map v0.0.0-20161205115927-cafb9879460a 11 | github.com/paulbellamy/ratecounter v0.1.1-0.20170206102657-348ad3bf08f0 12 | github.com/pkg/errors v0.8.1-0.20170227220037-bfd5150e4e41 13 | github.com/rwynn/gtm v0.0.0-20170315180800-22eec6961032 14 | github.com/serialx/hashring v0.0.0-20161115152012-8d1c83b82963 15 | github.com/stvp/roll v0.0.0-20170116223130-ca202b60b260 16 | github.com/thejerf/suture v2.0.0+incompatible 17 | github.com/tidwall/gjson v0.0.0-20170526023918-c784c417818f 18 | github.com/tidwall/match v1.0.1 // indirect 19 | golang.org/x/sys v0.0.0-20161214190518-d75a52659825 20 | gopkg.in/check.v1 v1.0.0-20161208181325-20d25e280405 21 | gopkg.in/mgo.v2 v2.0.0-20160818020120-3f83fa500528 22 | gopkg.in/yaml.v2 v2.0.0-20160928153709-a5b47d31c556 23 | ) 24 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/Sirupsen/logrus v0.10.1-0.20160601113210-f3cfb454f4c2 h1:3BYvDlSNPyoYk6lr17s9IueNAabOBur3f3uVULjbhTA= 2 | github.com/Sirupsen/logrus v0.10.1-0.20160601113210-f3cfb454f4c2/go.mod h1:rmk17hk6i8ZSAJkSDa7nOxamrG+SP4P0mm+DAvExv4U= 3 | github.com/heroku/rollrus v0.0.0-20160824233412-d20e35b8f913 h1:++PD3rZfQDOLZJOcsx7ZPbb+u48cIShBg6qXQzzaWSA= 4 | github.com/heroku/rollrus v0.0.0-20160824233412-d20e35b8f913/go.mod h1:BT+PgT529opmb6mcUY+Fg0IwVRRmwqFyavEMU17GnBg= 5 | github.com/jmoiron/sqlx v0.0.0-20161209024531-cac998c4f095 h1:6uwZHp3lyVH2mZxH/NLFbfBmbra2a2VDMSN2sp5NgGc= 6 | github.com/jmoiron/sqlx v0.0.0-20161209024531-cac998c4f095/go.mod h1:IiEW3SEiiErVyFdH8NTuWjSifiEQKUoyK3LNqr2kCHU= 7 | github.com/lib/pq v0.0.0-20160511035104-ee1442bda7bd h1:4boQFkBA2FViSz6B5dPK4yt+Ur9UKHvjtpoI4CrkrlM= 8 | github.com/lib/pq v0.0.0-20160511035104-ee1442bda7bd/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= 9 | github.com/orcaman/concurrent-map v0.0.0-20161205115927-cafb9879460a h1:8e7l2MMjamkluk1iCXKb2yPsstj1JcB1kENqiQnfaa8= 10 | github.com/orcaman/concurrent-map v0.0.0-20161205115927-cafb9879460a/go.mod h1:Lu3tH6HLW3feq74c2GC+jIMS/K2CFcDWnWD9XkenwhI= 11 | github.com/paulbellamy/ratecounter v0.1.1-0.20170206102657-348ad3bf08f0 h1:GXOaYcuObbI7QSQeS0UlLjDWBGaNpB3/bAKHoL/VX6s= 12 | github.com/paulbellamy/ratecounter v0.1.1-0.20170206102657-348ad3bf08f0/go.mod h1:Hfx1hDpSGoqxkVVpBi/IlYD7kChlfo5C6hzIHwPqfFE= 13 | github.com/pkg/errors v0.8.1-0.20170227220037-bfd5150e4e41 h1:wkVNpTThLSDUAGkTWb2bywAaMxLFrM/Zh1FTsbDylGA= 14 | github.com/pkg/errors v0.8.1-0.20170227220037-bfd5150e4e41/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= 15 | github.com/rwynn/gtm v0.0.0-20170315180800-22eec6961032 h1:MR0QJW32lIcEYNMK5jOFOJ88CU8tQdl7lQZ2CVpCAp0= 16 | github.com/rwynn/gtm v0.0.0-20170315180800-22eec6961032/go.mod h1:LYXeTMjbA7l9k9oEM+NUBuu0BgvNrD5nQuo8seLsar0= 17 | github.com/serialx/hashring v0.0.0-20161115152012-8d1c83b82963 h1:MKdta9JJrO3SvBErqOGh2m05/+RUSjoM8Z106GlfIVM= 18 | github.com/serialx/hashring v0.0.0-20161115152012-8d1c83b82963/go.mod h1:/yeG0My1xr/u+HZrFQ1tOQQQQrOawfyMUH13ai5brBc= 19 | github.com/stvp/roll v0.0.0-20170116223130-ca202b60b260 h1:kE8rpBNGaiYM3LN/5Xh3OMPoc87eHzMeQ7Hh7iB6NJE= 20 | github.com/stvp/roll v0.0.0-20170116223130-ca202b60b260/go.mod h1:Ffmqrj3nXIMIjeA4uW3Qjj0Ud9eDoTG0fu4JxyAr/tE= 21 | github.com/thejerf/suture v2.0.0+incompatible h1:DkVN8UweV9td/cBMFtFMDVrcE3JJxqCb9BlE8tgnh+8= 22 | github.com/thejerf/suture v2.0.0+incompatible/go.mod h1:ibKwrVj+Uzf3XZdAiNWUouPaAbSoemxOHLmJmwheEMc= 23 | github.com/tidwall/gjson v0.0.0-20170526023918-c784c417818f h1:GjObbyNgWfIZWp9rpmvywhKg/n3JityOBny7GaqFzlg= 24 | github.com/tidwall/gjson v0.0.0-20170526023918-c784c417818f/go.mod h1:c/nTNbUr0E0OrXEhq1pwa8iEgc2DOt4ZZqAt1HtCkPA= 25 | github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc= 26 | github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E= 27 | golang.org/x/sys v0.0.0-20161214190518-d75a52659825/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 28 | gopkg.in/check.v1 v1.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 29 | gopkg.in/mgo.v2 v2.0.0-20160818020120-3f83fa500528 h1:/saqWwm73dLmuzbNhe92F0QsZ/KiFND+esHco2v1hiY= 30 | gopkg.in/mgo.v2 v2.0.0-20160818020120-3f83fa500528/go.mod h1:yeKp02qBN3iKW1OzL3MGk2IdtZzaj7SFntXj72NppTA= 31 | gopkg.in/yaml.v2 v2.0.0-20160928153709-a5b47d31c556/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= 32 | -------------------------------------------------------------------------------- /goreleaser.yml: -------------------------------------------------------------------------------- 1 | # goreleaser.yml 2 | build: 3 | # Path to main.go file. 4 | # Default is `main.go` 5 | main: ./cmds/moresql/main.go 6 | 7 | # Name of the binary. Default is the name of the project directory. 8 | binary: moresql 9 | 10 | # Custom ldflags. 11 | # Default is `-s -w` 12 | ldflags: -s -w 13 | 14 | # GOOS list to build in. 15 | # For more info refer to https://golang.org/doc/install/source#environment 16 | # Defaults are darwin and linux 17 | goos: 18 | - darwin 19 | - linux 20 | 21 | archives: 22 | - id: "{{.BinaryName}}_{{.Os}}_{{.Arch}}" 23 | format: tar.gz 24 | 25 | # replacements: 26 | # darwin: darwin 27 | # linux: linux 28 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Moresql 2 | pages: 3 | - Home: index.md 4 | - Readme: README.md 5 | - Deploy: deploying.md 6 | - TODOs: TODO.md 7 | - Performance: performance.html 8 | -------------------------------------------------------------------------------- /moresql.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "flag" 5 | "net/http" 6 | "sync" 7 | "time" 8 | 9 | log "github.com/Sirupsen/logrus" 10 | ) 11 | 12 | // workerCount dedicated workers per collection 13 | const workerCount = 5 14 | 15 | // workerCountOverflow Threads in Golang 1.6+ are ~4kb to start 16 | // 500 * 4k = ~2MB ram usage due to heap of each routine 17 | const workerCountOverflow = 500 18 | 19 | // reportFrequency the timing for how often to report activity 20 | const reportFrequency = 60 // seconds 21 | 22 | // checkpointFrequency frequency at which checkpointing is saved to DB 23 | const checkpointFrequency = time.Duration(30) * time.Second 24 | 25 | var wg sync.WaitGroup 26 | 27 | func Run() { 28 | c := Commands{} 29 | env := FetchEnvsAndFlags() 30 | SetupLogger(env) 31 | ExitUnlessValidEnv(env) 32 | 33 | config := LoadConfig(env.configFile) 34 | pg := GetPostgresConnection(env) 35 | defer pg.Close() 36 | 37 | // TODO: should this run for each execution of application 38 | if env.validatePostgres { 39 | c.ValidateTablesAndColumns(config, pg) 40 | } 41 | 42 | session := GetMongoConnection(env) 43 | defer session.Close() 44 | log.Info("Connected to postgres") 45 | log.Info("Connected to mongo") 46 | 47 | if env.monitor { 48 | go http.ListenAndServe(":1234", nil) 49 | } 50 | 51 | switch { 52 | case env.sync: 53 | FullSync(config, pg, session) 54 | case env.tail: 55 | Tail(config, pg, session, env) 56 | default: 57 | flag.Usage() 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /moresql_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | "testing" 5 | 6 | _ "github.com/lib/pq" 7 | m "github.com/zph/moresql" 8 | . "gopkg.in/check.v1" 9 | ) 10 | 11 | // Hook up gocheck into the "go test" runner. 12 | func Test(t *testing.T) { TestingT(t) } 13 | 14 | type MySuite struct{} 15 | 16 | var _ = Suite(&MySuite{}) 17 | 18 | func (s *MySuite) TestBuildUpsertStatement(c *C) { 19 | mongo := m.Mongo{"_id", "id"} 20 | p := m.Postgres{"id", "text"} 21 | f := m.Field{mongo, p} 22 | f2 := m.Field{m.Mongo{"count", "text"}, m.Postgres{"count", "text"}} 23 | fields := m.Fields{"_id": f, "count": f2} 24 | collection := m.Collection{ 25 | Name: "categories", 26 | PgTable: "categories_in_pg", 27 | Fields: fields} 28 | o := m.Statement{collection} 29 | 30 | sql := o.BuildUpsert() 31 | expected := `INSERT INTO "categories_in_pg" ("id", "count") 32 | VALUES (:id, :count) 33 | ON CONFLICT ("id") 34 | DO UPDATE SET "count" = :count;` 35 | c.Check(sql, Equals, expected) 36 | } 37 | 38 | func (s *MySuite) TestBuildInsertStatement(c *C) { 39 | mongo := m.Mongo{"_id", "id"} 40 | p := m.Postgres{"id", "text"} 41 | f := m.Field{mongo, p} 42 | f2 := m.Field{m.Mongo{"count", "text"}, m.Postgres{"count", "text"}} 43 | fields := m.Fields{"_id": f, "count": f2} 44 | collection := m.Collection{ 45 | Name: "categories", 46 | PgTable: "categories", 47 | Fields: fields} 48 | o := m.Statement{collection} 49 | 50 | sql := o.BuildInsert() 51 | expected := `INSERT INTO "categories" ("id", "count") 52 | VALUES (:id, :count)` 53 | c.Check(sql, Equals, expected) 54 | } 55 | 56 | func (s *MySuite) TestBuildUpdateStatement(c *C) { 57 | mongo := m.Mongo{"_id", "id"} 58 | p := m.Postgres{"id", "id"} 59 | f := m.Field{mongo, p} 60 | f2 := m.Field{m.Mongo{"count", "text"}, m.Postgres{"count", "text"}} 61 | f3 := m.Field{m.Mongo{"avg", "text"}, m.Postgres{"avg", "text"}} 62 | fields := m.Fields{"_id": f, "count": f2, "avg": f3} 63 | collection := m.Collection{ 64 | Name: "categories", 65 | PgTable: "categories", 66 | Fields: fields} 67 | o := m.Statement{collection} 68 | sql := o.BuildUpdate() 69 | expected := `UPDATE "categories" 70 | SET "avg" = :avg, "count" = :count 71 | WHERE "id" = :_id;` 72 | c.Check(sql, Equals, expected) 73 | } 74 | 75 | func (s *MySuite) TestBuildDeleteStatement(c *C) { 76 | mongo := m.Mongo{"_id", "id"} 77 | p := m.Postgres{"id", "id"} 78 | f := m.Field{mongo, p} 79 | f2 := m.Field{m.Mongo{"count", "text"}, m.Postgres{"count", "text"}} 80 | f3 := m.Field{m.Mongo{"avg", "text"}, m.Postgres{"avg", "text"}} 81 | fields := m.Fields{"_id": f, "count": f2, "avg": f3} 82 | collection := m.Collection{ 83 | Name: "categories", 84 | PgTable: "categories", 85 | Fields: fields} 86 | o := m.Statement{collection} 87 | sql := o.BuildDelete() 88 | expected := `DELETE FROM "categories" WHERE "id" = :_id;` 89 | c.Check(sql, Equals, expected) 90 | } 91 | -------------------------------------------------------------------------------- /structs.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "os" 7 | "sort" 8 | "strings" 9 | "time" 10 | 11 | log "github.com/Sirupsen/logrus" 12 | "github.com/jmoiron/sqlx" 13 | ) 14 | 15 | type DBResult struct { 16 | MongoDB string 17 | Collection string 18 | Data map[string]interface{} 19 | } 20 | 21 | type MongoResult struct { 22 | DB struct { 23 | Source string 24 | Destination string 25 | } 26 | Data map[string]interface{} 27 | } 28 | 29 | type urls struct { 30 | mongo string 31 | postgres string 32 | } 33 | 34 | type Env struct { 35 | urls urls 36 | sync bool 37 | tail bool 38 | SSLCert string 39 | SSLInsecureSkipVerify bool 40 | configFile string 41 | allowDeletes bool 42 | monitor bool 43 | replayOplog bool 44 | replayDuration time.Duration 45 | replaySecond int64 46 | checkpoint bool 47 | appName string 48 | createTableSQL bool 49 | validatePostgres bool 50 | reportingToken string 51 | appEnvironment string 52 | errorReporting string 53 | memprofile string 54 | } 55 | 56 | func (e *Env) UseSSL() (r bool) { 57 | r = false 58 | if e.SSLCert != "" || e.SSLInsecureSkipVerify { 59 | r = true 60 | } 61 | return 62 | } 63 | 64 | // Queries contains the sql commands used by Moresql 65 | type Queries struct{} 66 | 67 | // GetMetadata fetches the most recent metadata row for this appname 68 | func (q *Queries) GetMetadata() string { 69 | return `SELECT * FROM moresql_metadata WHERE app_name=$1 ORDER BY last_epoch DESC LIMIT 1;` 70 | } 71 | 72 | // SaveMetadata performs an upsert using metadata with uniqueness constraint on app_name 73 | func (q *Queries) SaveMetadata() string { 74 | return `INSERT INTO "moresql_metadata" ("app_name", "last_epoch", "processed_at") 75 | VALUES (:app_name, :last_epoch, :processed_at) 76 | ON CONFLICT ("app_name") 77 | DO UPDATE SET "last_epoch" = :last_epoch, "processed_at" = :processed_at;` 78 | } 79 | 80 | // CreateMetadataTable provides the sql required to setup the metadata table 81 | func (q *Queries) CreateMetadataTable() string { 82 | return ` 83 | -- create the moresql_metadata table for checkpoint persistance 84 | CREATE TABLE public.moresql_metadata 85 | ( 86 | app_name TEXT NOT NULL, 87 | last_epoch INT NOT NULL, 88 | processed_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() NOT NULL 89 | ); 90 | -- Setup mandatory unique index 91 | CREATE UNIQUE INDEX moresql_metadata_app_name_uindex ON public.moresql_metadata (app_name); 92 | 93 | -- Grant permissions to this user, replace $USERNAME with moresql's user 94 | GRANT SELECT, UPDATE, DELETE ON TABLE public.moresql_metadata TO $USERNAME; 95 | 96 | COMMENT ON COLUMN public.moresql_metadata.app_name IS 'Name of application. Used for circumstances where multiple apps stream to same PG instance.'; 97 | COMMENT ON COLUMN public.moresql_metadata.last_epoch IS 'Most recent epoch processed from Mongo'; 98 | COMMENT ON COLUMN public.moresql_metadata.processed_at IS 'Timestamp for when the last epoch was processed at'; 99 | COMMENT ON TABLE public.moresql_metadata IS 'Stores checkpoint data for MoreSQL (mongo->pg) streaming'; 100 | ` 101 | } 102 | 103 | func (q *Queries) GetColumnsFromTable() string { 104 | return ` 105 | SELECT column_name 106 | FROM information_schema.columns 107 | WHERE table_schema = :schema 108 | AND table_name = :table` 109 | } 110 | 111 | func (q *Queries) GetTableColumnIndexMetadata() string { 112 | return ` 113 | -- Get table, columns, and index metadata 114 | WITH tables_and_indexes AS ( 115 | -- CREDIT: http://stackoverflow.com/a/25596855 116 | SELECT 117 | c.relname AS table, 118 | f.attname AS column, 119 | pg_catalog.format_type(f.atttypid, f.atttypmod) AS type, 120 | f.attnotnull AS notnull, 121 | i.relname AS index_name, 122 | CASE 123 | WHEN i.oid <> 0 124 | THEN TRUE 125 | ELSE FALSE 126 | END AS is_index, 127 | CASE 128 | WHEN p.contype = 'p' 129 | THEN TRUE 130 | ELSE FALSE 131 | END AS primarykey, 132 | CASE 133 | WHEN p.contype = 'u' 134 | THEN TRUE 135 | WHEN p.contype = 'p' 136 | THEN TRUE 137 | ELSE FALSE 138 | END AS uniquekey, 139 | CASE 140 | WHEN f.atthasdef = 't' 141 | THEN d.adsrc 142 | END AS default 143 | FROM pg_attribute f 144 | JOIN pg_class c ON c.oid = f.attrelid 145 | JOIN pg_type t ON t.oid = f.atttypid 146 | LEFT JOIN pg_attrdef d ON d.adrelid = c.oid AND d.adnum = f.attnum 147 | LEFT JOIN pg_namespace n ON n.oid = c.relnamespace 148 | LEFT JOIN pg_constraint p ON p.conrelid = c.oid AND f.attnum = ANY (p.conkey) 149 | LEFT JOIN pg_class AS g ON p.confrelid = g.oid 150 | LEFT JOIN pg_index AS ix ON f.attnum = ANY (ix.indkey) AND c.oid = f.attrelid AND c.oid = ix.indrelid 151 | LEFT JOIN pg_class AS i ON ix.indexrelid = i.oid 152 | 153 | WHERE c.relkind = 'r' :: CHAR 154 | AND n.nspname = 'public' -- Replace with Schema name 155 | --AND c.relname = 'nodes' -- Replace with table name, or Comment this for get all tables 156 | AND f.attnum > 0 157 | ORDER BY c.relname, f.attname 158 | ) 159 | SELECT count(*) from tables_and_indexes 160 | WHERE "table" = $1 161 | AND "column" = $2 162 | AND is_index IS TRUE 163 | -- TODO: determine how to check if index is unique vs unique column 164 | -- AND uniquekey IS TRUE; 165 | ` 166 | } 167 | 168 | type Commands struct{} 169 | 170 | func (c *Commands) CreateTableSQL() { 171 | q := Queries{} 172 | fmt.Print("-- Execute the following SQL to setup table in Postgres. Replace $USERNAME with the moresql user.") 173 | fmt.Println(q.CreateMetadataTable()) 174 | os.Exit(0) 175 | } 176 | 177 | type ColumnResult struct { 178 | Name string `db:"column_name"` 179 | } 180 | 181 | type TableColumn struct { 182 | Schema string 183 | Table string 184 | Column string 185 | Type string 186 | Message string 187 | Solution string 188 | } 189 | 190 | func (t *TableColumn) uniqueIndex() string { 191 | return fmt.Sprintf("CREATE UNIQUE INDEX %s_service_uindex_on_%s ON %s.%s (%s);", t.Table, t.Column, t.Schema, t.Table, t.Column) 192 | } 193 | 194 | func (t *TableColumn) createColumn() string { 195 | return fmt.Sprintf(`ALTER TABLE %s.%s ADD %s %s NULL;`, t.Schema, t.Table, normalizeDotNotationToPostgresNaming(t.Column), t.Type) 196 | } 197 | 198 | type hasUniqueIndex struct { 199 | Value int `db:"count"` 200 | } 201 | 202 | func (h *hasUniqueIndex) isValid() bool { 203 | if h.Value > 0 { 204 | return true 205 | } 206 | return false 207 | } 208 | 209 | func (c *Commands) ValidateTablesAndColumns(config Config, pg *sqlx.DB) { 210 | q := Queries{} 211 | missingColumns := []TableColumn{} 212 | // Validates configuration of Postgres based on config file 213 | // Only validates SELECT and column existance 214 | for _, db := range config { 215 | for _, coll := range db.Collections { 216 | table := coll.PgTable 217 | // TODO: allow for non-public schema 218 | schema := "public" 219 | // Check that all columns are present 220 | rows, err := pg.NamedQuery(q.GetColumnsFromTable(), map[string]interface{}{"schema": schema, "table": table}) 221 | if err != nil { 222 | log.Error(err) 223 | } 224 | // TODO: add validation that column types equal the types present in config 225 | 226 | resultMap := make(map[string]string) 227 | for rows.Next() { 228 | var row ColumnResult 229 | err := rows.StructScan(&row) 230 | if err != nil { 231 | log.Fatalln(err) 232 | } 233 | resultMap[row.Name] = row.Name 234 | } 235 | 236 | for _, field := range coll.Fields { 237 | k := field.Postgres.Name 238 | _, ok := resultMap[k] 239 | if ok != true { 240 | t := TableColumn{Schema: schema, Table: table, Column: k, Message: "Missing Column", Type: field.Postgres.Type} 241 | t.Solution = t.createColumn() 242 | missingColumns = append(missingColumns, t) 243 | } 244 | } 245 | 246 | // Check that each table has _id as in a unique index 247 | r := hasUniqueIndex{} 248 | err = pg.Get(&r, q.GetTableColumnIndexMetadata(), table, "_id") 249 | if err != nil { 250 | log.Error(err) 251 | } 252 | 253 | if r.isValid() == false { 254 | t := TableColumn{Schema: schema, Table: table, Column: "_id", Message: "Missing Unique Index on Column", Type: ""} 255 | t.Solution = t.uniqueIndex() 256 | missingColumns = append(missingColumns, t) 257 | } 258 | 259 | } 260 | } 261 | if len(missingColumns) != 0 { 262 | log.Print("The following errors were reported:") 263 | tables := make(map[string]TableColumn) 264 | for _, v := range missingColumns { 265 | log.Printf("Table %s.%s Column: %s, Error: %s", v.Schema, v.Table, v.Column, v.Message) 266 | tables[v.Table] = v 267 | } 268 | log.Printf("SQL Output to assist with correcting table schema malformation:") 269 | 270 | // Table level advice 271 | // CREATE TABLE IF NOT EXISTS public.distributions(); 272 | for _, v := range tables { 273 | fmt.Printf("CREATE TABLE IF NOT EXISTS %s.%s();\n", v.Schema, v.Table) 274 | } 275 | 276 | // Column level advice 277 | for _, v := range missingColumns { 278 | fmt.Printf("%s\n", v.Solution) 279 | } 280 | os.Exit(1) 281 | } 282 | log.Printf("Validation succeeded. Postgres tables look good.") 283 | os.Exit(0) 284 | } 285 | 286 | type Mongo struct { 287 | Name string `json:"name"` 288 | Type string `json:"type"` 289 | } 290 | type Postgres struct { 291 | Name string `json:"name"` 292 | Type string `json:"type"` 293 | } 294 | 295 | // nameQuoted is required for postgres table names 296 | // and field names in case they conflict with SQL 297 | // builtin functions 298 | func (p Postgres) nameQuoted() string { 299 | return fmt.Sprintf(`"%s"`, p.Name) 300 | } 301 | 302 | type Field struct { 303 | Mongo Mongo `json:"mongo"` 304 | Postgres Postgres `json:"postgres"` 305 | } 306 | type Fields map[string]Field 307 | type FieldShorthand map[string]string 308 | type FieldsWrapper map[string]json.RawMessage 309 | 310 | type Collection struct { 311 | Name string `json:"name"` 312 | PgTable string `json:"pg_table"` 313 | Fields Fields `json:"fields"` 314 | } 315 | 316 | type CollectionDelayed struct { 317 | Name string `json:"name"` 318 | PgTable string `json:"pg_table"` 319 | Fields json.RawMessage `json:"fields"` 320 | } 321 | 322 | func (c Collection) pgTableQuoted() string { 323 | return fmt.Sprintf(`"%s"`, c.PgTable) 324 | } 325 | 326 | type DBDelayed struct { 327 | Collections CollectionsDelayed `json:"collections"` 328 | } 329 | type DB struct { 330 | Collections Collections `json:"collections"` 331 | } 332 | 333 | type Collections map[string]Collection 334 | type CollectionsDelayed map[string]CollectionDelayed 335 | 336 | // Config provides the core struct for 337 | // the ultimate unmarshalled moresql.json 338 | type Config map[string]DB 339 | 340 | // ConfigDelayed provides lazy config loading 341 | // to support shorthand and longhand variants 342 | type ConfigDelayed map[string]DBDelayed 343 | 344 | // Statement provides functions for building up upsert/insert/update/allowDeletes 345 | // sql commands appropriate for a gtm.Op.Data 346 | type Statement struct { 347 | Collection Collection 348 | } 349 | 350 | func (o *Statement) prefixColon(s string) string { 351 | return fmt.Sprintf(":%s", s) 352 | } 353 | 354 | func (o *Statement) mongoFields() []string { 355 | var fields []string 356 | for _, k := range o.sortedKeys() { 357 | v := o.Collection.Fields[k] 358 | fields = append(fields, v.Mongo.Name) 359 | } 360 | return fields 361 | } 362 | 363 | func (o *Statement) postgresFields() []string { 364 | var fields []string 365 | for _, k := range o.sortedKeys() { 366 | v := o.Collection.Fields[k] 367 | fields = append(fields, v.Postgres.Name) 368 | } 369 | return fields 370 | } 371 | 372 | func (o *Statement) postgresFieldsQuoted() []string { 373 | var fields []string 374 | for _, k := range o.sortedKeys() { 375 | v := o.Collection.Fields[k] 376 | fields = append(fields, v.Postgres.nameQuoted()) 377 | } 378 | return fields 379 | } 380 | 381 | func (o *Statement) colonFields() []string { 382 | var withColons []string 383 | for _, f := range o.postgresFields() { 384 | withColons = append(withColons, o.prefixColon(f)) 385 | } 386 | return withColons 387 | } 388 | 389 | func (o *Statement) joinedPlaceholders() string { 390 | return strings.Join(o.colonFields(), ", ") 391 | } 392 | 393 | func (o *Statement) joinLines(sx ...string) string { 394 | return strings.Join(sx, "\n") 395 | } 396 | 397 | func (o *Statement) buildAssignment() string { 398 | set := []string{} 399 | for _, k := range o.sortedKeys() { 400 | v := o.Collection.Fields[k] 401 | if k != "_id" { 402 | // Accesses data that has already been sanitized into postgres naming 403 | set = append(set, fmt.Sprintf(`%s = :%s`, v.Postgres.nameQuoted(), v.Postgres.Name)) 404 | } 405 | } 406 | return strings.Join(set, ", ") 407 | } 408 | 409 | func (o *Statement) sortedKeys() []string { 410 | var keys []string 411 | for k := range o.Collection.Fields { 412 | keys = append(keys, k) 413 | } 414 | sort.Strings(keys) 415 | return keys 416 | } 417 | 418 | func (o *Statement) id() Field { 419 | return o.Collection.Fields["_id"] 420 | } 421 | 422 | func (o *Statement) whereById() string { 423 | id := o.id() 424 | return fmt.Sprintf(`WHERE %s = :%s`, id.Postgres.nameQuoted(), id.Mongo.Name) 425 | } 426 | 427 | func (o *Statement) BuildUpsert() string { 428 | insert := o.BuildInsert() 429 | onConflict := fmt.Sprintf("ON CONFLICT (%s)", o.id().Postgres.nameQuoted()) 430 | doUpdate := fmt.Sprintf("DO UPDATE SET %s;", o.buildAssignment()) 431 | output := o.joinLines(insert, onConflict, doUpdate) 432 | return output 433 | } 434 | 435 | func (o *Statement) BuildInsert() string { 436 | insertInto := fmt.Sprintf("INSERT INTO %s (%s)", o.Collection.pgTableQuoted(), strings.Join(o.postgresFieldsQuoted(), ", ")) 437 | values := fmt.Sprintf("VALUES (%s)", o.joinedPlaceholders()) 438 | output := o.joinLines(insertInto, values) 439 | return output 440 | } 441 | 442 | func (o *Statement) BuildUpdate() string { 443 | update := fmt.Sprintf("UPDATE %s", o.Collection.pgTableQuoted()) 444 | set := fmt.Sprintf("SET %s", o.buildAssignment()) 445 | where := fmt.Sprintf("%s;", o.whereById()) 446 | return o.joinLines(update, set, where) 447 | } 448 | 449 | func (o *Statement) BuildDelete() string { 450 | return fmt.Sprintf("DELETE FROM %s %s;", o.Collection.pgTableQuoted(), o.whereById()) 451 | } 452 | -------------------------------------------------------------------------------- /structs_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | m "github.com/zph/moresql" 5 | . "gopkg.in/check.v1" 6 | ) 7 | 8 | func (s *MySuite) TestUseSSL(c *C) { 9 | var table = []struct { 10 | env m.Env 11 | result bool 12 | }{ 13 | {m.Env{SSLCert: ""}, false}, 14 | {m.Env{SSLCert: "cert.pem"}, true}, 15 | } 16 | for _, t := range table { 17 | actual := t.env.UseSSL() 18 | c.Check(actual, DeepEquals, t.result) 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tail.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "database/sql" 5 | "expvar" 6 | "fmt" 7 | "regexp" 8 | 9 | "time" 10 | 11 | log "github.com/Sirupsen/logrus" 12 | "github.com/jmoiron/sqlx" 13 | "github.com/orcaman/concurrent-map" 14 | "github.com/paulbellamy/ratecounter" 15 | "github.com/rwynn/gtm" 16 | "github.com/serialx/hashring" 17 | "github.com/thejerf/suture" 18 | 19 | "strconv" 20 | 21 | mgo "gopkg.in/mgo.v2" 22 | "gopkg.in/mgo.v2/bson" 23 | ) 24 | 25 | // Tailer is the core struct for performing 26 | // Mongo->Pg streaming. 27 | type Tailer struct { 28 | config Config 29 | pg *sqlx.DB 30 | session *mgo.Session 31 | env Env 32 | counters counters 33 | stop chan bool 34 | fan map[string]gtm.OpChan 35 | checkpoint *cmap.ConcurrentMap 36 | } 37 | 38 | // Stop is the func necessary to terminate action 39 | // when using Suture library 40 | func (t *Tailer) Stop() { 41 | fmt.Println("Stopping service") 42 | t.stop <- true 43 | } 44 | 45 | func (t *Tailer) startOverflowConsumers(c <-chan *gtm.Op) { 46 | for i := 1; i <= workerCountOverflow; i++ { 47 | go t.consumer(strconv.Itoa(i), c, nil) 48 | } 49 | } 50 | 51 | type EpochTimestamp int64 52 | 53 | func BuildOptionAfterFromTimestamp(timestamp EpochTimestamp, replayDuration time.Duration) (func(*mgo.Session, *gtm.Options) bson.MongoTimestamp, error) { 54 | if timestamp != EpochTimestamp(0) && int64(timestamp) < time.Now().Unix() { 55 | // We have a starting oplog entry 56 | f := func() time.Time { return time.Unix(int64(timestamp), 0) } 57 | return OpTimestampWrapper(f, time.Duration(0)), nil 58 | } else if replayDuration != time.Duration(0) { 59 | return OpTimestampWrapper(bson.Now, replayDuration), nil 60 | } else { 61 | return OpTimestampWrapper(bson.Now, time.Duration(0)), nil 62 | } 63 | return nil, fmt.Errorf("Unable to calculate tailing start time") 64 | } 65 | 66 | func (t *Tailer) NewOptions(timestamp EpochTimestamp, replayDuration time.Duration) (*gtm.Options, error) { 67 | options := gtm.DefaultOptions() 68 | after, err := BuildOptionAfterFromTimestamp(timestamp, replayDuration) 69 | if err != nil { 70 | return nil, err 71 | } 72 | epoch, _ := gtm.ParseTimestamp(after(nil, nil)) 73 | log.Infof("Starting from epoch: %+v", epoch) 74 | options.After = after 75 | options.BufferSize = 500 76 | options.BufferDuration = time.Duration(500 * time.Millisecond) 77 | options.Ordering = gtm.Document 78 | return options, nil 79 | } 80 | 81 | func (t *Tailer) NewFan() map[string]gtm.OpChan { 82 | fan := make(map[string]gtm.OpChan) 83 | // Register Channels 84 | for dbName, db := range t.config { 85 | for collectionName := range db.Collections { 86 | fan[createFanKey(dbName, collectionName)] = make(gtm.OpChan, 1000) 87 | } 88 | } 89 | return fan 90 | } 91 | 92 | func consistentBroker(in gtm.OpChan, ring *hashring.HashRing, workerPool map[string]gtm.OpChan) { 93 | for { 94 | select { 95 | case op := <-in: 96 | node, ok := ring.GetNode(fmt.Sprintf("%s", op.Id)) 97 | if !ok { 98 | log.Error("Failed at getting worker node from hashring") 99 | } else { 100 | out := workerPool[node] 101 | out <- op 102 | } 103 | } 104 | } 105 | } 106 | 107 | func (t *Tailer) startDedicatedConsumers(fan map[string]gtm.OpChan, overflow gtm.OpChan) { 108 | // Reserved workers for individual channels 109 | for k, c := range fan { 110 | workerPool := make(map[string]gtm.OpChan) 111 | var workers [workerCount]int 112 | for i := range workers { 113 | o := make(gtm.OpChan) 114 | workerPool[strconv.Itoa(i)] = o 115 | } 116 | keys := []string{} 117 | for k := range workerPool { 118 | keys = append(keys, k) 119 | } 120 | ring := hashring.New(keys) 121 | wg.Add(1) 122 | go consistentBroker(c, ring, workerPool) 123 | for k, workerChan := range workerPool { 124 | go t.consumer(k, workerChan, overflow) 125 | } 126 | log.WithFields(log.Fields{ 127 | "count": workerCount, 128 | "collection": k, 129 | }).Debug("Starting worker(s)") 130 | } 131 | } 132 | 133 | type MoresqlMetadata struct { 134 | AppName string `db:"app_name"` 135 | LastEpoch int64 `db:"last_epoch"` 136 | ProcessedAt time.Time `db:"processed_at"` 137 | } 138 | 139 | func NewTailer(config Config, pg *sqlx.DB, session *mgo.Session, env Env) *Tailer { 140 | checkpoint := cmap.New() 141 | return &Tailer{config: config, pg: pg, session: session, env: env, stop: make(chan bool), counters: buildCounters(), checkpoint: &checkpoint} 142 | } 143 | 144 | func FetchMetadata(checkpoint bool, pg *sqlx.DB, appName string) MoresqlMetadata { 145 | metadata := MoresqlMetadata{} 146 | if checkpoint { 147 | q := Queries{} 148 | err := pg.Get(&metadata, q.GetMetadata(), appName) 149 | // No rows means this is first time with table 150 | if err != nil && err != sql.ErrNoRows { 151 | log.Errorf("Error while reading moresql_metadata table %+v", err) 152 | c := Commands{} 153 | c.CreateTableSQL() 154 | } 155 | 156 | } else { 157 | metadata.LastEpoch = 0 158 | } 159 | return metadata 160 | } 161 | 162 | type gtmTail struct { 163 | ops gtm.OpChan 164 | errs chan error 165 | } 166 | 167 | func (t *Tailer) Read() { 168 | metadata := FetchMetadata(t.env.checkpoint, t.pg, t.env.appName) 169 | 170 | var lastEpoch int64 171 | if t.env.replaySecond != 0 { 172 | lastEpoch = int64(t.env.replaySecond) 173 | } else { 174 | lastEpoch = metadata.LastEpoch 175 | } 176 | options, err := t.NewOptions(EpochTimestamp(lastEpoch), t.env.replayDuration) 177 | if err != nil { 178 | log.Fatal(err.Error()) 179 | } 180 | ops, errs := gtm.Tail(t.session, options) 181 | g := gtmTail{ops, errs} 182 | log.Info("Tailing mongo oplog") 183 | go func() { 184 | for { 185 | select { 186 | case <-t.stop: 187 | return 188 | case err := <-g.errs: 189 | if matched, _ := regexp.MatchString("i/o timeout", err.Error()); matched { 190 | // Restart gtm.Tail 191 | // Close existing channels to not leak resources 192 | log.Errorf("Problem connecting to mongo initiating reconnection: %s", err.Error()) 193 | close(g.ops) 194 | close(g.errs) 195 | latest, ok := t.checkpoint.Get("latest") 196 | if ok && latest != nil { 197 | metadata = latest.(MoresqlMetadata) 198 | lastEpoch = metadata.LastEpoch 199 | options, err := t.NewOptions(EpochTimestamp(lastEpoch), t.env.replayDuration) 200 | if err != nil { 201 | log.Fatal(err.Error()) 202 | } 203 | ops, errs = gtm.Tail(t.session, options) 204 | g = gtmTail{ops, errs} 205 | } else { 206 | log.Fatalf("Exiting: Unable to recover from %s", err.Error()) 207 | } 208 | } else { 209 | log.Fatalf("Exiting: Mongo tailer returned error %s", err.Error()) 210 | } 211 | case op := <-g.ops: 212 | t.counters.read.Incr(1) 213 | log.WithFields(log.Fields{ 214 | "operation": op.Operation, 215 | "collection": op.GetCollection(), 216 | "id": op.Id, 217 | }).Debug("Received operation") 218 | // Check if we're watching for the collection 219 | db := op.GetDatabase() 220 | coll := op.GetCollection() 221 | key := createFanKey(db, coll) 222 | if c := t.fan[key]; c != nil { 223 | collection := t.config[db].Collections[coll] 224 | o := Statement{collection} 225 | c <- EnsureOpHasAllFields(op, o.mongoFields()) 226 | } else { 227 | t.counters.skipped.Incr(1) 228 | log.Debug("Missing channel for this collection") 229 | } 230 | for k, v := range t.fan { 231 | if len(v) > 0 { 232 | log.Debugf("Channel %s has %d", k, len(v)) 233 | } 234 | } 235 | } 236 | } 237 | }() 238 | } 239 | 240 | func (t *Tailer) Write() { 241 | t.fan = t.NewFan() 242 | log.WithField("struct", t.fan).Debug("Fan") 243 | overflow := make(gtm.OpChan) 244 | t.startDedicatedConsumers(t.fan, overflow) 245 | t.startOverflowConsumers(overflow) 246 | } 247 | 248 | func (t *Tailer) Report() { 249 | c := time.Tick(time.Duration(reportFrequency) * time.Second) 250 | go func() { 251 | for { 252 | select { 253 | case <-c: 254 | t.ReportCounters() 255 | } 256 | } 257 | }() 258 | 259 | } 260 | 261 | func (t *Tailer) SaveCheckpoint(m MoresqlMetadata) error { 262 | q := Queries{} 263 | result, err := t.pg.NamedExec(q.SaveMetadata(), m) 264 | if err != nil { 265 | log.Errorf("Unable to save into moresql_metadata: %+v, %+v", result, err.Error()) 266 | } 267 | return err 268 | } 269 | 270 | func (t *Tailer) Checkpoints() { 271 | go func() { 272 | timer := time.Tick(checkpointFrequency) 273 | for { 274 | select { 275 | case _ = <-timer: 276 | latest, ok := t.checkpoint.Get("latest") 277 | if ok && latest != nil { 278 | t.SaveCheckpoint(latest.(MoresqlMetadata)) 279 | log.Debug("Saved checkpointing %+v", latest.(MoresqlMetadata)) 280 | } 281 | } 282 | } 283 | }() 284 | } 285 | 286 | // Serve is the func necessary to start action 287 | // when using Suture library 288 | func (t *Tailer) Serve() { 289 | t.Write() 290 | t.Read() 291 | t.Report() 292 | if t.env.checkpoint { 293 | t.Checkpoints() 294 | } 295 | <-t.stop 296 | } 297 | 298 | type counters struct { 299 | insert *ratecounter.RateCounter 300 | update *ratecounter.RateCounter 301 | delete *ratecounter.RateCounter 302 | read *ratecounter.RateCounter 303 | skipped *ratecounter.RateCounter 304 | } 305 | 306 | func (c *counters) All() map[string]*ratecounter.RateCounter { 307 | cx := make(map[string]*ratecounter.RateCounter) 308 | cx["insert"] = c.insert 309 | cx["update"] = c.update 310 | cx["delete"] = c.delete 311 | cx["read"] = c.read 312 | cx["skipped"] = c.skipped 313 | return cx 314 | } 315 | 316 | func buildCounters() (c counters) { 317 | c = counters{ 318 | ratecounter.NewRateCounter(1 * time.Minute), 319 | ratecounter.NewRateCounter(1 * time.Minute), 320 | ratecounter.NewRateCounter(1 * time.Minute), 321 | ratecounter.NewRateCounter(1 * time.Minute), 322 | ratecounter.NewRateCounter(1 * time.Minute), 323 | } 324 | expvar.Publish("insert/min", c.insert) 325 | expvar.Publish("update/min", c.update) 326 | expvar.Publish("delete/min", c.delete) 327 | expvar.Publish("ops/min", c.read) 328 | expvar.Publish("skipped/min", c.skipped) 329 | return 330 | } 331 | 332 | func (t *Tailer) ReportCounters() { 333 | for i, counter := range t.counters.All() { 334 | log.Infof("Rate of %s per min: %d", i, counter.Rate()) 335 | } 336 | } 337 | 338 | func (t *Tailer) MsLag(epoch int32, nowFunc func() time.Time) int64 { 339 | // TODO: use time.Duration instead of this malarky 340 | ts := time.Unix(int64(epoch), 0) 341 | d := nowFunc().Sub(ts) 342 | nanoToMillisecond := func(t time.Duration) int64 { return t.Nanoseconds() / 1e6 } 343 | return nanoToMillisecond(d) 344 | } 345 | 346 | func (t *Tailer) consumer(id string, in <-chan *gtm.Op, overflow chan<- *gtm.Op) { 347 | var workerType string 348 | if overflow != nil { 349 | workerType = "Dedicated" 350 | } else { 351 | workerType = "Generic" 352 | } 353 | for { 354 | if overflow != nil && len(in) > workerCount { 355 | // Siphon off overflow 356 | select { 357 | case op := <-in: 358 | overflow <- op 359 | } 360 | continue 361 | } 362 | select { 363 | case op := <-in: 364 | t.processOp(op, workerType) 365 | if t.env.checkpoint { 366 | t.checkpoint.Set("latest", t.OpToMoresqlMetadata(op)) 367 | } 368 | } 369 | } 370 | } 371 | 372 | func (t *Tailer) OpToMoresqlMetadata(op *gtm.Op) MoresqlMetadata { 373 | ts, _ := gtm.ParseTimestamp(op.Timestamp) 374 | return MoresqlMetadata{AppName: t.env.appName, ProcessedAt: time.Now(), LastEpoch: int64(ts)} 375 | } 376 | 377 | func (t *Tailer) processOp(op *gtm.Op, workerType string) { 378 | collectionName := op.GetCollection() 379 | db := op.GetDatabase() 380 | st := FullSyncer{Config: t.config} 381 | o, c := st.statementFromDbCollection(db, collectionName) 382 | ts1, ts2 := gtm.ParseTimestamp(op.Timestamp) 383 | gtmLag := t.MsLag(ts1, time.Now) 384 | logFn := func(s sql.Result, e error) { 385 | log.WithFields(log.Fields{ 386 | "ts": ts1, 387 | "ts2": ts2, 388 | "msLag": gtmLag, 389 | "now": time.Now().Unix(), 390 | "action": op.Operation, 391 | "id": op.Id, 392 | "collection": op.GetCollection(), 393 | "error": e, 394 | }).Debug(fmt.Sprintf("%s worker processed", workerType)) 395 | } 396 | data := SanitizeData(c.Fields, op) 397 | switch { 398 | case op.IsInsert(): 399 | t.counters.insert.Incr(1) 400 | s, err := t.pg.NamedExec(o.BuildUpsert(), data) 401 | logFn(s, err) 402 | case op.IsUpdate(): 403 | t.counters.update.Incr(1) 404 | // Note we're using upsert here vs update 405 | // This imposes a performance penalty but is more robust 406 | // in circumstances where an update would fail due to 407 | // record missing in PG 408 | s, err := t.pg.NamedExec(o.BuildUpsert(), data) 409 | logFn(s, err) 410 | case op.IsDelete() && t.env.allowDeletes: 411 | t.counters.delete.Incr(1) 412 | deleteSQL := o.BuildDelete() 413 | s, err := t.pg.NamedExec(deleteSQL, data) 414 | logFn(s, err) 415 | } 416 | } 417 | 418 | func OpTimestampWrapper(f func() time.Time, ago time.Duration) func(*mgo.Session, *gtm.Options) bson.MongoTimestamp { 419 | return func(*mgo.Session, *gtm.Options) bson.MongoTimestamp { 420 | now := f() 421 | inPast := now.Add(-ago) 422 | var c uint32 = 1 423 | ts, err := NewMongoTimestamp(inPast, c) 424 | if err != nil { 425 | log.Error(err) 426 | } 427 | return ts 428 | } 429 | } 430 | 431 | func Tail(config Config, pg *sqlx.DB, session *mgo.Session, env Env) { 432 | supervisor := suture.NewSimple("Supervisor") 433 | service := NewTailer(config, pg, session, env) 434 | supervisor.Add(service) 435 | supervisor.ServeBackground() 436 | <-service.stop 437 | } 438 | -------------------------------------------------------------------------------- /tail_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | "time" 5 | 6 | m "github.com/zph/moresql" 7 | . "gopkg.in/check.v1" 8 | bson "gopkg.in/mgo.v2/bson" 9 | ) 10 | 11 | func (s *MySuite) TestTimestamp(c *C) { 12 | startTime := time.Duration(1485144398995 * time.Millisecond) 13 | f := func() time.Time { return time.Unix(0, startTime.Nanoseconds()) } 14 | t := m.OpTimestampWrapper(f, -1*time.Hour)(nil, nil) 15 | expected := bson.MongoTimestamp(6378662081129873409) 16 | c.Check(t, Equals, expected) 17 | } 18 | 19 | func (s *MySuite) TestNewOptionsWithEpoch(c *C) { 20 | tail := m.Tailer{} 21 | opts, err := tail.NewOptions(m.EpochTimestamp(1485144398), time.Duration(0)) 22 | actual := opts.After(nil, nil) 23 | expected := bson.MongoTimestamp(6378646619247607809) 24 | c.Check(actual, Equals, expected) 25 | c.Check(err, Equals, nil) 26 | } 27 | 28 | func (s *MySuite) TestMsLag(c *C) { 29 | var table = []struct { 30 | ts int32 31 | now int64 32 | nowNano int64 33 | out int64 34 | }{ 35 | {1486898048, 1486898048, 0, 0}, 36 | {1486898047, 1486898048, 0, 1000}, 37 | {1486898047, 0, 1486898048001000000, 1001}, 38 | } 39 | for _, tt := range table { 40 | ts := tt.ts 41 | f := func() time.Time { return time.Unix(tt.now, tt.nowNano) } 42 | t := m.Tailer{} 43 | actual := t.MsLag(ts, f) 44 | c.Check(actual, Equals, int64(tt.out)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package moresql 2 | 3 | import ( 4 | "encoding/json" 5 | "flag" 6 | "os" 7 | "runtime/pprof" 8 | "strings" 9 | "time" 10 | 11 | log "github.com/Sirupsen/logrus" 12 | "github.com/tidwall/gjson" 13 | 14 | rollus "github.com/heroku/rollrus" 15 | "github.com/rwynn/gtm" 16 | ) 17 | 18 | func FetchEnvsAndFlags() (e Env) { 19 | e = Env{} 20 | e.urls.mongo = os.Getenv("MONGO_URL") 21 | e.urls.postgres = os.Getenv("POSTGRES_URL") 22 | var x = *flag.String("mongo-url", "", "`MONGO_URL` aka connection string") 23 | var p = *flag.String("postgres-url", "", "`POSTGRES_URL` aka connection string") 24 | flag.StringVar(&e.configFile, "config-file", "moresql.json", "Configuration file to use") 25 | flag.BoolVar(&e.sync, "full-sync", false, "Run full sync for each db.collection in config") 26 | flag.BoolVar(&e.allowDeletes, "allow-deletes", true, "Allow deletes to propagate from Mongo -> PG") 27 | flag.BoolVar(&e.tail, "tail", false, "Tail mongodb for each db.collection in config") 28 | flag.StringVar(&e.SSLCert, "ssl-cert", "", "SSL PEM cert for Mongodb") 29 | flag.StringVar(&e.appName, "app-name", "moresql", "AppName used in Checkpoint table") 30 | flag.BoolVar(&e.monitor, "enable-monitor", false, "Run expvarmon endpoint") 31 | flag.BoolVar(&e.checkpoint, "checkpoint", false, "Store and restore from checkpoints in PG table: moresql_metadata") 32 | flag.BoolVar(&e.createTableSQL, "create-table-sql", false, "Print out the necessary SQL for creating metadata table required for checkpointing") 33 | flag.BoolVar(&e.validatePostgres, "validate", false, "Validate the postgres table structures and exit") 34 | flag.StringVar(&e.errorReporting, "error-reporting", "", "Error reporting tool to use (currently only supporting Rollbar)") 35 | flag.StringVar(&e.memprofile, "memprofile", "", "Profile memory usage. Supply filename for output of memory usage") 36 | defaultDuration := time.Duration(0 * time.Second) 37 | flag.DurationVar(&e.replayDuration, "replay-duration", defaultDuration, "Last x to replay ie '1s', '5m', etc as parsed by Time.ParseDuration. Will be subtracted from time.Now()") 38 | flag.Int64Var(&e.replaySecond, "replay-second", 0, "Replay a specific epoch second of the oplog and forward from there.") 39 | flag.BoolVar(&e.SSLInsecureSkipVerify, "ssl-insecure-skip-verify", false, "Skip verification of Mongo SSL certificate ala sslAllowInvalidCertificates") 40 | flag.Parse() 41 | e.reportingToken = os.Getenv("ERROR_REPORTING_TOKEN") 42 | e.appEnvironment = os.Getenv("APP_ENV") 43 | if e.appEnvironment == "" { 44 | e.appEnvironment = "production" 45 | } 46 | if e.replayDuration != defaultDuration && e.replaySecond != 0 { 47 | e.replayOplog = true 48 | } else { 49 | e.replayOplog = false 50 | } 51 | if x != "" { 52 | e.urls.mongo = x 53 | } 54 | if p != "" { 55 | e.urls.postgres = p 56 | } 57 | log.Debugf("Configuration: %+v", e) 58 | if e.memprofile != "" { 59 | f, err := os.Create(e.memprofile) 60 | if err != nil { 61 | log.Fatal(err) 62 | } 63 | wg.Add(1) 64 | go func() { 65 | defer f.Close() 66 | tick := time.Tick(time.Duration(20) * time.Second) 67 | for { 68 | select { 69 | case <-tick: 70 | pprof.WriteHeapProfile(f) 71 | } 72 | } 73 | }() 74 | } 75 | return 76 | } 77 | 78 | func SetupLogger(env Env) { 79 | // Alter logging pattern for heroku 80 | log.SetOutput(os.Stdout) 81 | formatter := &log.TextFormatter{ 82 | FullTimestamp: true, 83 | } 84 | if os.Getenv("DYNO") != "" { 85 | formatter.FullTimestamp = false 86 | log.SetLevel(log.InfoLevel) 87 | log.SetFormatter(&log.JSONFormatter{}) 88 | } 89 | if v := os.Getenv("LOG_LEVEL"); v != "" { 90 | l, err := log.ParseLevel(v) 91 | if err != nil { 92 | log.WithField("level", v).Warn("LOG_LEVEL invalid, choose from debug, info, warn, fatal.") 93 | } else { 94 | log.SetLevel(l) 95 | } 96 | } 97 | switch env.errorReporting { 98 | case "rollbar": 99 | rollus.SetupLogging(env.reportingToken, env.appEnvironment) 100 | } 101 | 102 | log.WithField("logLevel", log.GetLevel()).Debug("Log Settings") 103 | } 104 | 105 | func IsInsertUpdateDelete(op *gtm.Op) bool { 106 | return isActionableOperation(op.IsInsert, op.IsUpdate, op.IsDelete) 107 | } 108 | 109 | func isActionableOperation(filters ...func() bool) bool { 110 | for _, fn := range filters { 111 | if fn() { 112 | return true 113 | } 114 | } 115 | return false 116 | } 117 | 118 | // SanitizeData handles type inconsistency between mongo and pg 119 | // and flattens the data from a potentially nested data struct 120 | // into a flattened struct using gjson. 121 | func SanitizeData(pgFields Fields, op *gtm.Op) map[string]interface{} { 122 | if !IsInsertUpdateDelete(op) { 123 | return make(map[string]interface{}) 124 | } 125 | 126 | newData, err := json.Marshal(op.Data) 127 | parsed := gjson.ParseBytes(newData) 128 | output := make(map[string]interface{}) 129 | if err != nil { 130 | log.Errorf("Failed to marshal op.Data into json %s", err.Error()) 131 | } 132 | 133 | for k, v := range pgFields { 134 | // Dot notation extraction 135 | maybe := parsed.Get(k) 136 | if !maybe.Exists() { 137 | // Fill with nils to ensure that NamedExec works 138 | output[v.Postgres.Name] = nil 139 | } else { 140 | // Sanitize the Value field when it's a map 141 | value := maybe.Value() 142 | if _, ok := maybe.Value().(map[string]interface{}); ok { 143 | // Marshal Objects using JSON 144 | b, _ := json.Marshal(value) 145 | output[v.Postgres.Name] = string(b) 146 | } else if _, ok := maybe.Value().([]interface{}); ok { 147 | // Marshal Arrays using JSON 148 | b, _ := json.Marshal(value) 149 | output[v.Postgres.Name] = string(b) 150 | } else { 151 | output[v.Postgres.Name] = value 152 | } 153 | } 154 | } 155 | 156 | // Normalize data map to always include the Id with conversion 157 | // Required for delete actions that have a missing _id field in 158 | // op.Data. Must occur after the preceeding iterative block 159 | // in order to avoid being overwritten with nil. 160 | if op.Id != nil { 161 | output["_id"] = op.Id 162 | } 163 | 164 | return output 165 | } 166 | 167 | func createFanKey(db string, collection string) string { 168 | return db + "." + collection 169 | } 170 | 171 | func splitFanKey(key string) (string, string) { 172 | s := strings.Split(key, ".") 173 | return s[0], s[1] 174 | } 175 | 176 | // EnsureOpHasAllFields: Ensure that required keys are present will null value 177 | func EnsureOpHasAllFields(op *gtm.Op, keysToEnsure []string) *gtm.Op { 178 | // Guard against assignment into nil map 179 | if op.Data == nil { 180 | op.Data = make(map[string]interface{}) 181 | } 182 | for _, k := range keysToEnsure { 183 | if _, ok := op.Data[k]; !ok { 184 | op.Data[k] = nil 185 | } 186 | } 187 | return op 188 | } 189 | 190 | func ExitUnlessValidEnv(e Env) { 191 | if e.validatePostgres { 192 | return 193 | } 194 | 195 | if e.createTableSQL { 196 | c := Commands{} 197 | c.CreateTableSQL() 198 | } 199 | if e.urls.mongo == "" || e.urls.postgres == "" { 200 | log.Warnf(`Missing required variable. Both MONGO_URL and POSTGRES_URL must be set. 201 | See the following usage instructions for setting those variables.`) 202 | flag.Usage() 203 | os.Exit(1) 204 | } 205 | if !(e.sync || e.tail) { 206 | flag.Usage() 207 | os.Exit(1) 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /utils_test.go: -------------------------------------------------------------------------------- 1 | package moresql_test 2 | 3 | import ( 4 | "github.com/rwynn/gtm" 5 | m "github.com/zph/moresql" 6 | . "gopkg.in/check.v1" 7 | "gopkg.in/mgo.v2/bson" 8 | ) 9 | 10 | func (s *MySuite) TestSanitizeData(c *C) { 11 | bsonId := bson.ObjectId("123") 12 | withBson := map[string]interface{}{"_id": bsonId} 13 | withBsonResult := map[string]interface{}{"name": interface{}(nil), "age": interface{}(nil), "location_id": interface{}(nil), "_id": "313233"} 14 | withSymbol := map[string]interface{}{"name": bson.Symbol("Alice")} 15 | withSymbolResult := map[string]interface{}{"age": interface{}(nil), "location_id": interface{}(nil), "_id": interface{}(nil), "name": "Alice"} 16 | withNonPrimaryKey := map[string]interface{}{"name": "Alice", "location_id": bsonId} 17 | withNonPrimaryKeyResult := map[string]interface{}{"_id": interface{}(nil), "name": "Alice", "age": interface{}(nil), "location_id": "313233"} 18 | var table = []struct { 19 | op *gtm.Op 20 | result map[string]interface{} 21 | }{ 22 | {>m.Op{Operation: "i", Data: withBson}, withBsonResult}, 23 | {>m.Op{Operation: "i", Data: withSymbol}, withSymbolResult}, 24 | {>m.Op{Operation: "i", Data: withNonPrimaryKey}, withNonPrimaryKeyResult}, 25 | } 26 | for _, t := range table { 27 | actual := m.SanitizeData(BuildFields("_id", "name", "age", "location_id"), t.op) 28 | c.Check(actual, DeepEquals, t.result) 29 | } 30 | 31 | // Test nested data structures 32 | test1Mongo := m.Mongo{} 33 | test1Postgres := m.Postgres{} 34 | test1Mongo.Name = "name.first" 35 | test1Postgres.Name = "name_first" 36 | field := m.Field{} 37 | field.Mongo = test1Mongo 38 | field.Postgres = test1Postgres 39 | nameFirst := m.Fields{"name.first": field} 40 | singleNested := map[string]interface{}{"name": map[string]interface{}{"first": "John", "last": "Doe"}} 41 | singleNestedResult := map[string]interface{}{"name_first": "John"} 42 | mResidential := m.Mongo{} 43 | pResidential := m.Postgres{} 44 | mResidential.Name = "address.home" 45 | pResidential.Name = "address_home" 46 | f := m.Field{} 47 | f.Mongo = mResidential 48 | f.Postgres = pResidential 49 | address := m.Fields{"address.home": f} 50 | stub := map[string]interface{}{"address": map[string]interface{}{"home": false}} 51 | result := map[string]interface{}{"address_home": false} 52 | var nested = []struct { 53 | op *gtm.Op 54 | fields m.Fields 55 | result map[string]interface{} 56 | }{ 57 | {>m.Op{Operation: "i", Data: singleNested}, nameFirst, singleNestedResult}, 58 | {>m.Op{Operation: "i", Data: stub}, address, result}, 59 | } 60 | for _, t := range nested { 61 | actual := m.SanitizeData(t.fields, t.op) 62 | c.Check(actual, DeepEquals, t.result) 63 | } 64 | } 65 | 66 | func (s *MySuite) TestEnsureOpHasAllFieldsWhenEmpty(c *C) { 67 | op := >m.Op{Operation: "i"} 68 | fields := []string{"_id", "name", "age"} 69 | actual := m.EnsureOpHasAllFields(op, fields) 70 | for _, f := range fields { 71 | val, ok := actual.Data[f] 72 | c.Check(ok, Equals, true) 73 | c.Check(val, Equals, nil) 74 | } 75 | c.Check(actual.Data, DeepEquals, map[string]interface{}{ 76 | "_id": interface{}(nil), 77 | "name": interface{}(nil), 78 | "age": interface{}(nil), 79 | }) 80 | } 81 | 82 | func (s *MySuite) TestEnsureOpHasAllFieldsWhenMissingField(c *C) { 83 | data := map[string]interface{}{ 84 | "_id": interface{}("123"), 85 | "name": interface{}("Alice"), 86 | } 87 | op := >m.Op{Operation: "i", Data: data} 88 | fields := []string{"_id", "name", "age"} 89 | actual := m.EnsureOpHasAllFields(op, fields) 90 | for _, f := range fields { 91 | _, ok := actual.Data[f] 92 | c.Check(ok, Equals, true) 93 | } 94 | c.Check(actual.Data, DeepEquals, map[string]interface{}{ 95 | "_id": interface{}("123"), 96 | "name": interface{}("Alice"), 97 | "age": interface{}(nil), 98 | }) 99 | } 100 | 101 | func (s *MySuite) TestIsInsertUpdateDelete(c *C) { 102 | var table = []struct { 103 | op *gtm.Op 104 | result bool 105 | }{ 106 | {>m.Op{Operation: "c"}, false}, 107 | {>m.Op{Operation: "i"}, true}, 108 | {>m.Op{Operation: "u"}, true}, 109 | {>m.Op{Operation: "d"}, true}, 110 | } 111 | for _, t := range table { 112 | actual := m.IsInsertUpdateDelete(t.op) 113 | c.Check(actual, Equals, t.result) 114 | } 115 | } 116 | 117 | // func (s *MySuite) TestCreateFanKey(c *C){ 118 | 119 | // } 120 | --------------------------------------------------------------------------------