├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cmd └── sql-importer │ └── main.go ├── import.go ├── pg.go ├── profile ├── csv │ ├── csv.go │ ├── csv_test.go │ ├── parser.go │ └── parser_test.go ├── json │ ├── json.go │ └── json_test.go ├── parse.go ├── parse_test.go ├── profile.go ├── profile_test.go ├── profiler.go ├── types.go └── types_test.go └── reader ├── reader.go └── reader_test.go /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !dist/linux-amd64 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw? 2 | dist/ 3 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.6 2 | 3 | COPY ./dist/linux-amd64/sql-importer / 4 | 5 | ENTRYPOINT ["/sql-importer"] 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 The Children's Hospital of Philadelphia and individual contributors. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGE_NAME := dbhi/sql-importer 2 | PROG_NAME := sql-importer 3 | CMD_PATH := "./cmd/sql-importer" 4 | 5 | GIT_SHA := $(shell git log -1 --pretty=format:"%h" .) 6 | GIT_TAG := $(shell git describe --tags --exact-match . 2>/dev/null) 7 | GIT_BRANCH := $(shell git symbolic-ref -q --short HEAD) 8 | GIT_VERSION := $(shell git log -1 --pretty=format:"%h (%ci)" .) 9 | 10 | build: 11 | go build -ldflags "-X \"main.buildVersion=$(GIT_VERSION)\"" \ 12 | -o $(GOPATH)/bin/$(PROG_NAME) $(CMD_PATH) 13 | 14 | dist-build: 15 | mkdir -p dist 16 | 17 | gox -output="./dist/{{.OS}}-{{.Arch}}/$(PROG_NAME)" \ 18 | -ldflags "-X \"main.buildVersion=$(GIT_VERSION)\"" \ 19 | -os "windows linux darwin" \ 20 | -arch "amd64" $(CMD_PATH) > /dev/null 21 | 22 | dist-zip: 23 | cd dist && zip $(PROG_NAME)-darwin-amd64.zip darwin-amd64/* 24 | cd dist && zip $(PROG_NAME)-linux-amd64.zip linux-amd64/* 25 | cd dist && zip $(PROG_NAME)-windows-amd64.zip windows-amd64/* 26 | 27 | dist: dist-build dist-zip 28 | 29 | docker: 30 | docker build -t ${IMAGE_NAME}:${GIT_SHA} . 31 | docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:${GIT_BRANCH} 32 | 33 | if [ -n "${GIT_TAG}" ] ; then \ 34 | docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:${GIT_TAG} ; \ 35 | fi; 36 | 37 | if [ "${GIT_BRANCH}" == "master" ]; then \ 38 | docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:latest ; \ 39 | fi; 40 | 41 | docker-push: 42 | docker push ${IMAGE_NAME}:${GIT_SHA} 43 | docker push ${IMAGE_NAME}:${GIT_BRANCH} 44 | 45 | if [ -n "${GIT_TAG}" ]; then \ 46 | docker push ${IMAGE_NAME}:${GIT_TAG} ; \ 47 | fi; 48 | 49 | if [ "${GIT_BRANCH}" == "master" ]; then \ 50 | docker push ${IMAGE_NAME}:latest ; \ 51 | fi; 52 | 53 | .PHONY: build dist-build dist 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SQL Importer 2 | 3 | Import a CSV file into Postgres with automatic column typing and table creation. 4 | 5 | Features: 6 | 7 | - Type inference for numbers, dates, datetimes, and booleans 8 | - Automatic table creation 9 | - Uniqueness and not null detection 10 | - Automatic decompressing of gzip and bzip2 files 11 | - Support for append instead of replace 12 | - Support for CSV files wider than 1600 columns (the Postgres limit) 13 | 14 | ## Install 15 | 16 | [Download a pre-built release](https://github.com/chop-dbhi/sql-importer/releases). 17 | 18 | 19 | Or install it from source (requires Go). 20 | 21 | ``` 22 | go get github.com/chop-dbhi/sql-importer/cmd/sql-importer 23 | ``` 24 | 25 | ## Usage 26 | 27 | Specify the database URL and a CSV file to import. The table name will be derived from the filename by default. 28 | 29 | ``` 30 | sql-importer -db postgres://127.0.0.1:5432/postgres data.csv 31 | ``` 32 | 33 | See other options by running `sql-importer -h`. 34 | 35 | ## Status 36 | 37 | Beta, works as expected. Command line options will likely change. 38 | 39 | ## License 40 | 41 | MIT 42 | -------------------------------------------------------------------------------- /cmd/sql-importer/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "log" 6 | "os" 7 | "path/filepath" 8 | "strings" 9 | "sync" 10 | 11 | "github.com/chop-dbhi/sql-importer" 12 | ) 13 | 14 | func main() { 15 | var ( 16 | dbUrl string 17 | schemaName string 18 | tableName string 19 | compressionType string 20 | 21 | csvType bool 22 | csvDelimiter string 23 | csvNoHeader bool 24 | 25 | useCstore bool 26 | appendTable bool 27 | ) 28 | 29 | flag.StringVar(&dbUrl, "db", "", "Database URL.") 30 | flag.StringVar(&schemaName, "schema", "public", "Schema name.") 31 | flag.StringVar(&tableName, "table", "", "Table name.") 32 | flag.BoolVar(&csvType, "csv", true, "CSV file. Required if using stdin.") 33 | flag.StringVar(&csvDelimiter, "csv.delim", ",", "CSV delimiter.") 34 | flag.BoolVar(&csvNoHeader, "csv.noheader", false, "No CSV header present.") 35 | flag.StringVar(&compressionType, "compression", "", "Compression used.") 36 | flag.BoolVar(&useCstore, "cstore", false, "Use cstore table.") 37 | flag.BoolVar(&appendTable, "append", false, "Append to table.") 38 | 39 | flag.Parse() 40 | args := flag.Args() 41 | 42 | if len(args) == 0 { 43 | log.Fatal("file name or directory required") 44 | } 45 | 46 | inputName := args[0] 47 | 48 | stat, _ := os.Stat(inputName) 49 | 50 | if stat.IsDir() { 51 | loadDir( 52 | inputName, 53 | dbUrl, 54 | compressionType, 55 | csvDelimiter, 56 | appendTable, 57 | useCstore, 58 | ) 59 | } else { 60 | loadFile( 61 | inputName, 62 | dbUrl, 63 | schemaName, 64 | tableName, 65 | compressionType, 66 | csvDelimiter, 67 | csvType, 68 | appendTable, 69 | useCstore, 70 | csvNoHeader, 71 | ) 72 | } 73 | } 74 | 75 | func loadFile(path, dbUrl, schemaName, tableName, compressionType, csvDelimiter string, csvType, appendTable, useCstore, csvNoHeader bool) { 76 | r := sqlimporter.Request{ 77 | Path: path, 78 | 79 | Database: dbUrl, 80 | Schema: schemaName, 81 | Table: tableName, 82 | 83 | AppendTable: appendTable, 84 | CStore: useCstore, 85 | 86 | CSV: csvType, 87 | Compression: compressionType, 88 | 89 | Delimiter: csvDelimiter, 90 | Header: !csvNoHeader, 91 | } 92 | 93 | if err := sqlimporter.Import(&r); err != nil { 94 | log.Fatal(err) 95 | } 96 | } 97 | 98 | func loadDir(rootDir, dbUrl, compressionType, csvDelimiter string, appendTable, useCstore bool) { 99 | wg := &sync.WaitGroup{} 100 | 101 | filepath.Walk(rootDir, func(path string, info os.FileInfo, err error) error { 102 | if info.IsDir() { 103 | return nil 104 | } 105 | 106 | rpath, _ := filepath.Rel(rootDir, path) 107 | dir, base := filepath.Split(rpath) 108 | 109 | tableName := strings.Split(base, ".")[0] 110 | schemaName := strings.Replace(strings.Trim(dir, "/"), "/", "_", -1) 111 | 112 | if schemaName == "" { 113 | schemaName = "public" 114 | } 115 | 116 | r := sqlimporter.Request{ 117 | Path: path, 118 | 119 | Database: dbUrl, 120 | Schema: schemaName, 121 | Table: tableName, 122 | 123 | AppendTable: appendTable, 124 | CStore: useCstore, 125 | 126 | CSV: true, 127 | Compression: compressionType, 128 | 129 | Delimiter: csvDelimiter, 130 | Header: true, 131 | } 132 | 133 | wg.Add(1) 134 | 135 | go func() { 136 | defer wg.Done() 137 | 138 | defer func() { 139 | if err := recover(); err != nil { 140 | log.Printf("error loading file: %s", rpath) 141 | log.Printf("%s", err) 142 | } 143 | }() 144 | 145 | log.Printf(`loading file %s into table "%s"."%s"`, rpath, schemaName, tableName) 146 | 147 | if err := sqlimporter.Import(&r); err != nil { 148 | log.Printf("error importing file: %s", err) 149 | } 150 | }() 151 | 152 | return nil 153 | }) 154 | 155 | wg.Wait() 156 | 157 | } 158 | -------------------------------------------------------------------------------- /import.go: -------------------------------------------------------------------------------- 1 | package sqlimporter 2 | 3 | import ( 4 | "database/sql" 5 | libcsv "encoding/csv" 6 | "fmt" 7 | "log" 8 | "path" 9 | "strings" 10 | 11 | "github.com/chop-dbhi/sql-importer/profile/csv" 12 | "github.com/chop-dbhi/sql-importer/reader" 13 | ) 14 | 15 | type Request struct { 16 | // Input path. 17 | Path string 18 | 19 | // Target database. 20 | Database string 21 | Schema string 22 | Table string 23 | 24 | // Behavior 25 | AppendTable bool 26 | CStore bool 27 | 28 | // File specifics. 29 | CSV bool 30 | Compression string 31 | 32 | // CSV 33 | Delimiter string 34 | Header bool 35 | } 36 | 37 | func Import(r *Request) error { 38 | fileType, fileComp := reader.DetectType(r.Path) 39 | 40 | if r.CSV || fileType == "csv" { 41 | r.CSV = true 42 | } else { 43 | return fmt.Errorf("file type not supported: %s", fileType) 44 | } 45 | 46 | if r.Compression == "" { 47 | r.Compression = fileComp 48 | } 49 | 50 | if r.Table == "" { 51 | _, base := path.Split(r.Path) 52 | r.Table = strings.Split(base, ".")[0] 53 | } 54 | 55 | // Connect to database. 56 | db, err := sql.Open("postgres", r.Database) 57 | if err != nil { 58 | return fmt.Errorf("cannot open db connection: %s", err) 59 | } 60 | defer db.Close() 61 | 62 | // Open the input stream. 63 | input, err := reader.Open(r.Path, r.Compression) 64 | if err != nil { 65 | return fmt.Errorf("cannot open input: %s", err) 66 | } 67 | defer input.Close() 68 | 69 | cp := csv.NewProfiler(input) 70 | cp.Delimiter = r.Delimiter[0] 71 | cp.Header = r.Header 72 | 73 | prof, err := cp.Profile() 74 | if err != nil { 75 | return fmt.Errorf("profile error: %s", err) 76 | } 77 | 78 | log.Print("Done profiling") 79 | 80 | input.Close() 81 | input, err = reader.Open(r.Path, r.Compression) 82 | if err != nil { 83 | return fmt.Errorf("cannot open input: %s", err) 84 | } 85 | defer input.Close() 86 | 87 | schema := NewSchema(prof) 88 | if r.CStore { 89 | schema.Cstore = true 90 | } 91 | 92 | // Load intot he database. 93 | log.Printf(`Begin load into "%s"."%s"`, r.Schema, r.Table) 94 | 95 | cr := libcsv.NewReader(input) 96 | cr.Comma = rune(r.Delimiter[0]) 97 | 98 | var n int64 99 | dbc := New(db) 100 | if r.AppendTable { 101 | n, err = dbc.Append(r.Schema, r.Table, schema, cr) 102 | } else { 103 | n, err = dbc.Replace(r.Schema, r.Table, schema, cr) 104 | } 105 | if err != nil { 106 | return fmt.Errorf("error loading: %s", err) 107 | } 108 | 109 | log.Printf("Loaded %d records", n) 110 | 111 | return nil 112 | } 113 | -------------------------------------------------------------------------------- /pg.go: -------------------------------------------------------------------------------- 1 | package sqlimporter 2 | 3 | import ( 4 | "bytes" 5 | "database/sql" 6 | "encoding/csv" 7 | "errors" 8 | "fmt" 9 | "io" 10 | "regexp" 11 | "strings" 12 | "text/template" 13 | 14 | "github.com/chop-dbhi/sql-importer/profile" 15 | "github.com/lib/pq" 16 | uuid "github.com/satori/go.uuid" 17 | ) 18 | 19 | const ( 20 | rowIdColumn = "_row_id" 21 | 22 | // Maximum number of entries in a "target list" (e.g. column list). 23 | pgMaxTargetListSize = 1664 24 | ) 25 | 26 | var ( 27 | badChars *regexp.Regexp 28 | sepChars *regexp.Regexp 29 | 30 | sqlTmpl = template.New("sql") 31 | 32 | queryTmpls = map[string]string{ 33 | "createSchema": `create schema if not exists "{{.Schema}}"`, 34 | "createTable": `create table if not exists "{{.Schema}}"."{{.Table}}" ( {{.Columns}} )`, 35 | "createView": `create or replace view "{{.Schema}}"."{{.View}}" as select {{.Columns}} from "{{.Schema}}"."{{.Table}}" {{.Joins}}`, 36 | "createCstoreTable": `create foreign table if not exists "{{.Schema}}"."{{.Table}}" ( {{.Columns}} ) server cstore_server options (compression 'pglz')`, 37 | "dropTable": `drop table if exists "{{.Schema}}"."{{.Table}}"`, 38 | "dropView": `drop view if exists "{{.Schema}}"."{{.View}}"`, 39 | "renameTable": `alter table "{{.Schema}}"."{{.TempTable}}" rename to "{{.Table}}"`, 40 | "analyzeTable": `analyze "{{.Schema}}"."{{.Table}}"`, 41 | } 42 | 43 | // Map of profile types to SQL types. 44 | sqlTypeMap = map[profile.ValueType]string{ 45 | profile.UnknownType: "integer", 46 | profile.BoolType: "boolean", 47 | profile.StringType: "text", 48 | profile.IntType: "integer", 49 | profile.FloatType: "real", 50 | profile.DateType: "date", 51 | profile.DateTimeType: "timestamp", 52 | profile.NullType: "text", 53 | } 54 | ) 55 | 56 | func init() { 57 | // Initialize SQL statement templates. 58 | for name, tmpl := range queryTmpls { 59 | template.Must(sqlTmpl.New(name).Parse(tmpl)) 60 | } 61 | 62 | badChars = regexp.MustCompile(`[^a-z0-9_\-\.\+]+`) 63 | sepChars = regexp.MustCompile(`[\-\.\+]+`) 64 | } 65 | 66 | func splitN(l, n int) (int, int) { 67 | if n > l { 68 | return 1, 0 69 | } 70 | 71 | // Parts. 72 | p := l / n 73 | 74 | // Remainder. 75 | r := l % n 76 | 77 | return p, r 78 | } 79 | 80 | func splitColumns(columns []string, n int) [][]string { 81 | l := len(columns) 82 | if n >= l { 83 | return [][]string{columns} 84 | } 85 | 86 | // Split columns. 87 | p, r := splitN(l, n) 88 | 89 | var hi, low int 90 | var colparts [][]string 91 | 92 | for i := 0; i < p; i++ { 93 | low = i * n 94 | hi = low + n 95 | var cp []string 96 | cp = append(cp, columns[low:hi]...) 97 | colparts = append(colparts, cp) 98 | } 99 | 100 | // Remainder, add another part. 101 | if r > 0 { 102 | var cp []string 103 | cp = append(cp, columns[hi:]...) 104 | colparts = append(colparts, cp) 105 | } 106 | 107 | return colparts 108 | } 109 | 110 | type Schema struct { 111 | Cstore bool 112 | Fields []*Field 113 | } 114 | 115 | func NewSchema(p *profile.Profile) *Schema { 116 | fields := make([]*Field, len(p.Fields)) 117 | 118 | for n, f := range p.Fields { 119 | fields[f.Index] = &Field{ 120 | Name: n, 121 | Type: sqlTypeMap[f.Type], 122 | Unique: f.Unique, 123 | Nullable: f.Nullable || f.Missing, 124 | } 125 | } 126 | 127 | return &Schema{ 128 | Fields: fields, 129 | } 130 | } 131 | 132 | // Field is a data definition on a schema. 133 | type Field struct { 134 | Name string 135 | Type string 136 | Multiple bool 137 | Unique bool 138 | Nullable bool 139 | } 140 | 141 | type tableData struct { 142 | Schema string 143 | TempTable string 144 | Table string 145 | View string 146 | Columns string 147 | Joins string 148 | } 149 | 150 | // TODO: fuzz test this. 151 | func cleanFieldName(n string) string { 152 | n = strings.ToLower(n) 153 | n = badChars.ReplaceAllString(n, "_") 154 | return sepChars.ReplaceAllString(n, "_") 155 | } 156 | 157 | type Client struct { 158 | db *sql.DB 159 | } 160 | 161 | // execTx calls a function within a transaction. 162 | func (c *Client) execTx(fn func(tx *sql.Tx) error) error { 163 | tx, err := c.db.Begin() 164 | if err != nil { 165 | return err 166 | } 167 | 168 | if err := fn(tx); err != nil { 169 | tx.Rollback() 170 | return err 171 | } 172 | 173 | return tx.Commit() 174 | } 175 | 176 | func (c *Client) Replace(schemaName, tableName string, tableSchema *Schema, cr *csv.Reader) (int64, error) { 177 | tempTableNameUid, _ := uuid.NewV4() 178 | tempTableName := tempTableNameUid.String() 179 | defer c.dropTable(schemaName, tempTableName) 180 | 181 | if err := c.createSchema(schemaName); err != nil { 182 | return 0, err 183 | } 184 | 185 | splits, err := c.createTable(schemaName, tempTableName, tableSchema) 186 | if err != nil { 187 | return 0, err 188 | } 189 | 190 | n, err := c.copyData(schemaName, tempTableName, splits, cr) 191 | if err != nil { 192 | return 0, err 193 | } 194 | 195 | c.dropView(schemaName, tableName) 196 | c.dropTable(schemaName, tableName) 197 | 198 | if err := c.renameTable(schemaName, tempTableName, tableName, len(splits)); err != nil { 199 | return n, err 200 | } 201 | 202 | // Create a view if necessary and possible. 203 | if len(splits) > 1 && len(tableSchema.Fields)+len(splits) <= pgMaxTargetListSize { 204 | if err := c.createView(schemaName, tableName, tableName, splits); err != nil { 205 | return n, err 206 | } 207 | } 208 | 209 | return n, c.analyzeTable(schemaName, tableName, splits) 210 | } 211 | 212 | func (c *Client) Append(schemaName, tableName string, tableSchema *Schema, cr *csv.Reader) (int64, error) { 213 | if err := c.createSchema(schemaName); err != nil { 214 | return 0, err 215 | } 216 | 217 | splits, err := c.createTable(schemaName, tableName, tableSchema) 218 | if err != nil { 219 | return 0, err 220 | } 221 | 222 | n, err := c.copyData(schemaName, tableName, splits, cr) 223 | if err != nil { 224 | return 0, err 225 | } 226 | 227 | return n, c.analyzeTable(schemaName, tableName, splits) 228 | } 229 | 230 | func (c *Client) dropView(schemaName, viewName string) error { 231 | // Create the set of statements to 232 | data := &tableData{ 233 | Schema: schemaName, 234 | View: viewName, 235 | } 236 | 237 | var b bytes.Buffer 238 | if err := sqlTmpl.ExecuteTemplate(&b, "dropView", data); err != nil { 239 | return err 240 | } 241 | 242 | return c.execTx(func(tx *sql.Tx) error { 243 | sql := b.String() 244 | _, err := tx.Exec(sql) 245 | if err != nil { 246 | return fmt.Errorf("error dropping view: %s\n%s", err, sql) 247 | } 248 | 249 | return nil 250 | }) 251 | } 252 | 253 | func (c *Client) dropTable(schemaName, tableName string) error { 254 | // Create the set of statements to 255 | data := &tableData{ 256 | Schema: schemaName, 257 | Table: tableName, 258 | } 259 | 260 | var b bytes.Buffer 261 | if err := sqlTmpl.ExecuteTemplate(&b, "dropTable", data); err != nil { 262 | return err 263 | } 264 | 265 | return c.execTx(func(tx *sql.Tx) error { 266 | sql := b.String() 267 | _, err := tx.Exec(sql) 268 | if err != nil { 269 | return fmt.Errorf("error dropping table: %s\n%s", err, sql) 270 | } 271 | 272 | return nil 273 | }) 274 | } 275 | 276 | func (c *Client) createSchema(schemaName string) error { 277 | // Create the set of statements to 278 | data := &tableData{ 279 | Schema: schemaName, 280 | } 281 | 282 | var b bytes.Buffer 283 | if err := sqlTmpl.ExecuteTemplate(&b, "createSchema", data); err != nil { 284 | return err 285 | } 286 | 287 | return c.execTx(func(tx *sql.Tx) error { 288 | sql := b.String() 289 | _, err := tx.Exec(sql) 290 | if err != nil { 291 | return fmt.Errorf("error creating schema: %s\n%s", err, sql) 292 | } 293 | 294 | return nil 295 | }) 296 | } 297 | 298 | func (c *Client) createView(schemaName, viewName string, tableName string, tableColumns [][]string) error { 299 | var ( 300 | firstTable string 301 | rightTable string 302 | leftTable string 303 | selectColumns []string 304 | joins []string 305 | ) 306 | 307 | for i, cols := range tableColumns { 308 | rightTable = fmt.Sprintf("%s_%d", tableName, i) 309 | 310 | if firstTable == "" { 311 | firstTable = rightTable 312 | } 313 | 314 | // Add columns to select statement. 315 | for _, col := range cols { 316 | selectColumns = append(selectColumns, fmt.Sprintf(`"%s"."%s"."%s"`, schemaName, rightTable, col)) 317 | } 318 | 319 | if leftTable != "" { 320 | joins = append(joins, fmt.Sprintf(`inner join "%s"."%s" on ("%s"."%s"."%s" = "%s"."%s"."%s")`, schemaName, rightTable, schemaName, leftTable, rowIdColumn, schemaName, rightTable, rowIdColumn)) 321 | } 322 | 323 | leftTable = rightTable 324 | } 325 | 326 | data := &tableData{ 327 | Table: firstTable, 328 | View: viewName, 329 | Schema: schemaName, 330 | Columns: strings.Join(selectColumns, ", "), 331 | Joins: strings.Join(joins, " "), 332 | } 333 | 334 | var b bytes.Buffer 335 | if err := sqlTmpl.ExecuteTemplate(&b, "createView", data); err != nil { 336 | return err 337 | } 338 | 339 | return c.execTx(func(tx *sql.Tx) error { 340 | sql := b.String() 341 | _, err := tx.Exec(sql) 342 | if err != nil { 343 | return fmt.Errorf("error creating view: %s\n%s", err, sql) 344 | } 345 | 346 | return nil 347 | }) 348 | } 349 | 350 | func (c *Client) createTable(schemaName, tableName string, tableSchema *Schema) ([][]string, error) { 351 | var ( 352 | columns []string 353 | columnSchemas []string 354 | ) 355 | 356 | for _, f := range tableSchema.Fields { 357 | // Cleaned column name. 358 | name := cleanFieldName(f.Name) 359 | columns = append(columns, name) 360 | 361 | var col string 362 | 363 | // Create index. 364 | // TODO: long text values cannot be indexed. 365 | // https://dba.stackexchange.com/questions/25138/index-max-row-size-error. 366 | // Should this check the max value length? 367 | if f.Unique && f.Type != "text" { 368 | col = "%s %s unique" 369 | } else if !f.Nullable { 370 | col = "%s %s not null" 371 | } else { 372 | col = "%s %s" 373 | } 374 | 375 | columnSchemas = append(columnSchemas, fmt.Sprintf(col, pq.QuoteIdentifier(name), f.Type)) 376 | } 377 | 378 | // 250 - 1600 is max number of columns allowed per table, but this depends 379 | // on the data types used. this strategy simply attempts to create the widest 380 | // table it can. 381 | partSizes := []int{ 382 | 1299, 383 | 249, // max for certain types 384 | } 385 | 386 | for _, size := range partSizes { 387 | columnSplits := splitColumns(columns, size) 388 | columnSchemaSplits := splitColumns(columnSchemas, size) 389 | 390 | err := c.createTableSplits(schemaName, tableName, columnSchemaSplits, tableSchema.Cstore) 391 | 392 | // Success. 393 | if err == nil { 394 | return columnSplits, nil 395 | } 396 | 397 | if !strings.Contains(err.Error(), "pq: tables can have at most 1600 columns") { 398 | return nil, err 399 | } 400 | } 401 | 402 | return nil, errors.New("failed to partition columns") 403 | } 404 | 405 | func (c *Client) createTableSplits(schemaName, tableName string, splitColumns [][]string, cstore bool) error { 406 | // All columns fit in the table. 407 | if len(splitColumns) == 1 { 408 | return c.execTx(func(tx *sql.Tx) error { 409 | return c.createSingleTable(tx, schemaName, tableName, splitColumns[0], cstore) 410 | }) 411 | } 412 | 413 | return c.execTx(func(tx *sql.Tx) error { 414 | var partTables []string 415 | 416 | // Multiple tables, so we need to add the rowIdColumn. 417 | // A suffix is added to each table name. Then a view is created 418 | // to join the tables to together. 419 | for i, cols := range splitColumns { 420 | partTableName := fmt.Sprintf("%s_%d", tableName, i) 421 | 422 | ncols := []string{ 423 | rowIdColumn + " integer not null unique", 424 | } 425 | ncols = append(ncols, cols...) 426 | 427 | // TODO: clean up partially created tables? 428 | if err := c.createSingleTable(tx, schemaName, partTableName, ncols, cstore); err != nil { 429 | return err 430 | } 431 | 432 | partTables = append(partTables, partTableName) 433 | } 434 | 435 | return nil 436 | }) 437 | } 438 | 439 | func (c *Client) createSingleTable(tx *sql.Tx, schemaName, tableName string, columns []string, cstore bool) error { 440 | // Create the set of statements to 441 | data := &tableData{ 442 | Schema: schemaName, 443 | Table: tableName, 444 | Columns: strings.Join(columns, ","), 445 | } 446 | 447 | tmplName := "createTable" 448 | if cstore { 449 | tmplName = "createCstoreTable" 450 | } 451 | 452 | var b bytes.Buffer 453 | if err := sqlTmpl.ExecuteTemplate(&b, tmplName, data); err != nil { 454 | return err 455 | } 456 | 457 | sql := b.String() 458 | _, err := tx.Exec(sql) 459 | if err != nil { 460 | return fmt.Errorf("error creating table: %s\n%s", err, sql) 461 | } 462 | return err 463 | } 464 | 465 | func (c *Client) renameSingleTable(tx *sql.Tx, schemaName, tempTableName, tableName string) error { 466 | var b bytes.Buffer 467 | 468 | // Create the set of statements to 469 | data := &tableData{ 470 | Schema: schemaName, 471 | TempTable: tempTableName, 472 | Table: tableName, 473 | } 474 | 475 | tmpls := []string{ 476 | "dropTable", 477 | "renameTable", 478 | } 479 | 480 | for _, name := range tmpls { 481 | b.Reset() 482 | if err := sqlTmpl.ExecuteTemplate(&b, name, data); err != nil { 483 | return err 484 | } 485 | 486 | if _, err := tx.Exec(b.String()); err != nil { 487 | return fmt.Errorf("error renaming table: %s", err) 488 | } 489 | } 490 | 491 | return nil 492 | } 493 | 494 | func (c *Client) renameTable(schemaName, tempTableName, tableName string, tableParts int) error { 495 | if tableParts == 1 { 496 | return c.execTx(func(tx *sql.Tx) error { 497 | return c.renameSingleTable(tx, schemaName, tempTableName, tableName) 498 | }) 499 | } 500 | 501 | return c.execTx(func(tx *sql.Tx) error { 502 | for i := 0; i < tableParts; i++ { 503 | if err := c.renameSingleTable(tx, schemaName, fmt.Sprintf("%s_%d", tempTableName, i), fmt.Sprintf("%s_%d", tableName, i)); err != nil { 504 | return err 505 | } 506 | } 507 | return nil 508 | }) 509 | } 510 | 511 | func (c *Client) analyzeTable(schemaName, tableName string, tableColumns [][]string) error { 512 | if len(tableColumns) == 1 { 513 | return c.execTx(func(tx *sql.Tx) error { 514 | return c.analyzeSingleTable(tx, schemaName, tableName) 515 | }) 516 | } 517 | 518 | return c.execTx(func(tx *sql.Tx) error { 519 | for i := range tableColumns { 520 | if err := c.analyzeSingleTable(tx, schemaName, fmt.Sprintf("%s_%d", tableName, i)); err != nil { 521 | return err 522 | } 523 | } 524 | 525 | return nil 526 | }) 527 | } 528 | 529 | func (c *Client) analyzeSingleTable(tx *sql.Tx, schemaName, tableName string) error { 530 | // Create the set of statements to 531 | data := &tableData{ 532 | Schema: schemaName, 533 | Table: tableName, 534 | } 535 | 536 | var b bytes.Buffer 537 | if err := sqlTmpl.ExecuteTemplate(&b, "analyzeTable", data); err != nil { 538 | return err 539 | } 540 | 541 | sql := b.String() 542 | if _, err := tx.Exec(sql); err != nil { 543 | return fmt.Errorf("error analyzinng table: %s\n%s", err, sql) 544 | } 545 | 546 | return nil 547 | } 548 | 549 | func (c *Client) copyData(schemaName, tableName string, tableColumns [][]string, cr *csv.Reader) (int64, error) { 550 | // Read and skip columns. 551 | _, err := cr.Read() 552 | if err != nil { 553 | return 0, err 554 | } 555 | 556 | singleTable := len(tableColumns) == 1 557 | singleTableSize := len(tableColumns[0]) 558 | 559 | txs := make([]*sql.Tx, len(tableColumns)) 560 | stmts := make([]*sql.Stmt, len(tableColumns)) 561 | 562 | defer func() { 563 | for i := range txs { 564 | stmts[i].Close() 565 | txs[i].Rollback() 566 | } 567 | }() 568 | 569 | for i, cols := range tableColumns { 570 | tx, err := c.db.Begin() 571 | if err != nil { 572 | return 0, err 573 | } 574 | 575 | txs[i] = tx 576 | 577 | targetTable := tableName 578 | if !singleTable { 579 | cols = append([]string{rowIdColumn}, cols...) 580 | targetTable = fmt.Sprintf("%s_%d", tableName, i) 581 | } 582 | 583 | stmt, err := tx.Prepare(pq.CopyInSchema(schemaName, targetTable, cols...)) 584 | if err != nil { 585 | return 0, fmt.Errorf("error preparing copy: %s", err) 586 | } 587 | 588 | stmts[i] = stmt 589 | } 590 | 591 | // Allocate buffer. Max width + 1 for row id. 592 | // The actual bounds will need to be maintained. 593 | cargs := make([]interface{}, len(tableColumns[0])+1) 594 | 595 | var ( 596 | n int64 597 | rowid int64 598 | ) 599 | 600 | // Buffer records for COPY statement. 601 | for { 602 | row, err := cr.Read() 603 | if err == io.EOF { 604 | break 605 | } 606 | 607 | if err != nil { 608 | return 0, fmt.Errorf("error reading record: %s", err) 609 | } 610 | 611 | rowid++ 612 | 613 | if singleTable { 614 | for i, v := range row { 615 | if v == "" { 616 | cargs[i] = nil 617 | } else { 618 | cargs[i] = v 619 | } 620 | } 621 | 622 | _, err = stmts[0].Exec(cargs[:singleTableSize]...) 623 | if err != nil { 624 | return 0, fmt.Errorf("error sending row: %s", err) 625 | } 626 | } else { 627 | var low, hi int 628 | 629 | for i, cols := range tableColumns { 630 | hi = low + len(cols) 631 | 632 | cargs[0] = rowid 633 | 634 | for j, v := range row[low:hi] { 635 | if v == "" { 636 | cargs[j+1] = nil 637 | } else { 638 | cargs[j+1] = v 639 | } 640 | } 641 | 642 | low = hi 643 | 644 | _, err = stmts[i].Exec(cargs[:len(cols)+1]...) 645 | if err != nil { 646 | return 0, fmt.Errorf("error sending row: %s: %v, %v", err, cols, cargs[:len(cols)+1]) 647 | } 648 | } 649 | } 650 | 651 | n++ 652 | } 653 | 654 | // Empty exec to flush the buffer. 655 | for _, stmt := range stmts { 656 | _, err = stmt.Exec() 657 | if err != nil { 658 | return 0, fmt.Errorf("error executing copy: %s", err) 659 | } 660 | } 661 | 662 | if err != nil { 663 | return 0, err 664 | } 665 | 666 | // Commit transactions. 667 | for _, tx := range txs { 668 | if err := tx.Commit(); err != nil { 669 | return 0, err 670 | } 671 | } 672 | 673 | return n, nil 674 | } 675 | 676 | func New(db *sql.DB) *Client { 677 | return &Client{ 678 | db: db, 679 | } 680 | } 681 | -------------------------------------------------------------------------------- /profile/csv/csv.go: -------------------------------------------------------------------------------- 1 | package csv 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "strings" 7 | 8 | "github.com/chop-dbhi/sql-importer/profile" 9 | ) 10 | 11 | type Profiler struct { 12 | Config *profile.Config 13 | Delimiter byte 14 | Header bool 15 | 16 | in io.Reader 17 | } 18 | 19 | func (x *Profiler) Profile() (*profile.Profile, error) { 20 | p := profile.NewProfiler(x.Config) 21 | cr := NewCSVReader(x.in, x.Delimiter) 22 | 23 | // First record, may be the header. 24 | record, err := cr.Read() 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | header := make([]string, len(record)) 30 | if x.Header { 31 | for i, n := range record { 32 | header[i] = strings.ToLower(n) 33 | } 34 | } else { 35 | for i, _ := range record { 36 | header[i] = fmt.Sprintf("c%d", i) 37 | } 38 | } 39 | 40 | for _, c := range header { 41 | p.InitField(c) 42 | } 43 | 44 | // Profile first record. 45 | if !x.Header { 46 | for i, field := range header { 47 | val := record[i] 48 | 49 | // Treat empty strings as a null value. 50 | if val == "" { 51 | p.RecordType(field, nil, profile.NullType) 52 | } else { 53 | p.Record(field, val) 54 | } 55 | } 56 | 57 | p.Incr() 58 | } 59 | 60 | // Continue with remaining records. 61 | for { 62 | err := cr.ScanLine(record) 63 | if err == io.EOF { 64 | break 65 | } 66 | 67 | if err != nil { 68 | return nil, err 69 | } 70 | 71 | for i, field := range header { 72 | val := record[i] 73 | 74 | // Treat empty strings as a null value. 75 | if val == "" { 76 | p.RecordType(field, nil, profile.NullType) 77 | } else { 78 | p.Record(field, val) 79 | } 80 | } 81 | 82 | p.Incr() 83 | } 84 | 85 | pf := p.Profile() 86 | 87 | // Set the index of the field. 88 | for idx, name := range header { 89 | pf.Fields[name].Index = idx 90 | } 91 | 92 | return pf, nil 93 | } 94 | 95 | func NewProfiler(r io.Reader) *Profiler { 96 | return &Profiler{ 97 | Delimiter: ',', 98 | Header: true, 99 | in: r, 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /profile/csv/csv_test.go: -------------------------------------------------------------------------------- 1 | package csv 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | 7 | "github.com/chop-dbhi/sql-importer/profile" 8 | ) 9 | 10 | func TestProfiler(t *testing.T) { 11 | b := bytes.NewBufferString(`name,color,dob 12 | John,Blue,03/11/2013 13 | Jane,Red,2008-2-24 14 | Joe,,2010-02-11 15 | `) 16 | 17 | pr := NewProfiler(b) 18 | p, err := pr.Profile() 19 | if err != nil { 20 | t.Fatal(err) 21 | } 22 | 23 | if len(p.Fields) != 3 { 24 | t.Errorf("expected 3 fields, got %d", len(p.Fields)) 25 | } 26 | 27 | if p.Fields["dob"].Type != profile.DateType { 28 | t.Errorf("expected date type, got %s", p.Fields["dob"].Type) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /profile/csv/parser.go: -------------------------------------------------------------------------------- 1 | package csv 2 | 3 | import ( 4 | "bufio" 5 | "errors" 6 | "io" 7 | ) 8 | 9 | const ( 10 | // 8 times default scanner buffer size. 11 | scanBufSize = 8 * 64 * 1024 12 | ) 13 | 14 | var ( 15 | csvErrUnquotedField = errors.New("unquoted field") 16 | csvErrUnescapedQuote = errors.New("bare quote") 17 | csvErrUnterminatedField = errors.New("unterminated field") 18 | csvErrExtraColumns = errors.New("extra columns") 19 | ) 20 | 21 | func clearRow(row []string) { 22 | for i, _ := range row { 23 | row[i] = "" 24 | } 25 | } 26 | 27 | // CSVReader provides an interface for reading CSV data 28 | // (compatible with rfc4180 and extended with the option of having a separator other than ","). 29 | // Successive calls to the Scan method will step through the 'fields', skipping the separator/newline between the fields. 30 | // The EndOfRecord method tells when a field is terminated by a line break. 31 | type CSVReader struct { 32 | sc *bufio.Scanner 33 | 34 | // If true, the scanner will continue scanning if field-level errors are 35 | // encountered. The error should be checked after each call to Scan to 36 | // handle the error. 37 | ContinueOnError bool 38 | 39 | sep byte // values separator 40 | eor bool // true when the most recent field has been terminated by a newline (not a separator). 41 | lineno int // current line number (not record number) 42 | column int // current column index 1-based 43 | 44 | eof bool 45 | // Error. Only set if 46 | err error 47 | 48 | // Full line, last valid column value, remaining data in the line. 49 | line string 50 | token []byte 51 | data []byte 52 | 53 | trail bool 54 | } 55 | 56 | // DefaultReader creates a "standard" CSV reader. 57 | func DefaultCSVReader(rd io.Reader) *CSVReader { 58 | return NewCSVReader(rd, ',') 59 | } 60 | 61 | // NewReader returns a new CSV scanner. 62 | func NewCSVReader(r io.Reader, sep byte) *CSVReader { 63 | s := &CSVReader{ 64 | ContinueOnError: true, 65 | 66 | // Defaults to splitting by line. 67 | sc: bufio.NewScanner(r), 68 | sep: sep, 69 | eor: true, 70 | } 71 | 72 | s.sc.Buffer(nil, scanBufSize) 73 | 74 | return s 75 | } 76 | 77 | // Line returns the current line as a string. 78 | func (s *CSVReader) Line() string { 79 | return s.line 80 | } 81 | 82 | // Text returns the text of the current field. 83 | func (s *CSVReader) Text() string { 84 | return string(s.token) 85 | } 86 | 87 | // LineNumber returns current line number. 88 | func (s *CSVReader) LineNumber() int { 89 | return s.lineno 90 | } 91 | 92 | // ColumnNumber returns the column index of the current field. 93 | func (s *CSVReader) ColumnNumber() int { 94 | return s.column 95 | } 96 | 97 | // EndOfRecord returns true when the most recent field has been terminated by a newline (not a separator). 98 | func (s *CSVReader) EndOfRecord() bool { 99 | return s.eor 100 | } 101 | 102 | // Err returns an error if one occurred during scanning. 103 | func (s *CSVReader) Err() error { 104 | if err := s.sc.Err(); err != nil { 105 | return err 106 | } 107 | 108 | if s.err != nil { 109 | return s.err 110 | } 111 | 112 | if s.eof { 113 | return io.EOF 114 | } 115 | 116 | return nil 117 | } 118 | 119 | // Read scans all fields in one line builds a slice of values. 120 | func (s *CSVReader) Read() ([]string, error) { 121 | var ( 122 | err error 123 | r []string 124 | ) 125 | 126 | for s.Scan() { 127 | if err = s.Err(); err != nil { 128 | return nil, err 129 | } 130 | 131 | r = append(r, s.Text()) 132 | 133 | if s.EndOfRecord() { 134 | break 135 | } 136 | } 137 | 138 | return r, s.Err() 139 | } 140 | 141 | // ScanLine scans all fields in one line and puts the values in 142 | // the passed slice. 143 | func (s *CSVReader) ScanLine(r []string) error { 144 | var ( 145 | err error 146 | max = len(r) 147 | ) 148 | 149 | for i := 0; s.Scan(); i++ { 150 | // Line too long. 151 | if i == max { 152 | return csvErrExtraColumns 153 | } 154 | 155 | if err = s.Err(); err != nil { 156 | clearRow(r[i:]) 157 | return err 158 | } 159 | 160 | r[i] = s.Text() 161 | 162 | if s.EndOfRecord() { 163 | break 164 | } 165 | } 166 | 167 | return s.Err() 168 | } 169 | 170 | func (s *CSVReader) Scan() bool { 171 | // Error. 172 | if s.err != nil && !s.ContinueOnError { 173 | return false 174 | } 175 | 176 | // EOF 177 | if s.eof && len(s.data) == 0 { 178 | return false 179 | } 180 | 181 | // If the end of the record has been reached, scan for the next line. 182 | if s.eor { 183 | // Clear. 184 | s.line = "" 185 | s.data = nil 186 | s.token = nil 187 | 188 | // Scan until there is a non-empty line to parse. 189 | for { 190 | if !s.sc.Scan() { 191 | // If there was an error, return. Otherwise mark as EOF. 192 | if err := s.sc.Err(); err != nil { 193 | return false 194 | } 195 | 196 | s.eof = true 197 | break 198 | } 199 | 200 | // Set the current line. Add the new line to parsing. 201 | s.line = s.sc.Text() 202 | 203 | // Skip empty lines. 204 | if s.line != "" { 205 | s.data = s.sc.Bytes() 206 | break 207 | } 208 | } 209 | } 210 | 211 | adv, token, trail, err := s.scanField(s.data) 212 | 213 | // Advance the section of the line for the next field. 214 | s.data = s.data[adv:] 215 | s.err = err 216 | 217 | if trail && len(s.data) == 0 { 218 | s.trail = trail 219 | } 220 | 221 | // Set the token if no error occurred otherwise mark as the end of record. 222 | if err == nil { 223 | s.token = token 224 | } else { 225 | if s.ContinueOnError { 226 | s.token = s.data 227 | s.eor = true 228 | } else { 229 | return false 230 | } 231 | } 232 | 233 | if !s.trail && s.eof && len(s.data) == 0 { 234 | return false 235 | } 236 | 237 | return true 238 | } 239 | 240 | func (s *CSVReader) scanField(data []byte) (int, []byte, bool, error) { 241 | // Special case. 242 | if s.trail { 243 | s.column++ 244 | s.eor = true 245 | s.trail = false 246 | return 0, data, false, nil 247 | } 248 | 249 | if len(data) == 0 { 250 | return 0, nil, false, nil 251 | } 252 | 253 | // Previous iteration was the end of a record. Increment line and reset column. 254 | if s.eor { 255 | s.column = 0 256 | s.lineno++ 257 | } 258 | 259 | s.column++ 260 | s.eor = false 261 | 262 | // Quoted field. 263 | if data[0] == '"' { 264 | var ( 265 | eq int 266 | oq bool 267 | c, pc byte 268 | ) 269 | 270 | // Scan until the end quote is found. 271 | for i := 1; i < len(data); i++ { 272 | c = data[i] 273 | 274 | // Successive quotes denote an escaped quote. Clear the previous byte 275 | // to escaped quotes are not overlapped. 276 | if c == '"' { 277 | if pc == '"' { 278 | pc = 0 279 | oq = false 280 | eq++ 281 | continue 282 | } 283 | 284 | // Open quote. 285 | if oq { 286 | return 0, nil, false, csvErrUnescapedQuote 287 | } 288 | 289 | oq = true 290 | } 291 | 292 | // End of field with a trailing separator. 293 | if pc == '"' && c == s.sep { 294 | return i + 1, unescapeQuotes(data[1:i-1], eq), true, nil 295 | } 296 | 297 | // Shift previous characters. 298 | pc = c 299 | } 300 | 301 | // Ran out of bytes. 302 | s.eor = true 303 | 304 | // Final character in the line is a quote of the last field. 305 | if c == '"' { 306 | return len(data), unescapeQuotes(data[1:len(data)-1], eq), false, nil 307 | } 308 | 309 | // End of line without a terminated quote. 310 | return 0, nil, false, csvErrUnterminatedField 311 | } 312 | 313 | // Unquoted fields. Only fail if a double quote is found. 314 | for i, c := range data { 315 | if c == s.sep { 316 | s.eor = false 317 | return i + 1, data[0:i], true, nil 318 | } 319 | 320 | // Unquoted field with quote. 321 | if c == '"' { 322 | return 0, nil, false, csvErrUnquotedField 323 | } 324 | } 325 | 326 | // Ran out of bytes. 327 | s.eor = true 328 | 329 | return len(data), data, false, nil 330 | } 331 | 332 | // Removes escaped quotes from the string. 333 | func unescapeQuotes(b []byte, count int) []byte { 334 | if count == 0 { 335 | return b 336 | } 337 | 338 | for i, j := 0, 0; i < len(b); i, j = i+1, j+1 { 339 | b[j] = b[i] 340 | 341 | if b[i] == '"' && (i < len(b)-1 && b[i+1] == '"') { 342 | i++ 343 | } 344 | } 345 | 346 | return b[:len(b)-count] 347 | } 348 | -------------------------------------------------------------------------------- /profile/csv/parser_test.go: -------------------------------------------------------------------------------- 1 | package csv 2 | 3 | import ( 4 | "bytes" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "testing" 9 | ) 10 | 11 | func compareRows(a, b []string) bool { 12 | if len(a) != len(b) { 13 | return false 14 | } 15 | 16 | for i, v := range a { 17 | if v != b[i] { 18 | return false 19 | } 20 | } 21 | 22 | return true 23 | } 24 | 25 | func tableToCSV(t [][]string) []byte { 26 | buf := bytes.NewBuffer(nil) 27 | sep := []byte{','} 28 | nl := []byte{'\n'} 29 | 30 | for _, r := range t { 31 | for i, c := range r { 32 | if i != 0 { 33 | buf.Write(sep) 34 | } 35 | if c != "" { 36 | buf.WriteString(fmt.Sprintf(`"%s"`, c)) 37 | } 38 | } 39 | 40 | buf.Write(nl) 41 | } 42 | 43 | return buf.Bytes() 44 | } 45 | 46 | func tableToToks(t [][]string) []string { 47 | var toks []string 48 | 49 | for _, r := range t { 50 | toks = append(toks, r...) 51 | } 52 | 53 | return toks 54 | } 55 | 56 | func TestCSVReader(t *testing.T) { 57 | table := [][]string{ 58 | {"name", "gender", "state"}, 59 | {"Joe", "M", "GA"}, 60 | {"Sue", "F", "NJ"}, 61 | {"Bob", "M", "NY"}, 62 | {"Bill", "M", ""}, // trailing comma 63 | } 64 | 65 | buf := bytes.NewBuffer(tableToCSV(table)) 66 | toks := tableToToks(table) 67 | 68 | cr := DefaultCSVReader(buf) 69 | 70 | var i, c, l int 71 | 72 | for i = 0; cr.Scan(); i++ { 73 | // Increment line and reset column every three tokens. 74 | if i%3 == 0 { 75 | l++ 76 | c = 1 77 | } else { 78 | c++ 79 | } 80 | 81 | if i == len(toks) { 82 | t.Errorf("scan exceeded %d tokens", i+1) 83 | break 84 | } 85 | 86 | tok := cr.Text() 87 | 88 | if tok != toks[i] { 89 | t.Errorf("line %d, column %d: expected %s, got %s", cr.LineNumber(), cr.ColumnNumber(), toks[i], tok) 90 | } 91 | 92 | if cr.LineNumber() != l { 93 | t.Errorf("expected line %d, got %d for %s", l, cr.LineNumber(), tok) 94 | } 95 | 96 | if cr.ColumnNumber() != c { 97 | t.Errorf("expected column %d, got %d for %s", c, cr.ColumnNumber(), tok) 98 | } 99 | } 100 | 101 | if err := cr.Err(); err != io.EOF { 102 | t.Errorf("unexpected error: %s", err) 103 | } 104 | 105 | if i != len(toks) { 106 | t.Errorf("expected %d, got %d", len(toks), i) 107 | } 108 | } 109 | 110 | func TestCSVScanLine(t *testing.T) { 111 | table := [][]string{ 112 | {"name", "gender", "state"}, 113 | {"Joe", "M", "GA"}, 114 | {"Sue", "F", "NJ"}, 115 | {"Bob", "M", "NY"}, 116 | {"Bill", "M", ""}, 117 | } 118 | 119 | buf := bytes.NewBuffer(tableToCSV(table)) 120 | 121 | cr := DefaultCSVReader(buf) 122 | 123 | var ( 124 | i int 125 | err error 126 | row = make([]string, 3) 127 | ) 128 | 129 | for { 130 | err = cr.ScanLine(row) 131 | 132 | if err == io.EOF { 133 | break 134 | } 135 | 136 | if err != nil { 137 | t.Errorf("%d: unexpected error: %s", i, err) 138 | } 139 | 140 | if cr.LineNumber() != i+1 { 141 | t.Errorf("%d: got wrong line number %d", i, cr.LineNumber()) 142 | } 143 | 144 | if !compareRows(table[i], row) { 145 | t.Errorf("%d: wrong row, got %v", row) 146 | } 147 | 148 | i++ 149 | } 150 | 151 | if i != 5 { 152 | t.Errorf("scanned wrong number of lines %d", i) 153 | } 154 | } 155 | 156 | func TestCSVInput(t *testing.T) { 157 | rows := []string{ 158 | `"name","gender",state`, 159 | `Joe,"M",GA`, 160 | `"Sue","""F""",NJ`, 161 | `Bob,M,NY`, 162 | } 163 | 164 | buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n"))) 165 | cr := DefaultCSVReader(buf) 166 | 167 | var ( 168 | err error 169 | row = make([]string, 3) 170 | ) 171 | 172 | for { 173 | err = cr.ScanLine(row) 174 | 175 | if err == io.EOF { 176 | break 177 | } 178 | 179 | if err != nil { 180 | t.Errorf("%d: unexpected error: %s", cr.LineNumber(), err) 181 | } 182 | } 183 | } 184 | 185 | func TestCSVScanLineBadInput(t *testing.T) { 186 | rows := []string{ 187 | `"name", "gender",state`, 188 | `Joe,"M", "GA"`, 189 | `"Sue", "F", "NJ"`, 190 | `"Bob",M,NY"`, 191 | } 192 | 193 | buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n"))) 194 | cr := DefaultCSVReader(buf) 195 | 196 | var ( 197 | i int 198 | err error 199 | row = make([]string, 3) 200 | ) 201 | 202 | for { 203 | err = cr.ScanLine(row) 204 | 205 | if err == io.EOF { 206 | break 207 | } 208 | 209 | if cr.Line() != rows[i] { 210 | t.Errorf("%d: bad line `%s`", i, cr.Line()) 211 | } 212 | 213 | if err == nil { 214 | t.Errorf("%d: expected error", i) 215 | } else if cr.LineNumber() != i+1 { 216 | t.Errorf("%d: got wrong line number %d", i, cr.LineNumber()) 217 | } 218 | 219 | i++ 220 | } 221 | 222 | if i != 4 { 223 | t.Errorf("scanned wrong number of lines %d", i) 224 | } 225 | } 226 | 227 | func TestCSVReaderBadInput(t *testing.T) { 228 | rows := []string{ 229 | `"name","gender", state`, 230 | `Joe,"M", "GA"`, 231 | `"Sue", "F", "NJ"`, 232 | `"Bob",M,NY"`, 233 | } 234 | 235 | expectedToks := []struct { 236 | Token string 237 | Error bool 238 | Line int 239 | Column int 240 | }{ 241 | {"name", false, 1, 1}, 242 | {"gender", false, 1, 2}, 243 | {" state", false, 1, 3}, 244 | {"Joe", false, 2, 1}, 245 | {"M", false, 2, 2}, 246 | {` "GA"`, true, 2, 3}, 247 | {"Sue", false, 3, 1}, 248 | {` "F", "NJ"`, true, 3, 2}, 249 | {"Bob", false, 4, 1}, 250 | {"M", false, 4, 2}, 251 | {`NY"`, true, 4, 3}, 252 | } 253 | 254 | buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n"))) 255 | cr := DefaultCSVReader(buf) 256 | 257 | var ( 258 | err error 259 | tok string 260 | ) 261 | 262 | for i := 0; cr.Scan(); i++ { 263 | tok = cr.Text() 264 | exp := expectedToks[i] 265 | 266 | if cr.LineNumber() != exp.Line { 267 | t.Errorf("%d: expected line %d, got %d", i, exp.Line, cr.LineNumber()) 268 | } 269 | 270 | if cr.ColumnNumber() != exp.Column { 271 | t.Errorf("%d: expected column %d, got %d", i, exp.Column, cr.ColumnNumber()) 272 | } 273 | 274 | if exp.Token != tok { 275 | t.Errorf("%d: expected token `%s`, got `%s`", i, exp.Token, tok) 276 | } 277 | 278 | err = cr.Err() 279 | 280 | if err == nil && exp.Error { 281 | t.Errorf("%d: expected error", i) 282 | } else if err != nil && !exp.Error { 283 | t.Errorf("%d: unexpected error: %s", i, err) 284 | } 285 | } 286 | } 287 | 288 | func TestCSVExtraColumns(t *testing.T) { 289 | buf := bytes.NewBufferString("one,two,three,four") 290 | cr := DefaultCSVReader(buf) 291 | 292 | // 3 columns expected. 293 | toks := make([]string, 3) 294 | err := cr.ScanLine(toks) 295 | 296 | if err == nil { 297 | t.Errorf("Expected error") 298 | } else if err != csvErrExtraColumns { 299 | t.Errorf("Expected extra columns error, got %s instead", err) 300 | } 301 | } 302 | 303 | var line = `"3","\PCORI\VITAL\TOBACCO\SMOKING\","Smoked Tobacco","N","FAE",,,,"concept_cd","CONCEPT_DIMENSION","concept_path","T","like","\PCORI\VITAL\TOBACCO\SMOKING\","CDMv2","This field is new to v3.0. Indicator for any form of tobacco that is smoked.Per Meaningful Use guidance, smoking status includes any form of tobacco that is smoked, but not all tobacco use. ""Light smoker"" is interpreted to mean less than 10 cigarettes per day, or an equivalent (but less concretely defined) quantity of cigar or pipe smoke. ""Heavy smoker"" is interpreted to mean greater than 10 cigarettes per day or an equivalent (but less concretely defined) quantity of cigar or pipe smoke. ","@","2015-08-20 312:14:14.0","2015-08-20 12:14:14.0","2015-08-20 12:14:14.0","PCORNET_CDM",,,"\PCORI\VITAL\TOBACCO\","SMOKING"` + "\n" 304 | 305 | func BenchmarkCSVReaderScan(b *testing.B) { 306 | cr := DefaultCSVReader(&bytes.Buffer{}) 307 | 308 | data := []byte(line) 309 | 310 | for i := 0; i < b.N; i++ { 311 | _, data, _, _ = cr.scanField(data) 312 | 313 | if len(data) == 0 { 314 | data = []byte(line) 315 | } 316 | } 317 | } 318 | -------------------------------------------------------------------------------- /profile/json/json.go: -------------------------------------------------------------------------------- 1 | package json 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "encoding/json" 7 | "fmt" 8 | "io" 9 | 10 | "github.com/chop-dbhi/sql-importer/profile" 11 | ) 12 | 13 | type analyzer struct { 14 | p profile.Profiler 15 | } 16 | 17 | func (a *analyzer) parseField(path, field string, value interface{}) { 18 | fp := fmt.Sprintf("%s%s", path, field) 19 | 20 | switch x := value.(type) { 21 | case nil: 22 | a.p.RecordType(fp, nil, profile.NullType) 23 | 24 | // Nested object. 25 | case map[string]interface{}: 26 | a.parseMap(fp+"/", x) 27 | 28 | // Array. 29 | case []interface{}: 30 | for _, v := range x { 31 | a.parseField(path, field, v) 32 | } 33 | 34 | case bool: 35 | a.p.RecordType(fp, x, profile.BoolType) 36 | 37 | case string: 38 | var t profile.ValueType 39 | 40 | if _, ok := profile.ParseDate(x); ok { 41 | t = profile.DateType 42 | } else if _, ok := profile.ParseDateTime(x); ok { 43 | t = profile.DateTimeType 44 | } else { 45 | t = profile.StringType 46 | } 47 | 48 | a.p.RecordType(fp, x, t) 49 | 50 | case json.Number: 51 | if v, err := x.Int64(); err == nil { 52 | a.p.RecordType(fp, v, profile.IntType) 53 | } else if v, err := x.Float64(); err == nil { 54 | a.p.RecordType(fp, v, profile.FloatType) 55 | } else { 56 | panic("could not parse JSON number") 57 | } 58 | 59 | default: 60 | panic(fmt.Sprintf("unsupported type: %#T", value)) 61 | } 62 | } 63 | 64 | // types are identified relative to the path. 65 | func (a *analyzer) parseMap(path string, m map[string]interface{}) { 66 | for k, v := range m { 67 | a.parseField(path, k, v) 68 | } 69 | } 70 | 71 | func (a *analyzer) parseLDJSON(r io.Reader) error { 72 | s := bufio.NewScanner(r) 73 | 74 | // Initialize buffer and JSON decoder. 75 | var b bytes.Buffer 76 | dec := json.NewDecoder(&b) 77 | dec.UseNumber() 78 | 79 | for s.Scan() { 80 | line := bytes.TrimSpace(s.Bytes()) 81 | if len(line) == 0 { 82 | continue 83 | } 84 | 85 | b.Reset() 86 | b.Write(line) 87 | 88 | var m map[string]interface{} 89 | if err := dec.Decode(&m); err != nil { 90 | return err 91 | } 92 | 93 | a.parseMap("", m) 94 | } 95 | 96 | return s.Err() 97 | } 98 | 99 | func (a *analyzer) parseJSON(r io.Reader) error { 100 | dec := json.NewDecoder(r) 101 | dec.UseNumber() 102 | 103 | tok, err := dec.Token() 104 | if err != nil { 105 | return err 106 | } 107 | 108 | if tok != json.Delim('[') { 109 | return fmt.Errorf("expected array, got: %v", tok) 110 | } 111 | 112 | // More elements in the array. 113 | for dec.More() { 114 | var m map[string]interface{} 115 | if err := dec.Decode(&m); err != nil { 116 | return err 117 | } 118 | 119 | a.parseMap("", m) 120 | } 121 | 122 | return nil 123 | } 124 | 125 | func Profile(config *profile.Config, in io.Reader, format string) (*profile.Profile, error) { 126 | p := profile.NewProfiler(config) 127 | 128 | a := analyzer{ 129 | p: p, 130 | } 131 | 132 | var err error 133 | 134 | switch format { 135 | case "ldjson": 136 | err = a.parseLDJSON(in) 137 | case "json": 138 | err = a.parseJSON(in) 139 | } 140 | 141 | if err != nil { 142 | return nil, err 143 | } 144 | 145 | return p.Profile(), nil 146 | } 147 | -------------------------------------------------------------------------------- /profile/json/json_test.go: -------------------------------------------------------------------------------- 1 | package json 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func TestProfileJSON(t *testing.T) { 9 | b := bytes.NewBufferString(`[ 10 | {"name": "John", "color": "Blue", "dob": "1985-03-10"}, 11 | {"name": "Jane", "color": "Red"} 12 | ]`) 13 | 14 | p, err := Profile(nil, b, "json") 15 | if err != nil { 16 | t.Fatal(err) 17 | } 18 | 19 | if len(p.Fields) != 3 { 20 | t.Errorf("expected 3 fields, got %d", len(p.Fields)) 21 | } 22 | } 23 | 24 | func TestProfileLDJSON(t *testing.T) { 25 | b := bytes.NewBufferString(` 26 | {"name": "John", "color": "Blue", "dob": "1985-03-10"} 27 | {"name": "Jane", "color": "Red"} 28 | `) 29 | 30 | p, err := Profile(nil, b, "ldjson") 31 | if err != nil { 32 | t.Fatal(err) 33 | } 34 | 35 | if len(p.Fields) != 3 { 36 | t.Errorf("expected 3 fields, got %d", len(p.Fields)) 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /profile/parse.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import ( 4 | "strconv" 5 | "strings" 6 | "time" 7 | ) 8 | 9 | var ( 10 | dateFormats = []string{ 11 | "2006-01-02", 12 | "2006-1-02", 13 | "01-02-2006", 14 | "01-02-06", 15 | "01/02/2006", 16 | "01/02/06", 17 | "1/2/06", 18 | } 19 | 20 | dateTimeFormats = []string{ 21 | "2006-01-02 15:04", 22 | "2006-01-02 15:04:05", 23 | "2006-01-02T15:04:05", 24 | "2006-01-02T15:04:05Z", 25 | "2006-01-02T15:04:05Z07:00", 26 | } 27 | ) 28 | 29 | func ParseBool(s string) (bool, bool) { 30 | s = strings.TrimSpace(s) 31 | 32 | b, err := strconv.ParseBool(s) 33 | if err != nil { 34 | return false, false 35 | } 36 | 37 | return b, true 38 | } 39 | 40 | func ParseDate(s string) (time.Time, bool) { 41 | s = strings.TrimSpace(s) 42 | 43 | for _, layout := range dateFormats { 44 | if v, err := time.Parse(layout, s); err == nil { 45 | return v, true 46 | } 47 | } 48 | 49 | return time.Time{}, false 50 | } 51 | 52 | func ParseDateTime(s string) (time.Time, bool) { 53 | s = strings.TrimSpace(s) 54 | 55 | for _, layout := range dateTimeFormats { 56 | if v, err := time.Parse(layout, s); err == nil { 57 | return v, true 58 | } 59 | } 60 | 61 | return time.Time{}, false 62 | } 63 | 64 | func ParseFloat(s string) (float64, bool) { 65 | f, err := strconv.ParseFloat(s, 64) 66 | if err != nil { 67 | return 0, false 68 | } 69 | return f, true 70 | } 71 | 72 | func ParseInt(s string) (int64, bool) { 73 | i, err := strconv.ParseInt(s, 10, 64) 74 | if err != nil { 75 | return 0, false 76 | } 77 | return i, true 78 | } 79 | -------------------------------------------------------------------------------- /profile/parse_test.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import "testing" 4 | 5 | func BenchmarkParseDateValid(b *testing.B) { 6 | s := "1998-10-01" 7 | for i := 0; i < b.N; i++ { 8 | ParseDate(s) 9 | } 10 | } 11 | 12 | func BenchmarkParseDateInvalid(b *testing.B) { 13 | s := "not a date" 14 | for i := 0; i < b.N; i++ { 15 | ParseDate(s) 16 | } 17 | } 18 | 19 | func BenchmarkParseDateTimeValid(b *testing.B) { 20 | s := "1998-10-01 01:32:10" 21 | for i := 0; i < b.N; i++ { 22 | ParseDateTime(s) 23 | } 24 | } 25 | 26 | func BenchmarkParseDateTimeInvalid(b *testing.B) { 27 | s := "not a date time" 28 | for i := 0; i < b.N; i++ { 29 | ParseDateTime(s) 30 | } 31 | } 32 | 33 | func BenchmarkParseFloatValid(b *testing.B) { 34 | s := "32.10219" 35 | for i := 0; i < b.N; i++ { 36 | ParseFloat(s) 37 | } 38 | } 39 | 40 | func BenchmarkParseFloatInvalid(b *testing.B) { 41 | s := "not a number" 42 | for i := 0; i < b.N; i++ { 43 | ParseFloat(s) 44 | } 45 | } 46 | 47 | func BenchmarkParseIntValid(b *testing.B) { 48 | s := "3210219" 49 | for i := 0; i < b.N; i++ { 50 | ParseInt(s) 51 | } 52 | } 53 | 54 | func BenchmarkParseIntInvalid(b *testing.B) { 55 | s := "not a number" 56 | for i := 0; i < b.N; i++ { 57 | ParseInt(s) 58 | } 59 | } 60 | 61 | func BenchmarkParseBoolValid(b *testing.B) { 62 | s := "TRUE" 63 | for i := 0; i < b.N; i++ { 64 | ParseBool(s) 65 | } 66 | } 67 | 68 | func BenchmarkParseBoolInvalid(b *testing.B) { 69 | s := "not a bool" 70 | for i := 0; i < b.N; i++ { 71 | ParseBool(s) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /profile/profile.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | // Field stores aggregation information and statistics for a field. 4 | type Field struct { 5 | // Name of this field. 6 | Name string `json:"name"` 7 | 8 | // Index of the field in tabular sources. 9 | Index int `json:"index"` 10 | 11 | // Inferred type of the field. All candidates types are in the 12 | // type counts array. 13 | Type ValueType `json:"type"` 14 | 15 | // True if the field contains null values. 16 | Nullable bool `json:"nullable"` 17 | 18 | // True if the field contains empty strings. 19 | Missing bool `json:"missing"` 20 | 21 | // True if all values are unique. 22 | Unique bool `json:"unique"` 23 | 24 | // If true, at least one value has been detected to have a leading zero. 25 | LeadingZeros bool `json:"leading_zeros"` 26 | } 27 | 28 | type Profile struct { 29 | // Total number os records processed. 30 | RecordCount int64 `json:"record_count"` 31 | 32 | // Flat set of fields that were profiled. 33 | Fields map[string]*Field `json:"fields"` 34 | } 35 | 36 | func NewProfile() *Profile { 37 | return &Profile{ 38 | Fields: make(map[string]*Field), 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /profile/profile_test.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import ( 4 | "testing" 5 | "time" 6 | ) 7 | 8 | func TestProfilerRecord(t *testing.T) { 9 | tests := map[string]struct { 10 | Raw string 11 | Type ValueType 12 | Val interface{} 13 | }{ 14 | "string": { 15 | "bar", 16 | StringType, 17 | "bar", 18 | }, 19 | "int": { 20 | "10", 21 | IntType, 22 | int64(10), 23 | }, 24 | "float": { 25 | "1.20", 26 | FloatType, 27 | float64(1.20), 28 | }, 29 | "bool": { 30 | "true", 31 | BoolType, 32 | true, 33 | }, 34 | "date-1": { 35 | "2014-02-01", 36 | DateType, 37 | time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC), 38 | }, 39 | "date-2": { 40 | "02/01/2014", 41 | DateType, 42 | time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC), 43 | }, 44 | "date-3": { 45 | "02/01/14", 46 | DateType, 47 | time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC), 48 | }, 49 | "date-4": { 50 | "2/1/14", 51 | DateType, 52 | time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC), 53 | }, 54 | "datetime": { 55 | "2014-02-01 10:00:00", 56 | DateTimeType, 57 | time.Date(2014, time.February, 1, 10, 0, 0, 0, time.UTC), 58 | }, 59 | } 60 | 61 | p := NewProfiler(nil) 62 | 63 | for name, test := range tests { 64 | t.Run(name, func(t *testing.T) { 65 | p.Record("test", test.Raw) 66 | }) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /profile/profiler.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import "strings" 4 | 5 | // hasLeadingZeros checks if a valid integer value contains leading zeros. 6 | // This is often an indicator that this is not an integer, but an identfier. 7 | func hasLeadingZeros(s string) bool { 8 | if s == "" { 9 | return false 10 | } 11 | 12 | return s[0] == '0' 13 | } 14 | 15 | type profiler struct { 16 | Config *Config 17 | Count int64 18 | Include map[string]struct{} 19 | Exclude map[string]struct{} 20 | Fields map[string]*profilerField 21 | } 22 | 23 | // Profiler is an interface for profiling data. 24 | type Profiler interface { 25 | // Increment the record count. 26 | Incr() 27 | 28 | InitField(name string) 29 | 30 | // Record records a field-value pair to the profile of an unknown type. 31 | // The value must be a encoded as a string and will be parsed in a variety 32 | // of ways to detect the type. 33 | Record(field string, raw string) 34 | 35 | // RecordType recorsd a field-value pair with a known type. 36 | RecordType(field string, value interface{}, typ ValueType) 37 | 38 | // Profile returns the profile. 39 | Profile() *Profile 40 | } 41 | 42 | type Config struct { 43 | // Include are the fields to explicitly include. 44 | Include []string 45 | 46 | // Exclude are the fields to explicitly exclude. 47 | Exclude []string 48 | } 49 | 50 | func (p *profiler) Incr() { 51 | p.Count++ 52 | } 53 | 54 | // field returns the field profile if it should be profiled. 55 | func (p *profiler) field(n string) (*profilerField, bool) { 56 | n = strings.ToLower(n) 57 | 58 | if _, ok := p.Exclude[n]; ok { 59 | return nil, false 60 | } 61 | 62 | if len(p.Include) > 0 { 63 | if _, ok := p.Include[n]; !ok { 64 | return nil, false 65 | } 66 | } 67 | 68 | // Initialize and get field profile. 69 | f, ok := p.Fields[n] 70 | if !ok { 71 | f = newProfilerField(n) 72 | p.Fields[n] = f 73 | } 74 | 75 | return f, true 76 | } 77 | 78 | func (p *profiler) InitField(name string) { 79 | p.field(name) 80 | } 81 | 82 | func (p *profiler) Profile() *Profile { 83 | r := NewProfile() 84 | r.RecordCount = p.Count 85 | 86 | for k, f := range p.Fields { 87 | r.Fields[k] = f.Field() 88 | } 89 | 90 | return r 91 | } 92 | 93 | func (p *profiler) Record(n string, v string) { 94 | f, ok := p.field(n) 95 | if !ok { 96 | return 97 | } 98 | 99 | // Still in the unique state. 100 | if f.Unique { 101 | // Duplicate value. 102 | if _, ok := f.Values[v]; ok { 103 | f.Unique = false 104 | f.Values = nil 105 | } else { 106 | f.Values[v] = struct{}{} 107 | } 108 | } 109 | 110 | // Short circuit. Already most general type. 111 | if _, ok := f.Types[StringType]; ok { 112 | return 113 | } 114 | 115 | if _, ok := ParseInt(v); ok { 116 | if !f.LeadingZeros && hasLeadingZeros(v) { 117 | f.LeadingZeros = true 118 | } 119 | 120 | f.Types[IntType] = struct{}{} 121 | return 122 | } 123 | 124 | if _, ok := ParseFloat(v); ok { 125 | f.Types[FloatType] = struct{}{} 126 | return 127 | } 128 | 129 | if _, ok := ParseBool(v); ok { 130 | f.Types[BoolType] = struct{}{} 131 | return 132 | } 133 | 134 | if _, ok := ParseDate(v); ok { 135 | f.Types[DateType] = struct{}{} 136 | return 137 | } 138 | 139 | if _, ok := ParseDateTime(v); ok { 140 | f.Types[DateTimeType] = struct{}{} 141 | return 142 | } 143 | 144 | f.Types[StringType] = struct{}{} 145 | } 146 | 147 | func (p *profiler) RecordType(n string, v interface{}, t ValueType) { 148 | f, ok := p.field(n) 149 | if !ok { 150 | return 151 | } 152 | 153 | f.Types[t] = struct{}{} 154 | } 155 | 156 | // Field stores aggregation information and statistics for a field. 157 | type profilerField struct { 158 | Name string 159 | Types map[ValueType]struct{} 160 | Values map[string]struct{} 161 | Unique bool 162 | LeadingZeros bool 163 | } 164 | 165 | func (p *profilerField) Field() *Field { 166 | _, nullable := p.Types[NullType] 167 | _, missing := p.Values[""] 168 | 169 | f := Field{ 170 | Name: p.Name, 171 | Type: p.Type(), 172 | Nullable: nullable, 173 | Missing: missing, 174 | Unique: p.Unique, 175 | LeadingZeros: p.LeadingZeros, 176 | } 177 | 178 | return &f 179 | } 180 | 181 | // Type returns the most specific type this field satisfies. 182 | func (f *profilerField) Type() ValueType { 183 | if f.LeadingZeros { 184 | return StringType 185 | } 186 | 187 | var g ValueType 188 | 189 | for t := range f.Types { 190 | if g == UnknownType { 191 | g = t 192 | } else { 193 | g = GeneralizeType(t, g) 194 | } 195 | } 196 | 197 | return g 198 | } 199 | 200 | func newProfilerField(name string) *profilerField { 201 | return &profilerField{ 202 | Name: name, 203 | Types: make(map[ValueType]struct{}), 204 | Values: make(map[string]struct{}), 205 | Unique: true, 206 | } 207 | } 208 | 209 | func NewProfiler(c *Config) Profiler { 210 | if c == nil { 211 | c = &Config{} 212 | } 213 | 214 | p := &profiler{ 215 | Config: c, 216 | Fields: make(map[string]*profilerField), 217 | } 218 | 219 | if len(p.Config.Exclude) > 0 { 220 | p.Exclude = make(map[string]struct{}) 221 | 222 | for _, f := range p.Config.Exclude { 223 | p.Exclude[strings.ToLower(f)] = struct{}{} 224 | } 225 | } 226 | 227 | if len(p.Config.Include) > 0 { 228 | p.Include = make(map[string]struct{}) 229 | 230 | for _, f := range p.Config.Include { 231 | p.Include[strings.ToLower(f)] = struct{}{} 232 | } 233 | } 234 | 235 | return p 236 | } 237 | -------------------------------------------------------------------------------- /profile/types.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import ( 4 | "encoding/json" 5 | "strings" 6 | ) 7 | 8 | const ( 9 | UnknownType ValueType = iota 10 | NullType 11 | StringType 12 | BinaryType 13 | IntType 14 | FloatType 15 | BoolType 16 | DateType 17 | DateTimeType 18 | ObjectType 19 | ) 20 | 21 | // ValueType is a type of value. 22 | type ValueType uint8 23 | 24 | func (v ValueType) String() string { 25 | switch v { 26 | case NullType: 27 | return "null" 28 | case StringType: 29 | return "string" 30 | case BinaryType: 31 | return "binary" 32 | case IntType: 33 | return "integer" 34 | case FloatType: 35 | return "float" 36 | case BoolType: 37 | return "boolean" 38 | case DateType: 39 | return "date" 40 | case DateTimeType: 41 | return "datetime" 42 | case ObjectType: 43 | return "object" 44 | } 45 | 46 | return "" 47 | } 48 | 49 | func (v ValueType) MarshalJSON() ([]byte, error) { 50 | return json.Marshal(v.String()) 51 | } 52 | 53 | func (v *ValueType) UnmarshalJSON(b []byte) error { 54 | var s string 55 | if err := json.Unmarshal(b, &s); err != nil { 56 | return err 57 | } 58 | 59 | var t ValueType 60 | 61 | switch strings.ToLower(s) { 62 | case "string": 63 | t = StringType 64 | case "null": 65 | t = NullType 66 | case "binary": 67 | t = BinaryType 68 | case "integer": 69 | t = IntType 70 | case "float": 71 | t = FloatType 72 | case "boolean": 73 | t = BoolType 74 | case "date": 75 | t = DateType 76 | case "datetime": 77 | t = DateTimeType 78 | case "object": 79 | t = ObjectType 80 | } 81 | 82 | *v = t 83 | 84 | return nil 85 | } 86 | 87 | var typeGeneralizationMap = map[[2]ValueType]ValueType{ 88 | {BoolType, IntType}: IntType, 89 | {IntType, FloatType}: FloatType, 90 | {BoolType, FloatType}: FloatType, 91 | {DateTimeType, DateType}: DateTimeType, 92 | } 93 | 94 | // GeneralizeType takes two types and returns the more general 95 | // type of the two with string being the most general if both 96 | // are not null types. 97 | func GeneralizeType(t1, t2 ValueType) ValueType { 98 | // Same type. 99 | if t1 == t2 { 100 | return t1 101 | } 102 | 103 | if t1 == NullType { 104 | return t2 105 | } 106 | 107 | if t2 == NullType { 108 | return t1 109 | } 110 | 111 | key := [2]ValueType{t1, t2} 112 | 113 | t, ok := typeGeneralizationMap[key] 114 | if ok { 115 | return t 116 | } 117 | 118 | // Swap order. 119 | key[0], key[1] = key[1], key[0] 120 | 121 | t, ok = typeGeneralizationMap[key] 122 | if ok { 123 | return t 124 | } 125 | 126 | // Everything can be generalized to a string. 127 | return StringType 128 | } 129 | -------------------------------------------------------------------------------- /profile/types_test.go: -------------------------------------------------------------------------------- 1 | package profile 2 | 3 | import "testing" 4 | 5 | func assertType(t *testing.T, e, a ValueType) { 6 | if e != a { 7 | t.Errorf("expected %s, got %s", e, a) 8 | } 9 | } 10 | 11 | func TestGeneralizeType(t *testing.T) { 12 | assertType(t, GeneralizeType(IntType, FloatType), FloatType) 13 | assertType(t, GeneralizeType(IntType, BoolType), IntType) 14 | assertType(t, GeneralizeType(StringType, BoolType), StringType) 15 | assertType(t, GeneralizeType(DateTimeType, DateType), DateTimeType) 16 | } 17 | -------------------------------------------------------------------------------- /reader/reader.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "bytes" 5 | "compress/bzip2" 6 | "compress/gzip" 7 | "fmt" 8 | "io" 9 | "os" 10 | "path" 11 | "path/filepath" 12 | "strings" 13 | ) 14 | 15 | var bom = []byte{0xef, 0xbb, 0xbf} 16 | 17 | // UniversalReader wraps an io.Reader to replace carriage returns with newlines. 18 | // This is used with the csv.Reader so it can properly delimit lines. 19 | type UniversalReader struct { 20 | r io.Reader 21 | } 22 | 23 | func (r *UniversalReader) Read(buf []byte) (int, error) { 24 | n, err := r.r.Read(buf) 25 | 26 | // Detect and remove BOM. 27 | if bytes.HasPrefix(buf, bom) { 28 | copy(buf, buf[len(bom):]) 29 | n -= len(bom) 30 | } 31 | 32 | // Replace carriage returns with newlines 33 | for i, b := range buf { 34 | if b == '\r' { 35 | buf[i] = '\n' 36 | } 37 | } 38 | 39 | return n, err 40 | } 41 | 42 | func (r *UniversalReader) Close() error { 43 | if rc, ok := r.r.(io.Closer); ok { 44 | return rc.Close() 45 | } 46 | return nil 47 | } 48 | 49 | func NewUniversalReader(r io.Reader) *UniversalReader { 50 | return &UniversalReader{r} 51 | } 52 | 53 | // Decompress takes a compression type and a reader and returns 54 | // reader that will be decompressed if the type is supported. 55 | func Decompress(t string, r io.Reader) (io.Reader, error) { 56 | if t == "" { 57 | return r, nil 58 | } 59 | 60 | switch t { 61 | case "gzip", "gz": 62 | gr, err := gzip.NewReader(r) 63 | if err != nil { 64 | return nil, err 65 | } 66 | return gr, nil 67 | 68 | case "bz2", "bzip2": 69 | return bzip2.NewReader(r), nil 70 | } 71 | 72 | return nil, fmt.Errorf("compression type not supported: %s", t) 73 | } 74 | 75 | // DetectType attempts to detect the file format and compression types by looking at the 76 | // file path extensions. 77 | func DetectType(url string) (string, string) { 78 | _, name := path.Split(url) 79 | 80 | // Split up extensions. 81 | exts := strings.Split(name, ".")[1:] 82 | 83 | var ( 84 | compression string 85 | format string 86 | ) 87 | 88 | for _, ext := range exts { 89 | switch ext { 90 | case "gz", "gzip": 91 | compression = "gzip" 92 | 93 | case "bz2", "bzip2": 94 | compression = "bzip2" 95 | 96 | case "json": 97 | format = "json" 98 | 99 | case "csv": 100 | format = "csv" 101 | 102 | case "ldjson": 103 | format = "ldjson" 104 | } 105 | } 106 | 107 | return format, compression 108 | } 109 | 110 | func detectCompression(name string) string { 111 | switch filepath.Ext(name) { 112 | case ".gzip", ".gz": 113 | return "gzip" 114 | case ".bzip2", ".bz2": 115 | return "bzip2" 116 | } 117 | 118 | return "" 119 | } 120 | 121 | // Reader encapsulates a stdin stream. 122 | type Reader struct { 123 | Name string 124 | Compression string 125 | 126 | reader io.Reader 127 | file *os.File 128 | } 129 | 130 | // Read implements the io.Reader interface. 131 | func (r *Reader) Read(buf []byte) (int, error) { 132 | return r.reader.Read(buf) 133 | } 134 | 135 | // Close implements the io.Closer interface. 136 | func (r *Reader) Close() { 137 | if r.file != nil { 138 | r.file.Close() 139 | } 140 | } 141 | 142 | // Open a reader by name with optional compression. If no name is specified, STDIN 143 | // is used. 144 | func Open(name, compr string) (*Reader, error) { 145 | r := new(Reader) 146 | 147 | if compr == "" { 148 | compr = detectCompression(name) 149 | } 150 | 151 | // Validate Compressionession method before working with files. 152 | switch compr { 153 | case "bzip2", "gzip", "": 154 | default: 155 | return nil, fmt.Errorf("unknown compression type %s", compr) 156 | } 157 | 158 | if name == "" { 159 | r.reader = os.Stdin 160 | } else { 161 | file, err := os.Open(name) 162 | 163 | if err != nil { 164 | return nil, err 165 | } 166 | 167 | r.file = file 168 | r.reader = file 169 | } 170 | 171 | // Apply the Compressionession decoder. 172 | switch compr { 173 | case "gzip": 174 | reader, err := gzip.NewReader(r.reader) 175 | 176 | if err != nil { 177 | r.Close() 178 | return nil, err 179 | } 180 | 181 | r.reader = reader 182 | case "bzip2": 183 | r.reader = bzip2.NewReader(r.reader) 184 | } 185 | 186 | r.Compression = compr 187 | 188 | r.reader = &UniversalReader{r.reader} 189 | 190 | return r, nil 191 | } 192 | -------------------------------------------------------------------------------- /reader/reader_test.go: -------------------------------------------------------------------------------- 1 | package reader 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func TestUniversalReader(t *testing.T) { 9 | s := "\xef\xbb\xbfhello world!\r" 10 | 11 | r := bytes.NewBufferString(s) 12 | ur := &UniversalReader{r} 13 | 14 | buf := make([]byte, 20) 15 | n, err := ur.Read(buf) 16 | 17 | if err != nil { 18 | t.Fatalf("problem reading: %s", err) 19 | } 20 | 21 | if cap(buf) != 20 { 22 | t.Fatalf("expected 20 cap, got %d", cap(buf)) 23 | } 24 | 25 | if len(s)-3 != n { 26 | t.Errorf("expected %d bytes, got %d", len(s)-3, n) 27 | } 28 | 29 | exp := "hello world!\n" 30 | 31 | if string(buf[:n]) != exp { 32 | t.Errorf("expected '%v', got '%v'", exp, string(buf[:n])) 33 | } 34 | } 35 | --------------------------------------------------------------------------------