├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── cmd
    └── sql-importer
    │   └── main.go
├── import.go
├── pg.go
├── profile
    ├── csv
    │   ├── csv.go
    │   ├── csv_test.go
    │   ├── parser.go
    │   └── parser_test.go
    ├── json
    │   ├── json.go
    │   └── json_test.go
    ├── parse.go
    ├── parse_test.go
    ├── profile.go
    ├── profile_test.go
    ├── profiler.go
    ├── types.go
    └── types_test.go
└── reader
    ├── reader.go
    └── reader_test.go


/.dockerignore:
--------------------------------------------------------------------------------
1 | *
2 | !dist/linux-amd64
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw?
2 | dist/
3 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM alpine:3.6
2 | 
3 | COPY ./dist/linux-amd64/sql-importer /
4 | 
5 | ENTRYPOINT ["/sql-importer"]
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 The Children's Hospital of Philadelphia and individual contributors.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGE_NAME := dbhi/sql-importer
 2 | PROG_NAME := sql-importer
 3 | CMD_PATH := "./cmd/sql-importer"
 4 | 
 5 | GIT_SHA := $(shell git log -1 --pretty=format:"%h" .)
 6 | GIT_TAG := $(shell git describe --tags --exact-match . 2>/dev/null)
 7 | GIT_BRANCH := $(shell git symbolic-ref -q --short HEAD)
 8 | GIT_VERSION := $(shell git log -1 --pretty=format:"%h (%ci)" .)
 9 | 
10 | build:
11 | 	go build -ldflags "-X \"main.buildVersion=$(GIT_VERSION)\"" \
12 |   	-o $(GOPATH)/bin/$(PROG_NAME) $(CMD_PATH)
13 | 
14 | dist-build:
15 | 	mkdir -p dist
16 | 
17 | 	gox -output="./dist/{{.OS}}-{{.Arch}}/$(PROG_NAME)" \
18 | 		-ldflags "-X \"main.buildVersion=$(GIT_VERSION)\"" \
19 | 		-os "windows linux darwin" \
20 | 		-arch "amd64" $(CMD_PATH) > /dev/null
21 | 
22 | dist-zip:
23 | 	cd dist && zip $(PROG_NAME)-darwin-amd64.zip darwin-amd64/*
24 | 	cd dist && zip $(PROG_NAME)-linux-amd64.zip linux-amd64/*
25 | 	cd dist && zip $(PROG_NAME)-windows-amd64.zip windows-amd64/*
26 | 
27 | dist: dist-build dist-zip
28 | 
29 | docker:
30 | 	docker build -t ${IMAGE_NAME}:${GIT_SHA} .
31 | 	docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:${GIT_BRANCH}
32 | 
33 | 	if [ -n "${GIT_TAG}" ] ; then \
34 | 					docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:${GIT_TAG} ; \
35 | 	fi;
36 | 
37 | 	if [ "${GIT_BRANCH}" == "master" ]; then \
38 | 					docker tag ${IMAGE_NAME}:${GIT_SHA} ${IMAGE_NAME}:latest ; \
39 | 	fi;
40 | 
41 | docker-push:
42 | 	docker push ${IMAGE_NAME}:${GIT_SHA}
43 | 	docker push ${IMAGE_NAME}:${GIT_BRANCH}
44 | 
45 | 	if [ -n "${GIT_TAG}" ]; then \
46 | 					docker push ${IMAGE_NAME}:${GIT_TAG} ; \
47 | 	fi;
48 | 
49 | 	if [ "${GIT_BRANCH}" == "master" ]; then \
50 | 					docker push ${IMAGE_NAME}:latest ; \
51 | 	fi;
52 | 
53 | .PHONY: build dist-build dist
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SQL Importer
 2 | 
 3 | Import a CSV file into Postgres with automatic column typing and table creation.
 4 | 
 5 | Features:
 6 | 
 7 | - Type inference for numbers, dates, datetimes, and booleans
 8 | - Automatic table creation
 9 | - Uniqueness and not null detection
10 | - Automatic decompressing of gzip and bzip2 files
11 | - Support for append instead of replace
12 | - Support for CSV files wider than 1600 columns (the Postgres limit)
13 | 
14 | ## Install
15 | 
16 | [Download a pre-built release](https://github.com/chop-dbhi/sql-importer/releases).
17 | 
18 | 
19 | Or install it from source (requires Go).
20 | 
21 | ```
22 | go get github.com/chop-dbhi/sql-importer/cmd/sql-importer
23 | ```
24 | 
25 | ## Usage
26 | 
27 | Specify the database URL and a CSV file to import. The table name will be derived from the filename by default.
28 | 
29 | ```
30 | sql-importer -db postgres://127.0.0.1:5432/postgres data.csv
31 | ```
32 | 
33 | See other options by running `sql-importer -h`.
34 | 
35 | ## Status
36 | 
37 | Beta, works as expected. Command line options will likely change.
38 | 
39 | ## License
40 | 
41 | MIT
42 | 


--------------------------------------------------------------------------------
/cmd/sql-importer/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"flag"
  5 | 	"log"
  6 | 	"os"
  7 | 	"path/filepath"
  8 | 	"strings"
  9 | 	"sync"
 10 | 
 11 | 	"github.com/chop-dbhi/sql-importer"
 12 | )
 13 | 
 14 | func main() {
 15 | 	var (
 16 | 		dbUrl           string
 17 | 		schemaName      string
 18 | 		tableName       string
 19 | 		compressionType string
 20 | 
 21 | 		csvType      bool
 22 | 		csvDelimiter string
 23 | 		csvNoHeader  bool
 24 | 
 25 | 		useCstore   bool
 26 | 		appendTable bool
 27 | 	)
 28 | 
 29 | 	flag.StringVar(&dbUrl, "db", "", "Database URL.")
 30 | 	flag.StringVar(&schemaName, "schema", "public", "Schema name.")
 31 | 	flag.StringVar(&tableName, "table", "", "Table name.")
 32 | 	flag.BoolVar(&csvType, "csv", true, "CSV file. Required if using stdin.")
 33 | 	flag.StringVar(&csvDelimiter, "csv.delim", ",", "CSV delimiter.")
 34 | 	flag.BoolVar(&csvNoHeader, "csv.noheader", false, "No CSV header present.")
 35 | 	flag.StringVar(&compressionType, "compression", "", "Compression used.")
 36 | 	flag.BoolVar(&useCstore, "cstore", false, "Use cstore table.")
 37 | 	flag.BoolVar(&appendTable, "append", false, "Append to table.")
 38 | 
 39 | 	flag.Parse()
 40 | 	args := flag.Args()
 41 | 
 42 | 	if len(args) == 0 {
 43 | 		log.Fatal("file name or directory required")
 44 | 	}
 45 | 
 46 | 	inputName := args[0]
 47 | 
 48 | 	stat, _ := os.Stat(inputName)
 49 | 
 50 | 	if stat.IsDir() {
 51 | 		loadDir(
 52 | 			inputName,
 53 | 			dbUrl,
 54 | 			compressionType,
 55 | 			csvDelimiter,
 56 | 			appendTable,
 57 | 			useCstore,
 58 | 		)
 59 | 	} else {
 60 | 		loadFile(
 61 | 			inputName,
 62 | 			dbUrl,
 63 | 			schemaName,
 64 | 			tableName,
 65 | 			compressionType,
 66 | 			csvDelimiter,
 67 | 			csvType,
 68 | 			appendTable,
 69 | 			useCstore,
 70 | 			csvNoHeader,
 71 | 		)
 72 | 	}
 73 | }
 74 | 
 75 | func loadFile(path, dbUrl, schemaName, tableName, compressionType, csvDelimiter string, csvType, appendTable, useCstore, csvNoHeader bool) {
 76 | 	r := sqlimporter.Request{
 77 | 		Path: path,
 78 | 
 79 | 		Database: dbUrl,
 80 | 		Schema:   schemaName,
 81 | 		Table:    tableName,
 82 | 
 83 | 		AppendTable: appendTable,
 84 | 		CStore:      useCstore,
 85 | 
 86 | 		CSV:         csvType,
 87 | 		Compression: compressionType,
 88 | 
 89 | 		Delimiter: csvDelimiter,
 90 | 		Header:    !csvNoHeader,
 91 | 	}
 92 | 
 93 | 	if err := sqlimporter.Import(&r); err != nil {
 94 | 		log.Fatal(err)
 95 | 	}
 96 | }
 97 | 
 98 | func loadDir(rootDir, dbUrl, compressionType, csvDelimiter string, appendTable, useCstore bool) {
 99 | 	wg := &sync.WaitGroup{}
100 | 
101 | 	filepath.Walk(rootDir, func(path string, info os.FileInfo, err error) error {
102 | 		if info.IsDir() {
103 | 			return nil
104 | 		}
105 | 
106 | 		rpath, _ := filepath.Rel(rootDir, path)
107 | 		dir, base := filepath.Split(rpath)
108 | 
109 | 		tableName := strings.Split(base, ".")[0]
110 | 		schemaName := strings.Replace(strings.Trim(dir, "/"), "/", "_", -1)
111 | 
112 | 		if schemaName == "" {
113 | 			schemaName = "public"
114 | 		}
115 | 
116 | 		r := sqlimporter.Request{
117 | 			Path: path,
118 | 
119 | 			Database: dbUrl,
120 | 			Schema:   schemaName,
121 | 			Table:    tableName,
122 | 
123 | 			AppendTable: appendTable,
124 | 			CStore:      useCstore,
125 | 
126 | 			CSV:         true,
127 | 			Compression: compressionType,
128 | 
129 | 			Delimiter: csvDelimiter,
130 | 			Header:    true,
131 | 		}
132 | 
133 | 		wg.Add(1)
134 | 
135 | 		go func() {
136 | 			defer wg.Done()
137 | 
138 | 			defer func() {
139 | 				if err := recover(); err != nil {
140 | 					log.Printf("error loading file: %s", rpath)
141 | 					log.Printf("%s", err)
142 | 				}
143 | 			}()
144 | 
145 | 			log.Printf(`loading file %s into table "%s"."%s"`, rpath, schemaName, tableName)
146 | 
147 | 			if err := sqlimporter.Import(&r); err != nil {
148 | 				log.Printf("error importing file: %s", err)
149 | 			}
150 | 		}()
151 | 
152 | 		return nil
153 | 	})
154 | 
155 | 	wg.Wait()
156 | 
157 | }
158 | 


--------------------------------------------------------------------------------
/import.go:
--------------------------------------------------------------------------------
  1 | package sqlimporter
  2 | 
  3 | import (
  4 | 	"database/sql"
  5 | 	libcsv "encoding/csv"
  6 | 	"fmt"
  7 | 	"log"
  8 | 	"path"
  9 | 	"strings"
 10 | 
 11 | 	"github.com/chop-dbhi/sql-importer/profile/csv"
 12 | 	"github.com/chop-dbhi/sql-importer/reader"
 13 | )
 14 | 
 15 | type Request struct {
 16 | 	// Input path.
 17 | 	Path string
 18 | 
 19 | 	// Target database.
 20 | 	Database string
 21 | 	Schema   string
 22 | 	Table    string
 23 | 
 24 | 	// Behavior
 25 | 	AppendTable bool
 26 | 	CStore      bool
 27 | 
 28 | 	// File specifics.
 29 | 	CSV         bool
 30 | 	Compression string
 31 | 
 32 | 	// CSV
 33 | 	Delimiter string
 34 | 	Header    bool
 35 | }
 36 | 
 37 | func Import(r *Request) error {
 38 | 	fileType, fileComp := reader.DetectType(r.Path)
 39 | 
 40 | 	if r.CSV || fileType == "csv" {
 41 | 		r.CSV = true
 42 | 	} else {
 43 | 		return fmt.Errorf("file type not supported: %s", fileType)
 44 | 	}
 45 | 
 46 | 	if r.Compression == "" {
 47 | 		r.Compression = fileComp
 48 | 	}
 49 | 
 50 | 	if r.Table == "" {
 51 | 		_, base := path.Split(r.Path)
 52 | 		r.Table = strings.Split(base, ".")[0]
 53 | 	}
 54 | 
 55 | 	// Connect to database.
 56 | 	db, err := sql.Open("postgres", r.Database)
 57 | 	if err != nil {
 58 | 		return fmt.Errorf("cannot open db connection: %s", err)
 59 | 	}
 60 | 	defer db.Close()
 61 | 
 62 | 	// Open the input stream.
 63 | 	input, err := reader.Open(r.Path, r.Compression)
 64 | 	if err != nil {
 65 | 		return fmt.Errorf("cannot open input: %s", err)
 66 | 	}
 67 | 	defer input.Close()
 68 | 
 69 | 	cp := csv.NewProfiler(input)
 70 | 	cp.Delimiter = r.Delimiter[0]
 71 | 	cp.Header = r.Header
 72 | 
 73 | 	prof, err := cp.Profile()
 74 | 	if err != nil {
 75 | 		return fmt.Errorf("profile error: %s", err)
 76 | 	}
 77 | 
 78 | 	log.Print("Done profiling")
 79 | 
 80 | 	input.Close()
 81 | 	input, err = reader.Open(r.Path, r.Compression)
 82 | 	if err != nil {
 83 | 		return fmt.Errorf("cannot open input: %s", err)
 84 | 	}
 85 | 	defer input.Close()
 86 | 
 87 | 	schema := NewSchema(prof)
 88 | 	if r.CStore {
 89 | 		schema.Cstore = true
 90 | 	}
 91 | 
 92 | 	// Load intot he database.
 93 | 	log.Printf(`Begin load into "%s"."%s"`, r.Schema, r.Table)
 94 | 
 95 | 	cr := libcsv.NewReader(input)
 96 | 	cr.Comma = rune(r.Delimiter[0])
 97 | 
 98 | 	var n int64
 99 | 	dbc := New(db)
100 | 	if r.AppendTable {
101 | 		n, err = dbc.Append(r.Schema, r.Table, schema, cr)
102 | 	} else {
103 | 		n, err = dbc.Replace(r.Schema, r.Table, schema, cr)
104 | 	}
105 | 	if err != nil {
106 | 		return fmt.Errorf("error loading: %s", err)
107 | 	}
108 | 
109 | 	log.Printf("Loaded %d records", n)
110 | 
111 | 	return nil
112 | }
113 | 


--------------------------------------------------------------------------------
/pg.go:
--------------------------------------------------------------------------------
  1 | package sqlimporter
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"database/sql"
  6 | 	"encoding/csv"
  7 | 	"errors"
  8 | 	"fmt"
  9 | 	"io"
 10 | 	"regexp"
 11 | 	"strings"
 12 | 	"text/template"
 13 | 
 14 | 	"github.com/chop-dbhi/sql-importer/profile"
 15 | 	"github.com/lib/pq"
 16 | 	uuid "github.com/satori/go.uuid"
 17 | )
 18 | 
 19 | const (
 20 | 	rowIdColumn = "_row_id"
 21 | 
 22 | 	// Maximum number of entries in a "target list" (e.g. column list).
 23 | 	pgMaxTargetListSize = 1664
 24 | )
 25 | 
 26 | var (
 27 | 	badChars *regexp.Regexp
 28 | 	sepChars *regexp.Regexp
 29 | 
 30 | 	sqlTmpl = template.New("sql")
 31 | 
 32 | 	queryTmpls = map[string]string{
 33 | 		"createSchema":      `create schema if not exists "{{.Schema}}"`,
 34 | 		"createTable":       `create table if not exists "{{.Schema}}"."{{.Table}}" ( {{.Columns}} )`,
 35 | 		"createView":        `create or replace view "{{.Schema}}"."{{.View}}" as select {{.Columns}} from "{{.Schema}}"."{{.Table}}" {{.Joins}}`,
 36 | 		"createCstoreTable": `create foreign table if not exists "{{.Schema}}"."{{.Table}}" ( {{.Columns}} ) server cstore_server options (compression 'pglz')`,
 37 | 		"dropTable":         `drop table if exists "{{.Schema}}"."{{.Table}}"`,
 38 | 		"dropView":          `drop view if exists "{{.Schema}}"."{{.View}}"`,
 39 | 		"renameTable":       `alter table "{{.Schema}}"."{{.TempTable}}" rename to "{{.Table}}"`,
 40 | 		"analyzeTable":      `analyze "{{.Schema}}"."{{.Table}}"`,
 41 | 	}
 42 | 
 43 | 	// Map of profile types to SQL types.
 44 | 	sqlTypeMap = map[profile.ValueType]string{
 45 | 		profile.UnknownType:  "integer",
 46 | 		profile.BoolType:     "boolean",
 47 | 		profile.StringType:   "text",
 48 | 		profile.IntType:      "integer",
 49 | 		profile.FloatType:    "real",
 50 | 		profile.DateType:     "date",
 51 | 		profile.DateTimeType: "timestamp",
 52 | 		profile.NullType:     "text",
 53 | 	}
 54 | )
 55 | 
 56 | func init() {
 57 | 	// Initialize SQL statement templates.
 58 | 	for name, tmpl := range queryTmpls {
 59 | 		template.Must(sqlTmpl.New(name).Parse(tmpl))
 60 | 	}
 61 | 
 62 | 	badChars = regexp.MustCompile(`[^a-z0-9_\-\.\+]+`)
 63 | 	sepChars = regexp.MustCompile(`[\-\.\+]+`)
 64 | }
 65 | 
 66 | func splitN(l, n int) (int, int) {
 67 | 	if n > l {
 68 | 		return 1, 0
 69 | 	}
 70 | 
 71 | 	// Parts.
 72 | 	p := l / n
 73 | 
 74 | 	// Remainder.
 75 | 	r := l % n
 76 | 
 77 | 	return p, r
 78 | }
 79 | 
 80 | func splitColumns(columns []string, n int) [][]string {
 81 | 	l := len(columns)
 82 | 	if n >= l {
 83 | 		return [][]string{columns}
 84 | 	}
 85 | 
 86 | 	// Split columns.
 87 | 	p, r := splitN(l, n)
 88 | 
 89 | 	var hi, low int
 90 | 	var colparts [][]string
 91 | 
 92 | 	for i := 0; i < p; i++ {
 93 | 		low = i * n
 94 | 		hi = low + n
 95 | 		var cp []string
 96 | 		cp = append(cp, columns[low:hi]...)
 97 | 		colparts = append(colparts, cp)
 98 | 	}
 99 | 
100 | 	// Remainder, add another part.
101 | 	if r > 0 {
102 | 		var cp []string
103 | 		cp = append(cp, columns[hi:]...)
104 | 		colparts = append(colparts, cp)
105 | 	}
106 | 
107 | 	return colparts
108 | }
109 | 
110 | type Schema struct {
111 | 	Cstore bool
112 | 	Fields []*Field
113 | }
114 | 
115 | func NewSchema(p *profile.Profile) *Schema {
116 | 	fields := make([]*Field, len(p.Fields))
117 | 
118 | 	for n, f := range p.Fields {
119 | 		fields[f.Index] = &Field{
120 | 			Name:     n,
121 | 			Type:     sqlTypeMap[f.Type],
122 | 			Unique:   f.Unique,
123 | 			Nullable: f.Nullable || f.Missing,
124 | 		}
125 | 	}
126 | 
127 | 	return &Schema{
128 | 		Fields: fields,
129 | 	}
130 | }
131 | 
132 | // Field is a data definition on a schema.
133 | type Field struct {
134 | 	Name     string
135 | 	Type     string
136 | 	Multiple bool
137 | 	Unique   bool
138 | 	Nullable bool
139 | }
140 | 
141 | type tableData struct {
142 | 	Schema    string
143 | 	TempTable string
144 | 	Table     string
145 | 	View      string
146 | 	Columns   string
147 | 	Joins     string
148 | }
149 | 
150 | // TODO: fuzz test this.
151 | func cleanFieldName(n string) string {
152 | 	n = strings.ToLower(n)
153 | 	n = badChars.ReplaceAllString(n, "_")
154 | 	return sepChars.ReplaceAllString(n, "_")
155 | }
156 | 
157 | type Client struct {
158 | 	db *sql.DB
159 | }
160 | 
161 | // execTx calls a function within a transaction.
162 | func (c *Client) execTx(fn func(tx *sql.Tx) error) error {
163 | 	tx, err := c.db.Begin()
164 | 	if err != nil {
165 | 		return err
166 | 	}
167 | 
168 | 	if err := fn(tx); err != nil {
169 | 		tx.Rollback()
170 | 		return err
171 | 	}
172 | 
173 | 	return tx.Commit()
174 | }
175 | 
176 | func (c *Client) Replace(schemaName, tableName string, tableSchema *Schema, cr *csv.Reader) (int64, error) {
177 | 	tempTableNameUid, _ := uuid.NewV4()
178 | 	tempTableName := tempTableNameUid.String()
179 | 	defer c.dropTable(schemaName, tempTableName)
180 | 
181 | 	if err := c.createSchema(schemaName); err != nil {
182 | 		return 0, err
183 | 	}
184 | 
185 | 	splits, err := c.createTable(schemaName, tempTableName, tableSchema)
186 | 	if err != nil {
187 | 		return 0, err
188 | 	}
189 | 
190 | 	n, err := c.copyData(schemaName, tempTableName, splits, cr)
191 | 	if err != nil {
192 | 		return 0, err
193 | 	}
194 | 
195 | 	c.dropView(schemaName, tableName)
196 | 	c.dropTable(schemaName, tableName)
197 | 
198 | 	if err := c.renameTable(schemaName, tempTableName, tableName, len(splits)); err != nil {
199 | 		return n, err
200 | 	}
201 | 
202 | 	// Create a view if necessary and possible.
203 | 	if len(splits) > 1 && len(tableSchema.Fields)+len(splits) <= pgMaxTargetListSize {
204 | 		if err := c.createView(schemaName, tableName, tableName, splits); err != nil {
205 | 			return n, err
206 | 		}
207 | 	}
208 | 
209 | 	return n, c.analyzeTable(schemaName, tableName, splits)
210 | }
211 | 
212 | func (c *Client) Append(schemaName, tableName string, tableSchema *Schema, cr *csv.Reader) (int64, error) {
213 | 	if err := c.createSchema(schemaName); err != nil {
214 | 		return 0, err
215 | 	}
216 | 
217 | 	splits, err := c.createTable(schemaName, tableName, tableSchema)
218 | 	if err != nil {
219 | 		return 0, err
220 | 	}
221 | 
222 | 	n, err := c.copyData(schemaName, tableName, splits, cr)
223 | 	if err != nil {
224 | 		return 0, err
225 | 	}
226 | 
227 | 	return n, c.analyzeTable(schemaName, tableName, splits)
228 | }
229 | 
230 | func (c *Client) dropView(schemaName, viewName string) error {
231 | 	// Create the set of statements to
232 | 	data := &tableData{
233 | 		Schema: schemaName,
234 | 		View:   viewName,
235 | 	}
236 | 
237 | 	var b bytes.Buffer
238 | 	if err := sqlTmpl.ExecuteTemplate(&b, "dropView", data); err != nil {
239 | 		return err
240 | 	}
241 | 
242 | 	return c.execTx(func(tx *sql.Tx) error {
243 | 		sql := b.String()
244 | 		_, err := tx.Exec(sql)
245 | 		if err != nil {
246 | 			return fmt.Errorf("error dropping view: %s\n%s", err, sql)
247 | 		}
248 | 
249 | 		return nil
250 | 	})
251 | }
252 | 
253 | func (c *Client) dropTable(schemaName, tableName string) error {
254 | 	// Create the set of statements to
255 | 	data := &tableData{
256 | 		Schema: schemaName,
257 | 		Table:  tableName,
258 | 	}
259 | 
260 | 	var b bytes.Buffer
261 | 	if err := sqlTmpl.ExecuteTemplate(&b, "dropTable", data); err != nil {
262 | 		return err
263 | 	}
264 | 
265 | 	return c.execTx(func(tx *sql.Tx) error {
266 | 		sql := b.String()
267 | 		_, err := tx.Exec(sql)
268 | 		if err != nil {
269 | 			return fmt.Errorf("error dropping table: %s\n%s", err, sql)
270 | 		}
271 | 
272 | 		return nil
273 | 	})
274 | }
275 | 
276 | func (c *Client) createSchema(schemaName string) error {
277 | 	// Create the set of statements to
278 | 	data := &tableData{
279 | 		Schema: schemaName,
280 | 	}
281 | 
282 | 	var b bytes.Buffer
283 | 	if err := sqlTmpl.ExecuteTemplate(&b, "createSchema", data); err != nil {
284 | 		return err
285 | 	}
286 | 
287 | 	return c.execTx(func(tx *sql.Tx) error {
288 | 		sql := b.String()
289 | 		_, err := tx.Exec(sql)
290 | 		if err != nil {
291 | 			return fmt.Errorf("error creating schema: %s\n%s", err, sql)
292 | 		}
293 | 
294 | 		return nil
295 | 	})
296 | }
297 | 
298 | func (c *Client) createView(schemaName, viewName string, tableName string, tableColumns [][]string) error {
299 | 	var (
300 | 		firstTable    string
301 | 		rightTable    string
302 | 		leftTable     string
303 | 		selectColumns []string
304 | 		joins         []string
305 | 	)
306 | 
307 | 	for i, cols := range tableColumns {
308 | 		rightTable = fmt.Sprintf("%s_%d", tableName, i)
309 | 
310 | 		if firstTable == "" {
311 | 			firstTable = rightTable
312 | 		}
313 | 
314 | 		// Add columns to select statement.
315 | 		for _, col := range cols {
316 | 			selectColumns = append(selectColumns, fmt.Sprintf(`"%s"."%s"."%s"`, schemaName, rightTable, col))
317 | 		}
318 | 
319 | 		if leftTable != "" {
320 | 			joins = append(joins, fmt.Sprintf(`inner join "%s"."%s" on ("%s"."%s"."%s" = "%s"."%s"."%s")`, schemaName, rightTable, schemaName, leftTable, rowIdColumn, schemaName, rightTable, rowIdColumn))
321 | 		}
322 | 
323 | 		leftTable = rightTable
324 | 	}
325 | 
326 | 	data := &tableData{
327 | 		Table:   firstTable,
328 | 		View:    viewName,
329 | 		Schema:  schemaName,
330 | 		Columns: strings.Join(selectColumns, ", "),
331 | 		Joins:   strings.Join(joins, " "),
332 | 	}
333 | 
334 | 	var b bytes.Buffer
335 | 	if err := sqlTmpl.ExecuteTemplate(&b, "createView", data); err != nil {
336 | 		return err
337 | 	}
338 | 
339 | 	return c.execTx(func(tx *sql.Tx) error {
340 | 		sql := b.String()
341 | 		_, err := tx.Exec(sql)
342 | 		if err != nil {
343 | 			return fmt.Errorf("error creating view: %s\n%s", err, sql)
344 | 		}
345 | 
346 | 		return nil
347 | 	})
348 | }
349 | 
350 | func (c *Client) createTable(schemaName, tableName string, tableSchema *Schema) ([][]string, error) {
351 | 	var (
352 | 		columns       []string
353 | 		columnSchemas []string
354 | 	)
355 | 
356 | 	for _, f := range tableSchema.Fields {
357 | 		// Cleaned column name.
358 | 		name := cleanFieldName(f.Name)
359 | 		columns = append(columns, name)
360 | 
361 | 		var col string
362 | 
363 | 		// Create index.
364 | 		// TODO: long text values cannot be indexed.
365 | 		// https://dba.stackexchange.com/questions/25138/index-max-row-size-error.
366 | 		// Should this check the max value length?
367 | 		if f.Unique && f.Type != "text" {
368 | 			col = "%s %s unique"
369 | 		} else if !f.Nullable {
370 | 			col = "%s %s not null"
371 | 		} else {
372 | 			col = "%s %s"
373 | 		}
374 | 
375 | 		columnSchemas = append(columnSchemas, fmt.Sprintf(col, pq.QuoteIdentifier(name), f.Type))
376 | 	}
377 | 
378 | 	// 250 - 1600 is max number of columns allowed per table, but this depends
379 | 	// on the data types used. this strategy simply attempts to create the widest
380 | 	// table it can.
381 | 	partSizes := []int{
382 | 		1299,
383 | 		249, // max for certain types
384 | 	}
385 | 
386 | 	for _, size := range partSizes {
387 | 		columnSplits := splitColumns(columns, size)
388 | 		columnSchemaSplits := splitColumns(columnSchemas, size)
389 | 
390 | 		err := c.createTableSplits(schemaName, tableName, columnSchemaSplits, tableSchema.Cstore)
391 | 
392 | 		// Success.
393 | 		if err == nil {
394 | 			return columnSplits, nil
395 | 		}
396 | 
397 | 		if !strings.Contains(err.Error(), "pq: tables can have at most 1600 columns") {
398 | 			return nil, err
399 | 		}
400 | 	}
401 | 
402 | 	return nil, errors.New("failed to partition columns")
403 | }
404 | 
405 | func (c *Client) createTableSplits(schemaName, tableName string, splitColumns [][]string, cstore bool) error {
406 | 	// All columns fit in the table.
407 | 	if len(splitColumns) == 1 {
408 | 		return c.execTx(func(tx *sql.Tx) error {
409 | 			return c.createSingleTable(tx, schemaName, tableName, splitColumns[0], cstore)
410 | 		})
411 | 	}
412 | 
413 | 	return c.execTx(func(tx *sql.Tx) error {
414 | 		var partTables []string
415 | 
416 | 		// Multiple tables, so we need to add the rowIdColumn.
417 | 		// A suffix is added to each table name. Then a view is created
418 | 		// to join the tables to together.
419 | 		for i, cols := range splitColumns {
420 | 			partTableName := fmt.Sprintf("%s_%d", tableName, i)
421 | 
422 | 			ncols := []string{
423 | 				rowIdColumn + " integer not null unique",
424 | 			}
425 | 			ncols = append(ncols, cols...)
426 | 
427 | 			// TODO: clean up partially created tables?
428 | 			if err := c.createSingleTable(tx, schemaName, partTableName, ncols, cstore); err != nil {
429 | 				return err
430 | 			}
431 | 
432 | 			partTables = append(partTables, partTableName)
433 | 		}
434 | 
435 | 		return nil
436 | 	})
437 | }
438 | 
439 | func (c *Client) createSingleTable(tx *sql.Tx, schemaName, tableName string, columns []string, cstore bool) error {
440 | 	// Create the set of statements to
441 | 	data := &tableData{
442 | 		Schema:  schemaName,
443 | 		Table:   tableName,
444 | 		Columns: strings.Join(columns, ","),
445 | 	}
446 | 
447 | 	tmplName := "createTable"
448 | 	if cstore {
449 | 		tmplName = "createCstoreTable"
450 | 	}
451 | 
452 | 	var b bytes.Buffer
453 | 	if err := sqlTmpl.ExecuteTemplate(&b, tmplName, data); err != nil {
454 | 		return err
455 | 	}
456 | 
457 | 	sql := b.String()
458 | 	_, err := tx.Exec(sql)
459 | 	if err != nil {
460 | 		return fmt.Errorf("error creating table: %s\n%s", err, sql)
461 | 	}
462 | 	return err
463 | }
464 | 
465 | func (c *Client) renameSingleTable(tx *sql.Tx, schemaName, tempTableName, tableName string) error {
466 | 	var b bytes.Buffer
467 | 
468 | 	// Create the set of statements to
469 | 	data := &tableData{
470 | 		Schema:    schemaName,
471 | 		TempTable: tempTableName,
472 | 		Table:     tableName,
473 | 	}
474 | 
475 | 	tmpls := []string{
476 | 		"dropTable",
477 | 		"renameTable",
478 | 	}
479 | 
480 | 	for _, name := range tmpls {
481 | 		b.Reset()
482 | 		if err := sqlTmpl.ExecuteTemplate(&b, name, data); err != nil {
483 | 			return err
484 | 		}
485 | 
486 | 		if _, err := tx.Exec(b.String()); err != nil {
487 | 			return fmt.Errorf("error renaming table: %s", err)
488 | 		}
489 | 	}
490 | 
491 | 	return nil
492 | }
493 | 
494 | func (c *Client) renameTable(schemaName, tempTableName, tableName string, tableParts int) error {
495 | 	if tableParts == 1 {
496 | 		return c.execTx(func(tx *sql.Tx) error {
497 | 			return c.renameSingleTable(tx, schemaName, tempTableName, tableName)
498 | 		})
499 | 	}
500 | 
501 | 	return c.execTx(func(tx *sql.Tx) error {
502 | 		for i := 0; i < tableParts; i++ {
503 | 			if err := c.renameSingleTable(tx, schemaName, fmt.Sprintf("%s_%d", tempTableName, i), fmt.Sprintf("%s_%d", tableName, i)); err != nil {
504 | 				return err
505 | 			}
506 | 		}
507 | 		return nil
508 | 	})
509 | }
510 | 
511 | func (c *Client) analyzeTable(schemaName, tableName string, tableColumns [][]string) error {
512 | 	if len(tableColumns) == 1 {
513 | 		return c.execTx(func(tx *sql.Tx) error {
514 | 			return c.analyzeSingleTable(tx, schemaName, tableName)
515 | 		})
516 | 	}
517 | 
518 | 	return c.execTx(func(tx *sql.Tx) error {
519 | 		for i := range tableColumns {
520 | 			if err := c.analyzeSingleTable(tx, schemaName, fmt.Sprintf("%s_%d", tableName, i)); err != nil {
521 | 				return err
522 | 			}
523 | 		}
524 | 
525 | 		return nil
526 | 	})
527 | }
528 | 
529 | func (c *Client) analyzeSingleTable(tx *sql.Tx, schemaName, tableName string) error {
530 | 	// Create the set of statements to
531 | 	data := &tableData{
532 | 		Schema: schemaName,
533 | 		Table:  tableName,
534 | 	}
535 | 
536 | 	var b bytes.Buffer
537 | 	if err := sqlTmpl.ExecuteTemplate(&b, "analyzeTable", data); err != nil {
538 | 		return err
539 | 	}
540 | 
541 | 	sql := b.String()
542 | 	if _, err := tx.Exec(sql); err != nil {
543 | 		return fmt.Errorf("error analyzinng table: %s\n%s", err, sql)
544 | 	}
545 | 
546 | 	return nil
547 | }
548 | 
549 | func (c *Client) copyData(schemaName, tableName string, tableColumns [][]string, cr *csv.Reader) (int64, error) {
550 | 	// Read and skip columns.
551 | 	_, err := cr.Read()
552 | 	if err != nil {
553 | 		return 0, err
554 | 	}
555 | 
556 | 	singleTable := len(tableColumns) == 1
557 | 	singleTableSize := len(tableColumns[0])
558 | 
559 | 	txs := make([]*sql.Tx, len(tableColumns))
560 | 	stmts := make([]*sql.Stmt, len(tableColumns))
561 | 
562 | 	defer func() {
563 | 		for i := range txs {
564 | 			stmts[i].Close()
565 | 			txs[i].Rollback()
566 | 		}
567 | 	}()
568 | 
569 | 	for i, cols := range tableColumns {
570 | 		tx, err := c.db.Begin()
571 | 		if err != nil {
572 | 			return 0, err
573 | 		}
574 | 
575 | 		txs[i] = tx
576 | 
577 | 		targetTable := tableName
578 | 		if !singleTable {
579 | 			cols = append([]string{rowIdColumn}, cols...)
580 | 			targetTable = fmt.Sprintf("%s_%d", tableName, i)
581 | 		}
582 | 
583 | 		stmt, err := tx.Prepare(pq.CopyInSchema(schemaName, targetTable, cols...))
584 | 		if err != nil {
585 | 			return 0, fmt.Errorf("error preparing copy: %s", err)
586 | 		}
587 | 
588 | 		stmts[i] = stmt
589 | 	}
590 | 
591 | 	// Allocate buffer. Max width + 1 for row id.
592 | 	// The actual bounds will need to be maintained.
593 | 	cargs := make([]interface{}, len(tableColumns[0])+1)
594 | 
595 | 	var (
596 | 		n     int64
597 | 		rowid int64
598 | 	)
599 | 
600 | 	// Buffer records for COPY statement.
601 | 	for {
602 | 		row, err := cr.Read()
603 | 		if err == io.EOF {
604 | 			break
605 | 		}
606 | 
607 | 		if err != nil {
608 | 			return 0, fmt.Errorf("error reading record: %s", err)
609 | 		}
610 | 
611 | 		rowid++
612 | 
613 | 		if singleTable {
614 | 			for i, v := range row {
615 | 				if v == "" {
616 | 					cargs[i] = nil
617 | 				} else {
618 | 					cargs[i] = v
619 | 				}
620 | 			}
621 | 
622 | 			_, err = stmts[0].Exec(cargs[:singleTableSize]...)
623 | 			if err != nil {
624 | 				return 0, fmt.Errorf("error sending row: %s", err)
625 | 			}
626 | 		} else {
627 | 			var low, hi int
628 | 
629 | 			for i, cols := range tableColumns {
630 | 				hi = low + len(cols)
631 | 
632 | 				cargs[0] = rowid
633 | 
634 | 				for j, v := range row[low:hi] {
635 | 					if v == "" {
636 | 						cargs[j+1] = nil
637 | 					} else {
638 | 						cargs[j+1] = v
639 | 					}
640 | 				}
641 | 
642 | 				low = hi
643 | 
644 | 				_, err = stmts[i].Exec(cargs[:len(cols)+1]...)
645 | 				if err != nil {
646 | 					return 0, fmt.Errorf("error sending row: %s: %v, %v", err, cols, cargs[:len(cols)+1])
647 | 				}
648 | 			}
649 | 		}
650 | 
651 | 		n++
652 | 	}
653 | 
654 | 	// Empty exec to flush the buffer.
655 | 	for _, stmt := range stmts {
656 | 		_, err = stmt.Exec()
657 | 		if err != nil {
658 | 			return 0, fmt.Errorf("error executing copy: %s", err)
659 | 		}
660 | 	}
661 | 
662 | 	if err != nil {
663 | 		return 0, err
664 | 	}
665 | 
666 | 	// Commit transactions.
667 | 	for _, tx := range txs {
668 | 		if err := tx.Commit(); err != nil {
669 | 			return 0, err
670 | 		}
671 | 	}
672 | 
673 | 	return n, nil
674 | }
675 | 
676 | func New(db *sql.DB) *Client {
677 | 	return &Client{
678 | 		db: db,
679 | 	}
680 | }
681 | 


--------------------------------------------------------------------------------
/profile/csv/csv.go:
--------------------------------------------------------------------------------
  1 | package csv
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"strings"
  7 | 
  8 | 	"github.com/chop-dbhi/sql-importer/profile"
  9 | )
 10 | 
 11 | type Profiler struct {
 12 | 	Config    *profile.Config
 13 | 	Delimiter byte
 14 | 	Header    bool
 15 | 
 16 | 	in io.Reader
 17 | }
 18 | 
 19 | func (x *Profiler) Profile() (*profile.Profile, error) {
 20 | 	p := profile.NewProfiler(x.Config)
 21 | 	cr := NewCSVReader(x.in, x.Delimiter)
 22 | 
 23 | 	// First record, may be the header.
 24 | 	record, err := cr.Read()
 25 | 	if err != nil {
 26 | 		return nil, err
 27 | 	}
 28 | 
 29 | 	header := make([]string, len(record))
 30 | 	if x.Header {
 31 | 		for i, n := range record {
 32 | 			header[i] = strings.ToLower(n)
 33 | 		}
 34 | 	} else {
 35 | 		for i, _ := range record {
 36 | 			header[i] = fmt.Sprintf("c%d", i)
 37 | 		}
 38 | 	}
 39 | 
 40 | 	for _, c := range header {
 41 | 		p.InitField(c)
 42 | 	}
 43 | 
 44 | 	// Profile first record.
 45 | 	if !x.Header {
 46 | 		for i, field := range header {
 47 | 			val := record[i]
 48 | 
 49 | 			// Treat empty strings as a null value.
 50 | 			if val == "" {
 51 | 				p.RecordType(field, nil, profile.NullType)
 52 | 			} else {
 53 | 				p.Record(field, val)
 54 | 			}
 55 | 		}
 56 | 
 57 | 		p.Incr()
 58 | 	}
 59 | 
 60 | 	// Continue with remaining records.
 61 | 	for {
 62 | 		err := cr.ScanLine(record)
 63 | 		if err == io.EOF {
 64 | 			break
 65 | 		}
 66 | 
 67 | 		if err != nil {
 68 | 			return nil, err
 69 | 		}
 70 | 
 71 | 		for i, field := range header {
 72 | 			val := record[i]
 73 | 
 74 | 			// Treat empty strings as a null value.
 75 | 			if val == "" {
 76 | 				p.RecordType(field, nil, profile.NullType)
 77 | 			} else {
 78 | 				p.Record(field, val)
 79 | 			}
 80 | 		}
 81 | 
 82 | 		p.Incr()
 83 | 	}
 84 | 
 85 | 	pf := p.Profile()
 86 | 
 87 | 	// Set the index of the field.
 88 | 	for idx, name := range header {
 89 | 		pf.Fields[name].Index = idx
 90 | 	}
 91 | 
 92 | 	return pf, nil
 93 | }
 94 | 
 95 | func NewProfiler(r io.Reader) *Profiler {
 96 | 	return &Profiler{
 97 | 		Delimiter: ',',
 98 | 		Header:    true,
 99 | 		in:        r,
100 | 	}
101 | }
102 | 


--------------------------------------------------------------------------------
/profile/csv/csv_test.go:
--------------------------------------------------------------------------------
 1 | package csv
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"testing"
 6 | 
 7 | 	"github.com/chop-dbhi/sql-importer/profile"
 8 | )
 9 | 
10 | func TestProfiler(t *testing.T) {
11 | 	b := bytes.NewBufferString(`name,color,dob
12 | John,Blue,03/11/2013
13 | Jane,Red,2008-2-24
14 | Joe,,2010-02-11
15 | `)
16 | 
17 | 	pr := NewProfiler(b)
18 | 	p, err := pr.Profile()
19 | 	if err != nil {
20 | 		t.Fatal(err)
21 | 	}
22 | 
23 | 	if len(p.Fields) != 3 {
24 | 		t.Errorf("expected 3 fields, got %d", len(p.Fields))
25 | 	}
26 | 
27 | 	if p.Fields["dob"].Type != profile.DateType {
28 | 		t.Errorf("expected date type, got %s", p.Fields["dob"].Type)
29 | 	}
30 | }
31 | 


--------------------------------------------------------------------------------
/profile/csv/parser.go:
--------------------------------------------------------------------------------
  1 | package csv
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"errors"
  6 | 	"io"
  7 | )
  8 | 
  9 | const (
 10 | 	// 8 times default scanner buffer size.
 11 | 	scanBufSize = 8 * 64 * 1024
 12 | )
 13 | 
 14 | var (
 15 | 	csvErrUnquotedField     = errors.New("unquoted field")
 16 | 	csvErrUnescapedQuote    = errors.New("bare quote")
 17 | 	csvErrUnterminatedField = errors.New("unterminated field")
 18 | 	csvErrExtraColumns      = errors.New("extra columns")
 19 | )
 20 | 
 21 | func clearRow(row []string) {
 22 | 	for i, _ := range row {
 23 | 		row[i] = ""
 24 | 	}
 25 | }
 26 | 
 27 | // CSVReader provides an interface for reading CSV data
 28 | // (compatible with rfc4180 and extended with the option of having a separator other than ",").
 29 | // Successive calls to the Scan method will step through the 'fields', skipping the separator/newline between the fields.
 30 | // The EndOfRecord method tells when a field is terminated by a line break.
 31 | type CSVReader struct {
 32 | 	sc *bufio.Scanner
 33 | 
 34 | 	// If true, the scanner will continue scanning if field-level errors are
 35 | 	// encountered. The error should be checked after each call to Scan to
 36 | 	// handle the error.
 37 | 	ContinueOnError bool
 38 | 
 39 | 	sep    byte // values separator
 40 | 	eor    bool // true when the most recent field has been terminated by a newline (not a separator).
 41 | 	lineno int  // current line number (not record number)
 42 | 	column int  // current column index 1-based
 43 | 
 44 | 	eof bool
 45 | 	// Error. Only set if
 46 | 	err error
 47 | 
 48 | 	// Full line, last valid column value, remaining data in the line.
 49 | 	line  string
 50 | 	token []byte
 51 | 	data  []byte
 52 | 
 53 | 	trail bool
 54 | }
 55 | 
 56 | // DefaultReader creates a "standard" CSV reader.
 57 | func DefaultCSVReader(rd io.Reader) *CSVReader {
 58 | 	return NewCSVReader(rd, ',')
 59 | }
 60 | 
 61 | // NewReader returns a new CSV scanner.
 62 | func NewCSVReader(r io.Reader, sep byte) *CSVReader {
 63 | 	s := &CSVReader{
 64 | 		ContinueOnError: true,
 65 | 
 66 | 		// Defaults to splitting by line.
 67 | 		sc:  bufio.NewScanner(r),
 68 | 		sep: sep,
 69 | 		eor: true,
 70 | 	}
 71 | 
 72 | 	s.sc.Buffer(nil, scanBufSize)
 73 | 
 74 | 	return s
 75 | }
 76 | 
 77 | // Line returns the current line as a string.
 78 | func (s *CSVReader) Line() string {
 79 | 	return s.line
 80 | }
 81 | 
 82 | // Text returns the text of the current field.
 83 | func (s *CSVReader) Text() string {
 84 | 	return string(s.token)
 85 | }
 86 | 
 87 | // LineNumber returns current line number.
 88 | func (s *CSVReader) LineNumber() int {
 89 | 	return s.lineno
 90 | }
 91 | 
 92 | // ColumnNumber returns the column index of the current field.
 93 | func (s *CSVReader) ColumnNumber() int {
 94 | 	return s.column
 95 | }
 96 | 
 97 | // EndOfRecord returns true when the most recent field has been terminated by a newline (not a separator).
 98 | func (s *CSVReader) EndOfRecord() bool {
 99 | 	return s.eor
100 | }
101 | 
102 | // Err returns an error if one occurred during scanning.
103 | func (s *CSVReader) Err() error {
104 | 	if err := s.sc.Err(); err != nil {
105 | 		return err
106 | 	}
107 | 
108 | 	if s.err != nil {
109 | 		return s.err
110 | 	}
111 | 
112 | 	if s.eof {
113 | 		return io.EOF
114 | 	}
115 | 
116 | 	return nil
117 | }
118 | 
119 | // Read scans all fields in one line builds a slice of values.
120 | func (s *CSVReader) Read() ([]string, error) {
121 | 	var (
122 | 		err error
123 | 		r   []string
124 | 	)
125 | 
126 | 	for s.Scan() {
127 | 		if err = s.Err(); err != nil {
128 | 			return nil, err
129 | 		}
130 | 
131 | 		r = append(r, s.Text())
132 | 
133 | 		if s.EndOfRecord() {
134 | 			break
135 | 		}
136 | 	}
137 | 
138 | 	return r, s.Err()
139 | }
140 | 
141 | // ScanLine scans all fields in one line and puts the values in
142 | // the passed slice.
143 | func (s *CSVReader) ScanLine(r []string) error {
144 | 	var (
145 | 		err error
146 | 		max = len(r)
147 | 	)
148 | 
149 | 	for i := 0; s.Scan(); i++ {
150 | 		// Line too long.
151 | 		if i == max {
152 | 			return csvErrExtraColumns
153 | 		}
154 | 
155 | 		if err = s.Err(); err != nil {
156 | 			clearRow(r[i:])
157 | 			return err
158 | 		}
159 | 
160 | 		r[i] = s.Text()
161 | 
162 | 		if s.EndOfRecord() {
163 | 			break
164 | 		}
165 | 	}
166 | 
167 | 	return s.Err()
168 | }
169 | 
170 | func (s *CSVReader) Scan() bool {
171 | 	// Error.
172 | 	if s.err != nil && !s.ContinueOnError {
173 | 		return false
174 | 	}
175 | 
176 | 	// EOF
177 | 	if s.eof && len(s.data) == 0 {
178 | 		return false
179 | 	}
180 | 
181 | 	// If the end of the record has been reached, scan for the next line.
182 | 	if s.eor {
183 | 		// Clear.
184 | 		s.line = ""
185 | 		s.data = nil
186 | 		s.token = nil
187 | 
188 | 		// Scan until there is a non-empty line to parse.
189 | 		for {
190 | 			if !s.sc.Scan() {
191 | 				// If there was an error, return. Otherwise mark as EOF.
192 | 				if err := s.sc.Err(); err != nil {
193 | 					return false
194 | 				}
195 | 
196 | 				s.eof = true
197 | 				break
198 | 			}
199 | 
200 | 			// Set the current line. Add the new line to parsing.
201 | 			s.line = s.sc.Text()
202 | 
203 | 			// Skip empty lines.
204 | 			if s.line != "" {
205 | 				s.data = s.sc.Bytes()
206 | 				break
207 | 			}
208 | 		}
209 | 	}
210 | 
211 | 	adv, token, trail, err := s.scanField(s.data)
212 | 
213 | 	// Advance the section of the line for the next field.
214 | 	s.data = s.data[adv:]
215 | 	s.err = err
216 | 
217 | 	if trail && len(s.data) == 0 {
218 | 		s.trail = trail
219 | 	}
220 | 
221 | 	// Set the token if no error occurred otherwise mark as the end of record.
222 | 	if err == nil {
223 | 		s.token = token
224 | 	} else {
225 | 		if s.ContinueOnError {
226 | 			s.token = s.data
227 | 			s.eor = true
228 | 		} else {
229 | 			return false
230 | 		}
231 | 	}
232 | 
233 | 	if !s.trail && s.eof && len(s.data) == 0 {
234 | 		return false
235 | 	}
236 | 
237 | 	return true
238 | }
239 | 
240 | func (s *CSVReader) scanField(data []byte) (int, []byte, bool, error) {
241 | 	// Special case.
242 | 	if s.trail {
243 | 		s.column++
244 | 		s.eor = true
245 | 		s.trail = false
246 | 		return 0, data, false, nil
247 | 	}
248 | 
249 | 	if len(data) == 0 {
250 | 		return 0, nil, false, nil
251 | 	}
252 | 
253 | 	// Previous iteration was the end of a record. Increment line and reset column.
254 | 	if s.eor {
255 | 		s.column = 0
256 | 		s.lineno++
257 | 	}
258 | 
259 | 	s.column++
260 | 	s.eor = false
261 | 
262 | 	// Quoted field.
263 | 	if data[0] == '"' {
264 | 		var (
265 | 			eq    int
266 | 			oq    bool
267 | 			c, pc byte
268 | 		)
269 | 
270 | 		// Scan until the end quote is found.
271 | 		for i := 1; i < len(data); i++ {
272 | 			c = data[i]
273 | 
274 | 			// Successive quotes denote an escaped quote. Clear the previous byte
275 | 			// to escaped quotes are not overlapped.
276 | 			if c == '"' {
277 | 				if pc == '"' {
278 | 					pc = 0
279 | 					oq = false
280 | 					eq++
281 | 					continue
282 | 				}
283 | 
284 | 				// Open quote.
285 | 				if oq {
286 | 					return 0, nil, false, csvErrUnescapedQuote
287 | 				}
288 | 
289 | 				oq = true
290 | 			}
291 | 
292 | 			// End of field with a trailing separator.
293 | 			if pc == '"' && c == s.sep {
294 | 				return i + 1, unescapeQuotes(data[1:i-1], eq), true, nil
295 | 			}
296 | 
297 | 			// Shift previous characters.
298 | 			pc = c
299 | 		}
300 | 
301 | 		// Ran out of bytes.
302 | 		s.eor = true
303 | 
304 | 		// Final character in the line is a quote of the last field.
305 | 		if c == '"' {
306 | 			return len(data), unescapeQuotes(data[1:len(data)-1], eq), false, nil
307 | 		}
308 | 
309 | 		// End of line without a terminated quote.
310 | 		return 0, nil, false, csvErrUnterminatedField
311 | 	}
312 | 
313 | 	// Unquoted fields. Only fail if a double quote is found.
314 | 	for i, c := range data {
315 | 		if c == s.sep {
316 | 			s.eor = false
317 | 			return i + 1, data[0:i], true, nil
318 | 		}
319 | 
320 | 		// Unquoted field with quote.
321 | 		if c == '"' {
322 | 			return 0, nil, false, csvErrUnquotedField
323 | 		}
324 | 	}
325 | 
326 | 	// Ran out of bytes.
327 | 	s.eor = true
328 | 
329 | 	return len(data), data, false, nil
330 | }
331 | 
332 | // Removes escaped quotes from the string.
333 | func unescapeQuotes(b []byte, count int) []byte {
334 | 	if count == 0 {
335 | 		return b
336 | 	}
337 | 
338 | 	for i, j := 0, 0; i < len(b); i, j = i+1, j+1 {
339 | 		b[j] = b[i]
340 | 
341 | 		if b[i] == '"' && (i < len(b)-1 && b[i+1] == '"') {
342 | 			i++
343 | 		}
344 | 	}
345 | 
346 | 	return b[:len(b)-count]
347 | }
348 | 


--------------------------------------------------------------------------------
/profile/csv/parser_test.go:
--------------------------------------------------------------------------------
  1 | package csv
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"strings"
  8 | 	"testing"
  9 | )
 10 | 
 11 | func compareRows(a, b []string) bool {
 12 | 	if len(a) != len(b) {
 13 | 		return false
 14 | 	}
 15 | 
 16 | 	for i, v := range a {
 17 | 		if v != b[i] {
 18 | 			return false
 19 | 		}
 20 | 	}
 21 | 
 22 | 	return true
 23 | }
 24 | 
 25 | func tableToCSV(t [][]string) []byte {
 26 | 	buf := bytes.NewBuffer(nil)
 27 | 	sep := []byte{','}
 28 | 	nl := []byte{'\n'}
 29 | 
 30 | 	for _, r := range t {
 31 | 		for i, c := range r {
 32 | 			if i != 0 {
 33 | 				buf.Write(sep)
 34 | 			}
 35 | 			if c != "" {
 36 | 				buf.WriteString(fmt.Sprintf(`"%s"`, c))
 37 | 			}
 38 | 		}
 39 | 
 40 | 		buf.Write(nl)
 41 | 	}
 42 | 
 43 | 	return buf.Bytes()
 44 | }
 45 | 
 46 | func tableToToks(t [][]string) []string {
 47 | 	var toks []string
 48 | 
 49 | 	for _, r := range t {
 50 | 		toks = append(toks, r...)
 51 | 	}
 52 | 
 53 | 	return toks
 54 | }
 55 | 
 56 | func TestCSVReader(t *testing.T) {
 57 | 	table := [][]string{
 58 | 		{"name", "gender", "state"},
 59 | 		{"Joe", "M", "GA"},
 60 | 		{"Sue", "F", "NJ"},
 61 | 		{"Bob", "M", "NY"},
 62 | 		{"Bill", "M", ""}, // trailing comma
 63 | 	}
 64 | 
 65 | 	buf := bytes.NewBuffer(tableToCSV(table))
 66 | 	toks := tableToToks(table)
 67 | 
 68 | 	cr := DefaultCSVReader(buf)
 69 | 
 70 | 	var i, c, l int
 71 | 
 72 | 	for i = 0; cr.Scan(); i++ {
 73 | 		// Increment line and reset column every three tokens.
 74 | 		if i%3 == 0 {
 75 | 			l++
 76 | 			c = 1
 77 | 		} else {
 78 | 			c++
 79 | 		}
 80 | 
 81 | 		if i == len(toks) {
 82 | 			t.Errorf("scan exceeded %d tokens", i+1)
 83 | 			break
 84 | 		}
 85 | 
 86 | 		tok := cr.Text()
 87 | 
 88 | 		if tok != toks[i] {
 89 | 			t.Errorf("line %d, column %d: expected %s, got %s", cr.LineNumber(), cr.ColumnNumber(), toks[i], tok)
 90 | 		}
 91 | 
 92 | 		if cr.LineNumber() != l {
 93 | 			t.Errorf("expected line %d, got %d for %s", l, cr.LineNumber(), tok)
 94 | 		}
 95 | 
 96 | 		if cr.ColumnNumber() != c {
 97 | 			t.Errorf("expected column %d, got %d for %s", c, cr.ColumnNumber(), tok)
 98 | 		}
 99 | 	}
100 | 
101 | 	if err := cr.Err(); err != io.EOF {
102 | 		t.Errorf("unexpected error: %s", err)
103 | 	}
104 | 
105 | 	if i != len(toks) {
106 | 		t.Errorf("expected %d, got %d", len(toks), i)
107 | 	}
108 | }
109 | 
110 | func TestCSVScanLine(t *testing.T) {
111 | 	table := [][]string{
112 | 		{"name", "gender", "state"},
113 | 		{"Joe", "M", "GA"},
114 | 		{"Sue", "F", "NJ"},
115 | 		{"Bob", "M", "NY"},
116 | 		{"Bill", "M", ""},
117 | 	}
118 | 
119 | 	buf := bytes.NewBuffer(tableToCSV(table))
120 | 
121 | 	cr := DefaultCSVReader(buf)
122 | 
123 | 	var (
124 | 		i   int
125 | 		err error
126 | 		row = make([]string, 3)
127 | 	)
128 | 
129 | 	for {
130 | 		err = cr.ScanLine(row)
131 | 
132 | 		if err == io.EOF {
133 | 			break
134 | 		}
135 | 
136 | 		if err != nil {
137 | 			t.Errorf("%d: unexpected error: %s", i, err)
138 | 		}
139 | 
140 | 		if cr.LineNumber() != i+1 {
141 | 			t.Errorf("%d: got wrong line number %d", i, cr.LineNumber())
142 | 		}
143 | 
144 | 		if !compareRows(table[i], row) {
145 | 			t.Errorf("%d: wrong row, got %v", row)
146 | 		}
147 | 
148 | 		i++
149 | 	}
150 | 
151 | 	if i != 5 {
152 | 		t.Errorf("scanned wrong number of lines %d", i)
153 | 	}
154 | }
155 | 
156 | func TestCSVInput(t *testing.T) {
157 | 	rows := []string{
158 | 		`"name","gender",state`,
159 | 		`Joe,"M",GA`,
160 | 		`"Sue","""F""",NJ`,
161 | 		`Bob,M,NY`,
162 | 	}
163 | 
164 | 	buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n")))
165 | 	cr := DefaultCSVReader(buf)
166 | 
167 | 	var (
168 | 		err error
169 | 		row = make([]string, 3)
170 | 	)
171 | 
172 | 	for {
173 | 		err = cr.ScanLine(row)
174 | 
175 | 		if err == io.EOF {
176 | 			break
177 | 		}
178 | 
179 | 		if err != nil {
180 | 			t.Errorf("%d: unexpected error: %s", cr.LineNumber(), err)
181 | 		}
182 | 	}
183 | }
184 | 
185 | func TestCSVScanLineBadInput(t *testing.T) {
186 | 	rows := []string{
187 | 		`"name", "gender",state`,
188 | 		`Joe,"M", "GA"`,
189 | 		`"Sue", "F", "NJ"`,
190 | 		`"Bob",M,NY"`,
191 | 	}
192 | 
193 | 	buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n")))
194 | 	cr := DefaultCSVReader(buf)
195 | 
196 | 	var (
197 | 		i   int
198 | 		err error
199 | 		row = make([]string, 3)
200 | 	)
201 | 
202 | 	for {
203 | 		err = cr.ScanLine(row)
204 | 
205 | 		if err == io.EOF {
206 | 			break
207 | 		}
208 | 
209 | 		if cr.Line() != rows[i] {
210 | 			t.Errorf("%d: bad line `%s`", i, cr.Line())
211 | 		}
212 | 
213 | 		if err == nil {
214 | 			t.Errorf("%d: expected error", i)
215 | 		} else if cr.LineNumber() != i+1 {
216 | 			t.Errorf("%d: got wrong line number %d", i, cr.LineNumber())
217 | 		}
218 | 
219 | 		i++
220 | 	}
221 | 
222 | 	if i != 4 {
223 | 		t.Errorf("scanned wrong number of lines %d", i)
224 | 	}
225 | }
226 | 
227 | func TestCSVReaderBadInput(t *testing.T) {
228 | 	rows := []string{
229 | 		`"name","gender", state`,
230 | 		`Joe,"M", "GA"`,
231 | 		`"Sue", "F", "NJ"`,
232 | 		`"Bob",M,NY"`,
233 | 	}
234 | 
235 | 	expectedToks := []struct {
236 | 		Token  string
237 | 		Error  bool
238 | 		Line   int
239 | 		Column int
240 | 	}{
241 | 		{"name", false, 1, 1},
242 | 		{"gender", false, 1, 2},
243 | 		{" state", false, 1, 3},
244 | 		{"Joe", false, 2, 1},
245 | 		{"M", false, 2, 2},
246 | 		{` "GA"`, true, 2, 3},
247 | 		{"Sue", false, 3, 1},
248 | 		{` "F", "NJ"`, true, 3, 2},
249 | 		{"Bob", false, 4, 1},
250 | 		{"M", false, 4, 2},
251 | 		{`NY"`, true, 4, 3},
252 | 	}
253 | 
254 | 	buf := bytes.NewBuffer([]byte(strings.Join(rows, "\n")))
255 | 	cr := DefaultCSVReader(buf)
256 | 
257 | 	var (
258 | 		err error
259 | 		tok string
260 | 	)
261 | 
262 | 	for i := 0; cr.Scan(); i++ {
263 | 		tok = cr.Text()
264 | 		exp := expectedToks[i]
265 | 
266 | 		if cr.LineNumber() != exp.Line {
267 | 			t.Errorf("%d: expected line %d, got %d", i, exp.Line, cr.LineNumber())
268 | 		}
269 | 
270 | 		if cr.ColumnNumber() != exp.Column {
271 | 			t.Errorf("%d: expected column %d, got %d", i, exp.Column, cr.ColumnNumber())
272 | 		}
273 | 
274 | 		if exp.Token != tok {
275 | 			t.Errorf("%d: expected token `%s`, got `%s`", i, exp.Token, tok)
276 | 		}
277 | 
278 | 		err = cr.Err()
279 | 
280 | 		if err == nil && exp.Error {
281 | 			t.Errorf("%d: expected error", i)
282 | 		} else if err != nil && !exp.Error {
283 | 			t.Errorf("%d: unexpected error: %s", i, err)
284 | 		}
285 | 	}
286 | }
287 | 
288 | func TestCSVExtraColumns(t *testing.T) {
289 | 	buf := bytes.NewBufferString("one,two,three,four")
290 | 	cr := DefaultCSVReader(buf)
291 | 
292 | 	// 3 columns expected.
293 | 	toks := make([]string, 3)
294 | 	err := cr.ScanLine(toks)
295 | 
296 | 	if err == nil {
297 | 		t.Errorf("Expected error")
298 | 	} else if err != csvErrExtraColumns {
299 | 		t.Errorf("Expected extra columns error, got %s instead", err)
300 | 	}
301 | }
302 | 
303 | var line = `"3","\PCORI\VITAL\TOBACCO\SMOKING\","Smoked Tobacco","N","FAE",,,,"concept_cd","CONCEPT_DIMENSION","concept_path","T","like","\PCORI\VITAL\TOBACCO\SMOKING\","CDMv2","This field is new to v3.0. Indicator for any form of tobacco that is smoked.Per Meaningful Use guidance, smoking status includes any form of tobacco that is smoked, but not all tobacco use. ""Light smoker"" is interpreted to mean less than 10 cigarettes per day, or an equivalent (but less concretely defined) quantity of cigar or pipe smoke. ""Heavy smoker"" is interpreted to mean greater than 10 cigarettes per day or an equivalent (but less concretely defined) quantity of cigar or pipe smoke. ","@","2015-08-20 312:14:14.0","2015-08-20 12:14:14.0","2015-08-20 12:14:14.0","PCORNET_CDM",,,"\PCORI\VITAL\TOBACCO\","SMOKING"` + "\n"
304 | 
305 | func BenchmarkCSVReaderScan(b *testing.B) {
306 | 	cr := DefaultCSVReader(&bytes.Buffer{})
307 | 
308 | 	data := []byte(line)
309 | 
310 | 	for i := 0; i < b.N; i++ {
311 | 		_, data, _, _ = cr.scanField(data)
312 | 
313 | 		if len(data) == 0 {
314 | 			data = []byte(line)
315 | 		}
316 | 	}
317 | }
318 | 


--------------------------------------------------------------------------------
/profile/json/json.go:
--------------------------------------------------------------------------------
  1 | package json
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"encoding/json"
  7 | 	"fmt"
  8 | 	"io"
  9 | 
 10 | 	"github.com/chop-dbhi/sql-importer/profile"
 11 | )
 12 | 
 13 | type analyzer struct {
 14 | 	p profile.Profiler
 15 | }
 16 | 
 17 | func (a *analyzer) parseField(path, field string, value interface{}) {
 18 | 	fp := fmt.Sprintf("%s%s", path, field)
 19 | 
 20 | 	switch x := value.(type) {
 21 | 	case nil:
 22 | 		a.p.RecordType(fp, nil, profile.NullType)
 23 | 
 24 | 	// Nested object.
 25 | 	case map[string]interface{}:
 26 | 		a.parseMap(fp+"/", x)
 27 | 
 28 | 	// Array.
 29 | 	case []interface{}:
 30 | 		for _, v := range x {
 31 | 			a.parseField(path, field, v)
 32 | 		}
 33 | 
 34 | 	case bool:
 35 | 		a.p.RecordType(fp, x, profile.BoolType)
 36 | 
 37 | 	case string:
 38 | 		var t profile.ValueType
 39 | 
 40 | 		if _, ok := profile.ParseDate(x); ok {
 41 | 			t = profile.DateType
 42 | 		} else if _, ok := profile.ParseDateTime(x); ok {
 43 | 			t = profile.DateTimeType
 44 | 		} else {
 45 | 			t = profile.StringType
 46 | 		}
 47 | 
 48 | 		a.p.RecordType(fp, x, t)
 49 | 
 50 | 	case json.Number:
 51 | 		if v, err := x.Int64(); err == nil {
 52 | 			a.p.RecordType(fp, v, profile.IntType)
 53 | 		} else if v, err := x.Float64(); err == nil {
 54 | 			a.p.RecordType(fp, v, profile.FloatType)
 55 | 		} else {
 56 | 			panic("could not parse JSON number")
 57 | 		}
 58 | 
 59 | 	default:
 60 | 		panic(fmt.Sprintf("unsupported type: %#T", value))
 61 | 	}
 62 | }
 63 | 
 64 | // types are identified relative to the path.
 65 | func (a *analyzer) parseMap(path string, m map[string]interface{}) {
 66 | 	for k, v := range m {
 67 | 		a.parseField(path, k, v)
 68 | 	}
 69 | }
 70 | 
 71 | func (a *analyzer) parseLDJSON(r io.Reader) error {
 72 | 	s := bufio.NewScanner(r)
 73 | 
 74 | 	// Initialize buffer and JSON decoder.
 75 | 	var b bytes.Buffer
 76 | 	dec := json.NewDecoder(&b)
 77 | 	dec.UseNumber()
 78 | 
 79 | 	for s.Scan() {
 80 | 		line := bytes.TrimSpace(s.Bytes())
 81 | 		if len(line) == 0 {
 82 | 			continue
 83 | 		}
 84 | 
 85 | 		b.Reset()
 86 | 		b.Write(line)
 87 | 
 88 | 		var m map[string]interface{}
 89 | 		if err := dec.Decode(&m); err != nil {
 90 | 			return err
 91 | 		}
 92 | 
 93 | 		a.parseMap("", m)
 94 | 	}
 95 | 
 96 | 	return s.Err()
 97 | }
 98 | 
 99 | func (a *analyzer) parseJSON(r io.Reader) error {
100 | 	dec := json.NewDecoder(r)
101 | 	dec.UseNumber()
102 | 
103 | 	tok, err := dec.Token()
104 | 	if err != nil {
105 | 		return err
106 | 	}
107 | 
108 | 	if tok != json.Delim('[') {
109 | 		return fmt.Errorf("expected array, got: %v", tok)
110 | 	}
111 | 
112 | 	// More elements in the array.
113 | 	for dec.More() {
114 | 		var m map[string]interface{}
115 | 		if err := dec.Decode(&m); err != nil {
116 | 			return err
117 | 		}
118 | 
119 | 		a.parseMap("", m)
120 | 	}
121 | 
122 | 	return nil
123 | }
124 | 
125 | func Profile(config *profile.Config, in io.Reader, format string) (*profile.Profile, error) {
126 | 	p := profile.NewProfiler(config)
127 | 
128 | 	a := analyzer{
129 | 		p: p,
130 | 	}
131 | 
132 | 	var err error
133 | 
134 | 	switch format {
135 | 	case "ldjson":
136 | 		err = a.parseLDJSON(in)
137 | 	case "json":
138 | 		err = a.parseJSON(in)
139 | 	}
140 | 
141 | 	if err != nil {
142 | 		return nil, err
143 | 	}
144 | 
145 | 	return p.Profile(), nil
146 | }
147 | 


--------------------------------------------------------------------------------
/profile/json/json_test.go:
--------------------------------------------------------------------------------
 1 | package json
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestProfileJSON(t *testing.T) {
 9 | 	b := bytes.NewBufferString(`[
10 | 		{"name": "John", "color": "Blue", "dob": "1985-03-10"},
11 | 		{"name": "Jane", "color": "Red"}
12 | 	]`)
13 | 
14 | 	p, err := Profile(nil, b, "json")
15 | 	if err != nil {
16 | 		t.Fatal(err)
17 | 	}
18 | 
19 | 	if len(p.Fields) != 3 {
20 | 		t.Errorf("expected 3 fields, got %d", len(p.Fields))
21 | 	}
22 | }
23 | 
24 | func TestProfileLDJSON(t *testing.T) {
25 | 	b := bytes.NewBufferString(`
26 | 		{"name": "John", "color": "Blue", "dob": "1985-03-10"}
27 | 		{"name": "Jane", "color": "Red"}
28 | 		`)
29 | 
30 | 	p, err := Profile(nil, b, "ldjson")
31 | 	if err != nil {
32 | 		t.Fatal(err)
33 | 	}
34 | 
35 | 	if len(p.Fields) != 3 {
36 | 		t.Errorf("expected 3 fields, got %d", len(p.Fields))
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/profile/parse.go:
--------------------------------------------------------------------------------
 1 | package profile
 2 | 
 3 | import (
 4 | 	"strconv"
 5 | 	"strings"
 6 | 	"time"
 7 | )
 8 | 
 9 | var (
10 | 	dateFormats = []string{
11 | 		"2006-01-02",
12 | 		"2006-1-02",
13 | 		"01-02-2006",
14 | 		"01-02-06",
15 | 		"01/02/2006",
16 | 		"01/02/06",
17 | 		"1/2/06",
18 | 	}
19 | 
20 | 	dateTimeFormats = []string{
21 | 		"2006-01-02 15:04",
22 | 		"2006-01-02 15:04:05",
23 | 		"2006-01-02T15:04:05",
24 | 		"2006-01-02T15:04:05Z",
25 | 		"2006-01-02T15:04:05Z07:00",
26 | 	}
27 | )
28 | 
29 | func ParseBool(s string) (bool, bool) {
30 | 	s = strings.TrimSpace(s)
31 | 
32 | 	b, err := strconv.ParseBool(s)
33 | 	if err != nil {
34 | 		return false, false
35 | 	}
36 | 
37 | 	return b, true
38 | }
39 | 
40 | func ParseDate(s string) (time.Time, bool) {
41 | 	s = strings.TrimSpace(s)
42 | 
43 | 	for _, layout := range dateFormats {
44 | 		if v, err := time.Parse(layout, s); err == nil {
45 | 			return v, true
46 | 		}
47 | 	}
48 | 
49 | 	return time.Time{}, false
50 | }
51 | 
52 | func ParseDateTime(s string) (time.Time, bool) {
53 | 	s = strings.TrimSpace(s)
54 | 
55 | 	for _, layout := range dateTimeFormats {
56 | 		if v, err := time.Parse(layout, s); err == nil {
57 | 			return v, true
58 | 		}
59 | 	}
60 | 
61 | 	return time.Time{}, false
62 | }
63 | 
64 | func ParseFloat(s string) (float64, bool) {
65 | 	f, err := strconv.ParseFloat(s, 64)
66 | 	if err != nil {
67 | 		return 0, false
68 | 	}
69 | 	return f, true
70 | }
71 | 
72 | func ParseInt(s string) (int64, bool) {
73 | 	i, err := strconv.ParseInt(s, 10, 64)
74 | 	if err != nil {
75 | 		return 0, false
76 | 	}
77 | 	return i, true
78 | }
79 | 


--------------------------------------------------------------------------------
/profile/parse_test.go:
--------------------------------------------------------------------------------
 1 | package profile
 2 | 
 3 | import "testing"
 4 | 
 5 | func BenchmarkParseDateValid(b *testing.B) {
 6 | 	s := "1998-10-01"
 7 | 	for i := 0; i < b.N; i++ {
 8 | 		ParseDate(s)
 9 | 	}
10 | }
11 | 
12 | func BenchmarkParseDateInvalid(b *testing.B) {
13 | 	s := "not a date"
14 | 	for i := 0; i < b.N; i++ {
15 | 		ParseDate(s)
16 | 	}
17 | }
18 | 
19 | func BenchmarkParseDateTimeValid(b *testing.B) {
20 | 	s := "1998-10-01 01:32:10"
21 | 	for i := 0; i < b.N; i++ {
22 | 		ParseDateTime(s)
23 | 	}
24 | }
25 | 
26 | func BenchmarkParseDateTimeInvalid(b *testing.B) {
27 | 	s := "not a date time"
28 | 	for i := 0; i < b.N; i++ {
29 | 		ParseDateTime(s)
30 | 	}
31 | }
32 | 
33 | func BenchmarkParseFloatValid(b *testing.B) {
34 | 	s := "32.10219"
35 | 	for i := 0; i < b.N; i++ {
36 | 		ParseFloat(s)
37 | 	}
38 | }
39 | 
40 | func BenchmarkParseFloatInvalid(b *testing.B) {
41 | 	s := "not a number"
42 | 	for i := 0; i < b.N; i++ {
43 | 		ParseFloat(s)
44 | 	}
45 | }
46 | 
47 | func BenchmarkParseIntValid(b *testing.B) {
48 | 	s := "3210219"
49 | 	for i := 0; i < b.N; i++ {
50 | 		ParseInt(s)
51 | 	}
52 | }
53 | 
54 | func BenchmarkParseIntInvalid(b *testing.B) {
55 | 	s := "not a number"
56 | 	for i := 0; i < b.N; i++ {
57 | 		ParseInt(s)
58 | 	}
59 | }
60 | 
61 | func BenchmarkParseBoolValid(b *testing.B) {
62 | 	s := "TRUE"
63 | 	for i := 0; i < b.N; i++ {
64 | 		ParseBool(s)
65 | 	}
66 | }
67 | 
68 | func BenchmarkParseBoolInvalid(b *testing.B) {
69 | 	s := "not a bool"
70 | 	for i := 0; i < b.N; i++ {
71 | 		ParseBool(s)
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/profile/profile.go:
--------------------------------------------------------------------------------
 1 | package profile
 2 | 
 3 | // Field stores aggregation information and statistics for a field.
 4 | type Field struct {
 5 | 	// Name of this field.
 6 | 	Name string `json:"name"`
 7 | 
 8 | 	// Index of the field in tabular sources.
 9 | 	Index int `json:"index"`
10 | 
11 | 	// Inferred type of the field. All candidates types are in the
12 | 	// type counts array.
13 | 	Type ValueType `json:"type"`
14 | 
15 | 	// True if the field contains null values.
16 | 	Nullable bool `json:"nullable"`
17 | 
18 | 	// True if the field contains empty strings.
19 | 	Missing bool `json:"missing"`
20 | 
21 | 	// True if all values are unique.
22 | 	Unique bool `json:"unique"`
23 | 
24 | 	// If true, at least one value has been detected to have a leading zero.
25 | 	LeadingZeros bool `json:"leading_zeros"`
26 | }
27 | 
28 | type Profile struct {
29 | 	// Total number os records processed.
30 | 	RecordCount int64 `json:"record_count"`
31 | 
32 | 	// Flat set of fields that were profiled.
33 | 	Fields map[string]*Field `json:"fields"`
34 | }
35 | 
36 | func NewProfile() *Profile {
37 | 	return &Profile{
38 | 		Fields: make(map[string]*Field),
39 | 	}
40 | }
41 | 


--------------------------------------------------------------------------------
/profile/profile_test.go:
--------------------------------------------------------------------------------
 1 | package profile
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 	"time"
 6 | )
 7 | 
 8 | func TestProfilerRecord(t *testing.T) {
 9 | 	tests := map[string]struct {
10 | 		Raw  string
11 | 		Type ValueType
12 | 		Val  interface{}
13 | 	}{
14 | 		"string": {
15 | 			"bar",
16 | 			StringType,
17 | 			"bar",
18 | 		},
19 | 		"int": {
20 | 			"10",
21 | 			IntType,
22 | 			int64(10),
23 | 		},
24 | 		"float": {
25 | 			"1.20",
26 | 			FloatType,
27 | 			float64(1.20),
28 | 		},
29 | 		"bool": {
30 | 			"true",
31 | 			BoolType,
32 | 			true,
33 | 		},
34 | 		"date-1": {
35 | 			"2014-02-01",
36 | 			DateType,
37 | 			time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC),
38 | 		},
39 | 		"date-2": {
40 | 			"02/01/2014",
41 | 			DateType,
42 | 			time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC),
43 | 		},
44 | 		"date-3": {
45 | 			"02/01/14",
46 | 			DateType,
47 | 			time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC),
48 | 		},
49 | 		"date-4": {
50 | 			"2/1/14",
51 | 			DateType,
52 | 			time.Date(2014, time.February, 1, 0, 0, 0, 0, time.UTC),
53 | 		},
54 | 		"datetime": {
55 | 			"2014-02-01 10:00:00",
56 | 			DateTimeType,
57 | 			time.Date(2014, time.February, 1, 10, 0, 0, 0, time.UTC),
58 | 		},
59 | 	}
60 | 
61 | 	p := NewProfiler(nil)
62 | 
63 | 	for name, test := range tests {
64 | 		t.Run(name, func(t *testing.T) {
65 | 			p.Record("test", test.Raw)
66 | 		})
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/profile/profiler.go:
--------------------------------------------------------------------------------
  1 | package profile
  2 | 
  3 | import "strings"
  4 | 
  5 | // hasLeadingZeros checks if a valid integer value contains leading zeros.
  6 | // This is often an indicator that this is not an integer, but an identfier.
  7 | func hasLeadingZeros(s string) bool {
  8 | 	if s == "" {
  9 | 		return false
 10 | 	}
 11 | 
 12 | 	return s[0] == '0'
 13 | }
 14 | 
 15 | type profiler struct {
 16 | 	Config  *Config
 17 | 	Count   int64
 18 | 	Include map[string]struct{}
 19 | 	Exclude map[string]struct{}
 20 | 	Fields  map[string]*profilerField
 21 | }
 22 | 
 23 | // Profiler is an interface for profiling data.
 24 | type Profiler interface {
 25 | 	// Increment the record count.
 26 | 	Incr()
 27 | 
 28 | 	InitField(name string)
 29 | 
 30 | 	// Record records a field-value pair to the profile of an unknown type.
 31 | 	// The value must be a encoded as a string and will be parsed in a variety
 32 | 	// of ways to detect the type.
 33 | 	Record(field string, raw string)
 34 | 
 35 | 	// RecordType recorsd a field-value pair with a known type.
 36 | 	RecordType(field string, value interface{}, typ ValueType)
 37 | 
 38 | 	// Profile returns the profile.
 39 | 	Profile() *Profile
 40 | }
 41 | 
 42 | type Config struct {
 43 | 	// Include are the fields to explicitly include.
 44 | 	Include []string
 45 | 
 46 | 	// Exclude are the fields to explicitly exclude.
 47 | 	Exclude []string
 48 | }
 49 | 
 50 | func (p *profiler) Incr() {
 51 | 	p.Count++
 52 | }
 53 | 
 54 | // field returns the field profile if it should be profiled.
 55 | func (p *profiler) field(n string) (*profilerField, bool) {
 56 | 	n = strings.ToLower(n)
 57 | 
 58 | 	if _, ok := p.Exclude[n]; ok {
 59 | 		return nil, false
 60 | 	}
 61 | 
 62 | 	if len(p.Include) > 0 {
 63 | 		if _, ok := p.Include[n]; !ok {
 64 | 			return nil, false
 65 | 		}
 66 | 	}
 67 | 
 68 | 	// Initialize and get field profile.
 69 | 	f, ok := p.Fields[n]
 70 | 	if !ok {
 71 | 		f = newProfilerField(n)
 72 | 		p.Fields[n] = f
 73 | 	}
 74 | 
 75 | 	return f, true
 76 | }
 77 | 
 78 | func (p *profiler) InitField(name string) {
 79 | 	p.field(name)
 80 | }
 81 | 
 82 | func (p *profiler) Profile() *Profile {
 83 | 	r := NewProfile()
 84 | 	r.RecordCount = p.Count
 85 | 
 86 | 	for k, f := range p.Fields {
 87 | 		r.Fields[k] = f.Field()
 88 | 	}
 89 | 
 90 | 	return r
 91 | }
 92 | 
 93 | func (p *profiler) Record(n string, v string) {
 94 | 	f, ok := p.field(n)
 95 | 	if !ok {
 96 | 		return
 97 | 	}
 98 | 
 99 | 	// Still in the unique state.
100 | 	if f.Unique {
101 | 		// Duplicate value.
102 | 		if _, ok := f.Values[v]; ok {
103 | 			f.Unique = false
104 | 			f.Values = nil
105 | 		} else {
106 | 			f.Values[v] = struct{}{}
107 | 		}
108 | 	}
109 | 
110 | 	// Short circuit. Already most general type.
111 | 	if _, ok := f.Types[StringType]; ok {
112 | 		return
113 | 	}
114 | 
115 | 	if _, ok := ParseInt(v); ok {
116 | 		if !f.LeadingZeros && hasLeadingZeros(v) {
117 | 			f.LeadingZeros = true
118 | 		}
119 | 
120 | 		f.Types[IntType] = struct{}{}
121 | 		return
122 | 	}
123 | 
124 | 	if _, ok := ParseFloat(v); ok {
125 | 		f.Types[FloatType] = struct{}{}
126 | 		return
127 | 	}
128 | 
129 | 	if _, ok := ParseBool(v); ok {
130 | 		f.Types[BoolType] = struct{}{}
131 | 		return
132 | 	}
133 | 
134 | 	if _, ok := ParseDate(v); ok {
135 | 		f.Types[DateType] = struct{}{}
136 | 		return
137 | 	}
138 | 
139 | 	if _, ok := ParseDateTime(v); ok {
140 | 		f.Types[DateTimeType] = struct{}{}
141 | 		return
142 | 	}
143 | 
144 | 	f.Types[StringType] = struct{}{}
145 | }
146 | 
147 | func (p *profiler) RecordType(n string, v interface{}, t ValueType) {
148 | 	f, ok := p.field(n)
149 | 	if !ok {
150 | 		return
151 | 	}
152 | 
153 | 	f.Types[t] = struct{}{}
154 | }
155 | 
156 | // Field stores aggregation information and statistics for a field.
157 | type profilerField struct {
158 | 	Name         string
159 | 	Types        map[ValueType]struct{}
160 | 	Values       map[string]struct{}
161 | 	Unique       bool
162 | 	LeadingZeros bool
163 | }
164 | 
165 | func (p *profilerField) Field() *Field {
166 | 	_, nullable := p.Types[NullType]
167 | 	_, missing := p.Values[""]
168 | 
169 | 	f := Field{
170 | 		Name:         p.Name,
171 | 		Type:         p.Type(),
172 | 		Nullable:     nullable,
173 | 		Missing:      missing,
174 | 		Unique:       p.Unique,
175 | 		LeadingZeros: p.LeadingZeros,
176 | 	}
177 | 
178 | 	return &f
179 | }
180 | 
181 | // Type returns the most specific type this field satisfies.
182 | func (f *profilerField) Type() ValueType {
183 | 	if f.LeadingZeros {
184 | 		return StringType
185 | 	}
186 | 
187 | 	var g ValueType
188 | 
189 | 	for t := range f.Types {
190 | 		if g == UnknownType {
191 | 			g = t
192 | 		} else {
193 | 			g = GeneralizeType(t, g)
194 | 		}
195 | 	}
196 | 
197 | 	return g
198 | }
199 | 
200 | func newProfilerField(name string) *profilerField {
201 | 	return &profilerField{
202 | 		Name:   name,
203 | 		Types:  make(map[ValueType]struct{}),
204 | 		Values: make(map[string]struct{}),
205 | 		Unique: true,
206 | 	}
207 | }
208 | 
209 | func NewProfiler(c *Config) Profiler {
210 | 	if c == nil {
211 | 		c = &Config{}
212 | 	}
213 | 
214 | 	p := &profiler{
215 | 		Config: c,
216 | 		Fields: make(map[string]*profilerField),
217 | 	}
218 | 
219 | 	if len(p.Config.Exclude) > 0 {
220 | 		p.Exclude = make(map[string]struct{})
221 | 
222 | 		for _, f := range p.Config.Exclude {
223 | 			p.Exclude[strings.ToLower(f)] = struct{}{}
224 | 		}
225 | 	}
226 | 
227 | 	if len(p.Config.Include) > 0 {
228 | 		p.Include = make(map[string]struct{})
229 | 
230 | 		for _, f := range p.Config.Include {
231 | 			p.Include[strings.ToLower(f)] = struct{}{}
232 | 		}
233 | 	}
234 | 
235 | 	return p
236 | }
237 | 


--------------------------------------------------------------------------------
/profile/types.go:
--------------------------------------------------------------------------------
  1 | package profile
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"strings"
  6 | )
  7 | 
  8 | const (
  9 | 	UnknownType ValueType = iota
 10 | 	NullType
 11 | 	StringType
 12 | 	BinaryType
 13 | 	IntType
 14 | 	FloatType
 15 | 	BoolType
 16 | 	DateType
 17 | 	DateTimeType
 18 | 	ObjectType
 19 | )
 20 | 
 21 | // ValueType is a type of value.
 22 | type ValueType uint8
 23 | 
 24 | func (v ValueType) String() string {
 25 | 	switch v {
 26 | 	case NullType:
 27 | 		return "null"
 28 | 	case StringType:
 29 | 		return "string"
 30 | 	case BinaryType:
 31 | 		return "binary"
 32 | 	case IntType:
 33 | 		return "integer"
 34 | 	case FloatType:
 35 | 		return "float"
 36 | 	case BoolType:
 37 | 		return "boolean"
 38 | 	case DateType:
 39 | 		return "date"
 40 | 	case DateTimeType:
 41 | 		return "datetime"
 42 | 	case ObjectType:
 43 | 		return "object"
 44 | 	}
 45 | 
 46 | 	return ""
 47 | }
 48 | 
 49 | func (v ValueType) MarshalJSON() ([]byte, error) {
 50 | 	return json.Marshal(v.String())
 51 | }
 52 | 
 53 | func (v *ValueType) UnmarshalJSON(b []byte) error {
 54 | 	var s string
 55 | 	if err := json.Unmarshal(b, &s); err != nil {
 56 | 		return err
 57 | 	}
 58 | 
 59 | 	var t ValueType
 60 | 
 61 | 	switch strings.ToLower(s) {
 62 | 	case "string":
 63 | 		t = StringType
 64 | 	case "null":
 65 | 		t = NullType
 66 | 	case "binary":
 67 | 		t = BinaryType
 68 | 	case "integer":
 69 | 		t = IntType
 70 | 	case "float":
 71 | 		t = FloatType
 72 | 	case "boolean":
 73 | 		t = BoolType
 74 | 	case "date":
 75 | 		t = DateType
 76 | 	case "datetime":
 77 | 		t = DateTimeType
 78 | 	case "object":
 79 | 		t = ObjectType
 80 | 	}
 81 | 
 82 | 	*v = t
 83 | 
 84 | 	return nil
 85 | }
 86 | 
 87 | var typeGeneralizationMap = map[[2]ValueType]ValueType{
 88 | 	{BoolType, IntType}:      IntType,
 89 | 	{IntType, FloatType}:     FloatType,
 90 | 	{BoolType, FloatType}:    FloatType,
 91 | 	{DateTimeType, DateType}: DateTimeType,
 92 | }
 93 | 
 94 | // GeneralizeType takes two types and returns the more general
 95 | // type of the two with string being the most general if both
 96 | // are not null types.
 97 | func GeneralizeType(t1, t2 ValueType) ValueType {
 98 | 	// Same type.
 99 | 	if t1 == t2 {
100 | 		return t1
101 | 	}
102 | 
103 | 	if t1 == NullType {
104 | 		return t2
105 | 	}
106 | 
107 | 	if t2 == NullType {
108 | 		return t1
109 | 	}
110 | 
111 | 	key := [2]ValueType{t1, t2}
112 | 
113 | 	t, ok := typeGeneralizationMap[key]
114 | 	if ok {
115 | 		return t
116 | 	}
117 | 
118 | 	// Swap order.
119 | 	key[0], key[1] = key[1], key[0]
120 | 
121 | 	t, ok = typeGeneralizationMap[key]
122 | 	if ok {
123 | 		return t
124 | 	}
125 | 
126 | 	// Everything can be generalized to a string.
127 | 	return StringType
128 | }
129 | 


--------------------------------------------------------------------------------
/profile/types_test.go:
--------------------------------------------------------------------------------
 1 | package profile
 2 | 
 3 | import "testing"
 4 | 
 5 | func assertType(t *testing.T, e, a ValueType) {
 6 | 	if e != a {
 7 | 		t.Errorf("expected %s, got %s", e, a)
 8 | 	}
 9 | }
10 | 
11 | func TestGeneralizeType(t *testing.T) {
12 | 	assertType(t, GeneralizeType(IntType, FloatType), FloatType)
13 | 	assertType(t, GeneralizeType(IntType, BoolType), IntType)
14 | 	assertType(t, GeneralizeType(StringType, BoolType), StringType)
15 | 	assertType(t, GeneralizeType(DateTimeType, DateType), DateTimeType)
16 | }
17 | 


--------------------------------------------------------------------------------
/reader/reader.go:
--------------------------------------------------------------------------------
  1 | package reader
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"compress/bzip2"
  6 | 	"compress/gzip"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"os"
 10 | 	"path"
 11 | 	"path/filepath"
 12 | 	"strings"
 13 | )
 14 | 
 15 | var bom = []byte{0xef, 0xbb, 0xbf}
 16 | 
 17 | // UniversalReader wraps an io.Reader to replace carriage returns with newlines.
 18 | // This is used with the csv.Reader so it can properly delimit lines.
 19 | type UniversalReader struct {
 20 | 	r io.Reader
 21 | }
 22 | 
 23 | func (r *UniversalReader) Read(buf []byte) (int, error) {
 24 | 	n, err := r.r.Read(buf)
 25 | 
 26 | 	// Detect and remove BOM.
 27 | 	if bytes.HasPrefix(buf, bom) {
 28 | 		copy(buf, buf[len(bom):])
 29 | 		n -= len(bom)
 30 | 	}
 31 | 
 32 | 	// Replace carriage returns with newlines
 33 | 	for i, b := range buf {
 34 | 		if b == '\r' {
 35 | 			buf[i] = '\n'
 36 | 		}
 37 | 	}
 38 | 
 39 | 	return n, err
 40 | }
 41 | 
 42 | func (r *UniversalReader) Close() error {
 43 | 	if rc, ok := r.r.(io.Closer); ok {
 44 | 		return rc.Close()
 45 | 	}
 46 | 	return nil
 47 | }
 48 | 
 49 | func NewUniversalReader(r io.Reader) *UniversalReader {
 50 | 	return &UniversalReader{r}
 51 | }
 52 | 
 53 | // Decompress takes a compression type and a reader and returns
 54 | // reader that will be decompressed if the type is supported.
 55 | func Decompress(t string, r io.Reader) (io.Reader, error) {
 56 | 	if t == "" {
 57 | 		return r, nil
 58 | 	}
 59 | 
 60 | 	switch t {
 61 | 	case "gzip", "gz":
 62 | 		gr, err := gzip.NewReader(r)
 63 | 		if err != nil {
 64 | 			return nil, err
 65 | 		}
 66 | 		return gr, nil
 67 | 
 68 | 	case "bz2", "bzip2":
 69 | 		return bzip2.NewReader(r), nil
 70 | 	}
 71 | 
 72 | 	return nil, fmt.Errorf("compression type not supported: %s", t)
 73 | }
 74 | 
 75 | // DetectType attempts to detect the file format and compression types by looking at the
 76 | // file path extensions.
 77 | func DetectType(url string) (string, string) {
 78 | 	_, name := path.Split(url)
 79 | 
 80 | 	// Split up extensions.
 81 | 	exts := strings.Split(name, ".")[1:]
 82 | 
 83 | 	var (
 84 | 		compression string
 85 | 		format      string
 86 | 	)
 87 | 
 88 | 	for _, ext := range exts {
 89 | 		switch ext {
 90 | 		case "gz", "gzip":
 91 | 			compression = "gzip"
 92 | 
 93 | 		case "bz2", "bzip2":
 94 | 			compression = "bzip2"
 95 | 
 96 | 		case "json":
 97 | 			format = "json"
 98 | 
 99 | 		case "csv":
100 | 			format = "csv"
101 | 
102 | 		case "ldjson":
103 | 			format = "ldjson"
104 | 		}
105 | 	}
106 | 
107 | 	return format, compression
108 | }
109 | 
110 | func detectCompression(name string) string {
111 | 	switch filepath.Ext(name) {
112 | 	case ".gzip", ".gz":
113 | 		return "gzip"
114 | 	case ".bzip2", ".bz2":
115 | 		return "bzip2"
116 | 	}
117 | 
118 | 	return ""
119 | }
120 | 
121 | // Reader encapsulates a stdin stream.
122 | type Reader struct {
123 | 	Name        string
124 | 	Compression string
125 | 
126 | 	reader io.Reader
127 | 	file   *os.File
128 | }
129 | 
130 | // Read implements the io.Reader interface.
131 | func (r *Reader) Read(buf []byte) (int, error) {
132 | 	return r.reader.Read(buf)
133 | }
134 | 
135 | // Close implements the io.Closer interface.
136 | func (r *Reader) Close() {
137 | 	if r.file != nil {
138 | 		r.file.Close()
139 | 	}
140 | }
141 | 
142 | // Open a reader by name with optional compression. If no name is specified, STDIN
143 | // is used.
144 | func Open(name, compr string) (*Reader, error) {
145 | 	r := new(Reader)
146 | 
147 | 	if compr == "" {
148 | 		compr = detectCompression(name)
149 | 	}
150 | 
151 | 	// Validate Compressionession method before working with files.
152 | 	switch compr {
153 | 	case "bzip2", "gzip", "":
154 | 	default:
155 | 		return nil, fmt.Errorf("unknown compression type %s", compr)
156 | 	}
157 | 
158 | 	if name == "" {
159 | 		r.reader = os.Stdin
160 | 	} else {
161 | 		file, err := os.Open(name)
162 | 
163 | 		if err != nil {
164 | 			return nil, err
165 | 		}
166 | 
167 | 		r.file = file
168 | 		r.reader = file
169 | 	}
170 | 
171 | 	// Apply the Compressionession decoder.
172 | 	switch compr {
173 | 	case "gzip":
174 | 		reader, err := gzip.NewReader(r.reader)
175 | 
176 | 		if err != nil {
177 | 			r.Close()
178 | 			return nil, err
179 | 		}
180 | 
181 | 		r.reader = reader
182 | 	case "bzip2":
183 | 		r.reader = bzip2.NewReader(r.reader)
184 | 	}
185 | 
186 | 	r.Compression = compr
187 | 
188 | 	r.reader = &UniversalReader{r.reader}
189 | 
190 | 	return r, nil
191 | }
192 | 


--------------------------------------------------------------------------------
/reader/reader_test.go:
--------------------------------------------------------------------------------
 1 | package reader
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func TestUniversalReader(t *testing.T) {
 9 | 	s := "\xef\xbb\xbfhello world!\r"
10 | 
11 | 	r := bytes.NewBufferString(s)
12 | 	ur := &UniversalReader{r}
13 | 
14 | 	buf := make([]byte, 20)
15 | 	n, err := ur.Read(buf)
16 | 
17 | 	if err != nil {
18 | 		t.Fatalf("problem reading: %s", err)
19 | 	}
20 | 
21 | 	if cap(buf) != 20 {
22 | 		t.Fatalf("expected 20 cap, got %d", cap(buf))
23 | 	}
24 | 
25 | 	if len(s)-3 != n {
26 | 		t.Errorf("expected %d bytes, got %d", len(s)-3, n)
27 | 	}
28 | 
29 | 	exp := "hello world!\n"
30 | 
31 | 	if string(buf[:n]) != exp {
32 | 		t.Errorf("expected '%v', got '%v'", exp, string(buf[:n]))
33 | 	}
34 | }
35 | 


--------------------------------------------------------------------------------