├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── anonymisations.go
├── anonymisations_test.go
├── config.go
├── config_defaults_test.json
├── config_invalid_test.json
├── config_test.go
├── config_test.json
├── icon.svg
├── main.go
└── main_test.go


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | anon
 3 | target
 4 | *.exe
 5 | *.exe~
 6 | *.dll
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Test binary, build with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Modified version of the Travis CI config for the go-cmp project:
 3 | # https://github.com/google/go-cmp/blob/master/.travis.yml
 4 | sudo: false
 5 | language: go
 6 | 
 7 | # Build for MacOS and Linux.
 8 | os:
 9 |   - linux
10 |   - osx
11 | 
12 | go: 1.10.x
13 | 
14 | script:
15 |   - diff -u <(echo -n) <(gofmt -d .) # Catch any gofmt errors.
16 |   - go build -o target/anon-$TRAVIS_OS_NAME # Catch any compile errors first.
17 |   - go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... # Run the tests with coverage.
18 | 
19 | after_success:
20 |   - bash <(curl -s https://codecov.io/bash)
21 | 
22 | # On a tag, we will publish the binary produced to the GitHub release.
23 | deploy:
24 |   provider: releases
25 |   api_key:
26 |     secure: "ezDOGZRKLTK4kbdm3cMJg/PbJw7Jx3XwtOwhppNEE5VR7mc3gAZVae3S6zydTnDICdpPJgCBzyMej2lmfLqCi6rfYIROlZw0IOKQm+V9E01/WTACrlVCmXuoeQArb1Q0KUUqr8buEaLgrau4fay/StVfgk2tAjrF02GWk8vNu4IglahUqR5oRqkBmnZrJD5i0Y9vZRuUa0y7YWVBQMknQHxrGTS3SzCfLAAuIggigQt8AfkfC3iWDllQnH4ElIKpc1qv8dfVu2qsqHxwlWrFHHGFVSDAgX1dvgtmR38NY9j+fXiJQ7gvFJzTcyemZuB1w7HYjr7Zk0+9SB5nJV5pZDTeGXLyO547HqAGDe/d4L1uEXBtpkImopz4qekKRCeG/jcUE6iTp9ZmZOGuLpEOOInDRj4pNLyY5RwPBxC7Cfk4J2Lo6/FOdpxv0O+4FsoKkr/+cU2Zm4uf2V8L0c6OwVNbglQe/lDJBLBrR1KUc5OYM07IVSuOJRyR77EUb6BXvK/qF2t7C8s/+n93KqRMcLNTezWy2QDb7LBJ9g1PNi6alF+CU//vtHROYPlxU+QFX/rH8HMD3aIx/bEdWdM06OnzbLsYwBw4tKb1huabJVjpvWANsPwhIk5xGOGhRqywY5aJS2lcHnFl5lvfXh1Szuk8ZMLix6pzm6Qip24QizU="
27 |   file: target/anon-$TRAVIS_OS_NAME
28 |   skip_cleanup: true
29 |   on:
30 |     tags: true
31 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | education, socio-economic status, nationality, personal appearance, race,
10 | religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at albert.pastrana@intenthq.com or
59 | nathan.kleyn@intenthq.com. All complaints will be reviewed and investigated and
60 | will result in a response that is deemed necessary and appropriate to the
61 | circumstances. The project team is obligated to maintain confidentiality with
62 | regard to the reporter of an incident. Further details of specific enforcement
63 | policies may be posted separately.
64 | 
65 | Project maintainers who do not follow or enforce the Code of Conduct in good
66 | faith may face temporary or permanent repercussions as determined by other
67 | members of the project's leadership.
68 | 
69 | ## Attribution
70 | 
71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
72 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
73 | 
74 | [homepage]: https://www.contributor-covenant.org
75 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Anon
 2 | 
 3 | ## Code of Conduct
 4 | 
 5 | This project and everyone participating in it is governed by the [Anon Code of Conduct](CODE_OF_CONDUCT.md).
 6 | By participating, you are expected to uphold this code.
 7 | 
 8 | ## How Can I Contribute
 9 | 
10 | Any contribution is welcome, raise a bug (and fix it! :-)) request or add a new feature, add some documentation...
11 | Don't be shy and raise a pull request, anything on the following topics will be very welcome:
12 | - New actions to anonymise data
13 | - New input formats (JSON?)
14 | - Bug fixes
15 | 
16 | You can also take a look at the [issues](https://github.com/intenthq/anon/issues) and pick the one you like better.
17 | 
18 | If you are going to contribute, we ask you to do the following:
19 | - Use `gofmt` to format your code
20 | - Check your code with `go vet`, `gocyclo`, `golint`
21 | - Cover the logic with enough tests
22 | - Write decent commit messages
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 IntentHQ
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img src="icon.svg" width="300" />
  3 | </p>
  4 | 
  5 | # Anon — A UNIX Command To Anonymise Data
  6 | [![Build Status](https://travis-ci.org/intenthq/anon.svg?branch=master)](https://travis-ci.org/intenthq/anon) <a href="https://codecov.io/gh/intenthq/anon">
  7 |   <img src="https://codecov.io/gh/intenthq/anon/branch/master/graph/badge.svg" />
  8 | </a> [![Go Report Card](https://goreportcard.com/badge/github.com/intenthq/anon)](https://goreportcard.com/report/github.com/intenthq/anon) [![License](https://img.shields.io/npm/l/express.svg)](https://github.com/intenthq/anon/LICENSE)
  9 | ![GitHub release](https://img.shields.io/github/release/intenthq/anon.svg)
 10 | 
 11 | Anon is a tool for taking delimited files and anonymising or transforming columns until the output is useful for applications where sensitive information cannot be exposed.
 12 | 
 13 | ## Installation
 14 | 
 15 | Releases of Anon are available as pre-compiled static binaries [on the corresponding GitHub release](https://github.com/intenthq/anon/releases). Simply download the appropriate build for your machine and make sure it's in your `PATH` (or use it directly).
 16 | 
 17 | ## Usage
 18 | 
 19 | ```sh
 20 | anon [--config <path to config file, default is ./config.json>]
 21 |      [--output <path to output to, default is STDOUT>]
 22 | ```
 23 | 
 24 | Anon is designed to take input from `STDIN` and by default will output the anonymised file to `STDOUT`:
 25 | 
 26 | ```sh
 27 | anon < some_file.csv > some_file_anonymised.csv
 28 | ```
 29 | 
 30 | ### Configuration
 31 | 
 32 | In order to be useful, Anon needs to be told what you want to do to each column of the CSV. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
 33 | 
 34 | ```json5
 35 | {
 36 |   "csv": {
 37 |     "delimiter": ","
 38 |   },
 39 |   // Optionally define a number of rows to randomly sample down to.
 40 |   // To do it, it will hash (using FNV-1 32 bits) the column with the ID
 41 |   // in it and will mod the result by the value specified to decide if the
 42 |   // row is included or not -> include = hash(idColumn) % mod == 0
 43 |   "sampling": {
 44 |     // Number used to mod the hash of the id and determine if the row
 45 |     // has to be included in the sample or not
 46 |     "mod": 30000
 47 |     // Specify in which a column a unique ID exists on which the sampling can
 48 |     // be performed. Indices are 0 based, so this would sample on the first
 49 |     // column.
 50 |     "idColumn": 0
 51 |   },
 52 |   // An array of actions to take on each column - indices are 0 based, so index
 53 |   // 0 in this array corresponds to column 1, and so on.
 54 |   //
 55 |   // There must be an action for every column in the CSV.
 56 |   "actions": [
 57 |     {
 58 |       // The no-op, leaves the input unchanged.
 59 |       "name": "nothing"
 60 |     },
 61 |     {
 62 |       // Takes a UK format postcode (eg. W1W 8BE) and just keeps the outcode
 63 |       // (eg. W1W).
 64 |       "name": "outcode"
 65 |     },
 66 |     {
 67 |       // Hash (SHA1) the input.
 68 |       "name": "hash",
 69 |       // Optional salt that will be appened to the input.
 70 |       // If not defined, a random salt will be generated
 71 |       "salt": "salt"
 72 |     },
 73 |     {
 74 |       // Given a date, just keep the year.
 75 |       "name": "year",
 76 |       "dateConfig": {
 77 |         // Define the format of the input date here.
 78 |         "format": "YYYYmmmdd"
 79 |       }
 80 |     },
 81 |     {
 82 |       // Summarise a range of values.
 83 |       "name": "range",
 84 |       "rangeConfig": {
 85 |         "ranges": [
 86 |           // For example, this will take values between 0 and 100, and convert
 87 |           // them to the string "0-100".
 88 |           // You can use one of (gt, gte) and (lt, lte) but not both at the
 89 |           // same time.
 90 |           // You also need to define at least one of (gt, gte, lt, lte).
 91 |           {
 92 |             "gte": 0,
 93 |             "lt": 100,
 94 |             "output": "0-100"
 95 |           }
 96 |         ]
 97 |       }
 98 |     }
 99 |   ]
100 | }
101 | ```
102 | 
103 | ## Contributing
104 | 
105 | Any contribution will be welcome, please refer to our [contributing guidelines](CONTRIBUTING.md) for more information.
106 | 
107 | ## License
108 | 
109 | This project is [licensed under the MIT license](LICENSE).
110 | 
111 | The icon is by [Pixel Perfect](https://www.flaticon.com/authors/pixel-perfect) from [Flaticon](https://www.flaticon.com/), and is licensed under a [Creative Commons 3.0 BY](http://creativecommons.org/licenses/by/3.0/) license.
112 | 


--------------------------------------------------------------------------------
/anonymisations.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"crypto/sha1"
  5 | 	"errors"
  6 | 	"fmt"
  7 | 	"io"
  8 | 	"math/rand"
  9 | 	"strconv"
 10 | 	"strings"
 11 | 	"time"
 12 | )
 13 | 
 14 | // Anonymisation is a function that transforms a string into another one
 15 | type Anonymisation func(string) (string, error)
 16 | 
 17 | // DateConfig stores the format (layout) of an input date
 18 | type DateConfig struct {
 19 | 	Format string
 20 | }
 21 | 
 22 | // RangeConfig stores configuration to define a range of values
 23 | type RangeConfig struct {
 24 | 	Gt     *float64
 25 | 	Gte    *float64
 26 | 	Lt     *float64
 27 | 	Lte    *float64
 28 | 	Output *string
 29 | }
 30 | 
 31 | // ActionConfig stores the config of an anonymisation action
 32 | type ActionConfig struct {
 33 | 	Name        string
 34 | 	Salt        *string
 35 | 	DateConfig  DateConfig
 36 | 	RangeConfig []RangeConfig
 37 | }
 38 | 
 39 | // Returns an array of anonymisations according to the config
 40 | func anonymisations(configs *[]ActionConfig) ([]Anonymisation, error) {
 41 | 	var err error
 42 | 	res := make([]Anonymisation, len(*configs))
 43 | 	for i, config := range *configs {
 44 | 		if res[i], err = config.create(); err != nil {
 45 | 			return nil, err
 46 | 		}
 47 | 	}
 48 | 	return res, nil
 49 | }
 50 | 
 51 | // Returns the configured salt or a random one
 52 | // if it's not set.
 53 | func (ac *ActionConfig) saltOrRandom() string {
 54 | 	if ac.Salt != nil {
 55 | 		return *ac.Salt
 56 | 	}
 57 | 	return strconv.Itoa(rand.Int())
 58 | }
 59 | 
 60 | func (ac *ActionConfig) create() (Anonymisation, error) {
 61 | 	switch ac.Name {
 62 | 	case "nothing":
 63 | 		return identity, nil
 64 | 	case "outcode":
 65 | 		return outcode, nil
 66 | 	case "hash":
 67 | 		return hash(ac.saltOrRandom()), nil
 68 | 	case "year":
 69 | 		return year(ac.DateConfig.Format)
 70 | 	case "ranges":
 71 | 		return ranges(ac.RangeConfig)
 72 | 	}
 73 | 	return nil, fmt.Errorf("can't create an action with name %s", ac.Name)
 74 | }
 75 | 
 76 | // The no-op, returns the input unchanged.
 77 | func identity(s string) (string, error) {
 78 | 	return s, nil
 79 | }
 80 | 
 81 | // Hashes (SHA1) the input.
 82 | func hash(salt string) Anonymisation {
 83 | 	return func(s string) (string, error) {
 84 | 		h := sha1.New()
 85 | 		io.WriteString(h, s)
 86 | 		io.WriteString(h, salt)
 87 | 		return fmt.Sprintf("%x", h.Sum(nil)), nil
 88 | 	}
 89 | }
 90 | 
 91 | // Takes a UK format postcode (eg. W1W 8BE) and just keeps
 92 | // the outcode (eg. W1W).
 93 | // i.e. returns the prefix of the input until it finds a space
 94 | func outcode(s string) (string, error) {
 95 | 	return strings.Split(s, " ")[0], nil
 96 | }
 97 | 
 98 | // Given a date format/layout, it returns a function that
 99 | // given a date in that format, just keeps the year.
100 | // If either the format is invalid or the year doesn't
101 | // match that format, it will return an error and
102 | // the input unchanged
103 | func year(format string) (Anonymisation, error) {
104 | 	if _, err := time.Parse(format, format); err != nil {
105 | 		return nil, err
106 | 	}
107 | 	return func(s string) (string, error) {
108 | 		t, err := time.Parse(format, s)
109 | 		if err != nil {
110 | 			return s, err
111 | 		}
112 | 		return strconv.Itoa(t.Year()), nil
113 | 	}, nil
114 | }
115 | 
116 | // Given a list of ranges, it will summarise numeric
117 | // values into groups of values, each group defined
118 | // by a range and an output
119 | func ranges(ranges []RangeConfig) (Anonymisation, error) {
120 | 	for _, rc := range ranges {
121 | 		if rc.Gt != nil && rc.Gte != nil || rc.Lt != nil && rc.Lte != nil {
122 | 			return nil, errors.New("you can only specify one of (gt, gte) and (lt, lte)")
123 | 		} else if rc.Gt == nil && rc.Gte == nil && rc.Lt == nil && rc.Lte == nil {
124 | 			return nil, errors.New("you need to specify at least one of gt, gte, lt, lte")
125 | 		} else if rc.Output == nil {
126 | 			return nil, errors.New("you need to specify the output for a range")
127 | 		}
128 | 	}
129 | 	return func(s string) (string, error) {
130 | 		v, err := strconv.ParseFloat(s, 64)
131 | 		if err != nil {
132 | 			return s, err
133 | 		}
134 | 		for _, rang := range ranges {
135 | 			if rang.contains(v) {
136 | 				return *rang.Output, nil
137 | 			}
138 | 		}
139 | 		return s, errors.New("No range defined for value")
140 | 	}, nil
141 | }
142 | 
143 | func (r *RangeConfig) contains(v float64) bool {
144 | 	return (r.Gt == nil && r.Gte == nil || r.Gt != nil && *r.Gt < v || r.Gte != nil && *r.Gte <= v) &&
145 | 		(r.Lt == nil && r.Lte == nil || r.Lt != nil && *r.Lt > v || r.Lte != nil && *r.Lte >= v)
146 | }
147 | 


--------------------------------------------------------------------------------
/anonymisations_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"math/rand"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/leanovate/gopter"
  8 | 	"github.com/leanovate/gopter/gen"
  9 | 	"github.com/leanovate/gopter/prop"
 10 | 	"github.com/stretchr/testify/assert"
 11 | 	"github.com/stretchr/testify/require"
 12 | )
 13 | 
 14 | var salt = "jump"
 15 | 
 16 | const seed = int64(1)
 17 | 
 18 | //this is the first random salt with the seed above
 19 | const firstSalt = "5577006791947779410"
 20 | 
 21 | // can't test that the functions are equal because of https://github.com/stretchr/testify/issues/182
 22 | // and https://github.com/stretchr/testify/issues/159#issuecomment-99557398
 23 | // will have to test that the functions return the same
 24 | func assertAnonymisationFunction(t *testing.T, expected Anonymisation, actual Anonymisation, value string) {
 25 | 	require.NotNil(t, expected)
 26 | 	require.NotNil(t, actual)
 27 | 	expectedRes, expectedErr := expected(value)
 28 | 	actualRes, actualErr := actual(value)
 29 | 	assert.Equal(t, expectedRes, actualRes)
 30 | 	assert.Equal(t, expectedErr, actualErr)
 31 | }
 32 | 
 33 | func TestAnonymisations(t *testing.T) {
 34 | 	t.Run("a valid configuration", func(t *testing.T) {
 35 | 		conf := &[]ActionConfig{
 36 | 			ActionConfig{
 37 | 				Name: "nothing",
 38 | 			},
 39 | 			ActionConfig{
 40 | 				Name: "hash",
 41 | 				Salt: &salt,
 42 | 			},
 43 | 		}
 44 | 		anons, err := anonymisations(conf)
 45 | 		assert.NoError(t, err)
 46 | 		assertAnonymisationFunction(t, identity, anons[0], "a")
 47 | 		assertAnonymisationFunction(t, hash(salt), anons[1], "a")
 48 | 	})
 49 | 	t.Run("an invalid configuration", func(t *testing.T) {
 50 | 		conf := &[]ActionConfig{ActionConfig{Name: "year", DateConfig: DateConfig{Format: "3333"}}}
 51 | 		anons, err := anonymisations(conf)
 52 | 		assert.Error(t, err, "should return an error")
 53 | 		assert.Nil(t, anons)
 54 | 	})
 55 | }
 56 | 
 57 | func TestActionConfigSaltOrRandom(t *testing.T) {
 58 | 	t.Run("if salt is not specified", func(t *testing.T) {
 59 | 		rand.Seed(seed)
 60 | 		acNoSalt := ActionConfig{Name: "hash"}
 61 | 		assert.Equal(t, firstSalt, acNoSalt.saltOrRandom(), "should return a random salt")
 62 | 	})
 63 | 	t.Run("if salt is specified", func(t *testing.T) {
 64 | 		emptySalt := ""
 65 | 		acEmptySalt := ActionConfig{Name: "hash", Salt: &emptySalt}
 66 | 		assert.Empty(t, acEmptySalt.saltOrRandom(), "should return the empty salt if empty")
 67 | 
 68 | 		acSalt := ActionConfig{Name: "hash", Salt: &salt}
 69 | 		assert.Equal(t, "jump", acSalt.saltOrRandom(), "should return the salt")
 70 | 	})
 71 | }
 72 | 
 73 | func TestActionConfigCreate(t *testing.T) {
 74 | 	t.Run("invalid name", func(t *testing.T) {
 75 | 		ac := ActionConfig{Name: "invalid name"}
 76 | 		res, err := ac.create()
 77 | 		assert.Error(t, err)
 78 | 		assert.Nil(t, res)
 79 | 	})
 80 | 	t.Run("identity", func(t *testing.T) {
 81 | 		ac := ActionConfig{Name: "nothing"}
 82 | 		res, err := ac.create()
 83 | 		assert.NoError(t, err)
 84 | 		assertAnonymisationFunction(t, identity, res, "a")
 85 | 	})
 86 | 	t.Run("outcode", func(t *testing.T) {
 87 | 		ac := ActionConfig{Name: "outcode"}
 88 | 		res, err := ac.create()
 89 | 		assert.NoError(t, err)
 90 | 		assertAnonymisationFunction(t, outcode, res, "a")
 91 | 	})
 92 | 	t.Run("hash", func(t *testing.T) {
 93 | 		t.Run("if salt is not specified uses a random salt", func(t *testing.T) {
 94 | 			rand.Seed(1)
 95 | 			ac := ActionConfig{Name: "hash"}
 96 | 			res, err := ac.create()
 97 | 			assert.NoError(t, err)
 98 | 			assertAnonymisationFunction(t, hash(firstSalt), res, "a")
 99 | 		})
100 | 		t.Run("if salt is specified uses it", func(t *testing.T) {
101 | 			ac := ActionConfig{Name: "hash", Salt: &salt}
102 | 			res, err := ac.create()
103 | 			assert.NoError(t, err)
104 | 			assertAnonymisationFunction(t, hash(salt), res, "a")
105 | 		})
106 | 	})
107 | 	t.Run("year", func(t *testing.T) {
108 | 		t.Run("with an invalid format", func(t *testing.T) {
109 | 			ac := ActionConfig{Name: "year", DateConfig: DateConfig{Format: "11112233"}}
110 | 			res, err := ac.create()
111 | 			assert.Error(t, err, "should fail")
112 | 			assert.Nil(t, res)
113 | 		})
114 | 		t.Run("with a valid format", func(t *testing.T) {
115 | 			ac := ActionConfig{Name: "year", DateConfig: DateConfig{Format: "20060102"}}
116 | 			res, err := ac.create()
117 | 			assert.NoError(t, err, "should not fail")
118 | 			y, err := year("20060102")
119 | 			assert.NoError(t, err)
120 | 			assertAnonymisationFunction(t, y, res, "21121212")
121 | 		})
122 | 	})
123 | 	t.Run("ranges", func(t *testing.T) {
124 | 		num := 2.0
125 | 		output := "0-100"
126 | 		t.Run("range has at least one of lt, lte, gt, gte", func(t *testing.T) {
127 | 			ac := ActionConfig{
128 | 				Name:        "ranges",
129 | 				RangeConfig: []RangeConfig{RangeConfig{Output: &output}},
130 | 			}
131 | 			r, err := ac.create()
132 | 			assert.Error(t, err, "if not should return an error")
133 | 			assert.Nil(t, r)
134 | 		})
135 | 		t.Run("range contains both lt and lte", func(t *testing.T) {
136 | 			ac := ActionConfig{
137 | 				Name:        "ranges",
138 | 				RangeConfig: []RangeConfig{RangeConfig{Lt: &num, Lte: &num, Output: &output}},
139 | 			}
140 | 			r, err := ac.create()
141 | 			assert.Error(t, err, "if not should return an error")
142 | 			assert.Nil(t, r)
143 | 		})
144 | 		t.Run("range contains both gt and gte", func(t *testing.T) {
145 | 			ac := ActionConfig{
146 | 				Name:        "ranges",
147 | 				RangeConfig: []RangeConfig{RangeConfig{Gt: &num, Gte: &num, Output: &output}},
148 | 			}
149 | 			r, err := ac.create()
150 | 			assert.Error(t, err, "if not should return an error")
151 | 			assert.Nil(t, r)
152 | 		})
153 | 		t.Run("range without output defined", func(t *testing.T) {
154 | 			ac := ActionConfig{
155 | 				Name:        "ranges",
156 | 				RangeConfig: []RangeConfig{RangeConfig{Lt: &num, Gte: &num}},
157 | 			}
158 | 			r, err := ac.create()
159 | 			assert.Error(t, err, "if not should return an error")
160 | 			assert.Nil(t, r)
161 | 		})
162 | 		t.Run("valid range", func(t *testing.T) {
163 | 			rangeConfigs := []RangeConfig{RangeConfig{Lte: &num, Gte: &num, Output: &output}}
164 | 			ac := ActionConfig{
165 | 				Name:        "ranges",
166 | 				RangeConfig: rangeConfigs,
167 | 			}
168 | 			r, err := ac.create()
169 | 			expected, _ := ranges(rangeConfigs)
170 | 			assert.NoError(t, err)
171 | 			assertAnonymisationFunction(t, expected, r, "2")
172 | 		})
173 | 	})
174 | }
175 | 
176 | func TestIdentity(t *testing.T) {
177 | 	properties := gopter.NewProperties(nil)
178 | 
179 | 	properties.Property("Same output as input", prop.ForAll(
180 | 		func(v string) bool {
181 | 			res, err := identity(v)
182 | 			return assert.NoError(t, err) && assert.Equal(t, v, res)
183 | 		},
184 | 		gen.AnyString(),
185 | 	))
186 | 
187 | 	properties.TestingRun(t)
188 | }
189 | 
190 | func TestHash(t *testing.T) {
191 | 	t.Run("should hash the values using sha1 without a salt", func(t *testing.T) {
192 | 		unsaltedHash := hash("")
193 | 		res, err := unsaltedHash("")
194 | 		assert.NoError(t, err)
195 | 		assert.Equal(t, "da39a3ee5e6b4b0d3255bfef95601890afd80709", res)
196 | 		res, err = unsaltedHash("hasselhoff")
197 | 		assert.Equal(t, "ffe3294fad149c2dd3579cb864a1aebb2201f38d", res)
198 | 	})
199 | 	t.Run("should use the salt if provided", func(t *testing.T) {
200 | 		properties := gopter.NewProperties(nil)
201 | 
202 | 		properties.Property("hash(salt)(s) == hash(s+salt)", prop.ForAll(
203 | 			func(salt string, s string) bool {
204 | 				res1, err1 := hash(salt)(s)
205 | 				res2, err2 := hash("")(s + salt)
206 | 				return assert.NoError(t, err1) && assert.NoError(t, err2) && assert.Equal(t, res1, res2)
207 | 			},
208 | 			gen.AlphaString(),
209 | 			gen.AlphaString(),
210 | 		))
211 | 	})
212 | }
213 | 
214 | func TestOutcode(t *testing.T) {
215 | 	properties := gopter.NewProperties(nil)
216 | 
217 | 	properties.Property("Same output as input", prop.ForAll(
218 | 		func(v1 string, v2 string) bool {
219 | 			res, err := outcode(v1 + " " + v2)
220 | 			return assert.NoError(t, err) && assert.Equal(t, v1, res)
221 | 		},
222 | 		gen.AlphaString(),
223 | 		gen.AlphaString(),
224 | 	))
225 | 
226 | 	properties.TestingRun(t)
227 | }
228 | 
229 | func TestYear(t *testing.T) {
230 | 	f, _ := year("20060102")
231 | 	t.Run("if the date can be parsed", func(t *testing.T) {
232 | 		res, err := f("20120102")
233 | 		assert.NoError(t, err, "should return no error")
234 | 		assert.Equal(t, "2012", res, "should return the year")
235 | 	})
236 | 	t.Run("if the date cannot be parsed", func(t *testing.T) {
237 | 		res, err := f("input")
238 | 		assert.Error(t, err, "should return an error")
239 | 		assert.Equal(t, "input", res, "should return the input unchanged")
240 | 	})
241 | }
242 | func TestRanges(t *testing.T) {
243 | 	min := 0.0
244 | 	max := 100.0
245 | 	output := "0-100"
246 | 	f, _ := ranges([]RangeConfig{RangeConfig{Gt: &min, Lte: &max, Output: &output}})
247 | 	t.Run("if the value is not a float", func(t *testing.T) {
248 | 		res, err := f("input")
249 | 		assert.Error(t, err, "should return an error")
250 | 		assert.Equal(t, "input", res, "should return the input unchanged")
251 | 	})
252 | 	t.Run("if the value is a float", func(t *testing.T) {
253 | 		t.Run("not in any range", func(t *testing.T) {
254 | 			res, err := f("2000")
255 | 			assert.Error(t, err, "should return an error")
256 | 			assert.Equal(t, "2000", res, "should return the input unchanged")
257 | 		})
258 | 		t.Run("inside a range", func(t *testing.T) {
259 | 			res, err := f("10")
260 | 			assert.NoError(t, err, "should return no error")
261 | 			assert.Equal(t, output, res, "should return the output")
262 | 		})
263 | 	})
264 | }
265 | 
266 | func TestRangeConfigContains(t *testing.T) {
267 | 	min := 0.0
268 | 	max := 100.0
269 | 	t.Run("range containing only lt", func(t *testing.T) {
270 | 		conf := RangeConfig{Lt: &max}
271 | 		assert.True(t, conf.contains(max-1))
272 | 		assert.False(t, conf.contains(max))
273 | 		assert.False(t, conf.contains(max+1))
274 | 	})
275 | 	t.Run("range containing only lte", func(t *testing.T) {
276 | 		conf := RangeConfig{Lte: &max}
277 | 		assert.True(t, conf.contains(max-1))
278 | 		assert.True(t, conf.contains(max))
279 | 		assert.False(t, conf.contains(max+1))
280 | 	})
281 | 	t.Run("range containing only gt", func(t *testing.T) {
282 | 		conf := RangeConfig{Gt: &min}
283 | 		assert.False(t, conf.contains(min-1))
284 | 		assert.False(t, conf.contains(min))
285 | 		assert.True(t, conf.contains(min+1))
286 | 	})
287 | 	t.Run("range containing only gte", func(t *testing.T) {
288 | 		conf := RangeConfig{Gte: &min}
289 | 		assert.False(t, conf.contains(min-1))
290 | 		assert.True(t, conf.contains(min))
291 | 		assert.True(t, conf.contains(min+1))
292 | 	})
293 | 	t.Run("range containing gt and lt", func(t *testing.T) {
294 | 		conf := RangeConfig{Gt: &min, Lt: &max}
295 | 		assert.False(t, conf.contains(min-1))
296 | 		assert.False(t, conf.contains(min))
297 | 		assert.True(t, conf.contains(min+1))
298 | 		assert.False(t, conf.contains(max))
299 | 		assert.False(t, conf.contains(max+1))
300 | 	})
301 | 	t.Run("range containing gt and lte", func(t *testing.T) {
302 | 		conf := RangeConfig{Gt: &min, Lte: &max}
303 | 		assert.False(t, conf.contains(min-1))
304 | 		assert.False(t, conf.contains(min))
305 | 		assert.True(t, conf.contains(min+1))
306 | 		assert.True(t, conf.contains(max))
307 | 		assert.False(t, conf.contains(max+1))
308 | 	})
309 | 	t.Run("range containing gte and lt", func(t *testing.T) {
310 | 		conf := RangeConfig{Gte: &min, Lt: &max}
311 | 		assert.False(t, conf.contains(min-1))
312 | 		assert.True(t, conf.contains(min))
313 | 		assert.True(t, conf.contains(min+1))
314 | 		assert.False(t, conf.contains(max))
315 | 		assert.False(t, conf.contains(max+1))
316 | 	})
317 | 	t.Run("range containing gte and lte", func(t *testing.T) {
318 | 		conf := RangeConfig{Gte: &min, Lte: &max}
319 | 		assert.False(t, conf.contains(min-1))
320 | 		assert.True(t, conf.contains(min))
321 | 		assert.True(t, conf.contains(min+1))
322 | 		assert.True(t, conf.contains(max))
323 | 		assert.False(t, conf.contains(max+1))
324 | 	})
325 | }
326 | 


--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"encoding/json"
 5 | 	"os"
 6 | )
 7 | 
 8 | // CsvConfig stores the config to read and write the csv file
 9 | type CsvConfig struct {
10 | 	Delimiter string
11 | }
12 | 
13 | // SamplingConfig stores the config to know how to sample the file
14 | type SamplingConfig struct {
15 | 	Mod      uint32
16 | 	IDColumn uint32
17 | }
18 | 
19 | // Config stores all the configuration
20 | type Config struct {
21 | 	Csv      CsvConfig
22 | 	Sampling SamplingConfig
23 | 	Actions  []ActionConfig
24 | }
25 | 
26 | var defaultCsvConfig = CsvConfig{
27 | 	Delimiter: ",",
28 | }
29 | 
30 | var defaultSamplingConfig = SamplingConfig{
31 | 	Mod:      1,
32 | 	IDColumn: 0,
33 | }
34 | 
35 | var defaultActionsConfig = []ActionConfig{}
36 | 
37 | func loadConfig(filename string) (*Config, error) {
38 | 	file, err := os.Open(filename)
39 | 	defer file.Close()
40 | 	if err != nil {
41 | 		return nil, err
42 | 	}
43 | 	decoder := json.NewDecoder(file)
44 | 	conf := Config{
45 | 		Csv:      defaultCsvConfig,
46 | 		Sampling: defaultSamplingConfig,
47 | 		Actions:  defaultActionsConfig,
48 | 	}
49 | 	err = decoder.Decode(&conf)
50 | 	if err != nil {
51 | 		return nil, err
52 | 	}
53 | 	return &conf, err
54 | }
55 | 


--------------------------------------------------------------------------------
/config_defaults_test.json:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/config_invalid_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "csv": {
 3 |     "delimiter": ","
 4 |   },
 5 |   "sampling": {
 6 |     "mod": "not a number",
 7 |     "idColumn": 0
 8 |   },
 9 |   "actions": [
10 |     {
11 |       "name": "hash"
12 |     },
13 |     {
14 |       "name": "outcode"
15 |     },
16 |     {
17 |       "name": "year",
18 |       "dateConfig": {
19 |         "format": "20060102"
20 |       }
21 |     },
22 |     {
23 |       "name": "nothing"
24 |     }
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/config_test.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/stretchr/testify/assert"
 7 | 	"github.com/stretchr/testify/require"
 8 | )
 9 | 
10 | func TestLoadConfig(t *testing.T) {
11 | 	t.Run("if the file doesn't exist", func(t *testing.T) {
12 | 		conf, err := loadConfig("non-existing-file")
13 | 		assert.Nil(t, conf, "should return nil if the file doesn't exist")
14 | 		assert.Error(t, err, "should return the error if the file doesn't exist")
15 | 	})
16 | 	t.Run("if the json can't be decoded", func(t *testing.T) {
17 | 		conf, err := loadConfig("config_invalid_test.json")
18 | 		assert.Nil(t, conf, "should return nil if the json can't be decoded")
19 | 		assert.Error(t, err, "should return the error if the json can't be decoded")
20 | 	})
21 | 	t.Run("default config values", func(t *testing.T) {
22 | 		conf, err := loadConfig("config_defaults_test.json")
23 | 		require.NoError(t, err, "should return no error if the config can be loaded")
24 | 		assert.Equal(t, Config{
25 | 			Csv: CsvConfig{
26 | 				Delimiter: ",",
27 | 			},
28 | 			Sampling: SamplingConfig{
29 | 				Mod:      1,
30 | 				IDColumn: 0,
31 | 			},
32 | 			Actions: []ActionConfig{},
33 | 		}, *conf, "should fill the config with the default values")
34 | 	})
35 | 	t.Run("if the config can be loaded", func(t *testing.T) {
36 | 		gte := 0.0
37 | 		lt := 100.0
38 | 		output := "0-100"
39 | 		conf, err := loadConfig("config_test.json")
40 | 		require.NoError(t, err, "should return no error if the config can be loaded")
41 | 		assert.Equal(t, Config{
42 | 			Csv: CsvConfig{
43 | 				Delimiter: "|",
44 | 			},
45 | 			Sampling: SamplingConfig{
46 | 				Mod:      77,
47 | 				IDColumn: 84,
48 | 			},
49 | 			Actions: []ActionConfig{
50 | 				ActionConfig{
51 | 					Name: "hash",
52 | 				},
53 | 				ActionConfig{
54 | 					Name: "outcode",
55 | 				},
56 | 				ActionConfig{
57 | 					Name: "year",
58 | 					DateConfig: DateConfig{
59 | 						Format: "20060102",
60 | 					},
61 | 				},
62 | 				ActionConfig{
63 | 					Name: "ranges",
64 | 					RangeConfig: []RangeConfig{
65 | 						RangeConfig{
66 | 							Gte:    &gte,
67 | 							Lt:     &lt,
68 | 							Output: &output,
69 | 						},
70 | 					},
71 | 				},
72 | 				ActionConfig{
73 | 					Name: "nothing",
74 | 				},
75 | 			},
76 | 		}, *conf, "should return the config properly decoded")
77 | 	})
78 | }
79 | 


--------------------------------------------------------------------------------
/config_test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "csv": {
 3 |     "delimiter": "|"
 4 |   },
 5 |   "sampling": {
 6 |     "mod": 77,
 7 |     "idColumn": 84
 8 |   },
 9 |   "actions": [
10 |     {
11 |       "name": "hash"
12 |     },
13 |     {
14 |       "name": "outcode"
15 |     },
16 |     {
17 |       "name": "year",
18 |       "dateConfig": {
19 |         "format": "20060102"
20 |       }
21 |     },
22 |     {
23 |       "name": "ranges",
24 |       "rangeConfig": [
25 |         {
26 |           "gte": 0,
27 |           "lt": 100,
28 |           "output": "0-100"
29 |         }
30 |       ]
31 |     },
32 |     {
33 |       "name": "nothing"
34 |     }
35 |   ]
36 | }
37 | 


--------------------------------------------------------------------------------
/icon.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="iso-8859-1"?>
 2 | <!-- Generator: Adobe Illustrator 19.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 512.024 512.024" style="enable-background:new 0 0 512.024 512.024;" xml:space="preserve">
 5 | <path style="fill:#607D8B;" d="M261.921,492.459l-91.243-60.843V416c-0.003-4.589-2.942-8.662-7.296-10.112l-29.611-9.877
 6 | 	l-19.563-19.563c-2.554-2.561-6.233-3.653-9.771-2.901l-11.307,2.496c-55.12,16.876-92.853,67.646-93.12,125.291
 7 | 	c0,5.891,4.776,10.667,10.667,10.667h245.333c5.891-0.005,10.663-4.784,10.658-10.675
 8 | 	C266.668,497.762,264.886,494.435,261.921,492.459z"/>
 9 | <path style="fill:#FFE082;" d="M373.345,170.667c-0.651,0.015-1.3,0.079-1.941,0.192H140.641c-0.648-0.114-1.304-0.179-1.963-0.192
10 | 	c-18.24,0-32,22.933-32,53.333c0,29.675,13.099,52.224,30.677,53.333C156.748,341.589,203.489,384,256.012,384
11 | 	s99.264-42.453,118.656-106.667c17.579-1.067,30.677-23.616,30.677-53.333C405.345,193.6,391.585,170.667,373.345,170.667z"/>
12 | <path style="fill:#607D8B;" d="M417.591,375.701l-10.027-2.133c-3.538-0.752-7.216,0.34-9.771,2.901l-19.563,19.563l-29.611,9.877
13 | 	c-4.338,1.455-7.265,5.515-7.275,10.091v15.616l-91.243,60.843c-4.901,3.268-6.225,9.891-2.957,14.792
14 | 	c1.977,2.965,5.303,4.746,8.866,4.749h245.333c5.891,0,10.667-4.776,10.667-10.667C511.602,443.245,473.275,392.247,417.591,375.701
15 | 	z"/>
16 | <g>
17 | 	<path style="fill:#455A64;" d="M259.98,182.272c-7.808-8.128-21.739-11.605-46.635-11.605c-25.877,0-39.595,3.371-47.339,11.605
18 | 		c-5.657,7.043-7.935,16.216-6.229,25.088c0.107,1.92,0.256,3.925,0.256,5.952c-2.001,21.563,13.856,40.665,35.419,42.667
19 | 		c2.411,0.224,4.837,0.224,7.248,0c26.731,0,64-16.235,64-42.667c0-1.771,0-3.52,0.149-5.227
20 | 		C268.389,198.893,265.885,189.483,259.98,182.272z"/>
21 | 	<path style="fill:#455A64;" d="M346.316,182.272c-7.744-8.235-21.461-11.605-47.339-11.605c-24.896,0-38.827,3.477-46.635,11.605
22 | 		c-5.903,7.22-8.4,16.638-6.848,25.835c0,1.707,0.149,3.435,0.149,5.227c0,26.432,37.099,42.667,63.701,42.667
23 | 		c21.859,1.857,41.086-14.358,42.943-36.217c0.182-2.146,0.19-4.303,0.022-6.449c0-2.133,0.128-4.032,0.256-5.952
24 | 		C354.273,198.5,351.987,189.317,346.316,182.272z"/>
25 | </g>
26 | <path style="fill:#607D8B;" d="M375.137,96.149c-5.802-0.992-11.31,2.903-12.309,8.704c-0.053,0.596-0.053,1.196,0,1.792H149.132
27 | 	c0.053-0.596,0.053-1.196,0-1.792c-0.999-5.801-6.508-9.696-12.309-8.704c-82.731,14.208-94.144,28.821-94.144,42.517
28 | 	C42.679,180.779,176.78,192,256.012,192s213.333-11.221,213.333-53.333C469.345,124.971,457.932,110.357,375.137,96.149z"/>
29 | <path style="fill:#455A64;" d="M383.799,125.867c0.824,4.307-1.029,8.686-4.693,11.093c-37.712,20.468-80.21,30.485-123.093,29.013
30 | 	c-42.883,1.471-85.381-8.545-123.093-29.013c-3.665-2.408-5.517-6.787-4.693-11.093c2.477-11.801,5.756-23.419,9.813-34.773
31 | 	c2.599-7.756,5.589-15.374,8.96-22.827C164.279,30.08,188.812,0,213.345,0c15.801,0.228,30.975,6.222,42.667,16.853
32 | 	C267.704,6.222,282.877,0.228,298.679,0c24.533,0,49.067,30.08,66.347,68.267c3.371,7.452,6.361,15.071,8.96,22.827
33 | 	C378.042,102.448,381.321,114.066,383.799,125.867z"/>
34 | <path style="fill:#607D8B;" d="M373.985,91.093C359.905,105.387,326.412,128,256.012,128s-103.893-22.613-117.973-36.907
35 | 	c2.599-7.756,5.589-15.374,8.96-22.827c0.424,0.445,0.783,0.948,1.067,1.493c0.64,0.853,22.187,36.907,107.947,36.907
36 | 	c86.613,0,107.733-36.693,107.947-36.907c0.283-0.546,0.642-1.048,1.067-1.493C368.396,75.719,371.387,83.338,373.985,91.093z"/>
37 | <g>
38 | 	<path style="fill:#455A64;" d="M264.801,495.296L147.468,324.629c-2.693-3.915-7.652-5.577-12.16-4.075l-64,21.333
39 | 		c-5.602,1.822-8.667,7.84-6.846,13.442c0.517,1.59,1.4,3.036,2.579,4.222l53.333,53.333c1.198,1.185,2.66,2.07,4.267,2.581
40 | 		l24.704,8.213v13.653c0.002,3.568,1.787,6.898,4.757,8.875l96,64c4.9,3.27,11.523,1.949,14.794-2.951
41 | 		c2.427-3.636,2.389-8.385-0.095-11.983V495.296z"/>
42 | 	<path style="fill:#455A64;" d="M440.716,341.888l-64-21.333c-4.508-1.502-9.467,0.159-12.16,4.075L247.223,495.296
43 | 		c-3.348,4.848-2.132,11.491,2.716,14.838c3.597,2.484,8.346,2.522,11.983,0.095l96-64c2.976-1.981,4.763-5.321,4.757-8.896V423.68
44 | 		l24.704-8.235c1.606-0.511,3.068-1.396,4.267-2.581l53.333-53.333c4.153-4.178,4.133-10.932-0.045-15.085
45 | 		c-1.186-1.179-2.632-2.062-4.222-2.579V341.888z"/>
46 | </g>
47 | <g>
48 | </g>
49 | <g>
50 | </g>
51 | <g>
52 | </g>
53 | <g>
54 | </g>
55 | <g>
56 | </g>
57 | <g>
58 | </g>
59 | <g>
60 | </g>
61 | <g>
62 | </g>
63 | <g>
64 | </g>
65 | <g>
66 | </g>
67 | <g>
68 | </g>
69 | <g>
70 | </g>
71 | <g>
72 | </g>
73 | <g>
74 | </g>
75 | <g>
76 | </g>
77 | </svg>
78 | 


--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"encoding/csv"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"hash/fnv"
  8 | 	"io"
  9 | 	"log"
 10 | 	"math/rand"
 11 | 	"os"
 12 | 	"time"
 13 | )
 14 | 
 15 | func main() {
 16 | 	rand.Seed(time.Now().UTC().UnixNano())
 17 | 	//TODO move args parsing to a function
 18 | 	configFile := flag.String("config", "config.json", "Configuration of the data to be anonymised. Default is 'config.json'")
 19 | 	outputFile := flag.String("output", "", "Output file. Default is stdout.")
 20 | 	flag.Parse()
 21 | 	log.Printf("Using configuration in file %s\n", *configFile)
 22 | 	conf, err := loadConfig(*configFile)
 23 | 	if err != nil {
 24 | 		log.Fatal(err)
 25 | 	}
 26 | 	r := initReader(flag.Arg(0), conf.Csv)
 27 | 	w := initWriter(*outputFile, conf.Csv)
 28 | 	anons, err := anonymisations(&conf.Actions)
 29 | 	if err != nil {
 30 | 		log.Fatal(err)
 31 | 	}
 32 | 
 33 | 	if err := process(r, w, conf, &anons); err != nil {
 34 | 		log.Fatal(err)
 35 | 	}
 36 | }
 37 | 
 38 | func process(r *csv.Reader, w *csv.Writer, conf *Config, anons *[]Anonymisation) error {
 39 | 	i := 0
 40 | 
 41 | 	for {
 42 | 		record, err := r.Read()
 43 | 		if err == io.EOF {
 44 | 			break
 45 | 		} else if pe, ok := err.(*csv.ParseError); ok && pe.Err == csv.ErrFieldCount {
 46 | 			// we just print the error and skip the record
 47 | 			log.Print(err)
 48 | 		} else if err != nil {
 49 | 			return err
 50 | 		} else if int64(conf.Sampling.IDColumn) >= int64(len(record)) {
 51 | 			return fmt.Errorf("id column (%d) out of range, record has %d columns", conf.Sampling.IDColumn, len(record))
 52 | 		} else if sample(record[conf.Sampling.IDColumn], conf.Sampling) {
 53 | 			anonymised, err := anonymise(record, *anons)
 54 | 			if err != nil {
 55 | 				// we just print the error and skip the record
 56 | 				log.Print(err)
 57 | 			} else {
 58 | 				w.Write(anonymised)
 59 | 			}
 60 | 			//TODO decide how often do we want to flush
 61 | 			if i%100 == 0 {
 62 | 				w.Flush()
 63 | 			}
 64 | 		}
 65 | 		i++
 66 | 	}
 67 | 	w.Flush()
 68 | 	return nil
 69 | }
 70 | 
 71 | func sample(s string, conf SamplingConfig) bool {
 72 | 	h := fnv.New32a()
 73 | 	h.Write([]byte(s))
 74 | 	return h.Sum32()%conf.Mod == 0
 75 | }
 76 | 
 77 | func initReader(filename string, conf CsvConfig) *csv.Reader {
 78 | 	reader := csv.NewReader(fileOr(filename, os.Stdin, os.Open))
 79 | 	reader.Comma = []rune(conf.Delimiter)[0]
 80 | 	return reader
 81 | }
 82 | 
 83 | func initWriter(filename string, conf CsvConfig) *csv.Writer {
 84 | 	writer := csv.NewWriter(fileOr(filename, os.Stdout, os.Create))
 85 | 	writer.Comma = []rune(conf.Delimiter)[0]
 86 | 	return writer
 87 | }
 88 | 
 89 | // If filename is empty, will return `def`, if it's not, will return the
 90 | // result of the function `action` after passing `filename` ot it.
 91 | func fileOr(filename string, def *os.File, action func(string) (*os.File, error)) *os.File {
 92 | 	if filename == "" {
 93 | 		return def
 94 | 	}
 95 | 	f, err := action(filename)
 96 | 	if err != nil {
 97 | 		log.Fatal(err)
 98 | 	}
 99 | 	return f
100 | }
101 | 
102 | func anonymise(record []string, anons []Anonymisation) ([]string, error) {
103 | 	var err error
104 | 	for i := range record {
105 | 		// TODO decide if we fail if not enough anonmisations are defined
106 | 		// or we just skip the column (i.e. we apply identity)
107 | 		if i < len(anons) {
108 | 			if record[i], err = anons[i](record[i]); err != nil {
109 | 				return nil, err
110 | 			}
111 | 		}
112 | 	}
113 | 	return record, nil
114 | }
115 | 


--------------------------------------------------------------------------------
/main_test.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/csv"
  6 | 	"io/ioutil"
  7 | 	"log"
  8 | 	"os"
  9 | 	"strings"
 10 | 	"testing"
 11 | 
 12 | 	"github.com/stretchr/testify/assert"
 13 | )
 14 | 
 15 | func TestInitReader(t *testing.T) {
 16 | 	t.Run("with an empty filename", func(t *testing.T) {
 17 | 		tmpfile := tmpFile("content")
 18 | 		defer os.Remove(tmpfile.Name()) // clean up
 19 | 
 20 | 		oldStdin := os.Stdin
 21 | 		defer func() { os.Stdin = oldStdin }() // Restore original Stdin
 22 | 		os.Stdin = tmpfile
 23 | 
 24 | 		r := initReader("", defaultCsvConfig)
 25 | 		record, err := r.Read()
 26 | 
 27 | 		assert.NoError(t, err, "should return no error")
 28 | 		assert.Equal(t, []string{"content"}, record, "should return a csv reader that reads from stdin")
 29 | 	})
 30 | 	t.Run("with a valid filename", func(t *testing.T) {
 31 | 		tmpfile := tmpFile("content")
 32 | 		defer os.Remove(tmpfile.Name()) // clean up
 33 | 
 34 | 		r := initReader(tmpfile.Name(), defaultCsvConfig)
 35 | 		record, err := r.Read()
 36 | 
 37 | 		assert.NoError(t, err, "should return no error")
 38 | 		assert.Equal(t, []string{"content"}, record, "should return a csv reader that reads from the file")
 39 | 	})
 40 | }
 41 | 
 42 | func tmpFile(content string) *os.File {
 43 | 	tmpfile, err := ioutil.TempFile("", "anon-test")
 44 | 	if err != nil {
 45 | 		log.Fatal(err)
 46 | 	}
 47 | 	ioutil.WriteFile(tmpfile.Name(), []byte("content"), os.ModePerm)
 48 | 	return tmpfile
 49 | }
 50 | 
 51 | func TestInitWriter(t *testing.T) {
 52 | 	t.Run("with an empty filename", func(t *testing.T) {
 53 | 		tmpfile := tmpFile("")
 54 | 		defer os.Remove(tmpfile.Name()) // clean up
 55 | 
 56 | 		oldStdout := os.Stdout
 57 | 		defer func() { os.Stdout = oldStdout }() // Restore original Stdout
 58 | 		os.Stdout = tmpfile
 59 | 
 60 | 		w := initWriter("", defaultCsvConfig)
 61 | 		err := w.Write([]string{"csv", "content"})
 62 | 		w.Flush()
 63 | 
 64 | 		content, _ := ioutil.ReadFile(tmpfile.Name())
 65 | 		assert.NoError(t, err, "should return no error")
 66 | 		assert.Equal(t, "csv,content\n", string(content), "should return a csv writer that writes to stdout")
 67 | 	})
 68 | 	t.Run("with a valid filename", func(t *testing.T) {
 69 | 		tmpfile := tmpFile("")
 70 | 		defer os.Remove(tmpfile.Name()) // clean up
 71 | 
 72 | 		w := initWriter(tmpfile.Name(), defaultCsvConfig)
 73 | 		err := w.Write([]string{"csv", "content"})
 74 | 		w.Flush()
 75 | 
 76 | 		content, _ := ioutil.ReadFile(tmpfile.Name())
 77 | 		assert.NoError(t, err, "should return no error")
 78 | 		assert.Equal(t, "csv,content\n", string(content), "should return a csv writer that writes to stdout")
 79 | 	})
 80 | }
 81 | func TestFileOr(t *testing.T) {
 82 | 	assert.Equal(t, fileOr("", os.Stdin, stdOutOk), os.Stdin, "with an empty filename returns the default value")
 83 | 	assert.Equal(t, fileOr("something", os.Stdin, stdOutOk), os.Stdout, "with non empty filename returns the value returned by the action")
 84 | }
 85 | 
 86 | func stdOutOk(s string) (*os.File, error) {
 87 | 	return os.Stdout, nil
 88 | }
 89 | 
 90 | func TestAnonymise(t *testing.T) {
 91 | 	record := []string{"a", "b", "c"}
 92 | 	actions := []Anonymisation{identity, hash(""), identity}
 93 | 	output := []string{"a", "e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98", "c"}
 94 | 	res, err := anonymise(record, actions)
 95 | 	assert.NoError(t, err)
 96 | 	assert.Equal(t, output, res, "should apply anonymisation functions to each column in the record")
 97 | }
 98 | 
 99 | func TestSample(t *testing.T) {
100 | 	conf := SamplingConfig{
101 | 		Mod: 2,
102 | 	}
103 | 	assert.True(t, sample("a", conf))
104 | 	assert.False(t, sample("b", conf))
105 | }
106 | 
107 | func TestProcess(t *testing.T) {
108 | 	config := func(mod uint32, idColumn uint32) *Config {
109 | 		return &Config{Sampling: SamplingConfig{Mod: mod, IDColumn: idColumn}}
110 | 	}
111 | 	anons := &[]Anonymisation{identity, outcode}
112 | 	createReaderAndWriter := func(in string) (*csv.Reader, *csv.Writer, *bytes.Buffer) {
113 | 		var out bytes.Buffer
114 | 		r := csv.NewReader(strings.NewReader(in))
115 | 
116 | 		w := csv.NewWriter(&out)
117 | 		return r, w, &out
118 | 	}
119 | 	t.Run("when the id column is out of range", func(t *testing.T) {
120 | 		r, w, out := createReaderAndWriter("a,b c\nd,e f\n")
121 | 
122 | 		err := process(r, w, config(1, 100), anons)
123 | 		assert.Error(t, err, "should return an error")
124 | 		assert.Equal(t, "", out.String(), "shouldn't write any output")
125 | 	})
126 | 	t.Run("when there is an error writing the output", func(t *testing.T) {
127 | 		var out bytes.Buffer
128 | 		f, _ := os.Open("non existing file")
129 | 		r := csv.NewReader(f)
130 | 
131 | 		w := csv.NewWriter(&out)
132 | 		err := process(r, w, config(1, 0), anons)
133 | 		assert.Error(t, err, "should return an error")
134 | 	})
135 | 	t.Run("when there is an error processing one of the rows", func(t *testing.T) {
136 | 		r, w, out := createReaderAndWriter("20020202\nfail\n10010101")
137 | 
138 | 		y, _ := year("20060102")
139 | 		err := process(r, w, config(1, 0), &[]Anonymisation{y})
140 | 		assert.NoError(t, err, "should not return an error")
141 | 		assert.Equal(t, "2002\n1001\n", out.String(), "should skip that row")
142 | 	})
143 | 	t.Run("when sampling is defined", func(t *testing.T) {
144 | 		r, w, out := createReaderAndWriter("a,b c\nd,e f\ng,h i\nj,k l\n")
145 | 
146 | 		err := process(r, w, config(2, 0), anons)
147 | 		assert.NoError(t, err, "should return no error")
148 | 		assert.Equal(t, "a,b\ng,h\n", out.String(), "should process some rows")
149 | 	})
150 | 	t.Run("when all the rows are valid", func(t *testing.T) {
151 | 		r, w, out := createReaderAndWriter("a,b c\nd,e f\n")
152 | 
153 | 		err := process(r, w, config(1, 0), anons)
154 | 		assert.NoError(t, err, "should return no error")
155 | 		assert.Equal(t, "a,b\nd,e\n", out.String(), "should process all rows")
156 | 	})
157 | }
158 | 


--------------------------------------------------------------------------------