├── .gitignore
├── .travis.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── anonymisations.go
├── anonymisations_test.go
├── config.go
├── config_defaults_test.json
├── config_invalid_test.json
├── config_test.go
├── config_test.json
├── icon.svg
├── main.go
└── main_test.go
/.gitignore:
--------------------------------------------------------------------------------
1 | # Binaries for programs and plugins
2 | anon
3 | target
4 | *.exe
5 | *.exe~
6 | *.dll
7 | *.so
8 | *.dylib
9 |
10 | # Test binary, build with `go test -c`
11 | *.test
12 |
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | ---
2 | # Modified version of the Travis CI config for the go-cmp project:
3 | # https://github.com/google/go-cmp/blob/master/.travis.yml
4 | sudo: false
5 | language: go
6 |
7 | # Build for MacOS and Linux.
8 | os:
9 | - linux
10 | - osx
11 |
12 | go: 1.10.x
13 |
14 | script:
15 | - diff -u <(echo -n) <(gofmt -d .) # Catch any gofmt errors.
16 | - go build -o target/anon-$TRAVIS_OS_NAME # Catch any compile errors first.
17 | - go test -v -race -coverprofile=coverage.txt -covermode=atomic ./... # Run the tests with coverage.
18 |
19 | after_success:
20 | - bash <(curl -s https://codecov.io/bash)
21 |
22 | # On a tag, we will publish the binary produced to the GitHub release.
23 | deploy:
24 | provider: releases
25 | api_key:
26 | secure: "ezDOGZRKLTK4kbdm3cMJg/PbJw7Jx3XwtOwhppNEE5VR7mc3gAZVae3S6zydTnDICdpPJgCBzyMej2lmfLqCi6rfYIROlZw0IOKQm+V9E01/WTACrlVCmXuoeQArb1Q0KUUqr8buEaLgrau4fay/StVfgk2tAjrF02GWk8vNu4IglahUqR5oRqkBmnZrJD5i0Y9vZRuUa0y7YWVBQMknQHxrGTS3SzCfLAAuIggigQt8AfkfC3iWDllQnH4ElIKpc1qv8dfVu2qsqHxwlWrFHHGFVSDAgX1dvgtmR38NY9j+fXiJQ7gvFJzTcyemZuB1w7HYjr7Zk0+9SB5nJV5pZDTeGXLyO547HqAGDe/d4L1uEXBtpkImopz4qekKRCeG/jcUE6iTp9ZmZOGuLpEOOInDRj4pNLyY5RwPBxC7Cfk4J2Lo6/FOdpxv0O+4FsoKkr/+cU2Zm4uf2V8L0c6OwVNbglQe/lDJBLBrR1KUc5OYM07IVSuOJRyR77EUb6BXvK/qF2t7C8s/+n93KqRMcLNTezWy2QDb7LBJ9g1PNi6alF+CU//vtHROYPlxU+QFX/rH8HMD3aIx/bEdWdM06OnzbLsYwBw4tKb1huabJVjpvWANsPwhIk5xGOGhRqywY5aJS2lcHnFl5lvfXh1Szuk8ZMLix6pzm6Qip24QizU="
27 | file: target/anon-$TRAVIS_OS_NAME
28 | skip_cleanup: true
29 | on:
30 | tags: true
31 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as
6 | contributors and maintainers pledge to making participation in our project and
7 | our community a harassment-free experience for everyone, regardless of age, body
8 | size, disability, ethnicity, gender identity and expression, level of experience,
9 | education, socio-economic status, nationality, personal appearance, race,
10 | religion, or sexual identity and orientation.
11 |
12 | ## Our Standards
13 |
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 |
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 | address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 | professional setting
33 |
34 | ## Our Responsibilities
35 |
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 |
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 |
46 | ## Scope
47 |
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 |
55 | ## Enforcement
56 |
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at albert.pastrana@intenthq.com or
59 | nathan.kleyn@intenthq.com. All complaints will be reviewed and investigated and
60 | will result in a response that is deemed necessary and appropriate to the
61 | circumstances. The project team is obligated to maintain confidentiality with
62 | regard to the reporter of an incident. Further details of specific enforcement
63 | policies may be posted separately.
64 |
65 | Project maintainers who do not follow or enforce the Code of Conduct in good
66 | faith may face temporary or permanent repercussions as determined by other
67 | members of the project's leadership.
68 |
69 | ## Attribution
70 |
71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
72 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
73 |
74 | [homepage]: https://www.contributor-covenant.org
75 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Anon
2 |
3 | ## Code of Conduct
4 |
5 | This project and everyone participating in it is governed by the [Anon Code of Conduct](CODE_OF_CONDUCT.md).
6 | By participating, you are expected to uphold this code.
7 |
8 | ## How Can I Contribute
9 |
10 | Any contribution is welcome, raise a bug (and fix it! :-)) request or add a new feature, add some documentation...
11 | Don't be shy and raise a pull request, anything on the following topics will be very welcome:
12 | - New actions to anonymise data
13 | - New input formats (JSON?)
14 | - Bug fixes
15 |
16 | You can also take a look at the [issues](https://github.com/intenthq/anon/issues) and pick the one you like better.
17 |
18 | If you are going to contribute, we ask you to do the following:
19 | - Use `gofmt` to format your code
20 | - Check your code with `go vet`, `gocyclo`, `golint`
21 | - Cover the logic with enough tests
22 | - Write decent commit messages
23 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 IntentHQ
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | # Anon — A UNIX Command To Anonymise Data
6 | [](https://travis-ci.org/intenthq/anon)
7 |
8 | [](https://goreportcard.com/report/github.com/intenthq/anon) [](https://github.com/intenthq/anon/LICENSE)
9 | 
10 |
11 | Anon is a tool for taking delimited files and anonymising or transforming columns until the output is useful for applications where sensitive information cannot be exposed.
12 |
13 | ## Installation
14 |
15 | Releases of Anon are available as pre-compiled static binaries [on the corresponding GitHub release](https://github.com/intenthq/anon/releases). Simply download the appropriate build for your machine and make sure it's in your `PATH` (or use it directly).
16 |
17 | ## Usage
18 |
19 | ```sh
20 | anon [--config ]
21 | [--output ]
22 | ```
23 |
24 | Anon is designed to take input from `STDIN` and by default will output the anonymised file to `STDOUT`:
25 |
26 | ```sh
27 | anon < some_file.csv > some_file_anonymised.csv
28 | ```
29 |
30 | ### Configuration
31 |
32 | In order to be useful, Anon needs to be told what you want to do to each column of the CSV. The config is defined as a JSON file (defaults to a file called `config.json` in the current directory):
33 |
34 | ```json5
35 | {
36 | "csv": {
37 | "delimiter": ","
38 | },
39 | // Optionally define a number of rows to randomly sample down to.
40 | // To do it, it will hash (using FNV-1 32 bits) the column with the ID
41 | // in it and will mod the result by the value specified to decide if the
42 | // row is included or not -> include = hash(idColumn) % mod == 0
43 | "sampling": {
44 | // Number used to mod the hash of the id and determine if the row
45 | // has to be included in the sample or not
46 | "mod": 30000
47 | // Specify in which a column a unique ID exists on which the sampling can
48 | // be performed. Indices are 0 based, so this would sample on the first
49 | // column.
50 | "idColumn": 0
51 | },
52 | // An array of actions to take on each column - indices are 0 based, so index
53 | // 0 in this array corresponds to column 1, and so on.
54 | //
55 | // There must be an action for every column in the CSV.
56 | "actions": [
57 | {
58 | // The no-op, leaves the input unchanged.
59 | "name": "nothing"
60 | },
61 | {
62 | // Takes a UK format postcode (eg. W1W 8BE) and just keeps the outcode
63 | // (eg. W1W).
64 | "name": "outcode"
65 | },
66 | {
67 | // Hash (SHA1) the input.
68 | "name": "hash",
69 | // Optional salt that will be appened to the input.
70 | // If not defined, a random salt will be generated
71 | "salt": "salt"
72 | },
73 | {
74 | // Given a date, just keep the year.
75 | "name": "year",
76 | "dateConfig": {
77 | // Define the format of the input date here.
78 | "format": "YYYYmmmdd"
79 | }
80 | },
81 | {
82 | // Summarise a range of values.
83 | "name": "range",
84 | "rangeConfig": {
85 | "ranges": [
86 | // For example, this will take values between 0 and 100, and convert
87 | // them to the string "0-100".
88 | // You can use one of (gt, gte) and (lt, lte) but not both at the
89 | // same time.
90 | // You also need to define at least one of (gt, gte, lt, lte).
91 | {
92 | "gte": 0,
93 | "lt": 100,
94 | "output": "0-100"
95 | }
96 | ]
97 | }
98 | }
99 | ]
100 | }
101 | ```
102 |
103 | ## Contributing
104 |
105 | Any contribution will be welcome, please refer to our [contributing guidelines](CONTRIBUTING.md) for more information.
106 |
107 | ## License
108 |
109 | This project is [licensed under the MIT license](LICENSE).
110 |
111 | The icon is by [Pixel Perfect](https://www.flaticon.com/authors/pixel-perfect) from [Flaticon](https://www.flaticon.com/), and is licensed under a [Creative Commons 3.0 BY](http://creativecommons.org/licenses/by/3.0/) license.
112 |
--------------------------------------------------------------------------------
/anonymisations.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "crypto/sha1"
5 | "errors"
6 | "fmt"
7 | "io"
8 | "math/rand"
9 | "strconv"
10 | "strings"
11 | "time"
12 | )
13 |
14 | // Anonymisation is a function that transforms a string into another one
15 | type Anonymisation func(string) (string, error)
16 |
17 | // DateConfig stores the format (layout) of an input date
18 | type DateConfig struct {
19 | Format string
20 | }
21 |
22 | // RangeConfig stores configuration to define a range of values
23 | type RangeConfig struct {
24 | Gt *float64
25 | Gte *float64
26 | Lt *float64
27 | Lte *float64
28 | Output *string
29 | }
30 |
31 | // ActionConfig stores the config of an anonymisation action
32 | type ActionConfig struct {
33 | Name string
34 | Salt *string
35 | DateConfig DateConfig
36 | RangeConfig []RangeConfig
37 | }
38 |
39 | // Returns an array of anonymisations according to the config
40 | func anonymisations(configs *[]ActionConfig) ([]Anonymisation, error) {
41 | var err error
42 | res := make([]Anonymisation, len(*configs))
43 | for i, config := range *configs {
44 | if res[i], err = config.create(); err != nil {
45 | return nil, err
46 | }
47 | }
48 | return res, nil
49 | }
50 |
51 | // Returns the configured salt or a random one
52 | // if it's not set.
53 | func (ac *ActionConfig) saltOrRandom() string {
54 | if ac.Salt != nil {
55 | return *ac.Salt
56 | }
57 | return strconv.Itoa(rand.Int())
58 | }
59 |
60 | func (ac *ActionConfig) create() (Anonymisation, error) {
61 | switch ac.Name {
62 | case "nothing":
63 | return identity, nil
64 | case "outcode":
65 | return outcode, nil
66 | case "hash":
67 | return hash(ac.saltOrRandom()), nil
68 | case "year":
69 | return year(ac.DateConfig.Format)
70 | case "ranges":
71 | return ranges(ac.RangeConfig)
72 | }
73 | return nil, fmt.Errorf("can't create an action with name %s", ac.Name)
74 | }
75 |
76 | // The no-op, returns the input unchanged.
77 | func identity(s string) (string, error) {
78 | return s, nil
79 | }
80 |
81 | // Hashes (SHA1) the input.
82 | func hash(salt string) Anonymisation {
83 | return func(s string) (string, error) {
84 | h := sha1.New()
85 | io.WriteString(h, s)
86 | io.WriteString(h, salt)
87 | return fmt.Sprintf("%x", h.Sum(nil)), nil
88 | }
89 | }
90 |
91 | // Takes a UK format postcode (eg. W1W 8BE) and just keeps
92 | // the outcode (eg. W1W).
93 | // i.e. returns the prefix of the input until it finds a space
94 | func outcode(s string) (string, error) {
95 | return strings.Split(s, " ")[0], nil
96 | }
97 |
98 | // Given a date format/layout, it returns a function that
99 | // given a date in that format, just keeps the year.
100 | // If either the format is invalid or the year doesn't
101 | // match that format, it will return an error and
102 | // the input unchanged
103 | func year(format string) (Anonymisation, error) {
104 | if _, err := time.Parse(format, format); err != nil {
105 | return nil, err
106 | }
107 | return func(s string) (string, error) {
108 | t, err := time.Parse(format, s)
109 | if err != nil {
110 | return s, err
111 | }
112 | return strconv.Itoa(t.Year()), nil
113 | }, nil
114 | }
115 |
116 | // Given a list of ranges, it will summarise numeric
117 | // values into groups of values, each group defined
118 | // by a range and an output
119 | func ranges(ranges []RangeConfig) (Anonymisation, error) {
120 | for _, rc := range ranges {
121 | if rc.Gt != nil && rc.Gte != nil || rc.Lt != nil && rc.Lte != nil {
122 | return nil, errors.New("you can only specify one of (gt, gte) and (lt, lte)")
123 | } else if rc.Gt == nil && rc.Gte == nil && rc.Lt == nil && rc.Lte == nil {
124 | return nil, errors.New("you need to specify at least one of gt, gte, lt, lte")
125 | } else if rc.Output == nil {
126 | return nil, errors.New("you need to specify the output for a range")
127 | }
128 | }
129 | return func(s string) (string, error) {
130 | v, err := strconv.ParseFloat(s, 64)
131 | if err != nil {
132 | return s, err
133 | }
134 | for _, rang := range ranges {
135 | if rang.contains(v) {
136 | return *rang.Output, nil
137 | }
138 | }
139 | return s, errors.New("No range defined for value")
140 | }, nil
141 | }
142 |
143 | func (r *RangeConfig) contains(v float64) bool {
144 | return (r.Gt == nil && r.Gte == nil || r.Gt != nil && *r.Gt < v || r.Gte != nil && *r.Gte <= v) &&
145 | (r.Lt == nil && r.Lte == nil || r.Lt != nil && *r.Lt > v || r.Lte != nil && *r.Lte >= v)
146 | }
147 |
--------------------------------------------------------------------------------
/anonymisations_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "math/rand"
5 | "testing"
6 |
7 | "github.com/leanovate/gopter"
8 | "github.com/leanovate/gopter/gen"
9 | "github.com/leanovate/gopter/prop"
10 | "github.com/stretchr/testify/assert"
11 | "github.com/stretchr/testify/require"
12 | )
13 |
14 | var salt = "jump"
15 |
16 | const seed = int64(1)
17 |
18 | //this is the first random salt with the seed above
19 | const firstSalt = "5577006791947779410"
20 |
21 | // can't test that the functions are equal because of https://github.com/stretchr/testify/issues/182
22 | // and https://github.com/stretchr/testify/issues/159#issuecomment-99557398
23 | // will have to test that the functions return the same
24 | func assertAnonymisationFunction(t *testing.T, expected Anonymisation, actual Anonymisation, value string) {
25 | require.NotNil(t, expected)
26 | require.NotNil(t, actual)
27 | expectedRes, expectedErr := expected(value)
28 | actualRes, actualErr := actual(value)
29 | assert.Equal(t, expectedRes, actualRes)
30 | assert.Equal(t, expectedErr, actualErr)
31 | }
32 |
33 | func TestAnonymisations(t *testing.T) {
34 | t.Run("a valid configuration", func(t *testing.T) {
35 | conf := &[]ActionConfig{
36 | ActionConfig{
37 | Name: "nothing",
38 | },
39 | ActionConfig{
40 | Name: "hash",
41 | Salt: &salt,
42 | },
43 | }
44 | anons, err := anonymisations(conf)
45 | assert.NoError(t, err)
46 | assertAnonymisationFunction(t, identity, anons[0], "a")
47 | assertAnonymisationFunction(t, hash(salt), anons[1], "a")
48 | })
49 | t.Run("an invalid configuration", func(t *testing.T) {
50 | conf := &[]ActionConfig{ActionConfig{Name: "year", DateConfig: DateConfig{Format: "3333"}}}
51 | anons, err := anonymisations(conf)
52 | assert.Error(t, err, "should return an error")
53 | assert.Nil(t, anons)
54 | })
55 | }
56 |
57 | func TestActionConfigSaltOrRandom(t *testing.T) {
58 | t.Run("if salt is not specified", func(t *testing.T) {
59 | rand.Seed(seed)
60 | acNoSalt := ActionConfig{Name: "hash"}
61 | assert.Equal(t, firstSalt, acNoSalt.saltOrRandom(), "should return a random salt")
62 | })
63 | t.Run("if salt is specified", func(t *testing.T) {
64 | emptySalt := ""
65 | acEmptySalt := ActionConfig{Name: "hash", Salt: &emptySalt}
66 | assert.Empty(t, acEmptySalt.saltOrRandom(), "should return the empty salt if empty")
67 |
68 | acSalt := ActionConfig{Name: "hash", Salt: &salt}
69 | assert.Equal(t, "jump", acSalt.saltOrRandom(), "should return the salt")
70 | })
71 | }
72 |
73 | func TestActionConfigCreate(t *testing.T) {
74 | t.Run("invalid name", func(t *testing.T) {
75 | ac := ActionConfig{Name: "invalid name"}
76 | res, err := ac.create()
77 | assert.Error(t, err)
78 | assert.Nil(t, res)
79 | })
80 | t.Run("identity", func(t *testing.T) {
81 | ac := ActionConfig{Name: "nothing"}
82 | res, err := ac.create()
83 | assert.NoError(t, err)
84 | assertAnonymisationFunction(t, identity, res, "a")
85 | })
86 | t.Run("outcode", func(t *testing.T) {
87 | ac := ActionConfig{Name: "outcode"}
88 | res, err := ac.create()
89 | assert.NoError(t, err)
90 | assertAnonymisationFunction(t, outcode, res, "a")
91 | })
92 | t.Run("hash", func(t *testing.T) {
93 | t.Run("if salt is not specified uses a random salt", func(t *testing.T) {
94 | rand.Seed(1)
95 | ac := ActionConfig{Name: "hash"}
96 | res, err := ac.create()
97 | assert.NoError(t, err)
98 | assertAnonymisationFunction(t, hash(firstSalt), res, "a")
99 | })
100 | t.Run("if salt is specified uses it", func(t *testing.T) {
101 | ac := ActionConfig{Name: "hash", Salt: &salt}
102 | res, err := ac.create()
103 | assert.NoError(t, err)
104 | assertAnonymisationFunction(t, hash(salt), res, "a")
105 | })
106 | })
107 | t.Run("year", func(t *testing.T) {
108 | t.Run("with an invalid format", func(t *testing.T) {
109 | ac := ActionConfig{Name: "year", DateConfig: DateConfig{Format: "11112233"}}
110 | res, err := ac.create()
111 | assert.Error(t, err, "should fail")
112 | assert.Nil(t, res)
113 | })
114 | t.Run("with a valid format", func(t *testing.T) {
115 | ac := ActionConfig{Name: "year", DateConfig: DateConfig{Format: "20060102"}}
116 | res, err := ac.create()
117 | assert.NoError(t, err, "should not fail")
118 | y, err := year("20060102")
119 | assert.NoError(t, err)
120 | assertAnonymisationFunction(t, y, res, "21121212")
121 | })
122 | })
123 | t.Run("ranges", func(t *testing.T) {
124 | num := 2.0
125 | output := "0-100"
126 | t.Run("range has at least one of lt, lte, gt, gte", func(t *testing.T) {
127 | ac := ActionConfig{
128 | Name: "ranges",
129 | RangeConfig: []RangeConfig{RangeConfig{Output: &output}},
130 | }
131 | r, err := ac.create()
132 | assert.Error(t, err, "if not should return an error")
133 | assert.Nil(t, r)
134 | })
135 | t.Run("range contains both lt and lte", func(t *testing.T) {
136 | ac := ActionConfig{
137 | Name: "ranges",
138 | RangeConfig: []RangeConfig{RangeConfig{Lt: &num, Lte: &num, Output: &output}},
139 | }
140 | r, err := ac.create()
141 | assert.Error(t, err, "if not should return an error")
142 | assert.Nil(t, r)
143 | })
144 | t.Run("range contains both gt and gte", func(t *testing.T) {
145 | ac := ActionConfig{
146 | Name: "ranges",
147 | RangeConfig: []RangeConfig{RangeConfig{Gt: &num, Gte: &num, Output: &output}},
148 | }
149 | r, err := ac.create()
150 | assert.Error(t, err, "if not should return an error")
151 | assert.Nil(t, r)
152 | })
153 | t.Run("range without output defined", func(t *testing.T) {
154 | ac := ActionConfig{
155 | Name: "ranges",
156 | RangeConfig: []RangeConfig{RangeConfig{Lt: &num, Gte: &num}},
157 | }
158 | r, err := ac.create()
159 | assert.Error(t, err, "if not should return an error")
160 | assert.Nil(t, r)
161 | })
162 | t.Run("valid range", func(t *testing.T) {
163 | rangeConfigs := []RangeConfig{RangeConfig{Lte: &num, Gte: &num, Output: &output}}
164 | ac := ActionConfig{
165 | Name: "ranges",
166 | RangeConfig: rangeConfigs,
167 | }
168 | r, err := ac.create()
169 | expected, _ := ranges(rangeConfigs)
170 | assert.NoError(t, err)
171 | assertAnonymisationFunction(t, expected, r, "2")
172 | })
173 | })
174 | }
175 |
176 | func TestIdentity(t *testing.T) {
177 | properties := gopter.NewProperties(nil)
178 |
179 | properties.Property("Same output as input", prop.ForAll(
180 | func(v string) bool {
181 | res, err := identity(v)
182 | return assert.NoError(t, err) && assert.Equal(t, v, res)
183 | },
184 | gen.AnyString(),
185 | ))
186 |
187 | properties.TestingRun(t)
188 | }
189 |
190 | func TestHash(t *testing.T) {
191 | t.Run("should hash the values using sha1 without a salt", func(t *testing.T) {
192 | unsaltedHash := hash("")
193 | res, err := unsaltedHash("")
194 | assert.NoError(t, err)
195 | assert.Equal(t, "da39a3ee5e6b4b0d3255bfef95601890afd80709", res)
196 | res, err = unsaltedHash("hasselhoff")
197 | assert.Equal(t, "ffe3294fad149c2dd3579cb864a1aebb2201f38d", res)
198 | })
199 | t.Run("should use the salt if provided", func(t *testing.T) {
200 | properties := gopter.NewProperties(nil)
201 |
202 | properties.Property("hash(salt)(s) == hash(s+salt)", prop.ForAll(
203 | func(salt string, s string) bool {
204 | res1, err1 := hash(salt)(s)
205 | res2, err2 := hash("")(s + salt)
206 | return assert.NoError(t, err1) && assert.NoError(t, err2) && assert.Equal(t, res1, res2)
207 | },
208 | gen.AlphaString(),
209 | gen.AlphaString(),
210 | ))
211 | })
212 | }
213 |
214 | func TestOutcode(t *testing.T) {
215 | properties := gopter.NewProperties(nil)
216 |
217 | properties.Property("Same output as input", prop.ForAll(
218 | func(v1 string, v2 string) bool {
219 | res, err := outcode(v1 + " " + v2)
220 | return assert.NoError(t, err) && assert.Equal(t, v1, res)
221 | },
222 | gen.AlphaString(),
223 | gen.AlphaString(),
224 | ))
225 |
226 | properties.TestingRun(t)
227 | }
228 |
229 | func TestYear(t *testing.T) {
230 | f, _ := year("20060102")
231 | t.Run("if the date can be parsed", func(t *testing.T) {
232 | res, err := f("20120102")
233 | assert.NoError(t, err, "should return no error")
234 | assert.Equal(t, "2012", res, "should return the year")
235 | })
236 | t.Run("if the date cannot be parsed", func(t *testing.T) {
237 | res, err := f("input")
238 | assert.Error(t, err, "should return an error")
239 | assert.Equal(t, "input", res, "should return the input unchanged")
240 | })
241 | }
242 | func TestRanges(t *testing.T) {
243 | min := 0.0
244 | max := 100.0
245 | output := "0-100"
246 | f, _ := ranges([]RangeConfig{RangeConfig{Gt: &min, Lte: &max, Output: &output}})
247 | t.Run("if the value is not a float", func(t *testing.T) {
248 | res, err := f("input")
249 | assert.Error(t, err, "should return an error")
250 | assert.Equal(t, "input", res, "should return the input unchanged")
251 | })
252 | t.Run("if the value is a float", func(t *testing.T) {
253 | t.Run("not in any range", func(t *testing.T) {
254 | res, err := f("2000")
255 | assert.Error(t, err, "should return an error")
256 | assert.Equal(t, "2000", res, "should return the input unchanged")
257 | })
258 | t.Run("inside a range", func(t *testing.T) {
259 | res, err := f("10")
260 | assert.NoError(t, err, "should return no error")
261 | assert.Equal(t, output, res, "should return the output")
262 | })
263 | })
264 | }
265 |
266 | func TestRangeConfigContains(t *testing.T) {
267 | min := 0.0
268 | max := 100.0
269 | t.Run("range containing only lt", func(t *testing.T) {
270 | conf := RangeConfig{Lt: &max}
271 | assert.True(t, conf.contains(max-1))
272 | assert.False(t, conf.contains(max))
273 | assert.False(t, conf.contains(max+1))
274 | })
275 | t.Run("range containing only lte", func(t *testing.T) {
276 | conf := RangeConfig{Lte: &max}
277 | assert.True(t, conf.contains(max-1))
278 | assert.True(t, conf.contains(max))
279 | assert.False(t, conf.contains(max+1))
280 | })
281 | t.Run("range containing only gt", func(t *testing.T) {
282 | conf := RangeConfig{Gt: &min}
283 | assert.False(t, conf.contains(min-1))
284 | assert.False(t, conf.contains(min))
285 | assert.True(t, conf.contains(min+1))
286 | })
287 | t.Run("range containing only gte", func(t *testing.T) {
288 | conf := RangeConfig{Gte: &min}
289 | assert.False(t, conf.contains(min-1))
290 | assert.True(t, conf.contains(min))
291 | assert.True(t, conf.contains(min+1))
292 | })
293 | t.Run("range containing gt and lt", func(t *testing.T) {
294 | conf := RangeConfig{Gt: &min, Lt: &max}
295 | assert.False(t, conf.contains(min-1))
296 | assert.False(t, conf.contains(min))
297 | assert.True(t, conf.contains(min+1))
298 | assert.False(t, conf.contains(max))
299 | assert.False(t, conf.contains(max+1))
300 | })
301 | t.Run("range containing gt and lte", func(t *testing.T) {
302 | conf := RangeConfig{Gt: &min, Lte: &max}
303 | assert.False(t, conf.contains(min-1))
304 | assert.False(t, conf.contains(min))
305 | assert.True(t, conf.contains(min+1))
306 | assert.True(t, conf.contains(max))
307 | assert.False(t, conf.contains(max+1))
308 | })
309 | t.Run("range containing gte and lt", func(t *testing.T) {
310 | conf := RangeConfig{Gte: &min, Lt: &max}
311 | assert.False(t, conf.contains(min-1))
312 | assert.True(t, conf.contains(min))
313 | assert.True(t, conf.contains(min+1))
314 | assert.False(t, conf.contains(max))
315 | assert.False(t, conf.contains(max+1))
316 | })
317 | t.Run("range containing gte and lte", func(t *testing.T) {
318 | conf := RangeConfig{Gte: &min, Lte: &max}
319 | assert.False(t, conf.contains(min-1))
320 | assert.True(t, conf.contains(min))
321 | assert.True(t, conf.contains(min+1))
322 | assert.True(t, conf.contains(max))
323 | assert.False(t, conf.contains(max+1))
324 | })
325 | }
326 |
--------------------------------------------------------------------------------
/config.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/json"
5 | "os"
6 | )
7 |
8 | // CsvConfig stores the config to read and write the csv file
9 | type CsvConfig struct {
10 | Delimiter string
11 | }
12 |
13 | // SamplingConfig stores the config to know how to sample the file
14 | type SamplingConfig struct {
15 | Mod uint32
16 | IDColumn uint32
17 | }
18 |
19 | // Config stores all the configuration
20 | type Config struct {
21 | Csv CsvConfig
22 | Sampling SamplingConfig
23 | Actions []ActionConfig
24 | }
25 |
26 | var defaultCsvConfig = CsvConfig{
27 | Delimiter: ",",
28 | }
29 |
30 | var defaultSamplingConfig = SamplingConfig{
31 | Mod: 1,
32 | IDColumn: 0,
33 | }
34 |
35 | var defaultActionsConfig = []ActionConfig{}
36 |
37 | func loadConfig(filename string) (*Config, error) {
38 | file, err := os.Open(filename)
39 | defer file.Close()
40 | if err != nil {
41 | return nil, err
42 | }
43 | decoder := json.NewDecoder(file)
44 | conf := Config{
45 | Csv: defaultCsvConfig,
46 | Sampling: defaultSamplingConfig,
47 | Actions: defaultActionsConfig,
48 | }
49 | err = decoder.Decode(&conf)
50 | if err != nil {
51 | return nil, err
52 | }
53 | return &conf, err
54 | }
55 |
--------------------------------------------------------------------------------
/config_defaults_test.json:
--------------------------------------------------------------------------------
1 | {}
2 |
--------------------------------------------------------------------------------
/config_invalid_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "csv": {
3 | "delimiter": ","
4 | },
5 | "sampling": {
6 | "mod": "not a number",
7 | "idColumn": 0
8 | },
9 | "actions": [
10 | {
11 | "name": "hash"
12 | },
13 | {
14 | "name": "outcode"
15 | },
16 | {
17 | "name": "year",
18 | "dateConfig": {
19 | "format": "20060102"
20 | }
21 | },
22 | {
23 | "name": "nothing"
24 | }
25 | ]
26 | }
27 |
--------------------------------------------------------------------------------
/config_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/stretchr/testify/assert"
7 | "github.com/stretchr/testify/require"
8 | )
9 |
10 | func TestLoadConfig(t *testing.T) {
11 | t.Run("if the file doesn't exist", func(t *testing.T) {
12 | conf, err := loadConfig("non-existing-file")
13 | assert.Nil(t, conf, "should return nil if the file doesn't exist")
14 | assert.Error(t, err, "should return the error if the file doesn't exist")
15 | })
16 | t.Run("if the json can't be decoded", func(t *testing.T) {
17 | conf, err := loadConfig("config_invalid_test.json")
18 | assert.Nil(t, conf, "should return nil if the json can't be decoded")
19 | assert.Error(t, err, "should return the error if the json can't be decoded")
20 | })
21 | t.Run("default config values", func(t *testing.T) {
22 | conf, err := loadConfig("config_defaults_test.json")
23 | require.NoError(t, err, "should return no error if the config can be loaded")
24 | assert.Equal(t, Config{
25 | Csv: CsvConfig{
26 | Delimiter: ",",
27 | },
28 | Sampling: SamplingConfig{
29 | Mod: 1,
30 | IDColumn: 0,
31 | },
32 | Actions: []ActionConfig{},
33 | }, *conf, "should fill the config with the default values")
34 | })
35 | t.Run("if the config can be loaded", func(t *testing.T) {
36 | gte := 0.0
37 | lt := 100.0
38 | output := "0-100"
39 | conf, err := loadConfig("config_test.json")
40 | require.NoError(t, err, "should return no error if the config can be loaded")
41 | assert.Equal(t, Config{
42 | Csv: CsvConfig{
43 | Delimiter: "|",
44 | },
45 | Sampling: SamplingConfig{
46 | Mod: 77,
47 | IDColumn: 84,
48 | },
49 | Actions: []ActionConfig{
50 | ActionConfig{
51 | Name: "hash",
52 | },
53 | ActionConfig{
54 | Name: "outcode",
55 | },
56 | ActionConfig{
57 | Name: "year",
58 | DateConfig: DateConfig{
59 | Format: "20060102",
60 | },
61 | },
62 | ActionConfig{
63 | Name: "ranges",
64 | RangeConfig: []RangeConfig{
65 | RangeConfig{
66 | Gte: >e,
67 | Lt: <,
68 | Output: &output,
69 | },
70 | },
71 | },
72 | ActionConfig{
73 | Name: "nothing",
74 | },
75 | },
76 | }, *conf, "should return the config properly decoded")
77 | })
78 | }
79 |
--------------------------------------------------------------------------------
/config_test.json:
--------------------------------------------------------------------------------
1 | {
2 | "csv": {
3 | "delimiter": "|"
4 | },
5 | "sampling": {
6 | "mod": 77,
7 | "idColumn": 84
8 | },
9 | "actions": [
10 | {
11 | "name": "hash"
12 | },
13 | {
14 | "name": "outcode"
15 | },
16 | {
17 | "name": "year",
18 | "dateConfig": {
19 | "format": "20060102"
20 | }
21 | },
22 | {
23 | "name": "ranges",
24 | "rangeConfig": [
25 | {
26 | "gte": 0,
27 | "lt": 100,
28 | "output": "0-100"
29 | }
30 | ]
31 | },
32 | {
33 | "name": "nothing"
34 | }
35 | ]
36 | }
37 |
--------------------------------------------------------------------------------
/icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
78 |
--------------------------------------------------------------------------------
/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "encoding/csv"
5 | "flag"
6 | "fmt"
7 | "hash/fnv"
8 | "io"
9 | "log"
10 | "math/rand"
11 | "os"
12 | "time"
13 | )
14 |
15 | func main() {
16 | rand.Seed(time.Now().UTC().UnixNano())
17 | //TODO move args parsing to a function
18 | configFile := flag.String("config", "config.json", "Configuration of the data to be anonymised. Default is 'config.json'")
19 | outputFile := flag.String("output", "", "Output file. Default is stdout.")
20 | flag.Parse()
21 | log.Printf("Using configuration in file %s\n", *configFile)
22 | conf, err := loadConfig(*configFile)
23 | if err != nil {
24 | log.Fatal(err)
25 | }
26 | r := initReader(flag.Arg(0), conf.Csv)
27 | w := initWriter(*outputFile, conf.Csv)
28 | anons, err := anonymisations(&conf.Actions)
29 | if err != nil {
30 | log.Fatal(err)
31 | }
32 |
33 | if err := process(r, w, conf, &anons); err != nil {
34 | log.Fatal(err)
35 | }
36 | }
37 |
38 | func process(r *csv.Reader, w *csv.Writer, conf *Config, anons *[]Anonymisation) error {
39 | i := 0
40 |
41 | for {
42 | record, err := r.Read()
43 | if err == io.EOF {
44 | break
45 | } else if pe, ok := err.(*csv.ParseError); ok && pe.Err == csv.ErrFieldCount {
46 | // we just print the error and skip the record
47 | log.Print(err)
48 | } else if err != nil {
49 | return err
50 | } else if int64(conf.Sampling.IDColumn) >= int64(len(record)) {
51 | return fmt.Errorf("id column (%d) out of range, record has %d columns", conf.Sampling.IDColumn, len(record))
52 | } else if sample(record[conf.Sampling.IDColumn], conf.Sampling) {
53 | anonymised, err := anonymise(record, *anons)
54 | if err != nil {
55 | // we just print the error and skip the record
56 | log.Print(err)
57 | } else {
58 | w.Write(anonymised)
59 | }
60 | //TODO decide how often do we want to flush
61 | if i%100 == 0 {
62 | w.Flush()
63 | }
64 | }
65 | i++
66 | }
67 | w.Flush()
68 | return nil
69 | }
70 |
71 | func sample(s string, conf SamplingConfig) bool {
72 | h := fnv.New32a()
73 | h.Write([]byte(s))
74 | return h.Sum32()%conf.Mod == 0
75 | }
76 |
77 | func initReader(filename string, conf CsvConfig) *csv.Reader {
78 | reader := csv.NewReader(fileOr(filename, os.Stdin, os.Open))
79 | reader.Comma = []rune(conf.Delimiter)[0]
80 | return reader
81 | }
82 |
83 | func initWriter(filename string, conf CsvConfig) *csv.Writer {
84 | writer := csv.NewWriter(fileOr(filename, os.Stdout, os.Create))
85 | writer.Comma = []rune(conf.Delimiter)[0]
86 | return writer
87 | }
88 |
89 | // If filename is empty, will return `def`, if it's not, will return the
90 | // result of the function `action` after passing `filename` ot it.
91 | func fileOr(filename string, def *os.File, action func(string) (*os.File, error)) *os.File {
92 | if filename == "" {
93 | return def
94 | }
95 | f, err := action(filename)
96 | if err != nil {
97 | log.Fatal(err)
98 | }
99 | return f
100 | }
101 |
102 | func anonymise(record []string, anons []Anonymisation) ([]string, error) {
103 | var err error
104 | for i := range record {
105 | // TODO decide if we fail if not enough anonmisations are defined
106 | // or we just skip the column (i.e. we apply identity)
107 | if i < len(anons) {
108 | if record[i], err = anons[i](record[i]); err != nil {
109 | return nil, err
110 | }
111 | }
112 | }
113 | return record, nil
114 | }
115 |
--------------------------------------------------------------------------------
/main_test.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "bytes"
5 | "encoding/csv"
6 | "io/ioutil"
7 | "log"
8 | "os"
9 | "strings"
10 | "testing"
11 |
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | func TestInitReader(t *testing.T) {
16 | t.Run("with an empty filename", func(t *testing.T) {
17 | tmpfile := tmpFile("content")
18 | defer os.Remove(tmpfile.Name()) // clean up
19 |
20 | oldStdin := os.Stdin
21 | defer func() { os.Stdin = oldStdin }() // Restore original Stdin
22 | os.Stdin = tmpfile
23 |
24 | r := initReader("", defaultCsvConfig)
25 | record, err := r.Read()
26 |
27 | assert.NoError(t, err, "should return no error")
28 | assert.Equal(t, []string{"content"}, record, "should return a csv reader that reads from stdin")
29 | })
30 | t.Run("with a valid filename", func(t *testing.T) {
31 | tmpfile := tmpFile("content")
32 | defer os.Remove(tmpfile.Name()) // clean up
33 |
34 | r := initReader(tmpfile.Name(), defaultCsvConfig)
35 | record, err := r.Read()
36 |
37 | assert.NoError(t, err, "should return no error")
38 | assert.Equal(t, []string{"content"}, record, "should return a csv reader that reads from the file")
39 | })
40 | }
41 |
42 | func tmpFile(content string) *os.File {
43 | tmpfile, err := ioutil.TempFile("", "anon-test")
44 | if err != nil {
45 | log.Fatal(err)
46 | }
47 | ioutil.WriteFile(tmpfile.Name(), []byte("content"), os.ModePerm)
48 | return tmpfile
49 | }
50 |
51 | func TestInitWriter(t *testing.T) {
52 | t.Run("with an empty filename", func(t *testing.T) {
53 | tmpfile := tmpFile("")
54 | defer os.Remove(tmpfile.Name()) // clean up
55 |
56 | oldStdout := os.Stdout
57 | defer func() { os.Stdout = oldStdout }() // Restore original Stdout
58 | os.Stdout = tmpfile
59 |
60 | w := initWriter("", defaultCsvConfig)
61 | err := w.Write([]string{"csv", "content"})
62 | w.Flush()
63 |
64 | content, _ := ioutil.ReadFile(tmpfile.Name())
65 | assert.NoError(t, err, "should return no error")
66 | assert.Equal(t, "csv,content\n", string(content), "should return a csv writer that writes to stdout")
67 | })
68 | t.Run("with a valid filename", func(t *testing.T) {
69 | tmpfile := tmpFile("")
70 | defer os.Remove(tmpfile.Name()) // clean up
71 |
72 | w := initWriter(tmpfile.Name(), defaultCsvConfig)
73 | err := w.Write([]string{"csv", "content"})
74 | w.Flush()
75 |
76 | content, _ := ioutil.ReadFile(tmpfile.Name())
77 | assert.NoError(t, err, "should return no error")
78 | assert.Equal(t, "csv,content\n", string(content), "should return a csv writer that writes to stdout")
79 | })
80 | }
81 | func TestFileOr(t *testing.T) {
82 | assert.Equal(t, fileOr("", os.Stdin, stdOutOk), os.Stdin, "with an empty filename returns the default value")
83 | assert.Equal(t, fileOr("something", os.Stdin, stdOutOk), os.Stdout, "with non empty filename returns the value returned by the action")
84 | }
85 |
86 | func stdOutOk(s string) (*os.File, error) {
87 | return os.Stdout, nil
88 | }
89 |
90 | func TestAnonymise(t *testing.T) {
91 | record := []string{"a", "b", "c"}
92 | actions := []Anonymisation{identity, hash(""), identity}
93 | output := []string{"a", "e9d71f5ee7c92d6dc9e92ffdad17b8bd49418f98", "c"}
94 | res, err := anonymise(record, actions)
95 | assert.NoError(t, err)
96 | assert.Equal(t, output, res, "should apply anonymisation functions to each column in the record")
97 | }
98 |
99 | func TestSample(t *testing.T) {
100 | conf := SamplingConfig{
101 | Mod: 2,
102 | }
103 | assert.True(t, sample("a", conf))
104 | assert.False(t, sample("b", conf))
105 | }
106 |
107 | func TestProcess(t *testing.T) {
108 | config := func(mod uint32, idColumn uint32) *Config {
109 | return &Config{Sampling: SamplingConfig{Mod: mod, IDColumn: idColumn}}
110 | }
111 | anons := &[]Anonymisation{identity, outcode}
112 | createReaderAndWriter := func(in string) (*csv.Reader, *csv.Writer, *bytes.Buffer) {
113 | var out bytes.Buffer
114 | r := csv.NewReader(strings.NewReader(in))
115 |
116 | w := csv.NewWriter(&out)
117 | return r, w, &out
118 | }
119 | t.Run("when the id column is out of range", func(t *testing.T) {
120 | r, w, out := createReaderAndWriter("a,b c\nd,e f\n")
121 |
122 | err := process(r, w, config(1, 100), anons)
123 | assert.Error(t, err, "should return an error")
124 | assert.Equal(t, "", out.String(), "shouldn't write any output")
125 | })
126 | t.Run("when there is an error writing the output", func(t *testing.T) {
127 | var out bytes.Buffer
128 | f, _ := os.Open("non existing file")
129 | r := csv.NewReader(f)
130 |
131 | w := csv.NewWriter(&out)
132 | err := process(r, w, config(1, 0), anons)
133 | assert.Error(t, err, "should return an error")
134 | })
135 | t.Run("when there is an error processing one of the rows", func(t *testing.T) {
136 | r, w, out := createReaderAndWriter("20020202\nfail\n10010101")
137 |
138 | y, _ := year("20060102")
139 | err := process(r, w, config(1, 0), &[]Anonymisation{y})
140 | assert.NoError(t, err, "should not return an error")
141 | assert.Equal(t, "2002\n1001\n", out.String(), "should skip that row")
142 | })
143 | t.Run("when sampling is defined", func(t *testing.T) {
144 | r, w, out := createReaderAndWriter("a,b c\nd,e f\ng,h i\nj,k l\n")
145 |
146 | err := process(r, w, config(2, 0), anons)
147 | assert.NoError(t, err, "should return no error")
148 | assert.Equal(t, "a,b\ng,h\n", out.String(), "should process some rows")
149 | })
150 | t.Run("when all the rows are valid", func(t *testing.T) {
151 | r, w, out := createReaderAndWriter("a,b c\nd,e f\n")
152 |
153 | err := process(r, w, config(1, 0), anons)
154 | assert.NoError(t, err, "should return no error")
155 | assert.Equal(t, "a,b\nd,e\n", out.String(), "should process all rows")
156 | })
157 | }
158 |
--------------------------------------------------------------------------------