├── tests ├── params.csv ├── uvals.csv ├── setup.py ├── rappor_sim_test.py ├── fastrand.py ├── gen_true_values_test.R ├── compare_dist_test.R ├── fastrand_test.py ├── gen_true_values.R ├── regtest.html ├── _fastrand.c ├── regtest_spec.py ├── gen_counts_test.R ├── user_spec.py └── analyze_assoc.R ├── .gitignore ├── apps ├── rappor-analysis │ ├── params.csv │ ├── run_app.sh │ └── test.csv ├── rappor-sim │ ├── run_app.sh │ └── server.R └── README.md ├── gh-pages ├── doc │ ├── data-flow.png │ └── randomness.html ├── examples │ ├── exp_report │ │ └── dist.png │ ├── gauss_report │ │ └── dist.png │ ├── unif_report │ │ └── dist.png │ └── report.html └── index.html ├── analysis ├── cpp │ ├── testdata │ │ ├── graph3.txt │ │ └── graph1.txt │ ├── README.md │ └── run.sh ├── tensorflow │ ├── README.md │ ├── fast_em.sh │ └── fast_em.py └── R │ ├── util.R │ ├── run_tests.R │ ├── alternative.R │ ├── fast_em.R │ ├── encode.R │ ├── read_input.R │ └── unknowns_test.R ├── ui ├── README.md ├── home.html ├── table-sort.css ├── ui.css ├── histograms.html ├── assoc-overview.html ├── assoc-day.html ├── assoc-metric.html ├── day.html ├── assoc-pair.html ├── overview.html └── metric.html ├── pipeline ├── util.py ├── csv_to_html_test.py ├── combine_results_test.py ├── csv-to-html-test.sh ├── combine_status_test.py ├── tools-lib.sh ├── task_spec_test.py ├── README.md ├── alarm-lib.sh ├── combine_results.py ├── dist.sh ├── regtest.sh ├── cook.sh └── assoc.sh ├── bin ├── sum-bits ├── hash-candidates ├── decode-assoc ├── decode-dist ├── hash_candidates_test.py ├── README.md ├── sum_bits_test.py ├── hash_candidates.py ├── sum_bits.py └── decode_dist.R ├── util.sh ├── docs.sh ├── client ├── cpp │ ├── dotd.sh │ ├── libc_rand_impl.h │ ├── openssl_hash_impl.h │ ├── unix_kernel_rand_impl.h │ ├── unix_kernel_rand_impl.cc │ ├── libc_rand_impl.cc │ ├── run.sh │ ├── encoder_demo.cc │ ├── rappor_deps.h │ ├── openssl_hash_impl.cc │ ├── Makefile │ ├── README.md │ ├── encoder.h │ └── openssl_hash_impl_unittest.cc ├── README.md └── python │ └── rappor_test.py ├── doc ├── randomness.md └── data-flow.dot ├── demo.sh ├── setup.sh ├── test.sh └── README.md /tests/params.csv: -------------------------------------------------------------------------------- 1 | k, h, m, p, q, f 2 | 16, 2, 4, 0.1, 0.9, 0.2 3 | -------------------------------------------------------------------------------- /tests/uvals.csv: -------------------------------------------------------------------------------- 1 | google.com,intel.com,yahoo.com 2 | ssl,nossl 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.swp 3 | _tmp 4 | tests/_fastrand.so 5 | tests/build/ 6 | -------------------------------------------------------------------------------- /apps/rappor-analysis/params.csv: -------------------------------------------------------------------------------- 1 | "k","h","m","p","q","f" 2 | 128,2,8,0.5,0.75,0 3 | -------------------------------------------------------------------------------- /gh-pages/doc/data-flow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/doc/data-flow.png -------------------------------------------------------------------------------- /gh-pages/examples/exp_report/dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/exp_report/dist.png -------------------------------------------------------------------------------- /gh-pages/examples/gauss_report/dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/gauss_report/dist.png -------------------------------------------------------------------------------- /gh-pages/examples/unif_report/dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/unif_report/dist.png -------------------------------------------------------------------------------- /analysis/cpp/testdata/graph3.txt: -------------------------------------------------------------------------------- 1 | num_partitions 3 2 | ngram_size 2 3 | edge 0.ab 1.cd 4 | edge 1.cd 2.ef 5 | edge 0.ab 2.ef 6 | edge 0.AB 1.CD 7 | edge 1.CD 2.EF 8 | -------------------------------------------------------------------------------- /ui/README.md: -------------------------------------------------------------------------------- 1 | ui 2 | == 3 | 4 | This directory contains static HTML, CSS, and JavaScript for the RAPPOR 5 | dashboard. See the `pipeline/` directory for more details. 6 | 7 | -------------------------------------------------------------------------------- /pipeline/util.py: -------------------------------------------------------------------------------- 1 | """Common functions.""" 2 | 3 | import sys 4 | 5 | 6 | def log(msg, *args): 7 | if args: 8 | msg = msg % args 9 | print >>sys.stderr, msg 10 | -------------------------------------------------------------------------------- /bin/sum-bits: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Shell wrapper around sum_bits.py. 4 | 5 | readonly THIS_DIR=$(dirname $0) 6 | 7 | PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/sum_bits.py "$@" 8 | -------------------------------------------------------------------------------- /bin/hash-candidates: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Shell wrapper around hash_candidates.py. 4 | 5 | readonly THIS_DIR=$(dirname $0) 6 | 7 | PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/hash_candidates.py "$@" 8 | -------------------------------------------------------------------------------- /util.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Utility functions, used by demo.sh and regtest.sh. 4 | 5 | banner() { 6 | echo 7 | echo "----- $@" 8 | echo 9 | } 10 | 11 | log() { 12 | echo 1>&2 "$@" 13 | } 14 | 15 | die() { 16 | log "$0: $@" 17 | exit 1 18 | } 19 | 20 | -------------------------------------------------------------------------------- /analysis/tensorflow/README.md: -------------------------------------------------------------------------------- 1 | RAPPOR in TensorFlow 2 | ==================== 3 | 4 | This directory contains an experimental implementation of the EM algorithm in 5 | [TensorFlow](http://tensorflow.org). 6 | 7 | Currently the C++ implementation in `analysis/cpp` is faster and can be used 8 | in production. 9 | 10 | 11 | -------------------------------------------------------------------------------- /analysis/cpp/README.md: -------------------------------------------------------------------------------- 1 | find_cliques 2 | ============ 3 | 4 | This tool does part of the analysis for unknown dictionaries. To run it: 5 | 6 | $ ./run.sh demo 7 | 8 | This compiles and runs it on files in the testdata/ directory. 9 | 10 | See comments in find_cliques.cc for information on how it works. 11 | 12 | 13 | -------------------------------------------------------------------------------- /apps/rappor-analysis/run_app.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Run the Shiny app in this directory. 4 | # 5 | # Usage: 6 | # ./run_app.sh [port] 7 | 8 | app_dir=$(dirname $0) 9 | port=${1:-6789} 10 | 11 | # host= makes it serve to other machines, not just localhost. 12 | exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)" 13 | -------------------------------------------------------------------------------- /gh-pages/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RAPPOR Github Pages 5 | 6 | 7 | 8 |

RAPPOR Github Pages

9 |

10 | examples/report.html
11 | doc/data-flow.html
12 |

13 | 14 | 15 | -------------------------------------------------------------------------------- /apps/rappor-sim/run_app.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # 3 | # Run the Shiny app in this directory. 4 | # 5 | # Usage: 6 | # ./run_app.sh [port] 7 | 8 | app_dir=$(dirname $0) 9 | port=${1:-6788} 10 | 11 | # Needed by source.rappor in analysis/R/*.R 12 | export RAPPOR_REPO=../../ 13 | 14 | # host= makes it serve to other machines, not just localhost. 15 | exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)" 16 | -------------------------------------------------------------------------------- /analysis/R/util.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | # 3 | # Common utility library for all R scripts. 4 | 5 | # Log message with timing. Example: 6 | # 7 | # _____ 1.301 My message 8 | # 9 | # The prefix makes it stand out (vs R's print()), and the number is the time so 10 | # far. 11 | # 12 | # NOTE: The shell script log uses hyphens. 13 | 14 | Log <- function(...) { 15 | cat(sprintf('_____ %.3f ', proc.time()[['elapsed']])) 16 | cat(sprintf(...)) 17 | cat('\n') 18 | } 19 | -------------------------------------------------------------------------------- /analysis/cpp/testdata/graph1.txt: -------------------------------------------------------------------------------- 1 | num_partitions 4 2 | ngram_size 2 3 | edge 0.ab 1.cd 4 | edge 0.xx 1.cd 5 | edge 0.ij 1.kl 6 | edge 0.qr 1.st 7 | edge 0.ab 1.le 8 | edge 0.qr 2.uv 9 | edge 0.ab 2.ef 10 | edge 0.ij 2.mn 11 | edge 0.ij 3.op 12 | edge 0.qr 3.wx 13 | edge 0.ab 3.gh 14 | edge 1.cd 2.ef 15 | edge 1.kl 2.mn 16 | edge 1.st 2.uv 17 | edge 1.kl 3.op 18 | edge 1.cd 3.gh 19 | edge 1.st 3.wx 20 | edge 2.uv 3.wx 21 | edge 2.ef 3.gh 22 | edge 2.ef 3.zz 23 | edge 2.mn 3.op 24 | -------------------------------------------------------------------------------- /docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o nounset 4 | set -o pipefail 5 | set -o errexit 6 | 7 | 8 | build() { 9 | ./build.sh doc 10 | } 11 | 12 | copy() { 13 | cp -a ./_tmp/doc/* ./gh-pages/doc/ 14 | echo "After commiting changes, you can publish them by running: ./docs.sh publish" 15 | } 16 | 17 | publish() { 18 | git subtree push --prefix gh-pages origin gh-pages 19 | } 20 | 21 | if test $# -eq 0 ; then 22 | build 23 | copy 24 | else 25 | "$@" 26 | fi 27 | 28 | 29 | -------------------------------------------------------------------------------- /ui/home.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Rappor HOME 5 | 6 | 8 | 9 | 10 | 11 | 12 |

13 | Redirecting to https://github.com/google/rappor 14 |

15 | 16 | 17 | -------------------------------------------------------------------------------- /analysis/tensorflow/fast_em.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Wrapper to run fast_em.py using TensorFlow configured for a GPU. CUDA 4 | # environment variables must be set. 5 | # 6 | # Usage: 7 | # ./fast_em.sh 8 | 9 | set -o nounset 10 | set -o pipefail 11 | set -o errexit 12 | 13 | readonly THIS_DIR=$(dirname $0) 14 | 15 | fast-em() { 16 | # Never returns 17 | LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 \ 18 | CUDA_HOME=/usr/local/cuda-7.0 \ 19 | exec $THIS_DIR/fast_em.py "$@" 20 | } 21 | 22 | fast-em "$@" 23 | -------------------------------------------------------------------------------- /pipeline/csv_to_html_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | """ 3 | csv_to_html_test.py: Tests for csv_to_html.py 4 | """ 5 | 6 | import unittest 7 | 8 | import csv_to_html # module under test 9 | 10 | 11 | class CsvToHtmlTest(unittest.TestCase): 12 | 13 | def testParseSpec(self): 14 | self.assertEqual( 15 | {'foo': 'bar', 'spam': 'eggs'}, 16 | csv_to_html.ParseSpec(['foo bar', 'spam eggs'])) 17 | 18 | self.assertEqual( 19 | {}, 20 | csv_to_html.ParseSpec([])) 21 | 22 | 23 | if __name__ == '__main__': 24 | unittest.main() 25 | -------------------------------------------------------------------------------- /bin/decode-assoc: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Decode multidimensional reports. 4 | # 5 | # This is a tiny shell wrapper around R. 6 | 7 | readonly THIS_DIR=$(dirname $0) 8 | 9 | # NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string 10 | # concatenation to form the absolute path. (file.path() in R doesn't do what 11 | # we want.) 12 | 13 | readonly RAPPOR_REPO=$THIS_DIR/../ 14 | 15 | # RAPPOR_REPO is used by source() statements to find .R files. 16 | export RAPPOR_REPO 17 | 18 | # Make sure to reuse the same process so it can be killed easily. 19 | exec $THIS_DIR/decode_assoc.R "$@" 20 | -------------------------------------------------------------------------------- /bin/decode-dist: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Decode a distribution from summed RAPPOR reports. 4 | # 5 | # This is a tiny shell wrapper around R. 6 | 7 | readonly THIS_DIR=$(dirname $0) 8 | 9 | # NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string 10 | # concatenation to form the absolute path. (file.path() in R doesn't do what 11 | # we want.) 12 | 13 | readonly RAPPOR_REPO=$THIS_DIR/../ 14 | 15 | # RAPPOR_REPO is used by source() statements to find .R files. 16 | export RAPPOR_REPO 17 | 18 | # Make sure to reuse the same process so it can be killed easily. 19 | exec $THIS_DIR/decode_dist.R "$@" 20 | -------------------------------------------------------------------------------- /ui/table-sort.css: -------------------------------------------------------------------------------- 1 | /* sort indicator in column headings */ 2 | .sortArrow { 3 | color: grey; 4 | } 5 | 6 | thead { 7 | font-weight: bold; 8 | text-align: center; 9 | } 10 | 11 | table { 12 | padding: 10px; /* Padding makes it look nicer. */ 13 | margin: 0 auto; /* center table on the page */ 14 | border-collapse: collapse; /* this is like old cellpadding */ 15 | } 16 | 17 | /* like cellspacing? */ 18 | td { 19 | padding: 5px; 20 | } 21 | 22 | /* Built-in support for R NA values */ 23 | .na { 24 | color: darkred; 25 | } 26 | 27 | /* Numbers aligned on the right, like Excel */ 28 | .num { 29 | text-align: right; 30 | } 31 | 32 | .highlight { 33 | background-color: #f0f0f0; 34 | } 35 | 36 | tbody tr:hover { 37 | background-color: lightcyan; 38 | } 39 | 40 | -------------------------------------------------------------------------------- /ui/ui.css: -------------------------------------------------------------------------------- 1 | /* Center the plots */ 2 | .dy { 3 | margin: 0 auto; 4 | width: 50em; 5 | } 6 | 7 | /* main metric */ 8 | #proportionsDy { 9 | width: 1000px; 10 | height: 600px; 11 | } 12 | 13 | #num-reports-dy { 14 | width: 1000px; 15 | height: 300px; 16 | } 17 | 18 | #mass-dy { 19 | width: 1000px; 20 | height: 300px; 21 | } 22 | 23 | #metricDesc { 24 | font-style: italic; 25 | } 26 | 27 | body { 28 | /*margin: 0 auto;*/ 29 | /*text-align: left;*/ 30 | } 31 | 32 | h1 { 33 | text-align: center; 34 | } 35 | 36 | h2 { 37 | text-align: center; 38 | } 39 | 40 | p { 41 | text-align: center; 42 | } 43 | 44 | /* R NA values */ 45 | .na { 46 | color: darkred; 47 | } 48 | 49 | #status { 50 | text-align: center; 51 | font-size: x-large; 52 | color: darkred; 53 | } 54 | -------------------------------------------------------------------------------- /apps/rappor-analysis/test.csv: -------------------------------------------------------------------------------- 1 | [1] String Estimate St.Dev P.value Proportion SNR 2 | <0 rows> (or 0-length row.names) 3 | SUMMARY 4 | parameters values 5 | 1 Candidate strings 300.000 6 | 2 Detected strings 0.000 7 | 3 Discovered Prop (out of N) 0.000 8 | 4 Explained Variance 0.000 9 | 5 Missing Variance 0.988 10 | 6 Noise Variance 0.012 11 | 7 Theoretical Noise Std. Dev. 2236.068 12 | PRIVACY 13 | parameters values 14 | 1 Effective p 0.500000000 15 | 2 Effective q 0.750000000 16 | 3 exp(e_1) 9.000000000 17 | 4 e_1 2.197224577 18 | 5 exp(e_inf) Inf 19 | 6 e_inf Inf 20 | 7 Detection frequency 0.001040297 21 | -------------------------------------------------------------------------------- /tests/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | 18 | from distutils.core import setup, Extension 19 | 20 | module = Extension('_fastrand', 21 | sources = ['_fastrand.c']) 22 | 23 | setup(name = '_fastrand', 24 | version = '1.0', 25 | description = 'Module to speed up RAPPOR simulation', 26 | ext_modules = [module]) 27 | -------------------------------------------------------------------------------- /tests/rappor_sim_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | rappor_sim_test.py: Tests for rappor_sim.py 19 | """ 20 | 21 | import unittest 22 | 23 | import rappor_sim # module under test 24 | 25 | 26 | class RapporSimTest(unittest.TestCase): 27 | 28 | def testFoo(self): 29 | pass 30 | 31 | 32 | if __name__ == "__main__": 33 | unittest.main() 34 | -------------------------------------------------------------------------------- /client/cpp/dotd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # dotd.sh 4 | # 5 | # Generate .d Makefile fragments, so we can use #include statements in source 6 | # for dependency info. Adapted from the GNU make manual: 7 | # 8 | # http://www.gnu.org/software/make/manual/html_node/Automatic-Prerequisites.html 9 | # 10 | # We are putting this in shell, so we just have 'sed in bash'. Not an unholy 11 | # mix of 'sed in bash in Make'. 12 | 13 | set -o nounset 14 | set -o pipefail 15 | set -o errexit 16 | 17 | # Munge gcc -MM output into .d files. 18 | main() { 19 | if [ ! -d _tmp ]; then mkdir _tmp; fi 20 | local basename=$1 21 | local dotd=$2 # .d output name 22 | shift 2 # rest of args are gcc invocation 23 | 24 | rm --verbose -f $dotd # in case of failure? 25 | 26 | # Execute the gcc -MM invocation. 27 | # 28 | # Change 29 | # rappor_sim.o: rappor.sim.cc 30 | # to 31 | # _tmp/rappor_sim.o _tmp/rappor_sim.d : rappor.sim.cc 32 | "$@" | sed "s|\($basename\).o|_tmp/\1.o _tmp/\1.d |" > $dotd 33 | } 34 | 35 | main "$@" 36 | -------------------------------------------------------------------------------- /pipeline/combine_results_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | """ 3 | combine_results_test.py: Tests for combine_results.py 4 | """ 5 | 6 | import csv 7 | import cStringIO 8 | import unittest 9 | 10 | import combine_results # module under test 11 | 12 | 13 | # TODO: Make these test more the header row. They rely heavily on the file 14 | # system! 15 | 16 | class CombineResultsTest(unittest.TestCase): 17 | 18 | def testCombineDistResults(self): 19 | stdin = cStringIO.StringIO('') 20 | out = cStringIO.StringIO() 21 | c_out = csv.writer(out) 22 | 23 | combine_results.CombineDistResults(stdin, c_out, 10) 24 | actual = out.getvalue() 25 | self.assert_(actual.startswith('date'), actual) 26 | 27 | def testCombineAssocResults(self): 28 | stdin = cStringIO.StringIO('') 29 | out = cStringIO.StringIO() 30 | c_out = csv.writer(out) 31 | 32 | combine_results.CombineAssocResults(stdin, c_out, 10) 33 | actual = out.getvalue() 34 | self.assert_(actual.startswith('dummy'), actual) 35 | 36 | 37 | if __name__ == '__main__': 38 | unittest.main() 39 | -------------------------------------------------------------------------------- /pipeline/csv-to-html-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Test for csv_to_html.py. 4 | # 5 | # Usage: 6 | # ./csv-to-html-test.sh 7 | 8 | set -o nounset 9 | set -o pipefail 10 | set -o errexit 11 | 12 | test-basic() { 13 | ./csv_to_html.py <{b}' <{v}' < 2 | 3 | 4 | RAPPOR Task Histograms 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 |

13 | Home / 14 | Overview / 15 | Histograms 16 |

17 | 18 |

RAPPOR Task Histograms

19 | 20 |

Each task's input is a (metric, day), i.e. it runs on the summed reports 21 | for a single metric received in a single day.

22 | 23 |

24 | 25 |

26 | 27 |

28 | 29 |

30 | 31 |

32 | 33 |

34 | 35 |

36 | 37 |

38 | 39 |

40 | 41 |

42 | 43 |

44 | 45 |

46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /client/cpp/libc_rand_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // A RAPPOR random implementation using libc's rand(). 16 | // 17 | // IMPORTANT: This is for demo /simulation purposes only. Use a better random 18 | // function in production applications. 19 | 20 | #ifndef LIBC_RAND_IMPL_H_ 21 | #define LIBC_RAND_IMPL_H_ 22 | 23 | #include "rappor_deps.h" 24 | 25 | namespace rappor { 26 | 27 | class LibcRand : public IrrRandInterface { 28 | public: 29 | virtual ~LibcRand() {} 30 | 31 | virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const; 32 | }; 33 | 34 | } // namespace rappor 35 | 36 | #endif // LIBC_RAND_IMPL_H_ 37 | -------------------------------------------------------------------------------- /client/cpp/openssl_hash_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // OpenSSL implementation of RAPPOR dependencies. 16 | 17 | #ifndef OPENSSL_IMPL_H_ 18 | #define OPENSSL_IMPL_H_ 19 | 20 | #include "rappor_deps.h" 21 | 22 | namespace rappor { 23 | 24 | bool HmacSha256(const std::string& key, const std::string& value, 25 | std::vector* output); 26 | // Pass output vector of desired length. 27 | bool HmacDrbg(const std::string& key, const std::string& value, 28 | std::vector* output); 29 | bool Md5(const std::string& value, std::vector* output); 30 | 31 | } // namespace rappor 32 | 33 | #endif // OPENSSL_IMPL_H_ 34 | -------------------------------------------------------------------------------- /client/README.md: -------------------------------------------------------------------------------- 1 | RAPPOR Clients 2 | ============== 3 | 4 | This directory contains RAPPOR client implementations in various languages. 5 | 6 | The privacy of RAPPOR is based on the client "lying" about the true values -- 7 | that is, not sending them over the network. 8 | 9 | The clients are typically small in terms of code size because the RAPPOR 10 | client algorithm is simple. See the README.md in each subdirectory for details 11 | on how to use the library. 12 | 13 | Common Test Protocol 14 | -------------------- 15 | 16 | When implementing a new RAPPOR client, you can get for free! 17 | 18 | The `regtest.sh` script in the root of this repository does the following: 19 | 20 | 1. Create test input data and feed it into your client as a CSV file 21 | 2. Preprocesses your client output (also CSV) 22 | 3. Runs the RAPPOR analysis, learning aggregate statistics from encoded values 23 | 4. Compares the analysis to the true client values, with metrics and plots. 24 | 25 | To have your client tested, you need a small executable wrapper, which reads 26 | and write as CSV file in a specified format. 27 | 28 | Then add it to the `_run-one-instance` function in `regtest.sh`. 29 | 30 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /tests/fastrand.py: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """fastrand.py - Python wrapper for _fastrand.""" 16 | 17 | # NOTE: We could retire this module in favor of the C++ client? One reason to 18 | # keep it is if it supports a wider range of params (e.g. more than 32 or 64 19 | # bits.) 20 | 21 | import random 22 | 23 | import _fastrand 24 | 25 | 26 | class FastIrrRand(object): 27 | """Fast insecure version of rappor.SecureIrrRand.""" 28 | 29 | def __init__(self, params): 30 | randbits = _fastrand.randbits # accelerated function 31 | num_bits = params.num_bloombits 32 | 33 | # IRR probabilities 34 | self.p_gen = lambda: randbits(params.prob_p, num_bits) 35 | self.q_gen = lambda: randbits(params.prob_q, num_bits) 36 | -------------------------------------------------------------------------------- /ui/assoc-overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RAPPOR Association Analysis Overview 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 |

16 | 17 |

18 | Single variable analysis (latest) 19 |

20 | 21 |

22 | Home / 23 | Association Overview 24 |

25 | 26 |

RAPPOR Association Analysis Overview

27 | 28 | 29 |
30 | 31 |

32 | Underlying data: overview.csv 33 |

34 | 35 | 36 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /client/cpp/unix_kernel_rand_impl.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // A RAPPOR random implementation using bytes from a file like /dev/urandom or 16 | // /dev/random. 17 | 18 | #ifndef UNIX_KERNEL_RAND_IMPL_H_ 19 | #define UNIX_KERNEL_RAND_IMPL_H_ 20 | 21 | #include // uint8_t 22 | #include // FILE* 23 | 24 | #include "rappor_deps.h" 25 | 26 | namespace rappor { 27 | 28 | class UnixKernelRand : public IrrRandInterface { 29 | public: 30 | explicit UnixKernelRand(FILE* fp) 31 | : fp_(fp) { 32 | } 33 | virtual ~UnixKernelRand() {} 34 | 35 | virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const; 36 | 37 | private: 38 | FILE* fp_; // open device, e.g. /dev/urandom 39 | }; 40 | 41 | } // namespace rappor 42 | 43 | #endif // UNIX_KERNEL_RAND_IMPL_H_ 44 | -------------------------------------------------------------------------------- /ui/assoc-day.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Single Day Association Results 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 |

16 | 17 | 18 |

19 | Home / 20 | Association Overview 21 |

22 | 23 | 25 | 26 |

27 | 28 | 29 |
30 | 31 |

32 | 33 | Underlying data: assoc-results.csv 34 |

35 | 36 | 37 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /client/cpp/unix_kernel_rand_impl.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "unix_kernel_rand_impl.h" 16 | 17 | #include // uint64_t 18 | 19 | namespace rappor { 20 | 21 | const int kMaxBitWidth = 32; // also in encoder.cc 22 | 23 | bool UnixKernelRand::GetMask(float prob, int num_bits, Bits* mask_out) const { 24 | uint8_t rand_buf[kMaxBitWidth]; 25 | size_t num_elems = fread(&rand_buf, sizeof(uint8_t), num_bits, fp_); 26 | if (num_elems != static_cast(num_bits)) { // fread error 27 | return false; 28 | } 29 | uint8_t threshold_256 = static_cast(prob * 256); 30 | 31 | Bits mask = 0; 32 | for (int i = 0; i < num_bits; ++i) { 33 | uint8_t bit = (rand_buf[i] < threshold_256); 34 | mask |= (bit << i); 35 | } 36 | *mask_out = mask; 37 | return true; 38 | } 39 | 40 | } // namespace rappor 41 | -------------------------------------------------------------------------------- /tests/gen_true_values_test.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/Rscript 2 | # 3 | # gen_reports_test.R 4 | 5 | source('analysis/R/util.R') # Log() 6 | 7 | source('tests/gen_true_values.R') # module under test 8 | 9 | library(RUnit) 10 | 11 | TestGenerateTrueValues = function() { 12 | num_clients <- 10 13 | reports_per_client <- 2 14 | num_cohorts <- 4 15 | reports <- GenerateTrueValues('exp', 10, num_clients, reports_per_client, 16 | num_cohorts) 17 | print(reports) 18 | 19 | # 10 clients, 2 reports per client 20 | checkEquals(20, nrow(reports)) 21 | 22 | # 10 unique clients 23 | checkEquals(10, length(unique(reports$client))) 24 | 25 | # Whether a given client reports different values 26 | reports_different_values <- rep(FALSE, num_clients) 27 | 28 | for (c in 1:num_clients) { 29 | my_reports <- reports[reports$client == c, ] 30 | #Log("CLIENT %d", c) 31 | #print(my_reports) 32 | 33 | # If every report for this client isn't same, make note of it 34 | if (length(unique(my_reports$value)) != 1) { 35 | reports_different_values[[c]] <- TRUE 36 | } 37 | } 38 | 39 | # At least one client should report different values. (Technically this 40 | # could fail, but is unlikely with 10 clients). 41 | checkTrue(any(reports_different_values)) 42 | 43 | checkEquals(num_cohorts, length(unique(reports$cohort))) 44 | } 45 | 46 | TestAll <- function(){ 47 | TestGenerateTrueValues() 48 | } 49 | 50 | TestAll() 51 | -------------------------------------------------------------------------------- /tests/compare_dist_test.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | library(RUnit) 18 | 19 | source('tests/compare_dist.R') 20 | 21 | TestProcessAll <- function() { 22 | ctx <- new.env() 23 | ctx$actual <- data.frame(string = c('v1', 'v2', 'v3'), proportion = c(0.2, 0.3, 0.5), 24 | count = c(2, 3, 5)) 25 | ctx$rappor <- data.frame(strings = c('v2', 'v3', 'v4'), proportion = c(0.1, 0.2, 0.3)) 26 | 27 | metrics <- CompareRapporVsActual(ctx)$metrics 28 | str(metrics) 29 | 30 | # sum of rappor proportions 31 | checkEqualsNumeric(0.6, metrics$sum_proportion) 32 | 33 | # v1 v2 v3 v4 34 | # 0.2 0.3 0.5 0.0 35 | # 0.0 0.1 0.2 0.3 36 | 37 | # (0.2 + 0.2 + 0.3 + 0.3) / 2 38 | checkEqualsNumeric(0.5, metrics$total_variation) 39 | 40 | print(metrics$total_variation) 41 | } 42 | 43 | TestProcessAll() 44 | -------------------------------------------------------------------------------- /pipeline/tools-lib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Library used to refer to open source tools. 4 | 5 | set -o nounset 6 | set -o pipefail 7 | set -o errexit 8 | 9 | # NOTE: RAPPOR_SRC defined by the module that sources (cook.sh or ui.sh) 10 | 11 | # Caller can override shebang line by setting $DEP_PYTHON. 12 | readonly PYTHON=${DEP_PYTHON:-} 13 | 14 | readonly METRIC_STATUS=${DEP_METRIC_STATUS:-} 15 | 16 | 17 | # These 3 used by cook.sh. 18 | 19 | TOOLS-combine-status() { 20 | if test -n "$PYTHON"; then 21 | $PYTHON $RAPPOR_SRC/pipeline/combine_status.py "$@" 22 | else 23 | $RAPPOR_SRC/pipeline/combine_status.py "$@" 24 | fi 25 | } 26 | 27 | TOOLS-combine-results() { 28 | if test -n "$PYTHON"; then 29 | $PYTHON $RAPPOR_SRC/pipeline/combine_results.py "$@" 30 | else 31 | $RAPPOR_SRC/pipeline/combine_results.py "$@" 32 | fi 33 | } 34 | 35 | TOOLS-metric-status() { 36 | if test -n "$METRIC_STATUS"; then 37 | $METRIC_STATUS "$@" 38 | else 39 | $RAPPOR_SRC/pipeline/metric_status.R "$@" 40 | fi 41 | } 42 | 43 | # Used by ui.sh. 44 | 45 | TOOLS-csv-to-html() { 46 | if test -n "$PYTHON"; then 47 | $PYTHON $RAPPOR_SRC/pipeline/csv_to_html.py "$@" 48 | else 49 | $RAPPOR_SRC/pipeline/csv_to_html.py "$@" 50 | fi 51 | } 52 | 53 | # 54 | # Higher level scripts 55 | # 56 | 57 | TOOLS-cook() { 58 | $RAPPOR_SRC/pipeline/cook.sh "$@" 59 | } 60 | 61 | # TODO: Rename gen-ui.sh. 62 | TOOLS-gen-ui() { 63 | $RAPPOR_SRC/pipeline/ui.sh "$@" 64 | } 65 | -------------------------------------------------------------------------------- /client/cpp/libc_rand_impl.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // IMPORTANT: This is for demo /simulation purposes only. Use a better random 16 | // function in production applications. 17 | 18 | #include "libc_rand_impl.h" 19 | 20 | #include 21 | #include // uint64_t 22 | #include // srand 23 | 24 | namespace rappor { 25 | 26 | // 27 | // LibcRand 28 | // 29 | 30 | // Similar to client/python/fastrand.c 31 | bool LibcRand::GetMask(float prob, int num_bits, Bits* mask_out) const { 32 | int rand_threshold = static_cast(prob * RAND_MAX); 33 | Bits mask = 0; 34 | 35 | for (int i = 0; i < num_bits; ++i) { 36 | // NOTE: could use rand_r(), which is more thread-safe 37 | Bits bit = (rand() < rand_threshold); 38 | mask |= (bit << i); 39 | } 40 | *mask_out = mask; 41 | return true; // no possible failure 42 | } 43 | 44 | } // namespace rappor 45 | -------------------------------------------------------------------------------- /ui/assoc-metric.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 17 |

18 | 19 |

20 | Home / 21 | Association Overview 22 |

23 | 24 |

25 | 26 |

27 | 28 | 29 |
30 | 31 |

32 | 33 | Underlying data: 34 |

35 | 36 | 37 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /ui/day.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Single Day Results 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 |

16 | 17 | 18 |

19 | Home / 20 | Overview / 21 | Histograms 22 |

23 | 24 | 26 | 27 |

28 | 29 | 30 |
31 | 32 |

33 | Residuals 34 |

35 | 36 |

37 | 38 | Underlying data: results.csv 39 |

40 | 41 | 42 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /pipeline/task_spec_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | """ 3 | task_spec_test.py: Tests for task_spec.py 4 | """ 5 | 6 | import cStringIO 7 | import unittest 8 | 9 | import task_spec # module under test 10 | 11 | 12 | class TaskSpecTest(unittest.TestCase): 13 | 14 | def testCountReports(self): 15 | f = cStringIO.StringIO("""\ 16 | 1,2 17 | 3,4 18 | 5,6 19 | """) 20 | c = task_spec.CountReports(f) 21 | self.assertEqual(9, c) 22 | 23 | def testDist(self): 24 | # NOTE: These files are opened, in order to count the reports. Maybe skip 25 | # that step. 26 | f = cStringIO.StringIO("""\ 27 | _tmp/counts/2015-12-01/exp_counts.csv 28 | _tmp/counts/2015-12-01/gauss_counts.csv 29 | _tmp/counts/2015-12-02/exp_counts.csv 30 | _tmp/counts/2015-12-02/gauss_counts.csv 31 | """) 32 | input_iter = task_spec.DistInputIter(f) 33 | #for row in input_iter: 34 | # print row 35 | 36 | field_id_lookup = {} 37 | 38 | # var name -> map filename 39 | f = cStringIO.StringIO("""\ 40 | var,map_filename 41 | exp,map.csv 42 | unif,map.csv 43 | gauss,map.csv 44 | """) 45 | dist_maps = task_spec.DistMapLookup(f, '_tmp/maps') 46 | 47 | f2 = cStringIO.StringIO("""\ 48 | metric,var,var_type,params 49 | exp,,string,params 50 | unif,,string,params 51 | gauss,,string,params 52 | """) 53 | var_schema = task_spec.VarSchema(f2, '_tmp/config') 54 | 55 | for row in task_spec.DistTaskSpec( 56 | input_iter, field_id_lookup, var_schema, dist_maps, None): 57 | print row 58 | 59 | 60 | if __name__ == '__main__': 61 | unittest.main() 62 | -------------------------------------------------------------------------------- /ui/assoc-pair.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 17 |

18 | 19 |

20 | Home / 21 | Association Overview 22 |

23 | 24 |

25 | 26 |

27 | 28 |

Task Status

29 | 30 | 31 |
32 | 33 |

34 | 35 | Underlying data: 36 |

37 | 38 | 39 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /analysis/R/run_tests.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # 18 | # Run unit tests for RAPPOR R code. 19 | 20 | library(RUnit) 21 | 22 | run_tests <- function() { 23 | dirs <- "analysis/R" # Run from root 24 | test_suite <- defineTestSuite("rappor", dirs, testFileRegexp = "_test.R$", 25 | testFuncRegexp = "^Test") 26 | stopifnot(isValidTestSuite(test_suite)) 27 | 28 | test_result <- runTestSuite(test_suite) 29 | 30 | printTextProtocol(test_result) # print to stdout 31 | 32 | result <- test_result[[1]] # Result for our only suite 33 | 34 | # Sanity check: fail if there were no tests found. 35 | if (result$nTestFunc == 0) { 36 | cat("No tests found.\n") 37 | return(FALSE) 38 | } 39 | if (result$nFail != 0 || result$nErr != 0) { 40 | cat("Some tests failed.\n") 41 | return(FALSE) 42 | } 43 | return(TRUE) 44 | } 45 | 46 | if (!run_tests()) { 47 | quit(status = 1) 48 | } 49 | -------------------------------------------------------------------------------- /analysis/cpp/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # ./run.sh 5 | 6 | set -o nounset 7 | set -o pipefail 8 | set -o errexit 9 | 10 | # Call gcc with the flags we like. 11 | # NOTE: -O3 does a lot for fast_em. (More than 5x speedup over unoptimized) 12 | 13 | cpp-compiler() { 14 | g++ -Wall -Wextra -O3 "$@" 15 | #clang++ -Wall -Wextra -O3 "$@" 16 | } 17 | 18 | build-find-cliques() { 19 | mkdir -p _tmp 20 | # C++ 11 for unordered_{map,set} 21 | cpp-compiler -std=c++0x -o _tmp/find_cliques find_cliques.cc 22 | } 23 | 24 | find-cliques() { 25 | _tmp/find_cliques "$@" 26 | } 27 | 28 | test-bad-edge() { 29 | # Edge should go from lesser partition number to greater 30 | find-cliques < 15 | 16 | 17 | For now, we have collected some useful links. 18 | 19 | Linux 20 | ----- 21 | 22 | * [Myths about /dev/urandom](http://www.2uo.de/myths-about-urandom/) -- Nice 23 | article explaining implementation aspects of `/dev/urandom` and `/dev/random` 24 | on Linux. (Summary: just use `/dev/urandom`, with caveats explained) 25 | 26 | * [LWN on getrandom](http://lwn.net/Articles/606141/) 27 | ([patch](http://lwn.net/Articles/605828/)) -- A very recent addition to the 28 | Linux kernel. As of this writing (11/2014), it's safe to say that very few 29 | applications use it. The relevant change, involving an issue mentioned in 30 | the first link, involves the situation at system boot, when there is little 31 | entropy available. 32 | 33 | 34 | 36 | 37 | 39 | -------------------------------------------------------------------------------- /bin/hash_candidates_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | hash_candidates_test.py: Tests for hash_candidates.py 19 | """ 20 | 21 | import cStringIO 22 | import unittest 23 | 24 | import rappor 25 | import hash_candidates # module under test 26 | 27 | 28 | STDIN = """\ 29 | apple 30 | banana 31 | carrot 32 | """ 33 | 34 | EXPECTED_CSV_OUT = """\ 35 | apple,5,1,26,26,38,34,63,62\r 36 | banana,12,14,28,24,37,34,62,49\r 37 | carrot,4,12,25,21,48,38,61,54\r 38 | """ 39 | 40 | 41 | class HashCandidatesTest(unittest.TestCase): 42 | 43 | def setUp(self): 44 | self.params = rappor.Params() 45 | self.params.num_bloombits = 16 46 | self.params.num_cohorts = 4 47 | self.params.num_hashes = 2 48 | 49 | def testHash(self): 50 | stdin = cStringIO.StringIO(STDIN) 51 | stdout = cStringIO.StringIO() 52 | 53 | hash_candidates.HashCandidates(self.params, stdin, stdout) 54 | 55 | self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue()) 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /bin/README.md: -------------------------------------------------------------------------------- 1 | Command Line Tools 2 | ================== 3 | 4 | This directory contains command line tools for RAPPOR analysis. 5 | 6 | Analysis Tools 7 | -------------- 8 | 9 | ### decode-dist 10 | 11 | Decode a distribution -- requires a "counts" file (summed bits from reports), 12 | map file, and a params file. See `test.sh decode-dist` in this dir for an 13 | example. 14 | 15 | ### decode-assoc 16 | 17 | Decode a joint distribution between 2 variables ("association analysis"). See 18 | `test.sh decode-assoc-R` or `test.sh decode-assoc-cpp` in this dir for an 19 | example. 20 | 21 | Currently it only supports associating strings vs. booleans. 22 | 23 | ### Setup 24 | 25 | Both of these tools are written in R, and require several R libraries to be 26 | installed (see `../setup.sh r-packages`). 27 | 28 | `decode-assoc` also shells out to a native binary written in C++ if 29 | `--em-executable` is passed. This requires a C++ compiler (see 30 | `analysis/cpp/run.sh`). You can run `test.sh decode-assoc-cpp` to test it. 31 | 32 | 33 | Helper Tools 34 | ------------ 35 | 36 | These are simple Python implementations of tools needed for analysis. At 37 | Google, Chrome uses alternative C++/Go implementations of these tools. 38 | 39 | ### sum-bits 40 | 41 | Given a CSV file with RAPPOR reports (IRRs), produce a "counts" CSV file on 42 | stdout. This is the `m x (k+1)` matrix that is used in the R analysis (where m 43 | = #cohorts and k = report width in bits). 44 | 45 | ### hash-candidates 46 | 47 | Given a list of candidates on stdin, produce a CSV file of hashes (the "map 48 | file"). Each row has `m x h` cells (where m = #cohorts and h = #hashes) 49 | 50 | See the `regtest.sh` script for examples of how these tools are invoked. 51 | 52 | -------------------------------------------------------------------------------- /ui/overview.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RAPPOR Results Overview 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 15 |

16 | 17 |

18 | Association analysis (latest) 19 |

20 | 21 |

22 | Home / 23 | Overview / 24 | Histograms 25 |

26 | 27 |

RAPPOR Results Overview

28 | 29 | 30 |
31 | 32 |

33 | Underlying data: overview.csv 34 |

35 | 36 |

Metric Descriptions

37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 |
Metric NameOwnersDescription
50 | 51 | 52 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /apps/README.md: -------------------------------------------------------------------------------- 1 | RAPPOR Shiny Apps 2 | ================= 3 | 4 | This directory contains web apps written using the [Shiny][shiny] web framework 5 | from [RStudio][rstudio]. 6 | 7 | To run them, first install Shiny: 8 | 9 | $ R 10 | ... 11 | > install.packages('shiny') 12 | ... 13 | 14 | (You can view Shiny's platform requirements in 15 | [CRAN](http://cran.r-project.org/web/packages/shiny/index.html).) 16 | 17 | Then change to the app directory, and execute the `run_app.sh` script: 18 | 19 | $ cd rappor/apps/rappor-analysis 20 | $ ./run_app.sh 21 | ... 22 | Listening on http://0.0.0.0.:6789 23 | 24 | Visit http://localhost:6789/ in your browser. 25 | 26 | This code has been tested on Ubuntu Linux, but should work on other platforms 27 | that Shiny supports. 28 | 29 | Both of these apps use the underlying analysis code in `analysis/R`, just like 30 | the command line demo `demo.sh` does. 31 | 32 | rappor-analysis 33 | --------------- 34 | 35 | This app "decodes" a RAPPOR data set. In other words, you can upload the 36 | `params`, `counts`, and `map` files, and view the inferred distribution, as 37 | well as debug info. 38 | 39 | These files are discussed in the RAPPOR [Data Flow][data-flow] doc. 40 | 41 | rappor-sim 42 | ---------- 43 | 44 | This app lets you simulate RAPPOR runs with different populations and 45 | parameters. This can help you choose collection parameters for a given 46 | situation / variable. 47 | 48 | Help 49 | ---- 50 | 51 | If you need help with these apps, please send a message to 52 | [rappor-discuss][group]. 53 | 54 | 55 | [shiny]: http://shiny.rstudio.com/ 56 | [rstudio]: http://rstudio.com/ 57 | [data-flow]: http://google.github.io/rappor/doc/data-flow.html 58 | [group]: https://groups.google.com/forum/#!forum/rappor-discuss 59 | -------------------------------------------------------------------------------- /gh-pages/examples/report.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RAPPOR Demo 5 | 6 | 7 | 8 |

RAPPOR Demo

9 | 10 | 11 | 12 | 13 | 14 |

Simulation Input

15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 |
Number of clients100,000
Total values reported / obfuscated700,000
Unique values reported / obfuscated50
29 | 30 | 31 | 32 | 33 |

RAPPOR Parameters

34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 |
kSize of Bloom filter in bits16
hHash functions in Bloom filter2
mNumber of Cohorts64
pProbability p0.5
qProbability q0.75
fProbability f0.5
66 | 67 | 68 |
69 | 70 | exponential distribution 71 | gauss distribution 72 | uniform distribution 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /tests/fastrand_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | fastrand_test.py: Tests for _fastrand extension module. 19 | """ 20 | import unittest 21 | 22 | import _fastrand # module under test 23 | 24 | 25 | BIT_WIDTHS = [8, 16, 32, 64] 26 | 27 | 28 | class FastRandTest(unittest.TestCase): 29 | 30 | def testRandbits64(self): 31 | for n in BIT_WIDTHS: 32 | #print '== %d' % n 33 | for p1 in [0.1, 0.5, 0.9]: 34 | #print '-- %f' % p1 35 | for i in xrange(5): 36 | r = _fastrand.randbits(p1, n) 37 | # Rough sanity check 38 | self.assertLess(r, 2 ** n) 39 | 40 | # Visual check 41 | #b = bin(r) 42 | #print b 43 | #print b.count('1') 44 | 45 | 46 | def testRandbits64_EdgeCases(self): 47 | for n in BIT_WIDTHS: 48 | r = _fastrand.randbits(0.0, n) 49 | self.assertEqual(0, r) 50 | 51 | for n in BIT_WIDTHS: 52 | r = _fastrand.randbits(1.0, n) 53 | self.assertEqual(2 ** n - 1, r) 54 | 55 | def testRandbitsError(self): 56 | r = _fastrand.randbits(-1, 64) 57 | # TODO: Should probably raise exceptions 58 | self.assertEqual(None, r) 59 | 60 | r = _fastrand.randbits(0.0, 65) 61 | self.assertEqual(None, r) 62 | 63 | 64 | if __name__ == '__main__': 65 | unittest.main() 66 | -------------------------------------------------------------------------------- /pipeline/README.md: -------------------------------------------------------------------------------- 1 | pipeline 2 | ======== 3 | 4 | This directory contains tools and scripts for running a cron job that does 5 | RAPPOR analysis and generates an HTML dashboard. 6 | 7 | It works like this: 8 | 9 | 1. `task_spec.py` generates a text file where each line corresponds to a process 10 | to be run (a "task"). The process is `bin/decode-dist` or 11 | `bin/decode-assoc`. The line contains the task parameters. 12 | 13 | 2. `xargs -P` is used to run processes in parallel. Our analysis is generally 14 | single-threaded (i.e. because R is single-threaded), so this helps utilize 15 | the machine fully. Each task places its output in a different subdirectory. 16 | 17 | 3. `cook.sh` calls `combine_results.py` to combine analysis results into a time 18 | series. It also calls `combine_status.py` to keep track of task data for 19 | "meta-analysis". `metric_status.R` generates more summary CSV files. 20 | 21 | 4. `ui.sh` calls `csv_to_html.py` to generate an HTML fragments from the CSV 22 | files. 23 | 24 | 5. The JavaScript in `ui/ui.js` is loaded from static HTML, and makes AJAX calls 25 | to retrieve the HTML fragments. The page is made interactive with 26 | `ui/table-lib.js`. 27 | 28 | `dist.sh` and `assoc.sh` contain functions which coordinate this process. 29 | 30 | `alarm-lib.sh` is used to kill processes that have been running for too long. 31 | 32 | Testing 33 | ------- 34 | 35 | `pipeline/regtest.sh` contains end-to-end demos of this process. Right now it 36 | depends on testdata from elsewhere in the tree: 37 | 38 | 39 | rappor$ ./demo.sh run # prepare dist testdata 40 | rappor$ cd bin 41 | 42 | bin$ ./test.sh write-assoc-testdata # prepare assoc testdata 43 | bin$ cd ../pipeline 44 | 45 | pipeline$ ./regtest.sh dist 46 | pipeline$ ./regtest.sh assoc 47 | 48 | pipeline$ python -m SimpleHTTPServer # start a static web server 49 | 50 | http://localhost:8000/_tmp/ 51 | 52 | 53 | -------------------------------------------------------------------------------- /bin/sum_bits_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python -S 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | sum_bits_test.py: Tests for sum_bits.py 19 | """ 20 | 21 | import cStringIO 22 | import unittest 23 | 24 | import rappor 25 | import sum_bits # module under test 26 | 27 | 28 | CSV_IN = """\ 29 | user_id,cohort,bloom,prr,rappor 30 | 5,1,dummy,dummy,0000111100001111 31 | 5,1,dummy,dummy,0000000000111100 32 | """ 33 | 34 | # NOTE: bit order is reversed. 35 | EXPECTED_CSV_OUT = """\ 36 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r 37 | 2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r 38 | """ 39 | 40 | TOO_MANY_COLUMNS = """\ 41 | user_id,cohort,rappor 42 | 5,1,0000111100001111,extra 43 | """ 44 | 45 | 46 | class SumBitsTest(unittest.TestCase): 47 | 48 | def setUp(self): 49 | self.params = rappor.Params() 50 | self.params.num_bloombits = 16 51 | self.params.num_cohorts = 2 52 | 53 | def testSum(self): 54 | stdin = cStringIO.StringIO(CSV_IN) 55 | stdout = cStringIO.StringIO() 56 | 57 | sum_bits.SumBits(self.params, stdin, stdout) 58 | 59 | self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue()) 60 | 61 | def testErrors(self): 62 | stdin = cStringIO.StringIO(TOO_MANY_COLUMNS) 63 | stdout = cStringIO.StringIO() 64 | 65 | self.assertRaises( 66 | RuntimeError, sum_bits.SumBits, self.params, stdin, stdout) 67 | 68 | 69 | if __name__ == '__main__': 70 | unittest.main() 71 | -------------------------------------------------------------------------------- /bin/hash_candidates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Given a list of candidates on stdin, produce a file of hashes ("map file"). 19 | """ 20 | 21 | import csv 22 | import sys 23 | 24 | import rappor 25 | 26 | 27 | def HashCandidates(params, stdin, stdout): 28 | num_bloombits = params.num_bloombits 29 | csv_out = csv.writer(stdout) 30 | 31 | for line in stdin: 32 | word = line.strip() 33 | row = [word] 34 | for cohort in xrange(params.num_cohorts): 35 | bloom_bits = rappor.get_bloom_bits(word, cohort, params.num_hashes, 36 | num_bloombits) 37 | for bit_to_set in bloom_bits: 38 | # bits are indexed from 1. Add a fixed offset for each cohort. 39 | # NOTE: This detail could be omitted from the map file format, and done 40 | # in R. 41 | row.append(cohort * num_bloombits + (bit_to_set + 1)) 42 | csv_out.writerow(row) 43 | 44 | 45 | def main(argv): 46 | try: 47 | filename = argv[1] 48 | except IndexError: 49 | raise RuntimeError('Usage: hash_candidates.py ') 50 | with open(filename) as f: 51 | try: 52 | params = rappor.Params.from_csv(f) 53 | except rappor.Error as e: 54 | raise RuntimeError(e) 55 | 56 | HashCandidates(params, sys.stdin, sys.stdout) 57 | 58 | 59 | if __name__ == '__main__': 60 | try: 61 | main(sys.argv) 62 | except RuntimeError, e: 63 | print >>sys.stderr, e.args[0] 64 | sys.exit(1) 65 | -------------------------------------------------------------------------------- /gh-pages/doc/randomness.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 11 | 12 | 13 |

Generating Random Bits for RAPPOR

14 | 15 |

To ensure privacy, an application using RAPPOR must generate random bits in an 16 | unpredictable manner. In other words, an adversary that can predict the 17 | sequence of random bits used can determine the true values being reported.

18 | 19 |

Generating random numbers is highly platform-specific -- even 20 | language-specific. So, libraries implementing RAPPOR should be parameterized 21 | by an interface to generate random bits. (This can be thought of as 22 | "dependency injection".)

23 | 24 | 26 | 27 |

For now, we have collected some useful links.

28 | 29 |

Linux

30 | 31 |
    32 |
  • Myths about /dev/urandom -- Nice 33 | article explaining implementation aspects of /dev/urandom and /dev/random 34 | on Linux. (Summary: just use /dev/urandom, with caveats explained)

  • 35 |
  • LWN on getrandom 36 | (patch) -- A very recent addition to the 37 | Linux kernel. As of this writing (11/2014), it's safe to say that very few 38 | applications use it. The relevant change, involving an issue mentioned in 39 | the first link, involves the situation at system boot, when there is little 40 | entropy available.

  • 41 |
42 | 43 | 45 | 46 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /client/cpp/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # ./run.sh 5 | 6 | set -o nounset 7 | set -o pipefail 8 | set -o errexit 9 | 10 | setup() { 11 | # need libprotobuf-dev for headers to compile against. 12 | sudo apt-get install protobuf-compiler libprotobuf-dev 13 | 14 | # OpenSSL dev headers 15 | sudo apt-get install libssl-dev 16 | } 17 | 18 | init() { 19 | mkdir --verbose -p _tmp 20 | } 21 | 22 | rappor-sim() { 23 | make _tmp/rappor_sim 24 | _tmp/rappor_sim "$@" 25 | } 26 | 27 | protobuf-encoder-demo() { 28 | make _tmp/protobuf_encoder_demo 29 | _tmp/protobuf_encoder_demo "$@" 30 | } 31 | 32 | rappor-sim-demo() { 33 | rappor-sim 16 2 128 0.25 0.75 0.5 < // assert 21 | 22 | #include "encoder.h" 23 | #include "openssl_hash_impl.h" 24 | #include "unix_kernel_rand_impl.h" 25 | 26 | int main(int argc, char** argv) { 27 | // Suppress unused variable warnings 28 | (void) argc; 29 | (void) argv; 30 | 31 | FILE* fp = fopen("/dev/urandom", "r"); 32 | rappor::UnixKernelRand irr_rand(fp); 33 | 34 | rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256, 35 | irr_rand); 36 | rappor::Params params(32, // num_bits (k) 37 | 2, // num_hashes (h) 38 | 128, // num_cohorts (m) 39 | 0.25, // probability f for PRR 40 | 0.75, // probability p for IRR 41 | 0.5); // probability q for IRR 42 | 43 | const char* encoder_id = "metric-name"; 44 | rappor::Encoder encoder(encoder_id, params, deps); 45 | 46 | // Now use it to encode values. The 'out' value can be sent over the 47 | // network. 48 | rappor::Bits out; 49 | assert(encoder.EncodeString("foo", &out)); // returns false on error 50 | printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort()); 51 | 52 | // Raw bits 53 | assert(encoder.EncodeBits(0x123, &out)); // returns false on error 54 | printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort()); 55 | } 56 | 57 | -------------------------------------------------------------------------------- /doc/data-flow.dot: -------------------------------------------------------------------------------- 1 | // Based on http://graphviz.org/content/cluster 2 | 3 | // Node types: 4 | // Rectangle: process 5 | // Oval: data 6 | // Diamond: debug/simulation data 7 | 8 | digraph G { 9 | //rankdir="LR"; // left to right layout 10 | 11 | // http://www.graphviz.org/content/color-names 12 | colorscheme=pastel13; 13 | 14 | subgraph cluster_0 { 15 | graph [ fontsize=24 ]; 16 | label = "Reporting"; 17 | style=filled; 18 | color=2; 19 | 20 | node [style=filled, color=white, fontsize=12]; 21 | 22 | gen_sim_input -> dist_csv -> rappor_sim; 23 | 24 | rappor_sim -> out; 25 | rappor_sim -> params; 26 | rappor_sim -> hist; 27 | rappor_sim -> true_inputs; 28 | 29 | // Process 30 | rappor_sim [label="rappor_sim"]; 31 | 32 | // Data 33 | dist_csv [shape=box, label="dist.csv"]; 34 | out [shape=box, label="dist_out.csv"]; 35 | params [shape=box, label="dist_params.csv"]; 36 | 37 | // simulation data 38 | hist [shape=box, style=dotted, color=black, label="dist_hist.csv"]; 39 | true_inputs [shape=box, style=dotted, color=black, label="dist_true_inputs.txt"]; 40 | } 41 | 42 | subgraph cluster_1 { 43 | graph [ fontsize=24 ]; 44 | label = "Analysis"; 45 | style = filled; 46 | color=3; 47 | 48 | node [style=filled, color=white, fontsize=12]; 49 | 50 | sum_bits -> counts; 51 | 52 | // sum_bits needs the params to construct the matrix. Technically it could 53 | // infer it, but this is simple. 54 | params -> sum_bits; 55 | 56 | // only in the simulation 57 | true_inputs -> demo_sh -> candidates [style=dotted]; 58 | 59 | candidates -> hash_candidates -> map; 60 | params -> hash_candidates; 61 | 62 | params -> analyze; 63 | map -> analyze; 64 | counts -> analyze; 65 | hist -> analyze [style=dotted]; // only for comparison 66 | 67 | analyze -> plot_png; 68 | 69 | // Processes 70 | analyze [label="analyze.R"]; 71 | demo_sh [label="demo.sh", style=dotted, color=black]; 72 | 73 | // Data 74 | counts [shape=box, label="dist_count.csv"]; 75 | candidates [shape=box, label="dist_candidates.txt"]; 76 | map [shape=box, label="dist_map.csv"]; 77 | 78 | plot_png [shape=box, label="dist.png"]; 79 | 80 | } 81 | 82 | out -> sum_bits; 83 | } 84 | -------------------------------------------------------------------------------- /demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Demo of RAPPOR. Automating Python and R scripts. See README. 4 | # 5 | # Usage: 6 | # ./demo.sh [function name] 7 | # 8 | # End to end demo of rappor. Notable functions include: 9 | # quick-python: Runs a demo using the python client 10 | # quick-cpp: Runs a demo using the c++ client 11 | # If no function is specified the above two will be run consecutivly. 12 | # 13 | # This takes a minute or so. It runs a subset of tests from regtest.sh and 14 | # writes an HTML summary. 15 | 16 | set -o nounset 17 | set -o pipefail 18 | set -o errexit 19 | 20 | . util.sh 21 | 22 | readonly THIS_DIR=$(dirname $0) 23 | readonly REPO_ROOT=$THIS_DIR 24 | readonly CLIENT_DIR=$REPO_ROOT/client/python 25 | 26 | # All the Python tools need this 27 | export PYTHONPATH=$CLIENT_DIR 28 | 29 | # 30 | # Semi-automated demos 31 | # 32 | 33 | # Run rappor-sim through the Python profiler. 34 | rappor-sim-profile() { 35 | local dist=$1 36 | shift 37 | 38 | # For now, just dump it to a text file. Sort by cumulative time. 39 | time python -m cProfile -s cumulative \ 40 | tests/rappor_sim.py \ 41 | -i _tmp/$dist.csv \ 42 | "$@" \ 43 | | tee _tmp/profile.txt 44 | } 45 | 46 | quick-python() { 47 | ./regtest.sh run-seq '^demo3' python 48 | } 49 | 50 | quick-cpp() { 51 | # For now we build it first. Don't want to build it in parallel. 52 | ./build.sh cpp-client 53 | 54 | ./regtest.sh run-seq '^demo3' cpp 55 | } 56 | 57 | quick() { 58 | quick-python 59 | quick-cpp 60 | } 61 | 62 | # TODO: Port these old bad cases to regtest_spec.py. 63 | 64 | # Running the demo of the exponential distribution with 10000 reports (x7, 65 | # which is 70000 values). 66 | # 67 | # - There are 50 real values, but we add 1000 more candidates, to get 1050 candidates. 68 | # - And then we remove the two most common strings, v1 and v2. 69 | # - With the current analysis, we are getting sum(proportion) = 1.1 to 1.7 70 | 71 | # TODO: Make this sharper by including only one real value? 72 | 73 | bad-case() { 74 | local num_additional=${1:-1000} 75 | run-dist exp 10000 $num_additional 'v1|v2' 76 | } 77 | 78 | # Force it to be less than 1 79 | pcls-test() { 80 | USE_PCLS=1 bad-case 81 | } 82 | 83 | # Only add 10 more candidates. Then we properly get the 0.48 proportion. 84 | ok-case() { 85 | run-dist exp 10000 10 'v1|v2' 86 | } 87 | 88 | if test $# -eq 0 ; then 89 | quick 90 | else 91 | "$@" 92 | fi 93 | -------------------------------------------------------------------------------- /ui/metric.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Metric Results 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 17 |

18 | 19 |

20 | Home / 21 | Overview / 22 | Histograms 23 |

24 | 25 | 27 | 28 |

29 | 30 |

31 | 32 |

Estimated Proportions

33 |

NOTE: Only the top 5 values for each day are shown

34 | 35 | 45 |

46 |

47 | Underlying data: dist.csv 48 |

49 | 50 |

Number of Reports

51 | 52 |

53 | 54 | 55 |

Unallocated Mass

56 | 57 |

58 | 59 |

60 | Plot Help: Drag horizontally to zoom to selection. Double click 61 | to zoom out. Shift + drag to pan. 62 |

63 | 64 |

Task Status

65 | 66 | 67 |
68 | 69 |

70 | 71 | Underlying data: status.csv 72 |

73 | 74 | 75 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /bin/sum_bits.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom 19 | filter by cohort. This can then be analyzed by R. 20 | """ 21 | 22 | import csv 23 | import sys 24 | 25 | import rappor 26 | 27 | 28 | def SumBits(params, stdin, stdout): 29 | csv_in = csv.reader(stdin) 30 | csv_out = csv.writer(stdout) 31 | 32 | num_cohorts = params.num_cohorts 33 | num_bloombits = params.num_bloombits 34 | 35 | sums = [[0] * num_bloombits for _ in xrange(num_cohorts)] 36 | num_reports = [0] * num_cohorts 37 | 38 | for i, row in enumerate(csv_in): 39 | try: 40 | (user_id, cohort, unused_bloom, unused_prr, irr) = row 41 | except ValueError: 42 | raise RuntimeError('Error parsing row %r' % row) 43 | 44 | if i == 0: 45 | continue # skip header 46 | 47 | cohort = int(cohort) 48 | num_reports[cohort] += 1 49 | 50 | if not len(irr) == params.num_bloombits: 51 | raise RuntimeError( 52 | "Expected %d bits, got %r" % (params.num_bloombits, len(irr))) 53 | for i, c in enumerate(irr): 54 | bit_num = num_bloombits - i - 1 # e.g. char 0 = bit 15, char 15 = bit 0 55 | if c == '1': 56 | sums[cohort][bit_num] += 1 57 | else: 58 | if c != '0': 59 | raise RuntimeError('Invalid IRR -- digits should be 0 or 1') 60 | 61 | for cohort in xrange(num_cohorts): 62 | # First column is the total number of reports in the cohort. 63 | row = [num_reports[cohort]] + sums[cohort] 64 | csv_out.writerow(row) 65 | 66 | 67 | def main(argv): 68 | try: 69 | filename = argv[1] 70 | except IndexError: 71 | raise RuntimeError('Usage: sum_bits.py ') 72 | with open(filename) as f: 73 | try: 74 | params = rappor.Params.from_csv(f) 75 | except rappor.Error as e: 76 | raise RuntimeError(e) 77 | 78 | SumBits(params, sys.stdin, sys.stdout) 79 | 80 | 81 | if __name__ == '__main__': 82 | try: 83 | main(sys.argv) 84 | except RuntimeError, e: 85 | print >>sys.stderr, e.args[0] 86 | sys.exit(1) 87 | -------------------------------------------------------------------------------- /client/cpp/rappor_deps.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // This header declares the dependencies that the application must provide to 16 | // the RAPPOR. 17 | 18 | #ifndef RAPPOR_DEPS_H_ 19 | #define RAPPOR_DEPS_H_ 20 | 21 | #include // for uint32_t 22 | #include 23 | #include 24 | 25 | namespace rappor { 26 | 27 | // rappor::Bits type is used for Bloom Filter, PRR, and IRR 28 | typedef uint32_t Bits; 29 | 30 | // rappor::Encoder needs a hash function for the bloom filter, and an HMAC 31 | // function for the PRR. 32 | 33 | typedef bool HashFunc(const std::string& value, std::vector* output); 34 | typedef bool HmacFunc(const std::string& key, const std::string& value, 35 | std::vector* output); 36 | 37 | // Interface that the encoder use to generate randomness for the IRR. 38 | // Applications should implement this based on their platform and requirements. 39 | class IrrRandInterface { 40 | public: 41 | virtual ~IrrRandInterface() {} 42 | // Compute a bitmask with each bit set to 1 with probability 'prob'. 43 | // Returns false if there is an error. 44 | virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const = 0; 45 | }; 46 | 47 | // Dependencies 48 | // - hash_func: hash function for the Bloom Filter client step 49 | // - client_secret: key for deterministic randomness in the PRR 50 | // - hmac_func: function for deterministic randomness in the PRR 51 | // - irr_rand: randomness for the IRR 52 | 53 | class Deps { 54 | public: 55 | Deps(HashFunc* const hash_func, const std::string& client_secret, 56 | HmacFunc* const hmac_func, const IrrRandInterface& irr_rand) 57 | : hash_func_(hash_func), 58 | client_secret_(client_secret), 59 | hmac_func_(hmac_func), 60 | irr_rand_(irr_rand) { 61 | } 62 | 63 | private: 64 | friend class Encoder; 65 | 66 | HashFunc* hash_func_; // for bloom filter 67 | const std::string client_secret_; // for PRR; copy of constructor param 68 | HmacFunc* hmac_func_; // PRR 69 | const IrrRandInterface& irr_rand_; // IRR 70 | }; 71 | 72 | } // namespace rappor 73 | 74 | #endif // RAPPOR_DEPS_H_ 75 | 76 | -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Setup RAPPOR analysis on Ubuntu Trusty (Google Cloud or otherwise). 4 | # 5 | # For the apps/api server, you need 'install-minimal'. For the regtest, and 6 | # Shiny apps, we need a few more R packages (ggplot2, data.table, etc.). They 7 | # cause versioning problems, so we keep them separate. 8 | # 9 | # Usage: 10 | # ./setup.sh [function name] 11 | # If run without specifing any function it will run: install-most 12 | # which should cover all the packages needed to run the demo. 13 | 14 | set -o nounset 15 | set -o pipefail 16 | set -o errexit 17 | 18 | native-packages() { 19 | sudo apt-get update 20 | # - build-essential for gcc compilers, invoked while installing R packages. 21 | # - gfortran Fortran compiler needed for glmnet. 22 | # - libblas-dev needed for limSolve. 23 | # - python-dev is for building the fastrand extension 24 | # 25 | # NOTE: we get R 3.0.2 on Trusty. 26 | sudo apt-get install build-essential gfortran libblas-dev r-base python-dev graphviz 27 | } 28 | 29 | r-packages() { 30 | # Install as root so you can write to /usr/local/lib/R. 31 | 32 | # glmnet, limSolve: solvers for decode.R 33 | # RJSONIO, optparse: for decode_dist.R 34 | # RUnit: for unit tests 35 | # abind: for decode_test only 36 | sudo R -e \ 37 | 'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")' 38 | } 39 | 40 | # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround. 41 | install-plyr-with-friends() { 42 | mkdir -p _tmp 43 | wget --directory _tmp \ 44 | http://cran.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.11.4.tar.gz 45 | wget --directory _tmp \ 46 | http://cran.r-project.org/src/contrib/Archive/plyr/plyr_1.8.1.tar.gz 47 | sudo R CMD INSTALL _tmp/Rcpp_0.11.4.tar.gz 48 | sudo R CMD INSTALL _tmp/plyr_1.8.1.tar.gz 49 | sudo R -e \ 50 | 'install.packages(c("reshape2", "ggplot2", "data.table"), repos="http://cran.rstudio.com/")' 51 | } 52 | 53 | # Keep Shiny separate, since it seems to install a lot of dependencies. 54 | shiny() { 55 | sudo R -e \ 56 | 'install.packages(c("shiny"), repos="http://cran.rstudio.com/")' 57 | } 58 | 59 | # 60 | # Batch 61 | # 62 | 63 | install-minimal() { 64 | native-packages 65 | r-packages 66 | } 67 | 68 | # NOTE: hasn't yet been tested on a clean machine. 69 | install-most() { 70 | install-minimal 71 | install-plyr-with-friends 72 | } 73 | 74 | # 75 | # Shiny Apps / API Server 76 | # 77 | 78 | # After running one of the run_app.sh scripts, see if the app returns a page. 79 | shiny-smoke-test() { 80 | curl http://localhost:6789/ 81 | } 82 | 83 | # Then set up a "firewall rule" in console.developers.google.com to open up 84 | # "tcp:6789". Test it from the outside. 85 | 86 | if test $# -eq 0 ; then 87 | install-most 88 | else 89 | "$@" 90 | fi 91 | -------------------------------------------------------------------------------- /tests/gen_true_values.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Copyright 2015 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | source('tests/gen_counts.R') 18 | 19 | # Usage: 20 | # 21 | # $ ./gen_true_values.R exp 100 10000 1 foo.csv 22 | # 23 | # Inputs: 24 | # distribution name 25 | # size of the distribution's support 26 | # number of clients 27 | # reports per client 28 | # name of the output file 29 | # Output: 30 | # csv file with reports sampled according to the specified distribution. 31 | 32 | GenerateTrueValues <- function(distr, distr_range, num_clients, 33 | reports_per_client, num_cohorts) { 34 | 35 | # Sums to 1.0, e.g. [0.2 0.2 0.2 0.2 0.2] for uniform distribution of 5. 36 | pdf <- ComputePdf(distr, distr_range) 37 | 38 | num_reports <- num_clients * reports_per_client 39 | 40 | # Computes the number of clients reporting each value, where the numbers are 41 | # sampled according to pdf. (sums to num_reports) 42 | partition <- RandomPartition(num_reports, pdf) 43 | 44 | value_ints <- rep(1:distr_range, partition) # expand partition 45 | 46 | stopifnot(length(value_ints) == num_reports) 47 | 48 | # Shuffle values randomly (may take a few sec for > 10^8 inputs) 49 | value_ints <- sample(value_ints) 50 | 51 | # Reported values are strings, so prefix integers "v". Even slower than 52 | # shuffling. 53 | values <- sprintf("v%d", value_ints) 54 | 55 | # e.g. [1 1 2 2 3 3] if num_clients is 3 and reports_per_client is 2 56 | client_ints <- rep(1:num_clients, each = reports_per_client) 57 | 58 | # Cohorts are assigned to clients. Cohorts are 0-based. 59 | cohorts <- client_ints %% num_cohorts # %% is integer modulus 60 | 61 | clients <- sprintf("c%d", client_ints) 62 | 63 | data.frame(client = clients, cohort = cohorts, value = values) 64 | } 65 | 66 | main <- function(argv) { 67 | distr <- argv[[1]] 68 | distr_range <- as.integer(argv[[2]]) 69 | num_clients <- as.integer(argv[[3]]) 70 | reports_per_client <- as.integer(argv[[4]]) 71 | num_cohorts <- as.integer(argv[[5]]) 72 | out_file <- argv[[6]] 73 | 74 | reports <- GenerateTrueValues(distr, distr_range, num_clients, 75 | reports_per_client, num_cohorts) 76 | 77 | write.csv(reports, file = out_file, row.names = FALSE, quote = FALSE) 78 | } 79 | 80 | if (length(sys.frames()) == 0) { 81 | main(commandArgs(TRUE)) 82 | } 83 | -------------------------------------------------------------------------------- /pipeline/alarm-lib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Alarm tool. 4 | # 5 | # Usage: 6 | # ./alarm.sh 7 | 8 | # You can source this file and use the alarm-status function. 9 | 10 | set -o nounset 11 | set -o pipefail 12 | set -o errexit 13 | 14 | # Run a command with a timeout, and print its status to a directory. 15 | # 16 | # Usage: 17 | # alarm-status job_dir/STATUS 10 \ 18 | # flaky_command ... 19 | 20 | alarm-status() { 21 | set +o errexit 22 | local status_file=$1 23 | shift # everything except the status file goes to perl 24 | 25 | # NOTE: It would be nice to setpgrp() before exec? And then can the signal 26 | # be delivered to the entire group, like kill -SIGALRM -PID? 27 | 28 | # NOTE: If we did this in Python, the error message would also be clearer. 29 | perl -e 'alarm shift; exec @ARGV or die "ERROR: after exec @ARGV"' "$@" 30 | local exit_code=$? 31 | 32 | set -o errexit 33 | 34 | local result='' 35 | case $exit_code in 36 | 0) 37 | # Would be nice to show elapsed time? 38 | result='OK' 39 | ;; 40 | 9) 41 | # decode_assoc.R will exit 9 if there are no reports AFTER 42 | # --remove-bad-rows. A task can also be marked SKIPPED before running 43 | # the child process (see backfill.sh). 44 | result='SKIPPED by child process' 45 | ;; 46 | # exit code 142 means SIGALARM. 128 + 14 = 142. See 'kill -l'. 47 | 142) 48 | local seconds=$1 49 | result="TIMEOUT after $seconds seconds" 50 | ;; 51 | *) 52 | result="FAIL with status $exit_code" 53 | ;; 54 | esac 55 | echo "$result" 56 | echo "$result" > $status_file 57 | } 58 | 59 | _work() { 60 | local n=10 # 2 seconds 61 | for i in $(seq $n); do 62 | echo $i - "$@" 63 | sleep 0.2 64 | done 65 | } 66 | 67 | _succeed() { 68 | _work "$@" 69 | exit 0 70 | } 71 | 72 | _fail() { 73 | _work "$@" 74 | exit 1 75 | } 76 | 77 | _skip() { 78 | exit 9 79 | } 80 | 81 | # http://perldoc.perl.org/functions/alarm.html 82 | # 83 | # Delivers alarm. But how to get the process to have a distinct exit code? 84 | 85 | demo() { 86 | mkdir -p _tmp 87 | 88 | # timeout 89 | alarm-status _tmp/A 1 $0 _succeed foo 90 | echo 91 | 92 | # ok 93 | alarm-status _tmp/B 3 $0 _succeed bar 94 | echo 95 | 96 | # fail 97 | alarm-status _tmp/C 3 $0 _fail baz 98 | echo 99 | 100 | # skip 101 | alarm-status _tmp/D 3 $0 _skip baz 102 | echo 103 | 104 | head _tmp/{A,B,C,D} 105 | } 106 | 107 | test-simple() { 108 | alarm-status _tmp/status.txt 1 sleep 2 109 | } 110 | 111 | test-bad-command() { 112 | alarm-status _tmp/status.txt 1 nonexistent_sleep 2 113 | } 114 | 115 | # BUG 116 | test-perl() { 117 | set +o errexit 118 | perl -e 'alarm shift; exec @ARGV or die "ERROR after exec @ARGV"' 1 _sleep 2 119 | echo $? 120 | } 121 | 122 | if test $(basename $0) = 'alarm-lib.sh'; then 123 | "$@" 124 | fi 125 | -------------------------------------------------------------------------------- /analysis/R/alternative.R: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | library(limSolve) 16 | library(Matrix) 17 | 18 | # The next two functions create a matrix (G) and a vector (H) encoding 19 | # linear inequality constraints that a solution vector (x) must satisfy: 20 | # G * x >= H 21 | 22 | # Currently represent three sets of constraints on the solution vector: 23 | # - all solution coefficients are nonnegative 24 | # - the sum total of all solution coefficients is no more than 1 25 | # - in each of the coordinates of the target vector (estimated Bloom filter) 26 | # we don't overshoot by more than three standard deviations. 27 | MakeG <- function(n, X) { 28 | d <- Diagonal(n) 29 | last <- rep(-1, n) 30 | rbind2(rbind2(d, last), -X) 31 | } 32 | 33 | MakeH <- function(n, Y, stds) { 34 | # set the floor at 0.01 to avoid degenerate cases 35 | YY <- apply(Y + 3 * stds, # in each bin don't overshoot by more than 3 stds 36 | 1:2, 37 | function(x) min(1, max(0.01, x))) # clamp the bound to [0.01,1] 38 | 39 | c(rep(0, n), # non-negativity condition 40 | -1, # coefficients sum up to no more than 1 41 | -as.vector(t(YY)) # t is important! 42 | ) 43 | } 44 | 45 | MakeLseiModel <- function(X, Y, stds) { 46 | m <- dim(X)[1] 47 | n <- dim(X)[2] 48 | 49 | # no slack variables for now 50 | # slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE) 51 | # colnames(slack) <- 1:m 52 | # diag(slack) <- TRUE 53 | # 54 | # G <- MakeG(n + m) 55 | # H <- MakeH(n + m) 56 | # 57 | # G[n+m+1,n:(n+m)] <- -0.1 58 | # A = cbind2(X, slack) 59 | 60 | w <- as.vector(t(1 / stds)) 61 | w_median <- median(w[!is.infinite(w)]) 62 | if(is.na(w_median)) # all w are infinite 63 | w_median <- 1 64 | w[w > w_median * 2] <- w_median * 2 65 | w <- w / mean(w) 66 | 67 | list(# coerce sparse Boolean matrix X to sparse numeric matrix 68 | A = Diagonal(x = w) %*% (X + 0), 69 | B = as.vector(t(Y)) * w, # transform to vector in the row-first order 70 | G = MakeG(n, X), 71 | H = MakeH(n, Y, stds), 72 | type = 2) # Since there are no equality constraints, lsei defaults to 73 | # solve.QP anyway, but outputs a warning unless type == 2. 74 | } 75 | 76 | # CustomLM(X, Y) 77 | ConstrainedLinModel <- function(X,Y) { 78 | model <- MakeLseiModel(X, Y$estimates, Y$stds) 79 | coefs <- do.call(lsei, model)$X 80 | names(coefs) <- colnames(X) 81 | 82 | coefs 83 | } -------------------------------------------------------------------------------- /tests/regtest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | RAPPOR regtest.sh (_IMPL_) 5 | 17 | 18 | 19 | 20 | 21 | 22 |

Parent

23 | 24 |

RAPPOR regtest.sh (_IMPL_)

25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 42 | 45 | 48 | 51 | 54 | 55 | 56 | 57 | 58 | 64 | 70 | 74 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | -------------------------------------------------------------------------------- /tests/_fastrand.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright 2014 Google Inc. All rights reserved. 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | /* 18 | * _fastrand.c -- Python extension module to generate random bit vectors 19 | * quickly. 20 | * 21 | * IMPORTANT: This module does not use crytographically strong randomness. It 22 | * should be used ONLY be used to speed up the simulation. Don't use it in 23 | * production. 24 | * 25 | * If an adversary can predict which random bits are flipped, then RAPPOR's 26 | * privacy is compromised. 27 | * 28 | */ 29 | 30 | #include // uint64_t 31 | #include // printf 32 | #include // srand 33 | #include // time 34 | 35 | #include 36 | 37 | uint64_t randbits(float p1, int num_bits) { 38 | uint64_t result = 0; 39 | // RAND_MAX is the maximum int returned by rand(). 40 | // 41 | // When p1 == 1.0, we want to guarantee that all bits are 1. The threshold 42 | // will be RAND_MAX + 1. In the rare case that rand() returns RAND_MAX, the 43 | // "<" test succeeds, so we get 1. 44 | // 45 | // When p1 == 0.0, we want to guarantee that all bits are 0. The threshold 46 | // will be 0. In the rare case that rand() returns 0, the "<" test fails, so 47 | // we get 0. 48 | 49 | // NOTE: cast is necessary to do unsigned arithmetic rather than signed. 50 | // RAND_MAX is an int so adding 1 won't overflow a uint64_t. 51 | uint64_t max = (uint64_t)RAND_MAX + 1u; 52 | uint64_t threshold = p1 * max; 53 | int i; 54 | for (i = 0; i < num_bits; ++i) { 55 | // NOTE: The comparison is <= so that p1 = 1.0 implies that the bit is 56 | // ALWAYS set. RAND_MAX is the maximum value returned by rand(). 57 | uint64_t bit = (rand() < threshold); 58 | result |= (bit << i); 59 | } 60 | return result; 61 | } 62 | 63 | static PyObject * 64 | func_randbits(PyObject *self, PyObject *args) { 65 | float p1; 66 | int num_bits; 67 | 68 | if (!PyArg_ParseTuple(args, "fi", &p1, &num_bits)) { 69 | return NULL; 70 | } 71 | if (p1 < 0.0 || p1 > 1.0) { 72 | printf("p1 must be between 0.0 and 1.0\n"); 73 | // return None for now; easier than raising ValueError 74 | Py_INCREF(Py_None); 75 | return Py_None; 76 | } 77 | if (num_bits < 0 || num_bits > 64) { 78 | printf("num_bits must be 64 or less\n"); 79 | // return None for now; easier than raising ValueError 80 | Py_INCREF(Py_None); 81 | return Py_None; 82 | } 83 | 84 | //printf("p: %f\n", p); 85 | uint64_t r = randbits(p1, num_bits); 86 | return PyLong_FromUnsignedLongLong(r); 87 | } 88 | 89 | PyMethodDef methods[] = { 90 | {"randbits", func_randbits, METH_VARARGS, 91 | "Return a number with N bits, where each bit is 1 with probability p."}, 92 | {NULL, NULL}, 93 | }; 94 | 95 | void init_fastrand(void) { 96 | Py_InitModule("_fastrand", methods); 97 | 98 | // Just seed it here; we don't give the application any control. 99 | int seed = time(NULL); 100 | srand(seed); 101 | } 102 | -------------------------------------------------------------------------------- /tests/regtest_spec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """Print a test spec on stdout. 3 | 4 | Each line has parameters for a test case. The regtest.sh shell script reads 5 | these lines and runs parallel processes. 6 | 7 | We use Python data structures so the test cases are easier to read and edit. 8 | """ 9 | 10 | import optparse 11 | import sys 12 | 13 | # 14 | # TEST CONFIGURATION 15 | # 16 | 17 | DEMO = ( 18 | # (case_name distr num_unique_values num_clients values_per_client) 19 | # (num_bits num_hashes num_cohorts) 20 | # (p q f) (num_additional regexp_to_remove) 21 | ('demo1 unif 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), 22 | ('demo2 gauss 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), 23 | ('demo3 exp 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), 24 | ('demo4 zipf1 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), 25 | ('demo5 zipf1.5 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'), 26 | ) 27 | 28 | DISTRIBUTIONS = ( 29 | 'unif', 30 | 'exp', 31 | 'gauss', 32 | 'zipf1', 33 | 'zipf1.5', 34 | ) 35 | 36 | DISTRIBUTION_PARAMS = ( 37 | # name, num unique values, num clients, values per client 38 | ('tiny', 100, 1000, 1), # test for insufficient data 39 | ('small', 100, 1000000, 1), 40 | ('medium', 1000, 10000000, 1), 41 | ('large', 10000, 100000000, 1), 42 | ) 43 | 44 | # 'k, h, m' as in params file. 45 | BLOOMFILTER_PARAMS = { 46 | '8x16': (8, 2, 16), # 16 cohorts, 8 bits each, 2 bits set in each 47 | '8x32': (8, 2, 32), # 32 cohorts, 8 bits each, 2 bits set in each 48 | '8x128': (8, 2, 128), # 128 cohorts, 8 bits each, 2 bits set in each 49 | '128x128': (128, 2, 128), # 8 cohorts, 128 bits each, 2 bits set in each 50 | } 51 | 52 | # 'p, q, f' as in params file. 53 | PRIVACY_PARAMS = { 54 | 'eps_1_1': (0.39, 0.61, 0.45), # eps_1 = 1, eps_inf = 5: 55 | 'eps_1_5': (0.225, 0.775, 0.0), # eps_1 = 5, no eps_inf 56 | } 57 | 58 | # For deriving candidates from true inputs. 59 | MAP_REGEX_MISSING = { 60 | 'sharp': 'NONE', # Categorical data 61 | '10%': 'v[0-9]*9$', # missing every 10th string 62 | } 63 | 64 | # test configuration -> 65 | # (name modifier, Bloom filter, privacy params, fraction of extra, 66 | # regex missing) 67 | TEST_CONFIGS = [ 68 | ('typical', '8x128', 'eps_1_1', .2, '10%'), 69 | ('sharp', '8x128', 'eps_1_1', .0, 'sharp'), # no extra candidates 70 | ('loose', '8x128', 'eps_1_5', .2, '10%'), # loose privacy 71 | ('over_x2', '8x128', 'eps_1_1', 2.0, '10%'), # overshoot by x2 72 | ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'), # overshoot by x10 73 | ] 74 | 75 | # 76 | # END TEST CONFIGURATION 77 | # 78 | 79 | 80 | def main(argv): 81 | rows = [] 82 | 83 | test_case = [] 84 | for (distr_params, num_values, num_clients, 85 | num_reports_per_client) in DISTRIBUTION_PARAMS: 86 | for distribution in DISTRIBUTIONS: 87 | for (config_name, bloom_name, privacy_params, fr_extra, 88 | regex_missing) in TEST_CONFIGS: 89 | test_name = 'r-{}-{}-{}'.format(distribution, distr_params, 90 | config_name) 91 | 92 | params = (BLOOMFILTER_PARAMS[bloom_name] 93 | + PRIVACY_PARAMS[privacy_params] 94 | + tuple([int(num_values * fr_extra)]) 95 | + tuple([MAP_REGEX_MISSING[regex_missing]])) 96 | 97 | test_case = (test_name, distribution, num_values, num_clients, 98 | num_reports_per_client) + params 99 | row_str = [str(element) for element in test_case] 100 | rows.append(row_str) 101 | 102 | for params in DEMO: 103 | rows.append(params) 104 | 105 | for row in rows: 106 | print ' '.join(row) 107 | 108 | if __name__ == '__main__': 109 | try: 110 | main(sys.argv) 111 | except RuntimeError, e: 112 | print >>sys.stderr, 'FATAL: %s' % e 113 | sys.exit(1) 114 | -------------------------------------------------------------------------------- /pipeline/combine_results.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """Combines results from multiple days of a single metric. 3 | 4 | Feed it the STATUS.txt files on stdin. It then finds the corresponding 5 | results.csv, and takes the top N items. 6 | 7 | Example: 8 | 9 | Date, "google.com,", yahoo.com 10 | 2015-03-01, 0.0, 0.9 11 | 2015-03-02, 0.1, 0.8 12 | 13 | Dygraphs can load this CSV file directly. 14 | 15 | TODO: Use different dygraph API? 16 | 17 | Also we need error bars. 18 | 19 | new Dygraph(document.getElementById("graphdiv2"), 20 | [ 21 | [1,10,100], 22 | [2,20,80], 23 | [3,50,60], 24 | [4,70,80] 25 | ], 26 | { 27 | labels: [ "Date", "failure", "timeout", "google.com" ] 28 | }); 29 | """ 30 | 31 | import collections 32 | import csv 33 | import json 34 | import os 35 | import sys 36 | 37 | import util 38 | 39 | 40 | def CombineDistResults(stdin, c_out, num_top): 41 | dates = [] 42 | var_cols = collections.defaultdict(dict) # {name: {date: value}} 43 | 44 | seen_dates = set() 45 | 46 | for line in stdin: 47 | status_path = line.strip() 48 | 49 | # Assume it looks like .../2015-03-01/STATUS.txt 50 | task_dir = os.path.dirname(status_path) 51 | date = os.path.basename(task_dir) 52 | 53 | # Get rid of duplicate dates. These could be caused by retries. 54 | if date in seen_dates: 55 | continue 56 | 57 | seen_dates.add(date) 58 | 59 | with open(status_path) as f: 60 | status = f.readline().split()[0] # OK, FAIL, TIMEOUT, SKIPPED 61 | 62 | dates.append(date) 63 | 64 | if status != 'OK': 65 | continue # won't have results.csv 66 | 67 | results_path = os.path.join(task_dir, 'results.csv') 68 | with open(results_path) as f: 69 | c = csv.reader(f) 70 | unused_header = c.next() # header row 71 | 72 | # they are sorted by decreasing "estimate", which is what we want 73 | for i in xrange(0, num_top): 74 | try: 75 | row = c.next() 76 | except StopIteration: 77 | # It's OK if it doesn't have enough 78 | util.log('Stopping early. Fewer than %d results to render.', num_top) 79 | break 80 | 81 | string, _, _, proportion, _, prop_low, prop_high = row 82 | 83 | # dygraphs has a weird format with semicolons: 84 | # value;lower;upper,value;lower;upper. 85 | 86 | # http://dygraphs.com/data.html#csv 87 | 88 | # Arbitrarily use 4 digits after decimal point (for dygraphs, not 89 | # directly displayed) 90 | dygraph_triple = '%.4f;%.4f;%.4f' % ( 91 | float(prop_low), float(proportion), float(prop_high)) 92 | 93 | var_cols[string][date] = dygraph_triple 94 | 95 | # Now print CSV on stdout. 96 | cols = sorted(var_cols.keys()) # sort columns alphabetically 97 | c_out.writerow(['date'] + cols) 98 | 99 | dates.sort() 100 | 101 | for date in dates: 102 | row = [date] 103 | for col in cols: 104 | cell = var_cols[col].get(date) # None mean sthere is no row 105 | row.append(cell) 106 | c_out.writerow(row) 107 | 108 | #util.log("Number of dynamic cols: %d", len(var_cols)) 109 | 110 | 111 | def CombineAssocResults(stdin, c_out, num_top): 112 | header = ('dummy',) 113 | c_out.writerow(header) 114 | 115 | 116 | def main(argv): 117 | action = argv[1] 118 | 119 | if action == 'dist': 120 | num_top = int(argv[2]) # number of values to keep 121 | c_out = csv.writer(sys.stdout) 122 | CombineDistResults(sys.stdin, c_out, num_top) 123 | 124 | elif action == 'assoc': 125 | num_top = int(argv[2]) # number of values to keep 126 | c_out = csv.writer(sys.stdout) 127 | CombineAssocResults(sys.stdin, c_out, num_top) 128 | 129 | else: 130 | raise RuntimeError('Invalid action %r' % action) 131 | 132 | 133 | if __name__ == '__main__': 134 | try: 135 | main(sys.argv) 136 | except RuntimeError, e: 137 | print >>sys.stderr, 'FATAL: %s' % e 138 | sys.exit(1) 139 | -------------------------------------------------------------------------------- /tests/gen_counts_test.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | library(RUnit) 18 | library(Matrix) # for sparse matrices 19 | 20 | source('tests/gen_counts.R') 21 | 22 | TestGenerateCounts <- function() { 23 | report_params <- list(k = 4, m = 2) # 2 cohorts, 4 bits each 24 | map <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE) # 3 possible values 25 | map[1,] <- c(1, 0, 0) 26 | map[2,] <- c(0, 1, 0) 27 | map[3,] <- c(0, 0, 1) 28 | map[4,] <- c(1, 1, 1) # 4th bit of the first cohort gets signal from all 29 | map[5,] <- c(0, 0, 1) # 1st bit of the second cohort gets signal from v3 30 | 31 | colnames(map) <- c('v1', 'v2', 'v3') 32 | 33 | partition <- c(3, 2, 1) * 10000 34 | v <- 100 # reports per client 35 | 36 | noise0 <- list(p = 0, q = 1, f = 0) # no noise at all 37 | counts0 <- GenerateCounts(c(report_params, noise0), map, partition, v) 38 | 39 | checkEqualsNumeric(sum(counts0[1,2:4]), counts0[1,1]) 40 | checkEqualsNumeric(counts0[1,5], counts0[1,1]) 41 | checkEqualsNumeric(partition[3] * v, counts0[1,4] + counts0[2,2]) 42 | checkEqualsNumeric(sum(partition) * v, counts0[1,1] + counts0[2,1]) 43 | 44 | pvalues <- chisq.test(counts0[,1] / v, p = c(.5, .5))$p.value 45 | for(i in 2:4) 46 | pvalues <- c(pvalues, 47 | chisq.test( 48 | c(counts0[1,i] / v, partition[i - 1] - counts0[1,i] / v), 49 | p = c(.5, .5))$p.value) 50 | 51 | noise1 <- list(p = .5, q = .5, f = 0) # truly random IRRs 52 | counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v) 53 | 54 | for(i in 2:5) 55 | for(j in 1:2) 56 | pvalues <- c(pvalues, 57 | chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]), 58 | p = c(.5, .5))$p.value) 59 | 60 | noise2 <- list(p = 0, q = 1, f = 1.0) # truly random PRRs 61 | counts2 <- GenerateCounts(c(report_params, noise2), map, partition, v) 62 | 63 | checkEqualsNumeric(0, max(counts2 %% v)) # all entries must be divisible by v 64 | 65 | counts2 <- counts2 / v 66 | 67 | for(i in 2:5) 68 | for(j in 1:2) 69 | pvalues <- c(pvalues, 70 | chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]), 71 | p = c(.5, .5))$p.value) 72 | 73 | checkTrue(min(pvalues) > 1E-9, "Chi-squared test failed") 74 | } 75 | 76 | TestRandomPartition <- function() { 77 | 78 | p1 <- RandomPartition(total = 100, dgeom(0:999, prob = .1)) 79 | p2 <- RandomPartition(total = 1000, dnorm(1:1000, mean = 500, sd = 1000 / 6)) 80 | p3 <- RandomPartition(total = 10000, dunif(1:1000)) 81 | 82 | # Totals must check out. 83 | checkEqualsNumeric(100, sum(p1)) 84 | checkEqualsNumeric(1000, sum(p2)) 85 | checkEqualsNumeric(10000, sum(p3)) 86 | 87 | # Initialize the weights vector to 1 0 1 0 1 0 ... 88 | weights <- rep(c(1, 0), 100) 89 | 90 | p4 <- RandomPartition(total = 10000, weights) 91 | 92 | # Check that all mass is allocated to non-zero weights. 93 | checkEqualsNumeric(10000, sum(p4[weights == 1])) 94 | checkTrue(all(p4[weights == 0] == 0)) 95 | 96 | p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4)) 97 | p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value 98 | 99 | # Apply the chi squared test and fail if p.value is too high or too low. 100 | # Probability of failure is 2 * 1E-9, which should never happen. 101 | checkTrue(p.value > 1E-9) 102 | } 103 | 104 | TestAll <- function(){ 105 | TestRandomPartition() 106 | TestGenerateCounts() 107 | } 108 | 109 | TestAll() -------------------------------------------------------------------------------- /client/cpp/openssl_hash_impl.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "openssl_hash_impl.h" 16 | 17 | #include 18 | #include 19 | 20 | #include // EVP_sha256 21 | #include // HMAC 22 | #include // MD5 23 | #include // SHA256_DIGEST_LENGTH 24 | 25 | namespace rappor { 26 | 27 | // of type HmacFunc in rappor_deps.h 28 | bool HmacSha256(const std::string& key, const std::string& value, 29 | std::vector* output) { 30 | output->resize(SHA256_DIGEST_LENGTH, 0); 31 | 32 | // Returns a pointer on success, or NULL on failure. 33 | unsigned char* result = HMAC( 34 | EVP_sha256(), key.c_str(), key.size(), 35 | // std::string has 'char', OpenSSL wants unsigned char. 36 | reinterpret_cast(value.c_str()), 37 | value.size(), 38 | output->data(), 39 | NULL); 40 | 41 | return (result != NULL); 42 | } 43 | 44 | // Of type HmacFunc in rappor_deps.h 45 | // 46 | // The length of the passed-in output vector determines how many 47 | // bytes are returned. 48 | // 49 | // No reseed operation, but recommended reseed_interval <= 2^48 updates. 50 | // Since we're seeding for each value and typically don't need 51 | // so many bytes, we should be OK. 52 | bool HmacDrbg(const std::string& key, const std::string& value, 53 | std::vector* output) { 54 | const unsigned char k_array[] = { 55 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 56 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 57 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 58 | 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 59 | }; 60 | std::string v; 61 | std::vector temp_output; 62 | int num_bytes = output->size(); 63 | if (num_bytes == 0) { 64 | // By default return 32 bytes for Uint32 applications. 65 | num_bytes = 32; 66 | } 67 | 68 | v.append(32u, 0x01); 69 | temp_output.resize(32, 0); 70 | 71 | std::string temp_str(v); 72 | temp_str.append(std::string("\0", 1)); 73 | // provided_data is key|value. 74 | temp_str.append(key); 75 | temp_str.append(value); 76 | 77 | output->resize(0); 78 | 79 | // Instantiate. 80 | if (!HmacSha256(std::string(k_array, k_array + 32), temp_str, &temp_output)) { 81 | return false; 82 | } 83 | std::string k(temp_output.begin(), temp_output.end()); 84 | if (!HmacSha256(k, v, &temp_output)) { 85 | return false; 86 | } 87 | v = std::string(temp_output.begin(), temp_output.end()); 88 | if (!HmacSha256(k, v + std::string("\1", 1) + key + value, &temp_output)) { 89 | return false; 90 | } 91 | k = std::string(temp_output.begin(), temp_output.end()); 92 | if (!HmacSha256(k, v, &temp_output)) { 93 | return false; 94 | } 95 | v = std::string(temp_output.begin(), temp_output.end()); 96 | 97 | while (output->size() < num_bytes) { 98 | // Generate. 99 | if (!HmacSha256(k, v, &temp_output)) { 100 | return false; 101 | } 102 | v = std::string(temp_output.begin(), temp_output.end()); 103 | output->insert(output->end(), temp_output.begin(), temp_output.end()); 104 | } 105 | output->resize(num_bytes); 106 | return true; 107 | } 108 | 109 | // of type HashFunc in rappor_deps.h 110 | bool Md5(const std::string& value, std::vector* output) { 111 | output->resize(MD5_DIGEST_LENGTH, 0); 112 | 113 | // std::string has 'char', OpenSSL wants unsigned char. 114 | MD5(reinterpret_cast(value.c_str()), 115 | value.size(), output->data()); 116 | return true; // OpenSSL MD5 doesn't return an error code 117 | } 118 | 119 | } // namespace rappor 120 | -------------------------------------------------------------------------------- /client/python/rappor_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # 3 | # Copyright 2014 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | """ 18 | rappor_test.py: Tests for rappor.py 19 | """ 20 | import cStringIO 21 | import copy 22 | import math 23 | import random 24 | import unittest 25 | 26 | import rappor # module under test 27 | 28 | 29 | class RapporParamsTest(unittest.TestCase): 30 | 31 | def setUp(self): 32 | self.typical_instance = rappor.Params() 33 | ti = self.typical_instance # For convenience 34 | ti.num_cohorts = 64 # Number of cohorts 35 | ti.num_hashes = 2 # Number of bloom filter hashes 36 | ti.num_bloombits = 16 # Number of bloom filter bits 37 | ti.prob_p = 0.40 # Probability p 38 | ti.prob_q = 0.70 # Probability q 39 | ti.prob_f = 0.30 # Probability f 40 | 41 | def testFromCsv(self): 42 | f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\n') 43 | params = rappor.Params.from_csv(f) 44 | self.assertEqual(32, params.num_bloombits) 45 | self.assertEqual(64, params.num_cohorts) 46 | 47 | # Malformed header 48 | f = cStringIO.StringIO('k,h,m,p,q\n32,2,64,0.5,0.75,0.6\n') 49 | self.assertRaises(rappor.Error, rappor.Params.from_csv, f) 50 | 51 | # Missing second row 52 | f = cStringIO.StringIO('k,h,m,p,q,f\n') 53 | self.assertRaises(rappor.Error, rappor.Params.from_csv, f) 54 | 55 | # Too many rows 56 | f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\nextra') 57 | self.assertRaises(rappor.Error, rappor.Params.from_csv, f) 58 | 59 | def testGetBloomBits(self): 60 | for cohort in xrange(0, 64): 61 | b = rappor.get_bloom_bits('foo', cohort, 2, 16) 62 | #print 'cohort', cohort, 'bloom', b 63 | 64 | def testGetPrr(self): 65 | bloom = 1 66 | num_bits = 8 67 | for word in ('v1', 'v2', 'v3'): 68 | masks = rappor.get_prr_masks('secret', word, 0.5, num_bits) 69 | print 'masks', masks 70 | 71 | def testToBigEndian(self): 72 | b = rappor.to_big_endian(1) 73 | print repr(b) 74 | self.assertEqual(4, len(b)) 75 | 76 | def testEncoder(self): 77 | # Test encoder with deterministic random function. 78 | params = copy.copy(self.typical_instance) 79 | params.prob_f = 0.5 80 | params.prob_p = 0.5 81 | params.prob_q = 0.75 82 | 83 | # return these 3 probabilities in sequence. 84 | rand = MockRandom([0.0, 0.6, 0.0], params) 85 | 86 | e = rappor.Encoder(params, 0, 'secret', rand) 87 | 88 | irr = e.encode("abc") 89 | 90 | self.assertEquals(64493, irr) # given MockRandom, this is what we get 91 | 92 | 93 | class MockRandom(object): 94 | """Returns one of three random values in a cyclic manner. 95 | 96 | Mock random function that involves *some* state, as needed for tests that 97 | call randomness several times. This makes it difficult to deal exclusively 98 | with stubs for testing purposes. 99 | """ 100 | 101 | def __init__(self, cycle, params): 102 | self.p_gen = MockRandomCall(params.prob_p, cycle, params.num_bloombits) 103 | self.q_gen = MockRandomCall(params.prob_q, cycle, params.num_bloombits) 104 | 105 | class MockRandomCall: 106 | def __init__(self, prob, cycle, num_bits): 107 | self.cycle = cycle 108 | self.n = len(self.cycle) 109 | self.prob = prob 110 | self.num_bits = num_bits 111 | 112 | def __call__(self): 113 | counter = 0 114 | r = 0 115 | for i in xrange(0, self.num_bits): 116 | rand_val = self.cycle[counter] 117 | counter += 1 118 | counter %= self.n # wrap around 119 | r |= ((rand_val < self.prob) << i) 120 | return r 121 | 122 | 123 | if __name__ == "__main__": 124 | unittest.main() 125 | -------------------------------------------------------------------------------- /tests/user_spec.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """Print a test spec on stdout. 3 | 4 | Each line has parmaeters for a test case. The regtest.sh shell script reads 5 | these lines and runs parallel processes. 6 | 7 | We use Python data structures so the test cases are easier to read and edit. 8 | """ 9 | 10 | import sys 11 | 12 | # 13 | # TEST CONFIGURATION 14 | # 15 | 16 | # For gen_sim_input.py 17 | INPUT_PARAMS = { 18 | # distribution, num unique values, num clients, values per client 19 | 'exp-100k': ('exp', 100, 100000, 1), 20 | 'exp-1m': ('exp', 100, 1000000, 1), 21 | } 22 | 23 | # For rappor_sim.py 24 | # 'k, h, m, p, q, f' as in params file. 25 | RAPPOR_PARAMS = { 26 | # Initial chrome params from 2014. 27 | # NOTE: fastrand simulation only supports 64 bits! Make sure to use the 28 | # 'fast_counts' code path. 29 | 'chrome128': (128, 2, 128, 0.25, 0.75, 0.50), 30 | 31 | # Chrome params from early 2015 -- changed to 8 bit reports. 32 | 'chrome8': (8, 2, 128, 0.25, 0.75, 0.50), 33 | 34 | # Original demo params 35 | 'demo': (16, 2, 64, 0.5, 0.75, 0.5), 36 | } 37 | 38 | # For deriving candidates from true inputs. 39 | MAP_PARAMS = { 40 | # 1. Number of extra candidates to add. 41 | # 2. Candidate strings to remove from the map. This FORCES false 42 | # negatives, e.g. for common strings, since a string has to be in the map 43 | # for RAPPOR to choose it. 44 | 'add-100': (100, []), 45 | 'add-1000': (1000, []), 46 | 'add-2000': (2000, []), 47 | # also thrashes on 128 bits 48 | 'add-3000': (3000, []), 49 | 'add-10000': (10000, []), 50 | 'add-15000': (15000, []), # approx number of candidates for eTLD+1 51 | 'add-100000': (100000, []), 52 | 'remove-top-2': (20, ['v1', 'v2']), 53 | } 54 | 55 | # test case name -> (input params name, RAPPOR params name, map params name) 56 | TEST_CASES = [ 57 | ('chrome128-100k-100', 'exp-100k', 'chrome128', 'add-100'), 58 | ('chrome128-100k-1000', 'exp-100k', 'chrome128', 'add-1000'), 59 | ('chrome128-100k-2000', 'exp-100k', 'chrome128', 'add-2000'), 60 | ('chrome128-100k-3000', 'exp-100k', 'chrome128', 'add-3000'), 61 | # 128 bits and 15k candidates fails on a machine with 8 GB memory. 62 | # Lasso finishes with 7508 non-zero coefficients, and then allocation 63 | # fails. TODO: just take the highest ones? 64 | #('chrome128-100k-15000', 'exp-100k', 'chrome128', 'add-15000'), 65 | #('chrome128-100k-100000', 'exp-100k', 'chrome128', 'add-100000'), 66 | 67 | # NOTE: Adding more candidates exercises LASSO 68 | ('chrome8-100k-100', 'exp-100k', 'chrome8', 'add-100'), 69 | ('chrome8-100k-1000', 'exp-100k', 'chrome8', 'add-1000'), 70 | ('chrome8-100k-2000', 'exp-100k', 'chrome8', 'add-2000'), 71 | ('chrome8-100k-3000', 'exp-100k', 'chrome8', 'add-3000'), 72 | ('chrome8-100k-15000', 'exp-100k', 'chrome8', 'add-15000'), 73 | 74 | # NOTE: This one takes too much memory! More than 4 GB. This is because 75 | # Lasso gets a huge matrix (100,000). We got 1564 non-zero coefficients. 76 | ('chrome8-100k-100000', 'exp-100k', 'chrome8', 'add-100000'), 77 | 78 | # What happens when the the candidates are missing top values? 79 | ('chrome8-badcand', 'exp-100k', 'chrome8', 'remove-top-2'), 80 | 81 | # TODO: Use chrome params with real map from Alexa 1M ? 82 | ] 83 | 84 | # 85 | # END TEST CONFIGURATION 86 | # 87 | 88 | 89 | def main(argv): 90 | rows = [] 91 | for test_case, input_name, rappor_name, map_name in TEST_CASES: 92 | input_params = INPUT_PARAMS[input_name] 93 | rappor_params = RAPPOR_PARAMS[rappor_name] 94 | map_params = MAP_PARAMS[map_name] 95 | row = tuple([test_case]) + input_params + rappor_params + map_params 96 | rows.append(row) 97 | 98 | for row in rows: 99 | for cell in row: 100 | if isinstance(cell, list): 101 | if cell: 102 | cell_str = '|'.join(cell) 103 | else: 104 | cell_str = 'NONE' # we don't want an empty string 105 | else: 106 | cell_str = cell 107 | print cell_str, # print it with a space after it 108 | print # new line after row 109 | 110 | 111 | if __name__ == '__main__': 112 | try: 113 | main(sys.argv) 114 | except RuntimeError, e: 115 | print >>sys.stderr, 'FATAL: %s' % e 116 | sys.exit(1) 117 | -------------------------------------------------------------------------------- /analysis/R/fast_em.R: -------------------------------------------------------------------------------- 1 | # fast_em.R: Wrapper around analysis/cpp/fast_em.cc. 2 | # 3 | # This serializes the input, shells out, and deserializes the output. 4 | 5 | .Flatten <- function(list_of_matrices) { 6 | listOfVectors <- lapply(list_of_matrices, as.vector) 7 | #print(listOfVectors) 8 | 9 | # unlist takes list to vector. 10 | unlist(listOfVectors) 11 | } 12 | 13 | .WriteListOfMatrices <- function(list_of_matrices, f) { 14 | flattened <- .Flatten(list_of_matrices) 15 | 16 | # NOTE: UpdateJointConditional does outer product of dimensions! 17 | 18 | # 3 letter strings are null terminated 19 | writeBin('ne ', con = f) 20 | num_entries <- length(list_of_matrices) 21 | writeBin(num_entries, con = f) 22 | 23 | Log('Wrote num_entries = %d', num_entries) 24 | 25 | # For 2x3, this is 6 26 | writeBin('es ', con = f) 27 | 28 | entry_size <- as.integer(prod(dim(list_of_matrices[[1]]))) 29 | writeBin(entry_size, con = f) 30 | 31 | Log('Wrote entry_size = %d', entry_size) 32 | 33 | # now write the data 34 | writeBin('dat', con = f) 35 | writeBin(flattened, con = f) 36 | } 37 | 38 | .ExpectTag <- function(f, tag) { 39 | # Read a single NUL-terminated character string. 40 | actual <- readBin(con = f, what = "char", n = 1) 41 | 42 | # Assert that we got what was expected. 43 | if (length(actual) != 1) { 44 | stop(sprintf("Failed to read a tag '%s'", tag)) 45 | } 46 | if (actual != tag) { 47 | stop(sprintf("Expected '%s', got '%s'", tag, actual)) 48 | } 49 | } 50 | 51 | .ReadResult <- function (f, entry_size, matrix_dims) { 52 | .ExpectTag(f, "emi") 53 | # NOTE: assuming R integers are 4 bytes (uint32_t) 54 | num_em_iters <- readBin(con = f, what = "int", n = 1) 55 | 56 | .ExpectTag(f, "pij") 57 | pij <- readBin(con = f, what = "double", n = entry_size) 58 | 59 | # Adjust dimensions 60 | dim(pij) <- matrix_dims 61 | 62 | Log("Number of EM iterations: %d", num_em_iters) 63 | Log("PIJ read from external implementation:") 64 | print(pij) 65 | 66 | # est, sd, var_cov, hist 67 | list(est = pij, num_em_iters = num_em_iters) 68 | } 69 | 70 | .SanityChecks <- function(joint_conditional) { 71 | # Display some stats before sending it over to C++. 72 | 73 | inf_counts <- lapply(joint_conditional, function(m) { 74 | sum(m == Inf) 75 | }) 76 | total_inf <- sum(as.numeric(inf_counts)) 77 | 78 | nan_counts <- lapply(joint_conditional, function(m) { 79 | sum(is.nan(m)) 80 | }) 81 | total_nan <- sum(as.numeric(nan_counts)) 82 | 83 | zero_counts <- lapply(joint_conditional, function(m) { 84 | sum(m == 0.0) 85 | }) 86 | total_zero <- sum(as.numeric(zero_counts)) 87 | 88 | #sum(joint_conditional[joint_conditional == Inf, ]) 89 | Log('total inf: %s', total_inf) 90 | Log('total nan: %s', total_nan) 91 | Log('total zero: %s', total_zero) 92 | } 93 | 94 | ConstructFastEM <- function(em_executable, tmp_dir) { 95 | 96 | return(function(joint_conditional, max_em_iters = 1000, 97 | epsilon = 10 ^ -6, verbose = FALSE, 98 | estimate_var = FALSE) { 99 | matrix_dims <- dim(joint_conditional[[1]]) 100 | # Check that number of dimensions is 2. 101 | if (length(matrix_dims) != 2) { 102 | Log('FATAL: Expected 2 dimensions, got %d', length(matrix_dims)) 103 | stop() 104 | } 105 | 106 | entry_size <- prod(matrix_dims) 107 | Log('entry size: %d', entry_size) 108 | 109 | .SanityChecks(joint_conditional) 110 | 111 | input_path <- file.path(tmp_dir, 'list_of_matrices.bin') 112 | Log("Writing flattened list of matrices to %s", input_path) 113 | f <- file(input_path, 'wb') # binary file 114 | .WriteListOfMatrices(joint_conditional, f) 115 | close(f) 116 | Log("Done writing %s", input_path) 117 | 118 | output_path <- file.path(tmp_dir, 'pij.bin') 119 | 120 | cmd <- sprintf("%s %s %s %s", em_executable, input_path, output_path, 121 | max_em_iters) 122 | 123 | Log("Shell command: %s", cmd) 124 | exit_code <- system(cmd) 125 | 126 | Log("Done running shell command") 127 | if (exit_code != 0) { 128 | stop(sprintf("Command failed with code %d", exit_code)) 129 | } 130 | 131 | f <- file(output_path, 'rb') 132 | result <- .ReadResult(f, entry_size, matrix_dims) 133 | close(f) 134 | 135 | result 136 | }) 137 | } 138 | -------------------------------------------------------------------------------- /pipeline/dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # ./dist.sh 5 | 6 | set -o nounset 7 | set -o pipefail 8 | set -o errexit 9 | 10 | readonly THIS_DIR=$(dirname $0) 11 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 12 | 13 | source $RAPPOR_SRC/util.sh # log, banner 14 | source $RAPPOR_SRC/pipeline/tools-lib.sh 15 | source $RAPPOR_SRC/pipeline/alarm-lib.sh 16 | 17 | readonly DECODE_DIST=${DEP_DECODE_DIST:-$RAPPOR_SRC/bin/decode-dist} 18 | 19 | readonly NUM_ARGS=7 # used for xargs 20 | 21 | decode-dist-one() { 22 | # Job constants 23 | local rappor_src=$1 24 | local timeout_secs=$2 25 | local min_reports=$3 26 | shift 3 # job constants do not vary per task and are not part of the spec 27 | 28 | # 7 spec variables 29 | local num_reports=$1 # unused, only for filtering 30 | local metric_name=$2 31 | local date=$3 32 | local counts=$4 33 | local params=$5 34 | local map=$6 35 | local results_dir=$7 36 | 37 | local task_dir=$results_dir/$metric_name/$date 38 | mkdir --verbose -p $task_dir 39 | 40 | local log_file=$task_dir/log.txt 41 | local status_file=$task_dir/STATUS.txt 42 | 43 | # Record the spec so we know params, counts, etc. 44 | echo "$@" > $task_dir/spec.txt 45 | 46 | if test $num_reports -lt $min_reports; then 47 | local msg="SKIPPED because $num_reports reports is less than $min_reports" 48 | # Duplicate this message 49 | echo "$msg" > $status_file 50 | echo "$msg" > $log_file 51 | return 52 | fi 53 | 54 | # Run it with a timeout, and record status in the task dir. 55 | { time \ 56 | alarm-status $status_file $timeout_secs \ 57 | $DECODE_DIST \ 58 | --counts $counts \ 59 | --params $params \ 60 | --map $map \ 61 | --output-dir $task_dir \ 62 | --adjust-counts-hack 63 | } >$log_file 2>&1 64 | 65 | # TODO: Don't pass --adjust-counts-hack unless the user asks for it. 66 | } 67 | 68 | # Print the number of processes to use. 69 | # NOTE: This is copied from google/rappor regtest.sh. 70 | # It also doesn't take into account the fact that we are memory-bound. 71 | # 72 | # 128 GiB / 4GiB would also imply about 32 processes though. 73 | num-processes() { 74 | local processors=$(grep -c ^processor /proc/cpuinfo || echo 4) 75 | if test $processors -gt 1; then # leave one CPU for the OS 76 | processors=$(expr $processors - 1) 77 | fi 78 | echo $processors 79 | } 80 | 81 | #readonly DEFAULT_MAX_PROCS=6 # for andychu2.hot, to avoid locking up UI 82 | #readonly DEFAULT_MAX_PROCS=16 # for rappor-ac.hot, to avoid thrashing 83 | readonly DEFAULT_MAX_PROCS=$(num-processes) 84 | 85 | #readonly DEFAULT_MAX_TASKS=12 86 | readonly DEFAULT_MAX_TASKS=10000 # more than the max 87 | 88 | # NOTE: Since we have 125 GB RAM, and processes can take up to 12 gigs of RAM, 89 | # only use parallelism of 10, even though we have 31 cores. 90 | 91 | readonly DEFAULT_MIN_REPORTS=5000 92 | 93 | 94 | decode-dist-many() { 95 | local job_dir=$1 96 | local spec_list=$2 97 | local timeout_secs=${3:-1200} # default timeout 98 | local max_procs=${4:-$DEFAULT_MAX_PROCS} 99 | local rappor_src=${5:-$RAPPOR_SRC} 100 | local min_reports=${6:-$DEFAULT_MIN_REPORTS} 101 | 102 | local interval_secs=5 103 | local pid_dir="$job_dir/pids" 104 | local sys_mem="$job_dir/system-mem.csv" 105 | mkdir --verbose -p $pid_dir 106 | 107 | time cat $spec_list \ 108 | | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ 109 | $0 decode-dist-one $rappor_src $timeout_secs $min_reports 110 | } 111 | 112 | # Combine/summarize results and task metadata from the parallel decode-dist 113 | # processes. Render them as HTML. 114 | combine-and-render-html() { 115 | local jobs_base_dir=$1 116 | local job_dir=$2 117 | 118 | banner "Combining dist task status" 119 | TOOLS-cook combine-dist-task-status $jobs_base_dir $job_dir 120 | 121 | banner "Combining dist results" 122 | TOOLS-cook combine-dist-results $jobs_base_dir $job_dir 123 | 124 | banner "Splitting out status per metric, and writing overview" 125 | TOOLS-cook dist-metric-status $job_dir 126 | 127 | # The task-status.csv file should have the a JOB ID. 128 | banner "Building overview.html and per-metric HTML" 129 | TOOLS-gen-ui build-html1 $job_dir 130 | 131 | banner "Building individual results.html (for ONE day)" 132 | TOOLS-gen-ui results-html $job_dir 133 | } 134 | 135 | "$@" 136 | -------------------------------------------------------------------------------- /client/cpp/Makefile: -------------------------------------------------------------------------------- 1 | # Build RAPPOR C++ code. 2 | 3 | default : \ 4 | _tmp/rappor_sim \ 5 | _tmp/encoder_demo \ 6 | _tmp/protobuf_encoder_demo \ 7 | _tmp/openssl_hash_impl_test 8 | 9 | # All intermediate files live in _tmp/ 10 | clean : 11 | rm -f --verbose _tmp/* 12 | 13 | # Use protobuf compiler to generate .cc and .h files. The .o and the .d depend 14 | # on .cc, so that is the target of this rule. 15 | 16 | _tmp/%.pb.cc : ../proto/%.proto 17 | protoc --cpp_out _tmp --proto_path=../proto $< 18 | 19 | # 20 | # Generate .d Makefile fragments. 21 | # 22 | 23 | # CXX flags: 24 | # -MM: exclude system headers 25 | # -I _tmp: So that protobuf files found 26 | # 27 | # Makefile stuff: 28 | # $*: the part that matched the wildcard, e.g. 'rappor_sim' for '%.cc' 29 | # matching 'rappor_sim.cc' 30 | # 31 | # We use $< (first prereq) to generate .d and and .o files from .cc, because 32 | # it only needs the .cc file. We used $^ (all prereqs) to pass ALL the .o 33 | # files to the link step. 34 | 35 | _tmp/%.d : %.cc 36 | ./dotd.sh $* $@ \ 37 | $(CXX) -I _tmp/ -MM $(CPPFLAGS) $< 38 | 39 | # Special case for .d file of generated source. 40 | _tmp/%.pb.d : _tmp/%.pb.cc 41 | ./dotd.sh $*.pb $@ \ 42 | $(CXX) -I _tmp/ -MM $(CPPFLAGS) $< 43 | 44 | # 45 | # Include the Makefile fragments we generated, so that changes to headers will 46 | # rebuild both .d files and .o files. ('-include' suppresses the error if they 47 | # don't exist.) 48 | # 49 | # NOTE: We have to list them explicitly. Every time you add a source file, add 50 | # the corresponding .d file here. 51 | # 52 | 53 | -include \ 54 | _tmp/encoder.d \ 55 | _tmp/libc_rand_impl.d \ 56 | _tmp/openssl_hash_impl.d \ 57 | _tmp/openssl_hash_impl_test.d \ 58 | _tmp/protobuf_encoder.d \ 59 | _tmp/protobuf_encoder_demo.d \ 60 | _tmp/rappor_sim.d \ 61 | _tmp/unix_kernel_rand_impl.d \ 62 | _tmp/rappor.pb.d \ 63 | _tmp/example_app.pb.d 64 | 65 | # For example, -Wextra warns about unused params, but -Wall doesn't. 66 | CXXFLAGS = -Wall -Wextra #-Wpedantic 67 | 68 | # 69 | # Build object files (-c: compile only) 70 | # 71 | 72 | # NOTE: More prerequisites to _tmp/%.o (header files) are added by the .d 73 | # files, so we need $<. 74 | _tmp/%.o : %.cc 75 | $(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $< 76 | 77 | _tmp/%.pb.o : _tmp/%.pb.cc 78 | $(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $< 79 | 80 | # 81 | # Build executables 82 | # 83 | 84 | # CXX flag notes: 85 | # -lcrypto from openssl 86 | # -g for debug info 87 | # 88 | # You can add -std=c++0x for std::array, etc. 89 | 90 | # $^ : all prerequisites 91 | _tmp/rappor_sim : \ 92 | _tmp/encoder.o \ 93 | _tmp/libc_rand_impl.o \ 94 | _tmp/unix_kernel_rand_impl.o \ 95 | _tmp/openssl_hash_impl.o \ 96 | _tmp/rappor_sim.o 97 | $(CXX) \ 98 | $(CXXFLAGS) \ 99 | -o $@ \ 100 | $^ \ 101 | -lcrypto \ 102 | -g 103 | 104 | # $^ : all prerequisites 105 | _tmp/encoder_demo: \ 106 | _tmp/encoder.o \ 107 | _tmp/unix_kernel_rand_impl.o \ 108 | _tmp/openssl_hash_impl.o \ 109 | _tmp/encoder_demo.o 110 | $(CXX) \ 111 | $(CXXFLAGS) \ 112 | -o $@ \ 113 | $^ \ 114 | -lcrypto \ 115 | -g 116 | 117 | # -I _tmp for protobuf headers 118 | _tmp/protobuf_encoder_demo : \ 119 | _tmp/encoder.o \ 120 | _tmp/libc_rand_impl.o \ 121 | _tmp/unix_kernel_rand_impl.o \ 122 | _tmp/openssl_hash_impl.o \ 123 | _tmp/protobuf_encoder.o \ 124 | _tmp/protobuf_encoder_demo.o \ 125 | _tmp/example_app.pb.o \ 126 | _tmp/rappor.pb.o 127 | $(CXX) \ 128 | $(CXXFLAGS) \ 129 | -I _tmp \ 130 | -o $@ \ 131 | $^ \ 132 | -lprotobuf \ 133 | -lcrypto \ 134 | -g 135 | 136 | _tmp/openssl_hash_impl_test : \ 137 | _tmp/openssl_hash_impl.o \ 138 | _tmp/openssl_hash_impl_test.o 139 | $(CXX) \ 140 | $(CXXFLAGS) \ 141 | -o $@ \ 142 | $^ \ 143 | -lcrypto \ 144 | -g 145 | 146 | # Unittests are currently run manually, and require the Google gtest 147 | # framework version 1.7.0 or greater, found at 148 | # https://github.com/google/googletest/releases 149 | # TODO(mdeshon-google): Installer script 150 | unittest: _tmp/openssl_hash_impl_unittest _tmp/encoder_unittest 151 | _tmp/openssl_hash_impl_unittest 152 | _tmp/encoder_unittest 153 | 154 | _tmp/openssl_hash_impl_unittest: openssl_hash_impl_unittest.cc openssl_hash_impl.cc 155 | $(CXX) -g -o $@ $^ -lssl -lcrypto -lgtest 156 | 157 | _tmp/encoder_unittest: encoder_unittest.cc encoder.cc unix_kernel_rand_impl.cc openssl_hash_impl.cc 158 | $(CXX) -g -o $@ $^ -lssl -lcrypto -lgtest 159 | -------------------------------------------------------------------------------- /pipeline/regtest.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # End-to-end tests for the dashboard. 4 | # 5 | # Usage: 6 | # ./regtest.sh 7 | # 8 | # NOTE: Must be run in this directory (rappor/pipeline). 9 | 10 | set -o nounset 11 | set -o pipefail 12 | set -o errexit 13 | 14 | # Create schema and params. 15 | create-metadata() { 16 | mkdir -p _tmp/metadata 17 | echo 'Hello from regtest.sh' 18 | 19 | local params_path=_tmp/metadata/regtest_params.csv 20 | 21 | # Relying on $RAPPOR_SRC/regtest.sh 22 | cp --verbose ../_tmp/python/demo1/case_params.csv $params_path 23 | 24 | # For now, use the same map everywhere. 25 | cat >_tmp/metadata/dist-analysis.csv <_tmp/metadata/rappor-vars.csv </dev/null || die "pep8 not installed ('sudo apt-get install pep8' on Ubuntu)" 110 | 111 | # - Skip _tmp dir, because we are downloading cpplint.py there, and it has 112 | # pep8 lint errors 113 | # - Exclude setup.py, because it's a config file and uses "invalid" 'name = 114 | # 1' style (spaces around =). 115 | find $REPO_ROOT \ 116 | \( -name _tmp -a -prune \) -o \ 117 | \( -name \*.py -a -print \) \ 118 | | grep -v /setup.py \ 119 | | xargs --verbose -- $0 python-lint 120 | } 121 | 122 | r-unit() { 123 | set -o xtrace # show tests we're running 124 | 125 | # This one needs to be run from the root dir 126 | tests/compare_dist_test.R 127 | 128 | tests/gen_counts_test.R 129 | 130 | tests/gen_true_values_test.R 131 | 132 | analysis/R/decode_test.R 133 | 134 | analysis/test/run_tests.R 135 | } 136 | 137 | doc-lint() { 138 | which tidy >/dev/null || die "tidy not found" 139 | for doc in _tmp/report.html _tmp/doc/*.html; do 140 | echo $doc 141 | # -e: show only errors and warnings 142 | # -q: quiet 143 | tidy -e -q $doc || true 144 | done 145 | } 146 | 147 | # This isn't a strict check, but can help. 148 | # TODO: Add words to whitelist. 149 | spell-all() { 150 | which spell >/dev/null || die "spell not found" 151 | spell README.md doc/*.md | sort | uniq 152 | } 153 | 154 | # 155 | # Smoke Tests. These can be manually run. 156 | # 157 | 158 | gen-true-values() { 159 | local num_unique_values=10 160 | local num_clients=10 161 | local values_per_client=2 162 | local num_cohorts=4 163 | local out=_tmp/reports.csv 164 | 165 | tests/gen_true_values.R \ 166 | exp $num_unique_values $num_clients $values_per_client $num_cohorts $out 167 | wc -l $out 168 | cat $out 169 | } 170 | 171 | if test $# -eq 0 ; then 172 | all 173 | else 174 | "$@" 175 | fi 176 | -------------------------------------------------------------------------------- /pipeline/cook.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Take the raw data from the analysis and massage it into various formats 4 | # suitable for display. 5 | # 6 | # Usage: 7 | # ./cook.sh 8 | 9 | set -o nounset 10 | set -o pipefail 11 | set -o errexit 12 | 13 | readonly THIS_DIR=$(dirname $0) 14 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 15 | 16 | source $RAPPOR_SRC/pipeline/tools-lib.sh 17 | 18 | 19 | status-files() { 20 | local dir=$1 21 | find $dir -name STATUS.txt 22 | } 23 | 24 | results-files() { 25 | local dir=$1 26 | find $dir -name results.csv 27 | } 28 | 29 | count-results() { 30 | # first field of each line is one of {OK, TIMEOUT, FAIL, SKIPPED} 31 | status-files "$@" \ 32 | | xargs cat \ 33 | | cut -d ' ' -f 1 \ 34 | | sort | uniq -c | sort -n -r 35 | } 36 | 37 | # 38 | # For dist cron job 39 | # 40 | 41 | # Combine status of tasks over multiple jobs. Each row is a task (decode-dist 42 | # invocation). This has the number of reports. 43 | combine-dist-task-status() { 44 | local base_dir=${1:-~/rappor/cron} 45 | local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01} 46 | 47 | local out=$job_dir/task-status.csv 48 | 49 | # Ignore memory for now. 50 | time status-files $base_dir | TOOLS-combine-status dist > $out 51 | echo "Wrote $out" 52 | } 53 | 54 | # Create a single dist.csv time series for a GIVEN metric. 55 | combine-dist-results-one() { 56 | local base_dir=$1 57 | local job_dir=$2 58 | local metric_name=$3 59 | #echo FOO $base_dir $metric_name 60 | 61 | local out_dir=$job_dir/cooked/$metric_name 62 | mkdir -p $out_dir 63 | 64 | # Glob to capture this specific metric name over ALL job IDs. 65 | find $base_dir/*/raw/$metric_name -name STATUS.txt \ 66 | | TOOLS-combine-results dist 5 \ 67 | > $out_dir/dist.csv 68 | } 69 | 70 | # Creates a dist.csv file for EACH metric. TODO: Rename one/many 71 | combine-dist-results() { 72 | local base_dir=${1:-~/rappor/cron} 73 | local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01} 74 | 75 | # Direct subdirs of 'raw' are metrics. Just print filename. 76 | find $base_dir/*/raw -mindepth 1 -maxdepth 1 -type d -a -printf '%f\n' \ 77 | | sort | uniq \ 78 | | xargs --verbose -n1 -- \ 79 | $0 combine-dist-results-one $base_dir $job_dir 80 | } 81 | 82 | # Take the task-status.csv file, which has row key (metric, date). Writes 83 | # num_reports.csv and status.csv per metric, and a single overview.csv for all 84 | # metrics. 85 | dist-metric-status() { 86 | local job_dir=${1:-_tmp/results-10} 87 | local out_dir=$job_dir/cooked 88 | 89 | TOOLS-metric-status dist $job_dir/task-status.csv $out_dir 90 | } 91 | 92 | # 93 | # For association analysis cron job 94 | # 95 | 96 | combine-assoc-task-status() { 97 | local base_dir=${1:-~/rappor/chrome-assoc-smoke} 98 | local job_dir=${2:-$base_dir/smoke1} 99 | 100 | local out=$job_dir/assoc-task-status.csv 101 | 102 | time find $base_dir -name assoc-status.txt \ 103 | | TOOLS-combine-status assoc \ 104 | > $out 105 | 106 | echo "Wrote $out" 107 | } 108 | 109 | # Create a single assoc.csv time series for a GIVEN (var1, var2) pair. 110 | combine-assoc-results-one() { 111 | local base_dir=$1 112 | local job_dir=$2 113 | local metric_pair_rel_path=$3 114 | 115 | local out_dir=$job_dir/cooked/$metric_pair_rel_path 116 | mkdir -p $out_dir 117 | 118 | # Glob to capture this specific metric name over ALL job IDs. 119 | find $base_dir/*/raw/$metric_pair_rel_path -name assoc-status.txt \ 120 | | TOOLS-combine-results assoc 5 \ 121 | > $out_dir/assoc-results-series.csv 122 | } 123 | 124 | # Creates a dist.csv file for EACH metric. TODO: Rename one/many 125 | combine-assoc-results() { 126 | local base_dir=${1:-~/rappor/chrome-assoc-smoke} 127 | local job_dir=${2:-$base_dir/smoke3} 128 | 129 | # Direct subdirs of 'raw' are metrics, and subdirs of that are variable 130 | # pairs. Print "$metric_name/$pair_name". 131 | find $base_dir/*/raw -mindepth 2 -maxdepth 2 -type d -a -printf '%P\n' \ 132 | | sort | uniq \ 133 | | xargs --verbose -n1 -- \ 134 | $0 combine-assoc-results-one $base_dir $job_dir 135 | } 136 | 137 | # Take the assoc-task-status.csv file, which has row key (metric, date). Writes 138 | # num_reports.csv and status.csv per metric, and a single overview.csv for all 139 | # metrics. 140 | assoc-metric-status() { 141 | local job_dir=${1:-~/rappor/chrome-assoc-smoke/smoke3} 142 | local out_dir=$job_dir/cooked 143 | 144 | TOOLS-metric-status assoc $job_dir/assoc-task-status.csv $out_dir 145 | } 146 | 147 | "$@" 148 | -------------------------------------------------------------------------------- /bin/decode_dist.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Command line tool to decode a RAPPOR data set. It is a simple wrapper for 4 | # Decode() in decode.R. 5 | 6 | library(optparse) 7 | 8 | # 9 | # Command line parsing. Do this first before loading libraries to catch errors 10 | # quickly. Loading libraries in R is slow. 11 | # 12 | 13 | # For command line error checking. 14 | UsageError <- function(...) { 15 | cat(sprintf(...)) 16 | cat('\n') 17 | quit(status = 1) 18 | } 19 | 20 | option_list <- list( 21 | # Inputs 22 | make_option("--map", default="", help="Map file (required)"), 23 | make_option("--counts", default="", help="Counts file (required)"), 24 | make_option("--params", default="", help="Params file (required)"), 25 | make_option("--output-dir", dest="output_dir", default=".", 26 | help="Output directory (default .)"), 27 | 28 | make_option("--correction", default="FDR", help="Correction method"), 29 | make_option("--alpha", default=.05, help="Alpha level"), 30 | 31 | make_option("--adjust-counts-hack", dest="adjust_counts_hack", 32 | default=FALSE, action="store_true", 33 | help="Allow the counts file to have more rows than cohorts. 34 | Most users should not use this.") 35 | ) 36 | 37 | ParseOptions <- function() { 38 | # NOTE: This API is bad; if you add positional_arguments, the return value 39 | # changes! 40 | parser <- OptionParser(option_list = option_list) 41 | opts <- parse_args(parser) 42 | 43 | if (opts$map == "") { 44 | UsageError("--map is required.") 45 | } 46 | if (opts$counts == "") { 47 | UsageError("--counts is required.") 48 | } 49 | if (opts$params == "") { 50 | UsageError("--params is required.") 51 | } 52 | return(opts) 53 | } 54 | 55 | if (!interactive()) { 56 | opts <- ParseOptions() 57 | } 58 | 59 | # 60 | # Load libraries and source our own code. 61 | # 62 | 63 | library(RJSONIO) 64 | 65 | # So we don't have to change pwd 66 | source.rappor <- function(rel_path) { 67 | abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 68 | source(abs_path) 69 | } 70 | 71 | source.rappor("analysis/R/read_input.R") 72 | source.rappor("analysis/R/decode.R") 73 | source.rappor("analysis/R/util.R") 74 | 75 | source.rappor("analysis/R/alternative.R") 76 | 77 | options(stringsAsFactors = FALSE) 78 | 79 | 80 | main <- function(opts) { 81 | Log("decode-dist") 82 | Log("argv:") 83 | print(commandArgs(TRUE)) 84 | 85 | Log("Loading inputs") 86 | 87 | # Run a single model of all inputs are specified. 88 | params <- ReadParameterFile(opts$params) 89 | counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack) 90 | counts <- AdjustCounts(counts, params) 91 | 92 | 93 | # The left-most column has totals. 94 | num_reports <- sum(counts[, 1]) 95 | 96 | map <- LoadMapFile(opts$map, params) 97 | 98 | Log("Decoding %d reports", num_reports) 99 | res <- Decode(counts, map$map, params, correction = opts$correction, 100 | alpha = opts$alpha) 101 | Log("Done decoding") 102 | 103 | if (nrow(res$fit) == 0) { 104 | Log("FATAL: Analysis returned no strings.") 105 | quit(status = 1) 106 | } 107 | 108 | # Write analysis results as CSV. 109 | results_csv_path <- file.path(opts$output_dir, 'results.csv') 110 | write.csv(res$fit, file = results_csv_path, row.names = FALSE) 111 | 112 | # Write residual histograph as a png. 113 | results_png_path <- file.path(opts$output_dir, 'residual.png') 114 | png(results_png_path) 115 | breaks <- pretty(res$residual, n = 200) 116 | histogram <- hist(res$residual, breaks, plot = FALSE) 117 | histogram$counts <- histogram$counts / sum(histogram$counts) # convert the histogram to frequencies 118 | plot(histogram, main = "Histogram of the residual", 119 | xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k)) 120 | dev.off() 121 | 122 | res$metrics$total_elapsed_time <- proc.time()[['elapsed']] 123 | 124 | # Write summary as JSON (scalar values). 125 | metrics_json_path <- file.path(opts$output_dir, 'metrics.json') 126 | m <- toJSON(res$metrics) 127 | writeLines(m, con = metrics_json_path) 128 | Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path) 129 | 130 | # TODO: 131 | # - These are in an 2 column 'parameters' and 'values' format. Should these 132 | # just be a plain list? 133 | # - Should any of these privacy params be in metrics.json? 134 | 135 | Log("Privacy summary:") 136 | print(res$privacy) 137 | cat("\n") 138 | 139 | Log('DONE') 140 | } 141 | 142 | if (!interactive()) { 143 | main(opts) 144 | } 145 | -------------------------------------------------------------------------------- /client/cpp/README.md: -------------------------------------------------------------------------------- 1 | RAPPOR C++ Client 2 | ================= 3 | 4 | We provide both a low level and high level client API. The low level API 5 | implements just the RAPPOR encoding algorithm on strings, with few 6 | dependencies. 7 | 8 | The high level API provides wrappers that bundle encoded values into Protocol 9 | Buffer messages. 10 | 11 | Build Instructions 12 | ------------------ 13 | 14 | You'll need a C++ compiler, the protobuf compiler, and a library that 15 | implements common hash functions (e.g. OpenSSL). 16 | 17 | On Ubuntu or Debian, the protobuf compiler and header files can be installed 18 | with: 19 | 20 | sudo apt-get install protobuf-compiler libprotobuf-dev 21 | 22 | OpenSSL can be installed with: 23 | 24 | sudo apt-get install libssl-dev 25 | 26 | Test 27 | ---- 28 | 29 | After installing dependencies, You can test it out easily on your machine: 30 | 31 | ./demo.sh quick-cpp 32 | 33 | This builds the test harness using a Makefile, and then runs the regtest.sh 34 | simulation. The last few lines of output will look like this: 35 | 36 | Done running all test instances 37 | Instances succeeded: 1 failed: 0 running: 0 total: 1 38 | Wrote _tmp/cpp/results.html 39 | URL: file:///usr/local/google/home/andychu/git/rappor/_tmp/cpp/results.html 40 | 41 | Open the HTML file to see a plot and stats. 42 | 43 | 44 | Encoder 45 | ------- 46 | 47 | The low level API is `Encoder`. You instantiatate it with RAPPOR encoding 48 | parameters and application dependencies. It has a method `EncodeString()` that 49 | takes an input string (no other types), sets an output parameter of type 50 | `rappor::Bits`, and returns success or failure. 51 | 52 | ```cpp 53 | #include 54 | 55 | #include "encoder.h" 56 | #include "openssl_hash_impl.h" 57 | #include "unix_kernel_rand_impl.h" 58 | 59 | int main(int argc, char** argv) { 60 | FILE* fp = fopen("/dev/urandom", "r"); 61 | rappor::UnixKernelRand irr_rand(fp); 62 | 63 | rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256, 64 | irr_rand); 65 | rappor::Params params(32, // num_bits (k) 66 | 2, // num_hashes (h) 67 | 128, // num_cohorts (m) 68 | 0.25, // probability f for PRR 69 | 0.75, // probability p for IRR 70 | 0.5); // probability q for IRR 71 | 72 | const char* encoder_id = "metric-name"; 73 | rappor::Encoder encoder(encoder_id, params, deps); 74 | 75 | // Now use it to encode values. The 'out' value can be sent over the 76 | // network. 77 | rappor::Bits out; 78 | assert(encoder.EncodeString("foo", &out)); // returns false on error 79 | printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort()); 80 | 81 | // Raw bits 82 | assert(encoder.EncodeBits(0x123, &out)); // returns false on error 83 | printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort()); 84 | } 85 | ``` 86 | 87 | Dependencies 88 | ------------ 89 | 90 | `rappor::Deps` is a struct-like object that holds the dependencies needed by 91 | the API. 92 | 93 | The application must provide the following values: 94 | 95 | - cohort: An integer between 0 and `num_cohorts - 1`. Each value is assigned 96 | with equal probability to a client process. 97 | - client_secret: A persistent client secret (used for deterministic randomness 98 | in the PRR, i.e. "memoization" requirement). 99 | - hash_func - string hash function implementation (e.g. MD5) 100 | - hmac_func - HMAC-SHA256 implementation 101 | - irr_rand - randomness for the IRR 102 | 103 | We provide an implementation of `hash_func` and `hmac_func` and using OpenSSL. 104 | If your application already has a different implementation of these functions, 105 | you can implement the `HashFunc` and HmacFunc` interfaces. 106 | 107 | We provide two example implementations of `irr_rand`: one based on libc 108 | `rand()` (insecure, for demo only), and one based on Unix `/dev/urandom`. 109 | 110 | Error Handling 111 | -------------- 112 | 113 | Note that incorrect usage of the `SimpleEncoder` and `Protobuf` constructors 114 | may cause *runtime assertions* (using `assert()`). For example, if 115 | Params.num\_bits is more than 32, the process will crash. 116 | 117 | Encoders should be initialized at application startup, with constant 118 | parameters, so this type of error should be seen early. 119 | 120 | The various `Encode()` members do *not* raise assertions. If those are used 121 | incorrectly, then the return value will be `false` to indicate an error. These 122 | failures should be handled by the application. 123 | 124 | Memory Management 125 | ----------------- 126 | 127 | The `Encoder` instances contain pointers to `Params` and `Deps` instances, but 128 | don't own them. In the examples, all instances live the stack of `main()`, so 129 | you don't have to worry about them being destroyed. 130 | -------------------------------------------------------------------------------- /analysis/R/encode.R: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | Encode <- function(value, map, strs, params, N, id = NULL, 16 | cohort = NULL, B = NULL, BP = NULL) { 17 | # Encode value to RAPPOR and return a report. 18 | # 19 | # Input: 20 | # value: value to be encoded 21 | # map: a mapping matrix describing where each element of strs map in 22 | # each cohort 23 | # strs: a vector of possible values with value being one of them 24 | # params: a list of RAPPOR parameters described in decode.R 25 | # N: sample size 26 | # Optional parameters: 27 | # id: user ID (smaller than N) 28 | # cohort: specifies cohort number (smaller than m) 29 | # B: input Bloom filter itself, in which case value is ignored 30 | # BP: input Permanent Randomized Response (memoized for multiple colections 31 | # from the same user 32 | 33 | k <- params$k 34 | p <- params$p 35 | q <- params$q 36 | f <- params$f 37 | h <- params$h 38 | m <- params$m 39 | if (is.null(cohort)) { 40 | cohort <- sample(1:m, 1) 41 | } 42 | 43 | if (is.null(id)) { 44 | id <- sample(N, 1) 45 | } 46 | 47 | ind <- which(value == strs) 48 | 49 | if (is.null(B)) { 50 | B <- as.numeric(map[[cohort]][, ind]) 51 | } 52 | 53 | if (is.null(BP)) { 54 | BP <- sapply(B, function(x) sample(c(0, 1, x), 1, 55 | prob = c(0.5 * f, 0.5 * f, 1 - f))) 56 | } 57 | rappor <- sapply(BP, function(x) rbinom(1, 1, ifelse(x == 1, q, p))) 58 | 59 | list(value = value, rappor = rappor, B = B, BP = BP, cohort = cohort, id = id) 60 | } 61 | 62 | ExamplePlot <- function(res, k, ebs = 1, title = "", title_cex = 4, 63 | voff = .17, acex = 1.5, posa = 2, ymin = 1, 64 | horiz = FALSE) { 65 | PC <- function(k, report) { 66 | char <- as.character(report) 67 | if (k > 128) { 68 | char[char != ""] <- "|" 69 | } 70 | char 71 | } 72 | 73 | # Annotation settings 74 | anc <- "darkorange2" 75 | colors <- c("lavenderblush3", "maroon4") 76 | 77 | par(omi = c(0, .55, 0, 0)) 78 | # Setup plotting. 79 | plot(1:k, rep(1, k), ylim = c(ymin, 4), type = "n", 80 | xlab = "Bloom filter bits", 81 | yaxt = "n", ylab = "", xlim = c(0, k), bty = "n", xaxt = "n") 82 | mtext(paste0("Participant ", res$id, " in cohort ", res$cohort), 3, 2, 83 | adj = 1, col = anc, cex = acex) 84 | axis(1, 2^(0:15), 2^(0:15)) 85 | abline(v = which(res$B == 1), lty = 2, col = "grey") 86 | 87 | # First row with the true value. 88 | text(k / 2, 4, paste0('"', paste0(title, as.character(res$value)), '"'), 89 | cex = title_cex, col = colors[2], xpd = NA) 90 | 91 | # Second row with BF: B. 92 | points(1:k, rep(3, k), pch = PC(k, res$B), col = colors[res$B + 1], 93 | cex = res$B + 1) 94 | text(k, 3 + voff, paste0(sum(res$B), " signal bits"), cex = acex, 95 | col = anc, pos = posa) 96 | 97 | # Third row: B'. 98 | points(1:k, rep(2, k), pch = PC(k, res$BP), col = colors[res$BP + 1], 99 | cex = res$BP + 1) 100 | text(k, 2 + voff, paste0(sum(res$BP), " bits on"), 101 | cex = acex, col = anc, pos = posa) 102 | 103 | # Row 4: actual RAPPOR report. 104 | report <- res$rappor 105 | points(1:k, rep(1, k), pch = PC(k, as.character(report)), 106 | col = colors[report + 1], cex = report + 1) 107 | text(k, 1 + voff, paste0(sum(res$rappor), " bits on"), cex = acex, 108 | col = anc, pos = posa) 109 | 110 | mtext(c("True value:", "Bloom filter (B):", 111 | "Fake Bloom \n filter (B'):", "Report sent\n to server:"), 112 | 2, 1, at = 4:1, las = 2) 113 | legend("topright", legend = c("0", "1"), fill = colors, bty = "n", 114 | cex = 1.5, horiz = horiz) 115 | legend("topleft", legend = ebs, plot = FALSE) 116 | } 117 | 118 | PlotPopulation <- function(probs, detected, detection_frequency) { 119 | cc <- c("gray80", "darkred") 120 | color <- rep(cc[1], length(probs)) 121 | color[detected] <- cc[2] 122 | bp <- barplot(probs, col = color, border = color) 123 | inds <- c(1, c(max(which(probs > 0)), length(probs))) 124 | axis(1, bp[inds], inds) 125 | legend("topright", legend = c("Detected", "Not-detected"), 126 | fill = rev(cc), bty = "n") 127 | abline(h = detection_frequency, lty = 2, col = "grey") 128 | } 129 | -------------------------------------------------------------------------------- /client/cpp/encoder.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All rights reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // RAPPOR encoder. 16 | // 17 | // See README.md and encoder_demo.cc for an example. 18 | 19 | #ifndef RAPPOR_H_ 20 | #define RAPPOR_H_ 21 | 22 | #include 23 | 24 | #include "rappor_deps.h" // for dependency injection 25 | 26 | namespace rappor { 27 | 28 | // For debug logging 29 | void log(const char* fmt, ...); 30 | 31 | // RAPPOR encoding parameters. 32 | class Params { 33 | public: 34 | Params(int num_bits, int num_hashes, int num_cohorts, 35 | float prob_f, float prob_p, float prob_q) 36 | : num_bits_(num_bits), 37 | num_hashes_(num_hashes), 38 | num_cohorts_(num_cohorts), 39 | prob_f_(prob_f), 40 | prob_p_(prob_p), 41 | prob_q_(prob_q) { 42 | } 43 | 44 | // Accessors 45 | int num_bits() { return num_bits_; } 46 | int num_hashes() { return num_hashes_; } 47 | int num_cohorts() { return num_cohorts_; } 48 | float prob_f() { return prob_f_; } 49 | float prob_p() { return prob_p_; } 50 | float prob_q() { return prob_q_; } 51 | 52 | private: 53 | friend class Encoder; 54 | 55 | // k: size of bloom filter, PRR, and IRR. 0 < k <= 32. 56 | int num_bits_; 57 | 58 | // number of bits set in the Bloom filter ("h") 59 | int num_hashes_; 60 | 61 | // Total number of cohorts ("m"). Note that the cohort assignment is what 62 | // is used in the client, not m. We include it here for documentation (it 63 | // can be unset, unlike the other params.) 64 | int num_cohorts_; 65 | 66 | float prob_f_; // noise probability for PRR, quantized to 1/128 67 | 68 | float prob_p_; // noise probability for IRR, quantized to 1/128 69 | float prob_q_; // noise probability for IRR, quantized to 1/128 70 | }; 71 | 72 | // Encoder: take client values and transform them with the RAPPOR privacy 73 | // algorithm. 74 | class Encoder { 75 | public: 76 | // Note that invalid parameters cause runtime assertions in the constructor. 77 | // Encoders are intended to be created at application startup with constant 78 | // arguments, so errors should be caught early. 79 | 80 | // encoder_id: A unique ID for this encoder -- typically the name of the 81 | // metric being encoded, so that different metrics have different PRR 82 | // mappings. 83 | // params: RAPPOR encoding parameters, which affect privacy and decoding. 84 | // (held by reference; it must outlive the Encoder) 85 | // deps: application-supplied dependencies. 86 | // (held by reference; it must outlive the Encoder) 87 | Encoder(const std::string& encoder_id, const Params& params, 88 | const Deps& deps); 89 | 90 | // Encode raw bits (represented as an integer), setting output parameter 91 | // irr_out. Only valid when the return value is 'true' (success). 92 | bool EncodeBits(const Bits bits, Bits* irr_out) const; 93 | 94 | // Encode a string, setting output parameter irr_out. Only valid when the 95 | // return value is 'true' (success). 96 | bool EncodeString(const std::string& value, Bits* irr_out) const; 97 | // For use with HmacDrbg hash function and any num_bits divisible by 8. 98 | bool EncodeString(const std::string& value, 99 | std::vector* irr_out) const; 100 | 101 | // For testing/simulation use only. 102 | bool _EncodeBitsInternal(const Bits bits, Bits* prr_out, Bits* irr_out) 103 | const; 104 | bool _EncodeStringInternal(const std::string& value, Bits* bloom_out, 105 | Bits* prr_out, Bits* irr_out) const; 106 | 107 | // Accessor for the assigned cohort. 108 | uint32_t cohort() { return cohort_; } 109 | // Set a cohort manually, if previously generated. 110 | void set_cohort(uint32_t cohort); 111 | 112 | private: 113 | bool MakeBloomFilter(const std::string& value, Bits* bloom_out) const; 114 | bool MakeBloomFilter(const std::string& value, 115 | std::vector* bloom_out) const; 116 | bool GetPrrMasks(const Bits bits, Bits* uniform, Bits* f_mask) const; 117 | 118 | // static helper function for initialization 119 | static uint32_t AssignCohort(const Deps& deps, int num_cohorts); 120 | 121 | const std::string encoder_id_; 122 | const Params& params_; 123 | const Deps& deps_; 124 | uint32_t cohort_; 125 | std::string cohort_str_; 126 | }; 127 | 128 | } // namespace rappor 129 | 130 | #endif // RAPPOR_H_ 131 | -------------------------------------------------------------------------------- /apps/rappor-sim/server.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | source("../../analysis/R/decode.R") 3 | source("../../analysis/R/simulation.R") 4 | source("../../analysis/R/encode.R") 5 | 6 | Plot <- function(x, color = "grey") { 7 | n <- nrow(x) 8 | if (n < 16) { 9 | par(mfrow = c(n, 1), mai = c(0, .5, .5, 0)) 10 | } else if (n < 64) { 11 | par(mfrow = c(n / 2, 2), mai = c(0, .5, .5, 0)) 12 | } else { 13 | par(mfrow = c(n / 4, 4), mai = c(0, .5, .5, 0)) 14 | } 15 | for (i in 1:nrow(x)) { 16 | barplot(x[i, ], main = paste0("Cohort ", i), col = color, border = color) 17 | } 18 | } 19 | 20 | shinyServer(function(input, output) { 21 | # Example state global variable. 22 | es <- list() 23 | 24 | # Example buttons states. 25 | ebs <- rep(0, 3) 26 | 27 | Params <- reactive({ 28 | list(k = as.numeric(input$size), 29 | h = as.numeric(input$hashes), 30 | m = as.numeric(input$instances), 31 | p = as.numeric(input$p), 32 | q = as.numeric(input$q), 33 | f = as.numeric(input$f)) 34 | }) 35 | 36 | PopParams <- reactive({ 37 | list(as.numeric(input$nstrs), 38 | as.numeric(input$nonzero), 39 | input$decay, 40 | as.numeric(input$expo), 41 | as.numeric(input$background) 42 | ) 43 | }) 44 | 45 | DecodingParams <- reactive({ 46 | list(as.numeric(input$alpha), 47 | input$correction) 48 | }) 49 | 50 | Sample <- reactive({ 51 | input$sample 52 | N <- input$N 53 | params <- Params() 54 | pop_params <- PopParams() 55 | decoding_params <- DecodingParams() 56 | prop_missing <- input$missing 57 | fit <- GenerateSamples(N, params, pop_params, 58 | alpha = decoding_params[[1]], 59 | correction = decoding_params[[2]], 60 | prop_missing = prop_missing) 61 | fit 62 | }) 63 | 64 | # Results summary. 65 | output$pr <- renderTable({ 66 | Sample()$summary 67 | }, 68 | include.rownames = FALSE, include.colnames = FALSE) 69 | 70 | # Results table. 71 | output$tab <- renderDataTable({ 72 | Sample()$fit 73 | }, 74 | options = list(iDisplayLength = 100)) 75 | 76 | # Epsilon. 77 | output$epsilon <- renderTable({ 78 | Sample()$privacy 79 | }, 80 | include.rownames = FALSE, include.colnames = FALSE, digits = 4) 81 | 82 | # True distribution. 83 | output$probs <- renderPlot({ 84 | samp <- Sample() 85 | probs <- samp$probs 86 | detected <- match(samp$fit[, 1], samp$strs) 87 | detection_frequency <- samp$privacy[7, 2] 88 | PlotPopulation(probs, detected, detection_frequency) 89 | }) 90 | 91 | # True bits patterns. 92 | output$truth <- renderPlot({ 93 | truth <- Sample()$truth 94 | Plot(truth[, -1, drop = FALSE], color = "darkblue") 95 | }) 96 | 97 | # Lasso plot. 98 | output$lasso <- renderPlot({ 99 | fit <- Sample()$lasso 100 | if (!is.null(fit)) { 101 | plot(fit) 102 | } 103 | }) 104 | 105 | output$resid <- renderPlot({ 106 | resid <- Sample()$residual 107 | params <- Params() 108 | plot(resid, xlab = "Bloom filter bits", ylab = "Residuals") 109 | abline(h = c(-1.96, 1.96), lty = 2, col = 2) 110 | sq <- qnorm(.025 / length(resid)) 111 | abline(h = c(sq, -sq), lty = 2, col = 3, lwd = 2) 112 | abline(h = c(-3, 3), lty = 2, col = 4, lwd = 2) 113 | abline(v = params$k * (0:params$m), lty = 2, col = "blue") 114 | legend("topright", legend = paste0("SD = ", round(sd(resid), 2)), bty = "n") 115 | }) 116 | 117 | # Estimated bits patterns. 118 | output$ests <- renderPlot({ 119 | ests <- Sample()$ests 120 | Plot(ests, color = "darkred") 121 | }) 122 | 123 | # Estimated vs truth. 124 | output$ests_truth <- renderPlot({ 125 | plot(unlist(Sample()$ests), unlist(Sample()$truth[, -1]), 126 | xlab = "Estimates", ylab = "Truth", pch = 19) 127 | abline(0, 1, lwd = 4, col = "darkred") 128 | }) 129 | 130 | output$example <- renderPlot({ 131 | params <- Params() 132 | strs <- Sample()$strs 133 | map <- Sample()$map 134 | samp <- Sample() 135 | 136 | # First run on app start. 137 | value <- sample(strs, 1) 138 | res <- Encode(value, map, strs, params, N = input$N) 139 | 140 | if (input$new_user > ebs[1]) { 141 | res <- Encode(es$value, map, strs, params, N = input$N) 142 | ebs[1] <<- input$new_user 143 | } else if (input$new_value > ebs[2]) { 144 | res <- Encode(value, map, strs, params, cohort = es$cohort, id = es$id, 145 | N = input$N) 146 | ebs[2] <<- input$new_value 147 | } else if (input$new_report > ebs[3]) { 148 | res <- Encode(es$value, map, strs, params, B = es$B, 149 | BP = es$BP, cohort = es$cohort, id = es$id, N = input$N) 150 | ebs[3] <<- input$new_report 151 | } 152 | es <<- res 153 | ExamplePlot(res, params$k, c(ebs, input$new_user, input$new_value, input$new_report)) 154 | }) 155 | 156 | }) 157 | -------------------------------------------------------------------------------- /pipeline/assoc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Usage: 4 | # ./assoc.sh 5 | 6 | set -o nounset 7 | set -o pipefail 8 | set -o errexit 9 | 10 | readonly THIS_DIR=$(dirname $0) 11 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd) 12 | 13 | source $RAPPOR_SRC/util.sh # log, banner 14 | source $RAPPOR_SRC/pipeline/tools-lib.sh 15 | source $RAPPOR_SRC/pipeline/alarm-lib.sh 16 | 17 | # Change the default location of these tools by setting DEP_* 18 | readonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc} 19 | readonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em} 20 | 21 | # Run a single decode-assoc process, to analyze one variable pair for one 22 | # metric. The arguments to this function are one row of the task spec. 23 | decode-one() { 24 | # Job constants, from decode-many 25 | local rappor_src=$1 26 | local timeout_secs=$2 27 | local min_reports=$3 28 | local job_dir=$4 29 | local sample_size=$5 30 | 31 | # Task spec variables, from task_spec.py 32 | local num_reports=$6 33 | local metric_name=$7 34 | local date=$8 # for output naming only 35 | local reports=$9 # file with reports 36 | local var1=${10} 37 | local var2=${11} 38 | local map1=${12} 39 | local output_dir=${13} 40 | 41 | local log_file=$output_dir/assoc-log.txt 42 | local status_file=$output_dir/assoc-status.txt 43 | mkdir --verbose -p $output_dir 44 | 45 | # Flags drived from job constants 46 | local schema=$job_dir/config/rappor-vars.csv 47 | local params_dir=$job_dir/config 48 | local em_executable=$FAST_EM 49 | 50 | # TODO: 51 | # - Skip jobs with few reports, like ./backfill.sh analyze-one. 52 | 53 | # Output the spec for combine_status.py. 54 | echo "$@" > $output_dir/assoc-spec.txt 55 | 56 | # NOTE: Not passing --num-cores since we're parallelizing already. 57 | 58 | # NOTE: --tmp-dir is the output dir. Then we just delete all the .bin files 59 | # afterward so we don't copy them to x20 (they are big). 60 | 61 | { time \ 62 | alarm-status $status_file $timeout_secs \ 63 | $DECODE_ASSOC \ 64 | --create-bool-map \ 65 | --remove-bad-rows \ 66 | --em-executable $em_executable \ 67 | --schema $schema \ 68 | --params-dir $params_dir \ 69 | --metric-name $metric_name \ 70 | --reports $reports \ 71 | --var1 $var1 \ 72 | --var2 $var2 \ 73 | --map1 $map1 \ 74 | --reports-sample-size $sample_size \ 75 | --tmp-dir $output_dir \ 76 | --output-dir $output_dir 77 | } >$log_file 2>&1 78 | } 79 | 80 | test-decode-one() { 81 | decode-one $RAPPOR_SRC 82 | } 83 | 84 | readonly DEFAULT_MIN_REPORTS=5000 85 | 86 | #readonly DEFAULT_TIMEOUT_SECONDS=300 # 5 minutes as a quick test. 87 | readonly DEFAULT_TIMEOUT_SECONDS=3600 # 1 hour 88 | 89 | readonly DEFAULT_MAX_PROCS=6 # TODO: Share with backfill.sh 90 | 91 | # Limit to 1M for now. Raise it when we have a full run. 92 | readonly DEFAULT_SAMPLE_SIZE=1000000 93 | 94 | readonly NUM_ARGS=8 # number of tokens in the task spec, used for xargs 95 | 96 | # Run many decode-assoc processes in parallel. 97 | decode-many() { 98 | local job_dir=$1 99 | local spec_list=$2 100 | 101 | # These 3 params affect speed 102 | local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS} 103 | local sample_size=${4:-$DEFAULT_SAMPLE_SIZE} 104 | local max_procs=${5:-$DEFAULT_MAX_PROCS} 105 | 106 | local rappor_src=${6:-$RAPPOR_SRC} 107 | local min_reports=${7:-$DEFAULT_MIN_REPORTS} 108 | 109 | time cat $spec_list \ 110 | | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \ 111 | $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size 112 | } 113 | 114 | # Combine assoc results and render HTML. 115 | 116 | combine-and-render-html() { 117 | local jobs_base_dir=$1 118 | local job_dir=$2 119 | 120 | banner "Combining assoc task status" 121 | TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir 122 | 123 | banner "Combining assoc results" 124 | TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir 125 | 126 | banner "Splitting out status per metric, and writing overview" 127 | TOOLS-cook assoc-metric-status $job_dir 128 | 129 | TOOLS-gen-ui symlink-static assoc $job_dir 130 | 131 | banner "Building overview .part.html from CSV" 132 | TOOLS-gen-ui assoc-overview-part-html $job_dir 133 | 134 | banner "Building metric .part.html from CSV" 135 | TOOLS-gen-ui assoc-metric-part-html $job_dir 136 | 137 | banner "Building pair .part.html from CSV" 138 | TOOLS-gen-ui assoc-pair-part-html $job_dir 139 | 140 | banner "Building day .part.html from CSV" 141 | TOOLS-gen-ui assoc-day-part-html $job_dir 142 | } 143 | 144 | # Temp files left over by the fast_em R <-> C++. 145 | list-and-remove-bin() { 146 | local job_dir=$1 147 | # If everything failed, we might not have anything to list/delete. 148 | find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si 149 | find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose 150 | } 151 | 152 | "$@" 153 | -------------------------------------------------------------------------------- /tests/analyze_assoc.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | # 3 | # Copyright 2015 Google Inc. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # Reads map files, report files, and RAPPOR parameters to run 18 | # an EM algorithm to estimate joint distribution over two or more variables 19 | # 20 | # Usage: 21 | # $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \ 22 | # -reports reports.csv \ 23 | # Inputs: map1, map2, reports, params 24 | # see how options are parsed below for more information 25 | # Outputs: 26 | # prints a table with estimated joint probability masses 27 | # over candidate strings 28 | # Ex. 29 | # ssl nossl 30 | # intel 0.1 0.3 31 | # google 0.5 0.1 32 | 33 | library("optparse") 34 | 35 | options(stringsAsFactors = FALSE) 36 | 37 | if(!interactive()) { 38 | option_list <- list( 39 | # Flags 40 | make_option(c("--map1", "-m1"), default = "map_1.csv", 41 | help = "Hashed candidates for 1st variable"), 42 | make_option(c("--map2", "-m2"), default = "map_2.csv", 43 | help = "Hashed candidates for 2nd variable"), 44 | make_option(c("--reports", "-r"), default = "reports.csv", 45 | help = "File with raw reports as "), 46 | make_option(c("--params", "-p"), default = "params.csv", 47 | help = "Filename for RAPPOR parameters") 48 | ) 49 | opts <- parse_args(OptionParser(option_list = option_list)) 50 | } 51 | 52 | source("../analysis/R/encode.R") 53 | source("../analysis/R/decode.R") 54 | source("../analysis/R/simulation.R") 55 | source("../analysis/R/read_input.R") 56 | source("../analysis/R/association.R") 57 | 58 | # This function processes the maps loaded using ReadMapFile 59 | # Association analysis requires a map object with a map 60 | # field that has the map split into cohorts and an rmap field 61 | # that has all the cohorts combined 62 | # Arguments: 63 | # map = map object with cohorts as sparse matrix in 64 | # object map$map 65 | # This is the expected object from ReadMapFile 66 | # params = data field with parameters 67 | # TODO(pseudorandom): move this functionality to ReadMapFile 68 | ProcessMap <- function(map, params) { 69 | map$rmap <- map$map 70 | split_map <- function(i, map_struct) { 71 | numbits <- params$k 72 | indices <- which(as.matrix( 73 | map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE, 74 | arr.ind = TRUE) 75 | sparseMatrix(indices[, "row"], indices[, "col"], 76 | dims = c(numbits, max(indices[, "col"]))) 77 | } 78 | map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap)) 79 | map 80 | } 81 | 82 | main <- function(opts) { 83 | ptm <- proc.time() 84 | 85 | params <- ReadParameterFile(opts$params) 86 | opts_map <- list(opts$map1, opts$map2) 87 | map <- lapply(opts_map, function(o) 88 | ProcessMap(ReadMapFile(o, params = params), 89 | params = params)) 90 | # Reports must be of the format 91 | # cohort no, rappor bitstring 1, rappor bitstring 2 92 | reportsObj <- read.csv(opts$reports, 93 | colClasses = c("integer", "character", "character"), 94 | header = FALSE) 95 | 96 | # Parsing reportsObj 97 | # ComputeDistributionEM allows for different sets of cohorts 98 | # for each variable. Here, both sets of cohorts are identical 99 | co <- as.list(reportsObj[1])[[1]] 100 | cohorts <- list(co, co) 101 | # Parse reports from reportObj cols 2 and 3 102 | reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1])) 103 | 104 | # Split strings into bit arrays (as required by assoc analysis) 105 | reports <- lapply(1:2, function(i) { 106 | # apply the following function to each of reports[[1]] and reports[[2]] 107 | lapply(reports[[i]][[1]], function(x) { 108 | # function splits strings and converts them to numeric values 109 | as.numeric(strsplit(x, split = "")[[1]]) 110 | }) 111 | }) 112 | 113 | joint_dist <- ComputeDistributionEM(reports, cohorts, map, 114 | ignore_other = TRUE, 115 | params, marginals = NULL, 116 | estimate_var = FALSE) 117 | # TODO(pseudorandom): Export the results to a file for further analysis 118 | print("JOINT_DIST$FIT") 119 | print(joint_dist$fit) 120 | print("PROC.TIME") 121 | print(proc.time() - ptm) 122 | } 123 | 124 | if(!interactive()) { 125 | main(opts) 126 | } -------------------------------------------------------------------------------- /analysis/R/read_input.R: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # 16 | # Read parameter, counts and map files. 17 | 18 | library(Matrix) 19 | 20 | source.rappor <- function(rel_path) { 21 | abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path) 22 | source(abs_path) 23 | } 24 | 25 | source.rappor("analysis/R/util.R") # for Log 26 | 27 | 28 | ReadParameterFile <- function(params_file) { 29 | # Read parameter file. Format: 30 | # k, h, m, p, q, f 31 | # 128, 2, 8, 0.5, 0.75, 0.75 32 | 33 | params <- as.list(read.csv(params_file)) 34 | if (length(params) != 6) { 35 | stop("There should be exactly 6 columns in the parameter file.") 36 | } 37 | if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) { 38 | stop("Parameter names must be k,h,m,p,q,f.") 39 | } 40 | params 41 | } 42 | 43 | # Handle the case of redundant cohorts, i.e. the counts file needs to be 44 | # further aggregated to obtain counts for the number of cohorts specified in 45 | # the params file. 46 | # 47 | # NOTE: Why is this happening? 48 | AdjustCounts <- function(counts, params) { 49 | apply(counts, 2, function(x) { 50 | tapply(x, rep(1:params$m, nrow(counts) / params$m), sum) 51 | }) 52 | } 53 | 54 | ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) { 55 | # Read in the counts file. 56 | if (!file.exists(counts_file)) { 57 | return(NULL) 58 | } 59 | counts <- read.csv(counts_file, header = FALSE) 60 | 61 | if (adjust_counts) { 62 | counts <- AdjustCounts(counts, params) 63 | } 64 | 65 | if (nrow(counts) != params$m) { 66 | stop(sprintf("Got %d rows in the counts file, expected m = %d", 67 | nrow(counts), params$m)) 68 | } 69 | 70 | if ((ncol(counts) - 1) != params$k) { 71 | stop(paste0("Counts file: number of columns should equal to k + 1: ", 72 | ncol(counts))) 73 | } 74 | 75 | if (any(counts < 0)) { 76 | stop("Counts file: all counts must be positive.") 77 | } 78 | 79 | # Turn counts from a data frame into a matrix. (In R a data frame and matrix 80 | # are sometimes interchangeable, but sometimes we need it to be matrix.) 81 | as.matrix(counts) 82 | } 83 | 84 | ReadMapFile <- function(map_file, params) { 85 | # Read in the map file which is in the following format (two hash functions): 86 | # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ... 87 | # str2, ... 88 | # Output: 89 | # map: a sparse representation of set bits for each candidate string. 90 | # strs: a vector of all candidate strings. 91 | 92 | Log("Parsing %s", map_file) 93 | 94 | map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE) 95 | strs <- map_pos[, 1] 96 | strs[strs == ""] <- "Empty" 97 | 98 | # Remove duplicated strings. 99 | ind <- which(!duplicated(strs)) 100 | strs <- strs[ind] 101 | map_pos <- map_pos[ind, ] 102 | 103 | n <- ncol(map_pos) - 1 104 | if (n != (params$h * params$m)) { 105 | stop(paste0("Map file: number of columns should equal hm + 1:", 106 | n, "_", params$h * params$m)) 107 | } 108 | 109 | row_pos <- unlist(map_pos[, -1], use.names = FALSE) 110 | col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1) 111 | 112 | # TODO: When would this ever happen? 113 | removed <- which(is.na(row_pos)) 114 | if (length(removed) > 0) { 115 | Log("Removed %d entries", length(removed)) 116 | row_pos <- row_pos[-removed] 117 | col_pos <- col_pos[-removed] 118 | } 119 | 120 | map <- sparseMatrix(row_pos, col_pos, 121 | dims = c(params$m * params$k, length(strs))) 122 | 123 | colnames(map) <- strs 124 | list(map = map, strs = strs, map_pos = map_pos) 125 | } 126 | 127 | LoadMapFile <- function(map_file, params) { 128 | # Reads the map file, caching an .rda (R binary data) version of it to speed 129 | # up future loads. 130 | 131 | rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE) 132 | # This must be unique per process, so concurrent processes don't try to 133 | # write the same file. 134 | tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid()) 135 | 136 | # First save to a temp file, and then atomically rename to the destination. 137 | if (file.exists(rda_path)) { 138 | Log("Loading %s", rda_path) 139 | load(rda_path, .GlobalEnv) # creates the 'map' variable in the global env 140 | } else { 141 | map <- ReadMapFile(map_file, params) 142 | 143 | Log("Saving %s as an rda file for faster access", map_file) 144 | tryCatch({ 145 | save(map, file = tmp_path) 146 | file.rename(tmp_path, rda_path) 147 | }, warning = function(w) { 148 | Log("WARNING: %s", w) 149 | }, error = function(e) { 150 | Log("ERROR: %s", e) 151 | }) 152 | } 153 | return(map) 154 | } 155 | -------------------------------------------------------------------------------- /analysis/R/unknowns_test.R: -------------------------------------------------------------------------------- 1 | # Copyright 2014 Google Inc. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Author: fanti@google.com (Giulia Fanti) 16 | # 17 | # Tests the unknown unknowns dictionary estimation functions. 18 | # There are two main components involved in estimating this unknown 19 | # distribution: 20 | # a) Find the pairwise ngrams that co-occur often. 21 | # b) Determine which full strings are consisted with all pairwise 22 | # relations. 23 | # 24 | # TestEstimateDictionary() tests the full pipeline, including parts (a) 25 | # and (b). 26 | # TestFindFeasibleStrings() tests only part (b). 27 | # Both tests generate their own data. 28 | 29 | library(parallel) 30 | source("analysis/R/encode.R") 31 | source("analysis/R/decode.R") 32 | source("analysis/R/simulation.R") 33 | source("analysis/R/association.R") 34 | source("analysis/R/decode_ngrams.R") 35 | source("analysis/R/ngrams_simulation.R") 36 | alphabet <- letters 37 | options(warn = -1) 38 | 39 | GeneratePopulation <- function(N, num_strs, str_len = 10, 40 | distribution = NULL) { 41 | # Generates a /deterministic/ string for each individual in the 42 | # population from distribution. 43 | # 44 | # Args: 45 | # N: Number of individuals in the population 46 | # num_strs: Number of strings from which to draw strings 47 | # str_len: Length of each string 48 | # distribution: Just here for compatibility with original 49 | # GeneratePopulation function in ngrams_simulation.R 50 | # 51 | # Returns: 52 | # Vector of strings for each individual in the population 53 | 54 | strs <- sapply(1:num_strs, function(i) { 55 | paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "") 56 | }) 57 | 58 | # Uniform distribution 59 | prob <- rep(1 / num_strs, num_strs) 60 | sample(strs, N, replace = TRUE, prob = prob) 61 | } 62 | 63 | TestEstimateDictionary <- function() { 64 | # Tests that the algorithm without noise recovers a uniform 65 | # string population correctly. 66 | 67 | # Compute the strings from measuring only 2 ngrams 68 | N <- 100 69 | str_len <- 6 70 | ngram_size <- 2 71 | num_ngrams <- str_len / ngram_size 72 | num_strs <- 1 73 | 74 | params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 75 | 76 | ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 77 | num_ngrams_collected = 2) 78 | 79 | sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 80 | alphabet, params, distribution = 3) 81 | 82 | res <- EstimateDictionary(sim, N, ngram_params, params) 83 | 84 | # Check that the correct strings are found 85 | if (num_strs == 1) { 86 | checkTrue(res$found_candidates == sort(unique(sim$strs))) 87 | } else { 88 | checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs)))) 89 | } 90 | } 91 | 92 | TestFindFeasibleStrings <- function() { 93 | # Tests that FindPairwiseCandidates weeds out false positives. 94 | # We test this by adding false positives to the pairwise estimates. 95 | N <- 100 96 | str_len <- 6 97 | ngram_size <- 2 98 | num_ngrams <- str_len / ngram_size 99 | num_strs <- 2 100 | 101 | params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0) 102 | 103 | ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams, 104 | num_ngrams_collected = 2) 105 | 106 | sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs, 107 | alphabet, params) 108 | 109 | pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params, 110 | params)$candidate_strs 111 | cat("Found the pairwise candidates. \n") 112 | 113 | pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le")) 114 | 115 | if (is.null(pairwise_candidates)) { 116 | return (FALSE) 117 | } 118 | 119 | conn <- file('graph.txt', 'w+') 120 | WriteKPartiteGraph(conn, 121 | pairwise_candidates, 122 | sim$pairings, 123 | ngram_params$num_ngrams, 124 | ngram_params$ngram_size) 125 | 126 | close(conn) 127 | cat("Wrote graph.txt\n") 128 | 129 | found_candidates <- FindFeasibleStrings(pairwise_candidates, 130 | sim$pairings, 131 | ngram_params$num_ngrams, 132 | ngram_params$ngram_size) 133 | # Check that the correct strings are found 134 | if (num_strs == 1) { 135 | checkTrue(found_candidates == sort(unique(sim$strs))) 136 | } else { 137 | checkTrue(all.equal(found_candidates, sort(unique(sim$strs)))) 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /analysis/tensorflow/fast_em.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | fast_em.py: Tensorflow implementation of expectation maximization for RAPPOR 4 | association analysis. 5 | 6 | TODO: 7 | - Use TensorFlow ops for reading input (so that reading input can be 8 | distributed) 9 | - Reduce the number of ops (currently proportional to the number of reports). 10 | May require new TensorFlow ops. 11 | - Fix performance bug (v_split is probably being recomputed on every 12 | iteration): 13 | bin$ ./test.sh decode-assoc-cpp - 1.1 seconds (single-threaded C++) 14 | bin$ ./test.sh decode-assoc-tensorflow - 226 seconds on GPU 15 | """ 16 | 17 | import sys 18 | 19 | import numpy as np 20 | import tensorflow as tf 21 | 22 | 23 | def log(msg, *args): 24 | if args: 25 | msg = msg % args 26 | print >>sys.stderr, msg 27 | 28 | 29 | def ExpectTag(f, expected): 30 | """Read and consume a 4 byte tag from the given file.""" 31 | b = f.read(4) 32 | if b != expected: 33 | raise RuntimeError('Expected %r, got %r' % (expected, b)) 34 | 35 | 36 | def ReadListOfMatrices(f): 37 | """ 38 | Read a big list of conditional probability matrices from a binary file. 39 | """ 40 | ExpectTag(f, 'ne \0') 41 | num_entries = np.fromfile(f, np.uint32, count=1)[0] 42 | log('Number of entries: %d', num_entries) 43 | 44 | ExpectTag(f, 'es \0') 45 | entry_size = np.fromfile(f, np.uint32, count=1)[0] 46 | log('Entry size: %d', entry_size) 47 | 48 | ExpectTag(f, 'dat\0') 49 | vec_length = num_entries * entry_size 50 | v = np.fromfile(f, np.float64, count=vec_length) 51 | 52 | log('Values read: %d', len(v)) 53 | log('v: %s', v[:10]) 54 | #print 'SUM', sum(v) 55 | 56 | # NOTE: We're not reshaping because we're using one TensorFlow tensor object 57 | # per matrix, since it makes the algorithm expressible with current 58 | # TensorFlow ops. 59 | #v = v.reshape((num_entries, entry_size)) 60 | 61 | return num_entries, entry_size, v 62 | 63 | 64 | def WriteTag(f, tag): 65 | if len(tag) != 3: 66 | raise AssertionError("Tags should be 3 bytes. Got %r" % tag) 67 | f.write(tag + '\0') # NUL terminated 68 | 69 | 70 | def WriteResult(f, num_em_iters, pij): 71 | WriteTag(f, 'emi') 72 | emi = np.array([num_em_iters], np.uint32) 73 | emi.tofile(f) 74 | 75 | WriteTag(f, 'pij') 76 | pij.tofile(f) 77 | 78 | 79 | def DebugSum(num_entries, entry_size, v): 80 | """Sum the entries as a sanity check.""" 81 | cond_prob = tf.placeholder(tf.float64, shape=(num_entries * entry_size,)) 82 | debug_sum = tf.reduce_sum(cond_prob) 83 | with tf.Session() as sess: 84 | s = sess.run(debug_sum, feed_dict={cond_prob: v}) 85 | log('Debug sum: %f', s) 86 | 87 | 88 | def BuildEmIter(num_entries, entry_size, v): 89 | # Placeholder for the value from the previous iteration. 90 | pij_in = tf.placeholder(tf.float64, shape=(entry_size,)) 91 | 92 | # split along dimension 0 93 | # TODO: 94 | # - make sure this doesn't get run for every EM iteration 95 | # - investigate using tf.tile() instead? (this may cost more memory) 96 | v_split = tf.split(0, num_entries, v) 97 | 98 | z_numerator = [report * pij_in for report in v_split] 99 | sum_z = [tf.reduce_sum(report) for report in z_numerator] 100 | z = [z_numerator[i] / sum_z[i] for i in xrange(num_entries)] 101 | 102 | # Concat per-report tensors and reshape. This is probably inefficient? 103 | z_concat = tf.concat(0, z) 104 | z_concat = tf.reshape(z_concat, [num_entries, entry_size]) 105 | 106 | # This whole expression represents an EM iteration. Bind the pij_in 107 | # placeholder, and get a new estimation of Pij. 108 | em_iter_expr = tf.reduce_sum(z_concat, 0) / num_entries 109 | 110 | return pij_in, em_iter_expr 111 | 112 | 113 | def RunEm(pij_in, entry_size, em_iter_expr, max_em_iters, epsilon=1e-6): 114 | """Run the iterative EM algorithm (using the TensorFlow API). 115 | 116 | Args: 117 | num_entries: number of matrices (one per report) 118 | entry_size: total number of cells in each matrix 119 | v: numpy.ndarray (e.g. 7000 x 8 matrix) 120 | max_em_iters: maximum number of EM iterations 121 | 122 | Returns: 123 | pij: numpy.ndarray (e.g. vector of length 8) 124 | """ 125 | # Initial value is the uniform distribution 126 | pij = np.ones(entry_size) / entry_size 127 | 128 | i = 0 # visible outside loop 129 | 130 | # Do EM iterations. 131 | with tf.Session() as sess: 132 | for i in xrange(max_em_iters): 133 | print 'PIJ', pij 134 | new_pij = sess.run(em_iter_expr, feed_dict={pij_in: pij}) 135 | dif = max(abs(new_pij - pij)) 136 | log('EM iteration %d, dif = %e', i, dif) 137 | pij = new_pij 138 | 139 | if dif < epsilon: 140 | log('Early EM termination: %e < %e', max_dif, epsilon) 141 | break 142 | 143 | # If i = 9, then we did 10 iteratinos. 144 | return i + 1, pij 145 | 146 | 147 | def sep(): 148 | print '-' * 80 149 | 150 | 151 | def main(argv): 152 | input_path = argv[1] 153 | output_path = argv[2] 154 | max_em_iters = int(argv[3]) 155 | 156 | sep() 157 | with open(input_path) as f: 158 | num_entries, entry_size, cond_prob = ReadListOfMatrices(f) 159 | 160 | sep() 161 | DebugSum(num_entries, entry_size, cond_prob) 162 | 163 | sep() 164 | pij_in, em_iter_expr = BuildEmIter(num_entries, entry_size, cond_prob) 165 | num_em_iters, pij = RunEm(pij_in, entry_size, em_iter_expr, max_em_iters) 166 | 167 | sep() 168 | log('Final Pij: %s', pij) 169 | 170 | with open(output_path, 'wb') as f: 171 | WriteResult(f, num_em_iters, pij) 172 | log('Wrote %s', output_path) 173 | 174 | 175 | if __name__ == '__main__': 176 | try: 177 | main(sys.argv) 178 | except RuntimeError, e: 179 | print >>sys.stderr, 'FATAL: %s' % e 180 | sys.exit(1) 181 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | RAPPOR 2 | ====== 3 | 4 | RAPPOR is a novel privacy technology that allows inferring statistics about 5 | populations while preserving the privacy of individual users. 6 | 7 | This repository contains simulation and analysis code in Python and R. 8 | 9 | For a detailed description of the algorithms, see the 10 | [paper](http://arxiv.org/abs/1407.6981) and links below. 11 | 12 | Feel free to send feedback to 13 | [rappor-discuss@googlegroups.com][group]. 14 | 15 | Running the Demo 16 | ---------------- 17 | 18 | Although the Python and R libraries should be portable to any platform, our 19 | end-to-end demo has only been tested on Linux. 20 | 21 | If you don't have a Linux box handy, you can [view the generated 22 | output](http://google.github.io/rappor/examples/report.html). 23 | 24 | To setup your enviroment there are some packages and R dependencies. There is a setup script to install them: 25 | $ ./setup.sh 26 | Then to build the native components run: 27 | $ ./build.sh 28 | This compiles and tests the `fastrand` C extension module for Python, which 29 | speeds up the simulation. 30 | 31 | Finally to run the demo run: 32 | $ ./demo.sh 33 | 34 | The demo strings together the Python and R code. It: 35 | 36 | 1. Generates simulated input data with different distributions 37 | 2. Runs it through the RAPPOR privacy-preserving reporting mechanisms 38 | 3. Analyzes and plots the aggregated reports against the true input 39 | 40 | The output is written to `_tmp/regtest/results.html`, and can be opened with a 41 | browser. 42 | 43 | Dependencies 44 | ------------ 45 | 46 | [R](http://r-project.org) analysis (`analysis/R`): 47 | 48 | - [glmnet](http://cran.r-project.org/web/packages/glmnet/index.html) 49 | - [limSolve](https://cran.r-project.org/web/packages/limSolve/index.html) 50 | 51 | Demo dependencies (`demo.sh`): 52 | 53 | These are necessary if you want to test changes to the code. 54 | 55 | - R libraries 56 | - [ggplot2](http://cran.r-project.org/web/packages/ggplot2/index.html) 57 | - [optparse](http://cran.r-project.org/web/packages/optparse/index.html) 58 | - bash shell / coreutils: to run tests 59 | 60 | Python client (`client/python`): 61 | 62 | - None. You should be able to just import the `rappor.py` file. 63 | 64 | Platform: 65 | 66 | - R: tested on R 3.0. 67 | - Python: tested on Python 2.7. 68 | - OS: the shell script tests have been tested on Linux, but may work on 69 | Mac/Cygwin. The R and Python code should work on any OS. 70 | 71 | Development 72 | ----------- 73 | 74 | To run tests: 75 | 76 | $ ./test.sh 77 | 78 | This currently runs Python unit tests, lints Python source files, and runs R 79 | unit tests. 80 | 81 | API 82 | --- 83 | 84 | `rappor.py` is a tiny standalone Python file, and you can easily copy it into a 85 | Python program. 86 | 87 | NOTE: Its interface is subject to change. We are in the demo stage now, but if 88 | there's demand, we will document and publish the interface. 89 | 90 | The R interface is also subject to change. 91 | 92 | 93 | 94 | The `fastrand` C module is optional. It's likely only useful for simulation of 95 | thousands of clients. It doesn't use cryptographically strong randomness, and 96 | thus should **not** be used in production. 97 | 98 | Directory Structure 99 | ------------------- 100 | 101 | analysis/ 102 | R/ # R code for analysis 103 | cpp/ # Fast reimplementations of certain analysis 104 | # algorithms 105 | apps/ # Web apps to help you use RAPPOR (using Shiny) 106 | bin/ # Command line tools for analysis. 107 | client/ # Client libraries 108 | python/ # Python client library 109 | rappor.py 110 | ... 111 | cpp/ # C++ client library 112 | encoder.cc 113 | ... 114 | doc/ # Documentation 115 | tests/ # Tools for regression tests 116 | compare_dist.R # Test helper for single variable analysis 117 | gen_true_values.R # Generate test input 118 | make_summary.py # Generate an HTML report for the regtest 119 | rappor_sim.py # RAPPOR client simulation 120 | regtest_spec.py # Specification of test cases 121 | ... 122 | build.sh # Build scripts (docs, C extension, etc.) 123 | demo.sh # Quick demonstration 124 | docs.sh # Generate docs form the markdown in doc/ 125 | gh-pages/ # Where generated docs go. (A subtree of the branch gh-pages) 126 | pipeline/ # Analysis pipeline code. 127 | regtest.sh # End-to-end regression tests, including client 128 | # libraries and analysis 129 | setup.sh # Install dependencies (for Linux) 130 | test.sh # Test runner 131 | 132 | Documentation 133 | ------------- 134 | 135 | - [RAPPOR Data Flow](http://google.github.io/rappor/doc/data-flow.html) 136 | 137 | Publications 138 | ------------ 139 | 140 | - [RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response](http://arxiv.org/abs/1407.6981) 141 | - [Building a RAPPOR with the Unknown: Privacy-Preserving Learning of Associations and Data Dictionaries](http://arxiv.org/abs/1503.01214) 142 | 143 | Links 144 | ----- 145 | 146 | - [Google Blog Post about RAPPOR](http://googleresearch.blogspot.com/2014/10/learning-statistics-with-privacy-aided.html) 147 | - [RAPPOR implementation in Chrome](http://www.chromium.org/developers/design-documents/rappor) 148 | - This is a production quality C++ implementation, but it's somewhat tied to 149 | Chrome, and doesn't support all privacy parameters (e.g. only a few values 150 | of p and q). On the other hand, the code in this repo is not yet 151 | production quality, but supports experimentation with different parameters 152 | and data sets. Of course, anyone is free to implement RAPPOR independently 153 | as well. 154 | - Mailing list: [rappor-discuss@googlegroups.com][group] 155 | 156 | [group]: https://groups.google.com/forum/#!forum/rappor-discuss 157 | -------------------------------------------------------------------------------- /client/cpp/openssl_hash_impl_unittest.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "openssl_hash_impl.h" 4 | 5 | 6 | TEST(OpensslHashImplTest, Md5) { 7 | std::vector output; 8 | rappor::Md5("test", &output); 9 | static const uint8_t ex[] = { 10 | 0x09, 0x8f, 0x6b, 0xcd, 0x46, 0x21, 0xd3, 0x73, 11 | 0xca, 0xde, 0x4e, 0x83, 0x26, 0x27, 0xb4, 0xf6 12 | }; 13 | std::vector expected(ex, ex + sizeof(ex)); 14 | ASSERT_EQ(expected, output); 15 | } 16 | 17 | TEST(OpensslHashImplTest, HmacSha256) { 18 | std::vector output; 19 | rappor::HmacSha256("key", "value", &output); 20 | static const uint8_t ex[] = { 21 | 0x90, 0xfb, 0xfc, 0xf1, 0x5e, 0x74, 0xa3, 0x6b, 22 | 0x89, 0xdb, 0xdb, 0x2a, 0x72, 0x1d, 0x9a, 0xec, 23 | 0xff, 0xdf, 0xdd, 0xdc, 0x5c, 0x83, 0xe2, 0x7f, 24 | 0x75, 0x92, 0x59, 0x4f, 0x71, 0x93, 0x24, 0x81, }; 25 | std::vector expected(ex, ex + sizeof(ex)); 26 | ASSERT_EQ(expected, output); 27 | 28 | // Make sure nulls are handled properly. 29 | // 30 | // An empty value with key "key" 31 | // $ echo -n -e "" | openssl dgst -hmac "key" -sha256 -binary | xxd 32 | // 00000000: 5d5d 1395 63c9 5b59 67b9 bd9a 8c9b 233a ]]..c.[Yg.....#: 33 | // 00000010: 9ded b450 7279 4cd2 32dc 1b74 8326 07d0 ...PryL.2..t.&.. 34 | rappor::HmacSha256("key", "", &output); 35 | static const uint8_t exempty[] = { 36 | 0x5d, 0x5d, 0x13, 0x95, 0x63, 0xc9, 0x5b, 0x59, 37 | 0x67, 0xb9, 0xbd, 0x9a, 0x8c, 0x9b, 0x23, 0x3a, 38 | 0x9d, 0xed, 0xb4, 0x50, 0x72, 0x79, 0x4c, 0xd2, 39 | 0x32, 0xdc, 0x1b, 0x74, 0x83, 0x26, 0x07, 0xd0 40 | }; 41 | std::vector expected_empty(exempty, exempty + sizeof(exempty)); 42 | ASSERT_EQ(expected_empty, output); 43 | 44 | // A single null value with key "key" 45 | // $ echo -n -e "\x00" | openssl dgst -hmac "key" -sha256 -binary | xxd 46 | // 00000000: 8a8d fb96 56dc cf21 b7ea 5269 1124 3b75 ....V..!..Ri.$;u 47 | // 00000010: 68f4 3281 5f1c d43a 4277 1f2d b4aa a525 h.2._..:Bw.-...% 48 | rappor::HmacSha256("key", std::string("\0", 1), &output); 49 | static const uint8_t exnull[] = { 50 | 0x8a, 0x8d, 0xfb, 0x96, 0x56, 0xdc, 0xcf, 0x21, 51 | 0xb7, 0xea, 0x52, 0x69, 0x11, 0x24, 0x3b, 0x75, 52 | 0x68, 0xf4, 0x32, 0x81, 0x5f, 0x1c, 0xd4, 0x3a, 53 | 0x42, 0x77, 0x1f, 0x2d, 0xb4, 0xaa, 0xa5, 0x25 54 | }; 55 | std::vector expected_null(exnull, exnull + sizeof(exnull)); 56 | ASSERT_EQ(expected_null, output); 57 | 58 | // A null value with something after it, with key "key" 59 | // $ echo -n -e "\x00a" | openssl dgst -hmac "key" -sha256 -binary | xxd 60 | // 00000000: 5787 df47 c2c4 8664 5a6a f898 44c3 4636 W..G...dZj..D.F6 61 | // 00000010: fc5b b78b 1b87 29a0 6ca8 7556 7b75 c05a .[....).l.uV{u.Z 62 | rappor::HmacSha256("key", std::string("\0a", 2), &output); 63 | static const uint8_t exnulltrail[] = { 64 | 0x57, 0x87, 0xdf, 0x47, 0xc2, 0xc4, 0x86, 0x64, 65 | 0x5a, 0x6a, 0xf8, 0x98, 0x44, 0xc3, 0x46, 0x36, 66 | 0xfc, 0x5b, 0xb7, 0x8b, 0x1b, 0x87, 0x29, 0xa0, 67 | 0x6c, 0xa8, 0x75, 0x56, 0x7b, 0x75, 0xc0, 0x5a 68 | }; 69 | std::vector expected_null_trailing( 70 | exnulltrail, exnulltrail + sizeof(exnulltrail)); 71 | ASSERT_EQ(expected_null_trailing, output); 72 | std::string s = std::string("\0a", 2); 73 | rappor::HmacSha256("key", s, &output); 74 | ASSERT_EQ(expected_null_trailing, output); 75 | } 76 | 77 | TEST(OpensslHashImplTest, HmacDrbgNist) { 78 | std::vector output; 79 | // Expected output for NIST tests. 80 | static const uint8_t exnist[] = { 81 | 0xD6, 0x7B, 0x8C, 0x17, 0x34, 0xF4, 0x6F, 0xA3, 82 | 0xF7, 0x63, 0xCF, 0x57, 0xC6, 0xF9, 0xF4, 0xF2, 83 | 0xDC, 0x10, 0x89, 0xBD, 0x8B, 0xC1, 0xF6, 0xF0, 84 | 0x23, 0x95, 0x0B, 0xFC, 0x56, 0x17, 0x63, 0x52, 85 | 0x08, 0xC8, 0x50, 0x12, 0x38, 0xAD, 0x7A, 0x44, 86 | 0x00, 0xDE, 0xFE, 0xE4, 0x6C, 0x64, 0x0B, 0x61, 87 | 0xAF, 0x77, 0xC2, 0xD1, 0xA3, 0xBF, 0xAA, 0x90, 88 | 0xED, 0xE5, 0xD2, 0x07, 0x40, 0x6E, 0x54, 0x03 89 | }; 90 | std::vector expected_nist( 91 | exnist, exnist + sizeof(exnist)); 92 | 93 | // NIST test data, from 94 | // http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/HMAC_DRBG.pdf 95 | // p.148, requested security strength 128, Requested hash algorithm SHA-256 96 | output.resize(64); 97 | rappor::HmacDrbg( 98 | std::string( 99 | "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" 100 | "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13" 101 | "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D" 102 | "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27" 103 | "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31" 104 | "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24" 105 | "\x25\x26\x27", 63), // provided_data 106 | "", &output); 107 | ASSERT_EQ(expected_nist, output); 108 | 109 | // Since in our use case we concatenate the key and value 110 | // to produce the provided_data portion of the DRBG, let's 111 | // split the above key into key|value as an additional 112 | // test case. 113 | output.resize(64); 114 | rappor::HmacDrbg( 115 | std::string( 116 | "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09" 117 | "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13" 118 | "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D" 119 | "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27", 40), 120 | std::string( 121 | "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31" 122 | "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24" 123 | "\x25\x26\x27", 23), // provided_data 124 | &output); 125 | ASSERT_EQ(expected_nist, output); 126 | } 127 | 128 | TEST(OpensslHashImplTest, HmacDrbgTextStrings) { 129 | std::vector output; 130 | output.resize(30); 131 | rappor::HmacDrbg("key", "value", &output); // Truncated to 30 bytes. 132 | static const uint8_t ex[] = { 133 | 0x89, 0xD7, 0x1B, 0xB8, 0xA3, 0x7D, 0x80, 0xC2, 134 | 0x6E, 0x63, 0x9C, 0xBD, 0x68, 0xF3, 0x60, 0x7A, 135 | 0xA9, 0x4D, 0xEE, 0xF4, 0x25, 0xA7, 0xAF, 0xBB, 136 | 0xF8, 0xD0, 0x09, 0x92, 0xAF, 0x92 137 | }; 138 | std::vector expected(ex, ex + sizeof(ex)); 139 | ASSERT_EQ(expected, output); 140 | } 141 | 142 | int main(int argc, char **argv) { 143 | ::testing::InitGoogleTest(&argc, argv); 144 | return RUN_ALL_TESTS(); 145 | } 146 | --------------------------------------------------------------------------------
40 | Test Case 41 | 43 | Input Params 44 | 46 | RAPPOR Params 47 | 49 | Map Params 50 | 52 | Result Metrics 53 |
59 | d: distribution drawn from
60 | u: total unique values
61 | c: clients
62 | v: values per client
63 |
65 | k: report bits
66 | h: hashes
67 | m: cohorts
68 | p, q, f: probabilities
69 |
71 | +: num additional candidates
72 | -: regex for true values removed
73 |
75 | a: actual values
76 | r: values RAPPOR detected
77 | fp: false positive rate
78 | fn: false negative rate
79 | tv: total variation distance
80 | am: allocated mass
81 | time: time in seconds
82 |
ducvkhmpqf+-arfpfntvamtime