├── tests
    ├── params.csv
    ├── uvals.csv
    ├── setup.py
    ├── rappor_sim_test.py
    ├── fastrand.py
    ├── gen_true_values_test.R
    ├── compare_dist_test.R
    ├── fastrand_test.py
    ├── gen_true_values.R
    ├── regtest.html
    ├── _fastrand.c
    ├── regtest_spec.py
    ├── gen_counts_test.R
    ├── user_spec.py
    └── analyze_assoc.R
├── .gitignore
├── apps
    ├── rappor-analysis
    │   ├── params.csv
    │   ├── run_app.sh
    │   └── test.csv
    ├── rappor-sim
    │   ├── run_app.sh
    │   └── server.R
    └── README.md
├── gh-pages
    ├── doc
    │   ├── data-flow.png
    │   └── randomness.html
    ├── examples
    │   ├── exp_report
    │   │   └── dist.png
    │   ├── gauss_report
    │   │   └── dist.png
    │   ├── unif_report
    │   │   └── dist.png
    │   └── report.html
    └── index.html
├── analysis
    ├── cpp
    │   ├── testdata
    │   │   ├── graph3.txt
    │   │   └── graph1.txt
    │   ├── README.md
    │   └── run.sh
    ├── tensorflow
    │   ├── README.md
    │   ├── fast_em.sh
    │   └── fast_em.py
    └── R
    │   ├── util.R
    │   ├── run_tests.R
    │   ├── alternative.R
    │   ├── fast_em.R
    │   ├── encode.R
    │   ├── read_input.R
    │   └── unknowns_test.R
├── ui
    ├── README.md
    ├── home.html
    ├── table-sort.css
    ├── ui.css
    ├── histograms.html
    ├── assoc-overview.html
    ├── assoc-day.html
    ├── assoc-metric.html
    ├── day.html
    ├── assoc-pair.html
    ├── overview.html
    └── metric.html
├── pipeline
    ├── util.py
    ├── csv_to_html_test.py
    ├── combine_results_test.py
    ├── csv-to-html-test.sh
    ├── combine_status_test.py
    ├── tools-lib.sh
    ├── task_spec_test.py
    ├── README.md
    ├── alarm-lib.sh
    ├── combine_results.py
    ├── dist.sh
    ├── regtest.sh
    ├── cook.sh
    └── assoc.sh
├── bin
    ├── sum-bits
    ├── hash-candidates
    ├── decode-assoc
    ├── decode-dist
    ├── hash_candidates_test.py
    ├── README.md
    ├── sum_bits_test.py
    ├── hash_candidates.py
    ├── sum_bits.py
    └── decode_dist.R
├── util.sh
├── docs.sh
├── client
    ├── cpp
    │   ├── dotd.sh
    │   ├── libc_rand_impl.h
    │   ├── openssl_hash_impl.h
    │   ├── unix_kernel_rand_impl.h
    │   ├── unix_kernel_rand_impl.cc
    │   ├── libc_rand_impl.cc
    │   ├── run.sh
    │   ├── encoder_demo.cc
    │   ├── rappor_deps.h
    │   ├── openssl_hash_impl.cc
    │   ├── Makefile
    │   ├── README.md
    │   ├── encoder.h
    │   └── openssl_hash_impl_unittest.cc
    ├── README.md
    └── python
    │   └── rappor_test.py
├── doc
    ├── randomness.md
    └── data-flow.dot
├── demo.sh
├── setup.sh
├── test.sh
└── README.md


/tests/params.csv:
--------------------------------------------------------------------------------
1 | k, h, m, p, q, f
2 | 16, 2, 4, 0.1, 0.9, 0.2
3 | 


--------------------------------------------------------------------------------
/tests/uvals.csv:
--------------------------------------------------------------------------------
1 | google.com,intel.com,yahoo.com
2 | ssl,nossl
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.swp
3 | _tmp
4 | tests/_fastrand.so
5 | tests/build/
6 | 


--------------------------------------------------------------------------------
/apps/rappor-analysis/params.csv:
--------------------------------------------------------------------------------
1 | "k","h","m","p","q","f"
2 | 128,2,8,0.5,0.75,0
3 | 


--------------------------------------------------------------------------------
/gh-pages/doc/data-flow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/doc/data-flow.png


--------------------------------------------------------------------------------
/gh-pages/examples/exp_report/dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/exp_report/dist.png


--------------------------------------------------------------------------------
/gh-pages/examples/gauss_report/dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/gauss_report/dist.png


--------------------------------------------------------------------------------
/gh-pages/examples/unif_report/dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/rappor/HEAD/gh-pages/examples/unif_report/dist.png


--------------------------------------------------------------------------------
/analysis/cpp/testdata/graph3.txt:
--------------------------------------------------------------------------------
1 | num_partitions 3
2 | ngram_size 2
3 | edge 0.ab 1.cd
4 | edge 1.cd 2.ef
5 | edge 0.ab 2.ef
6 | edge 0.AB 1.CD
7 | edge 1.CD 2.EF
8 | 


--------------------------------------------------------------------------------
/ui/README.md:
--------------------------------------------------------------------------------
1 | ui
2 | ==
3 | 
4 | This directory contains static HTML, CSS, and JavaScript for the RAPPOR
5 | dashboard.  See the `pipeline/` directory for more details.
6 | 
7 | 


--------------------------------------------------------------------------------
/pipeline/util.py:
--------------------------------------------------------------------------------
 1 | """Common functions."""
 2 | 
 3 | import sys
 4 | 
 5 | 
 6 | def log(msg, *args):
 7 |   if args:
 8 |     msg = msg % args
 9 |   print >>sys.stderr, msg
10 | 


--------------------------------------------------------------------------------
/bin/sum-bits:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Shell wrapper around sum_bits.py.
4 | 
5 | readonly THIS_DIR=$(dirname $0)
6 | 
7 | PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/sum_bits.py "$@"
8 | 


--------------------------------------------------------------------------------
/bin/hash-candidates:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #
3 | # Shell wrapper around hash_candidates.py.
4 | 
5 | readonly THIS_DIR=$(dirname $0)
6 | 
7 | PYTHONPATH=$THIS_DIR/../client/python $THIS_DIR/hash_candidates.py "$@"
8 | 


--------------------------------------------------------------------------------
/util.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Utility functions, used by demo.sh and regtest.sh.
 4 | 
 5 | banner() {
 6 |   echo
 7 |   echo "----- $@"
 8 |   echo
 9 | }
10 | 
11 | log() {
12 |   echo 1>&2 "$@"
13 | }
14 | 
15 | die() {
16 |   log "$0: $@"
17 |   exit 1
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/analysis/tensorflow/README.md:
--------------------------------------------------------------------------------
 1 | RAPPOR in TensorFlow
 2 | ====================
 3 | 
 4 | This directory contains an experimental implementation of the EM algorithm in
 5 | [TensorFlow](http://tensorflow.org).
 6 | 
 7 | Currently the C++ implementation in `analysis/cpp` is faster and can be used
 8 | in production.
 9 | 
10 | 
11 | 


--------------------------------------------------------------------------------
/analysis/cpp/README.md:
--------------------------------------------------------------------------------
 1 | find_cliques
 2 | ============
 3 | 
 4 | This tool does part of the analysis for unknown dictionaries.  To run it:
 5 | 
 6 |     $ ./run.sh demo
 7 | 
 8 | This compiles and runs it on files in the testdata/ directory.
 9 | 
10 | See comments in find_cliques.cc for information on how it works.
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/apps/rappor-analysis/run_app.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Run the Shiny app in this directory.
 4 | #
 5 | # Usage:
 6 | #   ./run_app.sh [port]
 7 | 
 8 | app_dir=$(dirname $0)
 9 | port=${1:-6789}
10 | 
11 | # host= makes it serve to other machines, not just localhost.
12 | exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)"
13 | 


--------------------------------------------------------------------------------
/gh-pages/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>RAPPOR Github Pages</title>
 5 |   </head>
 6 | 
 7 |   <body>
 8 |     <h2>RAPPOR Github Pages</h2>
 9 |     <p>
10 |       <a href="examples/report.html">examples/report.html</a> <br/>
11 |       <a href="doc/data-flow.html">doc/data-flow.html</a> <br/>
12 |     </p>
13 |   </body>
14 | </html>
15 | 


--------------------------------------------------------------------------------
/apps/rappor-sim/run_app.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | #
 3 | # Run the Shiny app in this directory.
 4 | #
 5 | # Usage:
 6 | #   ./run_app.sh [port]
 7 | 
 8 | app_dir=$(dirname $0)
 9 | port=${1:-6788}
10 | 
11 | # Needed by source.rappor in analysis/R/*.R
12 | export RAPPOR_REPO=../../
13 | 
14 | # host= makes it serve to other machines, not just localhost.
15 | exec R --vanilla --slave -e "shiny::runApp('$app_dir', host='0.0.0.0', port=$port)"
16 | 


--------------------------------------------------------------------------------
/analysis/R/util.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | #
 3 | # Common utility library for all R scripts.
 4 | 
 5 | # Log message with timing.  Example:
 6 | #
 7 | # _____ 1.301 My message
 8 | #
 9 | # The prefix makes it stand out (vs R's print()), and the number is the time so
10 | # far.
11 | #
12 | # NOTE: The shell script log uses hyphens.
13 | 
14 | Log <- function(...) {
15 |   cat(sprintf('_____ %.3f ', proc.time()[['elapsed']]))
16 |   cat(sprintf(...))
17 |   cat('\n')
18 | }
19 | 


--------------------------------------------------------------------------------
/analysis/cpp/testdata/graph1.txt:
--------------------------------------------------------------------------------
 1 | num_partitions 4
 2 | ngram_size 2
 3 | edge 0.ab 1.cd
 4 | edge 0.xx 1.cd
 5 | edge 0.ij 1.kl
 6 | edge 0.qr 1.st
 7 | edge 0.ab 1.le
 8 | edge 0.qr 2.uv
 9 | edge 0.ab 2.ef
10 | edge 0.ij 2.mn
11 | edge 0.ij 3.op
12 | edge 0.qr 3.wx
13 | edge 0.ab 3.gh
14 | edge 1.cd 2.ef
15 | edge 1.kl 2.mn
16 | edge 1.st 2.uv
17 | edge 1.kl 3.op
18 | edge 1.cd 3.gh
19 | edge 1.st 3.wx
20 | edge 2.uv 3.wx
21 | edge 2.ef 3.gh
22 | edge 2.ef 3.zz
23 | edge 2.mn 3.op
24 | 


--------------------------------------------------------------------------------
/docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o nounset
 4 | set -o pipefail
 5 | set -o errexit
 6 | 
 7 | 
 8 | build() {
 9 |   ./build.sh doc
10 | }
11 | 
12 | copy() {
13 |   cp -a ./_tmp/doc/* ./gh-pages/doc/
14 |   echo "After commiting changes, you can publish them by running: ./docs.sh publish"
15 | }
16 | 
17 | publish() {
18 |   git subtree push --prefix gh-pages origin gh-pages
19 | }
20 | 
21 | if test $# -eq 0 ; then
22 |   build
23 |   copy
24 | else
25 |   "$@"
26 | fi
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/ui/home.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Rappor HOME</title>
 5 | 
 6 |     <!-- This page is a stub that redirects to our Github home page.
 7 |       overview.html, etc. link to it.  Redirect after 0 seconds.  -->
 8 |     <meta http-equiv="refresh" content="0; url=https://github.com/google/rappor" />
 9 |   </head>
10 | 
11 |   <body>
12 |     <p>
13 |       Redirecting to <a href="https://github.com/google/rappor">https://github.com/google/rappor</a>
14 |     </p>
15 |   </body>
16 | </html>
17 | 


--------------------------------------------------------------------------------
/analysis/tensorflow/fast_em.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Wrapper to run fast_em.py using TensorFlow configured for a GPU.  CUDA
 4 | # environment variables must be set.
 5 | #
 6 | # Usage:
 7 | #   ./fast_em.sh <args>
 8 | 
 9 | set -o nounset
10 | set -o pipefail
11 | set -o errexit
12 | 
13 | readonly THIS_DIR=$(dirname $0)
14 | 
15 | fast-em() {
16 |   # Never returns
17 |   LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 \
18 |   CUDA_HOME=/usr/local/cuda-7.0 \
19 |     exec $THIS_DIR/fast_em.py "$@"
20 | }
21 | 
22 | fast-em "$@"
23 | 


--------------------------------------------------------------------------------
/pipeline/csv_to_html_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | """
 3 | csv_to_html_test.py: Tests for csv_to_html.py
 4 | """
 5 | 
 6 | import unittest
 7 | 
 8 | import csv_to_html  # module under test
 9 | 
10 | 
11 | class CsvToHtmlTest(unittest.TestCase):
12 | 
13 |   def testParseSpec(self):
14 |     self.assertEqual(
15 |         {'foo': 'bar', 'spam': 'eggs'},
16 |         csv_to_html.ParseSpec(['foo bar', 'spam eggs']))
17 | 
18 |     self.assertEqual(
19 |         {},
20 |         csv_to_html.ParseSpec([]))
21 | 
22 | 
23 | if __name__ == '__main__':
24 |   unittest.main()
25 | 


--------------------------------------------------------------------------------
/bin/decode-assoc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Decode multidimensional reports.
 4 | #
 5 | # This is a tiny shell wrapper around R.
 6 | 
 7 | readonly THIS_DIR=$(dirname $0)
 8 | 
 9 | # NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
10 | # concatenation to form the absolute path.  (file.path() in R doesn't do what
11 | # we want.)
12 | 
13 | readonly RAPPOR_REPO=$THIS_DIR/../
14 | 
15 | # RAPPOR_REPO is used by source() statements to find .R files.
16 | export RAPPOR_REPO
17 | 
18 | # Make sure to reuse the same process so it can be killed easily.
19 | exec $THIS_DIR/decode_assoc.R "$@"
20 | 


--------------------------------------------------------------------------------
/bin/decode-dist:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Decode a distribution from summed RAPPOR reports.
 4 | #
 5 | # This is a tiny shell wrapper around R.
 6 | 
 7 | readonly THIS_DIR=$(dirname $0)
 8 | 
 9 | # NOTE: A trailing / is *required* on RAPPOR_REPO, because we use string
10 | # concatenation to form the absolute path.  (file.path() in R doesn't do what
11 | # we want.)
12 | 
13 | readonly RAPPOR_REPO=$THIS_DIR/../
14 | 
15 | # RAPPOR_REPO is used by source() statements to find .R files.
16 | export RAPPOR_REPO
17 | 
18 | # Make sure to reuse the same process so it can be killed easily.
19 | exec $THIS_DIR/decode_dist.R "$@"
20 | 


--------------------------------------------------------------------------------
/ui/table-sort.css:
--------------------------------------------------------------------------------
 1 | /* sort indicator in column headings */
 2 | .sortArrow {
 3 |   color: grey;
 4 | }
 5 | 
 6 | thead {
 7 |   font-weight: bold;
 8 |   text-align: center;
 9 | }
10 | 
11 | table {
12 |   padding: 10px;  /* Padding makes it look nicer. */
13 |   margin: 0 auto;  /* center table on the page */
14 |   border-collapse: collapse;  /* this is like old cellpadding */
15 | }
16 | 
17 | /* like cellspacing? */
18 | td {
19 |   padding: 5px;
20 | }
21 | 
22 | /* Built-in support for R NA values */
23 | .na {
24 |   color: darkred;
25 | }
26 | 
27 | /* Numbers aligned on the right, like Excel */
28 | .num {
29 |   text-align: right;
30 | }
31 | 
32 | .highlight {
33 |   background-color: #f0f0f0;
34 | }
35 | 
36 | tbody tr:hover {
37 |   background-color: lightcyan;
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/ui/ui.css:
--------------------------------------------------------------------------------
 1 | /* Center the plots */
 2 | .dy {
 3 |   margin: 0 auto;
 4 |   width: 50em;
 5 | }
 6 | 
 7 | /* main metric */
 8 | #proportionsDy {
 9 |   width: 1000px;
10 |   height: 600px;
11 | }
12 | 
13 | #num-reports-dy {
14 |   width: 1000px;
15 |   height: 300px;
16 | }
17 | 
18 | #mass-dy {
19 |   width: 1000px;
20 |   height: 300px;
21 | }
22 | 
23 | #metricDesc {
24 |   font-style: italic;
25 | }
26 | 
27 | body {
28 |   /*margin: 0 auto;*/
29 |   /*text-align: left;*/
30 | }
31 | 
32 | h1 {
33 |   text-align: center;
34 | }
35 | 
36 | h2 {
37 |   text-align: center;
38 | }
39 | 
40 | p {
41 |   text-align: center;
42 | }
43 | 
44 | /* R NA values */
45 | .na {
46 |   color: darkred;
47 | }
48 | 
49 | #status {
50 |   text-align: center;
51 |   font-size: x-large;
52 |   color: darkred;
53 | }
54 | 


--------------------------------------------------------------------------------
/apps/rappor-analysis/test.csv:
--------------------------------------------------------------------------------
 1 | [1] String     Estimate   St.Dev     P.value    Proportion SNR       
 2 | <0 rows> (or 0-length row.names)
 3 | SUMMARY
 4 |                    parameters   values
 5 | 1           Candidate strings  300.000
 6 | 2            Detected strings    0.000
 7 | 3  Discovered Prop (out of N)    0.000
 8 | 4          Explained Variance    0.000
 9 | 5            Missing Variance    0.988
10 | 6              Noise Variance    0.012
11 | 7 Theoretical Noise Std. Dev. 2236.068
12 | PRIVACY
13 |            parameters      values
14 | 1         Effective p 0.500000000
15 | 2         Effective q 0.750000000
16 | 3            exp(e_1) 9.000000000
17 | 4                 e_1 2.197224577
18 | 5          exp(e_inf)         Inf
19 | 6               e_inf         Inf
20 | 7 Detection frequency 0.001040297
21 | 


--------------------------------------------------------------------------------
/tests/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | # 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | # 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | 
18 | from distutils.core import setup, Extension
19 | 
20 | module = Extension('_fastrand',
21 |                     sources = ['_fastrand.c'])
22 | 
23 | setup(name = '_fastrand',
24 |       version = '1.0',
25 |       description = 'Module to speed up RAPPOR simulation',
26 |       ext_modules = [module])
27 | 


--------------------------------------------------------------------------------
/tests/rappor_sim_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | rappor_sim_test.py: Tests for rappor_sim.py
19 | """
20 | 
21 | import unittest
22 | 
23 | import rappor_sim  # module under test
24 | 
25 | 
26 | class RapporSimTest(unittest.TestCase):
27 | 
28 |   def testFoo(self):
29 |     pass
30 | 
31 | 
32 | if __name__ == "__main__":
33 |   unittest.main()
34 | 


--------------------------------------------------------------------------------
/client/cpp/dotd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # dotd.sh
 4 | #
 5 | # Generate .d Makefile fragments, so we can use #include statements in source
 6 | # for dependency info.  Adapted from the GNU make manual:
 7 | #
 8 | # http://www.gnu.org/software/make/manual/html_node/Automatic-Prerequisites.html
 9 | #
10 | # We are putting this in shell, so we just have 'sed in bash'.  Not an unholy
11 | # mix of 'sed in bash in Make'.
12 | 
13 | set -o nounset
14 | set -o pipefail
15 | set -o errexit
16 | 
17 | # Munge gcc -MM output into .d files.
18 | main() {
19 |   if [ ! -d _tmp ]; then mkdir _tmp; fi
20 |   local basename=$1
21 |   local dotd=$2  # .d output name
22 |   shift 2  # rest of args are gcc invocation
23 | 
24 |   rm --verbose -f $dotd  # in case of failure?
25 | 
26 |   # Execute the gcc -MM invocation.
27 |   #
28 |   # Change
29 |   #   rappor_sim.o: rappor.sim.cc
30 |   # to
31 |   #   _tmp/rappor_sim.o _tmp/rappor_sim.d : rappor.sim.cc
32 |   "$@" | sed "s|\($basename\).o|_tmp/\1.o _tmp/\1.d |" > $dotd
33 | }
34 | 
35 | main "$@"
36 | 


--------------------------------------------------------------------------------
/pipeline/combine_results_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | """
 3 | combine_results_test.py: Tests for combine_results.py
 4 | """
 5 | 
 6 | import csv
 7 | import cStringIO
 8 | import unittest
 9 | 
10 | import combine_results  # module under test
11 | 
12 | 
13 | # TODO: Make these test more the header row.  They rely heavily on the file
14 | # system!
15 | 
16 | class CombineResultsTest(unittest.TestCase):
17 | 
18 |   def testCombineDistResults(self):
19 |     stdin = cStringIO.StringIO('')
20 |     out = cStringIO.StringIO()
21 |     c_out = csv.writer(out)
22 | 
23 |     combine_results.CombineDistResults(stdin, c_out, 10)
24 |     actual = out.getvalue()
25 |     self.assert_(actual.startswith('date'), actual)
26 | 
27 |   def testCombineAssocResults(self):
28 |     stdin = cStringIO.StringIO('')
29 |     out = cStringIO.StringIO()
30 |     c_out = csv.writer(out)
31 | 
32 |     combine_results.CombineAssocResults(stdin, c_out, 10)
33 |     actual = out.getvalue()
34 |     self.assert_(actual.startswith('dummy'), actual)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |   unittest.main()
39 | 


--------------------------------------------------------------------------------
/pipeline/csv-to-html-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Test for csv_to_html.py.
 4 | #
 5 | # Usage:
 6 | #   ./csv-to-html-test.sh <function name>
 7 | 
 8 | set -o nounset
 9 | set -o pipefail
10 | set -o errexit
11 | 
12 | test-basic() {
13 |   ./csv_to_html.py <<EOF
14 | a_number,b
15 | 1,2
16 | 3,4
17 | NA,4
18 | EOF
19 | }
20 | 
21 | test-col-format() {
22 |   ./csv_to_html.py \
23 |     --col-format 'b <a href="../{b}/metric.html">{b}</a>' <<EOF
24 | a,b
25 | 1,2015-05-01
26 | 3,2015-05-02
27 | EOF
28 | }
29 | 
30 | test-var-def() {
31 |   ./csv_to_html.py \
32 |     --def 'v VALUE' \
33 |     --col-format 'b <a href="../{b}/metric.html">{v}</a>' <<EOF
34 | a,b
35 | 1,2
36 | 3,4
37 | EOF
38 | }
39 | 
40 | test-as-percent() {
41 |   ./csv_to_html.py \
42 |     --as-percent b <<EOF
43 | a,b
44 | A,0.21
45 | B,0.001
46 | C,0.0009
47 | D,0.0001
48 | EOF
49 | }
50 | 
51 | if test $# -eq 0; then
52 |   test-basic
53 |   echo '--'
54 |   test-col-format
55 |   echo '--'
56 |   test-var-def
57 |   echo '--'
58 |   test-as-percent
59 |   echo '--'
60 |   echo 'OK'
61 | else
62 |   "$@"
63 | fi
64 | 


--------------------------------------------------------------------------------
/pipeline/combine_status_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | """
 3 | combine_status_test.py: Tests for combine_status.py
 4 | """
 5 | 
 6 | import csv
 7 | import cStringIO
 8 | import unittest
 9 | 
10 | import combine_status  # module under test
11 | 
12 | 
13 | # TODO: Make these test more the header row.  They rely heavily on the file
14 | # system!
15 | 
16 | class CombineStatusTest(unittest.TestCase):
17 | 
18 |   def testCombineDistTaskStatus(self):
19 |     stdin = cStringIO.StringIO('')
20 |     out = cStringIO.StringIO()
21 |     c_out = csv.writer(out)
22 | 
23 |     combine_status.CombineDistTaskStatus(stdin, c_out, {})
24 |     actual = out.getvalue()
25 |     self.assert_(actual.startswith('job_id,params_file,'), actual)
26 | 
27 |   def testCombineAssocTaskStatus(self):
28 |     stdin = cStringIO.StringIO('')
29 |     out = cStringIO.StringIO()
30 |     c_out = csv.writer(out)
31 | 
32 |     combine_status.CombineAssocTaskStatus(stdin, c_out)
33 |     actual = out.getvalue()
34 |     self.assert_(actual.startswith('job_id,metric,'), actual)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |   unittest.main()
39 | 


--------------------------------------------------------------------------------
/ui/histograms.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>RAPPOR Task Histograms</title>
 5 | 
 6 |     <!-- TODO: use <base> tag? -->
 7 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
 8 |     <script type="text/javascript" src="static/ui.js"></script>
 9 |   </head>
10 | 
11 |   <body>
12 |     <p style="text-align: right">
13 |       <a href="../home.html">Home</a> /
14 |       <a href="overview.html">Overview</a>  /
15 |       <b>Histograms</b> 
16 |     </p>
17 | 
18 |     <h1>RAPPOR Task Histograms</h1>
19 | 
20 |     <p>Each task's input is a (metric, day), i.e. it runs on the summed reports
21 |     for a single metric received in a single day.</p>
22 | 
23 |     <p>
24 |     <img src="cooked/allocated_mass.png" />
25 |     </p>
26 | 
27 |     <p>
28 |     <img src="cooked/num_rappor.png" />
29 |     </p>
30 | 
31 |     <p>
32 |     <img src="cooked/num_reports.png" />
33 |     </p>
34 | 
35 |     <p>
36 |     <img src="cooked/seconds.png" />
37 |     </p>
38 | 
39 |     <p>
40 |     <img src="cooked/memory.png" />
41 |     </p>
42 | 
43 |     <p>
44 |     <img src="mem-series.png" />
45 |     </p>
46 | 
47 |   </body>
48 | </html>
49 | 


--------------------------------------------------------------------------------
/client/cpp/libc_rand_impl.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // A RAPPOR random implementation using libc's rand().
16 | //
17 | // IMPORTANT: This is for demo /simulation purposes only.  Use a better random
18 | // function in production applications.
19 | 
20 | #ifndef LIBC_RAND_IMPL_H_
21 | #define LIBC_RAND_IMPL_H_
22 | 
23 | #include "rappor_deps.h"
24 | 
25 | namespace rappor {
26 | 
27 | class LibcRand : public IrrRandInterface {
28 |  public:
29 |   virtual ~LibcRand() {}
30 | 
31 |   virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const;
32 | };
33 | 
34 | }  // namespace rappor
35 | 
36 | #endif  // LIBC_RAND_IMPL_H_
37 | 


--------------------------------------------------------------------------------
/client/cpp/openssl_hash_impl.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // OpenSSL implementation of RAPPOR dependencies.
16 | 
17 | #ifndef OPENSSL_IMPL_H_
18 | #define OPENSSL_IMPL_H_
19 | 
20 | #include "rappor_deps.h"
21 | 
22 | namespace rappor {
23 | 
24 | bool HmacSha256(const std::string& key, const std::string& value,
25 |                 std::vector<uint8_t>* output);
26 | // Pass output vector of desired length.
27 | bool HmacDrbg(const std::string& key, const std::string& value,
28 |               std::vector<uint8_t>* output);
29 | bool Md5(const std::string& value, std::vector<uint8_t>* output);
30 | 
31 | }  // namespace rappor
32 | 
33 | #endif  // OPENSSL_IMPL_H_
34 | 


--------------------------------------------------------------------------------
/client/README.md:
--------------------------------------------------------------------------------
 1 | RAPPOR Clients
 2 | ==============
 3 | 
 4 | This directory contains RAPPOR client implementations in various languages.
 5 | 
 6 | The privacy of RAPPOR is based on the client "lying" about the true values --
 7 | that is, not sending them over the network.
 8 | 
 9 | The clients are typically small in terms of code size because the RAPPOR
10 | client algorithm is simple.  See the README.md in each subdirectory for details
11 | on how to use the library.
12 | 
13 | Common Test Protocol
14 | --------------------
15 | 
16 | When implementing a new RAPPOR client, you can get for free!
17 | 
18 | The `regtest.sh` script in the root of this repository does the following:
19 | 
20 | 1. Create test input data and feed it into your client as a CSV file
21 | 2. Preprocesses your client output (also CSV)
22 | 3. Runs the RAPPOR analysis, learning aggregate statistics from encoded values
23 | 4. Compares the analysis to the true client values, with metrics and plots.
24 | 
25 | To have your client tested, you need a small executable wrapper, which reads
26 | and write as CSV file in a specified format.
27 | 
28 | Then add it to the `_run-one-instance` function in `regtest.sh`. 
29 | 
30 | <!--
31 | 
32 | TODO:
33 | -  more details about protocol
34 | 
35 | -->
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/tests/fastrand.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """fastrand.py - Python wrapper for _fastrand."""
16 | 
17 | # NOTE: We could retire this module in favor of the C++ client?  One reason to
18 | # keep it is if it supports a wider range of params (e.g. more than 32 or 64
19 | # bits.)
20 | 
21 | import random
22 | 
23 | import _fastrand
24 | 
25 | 
26 | class FastIrrRand(object):
27 |   """Fast insecure version of rappor.SecureIrrRand."""
28 | 
29 |   def __init__(self, params):
30 |     randbits = _fastrand.randbits  # accelerated function
31 |     num_bits = params.num_bloombits
32 | 
33 |     # IRR probabilities
34 |     self.p_gen = lambda: randbits(params.prob_p, num_bits)
35 |     self.q_gen = lambda: randbits(params.prob_q, num_bits)
36 | 


--------------------------------------------------------------------------------
/ui/assoc-overview.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>RAPPOR Association Analysis Overview</title>
 5 | 
 6 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 7 |     <script type="text/javascript" src="static/table-lib.js"></script>
 8 | 
 9 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
10 |     <script type="text/javascript" src="static/ui.js"></script>
11 |   </head>
12 | 
13 |   <body onload="initAssocOverview(gUrlHash, gTableStates, kStatusElem);"
14 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
15 |     <p id="status"></p>
16 | 
17 |     <p style="text-align: right">
18 |       <a href="../../live/latest/overview.html">Single variable analysis</a> (latest)
19 |     </p>
20 | 
21 |     <p style="text-align: right">
22 |       <a href="../home.html">Home</a> /
23 |       <b>Association Overview</b>
24 |     </p>
25 | 
26 |     <h1>RAPPOR Association Analysis Overview</h1>
27 | 
28 |     <table id="overview">
29 |     </table>
30 | 
31 |     <p>
32 |       Underlying data: <a href="cooked/assoc-overview.csv">overview.csv</a>
33 |     </p>
34 | 
35 |     <!-- page globals -->
36 |     <script type="text/javascript">
37 |       var gUrlHash = new UrlHash(location.hash);
38 |       var gTableStates = {};
39 |       var kStatusElem = document.getElementById('status');
40 |     </script>
41 | 
42 |   </body>
43 | </html>
44 | 


--------------------------------------------------------------------------------
/client/cpp/unix_kernel_rand_impl.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // A RAPPOR random implementation using bytes from a file like /dev/urandom or
16 | // /dev/random.
17 | 
18 | #ifndef UNIX_KERNEL_RAND_IMPL_H_
19 | #define UNIX_KERNEL_RAND_IMPL_H_
20 | 
21 | #include <stdint.h>  // uint8_t
22 | #include <stdio.h>  // FILE*
23 | 
24 | #include "rappor_deps.h"
25 | 
26 | namespace rappor {
27 | 
28 | class UnixKernelRand : public IrrRandInterface {
29 |  public:
30 |   explicit UnixKernelRand(FILE* fp)
31 |       : fp_(fp) {
32 |   }
33 |   virtual ~UnixKernelRand() {}
34 | 
35 |   virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const;
36 | 
37 |  private:
38 |   FILE* fp_;  // open device, e.g. /dev/urandom
39 | };
40 | 
41 | }  // namespace rappor
42 | 
43 | #endif  // UNIX_KERNEL_RAND_IMPL_H_
44 | 


--------------------------------------------------------------------------------
/ui/assoc-day.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Single Day Association Results</title>
 5 | 
 6 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 7 |     <script type="text/javascript" src="static/table-lib.js"></script>
 8 | 
 9 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
10 |     <script type="text/javascript" src="static/ui.js"></script>
11 |   </head>
12 | 
13 |   <body onload="initAssocDay(gUrlHash, gTableStates, kStatusElem);"
14 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
15 |     <p id="status"></p>
16 | 
17 |     <!-- TODO: up to metric?  Nav bar. -->
18 |     <p style="text-align: right">
19 |       <a href="../home.html">Home</a> /
20 |       <a href="assoc-overview.html">Association Overview</a>
21 |     </p>
22 | 
23 |     <!-- NOTE: There is a metric description here.  Get it from the XML file.
24 |     -->
25 | 
26 |     <h2 id="metricDay"></h2>
27 | 
28 |     <table id="results_table">
29 |     </table>
30 | 
31 |     <p>
32 |       <!-- link depends on fragment; filled in by JS -->
33 |       Underlying data: <a id="underlying" href="">assoc-results.csv</a>
34 |     </p>
35 | 
36 |     <!-- page globals -->
37 |     <script type="text/javascript">
38 |       var gUrlHash = new UrlHash(location.hash);
39 |       var gTableStates = {};
40 |       var kStatusElem = document.getElementById('status');
41 |     </script>
42 | 
43 |   </body>
44 | </html>
45 | 


--------------------------------------------------------------------------------
/client/cpp/unix_kernel_rand_impl.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "unix_kernel_rand_impl.h"
16 | 
17 | #include <stdint.h>  // uint64_t
18 | 
19 | namespace rappor {
20 | 
21 | const int kMaxBitWidth = 32;  // also in encoder.cc
22 | 
23 | bool UnixKernelRand::GetMask(float prob, int num_bits, Bits* mask_out) const {
24 |   uint8_t rand_buf[kMaxBitWidth];
25 |   size_t num_elems = fread(&rand_buf, sizeof(uint8_t), num_bits, fp_);
26 |   if (num_elems != static_cast<size_t>(num_bits)) {  // fread error
27 |     return false;
28 |   }
29 |   uint8_t threshold_256 = static_cast<uint8_t>(prob * 256);
30 | 
31 |   Bits mask = 0;
32 |   for (int i = 0; i < num_bits; ++i) {
33 |     uint8_t bit = (rand_buf[i] < threshold_256);
34 |     mask |= (bit << i);
35 |   }
36 |   *mask_out = mask;
37 |   return true;
38 | }
39 | 
40 | }  // namespace rappor
41 | 


--------------------------------------------------------------------------------
/tests/gen_true_values_test.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/Rscript
 2 | #
 3 | # gen_reports_test.R
 4 | 
 5 | source('analysis/R/util.R')  # Log()
 6 | 
 7 | source('tests/gen_true_values.R')  # module under test
 8 | 
 9 | library(RUnit)
10 | 
11 | TestGenerateTrueValues = function() {
12 |   num_clients <- 10
13 |   reports_per_client <- 2
14 |   num_cohorts <- 4
15 |   reports <- GenerateTrueValues('exp', 10, num_clients, reports_per_client,
16 |                                 num_cohorts)
17 |   print(reports)
18 | 
19 |   # 10 clients, 2 reports per client
20 |   checkEquals(20, nrow(reports))
21 | 
22 |   # 10 unique clients
23 |   checkEquals(10, length(unique(reports$client)))
24 | 
25 |   # Whether a given client reports different values
26 |   reports_different_values <- rep(FALSE, num_clients)
27 | 
28 |   for (c in 1:num_clients) {
29 |     my_reports <- reports[reports$client == c, ]
30 |     #Log("CLIENT %d", c)
31 |     #print(my_reports)
32 | 
33 |     # If every report for this client isn't same, make note of it
34 |     if (length(unique(my_reports$value)) != 1) {
35 |       reports_different_values[[c]] <- TRUE
36 |     }
37 |   }
38 | 
39 |   # At least one client should report different values.  (Technically this
40 |   # could fail, but is unlikely with 10 clients).
41 |   checkTrue(any(reports_different_values))
42 | 
43 |   checkEquals(num_cohorts, length(unique(reports$cohort)))
44 | }
45 | 
46 | TestAll <- function(){
47 |   TestGenerateTrueValues()
48 | }
49 | 
50 | TestAll()
51 | 


--------------------------------------------------------------------------------
/tests/compare_dist_test.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | library(RUnit)
18 | 
19 | source('tests/compare_dist.R')
20 | 
21 | TestProcessAll <- function() {
22 |   ctx <- new.env()
23 |   ctx$actual <- data.frame(string = c('v1', 'v2', 'v3'), proportion = c(0.2, 0.3, 0.5),
24 |                            count = c(2, 3, 5))
25 |   ctx$rappor <- data.frame(strings = c('v2', 'v3', 'v4'), proportion = c(0.1, 0.2, 0.3))
26 | 
27 |   metrics <- CompareRapporVsActual(ctx)$metrics
28 |   str(metrics)
29 | 
30 |   # sum of rappor proportions
31 |   checkEqualsNumeric(0.6, metrics$sum_proportion)
32 | 
33 |   # v1  v2  v3  v4
34 |   # 0.2 0.3 0.5 0.0
35 |   # 0.0 0.1 0.2 0.3
36 | 
37 |   # (0.2 + 0.2 + 0.3 + 0.3) / 2
38 |   checkEqualsNumeric(0.5, metrics$total_variation)
39 | 
40 |   print(metrics$total_variation)
41 | }
42 | 
43 | TestProcessAll()
44 | 


--------------------------------------------------------------------------------
/pipeline/tools-lib.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Library used to refer to open source tools.
 4 | 
 5 | set -o nounset
 6 | set -o pipefail
 7 | set -o errexit
 8 | 
 9 | # NOTE: RAPPOR_SRC defined by the module that sources (cook.sh or ui.sh)
10 | 
11 | # Caller can override shebang line by setting $DEP_PYTHON.
12 | readonly PYTHON=${DEP_PYTHON:-}
13 | 
14 | readonly METRIC_STATUS=${DEP_METRIC_STATUS:-}
15 | 
16 | 
17 | # These 3 used by cook.sh.
18 | 
19 | TOOLS-combine-status() {
20 |   if test -n "$PYTHON"; then
21 |     $PYTHON $RAPPOR_SRC/pipeline/combine_status.py "$@"
22 |   else
23 |     $RAPPOR_SRC/pipeline/combine_status.py "$@"
24 |   fi
25 | }
26 | 
27 | TOOLS-combine-results() {
28 |   if test -n "$PYTHON"; then
29 |     $PYTHON $RAPPOR_SRC/pipeline/combine_results.py "$@"
30 |   else
31 |     $RAPPOR_SRC/pipeline/combine_results.py "$@"
32 |   fi
33 | }
34 | 
35 | TOOLS-metric-status() {
36 |   if test -n "$METRIC_STATUS"; then
37 |     $METRIC_STATUS "$@"
38 |   else
39 |     $RAPPOR_SRC/pipeline/metric_status.R "$@"
40 |   fi
41 | }
42 | 
43 | # Used by ui.sh.
44 | 
45 | TOOLS-csv-to-html() {
46 |   if test -n "$PYTHON"; then
47 |     $PYTHON $RAPPOR_SRC/pipeline/csv_to_html.py "$@"
48 |   else
49 |     $RAPPOR_SRC/pipeline/csv_to_html.py "$@"
50 |   fi
51 | }
52 | 
53 | #
54 | # Higher level scripts
55 | #
56 | 
57 | TOOLS-cook() {
58 |   $RAPPOR_SRC/pipeline/cook.sh "$@"
59 | }
60 | 
61 | # TODO: Rename gen-ui.sh.
62 | TOOLS-gen-ui() {
63 |   $RAPPOR_SRC/pipeline/ui.sh "$@"
64 | }
65 | 


--------------------------------------------------------------------------------
/client/cpp/libc_rand_impl.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // IMPORTANT: This is for demo /simulation purposes only.  Use a better random
16 | // function in production applications.
17 | 
18 | #include "libc_rand_impl.h"
19 | 
20 | #include <assert.h>
21 | #include <stdint.h>  // uint64_t
22 | #include <stdlib.h>  // srand
23 | 
24 | namespace rappor {
25 | 
26 | //
27 | // LibcRand
28 | //
29 | 
30 | // Similar to client/python/fastrand.c
31 | bool LibcRand::GetMask(float prob, int num_bits, Bits* mask_out) const {
32 |   int rand_threshold = static_cast<int>(prob * RAND_MAX);
33 |   Bits mask = 0;
34 | 
35 |   for (int i = 0; i < num_bits; ++i) {
36 |     // NOTE: could use rand_r(), which is more thread-safe
37 |     Bits bit = (rand() < rand_threshold);
38 |     mask |= (bit << i);
39 |   }
40 |   *mask_out = mask;
41 |   return true;  // no possible failure
42 | }
43 | 
44 | }  // namespace rappor
45 | 


--------------------------------------------------------------------------------
/ui/assoc-metric.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title></title>  <!-- filled in by JS -->
 5 | 
 6 |     <script type="text/javascript" src="static/dygraph-combined.js"></script>
 7 | 
 8 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 9 |     <script type="text/javascript" src="static/table-lib.js"></script>
10 | 
11 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
12 |     <script type="text/javascript" src="static/ui.js"></script>
13 |   </head>
14 | 
15 |   <body onload="initAssocMetric(gUrlHash, gTableStates, kStatusElem, globals);"
16 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
17 |     <p id="status"></p>
18 | 
19 |     <p style="text-align: right">
20 |       <a href="../home.html">Home</a> /
21 |       <a href="assoc-overview.html">Association Overview</a>
22 |     </p>
23 | 
24 |     <h1 id="pageTitle"></h1>  <!-- filled in by JS -->
25 | 
26 |     <p id="metricDesc"></p>  <!-- filled in by JS -->
27 | 
28 |     <table id="metric_table">
29 |     </table>
30 | 
31 |     <p>
32 |       <!-- link depends on fragment; filled in by JS -->
33 |       Underlying data: <a id="underlying-status" href=""></a>
34 |     </p>
35 | 
36 |     <!-- page globals -->
37 |     <script type="text/javascript">
38 |       var globals = {proportionsDygraph: null};
39 |       var gUrlHash = new UrlHash(location.hash);
40 |       var gTableStates = {};
41 |       var kStatusElem = document.getElementById('status');
42 |     </script>
43 | 
44 |   </body>
45 | </html>
46 | 


--------------------------------------------------------------------------------
/ui/day.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Single Day Results</title>
 5 | 
 6 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 7 |     <script type="text/javascript" src="static/table-lib.js"></script>
 8 | 
 9 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
10 |     <script type="text/javascript" src="static/ui.js"></script>
11 |   </head>
12 | 
13 |   <body onload="initDay(gUrlHash, gTableStates, kStatusElem);"
14 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
15 |     <p id="status"></p>
16 | 
17 |     <!-- TODO: up to metric?  Nav bar. -->
18 |     <p style="text-align: right">
19 |       <a href="../home.html">Home</a> /
20 |       <a href="overview.html">Overview</a> /
21 |       <a href="histograms.html">Histograms</a>
22 |     </p>
23 | 
24 |     <!-- NOTE: There is a metric description here.  Get it from the XML file.
25 |     -->
26 | 
27 |     <h2 id="metricDay"></h2>
28 | 
29 |     <table id="results_table">
30 |     </table>
31 | 
32 |     <p>
33 |       <img id="residual" src="" alt="Residuals">
34 |     </p>
35 | 
36 |     <p>
37 |       <!-- link depends on fragment; filled in by JS -->
38 |       Underlying data: <a id="underlying" href="">results.csv</a>
39 |     </p>
40 | 
41 |     <!-- page globals -->
42 |     <script type="text/javascript">
43 |       var gUrlHash = new UrlHash(location.hash);
44 |       var gTableStates = {};
45 |       var kStatusElem = document.getElementById('status');
46 |     </script>
47 | 
48 |   </body>
49 | </html>
50 | 


--------------------------------------------------------------------------------
/pipeline/task_spec_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | """
 3 | task_spec_test.py: Tests for task_spec.py
 4 | """
 5 | 
 6 | import cStringIO
 7 | import unittest
 8 | 
 9 | import task_spec  # module under test
10 | 
11 | 
12 | class TaskSpecTest(unittest.TestCase):
13 | 
14 |   def testCountReports(self):
15 |     f = cStringIO.StringIO("""\
16 | 1,2
17 | 3,4
18 | 5,6
19 | """)
20 |     c = task_spec.CountReports(f)
21 |     self.assertEqual(9, c)
22 | 
23 |   def testDist(self):
24 |     # NOTE: These files are opened, in order to count the reports.  Maybe skip
25 |     # that step.
26 |     f = cStringIO.StringIO("""\
27 | _tmp/counts/2015-12-01/exp_counts.csv
28 | _tmp/counts/2015-12-01/gauss_counts.csv
29 | _tmp/counts/2015-12-02/exp_counts.csv
30 | _tmp/counts/2015-12-02/gauss_counts.csv
31 | """)
32 |     input_iter = task_spec.DistInputIter(f)
33 |     #for row in input_iter:
34 |     #  print row
35 | 
36 |     field_id_lookup = {}
37 | 
38 |     # var name -> map filename
39 |     f = cStringIO.StringIO("""\
40 | var,map_filename
41 | exp,map.csv
42 | unif,map.csv
43 | gauss,map.csv
44 | """)
45 |     dist_maps = task_spec.DistMapLookup(f, '_tmp/maps')
46 | 
47 |     f2 = cStringIO.StringIO("""\
48 | metric,var,var_type,params
49 | exp,,string,params
50 | unif,,string,params
51 | gauss,,string,params
52 | """)
53 |     var_schema = task_spec.VarSchema(f2, '_tmp/config')
54 | 
55 |     for row in task_spec.DistTaskSpec(
56 |         input_iter, field_id_lookup, var_schema, dist_maps, None):
57 |       print row
58 | 
59 | 
60 | if __name__ == '__main__':
61 |   unittest.main()
62 | 


--------------------------------------------------------------------------------
/ui/assoc-pair.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title></title>  <!-- filled in by JS -->
 5 | 
 6 |     <script type="text/javascript" src="static/dygraph-combined.js"></script>
 7 | 
 8 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 9 |     <script type="text/javascript" src="static/table-lib.js"></script>
10 | 
11 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
12 |     <script type="text/javascript" src="static/ui.js"></script>
13 |   </head>
14 | 
15 |   <body onload="initAssocPair(gUrlHash, gTableStates, kStatusElem, globals);"
16 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
17 |     <p id="status"></p>
18 | 
19 |     <p style="text-align: right">
20 |       <a href="../home.html">Home</a> /
21 |       <a href="assoc-overview.html">Association Overview</a>
22 |     </p>
23 | 
24 |     <h1 id="pageTitle"></h1>  <!-- filled in by JS -->
25 | 
26 |     <p id="metricDesc"></p>  <!-- filled in by JS -->
27 | 
28 |     <h2>Task Status</h2>
29 | 
30 |     <table id="status_table">
31 |     </table>
32 | 
33 |     <p>
34 |       <!-- link depends on fragment; filled in by JS -->
35 |       Underlying data: <a id="underlying-status" href=""></a>
36 |     </p>
37 | 
38 |     <!-- page globals -->
39 |     <script type="text/javascript">
40 |       var globals = {proportionsDygraph: null};
41 |       var gUrlHash = new UrlHash(location.hash);
42 |       var gTableStates = {};
43 |       var kStatusElem = document.getElementById('status');
44 |     </script>
45 | 
46 |   </body>
47 | </html>
48 | 


--------------------------------------------------------------------------------
/analysis/R/run_tests.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | #
18 | # Run unit tests for RAPPOR R code.
19 | 
20 | library(RUnit)
21 | 
22 | run_tests <- function() {
23 |   dirs <- "analysis/R"  # Run from root
24 |   test_suite <- defineTestSuite("rappor", dirs, testFileRegexp = "_test.R$",
25 |                                 testFuncRegexp = "^Test")
26 |   stopifnot(isValidTestSuite(test_suite))
27 | 
28 |   test_result <- runTestSuite(test_suite)
29 | 
30 |   printTextProtocol(test_result)  # print to stdout
31 | 
32 |   result <- test_result[[1]]  # Result for our only suite
33 | 
34 |   # Sanity check: fail if there were no tests found.
35 |   if (result$nTestFunc == 0) {
36 |     cat("No tests found.\n")
37 |     return(FALSE)
38 |   }
39 |   if (result$nFail != 0 || result$nErr != 0) {
40 |     cat("Some tests failed.\n")
41 |     return(FALSE)
42 |   }
43 |   return(TRUE)
44 | }
45 | 
46 | if (!run_tests()) {
47 |   quit(status = 1)
48 | }
49 | 


--------------------------------------------------------------------------------
/analysis/cpp/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Usage:
 4 | #   ./run.sh <function name>
 5 | 
 6 | set -o nounset
 7 | set -o pipefail
 8 | set -o errexit
 9 | 
10 | # Call gcc with the flags we like.
11 | # NOTE: -O3 does a lot for fast_em.  (More than 5x speedup over unoptimized)
12 | 
13 | cpp-compiler() {
14 |   g++ -Wall -Wextra -O3 "$@"
15 |   #clang++ -Wall -Wextra -O3 "$@"
16 | }
17 | 
18 | build-find-cliques() {
19 |   mkdir -p _tmp
20 |   # C++ 11 for unordered_{map,set}
21 |   cpp-compiler -std=c++0x -o _tmp/find_cliques find_cliques.cc 
22 | }
23 | 
24 | find-cliques() {
25 |   _tmp/find_cliques "$@"
26 | }
27 | 
28 | test-bad-edge() {
29 |   # Edge should go from lesser partition number to greater
30 |   find-cliques <<EOF
31 | num_partitions 3
32 | ngram_size 2
33 | edge 1.ab 0.cd
34 | EOF
35 | }
36 | 
37 | test-bad-size() {
38 |   # Only support n =2 now
39 |   find-cliques <<EOF
40 | num_partitions 3
41 | ngram_size 3
42 | edge 0.ab 1.cd
43 | EOF
44 | }
45 | 
46 | demo() {
47 |   local graph=${1:-testdata/graph1.txt}
48 |   build-find-cliques
49 | 
50 |   time cat $graph | find-cliques
51 | }
52 | 
53 | get-lint() {
54 |   mkdir -p _tmp
55 |   wget --directory _tmp \
56 |     http://google-styleguide.googlecode.com/svn/trunk/cpplint/cpplint.py
57 |   chmod +x _tmp/cpplint.py
58 | }
59 | 
60 | lint() {
61 |   _tmp/cpplint.py find_cliques.cc fast_em.cc
62 | }
63 | 
64 | build-fast-em() {
65 |   mkdir -p _tmp
66 |   local out=_tmp/fast_em
67 | 
68 |   cpp-compiler -o $out fast_em.cc
69 |   ls -l $out
70 | }
71 | 
72 | fast-em() {
73 |   build-fast-em
74 |   time _tmp/fast_em "$@"
75 | }
76 | 
77 | "$@"
78 | 


--------------------------------------------------------------------------------
/doc/randomness.md:
--------------------------------------------------------------------------------
 1 | Generating Random Bits for RAPPOR
 2 | =================================
 3 | 
 4 | To ensure privacy, an application using RAPPOR must generate random bits in an
 5 | unpredictable manner.  In other words, an adversary that can predict the
 6 | sequence of random bits used can determine the true values being reported.
 7 | 
 8 | Generating random numbers is highly platform-specific -- even
 9 | language-specific.  So, libraries implementing RAPPOR should be parameterized
10 | by an interface to generate random bits.  (This can be thought of as
11 | "dependency injection".)
12 | 
13 | <!-- TODO: details on the interfaces, once we have them in more than one
14 |      language -->
15 | 
16 | 
17 | For now, we have collected some useful links.
18 | 
19 | Linux
20 | -----
21 | 
22 | * [Myths about /dev/urandom](http://www.2uo.de/myths-about-urandom/) -- Nice
23 |   article explaining implementation aspects of `/dev/urandom` and `/dev/random`
24 |   on Linux.  (Summary: just use `/dev/urandom`, with caveats explained)
25 | 
26 | * [LWN on getrandom](http://lwn.net/Articles/606141/)
27 |   ([patch](http://lwn.net/Articles/605828/)) -- A very recent addition to the
28 |   Linux kernel.  As of this writing (11/2014), it's safe to say that very few
29 |   applications use it.  The relevant change, involving an issue mentioned in
30 |   the first link, involves the situation at system boot, when there is little
31 |   entropy available.
32 | 
33 | 
34 | <!-- TODO: other platforms.  Chrome uses /dev/urandom on Linux.  What about
35 |      other platforms?  -->
36 | 
37 | <!-- TODO: when we have a C/C++ client, explain provide sample implementation
38 |      using simple C functions -->
39 | 


--------------------------------------------------------------------------------
/bin/hash_candidates_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | hash_candidates_test.py: Tests for hash_candidates.py
19 | """
20 | 
21 | import cStringIO
22 | import unittest
23 | 
24 | import rappor
25 | import hash_candidates  # module under test
26 | 
27 | 
28 | STDIN = """\
29 | apple
30 | banana
31 | carrot
32 | """
33 | 
34 | EXPECTED_CSV_OUT = """\
35 | apple,5,1,26,26,38,34,63,62\r
36 | banana,12,14,28,24,37,34,62,49\r
37 | carrot,4,12,25,21,48,38,61,54\r
38 | """
39 | 
40 | 
41 | class HashCandidatesTest(unittest.TestCase):
42 | 
43 |   def setUp(self):
44 |     self.params = rappor.Params()
45 |     self.params.num_bloombits = 16
46 |     self.params.num_cohorts = 4
47 |     self.params.num_hashes = 2
48 | 
49 |   def testHash(self):
50 |     stdin = cStringIO.StringIO(STDIN)
51 |     stdout = cStringIO.StringIO()
52 | 
53 |     hash_candidates.HashCandidates(self.params, stdin, stdout)
54 | 
55 |     self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
56 | 
57 | 
58 | if __name__ == '__main__':
59 |   unittest.main()
60 | 


--------------------------------------------------------------------------------
/bin/README.md:
--------------------------------------------------------------------------------
 1 | Command Line Tools
 2 | ==================
 3 | 
 4 | This directory contains command line tools for RAPPOR analysis.
 5 | 
 6 | Analysis Tools
 7 | --------------
 8 | 
 9 | ### decode-dist
10 | 
11 | Decode a distribution -- requires a "counts" file (summed bits from reports),
12 | map file, and a params file.  See `test.sh decode-dist` in this dir for an
13 | example.
14 | 
15 | ### decode-assoc
16 | 
17 | Decode a joint distribution between 2 variables ("association analysis").  See
18 | `test.sh decode-assoc-R` or `test.sh decode-assoc-cpp` in this dir for an
19 | example.
20 | 
21 | Currently it only supports associating strings vs. booleans.
22 | 
23 | ### Setup
24 | 
25 | Both of these tools are written in R, and require several R libraries to be
26 | installed (see `../setup.sh r-packages`).
27 | 
28 | `decode-assoc` also shells out to a native binary written in C++ if
29 | `--em-executable` is passed.  This requires a C++ compiler (see
30 | `analysis/cpp/run.sh`).  You can run `test.sh decode-assoc-cpp` to test it.
31 | 
32 | 
33 | Helper Tools
34 | ------------
35 | 
36 | These are simple Python implementations of tools needed for analysis.  At
37 | Google, Chrome uses alternative C++/Go implementations of these tools.
38 | 
39 | ### sum-bits
40 | 
41 | Given a CSV file with RAPPOR reports (IRRs), produce a "counts" CSV file on
42 | stdout.  This is the `m x (k+1)` matrix that is used in the R analysis (where m
43 | = #cohorts and k = report width in bits).
44 | 
45 | ### hash-candidates
46 | 
47 | Given a list of candidates on stdin, produce a CSV file of hashes (the "map
48 | file").  Each row has `m x h` cells (where m = #cohorts and h = #hashes)
49 | 
50 | See the `regtest.sh` script for examples of how these tools are invoked.
51 | 
52 | 


--------------------------------------------------------------------------------
/ui/overview.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>RAPPOR Results Overview</title>
 5 | 
 6 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 7 |     <script type="text/javascript" src="static/table-lib.js"></script>
 8 | 
 9 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
10 |     <script type="text/javascript" src="static/ui.js"></script>
11 |   </head>
12 | 
13 |   <body onload="initOverview(gUrlHash, gTableStates, kStatusElem);"
14 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
15 |     <p id="status"></p>
16 | 
17 |     <p style="text-align: right">
18 |       <a href="../../assoc-live/latest/assoc-overview.html">Association analysis</a> (latest)
19 |     </p>
20 | 
21 |     <p style="text-align: right">
22 |       <a href="../home.html">Home</a> /
23 |       <b>Overview</b> /
24 |       <a href="histograms.html">Histograms</a>
25 |     </p>
26 | 
27 |     <h1>RAPPOR Results Overview</h1>
28 | 
29 |     <table id="overview">
30 |     </table>
31 | 
32 |     <p>
33 |       Underlying data: <a href="cooked/overview.csv">overview.csv</a>
34 |     </p>
35 | 
36 |     <h2>Metric Descriptions</h2>
37 | 
38 |     <!-- Filled in by JS -->
39 |     <table id="metricMetadata">
40 |       <thead>
41 |         <tr>
42 |           <td>Metric Name</td>
43 |           <td>Owners</td>
44 |           <td>Description</td>
45 |         </tr>
46 |       </thead>
47 |       <tbody>
48 |       </tbody>
49 |     </table>
50 | 
51 |     <!-- page globals -->
52 |     <script type="text/javascript">
53 |       var gUrlHash = new UrlHash(location.hash);
54 |       var gTableStates = {};
55 |       var kStatusElem = document.getElementById('status');
56 |     </script>
57 | 
58 |   </body>
59 | </html>
60 | 


--------------------------------------------------------------------------------
/apps/README.md:
--------------------------------------------------------------------------------
 1 | RAPPOR Shiny Apps
 2 | =================
 3 | 
 4 | This directory contains web apps written using the [Shiny][shiny] web framework
 5 | from [RStudio][rstudio].
 6 | 
 7 | To run them, first install Shiny:
 8 | 
 9 |     $ R
10 |     ...
11 |     > install.packages('shiny')
12 |     ...
13 | 
14 | (You can view Shiny's platform requirements in
15 | [CRAN](http://cran.r-project.org/web/packages/shiny/index.html).)
16 | 
17 | Then change to the app directory, and execute the `run_app.sh` script:
18 | 
19 |     $ cd rappor/apps/rappor-analysis
20 |     $ ./run_app.sh
21 |     ...
22 |     Listening on http://0.0.0.0.:6789
23 | 
24 | Visit http://localhost:6789/ in your browser.
25 | 
26 | This code has been tested on Ubuntu Linux, but should work on other platforms
27 | that Shiny supports.
28 | 
29 | Both of these apps use the underlying analysis code in `analysis/R`, just like
30 | the command line demo `demo.sh` does.
31 | 
32 | rappor-analysis
33 | ---------------
34 | 
35 | This app "decodes" a RAPPOR data set.  In other words, you can upload the
36 | `params`, `counts`, and `map` files, and view the inferred distribution, as
37 | well as debug info.
38 | 
39 | These files are discussed in the RAPPOR [Data Flow][data-flow] doc.
40 | 
41 | rappor-sim
42 | ----------
43 | 
44 | This app lets you simulate RAPPOR runs with different populations and
45 | parameters.  This can help you choose collection parameters for a given
46 | situation / variable.
47 | 
48 | Help
49 | ----
50 | 
51 | If you need help with these apps, please send a message to
52 | [rappor-discuss][group].
53 | 
54 | 
55 | [shiny]: http://shiny.rstudio.com/ 
56 | [rstudio]: http://rstudio.com/ 
57 | [data-flow]: http://google.github.io/rappor/doc/data-flow.html
58 | [group]: https://groups.google.com/forum/#!forum/rappor-discuss
59 | 


--------------------------------------------------------------------------------
/gh-pages/examples/report.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <title>RAPPOR Demo</title>
 5 | </head>
 6 | 
 7 | <body style="text-align: center">
 8 |   <h2>RAPPOR Demo</h2>
 9 | 
10 |   <!-- These strings will be replaced by a sed script. -->
11 | 
12 |   <!-- SIM_PARAMS -->
13 | 
14 |   <h3>Simulation Input</h3>
15 |   <table align="center">
16 |     <tr>
17 |       <td>Number of clients</td>
18 |       <td align="right">100,000</td>
19 |     </tr>
20 |     <tr>
21 |       <td>Total values reported / obfuscated</td>
22 |       <td align="right">700,000</td>
23 |     </tr>
24 |     <tr>
25 |       <td>Unique values reported / obfuscated</td>
26 |       <td align="right">50</td>
27 |     </tr>
28 |   </table>
29 | 
30 | 
31 |   <!-- RAPPOR_PARAMS -->
32 | 
33 |   <h3>RAPPOR Parameters</h3>
34 |   <table align="center">
35 |     <tr>
36 |       <td><b>k</b></td>
37 |       <td>Size of Bloom filter in bits</td>
38 |       <td align="right">16</td>
39 |     </tr>
40 |     <tr>
41 |       <td><b>h</b></td>
42 |       <td>Hash functions in Bloom filter</td>
43 |       <td align="right">2</td>
44 |     </tr>
45 |     <tr>
46 |       <td><b>m</b></td>
47 |       <td>Number of Cohorts</td>
48 |       <td align="right">64</td>
49 |     </tr>
50 |     <tr>
51 |       <td><b>p</b></td>
52 |       <td>Probability p</td>
53 |       <td align="right">0.5</td>
54 |     </tr>
55 |     <tr>
56 |       <td><b>q</b></td>
57 |       <td>Probability q</td>
58 |       <td align="right">0.75</td>
59 |     </tr>
60 |     <tr>
61 |       <td><b>f</b></td>
62 |       <td>Probability f</td>
63 |       <td align="right">0.5</td>
64 |     </tr>
65 |   </table>
66 | 
67 | 
68 |   <hr/>
69 | 
70 |   <img src="exp_report/dist.png" alt="exponential distribution" />
71 |   <img src="gauss_report/dist.png" alt="gauss distribution" />
72 |   <img src="unif_report/dist.png" alt="uniform distribution" />
73 | </body>
74 | 
75 | </html>
76 | 


--------------------------------------------------------------------------------
/tests/fastrand_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | fastrand_test.py: Tests for _fastrand extension module.
19 | """
20 | import unittest
21 | 
22 | import _fastrand  # module under test
23 | 
24 | 
25 | BIT_WIDTHS = [8, 16, 32, 64]
26 | 
27 | 
28 | class FastRandTest(unittest.TestCase):
29 | 
30 |   def testRandbits64(self):
31 |     for n in BIT_WIDTHS:
32 |       #print '== %d' % n
33 |       for p1 in [0.1, 0.5, 0.9]:
34 |         #print '-- %f' % p1
35 |         for i in xrange(5):
36 |           r = _fastrand.randbits(p1, n)
37 |           # Rough sanity check
38 |           self.assertLess(r, 2 ** n)
39 | 
40 |           # Visual check
41 |           #b = bin(r)
42 |           #print b
43 |           #print b.count('1')
44 | 
45 | 
46 |   def testRandbits64_EdgeCases(self):
47 |     for n in BIT_WIDTHS:
48 |       r = _fastrand.randbits(0.0, n)
49 |       self.assertEqual(0, r)
50 | 
51 |     for n in BIT_WIDTHS:
52 |       r = _fastrand.randbits(1.0, n)
53 |       self.assertEqual(2 ** n - 1, r)
54 | 
55 |   def testRandbitsError(self):
56 |     r = _fastrand.randbits(-1, 64)
57 |     # TODO: Should probably raise exceptions
58 |     self.assertEqual(None, r)
59 | 
60 |     r = _fastrand.randbits(0.0, 65)
61 |     self.assertEqual(None, r)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |   unittest.main()
66 | 


--------------------------------------------------------------------------------
/pipeline/README.md:
--------------------------------------------------------------------------------
 1 | pipeline
 2 | ========
 3 | 
 4 | This directory contains tools and scripts for running a cron job that does
 5 | RAPPOR analysis and generates an HTML dashboard.
 6 | 
 7 | It works like this:
 8 | 
 9 | 1. `task_spec.py` generates a text file where each line corresponds to a process
10 |    to be run (a "task").  The process is `bin/decode-dist` or
11 |    `bin/decode-assoc`.  The line contains the task parameters.
12 | 
13 | 2. `xargs -P` is used to run processes in parallel.  Our analysis is generally
14 |    single-threaded (i.e. because R is single-threaded), so this helps utilize
15 |    the machine fully.  Each task places its output in a different subdirectory.
16 | 
17 | 3. `cook.sh` calls `combine_results.py` to combine analysis results into a time
18 |    series.  It also calls `combine_status.py` to keep track of task data for
19 |    "meta-analysis".  `metric_status.R` generates more summary CSV files.
20 | 
21 | 4. `ui.sh` calls `csv_to_html.py` to generate an HTML fragments from the CSV
22 |    files.
23 | 
24 | 5. The JavaScript in `ui/ui.js` is loaded from static HTML, and makes AJAX calls
25 |    to retrieve the HTML fragments.  The page is made interactive with
26 |    `ui/table-lib.js`.
27 | 
28 | `dist.sh` and `assoc.sh` contain functions which coordinate this process.
29 | 
30 | `alarm-lib.sh` is used to kill processes that have been running for too long.
31 | 
32 | Testing
33 | -------
34 | 
35 | `pipeline/regtest.sh` contains end-to-end demos of this process.  Right now it
36 | depends on testdata from elsewhere in the tree:
37 | 
38 | 
39 |     rappor$ ./demo.sh run   # prepare dist testdata
40 |     rappor$ cd bin
41 | 
42 |     bin$ ./test.sh write-assoc-testdata  # prepare assoc testdata
43 |     bin$ cd ../pipeline
44 |     
45 |     pipeline$ ./regtest.sh dist
46 |     pipeline$ ./regtest.sh assoc
47 | 
48 |     pipeline$ python -m SimpleHTTPServer  # start a static web server
49 |     
50 |     http://localhost:8000/_tmp/
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/bin/sum_bits_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python -S
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | sum_bits_test.py: Tests for sum_bits.py
19 | """
20 | 
21 | import cStringIO
22 | import unittest
23 | 
24 | import rappor
25 | import sum_bits  # module under test
26 | 
27 | 
28 | CSV_IN = """\
29 | user_id,cohort,bloom,prr,rappor
30 | 5,1,dummy,dummy,0000111100001111
31 | 5,1,dummy,dummy,0000000000111100
32 | """
33 | 
34 | # NOTE: bit order is reversed.
35 | EXPECTED_CSV_OUT = """\
36 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0\r
37 | 2,1,1,2,2,1,1,0,0,1,1,1,1,0,0,0,0\r
38 | """
39 | 
40 | TOO_MANY_COLUMNS = """\
41 | user_id,cohort,rappor
42 | 5,1,0000111100001111,extra
43 | """
44 | 
45 | 
46 | class SumBitsTest(unittest.TestCase):
47 | 
48 |   def setUp(self):
49 |     self.params = rappor.Params()
50 |     self.params.num_bloombits = 16
51 |     self.params.num_cohorts = 2
52 | 
53 |   def testSum(self):
54 |     stdin = cStringIO.StringIO(CSV_IN)
55 |     stdout = cStringIO.StringIO()
56 | 
57 |     sum_bits.SumBits(self.params, stdin, stdout)
58 | 
59 |     self.assertMultiLineEqual(EXPECTED_CSV_OUT, stdout.getvalue())
60 | 
61 |   def testErrors(self):
62 |     stdin = cStringIO.StringIO(TOO_MANY_COLUMNS)
63 |     stdout = cStringIO.StringIO()
64 | 
65 |     self.assertRaises(
66 |         RuntimeError, sum_bits.SumBits, self.params, stdin, stdout)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |   unittest.main()
71 | 


--------------------------------------------------------------------------------
/bin/hash_candidates.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | Given a list of candidates on stdin, produce a file of hashes ("map file").
19 | """
20 | 
21 | import csv
22 | import sys
23 | 
24 | import rappor
25 | 
26 | 
27 | def HashCandidates(params, stdin, stdout):
28 |   num_bloombits = params.num_bloombits
29 |   csv_out = csv.writer(stdout)
30 | 
31 |   for line in stdin:
32 |     word = line.strip()
33 |     row = [word]
34 |     for cohort in xrange(params.num_cohorts):
35 |       bloom_bits = rappor.get_bloom_bits(word, cohort, params.num_hashes,
36 |                                          num_bloombits)
37 |       for bit_to_set in bloom_bits:
38 |         # bits are indexed from 1.  Add a fixed offset for each cohort.
39 |         # NOTE: This detail could be omitted from the map file format, and done
40 |         # in R.
41 |         row.append(cohort * num_bloombits + (bit_to_set + 1))
42 |     csv_out.writerow(row)
43 | 
44 | 
45 | def main(argv):
46 |   try:
47 |     filename = argv[1]
48 |   except IndexError:
49 |     raise RuntimeError('Usage: hash_candidates.py <params file>')
50 |   with open(filename) as f:
51 |     try:
52 |       params = rappor.Params.from_csv(f)
53 |     except rappor.Error as e:
54 |       raise RuntimeError(e)
55 | 
56 |   HashCandidates(params, sys.stdin, sys.stdout)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |   try:
61 |     main(sys.argv)
62 |   except RuntimeError, e:
63 |     print >>sys.stderr, e.args[0]
64 |     sys.exit(1)
65 | 


--------------------------------------------------------------------------------
/gh-pages/doc/randomness.html:
--------------------------------------------------------------------------------
 1 |   <!DOCTYPE html>
 2 |   <html>
 3 |     <head>
 4 |       <meta charset="UTF-8">
 5 |       <style type="text/css">
 6 |         code { color: green; }
 7 |         pre { margin-left: 3em; }
 8 |       </style>
 9 |       <!-- INSERT LATCH JS -->
10 |     </head>
11 |     <body style="margin: 0 auto; width: 40em; text-align: left;">
12 |       <!-- INSERT LATCH HTML -->
13 | <h1>Generating Random Bits for RAPPOR</h1>
14 | 
15 | <p>To ensure privacy, an application using RAPPOR must generate random bits in an
16 | unpredictable manner.  In other words, an adversary that can predict the
17 | sequence of random bits used can determine the true values being reported.</p>
18 | 
19 | <p>Generating random numbers is highly platform-specific -- even
20 | language-specific.  So, libraries implementing RAPPOR should be parameterized
21 | by an interface to generate random bits.  (This can be thought of as
22 | "dependency injection".)</p>
23 | 
24 | <!-- TODO: details on the interfaces, once we have them in more than one
25 |      language -->
26 | 
27 | <p>For now, we have collected some useful links.</p>
28 | 
29 | <h2>Linux</h2>
30 | 
31 | <ul>
32 | <li><p><a href="http://www.2uo.de/myths-about-urandom/">Myths about /dev/urandom</a> -- Nice
33 | article explaining implementation aspects of <code>/dev/urandom</code> and <code>/dev/random</code>
34 | on Linux.  (Summary: just use <code>/dev/urandom</code>, with caveats explained)</p></li>
35 | <li><p><a href="http://lwn.net/Articles/606141/">LWN on getrandom</a>
36 | (<a href="http://lwn.net/Articles/605828/">patch</a>) -- A very recent addition to the
37 | Linux kernel.  As of this writing (11/2014), it's safe to say that very few
38 | applications use it.  The relevant change, involving an issue mentioned in
39 | the first link, involves the situation at system boot, when there is little
40 | entropy available.</p></li>
41 | </ul>
42 | 
43 | <!-- TODO: other platforms.  Chrome uses /dev/urandom on Linux.  What about
44 |      other platforms?  -->
45 | 
46 | <!-- TODO: when we have a C/C++ client, explain provide sample implementation
47 |      using simple C functions -->
48 |     </body>
49 |   </html>
50 | 


--------------------------------------------------------------------------------
/client/cpp/run.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Usage:
  4 | #   ./run.sh <function name>
  5 | 
  6 | set -o nounset
  7 | set -o pipefail
  8 | set -o errexit
  9 | 
 10 | setup() {
 11 |   # need libprotobuf-dev for headers to compile against.
 12 |   sudo apt-get install protobuf-compiler libprotobuf-dev
 13 | 
 14 |   # OpenSSL dev headers
 15 |   sudo apt-get install libssl-dev
 16 | }
 17 | 
 18 | init() {
 19 |   mkdir --verbose -p _tmp
 20 | }
 21 | 
 22 | rappor-sim() {
 23 |   make _tmp/rappor_sim
 24 |   _tmp/rappor_sim "$@"
 25 | }
 26 | 
 27 | protobuf-encoder-demo() {
 28 |   make _tmp/protobuf_encoder_demo
 29 |   _tmp/protobuf_encoder_demo "$@"
 30 | }
 31 | 
 32 | rappor-sim-demo() {
 33 |   rappor-sim 16 2 128 0.25 0.75 0.5 <<EOF
 34 | client,cohort,value
 35 | c1,1,v1
 36 | c1,1,v2
 37 | c2,2,v3
 38 | c2,2,v4
 39 | EOF
 40 | }
 41 | 
 42 | empty-input() {
 43 |   echo -n '' | rappor-sim 58 2 128 .025 0.75 0.5
 44 | }
 45 | 
 46 | # This outputs an HMAC and MD5 value.  Compare with Python/shell below.
 47 | 
 48 | openssl-hash-impl-test() {
 49 |   make _tmp/openssl_hash_impl_test
 50 |   _tmp/openssl_hash_impl_test "$@"
 51 | }
 52 | 
 53 | test-hmac-sha256() {
 54 |   #echo -n foo | sha256sum
 55 |   python -c '
 56 | import hashlib
 57 | import hmac
 58 | import sys
 59 | 
 60 | secret = sys.argv[1]
 61 | body = sys.argv[2]
 62 | m = hmac.new(secret, body, digestmod=hashlib.sha256)
 63 | print m.hexdigest()
 64 | ' "key" "value"
 65 | }
 66 | 
 67 | test-md5() {
 68 |   echo -n value | md5sum
 69 | }
 70 | 
 71 | # -M: all headers
 72 | # -MM: exclude system headers
 73 | 
 74 | # -MF: file to write the dependencies to
 75 | 
 76 | # -MD: like -M -MF
 77 | # -MMD: -MD, but only system headers
 78 | 
 79 | # -MP: workaround
 80 | 
 81 | 
 82 | deps() {
 83 |   # -MM seems like the one we want.
 84 |   gcc -I _tmp -MM protobuf_encoder_test.cc unix_kernel_rand_impl.cc
 85 |   #gcc -I _tmp -MMD -MP protobuf_encoder_test.cc unix_kernel_rand_impl.cc
 86 | }
 87 | 
 88 | count() {
 89 |   wc -l *.h *.cc | sort -n
 90 | }
 91 | 
 92 | encoder-demo() {
 93 |   make _tmp/encoder_demo && _tmp/encoder_demo
 94 | }
 95 | cpplint() {
 96 |   ../../analysis/cpp/_tmp/cpplint.py "$@"
 97 | }
 98 | 
 99 | "$@"
100 | 


--------------------------------------------------------------------------------
/client/cpp/encoder_demo.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // Sample code for encoder.cc.
16 | //
17 | // This is the code in README.md.  It's here to make sure it actually builds
18 | // and runs.
19 | 
20 | #include <cassert>  // assert
21 | 
22 | #include "encoder.h"
23 | #include "openssl_hash_impl.h"
24 | #include "unix_kernel_rand_impl.h"
25 | 
26 | int main(int argc, char** argv) {
27 |   // Suppress unused variable warnings
28 |   (void) argc;
29 |   (void) argv;
30 | 
31 |   FILE* fp = fopen("/dev/urandom", "r");
32 |   rappor::UnixKernelRand irr_rand(fp);
33 | 
34 |   rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256,
35 |                     irr_rand);
36 |   rappor::Params params(32,    // num_bits (k)
37 |                         2,     // num_hashes (h)
38 |                         128,   // num_cohorts (m)
39 |                         0.25,  // probability f for PRR
40 |                         0.75,  // probability p for IRR
41 |                         0.5);  // probability q for IRR
42 | 
43 |   const char* encoder_id = "metric-name";
44 |   rappor::Encoder encoder(encoder_id, params, deps);
45 | 
46 |   // Now use it to encode values.  The 'out' value can be sent over the
47 |   // network.
48 |   rappor::Bits out;
49 |   assert(encoder.EncodeString("foo", &out));  // returns false on error
50 |   printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
51 | 
52 |   // Raw bits
53 |   assert(encoder.EncodeBits(0x123, &out));  // returns false on error
54 |   printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
55 | }
56 | 
57 | 


--------------------------------------------------------------------------------
/doc/data-flow.dot:
--------------------------------------------------------------------------------
 1 | // Based on http://graphviz.org/content/cluster
 2 | 
 3 | // Node types:
 4 | //   Rectangle: process
 5 | //   Oval: data
 6 | //   Diamond: debug/simulation data
 7 | 
 8 | digraph G {
 9 |   //rankdir="LR";  // left to right layout
10 | 
11 |   // http://www.graphviz.org/content/color-names
12 |   colorscheme=pastel13;
13 | 
14 |   subgraph cluster_0 {
15 |     graph [ fontsize=24 ];
16 |     label = "Reporting";
17 |     style=filled;
18 |     color=2;
19 | 
20 |     node [style=filled, color=white, fontsize=12];
21 | 
22 |     gen_sim_input -> dist_csv -> rappor_sim;
23 |     
24 |     rappor_sim -> out;
25 |     rappor_sim -> params;
26 |     rappor_sim -> hist;
27 |     rappor_sim -> true_inputs;
28 | 
29 |     // Process
30 |     rappor_sim [label="rappor_sim"];
31 | 
32 |     // Data
33 |     dist_csv [shape=box, label="dist.csv"];
34 |     out [shape=box, label="dist_out.csv"];
35 |     params [shape=box, label="dist_params.csv"];
36 | 
37 |     // simulation data
38 |     hist [shape=box, style=dotted, color=black, label="dist_hist.csv"];
39 |     true_inputs [shape=box, style=dotted, color=black, label="dist_true_inputs.txt"];
40 |   }
41 | 
42 |   subgraph cluster_1 {
43 |     graph [ fontsize=24 ];
44 |     label = "Analysis";
45 |     style = filled;
46 |     color=3;
47 | 
48 |     node [style=filled, color=white, fontsize=12];
49 | 
50 |     sum_bits -> counts;
51 | 
52 |     // sum_bits needs the params to construct the matrix.  Technically it could
53 |     // infer it, but this is simple.
54 |     params -> sum_bits;
55 | 
56 |     // only in the simulation
57 |     true_inputs -> demo_sh -> candidates [style=dotted];
58 | 
59 |     candidates -> hash_candidates -> map;
60 |     params -> hash_candidates;
61 | 
62 |     params -> analyze;
63 |     map -> analyze;
64 |     counts -> analyze;
65 |     hist -> analyze [style=dotted];  // only for comparison
66 | 
67 |     analyze -> plot_png;
68 | 
69 |     // Processes
70 |     analyze [label="analyze.R"];
71 |     demo_sh [label="demo.sh", style=dotted, color=black];
72 | 
73 |     // Data
74 |     counts [shape=box, label="dist_count.csv"];
75 |     candidates [shape=box, label="dist_candidates.txt"];
76 |     map [shape=box, label="dist_map.csv"];
77 | 
78 |     plot_png [shape=box, label="dist.png"];
79 | 
80 |   }
81 | 
82 |   out -> sum_bits;
83 | }
84 | 


--------------------------------------------------------------------------------
/demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Demo of RAPPOR.  Automating Python and R scripts.  See README.
 4 | #
 5 | # Usage:
 6 | #   ./demo.sh [function name]
 7 | #
 8 | # End to end demo of rappor. Notable functions include:
 9 | #   quick-python: Runs a demo using the python client
10 | #   quick-cpp: Runs a demo using the c++ client
11 | # If no function is specified the above two will be run consecutivly. 
12 | #
13 | # This takes a minute or so.  It runs a subset of tests from regtest.sh and
14 | # writes an HTML summary.
15 | 
16 | set -o nounset
17 | set -o pipefail
18 | set -o errexit
19 | 
20 | . util.sh
21 | 
22 | readonly THIS_DIR=$(dirname $0)
23 | readonly REPO_ROOT=$THIS_DIR
24 | readonly CLIENT_DIR=$REPO_ROOT/client/python
25 | 
26 | # All the Python tools need this
27 | export PYTHONPATH=$CLIENT_DIR
28 | 
29 | #
30 | # Semi-automated demos
31 | #
32 | 
33 | # Run rappor-sim through the Python profiler.
34 | rappor-sim-profile() {
35 |   local dist=$1
36 |   shift
37 | 
38 |   # For now, just dump it to a text file.  Sort by cumulative time.
39 |   time python -m cProfile -s cumulative \
40 |     tests/rappor_sim.py \
41 |     -i _tmp/$dist.csv \
42 |     "$@" \
43 |     | tee _tmp/profile.txt
44 | }
45 | 
46 | quick-python() {  
47 |   ./regtest.sh run-seq '^demo3' python
48 | }
49 | 
50 | quick-cpp() {
51 |   # For now we build it first.  Don't want to build it in parallel.
52 |   ./build.sh cpp-client
53 | 
54 |   ./regtest.sh run-seq '^demo3' cpp
55 | }
56 | 
57 | quick() {
58 |   quick-python
59 |   quick-cpp
60 | }
61 | 
62 | # TODO: Port these old bad cases to regtest_spec.py.
63 | 
64 | # Running the demo of the exponential distribution with 10000 reports (x7,
65 | # which is 70000 values).
66 | #
67 | # - There are 50 real values, but we add 1000 more candidates, to get 1050 candidates.
68 | # - And then we remove the two most common strings, v1 and v2.
69 | # - With the current analysis, we are getting sum(proportion) = 1.1 to 1.7
70 | 
71 | # TODO: Make this sharper by including only one real value?
72 | 
73 | bad-case() {
74 |   local num_additional=${1:-1000}
75 |   run-dist exp 10000 $num_additional 'v1|v2'
76 | }
77 | 
78 | # Force it to be less than 1
79 | pcls-test() {
80 |   USE_PCLS=1 bad-case
81 | }
82 | 
83 | # Only add 10 more candidates.  Then we properly get the 0.48 proportion.
84 | ok-case() {
85 |   run-dist exp 10000 10 'v1|v2'
86 | }
87 | 
88 | if test $# -eq 0 ; then
89 |   quick
90 | else
91 |   "$@"
92 | fi
93 | 


--------------------------------------------------------------------------------
/ui/metric.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |   <head>
 4 |     <title>Metric Results</title>
 5 | 
 6 |     <script type="text/javascript" src="static/dygraph-combined.js"></script>
 7 | 
 8 |     <link rel="stylesheet" type="text/css" href="static/table-sort.css" />
 9 |     <script type="text/javascript" src="static/table-lib.js"></script>
10 | 
11 |     <link rel="stylesheet" type="text/css" href="static/ui.css" />
12 |     <script type="text/javascript" src="static/ui.js"></script>
13 |   </head>
14 | 
15 |   <body onload="initMetric(gUrlHash, gTableStates, kStatusElem, globals);"
16 |         onhashchange="onHashChange(gUrlHash, gTableStates, kStatusElem);">
17 |     <p id="status"></p>
18 | 
19 |     <p style="text-align: right">
20 |       <a href="../home.html">Home</a> /
21 |       <a href="overview.html">Overview</a> /
22 |       <a href="histograms.html">Histograms</a>
23 |     </p>
24 | 
25 |     <!-- NOTE: There is a metric description here.  Get it from the XML file.
26 |     -->
27 | 
28 |     <h1 id="metricName"></h1>  <!-- filled in by JS -->
29 | 
30 |     <p id="metricDesc"></p>  <!-- filled in by JS -->
31 | 
32 |     <h2>Estimated Proportions</h2>
33 |     <p>NOTE: Only the top 5 values for each day are shown</p>
34 | 
35 |     <!--
36 |       NOTE: Setting customBars: false removes the entire line?  That's lame.
37 |     <p>
38 |       <label>
39 |         <input type="checkbox" checked="checked" 
40 |                onclick="onMetricCheckboxClick(this, globals.proportionsDygraph);">
41 |         Show Error Bars
42 |       </label>
43 |     </p>
44 |     -->
45 |     <p class="dy" id="proportionsDy"></p>
46 |     <p>
47 |       Underlying data: <a id="underlying-dist" href="">dist.csv</a>
48 |     </p>
49 | 
50 |     <h2>Number of Reports</h2>
51 | 
52 |     <p class="dy" id="num-reports-dy" align="center"></p>
53 |     <!-- underlying data here is in status.csv? -->
54 | 
55 |     <h2>Unallocated Mass</h2>
56 | 
57 |     <p class="dy" id="mass-dy" align="center"></p>
58 | 
59 |     <p>
60 |       Plot Help: Drag horizontally to <b>zoom to selection</b>.  Double click
61 |       to <b>zoom out</b>.  Shift + drag to <b>pan</b>.
62 |     </p>
63 | 
64 |     <h2>Task Status</h2>
65 | 
66 |     <table id="status_table">
67 |     </table>
68 | 
69 |     <p>
70 |       <!-- link depends on fragment; filled in by JS -->
71 |       Underlying data: <a id="underlying-status" href="">status.csv</a>
72 |     </p>
73 | 
74 |     <!-- page globals -->
75 |     <script type="text/javascript">
76 |       var globals = {proportionsDygraph: null};
77 |       var gUrlHash = new UrlHash(location.hash);
78 |       var gTableStates = {};
79 |       var kStatusElem = document.getElementById('status');
80 |     </script>
81 | 
82 |   </body>
83 | </html>
84 | 


--------------------------------------------------------------------------------
/bin/sum_bits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #
 3 | # Copyright 2014 Google Inc. All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | """
18 | Read the RAPPOR'd values on stdin, and sum the bits to produce a Counting Bloom
19 | filter by cohort.  This can then be analyzed by R.
20 | """
21 | 
22 | import csv
23 | import sys
24 | 
25 | import rappor
26 | 
27 | 
28 | def SumBits(params, stdin, stdout):
29 |   csv_in = csv.reader(stdin)
30 |   csv_out = csv.writer(stdout)
31 | 
32 |   num_cohorts = params.num_cohorts
33 |   num_bloombits = params.num_bloombits
34 | 
35 |   sums = [[0] * num_bloombits for _ in xrange(num_cohorts)]
36 |   num_reports = [0] * num_cohorts
37 | 
38 |   for i, row in enumerate(csv_in):
39 |     try:
40 |       (user_id, cohort, unused_bloom, unused_prr, irr) = row
41 |     except ValueError:
42 |       raise RuntimeError('Error parsing row %r' % row)
43 | 
44 |     if i == 0:
45 |       continue  # skip header
46 | 
47 |     cohort = int(cohort)
48 |     num_reports[cohort] += 1
49 | 
50 |     if not len(irr) == params.num_bloombits:
51 |       raise RuntimeError(
52 |           "Expected %d bits, got %r" % (params.num_bloombits, len(irr)))
53 |     for i, c in enumerate(irr):
54 |       bit_num = num_bloombits - i - 1  # e.g. char 0 = bit 15, char 15 = bit 0
55 |       if c == '1':
56 |         sums[cohort][bit_num] += 1
57 |       else:
58 |         if c != '0':
59 |           raise RuntimeError('Invalid IRR -- digits should be 0 or 1')
60 | 
61 |   for cohort in xrange(num_cohorts):
62 |     # First column is the total number of reports in the cohort.
63 |     row = [num_reports[cohort]] + sums[cohort]
64 |     csv_out.writerow(row)
65 | 
66 | 
67 | def main(argv):
68 |   try:
69 |     filename = argv[1]
70 |   except IndexError:
71 |     raise RuntimeError('Usage: sum_bits.py <params file>')
72 |   with open(filename) as f:
73 |     try:
74 |       params = rappor.Params.from_csv(f)
75 |     except rappor.Error as e:
76 |       raise RuntimeError(e)
77 | 
78 |   SumBits(params, sys.stdin, sys.stdout)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |   try:
83 |     main(sys.argv)
84 |   except RuntimeError, e:
85 |     print >>sys.stderr, e.args[0]
86 |     sys.exit(1)
87 | 


--------------------------------------------------------------------------------
/client/cpp/rappor_deps.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All rights reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // This header declares the dependencies that the application must provide to
16 | // the RAPPOR.
17 | 
18 | #ifndef RAPPOR_DEPS_H_
19 | #define RAPPOR_DEPS_H_
20 | 
21 | #include <stdint.h>  // for uint32_t
22 | #include <string>
23 | #include <vector>
24 | 
25 | namespace rappor {
26 | 
27 | // rappor::Bits type is used for Bloom Filter, PRR, and IRR
28 | typedef uint32_t Bits;
29 | 
30 | // rappor::Encoder needs a hash function for the bloom filter, and an HMAC
31 | // function for the PRR.
32 | 
33 | typedef bool HashFunc(const std::string& value, std::vector<uint8_t>* output);
34 | typedef bool HmacFunc(const std::string& key, const std::string& value,
35 |                       std::vector<uint8_t>* output);
36 | 
37 | // Interface that the encoder use to generate randomness for the IRR.
38 | // Applications should implement this based on their platform and requirements.
39 | class IrrRandInterface {
40 |  public:
41 |   virtual ~IrrRandInterface() {}
42 |   // Compute a bitmask with each bit set to 1 with probability 'prob'.
43 |   // Returns false if there is an error.
44 |   virtual bool GetMask(float prob, int num_bits, Bits* mask_out) const = 0;
45 | };
46 | 
47 | // Dependencies
48 | // - hash_func: hash function for the Bloom Filter client step
49 | // - client_secret: key for deterministic randomness in the PRR
50 | // - hmac_func: function for deterministic randomness in the PRR
51 | // - irr_rand: randomness for the IRR
52 | 
53 | class Deps {
54 |  public:
55 |   Deps(HashFunc* const hash_func, const std::string& client_secret,
56 |        HmacFunc* const hmac_func, const IrrRandInterface& irr_rand)
57 |       : hash_func_(hash_func),
58 |         client_secret_(client_secret),
59 |         hmac_func_(hmac_func),
60 |         irr_rand_(irr_rand) {
61 |   }
62 | 
63 |  private:
64 |   friend class Encoder;
65 | 
66 |   HashFunc* hash_func_;  // for bloom filter
67 |   const std::string client_secret_;  // for PRR; copy of constructor param
68 |   HmacFunc* hmac_func_;  // PRR
69 |   const IrrRandInterface& irr_rand_;  // IRR
70 | };
71 | 
72 | }  // namespace rappor
73 | 
74 | #endif  // RAPPOR_DEPS_H_
75 | 
76 | 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Setup RAPPOR analysis on Ubuntu Trusty (Google Cloud or otherwise).
 4 | #
 5 | # For the apps/api server, you need 'install-minimal'.  For the regtest, and
 6 | # Shiny apps, we need a few more R packages (ggplot2, data.table, etc.).  They
 7 | # cause versioning problems, so we keep them separate.
 8 | #
 9 | # Usage:
10 | #   ./setup.sh [function name]
11 | # If run without specifing any function it will run: install-most 
12 | # which should cover all the packages needed to run the demo.
13 | 
14 | set -o nounset
15 | set -o pipefail
16 | set -o errexit
17 | 
18 | native-packages() {
19 |   sudo apt-get update
20 |   # - build-essential for gcc compilers, invoked while installing R packages.
21 |   # - gfortran Fortran compiler needed for glmnet.
22 |   # - libblas-dev needed for limSolve.
23 |   # - python-dev is for building the fastrand extension
24 |   #
25 |   # NOTE: we get R 3.0.2 on Trusty.
26 |   sudo apt-get install build-essential gfortran libblas-dev r-base python-dev graphviz
27 | }
28 | 
29 | r-packages() {
30 |   # Install as root so you can write to /usr/local/lib/R.
31 | 
32 |   # glmnet, limSolve: solvers for decode.R
33 |   # RJSONIO, optparse: for decode_dist.R
34 |   # RUnit: for unit tests
35 |   # abind: for decode_test only
36 |   sudo R -e \
37 |     'install.packages(c("glmnet", "optparse", "limSolve", "RUnit", "abind", "RJSONIO"), repos="http://cran.rstudio.com/")'
38 | }
39 | 
40 | # R 3.0.2 on Trusty is out of date with CRAN, so we need this workaround.
41 | install-plyr-with-friends() {
42 |   mkdir -p _tmp
43 |   wget --directory _tmp \
44 |     http://cran.r-project.org/src/contrib/Archive/Rcpp/Rcpp_0.11.4.tar.gz
45 |   wget --directory _tmp \
46 |     http://cran.r-project.org/src/contrib/Archive/plyr/plyr_1.8.1.tar.gz
47 |   sudo R CMD INSTALL _tmp/Rcpp_0.11.4.tar.gz
48 |   sudo R CMD INSTALL _tmp/plyr_1.8.1.tar.gz 
49 |   sudo R -e \
50 |     'install.packages(c("reshape2", "ggplot2", "data.table"), repos="http://cran.rstudio.com/")'
51 | }
52 | 
53 | # Keep Shiny separate, since it seems to install a lot of dependencies.
54 | shiny() {
55 |   sudo R -e \
56 |     'install.packages(c("shiny"), repos="http://cran.rstudio.com/")'
57 | }
58 | 
59 | #
60 | # Batch
61 | #
62 | 
63 | install-minimal() {
64 |   native-packages
65 |   r-packages
66 | }
67 | 
68 | # NOTE: hasn't yet been tested on a clean machine.
69 | install-most() {
70 |   install-minimal
71 |   install-plyr-with-friends
72 | }
73 | 
74 | #
75 | # Shiny Apps / API Server
76 | #
77 | 
78 | # After running one of the run_app.sh scripts, see if the app returns a page.
79 | shiny-smoke-test() {
80 |   curl http://localhost:6789/
81 | }
82 | 
83 | # Then set up a "firewall rule" in console.developers.google.com to open up
84 | # "tcp:6789".  Test it from the outside.
85 | 
86 | if test $# -eq 0 ; then
87 |   install-most
88 | else
89 |   "$@"
90 | fi
91 | 


--------------------------------------------------------------------------------
/tests/gen_true_values.R:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | #
 3 | # Copyright 2015 Google Inc. All rights reserved.
 4 | # 
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | # 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | # 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | source('tests/gen_counts.R')
18 | 
19 | # Usage:
20 | #
21 | # $ ./gen_true_values.R exp 100 10000 1 foo.csv
22 | #
23 | # Inputs:
24 | #   distribution name
25 | #   size of the distribution's support
26 | #   number of clients
27 | #   reports per client
28 | #   name of the output file
29 | # Output:
30 | #   csv file with reports sampled according to the specified distribution. 
31 | 
32 | GenerateTrueValues <- function(distr, distr_range, num_clients,
33 |                             reports_per_client, num_cohorts) {
34 | 
35 |   # Sums to 1.0, e.g. [0.2 0.2 0.2 0.2 0.2] for uniform distribution of 5.
36 |   pdf <- ComputePdf(distr, distr_range)
37 | 
38 |   num_reports <- num_clients * reports_per_client
39 | 
40 |   # Computes the number of clients reporting each value, where the numbers are
41 |   # sampled according to pdf.  (sums to num_reports)
42 |   partition <- RandomPartition(num_reports, pdf)
43 |   
44 |   value_ints <- rep(1:distr_range, partition)  # expand partition
45 | 
46 |   stopifnot(length(value_ints) == num_reports)
47 | 
48 |   # Shuffle values randomly (may take a few sec for > 10^8 inputs)
49 |   value_ints <- sample(value_ints)
50 | 
51 |   # Reported values are strings, so prefix integers "v". Even slower than
52 |   # shuffling.
53 |   values <- sprintf("v%d", value_ints)
54 | 
55 |   # e.g. [1 1 2 2 3 3] if num_clients is 3 and reports_per_client is 2
56 |   client_ints <- rep(1:num_clients, each = reports_per_client)
57 | 
58 |   # Cohorts are assigned to clients. Cohorts are 0-based.
59 |   cohorts <- client_ints %% num_cohorts  # %% is integer modulus
60 | 
61 |   clients <- sprintf("c%d", client_ints)
62 | 
63 |   data.frame(client = clients, cohort = cohorts, value = values)
64 | }
65 | 
66 | main <- function(argv) {
67 |   distr <- argv[[1]]
68 |   distr_range <- as.integer(argv[[2]])
69 |   num_clients <- as.integer(argv[[3]])
70 |   reports_per_client <- as.integer(argv[[4]])
71 |   num_cohorts <- as.integer(argv[[5]])
72 |   out_file <- argv[[6]]
73 | 
74 |   reports <- GenerateTrueValues(distr, distr_range, num_clients,
75 |                                 reports_per_client, num_cohorts)
76 | 
77 |   write.csv(reports, file = out_file, row.names = FALSE, quote = FALSE)
78 | }
79 | 
80 | if (length(sys.frames()) == 0) {
81 |   main(commandArgs(TRUE))
82 | }
83 | 


--------------------------------------------------------------------------------
/pipeline/alarm-lib.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Alarm tool.
  4 | #
  5 | # Usage:
  6 | #   ./alarm.sh <function name>
  7 | 
  8 | # You can source this file and use the alarm-status function.
  9 | 
 10 | set -o nounset
 11 | set -o pipefail
 12 | set -o errexit
 13 | 
 14 | # Run a command with a timeout, and print its status to a directory.
 15 | #
 16 | # Usage:
 17 | #   alarm-status job_dir/STATUS 10 \
 18 | #     flaky_command ...
 19 | 
 20 | alarm-status() {
 21 |   set +o errexit
 22 |   local status_file=$1
 23 |   shift  # everything except the status file goes to perl
 24 | 
 25 |   # NOTE: It would be nice to setpgrp() before exec?  And then can the signal
 26 |   # be delivered to the entire group, like kill -SIGALRM -PID?
 27 | 
 28 |   # NOTE: If we did this in Python, the error message would also be clearer.
 29 |   perl -e 'alarm shift; exec @ARGV or die "ERROR: after exec @ARGV"' "$@" 
 30 |   local exit_code=$?
 31 | 
 32 |   set -o errexit
 33 | 
 34 |   local result=''
 35 |   case $exit_code in
 36 |     0)
 37 |       # Would be nice to show elapsed time?
 38 |       result='OK'
 39 |       ;;
 40 |     9)
 41 |       # decode_assoc.R will exit 9 if there are no reports AFTER
 42 |       # --remove-bad-rows.  A task can also be marked SKIPPED before running
 43 |       # the child process (see backfill.sh).
 44 |       result='SKIPPED by child process'
 45 |       ;;
 46 |     # exit code 142 means SIGALARM.  128 + 14 = 142.  See 'kill -l'.
 47 |     142)
 48 |       local seconds=$1
 49 |       result="TIMEOUT after $seconds seconds"
 50 |       ;;
 51 |     *)
 52 |       result="FAIL with status $exit_code"
 53 |       ;;
 54 |   esac
 55 |   echo "$result"
 56 |   echo "$result" > $status_file
 57 | }
 58 | 
 59 | _work() {
 60 |   local n=10  # 2 seconds
 61 |   for i in $(seq $n); do
 62 |     echo $i - "$@"
 63 |     sleep 0.2
 64 |   done
 65 | }
 66 | 
 67 | _succeed() {
 68 |   _work "$@"
 69 |   exit 0
 70 | }
 71 | 
 72 | _fail() {
 73 |   _work "$@"
 74 |   exit 1
 75 | }
 76 | 
 77 | _skip() {
 78 |   exit 9
 79 | }
 80 | 
 81 | # http://perldoc.perl.org/functions/alarm.html
 82 | #
 83 | # Delivers alarm.  But how to get the process to have a distinct exit code?
 84 | 
 85 | demo() {
 86 |   mkdir -p _tmp
 87 | 
 88 |   # timeout
 89 |   alarm-status _tmp/A 1 $0 _succeed foo
 90 |   echo
 91 | 
 92 |   # ok
 93 |   alarm-status _tmp/B 3 $0 _succeed bar
 94 |   echo
 95 | 
 96 |   # fail
 97 |   alarm-status _tmp/C 3 $0 _fail baz
 98 |   echo
 99 | 
100 |   # skip
101 |   alarm-status _tmp/D 3 $0 _skip baz
102 |   echo
103 | 
104 |   head _tmp/{A,B,C,D}
105 | }
106 | 
107 | test-simple() {
108 |   alarm-status _tmp/status.txt 1 sleep 2
109 | }
110 | 
111 | test-bad-command() {
112 |   alarm-status _tmp/status.txt 1 nonexistent_sleep 2
113 | }
114 | 
115 | # BUG
116 | test-perl() {
117 |   set +o errexit
118 |   perl -e 'alarm shift; exec @ARGV or die "ERROR after exec @ARGV"' 1 _sleep 2
119 |   echo $?
120 | }
121 | 
122 | if test $(basename $0) = 'alarm-lib.sh'; then
123 |   "$@"
124 | fi
125 | 


--------------------------------------------------------------------------------
/analysis/R/alternative.R:
--------------------------------------------------------------------------------
 1 | # Copyright 2014 Google Inc. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | library(limSolve)
16 | library(Matrix)
17 | 
18 | # The next two functions create a matrix (G) and a vector (H) encoding
19 | # linear inequality constraints that a solution vector (x) must satisfy:
20 | #                       G * x >= H
21 | 
22 | # Currently represent three sets of constraints on the solution vector:
23 | #  - all solution coefficients are nonnegative
24 | #  - the sum total of all solution coefficients is no more than 1
25 | #  - in each of the coordinates of the target vector (estimated Bloom filter)
26 | #    we don't overshoot by more than three standard deviations.
27 | MakeG <- function(n, X) {
28 |   d <- Diagonal(n)
29 |   last <- rep(-1, n)
30 |   rbind2(rbind2(d, last), -X)
31 | }
32 | 
33 | MakeH <- function(n, Y, stds) {
34 |   # set the floor at 0.01 to avoid degenerate cases
35 |   YY <- apply(Y + 3 * stds,  # in each bin don't overshoot by more than 3 stds
36 |               1:2,
37 |               function(x) min(1, max(0.01, x)))  # clamp the bound to [0.01,1]
38 | 
39 |   c(rep(0, n),  # non-negativity condition
40 |     -1,         # coefficients sum up to no more than 1
41 |     -as.vector(t(YY))   # t is important!
42 |     )
43 | }
44 | 
45 | MakeLseiModel <- function(X, Y, stds) {
46 |   m <- dim(X)[1]
47 |   n <- dim(X)[2]
48 | 
49 | # no slack variables for now
50 | #   slack <- Matrix(FALSE, nrow = m, ncol = m, sparse = TRUE)
51 | #   colnames(slack) <- 1:m
52 | #   diag(slack) <- TRUE
53 | #
54 | #   G <- MakeG(n + m)
55 | #   H <- MakeH(n + m)
56 | #
57 | #   G[n+m+1,n:(n+m)] <- -0.1
58 | #  A = cbind2(X, slack)
59 | 
60 |   w <- as.vector(t(1 / stds))
61 |   w_median <- median(w[!is.infinite(w)])
62 |   if(is.na(w_median))  # all w are infinite
63 |     w_median <- 1
64 |   w[w > w_median * 2] <- w_median * 2
65 |   w <- w / mean(w)
66 | 
67 |   list(# coerce sparse Boolean matrix X to sparse numeric matrix
68 |        A = Diagonal(x = w) %*% (X + 0),
69 |        B = as.vector(t(Y)) * w,  # transform to vector in the row-first order
70 |        G = MakeG(n, X),
71 |        H = MakeH(n, Y, stds),
72 |        type = 2)  # Since there are no equality constraints, lsei defaults to
73 |                   # solve.QP anyway, but outputs a warning unless type == 2.
74 | }
75 | 
76 | # CustomLM(X, Y)
77 | ConstrainedLinModel <- function(X,Y) {
78 |   model <- MakeLseiModel(X, Y$estimates, Y$stds)
79 |   coefs <- do.call(lsei, model)$X
80 |   names(coefs) <- colnames(X)
81 | 
82 |   coefs
83 | }


--------------------------------------------------------------------------------
/tests/regtest.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |   <title>RAPPOR regtest.sh (_IMPL_)</title>
  5 |   <style type="text/css">
  6 |     h2 { text-align: center }
  7 |     p { margin: 0 auto; width: 80%; text-align: center }
  8 | 
  9 |     table { width: 100%; border-spacing: 0 }
 10 |     .tophead { text-align: center; font-weight: bold }
 11 |     .explain { text-align: left; font-weight: normal }
 12 |     .subhead { text-align: right; font-weight: bold }
 13 |     .highlight { background-color: #eeeeee }
 14 |     tbody td { text-align: right }
 15 |     #parent { text-align: right }
 16 |   </style>
 17 | </head>
 18 | 
 19 | <body>
 20 |   <a id="top"></a>
 21 | 
 22 |   <p id="parent"><a href="..">Parent</a></p>
 23 | 
 24 |   <h2>RAPPOR regtest.sh (_IMPL_)</h2>
 25 | 
 26 |   <!-- These strings will be replaced by a sed script. -->
 27 | 
 28 |   <table cellspacing="0" cellpadding="5">
 29 |     <colgroup>
 30 |       <col span="1" class="highlight" />
 31 |       <col span="4" />
 32 |       <col span="6" class="highlight" />
 33 |       <col span="2" />
 34 |       <col span="7" class="highlight" />
 35 |     </colgroup>
 36 | 
 37 |     <thead>
 38 |       <tr class="tophead">
 39 |         <td>
 40 |           Test Case
 41 |         </td>
 42 |         <td colspan=4>
 43 |           Input Params
 44 |         </td>
 45 |         <td colspan=6>
 46 |           RAPPOR Params
 47 |         </td>
 48 |         <td colspan=2>
 49 |           Map Params
 50 |         </td>
 51 |         <td colspan=7>
 52 |           Result Metrics
 53 |         </td>
 54 |       </tr>
 55 | 
 56 |       <tr class="explain">
 57 |         <td></td>
 58 |         <td colspan=4>
 59 |           d: distribution drawn from<br/>
 60 |           u: total unique values<br/>
 61 |           c: clients<br/>
 62 |           v: values per client<br/>
 63 |         </td>
 64 |         <td colspan=6>
 65 |           k: report bits<br/>
 66 |           h: hashes<br/>
 67 |           m: cohorts<br/>
 68 |           p, q, f: probabilities<br/>
 69 |         </td>
 70 |         <td colspan=2>
 71 |           +: num additional candidates<br/>
 72 |           -: regex for true values removed<br/>
 73 |         </td>
 74 |         <td colspan=7>
 75 |           a: actual values<br/>
 76 |           r: values RAPPOR detected<br/>
 77 |           fp: false positive rate<br/>
 78 |           fn: false negative rate<br/>
 79 |           tv: total variation distance<br/>
 80 |           am: allocated mass<br/>
 81 |           time: time in seconds<br/>
 82 |         </td>
 83 |       </tr>
 84 | 
 85 |       <tr class="subhead">
 86 |         <td></td>
 87 | 
 88 |         <td>d</td>
 89 |         <td>u</td>
 90 |         <td>c</td>
 91 |         <td>v</td>
 92 | 
 93 |         <td>k</td>
 94 |         <td>h</td>
 95 |         <td>m</td>
 96 |         <td>p</td>
 97 |         <td>q</td>
 98 |         <td>f</td>
 99 | 
100 |         <td>+</td>
101 |         <td>-</td>
102 | 
103 |         <td>a</td>
104 |         <td>r</td>
105 |         <td>fp</td>
106 |         <td>fn</td>
107 |         <td>tv</td>
108 |         <td>am</td>
109 |         <td>time</td>
110 |       </tr>
111 |     </thead>
112 | 
113 |     <tbody>
114 |   <!-- __TABLE_ROWS__ -->
115 | 
116 | </body>
117 | 
118 | </html>
119 | 


--------------------------------------------------------------------------------
/tests/_fastrand.c:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright 2014 Google Inc. All rights reserved.
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | /*
 18 |  * _fastrand.c -- Python extension module to generate random bit vectors
 19 |  * quickly.
 20 |  *
 21 |  * IMPORTANT: This module does not use crytographically strong randomness.  It
 22 |  * should be used ONLY be used to speed up the simulation.  Don't use it in
 23 |  * production.
 24 |  *
 25 |  * If an adversary can predict which random bits are flipped, then RAPPOR's
 26 |  * privacy is compromised.
 27 |  *
 28 |  */
 29 | 
 30 | #include <stdint.h>  // uint64_t
 31 | #include <stdio.h>  // printf
 32 | #include <stdlib.h>  // srand
 33 | #include <time.h>  // time
 34 | 
 35 | #include <Python.h>
 36 | 
 37 | uint64_t randbits(float p1, int num_bits) {
 38 |   uint64_t result = 0;
 39 |   // RAND_MAX is the maximum int returned by rand().
 40 |   //
 41 |   // When p1 == 1.0, we want to guarantee that all bits are 1.  The threshold
 42 |   // will be RAND_MAX + 1.  In the rare case that rand() returns RAND_MAX, the
 43 |   // "<" test succeeds, so we get 1.
 44 |   //
 45 |   // When p1 == 0.0, we want to guarantee that all bits are 0.  The threshold
 46 |   // will be 0.  In the rare case that rand() returns 0, the "<" test fails, so
 47 |   // we get 0.
 48 | 
 49 |   // NOTE: cast is necessary to do unsigned arithmetic rather than signed.
 50 |   // RAND_MAX is an int so adding 1 won't overflow a uint64_t.
 51 |   uint64_t max = (uint64_t)RAND_MAX + 1u;
 52 |   uint64_t threshold = p1 * max;
 53 |   int i;
 54 |   for (i = 0; i < num_bits; ++i) {
 55 |     // NOTE: The comparison is <= so that p1 = 1.0 implies that the bit is
 56 |     // ALWAYS set.  RAND_MAX is the maximum value returned by rand().
 57 |     uint64_t bit = (rand() < threshold);
 58 |     result |= (bit << i);
 59 |   }
 60 |   return result;
 61 | }
 62 | 
 63 | static PyObject *
 64 | func_randbits(PyObject *self, PyObject *args) {
 65 |   float p1;
 66 |   int num_bits;
 67 | 
 68 |   if (!PyArg_ParseTuple(args, "fi", &p1, &num_bits)) {
 69 |     return NULL;
 70 |   }
 71 |   if (p1 < 0.0 || p1 > 1.0) {
 72 |     printf("p1 must be between 0.0 and 1.0\n");
 73 |     // return None for now; easier than raising ValueError
 74 |     Py_INCREF(Py_None);
 75 |     return Py_None;
 76 |   }
 77 |   if (num_bits < 0 || num_bits > 64) {
 78 |     printf("num_bits must be 64 or less\n");
 79 |     // return None for now; easier than raising ValueError
 80 |     Py_INCREF(Py_None);
 81 |     return Py_None;
 82 |   }
 83 | 
 84 |   //printf("p: %f\n", p);
 85 |   uint64_t r = randbits(p1, num_bits);
 86 |   return PyLong_FromUnsignedLongLong(r);
 87 | }
 88 | 
 89 | PyMethodDef methods[] = {
 90 |   {"randbits", func_randbits, METH_VARARGS,
 91 |    "Return a number with N bits, where each bit is 1 with probability p."},
 92 |   {NULL, NULL},
 93 | };
 94 | 
 95 | void init_fastrand(void) {
 96 |   Py_InitModule("_fastrand", methods);
 97 | 
 98 |   // Just seed it here; we don't give the application any control.
 99 |   int seed = time(NULL);
100 |   srand(seed);
101 | }
102 | 


--------------------------------------------------------------------------------
/tests/regtest_spec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """Print a test spec on stdout.
  3 | 
  4 | Each line has parameters for a test case.  The regtest.sh shell script reads
  5 | these lines and runs parallel processes.
  6 | 
  7 | We use Python data structures so the test cases are easier to read and edit.
  8 | """
  9 | 
 10 | import optparse
 11 | import sys
 12 | 
 13 | #
 14 | # TEST CONFIGURATION
 15 | #
 16 | 
 17 | DEMO = (
 18 |     # (case_name distr num_unique_values num_clients values_per_client)
 19 |     # (num_bits num_hashes num_cohorts)
 20 |     # (p q f) (num_additional regexp_to_remove)
 21 |     ('demo1 unif    100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
 22 |     ('demo2 gauss   100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
 23 |     ('demo3 exp     100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
 24 |     ('demo4 zipf1   100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
 25 |     ('demo5 zipf1.5 100 100000 10', '32 1 64', '0.25 0.75 0.5', '100 v[0-9]*9$'),
 26 | )
 27 | 
 28 | DISTRIBUTIONS = (
 29 |     'unif',
 30 |     'exp',
 31 |     'gauss',
 32 |     'zipf1',
 33 |     'zipf1.5',
 34 | )
 35 | 
 36 | DISTRIBUTION_PARAMS = (
 37 |     # name, num unique values, num clients, values per client
 38 |     ('tiny', 100, 1000, 1),  # test for insufficient data
 39 |     ('small', 100, 1000000, 1),
 40 |     ('medium', 1000, 10000000, 1),
 41 |     ('large', 10000, 100000000, 1),
 42 | )
 43 | 
 44 | # 'k, h, m' as in params file.
 45 | BLOOMFILTER_PARAMS = {
 46 |     '8x16': (8, 2, 16),  # 16 cohorts, 8 bits each, 2 bits set in each
 47 |     '8x32': (8, 2, 32),  # 32 cohorts, 8 bits each, 2 bits set in each
 48 |     '8x128': (8, 2, 128),  # 128 cohorts, 8 bits each, 2 bits set in each
 49 |     '128x128': (128, 2, 128),  # 8 cohorts, 128 bits each, 2 bits set in each
 50 | }
 51 | 
 52 | # 'p, q, f' as in params file.
 53 | PRIVACY_PARAMS = {
 54 |     'eps_1_1': (0.39, 0.61, 0.45),  # eps_1 = 1, eps_inf = 5:
 55 |     'eps_1_5': (0.225, 0.775, 0.0),  # eps_1 = 5, no eps_inf
 56 | }
 57 | 
 58 | # For deriving candidates from true inputs.
 59 | MAP_REGEX_MISSING = {
 60 |     'sharp': 'NONE',  # Categorical data
 61 |     '10%': 'v[0-9]*9$',  # missing every 10th string
 62 | }
 63 | 
 64 | # test configuration ->
 65 | #   (name modifier, Bloom filter, privacy params, fraction of extra,
 66 | #    regex missing)
 67 | TEST_CONFIGS = [
 68 |     ('typical', '8x128', 'eps_1_1', .2, '10%'),
 69 |     ('sharp', '8x128', 'eps_1_1', .0, 'sharp'),  # no extra candidates
 70 |     ('loose', '8x128', 'eps_1_5', .2, '10%'),  # loose privacy
 71 |     ('over_x2', '8x128', 'eps_1_1', 2.0, '10%'),  # overshoot by x2
 72 |     ('over_x10', '8x128', 'eps_1_1', 10.0, '10%'),  # overshoot by x10
 73 | ]
 74 | 
 75 | #
 76 | # END TEST CONFIGURATION
 77 | #
 78 | 
 79 | 
 80 | def main(argv):
 81 |   rows = []
 82 | 
 83 |   test_case = []
 84 |   for (distr_params, num_values, num_clients,
 85 |        num_reports_per_client) in DISTRIBUTION_PARAMS:
 86 |     for distribution in DISTRIBUTIONS:
 87 |       for (config_name, bloom_name, privacy_params, fr_extra,
 88 |            regex_missing) in TEST_CONFIGS:
 89 |         test_name = 'r-{}-{}-{}'.format(distribution, distr_params,
 90 |                                         config_name)
 91 | 
 92 |         params = (BLOOMFILTER_PARAMS[bloom_name]
 93 |                   + PRIVACY_PARAMS[privacy_params]
 94 |                   + tuple([int(num_values * fr_extra)])
 95 |                   + tuple([MAP_REGEX_MISSING[regex_missing]]))
 96 | 
 97 |         test_case = (test_name, distribution, num_values, num_clients,
 98 |                      num_reports_per_client) + params
 99 |         row_str = [str(element) for element in test_case]
100 |         rows.append(row_str)
101 | 
102 |   for params in DEMO:
103 |     rows.append(params)
104 | 
105 |   for row in rows:
106 |     print ' '.join(row)
107 | 
108 | if __name__ == '__main__':
109 |   try:
110 |     main(sys.argv)
111 |   except RuntimeError, e:
112 |     print >>sys.stderr, 'FATAL: %s' % e
113 |     sys.exit(1)
114 | 


--------------------------------------------------------------------------------
/pipeline/combine_results.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """Combines results from multiple days of a single metric.
  3 | 
  4 | Feed it the STATUS.txt files on stdin.  It then finds the corresponding
  5 | results.csv, and takes the top N items.
  6 | 
  7 | Example:
  8 | 
  9 | Date,      "google.com,", yahoo.com
 10 | 2015-03-01,          0.0,       0.9
 11 | 2015-03-02,          0.1,       0.8
 12 | 
 13 | Dygraphs can load this CSV file directly.
 14 | 
 15 | TODO: Use different dygraph API?
 16 | 
 17 | Also we need error bars.
 18 | 
 19 |   new Dygraph(document.getElementById("graphdiv2"),
 20 |               [
 21 |                 [1,10,100],
 22 |                 [2,20,80],
 23 |                 [3,50,60],
 24 |                 [4,70,80]
 25 |               ],
 26 |               {
 27 |                 labels: [ "Date", "failure", "timeout", "google.com" ]
 28 |               });
 29 | """
 30 | 
 31 | import collections
 32 | import csv
 33 | import json
 34 | import os
 35 | import sys
 36 | 
 37 | import util
 38 | 
 39 | 
 40 | def CombineDistResults(stdin, c_out, num_top):
 41 |   dates = []
 42 |   var_cols = collections.defaultdict(dict)  # {name: {date: value}}
 43 | 
 44 |   seen_dates = set()
 45 | 
 46 |   for line in stdin:
 47 |     status_path = line.strip()
 48 | 
 49 |     # Assume it looks like .../2015-03-01/STATUS.txt
 50 |     task_dir = os.path.dirname(status_path)
 51 |     date = os.path.basename(task_dir)
 52 | 
 53 |     # Get rid of duplicate dates.  These could be caused by retries.
 54 |     if date in seen_dates:
 55 |       continue
 56 | 
 57 |     seen_dates.add(date)
 58 | 
 59 |     with open(status_path) as f:
 60 |       status = f.readline().split()[0]  # OK, FAIL, TIMEOUT, SKIPPED
 61 | 
 62 |     dates.append(date)
 63 | 
 64 |     if status != 'OK':
 65 |       continue  # won't have results.csv
 66 | 
 67 |     results_path = os.path.join(task_dir, 'results.csv')
 68 |     with open(results_path) as f:
 69 |       c = csv.reader(f)
 70 |       unused_header = c.next()  # header row
 71 | 
 72 |       # they are sorted by decreasing "estimate", which is what we want
 73 |       for i in xrange(0, num_top):
 74 |         try:
 75 |           row = c.next()
 76 |         except StopIteration:
 77 |           # It's OK if it doesn't have enough
 78 |           util.log('Stopping early. Fewer than %d results to render.', num_top)
 79 |           break
 80 | 
 81 |         string, _, _, proportion, _, prop_low, prop_high = row
 82 | 
 83 |         # dygraphs has a weird format with semicolons:
 84 |         # value;lower;upper,value;lower;upper.
 85 | 
 86 |         # http://dygraphs.com/data.html#csv
 87 | 
 88 |         # Arbitrarily use 4 digits after decimal point (for dygraphs, not
 89 |         # directly displayed)
 90 |         dygraph_triple = '%.4f;%.4f;%.4f' % (
 91 |             float(prop_low), float(proportion), float(prop_high))
 92 | 
 93 |         var_cols[string][date] = dygraph_triple
 94 | 
 95 |   # Now print CSV on stdout.
 96 |   cols = sorted(var_cols.keys())  # sort columns alphabetically
 97 |   c_out.writerow(['date'] + cols)
 98 | 
 99 |   dates.sort()
100 | 
101 |   for date in dates:
102 |     row = [date]
103 |     for col in cols:
104 |       cell = var_cols[col].get(date)  # None mean sthere is no row
105 |       row.append(cell)
106 |     c_out.writerow(row)
107 | 
108 |   #util.log("Number of dynamic cols: %d", len(var_cols))
109 | 
110 | 
111 | def CombineAssocResults(stdin, c_out, num_top):
112 |   header = ('dummy',)
113 |   c_out.writerow(header)
114 | 
115 | 
116 | def main(argv):
117 |   action = argv[1]
118 | 
119 |   if action == 'dist':
120 |     num_top = int(argv[2])  # number of values to keep
121 |     c_out = csv.writer(sys.stdout)
122 |     CombineDistResults(sys.stdin, c_out, num_top)
123 | 
124 |   elif action == 'assoc':
125 |     num_top = int(argv[2])  # number of values to keep
126 |     c_out = csv.writer(sys.stdout)
127 |     CombineAssocResults(sys.stdin, c_out, num_top)
128 | 
129 |   else:
130 |     raise RuntimeError('Invalid action %r' % action)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |   try:
135 |     main(sys.argv)
136 |   except RuntimeError, e:
137 |     print >>sys.stderr, 'FATAL: %s' % e
138 |     sys.exit(1)
139 | 


--------------------------------------------------------------------------------
/tests/gen_counts_test.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | #
  3 | # Copyright 2014 Google Inc. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | library(RUnit)
 18 | library(Matrix)  # for sparse matrices
 19 | 
 20 | source('tests/gen_counts.R')
 21 | 
 22 | TestGenerateCounts <- function() {
 23 |   report_params <- list(k = 4, m = 2)  # 2 cohorts, 4 bits each
 24 |   map <- Matrix(0, nrow = 8, ncol = 3, sparse = TRUE)  # 3 possible values
 25 |   map[1,] <- c(1, 0, 0)
 26 |   map[2,] <- c(0, 1, 0)
 27 |   map[3,] <- c(0, 0, 1)
 28 |   map[4,] <- c(1, 1, 1)  # 4th bit of the first cohort gets signal from all
 29 |   map[5,] <- c(0, 0, 1)  # 1st bit of the second cohort gets signal from v3
 30 | 
 31 |   colnames(map) <- c('v1', 'v2', 'v3')
 32 | 
 33 |   partition <- c(3, 2, 1) * 10000
 34 |   v <- 100  # reports per client
 35 | 
 36 |   noise0 <- list(p = 0, q = 1, f = 0)  # no noise at all
 37 |   counts0 <- GenerateCounts(c(report_params, noise0), map, partition, v)
 38 | 
 39 |   checkEqualsNumeric(sum(counts0[1,2:4]), counts0[1,1])
 40 |   checkEqualsNumeric(counts0[1,5], counts0[1,1])
 41 |   checkEqualsNumeric(partition[3] * v, counts0[1,4] + counts0[2,2])
 42 |   checkEqualsNumeric(sum(partition) * v, counts0[1,1] + counts0[2,1])
 43 | 
 44 |   pvalues <- chisq.test(counts0[,1] / v, p = c(.5, .5))$p.value
 45 |   for(i in 2:4)
 46 |     pvalues <- c(pvalues,
 47 |                  chisq.test(
 48 |                    c(counts0[1,i] / v, partition[i - 1] - counts0[1,i] / v),
 49 |                    p = c(.5, .5))$p.value)
 50 | 
 51 |   noise1 <- list(p = .5, q = .5, f = 0)  # truly random IRRs
 52 |   counts1 <- GenerateCounts(c(report_params, noise1), map, partition, v)
 53 | 
 54 |   for(i in 2:5)
 55 |     for(j in 1:2)
 56 |       pvalues <- c(pvalues,
 57 |                    chisq.test(c(counts1[j,1] - counts1[j,i], counts1[j,i]),
 58 |                                 p = c(.5, .5))$p.value)
 59 | 
 60 |   noise2 <- list(p = 0, q = 1, f = 1.0)  # truly random PRRs
 61 |   counts2 <- GenerateCounts(c(report_params, noise2), map, partition, v)
 62 | 
 63 |   checkEqualsNumeric(0, max(counts2 %% v))  # all entries must be divisible by v
 64 | 
 65 |   counts2 <- counts2 / v
 66 | 
 67 |   for(i in 2:5)
 68 |     for(j in 1:2)
 69 |       pvalues <- c(pvalues,
 70 |                    chisq.test(c(counts2[j,1] - counts2[j,i], counts2[j,i]),
 71 |                               p = c(.5, .5))$p.value)
 72 | 
 73 |   checkTrue(min(pvalues) > 1E-9,  "Chi-squared test failed")
 74 | }
 75 | 
 76 | TestRandomPartition <- function() {
 77 | 
 78 |   p1 <- RandomPartition(total = 100, dgeom(0:999, prob = .1))
 79 |   p2 <- RandomPartition(total = 1000, dnorm(1:1000, mean = 500, sd = 1000 / 6))
 80 |   p3 <- RandomPartition(total = 10000, dunif(1:1000))
 81 | 
 82 |   # Totals must check out.
 83 |   checkEqualsNumeric(100, sum(p1))
 84 |   checkEqualsNumeric(1000, sum(p2))
 85 |   checkEqualsNumeric(10000, sum(p3))
 86 | 
 87 |   # Initialize the weights vector to 1 0 1 0 1 0 ...
 88 |   weights <- rep(c(1, 0), 100)
 89 | 
 90 |   p4 <- RandomPartition(total = 10000, weights)
 91 | 
 92 |   # Check that all mass is allocated to non-zero weights.
 93 |   checkEqualsNumeric(10000, sum(p4[weights == 1]))
 94 |   checkTrue(all(p4[weights == 0] == 0))
 95 | 
 96 |   p5 <- RandomPartition(total = 1000000, c(1, 2, 3, 4))
 97 |   p.value <- chisq.test(p5, p = c(.1, .2, .3, .4))$p.value
 98 | 
 99 |   # Apply the chi squared test and fail if p.value is too high or too low.
100 |   # Probability of failure is 2 * 1E-9, which should never happen.
101 |   checkTrue(p.value > 1E-9)
102 | }
103 | 
104 | TestAll <- function(){
105 |   TestRandomPartition()
106 |   TestGenerateCounts()
107 | }
108 | 
109 | TestAll()


--------------------------------------------------------------------------------
/client/cpp/openssl_hash_impl.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 Google Inc. All rights reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "openssl_hash_impl.h"
 16 | 
 17 | #include <stdlib.h>
 18 | #include <string>
 19 | 
 20 | #include <openssl/evp.h>  // EVP_sha256
 21 | #include <openssl/hmac.h>  // HMAC
 22 | #include <openssl/md5.h>  // MD5
 23 | #include <openssl/sha.h>  // SHA256_DIGEST_LENGTH
 24 | 
 25 | namespace rappor {
 26 | 
 27 | // of type HmacFunc in rappor_deps.h
 28 | bool HmacSha256(const std::string& key, const std::string& value,
 29 |           std::vector<uint8_t>* output) {
 30 |   output->resize(SHA256_DIGEST_LENGTH, 0);
 31 | 
 32 |   // Returns a pointer on success, or NULL on failure.
 33 |   unsigned char* result = HMAC(
 34 |       EVP_sha256(), key.c_str(), key.size(),
 35 |       // std::string has 'char', OpenSSL wants unsigned char.
 36 |       reinterpret_cast<const unsigned char*>(value.c_str()),
 37 |       value.size(),
 38 |       output->data(),
 39 |       NULL);
 40 | 
 41 |   return (result != NULL);
 42 | }
 43 | 
 44 | // Of type HmacFunc in rappor_deps.h
 45 | //
 46 | // The length of the passed-in output vector determines how many
 47 | // bytes are returned.
 48 | //
 49 | // No reseed operation, but recommended reseed_interval <= 2^48 updates.
 50 | // Since we're seeding for each value and typically don't need
 51 | // so many bytes, we should be OK.
 52 | bool HmacDrbg(const std::string& key, const std::string& value,
 53 |               std::vector<uint8_t>* output) {
 54 |   const unsigned char k_array[] = {
 55 |     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 56 |     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 57 |     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 58 |     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
 59 |   };
 60 |   std::string v;
 61 |   std::vector<uint8_t> temp_output;
 62 |   int num_bytes = output->size();
 63 |   if (num_bytes == 0) {
 64 |     // By default return 32 bytes for Uint32 applications.
 65 |     num_bytes = 32;
 66 |   }
 67 | 
 68 |   v.append(32u, 0x01);
 69 |   temp_output.resize(32, 0);
 70 | 
 71 |   std::string temp_str(v);
 72 |   temp_str.append(std::string("\0", 1));
 73 |   // provided_data is key|value.
 74 |   temp_str.append(key);
 75 |   temp_str.append(value);
 76 | 
 77 |   output->resize(0);
 78 | 
 79 |   // Instantiate.
 80 |   if (!HmacSha256(std::string(k_array, k_array + 32), temp_str, &temp_output)) {
 81 |     return false;
 82 |   }
 83 |   std::string k(temp_output.begin(), temp_output.end());
 84 |   if (!HmacSha256(k, v, &temp_output)) {
 85 |     return false;
 86 |   }
 87 |   v = std::string(temp_output.begin(), temp_output.end());
 88 |   if (!HmacSha256(k, v + std::string("\1", 1) + key + value, &temp_output)) {
 89 |     return false;
 90 |   }
 91 |   k = std::string(temp_output.begin(), temp_output.end());
 92 |   if (!HmacSha256(k, v, &temp_output)) {
 93 |     return false;
 94 |   }
 95 |   v = std::string(temp_output.begin(), temp_output.end());
 96 | 
 97 |   while (output->size() < num_bytes) {
 98 |     // Generate.
 99 |     if (!HmacSha256(k, v, &temp_output)) {
100 |       return false;
101 |     }
102 |     v = std::string(temp_output.begin(), temp_output.end());
103 |     output->insert(output->end(), temp_output.begin(), temp_output.end());
104 |   }
105 |   output->resize(num_bytes);
106 |   return true;
107 | }
108 | 
109 | // of type HashFunc in rappor_deps.h
110 | bool Md5(const std::string& value, std::vector<uint8_t>* output) {
111 |   output->resize(MD5_DIGEST_LENGTH, 0);
112 | 
113 |   // std::string has 'char', OpenSSL wants unsigned char.
114 |   MD5(reinterpret_cast<const unsigned char*>(value.c_str()),
115 |       value.size(), output->data());
116 |   return true;  // OpenSSL MD5 doesn't return an error code
117 | }
118 | 
119 | }  // namespace rappor
120 | 


--------------------------------------------------------------------------------
/client/python/rappor_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #
  3 | # Copyright 2014 Google Inc. All rights reserved.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | """
 18 | rappor_test.py: Tests for rappor.py
 19 | """
 20 | import cStringIO
 21 | import copy
 22 | import math
 23 | import random
 24 | import unittest
 25 | 
 26 | import rappor  # module under test
 27 | 
 28 | 
 29 | class RapporParamsTest(unittest.TestCase):
 30 | 
 31 |   def setUp(self):
 32 |     self.typical_instance = rappor.Params()
 33 |     ti = self.typical_instance  # For convenience
 34 |     ti.num_cohorts = 64        # Number of cohorts
 35 |     ti.num_hashes = 2          # Number of bloom filter hashes
 36 |     ti.num_bloombits = 16      # Number of bloom filter bits
 37 |     ti.prob_p = 0.40           # Probability p
 38 |     ti.prob_q = 0.70           # Probability q
 39 |     ti.prob_f = 0.30           # Probability f
 40 | 
 41 |   def testFromCsv(self):
 42 |     f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\n')
 43 |     params = rappor.Params.from_csv(f)
 44 |     self.assertEqual(32, params.num_bloombits)
 45 |     self.assertEqual(64, params.num_cohorts)
 46 | 
 47 |     # Malformed header
 48 |     f = cStringIO.StringIO('k,h,m,p,q\n32,2,64,0.5,0.75,0.6\n')
 49 |     self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
 50 | 
 51 |     # Missing second row
 52 |     f = cStringIO.StringIO('k,h,m,p,q,f\n')
 53 |     self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
 54 | 
 55 |     # Too many rows
 56 |     f = cStringIO.StringIO('k,h,m,p,q,f\n32,2,64,0.5,0.75,0.6\nextra')
 57 |     self.assertRaises(rappor.Error, rappor.Params.from_csv, f)
 58 | 
 59 |   def testGetBloomBits(self):
 60 |     for cohort in xrange(0, 64):
 61 |       b = rappor.get_bloom_bits('foo', cohort, 2, 16)
 62 |       #print 'cohort', cohort, 'bloom', b
 63 | 
 64 |   def testGetPrr(self):
 65 |     bloom = 1
 66 |     num_bits = 8
 67 |     for word in ('v1', 'v2', 'v3'):
 68 |       masks = rappor.get_prr_masks('secret', word, 0.5, num_bits)
 69 |       print 'masks', masks
 70 | 
 71 |   def testToBigEndian(self):
 72 |     b = rappor.to_big_endian(1)
 73 |     print repr(b)
 74 |     self.assertEqual(4, len(b))
 75 | 
 76 |   def testEncoder(self):
 77 |     # Test encoder with deterministic random function.
 78 |     params = copy.copy(self.typical_instance)
 79 |     params.prob_f = 0.5
 80 |     params.prob_p = 0.5
 81 |     params.prob_q = 0.75
 82 | 
 83 |     # return these 3 probabilities in sequence.
 84 |     rand = MockRandom([0.0, 0.6, 0.0], params)
 85 | 
 86 |     e = rappor.Encoder(params, 0, 'secret', rand)
 87 | 
 88 |     irr = e.encode("abc")
 89 | 
 90 |     self.assertEquals(64493, irr)  # given MockRandom, this is what we get
 91 | 
 92 | 
 93 | class MockRandom(object):
 94 |   """Returns one of three random values in a cyclic manner.
 95 | 
 96 |   Mock random function that involves *some* state, as needed for tests that
 97 |   call randomness several times. This makes it difficult to deal exclusively
 98 |   with stubs for testing purposes.
 99 |   """
100 | 
101 |   def __init__(self, cycle, params):
102 |     self.p_gen = MockRandomCall(params.prob_p, cycle, params.num_bloombits)
103 |     self.q_gen = MockRandomCall(params.prob_q, cycle, params.num_bloombits)
104 | 
105 | class MockRandomCall:
106 |   def __init__(self, prob, cycle, num_bits):
107 |     self.cycle = cycle
108 |     self.n = len(self.cycle)
109 |     self.prob = prob
110 |     self.num_bits = num_bits
111 | 
112 |   def __call__(self):
113 |     counter = 0
114 |     r = 0
115 |     for i in xrange(0, self.num_bits):
116 |       rand_val = self.cycle[counter]
117 |       counter += 1
118 |       counter %= self.n  # wrap around
119 |       r |= ((rand_val < self.prob) << i)
120 |     return r
121 | 
122 | 
123 | if __name__ == "__main__":
124 |   unittest.main()
125 | 


--------------------------------------------------------------------------------
/tests/user_spec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """Print a test spec on stdout.
  3 | 
  4 | Each line has parmaeters for a test case.  The regtest.sh shell script reads
  5 | these lines and runs parallel processes.
  6 | 
  7 | We use Python data structures so the test cases are easier to read and edit.
  8 | """
  9 | 
 10 | import sys
 11 | 
 12 | #
 13 | # TEST CONFIGURATION
 14 | #
 15 | 
 16 | # For gen_sim_input.py
 17 | INPUT_PARAMS = {
 18 |     # distribution, num unique values, num clients, values per client
 19 |     'exp-100k': ('exp', 100, 100000, 1),
 20 |     'exp-1m': ('exp', 100, 1000000, 1),
 21 | }
 22 | 
 23 | # For rappor_sim.py
 24 | # 'k, h, m, p, q, f' as in params file.
 25 | RAPPOR_PARAMS = {
 26 |     # Initial chrome params from 2014.
 27 |     # NOTE: fastrand simulation only supports 64 bits!  Make sure to use the
 28 |     # 'fast_counts' code path.
 29 |     'chrome128': (128, 2, 128, 0.25, 0.75, 0.50),
 30 | 
 31 |     # Chrome params from early 2015 -- changed to 8 bit reports.
 32 |     'chrome8': (8, 2, 128, 0.25, 0.75, 0.50),
 33 | 
 34 |     # Original demo params
 35 |     'demo': (16, 2, 64, 0.5, 0.75, 0.5),
 36 | }
 37 | 
 38 | # For deriving candidates from true inputs.
 39 | MAP_PARAMS = {
 40 |     # 1. Number of extra candidates to add.
 41 |     # 2. Candidate strings to remove from the map.  This FORCES false
 42 |     # negatives, e.g. for common strings, since a string has to be in the map
 43 |     # for RAPPOR to choose it.
 44 |     'add-100': (100, []),
 45 |     'add-1000': (1000, []),
 46 |     'add-2000': (2000, []),
 47 |     # also thrashes on 128 bits
 48 |     'add-3000': (3000, []),
 49 |     'add-10000': (10000, []),
 50 |     'add-15000': (15000, []),  # approx number of candidates for eTLD+1
 51 |     'add-100000': (100000, []),
 52 |     'remove-top-2': (20, ['v1', 'v2']),
 53 | }
 54 | 
 55 | # test case name -> (input params name, RAPPOR params name, map params name)
 56 | TEST_CASES = [
 57 |     ('chrome128-100k-100', 'exp-100k', 'chrome128', 'add-100'),
 58 |     ('chrome128-100k-1000', 'exp-100k', 'chrome128', 'add-1000'),
 59 |     ('chrome128-100k-2000', 'exp-100k', 'chrome128', 'add-2000'),
 60 |     ('chrome128-100k-3000', 'exp-100k', 'chrome128', 'add-3000'),
 61 |     # 128 bits and 15k candidates fails on a machine with 8 GB memory.
 62 |     # Lasso finishes with 7508 non-zero coefficients, and then allocation
 63 |     # fails.  TODO: just take the highest ones?
 64 |     #('chrome128-100k-15000', 'exp-100k', 'chrome128', 'add-15000'),
 65 |     #('chrome128-100k-100000', 'exp-100k', 'chrome128', 'add-100000'),
 66 | 
 67 |     # NOTE: Adding more candidates exercises LASSO
 68 |     ('chrome8-100k-100', 'exp-100k', 'chrome8', 'add-100'),
 69 |     ('chrome8-100k-1000', 'exp-100k', 'chrome8', 'add-1000'),
 70 |     ('chrome8-100k-2000', 'exp-100k', 'chrome8', 'add-2000'),
 71 |     ('chrome8-100k-3000', 'exp-100k', 'chrome8', 'add-3000'),
 72 |     ('chrome8-100k-15000', 'exp-100k', 'chrome8', 'add-15000'),
 73 | 
 74 |     # NOTE: This one takes too much memory!  More than 4 GB.  This is because
 75 |     # Lasso gets a huge matrix (100,000).  We got 1564 non-zero coefficients.
 76 |     ('chrome8-100k-100000', 'exp-100k', 'chrome8', 'add-100000'),
 77 | 
 78 |     # What happens when the the candidates are missing top values?
 79 |     ('chrome8-badcand', 'exp-100k', 'chrome8', 'remove-top-2'),
 80 | 
 81 |     # TODO: Use chrome params with real map from Alexa 1M ?
 82 | ]
 83 | 
 84 | #
 85 | # END TEST CONFIGURATION
 86 | #
 87 | 
 88 | 
 89 | def main(argv):
 90 |   rows = []
 91 |   for test_case, input_name, rappor_name, map_name in TEST_CASES:
 92 |     input_params = INPUT_PARAMS[input_name]
 93 |     rappor_params = RAPPOR_PARAMS[rappor_name]
 94 |     map_params = MAP_PARAMS[map_name]
 95 |     row = tuple([test_case]) + input_params + rappor_params + map_params
 96 |     rows.append(row)
 97 | 
 98 |   for row in rows:
 99 |     for cell in row:
100 |       if isinstance(cell, list):
101 |         if cell:
102 |           cell_str = '|'.join(cell)
103 |         else:
104 |           cell_str = 'NONE'  # we don't want an empty string
105 |       else:
106 |         cell_str = cell
107 |       print cell_str,  # print it with a space after it
108 |     print  # new line after row
109 | 
110 | 
111 | if __name__ == '__main__':
112 |   try:
113 |     main(sys.argv)
114 |   except RuntimeError, e:
115 |     print >>sys.stderr, 'FATAL: %s' % e
116 |     sys.exit(1)
117 | 


--------------------------------------------------------------------------------
/analysis/R/fast_em.R:
--------------------------------------------------------------------------------
  1 | # fast_em.R: Wrapper around analysis/cpp/fast_em.cc.
  2 | #
  3 | # This serializes the input, shells out, and deserializes the output.
  4 | 
  5 | .Flatten <- function(list_of_matrices) {
  6 |   listOfVectors <- lapply(list_of_matrices, as.vector)
  7 |   #print(listOfVectors)
  8 | 
  9 |   # unlist takes list to vector.
 10 |   unlist(listOfVectors)
 11 | }
 12 | 
 13 | .WriteListOfMatrices <- function(list_of_matrices, f) {
 14 |   flattened <- .Flatten(list_of_matrices)
 15 | 
 16 |   # NOTE: UpdateJointConditional does outer product of dimensions!
 17 | 
 18 |   # 3 letter strings are null terminated
 19 |   writeBin('ne ', con = f)
 20 |   num_entries <- length(list_of_matrices)
 21 |   writeBin(num_entries, con = f)
 22 | 
 23 |   Log('Wrote num_entries = %d', num_entries)
 24 | 
 25 |   # For 2x3, this is 6
 26 |   writeBin('es ', con = f)
 27 | 
 28 |   entry_size <- as.integer(prod(dim(list_of_matrices[[1]])))
 29 |   writeBin(entry_size, con = f)
 30 | 
 31 |   Log('Wrote entry_size = %d', entry_size)
 32 | 
 33 |   # now write the data
 34 |   writeBin('dat', con = f)
 35 |   writeBin(flattened, con = f)
 36 | }
 37 | 
 38 | .ExpectTag <- function(f, tag) {
 39 |   # Read a single NUL-terminated character string.
 40 |   actual <- readBin(con = f, what = "char", n = 1)
 41 | 
 42 |   # Assert that we got what was expected.
 43 |   if (length(actual) != 1) {
 44 |     stop(sprintf("Failed to read a tag '%s'", tag))
 45 |   }
 46 |   if (actual != tag) {
 47 |     stop(sprintf("Expected '%s', got '%s'", tag, actual))
 48 |   }
 49 | }
 50 | 
 51 | .ReadResult <- function (f, entry_size, matrix_dims) {
 52 |   .ExpectTag(f, "emi")
 53 |   # NOTE: assuming R integers are 4 bytes (uint32_t)
 54 |   num_em_iters <- readBin(con = f, what = "int", n = 1)
 55 | 
 56 |   .ExpectTag(f, "pij")
 57 |   pij <- readBin(con = f, what = "double", n = entry_size)
 58 | 
 59 |   # Adjust dimensions
 60 |   dim(pij) <- matrix_dims
 61 | 
 62 |   Log("Number of EM iterations: %d", num_em_iters)
 63 |   Log("PIJ read from external implementation:")
 64 |   print(pij)
 65 |    
 66 |   # est, sd, var_cov, hist
 67 |   list(est = pij, num_em_iters = num_em_iters)
 68 | }
 69 | 
 70 | .SanityChecks <- function(joint_conditional) {
 71 |   # Display some stats before sending it over to C++.
 72 | 
 73 |   inf_counts <- lapply(joint_conditional, function(m) {
 74 |     sum(m == Inf)
 75 |   })
 76 |   total_inf <- sum(as.numeric(inf_counts))
 77 | 
 78 |   nan_counts <- lapply(joint_conditional, function(m) {
 79 |     sum(is.nan(m))
 80 |   })
 81 |   total_nan <- sum(as.numeric(nan_counts))
 82 | 
 83 |   zero_counts <- lapply(joint_conditional, function(m) {
 84 |     sum(m == 0.0)
 85 |   })
 86 |   total_zero <- sum(as.numeric(zero_counts))
 87 | 
 88 |   #sum(joint_conditional[joint_conditional == Inf, ])
 89 |   Log('total inf: %s', total_inf)
 90 |   Log('total nan: %s', total_nan)
 91 |   Log('total zero: %s', total_zero)
 92 | }
 93 | 
 94 | ConstructFastEM <- function(em_executable, tmp_dir) {
 95 | 
 96 |   return(function(joint_conditional, max_em_iters = 1000,
 97 |                   epsilon = 10 ^ -6, verbose = FALSE,
 98 |                   estimate_var = FALSE) {
 99 |     matrix_dims <- dim(joint_conditional[[1]])
100 |     # Check that number of dimensions is 2.
101 |     if (length(matrix_dims) != 2) {
102 |       Log('FATAL: Expected 2 dimensions, got %d', length(matrix_dims))
103 |       stop()
104 |     }
105 | 
106 |     entry_size <- prod(matrix_dims)
107 |     Log('entry size: %d', entry_size)
108 | 
109 |     .SanityChecks(joint_conditional)
110 | 
111 |     input_path <- file.path(tmp_dir, 'list_of_matrices.bin')
112 |     Log("Writing flattened list of matrices to %s", input_path)
113 |     f <- file(input_path, 'wb')  # binary file
114 |     .WriteListOfMatrices(joint_conditional, f)
115 |     close(f)
116 |     Log("Done writing %s", input_path)
117 |      
118 |     output_path <- file.path(tmp_dir, 'pij.bin')
119 | 
120 |     cmd <- sprintf("%s %s %s %s", em_executable, input_path, output_path,
121 |                    max_em_iters)
122 | 
123 |     Log("Shell command: %s", cmd)
124 |     exit_code <- system(cmd)
125 | 
126 |     Log("Done running shell command")
127 |     if (exit_code != 0) {
128 |       stop(sprintf("Command failed with code %d", exit_code))
129 |     }
130 | 
131 |     f <- file(output_path, 'rb')
132 |     result <- .ReadResult(f, entry_size, matrix_dims)
133 |     close(f)
134 | 
135 |     result
136 |   })
137 | }
138 | 


--------------------------------------------------------------------------------
/pipeline/dist.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Usage:
  4 | #   ./dist.sh <function name>
  5 | 
  6 | set -o nounset
  7 | set -o pipefail
  8 | set -o errexit
  9 | 
 10 | readonly THIS_DIR=$(dirname $0)
 11 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
 12 | 
 13 | source $RAPPOR_SRC/util.sh  # log, banner
 14 | source $RAPPOR_SRC/pipeline/tools-lib.sh
 15 | source $RAPPOR_SRC/pipeline/alarm-lib.sh
 16 | 
 17 | readonly DECODE_DIST=${DEP_DECODE_DIST:-$RAPPOR_SRC/bin/decode-dist}
 18 | 
 19 | readonly NUM_ARGS=7  # used for xargs
 20 | 
 21 | decode-dist-one() {
 22 |   # Job constants
 23 |   local rappor_src=$1
 24 |   local timeout_secs=$2
 25 |   local min_reports=$3
 26 |   shift 3  # job constants do not vary per task and are not part of the spec
 27 | 
 28 |   # 7 spec variables
 29 |   local num_reports=$1  # unused, only for filtering
 30 |   local metric_name=$2
 31 |   local date=$3
 32 |   local counts=$4
 33 |   local params=$5
 34 |   local map=$6
 35 |   local results_dir=$7
 36 | 
 37 |   local task_dir=$results_dir/$metric_name/$date
 38 |   mkdir --verbose -p $task_dir
 39 | 
 40 |   local log_file=$task_dir/log.txt
 41 |   local status_file=$task_dir/STATUS.txt
 42 | 
 43 |   # Record the spec so we know params, counts, etc.
 44 |   echo "$@" > $task_dir/spec.txt
 45 | 
 46 |   if test $num_reports -lt $min_reports; then
 47 |     local msg="SKIPPED because $num_reports reports is less than $min_reports"
 48 |     # Duplicate this message
 49 |     echo "$msg" > $status_file
 50 |     echo "$msg" > $log_file
 51 |     return
 52 |   fi
 53 | 
 54 |   # Run it with a timeout, and record status in the task dir.
 55 |   { time \
 56 |       alarm-status $status_file $timeout_secs \
 57 |         $DECODE_DIST \
 58 |           --counts $counts \
 59 |           --params $params \
 60 |           --map $map \
 61 |           --output-dir $task_dir \
 62 |           --adjust-counts-hack
 63 |   } >$log_file 2>&1
 64 | 
 65 |   # TODO: Don't pass --adjust-counts-hack unless the user asks for it.
 66 | }
 67 | 
 68 | # Print the number of processes to use.
 69 | # NOTE: This is copied from google/rappor regtest.sh.
 70 | # It also doesn't take into account the fact that we are memory-bound.
 71 | #
 72 | # 128 GiB / 4GiB would also imply about 32 processes though.
 73 | num-processes() {
 74 |   local processors=$(grep -c ^processor /proc/cpuinfo || echo 4)
 75 |   if test $processors -gt 1; then  # leave one CPU for the OS
 76 |     processors=$(expr $processors - 1)
 77 |   fi
 78 |   echo $processors
 79 | }
 80 | 
 81 | #readonly DEFAULT_MAX_PROCS=6  # for andychu2.hot, to avoid locking up UI
 82 | #readonly DEFAULT_MAX_PROCS=16  # for rappor-ac.hot, to avoid thrashing
 83 | readonly DEFAULT_MAX_PROCS=$(num-processes)
 84 | 
 85 | #readonly DEFAULT_MAX_TASKS=12
 86 | readonly DEFAULT_MAX_TASKS=10000  # more than the max
 87 | 
 88 | # NOTE: Since we have 125 GB RAM, and processes can take up to 12 gigs of RAM,
 89 | # only use parallelism of 10, even though we have 31 cores.
 90 | 
 91 | readonly DEFAULT_MIN_REPORTS=5000
 92 | 
 93 | 
 94 | decode-dist-many() {
 95 |   local job_dir=$1
 96 |   local spec_list=$2
 97 |   local timeout_secs=${3:-1200}  # default timeout
 98 |   local max_procs=${4:-$DEFAULT_MAX_PROCS}
 99 |   local rappor_src=${5:-$RAPPOR_SRC}
100 |   local min_reports=${6:-$DEFAULT_MIN_REPORTS}
101 | 
102 |   local interval_secs=5
103 |   local pid_dir="$job_dir/pids"
104 |   local sys_mem="$job_dir/system-mem.csv"
105 |   mkdir --verbose -p $pid_dir
106 | 
107 |   time cat $spec_list \
108 |     | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
109 |       $0 decode-dist-one $rappor_src $timeout_secs $min_reports
110 | }
111 | 
112 | # Combine/summarize results and task metadata from the parallel decode-dist
113 | # processes.  Render them as HTML.
114 | combine-and-render-html() {
115 |   local jobs_base_dir=$1
116 |   local job_dir=$2
117 | 
118 |   banner "Combining dist task status"
119 |   TOOLS-cook combine-dist-task-status $jobs_base_dir $job_dir
120 | 
121 |   banner "Combining dist results"
122 |   TOOLS-cook combine-dist-results $jobs_base_dir $job_dir
123 | 
124 |   banner "Splitting out status per metric, and writing overview"
125 |   TOOLS-cook dist-metric-status $job_dir
126 | 
127 |   # The task-status.csv file should have the a JOB ID.
128 |   banner "Building overview.html and per-metric HTML"
129 |   TOOLS-gen-ui build-html1 $job_dir
130 | 
131 |   banner "Building individual results.html (for ONE day)"
132 |   TOOLS-gen-ui results-html $job_dir
133 | }
134 | 
135 | "$@"
136 | 


--------------------------------------------------------------------------------
/client/cpp/Makefile:
--------------------------------------------------------------------------------
  1 | # Build RAPPOR C++ code.
  2 | 
  3 | default : \
  4 | 	_tmp/rappor_sim \
  5 | 	_tmp/encoder_demo \
  6 | 	_tmp/protobuf_encoder_demo \
  7 | 	_tmp/openssl_hash_impl_test
  8 | 
  9 | # All intermediate files live in _tmp/
 10 | clean :
 11 | 	rm -f --verbose _tmp/*
 12 | 
 13 | # Use protobuf compiler to generate .cc and .h files.  The .o and the .d depend
 14 | # on .cc, so that is the target of this rule.
 15 | 
 16 | _tmp/%.pb.cc : ../proto/%.proto
 17 | 	protoc --cpp_out _tmp --proto_path=../proto $<
 18 | 
 19 | #
 20 | # Generate .d Makefile fragments.
 21 | #
 22 | 
 23 | # CXX flags:
 24 | #   -MM: exclude system headers
 25 | #   -I _tmp: So that protobuf files found
 26 | #
 27 | # Makefile stuff:
 28 | #   $*: the part that matched the wildcard, e.g. 'rappor_sim' for '%.cc'
 29 | #   matching 'rappor_sim.cc'
 30 | #
 31 | #   We use $< (first prereq) to generate .d and and .o files from .cc, because
 32 | #   it only needs the .cc file.  We used $^ (all prereqs) to pass ALL the .o
 33 | #   files to the link step.
 34 | 
 35 | _tmp/%.d : %.cc
 36 | 	./dotd.sh $* $@ \
 37 | 		$(CXX) -I _tmp/ -MM $(CPPFLAGS) $<
 38 | 
 39 | # Special case for .d file of generated source.
 40 | _tmp/%.pb.d : _tmp/%.pb.cc
 41 | 	./dotd.sh $*.pb $@ \
 42 | 		$(CXX) -I _tmp/ -MM $(CPPFLAGS) $<
 43 | 
 44 | #
 45 | # Include the Makefile fragments we generated, so that changes to headers will
 46 | # rebuild both .d files and .o files.  ('-include' suppresses the error if they
 47 | # don't exist.)
 48 | #
 49 | # NOTE: We have to list them explicitly.  Every time you add a source file, add
 50 | # the corresponding .d file here.
 51 | #
 52 | 
 53 | -include \
 54 | 	_tmp/encoder.d \
 55 | 	_tmp/libc_rand_impl.d \
 56 | 	_tmp/openssl_hash_impl.d \
 57 | 	_tmp/openssl_hash_impl_test.d \
 58 | 	_tmp/protobuf_encoder.d \
 59 | 	_tmp/protobuf_encoder_demo.d \
 60 | 	_tmp/rappor_sim.d \
 61 | 	_tmp/unix_kernel_rand_impl.d \
 62 | 	_tmp/rappor.pb.d \
 63 |   _tmp/example_app.pb.d
 64 | 
 65 | # For example, -Wextra warns about unused params, but -Wall doesn't.
 66 | CXXFLAGS = -Wall -Wextra #-Wpedantic
 67 | 
 68 | #
 69 | # Build object files (-c: compile only)
 70 | #
 71 | 
 72 | # NOTE: More prerequisites to _tmp/%.o (header files) are added by the .d
 73 | # files, so we need $<.
 74 | _tmp/%.o : %.cc
 75 | 	$(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $<
 76 | 
 77 | _tmp/%.pb.o : _tmp/%.pb.cc
 78 | 	$(CXX) $(CXXFLAGS) -I _tmp/ -c -o $@ $<
 79 | 
 80 | #
 81 | # Build executables
 82 | #
 83 | 
 84 | # CXX flag notes:
 85 | # -lcrypto from openssl
 86 | # -g for debug info
 87 | #
 88 | # You can add -std=c++0x for std::array, etc.
 89 | 
 90 | # $^ : all prerequisites
 91 | _tmp/rappor_sim : \
 92 | 	_tmp/encoder.o \
 93 | 	_tmp/libc_rand_impl.o \
 94 | 	_tmp/unix_kernel_rand_impl.o \
 95 | 	_tmp/openssl_hash_impl.o \
 96 | 	_tmp/rappor_sim.o
 97 | 	$(CXX) \
 98 | 		$(CXXFLAGS) \
 99 | 		-o $@ \
100 | 		$^ \
101 | 		-lcrypto \
102 | 		-g
103 | 
104 | # $^ : all prerequisites
105 | _tmp/encoder_demo: \
106 | 	_tmp/encoder.o \
107 | 	_tmp/unix_kernel_rand_impl.o \
108 | 	_tmp/openssl_hash_impl.o \
109 | 	_tmp/encoder_demo.o
110 | 	$(CXX) \
111 | 		$(CXXFLAGS) \
112 | 		-o $@ \
113 | 		$^ \
114 | 		-lcrypto \
115 | 		-g
116 | 
117 | # -I _tmp for protobuf headers
118 | _tmp/protobuf_encoder_demo : \
119 | 	_tmp/encoder.o \
120 | 	_tmp/libc_rand_impl.o \
121 | 	_tmp/unix_kernel_rand_impl.o \
122 | 	_tmp/openssl_hash_impl.o \
123 | 	_tmp/protobuf_encoder.o \
124 | 	_tmp/protobuf_encoder_demo.o \
125 | 	_tmp/example_app.pb.o \
126 | 	_tmp/rappor.pb.o
127 | 	$(CXX) \
128 | 		$(CXXFLAGS) \
129 | 		-I _tmp \
130 | 		-o $@ \
131 | 		$^ \
132 | 		-lprotobuf \
133 | 		-lcrypto \
134 | 		-g
135 | 
136 | _tmp/openssl_hash_impl_test : \
137 |  	_tmp/openssl_hash_impl.o \
138 | 	_tmp/openssl_hash_impl_test.o
139 | 	$(CXX) \
140 | 		$(CXXFLAGS) \
141 | 		-o $@ \
142 | 		$^ \
143 | 		-lcrypto \
144 | 		-g
145 | 
146 | # Unittests are currently run manually, and require the Google gtest
147 | # framework version 1.7.0 or greater, found at
148 | #   https://github.com/google/googletest/releases
149 | # TODO(mdeshon-google): Installer script
150 | unittest: _tmp/openssl_hash_impl_unittest _tmp/encoder_unittest
151 | 	_tmp/openssl_hash_impl_unittest
152 | 	_tmp/encoder_unittest
153 | 
154 | _tmp/openssl_hash_impl_unittest: openssl_hash_impl_unittest.cc openssl_hash_impl.cc
155 | 	$(CXX) -g -o $@  $^ -lssl -lcrypto -lgtest
156 | 
157 | _tmp/encoder_unittest: encoder_unittest.cc encoder.cc unix_kernel_rand_impl.cc openssl_hash_impl.cc
158 | 	$(CXX) -g -o $@  $^ -lssl -lcrypto -lgtest
159 | 


--------------------------------------------------------------------------------
/pipeline/regtest.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | # 
  3 | # End-to-end tests for the dashboard.
  4 | #
  5 | # Usage:
  6 | #   ./regtest.sh <function name>
  7 | #
  8 | # NOTE: Must be run in this directory (rappor/pipeline).
  9 | 
 10 | set -o nounset
 11 | set -o pipefail
 12 | set -o errexit
 13 | 
 14 | # Create schema and params.
 15 | create-metadata() {
 16 |   mkdir -p _tmp/metadata
 17 |   echo 'Hello from regtest.sh'
 18 | 
 19 |   local params_path=_tmp/metadata/regtest_params.csv
 20 | 
 21 |   # Relying on $RAPPOR_SRC/regtest.sh
 22 |   cp --verbose ../_tmp/python/demo1/case_params.csv $params_path
 23 | 
 24 |   # For now, use the same map everywhere.
 25 |   cat >_tmp/metadata/dist-analysis.csv <<EOF
 26 | var,map_filename
 27 | unif,map.csv
 28 | gauss,map.csv
 29 | exp,map.csv
 30 | m.domain,domain_map.csv
 31 | EOF
 32 | 
 33 |   # Both single dimensional and multi dimensional metrics.
 34 |   cat >_tmp/metadata/rappor-vars.csv <<EOF 
 35 | metric,var,var_type,params
 36 | m,domain,string,m_params
 37 | m,flag..HTTPS,boolean,m_params
 38 | unif,,string,regtest_params
 39 | gauss,,string,regtest_params
 40 | exp,,string,regtest_params
 41 | EOF
 42 | }
 43 | 
 44 | # Create map files.
 45 | create-maps() {
 46 |   mkdir -p _tmp/maps
 47 |   # Use the same map for everyone now?
 48 |   local map_path=_tmp/maps/map.csv
 49 | 
 50 |   # Relying on $RAPPOR_SRC/regtest.sh
 51 |   cp --verbose ../_tmp/python/demo1/case_map.csv $map_path
 52 | }
 53 | 
 54 | # Simulate different metrics.
 55 | create-counts() {
 56 |   mkdir -p _tmp/counts
 57 | 
 58 |   for date in 2015-12-01 2015-12-02 2015-12-03; do
 59 |     mkdir -p _tmp/counts/$date
 60 | 
 61 |     # TODO: Change params for each day.
 62 |     cp --verbose \
 63 |       ../_tmp/python/demo1/1/case_counts.csv _tmp/counts/$date/unif_counts.csv
 64 |     cp --verbose \
 65 |       ../_tmp/python/demo2/1/case_counts.csv _tmp/counts/$date/gauss_counts.csv
 66 |     cp --verbose \
 67 |       ../_tmp/python/demo3/1/case_counts.csv _tmp/counts/$date/exp_counts.csv
 68 |   done
 69 | }
 70 | 
 71 | dist-task-spec() {
 72 |   local job_dir=$1
 73 |   ./task_spec.py dist \
 74 |     --map-dir _tmp/maps \
 75 |     --config-dir _tmp/metadata \
 76 |     --output-base-dir $job_dir/raw \
 77 |     --bad-report-out _tmp/bad_counts.csv \
 78 |     "$@"
 79 | }
 80 | 
 81 | dist-job() {
 82 |   local job_id=$1
 83 |   local pat=$2
 84 | 
 85 |   local job_dir=_tmp/$job_id
 86 |   mkdir -p $job_dir/raw
 87 | 
 88 |   local spec_list=$job_dir/spec-list.txt
 89 | 
 90 |   find _tmp/counts/$pat -name \*_counts.csv \
 91 |     | dist-task-spec $job_dir \
 92 |     | tee $spec_list
 93 | 
 94 |   ./dist.sh decode-dist-many $job_dir $spec_list
 95 |   ./dist.sh combine-and-render-html _tmp $job_dir
 96 | }
 97 | 
 98 | dist() {
 99 |   create-metadata
100 |   create-maps
101 |   create-counts
102 | 
103 |   dist-job smoke1 '2015-12-01'  # one day
104 |   dist-job smoke2 '2015-12-0[23]'  # two days
105 | }
106 | 
107 | # Simulate different metrics.
108 | create-reports() {
109 |   mkdir -p _tmp/reports
110 | 
111 |   for date in 2015-12-01 2015-12-02 2015-12-03; do
112 |     mkdir -p _tmp/reports/$date
113 | 
114 |     # TODO: Change params for each day.
115 |     cp --verbose \
116 |       ../bin/_tmp/reports.csv _tmp/reports/$date/m_reports.csv
117 |   done
118 | }
119 | 
120 | assoc-task-spec() {
121 |   local job_dir=$1
122 |   ./task_spec.py assoc \
123 |     --map-dir _tmp/maps \
124 |     --config-dir _tmp/metadata \
125 |     --output-base-dir $job_dir/raw \
126 |     "$@"
127 | }
128 | 
129 | assoc-job() {
130 |   local job_id=$1
131 |   local pat=$2
132 | 
133 |   local job_dir=_tmp/$job_id
134 |   mkdir -p $job_dir/raw $job_dir/config
135 | 
136 |   local spec_list=$job_dir/spec-list.txt
137 | 
138 |   find _tmp/reports/$pat -name \*_reports.csv \
139 |     | assoc-task-spec $job_dir \
140 |     | tee $spec_list
141 | 
142 |   # decode-many calls decode_assoc.R, which expects this schema in the 'config'
143 |   # dir now.  TODO: adjust this.
144 |   cp --verbose _tmp/metadata/rappor-vars.csv $job_dir/config
145 |   cp --verbose ../bin/_tmp/m_params.csv $job_dir/config
146 | 
147 |   ./assoc.sh decode-many $job_dir $spec_list
148 |   ./assoc.sh combine-and-render-html _tmp $job_dir
149 | }
150 | 
151 | # Copy some from bin/test.sh?  The input _reports.csv files should be taken
152 | # from there.
153 | assoc() {
154 |   create-reports
155 |   cp --verbose ../bin/_tmp/domain_map.csv _tmp/maps
156 | 
157 |   assoc-job smoke1-assoc '2015-12-01'  # one day
158 |   assoc-job smoke2-assoc '2015-12-0[23]'  # two days
159 | }
160 | 
161 | "$@"
162 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Copyright 2014 Google Inc. All rights reserved.
  4 | # 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | # 
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | # 
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # Test automation script.
 18 | #
 19 | # Usage:
 20 | #   test.sh [function name]
 21 | #
 22 | # Examples:
 23 | #   $ ./test.sh py-unit  # run Python unit tests
 24 | #   $ ./test.sh all      # all tests
 25 | #   $ ./test.sh lint     # run lint checks
 26 | # If no function is provided all of the unit tests will be run.
 27 | 
 28 | set -o nounset
 29 | set -o pipefail
 30 | set -o errexit
 31 | 
 32 | . util.sh
 33 | 
 34 | readonly THIS_DIR=$(dirname $0)
 35 | readonly REPO_ROOT=$THIS_DIR
 36 | readonly CLIENT_DIR=$REPO_ROOT/client/python
 37 | 
 38 | #
 39 | # Fully Automated Tests
 40 | #
 41 | 
 42 | # Run all Python unit tests.
 43 | #
 44 | # Or pass a particular test to run with the correct PYTHONPATH, e.g.
 45 | #
 46 | # $ ./test.sh py-unit tests/fastrand_test.py
 47 | #
 48 | # TODO: Separate out deterministic tests from statistical tests (which may
 49 | # rarely fail)
 50 | py-unit() {
 51 |   export PYTHONPATH=$CLIENT_DIR  # to find client library
 52 | 
 53 |   if test $# -gt 0; then
 54 |     "$@"
 55 |   else
 56 |     set +o errexit
 57 | 
 58 |     # -e: exit at first failure
 59 |     find $REPO_ROOT -name \*_test.py | sh -x -e
 60 |   fi
 61 | 
 62 |   local exit_code=$?
 63 | 
 64 |   if test $exit_code -eq 0; then
 65 |     echo 'ALL TESTS PASSED'
 66 |   else
 67 |     echo 'FAIL'
 68 |     exit 1
 69 |   fi
 70 |   set -o errexit
 71 | }
 72 | 
 73 | # All tests
 74 | all() {
 75 |   banner "Running Python unit tests"
 76 |   py-unit
 77 |   echo
 78 | 
 79 |   banner "Running R unit tests"
 80 |   r-unit
 81 | }
 82 | 
 83 | #
 84 | # Lint
 85 | #
 86 | lint() {
 87 |   banner "Linting Python source files"
 88 |   py-lint
 89 |   echo
 90 |   
 91 |   banner "Linting Documentation files"
 92 |   doc-lint
 93 | }
 94 | 
 95 | python-lint() {
 96 |   # E111: indent not a multiple of 4.  We are following the Google/Chrome
 97 |   # style and using 2 space indents.
 98 |   if pep8 --ignore=E111 "$@"; then
 99 |     echo
100 |     echo 'LINT PASSED'
101 |   else
102 |     echo
103 |     echo 'LINT FAILED'
104 |     exit 1
105 |   fi
106 | }
107 | 
108 | py-lint() {
109 |   which pep8 >/dev/null || die "pep8 not installed ('sudo apt-get install pep8' on Ubuntu)"
110 | 
111 |   # - Skip _tmp dir, because we are downloading cpplint.py there, and it has
112 |   # pep8 lint errors
113 |   # - Exclude setup.py, because it's a config file and uses "invalid" 'name =
114 |   # 1' style (spaces around =).
115 |   find $REPO_ROOT \
116 |     \( -name _tmp -a -prune \) -o \
117 |     \( -name \*.py -a -print \) \
118 |     | grep -v /setup.py \
119 |     | xargs --verbose -- $0 python-lint
120 | }
121 | 
122 | r-unit() {
123 |   set -o xtrace  # show tests we're running
124 | 
125 |   # This one needs to be run from the root dir
126 |   tests/compare_dist_test.R
127 | 
128 |   tests/gen_counts_test.R
129 |   
130 |   tests/gen_true_values_test.R
131 | 
132 |   analysis/R/decode_test.R
133 | 
134 |   analysis/test/run_tests.R
135 | }
136 | 
137 | doc-lint() {
138 |   which tidy >/dev/null || die "tidy not found"
139 |   for doc in _tmp/report.html _tmp/doc/*.html; do
140 |     echo $doc
141 |   # -e: show only errors and warnings
142 |   # -q: quiet
143 |     tidy -e -q $doc || true
144 |   done
145 | }
146 | 
147 | # This isn't a strict check, but can help.
148 | # TODO: Add words to whitelist.
149 | spell-all() {
150 |   which spell >/dev/null || die "spell not found"
151 |   spell README.md doc/*.md | sort | uniq
152 | }
153 | 
154 | #
155 | # Smoke Tests.  These can be manually run.
156 | #
157 | 
158 | gen-true-values() {
159 |   local num_unique_values=10
160 |   local num_clients=10
161 |   local values_per_client=2
162 |   local num_cohorts=4
163 |   local out=_tmp/reports.csv
164 | 
165 |   tests/gen_true_values.R \
166 |     exp $num_unique_values $num_clients $values_per_client $num_cohorts $out
167 |   wc -l $out
168 |   cat $out
169 | }
170 | 
171 | if test $# -eq 0 ; then
172 |   all
173 | else
174 |   "$@"
175 | fi
176 | 


--------------------------------------------------------------------------------
/pipeline/cook.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Take the raw data from the analysis and massage it into various formats
  4 | # suitable for display.
  5 | #
  6 | # Usage:
  7 | #   ./cook.sh <function name>
  8 | 
  9 | set -o nounset
 10 | set -o pipefail
 11 | set -o errexit
 12 | 
 13 | readonly THIS_DIR=$(dirname $0)
 14 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
 15 | 
 16 | source $RAPPOR_SRC/pipeline/tools-lib.sh
 17 | 
 18 | 
 19 | status-files() {
 20 |   local dir=$1
 21 |   find $dir -name STATUS.txt
 22 | }
 23 | 
 24 | results-files() {
 25 |   local dir=$1
 26 |   find $dir -name results.csv
 27 | }
 28 | 
 29 | count-results() {
 30 |   # first field of each line is one of {OK, TIMEOUT, FAIL, SKIPPED}
 31 |   status-files "$@" \
 32 |     | xargs cat \
 33 |     | cut -d ' ' -f 1 \
 34 |     | sort | uniq -c | sort -n -r
 35 | }
 36 | 
 37 | #
 38 | # For dist cron job
 39 | #
 40 | 
 41 | # Combine status of tasks over multiple jobs.  Each row is a task (decode-dist
 42 | # invocation).  This has the number of reports.
 43 | combine-dist-task-status() {
 44 |   local base_dir=${1:-~/rappor/cron}
 45 |   local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01}
 46 | 
 47 |   local out=$job_dir/task-status.csv
 48 | 
 49 |   # Ignore memory for now.
 50 |   time status-files $base_dir | TOOLS-combine-status dist > $out
 51 |   echo "Wrote $out"
 52 | }
 53 | 
 54 | # Create a single dist.csv time series for a GIVEN metric.
 55 | combine-dist-results-one() {
 56 |   local base_dir=$1
 57 |   local job_dir=$2
 58 |   local metric_name=$3
 59 |   #echo FOO $base_dir $metric_name
 60 | 
 61 |   local out_dir=$job_dir/cooked/$metric_name
 62 |   mkdir -p $out_dir
 63 | 
 64 |   # Glob to capture this specific metric name over ALL job IDs.
 65 |   find $base_dir/*/raw/$metric_name -name STATUS.txt \
 66 |     | TOOLS-combine-results dist 5 \
 67 |     > $out_dir/dist.csv
 68 | }
 69 | 
 70 | # Creates a dist.csv file for EACH metric.  TODO: Rename one/many
 71 | combine-dist-results() {
 72 |   local base_dir=${1:-~/rappor/cron}
 73 |   local job_dir=${2:-~/rappor/cron/2015-05-22__05-58-01}
 74 | 
 75 |   # Direct subdirs of 'raw' are metrics.  Just print filename.
 76 |   find $base_dir/*/raw -mindepth 1 -maxdepth 1 -type d -a -printf '%f\n' \
 77 |     | sort | uniq \
 78 |     | xargs --verbose -n1 -- \
 79 |       $0 combine-dist-results-one $base_dir $job_dir
 80 | }
 81 | 
 82 | # Take the task-status.csv file, which has row key (metric, date).  Writes
 83 | # num_reports.csv and status.csv per metric, and a single overview.csv for all
 84 | # metrics.
 85 | dist-metric-status() {
 86 |   local job_dir=${1:-_tmp/results-10}
 87 |   local out_dir=$job_dir/cooked
 88 | 
 89 |   TOOLS-metric-status dist $job_dir/task-status.csv $out_dir
 90 | }
 91 | 
 92 | #
 93 | # For association analysis cron job
 94 | #
 95 | 
 96 | combine-assoc-task-status() {
 97 |   local base_dir=${1:-~/rappor/chrome-assoc-smoke}
 98 |   local job_dir=${2:-$base_dir/smoke1}
 99 | 
100 |   local out=$job_dir/assoc-task-status.csv
101 | 
102 |   time find $base_dir -name assoc-status.txt \
103 |     | TOOLS-combine-status assoc \
104 |     > $out
105 | 
106 |   echo "Wrote $out"
107 | }
108 | 
109 | # Create a single assoc.csv time series for a GIVEN (var1, var2) pair.
110 | combine-assoc-results-one() {
111 |   local base_dir=$1
112 |   local job_dir=$2
113 |   local metric_pair_rel_path=$3
114 | 
115 |   local out_dir=$job_dir/cooked/$metric_pair_rel_path
116 |   mkdir -p $out_dir
117 | 
118 |   # Glob to capture this specific metric name over ALL job IDs.
119 |   find $base_dir/*/raw/$metric_pair_rel_path -name assoc-status.txt \
120 |     | TOOLS-combine-results assoc 5 \
121 |     > $out_dir/assoc-results-series.csv
122 | }
123 | 
124 | # Creates a dist.csv file for EACH metric.  TODO: Rename one/many
125 | combine-assoc-results() {
126 |   local base_dir=${1:-~/rappor/chrome-assoc-smoke}
127 |   local job_dir=${2:-$base_dir/smoke3}
128 | 
129 |   # Direct subdirs of 'raw' are metrics, and subdirs of that are variable
130 |   # pairs.  Print "$metric_name/$pair_name".
131 |   find $base_dir/*/raw -mindepth 2 -maxdepth 2 -type d -a -printf '%P\n' \
132 |     | sort | uniq \
133 |     | xargs --verbose -n1 -- \
134 |       $0 combine-assoc-results-one $base_dir $job_dir
135 | }
136 | 
137 | # Take the assoc-task-status.csv file, which has row key (metric, date).  Writes
138 | # num_reports.csv and status.csv per metric, and a single overview.csv for all
139 | # metrics.
140 | assoc-metric-status() {
141 |   local job_dir=${1:-~/rappor/chrome-assoc-smoke/smoke3}
142 |   local out_dir=$job_dir/cooked
143 | 
144 |   TOOLS-metric-status assoc $job_dir/assoc-task-status.csv $out_dir
145 | }
146 | 
147 | "$@"
148 | 


--------------------------------------------------------------------------------
/bin/decode_dist.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | #
  3 | # Command line tool to decode a RAPPOR data set.  It is a simple wrapper for
  4 | # Decode() in decode.R.
  5 | 
  6 | library(optparse)
  7 | 
  8 | #
  9 | # Command line parsing.  Do this first before loading libraries to catch errors
 10 | # quickly.  Loading libraries in R is slow.
 11 | #
 12 | 
 13 | # For command line error checking.
 14 | UsageError <- function(...) {
 15 |   cat(sprintf(...))
 16 |   cat('\n')
 17 |   quit(status = 1)
 18 | }
 19 | 
 20 | option_list <- list(
 21 |   # Inputs
 22 |   make_option("--map", default="", help="Map file (required)"),
 23 |   make_option("--counts", default="", help="Counts file (required)"),
 24 |   make_option("--params", default="", help="Params file (required)"),
 25 |   make_option("--output-dir", dest="output_dir", default=".",
 26 |               help="Output directory (default .)"),
 27 | 
 28 |   make_option("--correction", default="FDR", help="Correction method"),
 29 |   make_option("--alpha", default=.05, help="Alpha level"),
 30 | 
 31 |   make_option("--adjust-counts-hack", dest="adjust_counts_hack",
 32 |               default=FALSE, action="store_true",
 33 |               help="Allow the counts file to have more rows than cohorts. 
 34 |                     Most users should not use this.")
 35 | )
 36 | 
 37 | ParseOptions <- function() {
 38 |   # NOTE: This API is bad; if you add positional_arguments, the return value
 39 |   # changes!
 40 |   parser <- OptionParser(option_list = option_list)
 41 |   opts <- parse_args(parser)
 42 | 
 43 |   if (opts$map == "") {
 44 |     UsageError("--map is required.")
 45 |   }
 46 |   if (opts$counts == "") {
 47 |     UsageError("--counts is required.")
 48 |   }
 49 |   if (opts$params == "") {
 50 |     UsageError("--params is required.")
 51 |   }
 52 |   return(opts)
 53 | }
 54 | 
 55 | if (!interactive()) {
 56 |   opts <- ParseOptions()
 57 | }
 58 | 
 59 | #
 60 | # Load libraries and source our own code.
 61 | #
 62 | 
 63 | library(RJSONIO)
 64 | 
 65 | # So we don't have to change pwd
 66 | source.rappor <- function(rel_path)  {
 67 |   abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
 68 |   source(abs_path)
 69 | }
 70 | 
 71 | source.rappor("analysis/R/read_input.R")
 72 | source.rappor("analysis/R/decode.R")
 73 | source.rappor("analysis/R/util.R")
 74 | 
 75 | source.rappor("analysis/R/alternative.R")
 76 | 
 77 | options(stringsAsFactors = FALSE)
 78 | 
 79 | 
 80 | main <- function(opts) {
 81 |   Log("decode-dist")
 82 |   Log("argv:")
 83 |   print(commandArgs(TRUE))
 84 | 
 85 |   Log("Loading inputs")
 86 | 
 87 |   # Run a single model of all inputs are specified.
 88 |   params <- ReadParameterFile(opts$params)
 89 |   counts <- ReadCountsFile(opts$counts, params, adjust_counts = opts$adjust_counts_hack)
 90 |   counts <- AdjustCounts(counts, params)
 91 | 
 92 | 
 93 |   # The left-most column has totals.
 94 |   num_reports <- sum(counts[, 1])
 95 | 
 96 |   map <- LoadMapFile(opts$map, params)
 97 | 
 98 |   Log("Decoding %d reports", num_reports)
 99 |   res <- Decode(counts, map$map, params, correction = opts$correction,
100 |                 alpha = opts$alpha)
101 |   Log("Done decoding")
102 | 
103 |   if (nrow(res$fit) == 0) {
104 |     Log("FATAL: Analysis returned no strings.")
105 |     quit(status = 1)
106 |   }
107 | 
108 |   # Write analysis results as CSV.
109 |   results_csv_path <- file.path(opts$output_dir, 'results.csv')
110 |   write.csv(res$fit, file = results_csv_path, row.names = FALSE)
111 | 
112 |   # Write residual histograph as a png.
113 |   results_png_path <- file.path(opts$output_dir, 'residual.png')
114 |   png(results_png_path)
115 |   breaks <- pretty(res$residual, n = 200)
116 |   histogram <- hist(res$residual, breaks, plot = FALSE)
117 |   histogram$counts <- histogram$counts / sum(histogram$counts)  # convert the histogram to frequencies
118 |   plot(histogram, main = "Histogram of the residual",
119 |        xlab = sprintf("Residual (observed - explained, %d x %d values)", params$m, params$k))
120 |   dev.off()
121 | 
122 |   res$metrics$total_elapsed_time <- proc.time()[['elapsed']]
123 | 
124 |   # Write summary as JSON (scalar values).
125 |   metrics_json_path <- file.path(opts$output_dir, 'metrics.json')
126 |   m <- toJSON(res$metrics)
127 |   writeLines(m, con = metrics_json_path)
128 |   Log("Wrote %s, %s, and %s", results_csv_path, results_png_path, metrics_json_path)
129 | 
130 |   # TODO:
131 |   # - These are in an 2 column 'parameters' and 'values' format.  Should these
132 |   # just be a plain list?
133 |   # - Should any of these privacy params be in metrics.json?
134 | 
135 |   Log("Privacy summary:")
136 |   print(res$privacy)
137 |   cat("\n")
138 | 
139 |   Log('DONE')
140 | }
141 | 
142 | if (!interactive()) {
143 |   main(opts)
144 | }
145 | 


--------------------------------------------------------------------------------
/client/cpp/README.md:
--------------------------------------------------------------------------------
  1 | RAPPOR C++ Client
  2 | =================
  3 | 
  4 | We provide both a low level and high level client API.  The low level API
  5 | implements just the RAPPOR encoding algorithm on strings, with few
  6 | dependencies.
  7 | 
  8 | The high level API provides wrappers that bundle encoded values into Protocol
  9 | Buffer messages.
 10 | 
 11 | Build Instructions
 12 | ------------------
 13 | 
 14 | You'll need a C++ compiler, the protobuf compiler, and a library that
 15 | implements common hash functions (e.g. OpenSSL).
 16 | 
 17 | On Ubuntu or Debian, the protobuf compiler and header files can be installed
 18 | with:
 19 | 
 20 |     sudo apt-get install protobuf-compiler libprotobuf-dev
 21 | 
 22 | OpenSSL can be installed with:
 23 | 
 24 |     sudo apt-get install libssl-dev
 25 | 
 26 | Test
 27 | ----
 28 | 
 29 | After installing dependencies, You can test it out easily on your machine:
 30 | 
 31 |     ./demo.sh quick-cpp
 32 | 
 33 | This builds the test harness using a Makefile, and then runs the regtest.sh
 34 | simulation.  The last few lines of output will look like this:
 35 | 
 36 |     Done running all test instances
 37 |     Instances succeeded: 1  failed: 0  running: 0  total: 1
 38 |     Wrote _tmp/cpp/results.html
 39 |     URL: file:///usr/local/google/home/andychu/git/rappor/_tmp/cpp/results.html
 40 | 
 41 | Open the HTML file to see a plot and stats.
 42 | 
 43 | 
 44 | Encoder
 45 | -------
 46 | 
 47 | The low level API is `Encoder`.  You instantiatate it with RAPPOR encoding
 48 | parameters and application dependencies.  It has a method `EncodeString()` that
 49 | takes an input string (no other types), sets an output parameter of type
 50 | `rappor::Bits`, and returns success or failure.
 51 | 
 52 | ```cpp
 53 | #include <cassert>
 54 | 
 55 | #include "encoder.h"
 56 | #include "openssl_hash_impl.h"
 57 | #include "unix_kernel_rand_impl.h"
 58 | 
 59 | int main(int argc, char** argv) {
 60 |   FILE* fp = fopen("/dev/urandom", "r");
 61 |   rappor::UnixKernelRand irr_rand(fp);
 62 | 
 63 |   rappor::Deps deps(rappor::Md5, "client-secret", rappor::HmacSha256,
 64 |                     irr_rand);
 65 |   rappor::Params params(32,    // num_bits (k)
 66 |                         2,     // num_hashes (h)
 67 |                         128,   // num_cohorts (m)
 68 |                         0.25,  // probability f for PRR
 69 |                         0.75,  // probability p for IRR
 70 |                         0.5);  // probability q for IRR
 71 | 
 72 |   const char* encoder_id = "metric-name";
 73 |   rappor::Encoder encoder(encoder_id, params, deps);
 74 | 
 75 |   // Now use it to encode values.  The 'out' value can be sent over the
 76 |   // network.
 77 |   rappor::Bits out;
 78 |   assert(encoder.EncodeString("foo", &out));  // returns false on error
 79 |   printf("'foo' encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
 80 | 
 81 |   // Raw bits
 82 |   assert(encoder.EncodeBits(0x123, &out));  // returns false on error
 83 |   printf("0x123 encoded with RAPPOR: %0x, cohort %d\n", out, encoder.cohort());
 84 | }
 85 | ```
 86 | 
 87 | Dependencies
 88 | ------------
 89 | 
 90 | `rappor::Deps` is a struct-like object that holds the dependencies needed by
 91 | the API.
 92 | 
 93 | The application must provide the following values:
 94 | 
 95 | - cohort: An integer between 0 and `num_cohorts - 1`.  Each value is assigned
 96 |   with equal probability to a client process.
 97 | - client_secret: A persistent client secret (used for deterministic randomness
 98 |   in the PRR, i.e. "memoization" requirement).
 99 | - hash_func - string hash function implementation (e.g. MD5)
100 | - hmac_func - HMAC-SHA256 implementation
101 | - irr_rand - randomness for the IRR
102 | 
103 | We provide an implementation of `hash_func` and `hmac_func` and using OpenSSL.
104 | If your application already has a different implementation of these functions,
105 | you can implement the `HashFunc` and HmacFunc` interfaces.
106 | 
107 | We provide two example implementations of `irr_rand`: one based on libc
108 | `rand()` (insecure, for demo only), and one based on Unix `/dev/urandom`.
109 | 
110 | Error Handling
111 | --------------
112 | 
113 | Note that incorrect usage of the `SimpleEncoder` and `Protobuf` constructors
114 | may cause *runtime assertions* (using `assert()`).  For example, if
115 | Params.num\_bits is more than 32, the process will crash.
116 | 
117 | Encoders should be initialized at application startup, with constant
118 | parameters, so this type of error should be seen early.
119 | 
120 | The various `Encode()` members do *not* raise assertions.  If those are used
121 | incorrectly, then the return value will be `false` to indicate an error.  These
122 | failures should be handled by the application.
123 | 
124 | Memory Management
125 | -----------------
126 | 
127 | The `Encoder` instances contain pointers to `Params` and `Deps` instances, but
128 | don't own them.  In the examples, all instances live the stack of `main()`, so
129 | you don't have to worry about them being destroyed.
130 | 


--------------------------------------------------------------------------------
/analysis/R/encode.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Google Inc. All rights reserved.
  2 | # 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | # 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | # 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | Encode <- function(value, map, strs, params, N, id = NULL,
 16 |                    cohort = NULL, B = NULL, BP = NULL) {
 17 |   # Encode value to RAPPOR and return a report.
 18 |   #
 19 |   # Input:
 20 |   #    value: value to be encoded
 21 |   #    map: a mapping matrix describing where each element of strs map in
 22 |   #         each cohort
 23 |   #    strs: a vector of possible values with value being one of them
 24 |   #    params: a list of RAPPOR parameters described in decode.R
 25 |   #    N: sample size
 26 |   # Optional parameters:
 27 |   #    id: user ID (smaller than N)
 28 |   #    cohort: specifies cohort number (smaller than m)
 29 |   #    B: input Bloom filter itself, in which case value is ignored
 30 |   #    BP: input Permanent Randomized Response (memoized for multiple colections
 31 |   #        from the same user
 32 | 
 33 |   k <- params$k
 34 |   p <- params$p
 35 |   q <- params$q
 36 |   f <- params$f
 37 |   h <- params$h
 38 |   m <- params$m
 39 |   if (is.null(cohort)) {
 40 |     cohort <- sample(1:m, 1)
 41 |   }
 42 | 
 43 |   if (is.null(id)) {
 44 |     id <- sample(N, 1)
 45 |   }
 46 | 
 47 |   ind <- which(value == strs)
 48 | 
 49 |   if (is.null(B)) {
 50 |     B <- as.numeric(map[[cohort]][, ind])
 51 |   }
 52 | 
 53 |   if (is.null(BP)) {
 54 |     BP <- sapply(B, function(x) sample(c(0, 1, x), 1,
 55 |                                        prob = c(0.5 * f, 0.5 * f, 1 - f)))
 56 |   }
 57 |   rappor <- sapply(BP, function(x) rbinom(1, 1, ifelse(x == 1, q, p)))
 58 | 
 59 |   list(value = value, rappor = rappor, B = B, BP = BP, cohort = cohort, id = id)
 60 | }
 61 | 
 62 | ExamplePlot <- function(res, k, ebs = 1, title = "", title_cex = 4,
 63 |                         voff = .17, acex = 1.5, posa = 2, ymin = 1,
 64 |                         horiz = FALSE) {
 65 |   PC <- function(k, report) {
 66 |     char <- as.character(report)
 67 |     if (k > 128) {
 68 |       char[char != ""] <- "|"
 69 |     }
 70 |     char
 71 |   }
 72 | 
 73 |   # Annotation settings
 74 |   anc <- "darkorange2"
 75 |   colors <- c("lavenderblush3", "maroon4")
 76 | 
 77 |   par(omi = c(0, .55, 0, 0))
 78 |   # Setup plotting.
 79 |   plot(1:k, rep(1, k), ylim = c(ymin, 4), type = "n",
 80 |        xlab = "Bloom filter bits",
 81 |        yaxt = "n", ylab = "", xlim = c(0, k), bty = "n", xaxt = "n")
 82 |   mtext(paste0("Participant ", res$id, " in cohort ", res$cohort), 3, 2,
 83 |         adj = 1, col = anc, cex = acex)
 84 |   axis(1, 2^(0:15), 2^(0:15))
 85 |   abline(v = which(res$B == 1), lty = 2, col = "grey")
 86 | 
 87 |   # First row with the true value.
 88 |   text(k / 2, 4, paste0('"', paste0(title, as.character(res$value)), '"'),
 89 |        cex = title_cex, col = colors[2], xpd = NA)
 90 | 
 91 |   # Second row with BF: B.
 92 |   points(1:k, rep(3, k), pch = PC(k, res$B), col = colors[res$B + 1],
 93 |          cex = res$B + 1)
 94 |   text(k, 3 + voff, paste0(sum(res$B), " signal bits"), cex = acex,
 95 |        col = anc, pos = posa)
 96 | 
 97 |   # Third row: B'.
 98 |   points(1:k, rep(2, k), pch = PC(k, res$BP), col = colors[res$BP + 1],
 99 |          cex = res$BP + 1)
100 |   text(k, 2 + voff, paste0(sum(res$BP), " bits on"),
101 |        cex = acex, col = anc, pos = posa)
102 | 
103 |   # Row 4: actual RAPPOR report.
104 |   report <- res$rappor
105 |   points(1:k, rep(1, k), pch = PC(k, as.character(report)),
106 |          col = colors[report + 1], cex = report + 1)
107 |   text(k, 1 + voff, paste0(sum(res$rappor), " bits on"), cex = acex,
108 |        col = anc, pos = posa)
109 | 
110 |   mtext(c("True value:", "Bloom filter (B):",
111 |           "Fake Bloom \n filter (B'):", "Report sent\n to server:"),
112 |         2, 1, at = 4:1, las = 2)
113 |   legend("topright", legend = c("0", "1"), fill = colors, bty = "n",
114 |          cex = 1.5, horiz = horiz)
115 |   legend("topleft", legend = ebs, plot = FALSE)
116 | }
117 | 
118 | PlotPopulation <- function(probs, detected, detection_frequency) {
119 |     cc <- c("gray80", "darkred")
120 |     color <- rep(cc[1], length(probs))
121 |     color[detected] <- cc[2]
122 |     bp <- barplot(probs, col = color, border = color)
123 |     inds <- c(1, c(max(which(probs > 0)), length(probs)))
124 |     axis(1, bp[inds], inds)
125 |     legend("topright", legend = c("Detected", "Not-detected"),
126 |            fill = rev(cc), bty = "n")
127 |     abline(h = detection_frequency, lty = 2, col = "grey")
128 | }
129 | 


--------------------------------------------------------------------------------
/client/cpp/encoder.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 Google Inc. All rights reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // RAPPOR encoder.
 16 | //
 17 | // See README.md and encoder_demo.cc for an example.
 18 | 
 19 | #ifndef RAPPOR_H_
 20 | #define RAPPOR_H_
 21 | 
 22 | #include <string>
 23 | 
 24 | #include "rappor_deps.h"  // for dependency injection
 25 | 
 26 | namespace rappor {
 27 | 
 28 | // For debug logging
 29 | void log(const char* fmt, ...);
 30 | 
 31 | // RAPPOR encoding parameters.
 32 | class Params {
 33 |  public:
 34 |   Params(int num_bits, int num_hashes, int num_cohorts,
 35 |          float prob_f, float prob_p, float prob_q)
 36 |       : num_bits_(num_bits),
 37 |         num_hashes_(num_hashes),
 38 |         num_cohorts_(num_cohorts),
 39 |         prob_f_(prob_f),
 40 |         prob_p_(prob_p),
 41 |         prob_q_(prob_q) {
 42 |   }
 43 | 
 44 |   // Accessors
 45 |   int num_bits() { return num_bits_; }
 46 |   int num_hashes() { return num_hashes_; }
 47 |   int num_cohorts() { return num_cohorts_; }
 48 |   float prob_f() { return prob_f_; }
 49 |   float prob_p() { return prob_p_; }
 50 |   float prob_q() { return prob_q_; }
 51 | 
 52 |  private:
 53 |   friend class Encoder;
 54 | 
 55 |   // k: size of bloom filter, PRR, and IRR.  0 < k <= 32.
 56 |   int num_bits_;
 57 | 
 58 |   // number of bits set in the Bloom filter ("h")
 59 |   int num_hashes_;
 60 | 
 61 |   // Total number of cohorts ("m").  Note that the cohort assignment is what
 62 |   // is used in the client, not m.  We include it here for documentation (it
 63 |   // can be unset, unlike the other params.)
 64 |   int num_cohorts_;
 65 | 
 66 |   float prob_f_;  // noise probability for PRR, quantized to 1/128
 67 | 
 68 |   float prob_p_;  // noise probability for IRR, quantized to 1/128
 69 |   float prob_q_;  // noise probability for IRR, quantized to 1/128
 70 | };
 71 | 
 72 | // Encoder: take client values and transform them with the RAPPOR privacy
 73 | // algorithm.
 74 | class Encoder {
 75 |  public:
 76 |   // Note that invalid parameters cause runtime assertions in the constructor.
 77 |   // Encoders are intended to be created at application startup with constant
 78 |   // arguments, so errors should be caught early.
 79 | 
 80 |   // encoder_id: A unique ID for this encoder -- typically the name of the
 81 |   //   metric being encoded, so that different metrics have different PRR
 82 |   //   mappings.
 83 |   // params: RAPPOR encoding parameters, which affect privacy and decoding.
 84 |   //   (held by reference; it must outlive the Encoder)
 85 |   // deps: application-supplied dependencies.
 86 |   //   (held by reference; it must outlive the Encoder)
 87 |   Encoder(const std::string& encoder_id, const Params& params,
 88 |           const Deps& deps);
 89 | 
 90 |   // Encode raw bits (represented as an integer), setting output parameter
 91 |   // irr_out.  Only valid when the return value is 'true' (success).
 92 |   bool EncodeBits(const Bits bits, Bits* irr_out) const;
 93 | 
 94 |   // Encode a string, setting output parameter irr_out.  Only valid when the
 95 |   // return value is 'true' (success).
 96 |   bool EncodeString(const std::string& value, Bits* irr_out) const;
 97 |   // For use with HmacDrbg hash function and any num_bits divisible by 8.
 98 |   bool EncodeString(const std::string& value,
 99 |                     std::vector<uint8_t>* irr_out) const;
100 | 
101 |   // For testing/simulation use only.
102 |   bool _EncodeBitsInternal(const Bits bits, Bits* prr_out, Bits* irr_out)
103 |     const;
104 |   bool _EncodeStringInternal(const std::string& value, Bits* bloom_out,
105 |                              Bits* prr_out, Bits* irr_out) const;
106 | 
107 |   // Accessor for the assigned cohort.
108 |   uint32_t cohort() { return cohort_; }
109 |   // Set a cohort manually, if previously generated.
110 |   void set_cohort(uint32_t cohort);
111 | 
112 |  private:
113 |   bool MakeBloomFilter(const std::string& value, Bits* bloom_out) const;
114 |   bool MakeBloomFilter(const std::string& value,
115 |                        std::vector<uint8_t>* bloom_out) const;
116 |   bool GetPrrMasks(const Bits bits, Bits* uniform, Bits* f_mask) const;
117 | 
118 |   // static helper function for initialization
119 |   static uint32_t AssignCohort(const Deps& deps, int num_cohorts);
120 | 
121 |   const std::string encoder_id_;
122 |   const Params& params_;
123 |   const Deps& deps_;
124 |   uint32_t cohort_;
125 |   std::string cohort_str_;
126 | };
127 | 
128 | }  // namespace rappor
129 | 
130 | #endif  // RAPPOR_H_
131 | 


--------------------------------------------------------------------------------
/apps/rappor-sim/server.R:
--------------------------------------------------------------------------------
  1 | library(shiny)
  2 | source("../../analysis/R/decode.R")
  3 | source("../../analysis/R/simulation.R")
  4 | source("../../analysis/R/encode.R")
  5 | 
  6 | Plot <- function(x, color = "grey") {
  7 |   n <- nrow(x)
  8 |   if (n < 16) {
  9 |     par(mfrow = c(n, 1), mai = c(0, .5, .5, 0))
 10 |   } else if (n < 64) {
 11 |     par(mfrow = c(n / 2, 2), mai = c(0, .5, .5, 0))
 12 |   } else {
 13 |     par(mfrow = c(n / 4, 4), mai = c(0, .5, .5, 0))
 14 |   }
 15 |   for (i in 1:nrow(x)) {
 16 |     barplot(x[i, ], main = paste0("Cohort ", i), col = color, border = color)
 17 |   }
 18 | }
 19 | 
 20 | shinyServer(function(input, output) {
 21 |   # Example state global variable.
 22 |   es <- list()
 23 | 
 24 |   # Example buttons states.
 25 |   ebs <- rep(0, 3)
 26 | 
 27 |   Params <- reactive({
 28 |     list(k = as.numeric(input$size),
 29 |          h = as.numeric(input$hashes),
 30 |          m = as.numeric(input$instances),
 31 |          p = as.numeric(input$p),
 32 |          q = as.numeric(input$q),
 33 |          f = as.numeric(input$f))
 34 |   })
 35 | 
 36 |   PopParams <- reactive({
 37 |     list(as.numeric(input$nstrs),
 38 |       as.numeric(input$nonzero),
 39 |       input$decay,
 40 |       as.numeric(input$expo),
 41 |       as.numeric(input$background)
 42 |       )
 43 |   })
 44 | 
 45 |   DecodingParams <- reactive({
 46 |     list(as.numeric(input$alpha),
 47 |          input$correction)
 48 |   })
 49 | 
 50 |   Sample <- reactive({
 51 |     input$sample
 52 |     N <- input$N
 53 |     params <- Params()
 54 |     pop_params <- PopParams()
 55 |     decoding_params <- DecodingParams()
 56 |     prop_missing <- input$missing
 57 |     fit <- GenerateSamples(N, params, pop_params,
 58 |                     alpha = decoding_params[[1]],
 59 |                     correction = decoding_params[[2]],
 60 |                     prop_missing = prop_missing)
 61 |     fit
 62 |   })
 63 | 
 64 |   # Results summary.
 65 |   output$pr <- renderTable({
 66 |     Sample()$summary
 67 |   },
 68 |                            include.rownames = FALSE, include.colnames = FALSE)
 69 | 
 70 |   # Results table.
 71 |   output$tab <- renderDataTable({
 72 |      Sample()$fit
 73 |    },
 74 |                                 options = list(iDisplayLength = 100))
 75 | 
 76 |   # Epsilon.
 77 |   output$epsilon <- renderTable({
 78 |     Sample()$privacy
 79 |   },
 80 |                                 include.rownames = FALSE, include.colnames = FALSE, digits = 4)
 81 | 
 82 |   # True distribution.
 83 |   output$probs <- renderPlot({
 84 |     samp <- Sample()
 85 |     probs <- samp$probs
 86 |     detected <- match(samp$fit[, 1], samp$strs)
 87 |     detection_frequency <- samp$privacy[7, 2]
 88 |     PlotPopulation(probs, detected, detection_frequency)
 89 |   })
 90 | 
 91 |   # True bits patterns.
 92 |   output$truth <- renderPlot({
 93 |     truth <- Sample()$truth
 94 |     Plot(truth[, -1, drop = FALSE], color = "darkblue")
 95 |   })
 96 | 
 97 |   # Lasso plot.
 98 |   output$lasso <- renderPlot({
 99 |     fit <- Sample()$lasso
100 |     if (!is.null(fit)) {
101 |       plot(fit)
102 |     }
103 |   })
104 | 
105 |   output$resid <- renderPlot({
106 |     resid <- Sample()$residual
107 |     params <- Params()
108 |     plot(resid, xlab = "Bloom filter bits", ylab = "Residuals")
109 |     abline(h = c(-1.96, 1.96), lty = 2, col = 2)
110 |     sq <- qnorm(.025 / length(resid))
111 |     abline(h = c(sq, -sq), lty = 2, col = 3, lwd = 2)
112 |     abline(h = c(-3, 3), lty = 2, col = 4, lwd = 2)
113 |     abline(v = params$k * (0:params$m), lty = 2, col = "blue")
114 |     legend("topright", legend = paste0("SD = ", round(sd(resid), 2)), bty = "n")
115 |   })
116 | 
117 |   # Estimated bits patterns.
118 |   output$ests <- renderPlot({
119 |     ests <- Sample()$ests
120 |     Plot(ests, color = "darkred")
121 |   })
122 | 
123 |   # Estimated vs truth.
124 |   output$ests_truth <- renderPlot({
125 |     plot(unlist(Sample()$ests), unlist(Sample()$truth[, -1]),
126 |          xlab = "Estimates", ylab = "Truth", pch = 19)
127 |     abline(0, 1, lwd = 4, col = "darkred")
128 |   })
129 | 
130 |   output$example <- renderPlot({
131 |     params <- Params()
132 |     strs <- Sample()$strs
133 |     map <- Sample()$map
134 |     samp <- Sample()
135 | 
136 |     # First run on app start.
137 |     value <- sample(strs, 1)
138 |     res <- Encode(value, map, strs, params, N = input$N)
139 | 
140 |     if (input$new_user > ebs[1]) {
141 |       res <- Encode(es$value, map, strs, params, N = input$N)
142 |       ebs[1] <<- input$new_user
143 |     } else if (input$new_value > ebs[2]) {
144 |       res <- Encode(value, map, strs, params, cohort = es$cohort, id = es$id,
145 |                     N = input$N)
146 |       ebs[2] <<- input$new_value
147 |     } else if (input$new_report > ebs[3]) {
148 |       res <- Encode(es$value, map, strs, params, B = es$B,
149 |                     BP = es$BP, cohort = es$cohort, id = es$id, N = input$N)
150 |       ebs[3] <<- input$new_report
151 |     }
152 |     es <<- res
153 |     ExamplePlot(res, params$k, c(ebs, input$new_user, input$new_value, input$new_report))
154 |   })
155 | 
156 | })
157 | 


--------------------------------------------------------------------------------
/pipeline/assoc.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # Usage:
  4 | #   ./assoc.sh <function name>
  5 | 
  6 | set -o nounset
  7 | set -o pipefail
  8 | set -o errexit
  9 | 
 10 | readonly THIS_DIR=$(dirname $0)
 11 | readonly RAPPOR_SRC=$(cd $THIS_DIR/.. && pwd)
 12 | 
 13 | source $RAPPOR_SRC/util.sh  # log, banner
 14 | source $RAPPOR_SRC/pipeline/tools-lib.sh
 15 | source $RAPPOR_SRC/pipeline/alarm-lib.sh
 16 | 
 17 | # Change the default location of these tools by setting DEP_*
 18 | readonly DECODE_ASSOC=${DEP_DECODE_ASSOC:-$RAPPOR_SRC/bin/decode-assoc}
 19 | readonly FAST_EM=${DEP_FAST_EM:-$RAPPOR_SRC/analysis/cpp/_tmp/fast_em}
 20 | 
 21 | # Run a single decode-assoc process, to analyze one variable pair for one
 22 | # metric.  The arguments to this function are one row of the task spec.
 23 | decode-one() {
 24 |   # Job constants, from decode-many
 25 |   local rappor_src=$1
 26 |   local timeout_secs=$2
 27 |   local min_reports=$3
 28 |   local job_dir=$4
 29 |   local sample_size=$5
 30 | 
 31 |   # Task spec variables, from task_spec.py
 32 |   local num_reports=$6
 33 |   local metric_name=$7
 34 |   local date=$8  # for output naming only
 35 |   local reports=$9  # file with reports
 36 |   local var1=${10}
 37 |   local var2=${11}
 38 |   local map1=${12}
 39 |   local output_dir=${13}
 40 | 
 41 |   local log_file=$output_dir/assoc-log.txt
 42 |   local status_file=$output_dir/assoc-status.txt
 43 |   mkdir --verbose -p $output_dir
 44 | 
 45 |   # Flags drived from job constants
 46 |   local schema=$job_dir/config/rappor-vars.csv
 47 |   local params_dir=$job_dir/config
 48 |   local em_executable=$FAST_EM
 49 | 
 50 |   # TODO:
 51 |   # - Skip jobs with few reports, like ./backfill.sh analyze-one.
 52 | 
 53 |   # Output the spec for combine_status.py.
 54 |   echo "$@" > $output_dir/assoc-spec.txt
 55 | 
 56 |   # NOTE: Not passing --num-cores since we're parallelizing already.
 57 | 
 58 |   # NOTE: --tmp-dir is the output dir.  Then we just delete all the .bin files
 59 |   # afterward so we don't copy them to x20 (they are big).
 60 | 
 61 |   { time \
 62 |       alarm-status $status_file $timeout_secs \
 63 |         $DECODE_ASSOC \
 64 |           --create-bool-map \
 65 |           --remove-bad-rows \
 66 |           --em-executable $em_executable \
 67 |           --schema $schema \
 68 |           --params-dir $params_dir \
 69 |           --metric-name $metric_name \
 70 |           --reports $reports \
 71 |           --var1 $var1 \
 72 |           --var2 $var2 \
 73 |           --map1 $map1 \
 74 |           --reports-sample-size $sample_size \
 75 |           --tmp-dir $output_dir \
 76 |           --output-dir $output_dir
 77 |   } >$log_file 2>&1
 78 | }
 79 | 
 80 | test-decode-one() {
 81 |   decode-one $RAPPOR_SRC
 82 | }
 83 | 
 84 | readonly DEFAULT_MIN_REPORTS=5000
 85 | 
 86 | #readonly DEFAULT_TIMEOUT_SECONDS=300  # 5 minutes as a quick test.
 87 | readonly DEFAULT_TIMEOUT_SECONDS=3600  # 1 hour
 88 | 
 89 | readonly DEFAULT_MAX_PROCS=6  # TODO: Share with backfill.sh
 90 | 
 91 | # Limit to 1M for now.  Raise it when we have a full run.
 92 | readonly DEFAULT_SAMPLE_SIZE=1000000
 93 | 
 94 | readonly NUM_ARGS=8  # number of tokens in the task spec, used for xargs
 95 | 
 96 | # Run many decode-assoc processes in parallel.
 97 | decode-many() {
 98 |   local job_dir=$1
 99 |   local spec_list=$2
100 | 
101 |   # These 3 params affect speed
102 |   local timeout_secs=${3:-$DEFAULT_TIMEOUT_SECONDS}
103 |   local sample_size=${4:-$DEFAULT_SAMPLE_SIZE}
104 |   local max_procs=${5:-$DEFAULT_MAX_PROCS}
105 | 
106 |   local rappor_src=${6:-$RAPPOR_SRC}
107 |   local min_reports=${7:-$DEFAULT_MIN_REPORTS}
108 | 
109 |   time cat $spec_list \
110 |     | xargs --verbose -n $NUM_ARGS -P $max_procs --no-run-if-empty -- \
111 |       $0 decode-one $rappor_src $timeout_secs $min_reports $job_dir $sample_size
112 | }
113 | 
114 | # Combine assoc results and render HTML.
115 | 
116 | combine-and-render-html() {
117 |   local jobs_base_dir=$1
118 |   local job_dir=$2
119 | 
120 |   banner "Combining assoc task status"
121 |   TOOLS-cook combine-assoc-task-status $jobs_base_dir $job_dir
122 | 
123 |   banner "Combining assoc results"
124 |   TOOLS-cook combine-assoc-results $jobs_base_dir $job_dir
125 | 
126 |   banner "Splitting out status per metric, and writing overview"
127 |   TOOLS-cook assoc-metric-status $job_dir
128 | 
129 |   TOOLS-gen-ui symlink-static assoc $job_dir
130 | 
131 |   banner "Building overview .part.html from CSV"
132 |   TOOLS-gen-ui assoc-overview-part-html $job_dir
133 | 
134 |   banner "Building metric .part.html from CSV"
135 |   TOOLS-gen-ui assoc-metric-part-html $job_dir
136 | 
137 |   banner "Building pair .part.html from CSV"
138 |   TOOLS-gen-ui assoc-pair-part-html $job_dir
139 | 
140 |   banner "Building day .part.html from CSV"
141 |   TOOLS-gen-ui assoc-day-part-html $job_dir
142 | }
143 | 
144 | # Temp files left over by the fast_em R <-> C++.
145 | list-and-remove-bin() {
146 |   local job_dir=$1
147 |   # If everything failed, we might not have anything to list/delete.
148 |   find $job_dir -name \*.bin | xargs --no-run-if-empty -- ls -l --si
149 |   find $job_dir -name \*.bin | xargs --no-run-if-empty -- rm -f --verbose
150 | }
151 | 
152 | "$@"
153 | 


--------------------------------------------------------------------------------
/tests/analyze_assoc.R:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | #
  3 | # Copyright 2015 Google Inc. All rights reserved.
  4 | # 
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | # 
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | # 
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | 
 17 | # Reads map files, report files, and RAPPOR parameters to run 
 18 | # an EM algorithm to estimate joint distribution over two or more variables
 19 | # 
 20 | # Usage:
 21 | #       $ ./analyze_assoc.R -map1 map_1.csv -map2 map_2.csv \
 22 | #                                 -reports reports.csv \
 23 | # Inputs: map1, map2, reports, params
 24 | #         see how options are parsed below for more information
 25 | # Outputs:
 26 | #         prints a table with estimated joint probability masses
 27 | #         over candidate strings
 28 | #         Ex. 
 29 | #                 ssl   nossl
 30 | #         intel   0.1   0.3
 31 | #         google  0.5   0.1
 32 | 
 33 | library("optparse")
 34 | 
 35 | options(stringsAsFactors = FALSE)
 36 | 
 37 | if(!interactive()) {
 38 |   option_list <- list(
 39 |     # Flags
 40 |     make_option(c("--map1", "-m1"), default = "map_1.csv",
 41 |                 help = "Hashed candidates for 1st variable"),
 42 |     make_option(c("--map2", "-m2"), default = "map_2.csv",
 43 |                 help = "Hashed candidates for 2nd variable"),
 44 |     make_option(c("--reports", "-r"), default = "reports.csv",
 45 |                 help = "File with raw reports as <cohort, report1, report2>"),
 46 |     make_option(c("--params", "-p"), default = "params.csv",
 47 |                 help = "Filename for RAPPOR parameters")
 48 |   )
 49 |   opts <- parse_args(OptionParser(option_list = option_list))
 50 | }    
 51 | 
 52 | source("../analysis/R/encode.R")
 53 | source("../analysis/R/decode.R")
 54 | source("../analysis/R/simulation.R")
 55 | source("../analysis/R/read_input.R")
 56 | source("../analysis/R/association.R")
 57 | 
 58 | # This function processes the maps loaded using ReadMapFile
 59 | # Association analysis requires a map object with a map
 60 | # field that has the map split into cohorts and an rmap field
 61 | # that has all the cohorts combined
 62 | # Arguments: 
 63 | #       map = map object with cohorts as sparse matrix in
 64 | #             object map$map
 65 | #             This is the expected object from ReadMapFile
 66 | #       params = data field with parameters
 67 | # TODO(pseudorandom): move this functionality to ReadMapFile
 68 | ProcessMap <- function(map, params) {
 69 |   map$rmap <- map$map
 70 |   split_map <- function(i, map_struct) {
 71 |     numbits <- params$k
 72 |     indices <- which(as.matrix(
 73 |       map_struct[((i - 1) * numbits + 1):(i * numbits),]) == TRUE,
 74 |       arr.ind = TRUE)
 75 |     sparseMatrix(indices[, "row"], indices[, "col"],
 76 |                  dims = c(numbits, max(indices[, "col"])))
 77 |   }
 78 |   map$map <- lapply(1:params$m, function(i) split_map(i, map$rmap))
 79 |   map
 80 | }
 81 | 
 82 | main <- function(opts) {
 83 |   ptm <- proc.time()
 84 |   
 85 |   params <- ReadParameterFile(opts$params)
 86 |   opts_map <- list(opts$map1, opts$map2)
 87 |   map <- lapply(opts_map, function(o)
 88 |                   ProcessMap(ReadMapFile(o, params = params),
 89 |                              params = params))
 90 |   # Reports must be of the format
 91 |   #     cohort no, rappor bitstring 1, rappor bitstring 2
 92 |   reportsObj <- read.csv(opts$reports, 
 93 |                          colClasses = c("integer", "character", "character"),
 94 |                          header = FALSE)
 95 |   
 96 |   # Parsing reportsObj
 97 |   # ComputeDistributionEM allows for different sets of cohorts
 98 |   # for each variable. Here, both sets of cohorts are identical
 99 |   co <- as.list(reportsObj[1])[[1]]
100 |   cohorts <- list(co, co)
101 |   # Parse reports from reportObj cols 2 and 3
102 |   reports <- lapply(1:2, function(x) as.list(reportsObj[x + 1]))
103 |   
104 |   # Split strings into bit arrays (as required by assoc analysis)
105 |   reports <- lapply(1:2, function(i) {
106 |     # apply the following function to each of reports[[1]] and reports[[2]]
107 |     lapply(reports[[i]][[1]], function(x) {
108 |       # function splits strings and converts them to numeric values  
109 |       as.numeric(strsplit(x, split = "")[[1]])
110 |     })
111 |   })
112 |   
113 |   joint_dist <- ComputeDistributionEM(reports, cohorts, map, 
114 |                                       ignore_other = TRUE,
115 |                                       params, marginals = NULL,
116 |                                       estimate_var = FALSE)
117 |   # TODO(pseudorandom): Export the results to a file for further analysis
118 |   print("JOINT_DIST$FIT")
119 |   print(joint_dist$fit)
120 |   print("PROC.TIME")
121 |   print(proc.time() - ptm)
122 | }
123 | 
124 | if(!interactive()) {
125 |   main(opts)
126 | }


--------------------------------------------------------------------------------
/analysis/R/read_input.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Google Inc. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | #
 16 | # Read parameter, counts and map files.
 17 | 
 18 | library(Matrix)
 19 | 
 20 | source.rappor <- function(rel_path)  {
 21 |   abs_path <- paste0(Sys.getenv("RAPPOR_REPO", ""), rel_path)
 22 |   source(abs_path)
 23 | }
 24 | 
 25 | source.rappor("analysis/R/util.R")  # for Log
 26 | 
 27 | 
 28 | ReadParameterFile <- function(params_file) {
 29 |   # Read parameter file. Format:
 30 |   # k, h, m, p, q, f
 31 |   # 128, 2, 8, 0.5, 0.75, 0.75
 32 | 
 33 |   params <- as.list(read.csv(params_file))
 34 |   if (length(params) != 6) {
 35 |     stop("There should be exactly 6 columns in the parameter file.")
 36 |   }
 37 |   if (any(names(params) != c("k", "h", "m", "p", "q", "f"))) {
 38 |     stop("Parameter names must be k,h,m,p,q,f.")
 39 |   }
 40 |   params
 41 | }
 42 | 
 43 | # Handle the case of redundant cohorts, i.e. the counts file needs to be
 44 | # further aggregated to obtain counts for the number of cohorts specified in
 45 | # the params file.
 46 | #
 47 | # NOTE: Why is this happening?
 48 | AdjustCounts <- function(counts, params) {
 49 |   apply(counts, 2, function(x) {
 50 |     tapply(x, rep(1:params$m, nrow(counts) / params$m), sum)
 51 |   })
 52 | }
 53 | 
 54 | ReadCountsFile <- function(counts_file, params, adjust_counts = FALSE) {
 55 |   # Read in the counts file.
 56 |   if (!file.exists(counts_file)) {
 57 |     return(NULL)
 58 |   }
 59 |   counts <- read.csv(counts_file, header = FALSE)
 60 | 
 61 |   if (adjust_counts) {
 62 |     counts <- AdjustCounts(counts, params)
 63 |   }
 64 | 
 65 |   if (nrow(counts) != params$m) {
 66 |     stop(sprintf("Got %d rows in the counts file, expected m = %d",
 67 |                  nrow(counts), params$m))
 68 |   }
 69 | 
 70 |   if ((ncol(counts) - 1) != params$k) {
 71 |     stop(paste0("Counts file: number of columns should equal to k + 1: ",
 72 |                 ncol(counts)))
 73 |   }
 74 | 
 75 |   if (any(counts < 0)) {
 76 |     stop("Counts file: all counts must be positive.")
 77 |   }
 78 | 
 79 |   # Turn counts from a data frame into a matrix.  (In R a data frame and matrix
 80 |   # are sometimes interchangeable, but sometimes we need it to be matrix.)
 81 |   as.matrix(counts)
 82 | }
 83 | 
 84 | ReadMapFile <- function(map_file, params) {
 85 |   # Read in the map file which is in the following format (two hash functions):
 86 |   # str1, h11, h12, h21 + k, h22 + k, h31 + 2k, h32 + 2k ...
 87 |   # str2, ...
 88 |   # Output:
 89 |   #    map: a sparse representation of set bits for each candidate string.
 90 |   #    strs: a vector of all candidate strings.
 91 | 
 92 |   Log("Parsing %s", map_file)
 93 | 
 94 |   map_pos <- read.csv(map_file, header = FALSE, as.is = TRUE)
 95 |   strs <- map_pos[, 1]
 96 |   strs[strs == ""] <- "Empty"
 97 | 
 98 |   # Remove duplicated strings.
 99 |   ind <- which(!duplicated(strs))
100 |   strs <- strs[ind]
101 |   map_pos <- map_pos[ind, ]
102 | 
103 |   n <- ncol(map_pos) - 1
104 |   if (n != (params$h * params$m)) {
105 |     stop(paste0("Map file: number of columns should equal hm + 1:",
106 |                 n, "_", params$h * params$m))
107 |   }
108 | 
109 |   row_pos <- unlist(map_pos[, -1], use.names = FALSE)
110 |   col_pos <- rep(1:nrow(map_pos), times = ncol(map_pos) - 1)
111 | 
112 |   # TODO: When would this ever happen?
113 |   removed <- which(is.na(row_pos))
114 |   if (length(removed) > 0) {
115 |     Log("Removed %d entries", length(removed))
116 |     row_pos <- row_pos[-removed]
117 |     col_pos <- col_pos[-removed]
118 |   }
119 | 
120 |   map <- sparseMatrix(row_pos, col_pos,
121 |                       dims = c(params$m * params$k, length(strs)))
122 | 
123 |   colnames(map) <- strs
124 |   list(map = map, strs = strs, map_pos = map_pos)
125 | }
126 | 
127 | LoadMapFile <- function(map_file, params) {
128 |   # Reads the map file, caching an .rda (R binary data) version of it to speed
129 |   # up future loads.
130 | 
131 |   rda_path <- sub(".csv", ".rda", map_file, fixed = TRUE)
132 |   # This must be unique per process, so concurrent processes don't try to
133 |   # write the same file.
134 |   tmp_path <- sprintf("%s.%d", rda_path, Sys.getpid())
135 | 
136 |   # First save to a temp file, and then atomically rename to the destination.
137 |   if (file.exists(rda_path)) {
138 |     Log("Loading %s", rda_path)
139 |     load(rda_path, .GlobalEnv)  # creates the 'map' variable in the global env
140 |   } else {
141 |     map <- ReadMapFile(map_file, params)
142 | 
143 |     Log("Saving %s as an rda file for faster access", map_file)
144 |     tryCatch({
145 |       save(map, file = tmp_path)
146 |       file.rename(tmp_path, rda_path)
147 |     }, warning = function(w) {
148 |       Log("WARNING: %s", w)
149 |     }, error = function(e) {
150 |       Log("ERROR: %s", e)
151 |     })
152 |   }
153 |   return(map)
154 | }
155 | 


--------------------------------------------------------------------------------
/analysis/R/unknowns_test.R:
--------------------------------------------------------------------------------
  1 | # Copyright 2014 Google Inc. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # Author: fanti@google.com (Giulia Fanti)
 16 | #
 17 | # Tests the unknown unknowns dictionary estimation functions.
 18 | #     There are two main components involved in estimating this unknown
 19 | #     distribution:
 20 | #          a) Find the pairwise ngrams that co-occur often.
 21 | #          b) Determine which full strings are consisted with all pairwise
 22 | #             relations.
 23 | #
 24 | #     TestEstimateDictionary() tests the full pipeline, including parts (a)
 25 | #         and (b).
 26 | #     TestFindFeasibleStrings() tests only part (b).
 27 | #     Both tests generate their own data.
 28 | 
 29 | library(parallel)
 30 | source("analysis/R/encode.R")
 31 | source("analysis/R/decode.R")
 32 | source("analysis/R/simulation.R")
 33 | source("analysis/R/association.R")
 34 | source("analysis/R/decode_ngrams.R")
 35 | source("analysis/R/ngrams_simulation.R")
 36 | alphabet <- letters
 37 | options(warn = -1)
 38 | 
 39 | GeneratePopulation <- function(N, num_strs, str_len = 10,
 40 |                                distribution = NULL) {
 41 |   # Generates a /deterministic/ string for each individual in the
 42 |   #     population from distribution.
 43 |   #
 44 |   # Args:
 45 |   #   N: Number of individuals in the population
 46 |   #   num_strs: Number of strings from which to draw strings
 47 |   #   str_len: Length of each string
 48 |   #   distribution: Just here for compatibility with original
 49 |   #       GeneratePopulation function in ngrams_simulation.R
 50 |   #
 51 |   # Returns:
 52 |   #   Vector of strings for each individual in the population
 53 | 
 54 |   strs <- sapply(1:num_strs, function(i) {
 55 |     paste0(alphabet[(str_len * (i - 1) + 1):(str_len * i)], collapse = "")
 56 |   })
 57 | 
 58 |   # Uniform distribution
 59 |   prob <- rep(1 / num_strs, num_strs)
 60 |   sample(strs, N, replace = TRUE, prob = prob)
 61 | }
 62 | 
 63 | TestEstimateDictionary <- function() {
 64 |   # Tests that the algorithm without noise recovers a uniform
 65 |   #     string population correctly.
 66 | 
 67 |   # Compute the strings from measuring only 2 ngrams
 68 |   N <- 100
 69 |   str_len <- 6
 70 |   ngram_size <- 2
 71 |   num_ngrams <- str_len / ngram_size
 72 |   num_strs <- 1
 73 | 
 74 |   params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
 75 | 
 76 |   ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
 77 |                        num_ngrams_collected = 2)
 78 | 
 79 |   sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
 80 |                         alphabet, params, distribution = 3)
 81 | 
 82 |   res <- EstimateDictionary(sim, N, ngram_params, params)
 83 | 
 84 |   # Check that the correct strings are found
 85 |   if (num_strs == 1) {
 86 |     checkTrue(res$found_candidates == sort(unique(sim$strs)))
 87 |   } else {
 88 |     checkTrue(all.equal(res$found_candidates, sort(unique(sim$strs))))
 89 |   }
 90 | }
 91 | 
 92 | TestFindFeasibleStrings <- function() {
 93 |   # Tests that FindPairwiseCandidates weeds out false positives.
 94 |   #     We test this by adding false positives to the pairwise estimates.
 95 |   N <- 100
 96 |   str_len <- 6
 97 |   ngram_size <- 2
 98 |   num_ngrams <- str_len / ngram_size
 99 |   num_strs <- 2
100 | 
101 |   params <- list(k = 128, h = 4, m = 2, p = 0, q = 1, f = 0)
102 | 
103 |   ngram_params <- list(ngram_size = ngram_size, num_ngrams = num_ngrams,
104 |                        num_ngrams_collected = 2)
105 | 
106 |   sim <- SimulateNGrams(N, ngram_params, str_len, num_strs = num_strs,
107 |                         alphabet, params)
108 | 
109 |   pairwise_candidates <- FindPairwiseCandidates(sim, N, ngram_params,
110 |                                                 params)$candidate_strs
111 |   cat("Found the pairwise candidates. \n")
112 | 
113 |   pairwise_candidates[[1]] <- rbind(pairwise_candidates[[1]], c("ab", "le"))
114 | 
115 |   if (is.null(pairwise_candidates)) {
116 |     return (FALSE)
117 |   }
118 | 
119 |   conn <- file('graph.txt', 'w+')
120 |   WriteKPartiteGraph(conn,
121 |                      pairwise_candidates,
122 |                      sim$pairings,
123 |                      ngram_params$num_ngrams,
124 |                      ngram_params$ngram_size)
125 | 
126 |   close(conn)
127 |   cat("Wrote graph.txt\n")
128 | 
129 |   found_candidates <- FindFeasibleStrings(pairwise_candidates,
130 |                                           sim$pairings,
131 |                                           ngram_params$num_ngrams,
132 |                                           ngram_params$ngram_size)
133 |   # Check that the correct strings are found
134 |   if (num_strs == 1) {
135 |     checkTrue(found_candidates == sort(unique(sim$strs)))
136 |   } else {
137 |     checkTrue(all.equal(found_candidates, sort(unique(sim$strs))))
138 |   }
139 | }
140 | 


--------------------------------------------------------------------------------
/analysis/tensorflow/fast_em.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """
  3 | fast_em.py: Tensorflow implementation of expectation maximization for RAPPOR
  4 | association analysis.
  5 | 
  6 | TODO:
  7 |   - Use TensorFlow ops for reading input (so that reading input can be
  8 |     distributed)
  9 |   - Reduce the number of ops (currently proportional to the number of reports).
 10 |     May require new TensorFlow ops.
 11 |   - Fix performance bug (v_split is probably being recomputed on every
 12 |     iteration):
 13 |     bin$ ./test.sh decode-assoc-cpp - 1.1 seconds (single-threaded C++)
 14 |     bin$ ./test.sh decode-assoc-tensorflow - 226 seconds on GPU
 15 | """
 16 | 
 17 | import sys
 18 | 
 19 | import numpy as np
 20 | import tensorflow as tf
 21 | 
 22 | 
 23 | def log(msg, *args):
 24 |   if args:
 25 |     msg = msg % args
 26 |   print >>sys.stderr, msg
 27 | 
 28 | 
 29 | def ExpectTag(f, expected):
 30 |   """Read and consume a 4 byte tag from the given file."""
 31 |   b = f.read(4)
 32 |   if b != expected:
 33 |     raise RuntimeError('Expected %r, got %r' % (expected, b))
 34 | 
 35 | 
 36 | def ReadListOfMatrices(f):
 37 |   """
 38 |   Read a big list of conditional probability matrices from a binary file.
 39 |   """
 40 |   ExpectTag(f, 'ne \0')
 41 |   num_entries = np.fromfile(f, np.uint32, count=1)[0]
 42 |   log('Number of entries: %d', num_entries)
 43 | 
 44 |   ExpectTag(f, 'es \0')
 45 |   entry_size = np.fromfile(f, np.uint32, count=1)[0]
 46 |   log('Entry size: %d', entry_size)
 47 | 
 48 |   ExpectTag(f, 'dat\0')
 49 |   vec_length = num_entries * entry_size
 50 |   v = np.fromfile(f, np.float64, count=vec_length)
 51 | 
 52 |   log('Values read: %d', len(v))
 53 |   log('v: %s', v[:10])
 54 |   #print 'SUM', sum(v)
 55 | 
 56 |   # NOTE: We're not reshaping because we're using one TensorFlow tensor object
 57 |   # per matrix, since it makes the algorithm expressible with current
 58 |   # TensorFlow ops.
 59 |   #v = v.reshape((num_entries, entry_size))
 60 | 
 61 |   return num_entries, entry_size, v
 62 | 
 63 | 
 64 | def WriteTag(f, tag):
 65 |   if len(tag) != 3:
 66 |     raise AssertionError("Tags should be 3 bytes.  Got %r" % tag)
 67 |   f.write(tag + '\0')  # NUL terminated
 68 | 
 69 | 
 70 | def WriteResult(f, num_em_iters, pij):
 71 |   WriteTag(f, 'emi')
 72 |   emi = np.array([num_em_iters], np.uint32)
 73 |   emi.tofile(f)
 74 | 
 75 |   WriteTag(f, 'pij')
 76 |   pij.tofile(f)
 77 | 
 78 | 
 79 | def DebugSum(num_entries, entry_size, v):
 80 |   """Sum the entries as a sanity check."""
 81 |   cond_prob = tf.placeholder(tf.float64, shape=(num_entries * entry_size,))
 82 |   debug_sum = tf.reduce_sum(cond_prob)
 83 |   with tf.Session() as sess:
 84 |     s = sess.run(debug_sum, feed_dict={cond_prob: v})
 85 |   log('Debug sum: %f', s)
 86 | 
 87 | 
 88 | def BuildEmIter(num_entries, entry_size, v):
 89 |   # Placeholder for the value from the previous iteration.
 90 |   pij_in = tf.placeholder(tf.float64, shape=(entry_size,))
 91 | 
 92 |   # split along dimension 0
 93 |   # TODO:
 94 |   # - make sure this doesn't get run for every EM iteration
 95 |   # - investigate using tf.tile() instead?  (this may cost more memory)
 96 |   v_split = tf.split(0, num_entries, v)
 97 | 
 98 |   z_numerator = [report * pij_in for report in v_split]
 99 |   sum_z = [tf.reduce_sum(report) for report in z_numerator]
100 |   z = [z_numerator[i] / sum_z[i] for i in xrange(num_entries)]
101 | 
102 |   # Concat per-report tensors and reshape.  This is probably inefficient?
103 |   z_concat = tf.concat(0, z)
104 |   z_concat = tf.reshape(z_concat, [num_entries, entry_size])
105 | 
106 |   # This whole expression represents an EM iteration.  Bind the pij_in
107 |   # placeholder, and get a new estimation of Pij.
108 |   em_iter_expr = tf.reduce_sum(z_concat, 0) / num_entries
109 | 
110 |   return pij_in, em_iter_expr
111 | 
112 | 
113 | def RunEm(pij_in, entry_size, em_iter_expr, max_em_iters, epsilon=1e-6):
114 |   """Run the iterative EM algorithm (using the TensorFlow API).
115 | 
116 |   Args:
117 |     num_entries: number of matrices (one per report)
118 |     entry_size: total number of cells in each matrix
119 |     v: numpy.ndarray (e.g. 7000 x 8 matrix)
120 |     max_em_iters: maximum number of EM iterations
121 | 
122 |   Returns:
123 |     pij: numpy.ndarray (e.g. vector of length 8)
124 |   """
125 |   # Initial value is the uniform distribution
126 |   pij = np.ones(entry_size) / entry_size
127 | 
128 |   i = 0  # visible outside loop
129 | 
130 |   # Do EM iterations.
131 |   with tf.Session() as sess:
132 |     for i in xrange(max_em_iters):
133 |       print 'PIJ', pij
134 |       new_pij = sess.run(em_iter_expr, feed_dict={pij_in: pij})
135 |       dif = max(abs(new_pij - pij))
136 |       log('EM iteration %d, dif = %e', i, dif)
137 |       pij = new_pij
138 | 
139 |       if dif < epsilon:
140 |         log('Early EM termination: %e < %e', max_dif, epsilon)
141 |         break
142 | 
143 |   # If i = 9, then we did 10 iteratinos.
144 |   return i + 1, pij
145 | 
146 | 
147 | def sep():
148 |   print '-' * 80
149 | 
150 | 
151 | def main(argv):
152 |   input_path = argv[1]
153 |   output_path = argv[2]
154 |   max_em_iters = int(argv[3])
155 | 
156 |   sep()
157 |   with open(input_path) as f:
158 |     num_entries, entry_size, cond_prob = ReadListOfMatrices(f)
159 | 
160 |   sep()
161 |   DebugSum(num_entries, entry_size, cond_prob)
162 | 
163 |   sep()
164 |   pij_in, em_iter_expr = BuildEmIter(num_entries, entry_size, cond_prob)
165 |   num_em_iters, pij = RunEm(pij_in, entry_size, em_iter_expr, max_em_iters)
166 | 
167 |   sep()
168 |   log('Final Pij: %s', pij)
169 | 
170 |   with open(output_path, 'wb') as f:
171 |     WriteResult(f, num_em_iters, pij)
172 |   log('Wrote %s', output_path)
173 | 
174 | 
175 | if __name__ == '__main__':
176 |   try:
177 |     main(sys.argv)
178 |   except RuntimeError, e:
179 |     print >>sys.stderr, 'FATAL: %s' % e
180 |     sys.exit(1)
181 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | RAPPOR
  2 | ======
  3 | 
  4 | RAPPOR is a novel privacy technology that allows inferring statistics about
  5 | populations while preserving the privacy of individual users.
  6 | 
  7 | This repository contains simulation and analysis code in Python and R.
  8 | 
  9 | For a detailed description of the algorithms, see the
 10 | [paper](http://arxiv.org/abs/1407.6981) and links below.
 11 | 
 12 | Feel free to send feedback to
 13 | [rappor-discuss@googlegroups.com][group].
 14 | 
 15 | Running the Demo
 16 | ----------------
 17 | 
 18 | Although the Python and R libraries should be portable to any platform, our
 19 | end-to-end demo has only been tested on Linux.
 20 | 
 21 | If you don't have a Linux box handy, you can [view the generated
 22 | output](http://google.github.io/rappor/examples/report.html).
 23 | 
 24 | To setup your enviroment there are some packages and R dependencies. There is a setup script to install them:
 25 |     $ ./setup.sh
 26 | Then to build the native components run:
 27 |     $ ./build.sh 
 28 | This compiles and tests the `fastrand` C extension module for Python, which
 29 | speeds up the simulation.
 30 | 
 31 | Finally to run the demo run:
 32 |     $ ./demo.sh
 33 | 
 34 | The demo strings together the Python and R code.  It:
 35 | 
 36 | 1. Generates simulated input data with different distributions
 37 | 2. Runs it through the RAPPOR privacy-preserving reporting mechanisms
 38 | 3. Analyzes and plots the aggregated reports against the true input
 39 | 
 40 | The output is written to `_tmp/regtest/results.html`, and can be opened with a
 41 | browser.
 42 | 
 43 | Dependencies
 44 | ------------
 45 | 
 46 | [R](http://r-project.org) analysis (`analysis/R`):
 47 | 
 48 | - [glmnet](http://cran.r-project.org/web/packages/glmnet/index.html)
 49 | - [limSolve](https://cran.r-project.org/web/packages/limSolve/index.html)
 50 | 
 51 | Demo dependencies (`demo.sh`):
 52 | 
 53 | These are necessary if you want to test changes to the code.
 54 | 
 55 | - R libraries
 56 |   - [ggplot2](http://cran.r-project.org/web/packages/ggplot2/index.html)
 57 |   - [optparse](http://cran.r-project.org/web/packages/optparse/index.html)
 58 | - bash shell / coreutils: to run tests
 59 | 
 60 | Python client (`client/python`):
 61 | 
 62 | - None.  You should be able to just import the `rappor.py` file.
 63 | 
 64 | Platform:
 65 | 
 66 | - R: tested on R 3.0.
 67 | - Python: tested on Python 2.7.
 68 | - OS: the shell script tests have been tested on Linux, but may work on
 69 |   Mac/Cygwin.  The R and Python code should work on any OS.
 70 | 
 71 | Development
 72 | -----------
 73 | 
 74 | To run tests:
 75 | 
 76 |     $ ./test.sh
 77 | 
 78 | This currently runs Python unit tests, lints Python source files, and runs R
 79 | unit tests.
 80 | 
 81 | API
 82 | ---
 83 | 
 84 | `rappor.py` is a tiny standalone Python file, and you can easily copy it into a
 85 | Python program.
 86 | 
 87 | NOTE: Its interface is subject to change.  We are in the demo stage now, but if
 88 | there's demand, we will document and publish the interface.
 89 | 
 90 | The R interface is also subject to change.
 91 | 
 92 | <!-- TODO: Add links to interface docs when available. -->
 93 | 
 94 | The `fastrand` C module is optional.  It's likely only useful for simulation of
 95 | thousands of clients.  It doesn't use cryptographically strong randomness, and
 96 | thus should **not** be used in production.
 97 | 
 98 | Directory Structure
 99 | -------------------
100 | 
101 |     analysis/
102 |       R/                 # R code for analysis
103 |       cpp/               # Fast reimplementations of certain analysis
104 |                          #   algorithms
105 |     apps/                # Web apps to help you use RAPPOR (using Shiny)
106 |     bin/                 # Command line tools for analysis.
107 |     client/              # Client libraries
108 |       python/            # Python client library
109 |         rappor.py
110 |         ...
111 |       cpp/               # C++ client library
112 |         encoder.cc
113 |         ...
114 |     doc/                 # Documentation
115 |     tests/               # Tools for regression tests
116 |       compare_dist.R     # Test helper for single variable analysis
117 |       gen_true_values.R  # Generate test input
118 |       make_summary.py    # Generate an HTML report for the regtest
119 |       rappor_sim.py      # RAPPOR client simulation
120 |       regtest_spec.py    # Specification of test cases
121 |       ...
122 |     build.sh             # Build scripts (docs, C extension, etc.)
123 |     demo.sh              # Quick demonstration
124 |     docs.sh              # Generate docs form the markdown in doc/
125 |     gh-pages/            # Where generated docs go. (A subtree of the branch gh-pages)
126 |     pipeline/            # Analysis pipeline code.
127 |     regtest.sh           # End-to-end regression tests, including client
128 |                          #  libraries and analysis
129 |     setup.sh             # Install dependencies (for Linux)
130 |     test.sh              # Test runner
131 | 
132 | Documentation
133 | -------------
134 | 
135 | - [RAPPOR Data Flow](http://google.github.io/rappor/doc/data-flow.html)
136 | 
137 | Publications
138 | ------------
139 | 
140 | - [RAPPOR: Randomized Aggregatable Privacy-Preserving Ordinal Response](http://arxiv.org/abs/1407.6981)
141 | - [Building a RAPPOR with the Unknown: Privacy-Preserving Learning of Associations and Data Dictionaries](http://arxiv.org/abs/1503.01214)
142 | 
143 | Links
144 | -----
145 | 
146 | - [Google Blog Post about RAPPOR](http://googleresearch.blogspot.com/2014/10/learning-statistics-with-privacy-aided.html)
147 | - [RAPPOR implementation in Chrome](http://www.chromium.org/developers/design-documents/rappor)
148 |   - This is a production quality C++ implementation, but it's somewhat tied to
149 |     Chrome, and doesn't support all privacy parameters (e.g. only a few values
150 |     of p and q).  On the other hand, the code in this repo is not yet
151 |     production quality, but supports experimentation with different parameters
152 |     and data sets.  Of course, anyone is free to implement RAPPOR independently
153 |     as well.
154 | - Mailing list: [rappor-discuss@googlegroups.com][group]
155 | 
156 | [group]: https://groups.google.com/forum/#!forum/rappor-discuss
157 | 


--------------------------------------------------------------------------------
/client/cpp/openssl_hash_impl_unittest.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include "openssl_hash_impl.h"
  4 | 
  5 | 
  6 | TEST(OpensslHashImplTest, Md5) {
  7 |   std::vector<uint8_t> output;
  8 |   rappor::Md5("test", &output);
  9 |   static const uint8_t ex[] = {
 10 |     0x09, 0x8f, 0x6b, 0xcd, 0x46, 0x21, 0xd3, 0x73,
 11 |     0xca, 0xde, 0x4e, 0x83, 0x26, 0x27, 0xb4, 0xf6
 12 |   };
 13 |   std::vector<uint8_t> expected(ex, ex + sizeof(ex));
 14 |   ASSERT_EQ(expected, output);
 15 | }
 16 | 
 17 | TEST(OpensslHashImplTest, HmacSha256) {
 18 |   std::vector<uint8_t> output;
 19 |   rappor::HmacSha256("key", "value", &output);
 20 |   static const uint8_t ex[] = {
 21 |     0x90, 0xfb, 0xfc, 0xf1, 0x5e, 0x74, 0xa3, 0x6b,
 22 |     0x89, 0xdb, 0xdb, 0x2a, 0x72, 0x1d, 0x9a, 0xec,
 23 |     0xff, 0xdf, 0xdd, 0xdc, 0x5c, 0x83, 0xe2, 0x7f,
 24 |     0x75, 0x92, 0x59, 0x4f, 0x71, 0x93, 0x24, 0x81, };
 25 |   std::vector<uint8_t> expected(ex, ex + sizeof(ex));
 26 |   ASSERT_EQ(expected, output);
 27 | 
 28 |   // Make sure nulls are handled properly.
 29 |   //
 30 |   // An empty value with key "key"
 31 |   // $ echo -n -e "" | openssl dgst -hmac "key" -sha256 -binary | xxd
 32 |   // 00000000: 5d5d 1395 63c9 5b59 67b9 bd9a 8c9b 233a  ]]..c.[Yg.....#:
 33 |   // 00000010: 9ded b450 7279 4cd2 32dc 1b74 8326 07d0  ...PryL.2..t.&..
 34 |   rappor::HmacSha256("key", "", &output);
 35 |   static const uint8_t exempty[] = {
 36 |     0x5d, 0x5d, 0x13, 0x95, 0x63, 0xc9, 0x5b, 0x59,
 37 |     0x67, 0xb9, 0xbd, 0x9a, 0x8c, 0x9b, 0x23, 0x3a,
 38 |     0x9d, 0xed, 0xb4, 0x50, 0x72, 0x79, 0x4c, 0xd2,
 39 |     0x32, 0xdc, 0x1b, 0x74, 0x83, 0x26, 0x07, 0xd0
 40 |   };
 41 |   std::vector<uint8_t> expected_empty(exempty, exempty + sizeof(exempty));
 42 |   ASSERT_EQ(expected_empty, output);
 43 | 
 44 |   // A single null value with key "key"
 45 |   // $ echo -n -e "\x00" | openssl dgst -hmac "key" -sha256 -binary | xxd
 46 |   // 00000000: 8a8d fb96 56dc cf21 b7ea 5269 1124 3b75  ....V..!..Ri.$;u
 47 |   // 00000010: 68f4 3281 5f1c d43a 4277 1f2d b4aa a525  h.2._..:Bw.-...%
 48 |   rappor::HmacSha256("key", std::string("\0", 1), &output);
 49 |   static const uint8_t exnull[] = {
 50 |     0x8a, 0x8d, 0xfb, 0x96, 0x56, 0xdc, 0xcf, 0x21,
 51 |     0xb7, 0xea, 0x52, 0x69, 0x11, 0x24, 0x3b, 0x75,
 52 |     0x68, 0xf4, 0x32, 0x81, 0x5f, 0x1c, 0xd4, 0x3a,
 53 |     0x42, 0x77, 0x1f, 0x2d, 0xb4, 0xaa, 0xa5, 0x25
 54 |   };
 55 |   std::vector<uint8_t> expected_null(exnull, exnull + sizeof(exnull));
 56 |   ASSERT_EQ(expected_null, output);
 57 | 
 58 |   // A null value with something after it, with key "key"
 59 |   // $ echo -n -e "\x00a" | openssl dgst -hmac "key" -sha256 -binary | xxd
 60 |   // 00000000: 5787 df47 c2c4 8664 5a6a f898 44c3 4636  W..G...dZj..D.F6
 61 |   // 00000010: fc5b b78b 1b87 29a0 6ca8 7556 7b75 c05a  .[....).l.uV{u.Z
 62 |   rappor::HmacSha256("key", std::string("\0a", 2), &output);
 63 |   static const uint8_t exnulltrail[] = {
 64 |     0x57, 0x87, 0xdf, 0x47, 0xc2, 0xc4, 0x86, 0x64,
 65 |     0x5a, 0x6a, 0xf8, 0x98, 0x44, 0xc3, 0x46, 0x36,
 66 |     0xfc, 0x5b, 0xb7, 0x8b, 0x1b, 0x87, 0x29, 0xa0,
 67 |     0x6c, 0xa8, 0x75, 0x56, 0x7b, 0x75, 0xc0, 0x5a
 68 |   };
 69 |   std::vector<uint8_t> expected_null_trailing(
 70 |       exnulltrail, exnulltrail + sizeof(exnulltrail));
 71 |   ASSERT_EQ(expected_null_trailing, output);
 72 |   std::string s = std::string("\0a", 2);
 73 |   rappor::HmacSha256("key", s, &output);
 74 |   ASSERT_EQ(expected_null_trailing, output);
 75 | }
 76 | 
 77 | TEST(OpensslHashImplTest, HmacDrbgNist) {
 78 |   std::vector<uint8_t> output;
 79 |   // Expected output for NIST tests.
 80 |   static const uint8_t exnist[] = {
 81 |     0xD6, 0x7B, 0x8C, 0x17, 0x34, 0xF4, 0x6F, 0xA3,
 82 |     0xF7, 0x63, 0xCF, 0x57, 0xC6, 0xF9, 0xF4, 0xF2,
 83 |     0xDC, 0x10, 0x89, 0xBD, 0x8B, 0xC1, 0xF6, 0xF0,
 84 |     0x23, 0x95, 0x0B, 0xFC, 0x56, 0x17, 0x63, 0x52,
 85 |     0x08, 0xC8, 0x50, 0x12, 0x38, 0xAD, 0x7A, 0x44,
 86 |     0x00, 0xDE, 0xFE, 0xE4, 0x6C, 0x64, 0x0B, 0x61,
 87 |     0xAF, 0x77, 0xC2, 0xD1, 0xA3, 0xBF, 0xAA, 0x90,
 88 |     0xED, 0xE5, 0xD2, 0x07, 0x40, 0x6E, 0x54, 0x03
 89 |   };
 90 |   std::vector<uint8_t> expected_nist(
 91 |       exnist, exnist + sizeof(exnist));
 92 | 
 93 |   // NIST test data, from
 94 |   // http://csrc.nist.gov/groups/ST/toolkit/documents/Examples/HMAC_DRBG.pdf
 95 |   // p.148, requested security strength 128, Requested hash algorithm SHA-256
 96 |   output.resize(64);
 97 |   rappor::HmacDrbg(
 98 |     std::string(
 99 |         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
100 |         "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13"
101 |         "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D"
102 |         "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27"
103 |         "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31"
104 |         "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24"
105 |         "\x25\x26\x27", 63), // provided_data
106 |     "", &output);
107 |   ASSERT_EQ(expected_nist, output);
108 | 
109 |   // Since in our use case we concatenate the key and value
110 |   // to produce the provided_data portion of the DRBG, let's
111 |   // split the above key into key|value as an additional
112 |   // test case.
113 |   output.resize(64);
114 |   rappor::HmacDrbg(
115 |     std::string(
116 |         "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09"
117 |         "\x0A\x0B\x0C\x0D\x0E\x0F\x10\x11\x12\x13"
118 |         "\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D"
119 |         "\x1E\x1F\x20\x21\x22\x23\x24\x25\x26\x27", 40),
120 |     std::string(
121 |         "\x28\x29\x2A\x2B\x2C\x2D\x2E\x2F\x30\x31"
122 |         "\x32\x33\x34\x35\x36\x20\x21\x22\x23\x24"
123 |         "\x25\x26\x27", 23), // provided_data
124 |     &output);
125 |   ASSERT_EQ(expected_nist, output);
126 | }
127 | 
128 | TEST(OpensslHashImplTest, HmacDrbgTextStrings) {
129 |   std::vector<uint8_t> output;
130 |   output.resize(30);
131 |   rappor::HmacDrbg("key", "value", &output);  // Truncated to 30 bytes.
132 |   static const uint8_t ex[] = {
133 |     0x89, 0xD7, 0x1B, 0xB8, 0xA3, 0x7D, 0x80, 0xC2,
134 |     0x6E, 0x63, 0x9C, 0xBD, 0x68, 0xF3, 0x60, 0x7A,
135 |     0xA9, 0x4D, 0xEE, 0xF4, 0x25, 0xA7, 0xAF, 0xBB,
136 |     0xF8, 0xD0, 0x09, 0x92, 0xAF, 0x92
137 |   };
138 |   std::vector<uint8_t> expected(ex, ex + sizeof(ex));
139 |   ASSERT_EQ(expected, output);
140 | }
141 | 
142 | int main(int argc, char **argv) {
143 |   ::testing::InitGoogleTest(&argc, argv);
144 |   return RUN_ALL_TESTS();
145 | }
146 | 


--------------------------------------------------------------------------------