├── dataset_tests
    └── src
    │   ├── __init__.py
    │   ├── drugsens_code
    │       ├── __init__.py
    │       ├── .gitignore
    │       ├── run_tensor_tests.py
    │       ├── clippingomega.py
    │       ├── tensorresults.py
    │       ├── plot_tensor_results.py
    │       ├── tensor.py
    │       └── diffpri.py
    │   ├── .gitignore
    │   ├── data_reader.py
    │   ├── setup_handler.py
    │   ├── linear_regression_master.py
    │   ├── pos_def_matrices.py
    │   ├── README.md
    │   ├── calculate_pred_errors.py
    │   ├── estimate_vars.py
    │   ├── UCI_data_getter.py
    │   ├── sufficient_stats.py
    │   ├── combine_pred_errors.py
    │   ├── suff_stats_master.py
    │   └── eps_data_test.py
├── probic-decrypt-server
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    ├── gen10.sh
    ├── test-scripts
    │   ├── test-client.sh
    │   ├── test-data-writer.sh
    │   └── test-data-server.sh
    ├── run-scripts
    │   ├── gen-testdata-10.sh
    │   ├── gen-test-data-matrix-given.sh
    │   ├── start-servers.sh
    │   └── start-servers-eps.sh
    ├── gen-keys.sh
    ├── build.sbt
    ├── .gitignore
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── fi
    │   │           └── helsinki
    │   │               └── cs
    │   │                   └── probic
    │   │                       ├── test
    │   │                           ├── TestClient.scala
    │   │                           ├── TestDataWriter.scala
    │   │                           └── TestDataServer.scala
    │   │                       ├── crypto
    │   │                           └── PkCrypto.scala
    │   │                       ├── data
    │   │                           └── GenerateTestDataMatrix.scala
    │   │                       └── server
    │   │                           └── Server.scala
    └── README.md
├── spark-streaming-aggregator
    ├── project
    │   ├── build.properties
    │   └── plugins.sbt
    ├── results
    │   ├── 100-100-5.started.txt
    │   ├── result-10-100-10.txt
    │   ├── result-10-100-5.txt
    │   ├── result-10-1000-10.txt
    │   ├── result-10-1000-5.txt
    │   ├── result-100-100-10.txt
    │   ├── result-100-100-5.txt
    │   ├── result-100-1000-10.txt
    │   ├── result-100-1000-5.txt
    │   ├── result-1000-100-10.txt
    │   └── restable.sh
    ├── README.md
    ├── result-10-100-5.txt
    ├── result-100-100-5.txt
    ├── result-100-100-1s-notimeout-newrsa-5.txt
    ├── getresults.sh
    ├── run-spark-aggregator.sh
    ├── run-spark-aggregator-eps-data.sh
    ├── run-spark.sh
    ├── .gitignore
    ├── build.sbt
    ├── results-agg2.txt
    ├── src
    │   └── main
    │   │   └── scala
    │   │       └── fi
    │   │           └── helsinki
    │   │               └── cs
    │   │                   ├── nodes
    │   │                       └── util
    │   │                       │   └── Spark2Main.scala
    │   │                   └── probic
    │   │                       └── streaming
    │   │                           └── SparkDataAggregator.scala
    ├── results-agg-rerun.txt
    ├── results-agg4-serverupdated.txt
    └── results-agg4.txt
├── README.md
├── LICENSE
└── .gitignore


/dataset_tests/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.0.3
2 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.0.3
2 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | *.png
3 | *.sh
4 | tmp/*
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/100-100-5.started.txt:
--------------------------------------------------------------------------------
1 | 2017-05-12 19:33:09.422899064+03:00
2 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/README.md:
--------------------------------------------------------------------------------
1 | Please see the readme file in the probic-decrypt-server folder.
2 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/result-10-100-5.txt:
--------------------------------------------------------------------------------
1 | 1494601455.941897498
2 | 1494601642.811982119
3 | 186.87 s
4 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/result-100-100-5.txt:
--------------------------------------------------------------------------------
1 | 1494601958.178226584
2 | 1494602312.466400278
3 | 354.288 s
4 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/result-100-100-1s-notimeout-newrsa-5.txt:
--------------------------------------------------------------------------------
1 | 1494604731.460029461
2 | 1494606406.548135978
3 | 1675.09 s
4 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/gen10.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | . ./gen-keys.sh
4 | 
5 | for i in $( seq 1 10 )
6 | do
7 |   genkey "$i"
8 | done
9 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-10-100-10.txt:
--------------------------------------------------------------------------------
1 | output values: 10
2 | Start seconds: 1494658315.655119783
3 | End ms: 1494658422159
4 | 106.504 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-10-100-5.txt:
--------------------------------------------------------------------------------
1 | output values: 10
2 | Start seconds: 1494657542.603279687
3 | End ms: 1494657642661
4 | 100.058 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-10-1000-10.txt:
--------------------------------------------------------------------------------
1 | output values: 10
2 | Start seconds: 1494636005.148012318
3 | End ms: 1494636528633
4 | 523.485 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-10-1000-5.txt:
--------------------------------------------------------------------------------
1 | output values: 10
2 | Start seconds: 1494637919.631427449
3 | End ms: 1494638421267
4 | 501.636 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-100-100-10.txt:
--------------------------------------------------------------------------------
1 | output values: 100
2 | Start seconds: 1494608097.867109462
3 | End ms: 1494609063387
4 | 965.52 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-100-100-5.txt:
--------------------------------------------------------------------------------
1 | output values: 100
2 | Start seconds: 1494606789.422899064
3 | End ms: 1494607721492
4 | 932.069 s
5 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/test-scripts/test-client.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | java -cp target/scala-2.11/probic-server.jar \
4 | fi.helsinki.cs.probic.test.TestClient
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-100-1000-10.txt:
--------------------------------------------------------------------------------
1 | output values: 100
2 | Start seconds: 1494819075.667365251
3 | End ms: 1494820536327
4 | 1460.66 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-100-1000-5.txt:
--------------------------------------------------------------------------------
1 | output values: 100
2 | Start seconds: 1494639257.208596029
3 | End ms: 1494644109327
4 | 4852.12 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/result-1000-100-10.txt:
--------------------------------------------------------------------------------
1 | output values: 1000
2 | Start seconds: 1494609175.899942210
3 | End ms: 1494618765997
4 | 9590.1 s
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/getresults.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | awk  'NR % 2 == 1 {t=$0} NR % 2 == 0 {a[t]+=$(NF-1); c[t]+=1 } END{for (t in a) { print t, a[t]/c[t]/1000}}' $1 | sort -n
3 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.3")
2 | 
3 | //addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4")
4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
5 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.3")
2 | 
3 | //addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4")
4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")
5 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/run-scripts/gen-testdata-10.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ns="100 1000 10000 100000"
 3 | ds="10 100 1000 10000"
 4 | for N in $ns
 5 | do
 6 |   for d in $ds
 7 |   do
 8 |     run-scripts/gen-test-data-matrix-given.sh $d $N 9 --zip
 9 |   done
10 | done
11 | 
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dca-nips2017
2 | Differentially private learning on distributed data (NIPS 2017)
3 | 
4 | ## Instructions
5 | 
6 | Please see the [readme at the `probic-decrypt-server` folder](probic-decrypt-server) and the [readme at the `dataset_tests/src` folder](dataset_tests/src).
7 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/test-scripts/test-data-writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash 
2 | java -cp target/scala-2.11/probic-server.jar \
3 | fi.helsinki.cs.probic.data.TestDataWriter \
4 | --certs probic-1,probic-2,probic-3,probic-4,probic-5 \
5 | --masters localhost:8080,localhost:8081,localhost:8082,localhost:8083,localhost:8084 \
6 | --clients 10 \
7 | --input test-data-matrix.csv \
8 | --output test-data-matrix-crypt.csv
9 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/run-spark-aggregator.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | res=$1
 3 | shift
 4 | 
 5 | rm -rf temp
 6 | 
 7 | class=fi.helsinki.cs.probic.streaming.SparkDataAggregator
 8 | ./run-spark.sh $class \
 9 |   --input file://$PWD/../probic-decrypt-server/test-data-matrix \
10 |   --output file://$PWD/temp/sum-data-matrix \
11 |   --noise 9 $*
12 | echo "9 $*" >> $res
13 | tail -n 1 ${class}.log >> $res
14 | 


--------------------------------------------------------------------------------
/dataset_tests/src/.gitignore:
--------------------------------------------------------------------------------
 1 | onlineldavb*
 2 | tmp/*
 3 | bump_check_tmp/*
 4 | bump_test.py
 5 | data/*
 6 | plots/*
 7 | pert_data/*
 8 | .DS_Store
 9 | __py_cache__*
10 | user_list*
11 | res/*
12 | *.cpp
13 | *.h
14 | *.o
15 | client
16 | compute
17 | server
18 | Make*
19 | sample_user_list
20 | enc/*
21 | profiler_dump/*
22 | total_*.txt
23 | test_results/*
24 | *.pickle
25 | scratch/*
26 | *.py_bck
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/run-scripts/gen-test-data-matrix-given.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash 
 2 | 
 3 | if [ -z "$3" ]; then echo "Usage: $0 d N noise"; exit 1; fi
 4 | 
 5 | d=$1
 6 | shift
 7 | N=$1
 8 | shift
 9 | noise=$1
10 | shift
11 | 
12 | java  -Xmx1500g -cp target/scala-2.11/probic-server.jar \
13 | fi.helsinki.cs.probic.data.GenerateTestDataMatrix \
14 | --dimension $d \
15 | --clients $N \
16 | --noise $noise \
17 | --output test-data-matrix-$d-$N-$noise.csv \
18 | $*
19 | 
20 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/run-spark-aggregator-eps-data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | res=results-eps-abalone-3000.txt
 3 | shift
 4 | 
 5 | rm -rf temp
 6 | 
 7 | fun(){
 8 | class=fi.helsinki.cs.probic.streaming.SparkDataAggregator
 9 | ./run-spark.sh $class \
10 |   --input file://$PWD/../dataset_tests/src/sparkfile.txt \
11 |   --output file://$PWD/temp/eps-sum-data \
12 |   --noise 9 $*
13 | echo "9 $*" >> $res
14 | tail -n 1 ${class}.log >> $res
15 | }
16 | 
17 | fun --clients 3000 --d 8 --useDouble
18 | 
19 | 


--------------------------------------------------------------------------------
/dataset_tests/src/data_reader.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data
 6 | 
 7 | Function for reading data from a given file and returning it as a list.
 8 | '''
 9 | 
10 | import numpy as np
11 | import csv
12 | 
13 | def read_data(filename):
14 |   with open(filename,newline='',encoding='utf-8') as f:
15 |     reader = csv.reader(f, delimiter=',')
16 |     data = list()
17 |     for row in reader:
18 |       data.append(row)
19 |   return data
20 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/run-scripts/start-servers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | startserver(){
 4 |   echo "Starting probic-server: $*"
 5 |   #xterm -e "java -jar target/scala-2.11/probic-server.jar $*" &
 6 |   screen -d -m -S probic-server -- nice -n 20 java -Xmx10g -jar target/scala-2.11/probic-server.jar $* &
 7 | }
 8 | 
 9 | if [ -z "$1" ]; then c=5; else c=$1; fi
10 | if [ -z "$2" ]; then msg=100; else msg=$2; fi
11 | 
12 | for i in $( seq 1 $c )
13 | do
14 |   let p=8080+$i
15 |   let p--
16 |   startserver --port $p --cert probic-$i --messages $msg
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/dataset_tests/src/setup_handler.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data
 6 | 
 7 | Setup script handler: pickle setup parameters & read them back.
 8 | '''
 9 | 
10 | import numpy as np
11 | import pickle
12 | 
13 | def get_setup(saved_setup):
14 |   with open(saved_setup + '.pickle', 'rb') as f:
15 |     apu = pickle.load(f)
16 |   return apu
17 | 
18 | def write_setup(saved_setup, pars):
19 |   with open(saved_setup + '.pickle', 'wb') as f:
20 |     pickle.dump(pars, f, pickle.HIGHEST_PROTOCOL)
21 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/run-scripts/start-servers-eps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | startserver(){
 4 |   echo "Starting probic-server: $*"
 5 |   #xterm -e "java -jar target/scala-2.11/probic-server.jar $*" &
 6 |   screen -d -m -S probic-server -- nice -n 20 java -Xmx10g -jar target/scala-2.11/probic-server.jar $* &
 7 | }
 8 | 
 9 | if [ -z "$1" ]; then c=10; else c=$1; fi
10 | if [ -z "$2" ]; then msg=3000; else msg=$2; fi
11 | 
12 | for i in $( seq 1 $c )
13 | do
14 |   let p=8080+$i
15 |   let p--
16 |   startserver --port $p --cert probic-$i --messages $msg --useDouble
17 | done
18 | 
19 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/gen-keys.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -f "secret.txt" ]; then echo "Please create secret.txt with a single line containing the desired private key password."; exit 1; fi
 4 | # Store passwords in a separate file
 5 | pass=$( cat secret.txt )
 6 | 
 7 | res=$PWD
 8 | ks=$res/keystore.jks
 9 | 
10 | genkey(){
11 |   str="PROBIC-${1}\nDepartment of Computer Science\nUniversity of Helsinki\nHelsinki\nUusimaa\nFI\nyes"
12 |   echo -e $str | keytool -genkey -alias "probic-${1}" -keyalg RSA -keystore $ks -keysize 4096 -storepass $pass -keypass $pass -validity 360
13 | }
14 | 
15 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/run-spark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPARK=$HOME/work/spark-2.1.0-bin-hadoop2.7
 4 | 
 5 | if [ -n "$1" ]
 6 | then
 7 |   class=$1
 8 |   shift
 9 | else
10 |   echo "Usage: $0 classname [args...]"
11 |   exit 1
12 | fi
13 | 
14 | export SPARK_LOCAL_IP="127.0.0.1"
15 | export SPARK_LOCAL_DIRS="/run/user/$( id -u $USER )/spark"
16 | echo SPARK_LOCAL_DIRS=$SPARK_LOCAL_DIRS
17 | echo "args: $class $*"
18 | 
19 | $SPARK/bin/spark-submit --driver-memory 1500g --master "local[45]" \
20 |  --class $class $PWD/target/scala-2.11/probic-streaming-aggregator.jar $* 1> "${class}.log" 2> "${class}.err"
21 | 
22 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Probic Private Data Aggregation Node"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | libraryDependencies += "fi.helsinki.cs.nodes" % "getopt-scala" % "1.1.0"
 8 | 
 9 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0"
10 | 
11 | // https://mvnrepository.com/artifact/commons-codec/commons-codec
12 | libraryDependencies += "commons-codec" % "commons-codec" % "1.10"
13 | 
14 | libraryDependencies += "org.slf4j" % "slf4j-simple" % "1.7.25"
15 | 
16 | mainClass in assembly := Some("fi.helsinki.cs.probic.server.Server")
17 | 
18 | assemblyJarName in assembly := "probic-server.jar"
19 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/test-scripts/test-data-server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | genargs() {
 4 |   certs=""
 5 |   masters=""  
 6 |   for i in $( seq 1 $1 )
 7 |   do
 8 |     let p=8080+$i
 9 |     let p-=1
10 |     if [ -z "$certs" ]; then
11 |       certs="probic-$i"
12 |       masters="localhost:$p"
13 |     else
14 |       certs="$certs,probic-$i"
15 |       masters="$masters,localhost:$p"
16 |     fi
17 |   done
18 | }
19 | 
20 | if [ -n "$1" ]
21 | then 
22 |   genargs $1
23 |   shift
24 | else
25 |   genargs 5
26 | fi
27 |  
28 | java -cp target/scala-2.11/probic-server.jar \
29 | fi.helsinki.cs.probic.test.TestDataServer \
30 | --certs ${certs} \
31 | --masters ${masters} \
32 | $*
33 | 
34 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results/restable.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ns="100 1000 10000 100000"
 3 | ds="10 100 1000 10000"
 4 | 
 5 | mktable(){
 6 | echo '\begin{table}'
 7 | echo '\begin{tabular}[]{c c c c c}'
 8 | echo "M=$M & N=100 & N=1000 & N=10000 & N=100000 \\\\"
 9 | for d in $ds
10 | do
11 |   row="d=$d"
12 |   for N in $ns
13 |   do
14 |     f=result-$d-$N-$M.txt
15 |     if [ ! -f $f ]; then cell=NA; else
16 |       cell=$( tail -n 1 $f | awk '{print $1}' )
17 |     fi
18 |     if [ -z "$row" ]; then row=$cell; else row="$row & $cell"; fi
19 |   done
20 |   echo "$row \\\\"
21 | done
22 | echo '\end{tabular}'
23 | echo '\end{table}'
24 | }
25 | 
26 | M=5
27 | mktable
28 | echo ""
29 | 
30 | M=10
31 | mktable
32 | 
33 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | !.gitignore
 3 | bin
 4 | *.class
 5 | db
 6 | dist
 7 | dist/*
 8 | eclipse
 9 | *.eml
10 | *.iml
11 | # except for .gitignore
12 | # Extracted from https://github.com/ulrich/macaron-factory/blob/master/.gitignore
13 | # Ignore all dotfiles...
14 | # Ignore Play! working directory #
15 | lib
16 | lib_managed/
17 | log
18 | *.log
19 | logs
20 | modules
21 | /out
22 | precompiled
23 | project/boot/
24 | project/plugins/project/
25 | project/project
26 | /project/*-shim.sbt
27 | project/target
28 | # sbt specific
29 | # Scala-IDE specific
30 | server.pid
31 | src_managed/
32 | target
33 | test-result
34 | tmp
35 | .history
36 | dist
37 | /.idea
38 | /*.iml
39 | /out
40 | /.idea_modules
41 | /.classpath
42 | /.project
43 | /.settings
44 | /bin/
45 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/.gitignore:
--------------------------------------------------------------------------------
 1 | .*
 2 | !.gitignore
 3 | bin
 4 | *.class
 5 | db
 6 | dist
 7 | dist/*
 8 | eclipse
 9 | *.eml
10 | *.iml
11 | # except for .gitignore
12 | # Extracted from https://github.com/ulrich/macaron-factory/blob/master/.gitignore
13 | # Ignore all dotfiles...
14 | # Ignore Play! working directory #
15 | lib
16 | lib_managed/
17 | log
18 | *.log
19 | logs
20 | modules
21 | /out
22 | precompiled
23 | project/boot/
24 | project/plugins/project/
25 | project/project
26 | /project/*-shim.sbt
27 | project/target
28 | # sbt specific
29 | # Scala-IDE specific
30 | server.pid
31 | src_managed/
32 | target
33 | test-result
34 | tmp
35 | temp/*
36 | .history
37 | dist
38 | /.idea
39 | /*.iml
40 | /out
41 | /.idea_modules
42 | /.classpath
43 | /.project
44 | /.settings
45 | /bin/
46 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "Probic Spark Streaming Private Data Aggregator"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.1.1" % "provided"
 8 | 
 9 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.1.1" % "provided"
10 | 
11 | libraryDependencies += "fi.helsinki.cs.nodes" % "getopt-scala" % "1.1.0"
12 | 
13 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0"
14 | 
15 | // https://mvnrepository.com/artifact/commons-codec/commons-codec
16 | libraryDependencies += "commons-codec" % "commons-codec" % "1.10"
17 | 
18 | libraryDependencies += "org.slf4j" % "slf4j-simple" % "1.7.25"
19 | 
20 | assemblyJarName in assembly := "probic-streaming-aggregator.jar"
21 | 
22 | mainClass in assembly := Some("fi.helsinki.cs.probic.streaming.Aggregator")
23 | 
24 | 


--------------------------------------------------------------------------------
/dataset_tests/src/linear_regression_master.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data
 6 | 
 7 | Script for calculating Bayesian linear regression from sufficient stats
 8 | '''
 9 | 
10 | import numpy as np
11 | import sys
12 | 
13 | def get_regression_est(suff_stats, pars):
14 |   #assume suff_stats is a dictionary containing suff stats  as [X'X, X'y]
15 |   #return dict with [prec, mean]
16 |   
17 |   #prior precisions
18 |   l = 1
19 |   l0 = 1
20 |   
21 |   palautettava = {}
22 |   for k_stats in suff_stats.keys():
23 |     apu = {}
24 |     try:
25 |       apu['prec'] = l*(suff_stats[k_stats][0]) + l0*np.identity(pars['dim'])
26 |       apu['mean'] = np.linalg.solve(apu['prec'],l*(suff_stats[k_stats][1]))
27 |     except:
28 |       apu['prec'] = None
29 |       apu['mean'] = None
30 |     palautettava[k_stats] = apu
31 |   return palautettava


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/run_tensor_tests.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | GDSC/drug sensitivity data
 6 | 
 7 | Script for running tensor.py for a collection of drugs and CVs.
 8 | 
 9 | clippingomega.py should be run before this.
10 | 
11 | Run: python3 run_tensor_tests.py
12 | '''
13 | 
14 | import subprocess
15 | import sys
16 | 
17 | import numpy as np
18 | 
19 | n_drugs = 264 # 264 in the paper
20 | n_cv = 25 # 25 in the paper
21 | drugs_to_run = np.linspace(0,n_drugs,n_drugs+1,dtype='int')
22 | seeds_for_cv = np.linspace(0,n_cv,n_cv+1,dtype='int')
23 | 
24 | #args to tensor.py: drug_id, seed
25 | for drug in drugs_to_run:
26 |   print('Starting drug ' + str(drug))
27 |   for seed in seeds_for_cv:
28 |     testi = subprocess.run(args=['python','tensor.py',str(drug),str(seed)], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
29 |     print('stdout:\n' + testi.stdout.decode('utf-8'))
30 |     print('stderr:\n' + testi.stderr.decode('utf-8'))
31 | 
32 | print('All tensor tests done!')


--------------------------------------------------------------------------------
/dataset_tests/src/pos_def_matrices.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data, GDSC/drug sensitivity data
 6 | 
 7 | Function for checking & fixing matrix positive definiteness. Works by eigendecomposing, and re-composing with absolute values of the original eigenvalues.
 8 | '''
 9 | 
10 | import numpy as np
11 | 
12 | def check(suff_stats, pars):
13 |   
14 |   if pars['enforce_pos_def'] == False:
15 |     #simply flag non-pos.def matrices, no correction
16 |     if pars['feedback'] > 0:
17 |       for m in suff_stats:
18 |         D, V = np.linalg.eig(suff_stats[m][0])
19 |         if np.sum(D < 0) > 0:
20 |           print('Non-positive definite Cov matrix for {}'.format(m))
21 |     return suff_stats
22 |   
23 |   else:
24 |     #eigendecompose, set eigenvalues to their absolute values & multiply back
25 |     for m in suff_stats:
26 |       apu = suff_stats[m][0]
27 |       D, V = np.linalg.eig(apu)
28 |       D = np.absolute(D)
29 |       suff_stats[m][0] = np.dot( np.dot(V,np.diag(D)) ,np.linalg.inv(V))
30 |   return suff_stats
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 DPBayes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/dataset_tests/src/README.md:
--------------------------------------------------------------------------------
 1 | # Differentially private Bayesian learning on distributed data
 2 | 
 3 | Code for running the tests in the paper "Differentially private Bayesian learning on distributed data" (arXiv:1703.01106).
 4 | 
 5 | 
 6 | ## Requirements
 7 | 
 8 | The code uses Python3 with Numpy (tested with 1.11.1), Scipy (0.17.1), and Matplotlib (1.5.3).
 9 | 
10 | 
11 | ## Running the tests
12 | 
13 | To run the tests using UCI data, get the Abalone and Wine Quality datasets (https://archive.ics.uci.edu/ml/datasets.html),
14 | set the data location in UCI_data_getter.py and
15 | use eps_data_test.py. The results can be plotted using combine_prediction_erros.py.
16 | 
17 | For the GDSC data, set the options in tensor.py and use clippingomega.py followed by run_tensor_tests.py in the drugsens_code-folder. To plot the results, run tensorresults.py followed by plot_tensor_results.py.
18 | 
19 | See the paper "Efficient differentially private learning improves drug sensitivity prediction" (arXiv:1606.02109) for more information on the GDSC data pre-processing.
20 | 
21 | For running the Spark tests, see the separate [readme at the `probic-decrypt-server` folder](../../probic-decrypt-server).
22 | 


--------------------------------------------------------------------------------
/dataset_tests/src/calculate_pred_errors.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data
 6 | 
 7 | Function for calculating predictive errors.
 8 | '''
 9 | 
10 | import numpy as np
11 | from matplotlib import pyplot as plt
12 | 
13 | import data_reader
14 | 
15 | def calculate_errors(data, dim, filename_data, model_coeff):
16 |   regr_coeff_mu = model_coeff['mean']
17 |   regr_coeff_std = model_coeff['prec']
18 |   if regr_coeff_mu is None:
19 |     return None, None, None, None, None
20 |   
21 |   #read data to numpy array (where target = last column)
22 |   if filename_data is not '':
23 |     data = np.zeros((data[0], dim+1))
24 |     apu = dataReader.read_data(filename_data)
25 |     for i in range(len(apu)):
26 |       data[i,:] = apu[i]
27 |     #center data
28 |     data = data - np.mean(data, axis = 0)
29 |   
30 |   #calculate predictions (MAP)
31 |   preds = np.dot(regr_coeff_mu, np.transpose(data[:,:-1]) )
32 |   
33 |   #calculate errors
34 |   MAE = np.mean( np.absolute(data[:,-1] - preds) )
35 |   MSE = np.mean( (data[:,-1] - preds)**2 )
36 |   
37 |   return MAE, MSE, np.mean(preds), np.std(preds), np.amax(preds)-np.amin(preds)


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results-agg2.txt:
--------------------------------------------------------------------------------
 1 | 9 --d 10 --clients 100
 2 | Total time: 3275 ms.
 3 | 9 --d 10 --clients 100
 4 | Total time: 2588 ms.
 5 | 9 --d 10 --clients 100
 6 | Total time: 3355 ms.
 7 | 9 --d 10 --clients 100
 8 | Total time: 3572 ms.
 9 | 9 --d 10 --clients 100
10 | Total time: 4258 ms.
11 | 9 --d 10 --clients 100
12 | Total time: 4032 ms.
13 | 9 --d 10 --clients 100
14 | Total time: 3889 ms.
15 | 9 --d 10 --clients 100
16 | Total time: 5156 ms.
17 | 9 --d 10 --clients 100
18 | Total time: 3567 ms.
19 | 9 --d 100 --clients 100
20 | Total time: 6668 ms.
21 | 9 --d 100 --clients 100
22 | Total time: 6512 ms.
23 | 9 --d 100 --clients 100
24 | Total time: 6262 ms.
25 | 9 --d 100 --clients 100
26 | Total time: 6397 ms.
27 | 9 --d 100 --clients 100
28 | Total time: 6079 ms.
29 | 9 --d 100 --clients 100
30 | Total time: 6247 ms.
31 | 9 --d 1000 --clients 100
32 | Total time: 39086 ms.
33 | 9 --d 1000 --clients 100
34 | Total time: 37841 ms.
35 | 9 --d 1000 --clients 100
36 | Total time: 39017 ms.
37 | 9 --d 1000 --clients 100
38 | Total time: 38494 ms.
39 | 9 --d 1000 --clients 100
40 | Total time: 38236 ms.
41 | 9 --d 1000 --clients 100
42 | Total time: 38246 ms.
43 | 9 --d 1000 --clients 100
44 | Total time: 37514 ms.
45 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestClient.scala:
--------------------------------------------------------------------------------
 1 | package fi.helsinki.cs.probic.test
 2 | 
 3 | import fi.helsinki.cs.nodes.util.OptMain
 4 | import com.typesafe.scalalogging.LazyLogging
 5 | import java.net.Socket
 6 | import java.io.DataInputStream
 7 | import java.io.DataOutputStream
 8 | import fi.helsinki.cs.probic.crypto.PkCrypto
 9 | import scala.collection.Seq
10 | 
11 | /**
12 |  * Test the server by sending it encrypted messages forever.
13 |  */
14 | object TestClient extends OptMain with LazyLogging {
15 | 
16 |   val DEFAULT_PORT = "8080"
17 | 
18 |   val longOptions = Seq("port=")
19 | 
20 |   val shortOptions = ""
21 | 
22 |   def optMain() {
23 |     val port = optional("port").getOrElse(DEFAULT_PORT).toInt
24 | 
25 |     val crypto = new PkCrypto("probic")
26 |     val encrypt = crypto.getEncrypter("probic")
27 |     for (i <- 0 until 1000) {
28 |       val plainText = s"Test Number $i"
29 |       logger.info(plainText)
30 |       val cryptoText = encrypt(plainText)
31 |       val sock = new Socket("localhost", port)
32 |       val out = new DataOutputStream(sock.getOutputStream)
33 |       out.writeInt(cryptoText.length)
34 |       out.write(cryptoText)
35 |       val in = new DataInputStream(sock.getInputStream)
36 |       val returned = in.readUTF()
37 |       sock.close
38 |       logger.info(s"Server returned: $returned")
39 |       assert(plainText == returned)
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/dataset_tests/src/estimate_vars.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data, GDSC/drug sensitivity data
 6 | 
 7 | Function for estimating data & target marginal variances for optimal clipping when not assuming an auxiliary open dataset.
 8 | '''
 9 | 
10 | import numpy as np
11 | import sys
12 | 
13 | import sufficient_stats
14 | 
15 | def get_estimates(data, pars, small_pos=.5):
16 |   #Note: uses 1 clip for data and 1 for target; both scaled according to individual dim std
17 |   
18 |   N_train = data.shape[0]
19 |   dim = pars['dim']
20 |   
21 |   #clip data to the assumed data range
22 |   data[:,0:-1] = np.sign(data[:,0:-1]) * np.minimum( np.absolute(data[:,0:-1]), pars['assumed_data_range'][0] )
23 |   data[:,-1] = np.sign(data[:,-1]) * np.minimum( np.absolute(data[:,-1]), pars['assumed_data_range'][1] )
24 |   
25 |   
26 |   eps=pars['privacy_for_marg_var']*pars['epsilon']
27 |   delta=pars['privacy_for_marg_var']*pars['delta']
28 |   
29 |   sigma = np.sqrt( 1/(N_train-1) * 2*np.log(1.25/delta)) * (np.sqrt(dim*(pars['assumed_data_range'][0]**2)+pars['assumed_data_range'][1]**2) / eps)
30 |   
31 |   #add noise
32 |   products = np.add(data**2, np.random.normal(0,sigma,[N_train,dim+1]) ) 
33 |   
34 |   vars = np.nansum(products,0)/N_train
35 |   ind = vars <= 0
36 |   
37 |   #set vars to small positive numbers if negative
38 |   if sum(ind) > 0:
39 |     vars[ind] = small_pos
40 |   return vars
41 |   
42 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/src/main/scala/fi/helsinki/cs/nodes/util/Spark2Main.scala:
--------------------------------------------------------------------------------
 1 | package fi.helsinki.cs.nodes.util
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |  *
 7 |  * @author Eemil Lagerspetz
 8 |  */
 9 | trait Spark2Main extends OptMain {
10 |   /**
11 |    * Whether to compress Spark outputs. Required.
12 |    */
13 |   val sparkOutputCompression: Boolean
14 | 
15 |   /**
16 |    * Main entry point. Configures Spark and parses args for options specified in `shortOptSpec` and `longOptSpec` (see getopt-scala docs).
17 |    */
18 |   def sparkMain(spark: SparkSession)
19 | 
20 |   /**
21 |    * Main entry point. Configures Spark and parses args, then passes control to [[fi.helsinki.cs.nodes.carat.util.SparkMain#sparkMain]] .
22 |    */
23 |   def optMain() {
24 |     val sb = SparkSession
25 |       .builder()
26 |       .appName(getClass.getName.replaceAll("$", ""))
27 | 
28 |     val spark = {
29 |       if (sparkOutputCompression)
30 |         enableCompression(sb).getOrCreate()
31 |       else
32 |         sb.getOrCreate()
33 |     }
34 | 
35 |     sparkMain(spark)
36 |   }
37 | 
38 |   private def enableCompression(sb: SparkSession.Builder) = {
39 |     sb.config("spark.hadoop.mapred.output.compress", true)
40 |       .config("spark.hadoop.mapred.output.compression.codec", true)
41 |       .config("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec")
42 |       .config("spark.hadoop.mapred.output.compression.type", "BLOCK")
43 |   }
44 | }
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/clippingomega.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | Modified from the original code:
 6 |   Differentially private Bayesian linear regression 
 7 |   Arttu Nieminen 2016-2017
 8 |   University of Helsinki Department of Computer Science
 9 |   Helsinki Institute of Information Technology HIIT
10 | 
11 | Choose parameters for clipping using auxiliary data.
12 | 
13 | Run: python3 clippingomega.py
14 | '''
15 | 
16 | import sys
17 | import os
18 | 
19 | import diffpri as dp
20 | import numpy as np
21 | import csv
22 | 
23 | # average number of non-missing data ~ 400
24 | pv_size = [400] # 400 in the paper
25 | 
26 | ## NOTE: set these to match the values in tensor.py
27 | #privacy budget: lists of similar length
28 | eps = [1.0,3.0,5.0,7.5,10.0]
29 | delta_list = np.zeros(shape=len(eps))+10e-4
30 | np.random.seed(1)
31 | ny = len(pv_size)
32 | csvpath = ''   # path for output csv files
33 | privacy_for_marg_var = .3 # .3 in the paper
34 | 
35 | nx = len(eps)
36 | WX = np.zeros((ny,nx),dtype=np.float)
37 | WY = np.zeros((ny,nx),dtype=np.float)
38 | print('Finding optimal projection threshold...')
39 | for i in range(len(pv_size)):
40 |   for j in range(len(eps)):
41 |     n= pv_size[i]
42 |     d = 10
43 |     
44 |     e = eps[j]*(1-privacy_for_marg_var)
45 |     delta = delta_list[j]*(1-privacy_for_marg_var)
46 |     
47 |     w_x,w_y = dp.omega(n,d,e,delta,method='corr',ln=10)
48 |     WX[i,j] = w_x
49 |     WY[i,j] = w_y
50 |     
51 | print('WX:\n'+str(WX))
52 | print('WY:\n'+str(WY))
53 | print('done!')
54 | np.savetxt(csvpath+'C-WX.csv',WX,delimiter=',')
55 | np.savetxt(csvpath+'C-WY.csv',WY,delimiter=',')
56 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results-agg-rerun.txt:
--------------------------------------------------------------------------------
 1 | 9 --d 100 --clients 100
 2 | Total time: 6275 ms.
 3 | 9 --d 100 --clients 100
 4 | Total time: 5792 ms.
 5 | 9 --d 100 --clients 100
 6 | Total time: 5972 ms.
 7 | 9 --d 100 --clients 100
 8 | Total time: 6125 ms.
 9 | 9 --d 100 --clients 100
10 | Total time: 6163 ms.
11 | 9 --d 10 --clients 100
12 | Total time: 3139 ms.
13 | 9 --d 10 --clients 1000
14 | Total time: 7360 ms.
15 | 9 --d 10 --clients 1000
16 | Total time: 3691 ms.
17 | 9 --d 10 --clients 1000
18 | Total time: 3637 ms.
19 | 9 --d 10 --clients 1000
20 | Total time: 3217 ms.
21 | 9 --d 10 --clients 1000
22 | Total time: 3424 ms.
23 | 9 --d 100 --clients 1000
24 | Total time: 9037 ms.
25 | 9 --d 100 --clients 1000
26 | Total time: 9286 ms.
27 | 9 --d 100 --clients 1000
28 | Total time: 9072 ms.
29 | 9 --d 100 --clients 1000
30 | Total time: 8949 ms.
31 | 9 --d 100 --clients 1000
32 | Total time: 8879 ms.
33 | 9 --d 1000 --clients 1000
34 | Total time: 68311 ms.
35 | 9 --d 10 --clients 10000
36 | Total time: 9974 ms.
37 | 9 --d 10 --clients 10000
38 | Total time: 6604 ms.
39 | 9 --d 10 --clients 10000
40 | Total time: 6188 ms.
41 | 9 --d 10 --clients 10000
42 | Total time: 6042 ms.
43 | 9 --d 10 --clients 10000
44 | Total time: 6263 ms.
45 | 9 --d 100 --clients 10000
46 | Total time: 40170 ms.
47 | 9 --d 100 --clients 10000
48 | Total time: 35866 ms.
49 | 9 --d 100 --clients 10000
50 | Total time: 34550 ms.
51 | 9 --d 100 --clients 10000
52 | Total time: 35123 ms.
53 | 9 --d 100 --clients 10000
54 | Total time: 33611 ms.
55 | 9 --d 1000 --clients 10000
56 | Total time: 316652 ms.
57 | 9 --d 1000 --clients 10000
58 | Total time: 315078 ms.
59 | 9 --d 1000 --clients 10000
60 | Total time: 309505 ms.
61 | 9 --d 1000 --clients 10000
62 | Total time: 310065 ms.
63 | 9 --d 1000 --clients 10000
64 | Total time: 306264 ms.
65 | 


--------------------------------------------------------------------------------
/dataset_tests/src/UCI_data_getter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data
 6 | 
 7 | Script for reading UCI datasets. Returns a dataset with target as the last col.
 8 | '''
 9 | 
10 | import numpy as np
11 | 
12 | import data_reader
13 | 
14 | data_folder = 'data/'
15 | 
16 | #abalone dataset, predict abalone age
17 | def get_abalone():
18 |   filename = data_folder + 'abalone/abalone.data'
19 |   apu = data_reader.read_data(filename)
20 |   data = np.zeros( (len(apu),len(apu[0])))
21 |   for k_row in range(data.shape[0]):
22 |     data[k_row,1:] = apu[k_row][1:]
23 |     #code categorical sex as 0=male, 1=female
24 |     if apu[k_row][0] == 'M':
25 |       data[k_row,0] = 0
26 |     else:
27 |       data[k_row,0] = 1
28 |   return data
29 | 
30 | #predict concrete compressive strength
31 | def get_concrete():
32 |   #8 mittausta ja target
33 |   filename = data_folder + 'concrete/Concrete_Data.txt'
34 |   apu = data_reader.read_data(filename)
35 |   data = np.zeros( (len(apu)-1,len(apu[0])))
36 |   #0s rivi=nimet
37 |   for k_row in range(1,data.shape[0]):
38 |     data[k_row,:] = apu[k_row]
39 |   return data
40 | 
41 | #predict wine quality, red & white separately
42 | def get_red_wine():
43 |   filename = data_folder + 'wine/winequality-red.csv'
44 |   apu = data_reader.read_data(filename)
45 |   data = np.zeros((len(apu)-1,len(apu[0][0].split(";") )))
46 |   for k_row in range(1, data.shape[0]):
47 |     data[k_row-1,:] = apu[k_row][0].split(";")
48 |     #0th row=names
49 |   return data
50 | def get_white_wine():
51 |   filename = data_folder + 'wine/winequality-white.csv'
52 |   apu = data_reader.read_data(filename)
53 |   data = np.zeros((len(apu)-1,len(apu[0][0].split(";") )))
54 |   for k_row in range(1, data.shape[0]):
55 |     data[k_row-1,:] = apu[k_row][0].split(";")
56 |     #0th row=names
57 |   return data
58 | 
59 | if __name__=='__main__':
60 |   get_white_wine()


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/tensorresults.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | Modified from the original code:
 6 |   Differentially private Bayesian linear regression 
 7 |   Arttu Nieminen 2016-2017
 8 |   University of Helsinki Department of Computer Science
 9 |   Helsinki Institute of Information Technology HIIT
10 | 
11 | GDSC/drug sensitivity data
12 | 
13 | Script for combining results produced by tensor.py.
14 | 
15 | run_tensor_tests.py / tensor.py should be run before this.
16 | 
17 | Run: python3 tensorresults
18 | '''
19 | 
20 | import numpy as np
21 | import csv
22 | import os.path
23 | import pickle
24 | import sys
25 | from collections import OrderedDict
26 | 
27 | drug_nbo = 264 # 264 in the paper
28 | cv_rounds = 25 # 25 in the paper
29 | 
30 | pv_size = [840] # [840] in the paper
31 | 
32 | #privacy budget as lists of same length
33 | eps = [1.0,3.0,5.0,7.5,10.0]
34 | delta_list = np.zeros(shape=len(eps))+10e-4
35 | 
36 | # Set folders
37 | inpath = 'res/'   # set path for individual files from different drugs and folds
38 | outpath = 'resultsdata/'     # set path for computed final results
39 | inprefix = 'cliptest-drugsens-'   # set input file prefix
40 | outprefix = 'tensor-'       # set output file prefix
41 | 
42 | indatapath = inpath+inprefix
43 | outdatapath = outpath+outprefix
44 | datapath = indatapath
45 | 
46 | all_means = OrderedDict()
47 | methods = ['true', 'clipped','noisy','cl_noisy','noisy_ind','cl_noisy_ind','scaling','cl_scaling','cl_true_TA','cl_true_TA','cl_true_TA_DP']
48 | 
49 | for m in methods:
50 |   all_means[m] = OrderedDict()
51 |   
52 |   means = np.zeros((cv_rounds, drug_nbo, len(eps)))
53 |   print('array shape (cv, drugs, eps): '+str(means.shape))
54 |   
55 |   for k_cv in range(cv_rounds):
56 |     for k_drug in range(drug_nbo):
57 |       filename = datapath+str(k_drug)+'-'+str(k_cv)+'.pickle'
58 |       with open(filename, 'rb') as f:
59 |         apu = pickle.load(f)
60 |       
61 |       means[k_cv, k_drug, :] = apu[m]
62 |       
63 |   all_means[m]['mean'] = np.mean(means,1)
64 |   all_means[m]['std'] = np.std(means,1)
65 | 
66 |   # Save data
67 | with open(outpath+'drugsens_test.pickle', 'wb') as f:
68 |   pickle.dump(all_means,f,pickle.HIGHEST_PROTOCOL)
69 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestDataWriter.scala:
--------------------------------------------------------------------------------
 1 | package fi.helsinki.cs.probic.test
 2 | 
 3 | import fi.helsinki.cs.nodes.util.OptMain
 4 | import com.typesafe.scalalogging.LazyLogging
 5 | import org.apache.commons.codec.binary.Base64
 6 | import fi.helsinki.cs.probic.crypto.PkCrypto
 7 | import fi.helsinki.cs.probic.data.GenerateTestDataMatrix
 8 | import scala.collection.Seq
 9 | 
10 | /**
11 |  * Mandatory options:
12 |  * --intype hdfs or --intype socket
13 |  * --input hdfs://path/to/input/folder
14 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
15 |  */
16 | object TestDataWriter extends OptMain with LazyLogging {
17 | 
18 |   val longOptions = Seq("port=", "masters=", "certs=", "input=", "clients=", "output=")
19 | 
20 |   val shortOptions = ""
21 | 
22 |   def optMain() {
23 |     val output = mandatoryOption("output")
24 |     val certs = optional("certs").getOrElse("probic").split(",")
25 |     val masters = optional("masters").getOrElse("localhost:8080").split(",")
26 |     val clients = mandatoryOption("clients").toInt
27 | 
28 |     val input = mandatoryOption("input")
29 | 
30 |     val crypto = new PkCrypto("probic") // Private test key, not relevant in this program
31 | 
32 |     val servers = for (i <- 0 until certs.length) yield {
33 |       masters(i) -> crypto.getEncrypter(certs(i))
34 |     }
35 |     val inputLines = io.Source.fromFile(input).getLines().toSeq
36 |     val outputFile = inputLines.flatMap(line => getOutputLines(servers, line, clients).flatten)
37 |     GenerateTestDataMatrix.toFile(output, outputFile)
38 |   }
39 | 
40 |   def getOutputLines(servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = {
41 |     val items = line.split(";")
42 |     val itemsPerClient = items.length / clients
43 |     val clientItems = items.grouped(itemsPerClient).toSeq
44 |     for (client <- 0 until clients) yield {
45 |       val myItems = clientItems(client)
46 |       for (item <- 0 until myItems.length) yield {
47 |         val (master, encrypt) = servers(item % servers.length)
48 |         val data = myItems(item)
49 |         val cryptoText = encrypt(data + "")
50 |         val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}"
51 |         msg
52 |       }
53 |     }
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/crypto/PkCrypto.scala:
--------------------------------------------------------------------------------
 1 | package fi.helsinki.cs.probic.crypto
 2 | 
 3 | import java.security.KeyStore
 4 | import javax.crypto.Cipher
 5 | import org.apache.commons.codec.binary.Base64
 6 | import javax.security.cert.X509Certificate
 7 | 
 8 | /**
 9 |  * Mandatory options:
10 |  * --intype hdfs or --intype socket
11 |  * --input hdfs://path/to/input/folder
12 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
13 |  */
14 | class PkCrypto(cert: String) {
15 | 
16 |   val keystorePath = "keystore.jks"
17 | 
18 |   lazy val password = scala.io.Source.fromFile("secret.txt", "UTF-8").getLines().toSeq.head.toCharArray
19 | 
20 |   lazy val decrypter = getDecrypt()
21 | 
22 |   lazy val key = getKey()
23 | 
24 |   def getKey() = {
25 |     val ks = KeyStore.getInstance(KeyStore.getDefaultType())
26 | 
27 |     var fis: java.io.FileInputStream = null
28 |     try {
29 |       fis = new java.io.FileInputStream(keystorePath)
30 |       ks.load(fis, password)
31 |     } finally {
32 |       if (fis != null) {
33 |         fis.close();
34 |       }
35 |     }
36 | 
37 |     val k = ks.getKey(cert, password)
38 |     k
39 |   }
40 | 
41 |   /**
42 |    * Method for testing only
43 |    */
44 |   def getCert(id: String) = {
45 |     val ks = KeyStore.getInstance(KeyStore.getDefaultType())
46 | 
47 |     var fis: java.io.FileInputStream = null
48 |     try {
49 |       fis = new java.io.FileInputStream(keystorePath)
50 |       ks.load(fis, password)
51 |     } finally {
52 |       if (fis != null) {
53 |         fis.close();
54 |       }
55 |     }
56 | 
57 |     val k = ks.getCertificate(id)
58 |     k
59 |   }
60 | 
61 |   def getDecrypt() = {
62 |     val rsa = Cipher.getInstance("RSA")
63 |     rsa.init(Cipher.DECRYPT_MODE, key)
64 |     rsa
65 |   }
66 | 
67 |   def decrypt(cryptoText: Array[Byte]) = synchronized {
68 |     new String(decrypter.doFinal(cryptoText))
69 |   }
70 | 
71 |   def getRsa(pk: String) = {
72 |     val publicBytes = Base64.decodeBase64(pk)
73 |     val keySpec = X509Certificate.getInstance(publicBytes)
74 |     val pubKey = keySpec.getPublicKey
75 |     val rsa = Cipher.getInstance("RSA")
76 |     rsa.init(Cipher.ENCRYPT_MODE, pubKey)
77 |     rsa
78 |   }
79 | 
80 |   def pkEncrypt(rsa: Cipher)(clearText: String) = {
81 |     rsa.doFinal(clearText.getBytes)
82 |   }
83 | 
84 |   /**
85 |    * Prepare public key encryption engine for faster use. Used by test data generation and the clients sending the data.
86 |    */
87 |   def getEncrypter(id: String) = {
88 |     val k = getCert(id).getEncoded
89 |     val b64 = Base64.encodeBase64(k)
90 |     val rsa = getRsa(new String(b64))
91 |     pkEncrypt(rsa) _
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/dataset_tests/src/sufficient_stats.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Differentially private Bayesian learning on distributed data
 3 | Mikko Heikkilä 2016-17
 4 | 
 5 | UCI data, GDSC/drug sensitivity data
 6 | 
 7 | Function for calculating sufficient statistics with perturbations added by  individual clients.
 8 | '''
 9 | 
10 | import csv
11 | import numpy as np
12 | import sys
13 | 
14 | def ss_individually(data, add_noise=False, sigma=None, use_spark = False, filename = None, n_spark_messages=None, spark_noise_range=None, fixed_point_int=None):
15 |   
16 |   k_clients = data.shape[0]
17 |   dim = data.shape[1]-1 #dimensions without target
18 |   
19 |   added_noise = np.zeros((k_clients, dim*(dim+1)//2+dim))
20 |   
21 |   #construct the products in X'X and X'y individually for each client, with or without added noise
22 |   products = np.zeros([k_clients, dim*(dim+1)//2 + dim] )
23 |   ind = 0
24 |   added_noise = np.zeros((k_clients, dim*(dim+1)//2+dim))
25 |   added_noise = np.random.normal(0,sigma, (k_clients, dim*(dim+1)//2+dim))
26 |   
27 |   #suff_stat1
28 |   for i in range(dim):
29 |     for ii in range(i+1):
30 |       products[:,ind] = (data[0:k_clients,i] * data[0:k_clients,ii])
31 |       ind += 1
32 |   #suff_stat2
33 |   for i in range(dim):
34 |     products[:,ind] = (data[0:k_clients,i] * data[0:k_clients,-1])
35 |     ind += 1
36 |   
37 |   if not add_noise:
38 |     added_noise = 0
39 |   
40 |   products += added_noise
41 |   
42 |   # save test data for Spark
43 |   if use_spark and filename is not None:
44 |     # use fixed-point representation in the noisy messages
45 |     products = np.floor(products*fixed_point_int).astype('int64')
46 |     # save as a matrix with n_clients rows and n_messages*suff_stats-dim columns, s.t. first n_messages cols correspond to the first element in the sufficient stats
47 |     noisy_matrix = np.zeros((products.shape[0],products.shape[1]*n_spark_messages),dtype='int64')
48 |     for i in range(products.shape[1]):
49 |       noise = np.random.randint(-spark_noise_range,spark_noise_range, (products.shape[0],n_spark_messages-1) ,dtype='int64')
50 |       noisy_matrix[:,n_spark_messages*i] = products[:,i]
51 |       noisy_matrix[:,n_spark_messages*i:(n_spark_messages*(i+1)-1)] += noise
52 |       noisy_matrix[:,n_spark_messages*(i+1)-1] = -np.sum(noise,1)
53 |     np.savetxt(filename, noisy_matrix, delimiter=';')
54 |     print('Saved sufficient statistics to file for Spark:\n {}'.format(filename))
55 |     sys.exit()
56 |   
57 |   noisy_sum = np.sum(products, axis=0)
58 |   #suff stats for X'X
59 |   suff_stat1 = np.zeros([dim,dim])
60 |   suff_stat1[np.tril_indices(dim,0)] = noisy_sum[0:dim*(dim+1)//2]
61 |   suff_stat1 = suff_stat1 + np.triu(np.transpose(suff_stat1),k=1)
62 |   #suff stat for X'y
63 |   suff_stat2 = np.zeros([dim,1])
64 |   suff_stat2 = noisy_sum[(dim*(dim+1)//2):]
65 |   
66 |   return suff_stat1, suff_stat2, added_noise
67 | 
68 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/plot_tensor_results.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | Modified from the original code:
  6 |   Differentially private Bayesian linear regression 
  7 |   Arttu Nieminen 2016-2017
  8 |   University of Helsinki Department of Computer Science
  9 |   Helsinki Institute of Information Technology HIIT
 10 | 
 11 | GDSC/drug sensitivity data
 12 | 
 13 | Script for plotting tensor test results from pickle files.
 14 | 
 15 | tensorresults.py should be run before this.
 16 | 
 17 | Run: python3 plot_tensor_results.py
 18 | '''
 19 | 
 20 | import pickle
 21 | import sys
 22 | 
 23 | import numpy as np
 24 | from matplotlib import pyplot as plt
 25 | 
 26 | 
 27 | # PLOTTING CONFIGURATIONS
 28 | # set these to match tensorresults.py
 29 | 
 30 | # Result filename
 31 | filename = 'resultsdata/drugsens_test.pickle'
 32 | #filename = 'resultsdata/NIPS_camera_ready/data_bounds/drugsens_test.pickle'
 33 | #filename = 'resultsdata/NIPS_camera_ready/fixed_bounds/drugsens_test.pickle'
 34 | 
 35 | # Save figure to file
 36 | save_to_file = False
 37 | fig_name = 'NIPS_camera_ready_plots/GDSC_drugsens_NIPS_final_all.pdf'
 38 | #fig_name = 'NIPS_camera_ready_plots/GDSC_drugsens_NIPS_final_selected.pdf'
 39 | 
 40 | # Methods to plot
 41 | # plot all methods
 42 | no_plotting = ['cl_scaling','cl_noisy', 'cl_true_TA']
 43 | # plot selected methods
 44 | #no_plotting = ['clipped','cl_scaling','noisy','cl_noisy', 'cl_true_TA','cl_true_TA_DP']
 45 | 
 46 | ###############################################################################
 47 | metodit = ['true', 'clipped', 'noisy', 'cl_noisy', 'noisy_ind', 'cl_noisy_ind', 'scaling','cl_scaling','cl_true_TA','cl_true_TA_DP']
 48 | 
 49 | nimet_dict = {'true':'NP', 'clipped':'proj NP','noisy':'TA', 'cl_noisy':'proj TA (noise)', 'noisy_ind':'DDP', 'cl_noisy_ind':'proj DDP', 'scaling':'input\nperturbed','cl_scaling':'proj scaling','cl_true_TA':'proj TA (not DP)', 'cl_true_TA_DP':'proj TA' }
 50 | 
 51 | #use same colors for corresponding non-clipped & clipped methods except for DDP
 52 | col_dict = {'true':'blue', 'clipped':'gray','noisy':'lightseagreen', 'cl_noisy':'green', 'noisy_ind':'red', 'cl_noisy_ind':'magenta', 'scaling':'orange','cl_scaling':'orange','cl_true_TA':'brown', 'cl_true_TA_DP':'darkgreen'}
 53 | 
 54 | with open(filename, 'rb') as f:
 55 |   res_all = pickle.load(f)
 56 | 
 57 | # parameters
 58 | eps = [1.0,3.0,5.0,7.5,10.0]
 59 | n_train = 840
 60 | # Note: some drugs might not have the same number of training data
 61 | n_test = 100
 62 | 
 63 | x = np.linspace(1,len(eps),num=len(eps))
 64 | y_err_lower = None
 65 | y_lower = {}
 66 | y_upper = {}
 67 | k_col = 0
 68 | plt.figure()
 69 | ax = plt.gca()
 70 | offset = -3
 71 | offset_factor = 0.05
 72 | for m in metodit:
 73 |   k_col = k_col + 1 #?
 74 |   if m not in no_plotting:
 75 |     y_lower[m] = np.zeros(len(eps))
 76 |     y_upper[m] = np.zeros(len(eps))
 77 |     
 78 |     #plot non-private with dashed line
 79 |     if m in ['true','clipped']:
 80 |       linetype = '--'
 81 |     else:
 82 |       linetype = '-'
 83 |       
 84 |     ax.errorbar(x+offset*offset_factor,np.mean(res_all[m]['mean'],0),
 85 |  yerr=np.std(res_all[m]['mean'],0),ls=linetype,marker='',linewidth=2,label=m ,color=col_dict[m], elinewidth=1.5)
 86 |     plt.axis([.79,5.21,-.03,.27])
 87 |     
 88 |     offset += 1
 89 | 
 90 | plt.tight_layout(pad=7)
 91 | nimet = []
 92 | for m in metodit:
 93 |   if m not in no_plotting:
 94 |     nimet.append(nimet_dict[m])
 95 |     
 96 | plt.xticks(x,eps)
 97 | plt.ylabel('Predictive accuracy')
 98 | plt.xlabel('epsilon')
 99 | plt.suptitle('d=10, sample size=840, CV=25, $\delta$=0.0001', y=.12, fontsize=13)
100 | 
101 | #legend on top
102 | plt.legend(nimet,bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
103 | 
104 | if save_to_file:
105 |   plt.savefig(fig_name, bbox_inches='tight')
106 | else:
107 |   plt.show()
108 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results-agg4-serverupdated.txt:
--------------------------------------------------------------------------------
  1 | 9 --d 10 --clients 100
  2 | Total time: 1788 ms.
  3 | 9 --d 10 --clients 100
  4 | Total time: 1705 ms.
  5 | 9 --d 10 --clients 100
  6 | Total time: 1726 ms.
  7 | 9 --d 10 --clients 100
  8 | Total time: 1660 ms.
  9 | 9 --d 10 --clients 100
 10 | Total time: 1728 ms.
 11 | 9 --d 100 --clients 100
 12 | Total time: 2018 ms.
 13 | 9 --d 100 --clients 100
 14 | Total time: 2015 ms.
 15 | 9 --d 100 --clients 100
 16 | Total time: 2050 ms.
 17 | 9 --d 100 --clients 100
 18 | Total time: 2096 ms.
 19 | 9 --d 100 --clients 100
 20 | Total time: 1977 ms.
 21 | 9 --d 1000 --clients 100
 22 | Total time: 3435 ms.
 23 | 9 --d 1000 --clients 100
 24 | Total time: 3456 ms.
 25 | 9 --d 1000 --clients 100
 26 | Total time: 3441 ms.
 27 | 9 --d 1000 --clients 100
 28 | Total time: 3549 ms.
 29 | 9 --d 1000 --clients 100
 30 | Total time: 3277 ms.
 31 | 9 --d 10000 --clients 100
 32 | Total time: 15203 ms.
 33 | 9 --d 10000 --clients 100
 34 | Total time: 15828 ms.
 35 | 9 --d 10000 --clients 100
 36 | Total time: 15766 ms.
 37 | 9 --d 10000 --clients 100
 38 | Total time: 15040 ms.
 39 | 9 --d 10000 --clients 100
 40 | Total time: 14686 ms.
 41 | 9 --d 10 --clients 1000
 42 | Total time: 1892 ms.
 43 | 9 --d 10 --clients 1000
 44 | Total time: 1843 ms.
 45 | 9 --d 10 --clients 1000
 46 | Total time: 1934 ms.
 47 | 9 --d 10 --clients 1000
 48 | Total time: 1885 ms.
 49 | 9 --d 10 --clients 1000
 50 | Total time: 1872 ms.
 51 | 9 --d 100 --clients 1000
 52 | Total time: 2862 ms.
 53 | 9 --d 100 --clients 1000
 54 | Total time: 2798 ms.
 55 | 9 --d 100 --clients 1000
 56 | Total time: 2817 ms.
 57 | 9 --d 100 --clients 1000
 58 | Total time: 2828 ms.
 59 | 9 --d 100 --clients 1000
 60 | Total time: 2978 ms.
 61 | 9 --d 1000 --clients 1000
 62 | Total time: 10173 ms.
 63 | 9 --d 1000 --clients 1000
 64 | Total time: 10678 ms.
 65 | 9 --d 1000 --clients 1000
 66 | Total time: 11008 ms.
 67 | 9 --d 1000 --clients 1000
 68 | Total time: 10178 ms.
 69 | 9 --d 1000 --clients 1000
 70 | Total time: 10786 ms.
 71 | 9 --d 10000 --clients 1000
 72 | Total time: 85990 ms.
 73 | 9 --d 10000 --clients 1000
 74 | Total time: 83952 ms.
 75 | 9 --d 10000 --clients 1000
 76 | Total time: 86265 ms.
 77 | 9 --d 10000 --clients 1000
 78 | Total time: 84180 ms.
 79 | 9 --d 10000 --clients 1000
 80 | Total time: 84358 ms.
 81 | 9 --d 10 --clients 10000
 82 | Total time: 2913 ms.
 83 | 9 --d 10 --clients 10000
 84 | Total time: 2917 ms.
 85 | 9 --d 10 --clients 10000
 86 | Total time: 2973 ms.
 87 | 9 --d 10 --clients 10000
 88 | Total time: 3081 ms.
 89 | 9 --d 10 --clients 10000
 90 | Total time: 3065 ms.
 91 | 9 --d 100 --clients 10000
 92 | Total time: 12495 ms.
 93 | 9 --d 100 --clients 10000
 94 | Total time: 12247 ms.
 95 | 9 --d 100 --clients 10000
 96 | Total time: 12843 ms.
 97 | 9 --d 100 --clients 10000
 98 | Total time: 11950 ms.
 99 | 9 --d 100 --clients 10000
100 | Total time: 12268 ms.
101 | 9 --d 1000 --clients 10000
102 | Total time: 103702 ms.
103 | 9 --d 1000 --clients 10000
104 | Total time: 101318 ms.
105 | 9 --d 1000 --clients 10000
106 | Total time: 100846 ms.
107 | 9 --d 1000 --clients 10000
108 | Total time: 99636 ms.
109 | 9 --d 1000 --clients 10000
110 | Total time: 100498 ms.
111 | 9 --d 10000 --clients 10000
112 | Total time: 1003056 ms.
113 | 9 --d 10000 --clients 10000
114 | Total time: 979955 ms.
115 | 9 --d 10000 --clients 10000
116 | Total time: 1006304 ms.
117 | 9 --d 10000 --clients 10000
118 | Total time: 986876 ms.
119 | 9 --d 10000 --clients 10000
120 | Total time: 998624 ms.
121 | 9 --d 10 --clients 100000
122 | Total time: 8420 ms.
123 | 9 --d 10 --clients 100000
124 | Total time: 8568 ms.
125 | 9 --d 10 --clients 100000
126 | Total time: 8723 ms.
127 | 9 --d 10 --clients 100000
128 | Total time: 8678 ms.
129 | 9 --d 10 --clients 100000
130 | Total time: 8502 ms.
131 | 9 --d 100 --clients 100000
132 | Total time: 67553 ms.
133 | 9 --d 100 --clients 100000
134 | Total time: 66513 ms.
135 | 9 --d 100 --clients 100000
136 | Total time: 64927 ms.
137 | 9 --d 100 --clients 100000
138 | Total time: 64764 ms.
139 | 9 --d 100 --clients 100000
140 | Total time: 64463 ms.
141 | 9 --d 1000 --clients 100000
142 | Total time: 636229 ms.
143 | 9 --d 1000 --clients 100000
144 | Total time: 601287 ms.
145 | 9 --d 1000 --clients 100000
146 | Total time: 600590 ms.
147 | 9 --d 1000 --clients 100000
148 | Total time: 614189 ms.
149 | 9 --d 1000 --clients 100000
150 | Total time: 600433 ms.
151 | 9 --d 1000 --clients 100000 --repeats 10
152 | Total time: 1634766 ms.
153 | 9 --d 1000 --clients 100000 --repeats 10
154 | Total time: 1581326 ms.
155 | 9 --d 1000 --clients 100000 --repeats 10
156 | Total time: 1561852 ms.
157 | 9 --d 1000 --clients 100000 --repeats 10
158 | Total time: 1614572 ms.
159 | 9 --d 1000 --clients 100000 --repeats 10
160 | Total time: 1568930 ms.
161 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/src/main/scala/fi/helsinki/cs/probic/streaming/SparkDataAggregator.scala:
--------------------------------------------------------------------------------
  1 | package fi.helsinki.cs.probic.streaming
  2 | 
  3 | import fi.helsinki.cs.nodes.util.Spark2Main
  4 | import org.apache.spark.sql.SparkSession
  5 | import org.apache.spark.streaming.StreamingContext
  6 | import org.apache.spark.streaming.Seconds
  7 | import org.apache.spark.storage.StorageLevel
  8 | import org.apache.spark.streaming.dstream.DStream
  9 | import com.typesafe.scalalogging.LazyLogging
 10 | import java.net.URL
 11 | import java.net.Socket
 12 | import java.io.DataOutputStream
 13 | import org.apache.commons.codec.binary.Base64
 14 | import java.io.DataInputStream
 15 | import scala.io.Codec
 16 | import org.apache.commons.io.FileUtils
 17 | import java.io.File
 18 | 
 19 | /**
 20 |  * Assumes data is already in correct order, e.g. data for first server first, then the data for 2nd server, etc.
 21 |  *
 22 |  * Mandatory options:
 23 |  * --intype hdfs or --intype socket
 24 |  * --input hdfs://path/to/input/folder
 25 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
 26 |  */
 27 | object SparkDataAggregator extends Spark2Main with LazyLogging {
 28 |   val longOptions = Seq("clients=", "input=", "output=", "noise=", "d=", "repeats=", "useDouble")
 29 | 
 30 |   val shortOptions = ""
 31 | 
 32 |   val sparkOutputCompression = true
 33 | 
 34 |   def sparkMain(spark: SparkSession) {
 35 |     // test-data-matrix
 36 |     val input = mandatoryOption("input")
 37 |     val output = mandatoryOption("output")
 38 |     val d = mandatoryOption("d").toInt
 39 |     val clients = mandatoryOption("clients").toInt
 40 |     val noise = mandatoryOption("noise").toInt
 41 |     val useDouble = optionSet("useDouble")
 42 |     val k = noise + 1
 43 | 
 44 |     var timeAcc = 0L
 45 |     val repeats = optional("repeats").getOrElse("1").toInt
 46 |     val out = s"$output-$d-$clients-$noise.csv.gz"
 47 |     import sys.process._
 48 |     val result = "rm -rf temp" !
 49 |     val start = System.currentTimeMillis
 50 |     val in = spark.sparkContext.textFile(s"$input-$d-$clients-$noise.csv.gz", d).repartition(d).zipWithIndex
 51 | 
 52 |     val resultStream = {
 53 |       if (useDouble) {
 54 |         // Matrix is N lines, each line has (noise+1)*D messages
 55 |         // So we need to gather each group of (noise+1) items on each line to produce full batches of messages.
 56 |        val dGroupedLines = in.flatMap {
 57 |           case (line, clientId) =>
 58 |             val dGroups = line.split(";").map(_.toDouble).grouped(k).toSeq.zipWithIndex
 59 |             dGroups.map{case (kItems, dValue) =>
 60 |               dValue -> kItems.zipWithIndex.map{case (item, serverId) => (serverId, clientId, item)}
 61 |             }
 62 |         }
 63 |        dGroupedLines.reduceByKey(_++_).flatMap{case (dValue, batch) =>
 64 |          val byServer = batch.toSeq.groupBy(_._1).map{x => x._2.map(_._3).toArray -> x._1}
 65 |          val outputs = (0 until repeats).toSeq.par.map{ repeatId =>
 66 |              val output = byServer.toSeq.par.map(sendReceive(useDouble)).reduce(_+_)
 67 |              val adjustedLineId = (dValue * 10 + repeatId)
 68 |               s"$adjustedLineId;$output"
 69 |        }
 70 |          outputs.seq
 71 |        }
 72 |       } else {
 73 |         // Matrix is D lines, each line has (noise+1)*N messages
 74 |         in.flatMap {
 75 |           case (line, lineNum) =>
 76 |             // One line is a whole batch of messages, send to servers for sum.
 77 |             val outputs = (0 until repeats).toSeq.par.map { repeatId =>
 78 |               val output = line.split(";").map { x =>
 79 |                 if (useDouble)
 80 |                   x.toDouble
 81 |                 else
 82 |                   x.toLong
 83 |               }.grouped(clients).toSeq.zipWithIndex.par.map(sendReceive(useDouble))
 84 |                 .reduce(_ + _)
 85 |               val adjustedLineId = (lineNum * 10 + repeatId)
 86 |               s"$adjustedLineId;$output"
 87 |             }
 88 |             outputs.seq
 89 |           //lineNum -> output
 90 |         }
 91 |       }
 92 |     }
 93 | 
 94 |     //resultStream.map { case (k, value) => k + ";" + value }
 95 |     resultStream.saveAsTextFile(out)
 96 |     val end = System.currentTimeMillis()
 97 |     timeAcc += end - start
 98 | 
 99 |     println(s"Total time: $timeAcc ms.")
100 |   }
101 | 
102 |   def sendReceive(useDouble: Boolean)(valuesForServer: (Array[Double], Int)) = {
103 |     val (values, srvId) = valuesForServer
104 |     val sock = new Socket("127.0.0.1", 8080 + srvId)
105 |     val out = new DataOutputStream(sock.getOutputStream)
106 |     if (useDouble)
107 |       values.foreach(out.writeDouble)
108 |     else
109 |       values.map(_.toLong).foreach(out.writeLong)
110 |     val in = new DataInputStream(sock.getInputStream)
111 | 
112 |     val returned = {
113 |       if (useDouble)
114 |         in.readDouble
115 |       else
116 |         in.readLong
117 |     }
118 |     sock.close
119 |     returned
120 |   }
121 | }
122 | 


--------------------------------------------------------------------------------
/spark-streaming-aggregator/results-agg4.txt:
--------------------------------------------------------------------------------
  1 | 9 --d 1000 --clients 10000
  2 | Total time: 105394 ms.
  3 | 9 --d 1000 --clients 10000
  4 | Total time: 109267 ms.
  5 | 9 --d 1000 --clients 10000
  6 | Total time: 110730 ms.
  7 | 9 --d 1000 --clients 10000
  8 | Total time: 111144 ms.
  9 | 9 --d 1000 --clients 10000
 10 | Total time: 110378 ms.
 11 | 9 --d 100 --clients 10000
 12 | Total time: 12343 ms.
 13 | 9 --d 100 --clients 10000
 14 | Total time: 12948 ms.
 15 | 9 --d 100 --clients 10000
 16 | Total time: 12439 ms.
 17 | 9 --d 100 --clients 10000
 18 | Total time: 12913 ms.
 19 | 9 --d 100 --clients 10000
 20 | Total time: 13148 ms.
 21 | 9 --d 10 --clients 10000
 22 | Total time: 3063 ms.
 23 | 9 --d 10 --clients 10000
 24 | Total time: 2818 ms.
 25 | 9 --d 10 --clients 10000
 26 | Total time: 3067 ms.
 27 | 9 --d 10 --clients 10000
 28 | Total time: 3065 ms.
 29 | 9 --d 10 --clients 10000
 30 | Total time: 3096 ms.
 31 | 9 --d 10 --clients 1000
 32 | Total time: 2483 ms.
 33 | 9 --d 10 --clients 1000
 34 | Total time: 2046 ms.
 35 | 9 --d 10 --clients 1000
 36 | Total time: 2082 ms.
 37 | 9 --d 10 --clients 1000
 38 | Total time: 1978 ms.
 39 | 9 --d 10 --clients 1000
 40 | Total time: 2056 ms.
 41 | 9 --d 100 --clients 1000
 42 | Total time: 3046 ms.
 43 | 9 --d 100 --clients 1000
 44 | Total time: 3193 ms.
 45 | 9 --d 100 --clients 1000
 46 | Total time: 2860 ms.
 47 | 9 --d 100 --clients 1000
 48 | Total time: 2890 ms.
 49 | 9 --d 100 --clients 1000
 50 | Total time: 3116 ms.
 51 | 9 --d 1000 --clients 1000
 52 | Total time: 12203 ms.
 53 | 9 --d 1000 --clients 1000
 54 | Total time: 12163 ms.
 55 | 9 --d 1000 --clients 1000
 56 | Total time: 11263 ms.
 57 | 9 --d 1000 --clients 1000
 58 | Total time: 11680 ms.
 59 | 9 --d 1000 --clients 1000
 60 | Total time: 11522 ms.
 61 | 9 --d 10000 --clients 1000
 62 | Total time: 94400 ms.
 63 | 9 --d 10000 --clients 1000
 64 | Total time: 92984 ms.
 65 | 9 --d 10000 --clients 1000
 66 | Total time: 92690 ms.
 67 | 9 --d 10000 --clients 1000
 68 | Total time: 94449 ms.
 69 | 9 --d 10000 --clients 1000
 70 | Total time: 93135 ms.
 71 | 9 --d 10 --clients 100
 72 | Total time: 2363 ms.
 73 | 9 --d 10 --clients 100
 74 | Total time: 1803 ms.
 75 | 9 --d 10 --clients 100
 76 | Total time: 1729 ms.
 77 | 9 --d 10 --clients 100
 78 | Total time: 1711 ms.
 79 | 9 --d 10 --clients 100
 80 | Total time: 1732 ms.
 81 | 9 --d 100 --clients 100
 82 | Total time: 2225 ms.
 83 | 9 --d 100 --clients 100
 84 | Total time: 2163 ms.
 85 | 9 --d 100 --clients 100
 86 | Total time: 2057 ms.
 87 | 9 --d 100 --clients 100
 88 | Total time: 2155 ms.
 89 | 9 --d 100 --clients 100
 90 | Total time: 2152 ms.
 91 | 9 --d 1000 --clients 100
 92 | Total time: 3880 ms.
 93 | 9 --d 1000 --clients 100
 94 | Total time: 3945 ms.
 95 | 9 --d 1000 --clients 100
 96 | Total time: 3922 ms.
 97 | 9 --d 1000 --clients 100
 98 | Total time: 3863 ms.
 99 | 9 --d 1000 --clients 100
100 | Total time: 3826 ms.
101 | 9 --d 10000 --clients 100
102 | Total time: 19815 ms.
103 | 9 --d 10000 --clients 100
104 | Total time: 19339 ms.
105 | 9 --d 10000 --clients 100
106 | Total time: 19497 ms.
107 | 9 --d 10000 --clients 100
108 | Total time: 19323 ms.
109 | 9 --d 10000 --clients 100
110 | Total time: 21497 ms.
111 | 9 --d 10 --clients 100000
112 | Total time: 9172 ms.
113 | 9 --d 10 --clients 100000
114 | Total time: 8434 ms.
115 | 9 --d 10 --clients 100000
116 | Total time: 8647 ms.
117 | 9 --d 10 --clients 100000
118 | Total time: 8741 ms.
119 | 9 --d 10 --clients 100000
120 | Total time: 8807 ms.
121 | 9 --d 10 --clients 100000
122 | Total time: 8518 ms.
123 | 9 --d 100 --clients 100000
124 | Total time: 70875 ms.
125 | 9 --d 100 --clients 100000
126 | Total time: 70965 ms.
127 | 9 --d 100 --clients 100000
128 | Total time: 69856 ms.
129 | 9 --d 100 --clients 100000
130 | Total time: 71048 ms.
131 | 9 --d 100 --clients 100000
132 | Total time: 69147 ms.
133 | 9 --d 1000 --clients 100000
134 | Total time: 662427 ms.
135 | 9 --d 1000 --clients 100000
136 | Total time: 649283 ms.
137 | 9 --d 1000 --clients 100000
138 | Total time: 670965 ms.
139 | 9 --d 1000 --clients 100000
140 | Total time: 670152 ms.
141 | 9 --d 1000 --clients 100000
142 | Total time: 679691 ms.
143 | 9 --d 10000 --clients 10000
144 | Total time: 1042829 ms.
145 | 9 --d 10000 --clients 10000
146 | Total time: 1132176 ms.
147 | 9 --d 10000 --clients 10000
148 | Total time: 1138083 ms.
149 | 9 --d 10000 --clients 10000
150 | Total time: 1101404 ms.
151 | 9 --d 10000 --clients 10000
152 | Total time: 1102854 ms.
153 | 9 --d 1000 --clients 100000 --repeats 10
154 | 9 --d 1000 --clients 100000 --repeats 10
155 | 9 --d 1000 --clients 100000 --repeats 10
156 | 9 --d 1000 --clients 100000 --repeats 10
157 | 9 --d 1000 --clients 100000 --repeats 10
158 | 9 --d 1000 --clients 100000 --repeats 10
159 | Total time: 6684956 ms.
160 | 9 --d 1000 --clients 100000 --repeats 10
161 | Total time: 6649201 ms.
162 | 9 --d 1000 --clients 100000 --repeats 10
163 | Total time: 6662739 ms.
164 | 9 --d 1000 --clients 100000 --repeats 10
165 | Total time: 6690525 ms.
166 | 9 --d 1000 --clients 100000 --repeats 10
167 | Total time: 6720634 ms.
168 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/tensor.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | Modified from the original code:
  6 |   Differentially private Bayesian linear regression 
  7 |   Arttu Nieminen 2016-2017
  8 |   University of Helsinki Department of Computer Science
  9 |   Helsinki Institute of Information Technology HIIT
 10 | 
 11 | GDSC/drug sensitivity data
 12 | 
 13 | clippingomega.py should be run before this code.
 14 | 
 15 | Run: python3 tensor.py drugid seed
 16 | where 
 17 | - drugid is an integer in [0,1,...,264] (specifies drug)
 18 | - seed is an integer (specifies cv fold)
 19 | This program does 1-fold cv for given drug for one test tensor.
 20 | The cv split is defined by the given random seed.
 21 | run_tensor_tests.py is a helper script for running several drugs and CVs as in the paper.
 22 | '''
 23 | 
 24 | import sys
 25 | import os
 26 | 
 27 | import diffpri as dp
 28 | import numpy as np
 29 | import pickle
 30 | import csv
 31 | from collections import OrderedDict
 32 | 
 33 | # Import data
 34 | datapath = '' # add path for input and output data files
 35 | f = open(datapath+'GeneExpressionReducted.csv','rt')
 36 | reader = csv.reader(f,delimiter=',')
 37 | x = np.array(list(reader)).astype(float)
 38 | f.close()
 39 | f = open(datapath+'DrugResponse.csv','rt')
 40 | reader = csv.reader(f,delimiter=',')
 41 | y = np.array(list(reader)).astype(float)
 42 | f.close()
 43 | # For more information on the data pre-processing, see the paper "Efficient differentially private learning improves drug sensitivity prediction" (arXiv:1606.02109).
 44 | 
 45 | if len(sys.argv) > 1:
 46 |   drugid = int(sys.argv[1])
 47 |   seed = int(sys.argv[2])
 48 | else:
 49 |   drugid = 226
 50 |   seed = 0
 51 | 
 52 | # Number of samples to use
 53 | pv_size = [840] # [840] in the paper
 54 | pv_max = max(pv_size)
 55 | 
 56 | #privacy budget as lists of same length
 57 | eps = [1.0,3.0,5.0,7.5,10.0]
 58 | delta_list = np.zeros(shape=len(eps))+10e-4
 59 | 
 60 | #test set size
 61 | n_test = 100 # 100 in the paper
 62 | 
 63 | print('Running tensor test: drugid='+str(drugid)+', seed='+str(seed))
 64 | 
 65 | # Setup some parameters; see eps_data_test.py for more info
 66 | pars = {'assumed_data_range' : [1,7.5], #[1,7.5] in the paper
 67 |         #'feedback' : 0,
 68 |         'dim': 10, # 10 in the paper
 69 |         'tmp_folder' : 'tmp/',
 70 |         'add_noise' : 3,
 71 |         'scaling_comparison' : 0,
 72 |         'enforce_pos_def' : True,
 73 |         'privacy_for_marg_var' : .3, # NOTE: this should match the value in clippingomega.py; .3 in the paper
 74 |         'small_const_for_std' : .5, # .5 in the paper
 75 |         'drugsens_data' : True,
 76 |         'use_spark' : False,
 77 |         # Note: Spark version not tested with drugsens data
 78 |         'spark_filename' : 'tmp/sparktest.csv',
 79 |         'n_spark_messages' : 10,
 80 |         'spark_noise_range' : 10e13,
 81 |         'fixed_point_int' : 10e6
 82 |         }
 83 | 
 84 | csvpath = ''
 85 | # Fetch clipping threshold
 86 | f = open(csvpath+'C-WX.csv','rt')
 87 | reader = csv.reader(f,delimiter=',')
 88 | WX = np.array(list(reader)).astype(float)
 89 | f.close()
 90 | f = open(csvpath+'C-WY.csv','rt')
 91 | reader = csv.reader(f,delimiter=',')
 92 | WY = np.array(list(reader)).astype(float)
 93 | f.close()
 94 | 
 95 | #check number of missing values
 96 | inds = ~np.isnan(y[:,drugid])
 97 | n_data = np.sum(inds)
 98 | print('drugid '+str(drugid)+', has ' +str(n_data) +' target values (out of '+str(y.shape[0])+')')
 99 | y = y[inds,:]
100 | x = x[inds,:]
101 | 
102 | res_all = OrderedDict()
103 | models = ['true', 'clipped','noisy','cl_noisy','noisy_ind','cl_noisy_ind','scaling','cl_scaling','cl_true_TA','cl_true_TA_DP']
104 | for m in models:
105 |   res_all[m] = np.zeros((len(pv_size),len(eps)),dtype=np.float64)
106 | 
107 | for i in range(len(pv_size)):
108 |   
109 |   n_pv = pv_size[i]
110 |   d = pars['dim']
111 |   for j in range(len(eps)):
112 |     pars['epsilon'] = eps[j]
113 |     pars['delta'] = delta_list[j]
114 |     
115 |     w_x = WX[i,j]
116 |     w_y = WY[i,j]
117 |     
118 |     # check amount of data, use maximum amount if too few samples
119 |     if n_data < n_pv+n_test: #n_npv+n_test:
120 |       print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test))
121 |       n_pv = n_data-n_test
122 |     
123 |     # Process data
124 |     suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars)
125 |     
126 |     # calculate predictions
127 |     for m in suff_stats_all:
128 |       pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test)
129 |       res_all[m][i,j] = dp.precision(pred,y_test)
130 |       
131 | 
132 | with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f:
133 |   pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL)
134 |   
135 | print('Done.')
136 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction: Spark Streaming Aggregator Subprojects
  2 | 
  3 | The Probic Spark Streaming Data Aggregator consists of two projects, spark-streaming-aggregator that routes
  4 | data to the correct node for decryption, and probic-decrypt-server that represents one such decryption node.
  5 | The node then decrypts the data and returns the result to Spark which aggregates results from all such nodes
  6 | and produces the sum until input is exhausted (or forever, this can be adjusted).
  7 | 
  8 | ## Requirements
  9 | 
 10 | Both projects use sbt as the build tool. Usage:
 11 | 
 12 | 1. get sbt from http://www.scala-sbt.org/download.html , then extract it, and put its bin folder into your path.
 13 | 
 14 | 2. `cd spark-streaming-aggregator` or `cd probic-decrypt-server`
 15 | 
 16 | 3. `sbt eclipse` creates an Eclpse project file that allows you to import the spark-streaming-aggregator directory as a scala-ide project: http://scala-ide.org/
 17 | 
 18 | 5. `sbt assembly` creates a so called fat jar that can be copied to any machine with Java and Spark installed and run as a Spark Streaming program.
 19 | 
 20 | 6. Spark 2.1.0 or newer prebuilt for Hadoop 2.7 is needed by the tests. We assume Spark is downloaded from http://spark.apache.org/ , extracted, and placed at `$HOME/work/spark-2.1.0-bin-hadoop2.7` . If Spark is placed elsewhere, please adjust the file `spark-streaming-aggregator/run-spark.sh` accordingly.
 21 | 
 22 | 
 23 | # Running the experiment in the paper
 24 | 
 25 | After compiling, the sections below explains how to run the DCA experiment used to generate the results in Table 1 in the NIPS 2017 paper Differentially private Bayesian learning on distributed data.
 26 | 
 27 | The process includes some preparation and data generation steps followed by starting the decryption servers, and finally the top-level Spark-based data aggregator.
 28 | 
 29 | ## Create keys if not yet done
 30 | 
 31 | This requires java's keytool to be installed.
 32 | 
 33 | 1. Make a file called secret.txt with a one-line password that will be used for server keys. Currently the same password is used for all.
 34 | 2. Run `./gen10.sh` to generate 10 public/private key pairs.
 35 | 
 36 | ## Generate testing data file
 37 | Run `run-scripts/gen-testdata-10.sh` to generate a test data file for 10 decryption servers, 10:1 noise to real data message ratio, N=100 to 100,000 and d=10 to 10,000.
 38 | 
 39 | ## Start the aggregators
 40 | Run `run-scripts/start-servers.sh n C` where n is the number of servers and equal to amount of noise, 10 above) and C is the number of clients. This will start the decryption servers. They will wait for the Spark process to act as the clients and schedule the data processing for them.
 41 | 
 42 | ## Start the Spark aggregator
 43 | cd to `../spark-streaming-aggregator` and run:
 44 | 
 45 | ```
 46 | sbt assembly
 47 | for d in 10; do for k in $( seq 1 5 ); do ./run-spark-aggregator4.sh results-agg4.txt --d $d --clients 100 --repeats 10; done ; done
 48 | ```
 49 | Note: To run the whole experiment, you need to then kill the aggregator server processes, and restart with --clients 1000, then 10,0000, etc. until the whole table of results has been generated.
 50 | 
 51 | # Spark Non-Streaming Aggregator Final Results
 52 | 
 53 | You can obtain these 5-run averages using `./getresults.sh results-agg4.txt` in the `spark-streaming-aggregator` folder.
 54 | The output should look like this:
 55 | ```
 56 | 9 --d 10000 --clients 10000 1103.47
 57 | 9 --d 10000 --clients 1000 93.5316
 58 | 9 --d 10000 --clients 100 19.8942
 59 | 9 --d 1000 --clients 100000 666.504
 60 | 9 --d 1000 --clients 10000 109.383
 61 | 9 --d 1000 --clients 1000 11.7662
 62 | 9 --d 1000 --clients 100 3.8872
 63 | 9 --d 100 --clients 100000 70.3782
 64 | 9 --d 100 --clients 10000 12.7582
 65 | 9 --d 100 --clients 1000 3.021
 66 | 9 --d 100 --clients 100 2.1504
 67 | 9 --d 10 --clients 100000 8.71983
 68 | 9 --d 10 --clients 10000 3.0218
 69 | 9 --d 10 --clients 1000 2.129
 70 | 9 --d 10 --clients 100 1.8676
 71 | ```
 72 | 
 73 | # Running an experiment with a real dataset based model
 74 | 
 75 | ## Generate sufficient statistics + noise
 76 | ```sh
 77 | cd ../dataset_tests/src
 78 | python3 eps_data_test.py -s sparkfile.txt-8-3000-9.csv -c 10
 79 | ```
 80 | And compress it for Spark:
 81 | ```
 82 | gzip sparkfile.txt-8-3000-9.csv
 83 | ```
 84 | 
 85 | ## Run decryption servers
 86 | First, compile the project with `sbt assembly`. Then run:
 87 | 
 88 | ```sh
 89 | cd ../../probic-decrypt-server
 90 | run-scripts/start-servers-eps.sh
 91 | ```
 92 | 
 93 | ## Run Spark aggregator
 94 | First, compile the project with `sbt assembly`. Then run:
 95 | 
 96 | ```
 97 | cd ../spark-streaming-aggregator
 98 | ./run-spark-aggregator-eps-data.sh
 99 | ```
100 | 
101 | Results will be produced in the spark-streaming-aggregator folder in the file results-eps-abalone-3000.txt.
102 | Run as many repeats as you wish.
103 | You can get the average runtime by running
104 | ```sh
105 | ./get-results.sh results-eps-abalone-3000.txt 
106 | ```
107 | The results may be something like:
108 | ```sh
109 | 9 --clients 3000 --d 8 --useDouble 6.3494
110 | ```
111 | In this case the experiment took 6.35 seconds to complete on average.
112 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestDataServer.scala:
--------------------------------------------------------------------------------
  1 | package fi.helsinki.cs.probic.test
  2 | 
  3 | import fi.helsinki.cs.nodes.util.OptMain
  4 | import com.typesafe.scalalogging.LazyLogging
  5 | import java.net.ServerSocket
  6 | import java.net.Socket
  7 | import org.apache.commons.codec.binary.Base64
  8 | import java.io.DataOutputStream
  9 | import fi.helsinki.cs.probic.crypto.PkCrypto
 10 | import java.util.zip.GZIPInputStream
 11 | import java.io.FileInputStream
 12 | import java.io.File
 13 | import scala.collection.Seq
 14 | 
 15 | /**
 16 |  * Mandatory options:
 17 |  * --intype hdfs or --intype socket
 18 |  * --input hdfs://path/to/input/folder
 19 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
 20 |  */
 21 | object TestDataServer extends OptMain with LazyLogging {
 22 | 
 23 |   val DEFAULT_PORT = "8090"
 24 | 
 25 |   val longOptions = Seq("port=", "masters=", "certs=", "input=", "clients=", "zip", "batchLength=", "sleep=")
 26 | 
 27 |   val shortOptions = ""
 28 | 
 29 |   def optMain() {
 30 |     val port = optional("port").getOrElse(DEFAULT_PORT).toInt
 31 |     val certs = optional("certs").getOrElse("probic").split(",")
 32 |     val masters = optional("masters").getOrElse("localhost:8080").split(",")
 33 |     val clients = mandatoryOption("clients").toInt
 34 |     val batchLength = optional("batchLength").getOrElse("5000").toLong
 35 |     val sleep = optional("sleep").getOrElse("500").toLong
 36 | 
 37 |     val input = mandatoryOption("input")
 38 |     val zip = optionSet("zip")
 39 | 
 40 |     val server = new ServerSocket(port)
 41 |     val crypto = new PkCrypto("probic") // Private test key, not relevant in this program
 42 | 
 43 |     val servers = for (i <- 0 until certs.length) yield {
 44 |       masters(i) -> crypto.getEncrypter(certs(i))
 45 |     }
 46 | 
 47 |     val handler = handleRequest(servers) _
 48 |     //val handler = handleRequestLine(servers) _
 49 | 
 50 |     val inputLines = {
 51 |       if (zip) {
 52 |         io.Source.fromInputStream(new GZIPInputStream(new FileInputStream(new File(input)))).getLines()
 53 |       } else
 54 |         io.Source.fromFile(input).getLines()
 55 |     }
 56 | 
 57 |     logger.info(s"Starting ${getClass.getName} at $port")
 58 |     handler(server.accept, inputLines, clients, batchLength, sleep)
 59 |     /*for (line <- inputLines)
 60 |       handler(server.accept, line, clients)*/
 61 |   }
 62 | 
 63 |   def handleRequest(servers: Seq[(String, String => Array[Byte])])(sock: Socket, inputLines: Iterator[String], clients: Int, batchLength: Long, sleep: Long) {
 64 |     val t0 = System.currentTimeMillis()
 65 |     var batchId = 0
 66 |     val out = new DataOutputStream(sock.getOutputStream)
 67 |     for (line <- inputLines) {
 68 |       timedWriteout(out, servers, line, clients)
 69 |       var diff = System.currentTimeMillis() - t0
 70 |       logger.info(s"Total elapsed ${diff} ms.")
 71 |       diff -= batchId * batchLength
 72 |       val slp = batchLength - diff + sleep // 500 to make sure
 73 |       logger.info(s"Sleeping $slp ms to compensate")
 74 |       Thread.sleep(slp)
 75 |       batchId += 1
 76 |     }
 77 |     out.close
 78 |   }
 79 | 
 80 |   def handleRequestLine(servers: Seq[(String, String => Array[Byte])])(sock: Socket, line: String, clients: Int) {
 81 |     val out = new DataOutputStream(sock.getOutputStream)
 82 |     parWriteout(out, servers, line, clients)
 83 |     out.close
 84 |   }
 85 | 
 86 |   def timedWriteout(out: DataOutputStream, servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = {
 87 |     val t1 = System.currentTimeMillis()
 88 |     val items = line.split(";")
 89 |     val itemsPerClient = items.length / clients
 90 |     val clientItems = items.grouped(itemsPerClient).toSeq
 91 |     for (client <- 0 until clients) {
 92 |       val myItems = clientItems(client)
 93 |       for (item <- 0 until myItems.length) {
 94 |         val (master, encrypt) = servers(item % servers.length)
 95 |         val data = myItems(item)
 96 |         val cryptoText = encrypt(data + "")
 97 |         val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}"
 98 |         out.write((msg + "\n").getBytes)
 99 |       }
100 |     }
101 |     logger.info(s"Sent data of $clients clients with $itemsPerClient items per client in ${System.currentTimeMillis() - t1} ms.")
102 |   }
103 | 
104 |   def parWriteout(out: DataOutputStream, servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = {
105 |     val t1 = System.currentTimeMillis()
106 |     val items = line.split(";")
107 |     val itemsPerClient = items.length / clients
108 |     val groupsOfServers = items.grouped(itemsPerClient).zipWithIndex.toSeq
109 |     val encrypted = servers.zipWithIndex.par.flatMap {
110 |       case ((master, encrypt), sindex) =>
111 |         groupsOfServers.flatMap {
112 |           case (group, client) =>
113 |             //println(s"sindex $sindex grouplen ${group.length} client $client")
114 |             val data = group(sindex)
115 |             val cryptoText = encrypt(data + "")
116 |             val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}\n"
117 |             msg.getBytes
118 |         }
119 |     }
120 |     encrypted.seq.foreach { msg =>
121 |       out.write(msg)
122 |     }
123 |     logger.info(s"Sent data of $clients clients with $itemsPerClient items per client in ${System.currentTimeMillis() - t1} ms.")
124 |   }
125 | }
126 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/data/GenerateTestDataMatrix.scala:
--------------------------------------------------------------------------------
  1 | package fi.helsinki.cs.probic.data
  2 | 
  3 | import fi.helsinki.cs.nodes.util.OptMain
  4 | import com.typesafe.scalalogging.LazyLogging
  5 | import java.net.ServerSocket
  6 | import java.net.Socket
  7 | import scala.concurrent.ExecutionContext
  8 | import org.apache.commons.codec.binary.Base64
  9 | import java.io.DataOutputStream
 10 | import fi.helsinki.cs.probic.crypto.PkCrypto
 11 | import java.util.zip.GZIPOutputStream
 12 | import scala.util.Random
 13 | 
 14 | /**
 15 |  * Mandatory options:
 16 |  * --intype hdfs or --intype socket
 17 |  * --input hdfs://path/to/input/folder
 18 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
 19 |  */
 20 | object GenerateTestDataMatrix extends OptMain with LazyLogging {
 21 | 
 22 |   val longOptions = Seq("dimension=", "clients=", "noise=", "output=", "zip")
 23 | 
 24 |   val shortOptions = ""
 25 | 
 26 |   def optMain() {
 27 |     val clients = mandatoryOption("clients").toInt
 28 |     val d = mandatoryOption("dimension").toInt
 29 |     val noise = mandatoryOption("noise").toInt
 30 |     val zip = optionSet("zip")
 31 | 
 32 |     val output = mandatoryOption("output")
 33 | 
 34 |     logger.info(s"Generating test data matrix of size d=$d x N=$clients x k=${noise + 1}")
 35 |     //generateAllData(output, d, clients, noise)
 36 |     generateAllDataLive(output, d, clients, noise, zip)
 37 |   }
 38 | 
 39 |   def generateAllData(output: String, d: Int, clients: Int, noise: Int) = {
 40 |     val (realData, confusedData) = generateData(d, clients, noise)
 41 |     toFile(output, confusedData.seq.map(_.mkString(";")))
 42 |     // Save also real data.
 43 |     toFile(s"$output-realdata.csv", realData.seq.map(_.mkString(";")))
 44 |     // Save sums for checking.
 45 |     toFile(s"$output-sums.csv", confusedData.seq.map(_.sum.toString))
 46 |     // Save sums for checking.
 47 |     toFile(s"$output-realdata-sums.csv", realData.seq.map(_.sum.toString))
 48 |   }
 49 | 
 50 |   def generateAllDataLive(output: String, d: Int, clients: Int, noise: Int, zip: Boolean) {
 51 |     // test data writer
 52 |     val writer = fileWriter(output, zip)
 53 |     // Realdata Sums writer
 54 |     val realWriter = fileWriter(s"$output-realdata-sums.csv", zip)
 55 |     val allData = generateDataLive(d, clients, noise)
 56 |     allData.map { line =>
 57 |       val (realData, confusedData) = line.unzip
 58 |       writer.write(confusedData.flatten.mkString(";") + "\n")
 59 |       realWriter.write(realData.sum + "\n")
 60 |     }.force
 61 |     writer.close
 62 |     realWriter.close
 63 |   }
 64 | 
 65 |   def generateData(d: Int, clients: Int, noise: Int) = {
 66 |     val lines = 0 until d
 67 |     val outputs = lines.par.map { l =>
 68 |       val allData = (0 until clients).map { client =>
 69 |         clientData(client, noise)
 70 |       }
 71 |       val (realData, confusedData) = allData.unzip
 72 |       val wholeLine = confusedData.flatten
 73 | 
 74 |       /*val rs = realData.sum
 75 |       val ws = wholeLine.sum
 76 |       assert(doubleEquals(rs, ws), s"$rs did not equal $ws. The sum of real data of the line should equal the sum of the confused data.")*/
 77 |       realData -> wholeLine
 78 |     }
 79 |     outputs.unzip
 80 |   }
 81 | 
 82 |   def generateDataLive(d: Int, clients: Int, noise: Int) = {
 83 |     val lines = 0 until d
 84 |     val outputs = lines.view.map { l =>
 85 |       val allData = (0 until clients).par.map { client =>
 86 |         clientData(client, noise)
 87 |       }
 88 |       allData
 89 |     }
 90 |     outputs
 91 |   }
 92 | 
 93 |   def clientData(clientId: Int, noise: Int) = {
 94 |     val rnd = new Random()
 95 |     def rlong() = {
 96 |       val lon = rnd.nextInt().toLong << 32
 97 |       lon + rnd.nextInt()
 98 |     }
 99 |     val plainText = rlong
100 |     val noises = Seq.fill(noise)(rlong)
101 | 
102 |     val confusedRealData = plainText + noises.sum
103 | 
104 |     val clientData = (0 until noise + 1).view.map { j =>
105 |       if (j == 0) { // "real" data
106 |         confusedRealData
107 |       } else
108 |         noises(j - 1)
109 |     }
110 | 
111 |     //assert(doubleEquals(plainText, clientData.sum), s"$plainText did not equal ${clientData.sum} for client $client. Real data should equal the sum of the confused data for each data item.")
112 |     plainText -> clientData
113 |   }
114 | 
115 |   /**
116 |    * Store `lines` as a series of lines in a local file called `fileName`.
117 |    */
118 |   def toFile(fileName: String, lines: Iterable[String]) {
119 |     toFile(fileName, lines, false)
120 |   }
121 | 
122 |   /**
123 |    * Store `lines` as a series of lines in a local file called `fileName`.
124 |    */
125 |   def toFile(fileNameBase: String, lines: Iterable[String], zip: Boolean = false) {
126 |     val pw = fileWriter(fileNameBase, zip)
127 | 
128 |     lines.foreach(line => { pw.write(line + "\n") })
129 |     pw.close()
130 |   }
131 | 
132 |   /**
133 |    * Store `lines` as a series of lines in a local file called `fileName`.
134 |    */
135 |   def fileWriter(fileNameBase: String, zip: Boolean) = {
136 |     import java.io._
137 |     val fileName = {
138 |       if (zip) {
139 |         s"${fileNameBase}.gz"
140 |       } else
141 |         s"${fileNameBase}"
142 |     }
143 | 
144 |     val f = new File(fileName)
145 |     val pw = {
146 |       if (zip)
147 |         new PrintWriter(new GZIPOutputStream(new FileOutputStream(f, false)))
148 |       else
149 |         new PrintWriter(f)
150 |     }
151 |     pw
152 |   }
153 | }
154 | 


--------------------------------------------------------------------------------
/probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/server/Server.scala:
--------------------------------------------------------------------------------
  1 | package fi.helsinki.cs.probic.server
  2 | 
  3 | import fi.helsinki.cs.nodes.util.OptMain
  4 | import com.typesafe.scalalogging.LazyLogging
  5 | import java.net.ServerSocket
  6 | import java.net.Socket
  7 | import scala.concurrent.ExecutionContext
  8 | import scala.concurrent.Future
  9 | import java.io.ByteArrayInputStream
 10 | import java.io.InputStreamReader
 11 | import java.io.ByteArrayOutputStream
 12 | import java.security.KeyStore
 13 | import javax.crypto.Cipher
 14 | import java.security.spec.X509EncodedKeySpec
 15 | import java.security.KeyFactory
 16 | import java.security.PublicKey
 17 | import org.apache.commons.codec.binary.Base64
 18 | import java.io.DataOutputStream
 19 | import java.io.DataInputStream
 20 | import fi.helsinki.cs.probic.crypto.PkCrypto
 21 | import scala.concurrent.forkjoin.ForkJoinPool
 22 | import sun.misc.VM
 23 | import scala.collection.parallel.ForkJoinTaskSupport
 24 | 
 25 | /**
 26 |  * Mandatory options:
 27 |  * --intype hdfs or --intype socket
 28 |  * --input hdfs://path/to/input/folder
 29 |  * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/
 30 |  */
 31 | object Server extends OptMain with LazyLogging {
 32 | 
 33 |   val DEFAULT_PORT = "8080"
 34 | 
 35 |   val longOptions = Seq("port=", "cert=", "messages=", "useDouble")
 36 | 
 37 |   val shortOptions = ""
 38 | 
 39 |   def optMain() {
 40 | 
 41 |     val port = optional("port").getOrElse(DEFAULT_PORT).toInt
 42 |     val cert = optional("cert").getOrElse("probic")
 43 |     // How many messages to receive before decrypting and returning a result.
 44 |     val messages = mandatoryOption("messages").toInt
 45 | 
 46 |     val useDouble = optionSet("useDouble")
 47 | 
 48 |     val server = new ServerSocket(port)
 49 |     //val pk = new PkCrypto(cert)
 50 | 
 51 |     val handler = handleRequestStreaming(messages, useDouble) _
 52 |     logger.info(s"Starting Probic Data Aggregation Server at $port")
 53 |     //logger.info("Available processors: " + Runtime.getRuntime.availableProcessors() + ", using only 5")
 54 |     var running = true
 55 |     while (running) {
 56 |       handler(server.accept)
 57 |     }
 58 |   }
 59 | 
 60 |   var decryptedMessages = 0
 61 |   var decryptedSum = 0.0
 62 | 
 63 |   def handleRequest(messages: Int)(sock: Socket) {
 64 |     implicit val ec = ExecutionContext.global
 65 |     val answer = Future {
 66 |       val src = new DataInputStream(sock.getInputStream)
 67 |       // Read and decrypt a total of `messages` messages.
 68 | 
 69 |       // Sequentially read messages:
 70 |       val msgSeq = (0 until messages).map { msgId =>
 71 |         /*val len = src.readInt()
 72 |         val cryptoText = new Array[Byte](len)
 73 |         src.read(cryptoText)
 74 |         //logger.info(s"Received msg id $msgId")
 75 |         cryptoText*/
 76 |         src.readLong
 77 |       }.toSeq
 78 | 
 79 |       // Decrypt them in parallel using 5 threads
 80 |       /*val outValue = msgSeq.grouped(messages / 10).toSeq.par.flatMap { group =>
 81 |         /*val rsa = pk.getDecrypt()
 82 |         group.map {
 83 |           cryptoText =>
 84 |             // This is thread safe
 85 |             val msg = new String(rsa.doFinal(cryptoText))
 86 |             msg.toDouble
 87 |         }*/
 88 |       }.reduce(_ + _)*/
 89 | 
 90 |       val outValue = msgSeq.par.reduce(_ + _)
 91 | 
 92 |       logger.info(s"Decrypted $messages messages, returning $outValue")
 93 |       val out = new DataOutputStream(sock.getOutputStream)
 94 |       out.writeLong(outValue)
 95 |       sock.close()
 96 |       outValue
 97 |     }
 98 |   }
 99 | 
100 |   def handleRequestStreaming(messages: Int, useDouble: Boolean)(sock: Socket) {
101 |     implicit val ec = ExecutionContext.global
102 |     val answer = Future {
103 |       val src = new DataInputStream(sock.getInputStream)
104 |       // Read and decrypt a total of `messages` messages.
105 |       val outValue = {
106 |         if (useDouble) {
107 |           var result = 0.0
108 |           // Sequentially read messages:
109 |           val msgSeq = (0 until messages).foreach { msgId =>
110 |             result += src.readDouble
111 |           }
112 |           result
113 |         } else {
114 |           var result = 0L
115 |           // Sequentially read messages:
116 |           val msgSeq = (0 until messages).foreach { msgId =>
117 |             result += src.readLong
118 |           }
119 |           result
120 |         }
121 |       }
122 | 
123 |       logger.info(s"Decrypted $messages messages, returning $outValue")
124 |       val out = new DataOutputStream(sock.getOutputStream)
125 |       if (useDouble)
126 |         out.writeDouble(outValue)
127 |       else // Possible loss of precision.
128 |         out.writeLong(outValue.toLong)
129 |       sock.close()
130 |       outValue
131 |     }
132 |   }
133 | 
134 |   def handleRequestStreamingDouble(messages: Int)(sock: Socket) = {
135 |     implicit val ec = ExecutionContext.global
136 |     Future {
137 |       val src = new DataInputStream(sock.getInputStream)
138 |       // Read and decrypt a total of `messages` messages.
139 |       var result = 0.0
140 |       // Sequentially read messages:
141 |       val msgSeq = (0 until messages).foreach { msgId =>
142 |         result += src.readDouble
143 |       }
144 |       val outValue = result
145 | 
146 |       logger.info(s"Decrypted $messages messages, returning $outValue")
147 |       val out = new DataOutputStream(sock.getOutputStream)
148 |       out.writeDouble(outValue)
149 |       sock.close()
150 |       outValue
151 |     }
152 |   }
153 | }
154 | 


--------------------------------------------------------------------------------
/dataset_tests/src/combine_pred_errors.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | UCI data
  6 | 
  7 | Script for combining prediction error results from individual pickled files produced by eps_data_test.py.
  8 | 
  9 | eps_data_test.py should be run before this.
 10 | 
 11 | Run: python3 combine_pred_errors.py
 12 | '''
 13 | 
 14 | import sys
 15 | 
 16 | import numpy as np
 17 | from matplotlib import pyplot as plt
 18 | 
 19 | ################################################################################
 20 | # SETUP
 21 | ################################################################################
 22 | 
 23 | # Plot settings
 24 | pars_filename = 'test_results/NIPS_camera_ready/pars_test_red_wine_1.pickle'
 25 | #pars_filename = 'test_results/NIPS_camera_ready/pars_test_white_wine_1.pickle'
 26 | #pars_filename = 'test_results/NIPS_camera_ready/pars_test_abalone_1.pickle'
 27 | # Note: set this to match the settings in eps_data_test.py
 28 | 
 29 | # for reproducing the figures in the paper
 30 | figure_bounds = 'red_wine'
 31 | #figure_bounds = 'white_wine'
 32 | #figure_bounds = 'abalone'
 33 | 
 34 | #save figure
 35 | save_to_file = False
 36 | fig_name = 'plots/UCI_redwine_NIPS_final.pdf'
 37 | #fig_name = 'plots/UCI_whitewine_NIPS_final.pdf'
 38 | #fig_name = 'plots/UCI_abalone_NIPS_final.pdf'
 39 | 
 40 | #PLOTTING CONFIGURATIONS & COLORS
 41 | no_plotting = ['cl_scaling', 'cl_noisy','cl_true_TA']
 42 | 
 43 | nimet_dict = {'true':'NP', 'clipped':'proj NP','noisy':'TA', 'cl_noisy':'proj TA', 'noisy_ind':'DDP', 'cl_noisy_ind':'proj DDP', 'scaling':'input\nperturbed','cl_scaling':'proj scaling','cl_true_TA':'proj TA (non DP)' ,'cl_true_TA_DP': 'proj TA'}
 44 | 
 45 | # colors
 46 | col_dict = {'true':'blue', 'clipped':'gray','noisy':'lightseagreen', 'cl_noisy':'green', 'noisy_ind':'red', 'cl_noisy_ind':'magenta', 'scaling':'orange','cl_scaling':'orange', 'cl_true_TA': 'black','cl_true_TA_DP':'green'}
 47 | 
 48 | ################################################################################
 49 | # END OF SETUP
 50 | ################################################################################
 51 | metodit = ['true', 'clipped', 'noisy', 'cl_noisy', 'noisy_ind', 'cl_noisy_ind', 'scaling','cl_scaling', 'cl_true_TA','cl_true_TA_DP']
 52 | 
 53 | #load parameters used
 54 | pars = np.load(pars_filename)
 55 | print('Parameters read from ' + str(pars_filename))
 56 | 
 57 | #lists with one element for each clipping rate
 58 | abs_error_list = list()
 59 | sq_error_list = list()
 60 | 
 61 | #create names list
 62 | nimet = []
 63 | for m in metodit:
 64 |   if m not in no_plotting:
 65 |     nimet.append(nimet_dict[m])
 66 | 
 67 | for k_test in pars['all_file_ids']:
 68 |   abs_err = {}
 69 |   sq_err = {}
 70 |   for m in metodit:
 71 |     abs_err[m] = np.zeros((len(pars['n_clients']), pars['n_repeats']))
 72 |     sq_err[m] = np.zeros((len(pars['n_clients']), pars['n_repeats']))
 73 |   
 74 |   filename = pars['output_folder'] + 'pred_errors_test' + str(k_test) + '.pickle'
 75 |   apu = np.load(filename)
 76 |   i = 0
 77 |   for k_clients in range(len(pars['n_clients'])):
 78 |     for k_repeat in range(pars['n_repeats']):
 79 |       for m in metodit:
 80 |         #MAE
 81 |         abs_err[m][k_clients,k_repeat] = apu[i][m][0]
 82 |         #MSE
 83 |         sq_err[m][k_clients,k_repeat] = apu[i][m][1]
 84 |       i = i+1
 85 |        
 86 |   abs_error_list.append(abs_err)
 87 |   sq_error_list.append(sq_err)
 88 | 
 89 | ###############################################################################
 90 | #simple plotting function
 91 | def plotter(x,y,metodit, bounds, x_label, y_label, subtitle, x_ticks, add_noise_mean, y_err_lower=None, y_err_upper=None, y_all_clip_means=None,y_true_clip_means=None):
 92 |   round = -3
 93 |   k_col = 0
 94 |   for m in metodit:
 95 |     k_col = k_col + 1
 96 |     if m not in no_plotting: #skip non-used
 97 |       #plot non-private with dashed line
 98 |       if m in ['true','clipped']:
 99 |         linetype = '--'
100 |       else:
101 |         linetype = '-'
102 |         
103 |       if y_err_lower == None:
104 |       #line, = plt.plot(x,y[m], '*-', linewidth=2,label=m)
105 |         plt.plot(x,y[m], '*-', linewidth=2.5,label=m,linestyle=linetype)
106 |       else:
107 |       #with errorbars
108 |         plt.errorbar(x+round*.05, y[m],linewidth=2.2, yerr=[y_err_lower[m],y_err_upper[m] ], linestyle=linetype, color=col_dict[m],label=m )
109 |       round = round + 1
110 |         
111 |   #add clipping thresholds if applicable
112 |   if y_all_clip_means != None:
113 |     plt.plot(x,y_all_clip_means,label=m )
114 |   if y_true_clip_means != None:
115 |     plt.plot(x,y_true_clip_means,label=m )
116 |         
117 |   #add line for unclipped noise mean
118 |   if add_noise_mean:
119 |     plt.plot(x,np.repeat(np.mean(y['noisy']),len(x)), '--', linewidth=1,label='noise mean' )
120 |     
121 |   #define custom bounds for result figures
122 |   if figure_bounds == 'abalone':
123 |     bounds[2:] = [.55,2.5]
124 |   elif figure_bounds == 'red_wine':
125 |     bounds[2:] = [.59,4.0]
126 |   elif figure_bounds == 'white_wine':
127 |     bounds[2:] = [.63,2.5]
128 |   
129 |   plt.axis(bounds)
130 |   plt.tight_layout(pad=7)
131 |   plt.legend(nimet,bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.)
132 |   plt.xlabel(x_label)
133 |   plt.ylabel(y_label)
134 |   plt.suptitle(subtitle, y=.12, fontsize=13)
135 |   plt.xticks(x_ticks[0],x_ticks[1])
136 |   if save_to_file:
137 |     plt.savefig(fig_name, bbox_inches='tight')
138 |   else:
139 |     plt.show()
140 | 
141 | 
142 | ###############################################################################
143 | for sample_size in range(len(pars['n_clients'])):
144 |   x = np.linspace(1,len(sq_error_list),num=len(sq_error_list))
145 |   y_mae = {}
146 |   y_mse = {}
147 |   y_mae_lower = {}
148 |   y_mae_upper = {}
149 |   y_mae_err = {}
150 |   for m in metodit:
151 |     y_mae[m] = np.zeros(len(sq_error_list))
152 |     y_mse[m] = np.zeros(len(sq_error_list))
153 |     y_mae_err[m] = np.zeros(len(sq_error_list))
154 |     y_mae_lower[m] = np.zeros(len(sq_error_list))
155 |     y_mae_upper[m] = np.zeros(len(sq_error_list))
156 |     for k_priv in range( len(pars['epsilon_tot']) ):
157 |       y_mae[m][k_priv] = np.median(abs_error_list[k_priv][m][sample_size, :] )
158 |       #calculate .25 and .75 quantiles for errorbars
159 |       apu = np.sort(abs_error_list[k_priv][m][sample_size, :])
160 |       y_mae_lower[m][k_priv] = np.absolute( apu[ int(np.floor(.25*len(apu))) ] - y_mae[m][k_priv] )
161 |       y_mae_upper[m][k_priv] = np.absolute( apu[ int(np.ceil(.75*len(apu))) ] - y_mae[m][k_priv] )
162 |       
163 |       y_mse[m][k_priv] = np.mean(sq_error_list[k_priv][m][sample_size, :] )
164 |   y_mae_err = None#obsolete
165 | 
166 | for sample_size in range(len(pars['n_clients'])):
167 |   if len(x) < 10:
168 |     x_ticks = [x, np.round(pars['epsilon_tot'],2)]
169 |   else:
170 |     x_ticks = [x[0::3], np.round(pars['epsilon_tot'][0::3], 2)]
171 |   #mae
172 |   if not pars['do_optimal_clip']:
173 |     plotter(x, y_mae, metodit, [0,len(x)+1,0,1], 'epsilon', 'MAE', 'clipping: '+str(pars['all_clips']) + ', sample size=' + str(pars['n_clients'][sample_size]) + ', delta=' + str(pars['delta_tot'][0]), x_ticks, False,  y_mae_lower,y_mae_upper)
174 |   else:
175 |     plotter(x, y_mae, metodit, [0,len(x)+1,0,1], 'epsilon', 'MAE', 'd=' + str(pars['dim']) + ', sample size=' + str(pars['n_clients'][sample_size]) + ', repeats=' + str(pars['n_repeats']) + ', $\delta=$' + str(pars['delta_tot'][0]), x_ticks, False, y_mae_lower,y_mae_upper)
176 | 
177 | 
178 | 


--------------------------------------------------------------------------------
/dataset_tests/src/suff_stats_master.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | UCI data, GDSC/drug sensitivity data
  6 | 
  7 | Script for calculating std:s for noise for various models.
  8 | '''
  9 | 
 10 | import numpy as np
 11 | import sys
 12 | from collections import OrderedDict
 13 | 
 14 | import sufficient_stats
 15 | 
 16 | def get_suff_stats(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true=None, clip_threshold_true=None, data_clipped_true_DP=None, clip_threshold_true_DP=None):
 17 | 
 18 |   dim = pars['dim']
 19 | ############################################################
 20 |   added_noise_dict = OrderedDict()
 21 |   
 22 |   #use fixed sensitivities for UCI data; allow drugsens baseline methods to cheat a bit by using a bound calculated from the data
 23 |   if not pars['drugsens_data']:
 24 |     data_sensitivity = np.zeros(data.shape[1]-1) + pars['scale_to_range']
 25 |     target_sensitivity = pars['scale_to_range']
 26 |   else:
 27 |     data_sensitivity = np.zeros(dim) + pars['assumed_data_range'][0]
 28 |     target_sensitivity = np.ceil(np.amax(np.abs(data[:,-1])))
 29 |   
 30 |   #calculate clip products & range products between dimensions, includes factor of 2 for x_i*x_j sensitivities when i != j
 31 |   clip_prods = np.zeros((dim*(dim+1)//2) + dim)
 32 |   range_prods = np.zeros(len(clip_prods))
 33 |   if data_clipped_true is not None:
 34 |     clip_prods_true = np.zeros(len(clip_prods))
 35 |   if data_clipped_true_DP is not None:
 36 |     clip_prods_true_DP = np.zeros(len(clip_prods))
 37 |   ind = 0
 38 |   #for suff_stats X'X
 39 |   for i in range(dim):
 40 |     for ii in range(i+1):
 41 |       clip_prods[ind] = clip_threshold[i] * clip_threshold[ii]
 42 |       range_prods[ind] = data_sensitivity[i] * data_sensitivity[ii]
 43 |       if data_clipped_true is not None:
 44 |         clip_prods_true[ind] = clip_threshold_true[i] * clip_threshold_true[ii]
 45 |       if data_clipped_true_DP is not None:
 46 |         clip_prods_true_DP[ind] = clip_threshold_true_DP[i] * clip_threshold_true_DP[ii]
 47 |       #include factor of 2 from sensitivity for non-diagonal terms
 48 |       if i != ii:
 49 |         clip_prods[ind] *= 2
 50 |         range_prods[ind] *= 2
 51 |         if data_clipped_true is not None:
 52 |           clip_prods_true[ind] *= 2
 53 |         if data_clipped_true_DP is not None:
 54 |           clip_prods_true_DP[ind] *= 2
 55 |       ind = ind + 1
 56 |   #for suff stats X'y
 57 |   for i in range(dim):
 58 |     clip_prods[ind] = 2*clip_threshold[i] * clip_threshold[-1]
 59 |     range_prods[ind] = 2*data_sensitivity[i] * target_sensitivity
 60 |     if data_clipped_true is not None:
 61 |       clip_prods_true[ind] = 2*clip_threshold_true[i] * clip_threshold_true[-1]
 62 |     if data_clipped_true_DP is not None:
 63 |       clip_prods_true_DP[ind] = 2*clip_threshold_true_DP[i] * clip_threshold_true_DP[-1]
 64 |     ind = ind + 1
 65 |   
 66 |   #total l2-sensitivities for noise std calculations
 67 |   clip_sensitivity = np.sqrt( np.sum(clip_prods[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods[(dim*(dim+1)//2):]**2) )
 68 |   
 69 |   range_sensitivity = np.sqrt( np.sum(range_prods[0:(dim*(dim+1)//2)]**2) + np.sum(range_prods[(dim*(dim+1)//2):]**2) )
 70 |   
 71 |   if data_clipped_true is not None:
 72 |     clip_sensitivity_true = np.sqrt( np.sum(clip_prods_true[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods_true[(dim*(dim+1)//2):]**2) )
 73 |   
 74 |   if data_clipped_true_DP is not None:
 75 |     clip_sensitivity_true_DP = np.sqrt( np.sum(clip_prods_true_DP[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods_true_DP[(dim*(dim+1)//2):]**2) )
 76 |   
 77 |   sigma_all = OrderedDict()
 78 |   suff_stats_all = OrderedDict()
 79 |   
 80 |   eps=(1-pars['privacy_for_marg_var'])*pars['epsilon']
 81 |   delta=(1-pars['privacy_for_marg_var'])*pars['delta']
 82 |   eps_no_clip = pars['epsilon']
 83 |   delta_no_clip = pars['delta']
 84 |   
 85 |   if pars['add_noise'] in [1,3]:
 86 | #trusted aggregator noise
 87 | ############################################################
 88 |     
 89 |     #clipped
 90 |     sigma_all['cl_noisy'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps)
 91 |     #clipped true TA (non DP, i.e., doesn't spend privacy on clipping bounds)
 92 |     if data_clipped_true is not None:
 93 |       sigma_all['cl_true_TA'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta_no_clip) ) * (clip_sensitivity_true/eps_no_clip)
 94 |     #clipped true TA (DP)
 95 |     if data_clipped_true_DP is not None:
 96 |       sigma_all['cl_true_TA_DP'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta) ) * (clip_sensitivity_true_DP/eps)
 97 |     #no clipping
 98 |     sigma_all['noisy'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip)
 99 |     
100 |     #calculate sufficient stats for clipped & unclipped data
101 |     ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None
102 |     ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=True, sigma=sigma_all['noisy'], use_spark=False)
103 |     ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=True, sigma=sigma_all['cl_noisy'], use_spark=False)
104 |     
105 |     #cl true TA (not DP)
106 |     if data_clipped_true is not None:
107 |       ss_cl_true1, ss_cl_true2, noise_cl_true = sufficient_stats.ss_individually(data_clipped_true, add_noise=True, sigma=sigma_all['cl_true_TA'], use_spark=False)
108 |     #cl true TA (DP)
109 |     if data_clipped_true_DP is not None:
110 |       ss_cl_true_DP1, ss_cl_true_DP2, noise_cl_true_DP = sufficient_stats.ss_individually(data_clipped_true_DP, add_noise=True, sigma=sigma_all['cl_true_TA_DP'], use_spark=False)
111 |     
112 |     suff_stats_all['noisy'] = [ss1, ss2]
113 |     suff_stats_all['cl_noisy'] = [ss_cl1, ss_cl2]
114 |     added_noise_dict['noisy'] = noise
115 |     added_noise_dict['cl_noisy'] = noise_cl
116 |     #cl true TA (not DP)
117 |     if data_clipped_true is not None:
118 |       suff_stats_all['cl_true_TA'] = [ss_cl_true1, ss_cl_true2]
119 |       added_noise_dict['cl_true_TA'] = noise_cl_true
120 |     #cl true TA (DP)
121 |     if data_clipped_true_DP is not None:
122 |       suff_stats_all['cl_true_TA_DP'] = [ss_cl_true_DP1, ss_cl_true_DP2]
123 |       added_noise_dict['cl_true_TA_DP'] = noise_cl_true_DP
124 |     
125 | #with extra scaling factor for percentage honest clients
126 | ############################################################
127 |     #calculate scaling factor
128 |     if pars['scaling_comparison'] == 0:
129 |       scaling = 1
130 |     else:
131 |       scaling = 1/(np.ceil( pars['scaling_comparison']*n_train) )
132 |     #add noise in pieces separately by each client
133 |     #noise std for X'X
134 |     sigma_all['cl_scaling'] = np.sqrt( scaling * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps)
135 |     #noise std for X'y
136 |     sigma_all['scaling'] = np.sqrt( scaling * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip)
137 |     
138 |     #calculate sufficient stats for clipped & unclipped data with extra scaling
139 |     ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None
140 |     ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=True, sigma=sigma_all['scaling'], use_spark=False)
141 | 
142 |     ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=True, sigma=sigma_all['cl_scaling'], use_spark=False)
143 |     
144 |     
145 |     suff_stats_all['scaling'] = [ss1, ss2]
146 |     suff_stats_all['cl_scaling'] = [ss_cl1, ss_cl2]
147 |     added_noise_dict['scaling'] = noise
148 |     added_noise_dict['cl_scaling'] = noise_cl
149 | 
150 | #individual noise i.e. n/(n-1) factor with clipped and unclipped data
151 | ############################################################
152 |   if pars['add_noise'] in [2,3]:
153 |     #clipped data
154 |     sigma_all['cl_noisy_ind'] = np.sqrt( 1/(n_train-1) * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps)
155 |     
156 |     #unclipped data
157 |     sigma_all['noisy_ind'] = np.sqrt( 1/(n_train-1) * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip)
158 |     
159 |     #calculate sufficient stats for clipped & unclipped data
160 |     # Note: unclipped used for Spark testing
161 |     ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None
162 |     ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=pars['add_noise'] > 0, sigma=sigma_all['noisy_ind'], use_spark=pars['use_spark'], filename=pars['spark_filename'], n_spark_messages=pars['n_spark_messages'], spark_noise_range=pars['spark_noise_range'], fixed_point_int=pars['fixed_point_int'])
163 |     
164 |     ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=pars['add_noise'] > 0, sigma=sigma_all['cl_noisy_ind'], use_spark=False)
165 |     
166 |     suff_stats_all['noisy_ind'] = [ss1, ss2]
167 |     suff_stats_all['cl_noisy_ind'] = [ss_cl1, ss_cl2]
168 |     added_noise_dict['noisy_ind'] = noise
169 |     added_noise_dict['cl_noisy_ind'] = noise_cl
170 |     
171 | ############################################################
172 |   #calculate noiseless sufficient statistics for comparison
173 |   #X'X
174 |   suff_stats_all['true'] = list()
175 |   suff_stats_all['true'].append(np.dot(np.transpose(data[:,0:-1]), data[:,0:-1]))
176 |   #X'y
177 |   suff_stats_all['true'].append(np.dot(np.transpose(data[:,0:-1]), data[:,-1]))
178 | 
179 |   #suff.stats for the noiseless clipped data
180 |   suff_stats_all['clipped'] = list()
181 |   #X'X
182 |   suff_stats_all['clipped'].append(np.dot(np.transpose(data_clipped[:,0:-1]), data_clipped[:,0:-1]))
183 |   #X'y
184 |   suff_stats_all['clipped'].append(np.dot(np.transpose(data_clipped[:,0:-1]), data_clipped[:,-1]))
185 |   
186 |   return suff_stats_all, sigma_all, added_noise_dict
187 | 


--------------------------------------------------------------------------------
/dataset_tests/src/drugsens_code/diffpri.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | Modified from the original code:
  6 |   Differentially private Bayesian linear regression 
  7 |   Arttu Nieminen 2016-2017
  8 |   University of Helsinki Department of Computer Science
  9 |   Helsinki Institute of Information Technology HIIT
 10 |   
 11 | GDSC/drug sensitivity data
 12 | 
 13 | Various functions and data processing steps used in the tests.
 14 | '''
 15 | 
 16 | import sys, os, copy
 17 | import numpy as np
 18 | from scipy.stats import spearmanr
 19 | import warnings
 20 | 
 21 | # NOTE on normalisation in distributed setting:
 22 | #   assume centered data (so remove column means)
 23 | #   row-wise L2-normalization is ok, since doesn't depend on other rows
 24 | 
 25 | # Centers and L2-normalises x-data (removes columnwise mean, normalises rows to norm 1)
 26 | def xnormalise(x):
 27 |   n = x.shape[0]
 28 |   d = x.shape[1]
 29 |   if n == 0:
 30 |     return x
 31 |   else:
 32 |     z = x-np.dot(np.ones((n,1),dtype=np.float),np.nanmean(x,0).reshape(1,d))
 33 |     return np.divide(z,np.dot(np.sqrt(np.nansum(np.power(z,2.0),1)).reshape(n,1),np.ones((1,d),dtype=np.float)))
 34 | 
 35 | 
 36 | # Centers y-data (removes columnwise mean, except for columns where all samples have / all but one sample has missing drug response(s))
 37 | def ynormalise(y):
 38 |   n = y.shape[0]
 39 |   d = y.shape[1]
 40 |   if n == 0:
 41 |     return y
 42 |   else:
 43 |     with warnings.catch_warnings():
 44 |       warnings.simplefilter("ignore", category=RuntimeWarning)
 45 |       m = np.nanmean(y,0)
 46 |     ind = np.where(np.sum(~np.isnan(y),0)<=1)[0]
 47 |     m[ind] = 0.0 # don't center samples of size <= 1
 48 |     return y-np.dot(np.ones((n,1),dtype=np.float),m.reshape(1,d))
 49 | 
 50 | 
 51 | # Clip data
 52 | def clip(x,y,B_x,B_y):
 53 |   C = np.multiply(np.sign(x),np.minimum(np.abs(x),B_x))
 54 |   with np.errstate(invalid='ignore'):
 55 |     D = np.multiply(np.sign(y),np.minimum(np.abs(y),B_y))
 56 |   return C,D
 57 | 
 58 | 
 59 | # Selects drug based on drugid, removes cell lines with missing drug response
 60 | def ignoreNaN(xx,yy,drugid):
 61 |   ind = np.where(np.isnan(yy[:,drugid]))
 62 |   y = np.delete(yy[:,drugid],ind,axis=0)
 63 |   x = np.delete(xx,ind,axis=0)
 64 |   return x,y
 65 | 
 66 | 
 67 | # Non-private sufficient statistics
 68 | def nxx(x):
 69 |   return np.dot(x.T,x)
 70 | def nxy(x,y):
 71 |   return np.dot(x.T,y)
 72 | def nyy(y):
 73 |   return np.dot(y.T,y)
 74 | 
 75 | 
 76 | # Precision measure: Spearman's rank correlation coefficient
 77 | def precision(y_pred,y_real):
 78 |   r = spearmanr(y_pred,y_real)[0]
 79 |   if np.isnan(r):
 80 |     return 0.0
 81 |   else:
 82 |     return r
 83 | 
 84 | 
 85 | # Prediction errors (MAE, MSE) helper script
 86 | def pred_errors(pred, y, method):
 87 |   if method == 'mae':
 88 |     return np.mean(np.absolute(pred-y))
 89 |   elif method =='mse':
 90 |     return np.mean((pred-y)**2)
 91 | 
 92 | 
 93 | # Choose optimal w_x,w_y for clipping thresholds
 94 | def omega(n,d,eps,delta, method='corr',ln=20):
 95 |   
 96 |   # Precision parameters (correspond to the means of the gamma hyperpriors)
 97 |   l = 1.0
 98 |   l0 = 1.0
 99 |   
100 |   l1 = ln
101 |   l2 = ln
102 |   
103 |   st = np.arange(0.1,2.1,0.1)
104 |   lenC1 = len(st)
105 |   lenC2 = lenC1
106 |   err = np.zeros((lenC1,lenC2),dtype=np.float64)
107 |   
108 |   for i in range(l1):
109 | 
110 |     # Create synthetic data
111 |     x = np.random.normal(0.0,1.0,(n,d))
112 |     x = xnormalise(x)
113 |     sx = np.std(x,ddof=1)
114 |     b = np.random.normal(0.0,1.0/np.sqrt(l0),d)
115 |     y = np.random.normal(np.dot(x,b),1.0/np.sqrt(l)).reshape(n,1)
116 |     y = ynormalise(y)
117 |     sy = np.std(y,ddof=1)
118 |     
119 |     # Thresholds to be tested
120 |     cs1 = st*sx
121 |     cs2 = st*sy
122 |     
123 |     for j in range(l2):
124 |       
125 |       apu2 = np.random.normal(loc=0,
126 |       scale=np.sqrt(n/(n-1)*2*np.log(1.25/delta)) * 1/eps,
127 |       size=d*(d+1)//2+d)
128 |       
129 |       U = np.zeros((d,d))
130 |       U[np.tril_indices(d,0)] = apu2[:d*(d+1)//2]
131 |       U =  U + np.triu(np.transpose(U),k=1)
132 |       V = apu2[d*(d+1)//2:].reshape((d,1))
133 |       
134 |       for ci1 in range(lenC1):
135 |         c1 = cs1[ci1]
136 |         for ci2 in range(lenC2):
137 |           c2 = cs2[ci2]
138 |           
139 |           # Clip data
140 |           xc,yc = clip(x,y,c1,c2)
141 |           sensitivity = d*c1**4 + d*(d-1)*2*c1**4 + d*(2*c1*c2)**2
142 |           
143 |           # Perturbed suff.stats
144 |           xx = nxx(xc) + U*(sensitivity**2)
145 |           xy = nxy(xc,yc) + V*(sensitivity**2)
146 |           
147 |           # Prediction
148 |           prec = l0*np.identity(d) + l*xx
149 |           mean = np.linalg.solve(prec,l*xy)
150 |           pred = np.dot(x,mean)
151 |           
152 |           # Errors
153 |           if method == 'corr':
154 |             rho = precision(pred,y)
155 |             err[ci1,ci2] = err[ci1,ci2] + rho
156 |           elif method == 'mae':
157 |             MAE = pred_errors(pred,y,'mae')
158 |             err[ci1,ci2] = err[ci1,ci2] - MAE
159 |           elif method == 'mse':
160 |             MSE = pred_errors(pred,y,'mse')
161 |             err[ci1,ci2] = err[ci1,ci2] - MSE
162 |           else:
163 |             print('Unknown method in optimal clip!')
164 |             sys.exit()
165 | 
166 |   # Average
167 |   err = err/float(l1*l2)
168 |   # Choose best
169 |   ind = np.unravel_index(err.argmax(),err.shape)
170 |   w_x = st[ind[0]]
171 |   w_y = st[ind[1]]
172 | 
173 |   return w_x,w_y
174 | 
175 | 
176 | # Prediction on test data
177 | def predictL(nxx_pv,nxy_pv,x_test):
178 |   l = 1.0
179 |   l0 = 1.0
180 |   d = nxx_pv.shape[0]
181 |   # Posterior for Gaussian
182 |   prec = l*(nxx_pv) + l0*np.identity(d)
183 |   mean = np.linalg.solve(prec,l*(nxy_pv))
184 |   # Compute prediction
185 |   return np.dot(x_test,mean)
186 | 
187 | 
188 | def estimate_stds(data,pars):
189 |   PACKAGE_PARENT = '..'
190 |   SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__))))
191 |   sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT)))
192 |   from estimate_vars import get_estimates
193 |   return np.sqrt(get_estimates(data, pars, pars['small_const_for_std']))
194 | 
195 | def suff_stats_crypto(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true=None, clip_threshold_true=None, data_clipped_true_DP=None, clip_threshold_true_DP=None):
196 |   from suff_stats_master import get_suff_stats
197 |   return get_suff_stats(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true, clip_threshold_true, data_clipped_true_DP, clip_threshold_true_DP)
198 | 
199 | def enforce_pos_def(suff_stats, pars):
200 |   from pos_def_matrices import check
201 |   return check(suff_stats, pars)
202 | 
203 | 
204 | def get_TA_std_estimates(data, pars):
205 |   palautettava = np.var(data, 0)
206 |   #use Gaussian mechanism for DP
207 |   eps = pars['privacy_for_marg_var']*pars['epsilon']
208 |   delta = pars['privacy_for_marg_var']*pars['delta']
209 |   n = data.shape[0]
210 |   dim = data.shape[1] - 1
211 |   if pars['drugsens_data']:
212 |     data_bound = np.ceil(np.amax(np.abs(data),0))
213 |     sigma = np.sqrt( 2*np.log(1.25/delta)) * 1/n * np.sqrt((dim*(pars['assumed_data_range'][0]**2) + data_bound[-1]**2))/eps
214 |   else:
215 |     sigma = np.sqrt( 2*np.log(1.25/delta)) * 1/n *np.sqrt( (dim+1)*(pars['scale_to_range']**2))/eps
216 |   
217 |   #add noise
218 |   palautettava = palautettava + np.random.normal(0, sigma, [data.shape[1]])
219 |   #constrain stds to be positive
220 |   inds = palautettava <= 0
221 |   if len(inds) > 0:
222 |     palautettava[inds] = pars['small_const_for_std'] #set non-positive std to small arbitrary constant
223 |   return np.sqrt(palautettava)
224 | 
225 | 
226 | # Process drugsens data
227 | def processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars):
228 |   
229 |   n_train = n_pv
230 |   
231 |   # Set rng seed
232 |   np.random.seed(seed)
233 |   
234 |   # Test/training split + dimensionality reduction
235 |   ind = np.random.permutation(x.shape[0])
236 |   x_test = x[ind[0:n_test],0:d]
237 |   y_test = y[ind[0:n_test],:]
238 |   x_train = x[ind[n_test:],0:d]
239 |   y_train = y[ind[n_test:],:]
240 | 
241 |   # Training data
242 |   x_pv = x_train[0:n_pv,:]
243 |   y_pv = y_train[0:n_pv,:]
244 |   
245 |   # Normalise x-data (remove mean and L2-normalize)
246 |   x_test = xnormalise(x_test)
247 |   x_pv = xnormalise(x_pv)
248 |   
249 |   # Normalise y-data (remove mean)
250 |   y_test = ynormalise(y_test)
251 |   y_pv = ynormalise(y_pv)
252 | 
253 |   
254 |   # get marginal std estimates for clipping
255 |   data = np.copy(np.hstack( (x_pv, y_pv[:,drugid].reshape(y_pv.shape[0],1)) ))
256 |   
257 |   stds = estimate_stds(np.copy(data), pars)
258 |   
259 |   #true std for comparison
260 |   stds_true = np.std(data, 0)
261 |   
262 |   #DP std estimates for TA
263 |   stds_TA = get_TA_std_estimates(np.copy(data), pars)
264 |   
265 |   # Clip data
266 |   n = np.sum(~np.isnan(y_pv[:,drugid]))
267 |   
268 |   x_pv_orig = np.copy(x_pv)
269 |   y_pv_orig = np.copy(y_pv)
270 |   
271 |   if n == 1:
272 |     B_x = np.max(np.abs(x_pv))
273 |     B_y = np.nanmax(np.abs(y_pv))
274 |     x_pv,y_pv = clip(x_pv,y_pv,B_x,B_y)
275 |     print('\nn==1!\n')
276 |     
277 |   elif n > 1:
278 |     B_x = w_x * stds[0:-1]
279 |     B_y = w_y * stds[-1]
280 |     
281 |     B_x_true = w_x * stds_true[0:-1]
282 |     B_y_true = w_y * stds_true[-1]
283 |     
284 |     B_x_true_DP = w_x * stds_TA[0:-1]
285 |     B_y_true_DP = w_y * stds_TA[-1]
286 |     x_pv,y_pv = clip(x_pv,y_pv,B_x,B_y)
287 |     
288 |     x_pv_true,y_pv_true = clip(np.copy(x_pv_orig),np.copy(y_pv_orig),B_x_true,B_y_true)
289 |     
290 |     x_pv_true_DP,y_pv_true_DP = clip(np.copy(x_pv_orig),np.copy(y_pv_orig),B_x_true_DP,B_y_true_DP)
291 |     
292 |   else:
293 |     B_x = 0.0
294 |     B_y = 0.0
295 |   
296 |   # Select drug and drop cell lines with missing response
297 |   x_pv,y_pv = ignoreNaN(x_pv,y_pv,drugid)
298 |   x_test,y_test = ignoreNaN(x_test,y_test,drugid)
299 |   n_train = x_pv.shape[0]
300 |   x_pv_true,y_pv_true = ignoreNaN(x_pv_true,y_pv_true,drugid)
301 |   x_pv_true_DP,y_pv_true_DP = ignoreNaN(x_pv_true_DP,y_pv_true_DP,drugid)
302 |   
303 |   # Compute suff.stats
304 |   data_clipped = np.hstack( (x_pv, y_pv.reshape(y_pv.shape[0],1)) )
305 |   data_clipped_true = np.hstack( (x_pv_true, y_pv_true.reshape(y_pv_true.shape[0],1)) )
306 |   data_clipped_true_DP = np.hstack( (x_pv_true_DP, y_pv_true_DP.reshape(y_pv_true_DP.shape[0],1)) )
307 |   
308 |   
309 |   suff_stats_all, sigma_all, added_noise_dict = suff_stats_crypto(data, data_clipped, n_train, 0, np.hstack((B_x,B_y)), pars, data_clipped_true, np.hstack((B_x_true,B_y_true)), data_clipped_true_DP, np.hstack((B_x_true_DP,B_y_true_DP)) )
310 | 
311 |   #enforce pos.def. Cov matrices
312 |   suff_stats_all = enforce_pos_def(suff_stats_all, pars)
313 |   
314 |   return suff_stats_all, sigma_all, added_noise_dict, x_test, y_test, B_x, B_y, n_train
315 | 
316 | 
317 | 
318 | 
319 | 
320 | 


--------------------------------------------------------------------------------
/dataset_tests/src/eps_data_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Differentially private Bayesian learning on distributed data
  3 | Mikko Heikkilä 2016-17
  4 | 
  5 | UCI data (abalone, red wine, white wine)
  6 | 
  7 | Script for testing distributed Bayesian learning on UCI datasets.
  8 | 
  9 | Run: python3 eps_data_test.py
 10 | '''
 11 | 
 12 | import getopt
 13 | import numpy as np
 14 | import pickle
 15 | import os
 16 | import re
 17 | import sys
 18 | from time import sleep
 19 | 
 20 | import calculate_pred_errors
 21 | import estimate_vars
 22 | import linear_regression_master
 23 | import pos_def_matrices
 24 | import setup_handler
 25 | import suff_stats_master
 26 | import UCI_data_getter
 27 | from drugsens_code import diffpri as dp
 28 | 
 29 | pars = {}
 30 | ################################################################################
 31 | # SETUP
 32 | ################################################################################
 33 | # Use setup-script
 34 | # Note: overrides all the options in this script if used!
 35 | # 0=False, 1=use given setup-file, 2=write current setup to given file, 3=print given setup file and quit
 36 | use_saved_setup = 0
 37 | setup_filename = 'test_setups/abalone_setup'
 38 | 
 39 | # Check for positive definite Cov matrices
 40 | pars['enforce_pos_def'] = 1
 41 | # 0 = only flag non-pos.def matrices
 42 | # 1 = ensure pos.def. Cov
 43 | # 1 in the paper
 44 | 
 45 | pars['random_seed'] = 1
 46 | # 1 in the paper
 47 | 
 48 | # Number of cross-validation runs for each fixed sample size
 49 | pars['n_repeats'] = 25
 50 | # 25 in the paper
 51 | 
 52 | # Number of repeats for finding optimal clipping threshold
 53 | pars['opt_clip_repeats'] = 20
 54 | # 20 in the paper
 55 | 
 56 | # Possible datasets: red_wine, white_wine, abalone; uncomment the selected data
 57 | pars['dataset_name'] = 'abalone'
 58 | #pars['dataset_name'] = 'white_wine'
 59 | #pars['dataset_name'] = 'red_wine'
 60 | 
 61 | # Number of observations(=clients) to be used
 62 | # Note: looped n_repeats times for each element in the list (CV), the elements are picked at random
 63 | pars['n_clients'] = [3000]
 64 | pars['n_test'] = [1000]
 65 | # Note: n_clients & n_test need to have same length; if n_test = 0, uses all the data left after splitting training set for testing
 66 | # number of clients in datasets:
 67 | #       red wine 1599
 68 | #       white wine 4898
 69 | #       abalone 4177
 70 | # In the paper the following sizes are used:
 71 | # red wine: n_clients=1000, n_test=500
 72 | # white wine: n_clients=3000, n_test=1000
 73 | # abalone: n_clients=3000, n_test=1000
 74 | 
 75 | # Use selected data dimensions; uncomment according to the data used
 76 | #pars['sel_dims'] = [0,1,2,3,4,5,6,7,8,9,10] #wines
 77 | pars['sel_dims'] = [0,1,2,3,4,5,6,7] #abalone
 78 | # Note: for UCI red wines max dim = 11
 79 | #               white whine = 11
 80 | #               abalone = 8
 81 | 
 82 | # Percentage of privacy used for estimating std. Used in both distributed and TA settings
 83 | pars['privacy_for_marg_var'] = .3
 84 | # .3 in the paper
 85 |   
 86 | #use clipping trick
 87 | #pars['do_clipping'] = True
 88 | #list of clipping thresholds, for each clipping is [-c,c]
 89 |   #Note: number of file ids need to match the number of clipping thresholds
 90 |   #empty list = use estimated optimal clipping
 91 | pars['all_clips'] = []
 92 | # empty list in the paper
 93 | 
 94 | 
 95 | # Scale data to specific range 
 96 | # Note: range is scaled to be of length (2*given value) with mean 0
 97 | # the distributions are NOT enforced to be symmetric around the mean though
 98 | pars['scale_to_range'] = 5
 99 | # 5 in the paper
100 | # Assumed data and target ranges, each is interpreted as [-c,c]
101 | pars['assumed_data_range'] = [7.5,7.5]
102 | # [7.5,7.5] in the paper
103 | 
104 | # Folder for tmp files & output
105 | pars['tmp_folder'] = 'tmp/'
106 | pars['output_folder'] = 'test_results/'
107 | 
108 | # Add DP noise to suff. stats
109 | # 0=no noise; 1=DP noise to suff stats; 2=noise addition by individuals, 3=both for comparison
110 | pars['add_noise'] = 3
111 | # 3 in the paper
112 | 
113 | # Privacy parameters
114 | # Note: need to be equal length lists
115 | pars['epsilon_tot'] = np.power(10,[0,.25,.5,.75,1,1.5])
116 | pars['delta_tot'] = np.zeros(len(pars['epsilon_tot'])) + 10**(-4)
117 | 
118 | # File ids; each privacy par pair generates separate output files
119 | # Note: needs to match the length of privacy par lists. 1.id is also used as a general label (e.g. for saving results & settings used; this needs to match the settings in combine_pred_errors.py for plotting)
120 | pars['all_file_ids'] = ['_abalone_'+str(int(i)) for i in np.linspace(1,len(pars['epsilon_tot']),len(pars['epsilon_tot']))]
121 | 
122 | #comparison with T honest clients, T = ceil(scale*clients)
123 | pars['scaling_comparison'] = 0
124 | # Note: set to 0 to get standard input perturbation, 1=trusted aggregator
125 | # 0 in the paper
126 | 
127 | # Comparisons to trusted aggregator DP
128 | pars['compare_to_std_DP'] = True
129 | # True=unclipped noise var is calculated as in standard DP, False=use n/(n-1) factor for the noise as in crypto (for checking)
130 | # True in the paper
131 | 
132 | # Small constant to use if marg. std estimate <= 0
133 | pars['small_const_for_std'] = .5
134 | # .5 in the paper
135 | 
136 | # Extra options
137 | pars['drugsens_data'] = False
138 | 
139 | pars['spark_filename'] = 'tmp/sparktest.csv'
140 | # Note: this can be overwritten by command line options
141 | 
142 | ################################################################################
143 | # END OF SETUP
144 | ################################################################################
145 | 
146 | #check for needed folders
147 | all_folders = [pars['output_folder'],pars['tmp_folder']]
148 | m = re.split(r'/',setup_filename)
149 | if m is not None and len(m) > 1:
150 |   setup_folder = ''
151 |   for k in range(len(m)-1):
152 |     setup_folder += str(m[k]) + '/'
153 |   all_folders.append(setup_folder)
154 | for folder in all_folders:
155 |   if not os.path.exists(folder):
156 |     print('\nCreating folder ' + str(folder))
157 |     os.makedirs(folder)
158 | 
159 | # Spark
160 | pars['use_spark'] = False
161 | pars['n_spark_messages'] = 10
162 | pars['spark_noise_range'] = 10e13
163 | pars['fixed_point_int'] = 10e6
164 | # uses numpy randint  [-given val,given val]
165 | # Note: this shouldn't be considered a cryptographically safe implementation
166 | if len(sys.argv) > 1:
167 |   try:
168 |     opts, args = getopt.getopt(sys.argv[1:], "c:hs:f:n:", ["compute=","help", "spark=","fixed_point=","noise="])
169 |   except getopt.GetoptError as err:
170 |     print(str(err) + '. Use -h for help.')
171 |     sys.exit(2)
172 |   for o, a in opts:
173 |     if o in ("-h", "--help"):
174 |       print('Options:\n-s or --spark [filename] run a test using Spark. When using Spark, consider also setting the other options.\n-c or --compute [number of messages] sets the total number of messages used for Spark (default=10).\n-f or --fixed_point [fixed-point integer] defines the integer used for fixed-point arithmetic (default=10e6).\n-n or --noise sets the noise range used for Spark messages (default=10e14).')
175 |       sys.exit()
176 |     elif o in ("-s", "--spark"):
177 |       pars['use_spark'] = True
178 |       # Note: if use_spark = True, saves the individual contributions to the distributed non-projected model sufficient statistics to file on first round and terminates the run
179 |       if a is not '':
180 |         pars['spark_filename'] = a
181 |       print('Running Spark test, saving to file \'{}\'.'.format(pars['spark_filename']))
182 |       pars['n_repeats'] = 1
183 |       pars['n_clients'] = [pars['n_clients'][0]]
184 |       pars['epsilon_tot'] = [pars['epsilon_tot'][0]]
185 |       pars['delta_tot'] = [pars['delta_tot'][0]]
186 |     elif o in ["-c","--compute"]:
187 |       if a is not '':
188 |         pars['n_spark_messages'] = int(a)
189 |         print('Using {} messages for each data point for Spark.'.format(a))
190 |       else:
191 |         print('Number of messages for Spark should be an int.')
192 |     elif o in ['-f','--fixed_point']:
193 |       pars['fixed_point_int'] = int(float(a))
194 |     elif o in ['-n','--noise']:
195 |       pars['spark_noise_range'] = int(float(a))
196 |     else:
197 |       assert False, "unhandled option"
198 | 
199 | 
200 | pars['dim'] = len(pars['sel_dims'])
201 | 
202 | #check for optimal clipping
203 | if len(pars['all_clips']) == 0:
204 |   pars['do_optimal_clip'] = True
205 | else:
206 |   pars['do_optimal_clip'] = False
207 | 
208 | 
209 | #setup-script use
210 | #0=False, 1=use given setup-file, 2=write current setup to given file, 3=print given setup and quit
211 | if use_saved_setup is 1:
212 |   print('Reading setup from\n' + setup_filename + ', press y to continue..')
213 |   apu = sys.stdin.read(1)
214 |   if apu[0] is not 'y':
215 |     print('Aborted')
216 |     sys.exit()
217 |   pars = setup_handler.get_setup(setup_filename)  
218 |   
219 | #write current setup to file
220 | elif use_saved_setup is 2:
221 |   print('Saving setup to\n' + setup_filename + ', press y to continue..')
222 |   apu = sys.stdin.read(1)
223 |   if apu[0] is not 'y':
224 |     print('Aborted')
225 |     sys.exit()
226 |   setup_handler.write_setup(setup_filename, pars)
227 |   print('setup written, exiting..')
228 |   sys.exit()
229 | #read & print the given pars
230 | elif use_saved_setup is 3:
231 |   print('Reading setup from\n' + setup_filename + '\n')
232 |   apu = setup_handler.get_setup(setup_filename)
233 |   for i in apu.items():
234 |     print(str(i[0]) + ': ' + str(i[1]))
235 |   sys.exit()
236 | 
237 | if not pars['do_optimal_clip']:
238 |   clip_threshold = np.zeros((pars['dim'] + 1)) + pars['all_clips']
239 | 
240 | np.random.seed(pars['random_seed'])
241 | 
242 | print('Selected dims: {}'.format(pars['sel_dims']))
243 | #get data
244 | exec('data_master = UCI_data_getter.get_' + pars['dataset_name'] + '()')
245 | 
246 | #check that target is not selected as predictor
247 | if data_master.shape[1]-1 in pars['sel_dims']:
248 |   print('Target dim selected as predictor! Aborted.')
249 |   sys.exit()
250 | 
251 | #drop unused dims
252 | data_master = np.hstack((data_master[:,pars['sel_dims']],np.reshape(data_master[:,-1],(data_master.shape[0],1)) ))
253 | 
254 | #center data
255 | data_master = data_master - np.mean(data_master, axis = 0)
256 |   
257 | #scale data to assumed range
258 | data_master = np.multiply(data_master, 1/np.ptp(data_master,0)) * 2*pars['scale_to_range']
259 | print('Data range lengths scaled to ' +str(2*pars['scale_to_range']))
260 | 
261 | #generate fixed train-test splits for each repeat that are used with all privacy pars and sample sizes
262 | filename = pars['tmp_folder'] + 'permu_'
263 | for k_file in range(pars['n_repeats']):
264 |   if 0 in pars['n_test']:
265 |     all_inds = np.random.permutation(data_master.shape[0])
266 |   else:
267 |     all_inds = np.random.choice( np.arange(data_master.shape[0]), np.amax(pars['n_clients'])+np.amax(pars['n_test']),False)
268 |   with open(filename+str(k_file)+'.pickle', 'wb') as f:
269 |     pickle.dump(all_inds, f, pickle.HIGHEST_PROTOCOL)
270 | 
271 | 
272 | #loop over privacy pars
273 | for k_privacy_par in range(len(pars['epsilon_tot'])):
274 | 
275 |   print('\nStarting iteration ' + str(k_privacy_par+1) +'/' + str(len(pars['epsilon_tot'])) + '...\n')
276 |   sleep(.5)
277 |   
278 |   pars['epsilon'] = pars['epsilon_tot'][k_privacy_par]
279 |   pars['delta'] = pars['delta_tot'][k_privacy_par]
280 |   
281 |   file_id = pars['all_file_ids'][k_privacy_par]
282 |   pred_errors_filename = pars['output_folder'] + 'pred_errors_test' + file_id + '.pickle'
283 |   
284 |   pred_errors = list()
285 |   
286 |   client_round = -1
287 |   
288 |   for k_client in pars['n_clients']:
289 |     print('\nNumber of clients: ' + str(k_client) + ' ('+str(client_round+2) +'/'+str(len(pars['n_clients']))+')')
290 |     client_round = client_round + 1
291 |     k_test = pars['n_test'][client_round]
292 |     
293 |     pred_errors_client_loop = list()
294 |     
295 |     if pars['do_optimal_clip']:
296 |       clipping_array = np.zeros((pars['n_repeats'],pars['dim']+1))
297 |     
298 |     for k_repeat in range(pars['n_repeats']):
299 |       print('\nStarting repeat ' + str(k_repeat + 1) + '/'+str(pars['n_repeats'])+'...\n')
300 |       
301 |       data = np.copy(data_master)
302 |       #load fixed train-test split
303 |       filename = pars['tmp_folder'] + 'permu_'
304 |       permu = np.load(filename + str(k_repeat) + '.pickle')
305 |       train_ind = permu[0:k_client]
306 |       if k_test == 0: #use all elements not in training set
307 |         test_ind = permu[k_client:]
308 |       else:
309 |         test_ind = permu[-k_test:]
310 |       
311 |       data_test = data[test_ind,:]
312 |       data = data[train_ind,:]
313 |       
314 | ################################################
315 | # FIND OPTIMAL CLIPPING RATE
316 |       
317 |       if pars['do_optimal_clip']:
318 |       
319 |         print('Finding optimal clipping thresholds..\n')
320 |         optimal_clip_values = np.zeros(2)
321 |         
322 |         optimal_clip_values[0], optimal_clip_values[1] = dp.omega(k_client, pars['dim'], pars['epsilon'], pars['delta'], 'mae', pars['opt_clip_repeats'])
323 |         
324 |         clip_threshold = np.zeros((pars['dim']+1))
325 |         #estimate marginal std for each dimension & use for clipping
326 |         stds = np.zeros(pars['dim']+1)
327 |         pars['marginal_vars'] = estimate_vars.get_estimates(np.copy(data), pars=pars, small_pos = pars['small_const_for_std'])
328 |         stds = np.sqrt( pars['marginal_vars'])
329 |         
330 |         stds_true = np.std(data,0)
331 |         stds_TA_DP = dp.get_TA_std_estimates(np.copy(data),pars)
332 |         
333 |         #optimal clipping
334 |         clip_threshold[0:-1] = stds[0:-1] * optimal_clip_values[0]
335 |         clip_threshold[-1] = stds[-1] * optimal_clip_values[1]
336 |         
337 |         clip_threshold_true = np.zeros((pars['dim']+1))
338 |         clip_threshold_TA_DP = np.zeros((pars['dim']+1))
339 |         
340 |         clip_threshold_true[0:-1] = stds_true[0:-1] * optimal_clip_values[0]
341 |         clip_threshold_true[-1] = stds_true[-1] * optimal_clip_values[1]
342 |         
343 |         clip_threshold_TA_DP[0:-1] = stds_TA_DP[0:-1] * optimal_clip_values[0]
344 |         clip_threshold_TA_DP[-1] = stds_TA_DP[-1] * optimal_clip_values[1]
345 |         
346 |         #check that clipping threshold is not greater than the assumed data range
347 |         for k_dim in range(pars['dim']):
348 |           clip_threshold[k_dim] = np.minimum(clip_threshold[k_dim],pars['assumed_data_range'][0])
349 |           
350 |         clip_threshold[-1] = np.minimum(clip_threshold[-1],pars['assumed_data_range'][-1])
351 |         
352 |         
353 | ################################################
354 | #CLIPPING
355 |       
356 |       data_clipped = np.multiply( np.sign(data), np.minimum(clip_threshold,np.absolute(data) ) )
357 |       
358 |       data_clipped_true  = np.multiply( np.sign(data), np.minimum(clip_threshold_true,np.absolute(data) ) )
359 |       data_clipped_TA_DP  = np.multiply( np.sign(data), np.minimum(clip_threshold_TA_DP,np.absolute(data) ) )
360 |       
361 | ################################################
362 | #CALCULATE (PERTURBED) SUFFICIENT STATS
363 |       
364 |       suff_stats, sigma_all, added_noise_dict = suff_stats_master.get_suff_stats(np.copy(data), np.copy(data_clipped), k_client, k_repeat, clip_threshold, pars, data_clipped_true, clip_threshold_true, data_clipped_TA_DP, clip_threshold_TA_DP)
365 |       
366 |       
367 | ################################################
368 | #CHECK POSITIVE DEFINITENESS
369 |       suff_stats = pos_def_matrices.check(suff_stats, pars)
370 |       
371 | ################################################
372 | #LINEAR REGRESSION
373 |       model_coeffs = linear_regression_master.get_regression_est(suff_stats, pars)
374 |       
375 | ################################################
376 | #CALCULATE PREDICTION ERRORS
377 |       
378 |       pred_errors_client_loop = {}
379 |       
380 |       for k_model in model_coeffs:
381 |         MAE, MSE, E_pred, std_pred, range_pred = calculate_pred_errors.calculate_errors(data=np.copy(data_test), dim=pars['dim'], filename_data='', model_coeff = model_coeffs[k_model])
382 |         pred_errors_client_loop[k_model] = [MAE,MSE, E_pred, std_pred, range_pred]
383 |       
384 |       pred_errors.append(pred_errors_client_loop)
385 |       
386 | ################################################
387 | #end of loop over n_repeat
388 |     
389 | ################################################
390 | #end of loop over n_clients
391 | 
392 |   #pickle prediction errors
393 |   with open(pred_errors_filename, 'wb') as f:
394 |     pickle.dump(pred_errors, f, pickle.HIGHEST_PROTOCOL)
395 | 
396 | ################################################
397 | #end of loop over privacy pars
398 | 
399 | with open(pars['output_folder'] + 'pars_test' + pars['all_file_ids'][0] + '.pickle', 'wb') as f:
400 |   pickle.dump(pars, f, pickle.HIGHEST_PROTOCOL)
401 | 
402 | print('\nAll done.')


--------------------------------------------------------------------------------