├── dataset_tests └── src │ ├── __init__.py │ ├── drugsens_code │ ├── __init__.py │ ├── .gitignore │ ├── run_tensor_tests.py │ ├── clippingomega.py │ ├── tensorresults.py │ ├── plot_tensor_results.py │ ├── tensor.py │ └── diffpri.py │ ├── .gitignore │ ├── data_reader.py │ ├── setup_handler.py │ ├── linear_regression_master.py │ ├── pos_def_matrices.py │ ├── README.md │ ├── calculate_pred_errors.py │ ├── estimate_vars.py │ ├── UCI_data_getter.py │ ├── sufficient_stats.py │ ├── combine_pred_errors.py │ ├── suff_stats_master.py │ └── eps_data_test.py ├── probic-decrypt-server ├── project │ ├── build.properties │ └── plugins.sbt ├── gen10.sh ├── test-scripts │ ├── test-client.sh │ ├── test-data-writer.sh │ └── test-data-server.sh ├── run-scripts │ ├── gen-testdata-10.sh │ ├── gen-test-data-matrix-given.sh │ ├── start-servers.sh │ └── start-servers-eps.sh ├── gen-keys.sh ├── build.sbt ├── .gitignore ├── src │ └── main │ │ └── scala │ │ └── fi │ │ └── helsinki │ │ └── cs │ │ └── probic │ │ ├── test │ │ ├── TestClient.scala │ │ ├── TestDataWriter.scala │ │ └── TestDataServer.scala │ │ ├── crypto │ │ └── PkCrypto.scala │ │ ├── data │ │ └── GenerateTestDataMatrix.scala │ │ └── server │ │ └── Server.scala └── README.md ├── spark-streaming-aggregator ├── project │ ├── build.properties │ └── plugins.sbt ├── results │ ├── 100-100-5.started.txt │ ├── result-10-100-10.txt │ ├── result-10-100-5.txt │ ├── result-10-1000-10.txt │ ├── result-10-1000-5.txt │ ├── result-100-100-10.txt │ ├── result-100-100-5.txt │ ├── result-100-1000-10.txt │ ├── result-100-1000-5.txt │ ├── result-1000-100-10.txt │ └── restable.sh ├── README.md ├── result-10-100-5.txt ├── result-100-100-5.txt ├── result-100-100-1s-notimeout-newrsa-5.txt ├── getresults.sh ├── run-spark-aggregator.sh ├── run-spark-aggregator-eps-data.sh ├── run-spark.sh ├── .gitignore ├── build.sbt ├── results-agg2.txt ├── src │ └── main │ │ └── scala │ │ └── fi │ │ └── helsinki │ │ └── cs │ │ ├── nodes │ │ └── util │ │ │ └── Spark2Main.scala │ │ └── probic │ │ └── streaming │ │ └── SparkDataAggregator.scala ├── results-agg-rerun.txt ├── results-agg4-serverupdated.txt └── results-agg4.txt ├── README.md ├── LICENSE └── .gitignore /dataset_tests/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /probic-decrypt-server/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.0.3 2 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.0.3 2 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.png 3 | *.sh 4 | tmp/* 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/100-100-5.started.txt: -------------------------------------------------------------------------------- 1 | 2017-05-12 19:33:09.422899064+03:00 2 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/README.md: -------------------------------------------------------------------------------- 1 | Please see the readme file in the probic-decrypt-server folder. 2 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/result-10-100-5.txt: -------------------------------------------------------------------------------- 1 | 1494601455.941897498 2 | 1494601642.811982119 3 | 186.87 s 4 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/result-100-100-5.txt: -------------------------------------------------------------------------------- 1 | 1494601958.178226584 2 | 1494602312.466400278 3 | 354.288 s 4 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/result-100-100-1s-notimeout-newrsa-5.txt: -------------------------------------------------------------------------------- 1 | 1494604731.460029461 2 | 1494606406.548135978 3 | 1675.09 s 4 | -------------------------------------------------------------------------------- /probic-decrypt-server/gen10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . ./gen-keys.sh 4 | 5 | for i in $( seq 1 10 ) 6 | do 7 | genkey "$i" 8 | done 9 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-10-100-10.txt: -------------------------------------------------------------------------------- 1 | output values: 10 2 | Start seconds: 1494658315.655119783 3 | End ms: 1494658422159 4 | 106.504 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-10-100-5.txt: -------------------------------------------------------------------------------- 1 | output values: 10 2 | Start seconds: 1494657542.603279687 3 | End ms: 1494657642661 4 | 100.058 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-10-1000-10.txt: -------------------------------------------------------------------------------- 1 | output values: 10 2 | Start seconds: 1494636005.148012318 3 | End ms: 1494636528633 4 | 523.485 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-10-1000-5.txt: -------------------------------------------------------------------------------- 1 | output values: 10 2 | Start seconds: 1494637919.631427449 3 | End ms: 1494638421267 4 | 501.636 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-100-100-10.txt: -------------------------------------------------------------------------------- 1 | output values: 100 2 | Start seconds: 1494608097.867109462 3 | End ms: 1494609063387 4 | 965.52 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-100-100-5.txt: -------------------------------------------------------------------------------- 1 | output values: 100 2 | Start seconds: 1494606789.422899064 3 | End ms: 1494607721492 4 | 932.069 s 5 | -------------------------------------------------------------------------------- /probic-decrypt-server/test-scripts/test-client.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | java -cp target/scala-2.11/probic-server.jar \ 4 | fi.helsinki.cs.probic.test.TestClient 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-100-1000-10.txt: -------------------------------------------------------------------------------- 1 | output values: 100 2 | Start seconds: 1494819075.667365251 3 | End ms: 1494820536327 4 | 1460.66 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-100-1000-5.txt: -------------------------------------------------------------------------------- 1 | output values: 100 2 | Start seconds: 1494639257.208596029 3 | End ms: 1494644109327 4 | 4852.12 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/result-1000-100-10.txt: -------------------------------------------------------------------------------- 1 | output values: 1000 2 | Start seconds: 1494609175.899942210 3 | End ms: 1494618765997 4 | 9590.1 s 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/getresults.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | awk 'NR % 2 == 1 {t=$0} NR % 2 == 0 {a[t]+=$(NF-1); c[t]+=1 } END{for (t in a) { print t, a[t]/c[t]/1000}}' $1 | sort -n 3 | -------------------------------------------------------------------------------- /probic-decrypt-server/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.3") 2 | 3 | //addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4") 4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 5 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.typesafe.sbteclipse" % "sbteclipse-plugin" % "5.2.3") 2 | 3 | //addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.4") 4 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") 5 | -------------------------------------------------------------------------------- /probic-decrypt-server/run-scripts/gen-testdata-10.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ns="100 1000 10000 100000" 3 | ds="10 100 1000 10000" 4 | for N in $ns 5 | do 6 | for d in $ds 7 | do 8 | run-scripts/gen-test-data-matrix-given.sh $d $N 9 --zip 9 | done 10 | done 11 | 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dca-nips2017 2 | Differentially private learning on distributed data (NIPS 2017) 3 | 4 | ## Instructions 5 | 6 | Please see the [readme at the `probic-decrypt-server` folder](probic-decrypt-server) and the [readme at the `dataset_tests/src` folder](dataset_tests/src). 7 | -------------------------------------------------------------------------------- /probic-decrypt-server/test-scripts/test-data-writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | java -cp target/scala-2.11/probic-server.jar \ 3 | fi.helsinki.cs.probic.data.TestDataWriter \ 4 | --certs probic-1,probic-2,probic-3,probic-4,probic-5 \ 5 | --masters localhost:8080,localhost:8081,localhost:8082,localhost:8083,localhost:8084 \ 6 | --clients 10 \ 7 | --input test-data-matrix.csv \ 8 | --output test-data-matrix-crypt.csv 9 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/run-spark-aggregator.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | res=$1 3 | shift 4 | 5 | rm -rf temp 6 | 7 | class=fi.helsinki.cs.probic.streaming.SparkDataAggregator 8 | ./run-spark.sh $class \ 9 | --input file://$PWD/../probic-decrypt-server/test-data-matrix \ 10 | --output file://$PWD/temp/sum-data-matrix \ 11 | --noise 9 $* 12 | echo "9 $*" >> $res 13 | tail -n 1 ${class}.log >> $res 14 | -------------------------------------------------------------------------------- /dataset_tests/src/.gitignore: -------------------------------------------------------------------------------- 1 | onlineldavb* 2 | tmp/* 3 | bump_check_tmp/* 4 | bump_test.py 5 | data/* 6 | plots/* 7 | pert_data/* 8 | .DS_Store 9 | __py_cache__* 10 | user_list* 11 | res/* 12 | *.cpp 13 | *.h 14 | *.o 15 | client 16 | compute 17 | server 18 | Make* 19 | sample_user_list 20 | enc/* 21 | profiler_dump/* 22 | total_*.txt 23 | test_results/* 24 | *.pickle 25 | scratch/* 26 | *.py_bck 27 | 28 | 29 | -------------------------------------------------------------------------------- /probic-decrypt-server/run-scripts/gen-test-data-matrix-given.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ -z "$3" ]; then echo "Usage: $0 d N noise"; exit 1; fi 4 | 5 | d=$1 6 | shift 7 | N=$1 8 | shift 9 | noise=$1 10 | shift 11 | 12 | java -Xmx1500g -cp target/scala-2.11/probic-server.jar \ 13 | fi.helsinki.cs.probic.data.GenerateTestDataMatrix \ 14 | --dimension $d \ 15 | --clients $N \ 16 | --noise $noise \ 17 | --output test-data-matrix-$d-$N-$noise.csv \ 18 | $* 19 | 20 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/run-spark-aggregator-eps-data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | res=results-eps-abalone-3000.txt 3 | shift 4 | 5 | rm -rf temp 6 | 7 | fun(){ 8 | class=fi.helsinki.cs.probic.streaming.SparkDataAggregator 9 | ./run-spark.sh $class \ 10 | --input file://$PWD/../dataset_tests/src/sparkfile.txt \ 11 | --output file://$PWD/temp/eps-sum-data \ 12 | --noise 9 $* 13 | echo "9 $*" >> $res 14 | tail -n 1 ${class}.log >> $res 15 | } 16 | 17 | fun --clients 3000 --d 8 --useDouble 18 | 19 | -------------------------------------------------------------------------------- /dataset_tests/src/data_reader.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Function for reading data from a given file and returning it as a list. 8 | ''' 9 | 10 | import numpy as np 11 | import csv 12 | 13 | def read_data(filename): 14 | with open(filename,newline='',encoding='utf-8') as f: 15 | reader = csv.reader(f, delimiter=',') 16 | data = list() 17 | for row in reader: 18 | data.append(row) 19 | return data 20 | -------------------------------------------------------------------------------- /probic-decrypt-server/run-scripts/start-servers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | startserver(){ 4 | echo "Starting probic-server: $*" 5 | #xterm -e "java -jar target/scala-2.11/probic-server.jar $*" & 6 | screen -d -m -S probic-server -- nice -n 20 java -Xmx10g -jar target/scala-2.11/probic-server.jar $* & 7 | } 8 | 9 | if [ -z "$1" ]; then c=5; else c=$1; fi 10 | if [ -z "$2" ]; then msg=100; else msg=$2; fi 11 | 12 | for i in $( seq 1 $c ) 13 | do 14 | let p=8080+$i 15 | let p-- 16 | startserver --port $p --cert probic-$i --messages $msg 17 | done 18 | 19 | -------------------------------------------------------------------------------- /dataset_tests/src/setup_handler.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Setup script handler: pickle setup parameters & read them back. 8 | ''' 9 | 10 | import numpy as np 11 | import pickle 12 | 13 | def get_setup(saved_setup): 14 | with open(saved_setup + '.pickle', 'rb') as f: 15 | apu = pickle.load(f) 16 | return apu 17 | 18 | def write_setup(saved_setup, pars): 19 | with open(saved_setup + '.pickle', 'wb') as f: 20 | pickle.dump(pars, f, pickle.HIGHEST_PROTOCOL) 21 | -------------------------------------------------------------------------------- /probic-decrypt-server/run-scripts/start-servers-eps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | startserver(){ 4 | echo "Starting probic-server: $*" 5 | #xterm -e "java -jar target/scala-2.11/probic-server.jar $*" & 6 | screen -d -m -S probic-server -- nice -n 20 java -Xmx10g -jar target/scala-2.11/probic-server.jar $* & 7 | } 8 | 9 | if [ -z "$1" ]; then c=10; else c=$1; fi 10 | if [ -z "$2" ]; then msg=3000; else msg=$2; fi 11 | 12 | for i in $( seq 1 $c ) 13 | do 14 | let p=8080+$i 15 | let p-- 16 | startserver --port $p --cert probic-$i --messages $msg --useDouble 17 | done 18 | 19 | -------------------------------------------------------------------------------- /probic-decrypt-server/gen-keys.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -f "secret.txt" ]; then echo "Please create secret.txt with a single line containing the desired private key password."; exit 1; fi 4 | # Store passwords in a separate file 5 | pass=$( cat secret.txt ) 6 | 7 | res=$PWD 8 | ks=$res/keystore.jks 9 | 10 | genkey(){ 11 | str="PROBIC-${1}\nDepartment of Computer Science\nUniversity of Helsinki\nHelsinki\nUusimaa\nFI\nyes" 12 | echo -e $str | keytool -genkey -alias "probic-${1}" -keyalg RSA -keystore $ks -keysize 4096 -storepass $pass -keypass $pass -validity 360 13 | } 14 | 15 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/run-spark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPARK=$HOME/work/spark-2.1.0-bin-hadoop2.7 4 | 5 | if [ -n "$1" ] 6 | then 7 | class=$1 8 | shift 9 | else 10 | echo "Usage: $0 classname [args...]" 11 | exit 1 12 | fi 13 | 14 | export SPARK_LOCAL_IP="127.0.0.1" 15 | export SPARK_LOCAL_DIRS="/run/user/$( id -u $USER )/spark" 16 | echo SPARK_LOCAL_DIRS=$SPARK_LOCAL_DIRS 17 | echo "args: $class $*" 18 | 19 | $SPARK/bin/spark-submit --driver-memory 1500g --master "local[45]" \ 20 | --class $class $PWD/target/scala-2.11/probic-streaming-aggregator.jar $* 1> "${class}.log" 2> "${class}.err" 21 | 22 | -------------------------------------------------------------------------------- /probic-decrypt-server/build.sbt: -------------------------------------------------------------------------------- 1 | name := "Probic Private Data Aggregation Node" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | libraryDependencies += "fi.helsinki.cs.nodes" % "getopt-scala" % "1.1.0" 8 | 9 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0" 10 | 11 | // https://mvnrepository.com/artifact/commons-codec/commons-codec 12 | libraryDependencies += "commons-codec" % "commons-codec" % "1.10" 13 | 14 | libraryDependencies += "org.slf4j" % "slf4j-simple" % "1.7.25" 15 | 16 | mainClass in assembly := Some("fi.helsinki.cs.probic.server.Server") 17 | 18 | assemblyJarName in assembly := "probic-server.jar" 19 | -------------------------------------------------------------------------------- /probic-decrypt-server/test-scripts/test-data-server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | genargs() { 4 | certs="" 5 | masters="" 6 | for i in $( seq 1 $1 ) 7 | do 8 | let p=8080+$i 9 | let p-=1 10 | if [ -z "$certs" ]; then 11 | certs="probic-$i" 12 | masters="localhost:$p" 13 | else 14 | certs="$certs,probic-$i" 15 | masters="$masters,localhost:$p" 16 | fi 17 | done 18 | } 19 | 20 | if [ -n "$1" ] 21 | then 22 | genargs $1 23 | shift 24 | else 25 | genargs 5 26 | fi 27 | 28 | java -cp target/scala-2.11/probic-server.jar \ 29 | fi.helsinki.cs.probic.test.TestDataServer \ 30 | --certs ${certs} \ 31 | --masters ${masters} \ 32 | $* 33 | 34 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results/restable.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ns="100 1000 10000 100000" 3 | ds="10 100 1000 10000" 4 | 5 | mktable(){ 6 | echo '\begin{table}' 7 | echo '\begin{tabular}[]{c c c c c}' 8 | echo "M=$M & N=100 & N=1000 & N=10000 & N=100000 \\\\" 9 | for d in $ds 10 | do 11 | row="d=$d" 12 | for N in $ns 13 | do 14 | f=result-$d-$N-$M.txt 15 | if [ ! -f $f ]; then cell=NA; else 16 | cell=$( tail -n 1 $f | awk '{print $1}' ) 17 | fi 18 | if [ -z "$row" ]; then row=$cell; else row="$row & $cell"; fi 19 | done 20 | echo "$row \\\\" 21 | done 22 | echo '\end{tabular}' 23 | echo '\end{table}' 24 | } 25 | 26 | M=5 27 | mktable 28 | echo "" 29 | 30 | M=10 31 | mktable 32 | 33 | -------------------------------------------------------------------------------- /probic-decrypt-server/.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !.gitignore 3 | bin 4 | *.class 5 | db 6 | dist 7 | dist/* 8 | eclipse 9 | *.eml 10 | *.iml 11 | # except for .gitignore 12 | # Extracted from https://github.com/ulrich/macaron-factory/blob/master/.gitignore 13 | # Ignore all dotfiles... 14 | # Ignore Play! working directory # 15 | lib 16 | lib_managed/ 17 | log 18 | *.log 19 | logs 20 | modules 21 | /out 22 | precompiled 23 | project/boot/ 24 | project/plugins/project/ 25 | project/project 26 | /project/*-shim.sbt 27 | project/target 28 | # sbt specific 29 | # Scala-IDE specific 30 | server.pid 31 | src_managed/ 32 | target 33 | test-result 34 | tmp 35 | .history 36 | dist 37 | /.idea 38 | /*.iml 39 | /out 40 | /.idea_modules 41 | /.classpath 42 | /.project 43 | /.settings 44 | /bin/ 45 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !.gitignore 3 | bin 4 | *.class 5 | db 6 | dist 7 | dist/* 8 | eclipse 9 | *.eml 10 | *.iml 11 | # except for .gitignore 12 | # Extracted from https://github.com/ulrich/macaron-factory/blob/master/.gitignore 13 | # Ignore all dotfiles... 14 | # Ignore Play! working directory # 15 | lib 16 | lib_managed/ 17 | log 18 | *.log 19 | logs 20 | modules 21 | /out 22 | precompiled 23 | project/boot/ 24 | project/plugins/project/ 25 | project/project 26 | /project/*-shim.sbt 27 | project/target 28 | # sbt specific 29 | # Scala-IDE specific 30 | server.pid 31 | src_managed/ 32 | target 33 | test-result 34 | tmp 35 | temp/* 36 | .history 37 | dist 38 | /.idea 39 | /*.iml 40 | /out 41 | /.idea_modules 42 | /.classpath 43 | /.project 44 | /.settings 45 | /bin/ 46 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/build.sbt: -------------------------------------------------------------------------------- 1 | name := "Probic Spark Streaming Private Data Aggregator" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | libraryDependencies += "org.apache.spark" %% "spark-sql" % "2.1.1" % "provided" 8 | 9 | libraryDependencies += "org.apache.spark" %% "spark-streaming" % "2.1.1" % "provided" 10 | 11 | libraryDependencies += "fi.helsinki.cs.nodes" % "getopt-scala" % "1.1.0" 12 | 13 | libraryDependencies += "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0" 14 | 15 | // https://mvnrepository.com/artifact/commons-codec/commons-codec 16 | libraryDependencies += "commons-codec" % "commons-codec" % "1.10" 17 | 18 | libraryDependencies += "org.slf4j" % "slf4j-simple" % "1.7.25" 19 | 20 | assemblyJarName in assembly := "probic-streaming-aggregator.jar" 21 | 22 | mainClass in assembly := Some("fi.helsinki.cs.probic.streaming.Aggregator") 23 | 24 | -------------------------------------------------------------------------------- /dataset_tests/src/linear_regression_master.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Script for calculating Bayesian linear regression from sufficient stats 8 | ''' 9 | 10 | import numpy as np 11 | import sys 12 | 13 | def get_regression_est(suff_stats, pars): 14 | #assume suff_stats is a dictionary containing suff stats as [X'X, X'y] 15 | #return dict with [prec, mean] 16 | 17 | #prior precisions 18 | l = 1 19 | l0 = 1 20 | 21 | palautettava = {} 22 | for k_stats in suff_stats.keys(): 23 | apu = {} 24 | try: 25 | apu['prec'] = l*(suff_stats[k_stats][0]) + l0*np.identity(pars['dim']) 26 | apu['mean'] = np.linalg.solve(apu['prec'],l*(suff_stats[k_stats][1])) 27 | except: 28 | apu['prec'] = None 29 | apu['mean'] = None 30 | palautettava[k_stats] = apu 31 | return palautettava -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/run_tensor_tests.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | GDSC/drug sensitivity data 6 | 7 | Script for running tensor.py for a collection of drugs and CVs. 8 | 9 | clippingomega.py should be run before this. 10 | 11 | Run: python3 run_tensor_tests.py 12 | ''' 13 | 14 | import subprocess 15 | import sys 16 | 17 | import numpy as np 18 | 19 | n_drugs = 264 # 264 in the paper 20 | n_cv = 25 # 25 in the paper 21 | drugs_to_run = np.linspace(0,n_drugs,n_drugs+1,dtype='int') 22 | seeds_for_cv = np.linspace(0,n_cv,n_cv+1,dtype='int') 23 | 24 | #args to tensor.py: drug_id, seed 25 | for drug in drugs_to_run: 26 | print('Starting drug ' + str(drug)) 27 | for seed in seeds_for_cv: 28 | testi = subprocess.run(args=['python','tensor.py',str(drug),str(seed)], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 29 | print('stdout:\n' + testi.stdout.decode('utf-8')) 30 | print('stderr:\n' + testi.stderr.decode('utf-8')) 31 | 32 | print('All tensor tests done!') -------------------------------------------------------------------------------- /dataset_tests/src/pos_def_matrices.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data, GDSC/drug sensitivity data 6 | 7 | Function for checking & fixing matrix positive definiteness. Works by eigendecomposing, and re-composing with absolute values of the original eigenvalues. 8 | ''' 9 | 10 | import numpy as np 11 | 12 | def check(suff_stats, pars): 13 | 14 | if pars['enforce_pos_def'] == False: 15 | #simply flag non-pos.def matrices, no correction 16 | if pars['feedback'] > 0: 17 | for m in suff_stats: 18 | D, V = np.linalg.eig(suff_stats[m][0]) 19 | if np.sum(D < 0) > 0: 20 | print('Non-positive definite Cov matrix for {}'.format(m)) 21 | return suff_stats 22 | 23 | else: 24 | #eigendecompose, set eigenvalues to their absolute values & multiply back 25 | for m in suff_stats: 26 | apu = suff_stats[m][0] 27 | D, V = np.linalg.eig(apu) 28 | D = np.absolute(D) 29 | suff_stats[m][0] = np.dot( np.dot(V,np.diag(D)) ,np.linalg.inv(V)) 30 | return suff_stats 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 DPBayes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /dataset_tests/src/README.md: -------------------------------------------------------------------------------- 1 | # Differentially private Bayesian learning on distributed data 2 | 3 | Code for running the tests in the paper "Differentially private Bayesian learning on distributed data" (arXiv:1703.01106). 4 | 5 | 6 | ## Requirements 7 | 8 | The code uses Python3 with Numpy (tested with 1.11.1), Scipy (0.17.1), and Matplotlib (1.5.3). 9 | 10 | 11 | ## Running the tests 12 | 13 | To run the tests using UCI data, get the Abalone and Wine Quality datasets (https://archive.ics.uci.edu/ml/datasets.html), 14 | set the data location in UCI_data_getter.py and 15 | use eps_data_test.py. The results can be plotted using combine_prediction_erros.py. 16 | 17 | For the GDSC data, set the options in tensor.py and use clippingomega.py followed by run_tensor_tests.py in the drugsens_code-folder. To plot the results, run tensorresults.py followed by plot_tensor_results.py. 18 | 19 | See the paper "Efficient differentially private learning improves drug sensitivity prediction" (arXiv:1606.02109) for more information on the GDSC data pre-processing. 20 | 21 | For running the Spark tests, see the separate [readme at the `probic-decrypt-server` folder](../../probic-decrypt-server). 22 | -------------------------------------------------------------------------------- /dataset_tests/src/calculate_pred_errors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Function for calculating predictive errors. 8 | ''' 9 | 10 | import numpy as np 11 | from matplotlib import pyplot as plt 12 | 13 | import data_reader 14 | 15 | def calculate_errors(data, dim, filename_data, model_coeff): 16 | regr_coeff_mu = model_coeff['mean'] 17 | regr_coeff_std = model_coeff['prec'] 18 | if regr_coeff_mu is None: 19 | return None, None, None, None, None 20 | 21 | #read data to numpy array (where target = last column) 22 | if filename_data is not '': 23 | data = np.zeros((data[0], dim+1)) 24 | apu = dataReader.read_data(filename_data) 25 | for i in range(len(apu)): 26 | data[i,:] = apu[i] 27 | #center data 28 | data = data - np.mean(data, axis = 0) 29 | 30 | #calculate predictions (MAP) 31 | preds = np.dot(regr_coeff_mu, np.transpose(data[:,:-1]) ) 32 | 33 | #calculate errors 34 | MAE = np.mean( np.absolute(data[:,-1] - preds) ) 35 | MSE = np.mean( (data[:,-1] - preds)**2 ) 36 | 37 | return MAE, MSE, np.mean(preds), np.std(preds), np.amax(preds)-np.amin(preds) -------------------------------------------------------------------------------- /spark-streaming-aggregator/results-agg2.txt: -------------------------------------------------------------------------------- 1 | 9 --d 10 --clients 100 2 | Total time: 3275 ms. 3 | 9 --d 10 --clients 100 4 | Total time: 2588 ms. 5 | 9 --d 10 --clients 100 6 | Total time: 3355 ms. 7 | 9 --d 10 --clients 100 8 | Total time: 3572 ms. 9 | 9 --d 10 --clients 100 10 | Total time: 4258 ms. 11 | 9 --d 10 --clients 100 12 | Total time: 4032 ms. 13 | 9 --d 10 --clients 100 14 | Total time: 3889 ms. 15 | 9 --d 10 --clients 100 16 | Total time: 5156 ms. 17 | 9 --d 10 --clients 100 18 | Total time: 3567 ms. 19 | 9 --d 100 --clients 100 20 | Total time: 6668 ms. 21 | 9 --d 100 --clients 100 22 | Total time: 6512 ms. 23 | 9 --d 100 --clients 100 24 | Total time: 6262 ms. 25 | 9 --d 100 --clients 100 26 | Total time: 6397 ms. 27 | 9 --d 100 --clients 100 28 | Total time: 6079 ms. 29 | 9 --d 100 --clients 100 30 | Total time: 6247 ms. 31 | 9 --d 1000 --clients 100 32 | Total time: 39086 ms. 33 | 9 --d 1000 --clients 100 34 | Total time: 37841 ms. 35 | 9 --d 1000 --clients 100 36 | Total time: 39017 ms. 37 | 9 --d 1000 --clients 100 38 | Total time: 38494 ms. 39 | 9 --d 1000 --clients 100 40 | Total time: 38236 ms. 41 | 9 --d 1000 --clients 100 42 | Total time: 38246 ms. 43 | 9 --d 1000 --clients 100 44 | Total time: 37514 ms. 45 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestClient.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.test 2 | 3 | import fi.helsinki.cs.nodes.util.OptMain 4 | import com.typesafe.scalalogging.LazyLogging 5 | import java.net.Socket 6 | import java.io.DataInputStream 7 | import java.io.DataOutputStream 8 | import fi.helsinki.cs.probic.crypto.PkCrypto 9 | import scala.collection.Seq 10 | 11 | /** 12 | * Test the server by sending it encrypted messages forever. 13 | */ 14 | object TestClient extends OptMain with LazyLogging { 15 | 16 | val DEFAULT_PORT = "8080" 17 | 18 | val longOptions = Seq("port=") 19 | 20 | val shortOptions = "" 21 | 22 | def optMain() { 23 | val port = optional("port").getOrElse(DEFAULT_PORT).toInt 24 | 25 | val crypto = new PkCrypto("probic") 26 | val encrypt = crypto.getEncrypter("probic") 27 | for (i <- 0 until 1000) { 28 | val plainText = s"Test Number $i" 29 | logger.info(plainText) 30 | val cryptoText = encrypt(plainText) 31 | val sock = new Socket("localhost", port) 32 | val out = new DataOutputStream(sock.getOutputStream) 33 | out.writeInt(cryptoText.length) 34 | out.write(cryptoText) 35 | val in = new DataInputStream(sock.getInputStream) 36 | val returned = in.readUTF() 37 | sock.close 38 | logger.info(s"Server returned: $returned") 39 | assert(plainText == returned) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /dataset_tests/src/estimate_vars.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data, GDSC/drug sensitivity data 6 | 7 | Function for estimating data & target marginal variances for optimal clipping when not assuming an auxiliary open dataset. 8 | ''' 9 | 10 | import numpy as np 11 | import sys 12 | 13 | import sufficient_stats 14 | 15 | def get_estimates(data, pars, small_pos=.5): 16 | #Note: uses 1 clip for data and 1 for target; both scaled according to individual dim std 17 | 18 | N_train = data.shape[0] 19 | dim = pars['dim'] 20 | 21 | #clip data to the assumed data range 22 | data[:,0:-1] = np.sign(data[:,0:-1]) * np.minimum( np.absolute(data[:,0:-1]), pars['assumed_data_range'][0] ) 23 | data[:,-1] = np.sign(data[:,-1]) * np.minimum( np.absolute(data[:,-1]), pars['assumed_data_range'][1] ) 24 | 25 | 26 | eps=pars['privacy_for_marg_var']*pars['epsilon'] 27 | delta=pars['privacy_for_marg_var']*pars['delta'] 28 | 29 | sigma = np.sqrt( 1/(N_train-1) * 2*np.log(1.25/delta)) * (np.sqrt(dim*(pars['assumed_data_range'][0]**2)+pars['assumed_data_range'][1]**2) / eps) 30 | 31 | #add noise 32 | products = np.add(data**2, np.random.normal(0,sigma,[N_train,dim+1]) ) 33 | 34 | vars = np.nansum(products,0)/N_train 35 | ind = vars <= 0 36 | 37 | #set vars to small positive numbers if negative 38 | if sum(ind) > 0: 39 | vars[ind] = small_pos 40 | return vars 41 | 42 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/src/main/scala/fi/helsinki/cs/nodes/util/Spark2Main.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.nodes.util 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * 7 | * @author Eemil Lagerspetz 8 | */ 9 | trait Spark2Main extends OptMain { 10 | /** 11 | * Whether to compress Spark outputs. Required. 12 | */ 13 | val sparkOutputCompression: Boolean 14 | 15 | /** 16 | * Main entry point. Configures Spark and parses args for options specified in `shortOptSpec` and `longOptSpec` (see getopt-scala docs). 17 | */ 18 | def sparkMain(spark: SparkSession) 19 | 20 | /** 21 | * Main entry point. Configures Spark and parses args, then passes control to [[fi.helsinki.cs.nodes.carat.util.SparkMain#sparkMain]] . 22 | */ 23 | def optMain() { 24 | val sb = SparkSession 25 | .builder() 26 | .appName(getClass.getName.replaceAll("$", "")) 27 | 28 | val spark = { 29 | if (sparkOutputCompression) 30 | enableCompression(sb).getOrCreate() 31 | else 32 | sb.getOrCreate() 33 | } 34 | 35 | sparkMain(spark) 36 | } 37 | 38 | private def enableCompression(sb: SparkSession.Builder) = { 39 | sb.config("spark.hadoop.mapred.output.compress", true) 40 | .config("spark.hadoop.mapred.output.compression.codec", true) 41 | .config("spark.hadoop.mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec") 42 | .config("spark.hadoop.mapred.output.compression.type", "BLOCK") 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/clippingomega.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | Modified from the original code: 6 | Differentially private Bayesian linear regression 7 | Arttu Nieminen 2016-2017 8 | University of Helsinki Department of Computer Science 9 | Helsinki Institute of Information Technology HIIT 10 | 11 | Choose parameters for clipping using auxiliary data. 12 | 13 | Run: python3 clippingomega.py 14 | ''' 15 | 16 | import sys 17 | import os 18 | 19 | import diffpri as dp 20 | import numpy as np 21 | import csv 22 | 23 | # average number of non-missing data ~ 400 24 | pv_size = [400] # 400 in the paper 25 | 26 | ## NOTE: set these to match the values in tensor.py 27 | #privacy budget: lists of similar length 28 | eps = [1.0,3.0,5.0,7.5,10.0] 29 | delta_list = np.zeros(shape=len(eps))+10e-4 30 | np.random.seed(1) 31 | ny = len(pv_size) 32 | csvpath = '' # path for output csv files 33 | privacy_for_marg_var = .3 # .3 in the paper 34 | 35 | nx = len(eps) 36 | WX = np.zeros((ny,nx),dtype=np.float) 37 | WY = np.zeros((ny,nx),dtype=np.float) 38 | print('Finding optimal projection threshold...') 39 | for i in range(len(pv_size)): 40 | for j in range(len(eps)): 41 | n= pv_size[i] 42 | d = 10 43 | 44 | e = eps[j]*(1-privacy_for_marg_var) 45 | delta = delta_list[j]*(1-privacy_for_marg_var) 46 | 47 | w_x,w_y = dp.omega(n,d,e,delta,method='corr',ln=10) 48 | WX[i,j] = w_x 49 | WY[i,j] = w_y 50 | 51 | print('WX:\n'+str(WX)) 52 | print('WY:\n'+str(WY)) 53 | print('done!') 54 | np.savetxt(csvpath+'C-WX.csv',WX,delimiter=',') 55 | np.savetxt(csvpath+'C-WY.csv',WY,delimiter=',') 56 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results-agg-rerun.txt: -------------------------------------------------------------------------------- 1 | 9 --d 100 --clients 100 2 | Total time: 6275 ms. 3 | 9 --d 100 --clients 100 4 | Total time: 5792 ms. 5 | 9 --d 100 --clients 100 6 | Total time: 5972 ms. 7 | 9 --d 100 --clients 100 8 | Total time: 6125 ms. 9 | 9 --d 100 --clients 100 10 | Total time: 6163 ms. 11 | 9 --d 10 --clients 100 12 | Total time: 3139 ms. 13 | 9 --d 10 --clients 1000 14 | Total time: 7360 ms. 15 | 9 --d 10 --clients 1000 16 | Total time: 3691 ms. 17 | 9 --d 10 --clients 1000 18 | Total time: 3637 ms. 19 | 9 --d 10 --clients 1000 20 | Total time: 3217 ms. 21 | 9 --d 10 --clients 1000 22 | Total time: 3424 ms. 23 | 9 --d 100 --clients 1000 24 | Total time: 9037 ms. 25 | 9 --d 100 --clients 1000 26 | Total time: 9286 ms. 27 | 9 --d 100 --clients 1000 28 | Total time: 9072 ms. 29 | 9 --d 100 --clients 1000 30 | Total time: 8949 ms. 31 | 9 --d 100 --clients 1000 32 | Total time: 8879 ms. 33 | 9 --d 1000 --clients 1000 34 | Total time: 68311 ms. 35 | 9 --d 10 --clients 10000 36 | Total time: 9974 ms. 37 | 9 --d 10 --clients 10000 38 | Total time: 6604 ms. 39 | 9 --d 10 --clients 10000 40 | Total time: 6188 ms. 41 | 9 --d 10 --clients 10000 42 | Total time: 6042 ms. 43 | 9 --d 10 --clients 10000 44 | Total time: 6263 ms. 45 | 9 --d 100 --clients 10000 46 | Total time: 40170 ms. 47 | 9 --d 100 --clients 10000 48 | Total time: 35866 ms. 49 | 9 --d 100 --clients 10000 50 | Total time: 34550 ms. 51 | 9 --d 100 --clients 10000 52 | Total time: 35123 ms. 53 | 9 --d 100 --clients 10000 54 | Total time: 33611 ms. 55 | 9 --d 1000 --clients 10000 56 | Total time: 316652 ms. 57 | 9 --d 1000 --clients 10000 58 | Total time: 315078 ms. 59 | 9 --d 1000 --clients 10000 60 | Total time: 309505 ms. 61 | 9 --d 1000 --clients 10000 62 | Total time: 310065 ms. 63 | 9 --d 1000 --clients 10000 64 | Total time: 306264 ms. 65 | -------------------------------------------------------------------------------- /dataset_tests/src/UCI_data_getter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Script for reading UCI datasets. Returns a dataset with target as the last col. 8 | ''' 9 | 10 | import numpy as np 11 | 12 | import data_reader 13 | 14 | data_folder = 'data/' 15 | 16 | #abalone dataset, predict abalone age 17 | def get_abalone(): 18 | filename = data_folder + 'abalone/abalone.data' 19 | apu = data_reader.read_data(filename) 20 | data = np.zeros( (len(apu),len(apu[0]))) 21 | for k_row in range(data.shape[0]): 22 | data[k_row,1:] = apu[k_row][1:] 23 | #code categorical sex as 0=male, 1=female 24 | if apu[k_row][0] == 'M': 25 | data[k_row,0] = 0 26 | else: 27 | data[k_row,0] = 1 28 | return data 29 | 30 | #predict concrete compressive strength 31 | def get_concrete(): 32 | #8 mittausta ja target 33 | filename = data_folder + 'concrete/Concrete_Data.txt' 34 | apu = data_reader.read_data(filename) 35 | data = np.zeros( (len(apu)-1,len(apu[0]))) 36 | #0s rivi=nimet 37 | for k_row in range(1,data.shape[0]): 38 | data[k_row,:] = apu[k_row] 39 | return data 40 | 41 | #predict wine quality, red & white separately 42 | def get_red_wine(): 43 | filename = data_folder + 'wine/winequality-red.csv' 44 | apu = data_reader.read_data(filename) 45 | data = np.zeros((len(apu)-1,len(apu[0][0].split(";") ))) 46 | for k_row in range(1, data.shape[0]): 47 | data[k_row-1,:] = apu[k_row][0].split(";") 48 | #0th row=names 49 | return data 50 | def get_white_wine(): 51 | filename = data_folder + 'wine/winequality-white.csv' 52 | apu = data_reader.read_data(filename) 53 | data = np.zeros((len(apu)-1,len(apu[0][0].split(";") ))) 54 | for k_row in range(1, data.shape[0]): 55 | data[k_row-1,:] = apu[k_row][0].split(";") 56 | #0th row=names 57 | return data 58 | 59 | if __name__=='__main__': 60 | get_white_wine() -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/tensorresults.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | Modified from the original code: 6 | Differentially private Bayesian linear regression 7 | Arttu Nieminen 2016-2017 8 | University of Helsinki Department of Computer Science 9 | Helsinki Institute of Information Technology HIIT 10 | 11 | GDSC/drug sensitivity data 12 | 13 | Script for combining results produced by tensor.py. 14 | 15 | run_tensor_tests.py / tensor.py should be run before this. 16 | 17 | Run: python3 tensorresults 18 | ''' 19 | 20 | import numpy as np 21 | import csv 22 | import os.path 23 | import pickle 24 | import sys 25 | from collections import OrderedDict 26 | 27 | drug_nbo = 264 # 264 in the paper 28 | cv_rounds = 25 # 25 in the paper 29 | 30 | pv_size = [840] # [840] in the paper 31 | 32 | #privacy budget as lists of same length 33 | eps = [1.0,3.0,5.0,7.5,10.0] 34 | delta_list = np.zeros(shape=len(eps))+10e-4 35 | 36 | # Set folders 37 | inpath = 'res/' # set path for individual files from different drugs and folds 38 | outpath = 'resultsdata/' # set path for computed final results 39 | inprefix = 'cliptest-drugsens-' # set input file prefix 40 | outprefix = 'tensor-' # set output file prefix 41 | 42 | indatapath = inpath+inprefix 43 | outdatapath = outpath+outprefix 44 | datapath = indatapath 45 | 46 | all_means = OrderedDict() 47 | methods = ['true', 'clipped','noisy','cl_noisy','noisy_ind','cl_noisy_ind','scaling','cl_scaling','cl_true_TA','cl_true_TA','cl_true_TA_DP'] 48 | 49 | for m in methods: 50 | all_means[m] = OrderedDict() 51 | 52 | means = np.zeros((cv_rounds, drug_nbo, len(eps))) 53 | print('array shape (cv, drugs, eps): '+str(means.shape)) 54 | 55 | for k_cv in range(cv_rounds): 56 | for k_drug in range(drug_nbo): 57 | filename = datapath+str(k_drug)+'-'+str(k_cv)+'.pickle' 58 | with open(filename, 'rb') as f: 59 | apu = pickle.load(f) 60 | 61 | means[k_cv, k_drug, :] = apu[m] 62 | 63 | all_means[m]['mean'] = np.mean(means,1) 64 | all_means[m]['std'] = np.std(means,1) 65 | 66 | # Save data 67 | with open(outpath+'drugsens_test.pickle', 'wb') as f: 68 | pickle.dump(all_means,f,pickle.HIGHEST_PROTOCOL) 69 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestDataWriter.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.test 2 | 3 | import fi.helsinki.cs.nodes.util.OptMain 4 | import com.typesafe.scalalogging.LazyLogging 5 | import org.apache.commons.codec.binary.Base64 6 | import fi.helsinki.cs.probic.crypto.PkCrypto 7 | import fi.helsinki.cs.probic.data.GenerateTestDataMatrix 8 | import scala.collection.Seq 9 | 10 | /** 11 | * Mandatory options: 12 | * --intype hdfs or --intype socket 13 | * --input hdfs://path/to/input/folder 14 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 15 | */ 16 | object TestDataWriter extends OptMain with LazyLogging { 17 | 18 | val longOptions = Seq("port=", "masters=", "certs=", "input=", "clients=", "output=") 19 | 20 | val shortOptions = "" 21 | 22 | def optMain() { 23 | val output = mandatoryOption("output") 24 | val certs = optional("certs").getOrElse("probic").split(",") 25 | val masters = optional("masters").getOrElse("localhost:8080").split(",") 26 | val clients = mandatoryOption("clients").toInt 27 | 28 | val input = mandatoryOption("input") 29 | 30 | val crypto = new PkCrypto("probic") // Private test key, not relevant in this program 31 | 32 | val servers = for (i <- 0 until certs.length) yield { 33 | masters(i) -> crypto.getEncrypter(certs(i)) 34 | } 35 | val inputLines = io.Source.fromFile(input).getLines().toSeq 36 | val outputFile = inputLines.flatMap(line => getOutputLines(servers, line, clients).flatten) 37 | GenerateTestDataMatrix.toFile(output, outputFile) 38 | } 39 | 40 | def getOutputLines(servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = { 41 | val items = line.split(";") 42 | val itemsPerClient = items.length / clients 43 | val clientItems = items.grouped(itemsPerClient).toSeq 44 | for (client <- 0 until clients) yield { 45 | val myItems = clientItems(client) 46 | for (item <- 0 until myItems.length) yield { 47 | val (master, encrypt) = servers(item % servers.length) 48 | val data = myItems(item) 49 | val cryptoText = encrypt(data + "") 50 | val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}" 51 | msg 52 | } 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/crypto/PkCrypto.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.crypto 2 | 3 | import java.security.KeyStore 4 | import javax.crypto.Cipher 5 | import org.apache.commons.codec.binary.Base64 6 | import javax.security.cert.X509Certificate 7 | 8 | /** 9 | * Mandatory options: 10 | * --intype hdfs or --intype socket 11 | * --input hdfs://path/to/input/folder 12 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 13 | */ 14 | class PkCrypto(cert: String) { 15 | 16 | val keystorePath = "keystore.jks" 17 | 18 | lazy val password = scala.io.Source.fromFile("secret.txt", "UTF-8").getLines().toSeq.head.toCharArray 19 | 20 | lazy val decrypter = getDecrypt() 21 | 22 | lazy val key = getKey() 23 | 24 | def getKey() = { 25 | val ks = KeyStore.getInstance(KeyStore.getDefaultType()) 26 | 27 | var fis: java.io.FileInputStream = null 28 | try { 29 | fis = new java.io.FileInputStream(keystorePath) 30 | ks.load(fis, password) 31 | } finally { 32 | if (fis != null) { 33 | fis.close(); 34 | } 35 | } 36 | 37 | val k = ks.getKey(cert, password) 38 | k 39 | } 40 | 41 | /** 42 | * Method for testing only 43 | */ 44 | def getCert(id: String) = { 45 | val ks = KeyStore.getInstance(KeyStore.getDefaultType()) 46 | 47 | var fis: java.io.FileInputStream = null 48 | try { 49 | fis = new java.io.FileInputStream(keystorePath) 50 | ks.load(fis, password) 51 | } finally { 52 | if (fis != null) { 53 | fis.close(); 54 | } 55 | } 56 | 57 | val k = ks.getCertificate(id) 58 | k 59 | } 60 | 61 | def getDecrypt() = { 62 | val rsa = Cipher.getInstance("RSA") 63 | rsa.init(Cipher.DECRYPT_MODE, key) 64 | rsa 65 | } 66 | 67 | def decrypt(cryptoText: Array[Byte]) = synchronized { 68 | new String(decrypter.doFinal(cryptoText)) 69 | } 70 | 71 | def getRsa(pk: String) = { 72 | val publicBytes = Base64.decodeBase64(pk) 73 | val keySpec = X509Certificate.getInstance(publicBytes) 74 | val pubKey = keySpec.getPublicKey 75 | val rsa = Cipher.getInstance("RSA") 76 | rsa.init(Cipher.ENCRYPT_MODE, pubKey) 77 | rsa 78 | } 79 | 80 | def pkEncrypt(rsa: Cipher)(clearText: String) = { 81 | rsa.doFinal(clearText.getBytes) 82 | } 83 | 84 | /** 85 | * Prepare public key encryption engine for faster use. Used by test data generation and the clients sending the data. 86 | */ 87 | def getEncrypter(id: String) = { 88 | val k = getCert(id).getEncoded 89 | val b64 = Base64.encodeBase64(k) 90 | val rsa = getRsa(new String(b64)) 91 | pkEncrypt(rsa) _ 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /dataset_tests/src/sufficient_stats.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data, GDSC/drug sensitivity data 6 | 7 | Function for calculating sufficient statistics with perturbations added by individual clients. 8 | ''' 9 | 10 | import csv 11 | import numpy as np 12 | import sys 13 | 14 | def ss_individually(data, add_noise=False, sigma=None, use_spark = False, filename = None, n_spark_messages=None, spark_noise_range=None, fixed_point_int=None): 15 | 16 | k_clients = data.shape[0] 17 | dim = data.shape[1]-1 #dimensions without target 18 | 19 | added_noise = np.zeros((k_clients, dim*(dim+1)//2+dim)) 20 | 21 | #construct the products in X'X and X'y individually for each client, with or without added noise 22 | products = np.zeros([k_clients, dim*(dim+1)//2 + dim] ) 23 | ind = 0 24 | added_noise = np.zeros((k_clients, dim*(dim+1)//2+dim)) 25 | added_noise = np.random.normal(0,sigma, (k_clients, dim*(dim+1)//2+dim)) 26 | 27 | #suff_stat1 28 | for i in range(dim): 29 | for ii in range(i+1): 30 | products[:,ind] = (data[0:k_clients,i] * data[0:k_clients,ii]) 31 | ind += 1 32 | #suff_stat2 33 | for i in range(dim): 34 | products[:,ind] = (data[0:k_clients,i] * data[0:k_clients,-1]) 35 | ind += 1 36 | 37 | if not add_noise: 38 | added_noise = 0 39 | 40 | products += added_noise 41 | 42 | # save test data for Spark 43 | if use_spark and filename is not None: 44 | # use fixed-point representation in the noisy messages 45 | products = np.floor(products*fixed_point_int).astype('int64') 46 | # save as a matrix with n_clients rows and n_messages*suff_stats-dim columns, s.t. first n_messages cols correspond to the first element in the sufficient stats 47 | noisy_matrix = np.zeros((products.shape[0],products.shape[1]*n_spark_messages),dtype='int64') 48 | for i in range(products.shape[1]): 49 | noise = np.random.randint(-spark_noise_range,spark_noise_range, (products.shape[0],n_spark_messages-1) ,dtype='int64') 50 | noisy_matrix[:,n_spark_messages*i] = products[:,i] 51 | noisy_matrix[:,n_spark_messages*i:(n_spark_messages*(i+1)-1)] += noise 52 | noisy_matrix[:,n_spark_messages*(i+1)-1] = -np.sum(noise,1) 53 | np.savetxt(filename, noisy_matrix, delimiter=';') 54 | print('Saved sufficient statistics to file for Spark:\n {}'.format(filename)) 55 | sys.exit() 56 | 57 | noisy_sum = np.sum(products, axis=0) 58 | #suff stats for X'X 59 | suff_stat1 = np.zeros([dim,dim]) 60 | suff_stat1[np.tril_indices(dim,0)] = noisy_sum[0:dim*(dim+1)//2] 61 | suff_stat1 = suff_stat1 + np.triu(np.transpose(suff_stat1),k=1) 62 | #suff stat for X'y 63 | suff_stat2 = np.zeros([dim,1]) 64 | suff_stat2 = noisy_sum[(dim*(dim+1)//2):] 65 | 66 | return suff_stat1, suff_stat2, added_noise 67 | 68 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/plot_tensor_results.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | Modified from the original code: 6 | Differentially private Bayesian linear regression 7 | Arttu Nieminen 2016-2017 8 | University of Helsinki Department of Computer Science 9 | Helsinki Institute of Information Technology HIIT 10 | 11 | GDSC/drug sensitivity data 12 | 13 | Script for plotting tensor test results from pickle files. 14 | 15 | tensorresults.py should be run before this. 16 | 17 | Run: python3 plot_tensor_results.py 18 | ''' 19 | 20 | import pickle 21 | import sys 22 | 23 | import numpy as np 24 | from matplotlib import pyplot as plt 25 | 26 | 27 | # PLOTTING CONFIGURATIONS 28 | # set these to match tensorresults.py 29 | 30 | # Result filename 31 | filename = 'resultsdata/drugsens_test.pickle' 32 | #filename = 'resultsdata/NIPS_camera_ready/data_bounds/drugsens_test.pickle' 33 | #filename = 'resultsdata/NIPS_camera_ready/fixed_bounds/drugsens_test.pickle' 34 | 35 | # Save figure to file 36 | save_to_file = False 37 | fig_name = 'NIPS_camera_ready_plots/GDSC_drugsens_NIPS_final_all.pdf' 38 | #fig_name = 'NIPS_camera_ready_plots/GDSC_drugsens_NIPS_final_selected.pdf' 39 | 40 | # Methods to plot 41 | # plot all methods 42 | no_plotting = ['cl_scaling','cl_noisy', 'cl_true_TA'] 43 | # plot selected methods 44 | #no_plotting = ['clipped','cl_scaling','noisy','cl_noisy', 'cl_true_TA','cl_true_TA_DP'] 45 | 46 | ############################################################################### 47 | metodit = ['true', 'clipped', 'noisy', 'cl_noisy', 'noisy_ind', 'cl_noisy_ind', 'scaling','cl_scaling','cl_true_TA','cl_true_TA_DP'] 48 | 49 | nimet_dict = {'true':'NP', 'clipped':'proj NP','noisy':'TA', 'cl_noisy':'proj TA (noise)', 'noisy_ind':'DDP', 'cl_noisy_ind':'proj DDP', 'scaling':'input\nperturbed','cl_scaling':'proj scaling','cl_true_TA':'proj TA (not DP)', 'cl_true_TA_DP':'proj TA' } 50 | 51 | #use same colors for corresponding non-clipped & clipped methods except for DDP 52 | col_dict = {'true':'blue', 'clipped':'gray','noisy':'lightseagreen', 'cl_noisy':'green', 'noisy_ind':'red', 'cl_noisy_ind':'magenta', 'scaling':'orange','cl_scaling':'orange','cl_true_TA':'brown', 'cl_true_TA_DP':'darkgreen'} 53 | 54 | with open(filename, 'rb') as f: 55 | res_all = pickle.load(f) 56 | 57 | # parameters 58 | eps = [1.0,3.0,5.0,7.5,10.0] 59 | n_train = 840 60 | # Note: some drugs might not have the same number of training data 61 | n_test = 100 62 | 63 | x = np.linspace(1,len(eps),num=len(eps)) 64 | y_err_lower = None 65 | y_lower = {} 66 | y_upper = {} 67 | k_col = 0 68 | plt.figure() 69 | ax = plt.gca() 70 | offset = -3 71 | offset_factor = 0.05 72 | for m in metodit: 73 | k_col = k_col + 1 #? 74 | if m not in no_plotting: 75 | y_lower[m] = np.zeros(len(eps)) 76 | y_upper[m] = np.zeros(len(eps)) 77 | 78 | #plot non-private with dashed line 79 | if m in ['true','clipped']: 80 | linetype = '--' 81 | else: 82 | linetype = '-' 83 | 84 | ax.errorbar(x+offset*offset_factor,np.mean(res_all[m]['mean'],0), 85 | yerr=np.std(res_all[m]['mean'],0),ls=linetype,marker='',linewidth=2,label=m ,color=col_dict[m], elinewidth=1.5) 86 | plt.axis([.79,5.21,-.03,.27]) 87 | 88 | offset += 1 89 | 90 | plt.tight_layout(pad=7) 91 | nimet = [] 92 | for m in metodit: 93 | if m not in no_plotting: 94 | nimet.append(nimet_dict[m]) 95 | 96 | plt.xticks(x,eps) 97 | plt.ylabel('Predictive accuracy') 98 | plt.xlabel('epsilon') 99 | plt.suptitle('d=10, sample size=840, CV=25, $\delta$=0.0001', y=.12, fontsize=13) 100 | 101 | #legend on top 102 | plt.legend(nimet,bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) 103 | 104 | if save_to_file: 105 | plt.savefig(fig_name, bbox_inches='tight') 106 | else: 107 | plt.show() 108 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results-agg4-serverupdated.txt: -------------------------------------------------------------------------------- 1 | 9 --d 10 --clients 100 2 | Total time: 1788 ms. 3 | 9 --d 10 --clients 100 4 | Total time: 1705 ms. 5 | 9 --d 10 --clients 100 6 | Total time: 1726 ms. 7 | 9 --d 10 --clients 100 8 | Total time: 1660 ms. 9 | 9 --d 10 --clients 100 10 | Total time: 1728 ms. 11 | 9 --d 100 --clients 100 12 | Total time: 2018 ms. 13 | 9 --d 100 --clients 100 14 | Total time: 2015 ms. 15 | 9 --d 100 --clients 100 16 | Total time: 2050 ms. 17 | 9 --d 100 --clients 100 18 | Total time: 2096 ms. 19 | 9 --d 100 --clients 100 20 | Total time: 1977 ms. 21 | 9 --d 1000 --clients 100 22 | Total time: 3435 ms. 23 | 9 --d 1000 --clients 100 24 | Total time: 3456 ms. 25 | 9 --d 1000 --clients 100 26 | Total time: 3441 ms. 27 | 9 --d 1000 --clients 100 28 | Total time: 3549 ms. 29 | 9 --d 1000 --clients 100 30 | Total time: 3277 ms. 31 | 9 --d 10000 --clients 100 32 | Total time: 15203 ms. 33 | 9 --d 10000 --clients 100 34 | Total time: 15828 ms. 35 | 9 --d 10000 --clients 100 36 | Total time: 15766 ms. 37 | 9 --d 10000 --clients 100 38 | Total time: 15040 ms. 39 | 9 --d 10000 --clients 100 40 | Total time: 14686 ms. 41 | 9 --d 10 --clients 1000 42 | Total time: 1892 ms. 43 | 9 --d 10 --clients 1000 44 | Total time: 1843 ms. 45 | 9 --d 10 --clients 1000 46 | Total time: 1934 ms. 47 | 9 --d 10 --clients 1000 48 | Total time: 1885 ms. 49 | 9 --d 10 --clients 1000 50 | Total time: 1872 ms. 51 | 9 --d 100 --clients 1000 52 | Total time: 2862 ms. 53 | 9 --d 100 --clients 1000 54 | Total time: 2798 ms. 55 | 9 --d 100 --clients 1000 56 | Total time: 2817 ms. 57 | 9 --d 100 --clients 1000 58 | Total time: 2828 ms. 59 | 9 --d 100 --clients 1000 60 | Total time: 2978 ms. 61 | 9 --d 1000 --clients 1000 62 | Total time: 10173 ms. 63 | 9 --d 1000 --clients 1000 64 | Total time: 10678 ms. 65 | 9 --d 1000 --clients 1000 66 | Total time: 11008 ms. 67 | 9 --d 1000 --clients 1000 68 | Total time: 10178 ms. 69 | 9 --d 1000 --clients 1000 70 | Total time: 10786 ms. 71 | 9 --d 10000 --clients 1000 72 | Total time: 85990 ms. 73 | 9 --d 10000 --clients 1000 74 | Total time: 83952 ms. 75 | 9 --d 10000 --clients 1000 76 | Total time: 86265 ms. 77 | 9 --d 10000 --clients 1000 78 | Total time: 84180 ms. 79 | 9 --d 10000 --clients 1000 80 | Total time: 84358 ms. 81 | 9 --d 10 --clients 10000 82 | Total time: 2913 ms. 83 | 9 --d 10 --clients 10000 84 | Total time: 2917 ms. 85 | 9 --d 10 --clients 10000 86 | Total time: 2973 ms. 87 | 9 --d 10 --clients 10000 88 | Total time: 3081 ms. 89 | 9 --d 10 --clients 10000 90 | Total time: 3065 ms. 91 | 9 --d 100 --clients 10000 92 | Total time: 12495 ms. 93 | 9 --d 100 --clients 10000 94 | Total time: 12247 ms. 95 | 9 --d 100 --clients 10000 96 | Total time: 12843 ms. 97 | 9 --d 100 --clients 10000 98 | Total time: 11950 ms. 99 | 9 --d 100 --clients 10000 100 | Total time: 12268 ms. 101 | 9 --d 1000 --clients 10000 102 | Total time: 103702 ms. 103 | 9 --d 1000 --clients 10000 104 | Total time: 101318 ms. 105 | 9 --d 1000 --clients 10000 106 | Total time: 100846 ms. 107 | 9 --d 1000 --clients 10000 108 | Total time: 99636 ms. 109 | 9 --d 1000 --clients 10000 110 | Total time: 100498 ms. 111 | 9 --d 10000 --clients 10000 112 | Total time: 1003056 ms. 113 | 9 --d 10000 --clients 10000 114 | Total time: 979955 ms. 115 | 9 --d 10000 --clients 10000 116 | Total time: 1006304 ms. 117 | 9 --d 10000 --clients 10000 118 | Total time: 986876 ms. 119 | 9 --d 10000 --clients 10000 120 | Total time: 998624 ms. 121 | 9 --d 10 --clients 100000 122 | Total time: 8420 ms. 123 | 9 --d 10 --clients 100000 124 | Total time: 8568 ms. 125 | 9 --d 10 --clients 100000 126 | Total time: 8723 ms. 127 | 9 --d 10 --clients 100000 128 | Total time: 8678 ms. 129 | 9 --d 10 --clients 100000 130 | Total time: 8502 ms. 131 | 9 --d 100 --clients 100000 132 | Total time: 67553 ms. 133 | 9 --d 100 --clients 100000 134 | Total time: 66513 ms. 135 | 9 --d 100 --clients 100000 136 | Total time: 64927 ms. 137 | 9 --d 100 --clients 100000 138 | Total time: 64764 ms. 139 | 9 --d 100 --clients 100000 140 | Total time: 64463 ms. 141 | 9 --d 1000 --clients 100000 142 | Total time: 636229 ms. 143 | 9 --d 1000 --clients 100000 144 | Total time: 601287 ms. 145 | 9 --d 1000 --clients 100000 146 | Total time: 600590 ms. 147 | 9 --d 1000 --clients 100000 148 | Total time: 614189 ms. 149 | 9 --d 1000 --clients 100000 150 | Total time: 600433 ms. 151 | 9 --d 1000 --clients 100000 --repeats 10 152 | Total time: 1634766 ms. 153 | 9 --d 1000 --clients 100000 --repeats 10 154 | Total time: 1581326 ms. 155 | 9 --d 1000 --clients 100000 --repeats 10 156 | Total time: 1561852 ms. 157 | 9 --d 1000 --clients 100000 --repeats 10 158 | Total time: 1614572 ms. 159 | 9 --d 1000 --clients 100000 --repeats 10 160 | Total time: 1568930 ms. 161 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/src/main/scala/fi/helsinki/cs/probic/streaming/SparkDataAggregator.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.streaming 2 | 3 | import fi.helsinki.cs.nodes.util.Spark2Main 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.streaming.StreamingContext 6 | import org.apache.spark.streaming.Seconds 7 | import org.apache.spark.storage.StorageLevel 8 | import org.apache.spark.streaming.dstream.DStream 9 | import com.typesafe.scalalogging.LazyLogging 10 | import java.net.URL 11 | import java.net.Socket 12 | import java.io.DataOutputStream 13 | import org.apache.commons.codec.binary.Base64 14 | import java.io.DataInputStream 15 | import scala.io.Codec 16 | import org.apache.commons.io.FileUtils 17 | import java.io.File 18 | 19 | /** 20 | * Assumes data is already in correct order, e.g. data for first server first, then the data for 2nd server, etc. 21 | * 22 | * Mandatory options: 23 | * --intype hdfs or --intype socket 24 | * --input hdfs://path/to/input/folder 25 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 26 | */ 27 | object SparkDataAggregator extends Spark2Main with LazyLogging { 28 | val longOptions = Seq("clients=", "input=", "output=", "noise=", "d=", "repeats=", "useDouble") 29 | 30 | val shortOptions = "" 31 | 32 | val sparkOutputCompression = true 33 | 34 | def sparkMain(spark: SparkSession) { 35 | // test-data-matrix 36 | val input = mandatoryOption("input") 37 | val output = mandatoryOption("output") 38 | val d = mandatoryOption("d").toInt 39 | val clients = mandatoryOption("clients").toInt 40 | val noise = mandatoryOption("noise").toInt 41 | val useDouble = optionSet("useDouble") 42 | val k = noise + 1 43 | 44 | var timeAcc = 0L 45 | val repeats = optional("repeats").getOrElse("1").toInt 46 | val out = s"$output-$d-$clients-$noise.csv.gz" 47 | import sys.process._ 48 | val result = "rm -rf temp" ! 49 | val start = System.currentTimeMillis 50 | val in = spark.sparkContext.textFile(s"$input-$d-$clients-$noise.csv.gz", d).repartition(d).zipWithIndex 51 | 52 | val resultStream = { 53 | if (useDouble) { 54 | // Matrix is N lines, each line has (noise+1)*D messages 55 | // So we need to gather each group of (noise+1) items on each line to produce full batches of messages. 56 | val dGroupedLines = in.flatMap { 57 | case (line, clientId) => 58 | val dGroups = line.split(";").map(_.toDouble).grouped(k).toSeq.zipWithIndex 59 | dGroups.map{case (kItems, dValue) => 60 | dValue -> kItems.zipWithIndex.map{case (item, serverId) => (serverId, clientId, item)} 61 | } 62 | } 63 | dGroupedLines.reduceByKey(_++_).flatMap{case (dValue, batch) => 64 | val byServer = batch.toSeq.groupBy(_._1).map{x => x._2.map(_._3).toArray -> x._1} 65 | val outputs = (0 until repeats).toSeq.par.map{ repeatId => 66 | val output = byServer.toSeq.par.map(sendReceive(useDouble)).reduce(_+_) 67 | val adjustedLineId = (dValue * 10 + repeatId) 68 | s"$adjustedLineId;$output" 69 | } 70 | outputs.seq 71 | } 72 | } else { 73 | // Matrix is D lines, each line has (noise+1)*N messages 74 | in.flatMap { 75 | case (line, lineNum) => 76 | // One line is a whole batch of messages, send to servers for sum. 77 | val outputs = (0 until repeats).toSeq.par.map { repeatId => 78 | val output = line.split(";").map { x => 79 | if (useDouble) 80 | x.toDouble 81 | else 82 | x.toLong 83 | }.grouped(clients).toSeq.zipWithIndex.par.map(sendReceive(useDouble)) 84 | .reduce(_ + _) 85 | val adjustedLineId = (lineNum * 10 + repeatId) 86 | s"$adjustedLineId;$output" 87 | } 88 | outputs.seq 89 | //lineNum -> output 90 | } 91 | } 92 | } 93 | 94 | //resultStream.map { case (k, value) => k + ";" + value } 95 | resultStream.saveAsTextFile(out) 96 | val end = System.currentTimeMillis() 97 | timeAcc += end - start 98 | 99 | println(s"Total time: $timeAcc ms.") 100 | } 101 | 102 | def sendReceive(useDouble: Boolean)(valuesForServer: (Array[Double], Int)) = { 103 | val (values, srvId) = valuesForServer 104 | val sock = new Socket("127.0.0.1", 8080 + srvId) 105 | val out = new DataOutputStream(sock.getOutputStream) 106 | if (useDouble) 107 | values.foreach(out.writeDouble) 108 | else 109 | values.map(_.toLong).foreach(out.writeLong) 110 | val in = new DataInputStream(sock.getInputStream) 111 | 112 | val returned = { 113 | if (useDouble) 114 | in.readDouble 115 | else 116 | in.readLong 117 | } 118 | sock.close 119 | returned 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /spark-streaming-aggregator/results-agg4.txt: -------------------------------------------------------------------------------- 1 | 9 --d 1000 --clients 10000 2 | Total time: 105394 ms. 3 | 9 --d 1000 --clients 10000 4 | Total time: 109267 ms. 5 | 9 --d 1000 --clients 10000 6 | Total time: 110730 ms. 7 | 9 --d 1000 --clients 10000 8 | Total time: 111144 ms. 9 | 9 --d 1000 --clients 10000 10 | Total time: 110378 ms. 11 | 9 --d 100 --clients 10000 12 | Total time: 12343 ms. 13 | 9 --d 100 --clients 10000 14 | Total time: 12948 ms. 15 | 9 --d 100 --clients 10000 16 | Total time: 12439 ms. 17 | 9 --d 100 --clients 10000 18 | Total time: 12913 ms. 19 | 9 --d 100 --clients 10000 20 | Total time: 13148 ms. 21 | 9 --d 10 --clients 10000 22 | Total time: 3063 ms. 23 | 9 --d 10 --clients 10000 24 | Total time: 2818 ms. 25 | 9 --d 10 --clients 10000 26 | Total time: 3067 ms. 27 | 9 --d 10 --clients 10000 28 | Total time: 3065 ms. 29 | 9 --d 10 --clients 10000 30 | Total time: 3096 ms. 31 | 9 --d 10 --clients 1000 32 | Total time: 2483 ms. 33 | 9 --d 10 --clients 1000 34 | Total time: 2046 ms. 35 | 9 --d 10 --clients 1000 36 | Total time: 2082 ms. 37 | 9 --d 10 --clients 1000 38 | Total time: 1978 ms. 39 | 9 --d 10 --clients 1000 40 | Total time: 2056 ms. 41 | 9 --d 100 --clients 1000 42 | Total time: 3046 ms. 43 | 9 --d 100 --clients 1000 44 | Total time: 3193 ms. 45 | 9 --d 100 --clients 1000 46 | Total time: 2860 ms. 47 | 9 --d 100 --clients 1000 48 | Total time: 2890 ms. 49 | 9 --d 100 --clients 1000 50 | Total time: 3116 ms. 51 | 9 --d 1000 --clients 1000 52 | Total time: 12203 ms. 53 | 9 --d 1000 --clients 1000 54 | Total time: 12163 ms. 55 | 9 --d 1000 --clients 1000 56 | Total time: 11263 ms. 57 | 9 --d 1000 --clients 1000 58 | Total time: 11680 ms. 59 | 9 --d 1000 --clients 1000 60 | Total time: 11522 ms. 61 | 9 --d 10000 --clients 1000 62 | Total time: 94400 ms. 63 | 9 --d 10000 --clients 1000 64 | Total time: 92984 ms. 65 | 9 --d 10000 --clients 1000 66 | Total time: 92690 ms. 67 | 9 --d 10000 --clients 1000 68 | Total time: 94449 ms. 69 | 9 --d 10000 --clients 1000 70 | Total time: 93135 ms. 71 | 9 --d 10 --clients 100 72 | Total time: 2363 ms. 73 | 9 --d 10 --clients 100 74 | Total time: 1803 ms. 75 | 9 --d 10 --clients 100 76 | Total time: 1729 ms. 77 | 9 --d 10 --clients 100 78 | Total time: 1711 ms. 79 | 9 --d 10 --clients 100 80 | Total time: 1732 ms. 81 | 9 --d 100 --clients 100 82 | Total time: 2225 ms. 83 | 9 --d 100 --clients 100 84 | Total time: 2163 ms. 85 | 9 --d 100 --clients 100 86 | Total time: 2057 ms. 87 | 9 --d 100 --clients 100 88 | Total time: 2155 ms. 89 | 9 --d 100 --clients 100 90 | Total time: 2152 ms. 91 | 9 --d 1000 --clients 100 92 | Total time: 3880 ms. 93 | 9 --d 1000 --clients 100 94 | Total time: 3945 ms. 95 | 9 --d 1000 --clients 100 96 | Total time: 3922 ms. 97 | 9 --d 1000 --clients 100 98 | Total time: 3863 ms. 99 | 9 --d 1000 --clients 100 100 | Total time: 3826 ms. 101 | 9 --d 10000 --clients 100 102 | Total time: 19815 ms. 103 | 9 --d 10000 --clients 100 104 | Total time: 19339 ms. 105 | 9 --d 10000 --clients 100 106 | Total time: 19497 ms. 107 | 9 --d 10000 --clients 100 108 | Total time: 19323 ms. 109 | 9 --d 10000 --clients 100 110 | Total time: 21497 ms. 111 | 9 --d 10 --clients 100000 112 | Total time: 9172 ms. 113 | 9 --d 10 --clients 100000 114 | Total time: 8434 ms. 115 | 9 --d 10 --clients 100000 116 | Total time: 8647 ms. 117 | 9 --d 10 --clients 100000 118 | Total time: 8741 ms. 119 | 9 --d 10 --clients 100000 120 | Total time: 8807 ms. 121 | 9 --d 10 --clients 100000 122 | Total time: 8518 ms. 123 | 9 --d 100 --clients 100000 124 | Total time: 70875 ms. 125 | 9 --d 100 --clients 100000 126 | Total time: 70965 ms. 127 | 9 --d 100 --clients 100000 128 | Total time: 69856 ms. 129 | 9 --d 100 --clients 100000 130 | Total time: 71048 ms. 131 | 9 --d 100 --clients 100000 132 | Total time: 69147 ms. 133 | 9 --d 1000 --clients 100000 134 | Total time: 662427 ms. 135 | 9 --d 1000 --clients 100000 136 | Total time: 649283 ms. 137 | 9 --d 1000 --clients 100000 138 | Total time: 670965 ms. 139 | 9 --d 1000 --clients 100000 140 | Total time: 670152 ms. 141 | 9 --d 1000 --clients 100000 142 | Total time: 679691 ms. 143 | 9 --d 10000 --clients 10000 144 | Total time: 1042829 ms. 145 | 9 --d 10000 --clients 10000 146 | Total time: 1132176 ms. 147 | 9 --d 10000 --clients 10000 148 | Total time: 1138083 ms. 149 | 9 --d 10000 --clients 10000 150 | Total time: 1101404 ms. 151 | 9 --d 10000 --clients 10000 152 | Total time: 1102854 ms. 153 | 9 --d 1000 --clients 100000 --repeats 10 154 | 9 --d 1000 --clients 100000 --repeats 10 155 | 9 --d 1000 --clients 100000 --repeats 10 156 | 9 --d 1000 --clients 100000 --repeats 10 157 | 9 --d 1000 --clients 100000 --repeats 10 158 | 9 --d 1000 --clients 100000 --repeats 10 159 | Total time: 6684956 ms. 160 | 9 --d 1000 --clients 100000 --repeats 10 161 | Total time: 6649201 ms. 162 | 9 --d 1000 --clients 100000 --repeats 10 163 | Total time: 6662739 ms. 164 | 9 --d 1000 --clients 100000 --repeats 10 165 | Total time: 6690525 ms. 166 | 9 --d 1000 --clients 100000 --repeats 10 167 | Total time: 6720634 ms. 168 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/tensor.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | Modified from the original code: 6 | Differentially private Bayesian linear regression 7 | Arttu Nieminen 2016-2017 8 | University of Helsinki Department of Computer Science 9 | Helsinki Institute of Information Technology HIIT 10 | 11 | GDSC/drug sensitivity data 12 | 13 | clippingomega.py should be run before this code. 14 | 15 | Run: python3 tensor.py drugid seed 16 | where 17 | - drugid is an integer in [0,1,...,264] (specifies drug) 18 | - seed is an integer (specifies cv fold) 19 | This program does 1-fold cv for given drug for one test tensor. 20 | The cv split is defined by the given random seed. 21 | run_tensor_tests.py is a helper script for running several drugs and CVs as in the paper. 22 | ''' 23 | 24 | import sys 25 | import os 26 | 27 | import diffpri as dp 28 | import numpy as np 29 | import pickle 30 | import csv 31 | from collections import OrderedDict 32 | 33 | # Import data 34 | datapath = '' # add path for input and output data files 35 | f = open(datapath+'GeneExpressionReducted.csv','rt') 36 | reader = csv.reader(f,delimiter=',') 37 | x = np.array(list(reader)).astype(float) 38 | f.close() 39 | f = open(datapath+'DrugResponse.csv','rt') 40 | reader = csv.reader(f,delimiter=',') 41 | y = np.array(list(reader)).astype(float) 42 | f.close() 43 | # For more information on the data pre-processing, see the paper "Efficient differentially private learning improves drug sensitivity prediction" (arXiv:1606.02109). 44 | 45 | if len(sys.argv) > 1: 46 | drugid = int(sys.argv[1]) 47 | seed = int(sys.argv[2]) 48 | else: 49 | drugid = 226 50 | seed = 0 51 | 52 | # Number of samples to use 53 | pv_size = [840] # [840] in the paper 54 | pv_max = max(pv_size) 55 | 56 | #privacy budget as lists of same length 57 | eps = [1.0,3.0,5.0,7.5,10.0] 58 | delta_list = np.zeros(shape=len(eps))+10e-4 59 | 60 | #test set size 61 | n_test = 100 # 100 in the paper 62 | 63 | print('Running tensor test: drugid='+str(drugid)+', seed='+str(seed)) 64 | 65 | # Setup some parameters; see eps_data_test.py for more info 66 | pars = {'assumed_data_range' : [1,7.5], #[1,7.5] in the paper 67 | #'feedback' : 0, 68 | 'dim': 10, # 10 in the paper 69 | 'tmp_folder' : 'tmp/', 70 | 'add_noise' : 3, 71 | 'scaling_comparison' : 0, 72 | 'enforce_pos_def' : True, 73 | 'privacy_for_marg_var' : .3, # NOTE: this should match the value in clippingomega.py; .3 in the paper 74 | 'small_const_for_std' : .5, # .5 in the paper 75 | 'drugsens_data' : True, 76 | 'use_spark' : False, 77 | # Note: Spark version not tested with drugsens data 78 | 'spark_filename' : 'tmp/sparktest.csv', 79 | 'n_spark_messages' : 10, 80 | 'spark_noise_range' : 10e13, 81 | 'fixed_point_int' : 10e6 82 | } 83 | 84 | csvpath = '' 85 | # Fetch clipping threshold 86 | f = open(csvpath+'C-WX.csv','rt') 87 | reader = csv.reader(f,delimiter=',') 88 | WX = np.array(list(reader)).astype(float) 89 | f.close() 90 | f = open(csvpath+'C-WY.csv','rt') 91 | reader = csv.reader(f,delimiter=',') 92 | WY = np.array(list(reader)).astype(float) 93 | f.close() 94 | 95 | #check number of missing values 96 | inds = ~np.isnan(y[:,drugid]) 97 | n_data = np.sum(inds) 98 | print('drugid '+str(drugid)+', has ' +str(n_data) +' target values (out of '+str(y.shape[0])+')') 99 | y = y[inds,:] 100 | x = x[inds,:] 101 | 102 | res_all = OrderedDict() 103 | models = ['true', 'clipped','noisy','cl_noisy','noisy_ind','cl_noisy_ind','scaling','cl_scaling','cl_true_TA','cl_true_TA_DP'] 104 | for m in models: 105 | res_all[m] = np.zeros((len(pv_size),len(eps)),dtype=np.float64) 106 | 107 | for i in range(len(pv_size)): 108 | 109 | n_pv = pv_size[i] 110 | d = pars['dim'] 111 | for j in range(len(eps)): 112 | pars['epsilon'] = eps[j] 113 | pars['delta'] = delta_list[j] 114 | 115 | w_x = WX[i,j] 116 | w_y = WY[i,j] 117 | 118 | # check amount of data, use maximum amount if too few samples 119 | if n_data < n_pv+n_test: #n_npv+n_test: 120 | print('Not enough non-missing data! Continuing with maximum amount of private data: ' + str(n_data-n_test)) 121 | n_pv = n_data-n_test 122 | 123 | # Process data 124 | suff_stats_all,sigma_all,added_noise_dict,x_test,y_test,B_x,B_y,n_train = dp.processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars) 125 | 126 | # calculate predictions 127 | for m in suff_stats_all: 128 | pred = dp.predictL(suff_stats_all[m][0],suff_stats_all[m][1],x_test) 129 | res_all[m][i,j] = dp.precision(pred,y_test) 130 | 131 | 132 | with open('res/cliptest-drugsens-'+str(drugid)+'-'+str(seed)+'.pickle', 'wb') as f: 133 | pickle.dump(res_all, f, pickle.HIGHEST_PROTOCOL) 134 | 135 | print('Done.') 136 | -------------------------------------------------------------------------------- /probic-decrypt-server/README.md: -------------------------------------------------------------------------------- 1 | # Introduction: Spark Streaming Aggregator Subprojects 2 | 3 | The Probic Spark Streaming Data Aggregator consists of two projects, spark-streaming-aggregator that routes 4 | data to the correct node for decryption, and probic-decrypt-server that represents one such decryption node. 5 | The node then decrypts the data and returns the result to Spark which aggregates results from all such nodes 6 | and produces the sum until input is exhausted (or forever, this can be adjusted). 7 | 8 | ## Requirements 9 | 10 | Both projects use sbt as the build tool. Usage: 11 | 12 | 1. get sbt from http://www.scala-sbt.org/download.html , then extract it, and put its bin folder into your path. 13 | 14 | 2. `cd spark-streaming-aggregator` or `cd probic-decrypt-server` 15 | 16 | 3. `sbt eclipse` creates an Eclpse project file that allows you to import the spark-streaming-aggregator directory as a scala-ide project: http://scala-ide.org/ 17 | 18 | 5. `sbt assembly` creates a so called fat jar that can be copied to any machine with Java and Spark installed and run as a Spark Streaming program. 19 | 20 | 6. Spark 2.1.0 or newer prebuilt for Hadoop 2.7 is needed by the tests. We assume Spark is downloaded from http://spark.apache.org/ , extracted, and placed at `$HOME/work/spark-2.1.0-bin-hadoop2.7` . If Spark is placed elsewhere, please adjust the file `spark-streaming-aggregator/run-spark.sh` accordingly. 21 | 22 | 23 | # Running the experiment in the paper 24 | 25 | After compiling, the sections below explains how to run the DCA experiment used to generate the results in Table 1 in the NIPS 2017 paper Differentially private Bayesian learning on distributed data. 26 | 27 | The process includes some preparation and data generation steps followed by starting the decryption servers, and finally the top-level Spark-based data aggregator. 28 | 29 | ## Create keys if not yet done 30 | 31 | This requires java's keytool to be installed. 32 | 33 | 1. Make a file called secret.txt with a one-line password that will be used for server keys. Currently the same password is used for all. 34 | 2. Run `./gen10.sh` to generate 10 public/private key pairs. 35 | 36 | ## Generate testing data file 37 | Run `run-scripts/gen-testdata-10.sh` to generate a test data file for 10 decryption servers, 10:1 noise to real data message ratio, N=100 to 100,000 and d=10 to 10,000. 38 | 39 | ## Start the aggregators 40 | Run `run-scripts/start-servers.sh n C` where n is the number of servers and equal to amount of noise, 10 above) and C is the number of clients. This will start the decryption servers. They will wait for the Spark process to act as the clients and schedule the data processing for them. 41 | 42 | ## Start the Spark aggregator 43 | cd to `../spark-streaming-aggregator` and run: 44 | 45 | ``` 46 | sbt assembly 47 | for d in 10; do for k in $( seq 1 5 ); do ./run-spark-aggregator4.sh results-agg4.txt --d $d --clients 100 --repeats 10; done ; done 48 | ``` 49 | Note: To run the whole experiment, you need to then kill the aggregator server processes, and restart with --clients 1000, then 10,0000, etc. until the whole table of results has been generated. 50 | 51 | # Spark Non-Streaming Aggregator Final Results 52 | 53 | You can obtain these 5-run averages using `./getresults.sh results-agg4.txt` in the `spark-streaming-aggregator` folder. 54 | The output should look like this: 55 | ``` 56 | 9 --d 10000 --clients 10000 1103.47 57 | 9 --d 10000 --clients 1000 93.5316 58 | 9 --d 10000 --clients 100 19.8942 59 | 9 --d 1000 --clients 100000 666.504 60 | 9 --d 1000 --clients 10000 109.383 61 | 9 --d 1000 --clients 1000 11.7662 62 | 9 --d 1000 --clients 100 3.8872 63 | 9 --d 100 --clients 100000 70.3782 64 | 9 --d 100 --clients 10000 12.7582 65 | 9 --d 100 --clients 1000 3.021 66 | 9 --d 100 --clients 100 2.1504 67 | 9 --d 10 --clients 100000 8.71983 68 | 9 --d 10 --clients 10000 3.0218 69 | 9 --d 10 --clients 1000 2.129 70 | 9 --d 10 --clients 100 1.8676 71 | ``` 72 | 73 | # Running an experiment with a real dataset based model 74 | 75 | ## Generate sufficient statistics + noise 76 | ```sh 77 | cd ../dataset_tests/src 78 | python3 eps_data_test.py -s sparkfile.txt-8-3000-9.csv -c 10 79 | ``` 80 | And compress it for Spark: 81 | ``` 82 | gzip sparkfile.txt-8-3000-9.csv 83 | ``` 84 | 85 | ## Run decryption servers 86 | First, compile the project with `sbt assembly`. Then run: 87 | 88 | ```sh 89 | cd ../../probic-decrypt-server 90 | run-scripts/start-servers-eps.sh 91 | ``` 92 | 93 | ## Run Spark aggregator 94 | First, compile the project with `sbt assembly`. Then run: 95 | 96 | ``` 97 | cd ../spark-streaming-aggregator 98 | ./run-spark-aggregator-eps-data.sh 99 | ``` 100 | 101 | Results will be produced in the spark-streaming-aggregator folder in the file results-eps-abalone-3000.txt. 102 | Run as many repeats as you wish. 103 | You can get the average runtime by running 104 | ```sh 105 | ./get-results.sh results-eps-abalone-3000.txt 106 | ``` 107 | The results may be something like: 108 | ```sh 109 | 9 --clients 3000 --d 8 --useDouble 6.3494 110 | ``` 111 | In this case the experiment took 6.35 seconds to complete on average. 112 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/test/TestDataServer.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.test 2 | 3 | import fi.helsinki.cs.nodes.util.OptMain 4 | import com.typesafe.scalalogging.LazyLogging 5 | import java.net.ServerSocket 6 | import java.net.Socket 7 | import org.apache.commons.codec.binary.Base64 8 | import java.io.DataOutputStream 9 | import fi.helsinki.cs.probic.crypto.PkCrypto 10 | import java.util.zip.GZIPInputStream 11 | import java.io.FileInputStream 12 | import java.io.File 13 | import scala.collection.Seq 14 | 15 | /** 16 | * Mandatory options: 17 | * --intype hdfs or --intype socket 18 | * --input hdfs://path/to/input/folder 19 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 20 | */ 21 | object TestDataServer extends OptMain with LazyLogging { 22 | 23 | val DEFAULT_PORT = "8090" 24 | 25 | val longOptions = Seq("port=", "masters=", "certs=", "input=", "clients=", "zip", "batchLength=", "sleep=") 26 | 27 | val shortOptions = "" 28 | 29 | def optMain() { 30 | val port = optional("port").getOrElse(DEFAULT_PORT).toInt 31 | val certs = optional("certs").getOrElse("probic").split(",") 32 | val masters = optional("masters").getOrElse("localhost:8080").split(",") 33 | val clients = mandatoryOption("clients").toInt 34 | val batchLength = optional("batchLength").getOrElse("5000").toLong 35 | val sleep = optional("sleep").getOrElse("500").toLong 36 | 37 | val input = mandatoryOption("input") 38 | val zip = optionSet("zip") 39 | 40 | val server = new ServerSocket(port) 41 | val crypto = new PkCrypto("probic") // Private test key, not relevant in this program 42 | 43 | val servers = for (i <- 0 until certs.length) yield { 44 | masters(i) -> crypto.getEncrypter(certs(i)) 45 | } 46 | 47 | val handler = handleRequest(servers) _ 48 | //val handler = handleRequestLine(servers) _ 49 | 50 | val inputLines = { 51 | if (zip) { 52 | io.Source.fromInputStream(new GZIPInputStream(new FileInputStream(new File(input)))).getLines() 53 | } else 54 | io.Source.fromFile(input).getLines() 55 | } 56 | 57 | logger.info(s"Starting ${getClass.getName} at $port") 58 | handler(server.accept, inputLines, clients, batchLength, sleep) 59 | /*for (line <- inputLines) 60 | handler(server.accept, line, clients)*/ 61 | } 62 | 63 | def handleRequest(servers: Seq[(String, String => Array[Byte])])(sock: Socket, inputLines: Iterator[String], clients: Int, batchLength: Long, sleep: Long) { 64 | val t0 = System.currentTimeMillis() 65 | var batchId = 0 66 | val out = new DataOutputStream(sock.getOutputStream) 67 | for (line <- inputLines) { 68 | timedWriteout(out, servers, line, clients) 69 | var diff = System.currentTimeMillis() - t0 70 | logger.info(s"Total elapsed ${diff} ms.") 71 | diff -= batchId * batchLength 72 | val slp = batchLength - diff + sleep // 500 to make sure 73 | logger.info(s"Sleeping $slp ms to compensate") 74 | Thread.sleep(slp) 75 | batchId += 1 76 | } 77 | out.close 78 | } 79 | 80 | def handleRequestLine(servers: Seq[(String, String => Array[Byte])])(sock: Socket, line: String, clients: Int) { 81 | val out = new DataOutputStream(sock.getOutputStream) 82 | parWriteout(out, servers, line, clients) 83 | out.close 84 | } 85 | 86 | def timedWriteout(out: DataOutputStream, servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = { 87 | val t1 = System.currentTimeMillis() 88 | val items = line.split(";") 89 | val itemsPerClient = items.length / clients 90 | val clientItems = items.grouped(itemsPerClient).toSeq 91 | for (client <- 0 until clients) { 92 | val myItems = clientItems(client) 93 | for (item <- 0 until myItems.length) { 94 | val (master, encrypt) = servers(item % servers.length) 95 | val data = myItems(item) 96 | val cryptoText = encrypt(data + "") 97 | val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}" 98 | out.write((msg + "\n").getBytes) 99 | } 100 | } 101 | logger.info(s"Sent data of $clients clients with $itemsPerClient items per client in ${System.currentTimeMillis() - t1} ms.") 102 | } 103 | 104 | def parWriteout(out: DataOutputStream, servers: Seq[(String, String => Array[Byte])], line: String, clients: Int) = { 105 | val t1 = System.currentTimeMillis() 106 | val items = line.split(";") 107 | val itemsPerClient = items.length / clients 108 | val groupsOfServers = items.grouped(itemsPerClient).zipWithIndex.toSeq 109 | val encrypted = servers.zipWithIndex.par.flatMap { 110 | case ((master, encrypt), sindex) => 111 | groupsOfServers.flatMap { 112 | case (group, client) => 113 | //println(s"sindex $sindex grouplen ${group.length} client $client") 114 | val data = group(sindex) 115 | val cryptoText = encrypt(data + "") 116 | val msg = s"$master;$client;${new String(Base64.encodeBase64(cryptoText))}\n" 117 | msg.getBytes 118 | } 119 | } 120 | encrypted.seq.foreach { msg => 121 | out.write(msg) 122 | } 123 | logger.info(s"Sent data of $clients clients with $itemsPerClient items per client in ${System.currentTimeMillis() - t1} ms.") 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/data/GenerateTestDataMatrix.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.data 2 | 3 | import fi.helsinki.cs.nodes.util.OptMain 4 | import com.typesafe.scalalogging.LazyLogging 5 | import java.net.ServerSocket 6 | import java.net.Socket 7 | import scala.concurrent.ExecutionContext 8 | import org.apache.commons.codec.binary.Base64 9 | import java.io.DataOutputStream 10 | import fi.helsinki.cs.probic.crypto.PkCrypto 11 | import java.util.zip.GZIPOutputStream 12 | import scala.util.Random 13 | 14 | /** 15 | * Mandatory options: 16 | * --intype hdfs or --intype socket 17 | * --input hdfs://path/to/input/folder 18 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 19 | */ 20 | object GenerateTestDataMatrix extends OptMain with LazyLogging { 21 | 22 | val longOptions = Seq("dimension=", "clients=", "noise=", "output=", "zip") 23 | 24 | val shortOptions = "" 25 | 26 | def optMain() { 27 | val clients = mandatoryOption("clients").toInt 28 | val d = mandatoryOption("dimension").toInt 29 | val noise = mandatoryOption("noise").toInt 30 | val zip = optionSet("zip") 31 | 32 | val output = mandatoryOption("output") 33 | 34 | logger.info(s"Generating test data matrix of size d=$d x N=$clients x k=${noise + 1}") 35 | //generateAllData(output, d, clients, noise) 36 | generateAllDataLive(output, d, clients, noise, zip) 37 | } 38 | 39 | def generateAllData(output: String, d: Int, clients: Int, noise: Int) = { 40 | val (realData, confusedData) = generateData(d, clients, noise) 41 | toFile(output, confusedData.seq.map(_.mkString(";"))) 42 | // Save also real data. 43 | toFile(s"$output-realdata.csv", realData.seq.map(_.mkString(";"))) 44 | // Save sums for checking. 45 | toFile(s"$output-sums.csv", confusedData.seq.map(_.sum.toString)) 46 | // Save sums for checking. 47 | toFile(s"$output-realdata-sums.csv", realData.seq.map(_.sum.toString)) 48 | } 49 | 50 | def generateAllDataLive(output: String, d: Int, clients: Int, noise: Int, zip: Boolean) { 51 | // test data writer 52 | val writer = fileWriter(output, zip) 53 | // Realdata Sums writer 54 | val realWriter = fileWriter(s"$output-realdata-sums.csv", zip) 55 | val allData = generateDataLive(d, clients, noise) 56 | allData.map { line => 57 | val (realData, confusedData) = line.unzip 58 | writer.write(confusedData.flatten.mkString(";") + "\n") 59 | realWriter.write(realData.sum + "\n") 60 | }.force 61 | writer.close 62 | realWriter.close 63 | } 64 | 65 | def generateData(d: Int, clients: Int, noise: Int) = { 66 | val lines = 0 until d 67 | val outputs = lines.par.map { l => 68 | val allData = (0 until clients).map { client => 69 | clientData(client, noise) 70 | } 71 | val (realData, confusedData) = allData.unzip 72 | val wholeLine = confusedData.flatten 73 | 74 | /*val rs = realData.sum 75 | val ws = wholeLine.sum 76 | assert(doubleEquals(rs, ws), s"$rs did not equal $ws. The sum of real data of the line should equal the sum of the confused data.")*/ 77 | realData -> wholeLine 78 | } 79 | outputs.unzip 80 | } 81 | 82 | def generateDataLive(d: Int, clients: Int, noise: Int) = { 83 | val lines = 0 until d 84 | val outputs = lines.view.map { l => 85 | val allData = (0 until clients).par.map { client => 86 | clientData(client, noise) 87 | } 88 | allData 89 | } 90 | outputs 91 | } 92 | 93 | def clientData(clientId: Int, noise: Int) = { 94 | val rnd = new Random() 95 | def rlong() = { 96 | val lon = rnd.nextInt().toLong << 32 97 | lon + rnd.nextInt() 98 | } 99 | val plainText = rlong 100 | val noises = Seq.fill(noise)(rlong) 101 | 102 | val confusedRealData = plainText + noises.sum 103 | 104 | val clientData = (0 until noise + 1).view.map { j => 105 | if (j == 0) { // "real" data 106 | confusedRealData 107 | } else 108 | noises(j - 1) 109 | } 110 | 111 | //assert(doubleEquals(plainText, clientData.sum), s"$plainText did not equal ${clientData.sum} for client $client. Real data should equal the sum of the confused data for each data item.") 112 | plainText -> clientData 113 | } 114 | 115 | /** 116 | * Store `lines` as a series of lines in a local file called `fileName`. 117 | */ 118 | def toFile(fileName: String, lines: Iterable[String]) { 119 | toFile(fileName, lines, false) 120 | } 121 | 122 | /** 123 | * Store `lines` as a series of lines in a local file called `fileName`. 124 | */ 125 | def toFile(fileNameBase: String, lines: Iterable[String], zip: Boolean = false) { 126 | val pw = fileWriter(fileNameBase, zip) 127 | 128 | lines.foreach(line => { pw.write(line + "\n") }) 129 | pw.close() 130 | } 131 | 132 | /** 133 | * Store `lines` as a series of lines in a local file called `fileName`. 134 | */ 135 | def fileWriter(fileNameBase: String, zip: Boolean) = { 136 | import java.io._ 137 | val fileName = { 138 | if (zip) { 139 | s"${fileNameBase}.gz" 140 | } else 141 | s"${fileNameBase}" 142 | } 143 | 144 | val f = new File(fileName) 145 | val pw = { 146 | if (zip) 147 | new PrintWriter(new GZIPOutputStream(new FileOutputStream(f, false))) 148 | else 149 | new PrintWriter(f) 150 | } 151 | pw 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /probic-decrypt-server/src/main/scala/fi/helsinki/cs/probic/server/Server.scala: -------------------------------------------------------------------------------- 1 | package fi.helsinki.cs.probic.server 2 | 3 | import fi.helsinki.cs.nodes.util.OptMain 4 | import com.typesafe.scalalogging.LazyLogging 5 | import java.net.ServerSocket 6 | import java.net.Socket 7 | import scala.concurrent.ExecutionContext 8 | import scala.concurrent.Future 9 | import java.io.ByteArrayInputStream 10 | import java.io.InputStreamReader 11 | import java.io.ByteArrayOutputStream 12 | import java.security.KeyStore 13 | import javax.crypto.Cipher 14 | import java.security.spec.X509EncodedKeySpec 15 | import java.security.KeyFactory 16 | import java.security.PublicKey 17 | import org.apache.commons.codec.binary.Base64 18 | import java.io.DataOutputStream 19 | import java.io.DataInputStream 20 | import fi.helsinki.cs.probic.crypto.PkCrypto 21 | import scala.concurrent.forkjoin.ForkJoinPool 22 | import sun.misc.VM 23 | import scala.collection.parallel.ForkJoinTaskSupport 24 | 25 | /** 26 | * Mandatory options: 27 | * --intype hdfs or --intype socket 28 | * --input hdfs://path/to/input/folder 29 | * --output hdfs://path/to/output/foldersrc/main/scala/fi/helsinki/cs/probic/ 30 | */ 31 | object Server extends OptMain with LazyLogging { 32 | 33 | val DEFAULT_PORT = "8080" 34 | 35 | val longOptions = Seq("port=", "cert=", "messages=", "useDouble") 36 | 37 | val shortOptions = "" 38 | 39 | def optMain() { 40 | 41 | val port = optional("port").getOrElse(DEFAULT_PORT).toInt 42 | val cert = optional("cert").getOrElse("probic") 43 | // How many messages to receive before decrypting and returning a result. 44 | val messages = mandatoryOption("messages").toInt 45 | 46 | val useDouble = optionSet("useDouble") 47 | 48 | val server = new ServerSocket(port) 49 | //val pk = new PkCrypto(cert) 50 | 51 | val handler = handleRequestStreaming(messages, useDouble) _ 52 | logger.info(s"Starting Probic Data Aggregation Server at $port") 53 | //logger.info("Available processors: " + Runtime.getRuntime.availableProcessors() + ", using only 5") 54 | var running = true 55 | while (running) { 56 | handler(server.accept) 57 | } 58 | } 59 | 60 | var decryptedMessages = 0 61 | var decryptedSum = 0.0 62 | 63 | def handleRequest(messages: Int)(sock: Socket) { 64 | implicit val ec = ExecutionContext.global 65 | val answer = Future { 66 | val src = new DataInputStream(sock.getInputStream) 67 | // Read and decrypt a total of `messages` messages. 68 | 69 | // Sequentially read messages: 70 | val msgSeq = (0 until messages).map { msgId => 71 | /*val len = src.readInt() 72 | val cryptoText = new Array[Byte](len) 73 | src.read(cryptoText) 74 | //logger.info(s"Received msg id $msgId") 75 | cryptoText*/ 76 | src.readLong 77 | }.toSeq 78 | 79 | // Decrypt them in parallel using 5 threads 80 | /*val outValue = msgSeq.grouped(messages / 10).toSeq.par.flatMap { group => 81 | /*val rsa = pk.getDecrypt() 82 | group.map { 83 | cryptoText => 84 | // This is thread safe 85 | val msg = new String(rsa.doFinal(cryptoText)) 86 | msg.toDouble 87 | }*/ 88 | }.reduce(_ + _)*/ 89 | 90 | val outValue = msgSeq.par.reduce(_ + _) 91 | 92 | logger.info(s"Decrypted $messages messages, returning $outValue") 93 | val out = new DataOutputStream(sock.getOutputStream) 94 | out.writeLong(outValue) 95 | sock.close() 96 | outValue 97 | } 98 | } 99 | 100 | def handleRequestStreaming(messages: Int, useDouble: Boolean)(sock: Socket) { 101 | implicit val ec = ExecutionContext.global 102 | val answer = Future { 103 | val src = new DataInputStream(sock.getInputStream) 104 | // Read and decrypt a total of `messages` messages. 105 | val outValue = { 106 | if (useDouble) { 107 | var result = 0.0 108 | // Sequentially read messages: 109 | val msgSeq = (0 until messages).foreach { msgId => 110 | result += src.readDouble 111 | } 112 | result 113 | } else { 114 | var result = 0L 115 | // Sequentially read messages: 116 | val msgSeq = (0 until messages).foreach { msgId => 117 | result += src.readLong 118 | } 119 | result 120 | } 121 | } 122 | 123 | logger.info(s"Decrypted $messages messages, returning $outValue") 124 | val out = new DataOutputStream(sock.getOutputStream) 125 | if (useDouble) 126 | out.writeDouble(outValue) 127 | else // Possible loss of precision. 128 | out.writeLong(outValue.toLong) 129 | sock.close() 130 | outValue 131 | } 132 | } 133 | 134 | def handleRequestStreamingDouble(messages: Int)(sock: Socket) = { 135 | implicit val ec = ExecutionContext.global 136 | Future { 137 | val src = new DataInputStream(sock.getInputStream) 138 | // Read and decrypt a total of `messages` messages. 139 | var result = 0.0 140 | // Sequentially read messages: 141 | val msgSeq = (0 until messages).foreach { msgId => 142 | result += src.readDouble 143 | } 144 | val outValue = result 145 | 146 | logger.info(s"Decrypted $messages messages, returning $outValue") 147 | val out = new DataOutputStream(sock.getOutputStream) 148 | out.writeDouble(outValue) 149 | sock.close() 150 | outValue 151 | } 152 | } 153 | } 154 | -------------------------------------------------------------------------------- /dataset_tests/src/combine_pred_errors.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data 6 | 7 | Script for combining prediction error results from individual pickled files produced by eps_data_test.py. 8 | 9 | eps_data_test.py should be run before this. 10 | 11 | Run: python3 combine_pred_errors.py 12 | ''' 13 | 14 | import sys 15 | 16 | import numpy as np 17 | from matplotlib import pyplot as plt 18 | 19 | ################################################################################ 20 | # SETUP 21 | ################################################################################ 22 | 23 | # Plot settings 24 | pars_filename = 'test_results/NIPS_camera_ready/pars_test_red_wine_1.pickle' 25 | #pars_filename = 'test_results/NIPS_camera_ready/pars_test_white_wine_1.pickle' 26 | #pars_filename = 'test_results/NIPS_camera_ready/pars_test_abalone_1.pickle' 27 | # Note: set this to match the settings in eps_data_test.py 28 | 29 | # for reproducing the figures in the paper 30 | figure_bounds = 'red_wine' 31 | #figure_bounds = 'white_wine' 32 | #figure_bounds = 'abalone' 33 | 34 | #save figure 35 | save_to_file = False 36 | fig_name = 'plots/UCI_redwine_NIPS_final.pdf' 37 | #fig_name = 'plots/UCI_whitewine_NIPS_final.pdf' 38 | #fig_name = 'plots/UCI_abalone_NIPS_final.pdf' 39 | 40 | #PLOTTING CONFIGURATIONS & COLORS 41 | no_plotting = ['cl_scaling', 'cl_noisy','cl_true_TA'] 42 | 43 | nimet_dict = {'true':'NP', 'clipped':'proj NP','noisy':'TA', 'cl_noisy':'proj TA', 'noisy_ind':'DDP', 'cl_noisy_ind':'proj DDP', 'scaling':'input\nperturbed','cl_scaling':'proj scaling','cl_true_TA':'proj TA (non DP)' ,'cl_true_TA_DP': 'proj TA'} 44 | 45 | # colors 46 | col_dict = {'true':'blue', 'clipped':'gray','noisy':'lightseagreen', 'cl_noisy':'green', 'noisy_ind':'red', 'cl_noisy_ind':'magenta', 'scaling':'orange','cl_scaling':'orange', 'cl_true_TA': 'black','cl_true_TA_DP':'green'} 47 | 48 | ################################################################################ 49 | # END OF SETUP 50 | ################################################################################ 51 | metodit = ['true', 'clipped', 'noisy', 'cl_noisy', 'noisy_ind', 'cl_noisy_ind', 'scaling','cl_scaling', 'cl_true_TA','cl_true_TA_DP'] 52 | 53 | #load parameters used 54 | pars = np.load(pars_filename) 55 | print('Parameters read from ' + str(pars_filename)) 56 | 57 | #lists with one element for each clipping rate 58 | abs_error_list = list() 59 | sq_error_list = list() 60 | 61 | #create names list 62 | nimet = [] 63 | for m in metodit: 64 | if m not in no_plotting: 65 | nimet.append(nimet_dict[m]) 66 | 67 | for k_test in pars['all_file_ids']: 68 | abs_err = {} 69 | sq_err = {} 70 | for m in metodit: 71 | abs_err[m] = np.zeros((len(pars['n_clients']), pars['n_repeats'])) 72 | sq_err[m] = np.zeros((len(pars['n_clients']), pars['n_repeats'])) 73 | 74 | filename = pars['output_folder'] + 'pred_errors_test' + str(k_test) + '.pickle' 75 | apu = np.load(filename) 76 | i = 0 77 | for k_clients in range(len(pars['n_clients'])): 78 | for k_repeat in range(pars['n_repeats']): 79 | for m in metodit: 80 | #MAE 81 | abs_err[m][k_clients,k_repeat] = apu[i][m][0] 82 | #MSE 83 | sq_err[m][k_clients,k_repeat] = apu[i][m][1] 84 | i = i+1 85 | 86 | abs_error_list.append(abs_err) 87 | sq_error_list.append(sq_err) 88 | 89 | ############################################################################### 90 | #simple plotting function 91 | def plotter(x,y,metodit, bounds, x_label, y_label, subtitle, x_ticks, add_noise_mean, y_err_lower=None, y_err_upper=None, y_all_clip_means=None,y_true_clip_means=None): 92 | round = -3 93 | k_col = 0 94 | for m in metodit: 95 | k_col = k_col + 1 96 | if m not in no_plotting: #skip non-used 97 | #plot non-private with dashed line 98 | if m in ['true','clipped']: 99 | linetype = '--' 100 | else: 101 | linetype = '-' 102 | 103 | if y_err_lower == None: 104 | #line, = plt.plot(x,y[m], '*-', linewidth=2,label=m) 105 | plt.plot(x,y[m], '*-', linewidth=2.5,label=m,linestyle=linetype) 106 | else: 107 | #with errorbars 108 | plt.errorbar(x+round*.05, y[m],linewidth=2.2, yerr=[y_err_lower[m],y_err_upper[m] ], linestyle=linetype, color=col_dict[m],label=m ) 109 | round = round + 1 110 | 111 | #add clipping thresholds if applicable 112 | if y_all_clip_means != None: 113 | plt.plot(x,y_all_clip_means,label=m ) 114 | if y_true_clip_means != None: 115 | plt.plot(x,y_true_clip_means,label=m ) 116 | 117 | #add line for unclipped noise mean 118 | if add_noise_mean: 119 | plt.plot(x,np.repeat(np.mean(y['noisy']),len(x)), '--', linewidth=1,label='noise mean' ) 120 | 121 | #define custom bounds for result figures 122 | if figure_bounds == 'abalone': 123 | bounds[2:] = [.55,2.5] 124 | elif figure_bounds == 'red_wine': 125 | bounds[2:] = [.59,4.0] 126 | elif figure_bounds == 'white_wine': 127 | bounds[2:] = [.63,2.5] 128 | 129 | plt.axis(bounds) 130 | plt.tight_layout(pad=7) 131 | plt.legend(nimet,bbox_to_anchor=(0., 1.02, 1., .102), loc=3, ncol=3, mode="expand", borderaxespad=0.) 132 | plt.xlabel(x_label) 133 | plt.ylabel(y_label) 134 | plt.suptitle(subtitle, y=.12, fontsize=13) 135 | plt.xticks(x_ticks[0],x_ticks[1]) 136 | if save_to_file: 137 | plt.savefig(fig_name, bbox_inches='tight') 138 | else: 139 | plt.show() 140 | 141 | 142 | ############################################################################### 143 | for sample_size in range(len(pars['n_clients'])): 144 | x = np.linspace(1,len(sq_error_list),num=len(sq_error_list)) 145 | y_mae = {} 146 | y_mse = {} 147 | y_mae_lower = {} 148 | y_mae_upper = {} 149 | y_mae_err = {} 150 | for m in metodit: 151 | y_mae[m] = np.zeros(len(sq_error_list)) 152 | y_mse[m] = np.zeros(len(sq_error_list)) 153 | y_mae_err[m] = np.zeros(len(sq_error_list)) 154 | y_mae_lower[m] = np.zeros(len(sq_error_list)) 155 | y_mae_upper[m] = np.zeros(len(sq_error_list)) 156 | for k_priv in range( len(pars['epsilon_tot']) ): 157 | y_mae[m][k_priv] = np.median(abs_error_list[k_priv][m][sample_size, :] ) 158 | #calculate .25 and .75 quantiles for errorbars 159 | apu = np.sort(abs_error_list[k_priv][m][sample_size, :]) 160 | y_mae_lower[m][k_priv] = np.absolute( apu[ int(np.floor(.25*len(apu))) ] - y_mae[m][k_priv] ) 161 | y_mae_upper[m][k_priv] = np.absolute( apu[ int(np.ceil(.75*len(apu))) ] - y_mae[m][k_priv] ) 162 | 163 | y_mse[m][k_priv] = np.mean(sq_error_list[k_priv][m][sample_size, :] ) 164 | y_mae_err = None#obsolete 165 | 166 | for sample_size in range(len(pars['n_clients'])): 167 | if len(x) < 10: 168 | x_ticks = [x, np.round(pars['epsilon_tot'],2)] 169 | else: 170 | x_ticks = [x[0::3], np.round(pars['epsilon_tot'][0::3], 2)] 171 | #mae 172 | if not pars['do_optimal_clip']: 173 | plotter(x, y_mae, metodit, [0,len(x)+1,0,1], 'epsilon', 'MAE', 'clipping: '+str(pars['all_clips']) + ', sample size=' + str(pars['n_clients'][sample_size]) + ', delta=' + str(pars['delta_tot'][0]), x_ticks, False, y_mae_lower,y_mae_upper) 174 | else: 175 | plotter(x, y_mae, metodit, [0,len(x)+1,0,1], 'epsilon', 'MAE', 'd=' + str(pars['dim']) + ', sample size=' + str(pars['n_clients'][sample_size]) + ', repeats=' + str(pars['n_repeats']) + ', $\delta=$' + str(pars['delta_tot'][0]), x_ticks, False, y_mae_lower,y_mae_upper) 176 | 177 | 178 | -------------------------------------------------------------------------------- /dataset_tests/src/suff_stats_master.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data, GDSC/drug sensitivity data 6 | 7 | Script for calculating std:s for noise for various models. 8 | ''' 9 | 10 | import numpy as np 11 | import sys 12 | from collections import OrderedDict 13 | 14 | import sufficient_stats 15 | 16 | def get_suff_stats(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true=None, clip_threshold_true=None, data_clipped_true_DP=None, clip_threshold_true_DP=None): 17 | 18 | dim = pars['dim'] 19 | ############################################################ 20 | added_noise_dict = OrderedDict() 21 | 22 | #use fixed sensitivities for UCI data; allow drugsens baseline methods to cheat a bit by using a bound calculated from the data 23 | if not pars['drugsens_data']: 24 | data_sensitivity = np.zeros(data.shape[1]-1) + pars['scale_to_range'] 25 | target_sensitivity = pars['scale_to_range'] 26 | else: 27 | data_sensitivity = np.zeros(dim) + pars['assumed_data_range'][0] 28 | target_sensitivity = np.ceil(np.amax(np.abs(data[:,-1]))) 29 | 30 | #calculate clip products & range products between dimensions, includes factor of 2 for x_i*x_j sensitivities when i != j 31 | clip_prods = np.zeros((dim*(dim+1)//2) + dim) 32 | range_prods = np.zeros(len(clip_prods)) 33 | if data_clipped_true is not None: 34 | clip_prods_true = np.zeros(len(clip_prods)) 35 | if data_clipped_true_DP is not None: 36 | clip_prods_true_DP = np.zeros(len(clip_prods)) 37 | ind = 0 38 | #for suff_stats X'X 39 | for i in range(dim): 40 | for ii in range(i+1): 41 | clip_prods[ind] = clip_threshold[i] * clip_threshold[ii] 42 | range_prods[ind] = data_sensitivity[i] * data_sensitivity[ii] 43 | if data_clipped_true is not None: 44 | clip_prods_true[ind] = clip_threshold_true[i] * clip_threshold_true[ii] 45 | if data_clipped_true_DP is not None: 46 | clip_prods_true_DP[ind] = clip_threshold_true_DP[i] * clip_threshold_true_DP[ii] 47 | #include factor of 2 from sensitivity for non-diagonal terms 48 | if i != ii: 49 | clip_prods[ind] *= 2 50 | range_prods[ind] *= 2 51 | if data_clipped_true is not None: 52 | clip_prods_true[ind] *= 2 53 | if data_clipped_true_DP is not None: 54 | clip_prods_true_DP[ind] *= 2 55 | ind = ind + 1 56 | #for suff stats X'y 57 | for i in range(dim): 58 | clip_prods[ind] = 2*clip_threshold[i] * clip_threshold[-1] 59 | range_prods[ind] = 2*data_sensitivity[i] * target_sensitivity 60 | if data_clipped_true is not None: 61 | clip_prods_true[ind] = 2*clip_threshold_true[i] * clip_threshold_true[-1] 62 | if data_clipped_true_DP is not None: 63 | clip_prods_true_DP[ind] = 2*clip_threshold_true_DP[i] * clip_threshold_true_DP[-1] 64 | ind = ind + 1 65 | 66 | #total l2-sensitivities for noise std calculations 67 | clip_sensitivity = np.sqrt( np.sum(clip_prods[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods[(dim*(dim+1)//2):]**2) ) 68 | 69 | range_sensitivity = np.sqrt( np.sum(range_prods[0:(dim*(dim+1)//2)]**2) + np.sum(range_prods[(dim*(dim+1)//2):]**2) ) 70 | 71 | if data_clipped_true is not None: 72 | clip_sensitivity_true = np.sqrt( np.sum(clip_prods_true[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods_true[(dim*(dim+1)//2):]**2) ) 73 | 74 | if data_clipped_true_DP is not None: 75 | clip_sensitivity_true_DP = np.sqrt( np.sum(clip_prods_true_DP[0:(dim*(dim+1)//2)]**2) + np.sum(clip_prods_true_DP[(dim*(dim+1)//2):]**2) ) 76 | 77 | sigma_all = OrderedDict() 78 | suff_stats_all = OrderedDict() 79 | 80 | eps=(1-pars['privacy_for_marg_var'])*pars['epsilon'] 81 | delta=(1-pars['privacy_for_marg_var'])*pars['delta'] 82 | eps_no_clip = pars['epsilon'] 83 | delta_no_clip = pars['delta'] 84 | 85 | if pars['add_noise'] in [1,3]: 86 | #trusted aggregator noise 87 | ############################################################ 88 | 89 | #clipped 90 | sigma_all['cl_noisy'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps) 91 | #clipped true TA (non DP, i.e., doesn't spend privacy on clipping bounds) 92 | if data_clipped_true is not None: 93 | sigma_all['cl_true_TA'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta_no_clip) ) * (clip_sensitivity_true/eps_no_clip) 94 | #clipped true TA (DP) 95 | if data_clipped_true_DP is not None: 96 | sigma_all['cl_true_TA_DP'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta) ) * (clip_sensitivity_true_DP/eps) 97 | #no clipping 98 | sigma_all['noisy'] = np.sqrt( 1/n_train * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip) 99 | 100 | #calculate sufficient stats for clipped & unclipped data 101 | ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None 102 | ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=True, sigma=sigma_all['noisy'], use_spark=False) 103 | ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=True, sigma=sigma_all['cl_noisy'], use_spark=False) 104 | 105 | #cl true TA (not DP) 106 | if data_clipped_true is not None: 107 | ss_cl_true1, ss_cl_true2, noise_cl_true = sufficient_stats.ss_individually(data_clipped_true, add_noise=True, sigma=sigma_all['cl_true_TA'], use_spark=False) 108 | #cl true TA (DP) 109 | if data_clipped_true_DP is not None: 110 | ss_cl_true_DP1, ss_cl_true_DP2, noise_cl_true_DP = sufficient_stats.ss_individually(data_clipped_true_DP, add_noise=True, sigma=sigma_all['cl_true_TA_DP'], use_spark=False) 111 | 112 | suff_stats_all['noisy'] = [ss1, ss2] 113 | suff_stats_all['cl_noisy'] = [ss_cl1, ss_cl2] 114 | added_noise_dict['noisy'] = noise 115 | added_noise_dict['cl_noisy'] = noise_cl 116 | #cl true TA (not DP) 117 | if data_clipped_true is not None: 118 | suff_stats_all['cl_true_TA'] = [ss_cl_true1, ss_cl_true2] 119 | added_noise_dict['cl_true_TA'] = noise_cl_true 120 | #cl true TA (DP) 121 | if data_clipped_true_DP is not None: 122 | suff_stats_all['cl_true_TA_DP'] = [ss_cl_true_DP1, ss_cl_true_DP2] 123 | added_noise_dict['cl_true_TA_DP'] = noise_cl_true_DP 124 | 125 | #with extra scaling factor for percentage honest clients 126 | ############################################################ 127 | #calculate scaling factor 128 | if pars['scaling_comparison'] == 0: 129 | scaling = 1 130 | else: 131 | scaling = 1/(np.ceil( pars['scaling_comparison']*n_train) ) 132 | #add noise in pieces separately by each client 133 | #noise std for X'X 134 | sigma_all['cl_scaling'] = np.sqrt( scaling * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps) 135 | #noise std for X'y 136 | sigma_all['scaling'] = np.sqrt( scaling * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip) 137 | 138 | #calculate sufficient stats for clipped & unclipped data with extra scaling 139 | ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None 140 | ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=True, sigma=sigma_all['scaling'], use_spark=False) 141 | 142 | ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=True, sigma=sigma_all['cl_scaling'], use_spark=False) 143 | 144 | 145 | suff_stats_all['scaling'] = [ss1, ss2] 146 | suff_stats_all['cl_scaling'] = [ss_cl1, ss_cl2] 147 | added_noise_dict['scaling'] = noise 148 | added_noise_dict['cl_scaling'] = noise_cl 149 | 150 | #individual noise i.e. n/(n-1) factor with clipped and unclipped data 151 | ############################################################ 152 | if pars['add_noise'] in [2,3]: 153 | #clipped data 154 | sigma_all['cl_noisy_ind'] = np.sqrt( 1/(n_train-1) * 2*np.log(1.25/delta) ) * (clip_sensitivity/eps) 155 | 156 | #unclipped data 157 | sigma_all['noisy_ind'] = np.sqrt( 1/(n_train-1) * 2*np.log(1.25/delta_no_clip) ) * (range_sensitivity/eps_no_clip) 158 | 159 | #calculate sufficient stats for clipped & unclipped data 160 | # Note: unclipped used for Spark testing 161 | ss1, ss2, ss_cl1, ss_cl2, noise, noise_cl = None, None, None, None, None, None 162 | ss1, ss2, noise = sufficient_stats.ss_individually(data, add_noise=pars['add_noise'] > 0, sigma=sigma_all['noisy_ind'], use_spark=pars['use_spark'], filename=pars['spark_filename'], n_spark_messages=pars['n_spark_messages'], spark_noise_range=pars['spark_noise_range'], fixed_point_int=pars['fixed_point_int']) 163 | 164 | ss_cl1, ss_cl2, noise_cl = sufficient_stats.ss_individually(data_clipped, add_noise=pars['add_noise'] > 0, sigma=sigma_all['cl_noisy_ind'], use_spark=False) 165 | 166 | suff_stats_all['noisy_ind'] = [ss1, ss2] 167 | suff_stats_all['cl_noisy_ind'] = [ss_cl1, ss_cl2] 168 | added_noise_dict['noisy_ind'] = noise 169 | added_noise_dict['cl_noisy_ind'] = noise_cl 170 | 171 | ############################################################ 172 | #calculate noiseless sufficient statistics for comparison 173 | #X'X 174 | suff_stats_all['true'] = list() 175 | suff_stats_all['true'].append(np.dot(np.transpose(data[:,0:-1]), data[:,0:-1])) 176 | #X'y 177 | suff_stats_all['true'].append(np.dot(np.transpose(data[:,0:-1]), data[:,-1])) 178 | 179 | #suff.stats for the noiseless clipped data 180 | suff_stats_all['clipped'] = list() 181 | #X'X 182 | suff_stats_all['clipped'].append(np.dot(np.transpose(data_clipped[:,0:-1]), data_clipped[:,0:-1])) 183 | #X'y 184 | suff_stats_all['clipped'].append(np.dot(np.transpose(data_clipped[:,0:-1]), data_clipped[:,-1])) 185 | 186 | return suff_stats_all, sigma_all, added_noise_dict 187 | -------------------------------------------------------------------------------- /dataset_tests/src/drugsens_code/diffpri.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | Modified from the original code: 6 | Differentially private Bayesian linear regression 7 | Arttu Nieminen 2016-2017 8 | University of Helsinki Department of Computer Science 9 | Helsinki Institute of Information Technology HIIT 10 | 11 | GDSC/drug sensitivity data 12 | 13 | Various functions and data processing steps used in the tests. 14 | ''' 15 | 16 | import sys, os, copy 17 | import numpy as np 18 | from scipy.stats import spearmanr 19 | import warnings 20 | 21 | # NOTE on normalisation in distributed setting: 22 | # assume centered data (so remove column means) 23 | # row-wise L2-normalization is ok, since doesn't depend on other rows 24 | 25 | # Centers and L2-normalises x-data (removes columnwise mean, normalises rows to norm 1) 26 | def xnormalise(x): 27 | n = x.shape[0] 28 | d = x.shape[1] 29 | if n == 0: 30 | return x 31 | else: 32 | z = x-np.dot(np.ones((n,1),dtype=np.float),np.nanmean(x,0).reshape(1,d)) 33 | return np.divide(z,np.dot(np.sqrt(np.nansum(np.power(z,2.0),1)).reshape(n,1),np.ones((1,d),dtype=np.float))) 34 | 35 | 36 | # Centers y-data (removes columnwise mean, except for columns where all samples have / all but one sample has missing drug response(s)) 37 | def ynormalise(y): 38 | n = y.shape[0] 39 | d = y.shape[1] 40 | if n == 0: 41 | return y 42 | else: 43 | with warnings.catch_warnings(): 44 | warnings.simplefilter("ignore", category=RuntimeWarning) 45 | m = np.nanmean(y,0) 46 | ind = np.where(np.sum(~np.isnan(y),0)<=1)[0] 47 | m[ind] = 0.0 # don't center samples of size <= 1 48 | return y-np.dot(np.ones((n,1),dtype=np.float),m.reshape(1,d)) 49 | 50 | 51 | # Clip data 52 | def clip(x,y,B_x,B_y): 53 | C = np.multiply(np.sign(x),np.minimum(np.abs(x),B_x)) 54 | with np.errstate(invalid='ignore'): 55 | D = np.multiply(np.sign(y),np.minimum(np.abs(y),B_y)) 56 | return C,D 57 | 58 | 59 | # Selects drug based on drugid, removes cell lines with missing drug response 60 | def ignoreNaN(xx,yy,drugid): 61 | ind = np.where(np.isnan(yy[:,drugid])) 62 | y = np.delete(yy[:,drugid],ind,axis=0) 63 | x = np.delete(xx,ind,axis=0) 64 | return x,y 65 | 66 | 67 | # Non-private sufficient statistics 68 | def nxx(x): 69 | return np.dot(x.T,x) 70 | def nxy(x,y): 71 | return np.dot(x.T,y) 72 | def nyy(y): 73 | return np.dot(y.T,y) 74 | 75 | 76 | # Precision measure: Spearman's rank correlation coefficient 77 | def precision(y_pred,y_real): 78 | r = spearmanr(y_pred,y_real)[0] 79 | if np.isnan(r): 80 | return 0.0 81 | else: 82 | return r 83 | 84 | 85 | # Prediction errors (MAE, MSE) helper script 86 | def pred_errors(pred, y, method): 87 | if method == 'mae': 88 | return np.mean(np.absolute(pred-y)) 89 | elif method =='mse': 90 | return np.mean((pred-y)**2) 91 | 92 | 93 | # Choose optimal w_x,w_y for clipping thresholds 94 | def omega(n,d,eps,delta, method='corr',ln=20): 95 | 96 | # Precision parameters (correspond to the means of the gamma hyperpriors) 97 | l = 1.0 98 | l0 = 1.0 99 | 100 | l1 = ln 101 | l2 = ln 102 | 103 | st = np.arange(0.1,2.1,0.1) 104 | lenC1 = len(st) 105 | lenC2 = lenC1 106 | err = np.zeros((lenC1,lenC2),dtype=np.float64) 107 | 108 | for i in range(l1): 109 | 110 | # Create synthetic data 111 | x = np.random.normal(0.0,1.0,(n,d)) 112 | x = xnormalise(x) 113 | sx = np.std(x,ddof=1) 114 | b = np.random.normal(0.0,1.0/np.sqrt(l0),d) 115 | y = np.random.normal(np.dot(x,b),1.0/np.sqrt(l)).reshape(n,1) 116 | y = ynormalise(y) 117 | sy = np.std(y,ddof=1) 118 | 119 | # Thresholds to be tested 120 | cs1 = st*sx 121 | cs2 = st*sy 122 | 123 | for j in range(l2): 124 | 125 | apu2 = np.random.normal(loc=0, 126 | scale=np.sqrt(n/(n-1)*2*np.log(1.25/delta)) * 1/eps, 127 | size=d*(d+1)//2+d) 128 | 129 | U = np.zeros((d,d)) 130 | U[np.tril_indices(d,0)] = apu2[:d*(d+1)//2] 131 | U = U + np.triu(np.transpose(U),k=1) 132 | V = apu2[d*(d+1)//2:].reshape((d,1)) 133 | 134 | for ci1 in range(lenC1): 135 | c1 = cs1[ci1] 136 | for ci2 in range(lenC2): 137 | c2 = cs2[ci2] 138 | 139 | # Clip data 140 | xc,yc = clip(x,y,c1,c2) 141 | sensitivity = d*c1**4 + d*(d-1)*2*c1**4 + d*(2*c1*c2)**2 142 | 143 | # Perturbed suff.stats 144 | xx = nxx(xc) + U*(sensitivity**2) 145 | xy = nxy(xc,yc) + V*(sensitivity**2) 146 | 147 | # Prediction 148 | prec = l0*np.identity(d) + l*xx 149 | mean = np.linalg.solve(prec,l*xy) 150 | pred = np.dot(x,mean) 151 | 152 | # Errors 153 | if method == 'corr': 154 | rho = precision(pred,y) 155 | err[ci1,ci2] = err[ci1,ci2] + rho 156 | elif method == 'mae': 157 | MAE = pred_errors(pred,y,'mae') 158 | err[ci1,ci2] = err[ci1,ci2] - MAE 159 | elif method == 'mse': 160 | MSE = pred_errors(pred,y,'mse') 161 | err[ci1,ci2] = err[ci1,ci2] - MSE 162 | else: 163 | print('Unknown method in optimal clip!') 164 | sys.exit() 165 | 166 | # Average 167 | err = err/float(l1*l2) 168 | # Choose best 169 | ind = np.unravel_index(err.argmax(),err.shape) 170 | w_x = st[ind[0]] 171 | w_y = st[ind[1]] 172 | 173 | return w_x,w_y 174 | 175 | 176 | # Prediction on test data 177 | def predictL(nxx_pv,nxy_pv,x_test): 178 | l = 1.0 179 | l0 = 1.0 180 | d = nxx_pv.shape[0] 181 | # Posterior for Gaussian 182 | prec = l*(nxx_pv) + l0*np.identity(d) 183 | mean = np.linalg.solve(prec,l*(nxy_pv)) 184 | # Compute prediction 185 | return np.dot(x_test,mean) 186 | 187 | 188 | def estimate_stds(data,pars): 189 | PACKAGE_PARENT = '..' 190 | SCRIPT_DIR = os.path.dirname(os.path.realpath(os.path.join(os.getcwd(), os.path.expanduser(__file__)))) 191 | sys.path.append(os.path.normpath(os.path.join(SCRIPT_DIR, PACKAGE_PARENT))) 192 | from estimate_vars import get_estimates 193 | return np.sqrt(get_estimates(data, pars, pars['small_const_for_std'])) 194 | 195 | def suff_stats_crypto(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true=None, clip_threshold_true=None, data_clipped_true_DP=None, clip_threshold_true_DP=None): 196 | from suff_stats_master import get_suff_stats 197 | return get_suff_stats(data, data_clipped, n_train, k_repeat, clip_threshold, pars, data_clipped_true, clip_threshold_true, data_clipped_true_DP, clip_threshold_true_DP) 198 | 199 | def enforce_pos_def(suff_stats, pars): 200 | from pos_def_matrices import check 201 | return check(suff_stats, pars) 202 | 203 | 204 | def get_TA_std_estimates(data, pars): 205 | palautettava = np.var(data, 0) 206 | #use Gaussian mechanism for DP 207 | eps = pars['privacy_for_marg_var']*pars['epsilon'] 208 | delta = pars['privacy_for_marg_var']*pars['delta'] 209 | n = data.shape[0] 210 | dim = data.shape[1] - 1 211 | if pars['drugsens_data']: 212 | data_bound = np.ceil(np.amax(np.abs(data),0)) 213 | sigma = np.sqrt( 2*np.log(1.25/delta)) * 1/n * np.sqrt((dim*(pars['assumed_data_range'][0]**2) + data_bound[-1]**2))/eps 214 | else: 215 | sigma = np.sqrt( 2*np.log(1.25/delta)) * 1/n *np.sqrt( (dim+1)*(pars['scale_to_range']**2))/eps 216 | 217 | #add noise 218 | palautettava = palautettava + np.random.normal(0, sigma, [data.shape[1]]) 219 | #constrain stds to be positive 220 | inds = palautettava <= 0 221 | if len(inds) > 0: 222 | palautettava[inds] = pars['small_const_for_std'] #set non-positive std to small arbitrary constant 223 | return np.sqrt(palautettava) 224 | 225 | 226 | # Process drugsens data 227 | def processData(x,y,d,n_test,n_pv,pv_max,w_x,w_y,drugid,seed, pars): 228 | 229 | n_train = n_pv 230 | 231 | # Set rng seed 232 | np.random.seed(seed) 233 | 234 | # Test/training split + dimensionality reduction 235 | ind = np.random.permutation(x.shape[0]) 236 | x_test = x[ind[0:n_test],0:d] 237 | y_test = y[ind[0:n_test],:] 238 | x_train = x[ind[n_test:],0:d] 239 | y_train = y[ind[n_test:],:] 240 | 241 | # Training data 242 | x_pv = x_train[0:n_pv,:] 243 | y_pv = y_train[0:n_pv,:] 244 | 245 | # Normalise x-data (remove mean and L2-normalize) 246 | x_test = xnormalise(x_test) 247 | x_pv = xnormalise(x_pv) 248 | 249 | # Normalise y-data (remove mean) 250 | y_test = ynormalise(y_test) 251 | y_pv = ynormalise(y_pv) 252 | 253 | 254 | # get marginal std estimates for clipping 255 | data = np.copy(np.hstack( (x_pv, y_pv[:,drugid].reshape(y_pv.shape[0],1)) )) 256 | 257 | stds = estimate_stds(np.copy(data), pars) 258 | 259 | #true std for comparison 260 | stds_true = np.std(data, 0) 261 | 262 | #DP std estimates for TA 263 | stds_TA = get_TA_std_estimates(np.copy(data), pars) 264 | 265 | # Clip data 266 | n = np.sum(~np.isnan(y_pv[:,drugid])) 267 | 268 | x_pv_orig = np.copy(x_pv) 269 | y_pv_orig = np.copy(y_pv) 270 | 271 | if n == 1: 272 | B_x = np.max(np.abs(x_pv)) 273 | B_y = np.nanmax(np.abs(y_pv)) 274 | x_pv,y_pv = clip(x_pv,y_pv,B_x,B_y) 275 | print('\nn==1!\n') 276 | 277 | elif n > 1: 278 | B_x = w_x * stds[0:-1] 279 | B_y = w_y * stds[-1] 280 | 281 | B_x_true = w_x * stds_true[0:-1] 282 | B_y_true = w_y * stds_true[-1] 283 | 284 | B_x_true_DP = w_x * stds_TA[0:-1] 285 | B_y_true_DP = w_y * stds_TA[-1] 286 | x_pv,y_pv = clip(x_pv,y_pv,B_x,B_y) 287 | 288 | x_pv_true,y_pv_true = clip(np.copy(x_pv_orig),np.copy(y_pv_orig),B_x_true,B_y_true) 289 | 290 | x_pv_true_DP,y_pv_true_DP = clip(np.copy(x_pv_orig),np.copy(y_pv_orig),B_x_true_DP,B_y_true_DP) 291 | 292 | else: 293 | B_x = 0.0 294 | B_y = 0.0 295 | 296 | # Select drug and drop cell lines with missing response 297 | x_pv,y_pv = ignoreNaN(x_pv,y_pv,drugid) 298 | x_test,y_test = ignoreNaN(x_test,y_test,drugid) 299 | n_train = x_pv.shape[0] 300 | x_pv_true,y_pv_true = ignoreNaN(x_pv_true,y_pv_true,drugid) 301 | x_pv_true_DP,y_pv_true_DP = ignoreNaN(x_pv_true_DP,y_pv_true_DP,drugid) 302 | 303 | # Compute suff.stats 304 | data_clipped = np.hstack( (x_pv, y_pv.reshape(y_pv.shape[0],1)) ) 305 | data_clipped_true = np.hstack( (x_pv_true, y_pv_true.reshape(y_pv_true.shape[0],1)) ) 306 | data_clipped_true_DP = np.hstack( (x_pv_true_DP, y_pv_true_DP.reshape(y_pv_true_DP.shape[0],1)) ) 307 | 308 | 309 | suff_stats_all, sigma_all, added_noise_dict = suff_stats_crypto(data, data_clipped, n_train, 0, np.hstack((B_x,B_y)), pars, data_clipped_true, np.hstack((B_x_true,B_y_true)), data_clipped_true_DP, np.hstack((B_x_true_DP,B_y_true_DP)) ) 310 | 311 | #enforce pos.def. Cov matrices 312 | suff_stats_all = enforce_pos_def(suff_stats_all, pars) 313 | 314 | return suff_stats_all, sigma_all, added_noise_dict, x_test, y_test, B_x, B_y, n_train 315 | 316 | 317 | 318 | 319 | 320 | -------------------------------------------------------------------------------- /dataset_tests/src/eps_data_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Differentially private Bayesian learning on distributed data 3 | Mikko Heikkilä 2016-17 4 | 5 | UCI data (abalone, red wine, white wine) 6 | 7 | Script for testing distributed Bayesian learning on UCI datasets. 8 | 9 | Run: python3 eps_data_test.py 10 | ''' 11 | 12 | import getopt 13 | import numpy as np 14 | import pickle 15 | import os 16 | import re 17 | import sys 18 | from time import sleep 19 | 20 | import calculate_pred_errors 21 | import estimate_vars 22 | import linear_regression_master 23 | import pos_def_matrices 24 | import setup_handler 25 | import suff_stats_master 26 | import UCI_data_getter 27 | from drugsens_code import diffpri as dp 28 | 29 | pars = {} 30 | ################################################################################ 31 | # SETUP 32 | ################################################################################ 33 | # Use setup-script 34 | # Note: overrides all the options in this script if used! 35 | # 0=False, 1=use given setup-file, 2=write current setup to given file, 3=print given setup file and quit 36 | use_saved_setup = 0 37 | setup_filename = 'test_setups/abalone_setup' 38 | 39 | # Check for positive definite Cov matrices 40 | pars['enforce_pos_def'] = 1 41 | # 0 = only flag non-pos.def matrices 42 | # 1 = ensure pos.def. Cov 43 | # 1 in the paper 44 | 45 | pars['random_seed'] = 1 46 | # 1 in the paper 47 | 48 | # Number of cross-validation runs for each fixed sample size 49 | pars['n_repeats'] = 25 50 | # 25 in the paper 51 | 52 | # Number of repeats for finding optimal clipping threshold 53 | pars['opt_clip_repeats'] = 20 54 | # 20 in the paper 55 | 56 | # Possible datasets: red_wine, white_wine, abalone; uncomment the selected data 57 | pars['dataset_name'] = 'abalone' 58 | #pars['dataset_name'] = 'white_wine' 59 | #pars['dataset_name'] = 'red_wine' 60 | 61 | # Number of observations(=clients) to be used 62 | # Note: looped n_repeats times for each element in the list (CV), the elements are picked at random 63 | pars['n_clients'] = [3000] 64 | pars['n_test'] = [1000] 65 | # Note: n_clients & n_test need to have same length; if n_test = 0, uses all the data left after splitting training set for testing 66 | # number of clients in datasets: 67 | # red wine 1599 68 | # white wine 4898 69 | # abalone 4177 70 | # In the paper the following sizes are used: 71 | # red wine: n_clients=1000, n_test=500 72 | # white wine: n_clients=3000, n_test=1000 73 | # abalone: n_clients=3000, n_test=1000 74 | 75 | # Use selected data dimensions; uncomment according to the data used 76 | #pars['sel_dims'] = [0,1,2,3,4,5,6,7,8,9,10] #wines 77 | pars['sel_dims'] = [0,1,2,3,4,5,6,7] #abalone 78 | # Note: for UCI red wines max dim = 11 79 | # white whine = 11 80 | # abalone = 8 81 | 82 | # Percentage of privacy used for estimating std. Used in both distributed and TA settings 83 | pars['privacy_for_marg_var'] = .3 84 | # .3 in the paper 85 | 86 | #use clipping trick 87 | #pars['do_clipping'] = True 88 | #list of clipping thresholds, for each clipping is [-c,c] 89 | #Note: number of file ids need to match the number of clipping thresholds 90 | #empty list = use estimated optimal clipping 91 | pars['all_clips'] = [] 92 | # empty list in the paper 93 | 94 | 95 | # Scale data to specific range 96 | # Note: range is scaled to be of length (2*given value) with mean 0 97 | # the distributions are NOT enforced to be symmetric around the mean though 98 | pars['scale_to_range'] = 5 99 | # 5 in the paper 100 | # Assumed data and target ranges, each is interpreted as [-c,c] 101 | pars['assumed_data_range'] = [7.5,7.5] 102 | # [7.5,7.5] in the paper 103 | 104 | # Folder for tmp files & output 105 | pars['tmp_folder'] = 'tmp/' 106 | pars['output_folder'] = 'test_results/' 107 | 108 | # Add DP noise to suff. stats 109 | # 0=no noise; 1=DP noise to suff stats; 2=noise addition by individuals, 3=both for comparison 110 | pars['add_noise'] = 3 111 | # 3 in the paper 112 | 113 | # Privacy parameters 114 | # Note: need to be equal length lists 115 | pars['epsilon_tot'] = np.power(10,[0,.25,.5,.75,1,1.5]) 116 | pars['delta_tot'] = np.zeros(len(pars['epsilon_tot'])) + 10**(-4) 117 | 118 | # File ids; each privacy par pair generates separate output files 119 | # Note: needs to match the length of privacy par lists. 1.id is also used as a general label (e.g. for saving results & settings used; this needs to match the settings in combine_pred_errors.py for plotting) 120 | pars['all_file_ids'] = ['_abalone_'+str(int(i)) for i in np.linspace(1,len(pars['epsilon_tot']),len(pars['epsilon_tot']))] 121 | 122 | #comparison with T honest clients, T = ceil(scale*clients) 123 | pars['scaling_comparison'] = 0 124 | # Note: set to 0 to get standard input perturbation, 1=trusted aggregator 125 | # 0 in the paper 126 | 127 | # Comparisons to trusted aggregator DP 128 | pars['compare_to_std_DP'] = True 129 | # True=unclipped noise var is calculated as in standard DP, False=use n/(n-1) factor for the noise as in crypto (for checking) 130 | # True in the paper 131 | 132 | # Small constant to use if marg. std estimate <= 0 133 | pars['small_const_for_std'] = .5 134 | # .5 in the paper 135 | 136 | # Extra options 137 | pars['drugsens_data'] = False 138 | 139 | pars['spark_filename'] = 'tmp/sparktest.csv' 140 | # Note: this can be overwritten by command line options 141 | 142 | ################################################################################ 143 | # END OF SETUP 144 | ################################################################################ 145 | 146 | #check for needed folders 147 | all_folders = [pars['output_folder'],pars['tmp_folder']] 148 | m = re.split(r'/',setup_filename) 149 | if m is not None and len(m) > 1: 150 | setup_folder = '' 151 | for k in range(len(m)-1): 152 | setup_folder += str(m[k]) + '/' 153 | all_folders.append(setup_folder) 154 | for folder in all_folders: 155 | if not os.path.exists(folder): 156 | print('\nCreating folder ' + str(folder)) 157 | os.makedirs(folder) 158 | 159 | # Spark 160 | pars['use_spark'] = False 161 | pars['n_spark_messages'] = 10 162 | pars['spark_noise_range'] = 10e13 163 | pars['fixed_point_int'] = 10e6 164 | # uses numpy randint [-given val,given val] 165 | # Note: this shouldn't be considered a cryptographically safe implementation 166 | if len(sys.argv) > 1: 167 | try: 168 | opts, args = getopt.getopt(sys.argv[1:], "c:hs:f:n:", ["compute=","help", "spark=","fixed_point=","noise="]) 169 | except getopt.GetoptError as err: 170 | print(str(err) + '. Use -h for help.') 171 | sys.exit(2) 172 | for o, a in opts: 173 | if o in ("-h", "--help"): 174 | print('Options:\n-s or --spark [filename] run a test using Spark. When using Spark, consider also setting the other options.\n-c or --compute [number of messages] sets the total number of messages used for Spark (default=10).\n-f or --fixed_point [fixed-point integer] defines the integer used for fixed-point arithmetic (default=10e6).\n-n or --noise sets the noise range used for Spark messages (default=10e14).') 175 | sys.exit() 176 | elif o in ("-s", "--spark"): 177 | pars['use_spark'] = True 178 | # Note: if use_spark = True, saves the individual contributions to the distributed non-projected model sufficient statistics to file on first round and terminates the run 179 | if a is not '': 180 | pars['spark_filename'] = a 181 | print('Running Spark test, saving to file \'{}\'.'.format(pars['spark_filename'])) 182 | pars['n_repeats'] = 1 183 | pars['n_clients'] = [pars['n_clients'][0]] 184 | pars['epsilon_tot'] = [pars['epsilon_tot'][0]] 185 | pars['delta_tot'] = [pars['delta_tot'][0]] 186 | elif o in ["-c","--compute"]: 187 | if a is not '': 188 | pars['n_spark_messages'] = int(a) 189 | print('Using {} messages for each data point for Spark.'.format(a)) 190 | else: 191 | print('Number of messages for Spark should be an int.') 192 | elif o in ['-f','--fixed_point']: 193 | pars['fixed_point_int'] = int(float(a)) 194 | elif o in ['-n','--noise']: 195 | pars['spark_noise_range'] = int(float(a)) 196 | else: 197 | assert False, "unhandled option" 198 | 199 | 200 | pars['dim'] = len(pars['sel_dims']) 201 | 202 | #check for optimal clipping 203 | if len(pars['all_clips']) == 0: 204 | pars['do_optimal_clip'] = True 205 | else: 206 | pars['do_optimal_clip'] = False 207 | 208 | 209 | #setup-script use 210 | #0=False, 1=use given setup-file, 2=write current setup to given file, 3=print given setup and quit 211 | if use_saved_setup is 1: 212 | print('Reading setup from\n' + setup_filename + ', press y to continue..') 213 | apu = sys.stdin.read(1) 214 | if apu[0] is not 'y': 215 | print('Aborted') 216 | sys.exit() 217 | pars = setup_handler.get_setup(setup_filename) 218 | 219 | #write current setup to file 220 | elif use_saved_setup is 2: 221 | print('Saving setup to\n' + setup_filename + ', press y to continue..') 222 | apu = sys.stdin.read(1) 223 | if apu[0] is not 'y': 224 | print('Aborted') 225 | sys.exit() 226 | setup_handler.write_setup(setup_filename, pars) 227 | print('setup written, exiting..') 228 | sys.exit() 229 | #read & print the given pars 230 | elif use_saved_setup is 3: 231 | print('Reading setup from\n' + setup_filename + '\n') 232 | apu = setup_handler.get_setup(setup_filename) 233 | for i in apu.items(): 234 | print(str(i[0]) + ': ' + str(i[1])) 235 | sys.exit() 236 | 237 | if not pars['do_optimal_clip']: 238 | clip_threshold = np.zeros((pars['dim'] + 1)) + pars['all_clips'] 239 | 240 | np.random.seed(pars['random_seed']) 241 | 242 | print('Selected dims: {}'.format(pars['sel_dims'])) 243 | #get data 244 | exec('data_master = UCI_data_getter.get_' + pars['dataset_name'] + '()') 245 | 246 | #check that target is not selected as predictor 247 | if data_master.shape[1]-1 in pars['sel_dims']: 248 | print('Target dim selected as predictor! Aborted.') 249 | sys.exit() 250 | 251 | #drop unused dims 252 | data_master = np.hstack((data_master[:,pars['sel_dims']],np.reshape(data_master[:,-1],(data_master.shape[0],1)) )) 253 | 254 | #center data 255 | data_master = data_master - np.mean(data_master, axis = 0) 256 | 257 | #scale data to assumed range 258 | data_master = np.multiply(data_master, 1/np.ptp(data_master,0)) * 2*pars['scale_to_range'] 259 | print('Data range lengths scaled to ' +str(2*pars['scale_to_range'])) 260 | 261 | #generate fixed train-test splits for each repeat that are used with all privacy pars and sample sizes 262 | filename = pars['tmp_folder'] + 'permu_' 263 | for k_file in range(pars['n_repeats']): 264 | if 0 in pars['n_test']: 265 | all_inds = np.random.permutation(data_master.shape[0]) 266 | else: 267 | all_inds = np.random.choice( np.arange(data_master.shape[0]), np.amax(pars['n_clients'])+np.amax(pars['n_test']),False) 268 | with open(filename+str(k_file)+'.pickle', 'wb') as f: 269 | pickle.dump(all_inds, f, pickle.HIGHEST_PROTOCOL) 270 | 271 | 272 | #loop over privacy pars 273 | for k_privacy_par in range(len(pars['epsilon_tot'])): 274 | 275 | print('\nStarting iteration ' + str(k_privacy_par+1) +'/' + str(len(pars['epsilon_tot'])) + '...\n') 276 | sleep(.5) 277 | 278 | pars['epsilon'] = pars['epsilon_tot'][k_privacy_par] 279 | pars['delta'] = pars['delta_tot'][k_privacy_par] 280 | 281 | file_id = pars['all_file_ids'][k_privacy_par] 282 | pred_errors_filename = pars['output_folder'] + 'pred_errors_test' + file_id + '.pickle' 283 | 284 | pred_errors = list() 285 | 286 | client_round = -1 287 | 288 | for k_client in pars['n_clients']: 289 | print('\nNumber of clients: ' + str(k_client) + ' ('+str(client_round+2) +'/'+str(len(pars['n_clients']))+')') 290 | client_round = client_round + 1 291 | k_test = pars['n_test'][client_round] 292 | 293 | pred_errors_client_loop = list() 294 | 295 | if pars['do_optimal_clip']: 296 | clipping_array = np.zeros((pars['n_repeats'],pars['dim']+1)) 297 | 298 | for k_repeat in range(pars['n_repeats']): 299 | print('\nStarting repeat ' + str(k_repeat + 1) + '/'+str(pars['n_repeats'])+'...\n') 300 | 301 | data = np.copy(data_master) 302 | #load fixed train-test split 303 | filename = pars['tmp_folder'] + 'permu_' 304 | permu = np.load(filename + str(k_repeat) + '.pickle') 305 | train_ind = permu[0:k_client] 306 | if k_test == 0: #use all elements not in training set 307 | test_ind = permu[k_client:] 308 | else: 309 | test_ind = permu[-k_test:] 310 | 311 | data_test = data[test_ind,:] 312 | data = data[train_ind,:] 313 | 314 | ################################################ 315 | # FIND OPTIMAL CLIPPING RATE 316 | 317 | if pars['do_optimal_clip']: 318 | 319 | print('Finding optimal clipping thresholds..\n') 320 | optimal_clip_values = np.zeros(2) 321 | 322 | optimal_clip_values[0], optimal_clip_values[1] = dp.omega(k_client, pars['dim'], pars['epsilon'], pars['delta'], 'mae', pars['opt_clip_repeats']) 323 | 324 | clip_threshold = np.zeros((pars['dim']+1)) 325 | #estimate marginal std for each dimension & use for clipping 326 | stds = np.zeros(pars['dim']+1) 327 | pars['marginal_vars'] = estimate_vars.get_estimates(np.copy(data), pars=pars, small_pos = pars['small_const_for_std']) 328 | stds = np.sqrt( pars['marginal_vars']) 329 | 330 | stds_true = np.std(data,0) 331 | stds_TA_DP = dp.get_TA_std_estimates(np.copy(data),pars) 332 | 333 | #optimal clipping 334 | clip_threshold[0:-1] = stds[0:-1] * optimal_clip_values[0] 335 | clip_threshold[-1] = stds[-1] * optimal_clip_values[1] 336 | 337 | clip_threshold_true = np.zeros((pars['dim']+1)) 338 | clip_threshold_TA_DP = np.zeros((pars['dim']+1)) 339 | 340 | clip_threshold_true[0:-1] = stds_true[0:-1] * optimal_clip_values[0] 341 | clip_threshold_true[-1] = stds_true[-1] * optimal_clip_values[1] 342 | 343 | clip_threshold_TA_DP[0:-1] = stds_TA_DP[0:-1] * optimal_clip_values[0] 344 | clip_threshold_TA_DP[-1] = stds_TA_DP[-1] * optimal_clip_values[1] 345 | 346 | #check that clipping threshold is not greater than the assumed data range 347 | for k_dim in range(pars['dim']): 348 | clip_threshold[k_dim] = np.minimum(clip_threshold[k_dim],pars['assumed_data_range'][0]) 349 | 350 | clip_threshold[-1] = np.minimum(clip_threshold[-1],pars['assumed_data_range'][-1]) 351 | 352 | 353 | ################################################ 354 | #CLIPPING 355 | 356 | data_clipped = np.multiply( np.sign(data), np.minimum(clip_threshold,np.absolute(data) ) ) 357 | 358 | data_clipped_true = np.multiply( np.sign(data), np.minimum(clip_threshold_true,np.absolute(data) ) ) 359 | data_clipped_TA_DP = np.multiply( np.sign(data), np.minimum(clip_threshold_TA_DP,np.absolute(data) ) ) 360 | 361 | ################################################ 362 | #CALCULATE (PERTURBED) SUFFICIENT STATS 363 | 364 | suff_stats, sigma_all, added_noise_dict = suff_stats_master.get_suff_stats(np.copy(data), np.copy(data_clipped), k_client, k_repeat, clip_threshold, pars, data_clipped_true, clip_threshold_true, data_clipped_TA_DP, clip_threshold_TA_DP) 365 | 366 | 367 | ################################################ 368 | #CHECK POSITIVE DEFINITENESS 369 | suff_stats = pos_def_matrices.check(suff_stats, pars) 370 | 371 | ################################################ 372 | #LINEAR REGRESSION 373 | model_coeffs = linear_regression_master.get_regression_est(suff_stats, pars) 374 | 375 | ################################################ 376 | #CALCULATE PREDICTION ERRORS 377 | 378 | pred_errors_client_loop = {} 379 | 380 | for k_model in model_coeffs: 381 | MAE, MSE, E_pred, std_pred, range_pred = calculate_pred_errors.calculate_errors(data=np.copy(data_test), dim=pars['dim'], filename_data='', model_coeff = model_coeffs[k_model]) 382 | pred_errors_client_loop[k_model] = [MAE,MSE, E_pred, std_pred, range_pred] 383 | 384 | pred_errors.append(pred_errors_client_loop) 385 | 386 | ################################################ 387 | #end of loop over n_repeat 388 | 389 | ################################################ 390 | #end of loop over n_clients 391 | 392 | #pickle prediction errors 393 | with open(pred_errors_filename, 'wb') as f: 394 | pickle.dump(pred_errors, f, pickle.HIGHEST_PROTOCOL) 395 | 396 | ################################################ 397 | #end of loop over privacy pars 398 | 399 | with open(pars['output_folder'] + 'pars_test' + pars['all_file_ids'][0] + '.pickle', 'wb') as f: 400 | pickle.dump(pars, f, pickle.HIGHEST_PROTOCOL) 401 | 402 | print('\nAll done.') --------------------------------------------------------------------------------