├── .gitignore ├── ARX_work ├── arx_accuracy.png ├── basic_queries.ipynb ├── dp_example.java └── intro.ipynb ├── LDP ├── Direct_Encoding.py ├── Distance_Sensitive_Encoding.py ├── Histogram_Encoding.py ├── LDP_Frequency_Estimator.py ├── RAPPOR.py ├── Random_Matrix.py ├── Unary_Encoding.py ├── ldp.py ├── local_DP_intro.ipynb ├── random_response.py └── res.csv ├── LICENSE ├── README.md ├── ibm_lib_work ├── basic_queries.ipynb ├── epsilon_measurements.ipynb ├── epsilon_measurements.png ├── hist_metrics_euclidean.png ├── hist_metrics_kantorovich.png ├── histograms.ipynb ├── increasing_ds_size.png └── simple_hists.png ├── images ├── D.E. Idea.png ├── Figure_1.png ├── Our Idea.png ├── arx_accuracy.png ├── arx_tool.png ├── emd.png ├── epsilon_intro_graph.png ├── epsilon_measurements.png ├── epsilon_others_kant.png ├── epsilon_others_l1.png ├── epsilon_our_kant.png ├── hierarchies.png ├── hist_metrics_euclidean.png ├── hist_metrics_kantorovich.png ├── increasing_ds_size.png ├── local_vs_global.png ├── nusers_others_kant.png ├── nusers_others_l1.png ├── rr_results.png ├── simple_hists.png ├── true_answers_ldp.png ├── users_our_kant.png └── users_our_l1.png ├── papers_used ├── 10_sec17-wang-tianhao.pdf ├── 11_dpmetrics.pdf ├── 12_LATENT_localDP.pdf ├── 13_jcp-01-00004.pdf ├── 14_Random_Matrix.pdf ├── 15_RAPPOR.pdf ├── 1_privacybook.pdf ├── 2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf ├── 3_ibm_diffprivlib.pdf ├── 4_k_anon+dp.pdf ├── 5_arx_dp.pdf ├── 6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf ├── 7_chatziko_locationguard_paper_1.pdf ├── 8_Differential_privacy_its_technological_prescriptiv.pdf └── 9_localDP_Tutorial.pdf └── thesis_paper ├── GDP ├── ARX.tex ├── DP_definition.tex ├── IBM.tex └── Intro.tex ├── LDP ├── intro.tex ├── other_protocols.tex └── our_protocol.tex ├── dependencies └── arial │ └── fonts │ ├── Arial Bold Italic.ttf │ ├── Arial Bold.ttf │ ├── Arial Italic.ttf │ └── Arial.ttf ├── dithesis.cls ├── emblems ├── athena-black.pdf ├── athena-blue.pdf ├── athena-red.pdf └── athena_black.jpeg ├── images ├── D.E. Idea.png ├── Figure_1.png ├── Our Idea.png ├── arx_accuracy.png ├── arx_tool.png ├── emd.png ├── epsilon_intro_graph.png ├── epsilon_measurements.png ├── epsilon_others_kant.png ├── epsilon_others_l1.png ├── epsilon_our_kant.png ├── hierarchies.png ├── hist_metrics_euclidean.png ├── hist_metrics_kantorovich.png ├── increasing_ds_size.png ├── local_vs_global.png ├── nusers_others_kant.png ├── nusers_others_l1.png ├── rr_results.png ├── simple_hists.png ├── true_answers_ldp.png ├── users_our_kant.png └── users_our_l1.png ├── latexmkrc ├── outerjoin10.mf ├── outerjoin10.pk ├── outerjoin10.tfm ├── references.bib ├── refs.tex └── thesis.tex /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | .ipynb_checkpoints 3 | .vscode/settings.json 4 | __pycache__ 5 | !LDP/res.csv -------------------------------------------------------------------------------- /ARX_work/arx_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ARX_work/arx_accuracy.png -------------------------------------------------------------------------------- /ARX_work/dp_example.java: -------------------------------------------------------------------------------- 1 | /* 2 | * ARX: Powerful Data Anonymization 3 | * Copyright 2012 - 2020 Fabian Prasser and contributors 4 | * 5 | * Licensed under the Apache License, Version 2.0 (the "License"); 6 | * you may not use this file except in compliance with the License. 7 | * You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package org.deidentifier.arx.examples; 19 | import java.util.Iterator; 20 | 21 | import java.io.IOException; 22 | 23 | import java.nio.charset.Charset; 24 | 25 | 26 | import org.deidentifier.arx.ARXAnonymizer; 27 | import org.deidentifier.arx.ARXConfiguration; 28 | import org.deidentifier.arx.ARXResult; 29 | import org.deidentifier.arx.AttributeType; 30 | import org.deidentifier.arx.AttributeType.Hierarchy; 31 | import org.deidentifier.arx.Data; 32 | import org.deidentifier.arx.Data.DefaultData; 33 | import org.deidentifier.arx.DataGeneralizationScheme; 34 | import org.deidentifier.arx.DataGeneralizationScheme.GeneralizationDegree; 35 | import org.deidentifier.arx.DataHandle; 36 | import org.deidentifier.arx.DataType; 37 | import org.deidentifier.arx.criteria.EDDifferentialPrivacy; 38 | import java.nio.charset.StandardCharsets; 39 | 40 | /** 41 | * This class implements an example on how to use the API by directly providing 42 | * the input datasets. 43 | * 44 | * @author Fabian Prasser 45 | * @author Florian Kohlmayer 46 | */ 47 | public class dp_example extends Example { 48 | 49 | /** 50 | * Entry point. 51 | * 52 | * @param args 53 | * the arguments 54 | * @throws IOException 55 | */ 56 | 57 | protected static double run_query(ARXResult data, int targetColumn) { 58 | // iterator that we are going to use to access the data 59 | final Iterator itHandle = data.getOutput().iterator(); 60 | 61 | // result of the query 62 | double result = 0d; 63 | // length of the dataset 64 | int totalRecords = 0; 65 | 66 | // get the first element of the column, thus the name of it, and ignore it 67 | String[] name = itHandle.next(); 68 | if (name.length <= targetColumn) { 69 | System.out.println("Target column out of bounds\n"); 70 | return 0d; 71 | } 72 | 73 | // iterate through all the values in the dataset 74 | while(itHandle.hasNext()) { 75 | String[] next = itHandle.next(); 76 | // check that our target position is legal 77 | String string = next[targetColumn]; 78 | if (!string.equals("*")) { 79 | result += Integer.parseInt(string); 80 | totalRecords++; 81 | } 82 | } 83 | // System.out.println(result); 84 | return result / totalRecords; 85 | } 86 | 87 | public static void main(String[] args) throws IOException { 88 | 89 | // import the data 90 | Data data = Data.create("data/nba/new_salaries.csv", StandardCharsets.UTF_8, ','); 91 | 92 | // set the hierarchies for each column 93 | Hierarchy position = Hierarchy.create("data/nba/position_hierarchy.csv", StandardCharsets.UTF_8, ','); 94 | Hierarchy year = Hierarchy.create("data/nba/year_hierarchy.csv", StandardCharsets.UTF_8, ','); 95 | Hierarchy age = Hierarchy.create("data/nba/age_hierarchy.csv", StandardCharsets.UTF_8, ','); 96 | Hierarchy team = Hierarchy.create("data/nba/team_hierarchy.csv", StandardCharsets.UTF_8, ';'); 97 | Hierarchy salary = Hierarchy.create("data/nba/salaries_hierarchy.csv", StandardCharsets.UTF_8, ','); 98 | 99 | 100 | data.getDefinition().setAttributeType("Pos", AttributeType.INSENSITIVE_ATTRIBUTE); 101 | data.getDefinition().setAttributeType("Year", AttributeType.INSENSITIVE_ATTRIBUTE); 102 | data.getDefinition().setAttributeType("Age", AttributeType.INSENSITIVE_ATTRIBUTE); 103 | data.getDefinition().setAttributeType("Tm", AttributeType.QUASI_IDENTIFYING_ATTRIBUTE); 104 | data.getDefinition().setAttributeType("Salary", AttributeType.QUASI_IDENTIFYING_ATTRIBUTE);// AttributeType.IDENTIFYING_ATTRIBUTE); 105 | 106 | data.getDefinition().setHierarchy("Pos", position); 107 | data.getDefinition().setHierarchy("Year", year); 108 | data.getDefinition().setHierarchy("Age", age); 109 | data.getDefinition().setHierarchy("Tm", team); 110 | data.getDefinition().setHierarchy("Salary", salary); 111 | 112 | // Create an instance of the anonymizer 113 | 114 | // Create a differential privacy criterion 115 | // we want (1,0) - DP 116 | // delta is suggested to be 1/#records 117 | double total_res = 0d; 118 | int solved = 0; 119 | for (int i = 0; i < 500; i++) { 120 | data.getHandle().release(); 121 | 122 | System.out.println(i); 123 | 124 | ARXAnonymizer anonymizer = new ARXAnonymizer(); 125 | 126 | EDDifferentialPrivacy criterion = new EDDifferentialPrivacy(1.7d, 1d / data.getHandle().getNumRows()); 127 | 128 | ARXConfiguration config = ARXConfiguration.create(); 129 | config.addPrivacyModel(criterion); 130 | config.setSuppressionLimit(1d); 131 | config.setHeuristicSearchStepLimit(100); 132 | ARXResult result = anonymizer.anonymize(data, config); 133 | 134 | double res = run_query(result, 4); 135 | // printResult(result, data); 136 | 137 | if (res > 0) { 138 | // System.out.print("--------------------------------" + res); 139 | 140 | total_res += res; 141 | solved++; 142 | } 143 | } 144 | total_res /= solved; 145 | System.out.print("Total result " + total_res); 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /ARX_work/intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# ARX Data Anonymization Tool" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "ARX is a tool for data anonymization, that in general, takes a dataset as an input, applies different privacy models, and produces an anonymized version of this dataset, thus offering privacy to its members.\n", 15 | "\n", 16 | "At its core, ARX uses a highly efficient globally-optimal search algorithm for transforming data with full-domain generalization and record suppression. The transformation of attribute values is implemented through domain generalization hierarchies, which represent valid transformations that can be applied to individual-level values." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Classic Privacy Models" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "The ARX tool offers standard privacy models that are tested in theory and are widely use to ensure anonymity given a plain dataset." 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "### k-Anonymity" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "This well-known privacy model that aims at protecting datasets from re-identification in the prosecutor model. A dataset is $k$-anonymous if each record cannot be distinguished from at least $k-1$ other records regarding the quasi-identifiers. Each group of indistinguishable records forms a so-called equivalence class. " 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Average risk\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "This privacy model can be used for protecting datasets from re-identification in the marketer model by enforcing a threshold on the average re-identification risk of the records. By combining the model with k-anonymity, a privacy model called strict-average risk can be constructed." 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### ℓ-Diversity" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "This privacy model can be used to protect data against attribute disclosure by ensuring that each sensitive attribute has at least $ℓ$ \"well represented\" values in each equivalence class. Different variants, which implement different measures of diversity, have been proposed." 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Differential Privacy" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Given the strict definition of DP, we know that we must access the dataset through various queries, given a privacy budget that we must not exceed. The ARX team, proposes a quite different application of DP in their tool, where privacy protection is not considered a property of a dataset, but a property of a data processing method.\n", 87 | "\n", 88 | "DP guarantees that the probability of any possible output of the anonymization process does not change \"by much\" if data of an individual is added to or removed from input data.\n", 89 | "\n", 90 | "In order to implement Differential Privacy, ARX uses the __SafePub algorithm__" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Concepts used" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "__Random Sampling__: A probability sampling method is any method of sampling that utilizes some form of random selection. In order to have a random selection method, you must set up some process or procedure that assures that the different units in your population have equal probabilities of being chosen. In SafePub, such sampling happens with probability $\\beta$\n", 105 | "\n", 106 | "__Attribute Generalization__: In SafePub, generalization is achieved through user-defined hierarchies, which describe rules for replacing values with more general but semantically consistent values on increasing levels of generalization. \n", 107 | "\n", 108 | "__Record Suppression__: Deletion of a specific row on the input dataset." 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### Theorem" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "__Random sampling with probability $\\beta$ followed by attribute generalization and the suppression of\n", 123 | "every record which appears less than k times__ satisfies $(\\epsilon, \\delta)$ differential privacy for every $\\epsilon \\geq -ln(1-\\beta)$ with \n", 124 | "$$\\delta = \\max_{n:n \\geq n_m} \\sum_{j>\\gamma_n}^{n}f(j;n,\\beta)$$\n", 125 | "\n", 126 | "where $n_m = \\frac{k}{\\gamma} - 1$, $\\gamma = \\frac{e^\\epsilon-1+\\beta}{e^\\epsilon}$ and $f(j;n,\\beta) = {n \\choose j} \\beta^j(1-\\beta)^{n-j}$" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Techniques" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "In order to achieve attribute generalization, ARX uses the so called __hierarchies__. They are either imported from a csv, or being hard-coded into the API, and they are used in order to generalize a sensitive field. An example is given below. The subject to generalize is the age of a person. Let's see the values as they proceed through generalization." 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "1st level | 2nd level | 3rd level | 4th level | 5th level\n", 148 | "--- | ----- | ------ |----- | --\n", 149 | "1 |\t0-4 | 0-9| 0-19\t|*\n", 150 | "2 |\t0-4 | 0-9| \t0-19|\t*\n", 151 | "3 |\t0-4 |\t0-9|\t0-19|\t*\n", 152 | "4 |\t0-4 |\t0-9|\t0-19|\t*\n", 153 | "5 |\t0-4 |\t0-9|\t0-19|\t*\n", 154 | "6 |\t5-9 |\t0-9|\t0-19|\t*\n", 155 | "7 |\t5-9 |\t0-9|\t0-19|\t*\n", 156 | "8 |\t5-9 |\t0-9|\t0-19|\t*\n", 157 | "9 |\t5-9 |\t0-9|\t0-19|\t*\n", 158 | "10| 5-9\t| 0-9|\t0-19\t|*\n", 159 | "11|\t10-14 |\t10-19|\t0-19|\t*\n", 160 | "12|\t10-14 |\t10-19|\t0-19|\t*\n", 161 | "13|\t10-14 |\t10-19|\t0-19|\t*\n", 162 | "14|\t10-14 |\t10-19|\t0-19|\t*\n", 163 | "15|\t10-14 |\t10-19|\t0-19|\t*\n", 164 | "16|\t15-19 |\t10-19|\t0-19|\t*\n", 165 | "17|\t15-19 |\t10-19|\t0-19|\t*\n", 166 | "18|\t15-19 |\t10-19|\t0-19|\t*\n", 167 | "19|\t15-19 |\t10-19|\t0-19|\t*\n", 168 | "20|\t15-19 |\t10-19|\t0-19|\t*\n", 169 | "21|\t20-24 |\t20-29|\t20-39|\t*" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "## Testings" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "In order to test the accuracy of the models used by ARX, we are going to run simple np python queries, on the datasets produced by the anonymization process. We want to eliminate the probability of extremely high noise generation, thus we are going to run the anonymization tool multiple times, and the output dataset will be constructed by the mean values of the fields." 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Problems we faced" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "As show on the above matrix, ARX hierarchies tend to treat every type of value as a string, in order to replace it with a interval. This is not desirable when applying the testings we mentioned. Thus, we had to come up with a better solution of defining hierarchies. The ARX GUI provides a wizard that gives a variety of choices so the user can easily create a hierarchy for plenty data types.\n", 198 | "\n", 199 | "Another challenge is the number of layers that we are going to use, meaning how far our anonymization will proceed. In each layer, the number of same records increase exponentially, thus we do not want to apply many layers, in order for our results to be accurate, and the output dataset to be readable.\n" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": {}, 205 | "source": [ 206 | "## Solution to the construction of Hierarchies" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "Given the help from Fabian Prasser, we opted to treat the integer values as numbers, and in each level:\n", 214 | " - Group the rows by 2\n", 215 | " - Apply a function according to the query we want to ask.\n", 216 | " \n", 217 | "For example, if we want a counting query, the best option would be to apply an __arithmetic mean__ function to the group, thus the sum, the mean, the variance etc will be the same. The way that ARX preserves DP with those settings, is by record suppression. If that was not the case, the results would be identical to the input dataset. However, now, the output dataset will differ because of its lack of some rows of the input.\n", 218 | " \n", 219 | "Regarding the layers problem, we opted to use 4 layers of anonymization, the last of whom will be the `*` value, meaning that every record is inseparable. We do not want this to happen early in our anonymization, but we do not want it to never happen either, because then we would have a privacy leak, if the dataset was too small." 220 | ] 221 | } 222 | ], 223 | "metadata": { 224 | "kernelspec": { 225 | "display_name": "Python 3", 226 | "language": "python", 227 | "name": "python3" 228 | }, 229 | "language_info": { 230 | "codemirror_mode": { 231 | "name": "ipython", 232 | "version": 3 233 | }, 234 | "file_extension": ".py", 235 | "mimetype": "text/x-python", 236 | "name": "python", 237 | "nbconvert_exporter": "python", 238 | "pygments_lexer": "ipython3", 239 | "version": "3.8.10" 240 | } 241 | }, 242 | "nbformat": 4, 243 | "nbformat_minor": 4 244 | } 245 | -------------------------------------------------------------------------------- /LDP/Direct_Encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | import numbers 5 | 6 | class Direct_Encoding_client(): 7 | def __init__(self, e, d): 8 | # initialization of the protocol's constants 9 | self.e = e 10 | self.d = d 11 | # p and q are fixed, depending on the domain size and the epsilon value 12 | self.p = math.exp(self.e) / (math.exp(self.e) + self.d - 1) 13 | self.q = 1 / (math.exp(self.e) + self.d - 1) 14 | 15 | # encoding: simply return the value itself 16 | def encode(self, v): 17 | return v 18 | 19 | def perturbe(self, ret): 20 | x = ret 21 | # generate a random number in the range (0,1) 22 | res = random.random() 23 | 24 | # if it is less than p, report the real value 25 | if (res < self.p): 26 | pert = x 27 | else: 28 | # else chose one of the other values of the domain 29 | false_xs = [i for i in range(self.d) if i != x] 30 | 31 | pert = random.choice(false_xs) 32 | 33 | return pert 34 | 35 | # randomization consists of perturbing the encoded value 36 | def randomize(self, v): 37 | return self.perturbe(self.encode(v)) 38 | 39 | 40 | class Direct_Encoding_aggregator(): 41 | def __init__(self, e, d): 42 | # initialization of the protocol's constants 43 | self.e = e 44 | self.d = d 45 | # p and q are fixed, depending on the domain size and the epsilon value 46 | self.p = math.exp(self.e) / (math.exp(self.e) + self.d - 1) 47 | self.q = 1 / (math.exp(self.e) + self.d - 1) 48 | 49 | def aggregate(self, config): 50 | # define the needed variables from the configuration dict provided 51 | reported_values = config['reported_values'] 52 | e = config['epsilon'] 53 | d = config['d'] 54 | 55 | # array to store the results 56 | results = np.zeros(d) 57 | n = len(reported_values) 58 | 59 | # compute p and q based on the espilon value and the domain size 60 | p = math.exp(e) / (math.exp(e) + d - 1) 61 | q = 1 / (math.exp(e) + d - 1) 62 | 63 | # compute the estimation for each value of the domain 64 | for i in range(d): 65 | sum_v = 0 66 | for j in reported_values: 67 | # Support(i) = {i}, thus the protocol supports only the values equal to 68 | # the current value 69 | if j == i: 70 | sum_v += 1 71 | # normalize the sum by trying to extract the noise 72 | results[i] = ((sum_v) - n * q) / (p - q) 73 | # if a negative sum is generated by the normalization, convert it to zero 74 | if (results[i] < 0): 75 | results[i] = 0 76 | 77 | return results -------------------------------------------------------------------------------- /LDP/Distance_Sensitive_Encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | import numbers 5 | 6 | 7 | class Distance_Sensitive_Encoding_client(): 8 | def __init__(self, e, d): 9 | # initialization of the protocol's constants 10 | self.e = e 11 | self.d = d 12 | self.theta = math.floor((math.sqrt(4 * math.exp(self.e) + 1) - 1) / 2) 13 | self.a = (self.theta * (self.theta + 1)) / (3 * self.theta ** 2 - self.theta + d - 1) 14 | self.p = self.a 15 | self.probs = [self.a / (i * (i + 1)) for i in range(1, self.theta)] 16 | self.q = self.a / (self.theta * (self.theta + 1)) 17 | 18 | # encoding: simply return the value itself 19 | def encode(self, v): 20 | return v 21 | 22 | # perturbation: choose a random value, with fixed probabilities depending on the distance from the truth 23 | def perturbe(self, ret): 24 | x = ret 25 | # create an array of probabilities for each element of the domain 26 | probabilities = np.zeros(self.d) 27 | # extreme cases: x-theta outside domain boundaries 28 | if (x - self.theta < 0): 29 | m = sum([self.a / (abs(i - x) * (abs(i - x) + 1)) - self.a / (self.theta * (self.theta + 1)) for i in range(x - self.theta, 0)]) 30 | elif (x + self.theta > self.d): 31 | m = sum([self.a / (abs(i - x) * (abs(i - x) + 1)) - self.a / (self.theta * (self.theta + 1)) for i in range(self.d, x + self.theta)]) 32 | else: 33 | m = 0 34 | 35 | for i in range(self.d): 36 | # probablitiy of choosing the truth, fixed by the user 37 | if i == x: 38 | probabilities[i] = 100 * self.p 39 | # probability of being within the area 40 | elif abs(i - x) < self.theta: 41 | # probability of lying, depending on the distance of the false value from the true one 42 | probabilities[i] = 100 * self.probs[abs(i - x) - 1] + m / (self.d - 1) 43 | # probability of being outside the area 44 | else: 45 | probabilities[i] = 100 * self.q + m / (self.d - 1) 46 | # list of all the possible options of values 47 | options = [i for i in range(self.d)] 48 | # choose a value given the probabilities for each one 49 | pert = random.choices(options, probabilities)[0] 50 | return pert 51 | 52 | # randomization consists of perturbing the encoded value 53 | def randomize(self, v): 54 | return self.perturbe(self.encode(v)) 55 | 56 | 57 | class Distance_Sensitive_Encoding_aggregator(): 58 | def __init__(self, e, d): 59 | # initialization of the protocol's constants 60 | self.e = e 61 | self.d = d 62 | # initialization of the protocol's constants 63 | self.e = e 64 | self.d = d 65 | self.theta = math.floor((math.sqrt(4 * math.exp(self.e) + 1) - 1) / 2) 66 | self.a = (self.theta * (self.theta + 1)) / (3 * self.theta ** 2 - self.theta + d - 1) 67 | 68 | self.p = self.a 69 | self.p_star = 2 * sum([self.a / (i * (i + 1)) for i in range(1, self.theta)]) + self.p 70 | self.probs = [self.a / (i * (i + 1)) for i in range(1, self.theta)] 71 | self.q = self.a / (self.theta * (self.theta + 1)) 72 | 73 | 74 | def aggregate(self, config): 75 | # define the needed variables from the configuration dict provided 76 | reported_values = config['reported_values'] 77 | e = config['epsilon'] 78 | d = config['d'] 79 | 80 | # array to store the results 81 | results = np.zeros(d) 82 | n = len(reported_values) 83 | 84 | # compute the estimation for each value of the domain 85 | for i in range(d): 86 | sum_v = 0 87 | for j in reported_values: 88 | # Support(i) = {i}, thus the protocol supports only the values equal to 89 | # the current value 90 | if i == j: 91 | sum_v += 1 92 | # normalize the sum by trying to extract the noise 93 | results[i] = ((sum_v) - n * self.q) / (self.p_star - self.q) 94 | # if a negative sum is generated by the normalization, convert it to zero 95 | if (results[i] < 0): 96 | results[i] = 0 97 | 98 | return results -------------------------------------------------------------------------------- /LDP/Histogram_Encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | import numbers 5 | 6 | class Histogram_Encoding_client(): 7 | def __init__(self, e, d): 8 | # initialization of the protocol's constants 9 | self.e = e 10 | self.d = d 11 | 12 | # encoding consists of creating a d-bit vetor, where 13 | # only the v-th element is 1, and every other equal to 0 14 | def encode(self, v): 15 | assert(v < self.d) 16 | B = np.zeros(self.d) 17 | B[v] = 1 18 | return B 19 | 20 | # perturbation consists of adding noise generated 21 | # from the laplace distribution to each element 22 | def perturb(self, ret): 23 | B = ret 24 | for i in range(len(B)): 25 | B[i] += np.random.laplace(scale = (2/self.e)) 26 | 27 | return B 28 | 29 | # randomization consists of perturbing the encoded value 30 | def randomize(self, v): 31 | return self.perturb(self.encode(v)) 32 | 33 | class Histogram_Encoding_aggregator(): 34 | def __init__(self, e, d): 35 | # initialization of the protocol's constants 36 | self.e = e 37 | self.d = d 38 | 39 | def aggregate(self, config): 40 | 41 | # define the needed variables from the configuration dict provided 42 | reported_values = config['reported_values'] 43 | e = self.e 44 | d = self.d 45 | 46 | threshold = config['threshold'] 47 | method = config['method'] 48 | 49 | # array to store the results 50 | results = np.zeros(d) 51 | # Summation with Histogram Encoding method 52 | if method == 'SHE': 53 | for i in range(d): 54 | sum_v = 0 55 | # just sum all the 1s from the v-th elements of the results 56 | for j in reported_values: 57 | sum_v += j[i] 58 | 59 | results[i] = sum_v 60 | 61 | return results 62 | else: 63 | # Thresholding with Histogram Encoding method 64 | 65 | # count of the reported values 66 | n = len(reported_values) 67 | 68 | # p and q according to the theory 69 | p = 1 - (1/2) * math.exp((e/2) * (threshold - 1)) 70 | q = (1/2) * math.exp(-(e/2) * threshold) 71 | 72 | for i in range(d): 73 | sum_v = 0 74 | # Support(B) = {v | B[v] > threshold} 75 | # thus, each reported value grater than the threshold is supported 76 | for j in reported_values: 77 | if j[i] > threshold: 78 | sum_v += 1 79 | 80 | # normalize the sum by trying to extract the noise 81 | results[i] = ((sum_v) - n * q) / (p - q) 82 | # if a negative sum is generated by the normalization, convert it to zero 83 | if (results[i] < 0): 84 | results[i] = 0 85 | 86 | return results 87 | 88 | -------------------------------------------------------------------------------- /LDP/LDP_Frequency_Estimator.py: -------------------------------------------------------------------------------- 1 | from RAPPOR import * 2 | from Random_Matrix import * 3 | from Direct_Encoding import * 4 | from Distance_Sensitive_Encoding import * 5 | from Unary_Encoding import * 6 | from Histogram_Encoding import * 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import random 11 | import math 12 | import numbers 13 | import copy 14 | import tqdm as tq 15 | import qif 16 | 17 | def manhattan_distance(a, b): 18 | return np.abs(a - b).sum() 19 | 20 | # Base class for the frequency estimator 21 | """ 22 | Mandatory Arguments: 23 | 24 | - domain_size: the number of values that the user might answer to the question posed 25 | - method: the protocol that the user wants to use. Possible answers: 26 | -> 'RAPPOR' 27 | -> 'Random_Matrix' 28 | -> 'Direct_Encoding' 29 | -> 'Histogram_Encoding' 30 | -> 'Unary_Encoding' 31 | 32 | Optional Arguments (depending on the protocol used): 33 | 34 | - epsilon: The epsilon value, as a setting for LDP (Usually in the range (0, 5])) 35 | - p, q: Probability values used by pure protocols. In some cases the do not need to be included 36 | - Public matrix: The matrix that was previously generated in the Random Matrix protocol 37 | - m: The m value used for the initialization of the public matrix in the R.M. protocol 38 | - f: The frequency setting used by the RAPPOR protocol 39 | - unary_optimized: setting for optimizing the Unary Encoding protocol; by default set to True 40 | - threshold: threshold used for the aggregation during the histogram encoding 41 | - aggregation_method: aggr. method used my the histogram encoding, in order to obtain the randomized values. ('SHE' or 'THE') 42 | """ 43 | class Frequency_Estimator(): 44 | 45 | def __init__(self, domain_size, method = 'Direct_Encoding', epsilon = 1, 46 | p = 0.75, q = 0.25, public_matrix = None, m = 10, n_users = 1, 47 | f = 0.25, unary_optimized = True, threshold = 0.67, 48 | aggr_method = 'THE'): 49 | # keep the initialization values of the class 50 | self.domain_size = domain_size 51 | self.n_users = n_users 52 | self.method = method 53 | self.epsilon = epsilon 54 | self.p = p 55 | self.q = q 56 | self.m = m 57 | self.f = f 58 | self.threshold = threshold 59 | self.public_matrix = public_matrix 60 | # according to the method, initialize the proper class with the mandatory arguments 61 | if method == 'RAPPOR': 62 | self.user_protocol_class = RAPPOR_client(f, domain_size, p ,q) 63 | self.aggregator_protocol_class = RAPPOR_aggregator(f, domain_size, p ,q) 64 | elif method == 'Random_Matrix': 65 | # spectial case: if we are using random matrix we must provide a public matrix 66 | # if it is not provided, create it on the fly using the appropriate function 67 | if public_matrix == None: 68 | self.public_matrix = generate_matrix(m, domain_size) 69 | self.user_protocol_class = Random_Matrix_client(self.public_matrix, m, domain_size, epsilon) 70 | self.aggregator_protocol_class = Random_Matrix_aggregator(public_matrix, m, domain_size, epsilon) 71 | elif method == 'Direct_Encoding': 72 | self.user_protocol_class = Direct_Encoding_client(epsilon, domain_size) 73 | self.aggregator_protocol_class = Direct_Encoding_aggregator(epsilon, domain_size) 74 | elif method == 'Distance_Sensitive_Encoding': 75 | self.user_protocol_class = Distance_Sensitive_Encoding_client(epsilon, domain_size) 76 | self.aggregator_protocol_class = Distance_Sensitive_Encoding_aggregator(epsilon, domain_size) 77 | 78 | elif method == 'Histogram_Encoding': 79 | self.user_protocol_class = Histogram_Encoding_client(epsilon, domain_size) 80 | self.aggregator_protocol_class = Histogram_Encoding_aggregator(epsilon, domain_size) 81 | elif method == 'Unary_Encoding': 82 | self.user_protocol_class = Unary_Encoding_client(epsilon, domain_size, unary_optimized, p, q) 83 | self.aggregator_protocol_class = Unary_Encoding_aggregator(epsilon, domain_size, unary_optimized, p, q) 84 | else: 85 | raise ValueError('Method not recognized. Choose one of the default ones') 86 | 87 | # create a list containing one instance for each user 88 | self.users = [] 89 | for _ in range(self.n_users): 90 | self.users.append(copy.copy(self.user_protocol_class)) 91 | 92 | 93 | """ 94 | Randomization: The user provides a value v in the range (0, d-1), and according to the protocol chosen, 95 | the system can return either a single value, or a vector containing the randomized values 96 | 97 | The return value of this function makes no sense to someone that views _only_ one user's data. It is used 98 | by another function, but only to the aggregator of the data 99 | """ 100 | def randomize_value(self, v): 101 | # just call the randomization function of the relevant protocol 102 | return self.user_protocol_class.randomize(v) 103 | 104 | 105 | 106 | """ 107 | Aggregation: Used by the aggregator in order to combine all the users' data in order to produce the final 108 | frequency vector, for each value in the domain. 109 | 110 | The reported_values argument is a vector containing each noisy value reported. 111 | """ 112 | def aggregate(self, reported_values): 113 | # create a dict with all the settings of a protocol, and pass it to the aggregator 114 | # who chooses the ones that he wants 115 | config = {'reported_values': reported_values, 'f': self.f, 'd':self.domain_size, 116 | 'public_matrix': self.public_matrix, 'epsilon': self.epsilon, 'threshold':self.threshold, 117 | 'method': self.method, 'p': self.p, 'q': self.q} 118 | # call the aggregation function of the relevant protocol 119 | return self.aggregator_protocol_class.aggregate(config) 120 | 121 | 122 | 123 | """ 124 | Test an previously initialized protol. The function returns the true, and the randomizes 125 | stats produced in 2 np vectors. Then, the user is free to compare the vectors using the 126 | neccessary metrics. 127 | 128 | The arguments are: a _csv_ file, with only one column, the one that we are interested in. 129 | All the other settings for the protocol are already defined, and thus this function is 130 | able to use them. 131 | """ 132 | def test_protocol(self, count, input_file=None, values=None): 133 | if input_file is None and values is None: 134 | raise ValueError('An input file or a value vector must by given') 135 | if input_file is not None: 136 | # parse the file, and store its values 137 | df = pd.read_csv(input_file) 138 | values = df.to_numpy() 139 | 140 | # determine the number of users and values, based on the values' vector 141 | user_count = max(df.iloc[:,0]) + 1 142 | total_values = len(df.iloc[:,1]) + 1 143 | 144 | df = df[:count] 145 | # check that the users are the same with the way that we initialized the class 146 | if user_count != self.n_users: 147 | print("---" + str(user_count) + " " + str(self.n_users)) 148 | raise ValueError('Incorrect amount of users during initialization') 149 | 150 | # vector to store the true sums for each value in the domain 151 | true_results = np.zeros(self.domain_size) 152 | 153 | # list to store the results of each randomization, and to be fed to the aggregator 154 | reported_values = [] 155 | 156 | # for i in tq.tqdm(range(len(df)), position=0, leave=True): 157 | for i in range(len(df)): 158 | # get the true value 159 | user = int(df.iloc[i, 0]) 160 | value = int(df.iloc[i, 1]) 161 | 162 | # value = int(values[user]) 163 | # the true sum of this value is increased 164 | true_results[value] += 1 165 | # call the randomization function in order to obtain the randomized value 166 | randomised_result = self.users[user].randomize(value) 167 | # and append it to the appropriate list 168 | reported_values.append(randomised_result) 169 | 170 | # feed the reported values to the aggregator, who returns an np vector with 171 | # the randomized sums 172 | randomised_results = self.aggregate(reported_values) 173 | 174 | # return the tuple of the 2 vectors: the real sums, and the predicted, randomized sums 175 | return (true_results.astype(int), randomised_results.astype(int)) 176 | 177 | 178 | import matplotlib.pyplot as plt 179 | 180 | 181 | res = input("Method: [i]: instance | [e]: epsilon measuerements | [u]: increasing users\n\n") 182 | max_samples = -1 183 | e = np.log(12) 184 | 185 | if res == 'i': 186 | estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000) 187 | 188 | res = estimator.test_protocol(max_samples, input_file='res.csv') 189 | 190 | print(res[0]) 191 | print(res[1]) 192 | 193 | print("\nsums\n\n", np.sum(res[0]), np.sum(res[1]), "\n\n") 194 | 195 | estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000) 196 | 197 | res1 = estimator.test_protocol(max_samples, input_file='res.csv') 198 | 199 | print(res1[0]) 200 | print(res1[1]) 201 | 202 | print("\nsums\n\n", np.sum(res1[0]), np.sum(res1[1]), "\n\n") 203 | 204 | xs = [i for i in range(50)] 205 | fig, axs = plt.subplots(3) 206 | fig.suptitle('Vertically stacked subplots') 207 | axs[0].bar(xs, res[0]) 208 | axs[1].bar(xs, res[1]) 209 | axs[2].bar(xs, res1[1]) 210 | 211 | # axs[0].set_ylim((0, 20)) 212 | axs[1].set_ylim((0, max(res[0]))) 213 | axs[2].set_ylim((0, max(res[0]))) 214 | 215 | axs[0].title.set_text('True Data') 216 | axs[1].title.set_text('Perturbed Data produced by the Direct Encoding Protocol') 217 | axs[2].title.set_text('Perturbed Data produced by our Protocol') 218 | 219 | def euclid(x, y): # ground distance 220 | return abs(x-y) 221 | 222 | kant = qif.metric.kantorovich(euclid) # distance on distributions 223 | 224 | 225 | print("\n\n\n\nDirect:", kant(res[0], res[1])) 226 | print("\n\n\n\nDistance_Sensitive:", kant(res1[0], res1[1])) 227 | 228 | plt.show() 229 | 230 | elif res == 'u': 231 | 232 | direct = [] 233 | dist_sens = [] 234 | dist_hist = [] 235 | unary = [] 236 | randmatr = [] 237 | size = 2000 238 | 239 | max_samples = 1200 240 | d = 50 241 | 242 | x = [i for i in range(10, max_samples, 20)] 243 | 244 | 245 | 246 | def euclid(x, y): # ground distance 247 | return abs(x-y) 248 | 249 | kant = qif.metric.kantorovich(euclid) # distance on distributions 250 | 251 | e = np.log(12) 252 | for i in tq.tqdm(x, position=0, leave=True): 253 | 254 | estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000) 255 | reses = [] 256 | for j in range(0, 10): 257 | a = estimator.test_protocol(i, input_file='res.csv') 258 | reses.append(kant(a[0], a[1])) 259 | res = sum(reses) / i * 10 260 | 261 | direct.append(res) 262 | 263 | estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000) 264 | 265 | reses = [] 266 | for j in range(0, 10): 267 | a = estimator.test_protocol(i, input_file='res.csv') 268 | reses.append(kant(a[0], a[1])) 269 | 270 | res1 = sum(reses) / i * 10 271 | 272 | dist_hist.append(res1) 273 | 274 | estimator = Frequency_Estimator(50, method='Histogram_Encoding', epsilon=e, n_users=1000) 275 | 276 | reses = [] 277 | for j in range(0, 10): 278 | a = estimator.test_protocol(i, input_file='res.csv') 279 | reses.append(kant(a[0], a[1])) 280 | 281 | res2 = sum(reses) / i * 10 282 | 283 | dist_hist.append(res2) 284 | 285 | q = 1 / (math.exp(e) + 1) 286 | 287 | 288 | estimator = Frequency_Estimator(50, method='Unary_Encoding', epsilon=e, p=1/2, q=q, n_users=1000) 289 | 290 | reses = [] 291 | for j in range(0, 10): 292 | a = estimator.test_protocol(i, input_file='res.csv') 293 | reses.append(kant(a[0], a[1])) 294 | 295 | res3 = sum(reses) / i * 10 296 | 297 | unary.append(res3) 298 | 299 | estimator = Frequency_Estimator(50, method='Random_Matrix', n_users=1000) 300 | 301 | reses = [] 302 | for j in range(0, 10): 303 | a = estimator.test_protocol(i, input_file='res.csv') 304 | reses.append(kant(a[0], a[1])) 305 | 306 | res4 = sum(reses) / i * 10 307 | 308 | randmatr.append(res4) 309 | 310 | plt.plot(x, direct, 'r') 311 | plt.plot(x, dist_hist, 'g') 312 | plt.plot(x, unary, 'm') 313 | plt.plot(x, dist_hist, 'y') 314 | plt.xlabel("Number of Users") 315 | plt.ylabel("Accuracy Error") 316 | 317 | plt.legend(["Direct Encoding", "Histogram Encoding", "Unary Encoding", "Distance Sensitive Encoding"]) 318 | plt.savefig('../misc/latest_plot.png') 319 | plt.show() 320 | 321 | else: 322 | 323 | direct = [] 324 | dist_sens = [] 325 | dist_hist = [] 326 | unary = [] 327 | randmatr = [] 328 | size = 2000 329 | 330 | epsilon = [round(i/3 + 0.8, 2) for i in range (0,12)] 331 | nusers = 10000 332 | d = 50 333 | 334 | 335 | 336 | def euclid(x, y): # ground distance 337 | return abs(x-y) 338 | 339 | kant = qif.metric.kantorovich(euclid) # distance on distributions 340 | 341 | for e in tq.tqdm(epsilon, position=0, leave=True): 342 | 343 | estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000) 344 | reses = [] 345 | for j in range(0, 10): 346 | a = estimator.test_protocol(nusers, input_file='res.csv') 347 | reses.append(kant(a[0], a[1])) 348 | res = sum(reses) / 10 349 | 350 | direct.append(res) 351 | 352 | estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000) 353 | 354 | reses = [] 355 | for j in range(0, 10): 356 | a = estimator.test_protocol(nusers, input_file='res.csv') 357 | reses.append(kant(a[0], a[1])) 358 | 359 | res1 = sum(reses) / 10 360 | 361 | dist_hist.append(res1) 362 | 363 | estimator = Frequency_Estimator(50, method='Histogram_Encoding', epsilon=e, n_users=1000) 364 | 365 | reses = [] 366 | for j in range(0, 10): 367 | a = estimator.test_protocol(nusers, input_file='res.csv') 368 | reses.append(kant(a[0], a[1])) 369 | 370 | res2 = sum(reses) / 10 371 | 372 | dist_hist.append(res2) 373 | 374 | q = 1 / (math.exp(e) + 1) 375 | 376 | 377 | estimator = Frequency_Estimator(50, method='Unary_Encoding', epsilon=e, p=1/2, q=q, n_users=1000) 378 | 379 | reses = [] 380 | for j in range(0, 10): 381 | a = estimator.test_protocol(nusers, input_file='res.csv') 382 | reses.append(kant(a[0], a[1])) 383 | 384 | res3 = sum(reses) / 10 385 | 386 | unary.append(res3) 387 | 388 | estimator = Frequency_Estimator(50, method='Random_Matrix', n_users=1000) 389 | 390 | reses = [] 391 | for j in range(0, 10): 392 | a = estimator.test_protocol(nusers, input_file='res.csv') 393 | reses.append(kant(a[0], a[1])) 394 | 395 | res4 = sum(reses) / 10 396 | 397 | randmatr.append(res4) 398 | 399 | plt.plot(epsilon, direct, 'r') 400 | plt.plot(epsilon, dist_hist, 'g') 401 | plt.plot(epsilon, unary, 'm') 402 | plt.plot(epsilon, dist_hist, 'y') 403 | plt.xlabel("Epsilon") 404 | plt.ylabel("Accuracy Error") 405 | 406 | plt.legend(["Direct Encoding", "Histogram Encoding", "Unary Encoding", "Distance Sensitive Encoding"]) 407 | plt.savefig('../misc/latest_plot.png') 408 | plt.show() -------------------------------------------------------------------------------- /LDP/RAPPOR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | 4 | class RAPPOR_client(): 5 | # initialize the class with the necessary arguments: f and d(domain size) 6 | def __init__(self, f, d, p, q): 7 | self.f = f 8 | self.d = d 9 | self.p = p 10 | self.q = q 11 | 12 | # store matrices of permanent perturbation 13 | self.perma_B = {} 14 | 15 | # encode by creating a d-lengthed vector 16 | def encode(self, v): 17 | B = np.zeros(self.d) 18 | # only it's v-th element is 1 19 | B[v] = 1 20 | 21 | return v, B 22 | 23 | # perturbe the value in order to report it 24 | def perturb(self, res): 25 | v, B = res 26 | # __step 1__: permanent randomized response 27 | # check if the permanent B exists for the value B 28 | if v in self.perma_B: 29 | new_B = self.perma_B[v] 30 | else: 31 | # if it does not exist, we must create it 32 | new_B = np.zeros(self.d) 33 | # for each item, we must fix it according to its value in the original matrix 34 | for i, b in enumerate(new_B): 35 | # if the element is 1 36 | if B[i] == 1: 37 | pr = 1 - 0.5 * self.f 38 | # if it is 0 39 | else: 40 | pr = 0.5 * self.f 41 | # generate a random number 42 | res = random.random() 43 | # and compute the element in the new matrix 44 | if (res < pr): 45 | new_B[i] = 1 46 | else: 47 | new_B[i] = 0 48 | # save it to the dictionary so we do not have to compute it again 49 | self.perma_B[v] = new_B 50 | 51 | # __step 2__: instantaneous randomized response 52 | final_B = np.zeros(self.d) 53 | for i, b in enumerate(final_B): 54 | if new_B[i] == 1: 55 | pr = self.p 56 | else: 57 | pr = self.q 58 | res = random.random() 59 | if (res < pr): 60 | final_B[i] = 1 61 | else: 62 | final_B[i] = 0 63 | 64 | return final_B 65 | 66 | # randomization contists of the PE() opperation 67 | def randomize(self, v): 68 | return self.perturb(self.encode(v)) 69 | 70 | 71 | 72 | class RAPPOR_aggregator(): 73 | # initialize the class with the necessary arguments: f and d(domain size) 74 | def __init__(self, f, d, p, q): 75 | self.f = f 76 | self.d = d 77 | self.p = p 78 | self.q = q 79 | 80 | def aggregate(self, config): 81 | 82 | reported_values = config['reported_values'] 83 | f = self.f 84 | d = self.d 85 | n = len(reported_values) 86 | results = np.zeros(d) 87 | for i in range(d): 88 | sum_v = 0 89 | for j in reported_values: 90 | if j[i] == 1: 91 | sum_v += j[i] 92 | 93 | results[i] = (sum_v - 0.5 * f * n) / (1 - f) 94 | 95 | return results 96 | 97 | def compute_metrics(self, true, randomized): 98 | metrics_dict = {} 99 | metrics_dict['eucledian_distance'] = np.linalg.norm(true - randomized) 100 | -------------------------------------------------------------------------------- /LDP/Random_Matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | import numbers 5 | 6 | def generate_matrix(m, d): 7 | F = np.zeros((m,d)) 8 | 9 | bound = -1 / math.sqrt(m) 10 | for i in range(m): 11 | for j in range(d): 12 | F[i][j] = random.uniform(-bound, bound) 13 | 14 | return F 15 | generate_matrix(5, 10) 16 | 17 | class Random_Matrix_client(): 18 | def __init__(self, F, m, d, e): 19 | # initialization of the protocol's constants 20 | self.F = F 21 | self.m = m 22 | self.d = d 23 | self.e = e 24 | 25 | def encode(self, v): 26 | r = random.randint(0, self.m - 1) 27 | x = self.F[r][v] 28 | 29 | return (r, x) 30 | 31 | def perturbe(self, ret): 32 | r, x = ret 33 | 34 | pr = math.exp(self.e) / (math.exp(self.e) + 1) 35 | res = random.random() 36 | if (res < pr): 37 | b = 1 38 | else: 39 | b = -1 40 | 41 | c = (math.exp(self.e) + 1) / (math.exp(self.e) - 1) 42 | 43 | return (r, b * c * self.m * x) 44 | 45 | def randomize(self, v): 46 | return self.perturbe(self.encode(v)) 47 | 48 | 49 | 50 | class Random_Matrix_aggregator(): 51 | def __init__(self, F, m, d, e): 52 | # initialization of the protocol's constants 53 | self.F = F 54 | self.m = m 55 | self.d = d 56 | self.e = e 57 | 58 | def aggregate(self, config): 59 | 60 | reported_values = config['reported_values'] 61 | public_matrix = config['public_matrix'] 62 | d = self.d 63 | 64 | results = np.zeros(d) 65 | for i in range(d): 66 | sum_v = 0 67 | for j in reported_values: 68 | sum_v += j[1] * public_matrix[j[0]][i] 69 | 70 | results[i] = sum_v 71 | return results 72 | -------------------------------------------------------------------------------- /LDP/Unary_Encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import math 4 | import numbers 5 | 6 | class Unary_Encoding_client(): 7 | def __init__(self, e, d, optimized=True, p=0, q=0): 8 | # initialization of the protocol's constants 9 | self.d = d 10 | self.p = p 11 | self.q = q 12 | self.e = e 13 | self.optimized = optimized 14 | # if the user wants OUE, we initialize p and q 15 | # according to the theory, and based on epsilon 16 | if self.optimized: 17 | self.p = 1 / 2 18 | self.q = 1 / (math.exp(self.e) + 1) 19 | 20 | # encoding consists of creating a d-bit vetor, where 21 | # only the v-th element is 1, and every other equal to 0 22 | def encode(self, v): 23 | assert(v < self.d) 24 | B = np.zeros(self.d) 25 | B[v] = 1 26 | return B 27 | 28 | # perturbation consists of setting each bit of the array 29 | # to 0 or 1, according to the probabilities defined as 30 | # features of the protocol 31 | def perturb(self, ret): 32 | B = ret 33 | 34 | new_B = B 35 | # for each bit of the binary array 36 | for i in range(len(B)): 37 | # depending on wether it is 0 or 1 38 | if B[i] == 1: 39 | pr = self.p 40 | else: 41 | pr = self.q 42 | # generate a random number 43 | res = random.random() 44 | # in order to determine the bit in the new array 45 | if res < pr: 46 | new_B[i] = 1 47 | else: 48 | new_B[i] = 0 49 | 50 | return new_B 51 | 52 | # randomization consists of perturbing the encoded value 53 | def randomize(self, v): 54 | return self.perturb(self.encode(v)) 55 | 56 | def aggregate(self, config): 57 | # define the needed variables from the configuration dict provided 58 | reported_values = config['reported_values'] 59 | d = config['d'] 60 | 61 | p = self.p 62 | q = self.q 63 | 64 | # array to store the results 65 | results = np.zeros(d) 66 | n = len(reported_values) 67 | 68 | 69 | # compute the estimation for each value of the domain 70 | for i in range(d): 71 | sum_v = 0 72 | for j in reported_values: 73 | # Support(i) = {i | B[i] == 1}, thus the protocol supports only 74 | # the values that have 1 on the v-th bit of their matrix 75 | if j[i] == 1: 76 | sum_v += 1 77 | # normalize the sum by trying to extract the noise 78 | results[i] = ((sum_v) - n * q) / (p - q) 79 | # if a negative sum is generated by the normalization, convert it to zero 80 | if (results[i] < 0): 81 | results[i] = 0 82 | 83 | return results 84 | 85 | class Unary_Encoding_aggregator(): 86 | def __init__(self, e, d, optimized=True, p=0, q=0): 87 | # initialization of the protocol's constants 88 | self.d = d 89 | self.p = p 90 | self.q = q 91 | self.e = e 92 | self.optimized = optimized 93 | # if the user wants OUE, we initialize p and q 94 | # according to the theory, and based on epsilon 95 | if self.optimized: 96 | self.p = 1 / 2 97 | self.q = 1 / (math.exp(self.e) + 1) 98 | 99 | def aggregate(self, config): 100 | # define the needed variables from the configuration dict provided 101 | reported_values = config['reported_values'] 102 | d = config['d'] 103 | 104 | p = self.p 105 | q = self.q 106 | 107 | # array to store the results 108 | results = np.zeros(d) 109 | n = len(reported_values) 110 | 111 | 112 | # compute the estimation for each value of the domain 113 | for i in range(d): 114 | sum_v = 0 115 | for j in reported_values: 116 | # Support(i) = {i | B[i] == 1}, thus the protocol supports only 117 | # the values that have 1 on the v-th bit of their matrix 118 | if j[i] == 1: 119 | sum_v += 1 120 | # normalize the sum by trying to extract the noise 121 | results[i] = ((sum_v) - n * q) / (p - q) 122 | 123 | return results -------------------------------------------------------------------------------- /LDP/ldp.py: -------------------------------------------------------------------------------- 1 | # %% 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import qif 5 | 6 | eps = np.log(20) 7 | d = 89 # domain size 8 | trv = 3 # true value 9 | theta_f = (np.sqrt( 4 * np.exp(eps) + 1) - 1) / 2 10 | theta = int(np.floor( theta_f )) 11 | a = theta * (theta + 1) / (3 * theta**2 - theta + d - 1) 12 | print() 13 | 14 | m = sum([a / (abs(i - trv) * (abs(i - trv) + 1)) - a / (theta * (theta + 1)) for i in range(trv - theta + 1, 0)]) 15 | values = [a / i - m / (d - 1) for i in range(1, theta)] 16 | 17 | print(values) 18 | print(values[0]) 19 | 20 | def prob(i, x): 21 | if i == x: 22 | return a 23 | if (x - theta < 0): 24 | m = sum([a / (abs(i - x) * (abs(i - x) + 1)) - a / (theta * (theta + 1)) for i in range(trv - theta + 1, 0)]) 25 | c = min( abs(i - x), theta) 26 | 27 | return a / ( c * (c+1)) + m / (d - 1) 28 | 29 | if (x + theta > (d - 1)): 30 | m = sum([a / (abs(i - x) * (abs(i - x) + 1)) - a / (theta * (theta + 1)) for i in range(d, x + theta)]) 31 | c = min( abs(i - x), theta) 32 | 33 | return a / ( c * (c+1)) + m / (d - 1) 34 | 35 | c = min( abs(i - x), theta) 36 | return a / ( c * (c+1)) 37 | 38 | def rr(i, x): 39 | t = 1 / (d - 1 + np.exp(eps)) 40 | if i == x: 41 | return np.exp(eps) * t 42 | else: 43 | return t 44 | 45 | 46 | area = sum([prob(trv - j, trv) for j in range(-theta, theta + 1)]) 47 | 48 | 49 | print("eps:", eps) 50 | print("theta_f:", theta_f) 51 | print("theta:", theta) 52 | print("a:", a) 53 | print() 54 | print("prob(true):", prob(trv,trv)) 55 | print("prob(area):", area) 56 | print("prob(others):", prob(0,trv)) 57 | print() 58 | print("rr(true):", rr(trv,trv)) 59 | print("rr(other):", rr(0,trv)) 60 | print() 61 | 62 | print("ratio:", prob(trv, trv) / prob(0,trv), np.exp(eps)) 63 | print("ratio rr:", rr(trv, trv) / rr(0,trv), np.exp(eps)) 64 | 65 | dist = np.array([prob(i, trv) for i in range(0,d)]) 66 | dist_rr = np.array([rr(i, trv) for i in range(0,d)]) 67 | print("sum(dist):", sum(dist)) 68 | print("sum(dist_rr):", sum(dist_rr)) 69 | 70 | 71 | # def euclid(x, y): # ground distance 72 | # return abs(x-y) 73 | # kant = qif.metric.kantorovich(euclid) # distance on distributions 74 | 75 | # d1 = np.array([1,0,0]) 76 | # d2 = np.array([0,0.5,0.5]) 77 | # print(kant(d1, d2)) 78 | 79 | 80 | # a 81 | # ---- c = min { |i-x|, theta } 82 | # c(c-1) 83 | 84 | # %% 85 | -------------------------------------------------------------------------------- /LDP/random_response.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | 6 | def randomized_response(true_value): 7 | res1 = random.randint(0,1) 8 | if (res1 == 1): 9 | return true_value 10 | else: 11 | res2 = random.randint(0,1) 12 | if (res2 == 1): 13 | return 0 14 | else: 15 | return 1 16 | 17 | 18 | n_users = 30 19 | 20 | users = [i for i in range(100, 10000, 100)] 21 | diffs = [] 22 | print(users) 23 | 24 | for n_users in users: 25 | 26 | diff = 0 27 | for _ in range(100): 28 | 29 | true_values = np.array([random.randint(0,1) for i in range(n_users)]) 30 | 31 | 32 | new_values = np.array([randomized_response(i) for i in true_values]) 33 | 34 | 35 | diff += abs(sum(new_values) - sum(true_values)) 36 | 37 | diff /= 10 38 | diffs.append(diff / n_users) 39 | 40 | plt.plot(users, diffs) 41 | plt.xlabel("Number of Users") 42 | plt.ylabel("Accuracy Error") 43 | plt.show() -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Protection of Sensitive Data: Creating, Analyzing and Testing Protocols of Differential Privacy 2 | 3 | The full paper of the thesis is available [here](https://pergamos.lib.uoa.gr/uoa/dl/frontend/en/browse/2958792) 4 | 5 | The problem of preserving privacy while extracting information during data analysis, has been an everlasting one. Specifically, during the big­data era, user details can be easily compromised by a malicious handler, something considered both as a security, and as a privacy issue. 6 | 7 | The optimal fix to the subject, is Differential Privacy, which is actually a promise, made by the data handler to the user, that they will not be affected, by allowing their data to be used in any analysis, no matter what other stud­ies/databases/info resources are available. Meanwhile, the output data statistics should be accurate enough for any researcher to extract useful information from them. 8 | 9 | The goal of this thesis, is to examine and compare previously created mechanisms for D.P., while also creating our own mechanism, that serves to the purpose of achieving Local D.P., a form of Differential Privacy that is nowadays widely used in machine learning algorithms, aiming to protect the individuals that send their personal data for analysis. We will do so, by creating a library that is easy to use, and applies to all the rules of data privacy, and then extract conclusions from its use. 10 | 11 | ## Analyzing and Testing of existing protocols 12 | 13 | The first two chapters of the thesis are dedicated in testing libraries created, like the **IBM diffprivlib** and the **ARX Tool**. The directory `ibm_lib_work` contains notebooks for testing the IBM library, and the directory `ARX_work`, contains Java code created in order to test the ARX API. 14 | 15 | ## Creating an LDP protocol 16 | 17 | Local Differential Privacy (LDP), is a modern form of DP used in many real world application. The main downside of most LDP protocols, is their lack of efficiency when a small number of users contribute in the protocol. During this thesis, we aim to create a protocol to fix this probem, and we are introducing the **Distance Sensitive** protocol, which fufils exactly that promise. We conduct testings, and comparisons with other LDP protocols, which were implemented using Python. All our LDP work can be found in the directory `LDP`. 18 | -------------------------------------------------------------------------------- /ibm_lib_work/epsilon_measurements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/epsilon_measurements.png -------------------------------------------------------------------------------- /ibm_lib_work/hist_metrics_euclidean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/hist_metrics_euclidean.png -------------------------------------------------------------------------------- /ibm_lib_work/hist_metrics_kantorovich.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/hist_metrics_kantorovich.png -------------------------------------------------------------------------------- /ibm_lib_work/increasing_ds_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/increasing_ds_size.png -------------------------------------------------------------------------------- /ibm_lib_work/simple_hists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/simple_hists.png -------------------------------------------------------------------------------- /images/D.E. Idea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/D.E. Idea.png -------------------------------------------------------------------------------- /images/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/Figure_1.png -------------------------------------------------------------------------------- /images/Our Idea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/Our Idea.png -------------------------------------------------------------------------------- /images/arx_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/arx_accuracy.png -------------------------------------------------------------------------------- /images/arx_tool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/arx_tool.png -------------------------------------------------------------------------------- /images/emd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/emd.png -------------------------------------------------------------------------------- /images/epsilon_intro_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_intro_graph.png -------------------------------------------------------------------------------- /images/epsilon_measurements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_measurements.png -------------------------------------------------------------------------------- /images/epsilon_others_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_others_kant.png -------------------------------------------------------------------------------- /images/epsilon_others_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_others_l1.png -------------------------------------------------------------------------------- /images/epsilon_our_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_our_kant.png -------------------------------------------------------------------------------- /images/hierarchies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hierarchies.png -------------------------------------------------------------------------------- /images/hist_metrics_euclidean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hist_metrics_euclidean.png -------------------------------------------------------------------------------- /images/hist_metrics_kantorovich.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hist_metrics_kantorovich.png -------------------------------------------------------------------------------- /images/increasing_ds_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/increasing_ds_size.png -------------------------------------------------------------------------------- /images/local_vs_global.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/local_vs_global.png -------------------------------------------------------------------------------- /images/nusers_others_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/nusers_others_kant.png -------------------------------------------------------------------------------- /images/nusers_others_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/nusers_others_l1.png -------------------------------------------------------------------------------- /images/rr_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/rr_results.png -------------------------------------------------------------------------------- /images/simple_hists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/simple_hists.png -------------------------------------------------------------------------------- /images/true_answers_ldp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/true_answers_ldp.png -------------------------------------------------------------------------------- /images/users_our_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/users_our_kant.png -------------------------------------------------------------------------------- /images/users_our_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/users_our_l1.png -------------------------------------------------------------------------------- /papers_used/10_sec17-wang-tianhao.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/10_sec17-wang-tianhao.pdf -------------------------------------------------------------------------------- /papers_used/11_dpmetrics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/11_dpmetrics.pdf -------------------------------------------------------------------------------- /papers_used/12_LATENT_localDP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/12_LATENT_localDP.pdf -------------------------------------------------------------------------------- /papers_used/13_jcp-01-00004.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/13_jcp-01-00004.pdf -------------------------------------------------------------------------------- /papers_used/14_Random_Matrix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/14_Random_Matrix.pdf -------------------------------------------------------------------------------- /papers_used/15_RAPPOR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/15_RAPPOR.pdf -------------------------------------------------------------------------------- /papers_used/1_privacybook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/1_privacybook.pdf -------------------------------------------------------------------------------- /papers_used/2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf -------------------------------------------------------------------------------- /papers_used/3_ibm_diffprivlib.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/3_ibm_diffprivlib.pdf -------------------------------------------------------------------------------- /papers_used/4_k_anon+dp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/4_k_anon+dp.pdf -------------------------------------------------------------------------------- /papers_used/5_arx_dp.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/5_arx_dp.pdf -------------------------------------------------------------------------------- /papers_used/6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf -------------------------------------------------------------------------------- /papers_used/7_chatziko_locationguard_paper_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/7_chatziko_locationguard_paper_1.pdf -------------------------------------------------------------------------------- /papers_used/8_Differential_privacy_its_technological_prescriptiv.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/8_Differential_privacy_its_technological_prescriptiv.pdf -------------------------------------------------------------------------------- /papers_used/9_localDP_Tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/9_localDP_Tutorial.pdf -------------------------------------------------------------------------------- /thesis_paper/GDP/ARX.tex: -------------------------------------------------------------------------------- 1 | \section{Anonymized Dataset Producing Libraries} 2 | 3 | As mentioned earlier, an other possible output of a mechanism that adds D.P. to a dataset can be the dataset itself, after being anonymized using certain algorithms that meet the criteria of D.P. 4 | 5 | This technique does not yet have many different implementations, mostly due to the success of the previous model shown, as well as the difficulty, the computer power needed and the poor quality of the result being produced. 6 | 7 | Producing an anonymized dataset is the way to go if someone is using earlier forms of data privacy, such as \emph{k-anonymity}, \emph{l-diversity} etc, which we are going to analyze moving forward. However, in order to cover the needs of D.P., several adjustments have to be made. The main idea behind all those libraries, lay behind a theorem, presented in [4], that mixes the use of those previously mentioned techniques with D.P. 8 | 9 | In this Thesis, we are going to examine the \emph{ARX tool}, a tool for data anonymization, that supports the method that we are trying to implement. We are going to analyze this tool, and perform similar testings as in IBM. 10 | 11 | ARX is a tool for data anonymization, that in general, takes a dataset as an input, applies privacy models, and produces an anonymized version of this dataset, thus offering protection to its members. The Menu of the ARX tool can be seen in the following \textbf{Figure 3.7.} 12 | 13 | \begin{figure}[!htb]\centering 14 | \includegraphics[width=0.9\textwidth]{images/arx_tool.png} 15 | \caption{The ARX GUI tool} 16 | \end{figure} 17 | 18 | At its core, ARX uses a highly efficient globally-optimal search algorithm for transforming data with full-domain generalization and record suppression. The transformation of attribute values is implemented through domain generalization hierarchies, which represent valid transformations that can be applied to individual-level values. 19 | 20 | \subsection{Classic Privacy Models} 21 | 22 | The ARX tool offers standard privacy models that are tested in theory and are widely used to ensure anonymity given a plain dataset. Those consist of the implementation of the following protocols: 23 | 24 | \begin{itemize} 25 | \item \textbf{K-anonymity}: A well-known privacy model that aims to protect datasets from re-identification in the prosecutor model. A dataset is $k$-anonymous if\emph{ each record cannot be distinguished from at least $k-1$ other records regarding the quasi-identifiers.} Each group of indistinguishable records forms a so-called equivalence class. 26 | \item \textbf{l-diversity}: This privacy model can be used to protect data against attribute disclosure by ensuring that each sensitive attribute \emph{has at least $l$ "well represented" values in each equivalence class}. Different variants, which implement different measures of diversity, have been proposed. 27 | \end{itemize} 28 | 29 | Moreover, the tool uses some simple concepts of processing a dataset: 30 | 31 | \begin{itemize} 32 | \item \textbf{Random Sampling}: A method of sampling that utilizes some form of random selection. In order to have a random selection method, we must set up some process or procedure that assures that the different units in the population have equal probabilities of being chosen. 33 | \item \textbf{Attribute Generalization}: Generalizing a column of the dataset, based on its values. The applications of attribute generalization depend on the type of records (eg. integers, ranges etc). 34 | \item \textbf{Record Suppression}: Deletion of a specific row on the input dataset. 35 | \end{itemize} 36 | 37 | Those are some techniques that are not going to be analyzed and tested in this thesis, however, if combined with D.P. can produce interesting results. Specifically, according to [4], the following theorem applies: 38 | 39 | \emph{Random sampling} with probability $\beta$ followed by \emph{attribute generalization} and the \emph{suppression} of 40 | every record which appears less than k times \emph{satisfies $(\epsilon, \delta)$ differential privacy} for every $\epsilon \geq -ln(1-\beta)$ with 41 | $$\delta = \max_{n:n \geq n_m} \sum_{j>\gamma_n}^{n}f(j;n,\beta)$$ 42 | 43 | where $n_m = \frac{k}{\gamma} - 1$, $\gamma = \frac{e^\epsilon-1+\beta}{e^\epsilon}$ and $f(j;n,\beta) = {n \choose j} \beta^j(1-\beta)^{n-j}$. 44 | 45 | In order to achieve attribute generalization, ARX uses the so called \emph{hierarchies}. They are either imported from a csv file, or being hard-coded into the API, and they are used in order to generalize a sensitive field. An example is given in \emph{Table 3.4}. The subject to generalize is the age of a person. Let's see the values as they proceed through generalization. 46 | 47 | \begin{table}[!htb] 48 | \centering 49 | 50 | \caption{Generalization of data using hierarchies} 51 | \label{numbers} 52 | 53 | \begin{tabular}{| c | c | c | c |} 54 | \hline 55 | $1^{st}$ level & $2^{nd}$ level & $3^{rd}$ level & $4^{th}$ level\\ 56 | \hline 57 | 1 & 0-4 & 0-9 & *\\ 58 | \hline 59 | 3 & 0-4 & 0-9 & *\\ 60 | \hline 61 | 5 & 5-9 & 0-9 & * \\ 62 | \hline 63 | 10 & 10-14 & 10-19 & *\\ 64 | \hline 65 | 18 & 15-20 & 10-19 & *\\ 66 | \hline 67 | 68 | \end{tabular} 69 | \end{table} 70 | 71 | \subsection{Conducting D.P. Testings} 72 | 73 | ARX provides a cross-platform graphical tool, that supports many different ways of anonymizing data, as well as an API that delivers those data anonymization capabilities to Java programs. We are going to use the latter, in order to create our own scripts for testing the tool and its accuracy. 74 | 75 | In order to test the accuracy of the models used by ARX, we are going to run simple queries, on the datasets produced by the anonymization process. We want to eliminate the probability of extremely high noise generation, thus we are going to run the anonymization tool multiple times, and the output dataset will be constructed by the mean values of the fields. 76 | 77 | As show on the above matrix, ARX hierarchies tend to replace every type of value with an interval. This is not desirable when applying the testings we mentioned. Thus, we had to come up with a better solution of defining hierarchies. The ARX GUI provides a wizard that gives a variety of choices so the user can easily create a hierarchy for many data types. 78 | 79 | Another challenge is the number of layers that we are going to use, meaning how far our anonymization will proceed. In each layer, the number of same records increase exponentially, thus we do not want to apply many layers, in order for our results to be accurate, and the output dataset to be readable. 80 | 81 | Given the help from Dr. Fabian Prasser, one of ARX's creators, we opted to treat the integer values as numbers, and in each level: 82 | \begin{itemize} 83 | \item Group the rows by 2 84 | \item Apply a function according to the query we want to ask. 85 | \end{itemize} 86 | 87 | For example, if we want a counting query, the best option would be to apply an \emph{arithmetic mean} function to the group, thus the sum, the mean, the variance etc will be the same. The way that ARX preserves D.P. with those settings, is by record suppression. If that was not the case, the results would be identical to the input dataset. However, now, the output dataset will differ because of its lack of some rows of the input. 88 | 89 | Regarding the layers problem, we opted to use 4 layers of anonymization, the last of whom will be the * value, meaning that every record is inseparable. We do not want this to happen early in our anonymization, but we do not want it to never happen either, because then we would have a privacy leak, if the dataset was too small. 90 | 91 | The creation of the hierarchies for the salary column, can be shown in \textbf{Figure 3.8}, taken from the ARX GUI Hierarchy Creation Wizard. 92 | 93 | \begin{figure}[!htb]\centering 94 | \includegraphics[width=0.8\textwidth]{images/hierarchies.png} 95 | \caption{Creating an Hierarchy using ARX GUI} 96 | \end{figure} 97 | 98 | 99 | \subsection{Metrics Used} 100 | We are going to test the applicability of the already given ARX mechanisms on a numerical dataset. Our goal is to run basic queries, such as mean value on the dataset's records. We are going to do that first by applying no DP at all, and then by using the API that is presented by ARX, helped by a simple java script that was built for this purpose. 101 | 102 | 103 | \subsection{The identity of the testing Dataset} 104 | The dataset that we are going to be looking at, contains sensitive data regarding NBA players' \emph{salaries} from the year 1990 until today. It also states other info about them, such as their \emph{age}, their\emph{ current team }and their \emph{position}. This particular data is not considered sensitive, as those numbers are widely available, however, when it comes down to certain people's salaries, applying D.P. in order to preserve their privacy is crucial. 105 | 106 | \subsection{Process of running the queries} 107 | As we have earlier noted, the application of D.P. in ARX is rather complicated, specifically for the use that we are interested in: We want the output dataset to have numerical values in the earnings' column, in order to apply queries. 108 | 109 | For each column of the dataset, we have defined our own hierarchies. For every column except the `Salaries` one, this hierarchy is semantic, like the ones presented in our intro. 110 | 111 | For the salaries column, with it being our goal to analyze, we opt to use the construction mentioned in our solution in the intro. We created 7 layers, in order to give the algorithm the ability to anonymize the dataset without the values being converted to `*`. 112 | 113 | A sample of the result of the creation of the Salaries hierarchy is presented in \textbf{Table 3.5}, while the whole file is available in the GitHub distribution of the results of this Thesis. 114 | 115 | \begin{table}[!htb] 116 | \centering 117 | 118 | \caption{Hierarchy Levels created} 119 | \label{numbers} 120 | 121 | \begin{tabular}{| c | c | c | c | c|} 122 | \hline 123 | $1^{st}$ level & $2^{nd}$ level & $3^{rd}$ level & ... & $7^{th}$ level \\ 124 | \hline 125 | 79.568 & 291.029 & 500.776 & ... & *\\ 126 | \hline 127 | 502.491 & 291.029 & 500.776 &... & *\\ 128 | \hline 129 | 522.738 & 710.524 & 500.776 &... & *\\ 130 | \hline 131 | 898.310 & 710.524 & 500.776 &... & *\\ 132 | \hline 133 | 1.000.000 & 1.114.013 & 1.220.739 &...& *\\ 134 | \hline 135 | 1.228.026 & 1.114.013 & 1.220.739 &... & *\\ 136 | \hline 137 | \end{tabular} 138 | \end{table} 139 | 140 | Next up, we are going to setup the use the ARX API, which requires us to specify some variables in order to run Differential Privacy. Those variables are defined in the above Java code, and are those that were use in the actual testings. 141 | 142 | \bigskip 143 | \bigskip 144 | \bigskip 145 | \bigskip 146 | \begin{lstlisting}[ 147 | basicstyle= \footnotesize, 148 | language=Java] 149 | EDDifferentialPrivacy criterion = new EDDifferentialPrivacy(2d, 1d / Rows); 150 | 151 | ARXConfiguration config = ARXConfiguration.create(); 152 | config.addPrivacyModel(criterion); 153 | config.setSuppressionLimit(1d); 154 | config.setHeuristicSearchStepLimit(100); 155 | ARXResult result = anonymizer.anonymize(data, config); 156 | \end{lstlisting} 157 | \bigskip 158 | 159 | 160 | The basic principles that were followed for the above definitions are based on the following instructions and guidance by the ARX Tool documentations: 161 | \begin{itemize} 162 | \item The delta value should not be 0, but is suggested to be set lower or equal than the reciprocal of the number of records. 163 | \item A suppression limit should be set, preferably to 1. 164 | \item In order to improve the quality of the data produced, a heuristic search step limit should be set, in order to tweak the ARX search algorithm that handles data suppression. 165 | \end{itemize} 166 | 167 | Additionally, following the same principles as with the IBM library, we are going to run the D.P. query multiple times before reporting its value. We are going to do so, because the amount of noise generated can be extreme, and because of the low bounds of the heuristic search that we have set. We chose to run each query 1000 times, and then report the mean value of those runs as the result produced by the mechanism. 168 | 169 | Because of the structure of the result of the ARX mechanism (a dataset containing numerical values), we can only run queries like \emph{sum} and \emph{mean}. There is no point in running a min or max query: we already know that the result will not be accurate. Thus, we are going to try to run a\emph{ mean value} numerical query in the anonymized dataset. The function we are using in order to run this typed of queries is the following: 170 | \bigskip 171 | \clearpage 172 | \begin{lstlisting}[ 173 | basicstyle= \footnotesize, 174 | language=Java] 175 | protected static double run_query(ARXResult data, int targetColumn) { 176 | // iterator that we are going to use to access the data 177 | final Iterator itHandle = data.getOutput().iterator(); 178 | 179 | // result of the query 180 | double result = 0d; 181 | // length of the dataset 182 | int totalRecords = 0; 183 | 184 | // ignore the name of the column 185 | String[] name = itHandle.next(); 186 | if (name.length <= targetColumn) { 187 | System.out.println("Target column out of bounds\n"); 188 | return 0d; 189 | } 190 | 191 | // iterate through all the values in the dataset 192 | while(itHandle.hasNext()) { 193 | String[] next = itHandle.next(); 194 | // check that our target position is legal 195 | String string = next[targetColumn]; 196 | if (!string.equals("*")) { 197 | result += Integer.parseInt(string); 198 | totalRecords++; 199 | } 200 | } 201 | // return the __mean__ of the dataset 202 | return result / totalRecords; 203 | } 204 | 205 | \end{lstlisting} 206 | \bigskip 207 | 208 | Finally, before running the queries, we must mention that while using the ARX Tool the dataset size should be significant. In our case, it contains almost 13 contains thousands of rows. The dataset size is a critical parameter when applying D.P., while being even more essential during the use of the ARX tool. 209 | 210 | \subsection{Statistical Queries} 211 | 212 | As shown by the above Java function, our testings can support every type of statistical queries, such as \emph{mean value}, \emph{sum}, and \emph{average}. However, due to the computer power required and the similarity of the results that those queries produce, we will focus our testings solely to the mean value query. 213 | 214 | Given the dataset previously analyzed, the true mean value of the salaries column of the dataset is $\$2.868.981,32$. This will be the value that we are going to use in order to examine the accuracy of the D.P. results. 215 | 216 | \subsubsection{Running with fixed parameters} 217 | 218 | We are going to conduct our first test by anonymizing our dataset using the default parameters, as we set ε $ = 1$ and δ $ = \frac{1}{12377}$. The results are shown in the following table. 219 | 220 | 221 | \begin{table}[!htb] 222 | \centering 223 | 224 | \caption{Mean value query in ARX with default parameters} 225 | \label{numbers} 226 | 227 | \begin{tabular}{| c | c |} 228 | \hline 229 | Non-DP result & DP Result \\ 230 | \hline 231 | $\$2.868.981,32$ & $\$2.860.215,6$\\ 232 | \hline 233 | \end{tabular} 234 | \end{table} 235 | 236 | We observe that the query results, are somewhat close: We are in the range of millions of dollars, and the ARX mechanism only fails to approach the result by 8 thousand. This is not of course close to what the IBM library computed, but it is still a reliable result, given all the downsides of this type of anonymization. 237 | 238 | 239 | \subsubsection{Running with different epsilon values} 240 | 241 | Next up, in order to determine if ARX follows the rules of D.P., we should try anonymizing the dataset for different values of epsilon, just like we did with the IBM library. 242 | 243 | We observed during our initial runs that if the epsilon value rises above 2, the algorithm faces certain problems, that will be analyzed moving forward. With that being the case, the epsilon values chosen to conduct the measurements are in the range $[0.2, 1.6]$. The results from our testings are shown in the above \textbf{Figure 3.8.} 244 | 245 | \begin{figure}[!htb]\centering 246 | \includegraphics[width=1\textwidth]{images/arx_accuracy.png} 247 | \caption{Accuracy Error results for increasing values of epsilon} 248 | \end{figure} 249 | 250 | As we can see, the plot produces a linear type curve when the epsilon is below 0.8, and then stabilizes, unlike the Laplace distribution mechanisms, where the error curve is in logarithmic shape. We can not directly compare the results with the Laplace noise distribution, due to the different datasets used. 251 | 252 | We generate the answers given by asking the query in the output dataset. Without its records being suppressed, the result dataset would have been perfect, because of the transformation of the data. However, when suppressing many records (nearly 10\% each time), the result could be severely altered, and thus the error plot, as we saw, is quite unpredictable. 253 | 254 | 255 | \subsection{Observations regarding the Algorithm} 256 | 257 | During our testings in the dataset using the ARX mechanism, we observe the following regarding its behavior in the DP queries: 258 | 259 | \begin{itemize} 260 | \item The epsilon variable if raised above 2,5, makes the algorithm \emph{extremely slow}, to the point that it does not respond after minutes of execution. This makes sense, if we take into consideration that when epsilon increases, the accuracy gets better. Thus, the algorithm performs extreme searching techniques in order to find which records to suppress, resulting into slow execution. 261 | \item In order for the algorithm not to produce only *(the last level in our hierarchies) in our target column, we set each of the other columns as \emph{non sensitive} in their definition. 262 | \item As the epsilon values rise, \emph{the accuracy gets better}, as it is supposed to be, according to the DP principles. 263 | \item While the dataset has multiple columns, the algorithm usually fails to present all of them with anonymized values, and just reports * in each row. This could have been a result of the high \emph{Heuristic Search Step Limit}, which was by default set to maximum. Despite us lowering its value, the phenomenon persists. 264 | \end{itemize} 265 | 266 | \subsection{Conclusion} 267 | 268 | While researching the ARX mechanism we came to the conclusion that it is for sure a whole different approach in Differential Privacy compared to the other libraries that we studied. With that being the case, it has some advantages and some disadvantages. Its main advantages are the following: 269 | 270 | \begin{itemize} 271 | \item The result of the mechanism is a \emph{handy dataset} that the user can handle in multiple ways and gain more information than just the result of a query. 272 | \item The result \emph{can be iterated}, thus giving the option to the user to run the query in a smaller subset of the rows, while it being differential private. 273 | 274 | \end{itemize} 275 | 276 | On the other hand, the main disadvantages are: 277 | \begin{itemize} 278 | \item The result can be misleading, because of the \emph{big accuracy error produced}. 279 | \item The algorithm \emph{requires a rather big dataset} in order to run properly, while other libraries perform just fine with smaller datasets. 280 | \item The algorithm is difficult to implement, as you have to create a self-made function for every query, and moreover tune many parameters if you want to run differential privacy. 281 | 282 | \end{itemize} 283 | -------------------------------------------------------------------------------- /thesis_paper/GDP/DP_definition.tex: -------------------------------------------------------------------------------- 1 | \chapter{PRINCIPLES OF DIFFERENTIAL PRIVACY} 2 | 3 | During this chapter we are going to introduce the term of D.P., and its definition, alongside with the principles that need to be followed while applying it. 4 | 5 | \section{Promise of Differential Privacy} 6 | 7 | \par Differential Privacy is actually a promise made by the data handlers, to the participants of a study: "You will not be affected, adversely or otherwise, by allowing your data to be used in any study or analysis, no matter what other studies/ datasets/ info resources are available". 8 | \par The goal is to make the data widely available for analysis, while protecting the users. However, is it possible to learn nothing about an individual, while gathering useful information about a population? This is actually what D.P. is trying to achieve. 9 | 10 | 11 | \section{Definition of Differential Privacy} 12 | Before defining D.P., we must analyze some of the basic components of its definition. 13 | 14 | \subsection{Randomized Response} 15 | Randomized response is one of the earliest privacy mechanisms, that is used to conduct surveys where taboo behaviour is studied. The participants in those surveys are asked to answer truthfully, while they do not want to be stigmatized. There is a micro-world of what we are trying to achieve, thus we are going to give the algorithm of the randomized response in order to answer a binary (yes/no) question. 16 | 17 | \begin{itemize} 18 | \item Flip a coin. 19 | \item If it lands on heads, answer truthfully 20 | \item If it lands on tails, flip another one 21 | \item If it lands on heads, answer no, else, answer yes 22 | \end{itemize} 23 | 24 | We are going to analyze this algorithm and its success in later chapters, but for now, it is enough to know that there exists a simple mechanism that adds noise, and is rather accurate for large samples. 25 | 26 | Before giving the definition of D.P., we must define its components. 27 | 28 | \begin{itemize} 29 | \item \textbf{Probability Simplex}, given a discrete set $B$, is denoted as $\Delta(B)$ and is defined to be: 30 | \begin{align*} 31 | \Delta(B) = \{ x\in R^{|B|}: x_i \geq 0 \text { } \forall i \text{ and } \sum_{i=1}^{|B|} x_i = 1\} 32 | \end{align*} 33 | \item A \textbf {Randomized algorithm} $M$ with domain $A$ and discrete range of results $B$, is associated with the mapping $M: A\rightarrow\Delta(B)$. 34 | \item \textbf{Distance between Databases:} The $l_1$ norm of a database x is denoted $||x||_1$ and it is defined to be: $||x||_1 = \sum_{i = 1}^{|x|} |x_i|$. Thus, the $l_1$ distance between 2 databases $x$ and $y$, is $||x-y||_1$, and the size of a database $x$ os $||x||_1$. 35 | 36 | \end{itemize} 37 | 38 | \subsection{Definition} 39 | Differential Privacy is defined as following: 40 | \\ 41 | \\ 42 | A randomized algorithm $M$ with domain $N^{|x|}$ is (ε,δ)-differentially private, if for all $S \in Range(n)$ and for all $x,y \in N^{|x|}$ s.t. $||x - y||_1 \leq 1$ 43 | $$ Pr[M(x) \in S] \leq e^\epsilon Pr[M(y) \in S] + \delta$$ 44 | 45 | where the probability space is over the coin flips of the mechanisms $M$. If $\delta = 0$, we say that $M$ is ε-differentially private. 46 | 47 | \\ 48 | It should be noted that D.P. is rather a definition than a strict algorithm. While relying on the definition of D.P., we can create different algorithms, which will all ensure that the result will be deferentially private. This allows us to create different forms of D.P., that will be analyzed later on this thesis. 49 | 50 | The whole point of Differential Privacy, is that the output of a D.P. mechanism, should by \emph{independent} of whether or not an individual is present in the domain $N$. The "ability" of the adversary to recognise the existence of a column in the dataset, is regularized by epsilon. 51 | 52 | \section{The meaning of epsilon} 53 | It is made clear from the above definition, that if we have a computational task, we might find different algorithms for applying D.P., but the result will always be of the same form: each user of the dataset, will get ε-D.P.. But what does the epsilon parameter actually mean? 54 | 55 | By reading the mathematical equation, we observe that the higher the value of epsilon, the bigger the difference between the two probabilities (minimum and maximum). Thus, we extract the following statement about the value of epsilon during the application of Differential Privacy: 56 | 57 | \begin{itemize} 58 | \item The \emph{lower the epsilon} value, the \emph{higher the privacy} guarantees for the users of the dataset. 59 | \item The \emph{higher the epsilon} value, the \emph{more accurate the results} produced. 60 | \end{itemize} 61 | 62 | In practice, epsilon values vary in the range $(0,5]$, as lower values are prohibited, and higher values are considered extreme cases. However, as mentioned in [1], when epsilon is small, failing to be (ε,0)-differentially private is not necessarily alarming, if our algorithm is linearly increasing with ε (ex (2ε,0)-D.P). This happens because of the nature of the epsilon parameter, which guarantees very strict boundaries between databases. However, when ε increases by a lot, users' privacy suffers. 63 | 64 | In \textbf{Figure 2.1}, we can see in general terms, the function between the epsilon and the accuracy error, as well as the protection guaranteed. We will discuss in later sections the details on how these graphs are created, but now is a good time to get an overall picture of the accuracy error produced when applying D.P. 65 | \bigskip 66 | \bigskip\bigskip 67 | 68 | \begin{figure}[!htb]\centering 69 | \includegraphics[width=0.6\textwidth]{images/epsilon_intro_graph.png} 70 | \caption{Accuracy Error as a function of epsilon} 71 | \end{figure} 72 | 73 | \section{Different forms of Differential Privacy} 74 | 75 | As mentioned during the definition, due to the room that is left for its interpretation, there can be many forms of Differential Privacy. There are two major fields recognized, the \emph{Global D.P.} and the \emph{Local D.P.}. 76 | 77 | Their major difference is the curator of the data. In the Global model, the curator must be trusted, as he collects the non-private data and has to pass them through a D.P. algorithm. 78 | 79 | On the other hand, in the Local model, the curator may as well be untrusted, since the users perturb their data on their own, using a specific protocol. The key differences of the two forms are shown in the \emph{Figure 2.2} below. 80 | 81 | An other difference between the two models, is the amount of noise added. With the absence of a trusted curator, the users themselves must add a significant amount of noise into their data, in order to preserve their privacy. This of course results into a need of many users (several thousands), in order for the L.D.P. protocols to function correctly and accurately. 82 | 83 | 84 | \begin{figure}[!htb]\centering 85 | \includegraphics[width=0.3\textwidth]{images/local_vs_global.png} 86 | \caption{Differences between LDP and GDP} 87 | \end{figure} 88 | 89 | In this thesis, we are going to examine both models, by quoting their definitions, observing already-existing algorithms, and creating our own L.D.P. protocol. 90 | 91 | \section{Existing Problems of D.P.} 92 | 93 | As every new step in Computer Science, Differential Privacy has some issues that are yet to be solved, and some others not covered by its definition. 94 | 95 | One major problem is the behaviour of the protocols \emph{when the number of users is limited}. The definition of D.P. is based on the alteration of the data in order not to reveal sensitive information. Thus, if a small amount of users are involved in those protocols, the accuracy of the results might be way off the standards that we set, in order to satisfy the epsilon requirements of the user. 96 | 97 | Another (unsolvable) issue, that mainly lies on the basis of surveys, is \emph{the possibility that conclusions drawn from a survey may reflect statistical information about an individual}. 98 | 99 | For example, if a survey about the correlation of smoking and dental problems is conducted, someone that has specific dental problems might be deemed as a smoker, despite keeping his privacy about the fact that he is smoking, during the survey. That is something that D.P. does not promise: unconditional freedom from distinguishing. This is not however a violation of the definition of D.P., as the survey teaches us that specific private attributes correlate with public observable attributes, since this correlation would be observed independent of the presence or absence of the individual in the survey. 100 | 101 | There are several more issues as the ones covered above, however we are not going to focus on those, rather on the advantages of D.P. -------------------------------------------------------------------------------- /thesis_paper/GDP/Intro.tex: -------------------------------------------------------------------------------- 1 | \chapter{INTRODUCTION} 2 | 3 | \section{Need for Privacy} 4 | 5 | 6 | \par In our days, data is everywhere, including our smartphones, our computers our TVs, even our watches. Every device and nearly every website track down data, in order to provide more personalized services. This, of course, is desired by the users, as they are more likely to see relevant advertisements, and in general, have a more unique experience while they are using their devices. 7 | 8 | \par At the same time, the services that track down the data are also benefited, because of the way that science evolves: Experiments need to be made, thus the more available data in order to conduct them, the better. As an example, we might think the medical community: when someone logs-in to the hospital, it is beneficiary for the doctors to gather his data, in order to study his decease, and his potential recovery, not only for the shake of the patient, but also for the further study of the decease. 9 | 10 | \par While providing data may seem inevitable and yet beneficiary for all parties, there is always a risk that this data will be used in order to compromise the user's privacy. When the information lands in the wrong hands, it can expose some characteristics of the user that he does not want to be shared. In our medical example, let's now consider a patient with a rare decease, who logs-in to a local hospital. He might consent to share his personal data (name, age etc.), but only for the doctors to use it. What will happen when the doctors give the data of the whole hospital for analysis? This patient, considering he is one of the few that has this illness, may be stigmatized, when the data analysts find out his condition. Wouldn't it be better for him, if, let's say, his name was not exposed? We will see later on, why this approach, is found to be successful, but not enough, for extreme cases. 11 | 12 | \section{Definition of the problem of privacy} 13 | \par In general, when we consider the \emph{problem of privacy}, we refer to the protection of the disclosure of sensitive information of individuals, when a collection of data about these individuals (dataset) is made publicly available. 14 | 15 | \subsection{Achieving Privacy via Anonymization} 16 | 17 | \par One of the first, and rather successful attempts for preserving privacy, was anonymization, meaning removing all personal identifiers from the dataset. This technique is further developed, using famous algorithms like k-anonymity, l-diversity etc. However, there are several problems with this approach. Firstly, they are very computational heavy, as their complexity rises up to an exponential one, making the anonymization of a large dataset very slow. Also, the anonymization does not guarantee that the user will remain private, if other datasets are not anonymized. Let's once again consider our example. Suppose our patient goes to two separate hospitals for his treatment, and one of them uses the best anonymization techniques, while the other one provides the data without any form of privacy. Our patient is on both of the datasets, thus the techniques adopted by the first hospital are now useless. This expands to the real world, because, no matter how careful you (and the services that you use) are, a single data breach is enough for you to be compromised. 18 | 19 | \par So, right now, things seem a bit pessimistic, supposing that anonymization, no matter how well performed, can not fix our problem. Another successful technique, that is used on many fields, is the addition of noise. During a later section, we are going to examine in which ways it can benefit us while trying to solve our problem. 20 | 21 | \subsection{Achieving Privacy via Randomization} 22 | 23 | Randomization can be applied to the data of the users in two different forms: 24 | \begin{itemize} 25 | \item Apply random noise \emph{directly to the data}. This will result to altered data, which will then be processed, so that the adversary will not be able to individualize the entries in the dataset. 26 | \item Apply random noise to \emph{queries asked to the dataset}. In that case, the dataset is not directly available to the analysts. Instead, they are allowed to ask questions to the dataset, and the answers are then being randomized, and returned. 27 | \end{itemize} 28 | 29 | Both the above approaches are utilized, but the second one is widely preferred. During our analysis of data privacy, we are going to dive in both of those techniques, as well as the libraries that they are used in. 30 | 31 | \par As we can see, the randomization method looks good in theory, but we must answer to several questions before implementing it, such as: 32 | 33 | \begin{itemize} 34 | \item How can we define privacy for noisy queries? 35 | \item What type of noise do we need? 36 | \item What should we do in the case of extreme amount of noise added? 37 | \end{itemize} 38 | 39 | We are going to answer those questions later on, during our next chapters. 40 | 41 | \section{Goal of this thesis} 42 | As discussed in the introduction, that the most effective up-to date method for applying privacy into a dataset, is via randomization. The method used, is called \emph{Differential Privacy}, and is based on injecting noise into the users' data. 43 | 44 | The theory behind this method includes many mathematical theorems, however, it can by easily explained. We will proceed by taking a look on those principles, and analysing the theory behind this form on data privacy. Then, we will proceed by examining some existing applications of D.P., especially some libraries that help us to apply this technique in a dataset. Finally, we are going to create our own library in order to apply Local D.P., a form of privacy that we will discuss in the next chapter. 45 | 46 | This library will allow a user to fully anonymize a dataset, and afterwards create histogram and counting queries for this dataset. During the implementation of this library, a new protocol will be introduced, which follows the rules of D.P., and produces better results than many already-existing protocols. 47 | -------------------------------------------------------------------------------- /thesis_paper/LDP/intro.tex: -------------------------------------------------------------------------------- 1 | \chapter{A LIBRARY FOR LOCAL DIFFERENTIAL PRIVACY} 2 | 3 | \section{Introduction in Local DP} 4 | 5 | As we mentioned in previous chapters, there are two major forms of Differential Privacy. Having analyzed and tested the first one, \emph{Global D.P.}, it is now time to examine \emph{Local D.P.}, by explaining some possible protocols, as well as building our own. 6 | 7 | 8 | In Local D.P., there is a significant difference compared to Global DP: there is \emph{no trusted curator} between the data and the users, as they just want to send their data, while already being anonymized. Thus, an algorithm must perturb the data before sending it to the untrusted curator, who will then transmit it to the analysts. 9 | 10 | In order to achieve that goal, the user must randomize the value before making it public (i.e. sending it to the untrusted curator). Then, the curator which collects the data (we will reference to him as aggregator moving forward), collects the data and tries to retrieve their original values, with a goal of producing the most accurate results possible. 11 | 12 | Thus, each LDP algorithm has the following steps: 13 | 14 | \begin{itemize} 15 | \item Each user encodes, and then perturbs the private value that he wants to make public 16 | \item Each user sends out the result of the perturbation process, with that being only the final value, as they keep the intermediate results for themselves 17 | \item The untrusted data curator collects each user's value, and implements some kind of aggregation in order to retrieve the stats that he wants from the data given to him. 18 | \end{itemize} 19 | 20 | In comparison with Global D.P., the Local model has advantages, as well as disadvantages. 21 | Its main advantages are: 22 | \begin{itemize} 23 | \item The user is not forced to trust the data curator, as only the perturbed value is reported 24 | \item Simpler implementation of the algorithms, due to the district steps taken by both sides. 25 | \end{itemize} 26 | 27 | while the main disadvantages are the following: 28 | 29 | \begin{itemize} 30 | \item The noise added should be larger than the Global model, in order to satisfy the definition, thus the number of people in the dataset should be significant for accurate results to be produced. 31 | \item Because this is not always possible, many real-world applications use extremely high values of epsilon compared to what we got used to during our testing in the Global models. 32 | \end{itemize} 33 | 34 | During this Thesis, concern was raised for the main disadvantage of L.D.P., and thus\emph{ we will present a new protocol aiming to reduce the need for many users, while still covering the definition.} However, the definition for L.D.P. is quite different than the Global model one's. 35 | 36 | \section{Definition of Local DP} 37 | 38 | Having a general idea in how Local D.P. functions, it is now time to give a strict definition that we are going to depend our work on moving forward. 39 | 40 | We can say that an algorithm $A$ satisfies ε-Local Differential Privacy, if and only if for any input $v_1$, $v_2$, we have 41 | 42 | $$ \forall y \in Range(A):\ Pr[A(v_1) = y] \leq e^{\epsilon} * Pr[A(v_2) = y] $$ 43 | 44 | where $Range(A)$ denotes the set of all possible outputs of the algorithm $A$. 45 | 46 | As mentioned in Chapter 2, this definition can have many interpretations by different algorithms or protocols, but each one must produce a probabilistic space whose elements must satisfy the above equation. 47 | 48 | 49 | \section{Simple Application of LDP} 50 | 51 | The most simple of L.D.P. protocols is already mentioned in this Thesis, and is no other than the \emph{Randomized Response} protocol. This algorithm implements the three steps mentioned in the introduction, as the user chooses a value (Yes or No), perturbs it (by the flipping of the coins), reports the perturbed value, with the sole job of the aggregator being to collect, normalize and report the values provided. It meets the definition of L.D.P., as the fraction of a pair of probabilities in the space of possible outputs (Yes, No) has always the ceiling of a real number. 52 | 53 | Our goal is to now find this ceiling, and thus denote the level of privacy that randomized response offers. In order to do this, we are going to select the possibility of the user having chose the answer "Yes". A simple case analysis shows that $Pr[Yes | Truth] = \frac{3}{4}$, and of course $Pr[Yes | False] = \frac{1}{4}$. Thus, by the definition of L.D.P., we have 54 | 55 | \begin{align*} 56 | \frac{Pr[Yes | Truth]}{Pr[Yes | False]} = \frac{\frac{3}{4}}{\frac{1}{4}} = 3 = e^\epsilon \Longleftrightarrow \epsilon = ln(3) 57 | \end{align*} 58 | 59 | Thus, R.R. offers $ln(3)$-differential privacy to its users. This is quite a good setting, but the restriction is that the user can only report 2 values, something not suitable for modern problems and surveys. 60 | 61 | 62 | 63 | In R.R., we care about the total true answers of the users, and not the individual responses. Thus the metric we are going to use is the \emph{absolute difference of the sum of the 2 vectors: the one with the truthful answers, and the one with the reported answers}. We are going to divide this result with the number of the users, in order to get the scale of the error depending on the size of the vector that was reported. The metric is expressed from the following function: 64 | 65 | \begin{align*} 66 | \text{Error} = \frac{|\sum \text{true\_values} - \sum \text{reported\_values}| }{\text{number\ of \ users}} 67 | \end{align*} 68 | 69 | As always, during the creation of probabilistic distributions, one run is not enough, because of the extreme amount of noise that can occur. Thus, for each number of users we are going to run the R.R. protocol 100 times, and the final accuracy error will be produced by the mean value of those runs. 70 | 71 | Having implemented R.R. in Python, we can now display the accuracy error of R.R. as the number of users rises. The results of the testings are shown bellow in \textbf{Figure 4.1}. 72 | 73 | \begin{figure}[!htb]\centering 74 | \includegraphics[width=0.8\textwidth]{images/rr_results.png} 75 | \caption{Accuracy Error in R.R for increasing values of epsilon} 76 | \end{figure} 77 | 78 | 79 | We observe that the plot behaves as expected: the protocol produces a logarithmic curve for the accuracy error, while for a large number of users (over 3000), the error stabilizes bellow $0.1$. -------------------------------------------------------------------------------- /thesis_paper/LDP/other_protocols.tex: -------------------------------------------------------------------------------- 1 | 2 | 3 | \section{Existing Protocols for Local DP} 4 | 5 | Apart from R.R., several L.D.P. protocols have been implemented during the years, with many of them being widely used by companies in order to protect users' data. One of the most famous protocols is \emph{RAPPOR}([15]), created by Google, and being currently used in the Chrome browser for the company to provide useful info to its users without compromising their privacy. Also, Apple has created ts own protocol of L.D.P., and utilize it in their products. 6 | 7 | However, we are not going to focus on those protocols moving forward, than the ones presented in [10], a paper which introduces many algorithms for L.D.P., each one with different perturbation techniques and suitable for different circumstances. 8 | 9 | During this chapter we are going to give a definition of each algorithm, implement it using Python, and compare the accuracy results produced by those protocols, just like during our testings of the G.D.P. models. Each protocol has two parts: the \emph{users} and the \emph{aggregator}. For the users we must each time define the following functions: 10 | 11 | \begin{itemize} 12 | \item $Encode()$: Encodes the true value that the user wants to report 13 | \item $Perturb()$: Perturbs the encoded value, in order to produce the random value that will be reported 14 | \end{itemize} 15 | 16 | For the aggregator we must each time define the $Aggregate()$ function, that collects the reported random values of the users, and produces the results according to the model. 17 | 18 | \subsection{Basic RAPPOR} 19 | As mentioned earlier, RAPPOR is a protocol created by Google. Its simpler form, Basic RAPPOR is used in Chrome, where it collects answers to questions such as the user's home page. The protocol's functions are the following: 20 | 21 | \textbf{Encoding:} $Encode(v) = A_0$, where $A_0$ is a d-bit vector, such that: $A_0[v] = 1$ and $A_0[i] = 0$ for every $i \neq v$. 22 | 23 | \textbf{Perturbation:} The perturbation consists of 2 steps: the permanent and the instantaneous. The permanent one is carried out only one time, and is the following: 24 | 25 | \begin{equation*} 26 | Pr[A_1[i] = 1] = 27 | \begin{cases} 28 | 1 - \frac{1}{2}f & \mbox{if } A_0[i]=1 \\ 29 | \frac{1}{2}f & \mbox{ otherwise} 30 | \end{cases} 31 | \end{equation*} 32 | 33 | The instantaneous step is carried out every time a user reports a value, and is defined as: 34 | 35 | \begin{equation*} 36 | Pr[A_2[i] = 1] = 37 | \begin{cases} 38 | p & \mbox{if } A_1[i]=1 \\ 39 | q & \mbox{ otherwise} 40 | \end{cases} 41 | \end{equation*} 42 | 43 | We observe from the above functions, that the user must define the $f, p $ and $q$ parameters. Google suggests that we set $f = \frac{1}{2}$ or $\frac{1}{4}$, and $p = 0.75$, thus $q = 0.25$. During our testings, those exact parameters were used. 44 | 45 | 46 | \subsection{Random Matrix Projection} 47 | In [14], a protocol with a random matrix projection is proposed, introducing an additional setup step. 48 | 49 | \textbf{Setup:} A random and uniform matrix is generated before any encoding, with it being public and drawn as: $\Phi \in \{-\frac{1}{m}, \frac{1}{m}\}^{m \times d}$, where $m$ and $d$ are user defined. In our testings, we opt to set $m = 5$ and $d = 10$. 50 | 51 | \textbf{Encoding:} When it comes down to encoding, the function used is the following: $Encoding = (r,x)$, where $r$ is uniformly randomly selected from the range of m, and $x$ is the v-th element of the r-row of the random matrix. 52 | 53 | \textbf{Perturbation}: The perturbation function is defined as following: 54 | \begin{align*} 55 | Perturb(r,x) = (r, b\cdot c \cdot m \cdot x) 56 | \end{align*} 57 | where 58 | \begin{equation*} 59 | b = 60 | \begin{cases} 61 | 1 & \mbox{with } p = \frac{e^\epsilon}{e^\epsilon + 1} \\ 62 | -1 & \mbox{with } q = \frac{1}{e^\epsilon + 1} 63 | \end{cases} 64 | \end{equation*} 65 | and $c = \frac{e^\epsilon +1}{e^\epsilon -1}$ 66 | 67 | \textbf{Aggregation:} Given all the tuples reported by $j$ users in the form $(r, y)$, the estimation for the i-th value of the dataset, is produced by 68 | \begin{align*} 69 | \sum_{j} y^j \cdot \Phi[r^j,i] 70 | \end{align*} 71 | 72 | \subsection{Pure Protocols} 73 | 74 | The following protocols are presented in [10], and are called "pure" protocols, because of the way they aggregate the data produced by the user. For each one of them, we should define a $Support()$ function, that indicates for each value of the possible outcomes, the reported values that are supported. Thus, with the notation $\sum_{j} Support(y^j)$, we mean the sum of all the supported values of the y-th element of the dataset. 75 | 76 | Also, for a protocol to be pure, two probabilities must be defined, $p^*$ and $q^*$, where the first notes the probability that the true value is supported by an element $y$, and the second one the probability of another value is supported by the element $y$. The protocol is pure if and only if $p^* > q^*$. 77 | 78 | If a protocol is pure, the estimation of the total reported values for an element of the dataset $i$, is the following: 79 | 80 | \begin{align} 81 | \text{Estimation} = \frac{\sum_{j} 1_{support(y^j)}(i) - nq*}{p^* - q^*} 82 | \end{align} 83 | where $j$ denotes the j-th user reporting their value, and $n$ the total size of the vector of the reported values. 84 | 85 | 86 | \subsubsection{Direct Encoding} 87 | 88 | This protocol is the natural method of extending the Randomized Response, without the limitation of 2 possible answers. 89 | 90 | \textbf{Encoding:} The protocol does not feature an encoding procedure, thus 91 | 92 | \begin{align*} 93 | Encode(v) = v 94 | \end{align*} 95 | 96 | \textbf{Perturbation:} The perturbation is based on the epsilon setting given by the user, and its function is defined as following: 97 | 98 | \begin{equation*} 99 | Pr[Perturb(x) = i] = 100 | \begin{cases} 101 | p = \frac{e^\epsilon}{e^\epsilon + d - 1} & \mbox{if } i = x \\ 102 | q = \frac{1}{e^\epsilon + d - 1} & \mbox{if } i \neq x 103 | \end{cases} 104 | \end{equation*} 105 | 106 | where $d$ the size of the dataset of the possible answers, $x$ the true value and $i$ the value selected. 107 | 108 | \textbf{Aggregation:} The protocol is pure with $p^* = p$, $q^* = q$ and $Support(i) = i$, thus the predicted results for each of the dataset's values can be calculated from the Equation 4.1. 109 | 110 | We observe that this protocol strongly depends on the size of the dataset of the possible answers, thus when the dataset size increases, the protocol becomes less accurate, due to the decreased probability of selecting the truth. Moreover, for the D.E. protocol, all the false values have the same probability to get chosen, a rather disturbing detail for a query such as a person's age. We will return to these thoughts on later sections. 111 | 112 | \subsubsection{Histogram Encoding} 113 | 114 | An other protocol presented is Histogram Encoding, where an input when having $d$ options is encoded as a $d$-length vector. 115 | 116 | \textbf{Encoding:} The encoding function is for the protocol is 117 | 118 | \begin{align*} 119 | Encoding(v) = [0, 0, \dots, 1, \dots, 0] 120 | \end{align*} 121 | 122 | where only the v-th element of the vector is equal to 1. 123 | 124 | \textbf{Perturbation:} The result of perturbing the encoded vector, is a new vector $B'$, s.t.: 125 | 126 | \begin{align*} 127 | B'[i] = B[i] + Lap(\frac{2}{\epsilon}) 128 | \end{align*} 129 | where $Lap()$ denotes the noise drawn from the Laplace distribution, where 130 | 131 | 132 | \begin{align*} 133 | Pr[Lap(\beta) = x] = \frac{1}{2\beta}e^{\frac{-|x|}{\beta}} 134 | \end{align*} 135 | 136 | \textbf{Aggregation:} Several methods are proposed for aggregating the results created by the H.E. protocol, but as mentioned by the authors, the best one is called \emph{Thresholding with H.E.}, where a threshold value is introduced in order to decide what to keep from the reported values. The support function is altered as following: 137 | 138 | \begin{align*} 139 | Support(B) = \{v | B[v] > \theta\} 140 | \end{align*} 141 | thus, if a noisy output is grater than theta, is set to support the corresponding value. According to the authors, the optimal value for θ is in the range of $(\frac{1}{2}, 1)$. During the testings that are going to be conducted, we are going to use a threshold of $\frac{2}{3}$. 142 | 143 | 144 | Comparing this protocol to D.E., we observe that is solves the problem of the dependence of the noise drawn by the number of options to choose from. In H.E., no matter how large the domain size is, the noise solely depends on the epsilon value chosen by the user. Thus, when having a large domain size, it is clear that we should prefer the H.E. protocol over D.E. 145 | 146 | \subsection{Unary Encoding} 147 | 148 | The last protocol that is going to take part in the accuracy testings, is the Unary Encoding method, a further exploration of the Basic RAPPOR. It is a unique protocol, as the user does not set the level of privacy using epsilon, but by giving two probabilities, $p$ and $q$ and the epsilon value is computed using those two parameters. 149 | 150 | \textbf{Encoding:} Exactly like in the H.E. method: 151 | \begin{align*} 152 | Encoding(v) = [0, 0, \dots, 1, \dots, 0] 153 | \end{align*} 154 | where only the v-th element of the vector is equal to 1. 155 | 156 | \textbf{Perturbation:} This step is different than those that we already saw, and is carried out using the following function: 157 | 158 | 159 | \begin{equation*} 160 | Pr[B'[i] = i] = 161 | \begin{cases} 162 | p & \mbox{if } B[i] = 1 \\ 163 | q & \mbox{if } B[i] = 0 164 | \end{cases} 165 | \end{equation*} 166 | 167 | The epsilon value is decided given $p$ and $q$, and is defined as following: 168 | 169 | \begin{align*} 170 | \epsilon = ln(\frac{p\cdot(1-q)}{(1-p)\cdot q}) 171 | \end{align*} 172 | 173 | \textbf{Aggregation:} The Support function is once again altered, as in the U.E. protocol is defined as following: 174 | 175 | \begin{align*} 176 | Support(B) = \{i | B[i] = 1\} 177 | \end{align*} 178 | 179 | and of course, $p^* = p$ and $q^* = q$, in order to make the protocol pure. As for the choice of $p$ and $q$, we opt to choose $p = \frac{1}{2}$, and $q = \frac{1}{e^\epsilon + 1}$. 180 | 181 | \section{Testings} 182 | 183 | \subsection{Setup} 184 | 185 | Now that all those protocols where introduced, we are going to compare them in order to decide which is better to use when wanting to apply L.D.P. in a dataset. We are going to use a dataset that was created using random values, but corresponds to the age of a group of people. The distribution of the values of the dataset is shown in the histogram in \textbf{Figure 4.2. } 186 | 187 | \begin{figure}[!htb]\centering 188 | \includegraphics[width=1\textwidth]{images/true_answers_ldp.png} 189 | \caption{True Answers for the Dataset of LDP} 190 | \end{figure} 191 | 192 | Each user will report one of these 50 values, and the aggregator of each protocol will gather the data given, and try to re-create this histogram in the best manner possible. 193 | 194 | \subsection{Goal} 195 | 196 | We want to decide which protocols behaves better, thus a number of different metrics will be used. The main focus of our testings will be the vectors that the aggregators provide, which we will compare with each other, as well as with the vector containing the true answers. In a similar way as our G.D.P. testings, we are going to run the protocols for different values of epsilon, and different number of users used. The second one is extremely important in L.D.P., as we mentioned earlier that many protocols struggle with a small number of input, as the noise drawn is significant. 197 | 198 | With respect to the choice of metrics, we are going to use the \emph{Manhattan Distance}, known as the $l1$-norm, as well as the \emph{Kantorovic Distance}, explained in 3.2.5.1. 199 | 200 | \subsection{Epsilon Measurements} 201 | 202 | The first comparison between the protocols will be with a changing epsilon value, in order to observe how they behave for lower and higher values of the privacy setting. During these testings, all of the users of the dataset were used (approximately 20 thousand), and each run of each protocol was carried out 10 times, just like in other testings, in order to eliminate the danger of drawing extreme values of noise. 203 | 204 | First, we are going to run all the protocols and compare them using the Manhattan Distance. The results are shown in \textbf{Figure 4.3.} 205 | 206 | 207 | \begin{figure}[!htb]\centering 208 | \includegraphics[width=1\textwidth]{images/epsilon_others_l1.png} 209 | \caption{Epsilon Measurements compared by Manhattan Distance} 210 | \end{figure} 211 | 212 | 213 | We gather many useful observations from the graph: 214 | 215 | \begin{itemize} 216 | \item The Random Matrix protocol does not function as expected, as \emph{its accuracy does not follow the logarithmic curve we are used to when epsilon increases.} However, for small values of ε, its results are acceptable, and some times even better than the pure protocols. 217 | \item \emph{The pure protocols behave in a similar way}, with the error stabilizing when epsilon gets higher than 2.5. 218 | \item \emph{The Direct Encoding protocol was the worst behaviour among the pure ones}, with its error being extremely high for epsilon values lower than 1. This is mainly due to the fact that when ε gets too small, the probability of telling the truth gets significantly low, thus creating a big error in accuracy. 219 | \item \emph{The optimized U.E. protocol has the best behaviour} in comparison to the other protocols tested. 220 | \end{itemize} 221 | 222 | Next up, we are going to run the same testings, but this time using the Kantorovich metric. We expect the protocols to behave even worse, because of the identity of the metric: the Kant. metric pays attention to the distance of the reported answer from the true one. The current protocols do not take into account the distance of the two answers, thus the metric will probably report a higher error. The results of the runs are shown in \textbf{Figure 4.4.} 223 | 224 | \begin{figure}[!htb]\centering 225 | \includegraphics[width=1\textwidth]{images/epsilon_others_kant.png} 226 | \caption{Epsilon Measurements compared by Kantorovic Distance} 227 | \end{figure} 228 | 229 | 230 | As we expected, the protocols produce a higher error, with the D.E. being the worst among them, especially for lower values of epsilon. This is a rather alarming notice, in which we will come back in Section 4.6. 231 | 232 | 233 | \subsection{Increasing number of users} 234 | 235 | 236 | The second experiment that we will conduct is the accuracy error depending on the number of users used during the survey covered by the protocol. In the definition of L.D.P. the observation of the need of lots of users was made, and it is now time that we examine it. We are going to use a \emph{fixed epsilon value}, one that our protocols behave similarly for (at least the pure ones, in which we will focus our research moving forward). Our epsilon value that we are going ot used will be fixed and equal to 1.5. 237 | 238 | We are going to run the protocols and compare them using the Manhattan Distance. Additionally, we are each time going to divide the result of the metric with the number of users participated, as the simple error is going to increase when the users increase. Hence, this division is going to give us the error depending on the size of our domain. The results are shown in \textbf{Figure 4.5.} 239 | 240 | \begin{figure}[!htb]\centering 241 | \includegraphics[width=1\textwidth]{images/nusers_others_l1.png} 242 | \caption{Increasing n. of users compared by Manhattan Distance} 243 | \end{figure} 244 | 245 | The results confirm the allegations made after explaining the definition of L.D.P.: When the number of participants in a survey is low, the error produced is very high. Every protocol has similar behaviour, as we can see that for fewer than 1000 users the relative error is even 6 times larger than for more than 1000 users. Actually, as we can see from the graph, the turning point is around 2000 users: the relative error drops and stabilizes after this number of participants. 246 | 247 | An other observation made, is that the protocol with the best behaviour during our epsilon measurements (Unary Encoding), is the worst one in the low users testings. This of course gets better as the number of participants rises, and eventually beats the other protocols when it comes down to relative accuracy error. 248 | 249 | Next up, we are going to perform the same testings while comparing the protocols with the Kantorovich metric. The results are shown in \textbf{Figure 4.6}. 250 | 251 | \begin{figure}[!htb]\centering 252 | \includegraphics[width=1\textwidth]{images/nusers_others_kant.png} 253 | \caption{Increasing n. of users compared by Kantorovich Distance} 254 | \end{figure} 255 | 256 | The results for a low number of users differ a lot from the runs with the $l1$ distance as a metric. Here, the D.E. method has obviously the worst behaviour, and the U.E. method the best. We can even claim that U.E. seems to solve our problem of high accuracy error. However, no one of those protocols takes into account the distance of the reported answer from the true one, thus it makes sense that the Kantorovich metric produces a huge amount of accuracy error. 257 | 258 | That fact, triggered our thoughts, on what could possibly be done in order to reduce that problem. The thoughts made on this subject are analyzed in the next chapter, by creating a new L.D.P. protocol, sensitive to the distance between the true and the reported values. -------------------------------------------------------------------------------- /thesis_paper/LDP/our_protocol.tex: -------------------------------------------------------------------------------- 1 | \section{A Distance Sensitive Encoding Protocol} 2 | 3 | \subsection{Idea} 4 | 5 | In the previous section we discussed various LDP Protocols that function extremely well for numerical values (histogram type), with accuracy that is completely acceptable. However, when the number of users is limited, (eg under 1000), we made the observation that the accuracy error is extremely large. This is mainly due to the fact that the probability of an item to be chosen is independent of the distance between the true and the selected value. 6 | 7 | 8 | Thus, for the needs of this Thesis, a new L.D.P. protocol was constructed. The idea proposed is to \emph{have the probability of choosing an element of the domain to depend on the distance from the true value}. This could prove very helpful for histogram values, but does not make any sense for categorical values. From now on, we are going to focus on histogram values. 9 | 10 | Based on our idea, the probabilities' distribution, in comparison with the distribution of the D.E. protocol, will look like the one in the following figures. 11 | 12 | \begin{figure}[!htb]\centering 13 | \includegraphics[width=0.5\textwidth]{images/D.E. Idea.png} 14 | \caption{D.E. protocol's Probabilistic Distribution} 15 | \end{figure} 16 | 17 | \begin{figure}[!htb]\centering 18 | \includegraphics[width=0.5\textwidth]{images/Our Idea.png} 19 | \caption{Distance Sensitive protocol's Probabilistic Distribution} 20 | \end{figure} 21 | 22 | 23 | Thus, there is an area around the true value that has high probability to be selected. The width of this area(from now on $\theta$) is defined by the epsilon setting that the user wants to use. The idea for having a specific area and not decreasing our probabilities as low as it goes when we diverge from the true value, is based on our need to be able to serve low epsilon values, as the first would be a big no for when the users want to use a high privacy setting. 24 | 25 | \subsection{Mathematical Background} 26 | 27 | In order to achieve our goal, when in the selected area, we should include in the denominator the quantity 28 | $$ 29 | |x - i| 30 | $$ 31 | where x is the true value, and the i the false one that we are looking at in order to report it, in order for the probability to depend on the distance of the reported values. 32 | 33 | When we are out of this area, the denominator will have a constant value, proportional to the boundaries of the selected area. 34 | 35 | \emph{Example}: Let's suppose that we have a domain size of 100, and our $\theta$ value is 4. All the probabilities outside the area, should be in reverse proportion of $\theta$, thus 4. 36 | 37 | Like all probabilistic algorithms though, the sum of the probabilities for all the items in the domain size, must be 1. Thus if we chose the probability to be in the shape of $\frac{a}{|x-i|}$, the partial sum of the series will not give as simple results. 38 | 39 | It is known that the series in the form of $a_n = \frac{1}{n}$ does not converge, and its partial sums can only be computed using a complex approximation formula. These characteristics make this type of series very hard to use, and so we have to think of a more handy type. 40 | 41 | 42 | A type of series that is known to have easy to compute partial sums, are the telescopic series, such as $b_n = \frac{1}{n (n+1)}$. It is known that 43 | $$ 44 | \sum_{n = 1}^{n = k} b_n = 1 - \frac{1}{k + 1} 45 | $$ 46 | something that will prove extremely useful moving forward. 47 | 48 | So, taking into consideration the quantity $|x - i|$ and the telescopic series $b_n$, we conclude that the probability of each non-true element in our selected area will be of the shape: 49 | 50 | \begin{align} 51 | \mathbf{q = \frac{a}{|x-i|(|x-i| + 1)}} 52 | \end{align} 53 | and outside of that area: 54 | 55 | \begin{align} 56 | \mathbf{s = \frac{a}{\theta \cdot (\theta + 1)}} 57 | \end{align} 58 | 59 | The probability $p$ of selecting the true value will have to meet specific criteria that we are going to define later on. 60 | 61 | \subsection{Building the Protocol} 62 | Ww are now going to find what the alpha parameter will be, as it is not constant, but clearly it depends on the domain size and the probability $p$. In order to find it, we must keep in mind that all the probabilities of selecting an item from our domain, must add up to $1$. 63 | 64 | In order to find out the $\alpha$ value, we must solve the following equation: 65 | 66 | \begin{align*} 67 | p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 68 | \end{align*} 69 | 70 | At this point, we must note that $\alpha$ although not a constant, can be held out of the sums, because it is obviously independent from the $i$ variable, that is the variable parsing through the domain in order to retrieve the false elements' probabilities. Thus, we have: 71 | 72 | \begin{align*} 73 | p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 \Longleftrightarrow \\ 74 | \sum_{i = x - \theta}^{i = x + \theta} \frac{a}{|x-i|(|x-i| + 1)} + \sum_{i = 1}^{i = x - \theta -1} \frac{a}{\theta(\theta+1)} + \sum_{i = x + \theta + 1}^{i = d} \frac{a}{\theta(\theta+1)} = 1 - p \Longleftrightarrow \\ \dots \Longleftrightarrow \\ 75 | a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1} 76 | \end{align*} 77 | 78 | The proof for the mathematical equations leading to the extraction of the alpha value, can be found in the First Appendix of the Thesis. 79 | 80 | \subsection{Epsilon Requirements} 81 | 82 | The epsilon value is the most essential in these protocols, as it determines the privacy level that the protocol yields. 83 | 84 | Recalling the definition of LDP, we must follow the following rule: 85 | 86 | \begin{center} 87 | An algorithm $A$ satisfies \espilon-LDP \textit{iff} for any input $v_1$ and $v_2$, we have 88 | \begin{align*} 89 | \forall y \in Range(A): \frac{Pr[A(v_1) = y]}{Pr[A(v_2) = y]} \leq e^{\epsilon} 90 | \end{align*} 91 | 92 | \end{center} 93 | 94 | Thus, in order to determine the epsilon value for our algorithm, it must satisfy even the worst case of this equation. The fraction gets bigger, if we put on the biggest probability on the numerator, and the smallest probability of all in the denominator. 95 | 96 | The numerator must have the probability $p$, and the denominator $s$, the probability of all of the elements outside of the $\theta$ area. 97 | 98 | We want for $p$ to be the biggest probability among all, but not extremely high, in order to be able to restrict the growth of epsilon. Hence, we are going to set it as double of the probabilities of its exact neighbours. The 2 neighbours have $|i-x| = 1$, thus $q = \frac{a}{2}$, so we are going to set $\mathbf{p = 2 \cdot \frac{a}{2} = a}$, where $a$ is the quantity defined above, depending on the domain size, and the $\theta$ value. 99 | 100 | Now, if we set $p = a$, then the $a$ equation changes, and now our aplha parameter only depends on the domain size and the $\theta$ selected. So, we proceed as following: 101 | \begin{align*} 102 | a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1} \Longleftrightarrow{(p = a)}\\ 103 | a = \frac{\theta(\theta + 1) (1 - a)}{2\theta^2 - 2\theta + d - 1} \Longleftrightarrow \\ 104 | a (2\theta^2 - 2\theta + d - 1) = \theta(\theta + 1) - (\theta^2 + \theta)a \Longleftrightarrow\\ 105 | a (3\theta^2 - \theta + d - 1) = \theta(\theta + 1) \Longleftrightarrow \\ 106 | \mathbf{a = \frac{\theta(\theta + 1)}{3\theta^2 - \theta + d - 1}} 107 | \end{align*} 108 | 109 | Obviously, we observe the constraint that $\theta > 0$, and this is a special case of our protocol, that can be represented by the direct encoding protocol. 110 | 111 | We now have: 112 | 113 | \begin{align*} 114 | \frac{Pr[A(v_1) = y]}{Pr[A(v_2) = y]} = e^{\espilon} \Longleftrightarrow 115 | q e^{\epsilon} = p \Longleftrightarrow \\ 116 | \frac{a}{\theta(\theta + 1)} \cdot e^{\epsilon} = a \Longleftrightarrow \\ 117 | \theta(\theta + 1) = e^{\epsilon} \Longrightarrow \\ 118 | \theta^2 + \theta - e^\epsilon = 0 119 | \end{align*} 120 | 121 | If we solve the quadratic equation, and reject its one (illegal) solution, we get that: 122 | 123 | \begin{align} 124 | \theta = \lfloor \frac{\sqrt{4e^{\epsilon} + 1} - 1}{2}\rfloor 125 | \end{align} 126 | 127 | So, to conclude, in order for the protocol to function, the user must provide just the epsilon setting, from which the $\theta$ constant is computed, and so the probabilities for each item of the domain will be selected. 128 | 129 | 130 | \subsection{Protocol Definition} 131 | 132 | We are now ready to define our protocol, by determining the 3 basic operations for an LDP protocol, the \emph{encoding}, the \emph{perturbation} and the \emph{aggregation methods}. 133 | 134 | 135 | We are going to use the following symbols: 136 | \begin{itemize} 137 | \item $D$: The protocol's domain. In this set, we have each $i$ for $1 \leq i \leq |D|$ as each item in the domain $D$ 138 | \item $a$: The quantity that we computed in the previous section 139 | \item $\theta$: The constant used in the previous section, which denotes the area around the true value that the probabilities will be higher than others. 140 | 141 | \end{itemize} 142 | 143 | \textbf{Encoding:} The encoding procedure is trivial. Just like the Wang paper, we are just going to set: 144 | \begin{align*} 145 | Encode(v) = v 146 | \end{align*} 147 | 148 | for each value $v$ of the domain. The values are going to be randomized during the perturbation step. 149 | 150 | \textbf{Perturbation:} Given the previous section, the randomization during the perturbation step is define as following: 151 | 152 | \begin{equation*} 153 | Pr[Perturb(x) = i] = 154 | \begin{cases} 155 | p = a & \mbox{if } i = x \\ 156 | q = \frac{a}{|c|(|c| + 1)} & c = \min{(\theta, |i-x|)} \mbox{, otherwise} \end{cases} 157 | \end{equation*} 158 | 159 | where $i$ is the value selected each time, and $x$ our initial selection. 160 | 161 | \textbf{Aggregation:} The aggregation step was the most tricky during the building of the protocol. A similar approach to the aggregation of pure protocols was chosen, but with a few changes. After several different tries, the optimal aggregation found, was the following: the protocol supports only the reported values corresponding to the true one, thus $Support(v) = v$. However, the $p^*$ quantity is the sum of all the probabilities inside the area: 162 | 163 | \begin{align*} 164 | p^* = \sum_{x\in (-\theta, \theta)} p(x) 165 | \end{align*} 166 | 167 | Finally, the $q*$ quantity is the probability of choosing an element from outside the θ area, thus equal to $s$. Hence, the estimation generated for a value $v$ of the possible answers in the domain is defined as following: 168 | 169 | \begin{align*} 170 | \text{Estimation} = \frac{\sum_{j} 1_{support(v^j)}(i) - nq^*}{p^* - q^*} 171 | \end{align*} 172 | 173 | \subsection{Extreme Cases} 174 | 175 | The downside of a complicated protocol, are of course some extreme cases for the $x, \theta \text{ and } i$ values, all of which we are going to examine in this chapter. The definition of the protocol is going to be altered, and the constraints increased, in order to support those extreme cases. 176 | 177 | \textbf{Extreme theta cases:} Of course, we have the constraint that $0 < \theta \leq d$, but what happens when its value is equal to one of the bounds? 178 | \begin{itemize} 179 | \item When $\theta \leq 0$, our protocol can not function, as this assignment will result in $a = 0$, something that is prohibited, because the probabilities will not sum to 1. In order to ensure that $\theta$ is at least 1, the user must provide at least an $\epsilon = ln(2)$. 180 | 181 | \item When $\theta = 1$, we can see that the third case in the perturbation step does not exist, thus we have only the first 2 cases. There, $p = a = \frac{2}{d+1}$, and for every other $i$, $q = \frac{a}{2}$, something that is similar with the Direct Encoding protocol. 182 | 183 | \item When $\theta = d$, which realistically can only happen when d is extremely small, then our protocol functions as designed, and has its best behavior. However, if the selection of epsilon results in such big a theta, then the user does not have extreme privacy demands. 184 | \end{itemize} 185 | 186 | \textbf{Extreme x values:} Even when the epsilon value and the domain size are normal, in some cases we might face a certain difficulty: if $x - \theta < 0$ or $x + \theta > d$, some of the items in our area are actually outside of our domain boundaries. This results in the sum of the items in the probabilistic distribution to be below 1, something not acceptable. 187 | 188 | In order to fix it, we are going to "transfer" those probabilities inside the boundaries of our domain, while not messing with the highest probability, as this would result in problems with the definition of D.P., an thus the value of theta. 189 | 190 | The idea is to increase the other selections' probabilities by a bit, in order to fill the gap created, while leaving the maximum as initially created. We are going to boost all of the domain's items, by a portion of $\frac{m}{d - 1}$ (as we are altering $d-1$ elements), where $m$ is the sum of the probabilities of the items outside the bounds of our domain. 191 | 192 | However, we are not interested in transferring the whole $Pr[Perturb(x) = i]$, but only its difference from the item with the lowest probability, which is $s = \frac{a}{\theta(\theta+1)}$. Thus, the $m$ values are defined as following: 193 | \begin{align*} 194 | m = \sum_{i < 0 \bigcup |i-x|<\theta} Pr[Perturb(x) = i] - \frac{a}{\theta(\theta+1)} 195 | \end{align*} 196 | 197 | for the case of $x - \theta < 0$, and as 198 | 199 | \begin{align*} 200 | m = \sum_{i \geq d \bigcup |i-x|<\theta} Pr[Perturb(x) = i] - \frac{a}{\theta(\theta+1)} 201 | \end{align*} 202 | 203 | for the second one. 204 | \\\bigskip 205 | Now, the probabilistic distribution can be altered as following: 206 | 207 | 208 | \begin{equation*} 209 | Pr[Perturb_{DS}(x) = i] = 210 | \begin{cases} 211 | p = a & \mbox{if } i = x \\ 212 | q = \frac{a}{|c|(|c| + 1)} + \frac{m}{d - 1} & c = \min{(\theta, |i-x|)} \mbox{, otherwise} \end{cases} 213 | \end{equation*} 214 | 215 | 216 | The definition of D.P. is not altered, because again in the best case we have a probability of $p = a$, and in the worst case $s = \frac{a}{\theta(\theta+1)}$. 217 | 218 | \subsection{Implementation} 219 | 220 | The most difficult part of the implementation of our protocol consists of creating the probabilistic distribution for each element of the domain, depending on the true value. This can prove to be costly, if we have a large domain or if we are in the case of the extreme x values. 221 | 222 | However, we do not need to compute every single probability, as it is clear from the definition that they are independent from the true value: they only depend on $a$ (and on the domain size in case of an extreme x value). The quantity $|i - x|$ can only take values in the range of $[1,\theta]$, thus constant for every possible true value. Moreover, for the domain values outside of the area, the probability is fixed and equal to $\frac{a}{\theta(\theta + 1)}$. Hence, the probabilities can be computed in advance, either by each user, or given to the protocol by the aggregator. 223 | 224 | The protocol has been implemented using Python, and can be found in the GitHub repository of this Thesis. Moving forward, we are going to use this implementation in order to conduct some testings to ensure the protocol's functionality. 225 | 226 | \subsection{Experiments} 227 | First up, we are going to perform the epsilon measurements that we did for the other protocols, this time excluding the Random Matrix approach, and including the D.S. protocol. The results, when running with the Kantorovich metric, are the following: 228 | 229 | \begin{figure}[!htb]\centering 230 | \includegraphics[width=1\textwidth]{images/epsilon_our_kant.png} 231 | \caption{Epsilon measurements for D.S. protocol compared by Kantorovich Distance} 232 | \end{figure} 233 | 234 | The first observation is the \emph{strange form of the curve of our protocol}. This can be easily explained: the theta value used to determine the area around the true value, is depended on epsilon, but has a floor function applied to it. Thus, for a specific range of ε, the protocol produces the same results. 235 | 236 | An other observation is that\emph{ our protocol lacks efficiency for low values of epsilon}, that is natural, since small θ values do not help our idea at all. However, when epsilon gets higher than 1.5 (and thus theta rises above 2), the results are more than satisfying: \emph{our protocol has the best behaviour for epsilon in the range of $\mathbf{(2, 2.5)}$.} 237 | 238 | The real test though, is how our protocol behaves for an increasing number of users: we must check if it produces better accuracy error than the competitors. This is our next testing, where we are going to set $\epsilon = \ln(20)$, in order for the conditions to be favorable for each one of the protocols. The results are shown in the \textbf{Figure 4.10}. 239 | 240 | 241 | \begin{figure}[!htb]\centering 242 | \includegraphics[width=1\textwidth]{images/users_our_kant.png} 243 | \caption{Increasing users measurements for D.S. protocol compared by Kantorovich Distance} 244 | \end{figure} 245 | 246 | For the specific epsilon setting, \emph{our protocol produces extremely good accuracy error for a small number of users.}, beating by a lot the U.E. protocol. The comparisons have been made using the Kantorovich metric, the most characteristic of them all, as it takes into account the distance between the real answers and the projections, exactly what our protocol is designed to do. However, we are also going to perform the same testings using the Manhattan metric. The results are shown in the \textbf{Figure 4.11}. 247 | 248 | \begin{figure}[!htb]\centering 249 | \includegraphics[width=1\textwidth]{images/users_our_l1.png} 250 | \caption{Increasing users measurements for D.S. protocol compared by Manhattan Distance} 251 | \end{figure} 252 | 253 | The results of the Manhattan-driven tests are similar to the ones made with the Kant. metric. However, we observe that \emph{when the number of users rises, our protocol has worse behaviour in comparison to the other pure ones}. This happens mainly because of the other protocols, that by the law of big numbers, have good accuracy because of the higher probability of choosing the true answer. On the other hand, in our protocol, this probability is reduced and shared with the other elements in the area covered by theta. 254 | 255 | \subsection{Conclusions} 256 | In general, \emph{the D.S. protocol succeeds when the number of the participants in a survey is extremely low}, and functions similarly with the other protocols for an increasing number of users. The downside is that it does not always takes full advantage of the epsilon setting, as explained in a previous section. However, the results are more than satisfying. Hence, \emph{this is a fully functioning protocol that can be used for the application of L.D.P., especially in a situation when few people take part in the survey.} The protocol will be further tested in more extreme cases, but this is beyond the scope of this Thesis. 257 | -------------------------------------------------------------------------------- /thesis_paper/dependencies/arial/fonts/Arial Bold Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Bold Italic.ttf -------------------------------------------------------------------------------- /thesis_paper/dependencies/arial/fonts/Arial Bold.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Bold.ttf -------------------------------------------------------------------------------- /thesis_paper/dependencies/arial/fonts/Arial Italic.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Italic.ttf -------------------------------------------------------------------------------- /thesis_paper/dependencies/arial/fonts/Arial.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial.ttf -------------------------------------------------------------------------------- /thesis_paper/dithesis.cls: -------------------------------------------------------------------------------- 1 | % dithesis.cls 2 | % 3 | % A LaTeX2e document class for writing MSc theses in English for the Department 4 | % of Informatics and Telecommunications (DI&T) of the National and Kapodistrian 5 | % University of Athens (NKUA). 6 | % 7 | % Copyright (c) 2014, 2015 Charalampos S. Nikolaou 8 | % 2017 Ergys Dona 9 | % 2020 Giorgos Katsogiannis 10 | % 11 | % This work may be distributed and/or modified under the conditions of the 12 | % LaTeX Project Public License. The latest version of this license is in 13 | % http://www.latex-project.org/lppl.txt. 14 | % 15 | % This work consists of the following files: 16 | % dithesis.cls 17 | % This document class file 18 | % 19 | % sample.tex 20 | % A sample document demonstrating the use of this class file 21 | % 22 | % emblems/ 23 | % A directory containing three colored versions of the official emblem of 24 | % the National and Kapodistrian University of Athens. 25 | % 26 | % fonts/ 27 | % A directory containing the Arial family TrueType fonts. The directory 28 | % also includes a README file with instructions on installing the fonts 29 | % in your system (assuming Unix-based). 30 | 31 | % Document Class Options 32 | % 33 | % inscr 34 | % If present, then a page with the inscription provided via the command 35 | % \inscription{} is printed. 36 | % 37 | % ack 38 | % If present, then a page with the acknowledgements provided via the 39 | % command \acksEn{} is printed. 40 | % 41 | % preface 42 | % If present, then a preface page is included just before the 43 | % introductory chapter. The content of this page is controled via the 44 | % command \preface{}. 45 | % 46 | % lop 47 | % If present, then a page with the list of puclications will be included. 48 | % 49 | 50 | % Document Versions 51 | % 52 | % Version 1.3, 2020/10/02 53 | % Added some corrections to match the department's official formatting requirements. 54 | % Changes were applied to the acronyms table, the bibliography size, the table of 55 | % contents, the captions of figures and tables and the abstract page's keywords. 56 | % 57 | % Version 1.2, 2017/02/26 58 | % Refactored the document to allow it to be used to write BSc Theses. 59 | % 60 | % Version 1.1, 2015/11/06 61 | % Updated document class so as to be compatible with the template file 62 | % regarding the appearance of headers/footers and appendix names. Require 63 | % also package `longtable' for being used for abbreviations/acronyms. 64 | % 65 | % Version 1.0, 2014/12/08 66 | % Initial attempt at creating the first class file for LaTeX Theses in 67 | % DI&T, NKUA. 68 | 69 | % Known Issues 70 | % 71 | % * Chapter titles are not appearing in capital letters in the ToC, although 72 | % in other places this has been taken care of for good. Thus, one is enforced 73 | % to type chapter titles in capital letters, so as to be compatible with the 74 | % requirements of the department. 75 | % 76 | 77 | \NeedsTeXFormat{LaTeX2e} 78 | \ProvidesClass{dithesis}[2017/02/26 v1.2 LaTeX class for BSc Theses 79 | submitted to the Department of Informatics and Telecommunications of the 80 | National and Kapodistrian University of Athens] 81 | 82 | % 83 | % Declare and initialize global ifs 84 | % (set by passing options to this document class) 85 | % 86 | 87 | % controls the inclusion of the inscription page 88 | \newif\ifinscriptionpage 89 | \inscriptionpagefalse 90 | 91 | % controls inclusion of acknowledgement page 92 | \newif\ifackpage 93 | \ackpagefalse 94 | 95 | % controls inclusion of the list of publications page 96 | \newif\ifloppage 97 | \loppagefalse 98 | 99 | % controls inclusion of the preface page 100 | \newif\ifprefacepage 101 | \prefacepagefalse 102 | 103 | % 104 | % Declare and initialize internal ifs (not set by document class options) 105 | % 106 | \newif\ifenglishfrontpage 107 | \englishfrontpagefalse 108 | 109 | \newif\ifenglishcommitteepage 110 | \englishcommitteepagefalse 111 | 112 | % marks whether the appendices have started 113 | % (used later for setting an appropriate naming scheme for the appendix title) 114 | \newif\ifappendixstarted 115 | \appendixstartedfalse 116 | 117 | % controls whether we want to print dual-page or single page 118 | % when enabled, the page numbering will occur in the centre of the page 119 | % otherwise, it will occur at the right 120 | \newif\ifdualpage 121 | \dualpagefalse 122 | 123 | % 124 | % Check passed options 125 | % 126 | \DeclareOption{inscr}{ 127 | \inscriptionpagetrue 128 | } 129 | \DeclareOption{ack}{ 130 | \ackpagetrue 131 | } 132 | \DeclareOption{preface}{ 133 | \prefacepagetrue 134 | } 135 | \DeclareOption{lop}{ 136 | \loppagetrue 137 | } 138 | \DeclareOption{dualpage}{ 139 | \dualpagetrue 140 | } 141 | 142 | 143 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{book}} 144 | \ProcessOptions\relax 145 | 146 | \LoadClass[12pt,oneside]{book} 147 | 148 | % 149 | % Required packages and configuration 150 | % 151 | \RequirePackage[a4paper, 152 | top=2cm,bottom=2cm,bindingoffset=0.5cm,left=2cm,right=2cm, 153 | headsep=0.5cm,footskip=0.75cm, 154 | ]{geometry} % add option showframe=true for debugging 155 | \RequirePackage{fancyhdr} 156 | \RequirePackage[final]{graphicx} 157 | \RequirePackage{sectsty} % needed for formatting chapter headings 158 | \RequirePackage{textcase} % needed for forcing capital letters (in chapters) 159 | \RequirePackage[resetlabels]{multibib} 160 | %\usepackage[notlof,notlot,nottoc,notlop]{tocbibind} 161 | %\RequirePackage{tocloft} % needed for making chapter titles upper case 162 | \RequirePackage{etoolbox} % needed for patchcmd 163 | \RequirePackage{tabularx} % needed for tabular* environment 164 | \RequirePackage{longtable} % needed for splitting big tables across pages 165 | \RequirePackage{xifthen} 166 | \RequirePackage[caption=false]{subfig} 167 | \RequirePackage[% 168 | font={footnotesize, bf}, 169 | justification=centering, 170 | labelsep=colon, 171 | figureposition=bottom, 172 | tableposition=top]{caption} % captions 173 | 174 | %(e.g., abbreviations) 175 | %\RequirePackage[toc,page,titletoc]{appendix} % needed for configuring 176 | %%appendices 177 | 178 | % 179 | % Add Greek support 180 | % 181 | \usepackage{fontspec} 182 | \usepackage{xunicode} 183 | \usepackage{xltxtra} 184 | \usepackage{polyglossia} 185 | \newfontfamily\greekfont[Script=Greek]{Arial} 186 | \newfontfamily\greekfontsf[Script=Greek]{Arial} 187 | \newfontfamily\greekfonttt[Script=Greek]{Arial} 188 | \setdefaultlanguage[variant=uk]{english} 189 | \setotherlanguage{greek} 190 | 191 | % 192 | % Set font mamily Aria 193 | % 194 | \setmainfont[Ligatures={Common,TeX}]{Arial} % the passed option was needed for 195 | % correctly rendering double quotes 196 | \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase} 197 | \setsansfont[Mapping=tex-text,Scale=MatchLowercase]{Arial} 198 | \setmathsf{CMU Bright} 199 | \setmathrm{CMU Bright} 200 | 201 | %\ifglossaryInPreamble 202 | %\RequirePackage[toc,section=chapter,numberedsection=false,nonumberlist]{glossaries} 203 | %\else 204 | % \RequirePackage[toc,section,numberedsection=nolabel,nonumberlist]{glossaries} 205 | %\fi 206 | %\makeglossaries 207 | 208 | \providecommand{\grnumm}[1]{#1\textsuperscript{ο}} 209 | \providecommand{\grnumf}[1]{#1\textsuperscript{η}} 210 | 211 | % 212 | % Commands for first, middle, and last name (greek and english versions) 213 | % 214 | \providecommand{\authorFirstGr}[1]{\gdef\@authorFirstGr{#1}} 215 | \providecommand{\authorFirstAbrGr}[1]{\gdef\@authorFirstAbrGr{#1}} 216 | \providecommand{\authorMiddleGr}[1]{\gdef\@authorMiddleGr{#1}} 217 | \providecommand{\authorLastGr}[1]{\gdef\@authorLastGr{#1}} 218 | \providecommand{\authorFirstEn}[1]{\gdef\@authorFirstEn{#1}} 219 | \providecommand{\authorFirstAbrEn}[1]{\gdef\@authorFirstAbrEn{#1}} 220 | \providecommand{\authorMiddleEn}[1]{\gdef\@authorMiddleEn{#1}} 221 | \providecommand{\authorLastEn}[1]{\gdef\@authorLastEn{#1}} 222 | \providecommand{\authorSn}[1]{\gdef\@authorSn{#1}} 223 | 224 | % 225 | % Commands for the title of the thesis (greek and english versions) 226 | % 227 | \providecommand{\titleGr}[1]{\gdef\@titleGr{#1}} 228 | \providecommand{\titleEn}[1]{\gdef\@titleEn{#1}} 229 | 230 | % 231 | % Commands for the date of the writing of the thesis (Month followed by Year) 232 | % [provide greek and english versions] 233 | % 234 | \providecommand{\dateGr}[1]{\gdef\@dateGr{#1}} 235 | \providecommand{\dateEn}[1]{\gdef\@dateEn{#1}} 236 | 237 | 238 | % 239 | % Commands for supervisor(s) 240 | % If more than one supervisor is declared, the class takes care to show 241 | % "Supervisors" instead of "Supervisor" and 242 | % "Επιβλέποντες" instead of "Επιβλέπων". 243 | % 244 | \gdef\@supervisorLabelSuffixGr{ων} 245 | \gdef\@supervisorLabelSuffixEn{} 246 | 247 | \providecommand{\supervisorGr}[2]{% 248 | \ifthenelse{\isundefined{\@thesupervisorslistGr}}{% 249 | \def\@thesupervisorslistGr{\textbf{#1,} #2} 250 | }{% 251 | \g@addto@macro\@thesupervisorslistGr{\\&\textbf{#1,} #2} 252 | \def\@supervisorLabelSuffixGr{οντες} 253 | } 254 | } 255 | 256 | \providecommand{\supervisorEn}[2]{% 257 | \ifthenelse{\isundefined{\@thesupervisorslistEn}}{% 258 | \def\@thesupervisorslistEn{\textbf{#1,} #2} 259 | }{% 260 | \g@addto@macro\@thesupervisorslistEn{\\&\textbf{#1,} #2} 261 | \def\@supervisorLabelSuffixEn{s} 262 | } 263 | } 264 | 265 | % 266 | % Commands for setting up abstract (greek and english versions), ack (english 267 | % version), synopsis (greek version only), incsriptionEn (english version 268 | %only), and prefaceEn (english version only) pages 269 | % 270 | \providecommand{\abstractEn}[1]{\gdef\@abstractEn{#1}} 271 | \providecommand{\abstractGr}[1]{\gdef\@abstractGr{#1}} 272 | \providecommand{\synopsisGr}[1]{\gdef\@synopsisGr{#1}} 273 | \providecommand{\acksEn}[1]{\gdef\@acksEn{#1}} 274 | \providecommand{\inscriptionEn}[1]{\gdef\@inscriptionEn{% 275 | \vspace*{0.2\textheight} 276 | \begin{flushright} 277 | #1 278 | \end{flushright} 279 | }} 280 | \providecommand{\prefaceEn}[1]{\gdef\@prefaceEn{#1}} 281 | 282 | % 283 | % Commands for Subject Area and Keywords fields 284 | % (greek and english versions) 285 | % 286 | \providecommand{\subjectAreaGr}[1]{\gdef\@subjectAreaGr{#1}} 287 | \providecommand{\subjectAreaEn}[1]{\gdef\@subjectAreaEn{#1}} 288 | \providecommand{\keywordsGr}[1]{\gdef\@keywordsGr{#1}} 289 | \providecommand{\keywordsEn}[1]{\gdef\@keywordsEn{#1}} 290 | 291 | % 292 | % Command for specifiying the file containing the publications 293 | % in the context of the PhD - NOT USED FOR BSC THESES. 294 | % 295 | \providecommand{\lopfile}[1]{\newcites{lop}{List of 296 | publications}\gdef\@lopfileinternal{#1}} 297 | 298 | % 299 | % Command for starting a new unumbered chapter (ToC'ed) for 300 | % holding the table of abbreviations and acronyms. The table 301 | % should be set by the user. 302 | % 303 | \providecommand{\abbreviations}{ 304 | \chapter*{ABBREVIATIONS - ACRONYMS} 305 | \addcontentsline{toc}{chapter}{ABBREVIATIONS - ACRONYMS} 306 | } 307 | 308 | % 309 | % Front page (greek and english versions) 310 | % 311 | \def\@frontpage{ 312 | \begin{center} 313 | \includegraphics[scale=0.85]{emblems/athena-black} 314 | \end{center} 315 | \begin{minipage}[t]{\textwidth} 316 | \begin{center} 317 | {\large \bfseries 318 | \ifenglishfrontpage 319 | NATIONAL AND KAPODISTRIAN UNIVERSITY OF ATHENS 320 | \else 321 | ΕΘΝΙΚΟ ΚΑΙ ΚΑΠΟΔΙΣΤΡΙΑΚΟ ΠΑΝΕΠΙΣΤΗΜΙΟ ΑΘΗΝΩΝ 322 | \fi 323 | } 324 | \linebreak 325 | 326 | {\bfseries 327 | \ifenglishfrontpage 328 | SCHOOL OF SCIENCES \\ DEPARTMENT OF INFORMATICS AND TELECOMMUNICATIONS 329 | \else 330 | ΣΧΟΛΗ ΘΕΤΙΚΩΝ ΕΠΙΣΤΗΜΩΝ \\ ΤΜΗΜΑ ΠΛΗΡΟΦΟΡΙΚΗΣ ΚΑΙ ΤΗΛΕΠΙΚΟΙΝΩΝΙΩΝ 331 | \fi 332 | } 333 | \linebreak\linebreak\linebreak\linebreak\linebreak 334 | 335 | {\bfseries 336 | \ifenglishfrontpage 337 | BSc THESIS 338 | \else 339 | ΠΤΥΧΙΑΚΗ ΕΡΓΑΣΙΑ 340 | \fi} 341 | \linebreak\linebreak 342 | 343 | {\Large \bfseries 344 | \ifenglishfrontpage 345 | \@titleEn 346 | \else 347 | \@titleGr 348 | \fi} 349 | \linebreak\linebreak\linebreak 350 | 351 | {\bfseries 352 | \ifenglishfrontpage 353 | \@authorFirstEn{} \@authorMiddleEn{} \@authorLastEn 354 | \else 355 | \@authorFirstGr{} \@authorMiddleGr{} \@authorLastGr 356 | \fi} 357 | \linebreak\linebreak\linebreak 358 | \linebreak\linebreak\linebreak 359 | \linebreak\linebreak 360 | \end{center} 361 | { 362 | \ifenglishfrontpage 363 | \begin{tabular}{l l} 364 | \textbf{Supervisor\@supervisorLabelSuffixEn:} & \@thesupervisorslistEn \\ 365 | \end{tabular} 366 | \else 367 | \begin{tabular}{l l} 368 | \textbf{Επιβλέπ\@supervisorLabelSuffixGr:} & \@thesupervisorslistGr \\ 369 | \end{tabular} 370 | \fi 371 | } 372 | \end{minipage} 373 | \vfill 374 | \begin{center} 375 | {\bfseries 376 | \ifenglishfrontpage 377 | ATHENS 378 | \else 379 | ΑΘΗΝΑ 380 | \fi} 381 | \\\vspace*{4mm} 382 | {\bfseries 383 | \ifenglishfrontpage 384 | \@dateEn 385 | \else 386 | \@dateGr 387 | \fi} 388 | \end{center} 389 | \clearpage 390 | } 391 | 392 | % 393 | % Committee page (greek and english versions) 394 | % 395 | \def\@committeepage{ 396 | \begin{center} 397 | \vspace*{1.5cm} 398 | {\bfseries 399 | \ifenglishcommitteepage 400 | BSc THESIS 401 | \else 402 | ΠΤΥΧΙΑΚΗ ΕΡΓΑΣΙΑ 403 | \fi 404 | } 405 | \linebreak 406 | 407 | { 408 | \ifenglishcommitteepage 409 | \@titleEn 410 | \else 411 | \@titleGr 412 | \fi 413 | } 414 | \linebreak\linebreak\linebreak 415 | 416 | {\bfseries 417 | \ifenglishcommitteepage 418 | \@authorFirstEn{} \@authorMiddleEn{} \@authorLastEn 419 | \else 420 | \@authorFirstGr{} \@authorMiddleGr{} \@authorLastGr 421 | \fi 422 | \vspace{2mm} 423 | } 424 | \linebreak 425 | { 426 | \ifenglishcommitteepage 427 | {\bfseries S.N.:} \@authorSn 428 | \else 429 | {\bfseries Α.Μ.:} \@authorSn 430 | \fi 431 | } 432 | \linebreak\linebreak\linebreak 433 | \linebreak\linebreak\linebreak 434 | \linebreak\linebreak\linebreak 435 | \linebreak\linebreak\linebreak 436 | \linebreak\linebreak 437 | \end{center} 438 | 439 | { 440 | \ifenglishcommitteepage 441 | \begin{tabular}{l l} 442 | \textbf{SUPERVISOR\MakeUppercase{\@supervisorLabelSuffixEn:}} & \@thesupervisorslistEn \\ 443 | \end{tabular} 444 | % \@supervisorlabelEn 445 | \else 446 | \begin{tabular}{l l} 447 | \textbf{ΕΠΙΒΛΕΠ\MakeUppercase{\@supervisorLabelSuffixGr:}} & \@thesupervisorslistGr \\ 448 | \end{tabular} 449 | % \@supervisorlabelGr 450 | \fi 451 | } 452 | \clearpage 453 | } 454 | 455 | % 456 | % Abstract (english version) 457 | % 458 | \def\@absEn{ 459 | \chapter*{Abstract} 460 | \thispagestyle{empty} 461 | \@abstractEn{} 462 | \vfill 463 | 464 | \begin{tabularx}{\textwidth}{l X} 465 | {\bfseries SUBJECT AREA:} & \@subjectAreaEn 466 | \end{tabularx} 467 | 468 | \begin{tabularx}{\textwidth}{l X} 469 | {\bfseries KEYWORDS:} & \@keywordsEn 470 | \end{tabularx} 471 | } 472 | 473 | % 474 | % Abstract (greek version) 475 | % 476 | \def\@absGr{ 477 | \chapter*{ΠΕΡΙΛΗΨΗ} 478 | \thispagestyle{empty} 479 | \begin{greek} 480 | \@abstractGr{} 481 | \end{greek} 482 | \vfill 483 | 484 | \begin{tabularx}{\textwidth}{l X} 485 | {\bfseries ΘΕΜΑΤΙΚΗ ΠΕΡΙΟΧΗ:} & \@subjectAreaGr 486 | \end{tabularx} 487 | 488 | \begin{tabularx}{\textwidth}{l X} 489 | {\bfseries ΛΕΞΕΙΣ ΚΛΕΙΔΙΑ:} & \@keywordsGr 490 | \end{tabularx} 491 | } 492 | 493 | % 494 | % Greek synopsis of the thesis 495 | % 496 | \def\@synopsis{ 497 | \chapter*{ΣΥΝΟΠΤΙΚΗ ΠΑΡΟΥΣΙΑΣΗ ΤΗΣ ΔΙΔΑΚΤΟΡΙΚΗΣ ΔΙΑΤΡΙΒΗΣ} 498 | \thispagestyle{empty} 499 | \@synopsisGr{} 500 | \clearpage 501 | } 502 | 503 | % 504 | % Inscription page (optional) 505 | % 506 | % Enable it by passing option ``inscr'' to the document class. 507 | % 508 | \def\@inscr{ 509 | \cleardoublepage 510 | \@inscriptionEn{} 511 | \clearpage 512 | } 513 | 514 | % 515 | % Acknowledgements (optional) 516 | % 517 | % Enable it by passing option ``ack'' to the document class. 518 | % 519 | \def\@acks{ 520 | \chapter*{Acknowledgements} 521 | \thispagestyle{empty} 522 | \@acksEn{} 523 | \clearpage 524 | } 525 | 526 | % 527 | % Preface page (optional) 528 | % 529 | % Enable it by passing option ``preface'' to the document class. 530 | % 531 | % Check the following page that gives a definition among the uses of Prologue, 532 | % Foreword, and Preface: 533 | %http://iankingsleyauthor.blogspot.de/2013/03/defined-prologue-epilogue-foreword.html. 534 | % 535 | \def\@preface{ 536 | \chapter*{Preface} 537 | \thispagestyle{empty} 538 | \@prefaceEn{} 539 | \clearpage 540 | } 541 | 542 | % 543 | % List of publications (optional) 544 | % 545 | % Enable it by passing option ``lop'' to the document class. 546 | % 547 | \def\@listofpubs{ 548 | \bibliographystylelop{unsrt} 549 | \bibliographylop{\@lopfileinternal} 550 | \thispagestyle{empty} 551 | \clearpage 552 | } 553 | 554 | 555 | %% 556 | %% Format of Table of Contents, List of Figures and List of Tables. 557 | %% (Adapted from dithesis.cls made by Yannis Mantzouratos ) 558 | %% 559 | \RequirePackage{titletoc} 560 | \RequirePackage[subfigure,titles]{tocloft} 561 | 562 | % Table of Contents 563 | \let\oldtableofcontents\tableofcontents 564 | \DeclareRobustCommand{\tableofcontents}{ 565 | \newpage 566 | \pagestyle{empty} 567 | \oldtableofcontents 568 | \clearpage 569 | \pagestyle{fancy} 570 | } 571 | 572 | % place dots between each section and the respective page number 573 | \renewcommand{\cftsecleader}{\bfseries\cftdotfill{\cftdotsep}} 574 | % place dots between each subsection and the respective page number 575 | \renewcommand{\cftsubsecleader}{\bfseries\cftdotfill{\cftdotsep}} 576 | 577 | % place a dot after each chapter number 578 | \renewcommand{\cftchapaftersnum}{.} 579 | 580 | % section entries should be 10 pt and bold 581 | \renewcommand{\cftsecfont}{\fontsize{10pt}{12pt}\selectfont\bfseries} 582 | % subsection entries should be 10 pt 583 | \renewcommand{\cftsubsecfont}{\fontsize{10pt}{12pt}\selectfont} 584 | % subsubsection entries should be 10 pt 585 | \renewcommand{\cftsubsubsecfont}{\fontsize{10pt}{12pt}\selectfont} 586 | 587 | % sections should not be indented, whereas subsections and subsubsections should 588 | \setlength\cftsubsubsecindent\cftsubsecindent 589 | \setlength\cftsubsecindent\cftsecindent 590 | \setlength{\cftsecindent}{0pt} 591 | 592 | % sections should have the same vertical space with chapters 593 | \setlength\cftbeforesecskip\cftbeforechapskip 594 | 595 | % space between chapters and numbering in case of double digit numbers 596 | \newlength{\tocbinnumwidth} 597 | \settowidth{\tocbinnumwidth}{9} 598 | \addtolength{\cftchapnumwidth}{\tocbinnumwidth} 599 | 600 | 601 | % 602 | % Configure the frontmatter page 603 | % 604 | \renewcommand{\frontmatter}{ 605 | \pagestyle{empty} 606 | % frontpage - english version 607 | \englishfrontpagetrue 608 | \@frontpage 609 | 610 | % frontpage - greek version 611 | \englishfrontpagefalse 612 | \@frontpage 613 | 614 | % examination committe page - english version 615 | \englishcommitteepagetrue 616 | \@committeepage 617 | 618 | % examination committe page - greek version 619 | \englishcommitteepagefalse 620 | \@committeepage 621 | 622 | % abstract (english and greek version) 623 | \@absEn 624 | \@absGr 625 | 626 | % inscription 627 | \ifinscriptionpage 628 | \@inscr 629 | \fi 630 | 631 | % acknowledgements (english version only, OPTIONAL) 632 | \ifackpage 633 | \@acks 634 | \fi 635 | 636 | % table of contents 637 | % add TOC as bookmark 638 | \addtocontents{toc}{\protect{\pdfbookmark[0]{CONTENTS}{toc}}} 639 | \tableofcontents 640 | 641 | % list of figures 642 | %\listoffigures 643 | %\thispagestyle{empty} 644 | \cleardoublepage 645 | \begingroup 646 | \makeatletter 647 | \let\ps@plain\ps@empty 648 | \makeatother 649 | 650 | \pagestyle{empty} 651 | \listoffigures 652 | \cleardoublepage 653 | \endgroup 654 | 655 | % list of tables 656 | \listoftables 657 | \thispagestyle{empty} 658 | 659 | % preface page (english version only, OPTIONAL) 660 | \ifprefacepage 661 | \@preface 662 | \fi 663 | 664 | % Prepare things for the rest of the document 665 | \clearpage 666 | \thispagestyle{empty} 667 | } 668 | 669 | % 670 | % Configure the mainmatter and backmatter pages 671 | % 672 | \renewcommand{\mainmatter}{ 673 | % just set the style of the pages to be fancy 674 | \pagestyle{fancy} 675 | } 676 | \renewcommand{\backmatter}{ 677 | % just set the style of the pages to be fancy 678 | \pagestyle{fancy} 679 | 680 | % and also, make the empty page style permanent, since 681 | % when a \chapter command is invoked, the command 682 | % \thispagestyle{plain} is invoked 683 | % (thanks to http://tex.stackexchange.com/a/19741) 684 | %\patchcmd{\chapter}{plain}{empty}{}{} 685 | } 686 | 687 | % 688 | % Configure chapter printing and alignment (centered) 689 | % 690 | \def\@makechapterhead#1{% 691 | %\vspace*{50\p@}% 692 | {\parindent \z@ \raggedright \normalfont 693 | \interlinepenalty\@M 694 | \ifappendixstarted 695 | \large \centering \bfseries APPENDIX \thechapter. \MakeTextUppercase{#1} 696 | \else 697 | \large \centering \bfseries \thechapter. \MakeTextUppercase{#1} 698 | \fi 699 | \par\nobreak 700 | \vskip 20\p@ 701 | }} 702 | 703 | \chaptertitlefont{\vspace*{-2.38cm} \large \centering \MakeTextUppercase} 704 | \sectionfont{\normalsize} 705 | \subsectionfont{\normalsize} 706 | \subsubsectionfont{\normalsize} 707 | \paragraphfont{\normalsize} 708 | \subparagraphfont{\normalsize} 709 | 710 | % use capital letters for chapters in the ToC as well 711 | % TODO 712 | 713 | % 714 | % Rename Bibliography to References 715 | % 716 | \renewcommand{\bibname}{REFERENCES} 717 | 718 | % 719 | % Set ToC depth 720 | % 721 | \setcounter{tocdepth}{4} 722 | \setcounter{secnumdepth}{3} 723 | 724 | % 725 | % No indentation for paragraphs 726 | % 727 | \setlength{\parindent}{0pt} 728 | 729 | % 730 | % Paragraph spacing should be 731 | % 732 | \setlength{\parskip}{6pt} 733 | 734 | % 735 | % Line spacing should be one line 736 | % 737 | \linespread{1} 738 | 739 | % 740 | % Configure header and footer 741 | % 742 | %\pagestyle{fancy} 743 | \fancyhf{} 744 | \fancyhead[LEO]{{\scriptsize \@titleEn{}}} 745 | \ifdualpage 746 | \fancyfoot[C]{{\fontsize{10pt}{10pt}\selectfont \thepage}} 747 | \else 748 | \fancyfoot[R]{{\fontsize{10pt}{10pt}\selectfont \thepage}} 749 | \fi 750 | \fancyfoot[LE,LO]{{\scriptsize \@authorFirstAbrEn{} \@authorLastEn{}}} 751 | 752 | % needed redefinition, because commands like \chapter call 753 | % \thispagestyle{plain} automatically 754 | \fancypagestyle{plain} { 755 | \fancyhf{} 756 | \fancyhead[LO]{{\scriptsize \@titleEn{}}} 757 | \ifdualpage 758 | \fancyfoot[C]{{\fontsize{10pt}{10pt}\selectfont \thepage}} 759 | \else 760 | \fancyfoot[R]{{\fontsize{10pt}{10pt}\selectfont \thepage}} 761 | \fi 762 | \fancyfoot[LE,LO]{{\scriptsize \@authorFirstAbrEn{} \@authorLastEn{}}} 763 | } 764 | \renewcommand{\headrulewidth}{0pt} 765 | 766 | % 767 | % Configure the first page of ToC to have an empty pagestyle 768 | % (thanks to http://tex.stackexchange.com/a/5789) 769 | % 770 | \AtBeginDocument{\addtocontents{toc}{\protect\thispagestyle{empty}}} 771 | -------------------------------------------------------------------------------- /thesis_paper/emblems/athena-black.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-black.pdf -------------------------------------------------------------------------------- /thesis_paper/emblems/athena-blue.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-blue.pdf -------------------------------------------------------------------------------- /thesis_paper/emblems/athena-red.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-red.pdf -------------------------------------------------------------------------------- /thesis_paper/emblems/athena_black.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena_black.jpeg -------------------------------------------------------------------------------- /thesis_paper/images/D.E. Idea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/D.E. Idea.png -------------------------------------------------------------------------------- /thesis_paper/images/Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/Figure_1.png -------------------------------------------------------------------------------- /thesis_paper/images/Our Idea.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/Our Idea.png -------------------------------------------------------------------------------- /thesis_paper/images/arx_accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/arx_accuracy.png -------------------------------------------------------------------------------- /thesis_paper/images/arx_tool.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/arx_tool.png -------------------------------------------------------------------------------- /thesis_paper/images/emd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/emd.png -------------------------------------------------------------------------------- /thesis_paper/images/epsilon_intro_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_intro_graph.png -------------------------------------------------------------------------------- /thesis_paper/images/epsilon_measurements.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_measurements.png -------------------------------------------------------------------------------- /thesis_paper/images/epsilon_others_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_others_kant.png -------------------------------------------------------------------------------- /thesis_paper/images/epsilon_others_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_others_l1.png -------------------------------------------------------------------------------- /thesis_paper/images/epsilon_our_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_our_kant.png -------------------------------------------------------------------------------- /thesis_paper/images/hierarchies.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hierarchies.png -------------------------------------------------------------------------------- /thesis_paper/images/hist_metrics_euclidean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hist_metrics_euclidean.png -------------------------------------------------------------------------------- /thesis_paper/images/hist_metrics_kantorovich.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hist_metrics_kantorovich.png -------------------------------------------------------------------------------- /thesis_paper/images/increasing_ds_size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/increasing_ds_size.png -------------------------------------------------------------------------------- /thesis_paper/images/local_vs_global.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/local_vs_global.png -------------------------------------------------------------------------------- /thesis_paper/images/nusers_others_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/nusers_others_kant.png -------------------------------------------------------------------------------- /thesis_paper/images/nusers_others_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/nusers_others_l1.png -------------------------------------------------------------------------------- /thesis_paper/images/rr_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/rr_results.png -------------------------------------------------------------------------------- /thesis_paper/images/simple_hists.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/simple_hists.png -------------------------------------------------------------------------------- /thesis_paper/images/true_answers_ldp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/true_answers_ldp.png -------------------------------------------------------------------------------- /thesis_paper/images/users_our_kant.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/users_our_kant.png -------------------------------------------------------------------------------- /thesis_paper/images/users_our_l1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/users_our_l1.png -------------------------------------------------------------------------------- /thesis_paper/latexmkrc: -------------------------------------------------------------------------------- 1 | $out_dir = "./build"; 2 | $pdflatex = "xelatex %O %S"; 3 | $pdf_mode = 1; $postscript_mode = $dvi_mode = 0; 4 | 5 | -------------------------------------------------------------------------------- /thesis_paper/outerjoin10.mf: -------------------------------------------------------------------------------- 1 | % This file was largely modified from lasy.mf and lasy10.mf 2 | % by Scott Pakin and Anonymous. 3 | 4 | if unknown cmbase: input cmbase fi 5 | def generate suffix t= enddef; 6 | input cmsy10; 7 | 8 | font_identifier:="OUTERJOIN"; 9 | font_coding_scheme:="Outer-join symbols"; 10 | 11 | % Here we steal a bit from mathsy 12 | 13 | mode_setup; font_setup; 14 | autorounding:=0; 15 | 16 | font_slant slant; font_x_height x_height#; 17 | font_quad 18u# if not monospace:+4letter_fit# fi; 18 | slant:=mono_charic#:=0; % the remaining characters will not be slanted 19 | currenttransform:=identity yscaled aspect_ratio scaled granularity; 20 | 21 | cmchar "Left outer-join operator"; 22 | beginchar(oct"061",18u#,v_center(7u#)); 23 | italcorr math_axis#*slant; 24 | adjust_fit(0,0); pickup rule.nib; 25 | numeric a; a=round(1.1*math_axis); 26 | x1=x4=good.x 6u; x2=x3=x1+10u; 27 | x5=x6=x1-5u; 28 | y1=y2=y5=good.y(.5(cap_height-2a)); y3=y4=y6=y1+2a; 29 | draw z1--z3--z2--z4--cycle; 30 | draw z1--z5; 31 | draw z4--z6; 32 | labels(1,2,3,4,5,6); endchar; 33 | 34 | cmchar "Right outer-join operator"; 35 | beginchar(oct"062",18u#,v_center(7u#)); 36 | italcorr math_axis#*slant; 37 | adjust_fit(0,0); pickup rule.nib; 38 | numeric a; a=round(1.1*math_axis); 39 | x1=x4=good.x 1.5u; x2=x3=x1+10u; 40 | x5=x6=x2+5u; 41 | y1=y2=y5=good.y(.5(cap_height-2a)); y3=y4=y6=y1+2a; 42 | draw z1--z3--z2--z4--cycle; 43 | draw z2--z5; 44 | draw z3--z6; 45 | labels(1,2,3,4,5,6); endchar; 46 | 47 | cmchar "Full outer-join operator"; 48 | beginchar(oct"063",23u#,v_center(7u#)); 49 | italcorr math_axis#*slant; 50 | adjust_fit(0,0); pickup rule.nib; 51 | numeric a; a=round(1.1*math_axis); 52 | x1=x4=good.x 6u; x2=x3=x1+10u; 53 | x5=x6=x1-5u; x7=x8=x2+5u; 54 | y1=y2=y5=y7=good.y(.5(cap_height-2a)); y3=y4=y6=y8=y1+2a; 55 | draw z1--z3--z2--z4--cycle; 56 | draw z1--z5; 57 | draw z4--z6; 58 | draw z2--z7; 59 | draw z3--z8; 60 | labels(1,2,3,4,5,6,7,8); endchar; 61 | 62 | cmchar "CSQL intersection operator"; 63 | beginchar(oct"064",23u#,v_center(7u#)); 64 | italcorr math_axis#*slant; 65 | adjust_fit(0,0); pickup rule.nib; 66 | numeric a; a=round(1.1*math_axis); 67 | x1=x4=good.x 6u; x2=x3=x1+10u; 68 | x5=x6=x1-5u; x7=x8=x2+5u; 69 | y1=y2=y5=y7=good.y(.5(cap_height-2a)); y3=y4=y6=y8=y1+2a; 70 | draw z1--z3--z2--z4--cycle; 71 | draw z1--z5; 72 | draw z4--z6; 73 | draw z2--z7; 74 | draw z3--z8; 75 | labels(1,2,3,4,5,6,7,8); endchar; 76 | 77 | 78 | bye. -------------------------------------------------------------------------------- /thesis_paper/outerjoin10.pk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/outerjoin10.pk -------------------------------------------------------------------------------- /thesis_paper/outerjoin10.tfm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/outerjoin10.tfm -------------------------------------------------------------------------------- /thesis_paper/references.bib: -------------------------------------------------------------------------------- 1 | @book{dwork, 2 | author = {Dwork, C., & Roth, A.}, 3 | title = {The algorithmic foundations of differential privacy.}, 4 | booktitle = {now Publishers Inc.}, 5 | year = {2014}, 6 | url = {https://www.tau.ac.il/~saharon/BigData2018/privacybook.pdf} 7 | } 8 | 9 | 10 | 11 | 12 | % @inproceedings{dwork:glove, 13 | % author = {Dwork, C., McSherry, F., Nissim, K., & Smith, A.}, 14 | % title = {Calibrating Noise to Sensitivity in Private Data Analysis.}, 15 | % booktitle = {Theory of Cryptography,}, 16 | % year = {2006}, 17 | % pages = {265–-284}, 18 | % url = {https://link.springer.com/content/pdf/10.1007/11681878_14.pdf}, 19 | % } 20 | 21 | 22 | % @inproceedings{holohan:glove, 23 | % author = {Holohan, N., Braghin, S., Mac Aonghusa, P., & Levacher, K.}, 24 | % title = {Diffprivlib: The IBM Differential Privacy Library.}, 25 | % year = {2019}, 26 | % url = {https://arxiv.org/pdf/1907.02444.pdf}, 27 | % } 28 | 29 | % @inproceedings{li:glove, 30 | % author = {Li, N., Qardaji, W., & Su, D.}, 31 | % title = {On sampling, anonymization, and differential privacy or,k-anonymization meets differential privacy.}, 32 | % booktitle = {Proceedings of the 7th ACM Symposium on Information, Computer and Communications Security - ASIACCS '12}, 33 | % year = {2012}, 34 | % url = {https://arxiv.org/pdf/1101.2604.pdf}, 35 | % } 36 | -------------------------------------------------------------------------------- /thesis_paper/refs.tex: -------------------------------------------------------------------------------- 1 | \chapter{BIBLIOGRAPHY} 2 | 3 | asddfds -------------------------------------------------------------------------------- /thesis_paper/thesis.tex: -------------------------------------------------------------------------------- 1 | % demo.tex 2 | % 3 | % Enjoy, evolve, and share! 4 | % 5 | % Compile it as follows: 6 | % latexmk 7 | % 8 | % Check file `dithesis.cls' for other configuration options. 9 | % 10 | \documentclass[inscr]{dithesis} 11 | 12 | %\usepackage{graphicx} 13 | 14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 15 | %%%%%%%%%%%%%%%%%%%% User-specific package inclusions %%%%%%%%%%%%%%%%%%%%%%%%% 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 17 | \usepackage{booktabs} 18 | \usepackage{hyperref} 19 | \usepackage{lipsum} 20 | \usepackage{enumerate} 21 | \usepackage{amsmath} 22 | \usepackage{amssymb} 23 | \usepackage{listings} 24 | 25 | \hypersetup{ 26 | unicode=true, % non-Latin characters in bookmarks 27 | pdffitwindow=true, % page fit to window when opened 28 | pdfnewwindow=true, % links in new window 29 | pdfkeywords={}, % list of keywords 30 | colorlinks=true, % false: boxed links; true: colored links 31 | linkcolor=black, % color of internal links 32 | citecolor=black, % color of links to bibliography 33 | filecolor=black, % color of file links 34 | urlcolor=black, % color of external links 35 | pdftitle={}, % title 36 | pdfauthor={}, % author 37 | pdfsubject={} % subject of the document 38 | } 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 40 | %%%%%%%%%%%%%%%%%%%% User-specific package inclusions %%%%%%%%%%%%%%%%%%%%%%%%% 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 42 | 43 | 44 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 45 | %%%%%%%%%%%%%%%%%%%%%% User-specific configuration %%%%%%%%%%%%%%%%%%%%%%%%%%%% 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 48 | %%%%%%%%%%%%%%%%%%%%%% User-specific configuration %%%%%%%%%%%%%%%%%%%%%%%%%%%% 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 50 | 51 | 52 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 53 | %%%%%%%%%%%%%%%%%%%%%%%%%%% Required Metadata %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 55 | % 56 | % First name, last name 57 | % 58 | \authorFirstGr{Νικόλαος} 59 | \authorFirstAbrGr{Ν.} % abbreviation of first name 60 | \authorMiddleGr{Γ.} % abbreviation of father's first name 61 | \authorLastGr{Γαλάνης} 62 | \authorFirstEn{Nikolaos} 63 | \authorFirstAbrEn{N.} 64 | \authorMiddleEn{G.} 65 | \authorLastEn{Galanis} 66 | \authorSn{1115201700019} 67 | 68 | % 69 | % The title of the thesis 70 | % 71 | \titleEn{Protection of Sensitive Data: Creating, Analyzing and Testing Protocols of Differential Privacy} 72 | \titleGr{Προστασία Ευαίσθητων Δεδομένων: Δημιουργία, Ανάλυση και Δοκιμή Πρωτοκόλλων Διαφορικής Ιδιωτικότητας} 73 | 74 | % 75 | % Month followed by Year 76 | % 77 | \dateGr{ΙΟΥΛΙΟΣ 2021} 78 | \dateEn{JULY 2021} 79 | 80 | % 81 | % Supervisor(s) info 82 | % 83 | \supervisorGr{Κωνσταντίνος Χατζηκοκολάκης}{Αναπληρωτής Καθηγητής} 84 | \supervisorEn{Konstantinos Chatzikokolakis}{Associate Professor} 85 | 86 | % 87 | % Abstract, synopsis, inscription, ack, and preface pages. 88 | % 89 | % \setlength\parindent{24pt} 90 | 91 | \abstractEn{ 92 | \par The problem of preserving privacy while extracting information during data analysis, has been an everlasting one. Specifically, during the big-data era, user details can be easily compromised by a malicious handler, something considered both as a security, and as a privacy issue. 93 | \par With that being the case, there is a simple solution of denying the access to user data, thus making the mining of useful information about a plethora of subjects impossible. On the other hand, a successful mechanism would be for the data to be flowing without control, something that would be beneficiary for the advance of sciences (because of the huge amount of information that would be available), but a significant compromisation for the individuals' privacy. \par 94 | However, none of these two solutions are applicable and helpful for solving our problem. The answer is finding a balance, that would benefit both parties: the users and their privacy, as well as the researchers. The optimal fix to the subject, is Differential Privacy, which is actually a promise, made by the data handler to the user, that they will not be affected, by allowing their data to be used in any analysis, no matter what other studies/databases/info resources are available. Meanwhile, the output data statistics should be accurate enough for any researcher to extract useful information from them.\par 95 | This is a promise that in the first sight, seems rather hard to be achieved. Despite that, during this thesis, we will look closely into the theory which makes this form of privacy possible, by the addition of random noise to the user data. Differential Privacy is based on probabilistic theories, well known from the $20^{th}$ century, however, it is a rather new technique, which has yet to be fully implemented in a handy way for all data-miners to use. 96 | \par The goal of this thesis, is to examine and compare previously created mechanisms for D.P., while also creating our own mechanism, that serves to the purpose of achieving Local D.P., a form of Differential Privacy that is nowadays widely used in machine learning algorithms, aiming to protect the individuals that send their personal data for analysis. We will do so, by creating a library that is easy to use, and applies to all the rules of data privacy, and then extract conclusions from its use. 97 | 98 | During this thesis, a lot of testings will be made, in order to convince for the usability and the efficiency of Differential Privacy. 99 | } 100 | \abstractGr{ 101 | Το πρόβλημα της διατήρησης της ιδιωτικότητας κατά την ανάλυση δεδομένων, υφίσταται για πολύ καιρό. Συγκεκριμένα, στην εποχή των big-data, λεπτομέρειες των χρηστών μπορούν εύκολα να παραβιαστούν από κακόβουλους χειριστές των δεδομένων, γεγονός που θεωρείται ζήτημα τόσο όσον αφορά την ασφάλεια, όσο και την προστασία της ιδιωτικότητας του ατόμου.\par 102 | Mε την υπάρχουσα κατάσταση, υπάρχει η απλή λύση της άρνησης της πρόσβασης σε δεδομένα χρηστών, στον βωμό της προστασίας τους, κάτι που καθιστά την εξαγωγή συμπερασμάτων για ποικίλα θέματα αδύνατη. Από την άλλη, ένας επιτυχημένος μηχανισμός θα ήταν η ελεύθερη διακίνηση των δεδομένων, χωρίς φιλτράρισμά τους, γεγονός που θα ήταν ωφέλιμο για την πρόοδο των επιστημών (λόγω του μεγάλου όγκου δεδομένων που θα ήταν διαθέσιμος), αλλά μία μεγάλη παραβίαση της ιδιωτικότητας των ατόμων. 103 | \par 104 | Ωστόσο, καμία από τις δύο αυτές λύσεις δεν μπορεί να εφαρμοστεί και να μας βοηθήσει στην επίλυση τους προβλήματός μας. Η απάντηση είναι η εύρεση μίας ισορροπίας, η οποία ευνοεί και τα δύο μέρη: τους χρήστες και την ιδιωτικότητά τους, όπως και τους ερευνητές. Η βέλτιστη επίλυση του θέματος, είναι η Διαφορική Ιδιωτικότητα, που στην πραγματικότητα πρόκειται για μία υπόσχεση από τον χειριστή των δεδομένων προς τον χρήστη, πως ο χρήστης δεν θα επηρεαστεί αν επιτρέψει τη χρήση των δεδομένων του σε κάποια ανάλυση, χωρίς περιορισμούς όπως η παράλληλη ύπαρξη άλλων μελετών/βάσεων δεδομένων/πληροφοριών που υπάρχουν για αυτόν. Παράλληλα, τα στατιστικά του αποτελέσματος της ανάλυσης, πρέπει να είναι αρκετά ακριβή, ώστε ο ερευνητής να μπορεί να εξάγει χρήσιμη πληροφορία από αυτά. \par 105 | Η υπόσχεση αυτή, δείχνει δύσκολα υλοποιήσιμη με την πρώτη ματιά. Παρόλα αυτά, σε αυτήν την πτυχιακή εργασία, θα ερευνήσουμε με λεπτομέρεια τη θεωρία που καθιστά εφικτή αυτή τη μορφή ιδιωτικότητας, με την προσθήκη τυχαίου θορύβου στα δεδομένα. Η Διαφορική Ιδιωτικότητα βασίζεται σε πιθανοτικές κατανομές, γνωστές ήδη από τον $20^o$ αιώνα, όμως παραμένει μία νέα τεχνική, η οποία δεν έχει πλήρως υλοποιηθεί με τρόπο τέτοιον ώστε να μπορεί να χρησιμοποιηθεί από πολλούς ανθρώπους που είναι υπεύθυνοι για την εξαγωγή δεδομένων. 106 | \par Σκοπός αυτής της πτυχιακής εργασίας, είναι να μελετήσουμε και να συγκρίνουμε ήδη υλοποιημένους μηχανισμούς πανω στην Δ.Ι., ενώ παράλληλα θα δημιουργήσουμε τον δικό μας μηχανισμό, ο οποίος χρησιμοποιείται για τους σκοπούς της Τοπικής Διαφορικής Ιδιωτικότητας που συναντάται την σήμερον ημέραν σε αλγορίθμους μηχανικής μάθησης, με στόχο να προστατέψει τα δεδομένα που αποστέλλουν για εκμάθηση οι χρήστες. Θα το κατορθώσουμε αυτό δημιουργώντας μία προγραμματιστική βιβλιοθήκη η οποία είναι εύκολη στη χρήση, ικανοποιώνατας παράλληλα τους κανόνες της προστασίας δεδομένων, και τέλος θα εξάγουμε συμπεράσματα από τη χρήση της βιβλιοθήκης αυτής. 107 | 108 | Κατά την διάρκεια αυτής της εργασίας, θα πραγματοποιηθούν πολλές μετρήσεις, με στόχο να γίνει πειστική η χρησιμότητα και η αποτελεσματικότητα της Διαφορικής Ιδιωτικότητας. 109 | 110 | } 111 | 112 | \inscriptionEn{\emph{}} 113 | 114 | % 115 | % Subject area and keywords 116 | % 117 | \subjectAreaGr{Προστασία και Ιδιωτικότητα Δεδομένων} 118 | \subjectAreaEn{Data Privacy} 119 | \keywordsGr{Διαφορική Ιδιωτικότητα, Ασφάλεια, Δεδομένα Χρηστών, Προστασία Δεδομένων, Θόρυβος σε Δεδομένα, Συλλογή Δεδομένων} 120 | \keywordsEn{Differential Privacy, Security, User data, Data Privacy, Noisy Data, Aggregation of Data} 121 | 122 | % 123 | % Set the .bib file containing your paper publications (leave the extension out) 124 | % 125 | % This is optional, but it should be specified when option 'lop' is passed to 126 | % the document class. 127 | % 128 | % Then, inside the document environment, you may use the command '\nocitelop' to 129 | % site your papers, as you would traditionally do with the commands '\cite' or 130 | % '\nocite'. 131 | % 132 | % The papers are printed in reverse chronological order. 133 | % 134 | %\lopfile{mypapers/pubs} 135 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 136 | %%%%%%%%%%%%%%%%%%%%%%%%%%% Required Metadata %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 137 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 138 | 139 | \begin{document} 140 | 141 | \frontmatter 142 | 143 | \mainmatter 144 | 145 | \include{GDP/Intro} 146 | 147 | \include{GDP/DP_definition} 148 | 149 | \include{GDP/IBM} 150 | 151 | \include{GDP/ARX} 152 | 153 | \include{LDP/intro} 154 | 155 | \include{LDP/other_protocols} 156 | 157 | \include{LDP/our_protocol} 158 | 159 | \chapter{CONCLUSIONS AND FUTURE WORK} 160 | 161 | The goal of this thesis was to analyze the importance of protecting sensitive data, and doing so in an efficient way. After its elaboration, it is clear that Differential Privacy is a secure and efficient way for data anonymization. Having two forms, the Global and the Local, it can cover many different scenarios, including Machine Learning applications. 162 | 163 | Differential Privacy is the future of Data Protection and Anonymization, as its results can not be compromised, due to the random noise that the algorithms introduce. Unlike previous methods, such as k-anonymity, there is not yet an attack that can reduce the privacy created by D.P. algorithms, which makes this technique ideal. 164 | 165 | Despite the use of random noise the data is still useful, as the mathematical ideas behind the aggregation were built with the mindset of eliminating this noise using data normalization. 166 | 167 | Having explored many different applications, algorithms and protocols we can safely say that when it comes down to Global D.P., IBM's diffprivlib is a state of the art library that produces extremely good results. Its use is quite simple as a Python API is provided, thus can be safely added to any numerical dataset. 168 | 169 | When someone wants to apply L.D.P. during a survey, the pure protocols analyzed and tested are suitable for high efficiency combined with good protections of the members. With simple algorithms, they do not require a trusted curator in order to perform, hence users can perturb their data, and then safely report it. However, when the number of users is small, \emph{the Distance Sensitive Protocol created for the needs of this Thesis is the best option}, as the other protocols produce extreme noise in order to maintain the privacy levels. On the contrary, the D.S. protocol takes into account the distance between the true value and the one being reported when creating its probabilistic space, thus lowering the error produced. 170 | 171 | Our plans for future work are centered around the D.S. protocol. We would like to perfect its aggregation method, as it may produce satisfying results, but with a different approach it can maybe become even better. Moreover, we would like to perform more demanding experiments for extreme cases of dataset sizes, domain sizes and theta values. 172 | 173 | Finally, similar testings like the ones introduced in this Thesis can be performed in other D.P. libraries, as the accuracy measurements is a good indicator if someone wants to rank those libraries. 174 | 175 | 176 | \backmatter 177 | 178 | % abbreviations table 179 | \abbreviations 180 | \begin{center} 181 | \renewcommand{\arraystretch}{1.5} 182 | \begin{longtable}{| l | @{\qquad} l |} 183 | \hline 184 | EMD & Earth Mover's Distance \\ 185 | \hline 186 | QIF & Quantitative Information Flow \\ 187 | \hline 188 | DP & Differential Privacy\\ 189 | \hline 190 | Kant. & Kantorovich \\ 191 | \hline 192 | LDP & Local Differential Privacy\\ 193 | \hline 194 | GDP & Global Differential Privacy\\ 195 | \hline 196 | CSV & Comma Separated Values\\ 197 | \hline 198 | GUI & Graphical User Interface\\ 199 | \hline 200 | RR & Randomized Response\\ 201 | \hline 202 | DE & Direct Encoding\\ 203 | \hline 204 | HE & Histogram Encoding\\ 205 | \hline 206 | UE & Unary Encoding\\ 207 | \hline 208 | DS & Distance Sensitive\\ 209 | \hline 210 | \end{longtable} 211 | \end{center} 212 | 213 | % appendix 214 | \begin{appendix} 215 | % mark the beginning of the appendix 216 | \appendixstartedtrue 217 | 218 | % add appendix line to ToC 219 | \phantomsection 220 | \addcontentsline{toc}{chapter}{APPENDICES} 221 | 222 | \chapter{MATHEMATICAL PROOF OF THE D.S. PROTOCOL} 223 | During this Appendix, a mathematical explanation for the $a$ variable in the D.S. protocol will be given. 224 | 225 | In order to find out the $\alpha$ value, we must solve the following equation: 226 | 227 | \begin{align*} 228 | p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 229 | \end{align*} 230 | 231 | At this point, we must note that $\aplha$ although not a constant, can be held out of the sums, because it is obviously independent from the $i$ variable, that is the variable parsing through the domain in order to retrieve the false elements' probabilities. Thus, we have: 232 | 233 | \begin{align*} 234 | p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 \Longleftrightarrow \\ 235 | \sum_{i = x - \theta}^{i = x + \theta} \frac{a}{|x-i|(|x-i| + 1)} + \sum_{i = 1}^{i = x - \theta -1} \frac{a}{\theta(\theta+1)} + \sum_{i = x + \theta + 1}^{i = d} \frac{a}{\theta(\theta+1)} = 1 - p \Longleftrightarrow \\ 236 | \sum_{i = x - \theta}^{i = x - 1} \frac{a}{(x - i)(x - i + 1)} + \sum_{i = x + 1}^{i = x + \theta} \frac{a}{(i - x)(i - x + 1)} +\\+ \frac{a}{\theta(\theta+1)} \cdot (x - \theta - 1 + d - x - \theta) = 1 - p \Longleftrightarrow \\ 237 | \end{align*} 238 | 239 | For the first sum, we set $u = x - i$ and for the second one $u = i - x$, and we have: 240 | 241 | \begin{align*} 242 | \sum_{u = 1}^{u = \theta} \frac{a}{u (u + 1)} + \sum_{u = 1}^{u = \theta} \frac{a}{u (u + 1)} + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\ 243 | 2 \cdot a (1 - \frac{1}{\theta + 1}) + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\ 244 | 2 \cdot a \frac{\theta}{\theta + 1} + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\ 245 | \frac{a}{\theta + 1}(2 \theta + \frac{d - 2\theta - 1}{\theta}) = 1 - p \Longleftrightarrow \\ 246 | \frac{a}{\theta + 1}(\frac{2\theta^2 - 2\theta + d - 1}{\theta}) = 1 - p \Longleftrightarrow \\ 247 | \mathbf{a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1}} 248 | \end{align*} 249 | 250 | \chapter{REPOSITORY OF THE THESIS} 251 | The implementation of all the testings, the libraries and the protocols can be found in the GitHub repository of this thesis, in the link: \url{https://github.com/nikosgalanis/bsc-thesis}. 252 | 253 | In the directory \textit{ibm\_lib\_work}, all the notebooks with the measurements made for the IBM library are included. 254 | 255 | In the directory \textit{ARX\_work}, the java code for the measurements in ARX is included, as well as the datasets and the hierarchies used in order to test the protocol. 256 | 257 | In the directory \textit{LDP}, the LDP library is implemented, using the already-known protocols from the Wang et. al. paper. Additionally, our own protocol created for the needs of this Thesis is included, alongside with a Python file responsible to create all the testings that were carried out. 258 | 259 | Finally, in the directory \textit{papers\_used}, all of the papers referenced in this Thesis can be found. 260 | 261 | 262 | More information for the repository and its contents can be found in the README file included. 263 | 264 | \end{appendix} 265 | 266 | % % manually include the bibliography 267 | \bibliographystyle{plain} 268 | 269 | 270 | {\huge \bibliography{references}} 271 | 272 | % % include it also in ToC (do sth on your own) 273 | \addcontentsline{toc}{chapter}{REFERENCES} 274 | 275 | 276 | \fontsize{10}{14}\selectfont 277 | \setmainfont{Arial} 278 | 279 | [1]\hspace{1cm}Dwork, C., & Roth, A. (2014). The algorithmic foundations of differential privacy. now Publishers Inc. 280 | 281 | [2]\hspace{1cm}Dwork, C., McSherry, F., Nissim, K., & Smith, A. (2006). Calibrating Noise to Sensitivity in Private Data Analysis. Theory of Cryptography, 265–284. 282 | 283 | [3]\hspace{1cm}Holohan, N., Braghin, S., Mac Aonghusa, P., & Levacher, K. (2019, July 4). Diffprivlib: The IBM Differential Privacy Library. arXiv.org. 284 | 285 | [4]\hspace{1cm}Li, N., Qardaji, W., & Su, D. (2012). On sampling, anonymization, and differential privacy or,k-anonymization meets differential privacy. Proceedings of the 7th ACM Symposium on Information, Computer and Communications Security - ASIACCS '12. 286 | 287 | [5]\hspace{1cm}Bild, R., Kuhn, K. A., & Prasser, F. (2018). SafePub: A Truthful Data Anonymization Algorithm With Strong Privacy Guarantees. Proceedings on Privacy Enhancing Technologies, 2018(1), 67–87. 288 | 289 | [6]\hspace{1cm}Christofides, T. C. (2003). A generalized randomized response technique. Metrika, 57(2), 195–200. 290 | 291 | [7]\hspace{1cm}Chatzikokolakis, K., Palamidessi, C., & Stronati, M. (2015). Location privacy via geo-indistinguishability. ACM SIGLOG News, 2(3), 46–69. 292 | 293 | [8]\hspace{1cm}Jain, P., Gyanchandani, M., & Khare, N. (2018). Differential privacy: its technological prescriptive using big data. Journal of Big Data, 5(1). 294 | 295 | [9]\hspace{1cm}Bebensee, B. (2019, July 27). Local Differential Privacy: a tutorial. arXiv.org. 296 | 297 | [10]\hspace{1cm}Tianhao Wang, Jeremiah Blocki, Ninghui Li, and Somesh Jha. 2017. Locally differentially private protocols for frequency estimation. In Proceedings of the 26th USENIX Conference on Security Symposium (SEC'17). USENIX Association, USA, 729–745. 298 | 299 | [11]\hspace{1cm}Chatzikokolakis, K., Andrés, M. E., Bordenabe, N. E., & Palamidessi, C. (2013). Broadening the Scope of Differential Privacy Using Metrics. Privacy Enhancing Technologies, 82–102. 300 | 301 | [12]\hspace{1cm}Chamikara, M.A.P. & Bertok, P. & Khalil, Ibrahim & Liu, D. & Camtepe, Seyit. (2019). Local Differential Privacy for Deep Learning. 302 | 303 | [13]\hspace{1cm}Chatzikokolakis, K., Fernandes, N., & Palamidessi, C. (2020). Refinement Orders for Quantitative Information Flow and Differential Privacy. Journal of Cybersecurity and Privacy, 1(1), 40–77. 304 | 305 | [14]\hspace{1cm}Bassily, R., & Smith, A. (2015). Local, Private, Efficient Protocols for Succinct Histograms. Proceedings of the Forty-Seventh Annual ACM Symposium on Theory of Computing. 306 | 307 | [15]\hspace{1cm}Erlingsson, Ú., Pihur, V., & Korolova, A. (2014). RAPPOR. Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security. 308 | 309 | [16]\hspace{1cm} "Surgery Charges Across the U.S.", https://data.world/dmikebishop/surgery-charges-across-the-u-s. 310 | 311 | [17]\hspace{1cm} "NBA Salaries", https://data.world/datadavis/nba-salaries 312 | 313 | \end{document} 314 | --------------------------------------------------------------------------------