├── .gitignore
├── ARX_work
    ├── arx_accuracy.png
    ├── basic_queries.ipynb
    ├── dp_example.java
    └── intro.ipynb
├── LDP
    ├── Direct_Encoding.py
    ├── Distance_Sensitive_Encoding.py
    ├── Histogram_Encoding.py
    ├── LDP_Frequency_Estimator.py
    ├── RAPPOR.py
    ├── Random_Matrix.py
    ├── Unary_Encoding.py
    ├── ldp.py
    ├── local_DP_intro.ipynb
    ├── random_response.py
    └── res.csv
├── LICENSE
├── README.md
├── ibm_lib_work
    ├── basic_queries.ipynb
    ├── epsilon_measurements.ipynb
    ├── epsilon_measurements.png
    ├── hist_metrics_euclidean.png
    ├── hist_metrics_kantorovich.png
    ├── histograms.ipynb
    ├── increasing_ds_size.png
    └── simple_hists.png
├── images
    ├── D.E. Idea.png
    ├── Figure_1.png
    ├── Our Idea.png
    ├── arx_accuracy.png
    ├── arx_tool.png
    ├── emd.png
    ├── epsilon_intro_graph.png
    ├── epsilon_measurements.png
    ├── epsilon_others_kant.png
    ├── epsilon_others_l1.png
    ├── epsilon_our_kant.png
    ├── hierarchies.png
    ├── hist_metrics_euclidean.png
    ├── hist_metrics_kantorovich.png
    ├── increasing_ds_size.png
    ├── local_vs_global.png
    ├── nusers_others_kant.png
    ├── nusers_others_l1.png
    ├── rr_results.png
    ├── simple_hists.png
    ├── true_answers_ldp.png
    ├── users_our_kant.png
    └── users_our_l1.png
├── papers_used
    ├── 10_sec17-wang-tianhao.pdf
    ├── 11_dpmetrics.pdf
    ├── 12_LATENT_localDP.pdf
    ├── 13_jcp-01-00004.pdf
    ├── 14_Random_Matrix.pdf
    ├── 15_RAPPOR.pdf
    ├── 1_privacybook.pdf
    ├── 2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf
    ├── 3_ibm_diffprivlib.pdf
    ├── 4_k_anon+dp.pdf
    ├── 5_arx_dp.pdf
    ├── 6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf
    ├── 7_chatziko_locationguard_paper_1.pdf
    ├── 8_Differential_privacy_its_technological_prescriptiv.pdf
    └── 9_localDP_Tutorial.pdf
└── thesis_paper
    ├── GDP
        ├── ARX.tex
        ├── DP_definition.tex
        ├── IBM.tex
        └── Intro.tex
    ├── LDP
        ├── intro.tex
        ├── other_protocols.tex
        └── our_protocol.tex
    ├── dependencies
        └── arial
        │   └── fonts
        │       ├── Arial Bold Italic.ttf
        │       ├── Arial Bold.ttf
        │       ├── Arial Italic.ttf
        │       └── Arial.ttf
    ├── dithesis.cls
    ├── emblems
        ├── athena-black.pdf
        ├── athena-blue.pdf
        ├── athena-red.pdf
        └── athena_black.jpeg
    ├── images
        ├── D.E. Idea.png
        ├── Figure_1.png
        ├── Our Idea.png
        ├── arx_accuracy.png
        ├── arx_tool.png
        ├── emd.png
        ├── epsilon_intro_graph.png
        ├── epsilon_measurements.png
        ├── epsilon_others_kant.png
        ├── epsilon_others_l1.png
        ├── epsilon_our_kant.png
        ├── hierarchies.png
        ├── hist_metrics_euclidean.png
        ├── hist_metrics_kantorovich.png
        ├── increasing_ds_size.png
        ├── local_vs_global.png
        ├── nusers_others_kant.png
        ├── nusers_others_l1.png
        ├── rr_results.png
        ├── simple_hists.png
        ├── true_answers_ldp.png
        ├── users_our_kant.png
        └── users_our_l1.png
    ├── latexmkrc
    ├── outerjoin10.mf
    ├── outerjoin10.pk
    ├── outerjoin10.tfm
    ├── references.bib
    ├── refs.tex
    └── thesis.tex


/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 | .ipynb_checkpoints
3 | .vscode/settings.json
4 | __pycache__
5 | !LDP/res.csv


--------------------------------------------------------------------------------
/ARX_work/arx_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ARX_work/arx_accuracy.png


--------------------------------------------------------------------------------
/ARX_work/dp_example.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * ARX: Powerful Data Anonymization
  3 |  * Copyright 2012 - 2020 Fabian Prasser and contributors
  4 |  * 
  5 |  * Licensed under the Apache License, Version 2.0 (the "License");
  6 |  * you may not use this file except in compliance with the License.
  7 |  * You may obtain a copy of the License at
  8 |  * 
  9 |  * http://www.apache.org/licenses/LICENSE-2.0
 10 |  * 
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | 
 18 | package org.deidentifier.arx.examples;
 19 | import java.util.Iterator;
 20 | 
 21 | import java.io.IOException;
 22 | 
 23 | import java.nio.charset.Charset;
 24 | 
 25 | 
 26 | import org.deidentifier.arx.ARXAnonymizer;
 27 | import org.deidentifier.arx.ARXConfiguration;
 28 | import org.deidentifier.arx.ARXResult;
 29 | import org.deidentifier.arx.AttributeType;
 30 | import org.deidentifier.arx.AttributeType.Hierarchy;
 31 | import org.deidentifier.arx.Data;
 32 | import org.deidentifier.arx.Data.DefaultData;
 33 | import org.deidentifier.arx.DataGeneralizationScheme;
 34 | import org.deidentifier.arx.DataGeneralizationScheme.GeneralizationDegree;
 35 | import org.deidentifier.arx.DataHandle;
 36 | import org.deidentifier.arx.DataType;
 37 | import org.deidentifier.arx.criteria.EDDifferentialPrivacy;
 38 | import java.nio.charset.StandardCharsets;
 39 | 
 40 | /**
 41 |  * This class implements an example on how to use the API by directly providing
 42 |  * the input datasets.
 43 |  *
 44 |  * @author Fabian Prasser
 45 |  * @author Florian Kohlmayer
 46 |  */
 47 | public class dp_example extends Example {
 48 | 
 49 |     /**
 50 |      * Entry point.
 51 |      * 
 52 |      * @param args
 53 |      *            the arguments
 54 |      * @throws IOException 
 55 |      */
 56 | 	
 57 | 	protected static double run_query(ARXResult data, int targetColumn) {
 58 | 		// iterator that we are going to use to access the data
 59 | 		final Iterator<String[]> itHandle = data.getOutput().iterator();
 60 | 		
 61 | 		// result of the query
 62 | 		double result = 0d;
 63 | 		// length of the dataset
 64 | 		int totalRecords = 0;
 65 | 		
 66 | 		// get the first element of the column, thus the name of it, and ignore it
 67 | 		String[] name = itHandle.next();
 68 | 		if (name.length <= targetColumn) {
 69 | 			System.out.println("Target column out of bounds\n");
 70 | 			return 0d;
 71 | 		}
 72 | 
 73 | 		// iterate through all the values in the dataset
 74 | 		while(itHandle.hasNext()) {
 75 | 			String[] next = itHandle.next();
 76 | 			// check that our target position is legal
 77 | 			String string = next[targetColumn];
 78 | 			if (!string.equals("*")) {
 79 | 				result += Integer.parseInt(string);		
 80 | 				totalRecords++;				
 81 | 			}
 82 | 		}
 83 | //    	System.out.println(result);
 84 | 		return result / totalRecords;
 85 | 	}
 86 | 	
 87 |     public static void main(String[] args) throws IOException {
 88 | 
 89 |         // import the data
 90 |         Data data = Data.create("data/nba/new_salaries.csv", StandardCharsets.UTF_8, ',');
 91 | 
 92 |         // set the hierarchies for each column 
 93 |         Hierarchy position = Hierarchy.create("data/nba/position_hierarchy.csv", StandardCharsets.UTF_8, ',');
 94 |         Hierarchy year = Hierarchy.create("data/nba/year_hierarchy.csv", StandardCharsets.UTF_8, ',');
 95 |         Hierarchy age = Hierarchy.create("data/nba/age_hierarchy.csv", StandardCharsets.UTF_8, ',');
 96 |         Hierarchy team = Hierarchy.create("data/nba/team_hierarchy.csv", StandardCharsets.UTF_8, ';');
 97 |         Hierarchy salary = Hierarchy.create("data/nba/salaries_hierarchy.csv", StandardCharsets.UTF_8, ',');
 98 | 
 99 |         
100 |         data.getDefinition().setAttributeType("Pos", AttributeType.INSENSITIVE_ATTRIBUTE);
101 |         data.getDefinition().setAttributeType("Year", AttributeType.INSENSITIVE_ATTRIBUTE);
102 |         data.getDefinition().setAttributeType("Age", AttributeType.INSENSITIVE_ATTRIBUTE);
103 |         data.getDefinition().setAttributeType("Tm", AttributeType.QUASI_IDENTIFYING_ATTRIBUTE);
104 |         data.getDefinition().setAttributeType("Salary", AttributeType.QUASI_IDENTIFYING_ATTRIBUTE);// AttributeType.IDENTIFYING_ATTRIBUTE);
105 |         
106 |         data.getDefinition().setHierarchy("Pos", position);
107 |         data.getDefinition().setHierarchy("Year", year);
108 |         data.getDefinition().setHierarchy("Age", age);
109 |         data.getDefinition().setHierarchy("Tm", team);
110 |         data.getDefinition().setHierarchy("Salary", salary);
111 |         
112 |          // Create an instance of the anonymizer
113 | 
114 |         // Create a differential privacy criterion
115 |         // we want (1,0) - DP
116 |         // delta is suggested to be 1/#records
117 |         double total_res = 0d;
118 |         int solved = 0;
119 |         for (int i = 0; i < 500; i++) {
120 |         	data.getHandle().release();
121 |         	
122 |         	System.out.println(i);
123 | 
124 |         	ARXAnonymizer anonymizer = new ARXAnonymizer();
125 |         	
126 |         	EDDifferentialPrivacy criterion = new EDDifferentialPrivacy(1.7d, 1d / data.getHandle().getNumRows());
127 |         	
128 |         	ARXConfiguration config = ARXConfiguration.create();
129 |         	config.addPrivacyModel(criterion);
130 |         	config.setSuppressionLimit(1d);
131 |             config.setHeuristicSearchStepLimit(100);
132 |             ARXResult result = anonymizer.anonymize(data, config);
133 |         	     
134 |         	double res = run_query(result, 4);
135 | //            printResult(result, data);
136 | 
137 |         	if (res > 0) {
138 | //            	System.out.print("--------------------------------" + res);
139 | 
140 |         		total_res += res;
141 |         		solved++;
142 |         	}
143 |         }
144 |         total_res /= solved;
145 |     	System.out.print("Total result " + total_res);
146 |     }
147 | }
148 | 


--------------------------------------------------------------------------------
/ARX_work/intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# ARX Data Anonymization Tool"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "ARX is a tool for data anonymization, that in general, takes a dataset as an input, applies different privacy models, and produces an anonymized version of this dataset, thus offering privacy to its members.\n",
 15 |     "\n",
 16 |     "At its core, ARX uses a highly efficient globally-optimal search algorithm for transforming data with full-domain generalization and record suppression. The transformation of attribute values is implemented through domain generalization hierarchies, which represent valid transformations that can be applied to individual-level values."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Classic Privacy Models"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "The ARX tool offers standard privacy models that are tested in theory and are widely use to ensure anonymity given a plain dataset."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "markdown",
 35 |    "metadata": {},
 36 |    "source": [
 37 |     "### k-Anonymity"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "This well-known privacy model that aims at protecting datasets from re-identification in the prosecutor model. A dataset is $k$-anonymous if each record cannot be distinguished from at least $k-1$ other records regarding the quasi-identifiers. Each group of indistinguishable records forms a so-called equivalence class. "
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "### Average risk\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "This privacy model can be used for protecting datasets from re-identification in the marketer model by enforcing a threshold on the average re-identification risk of the records. By combining the model with k-anonymity, a privacy model called strict-average risk can be constructed."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "### ℓ-Diversity"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "This privacy model can be used to protect data against attribute disclosure by ensuring that each sensitive attribute has at least $ℓ$ \"well represented\" values in each equivalence class. Different variants, which implement different measures of diversity, have been proposed."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Differential Privacy"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "Given the strict definition of DP, we know that we must access the dataset through various queries, given a privacy budget that we must not exceed. The ARX team, proposes a quite different application of DP in their tool, where privacy protection is not considered a property of a dataset, but a property of a data processing method.\n",
 87 |     "\n",
 88 |     "DP guarantees that the probability of any possible output of the anonymization process does not change \"by much\" if data of an individual is added to or removed from input data.\n",
 89 |     "\n",
 90 |     "In order to implement Differential Privacy, ARX uses the __SafePub algorithm__"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Concepts used"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "__Random Sampling__: A probability sampling method is any method of sampling that utilizes some form of random selection. In order to have a random selection method, you must set up some process or procedure that assures that the different units in your population have equal probabilities of being chosen. In SafePub, such sampling happens with probability $\\beta$\n",
105 |     "\n",
106 |     "__Attribute Generalization__: In SafePub, generalization is achieved through user-defined hierarchies, which describe rules for replacing values with more general but semantically consistent values on increasing levels of generalization. \n",
107 |     "\n",
108 |     "__Record Suppression__: Deletion of a specific row on the input dataset."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### Theorem"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "__Random sampling with probability $\\beta$ followed by attribute generalization and the suppression of\n",
123 |     "every record which appears less than k times__ satisfies $(\\epsilon, \\delta)$ differential privacy for every $\\epsilon \\geq -ln(1-\\beta)$ with \n",
124 |     "$$\\delta = \\max_{n:n \\geq n_m} \\sum_{j>\\gamma_n}^{n}f(j;n,\\beta)$$\n",
125 |     "\n",
126 |     "where $n_m = \\frac{k}{\\gamma} - 1$, $\\gamma = \\frac{e^\\epsilon-1+\\beta}{e^\\epsilon}$ and $f(j;n,\\beta) = {n \\choose  j} \\beta^j(1-\\beta)^{n-j}$"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "## Techniques"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "In order to achieve attribute generalization, ARX uses the so called __hierarchies__. They are either imported from a csv, or being hard-coded into the API, and they are used in order to generalize a sensitive field. An example is given below. The subject to generalize is the age of a person. Let's see the values as they proceed through generalization."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "1<sup>st</sup> level | 2<sup>nd</sup> level | 3<sup>rd</sup> level | 4<sup>th</sup> level | 5<sup>th</sup> level\n",
148 |     "--- | ----- | ------ |----- | --\n",
149 |     "1 |\t0-4 | 0-9| 0-19\t|*\n",
150 |     "2 |\t0-4 | 0-9| \t0-19|\t*\n",
151 |     "3 |\t0-4 |\t0-9|\t0-19|\t*\n",
152 |     "4 |\t0-4 |\t0-9|\t0-19|\t*\n",
153 |     "5 |\t0-4 |\t0-9|\t0-19|\t*\n",
154 |     "6 |\t5-9 |\t0-9|\t0-19|\t*\n",
155 |     "7 |\t5-9 |\t0-9|\t0-19|\t*\n",
156 |     "8 |\t5-9 |\t0-9|\t0-19|\t*\n",
157 |     "9 |\t5-9 |\t0-9|\t0-19|\t*\n",
158 |     "10| 5-9\t| 0-9|\t0-19\t|*\n",
159 |     "11|\t10-14 |\t10-19|\t0-19|\t*\n",
160 |     "12|\t10-14 |\t10-19|\t0-19|\t*\n",
161 |     "13|\t10-14 |\t10-19|\t0-19|\t*\n",
162 |     "14|\t10-14 |\t10-19|\t0-19|\t*\n",
163 |     "15|\t10-14 |\t10-19|\t0-19|\t*\n",
164 |     "16|\t15-19 |\t10-19|\t0-19|\t*\n",
165 |     "17|\t15-19 |\t10-19|\t0-19|\t*\n",
166 |     "18|\t15-19 |\t10-19|\t0-19|\t*\n",
167 |     "19|\t15-19 |\t10-19|\t0-19|\t*\n",
168 |     "20|\t15-19 |\t10-19|\t0-19|\t*\n",
169 |     "21|\t20-24 |\t20-29|\t20-39|\t*"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "markdown",
174 |    "metadata": {},
175 |    "source": [
176 |     "## Testings"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "In order to test the accuracy of the models used by ARX, we are going to run simple np python queries, on the datasets produced by the anonymization process. We want to eliminate the probability of extremely high noise generation, thus we are going to run the anonymization tool multiple times, and the output dataset will be constructed by the mean values of the fields."
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Problems we faced"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "As show on the above matrix, ARX hierarchies tend to treat every type of value as a string, in order to replace it with a interval. This is not  desirable when applying the testings we mentioned. Thus, we had to come up with a better solution of defining hierarchies. The ARX GUI provides a wizard that gives a variety of choices so the user can easily create a hierarchy for plenty data types.\n",
198 |     "\n",
199 |     "Another challenge is the number of layers that we are going to use, meaning how far our anonymization will proceed. In each layer, the number of same records increase exponentially, thus we do not want to apply many layers, in order for our results to be accurate, and the output dataset to be readable.\n"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {},
205 |    "source": [
206 |     "## Solution to the construction of Hierarchies"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "Given the help from Fabian Prasser, we opted to treat the integer values as numbers, and in each level:\n",
214 |     " - Group the rows by 2\n",
215 |     " - Apply a function according to the query we want to ask.\n",
216 |     " \n",
217 |     "For example, if we want a counting query, the best option would be to apply an __arithmetic mean__ function to the group, thus the sum, the mean, the variance etc will be the same. The way that ARX preserves DP with those settings, is by record suppression. If that was not the case, the results would be identical to the input dataset. However, now, the output dataset will differ because of its lack of some rows of the input.\n",
218 |     " \n",
219 |     "Regarding the layers problem, we opted to use 4 layers of anonymization, the last of whom will be the `*` value, meaning that every record is inseparable. We do not want this to happen early in our anonymization, but we do not want it to never happen either, because then we would have a privacy leak, if the dataset was too small."
220 |    ]
221 |   }
222 |  ],
223 |  "metadata": {
224 |   "kernelspec": {
225 |    "display_name": "Python 3",
226 |    "language": "python",
227 |    "name": "python3"
228 |   },
229 |   "language_info": {
230 |    "codemirror_mode": {
231 |     "name": "ipython",
232 |     "version": 3
233 |    },
234 |    "file_extension": ".py",
235 |    "mimetype": "text/x-python",
236 |    "name": "python",
237 |    "nbconvert_exporter": "python",
238 |    "pygments_lexer": "ipython3",
239 |    "version": "3.8.10"
240 |   }
241 |  },
242 |  "nbformat": 4,
243 |  "nbformat_minor": 4
244 | }
245 | 


--------------------------------------------------------------------------------
/LDP/Direct_Encoding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import math
 4 | import numbers
 5 | 
 6 | class Direct_Encoding_client():
 7 | 	def __init__(self, e, d):
 8 | 		# initialization of the protocol's constants
 9 | 		self.e = e
10 | 		self.d = d
11 | 		# p and q are fixed, depending on the domain size and the epsilon value
12 | 		self.p = math.exp(self.e) / (math.exp(self.e) + self.d - 1)
13 | 		self.q = 1 / (math.exp(self.e) + self.d - 1)
14 | 
15 | 	# encoding: simply return the value itself
16 | 	def encode(self, v):
17 | 		return v
18 | 
19 | 	def perturbe(self, ret):
20 | 		x = ret
21 | 		# generate a random number in the range (0,1)
22 | 		res = random.random()
23 | 		
24 | 		# if it is less than p, report the real value
25 | 		if (res < self.p):
26 | 			pert = x
27 | 		else:
28 | 			# else chose one of the other values of the domain
29 | 			false_xs = [i for i in range(self.d) if i != x]		
30 | 
31 | 			pert = random.choice(false_xs)
32 | 		
33 | 		return pert
34 | 
35 | 	# randomization consists of perturbing the encoded value
36 | 	def randomize(self, v):
37 | 		return self.perturbe(self.encode(v))
38 | 
39 | 
40 | class Direct_Encoding_aggregator():
41 | 	def __init__(self, e, d):
42 | 		# initialization of the protocol's constants
43 | 		self.e = e
44 | 		self.d = d
45 | 		# p and q are fixed, depending on the domain size and the epsilon value
46 | 		self.p = math.exp(self.e) / (math.exp(self.e) + self.d - 1)
47 | 		self.q = 1 / (math.exp(self.e) + self.d - 1)
48 | 
49 | 	def aggregate(self, config):
50 |     		# define the needed variables from the configuration dict provided
51 | 		reported_values = config['reported_values']
52 | 		e = config['epsilon']
53 | 		d = config['d']
54 | 
55 | 		# array to store the results
56 | 		results = np.zeros(d)
57 | 		n = len(reported_values)
58 | 		
59 | 		# compute p and q based on the espilon value and the domain size
60 | 		p = math.exp(e) / (math.exp(e) + d - 1)
61 | 		q = 1 / (math.exp(e) + d - 1)
62 | 
63 | 		# compute the estimation for each value of the domain
64 | 		for i in range(d):
65 | 			sum_v = 0
66 | 			for j in reported_values:
67 |     			# Support(i) = {i}, thus the protocol supports only the values equal to 
68 | 				# the current value
69 | 				if j == i:
70 | 					sum_v += 1
71 | 			# normalize the sum by trying to extract the noise
72 | 			results[i] = ((sum_v) - n * q) / (p - q)
73 | 			# if a negative sum is generated by the normalization, convert it to zero
74 | 			if (results[i] < 0):
75 | 				results[i] = 0
76 | 
77 | 		return results


--------------------------------------------------------------------------------
/LDP/Distance_Sensitive_Encoding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import math
 4 | import numbers
 5 | 
 6 | 
 7 | class Distance_Sensitive_Encoding_client():
 8 | 	def __init__(self, e, d):
 9 | 		# initialization of the protocol's constants
10 | 		self.e = e
11 | 		self.d = d
12 | 		self.theta = math.floor((math.sqrt(4 * math.exp(self.e) + 1) - 1) / 2)
13 | 		self.a = (self.theta * (self.theta + 1)) / (3 * self.theta ** 2 - self.theta + d - 1)
14 | 		self.p = self.a
15 | 		self.probs = [self.a / (i * (i + 1)) for i in range(1, self.theta)]
16 | 		self.q = self.a / (self.theta * (self.theta + 1))
17 | 
18 | 	# encoding: simply return the value itself
19 | 	def encode(self, v):
20 | 		return v
21 | 
22 | 	# perturbation: choose a random value, with fixed probabilities depending on the distance from the truth
23 | 	def perturbe(self, ret):
24 | 		x = ret
25 | 		# create an array of probabilities for each element of the domain
26 | 		probabilities = np.zeros(self.d)
27 | 		# extreme cases: x-theta outside domain boundaries
28 | 		if (x - self.theta < 0):
29 | 			m = sum([self.a / (abs(i - x) * (abs(i - x) + 1)) - self.a / (self.theta * (self.theta + 1)) for i in range(x - self.theta, 0)])
30 | 		elif (x + self.theta > self.d):
31 | 			m = sum([self.a / (abs(i - x) * (abs(i - x) + 1)) - self.a / (self.theta * (self.theta + 1)) for i in range(self.d, x + self.theta)])
32 | 		else:
33 | 			m = 0
34 | 
35 | 		for i in range(self.d):
36 | 			# probablitiy of choosing the truth, fixed by the user
37 | 			if i == x:
38 | 				probabilities[i] = 100 * self.p
39 | 			# probability of being within the area
40 | 			elif abs(i - x) < self.theta:
41 |     			# probability of lying, depending on the distance of the false value from the true one
42 | 				probabilities[i] = 100 * self.probs[abs(i - x) - 1] + m / (self.d - 1)
43 | 			# probability of being outside the area
44 | 			else:
45 | 				probabilities[i] = 100 * self.q + m / (self.d - 1)
46 | 		# list of all the possible options of values
47 | 		options = [i for i in range(self.d)]
48 | 		# choose a value given the probabilities for each one
49 | 		pert = random.choices(options, probabilities)[0]
50 | 		return pert
51 | 
52 | 	# randomization consists of perturbing the encoded value
53 | 	def randomize(self, v):
54 | 		return self.perturbe(self.encode(v))
55 | 
56 | 
57 | class Distance_Sensitive_Encoding_aggregator():
58 | 	def __init__(self, e, d):
59 |     		# initialization of the protocol's constants
60 | 		self.e = e
61 | 		self.d = d
62 | 		# initialization of the protocol's constants
63 | 		self.e = e
64 | 		self.d = d
65 | 		self.theta = math.floor((math.sqrt(4 * math.exp(self.e) + 1) - 1) / 2)
66 | 		self.a = (self.theta * (self.theta + 1)) / (3 * self.theta ** 2 - self.theta + d - 1)
67 | 		
68 | 		self.p = self.a
69 | 		self.p_star = 2 * sum([self.a / (i * (i + 1)) for i in range(1, self.theta)]) + self.p 
70 | 		self.probs = [self.a / (i * (i + 1)) for i in range(1, self.theta)]
71 | 		self.q = self.a / (self.theta * (self.theta + 1))
72 | 
73 | 
74 | 	def aggregate(self, config):
75 | 		# define the needed variables from the configuration dict provided
76 | 		reported_values = config['reported_values']
77 | 		e = config['epsilon']
78 | 		d = config['d']
79 | 
80 | 		# array to store the results
81 | 		results = np.zeros(d)
82 | 		n = len(reported_values)
83 | 		
84 | 		# compute the estimation for each value of the domain
85 | 		for i in range(d):
86 | 			sum_v = 0
87 | 			for j in reported_values:
88 |     			# Support(i) = {i}, thus the protocol supports only the values equal to 
89 | 				# the current value
90 | 				if i == j:
91 | 					sum_v += 1
92 | 			# normalize the sum by trying to extract the noise
93 | 			results[i] = ((sum_v) - n * self.q) / (self.p_star - self.q)
94 | 			# if a negative sum is generated by the normalization, convert it to zero
95 | 			if (results[i] < 0):
96 | 				results[i] = 0
97 | 
98 | 		return results


--------------------------------------------------------------------------------
/LDP/Histogram_Encoding.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import math
 4 | import numbers
 5 | 
 6 | class Histogram_Encoding_client():
 7 | 	def __init__(self, e, d):
 8 | 		# initialization of the protocol's constants
 9 | 		self.e = e
10 | 		self.d = d
11 | 
12 | 	# encoding consists of creating a d-bit vetor, where
13 | 	# only the v-th element is 1, and every other equal to 0
14 | 	def encode(self, v):
15 | 		assert(v < self.d)
16 | 		B = np.zeros(self.d)
17 | 		B[v] = 1
18 | 		return B
19 | 	
20 | 	# perturbation consists of adding noise generated
21 | 	# from the laplace distribution to each element
22 | 	def perturb(self, ret):
23 | 		B = ret
24 | 		for i in range(len(B)):
25 | 			B[i] += np.random.laplace(scale = (2/self.e))
26 | 		
27 | 		return B
28 | 	
29 | 	# randomization consists of perturbing the encoded value
30 | 	def randomize(self, v):
31 | 		return self.perturb(self.encode(v))
32 | 
33 | class Histogram_Encoding_aggregator():
34 | 	def __init__(self, e, d):
35 | 		# initialization of the protocol's constants
36 | 		self.e = e
37 | 		self.d = d
38 | 			
39 | 	def aggregate(self, config):
40 | 
41 | 		# define the needed variables from the configuration dict provided
42 | 		reported_values = config['reported_values']
43 | 		e = self.e
44 | 		d = self.d
45 | 		
46 | 		threshold = config['threshold']
47 | 		method = config['method']
48 | 
49 | 		# array to store the results
50 | 		results = np.zeros(d)
51 | 		# Summation with Histogram Encoding method
52 | 		if method == 'SHE':
53 | 			for i in range(d):
54 | 				sum_v = 0
55 | 				# just sum all the 1s from the v-th elements of the results
56 | 				for j in reported_values:
57 | 					sum_v += j[i]
58 | 			
59 | 				results[i] = sum_v
60 | 
61 | 			return results
62 | 		else:
63 | 			# Thresholding with Histogram Encoding method
64 | 
65 | 			# count of the reported values
66 | 			n = len(reported_values)
67 | 
68 | 			# p and q according to the theory
69 | 			p = 1 - (1/2) * math.exp((e/2) * (threshold - 1))
70 | 			q = (1/2) * math.exp(-(e/2) * threshold)
71 | 
72 | 			for i in range(d):
73 | 				sum_v = 0
74 | 				# Support(B) = {v | B[v] > threshold}
75 | 				# thus, each reported value grater than the threshold is supported
76 | 				for j in reported_values:
77 | 					if j[i] > threshold:
78 | 						sum_v += 1
79 | 
80 | 				# normalize the sum by trying to extract the noise
81 | 				results[i] = ((sum_v) - n * q) / (p - q)
82 | 				# if a negative sum is generated by the normalization, convert it to zero
83 | 				if (results[i] < 0):
84 | 					results[i] = 0
85 | 			
86 | 			return results
87 | 
88 | 


--------------------------------------------------------------------------------
/LDP/LDP_Frequency_Estimator.py:
--------------------------------------------------------------------------------
  1 | from RAPPOR import *
  2 | from Random_Matrix import *
  3 | from Direct_Encoding import *
  4 | from Distance_Sensitive_Encoding import *
  5 | from Unary_Encoding import *
  6 | from Histogram_Encoding import *
  7 | 
  8 | import pandas as pd
  9 | import numpy as np
 10 | import random
 11 | import math
 12 | import numbers
 13 | import copy
 14 | import tqdm as tq
 15 | import qif
 16 | 
 17 | def manhattan_distance(a, b):
 18 |     return np.abs(a - b).sum()
 19 | 
 20 | # Base class for the frequency estimator
 21 | """
 22 | Mandatory Arguments:
 23 | 
 24 |  - domain_size: the number of values that the user might answer to the question posed
 25 |  - method: the protocol that the user wants to use. Possible answers:
 26 |  		-> 'RAPPOR'
 27 | 		-> 'Random_Matrix'
 28 | 		-> 'Direct_Encoding'
 29 | 		-> 'Histogram_Encoding'
 30 | 		-> 'Unary_Encoding'
 31 | 
 32 | Optional Arguments (depending on the protocol used):
 33 | 
 34 |  - epsilon: The epsilon value, as a setting for LDP (Usually in the range (0, 5]))
 35 |  - p, q: Probability values used by pure protocols. In some cases the do not need to be included
 36 |  - Public matrix: The matrix that was previously generated in the Random Matrix protocol
 37 |  - m: The m value used for the initialization of the public matrix in the R.M. protocol
 38 |  - f: The frequency setting used by the RAPPOR protocol
 39 |  - unary_optimized: setting for optimizing the Unary Encoding protocol; by default set to True
 40 |  - threshold: threshold used for the aggregation during the histogram encoding
 41 |  - aggregation_method: aggr. method used my the histogram encoding, in order to obtain the randomized values. ('SHE' or 'THE')
 42 | """
 43 | class Frequency_Estimator():
 44 | 	
 45 | 	def __init__(self, domain_size, method = 'Direct_Encoding', epsilon = 1,
 46 | 				 p = 0.75, q = 0.25, public_matrix = None, m = 10, n_users = 1,
 47 | 				 f = 0.25, unary_optimized = True, threshold = 0.67, 
 48 | 				 aggr_method = 'THE'):
 49 | 		# keep the initialization values of the class
 50 | 		self.domain_size = domain_size
 51 | 		self.n_users = n_users
 52 | 		self.method = method
 53 | 		self.epsilon = epsilon
 54 | 		self.p = p
 55 | 		self.q = q
 56 | 		self.m = m
 57 | 		self.f = f
 58 | 		self.threshold = threshold
 59 | 		self.public_matrix = public_matrix
 60 | 		# according to the method, initialize the proper class with the mandatory arguments
 61 | 		if method == 'RAPPOR':
 62 | 			self.user_protocol_class = RAPPOR_client(f, domain_size, p ,q)
 63 | 			self.aggregator_protocol_class = RAPPOR_aggregator(f, domain_size, p ,q)
 64 | 		elif method == 'Random_Matrix':
 65 |     		# spectial case: if we are using random matrix we must provide a public matrix
 66 | 			# if it is not provided, create it on the fly using the appropriate function
 67 | 			if public_matrix == None:
 68 | 				self.public_matrix = generate_matrix(m, domain_size)
 69 | 			self.user_protocol_class = Random_Matrix_client(self.public_matrix, m, domain_size, epsilon)
 70 | 			self.aggregator_protocol_class = Random_Matrix_aggregator(public_matrix, m, domain_size, epsilon)
 71 | 		elif method == 'Direct_Encoding':
 72 | 			self.user_protocol_class = Direct_Encoding_client(epsilon, domain_size)
 73 | 			self.aggregator_protocol_class = Direct_Encoding_aggregator(epsilon, domain_size)
 74 | 		elif method == 'Distance_Sensitive_Encoding':
 75 | 			self.user_protocol_class = Distance_Sensitive_Encoding_client(epsilon, domain_size)
 76 | 			self.aggregator_protocol_class = Distance_Sensitive_Encoding_aggregator(epsilon, domain_size)
 77 | 		
 78 | 		elif method == 'Histogram_Encoding':
 79 | 			self.user_protocol_class = Histogram_Encoding_client(epsilon, domain_size)
 80 | 			self.aggregator_protocol_class = Histogram_Encoding_aggregator(epsilon, domain_size)
 81 | 		elif method == 'Unary_Encoding':
 82 | 			self.user_protocol_class = Unary_Encoding_client(epsilon, domain_size, unary_optimized, p, q)
 83 | 			self.aggregator_protocol_class = Unary_Encoding_aggregator(epsilon, domain_size, unary_optimized, p, q)
 84 | 		else:
 85 | 			raise ValueError('Method not recognized. Choose one of the default ones')
 86 | 		
 87 | 		# create a list containing one instance for each user
 88 | 		self.users = []
 89 | 		for _ in range(self.n_users):
 90 | 			self.users.append(copy.copy(self.user_protocol_class))
 91 | 	
 92 | 	
 93 | 	"""
 94 | 	Randomization: The user provides a value v in the range (0, d-1), and according to the protocol chosen,
 95 | 	the system can return either a single value, or a vector containing the randomized values
 96 | 
 97 | 	The return value of this function makes no sense to someone that views _only_ one user's data. It is used
 98 | 	by another function, but only to the aggregator of the data
 99 | 	"""
100 | 	def randomize_value(self, v):
101 | 		# just call the randomization function of the relevant protocol
102 | 		return self.user_protocol_class.randomize(v)
103 | 
104 | 	
105 | 	
106 | 	"""
107 | 	Aggregation: Used by the aggregator in order to combine all the users' data in order to produce the final
108 | 	frequency vector, for each value in the domain.
109 | 
110 | 	The reported_values argument is a vector containing each noisy value reported.
111 | 	"""
112 | 	def aggregate(self, reported_values):
113 |     	# create a dict with all the settings of a protocol, and pass it to the aggregator
114 | 		# who chooses the ones that he wants	
115 | 		config = {'reported_values': reported_values, 'f': self.f, 'd':self.domain_size,
116 | 				  'public_matrix': self.public_matrix, 'epsilon': self.epsilon, 'threshold':self.threshold,
117 | 			      'method': self.method, 'p': self.p, 'q': self.q}
118 | 		# call the aggregation function of the relevant protocol
119 | 		return self.aggregator_protocol_class.aggregate(config)
120 | 
121 | 
122 | 	
123 | 	"""
124 | 	Test an previously initialized protol. The function returns the true, and the randomizes
125 | 	stats produced in 2 np vectors. Then, the user is free to compare the vectors using the
126 | 	neccessary metrics.
127 | 
128 | 	The arguments are: a _csv_ file, with only one column, the one that we are interested in.
129 | 	All the other settings for the protocol are already defined, and thus this function is
130 | 	able to use them.
131 | 	"""
132 | 	def test_protocol(self, count, input_file=None, values=None):
133 | 		if input_file is None and values is None:
134 | 			raise ValueError('An input file or a value vector must by given')
135 | 		if input_file is not None:
136 | 			# parse the file, and store its values 
137 | 			df = pd.read_csv(input_file)
138 | 			values = df.to_numpy()
139 | 
140 | 		# determine the number of users and values, based on the values' vector
141 | 		user_count = max(df.iloc[:,0]) + 1
142 | 		total_values = len(df.iloc[:,1]) + 1
143 | 
144 | 		df = df[:count]
145 | 		# check that the users are the same with the way that we initialized the class
146 | 		if user_count != self.n_users:
147 |     			print("---" + str(user_count) + " " + str(self.n_users))
148 |     			raise ValueError('Incorrect amount of users during initialization')
149 | 
150 | 		# vector to store the true sums for each value in the domain
151 | 		true_results = np.zeros(self.domain_size)
152 | 		
153 | 		# list to store the results of each randomization, and to be fed to the aggregator
154 | 		reported_values = []
155 | 
156 | 		# for i in tq.tqdm(range(len(df)), position=0, leave=True):
157 | 		for i in range(len(df)):
158 | 			# get the true value
159 | 			user = int(df.iloc[i, 0])
160 | 			value = int(df.iloc[i, 1])
161 | 
162 | 			# value = int(values[user])
163 | 			# the true sum of this value is increased
164 | 			true_results[value] += 1
165 | 			# call the randomization function in order to obtain the randomized value
166 | 			randomised_result = self.users[user].randomize(value)
167 | 			# and append it to the appropriate list
168 | 			reported_values.append(randomised_result)
169 | 
170 | 		# feed the reported values to the aggregator, who returns an np vector with
171 | 		# the randomized sums	
172 | 		randomised_results = self.aggregate(reported_values)
173 | 
174 | 		# return the tuple of the 2 vectors: the real sums, and the predicted, randomized sums
175 | 		return (true_results.astype(int), randomised_results.astype(int))
176 | 
177 | 
178 | import matplotlib.pyplot as plt
179 | 
180 | 
181 | res = input("Method: [i]: instance | [e]: epsilon measuerements | [u]: increasing users\n\n")
182 | max_samples = -1
183 | e = np.log(12)
184 | 
185 | if res == 'i':
186 | 	estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000)
187 | 
188 | 	res = estimator.test_protocol(max_samples, input_file='res.csv')
189 | 
190 | 	print(res[0])
191 | 	print(res[1])
192 | 
193 | 	print("\nsums\n\n", np.sum(res[0]), np.sum(res[1]), "\n\n")
194 | 
195 | 	estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000)
196 | 
197 | 	res1 = estimator.test_protocol(max_samples, input_file='res.csv')
198 | 
199 | 	print(res1[0])
200 | 	print(res1[1])
201 | 
202 | 	print("\nsums\n\n", np.sum(res1[0]), np.sum(res1[1]), "\n\n")
203 | 
204 | 	xs = [i for i in range(50)]
205 | 	fig, axs = plt.subplots(3)
206 | 	fig.suptitle('Vertically stacked subplots')
207 | 	axs[0].bar(xs, res[0])
208 | 	axs[1].bar(xs, res[1])
209 | 	axs[2].bar(xs, res1[1])
210 | 
211 | 	# axs[0].set_ylim((0, 20))
212 | 	axs[1].set_ylim((0, max(res[0])))
213 | 	axs[2].set_ylim((0, max(res[0])))
214 | 
215 | 	axs[0].title.set_text('True Data')
216 | 	axs[1].title.set_text('Perturbed Data produced by the Direct Encoding Protocol')
217 | 	axs[2].title.set_text('Perturbed Data produced by our Protocol')
218 | 
219 | 	def euclid(x, y):                       # ground distance
220 | 		return abs(x-y)
221 | 
222 | 	kant = qif.metric.kantorovich(euclid)   # distance on distributions
223 | 
224 | 
225 | 	print("\n\n\n\nDirect:", kant(res[0], res[1]))
226 | 	print("\n\n\n\nDistance_Sensitive:", kant(res1[0], res1[1]))
227 | 
228 | 	plt.show()
229 | 
230 | elif res == 'u':
231 | 
232 | 	direct = []
233 | 	dist_sens = []
234 | 	dist_hist = []
235 | 	unary = []
236 | 	randmatr = []
237 | 	size = 2000
238 | 
239 | 	max_samples = 1200
240 | 	d = 50
241 | 	
242 | 	x = [i for i in range(10, max_samples, 20)]
243 | 
244 | 
245 | 
246 | 	def euclid(x, y):                       # ground distance
247 | 		return abs(x-y)
248 | 
249 | 	kant = qif.metric.kantorovich(euclid)   # distance on distributions
250 | 
251 | 	e = np.log(12)
252 | 	for i in tq.tqdm(x, position=0, leave=True):
253 | 
254 | 		estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000)
255 | 		reses = []
256 | 		for j in range(0, 10):
257 | 			a = estimator.test_protocol(i, input_file='res.csv')
258 | 			reses.append(kant(a[0], a[1]))
259 | 		res = sum(reses) / i * 10  
260 | 
261 | 		direct.append(res)
262 | 
263 | 		estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000)
264 | 
265 | 		reses = []
266 | 		for j in range(0, 10):
267 | 			a = estimator.test_protocol(i, input_file='res.csv')
268 | 			reses.append(kant(a[0], a[1]))
269 | 
270 | 		res1 = sum(reses) / i *  10
271 | 
272 | 		dist_hist.append(res1)
273 | 
274 | 		estimator = Frequency_Estimator(50, method='Histogram_Encoding', epsilon=e, n_users=1000)
275 | 
276 | 		reses = []
277 | 		for j in range(0, 10):
278 | 			a = estimator.test_protocol(i, input_file='res.csv')
279 | 			reses.append(kant(a[0], a[1]))
280 | 
281 | 		res2 = sum(reses) / i * 10
282 | 
283 | 		dist_hist.append(res2)
284 | 
285 | 		q = 1 / (math.exp(e) + 1)
286 | 		
287 | 		
288 | 		estimator = Frequency_Estimator(50, method='Unary_Encoding', epsilon=e, p=1/2, q=q, n_users=1000)
289 | 	
290 | 		reses = []
291 | 		for j in range(0, 10):
292 | 			a = estimator.test_protocol(i, input_file='res.csv')
293 | 			reses.append(kant(a[0], a[1]))
294 | 
295 | 		res3 = sum(reses) / i *  10
296 | 
297 | 		unary.append(res3)
298 | 
299 | 		estimator = Frequency_Estimator(50, method='Random_Matrix', n_users=1000)
300 | 	
301 | 		reses = []
302 | 		for j in range(0, 10):
303 | 			a = estimator.test_protocol(i, input_file='res.csv')
304 | 			reses.append(kant(a[0], a[1]))
305 | 
306 | 		res4 = sum(reses) / i *  10
307 | 
308 | 		randmatr.append(res4)
309 | 
310 | 	plt.plot(x, direct, 'r')
311 | 	plt.plot(x, dist_hist, 'g')
312 | 	plt.plot(x, unary, 'm')
313 | 	plt.plot(x, dist_hist, 'y')
314 | 	plt.xlabel("Number of Users")
315 | 	plt.ylabel("Accuracy Error")
316 | 
317 | 	plt.legend(["Direct Encoding", "Histogram Encoding", "Unary Encoding", "Distance Sensitive Encoding"])
318 | 	plt.savefig('../misc/latest_plot.png')
319 | 	plt.show()
320 | 
321 | else:
322 |     
323 | 	direct = []
324 | 	dist_sens = []
325 | 	dist_hist = []
326 | 	unary = []
327 | 	randmatr = []
328 | 	size = 2000
329 | 
330 | 	epsilon = [round(i/3 + 0.8, 2) for i in range (0,12)]
331 | 	nusers = 10000
332 | 	d = 50
333 | 
334 | 
335 | 
336 | 	def euclid(x, y):                       # ground distance
337 | 		return abs(x-y)
338 | 
339 | 	kant = qif.metric.kantorovich(euclid)   # distance on distributions
340 | 
341 | 	for e in tq.tqdm(epsilon, position=0, leave=True):
342 | 
343 | 		estimator = Frequency_Estimator(50, method='Direct_Encoding', epsilon=e, n_users=1000)
344 | 		reses = []
345 | 		for j in range(0, 10):
346 | 			a = estimator.test_protocol(nusers, input_file='res.csv')
347 | 			reses.append(kant(a[0], a[1]))
348 | 		res = sum(reses) / 10  
349 | 
350 | 		direct.append(res)
351 | 
352 | 		estimator = Frequency_Estimator(50, method='Distance_Sensitive_Encoding', epsilon=e, n_users=1000)
353 | 
354 | 		reses = []
355 | 		for j in range(0, 10):
356 | 			a = estimator.test_protocol(nusers, input_file='res.csv')
357 | 			reses.append(kant(a[0], a[1]))
358 | 
359 | 		res1 = sum(reses) / 10
360 | 
361 | 		dist_hist.append(res1)
362 | 
363 | 		estimator = Frequency_Estimator(50, method='Histogram_Encoding', epsilon=e, n_users=1000)
364 | 
365 | 		reses = []
366 | 		for j in range(0, 10):
367 | 			a = estimator.test_protocol(nusers, input_file='res.csv')
368 | 			reses.append(kant(a[0], a[1]))
369 | 
370 | 		res2 = sum(reses) / 10
371 | 
372 | 		dist_hist.append(res2)
373 | 
374 | 		q = 1 / (math.exp(e) + 1)
375 | 		
376 | 		
377 | 		estimator = Frequency_Estimator(50, method='Unary_Encoding', epsilon=e, p=1/2, q=q, n_users=1000)
378 | 	
379 | 		reses = []
380 | 		for j in range(0, 10):
381 | 			a = estimator.test_protocol(nusers, input_file='res.csv')
382 | 			reses.append(kant(a[0], a[1]))
383 | 
384 | 		res3 = sum(reses) / 10
385 | 
386 | 		unary.append(res3)
387 | 
388 | 		estimator = Frequency_Estimator(50, method='Random_Matrix', n_users=1000)
389 | 	
390 | 		reses = []
391 | 		for j in range(0, 10):
392 | 			a = estimator.test_protocol(nusers, input_file='res.csv')
393 | 			reses.append(kant(a[0], a[1]))
394 | 
395 | 		res4 = sum(reses) / 10
396 | 
397 | 		randmatr.append(res4)
398 | 
399 | 	plt.plot(epsilon, direct, 'r')
400 | 	plt.plot(epsilon, dist_hist, 'g')
401 | 	plt.plot(epsilon, unary, 'm')
402 | 	plt.plot(epsilon, dist_hist, 'y')
403 | 	plt.xlabel("Epsilon")
404 | 	plt.ylabel("Accuracy Error")
405 | 
406 | 	plt.legend(["Direct Encoding", "Histogram Encoding", "Unary Encoding", "Distance Sensitive Encoding"])
407 | 	plt.savefig('../misc/latest_plot.png')
408 | 	plt.show()


--------------------------------------------------------------------------------
/LDP/RAPPOR.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | class RAPPOR_client():
  5 | 	# initialize the class with the necessary arguments: f and d(domain size)
  6 | 	def __init__(self, f, d, p, q):
  7 | 		self.f = f
  8 | 		self.d = d
  9 | 		self.p = p
 10 | 		self.q = q
 11 | 	
 12 | 		# store matrices of permanent perturbation
 13 | 		self.perma_B = {}
 14 | 		
 15 | 	# encode by creating a d-lengthed vector
 16 | 	def encode(self, v):
 17 | 		B = np.zeros(self.d)
 18 | 		# only it's v-th element is 1
 19 | 		B[v] = 1
 20 | 
 21 | 		return v, B
 22 | 
 23 | 	# perturbe the value in order to report it
 24 | 	def perturb(self, res):
 25 | 		v, B = res
 26 | 		# __step 1__: permanent randomized response
 27 | 		# check if the permanent B exists for the value B
 28 | 		if v in self.perma_B:
 29 | 			new_B = self.perma_B[v]
 30 | 		else:
 31 | 			# if it does not exist, we must create it
 32 | 			new_B = np.zeros(self.d)
 33 | 			# for each item, we must fix it according to its value in the original matrix
 34 | 			for i, b in enumerate(new_B):
 35 | 				# if the element is 1
 36 | 				if B[i] == 1:
 37 | 					pr = 1 - 0.5 * self.f
 38 | 				# if it is 0
 39 | 				else:
 40 | 					pr = 0.5 * self.f
 41 | 				# generate a random number
 42 | 				res = random.random()
 43 | 				# and compute the element in the new matrix
 44 | 				if (res < pr):
 45 | 					new_B[i] = 1
 46 | 				else:
 47 | 					new_B[i] = 0
 48 | 			# save it to the dictionary so we do not have to compute it again
 49 | 			self.perma_B[v] = new_B
 50 | 
 51 | 		# __step 2__: instantaneous randomized response       
 52 | 		final_B = np.zeros(self.d)
 53 | 		for i, b in enumerate(final_B):
 54 | 			if new_B[i] == 1:
 55 | 				pr = self.p
 56 | 			else:
 57 | 				pr = self.q
 58 | 			res = random.random()
 59 | 			if (res < pr):
 60 | 				final_B[i] = 1
 61 | 			else:
 62 | 				final_B[i] = 0
 63 | 
 64 | 		return final_B
 65 | 
 66 | 	# randomization contists of the PE() opperation
 67 | 	def randomize(self, v):
 68 | 		return self.perturb(self.encode(v))
 69 | 
 70 | 
 71 | 
 72 | class RAPPOR_aggregator():
 73 |     	# initialize the class with the necessary arguments: f and d(domain size)
 74 | 	def __init__(self, f, d, p, q):
 75 | 		self.f = f
 76 | 		self.d = d
 77 | 		self.p = p
 78 | 		self.q = q
 79 | 		
 80 | 	def aggregate(self, config):
 81 |     
 82 | 		reported_values = config['reported_values']	
 83 | 		f = self.f	
 84 | 		d = self.d
 85 | 		n = len(reported_values)
 86 | 		results = np.zeros(d)
 87 | 		for i in range(d):
 88 | 			sum_v = 0
 89 | 			for j in reported_values:
 90 | 				if j[i] == 1:
 91 | 					sum_v += j[i]
 92 | 
 93 | 			results[i] = (sum_v - 0.5 * f * n) / (1 - f)
 94 | 		
 95 | 		return results
 96 | 	
 97 | 	def compute_metrics(self, true, randomized):
 98 | 		metrics_dict = {}
 99 | 		metrics_dict['eucledian_distance'] = np.linalg.norm(true - randomized)
100 |     


--------------------------------------------------------------------------------
/LDP/Random_Matrix.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import random
 3 | import math
 4 | import numbers
 5 | 
 6 | def generate_matrix(m, d):
 7 | 	F = np.zeros((m,d))
 8 | 	
 9 | 	bound = -1 / math.sqrt(m)
10 | 	for i in range(m):
11 | 		for j in range(d):
12 | 			F[i][j] = random.uniform(-bound, bound)
13 | 
14 | 	return F
15 | generate_matrix(5, 10)
16 | 
17 | class Random_Matrix_client():
18 | 	def __init__(self, F, m, d, e):
19 | 		# initialization of the protocol's constants
20 | 		self.F = F
21 | 		self.m = m
22 | 		self.d = d
23 | 		self.e = e
24 | 	
25 | 	def encode(self, v):
26 | 		r = random.randint(0, self.m - 1)
27 | 		x = self.F[r][v]
28 | 
29 | 		return (r, x)
30 | 
31 | 	def perturbe(self, ret):
32 | 		r, x = ret
33 | 
34 | 		pr = math.exp(self.e) / (math.exp(self.e) + 1)
35 | 		res = random.random()
36 | 		if (res < pr):
37 | 			b = 1
38 | 		else:
39 | 			b = -1
40 | 		
41 | 		c = (math.exp(self.e) + 1) / (math.exp(self.e) - 1)
42 | 
43 | 		return (r, b * c * self.m * x)
44 | 
45 | 	def randomize(self, v):
46 | 		return self.perturbe(self.encode(v))
47 | 
48 | 
49 | 
50 | class Random_Matrix_aggregator():
51 | 	def __init__(self, F, m, d, e):
52 | 		# initialization of the protocol's constants
53 | 		self.F = F
54 | 		self.m = m
55 | 		self.d = d
56 | 		self.e = e
57 | 
58 | 	def aggregate(self, config):
59 | 		
60 | 		reported_values = config['reported_values']
61 | 		public_matrix = config['public_matrix']
62 | 		d = self.d
63 | 
64 | 		results = np.zeros(d)
65 | 		for i in range(d):
66 | 			sum_v = 0
67 | 			for j in reported_values:
68 | 				sum_v += j[1] * public_matrix[j[0]][i]
69 | 			
70 | 			results[i] = sum_v
71 | 		return results
72 | 


--------------------------------------------------------------------------------
/LDP/Unary_Encoding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import math
  4 | import numbers
  5 | 
  6 | class Unary_Encoding_client():
  7 | 	def __init__(self, e, d, optimized=True, p=0, q=0):
  8 | 		# initialization of the protocol's constants
  9 | 		self.d = d
 10 | 		self.p = p
 11 | 		self.q = q
 12 | 		self.e = e
 13 | 		self.optimized = optimized
 14 | 		# if the user wants OUE, we initialize p and q 
 15 | 		# according to the theory, and based on epsilon
 16 | 		if self.optimized:
 17 | 			self.p = 1 / 2 
 18 | 			self.q = 1 / (math.exp(self.e) + 1)
 19 | 		
 20 | 	# encoding consists of creating a d-bit vetor, where
 21 | 	# only the v-th element is 1, and every other equal to 0
 22 | 	def encode(self, v):
 23 | 		assert(v < self.d)
 24 | 		B = np.zeros(self.d)
 25 | 		B[v] = 1
 26 | 		return B
 27 | 	
 28 | 	# perturbation consists of setting each bit of the array
 29 | 	# to 0 or 1, according to the probabilities defined as 
 30 | 	# features of the protocol
 31 | 	def perturb(self, ret):
 32 | 		B = ret
 33 | 
 34 | 		new_B = B
 35 | 		# for each bit of the binary array
 36 | 		for i in range(len(B)):
 37 |     		# depending on wether it is 0 or 1
 38 | 			if B[i] == 1:
 39 | 				pr = self.p
 40 | 			else: 
 41 | 				pr = self.q
 42 | 			# generate a random number
 43 | 			res = random.random()
 44 | 			# in order to determine the bit in the new array
 45 | 			if res < pr:
 46 | 				new_B[i] = 1
 47 | 			else:
 48 | 				new_B[i] = 0
 49 | 		
 50 | 		return new_B
 51 | 
 52 | 	# randomization consists of perturbing the encoded value
 53 | 	def randomize(self, v):
 54 | 		return self.perturb(self.encode(v))
 55 | 	
 56 | 	def aggregate(self, config):
 57 | 		# define the needed variables from the configuration dict provided
 58 | 		reported_values = config['reported_values']
 59 | 		d = config['d']
 60 | 
 61 | 		p = self.p
 62 | 		q = self.q
 63 | 
 64 | 		# array to store the results
 65 | 		results = np.zeros(d)
 66 | 		n = len(reported_values)
 67 | 		
 68 | 
 69 | 		# compute the estimation for each value of the domain
 70 | 		for i in range(d):
 71 | 			sum_v = 0
 72 | 			for j in reported_values:
 73 |     			# Support(i) = {i | B[i] == 1}, thus the protocol supports only 
 74 | 				# the values that have 1 on the v-th bit of their matrix
 75 | 				if j[i] == 1:
 76 | 					sum_v += 1
 77 | 			# normalize the sum by trying to extract the noise
 78 | 			results[i] = ((sum_v) - n * q) / (p - q)
 79 | 			# if a negative sum is generated by the normalization, convert it to zero
 80 | 			if (results[i] < 0):
 81 | 				results[i] = 0
 82 | 
 83 | 		return results
 84 | 
 85 | class Unary_Encoding_aggregator():
 86 | 	def __init__(self, e, d, optimized=True, p=0, q=0):
 87 | 		# initialization of the protocol's constants
 88 | 		self.d = d
 89 | 		self.p = p
 90 | 		self.q = q
 91 | 		self.e = e
 92 | 		self.optimized = optimized
 93 | 		# if the user wants OUE, we initialize p and q 
 94 | 		# according to the theory, and based on epsilon
 95 | 		if self.optimized:
 96 | 			self.p = 1 / 2 
 97 | 			self.q = 1 / (math.exp(self.e) + 1)
 98 | 	
 99 | 	def aggregate(self, config):
100 | 		# define the needed variables from the configuration dict provided
101 | 		reported_values = config['reported_values']
102 | 		d = config['d']
103 | 
104 | 		p = self.p
105 | 		q = self.q
106 | 
107 | 		# array to store the results
108 | 		results = np.zeros(d)
109 | 		n = len(reported_values)
110 | 		
111 | 
112 | 		# compute the estimation for each value of the domain
113 | 		for i in range(d):
114 | 			sum_v = 0
115 | 			for j in reported_values:
116 |     			# Support(i) = {i | B[i] == 1}, thus the protocol supports only 
117 | 				# the values that have 1 on the v-th bit of their matrix
118 | 				if j[i] == 1:
119 | 					sum_v += 1
120 | 			# normalize the sum by trying to extract the noise
121 | 			results[i] = ((sum_v) - n * q) / (p - q)
122 | 
123 | 		return results


--------------------------------------------------------------------------------
/LDP/ldp.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import qif
 5 | 
 6 | eps = np.log(20)
 7 | d = 89               # domain size
 8 | trv = 3              # true value
 9 | theta_f = (np.sqrt( 4 * np.exp(eps) + 1) - 1) / 2
10 | theta = int(np.floor( theta_f ))
11 | a = theta * (theta + 1) / (3 * theta**2 - theta + d - 1)
12 | print()
13 | 
14 | m = sum([a / (abs(i - trv) * (abs(i - trv) + 1)) - a / (theta * (theta + 1)) for i in range(trv - theta + 1, 0)])
15 | values = [a / i - m / (d - 1) for i in range(1, theta)]
16 | 
17 | print(values)
18 | print(values[0])
19 | 
20 | def prob(i, x):
21 | 	if i == x:
22 | 		return a
23 | 	if (x - theta < 0):
24 | 		m = sum([a / (abs(i - x) * (abs(i - x) + 1)) - a / (theta * (theta + 1)) for i in range(trv - theta + 1, 0)])
25 | 		c = min( abs(i - x), theta)
26 | 
27 | 		return a / ( c * (c+1)) + m / (d - 1)
28 | 	
29 | 	if (x + theta > (d - 1)):
30 | 		m = sum([a / (abs(i - x) * (abs(i - x) + 1)) - a / (theta * (theta + 1)) for i in range(d, x + theta)])
31 | 		c = min( abs(i - x), theta)
32 | 		
33 | 		return a / ( c * (c+1)) + m / (d - 1)
34 | 
35 | 	c = min( abs(i - x), theta)
36 | 	return a / ( c * (c+1))
37 | 
38 | def rr(i, x):
39 | 	t = 1 / (d - 1 + np.exp(eps))
40 | 	if i == x:
41 | 		return np.exp(eps) * t
42 | 	else:
43 | 		return t
44 | 
45 | 
46 | area = sum([prob(trv - j, trv) for j in range(-theta, theta + 1)])
47 | 
48 | 
49 | print("eps:", eps)
50 | print("theta_f:", theta_f)
51 | print("theta:", theta)
52 | print("a:", a)
53 | print()
54 | print("prob(true):", prob(trv,trv))
55 | print("prob(area):", area)
56 | print("prob(others):", prob(0,trv))
57 | print()
58 | print("rr(true):", rr(trv,trv))
59 | print("rr(other):", rr(0,trv))
60 | print()
61 | 
62 | print("ratio:", prob(trv, trv) / prob(0,trv), np.exp(eps))
63 | print("ratio rr:", rr(trv, trv) / rr(0,trv), np.exp(eps))
64 | 
65 | dist = np.array([prob(i, trv) for i in range(0,d)])
66 | dist_rr = np.array([rr(i, trv) for i in range(0,d)])
67 | print("sum(dist):", sum(dist))
68 | print("sum(dist_rr):", sum(dist_rr))
69 | 
70 | 
71 | # def euclid(x, y):                       # ground distance
72 | # 	return abs(x-y)
73 | # kant = qif.metric.kantorovich(euclid)   # distance on distributions
74 | 
75 | # d1 = np.array([1,0,0])
76 | # d2 = np.array([0,0.5,0.5])
77 | # print(kant(d1, d2))
78 | 
79 | 
80 | #    a
81 | #   ----        c = min { |i-x|, theta }
82 | #   c(c-1)  
83 | 
84 | # %%
85 | 


--------------------------------------------------------------------------------
/LDP/random_response.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | 
 6 | def randomized_response(true_value):
 7 | 	res1 = random.randint(0,1)
 8 | 	if (res1 == 1):
 9 | 		return true_value
10 | 	else:
11 | 		res2 = random.randint(0,1)
12 | 		if (res2 == 1):
13 | 			return 0
14 | 		else:
15 | 			return 1
16 | 
17 | 
18 | n_users = 30
19 | 
20 | users = [i for i in range(100, 10000, 100)]
21 | diffs = []
22 | print(users)
23 | 
24 | for n_users in users:
25 |     
26 | 	diff = 0
27 | 	for _ in range(100):
28 | 
29 | 		true_values = np.array([random.randint(0,1) for i in range(n_users)])
30 | 
31 | 
32 | 		new_values = np.array([randomized_response(i) for i in true_values])
33 | 
34 | 
35 | 		diff += abs(sum(new_values) - sum(true_values))
36 | 
37 | 	diff /= 10
38 | 	diffs.append(diff / n_users)
39 | 
40 | plt.plot(users, diffs)
41 | plt.xlabel("Number of Users")
42 | plt.ylabel("Accuracy Error")
43 | plt.show()


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Protection of Sensitive Data: Creating, Analyzing and Testing Protocols of Differential Privacy
 2 | 
 3 | The full paper of the thesis is available [here](https://pergamos.lib.uoa.gr/uoa/dl/frontend/en/browse/2958792)
 4 | 
 5 | The problem of preserving privacy while extracting information during data analysis, has been an everlasting one. Specifically, during the big­data era, user details can be easily compromised by a malicious handler, something considered both as a security, and as a privacy issue.
 6 | 
 7 | The optimal fix to the subject, is Differential Privacy, which is actually a promise, made by the data handler to the user, that they will not be affected, by allowing their data to be used in any analysis, no matter what other stud­ies/databases/info resources are available. Meanwhile, the output data statistics should be accurate enough for any researcher to extract useful information from them.
 8 | 
 9 | The goal of this thesis, is to examine and compare previously created mechanisms for D.P., while also creating our own mechanism, that serves to the purpose of achieving Local D.P., a form of Differential Privacy that is nowadays widely used in machine learning algorithms, aiming to protect the individuals that send their personal data for analysis. We will do so, by creating a library that is easy to use, and applies to all the rules of data privacy, and then extract conclusions from its use.
10 | 
11 | ## Analyzing and Testing of existing protocols
12 | 
13 | The first two chapters of the thesis are dedicated in testing libraries created, like the **IBM diffprivlib** and the **ARX Tool**. The directory  `ibm_lib_work` contains notebooks for testing the IBM library, and the directory `ARX_work`, contains Java code created in order to test the ARX API. 
14 | 
15 | ## Creating an LDP protocol
16 | 
17 | Local Differential Privacy (LDP), is a modern form of DP used in many real world application. The main downside of most LDP protocols, is their lack of efficiency when a small number of users contribute in the protocol. During this thesis, we aim to create a protocol to fix this probem, and we are introducing the **Distance Sensitive** protocol, which fufils exactly that promise. We conduct testings, and comparisons with other LDP protocols, which were implemented using Python. All our LDP work can be found in the directory `LDP`.
18 | 


--------------------------------------------------------------------------------
/ibm_lib_work/epsilon_measurements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/epsilon_measurements.png


--------------------------------------------------------------------------------
/ibm_lib_work/hist_metrics_euclidean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/hist_metrics_euclidean.png


--------------------------------------------------------------------------------
/ibm_lib_work/hist_metrics_kantorovich.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/hist_metrics_kantorovich.png


--------------------------------------------------------------------------------
/ibm_lib_work/increasing_ds_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/increasing_ds_size.png


--------------------------------------------------------------------------------
/ibm_lib_work/simple_hists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/ibm_lib_work/simple_hists.png


--------------------------------------------------------------------------------
/images/D.E. Idea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/D.E. Idea.png


--------------------------------------------------------------------------------
/images/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/Figure_1.png


--------------------------------------------------------------------------------
/images/Our Idea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/Our Idea.png


--------------------------------------------------------------------------------
/images/arx_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/arx_accuracy.png


--------------------------------------------------------------------------------
/images/arx_tool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/arx_tool.png


--------------------------------------------------------------------------------
/images/emd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/emd.png


--------------------------------------------------------------------------------
/images/epsilon_intro_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_intro_graph.png


--------------------------------------------------------------------------------
/images/epsilon_measurements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_measurements.png


--------------------------------------------------------------------------------
/images/epsilon_others_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_others_kant.png


--------------------------------------------------------------------------------
/images/epsilon_others_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_others_l1.png


--------------------------------------------------------------------------------
/images/epsilon_our_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/epsilon_our_kant.png


--------------------------------------------------------------------------------
/images/hierarchies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hierarchies.png


--------------------------------------------------------------------------------
/images/hist_metrics_euclidean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hist_metrics_euclidean.png


--------------------------------------------------------------------------------
/images/hist_metrics_kantorovich.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/hist_metrics_kantorovich.png


--------------------------------------------------------------------------------
/images/increasing_ds_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/increasing_ds_size.png


--------------------------------------------------------------------------------
/images/local_vs_global.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/local_vs_global.png


--------------------------------------------------------------------------------
/images/nusers_others_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/nusers_others_kant.png


--------------------------------------------------------------------------------
/images/nusers_others_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/nusers_others_l1.png


--------------------------------------------------------------------------------
/images/rr_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/rr_results.png


--------------------------------------------------------------------------------
/images/simple_hists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/simple_hists.png


--------------------------------------------------------------------------------
/images/true_answers_ldp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/true_answers_ldp.png


--------------------------------------------------------------------------------
/images/users_our_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/users_our_kant.png


--------------------------------------------------------------------------------
/images/users_our_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/images/users_our_l1.png


--------------------------------------------------------------------------------
/papers_used/10_sec17-wang-tianhao.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/10_sec17-wang-tianhao.pdf


--------------------------------------------------------------------------------
/papers_used/11_dpmetrics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/11_dpmetrics.pdf


--------------------------------------------------------------------------------
/papers_used/12_LATENT_localDP.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/12_LATENT_localDP.pdf


--------------------------------------------------------------------------------
/papers_used/13_jcp-01-00004.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/13_jcp-01-00004.pdf


--------------------------------------------------------------------------------
/papers_used/14_Random_Matrix.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/14_Random_Matrix.pdf


--------------------------------------------------------------------------------
/papers_used/15_RAPPOR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/15_RAPPOR.pdf


--------------------------------------------------------------------------------
/papers_used/1_privacybook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/1_privacybook.pdf


--------------------------------------------------------------------------------
/papers_used/2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/2_Dwork2006_Chapter_CalibratingNoiseToSensitivityI.pdf


--------------------------------------------------------------------------------
/papers_used/3_ibm_diffprivlib.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/3_ibm_diffprivlib.pdf


--------------------------------------------------------------------------------
/papers_used/4_k_anon+dp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/4_k_anon+dp.pdf


--------------------------------------------------------------------------------
/papers_used/5_arx_dp.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/5_arx_dp.pdf


--------------------------------------------------------------------------------
/papers_used/6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/6_Christofides2003_Article_AGeneralizedRandomizedResponse.pdf


--------------------------------------------------------------------------------
/papers_used/7_chatziko_locationguard_paper_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/7_chatziko_locationguard_paper_1.pdf


--------------------------------------------------------------------------------
/papers_used/8_Differential_privacy_its_technological_prescriptiv.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/8_Differential_privacy_its_technological_prescriptiv.pdf


--------------------------------------------------------------------------------
/papers_used/9_localDP_Tutorial.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/papers_used/9_localDP_Tutorial.pdf


--------------------------------------------------------------------------------
/thesis_paper/GDP/ARX.tex:
--------------------------------------------------------------------------------
  1 | \section{Anonymized Dataset Producing Libraries}
  2 | 
  3 | As mentioned earlier, an other possible output of a mechanism that adds D.P. to a dataset can be the dataset itself, after being anonymized using certain algorithms that meet the criteria of D.P.
  4 | 
  5 | This technique does not yet have many different implementations, mostly due to the success of the previous model shown, as well as the difficulty, the computer power needed and the poor quality of the result being produced.
  6 | 
  7 | Producing an anonymized dataset is the way to go if someone is using earlier forms of data privacy, such as \emph{k-anonymity}, \emph{l-diversity} etc, which we are going to analyze moving forward. However, in order to cover the needs of D.P., several adjustments have to be made. The main idea behind all those libraries, lay behind a theorem, presented in [4], that mixes the use of those previously mentioned techniques with D.P.
  8 | 
  9 | In this Thesis, we are going to examine the \emph{ARX tool}, a tool for data anonymization, that supports the method that we are trying to implement. We are going to analyze this tool, and perform similar testings as in IBM. 
 10 | 
 11 | ARX is a tool for data anonymization, that in general, takes a dataset as an input, applies  privacy models, and produces an anonymized version of this dataset, thus offering protection to its members. The Menu of the ARX tool can be seen in the following \textbf{Figure 3.7.}
 12 | 
 13 | \begin{figure}[!htb]\centering
 14 |     \includegraphics[width=0.9\textwidth]{images/arx_tool.png}
 15 |     \caption{The ARX GUI tool}
 16 | \end{figure}
 17 | 
 18 | At its core, ARX uses a highly efficient globally-optimal search algorithm for transforming data with full-domain generalization and record suppression. The transformation of attribute values is implemented through domain generalization hierarchies, which represent valid transformations that can be applied to individual-level values.
 19 | 
 20 | \subsection{Classic Privacy Models}
 21 | 
 22 | The ARX tool offers standard privacy models that are tested in theory and are widely used to ensure anonymity given a plain dataset. Those consist of the implementation of the following protocols:
 23 | 
 24 | \begin{itemize}
 25 |     \item \textbf{K-anonymity}: A well-known privacy model that aims to protect datasets from re-identification in the prosecutor model. A dataset is $k$-anonymous if\emph{ each record cannot be distinguished from at least $k-1$ other records regarding the quasi-identifiers.} Each group of indistinguishable records forms a so-called equivalence class. 
 26 |     \item \textbf{l-diversity}: This privacy model can be used to protect data against attribute disclosure by ensuring that each sensitive attribute \emph{has at least $l$ "well represented" values in each equivalence class}. Different variants, which implement different measures of diversity, have been proposed.
 27 | \end{itemize}
 28 | 
 29 | Moreover, the tool uses some simple concepts of processing a dataset:
 30 | 
 31 | \begin{itemize}
 32 |     \item \textbf{Random Sampling}: A method of sampling that utilizes some form of random selection. In order to have a random selection method, we must set up some process or procedure that assures that the different units in the population have equal probabilities of being chosen.
 33 |     \item \textbf{Attribute Generalization}: Generalizing a column of the dataset, based on its values. The applications of attribute generalization depend on the type of records (eg. integers, ranges etc).
 34 |     \item \textbf{Record Suppression}: Deletion of a specific row on the input dataset.
 35 | \end{itemize}
 36 | 
 37 | Those are some techniques that are not going to be analyzed and tested in this thesis, however, if combined with D.P. can produce interesting results. Specifically, according to [4], the following theorem applies:
 38 | 
 39 | \emph{Random sampling} with probability $\beta$ followed by \emph{attribute generalization} and the \emph{suppression} of
 40 | every record which appears less than k times \emph{satisfies $(\epsilon, \delta)$ differential privacy} for every $\epsilon \geq -ln(1-\beta)$ with 
 41 | $$\delta = \max_{n:n \geq n_m} \sum_{j>\gamma_n}^{n}f(j;n,\beta)$$
 42 | 
 43 | where $n_m = \frac{k}{\gamma} - 1$, $\gamma = \frac{e^\epsilon-1+\beta}{e^\epsilon}$ and $f(j;n,\beta) = {n \choose  j} \beta^j(1-\beta)^{n-j}$.
 44 | 
 45 | In order to achieve attribute generalization, ARX uses the so called \emph{hierarchies}. They are either imported from a csv file, or being hard-coded into the API, and they are used in order to generalize a sensitive field. An example is given in \emph{Table 3.4}. The subject to generalize is the age of a person. Let's see the values as they proceed through generalization.
 46 | 
 47 | \begin{table}[!htb]
 48 |     \centering
 49 | 
 50 |     \caption{Generalization of data using hierarchies}
 51 |     \label{numbers}
 52 | 
 53 |     \begin{tabular}{| c | c | c | c |}
 54 |       \hline 
 55 |       $1^{st}$ level & $2^{nd}$ level & $3^{rd}$ level & $4^{th}$ level\\
 56 |       \hline
 57 |       1 & 0-4 & 0-9 & *\\
 58 |       \hline
 59 |       3 & 0-4 & 0-9 & *\\
 60 |       \hline
 61 |       5 & 5-9 & 0-9 & * \\
 62 |       \hline
 63 |       10 & 10-14 & 10-19 & *\\
 64 |       \hline
 65 |       18 & 15-20 & 10-19 & *\\
 66 |       \hline
 67 |     
 68 |     \end{tabular}
 69 | \end{table}
 70 | 
 71 | \subsection{Conducting D.P. Testings}
 72 | 
 73 | ARX provides a cross-platform graphical tool, that supports many different ways of anonymizing data, as well as an API that delivers those data anonymization capabilities to Java programs. We are going to use the latter, in order to create our own scripts for testing the tool and its accuracy.
 74 | 
 75 | In order to test the accuracy of the models used by ARX, we are going to run simple queries, on the datasets produced by the anonymization process. We want to eliminate the probability of extremely high noise generation, thus we are going to run the anonymization tool multiple times, and the output dataset will be constructed by the mean values of the fields.
 76 | 
 77 | As show on the above matrix, ARX hierarchies tend to replace every type of value with an interval. This is not desirable when applying the testings we mentioned. Thus, we had to come up with a better solution of defining hierarchies. The ARX GUI provides a wizard that gives a variety of choices so the user can easily create a hierarchy for many data types.
 78 | 
 79 | Another challenge is the number of layers that we are going to use, meaning how far our anonymization will proceed. In each layer, the number of same records increase exponentially, thus we do not want to apply many layers, in order for our results to be accurate, and the output dataset to be readable.
 80 | 
 81 | Given the help from Dr. Fabian Prasser, one of ARX's creators, we opted to treat the integer values as numbers, and in each level:
 82 | \begin{itemize}
 83 |     \item Group the rows by 2
 84 |     \item Apply a function according to the query we want to ask.
 85 | \end{itemize}
 86 |  
 87 | For example, if we want a counting query, the best option would be to apply an \emph{arithmetic mean} function to the group, thus the sum, the mean, the variance etc will be the same. The way that ARX preserves D.P. with those settings, is by record suppression. If that was not the case, the results would be identical to the input dataset. However, now, the output dataset will differ because of its lack of some rows of the input.
 88 |  
 89 | Regarding the layers problem, we opted to use 4 layers of anonymization, the last of whom will be the * value, meaning that every record is inseparable. We do not want this to happen early in our anonymization, but we do not want it to never happen either, because then we would have a privacy leak, if the dataset was too small.
 90 | 
 91 | The creation of the hierarchies for the salary column, can be shown in \textbf{Figure 3.8}, taken from the ARX GUI Hierarchy Creation Wizard.
 92 | 
 93 | \begin{figure}[!htb]\centering
 94 |     \includegraphics[width=0.8\textwidth]{images/hierarchies.png}
 95 |     \caption{Creating an Hierarchy using ARX GUI}
 96 | \end{figure}
 97 | 
 98 | 
 99 | \subsection{Metrics Used}
100 | We are going to test the applicability of the already given ARX mechanisms on a numerical dataset. Our goal is to run basic queries, such as mean value on the dataset's records. We are going to do that first by applying no DP at all, and then by using the API that is presented by ARX, helped by a simple java script that was built for this purpose.
101 | 
102 | 
103 | \subsection{The identity of the testing Dataset}
104 | The dataset that we are going to be looking at, contains sensitive data regarding NBA players' \emph{salaries} from the year 1990 until today. It also states other info about them, such as their \emph{age}, their\emph{ current team }and their \emph{position}. This particular data is not considered sensitive, as those numbers are widely available, however, when it comes down to certain people's salaries, applying D.P. in order to preserve their privacy is crucial.
105 | 
106 | \subsection{Process of running the queries}
107 | As we have earlier noted, the application of D.P. in ARX is rather complicated, specifically for the use that we are interested in: We want the output dataset to have numerical values in the earnings' column, in order to apply queries. 
108 | 
109 | For each column of the dataset, we have defined our own hierarchies. For every column except the `Salaries` one, this hierarchy is semantic, like the ones presented in our intro.
110 | 
111 | For the salaries column, with it being our goal to analyze, we opt to use the construction mentioned in our solution in the intro. We created 7 layers, in order to give the algorithm the ability to anonymize the dataset without the values being converted to `*`.
112 | 
113 | A sample of the result of the creation of the Salaries hierarchy is presented in \textbf{Table 3.5}, while the whole file is available in the GitHub distribution of the results of this Thesis.
114 | 
115 | \begin{table}[!htb]
116 |     \centering
117 | 
118 |     \caption{Hierarchy Levels created}
119 |     \label{numbers}
120 | 
121 |     \begin{tabular}{| c | c | c | c | c|}
122 |       \hline 
123 |       $1^{st}$ level & $2^{nd}$ level & $3^{rd}$ level & ... & $7^{th}$ level \\
124 |       \hline
125 |         79.568  &	291.029 & 500.776 &   	... &	*\\
126 |               \hline
127 |         502.491 &	291.029 &	500.776 &... &	*\\
128 |               \hline
129 |         522.738 &	710.524 &	500.776 &...	&   *\\
130 |               \hline
131 |         898.310 &	710.524 &	500.776 &... &	*\\
132 |               \hline
133 |         1.000.000 &	1.114.013 &   1.220.739 &...&	*\\
134 |               \hline
135 |         1.228.026 &	1.114.013 &   1.220.739 &... &	*\\
136 |               \hline
137 |     \end{tabular}
138 | \end{table}
139 | 
140 | Next up, we are going to setup the use the ARX API, which requires us to specify some variables in order to run Differential Privacy. Those variables are defined in the above Java code, and are those that were use in the actual testings.
141 | 
142 | \bigskip
143 | \bigskip
144 | \bigskip
145 | \bigskip
146 | \begin{lstlisting}[
147 | basicstyle= \footnotesize,
148 | language=Java]
149 |     EDDifferentialPrivacy criterion = new EDDifferentialPrivacy(2d, 1d / Rows);
150 | 
151 |     ARXConfiguration config = ARXConfiguration.create();
152 |     config.addPrivacyModel(criterion);
153 |     config.setSuppressionLimit(1d);
154 |     config.setHeuristicSearchStepLimit(100);
155 |     ARXResult result = anonymizer.anonymize(data, config);
156 | \end{lstlisting}
157 | \bigskip
158 | 
159 | 
160 | The basic principles that were followed for the above definitions are based on the following instructions and guidance by the ARX Tool documentations:
161 | \begin{itemize}
162 |     \item The delta value should not be 0, but is suggested to be set lower or equal than the reciprocal of the number of records.
163 |     \item A suppression limit should be set, preferably to 1.
164 |     \item In order to improve the quality of the data produced, a heuristic search step limit should be set, in order to tweak the ARX search algorithm that handles data suppression.
165 | \end{itemize}
166 | 
167 | Additionally, following the same principles as with the IBM library, we are going to run the D.P. query multiple times before reporting its value. We are going to do so, because the amount of noise generated can be extreme, and because of the low bounds of the heuristic search that we have set. We chose to run each query 1000 times, and then report the mean value of those runs as the result produced by the mechanism.
168 | 
169 | Because of the structure of the result of the ARX mechanism (a dataset containing numerical values), we can only run queries like \emph{sum} and \emph{mean}. There is no point in running a min or max query: we already know that the result will not be accurate. Thus, we are going to try to run a\emph{ mean value} numerical query in the anonymized dataset. The function we are using in order to run this typed of queries is the following:
170 | \bigskip
171 | \clearpage
172 | \begin{lstlisting}[
173 | basicstyle= \footnotesize,
174 | language=Java]
175 | protected static double run_query(ARXResult data, int targetColumn) {
176 | 	// iterator that we are going to use to access the data
177 | 	final Iterator<String[]> itHandle = data.getOutput().iterator();
178 | 	
179 | 	// result of the query
180 | 	double result = 0d;
181 | 	// length of the dataset
182 | 	int totalRecords = 0;
183 | 	
184 | 	// ignore the name of the column
185 | 	String[] name = itHandle.next();
186 | 	if (name.length <= targetColumn) {
187 | 		System.out.println("Target column out of bounds\n");
188 | 		return 0d;
189 | 	}
190 | 
191 | 	// iterate through all the values in the dataset
192 | 	while(itHandle.hasNext()) {
193 | 		String[] next = itHandle.next();
194 | 		// check that our target position is legal
195 | 		String string = next[targetColumn];
196 | 		if (!string.equals("*")) {
197 | 			result += Integer.parseInt(string);		
198 | 			totalRecords++;				
199 | 		}
200 | 	}
201 |     // return the __mean__ of the dataset
202 |     return result / totalRecords;
203 | }
204 | 	
205 | \end{lstlisting}
206 | \bigskip
207 | 
208 | Finally, before running the queries, we must mention that while using the ARX Tool the dataset size should be significant. In our case, it contains almost 13 contains thousands of rows. The dataset size is a critical parameter when applying D.P., while being even more essential during the use of the ARX tool.
209 | 
210 | \subsection{Statistical Queries}
211 | 
212 | As shown by the above Java function, our testings can support every type of statistical queries, such as \emph{mean value}, \emph{sum}, and \emph{average}. However, due to the computer power required and the similarity of the results that those queries produce, we will focus our testings solely to the mean value query.
213 | 
214 | Given the dataset previously analyzed, the true mean value of the salaries column of the dataset is $\$2.868.981,32$. This will be the value that we are going to use in order to examine the accuracy of the D.P. results.
215 | 
216 | \subsubsection{Running with fixed parameters}
217 | 
218 | We are going to conduct our first test by anonymizing our dataset using the default parameters, as we set ε $ = 1$ and δ $ = \frac{1}{12377}$.  The results are shown in the following table.
219 | 
220 | 
221 | \begin{table}[!htb]
222 |     \centering
223 | 
224 |     \caption{Mean value query in ARX with default parameters}
225 |     \label{numbers}
226 | 
227 |     \begin{tabular}{| c | c |}
228 |       \hline 
229 |         Non-DP result & DP Result \\
230 |       \hline
231 |         $\$2.868.981,32$ & $\$2.860.215,6$\\
232 |       \hline
233 |     \end{tabular}
234 | \end{table}
235 | 
236 | We observe that the query results, are somewhat close: We are in the range of millions of dollars, and the ARX mechanism only fails to approach the result by 8 thousand. This is not of course close to what the IBM library computed, but it is still a reliable result, given all the downsides of this type of anonymization.
237 | 
238 | 
239 | \subsubsection{Running with different epsilon values}
240 | 
241 | Next up, in order to determine if ARX follows the rules of D.P., we should try anonymizing the dataset for different values of epsilon, just like we did with the IBM library.
242 | 
243 | We observed during our initial runs that if the epsilon value rises above 2, the algorithm faces certain problems, that will be analyzed moving forward. With that being the case, the epsilon values chosen to conduct the measurements are in the range $[0.2, 1.6]$. The results from our testings are shown in the above \textbf{Figure 3.8.}
244 | 
245 | \begin{figure}[!htb]\centering
246 |     \includegraphics[width=1\textwidth]{images/arx_accuracy.png}
247 |     \caption{Accuracy Error results for increasing values of epsilon}
248 | \end{figure}
249 | 
250 | As we can see, the plot produces a linear type curve when the epsilon is below 0.8, and then stabilizes, unlike the Laplace distribution mechanisms, where the error curve is in logarithmic shape. We can not directly compare the results with the Laplace noise distribution, due to the different datasets used.
251 | 
252 | We generate the answers given by asking the query in the output dataset. Without its records being suppressed, the result dataset would have been perfect, because of the transformation of the data. However, when suppressing many records (nearly 10\% each time), the result could be severely altered, and thus the error plot, as we saw, is quite unpredictable.
253 | 
254 | 
255 | \subsection{Observations regarding the Algorithm}
256 | 
257 | During our testings in the dataset using the ARX mechanism, we observe the following regarding its behavior in the DP queries:
258 | 
259 | \begin{itemize}
260 |     \item The epsilon variable if raised above 2,5, makes the algorithm \emph{extremely slow}, to the point that it does not respond after minutes of execution. This makes sense, if we take into consideration that when epsilon increases, the accuracy gets better. Thus, the algorithm performs extreme searching techniques in order to find which records to suppress, resulting into slow execution.
261 |     \item In order for the algorithm not to produce only *(the last level in our hierarchies) in our target column, we set each of the other columns as \emph{non sensitive} in their definition.
262 |     \item As the epsilon values rise, \emph{the accuracy gets better}, as it is supposed to be, according to the DP principles.
263 |     \item While the dataset has multiple columns, the algorithm usually fails to present all of them with anonymized values, and just reports * in each row. This could have been a result of the high \emph{Heuristic Search Step Limit}, which was by default set to maximum. Despite us lowering its value, the phenomenon persists.
264 | \end{itemize}
265 | 
266 | \subsection{Conclusion}
267 | 
268 | While researching the ARX mechanism we came to the conclusion that it is for sure a whole different approach in Differential Privacy compared to the other libraries that we studied. With that being the case, it has some advantages and some disadvantages. Its main advantages are the following:
269 | 
270 | \begin{itemize}
271 |     \item The result of the mechanism is a \emph{handy dataset} that the user can handle in multiple ways and gain more information than just the result of a query.
272 |     \item The result \emph{can be iterated}, thus giving the option to the user to run the query in a smaller subset of the rows, while it being differential private.
273 | 
274 | \end{itemize}
275 | 
276 | On the other hand, the main disadvantages are:
277 | \begin{itemize}
278 |     \item The result can be misleading, because of the \emph{big accuracy error produced}.
279 |     \item The algorithm \emph{requires a rather big dataset} in order to run properly, while other libraries perform just fine with smaller datasets.
280 |     \item The algorithm is difficult to implement, as you have to create a self-made function for every query, and moreover tune many parameters if you want to run differential privacy.
281 | 
282 | \end{itemize}
283 | 


--------------------------------------------------------------------------------
/thesis_paper/GDP/DP_definition.tex:
--------------------------------------------------------------------------------
  1 | \chapter{PRINCIPLES OF DIFFERENTIAL PRIVACY}
  2 | 
  3 | During this chapter we are going to introduce the term of D.P., and its definition, alongside with the principles that need to be followed while applying it.
  4 | 
  5 | \section{Promise of Differential Privacy}
  6 | 
  7 | \par Differential Privacy is actually a promise made by the data handlers, to the participants of a study: "You will not be affected, adversely or otherwise, by allowing your data to be used in any study or analysis, no matter what other studies/ datasets/ info resources are available". 
  8 | \par The goal is to make the data widely available for analysis, while protecting the users. However, is it possible to learn nothing about an individual, while gathering useful information about a population? This is actually what D.P. is trying to achieve.
  9 | 
 10 | 
 11 | \section{Definition of Differential Privacy}
 12 | Before defining D.P., we must analyze some of the basic components of its definition.
 13 | 
 14 | \subsection{Randomized Response}
 15 | Randomized response is one of the earliest privacy mechanisms, that is used to conduct surveys where taboo behaviour is studied. The participants in those surveys are asked to answer truthfully, while they do not want to be stigmatized. There is a micro-world of what we are trying to achieve, thus we are going to give the algorithm of the randomized response in order to answer a binary (yes/no) question.
 16 | 
 17 | \begin{itemize}
 18 |     \item Flip a coin.
 19 |     \item If it lands on heads, answer truthfully
 20 |     \item If it lands on tails, flip another one
 21 |     \item If it lands on heads, answer no, else, answer yes
 22 | \end{itemize}
 23 | 
 24 | We are going to analyze this algorithm and its success in later chapters, but for now, it is enough to know that there exists a simple mechanism that adds noise, and is rather accurate for large samples.
 25 | 
 26 | Before giving the definition of D.P., we must define its components. 
 27 | 
 28 | \begin{itemize}
 29 |     \item \textbf{Probability Simplex}, given a discrete set $B$, is denoted as $\Delta(B)$ and is defined to be:
 30 |     \begin{align*}
 31 |         \Delta(B) = \{ x\in R^{|B|}: x_i \geq 0 \text { } \forall i \text{ and } \sum_{i=1}^{|B|} x_i = 1\}
 32 |     \end{align*}
 33 |     \item A \textbf {Randomized algorithm} $M$ with domain $A$ and discrete range of results $B$, is associated with the mapping $M: A\rightarrow\Delta(B)$. 
 34 |     \item \textbf{Distance between Databases:} The $l_1$ norm of a database x is denoted $||x||_1$ and it is defined to be: $||x||_1 = \sum_{i = 1}^{|x|} |x_i|$. Thus, the $l_1$ distance between 2 databases $x$ and $y$, is $||x-y||_1$, and the size of a database $x$ os $||x||_1$.
 35 |     
 36 | \end{itemize}
 37 | 
 38 | \subsection{Definition}
 39 | Differential Privacy is defined as following:
 40 | \\
 41 | \\
 42 | A randomized algorithm $M$ with domain $N^{|x|}$ is (ε,δ)-differentially private, if for all $S \in Range(n)$ and for all $x,y \in N^{|x|}$ s.t. $||x - y||_1 \leq 1$
 43 | $$ Pr[M(x) \in S] \leq e^\epsilon Pr[M(y) \in S] + \delta$$
 44 | 
 45 | where the probability space is over the coin flips of the mechanisms $M$. If $\delta = 0$, we say that $M$ is ε-differentially private.
 46 | 
 47 | \\
 48 | It should be noted that D.P. is rather a definition than a strict algorithm. While relying on the definition of D.P., we can create different algorithms, which will all ensure that the result will be deferentially private. This allows us to create different forms of D.P., that will be analyzed later on this thesis.
 49 | 
 50 | The whole point of Differential Privacy, is that the output of a D.P. mechanism, should by \emph{independent} of whether or not an individual is present in the domain $N$. The "ability" of the adversary to recognise the existence of a column in the dataset, is regularized by epsilon.
 51 | 
 52 | \section{The meaning of epsilon}
 53 | It is made clear from the above definition, that if we have a computational task, we might find different algorithms for applying D.P., but the result will always be of the same form: each user of the dataset, will get ε-D.P.. But what does the epsilon parameter actually mean?
 54 | 
 55 | By reading the mathematical equation, we observe that the higher the value of epsilon, the bigger the difference between the two probabilities (minimum and maximum). Thus, we extract the following statement about the value of epsilon during the application of Differential Privacy:
 56 | 
 57 | \begin{itemize}
 58 |     \item The \emph{lower the epsilon} value, the \emph{higher the privacy} guarantees for the users of the dataset.
 59 |     \item The \emph{higher the epsilon} value, the \emph{more accurate the results} produced.
 60 | \end{itemize}
 61 | 
 62 | In practice, epsilon values vary in the range $(0,5]$, as lower values are prohibited, and higher values are considered extreme cases. However, as mentioned in [1],  when epsilon is small, failing to be (ε,0)-differentially private is not necessarily alarming, if our algorithm is linearly increasing with ε (ex (2ε,0)-D.P). This happens because of the nature of the epsilon parameter, which guarantees very strict boundaries between databases. However, when ε increases by a lot, users' privacy suffers. 
 63 | 
 64 | In \textbf{Figure 2.1}, we can see in general terms, the function between the epsilon and the accuracy error, as well as the protection guaranteed. We will discuss in later sections the details on how these graphs are created, but now is a good time to get an overall picture of the accuracy error produced when applying D.P.
 65 | \bigskip
 66 | \bigskip\bigskip
 67 | 
 68 | \begin{figure}[!htb]\centering
 69 |       \includegraphics[width=0.6\textwidth]{images/epsilon_intro_graph.png}
 70 |   \caption{Accuracy Error as a function of epsilon}
 71 | \end{figure}
 72 | 
 73 | \section{Different forms of Differential Privacy}
 74 | 
 75 | As mentioned during the definition, due to the room that is left for its interpretation, there can be many forms of Differential Privacy. There are two major fields recognized, the \emph{Global D.P.} and the \emph{Local D.P.}.
 76 | 
 77 | Their major difference is the curator of the data. In the Global model, the curator must be trusted, as he collects the non-private data and has to pass them through a D.P. algorithm.
 78 | 
 79 | On the other hand, in the Local model, the curator may as well be untrusted, since the users perturb their data on their own, using a specific protocol. The key differences of the two forms are shown in the \emph{Figure 2.2} below.
 80 | 
 81 | An other difference between the two models, is the amount of noise added. With the absence of a trusted curator, the users themselves must add a significant amount of noise into their data, in order to preserve their privacy. This of course results into a need of many users (several thousands), in order for the L.D.P. protocols to function correctly and accurately.
 82 | 
 83 | 
 84 | \begin{figure}[!htb]\centering
 85 |       \includegraphics[width=0.3\textwidth]{images/local_vs_global.png}
 86 |   \caption{Differences between LDP and GDP}
 87 | \end{figure}
 88 | 
 89 | In this thesis, we are going to examine both models, by quoting their definitions, observing already-existing algorithms, and creating our own L.D.P. protocol.
 90 | 
 91 | \section{Existing Problems of D.P.}
 92 | 
 93 | As every new step in Computer Science, Differential Privacy has some issues that are yet to be solved, and some others not covered by its definition. 
 94 | 
 95 | One major problem is the behaviour of the protocols \emph{when the number of users is limited}. The definition of D.P. is based on the alteration of the data in order not to reveal sensitive information. Thus, if a small amount of users are involved in those protocols, the accuracy of the results might be way off the standards that we set, in order to satisfy the epsilon requirements of the user.
 96 | 
 97 | Another (unsolvable) issue, that mainly lies on the basis of surveys, is  \emph{the possibility that conclusions drawn from a survey may reflect statistical information about an individual}.
 98 | 
 99 | For example, if a survey about the correlation of smoking and dental problems is conducted, someone that has specific dental problems might be deemed as a smoker, despite keeping his privacy about the fact that he is smoking, during the survey. That is something that D.P. does not promise: unconditional freedom from distinguishing. This is not however a violation of the definition of D.P., as the survey teaches us that specific private attributes correlate with public observable attributes, since this correlation would be observed independent of the presence or absence of the individual in the survey.
100 | 
101 | There are several more issues as the ones covered above, however we are not going to focus on those, rather on the advantages of D.P.


--------------------------------------------------------------------------------
/thesis_paper/GDP/Intro.tex:
--------------------------------------------------------------------------------
 1 | \chapter{INTRODUCTION}
 2 | 
 3 | \section{Need for Privacy}
 4 | 
 5 | 
 6 | \par In our days, data is everywhere, including our smartphones, our computers our TVs, even our watches. Every device and nearly every website track down data, in order to provide more personalized services. This, of course, is desired by the users, as they are more likely to see relevant advertisements, and in general, have a more unique experience while they are using their devices.
 7 | 
 8 | \par At the same time, the services that track down the data are also benefited, because of the way that science evolves: Experiments need to be made, thus the more available data in order to conduct them, the better. As an example, we might think the medical community: when someone logs-in to the hospital, it is beneficiary for the doctors to gather his data, in order to study his decease, and his potential recovery, not only for the shake of the patient, but also for the further study of the decease. 
 9 | 
10 | \par While providing data may seem inevitable and yet beneficiary for all parties, there is always a risk that this data will be used in order to compromise the user's privacy. When the information lands in the wrong hands, it can expose some characteristics of the user that he does not want to be shared. In our medical example, let's now consider a patient with a rare decease, who logs-in to a local hospital. He might consent to share his personal data (name, age etc.), but only for the doctors to use it. What will happen when the doctors give the data of the whole hospital for analysis? This patient, considering he is one of the few that has this illness, may be stigmatized,  when the data analysts find out his condition. Wouldn't it be better for him, if, let's say, his name was not exposed? We will see later on, why this approach, is found to be successful, but not enough, for extreme cases.
11 | 
12 | \section{Definition of the problem of privacy}
13 | \par In general, when we consider the \emph{problem of privacy}, we refer to the protection of the disclosure of sensitive information of individuals, when a collection of data about these individuals (dataset) is made publicly available.
14 | 
15 | \subsection{Achieving Privacy via Anonymization}
16 | 
17 | \par One of the first, and rather successful attempts for preserving privacy, was anonymization, meaning removing all personal identifiers from the dataset. This technique is further developed, using famous algorithms like k-anonymity, l-diversity etc. However, there are several problems with this approach. Firstly, they are very computational heavy, as their complexity rises up to an exponential one, making the anonymization of a large dataset very slow. Also, the anonymization does not guarantee that the user will remain private, if other datasets are not anonymized. Let's once again consider our example. Suppose our patient goes to two separate hospitals for his treatment, and one of them uses the best anonymization techniques, while the other one provides the data without any form of privacy. Our patient is on both of the datasets, thus the techniques adopted by the first hospital are now useless. This expands to the real world, because, no matter how careful you (and the services that you use) are, a single data breach is enough for you to be compromised. 
18 | 
19 | \par So, right now, things seem a bit pessimistic, supposing that anonymization, no matter how well performed, can not fix our problem. Another successful technique, that is used on many fields, is the addition of noise. During a later section, we are going to examine in which ways it can benefit us while trying to solve our problem.
20 | 
21 | \subsection{Achieving Privacy via Randomization}
22 | 
23 | Randomization can be applied to the data of the users in two different forms:
24 | \begin{itemize}
25 |     \item Apply random noise \emph{directly to the data}. This will result to altered data, which will then be processed, so that the adversary will not be able to individualize the entries in the dataset.
26 |     \item Apply random noise to \emph{queries asked to the dataset}. In that case, the dataset is not directly available to the analysts. Instead, they are allowed to ask questions to the dataset, and the answers are then being randomized, and returned.
27 | \end{itemize}
28 | 
29 | Both the above approaches are utilized, but the second one is widely preferred. During our analysis of data privacy, we are going to dive in both of those techniques, as well as the libraries that they are used in.
30 | 
31 | \par As we can see, the randomization method looks good in theory, but we must answer to several questions before implementing it, such as:
32 | 
33 | \begin{itemize}
34 |     \item How can we define privacy for noisy queries?
35 |     \item What type of noise do we need?
36 |     \item What should we do in the case of extreme amount of noise added?
37 | \end{itemize}
38 | 
39 | We are going to answer those questions later on, during our next chapters. 
40 | 
41 | \section{Goal of this thesis}
42 | As discussed in the introduction, that the most effective up-to date method for applying privacy into a dataset, is via randomization. The method used, is called \emph{Differential Privacy}, and is based on injecting noise into the users' data. 
43 | 
44 | The theory behind this method includes many mathematical theorems, however, it can by easily explained. We will proceed by taking a look on those principles, and analysing the theory behind this form on data privacy. Then, we will proceed by examining some existing applications of D.P., especially some libraries that help us to apply this technique in a dataset. Finally, we are going to create our own library in order to apply Local D.P., a form of privacy that we will discuss in the next chapter. 
45 | 
46 | This library will allow a user to fully anonymize a dataset, and afterwards create histogram and counting queries for this dataset. During the implementation of this library, a new protocol will be introduced, which follows the rules of D.P., and produces better results than many already-existing protocols.
47 | 


--------------------------------------------------------------------------------
/thesis_paper/LDP/intro.tex:
--------------------------------------------------------------------------------
 1 | \chapter{A LIBRARY FOR LOCAL DIFFERENTIAL PRIVACY}
 2 | 
 3 | \section{Introduction in Local DP}
 4 | 
 5 | As we mentioned in previous chapters, there are two major forms of Differential Privacy. Having analyzed and tested the first one, \emph{Global D.P.}, it is now time to examine \emph{Local D.P.}, by explaining some possible protocols, as well as building our own.
 6 | 
 7 | 
 8 | In Local D.P., there is a significant difference compared to Global DP: there is \emph{no trusted curator} between the data and the users, as they just want to send their data, while already being anonymized. Thus, an algorithm must perturb the data before sending it to the untrusted curator, who will then transmit it to the analysts. 
 9 | 
10 | In order to achieve that goal, the user must randomize the value before making it public (i.e. sending it to the untrusted curator). Then, the curator which collects the data (we will reference to him as aggregator moving forward), collects the data and tries to retrieve their original values, with a goal of producing the most accurate results possible. 
11 | 
12 | Thus, each LDP algorithm has the following steps:
13 | 
14 | \begin{itemize}
15 |     \item Each user encodes, and then perturbs the private value that he wants to make public
16 |     \item Each user sends out the result of the perturbation process, with that being only the final value, as they keep the intermediate results for themselves
17 |     \item The untrusted data curator collects each user's value, and implements some kind of aggregation in order to retrieve the stats that he wants from the data given to him.
18 | \end{itemize}
19 | 
20 | In comparison with Global D.P., the Local model has advantages, as well as disadvantages. 
21 | Its main advantages are:
22 | \begin{itemize}
23 |     \item The user is not forced to trust the data curator, as only the perturbed value is reported
24 |     \item Simpler implementation of the algorithms, due to the district steps taken by both sides.
25 | \end{itemize}
26 | 
27 | while the main disadvantages are the following:
28 | 
29 | \begin{itemize}
30 |     \item The noise added should be larger than the Global model, in order to satisfy the definition, thus the number of people in the dataset should be significant for accurate results to be produced.
31 |     \item Because this is not always possible, many real-world applications use extremely high values of epsilon compared to what we got used to during our testing in the Global models.
32 | \end{itemize}
33 | 
34 | During this Thesis, concern was raised for the main disadvantage of L.D.P., and thus\emph{ we will present a new protocol aiming to reduce the need for many users, while still covering the definition.} However, the definition for L.D.P. is quite different than the Global model one's.
35 | 
36 | \section{Definition of Local DP}
37 | 
38 | Having a general idea in how Local D.P. functions, it is now time to give a strict definition that we are going to depend our work on moving forward.
39 | 
40 | We can say that an algorithm $A$ satisfies ε-Local Differential Privacy, if and only if for any input $v_1$, $v_2$, we have
41 | 
42 | $$ \forall y \in Range(A):\ Pr[A(v_1) = y] \leq e^{\epsilon} * Pr[A(v_2) = y] $$
43 | 
44 | where $Range(A)$ denotes the set of all possible outputs of the algorithm $A$.
45 | 
46 | As mentioned in Chapter 2, this definition can have many interpretations by different algorithms or protocols, but each one must produce a probabilistic space whose elements must satisfy the above equation.
47 | 
48 | 
49 | \section{Simple Application of LDP}
50 | 
51 | The most simple of L.D.P. protocols is already mentioned in this Thesis, and is no other than the \emph{Randomized Response} protocol. This algorithm implements the three steps mentioned in the introduction, as the user chooses a value (Yes or No), perturbs it (by the flipping of the coins), reports the perturbed value, with the sole job of the aggregator being to collect, normalize and report the values provided. It meets the definition of L.D.P., as the fraction of a pair of probabilities in the space of possible outputs (Yes, No) has always the ceiling of a real number. 
52 | 
53 | Our goal is to now find this ceiling, and thus denote the level of privacy that randomized response offers. In order to do this, we are going to select the possibility of the user having chose the answer "Yes". A simple case analysis shows that $Pr[Yes | Truth] = \frac{3}{4}$, and of course $Pr[Yes | False] = \frac{1}{4}$. Thus, by the definition of L.D.P., we have 
54 | 
55 | \begin{align*}
56 |     \frac{Pr[Yes | Truth]}{Pr[Yes | False]} = \frac{\frac{3}{4}}{\frac{1}{4}} = 3 = e^\epsilon \Longleftrightarrow \epsilon = ln(3)
57 | \end{align*}
58 | 
59 | Thus, R.R. offers  $ln(3)$-differential privacy to its users. This is quite a good setting, but the restriction is that the user can only report 2 values, something not suitable for modern problems and surveys. 
60 | 
61 | 
62 | 
63 | In R.R., we care about the total true answers of the users, and not the individual responses. Thus the metric we are going to use is the \emph{absolute difference of the sum of the 2 vectors: the one with the truthful answers, and the one with the reported answers}. We are going to divide this result with the number of the users, in order to get the scale of the error depending on the size of the vector that was reported. The metric is expressed from the following function:
64 | 
65 | \begin{align*}
66 |     \text{Error} = \frac{|\sum \text{true\_values} - \sum \text{reported\_values}| }{\text{number\ of \ users}}
67 | \end{align*}
68 | 
69 | As always, during the creation of probabilistic distributions, one run is not enough, because of the extreme amount of noise that can occur. Thus, for each number of users we are going to run the R.R. protocol 100 times, and the final accuracy error will be produced by the mean value of those runs.
70 | 
71 | Having implemented R.R. in Python, we can now display the accuracy error of R.R. as the number of users rises. The results of the testings are shown bellow in \textbf{Figure 4.1}.
72 | 
73 | \begin{figure}[!htb]\centering
74 |     \includegraphics[width=0.8\textwidth]{images/rr_results.png}
75 |     \caption{Accuracy Error in R.R for increasing values of epsilon}
76 | \end{figure}
77 | 
78 | 
79 | We observe that the plot behaves as expected: the protocol produces a logarithmic curve for the accuracy error, while for a large number of users (over 3000), the error stabilizes bellow $0.1$. 


--------------------------------------------------------------------------------
/thesis_paper/LDP/other_protocols.tex:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | \section{Existing Protocols for Local DP}
  4 | 
  5 | Apart from R.R., several L.D.P. protocols have been implemented during the years, with many of them being widely used by companies in order to protect users' data. One of the most famous protocols is \emph{RAPPOR}([15]), created by Google, and being currently used in the Chrome browser for the company to provide useful info to its users without compromising their privacy. Also, Apple has created ts own protocol of L.D.P., and utilize it in their products. 
  6 | 
  7 | However, we are not going to focus on those protocols moving forward, than the ones presented in [10], a paper which introduces many algorithms for L.D.P., each one with different perturbation techniques and suitable for different circumstances.
  8 | 
  9 | During this chapter we are going to give a definition of each algorithm, implement it using Python, and compare the accuracy results produced by those protocols, just like during our testings of the G.D.P. models. Each protocol has two parts: the \emph{users} and the \emph{aggregator}. For the users we must each time define the following functions:
 10 | 
 11 | \begin{itemize}
 12 |     \item $Encode()$: Encodes the true value that the user wants to report
 13 |     \item $Perturb()$: Perturbs the encoded value, in order to produce the random value that will be reported
 14 | \end{itemize}
 15 | 
 16 | For the aggregator we must each time define the  $Aggregate()$ function, that collects the reported random values of the users, and produces the results according to the model.
 17 | 
 18 | \subsection{Basic RAPPOR}
 19 | As mentioned earlier, RAPPOR is a protocol created by Google. Its simpler form, Basic RAPPOR is used in Chrome, where it collects answers to questions such as the user's home page. The protocol's functions are the following:
 20 | 
 21 | \textbf{Encoding:} $Encode(v) = A_0$, where $A_0$ is a d-bit vector, such that: $A_0[v] = 1$ and $A_0[i] = 0$ for every $i \neq v$. 
 22 | 
 23 | \textbf{Perturbation:} The perturbation consists of 2 steps: the permanent and the instantaneous. The permanent one is carried out only one time, and is the following: 
 24 | 
 25 | \begin{equation*}
 26 |     Pr[A_1[i] = 1] =
 27 | 	\begin{cases}
 28 | 		1 - \frac{1}{2}f & \mbox{if } A_0[i]=1 \\
 29 | 		\frac{1}{2}f & \mbox{ otherwise}
 30 | 			\end{cases}
 31 | \end{equation*}
 32 | 
 33 | The instantaneous step is carried out every time a user reports a value, and is defined as:
 34 | 
 35 | \begin{equation*}
 36 |     Pr[A_2[i] = 1] =
 37 | 	\begin{cases}
 38 | 		p & \mbox{if } A_1[i]=1 \\
 39 | 		q & \mbox{ otherwise}
 40 | 			\end{cases}
 41 | \end{equation*}
 42 | 
 43 | We observe from the above functions, that the user must define the $f, p $ and $q$ parameters. Google suggests that we set $f = \frac{1}{2}$ or $\frac{1}{4}$, and $p = 0.75$, thus $q = 0.25$. During our testings, those exact parameters were used. 
 44 | 
 45 | 
 46 | \subsection{Random Matrix Projection}
 47 | In [14], a protocol with a random matrix projection is proposed, introducing an additional setup step.
 48 | 
 49 | \textbf{Setup:} A random and uniform matrix is generated before any encoding, with it being public and drawn as: $\Phi \in \{-\frac{1}{m}, \frac{1}{m}\}^{m \times d}$, where $m$ and $d$ are user defined. In our testings, we opt to set $m = 5$ and $d = 10$.
 50 | 
 51 | \textbf{Encoding:} When it comes down to encoding, the function used is the following: $Encoding = (r,x)$, where $r$ is uniformly randomly selected from the range of m, and $x$ is the v-th element of the r-row of the random matrix.
 52 | 
 53 | \textbf{Perturbation}: The perturbation function is defined as following: 
 54 | \begin{align*}
 55 |  Perturb(r,x) = (r, b\cdot c \cdot m \cdot x) 
 56 | \end{align*}
 57 |  where
 58 | \begin{equation*}
 59 |     b =
 60 | 	\begin{cases}
 61 | 		1 & \mbox{with } p = \frac{e^\epsilon}{e^\epsilon + 1} \\
 62 | 		-1 & \mbox{with } q = \frac{1}{e^\epsilon + 1}			
 63 | 	\end{cases}
 64 | \end{equation*}
 65 | and $c = \frac{e^\epsilon +1}{e^\epsilon -1}$
 66 | 
 67 | \textbf{Aggregation:} Given all the tuples reported by $j$ users in the form $(r, y)$, the estimation for the i-th value of the dataset, is produced by 
 68 | \begin{align*}
 69 | \sum_{j} y^j \cdot \Phi[r^j,i]
 70 | \end{align*}
 71 | 
 72 | \subsection{Pure Protocols}
 73 | 
 74 | The following protocols are presented in [10], and are called "pure" protocols, because of the way they aggregate the data produced by the user. For each one of them, we should define a $Support()$ function, that indicates for each value of the possible outcomes, the reported values that are supported. Thus, with the notation $\sum_{j} Support(y^j)$, we mean the sum of all the supported values of the y-th element of the dataset.
 75 | 
 76 | Also, for a protocol to be pure, two probabilities must be defined, $p^*$ and $q^*$, where the first notes the probability that the true value is supported by an element $y$, and the second one the probability of another value is supported by the element $y$. The protocol is pure if and only if $p^* > q^*$.
 77 | 
 78 | If a protocol is pure, the estimation of the total reported values for an element of the dataset $i$, is the following:
 79 | 
 80 | \begin{align}
 81 |     \text{Estimation} = \frac{\sum_{j} 1_{support(y^j)}(i) - nq*}{p^* - q^*}
 82 | \end{align}
 83 | where $j$ denotes the j-th user reporting their value, and $n$ the total size of the vector of the reported values.
 84 | 
 85 | 
 86 | \subsubsection{Direct Encoding}
 87 | 
 88 | This protocol is the natural method of extending the Randomized Response, without the limitation of 2 possible answers. 
 89 | 
 90 | \textbf{Encoding:} The protocol does not feature an encoding procedure, thus 
 91 | 
 92 | \begin{align*}
 93 |     Encode(v) = v
 94 | \end{align*}
 95 | 
 96 | \textbf{Perturbation:} The perturbation is based on the epsilon setting given by the user, and its function is defined as following:
 97 | 
 98 | \begin{equation*}
 99 |     Pr[Perturb(x) = i] =
100 | 	\begin{cases}
101 |     	p = \frac{e^\epsilon}{e^\epsilon + d - 1} & \mbox{if } i = x \\
102 |     	q = \frac{1}{e^\epsilon + d - 1} & \mbox{if } i \neq x 
103 | 	\end{cases}
104 | \end{equation*}
105 | 
106 | where $d$ the size of the dataset of the possible answers, $x$ the true value and $i$ the value selected.
107 | 
108 | \textbf{Aggregation:} The protocol is pure with $p^* = p$, $q^* = q$ and $Support(i) = i$, thus the predicted results for each of the dataset's values can be calculated from the Equation 4.1.
109 | 
110 | We observe that this protocol strongly depends on the size of the dataset of the possible answers, thus when the dataset size increases, the protocol becomes less accurate, due to the decreased probability of selecting the truth. Moreover, for the D.E. protocol, all the false values have the same probability to get chosen, a rather disturbing detail for a query such as a person's age. We will return to these thoughts on later sections.
111 | 
112 | \subsubsection{Histogram Encoding}
113 | 
114 | An other protocol presented is Histogram Encoding, where an input when having $d$ options is encoded as a $d$-length vector.
115 | 
116 | \textbf{Encoding:} The encoding function is for the protocol is
117 | 
118 | \begin{align*}
119 |     Encoding(v) = [0, 0, \dots, 1, \dots, 0]
120 | \end{align*}
121 | 
122 | where only the v-th element of the vector is equal to 1.
123 | 
124 | \textbf{Perturbation:} The result of perturbing the encoded vector, is a new vector $B'$, s.t.: 
125 | 
126 | \begin{align*}
127 |     B'[i] = B[i] + Lap(\frac{2}{\epsilon})
128 | \end{align*}
129 | where $Lap()$ denotes the noise drawn from the Laplace distribution, where
130 | 
131 | 
132 | \begin{align*}
133 |     Pr[Lap(\beta) = x] = \frac{1}{2\beta}e^{\frac{-|x|}{\beta}}
134 | \end{align*}
135 | 
136 | \textbf{Aggregation:} Several methods are proposed for aggregating the results created by the H.E. protocol, but as mentioned by the authors, the best one is called \emph{Thresholding with H.E.}, where a threshold value is introduced in order to decide what to keep from the reported values. The support function is altered as following:
137 | 
138 | \begin{align*}
139 |     Support(B) = \{v | B[v] > \theta\}
140 | \end{align*}
141 | thus, if a noisy output is grater than theta, is set to support the corresponding value. According to the authors, the optimal value for θ is in the range of $(\frac{1}{2}, 1)$. During the testings that are going to be conducted, we are going to use a threshold of $\frac{2}{3}$.
142 | 
143 | 
144 | Comparing this protocol to D.E., we observe that is solves the problem of the dependence of the noise drawn by the number of options to choose from. In H.E., no matter how large the domain size is, the noise solely depends on the epsilon value chosen by the user. Thus, when having a large domain size, it is clear that we should prefer the H.E. protocol over D.E.
145 | 
146 | \subsection{Unary Encoding}
147 | 
148 | The last protocol that is going to take part in the accuracy testings, is the Unary Encoding method, a further exploration of the Basic RAPPOR. It is a unique protocol, as the user does not set the level of privacy using epsilon, but by giving two probabilities, $p$ and $q$ and the epsilon value is computed using those two parameters.
149 | 
150 | \textbf{Encoding:} Exactly like in the H.E. method:
151 | \begin{align*}
152 | Encoding(v) = [0, 0, \dots, 1, \dots, 0] 
153 | \end{align*}
154 | where only the v-th element of the vector is equal to 1.
155 | 
156 | \textbf{Perturbation:} This step is different than those that we already saw, and is carried out using the following function:
157 | 
158 | 
159 | \begin{equation*}
160 |     Pr[B'[i] = i] =
161 | 	\begin{cases}
162 |     	p  & \mbox{if } B[i] = 1 \\
163 |     	q & \mbox{if } B[i] = 0 
164 | 	\end{cases}
165 | \end{equation*}
166 | 
167 | The epsilon value is decided given $p$ and $q$, and is defined as following:
168 | 
169 | \begin{align*}
170 |     \epsilon = ln(\frac{p\cdot(1-q)}{(1-p)\cdot q})
171 | \end{align*}
172 | 
173 | \textbf{Aggregation:} The Support function is once again altered, as in the U.E. protocol is defined as following:
174 | 
175 | \begin{align*}
176 |     Support(B) = \{i | B[i] = 1\}
177 | \end{align*}
178 | 
179 | and of course, $p^* = p$ and $q^* = q$, in order to make the protocol pure. As for the choice of $p$ and $q$, we opt to choose $p = \frac{1}{2}$, and $q = \frac{1}{e^\epsilon + 1}$.
180 | 
181 | \section{Testings}
182 | 
183 | \subsection{Setup}
184 | 
185 | Now that all those protocols where introduced, we are going to compare them in order to decide which is better to use when wanting to apply L.D.P. in a dataset. We are going to use a dataset that was created using random values, but corresponds to the age of a group of people. The distribution of the values of the dataset is shown in the histogram in \textbf{Figure 4.2. }
186 | 
187 | \begin{figure}[!htb]\centering
188 |     \includegraphics[width=1\textwidth]{images/true_answers_ldp.png}
189 |     \caption{True Answers for the Dataset of LDP}
190 | \end{figure}
191 | 
192 | Each user will report one of these 50 values, and the aggregator of each protocol will gather the data given, and try to re-create this histogram in the best manner possible.
193 | 
194 | \subsection{Goal}
195 | 
196 | We want to decide which protocols behaves better, thus a number of different metrics will be used. The main focus of our testings will be the vectors that the aggregators provide, which we will compare with each other, as well as with the vector containing the true answers. In a similar way as our G.D.P. testings, we are going to run the protocols for different values of epsilon, and different number of users used. The second one is extremely important in L.D.P., as we mentioned earlier that many protocols struggle with a small number of input, as the noise drawn is significant. 
197 | 
198 | With respect to the choice of metrics, we are going to use the \emph{Manhattan Distance}, known as the $l1$-norm, as well as the \emph{Kantorovic Distance}, explained in 3.2.5.1.
199 | 
200 | \subsection{Epsilon Measurements}
201 | 
202 | The first comparison between the protocols will be with a changing epsilon value, in order to observe how they behave for lower and higher values of the privacy setting. During these testings, all of the users of the dataset were used (approximately 20 thousand), and each run of each protocol was carried out 10 times, just like in other testings, in order to eliminate the danger of drawing extreme values of noise.
203 | 
204 | First, we are going to run all the protocols and compare them using the Manhattan Distance. The results are shown in \textbf{Figure 4.3.}
205 | 
206 | 
207 | \begin{figure}[!htb]\centering
208 |     \includegraphics[width=1\textwidth]{images/epsilon_others_l1.png}
209 |     \caption{Epsilon Measurements compared by Manhattan Distance}
210 | \end{figure}
211 | 
212 | 
213 | We gather many useful observations from the graph:
214 | 
215 | \begin{itemize}
216 |     \item The Random Matrix protocol does not function as expected, as \emph{its accuracy does not follow the logarithmic curve we are used to when epsilon increases.} However, for small values of ε, its results are acceptable, and some times even better than the pure protocols.
217 |     \item \emph{The pure protocols behave in a similar way}, with the error stabilizing when epsilon gets higher than 2.5. 
218 |     \item \emph{The Direct Encoding protocol was the worst behaviour among the pure ones}, with its error being extremely high for epsilon values lower than 1.  This is mainly due to the fact that when ε gets too small, the probability of telling the truth gets significantly low, thus creating a big error in accuracy.
219 |     \item \emph{The optimized U.E. protocol has the best behaviour} in comparison to the other protocols tested.
220 | \end{itemize}
221 | 
222 | Next up, we are going to run the same testings, but this time using the Kantorovich metric. We expect the protocols to behave even worse, because of the identity of the metric: the Kant. metric pays attention to the distance of the reported answer from the true one. The current protocols do not take into account the distance of the two answers, thus the metric will probably report a higher error. The results of the runs are shown in \textbf{Figure 4.4.}
223 | 
224 | \begin{figure}[!htb]\centering
225 |     \includegraphics[width=1\textwidth]{images/epsilon_others_kant.png}
226 |     \caption{Epsilon Measurements compared by Kantorovic Distance}
227 | \end{figure}
228 | 
229 | 
230 | As we expected, the protocols produce a higher error, with the D.E. being the worst among them, especially for lower values of epsilon. This is a rather alarming notice, in which we will come back in Section 4.6.
231 | 
232 | 
233 | \subsection{Increasing number of users}
234 | 
235 | 
236 | The second experiment that we will conduct is the accuracy error depending on the number of users used during the survey covered by the protocol. In the definition of L.D.P. the observation of the need of lots of users was made, and it is now time that we examine it. We are going to use a \emph{fixed epsilon value}, one that our protocols behave similarly for (at least the pure ones, in which we will focus our research moving forward). Our epsilon value that we are going ot used will be fixed and equal to 1.5.
237 | 
238 | We are going to run the protocols and compare them using the Manhattan Distance. Additionally, we are each time going to divide the result of the metric with the number of users participated, as the simple error is going to increase when the users increase. Hence, this division is going to give us the error depending on the size of our domain. The results are shown in \textbf{Figure 4.5.}
239 | 
240 | \begin{figure}[!htb]\centering
241 |     \includegraphics[width=1\textwidth]{images/nusers_others_l1.png}
242 |     \caption{Increasing n. of users compared by Manhattan Distance}
243 | \end{figure}
244 | 
245 | The results confirm the allegations made after explaining the definition of L.D.P.: When the number of participants in a survey is low, the error produced is very high. Every protocol has similar behaviour, as we can see that for fewer than 1000 users the relative error is even 6 times larger than for more than 1000 users. Actually, as we can see from the graph, the turning point is around 2000 users: the relative error drops and stabilizes after this number of participants. 
246 | 
247 | An other observation made, is that the protocol with the best behaviour during our epsilon measurements (Unary Encoding), is the worst one in the low users testings. This of course gets better as the number of participants rises, and eventually beats the other protocols when it comes down to relative accuracy error. 
248 | 
249 | Next up, we are going to perform the same testings while comparing the protocols with the Kantorovich metric. The results are shown in \textbf{Figure 4.6}.
250 | 
251 | \begin{figure}[!htb]\centering
252 |     \includegraphics[width=1\textwidth]{images/nusers_others_kant.png}
253 |     \caption{Increasing n. of users compared by Kantorovich Distance}
254 | \end{figure}
255 | 
256 | The results for a low number of users differ a lot from the runs with the $l1$ distance as a metric. Here, the D.E. method has obviously the worst behaviour, and the U.E. method the best. We can even claim that U.E. seems to solve our problem of high accuracy error. However, no one of those protocols takes into account the distance of the reported answer from the true one, thus it makes sense that the Kantorovich metric produces a huge amount of accuracy error.
257 | 
258 | That fact, triggered our thoughts, on what could possibly be done in order to reduce that problem. The thoughts made on this subject are analyzed in the next chapter, by creating a new L.D.P. protocol, sensitive to the distance between the true and the reported values.


--------------------------------------------------------------------------------
/thesis_paper/LDP/our_protocol.tex:
--------------------------------------------------------------------------------
  1 | \section{A Distance Sensitive Encoding Protocol}
  2 | 
  3 | \subsection{Idea}
  4 | 
  5 | In the previous section we discussed various LDP Protocols that function extremely well for numerical values (histogram type), with accuracy that is completely acceptable. However, when the number of users is limited, (eg under 1000), we made the observation that the accuracy error is extremely large. This is mainly due to the fact that the probability of an item to be chosen is independent of the distance between the true and the selected value.
  6 | 
  7 | 
  8 | Thus, for the needs of this Thesis, a new L.D.P. protocol was constructed. The idea proposed is to \emph{have the probability of choosing an element of the domain to depend on the distance from the true value}. This could prove very helpful for histogram values, but does not make any sense for categorical values. From now on, we are going to focus on histogram values.
  9 | 
 10 | Based on our idea, the probabilities' distribution, in comparison with the distribution of the D.E. protocol, will look like the one in the following figures.
 11 | 
 12 | \begin{figure}[!htb]\centering
 13 |     \includegraphics[width=0.5\textwidth]{images/D.E. Idea.png}
 14 |     \caption{D.E. protocol's Probabilistic Distribution}
 15 | \end{figure}
 16 | 
 17 | \begin{figure}[!htb]\centering
 18 |     \includegraphics[width=0.5\textwidth]{images/Our Idea.png}
 19 |     \caption{Distance Sensitive protocol's Probabilistic Distribution}
 20 | \end{figure}
 21 | 
 22 | 
 23 | Thus, there is an area around the true value that has high probability to be selected. The width of this area(from now on $\theta$) is defined by the epsilon setting that the user wants to use. The idea for having a specific area and not decreasing our probabilities as low as it goes when we diverge from the true value, is based on our need to be able to serve low epsilon values, as the first would be a big no for when the users want to use a high privacy setting.
 24 | 
 25 | \subsection{Mathematical Background}
 26 | 
 27 | In order to achieve our goal, when in the selected area, we should include in the denominator the quantity 
 28 | $$
 29 | |x - i|
 30 | $$
 31 | where x is the true value, and the i the false one that we are looking at in order to report it, in order for the probability to depend on the distance of the reported values.
 32 | 
 33 | When we are out of this area, the denominator will have a constant value, proportional to the boundaries of the selected area. 
 34 | 
 35 | \emph{Example}: Let's suppose that we have a domain size of 100, and our $\theta$ value is 4. All the probabilities outside the area, should be in reverse proportion of $\theta$, thus 4.
 36 | 
 37 | Like all probabilistic algorithms though, the sum of the probabilities for all the items in the domain size, must be 1. Thus if we chose the probability to be in the shape of $\frac{a}{|x-i|}$, the partial sum of the series will not give as simple results.
 38 | 
 39 | It is known that the series in the form of $a_n = \frac{1}{n}$ does not converge, and its partial sums can only be computed using a complex approximation formula. These characteristics make this type of series very hard to use, and so we have to think of a more handy type.
 40 | 
 41 | 
 42 | A type of series that is known to have easy to compute partial sums, are the telescopic series, such as $b_n = \frac{1}{n (n+1)}$. It is known that 
 43 | $$
 44 | \sum_{n = 1}^{n = k} b_n = 1 - \frac{1}{k + 1}
 45 | $$
 46 | something that will prove extremely useful moving forward.
 47 | 
 48 | So, taking into consideration the quantity $|x - i|$ and the telescopic series $b_n$, we conclude that the probability of each non-true element in our selected area will be of the shape:
 49 | 
 50 | \begin{align}
 51 |     \mathbf{q = \frac{a}{|x-i|(|x-i| + 1)}}
 52 | \end{align}
 53 | and outside of that area:
 54 | 
 55 | \begin{align}
 56 |     \mathbf{s = \frac{a}{\theta \cdot (\theta + 1)}}
 57 | \end{align}
 58 | 
 59 | The probability $p$ of selecting the true value will have to meet specific criteria that we are going to define later on.
 60 | 
 61 | \subsection{Building the Protocol}
 62 | Ww are now going to find what the alpha parameter will be, as it is not constant, but clearly it depends on the domain size and the probability $p$. In order to find it, we must keep in mind that all the probabilities of selecting an item from our domain, must add up to $1$.
 63 | 
 64 | In order to find out the $\alpha$ value, we must solve the following equation:
 65 | 
 66 | \begin{align*}
 67 |     p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1
 68 | \end{align*}
 69 | 
 70 | At this point, we must note that $\alpha$ although not a constant, can be held out of the sums, because it is obviously independent from the $i$ variable, that is the variable parsing through the domain in order to retrieve the false elements' probabilities. Thus, we have:
 71 | 
 72 | \begin{align*}
 73 |         p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 \Longleftrightarrow \\
 74 |       \sum_{i = x - \theta}^{i = x + \theta} \frac{a}{|x-i|(|x-i| + 1)} + \sum_{i = 1}^{i = x - \theta -1} \frac{a}{\theta(\theta+1)} + \sum_{i = x + \theta + 1}^{i = d} \frac{a}{\theta(\theta+1)} = 1 - p \Longleftrightarrow \\ \dots \Longleftrightarrow \\ 
 75 |     a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1}
 76 | \end{align*}
 77 | 
 78 | The proof for the mathematical equations leading to the extraction of the alpha value, can be found in the First Appendix of the Thesis.
 79 | 
 80 | \subsection{Epsilon Requirements}
 81 | 
 82 | The epsilon value is the most essential in these protocols, as it determines the privacy level that the protocol yields.
 83 | 
 84 | Recalling the definition of LDP, we must follow the following rule:
 85 | 
 86 | \begin{center}
 87 | An algorithm $A$ satisfies \espilon-LDP \textit{iff} for any input $v_1$ and $v_2$, we have
 88 | \begin{align*}
 89 |     \forall y \in Range(A): \frac{Pr[A(v_1) = y]}{Pr[A(v_2) = y]} \leq e^{\epsilon}
 90 | \end{align*}
 91 |     
 92 | \end{center}
 93 | 
 94 | Thus, in order to determine the epsilon value for our algorithm, it must satisfy even the worst case of this equation. The fraction gets bigger, if we put on the biggest probability on the numerator, and the smallest probability of all in the denominator. 
 95 | 
 96 | The numerator must have the probability $p$, and the denominator $s$, the probability of all of the elements outside of the $\theta$ area. 
 97 | 
 98 | We want for $p$ to be the biggest probability among all, but not extremely high, in order to be able to restrict the growth of epsilon. Hence, we are going to set it as double of the probabilities of its exact neighbours. The 2 neighbours have $|i-x| = 1$, thus $q = \frac{a}{2}$, so we are going to set $\mathbf{p = 2 \cdot \frac{a}{2} = a}$, where $a$ is the quantity defined above, depending on the domain size, and the $\theta$ value.
 99 | 
100 | Now, if we set $p = a$, then the $a$ equation changes, and now our aplha parameter only depends on the domain size and the $\theta$ selected. So, we proceed as following:
101 | \begin{align*}
102 |     a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1} \Longleftrightarrow{(p = a)}\\
103 |     a = \frac{\theta(\theta + 1) (1 - a)}{2\theta^2 - 2\theta + d - 1} \Longleftrightarrow \\
104 |     a (2\theta^2 - 2\theta + d - 1) = \theta(\theta + 1) - (\theta^2 + \theta)a \Longleftrightarrow\\
105 |     a (3\theta^2 - \theta + d - 1) = \theta(\theta + 1) \Longleftrightarrow \\
106 |     \mathbf{a = \frac{\theta(\theta + 1)}{3\theta^2 - \theta + d - 1}}
107 | \end{align*}
108 | 
109 | Obviously, we observe the constraint that $\theta > 0$, and this is a special case of our protocol, that can be represented by the direct encoding protocol.
110 | 
111 | We now have:
112 | 
113 | \begin{align*}
114 |     \frac{Pr[A(v_1) = y]}{Pr[A(v_2) = y]} = e^{\espilon} \Longleftrightarrow 
115 |     q e^{\epsilon} = p \Longleftrightarrow \\
116 |     \frac{a}{\theta(\theta + 1)} \cdot e^{\epsilon} =  a \Longleftrightarrow \\
117 |     \theta(\theta + 1) = e^{\epsilon} \Longrightarrow \\
118 |     \theta^2 + \theta - e^\epsilon = 0
119 | \end{align*}
120 | 
121 | If we solve the quadratic equation, and reject its one (illegal) solution, we get that:
122 | 
123 | \begin{align}
124 |     \theta = \lfloor \frac{\sqrt{4e^{\epsilon} + 1} - 1}{2}\rfloor
125 | \end{align}
126 | 
127 | So, to conclude, in order for the protocol to function, the user must provide just the epsilon setting, from which the $\theta$ constant is computed, and so the probabilities for each item of the domain will be selected.  
128 | 
129 | 
130 | \subsection{Protocol Definition}
131 | 
132 | We are now ready to define our protocol, by determining the 3 basic operations for an LDP protocol, the \emph{encoding}, the \emph{perturbation} and the \emph{aggregation methods}.
133 | 
134 | 
135 | We are going to use the following symbols:
136 | \begin{itemize}
137 |     \item $D$: The protocol's domain. In this set, we have each $i$ for $1 \leq i \leq |D|$ as each item in the domain $D$
138 |     \item $a$: The quantity that we computed in the previous section
139 |     \item $\theta$: The constant used in the previous section, which denotes the area around the true value that the probabilities will be higher than others.
140 |     
141 | \end{itemize}
142 | 
143 | \textbf{Encoding:} The encoding procedure is trivial. Just like the Wang paper, we are just going to set:
144 | \begin{align*}
145 |     Encode(v) = v
146 | \end{align*}
147 | 
148 | for each value $v$ of the domain. The values are going to be randomized during the perturbation step.
149 | 
150 | \textbf{Perturbation:} Given the previous section, the randomization during the perturbation step is define as following:
151 | 
152 | \begin{equation*}
153 |     Pr[Perturb(x) = i] =
154 | 	\begin{cases}
155 | 		p = a & \mbox{if } i = x \\
156 | 		q = \frac{a}{|c|(|c| + 1)}  &  c = \min{(\theta, |i-x|)}  \mbox{, otherwise}	\end{cases}
157 | \end{equation*}
158 |  
159 | where $i$ is the value selected each time, and $x$ our initial selection. 
160 | 
161 | \textbf{Aggregation:} The aggregation step was the most tricky during the building of the protocol. A similar approach to the aggregation of pure protocols was chosen, but with a few changes. After several different tries, the optimal aggregation found, was the following: the protocol supports only the reported values corresponding to the true one, thus $Support(v) = v$. However, the $p^*$ quantity is the sum of all the probabilities inside the area: 
162 | 
163 | \begin{align*}
164 |     p^* = \sum_{x\in (-\theta, \theta)} p(x)
165 | \end{align*}
166 | 
167 | Finally, the $q*$ quantity is the probability of choosing an element from outside the θ area, thus equal to $s$. Hence, the estimation generated for a value $v$ of the possible answers in the domain is defined as following:
168 | 
169 | \begin{align*}
170 |     \text{Estimation} = \frac{\sum_{j} 1_{support(v^j)}(i) - nq^*}{p^* - q^*}
171 | \end{align*}
172 | 
173 | \subsection{Extreme Cases}
174 | 
175 | The downside of a complicated protocol, are of course some extreme cases for the $x, \theta \text{ and } i$ values, all of which we are going to examine in this chapter. The definition of the protocol is going to be altered, and the constraints increased, in order to support those extreme cases.
176 | 
177 | \textbf{Extreme theta cases:} Of course, we have the constraint that $0 < \theta \leq d$, but what happens when its value is equal to one of the bounds?
178 | \begin{itemize}
179 |     \item When $\theta \leq 0$, our protocol can not function, as this assignment will result in $a = 0$, something that is prohibited, because the probabilities will not sum to 1. In order to ensure that $\theta$ is at least 1, the user must provide at least an $\epsilon = ln(2)$.
180 |     
181 |     \item When $\theta = 1$, we can see that the third case in the perturbation step does not exist, thus we have only the first 2 cases. There, $p = a = \frac{2}{d+1}$, and for every other $i$, $q = \frac{a}{2}$, something that is similar with the Direct Encoding protocol.
182 |     
183 |     \item When $\theta = d$, which realistically can only happen when d is extremely small, then our protocol functions as designed, and has its best behavior. However, if the selection of epsilon results in such big a theta, then the user does not have extreme privacy demands.
184 | \end{itemize}
185 | 
186 | \textbf{Extreme x values:} Even when the epsilon value and the domain size are normal, in some cases we might face a certain difficulty: if $x - \theta < 0$ or $x + \theta > d$, some of the items in our area are actually outside of our domain boundaries. This results in the sum of the items in the probabilistic distribution to be below 1, something not acceptable. 
187 | 
188 | In order to fix it, we are going to "transfer" those probabilities inside the boundaries of our domain, while not messing with the highest probability, as this would result in problems with the definition of D.P., an thus the value of theta. 
189 | 
190 | The idea is to increase the other selections' probabilities by a bit, in order to fill the gap created, while leaving the maximum as initially created. We are going to boost all of the domain's items, by a portion of $\frac{m}{d - 1}$ (as we are altering $d-1$ elements), where $m$ is the sum of the probabilities of the items outside the bounds of our domain. 
191 | 
192 | However, we are not interested in transferring the whole $Pr[Perturb(x) = i]$, but only its difference from the item with the lowest probability, which is $s =  \frac{a}{\theta(\theta+1)}$. Thus, the $m$ values are defined as following:
193 | \begin{align*}
194 |     m = \sum_{i < 0 \bigcup |i-x|<\theta} Pr[Perturb(x) = i] - \frac{a}{\theta(\theta+1)}
195 | \end{align*}
196 | 
197 | for the case of $x - \theta < 0$, and as 
198 | 
199 | \begin{align*}
200 |     m = \sum_{i \geq d \bigcup |i-x|<\theta} Pr[Perturb(x) = i] - \frac{a}{\theta(\theta+1)}
201 | \end{align*}
202 | 
203 | for the second one.
204 | \\\bigskip
205 | Now, the probabilistic distribution can be altered as following: 
206 | 
207 | 
208 | \begin{equation*}
209 |     Pr[Perturb_{DS}(x) = i] =
210 | 	\begin{cases}
211 | 		p = a & \mbox{if } i = x \\
212 | 		q = \frac{a}{|c|(|c| + 1)} + \frac{m}{d - 1}  &  c = \min{(\theta, |i-x|)}  \mbox{, otherwise}	\end{cases}
213 | \end{equation*}
214 |  
215 | 
216 | The definition of D.P. is not altered, because again in the best case we have a probability of $p = a$, and in the worst case $s = \frac{a}{\theta(\theta+1)}$.
217 | 
218 | \subsection{Implementation}
219 | 
220 | The most difficult part of the implementation of our protocol consists of creating the probabilistic distribution for each element of the domain, depending on the true value. This can prove to be costly, if we have a large domain or if we are in the case of the extreme x values.
221 | 
222 | However, we do not need to compute every single probability, as it is clear from the definition that they are independent from the true value: they only depend on $a$ (and on the domain size in case of an extreme x value). The quantity $|i - x|$ can only take values in the range of $[1,\theta]$, thus constant for every possible true value. Moreover, for the domain values outside of the area, the probability is fixed and equal to $\frac{a}{\theta(\theta + 1)}$. Hence, the probabilities can be computed in advance, either by each user, or given to the protocol by the aggregator. 
223 | 
224 | The protocol has been implemented using Python, and can be found in the GitHub repository of this Thesis. Moving forward, we are going to use this implementation in order to conduct some testings to ensure the protocol's functionality.
225 | 
226 | \subsection{Experiments}
227 | First up, we are going to perform the epsilon measurements that we did for the other protocols, this time excluding the Random Matrix approach, and including the D.S. protocol. The results, when running with the Kantorovich metric, are the following:
228 | 
229 | \begin{figure}[!htb]\centering
230 |     \includegraphics[width=1\textwidth]{images/epsilon_our_kant.png}
231 |     \caption{Epsilon measurements for D.S. protocol compared by Kantorovich Distance}
232 | \end{figure}
233 | 
234 | The first observation is the \emph{strange form of the curve of our protocol}. This can be easily explained: the theta value used to determine the area around the true value, is depended on epsilon, but has a floor function applied to it. Thus, for a specific range of ε, the protocol produces the same results. 
235 | 
236 | An other observation is that\emph{ our protocol lacks efficiency for low values of epsilon}, that is natural, since small θ values do not help our idea at all. However, when epsilon gets higher than 1.5 (and thus theta rises above 2), the results are more than satisfying: \emph{our protocol has the best behaviour for epsilon in the range of $\mathbf{(2, 2.5)}$.}
237 | 
238 | The real test though, is how our protocol behaves for an increasing number of users: we must check if it produces better accuracy error than the competitors. This is our next testing, where we are going to set $\epsilon = \ln(20)$, in order for the conditions to be favorable for each one of the protocols. The results are shown in the \textbf{Figure 4.10}.
239 | 
240 | 
241 | \begin{figure}[!htb]\centering
242 |     \includegraphics[width=1\textwidth]{images/users_our_kant.png}
243 |     \caption{Increasing users measurements for D.S. protocol compared by Kantorovich Distance}
244 | \end{figure}
245 | 
246 | For the specific epsilon setting, \emph{our protocol produces extremely good accuracy error for a small number of users.}, beating by a lot the U.E. protocol. The comparisons have been made using the Kantorovich metric, the most characteristic of them all, as it takes into account the distance between the real answers and the projections, exactly what our protocol is designed to do. However, we are also going to perform the same testings using the Manhattan metric. The results are shown in the \textbf{Figure 4.11}.
247 | 
248 | \begin{figure}[!htb]\centering
249 |     \includegraphics[width=1\textwidth]{images/users_our_l1.png}
250 |     \caption{Increasing users measurements for D.S. protocol compared by Manhattan Distance}
251 | \end{figure}
252 | 
253 | The results of the Manhattan-driven tests are similar to the ones made with the Kant. metric. However, we observe that \emph{when the number of users rises, our protocol has worse behaviour in comparison to the other pure ones}. This happens mainly because of the other protocols, that by the law of big numbers, have good accuracy because of the higher probability of choosing the true answer. On the other hand, in our protocol, this probability is reduced and shared with the other elements in the area covered by theta. 
254 | 
255 | \subsection{Conclusions}
256 | In general, \emph{the D.S. protocol succeeds when the number of the participants in a survey is extremely low}, and functions similarly with the other protocols for an increasing number of users. The downside is that it does not always takes full advantage of the epsilon setting, as explained in a previous section. However, the results are more than satisfying. Hence, \emph{this is a fully functioning protocol that can be used for the application of L.D.P., especially in a situation when few people take part in the survey.} The protocol will be further tested in more extreme cases, but this is beyond the scope of this Thesis.
257 | 


--------------------------------------------------------------------------------
/thesis_paper/dependencies/arial/fonts/Arial Bold Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Bold Italic.ttf


--------------------------------------------------------------------------------
/thesis_paper/dependencies/arial/fonts/Arial Bold.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Bold.ttf


--------------------------------------------------------------------------------
/thesis_paper/dependencies/arial/fonts/Arial Italic.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial Italic.ttf


--------------------------------------------------------------------------------
/thesis_paper/dependencies/arial/fonts/Arial.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/dependencies/arial/fonts/Arial.ttf


--------------------------------------------------------------------------------
/thesis_paper/dithesis.cls:
--------------------------------------------------------------------------------
  1 | % dithesis.cls
  2 | %
  3 | % A LaTeX2e document class for writing MSc theses in English for the Department
  4 | % of Informatics and Telecommunications (DI&T) of the National and Kapodistrian
  5 | % University of Athens (NKUA).
  6 | %
  7 | % Copyright (c) 2014, 2015 Charalampos S. Nikolaou <charnik@di.uoa.gr>
  8 | %                     2017 Ergys Dona <errikosd@gmail.com>
  9 | %                     2020 Giorgos Katsogiannis <geo.katsogiannis@gmail.com>
 10 | %
 11 | % This work may be distributed and/or modified under the conditions of the
 12 | % LaTeX Project Public License. The latest version of this license is in
 13 | %   http://www.latex-project.org/lppl.txt.
 14 | %
 15 | % This work consists of the following files:
 16 | % dithesis.cls
 17 | %		This document class file
 18 | %
 19 | % sample.tex
 20 | %		A sample document demonstrating the use of this class file
 21 | %
 22 | % emblems/
 23 | %		A directory containing three colored versions of the official emblem of
 24 | %		the National and Kapodistrian University of Athens.
 25 | %
 26 | % fonts/
 27 | %		A directory containing the Arial family TrueType fonts. The directory
 28 | %		also includes a README file with instructions on installing the fonts
 29 | %		in your system (assuming Unix-based).
 30 | 
 31 | % Document Class Options
 32 | %
 33 | % inscr
 34 | % 		If present, then a page with the inscription provided via the command
 35 | % 		\inscription{} is printed.
 36 | %
 37 | % ack
 38 | %		If present, then a page with the acknowledgements provided via the
 39 | %		command \acksEn{} is printed.
 40 | %
 41 | % preface
 42 | %		If present, then a preface page is included just before the
 43 | %		introductory chapter. The content of this page is controled via the
 44 | %		command \preface{}.
 45 | %
 46 | % lop
 47 | %		If present, then a page with the list of puclications will be included.
 48 | %
 49 | 
 50 | % Document Versions
 51 | %
 52 | % Version 1.3, 2020/10/02
 53 | %   Added some corrections to match the department's official formatting requirements.
 54 | %   Changes were applied to the acronyms table, the bibliography size, the table of
 55 | %   contents, the captions of figures and tables and the abstract page's keywords.
 56 | %
 57 | % Version 1.2, 2017/02/26
 58 | %   Refactored the document to allow it to be used to write BSc Theses.
 59 | %
 60 | % Version 1.1, 2015/11/06
 61 | %		Updated document class so as to be compatible with the template file
 62 | %		regarding the appearance of headers/footers and appendix names. Require
 63 | %		also package `longtable' for being used for abbreviations/acronyms.
 64 | %
 65 | % Version 1.0, 2014/12/08
 66 | %		Initial attempt at creating the first class file for LaTeX Theses in
 67 | %		DI&T, NKUA.
 68 | 
 69 | % Known Issues
 70 | %
 71 | % * Chapter titles are not appearing in capital letters in the ToC, although
 72 | %   in other places this has been taken care of for good. Thus, one is enforced
 73 | %   to type chapter titles in capital letters, so as to be compatible with the
 74 | %   requirements of the department.
 75 | %
 76 | 
 77 | \NeedsTeXFormat{LaTeX2e}
 78 | \ProvidesClass{dithesis}[2017/02/26 v1.2 LaTeX class for BSc Theses
 79 | submitted to the Department of Informatics and Telecommunications of the
 80 | National and Kapodistrian University of Athens]
 81 | 
 82 | %
 83 | % Declare and initialize global ifs
 84 | % (set by passing options to this document class)
 85 | %
 86 | 
 87 | % controls the inclusion of the inscription page
 88 | \newif\ifinscriptionpage
 89 | \inscriptionpagefalse
 90 | 
 91 | % controls inclusion of acknowledgement page
 92 | \newif\ifackpage
 93 | \ackpagefalse
 94 | 
 95 | % controls inclusion of the list of publications page
 96 | \newif\ifloppage
 97 | \loppagefalse
 98 | 
 99 | % controls inclusion of the preface page
100 | \newif\ifprefacepage
101 | \prefacepagefalse
102 | 
103 | %
104 | % Declare and initialize internal ifs (not set by document class options)
105 | %
106 | \newif\ifenglishfrontpage
107 | \englishfrontpagefalse
108 | 
109 | \newif\ifenglishcommitteepage
110 | \englishcommitteepagefalse
111 | 
112 | % marks whether the appendices have started
113 | % (used later for setting an appropriate naming scheme for the appendix title)
114 | \newif\ifappendixstarted
115 | \appendixstartedfalse
116 | 
117 | % controls whether we want to print dual-page or single page
118 | % when enabled, the page numbering will occur in the centre of the page
119 | % otherwise, it will occur at the right
120 | \newif\ifdualpage
121 | \dualpagefalse
122 | 
123 | %
124 | % Check passed options
125 | %
126 | \DeclareOption{inscr}{
127 |   \inscriptionpagetrue
128 | }
129 | \DeclareOption{ack}{
130 |   \ackpagetrue
131 | }
132 | \DeclareOption{preface}{
133 |   \prefacepagetrue
134 | }
135 | \DeclareOption{lop}{
136 |   \loppagetrue
137 | }
138 | \DeclareOption{dualpage}{
139 |   \dualpagetrue
140 | }
141 | 
142 | 
143 | \DeclareOption*{\PassOptionsToClass{\CurrentOption}{book}}
144 | \ProcessOptions\relax
145 | 
146 | \LoadClass[12pt,oneside]{book}
147 | 
148 | %
149 | % Required packages and configuration
150 | %
151 | \RequirePackage[a4paper,
152 | top=2cm,bottom=2cm,bindingoffset=0.5cm,left=2cm,right=2cm,
153 | headsep=0.5cm,footskip=0.75cm,
154 | ]{geometry} % add option showframe=true for debugging
155 | \RequirePackage{fancyhdr}
156 | \RequirePackage[final]{graphicx}
157 | \RequirePackage{sectsty}   % needed for formatting chapter headings
158 | \RequirePackage{textcase}  % needed for forcing capital letters (in chapters)
159 | \RequirePackage[resetlabels]{multibib}
160 | %\usepackage[notlof,notlot,nottoc,notlop]{tocbibind}
161 | %\RequirePackage{tocloft}  % needed for making chapter titles upper case
162 | \RequirePackage{etoolbox}  % needed for patchcmd
163 | \RequirePackage{tabularx}  % needed for tabular* environment
164 | \RequirePackage{longtable} % needed for splitting big tables across pages
165 | \RequirePackage{xifthen}
166 | \RequirePackage[caption=false]{subfig}
167 | \RequirePackage[%
168 |   font={footnotesize, bf},
169 |   justification=centering,
170 |   labelsep=colon,
171 |   figureposition=bottom,
172 |   tableposition=top]{caption} % captions
173 | 
174 | %(e.g., abbreviations)
175 | %\RequirePackage[toc,page,titletoc]{appendix} % needed for configuring
176 | %%appendices
177 | 
178 | %
179 | % Add Greek support
180 | %
181 | \usepackage{fontspec}
182 | \usepackage{xunicode}
183 | \usepackage{xltxtra}
184 | \usepackage{polyglossia}
185 | \newfontfamily\greekfont[Script=Greek]{Arial}
186 | \newfontfamily\greekfontsf[Script=Greek]{Arial}
187 | \newfontfamily\greekfonttt[Script=Greek]{Arial}
188 | \setdefaultlanguage[variant=uk]{english}
189 | \setotherlanguage{greek}
190 | 
191 | %
192 | % Set font mamily Aria
193 | %
194 | \setmainfont[Ligatures={Common,TeX}]{Arial} % the passed option was needed for
195 |                                             % correctly rendering double quotes
196 | \defaultfontfeatures{Mapping=tex-text,Scale=MatchLowercase}
197 | \setsansfont[Mapping=tex-text,Scale=MatchLowercase]{Arial}
198 | \setmathsf{CMU Bright}
199 | \setmathrm{CMU Bright}
200 | 
201 | %\ifglossaryInPreamble
202 | %\RequirePackage[toc,section=chapter,numberedsection=false,nonumberlist]{glossaries}
203 | %\else
204 | % \RequirePackage[toc,section,numberedsection=nolabel,nonumberlist]{glossaries}
205 | %\fi
206 | %\makeglossaries
207 | 
208 | \providecommand{\grnumm}[1]{#1\textsuperscript{ο}}
209 | \providecommand{\grnumf}[1]{#1\textsuperscript{η}}
210 | 
211 | %
212 | % Commands for first, middle, and last name (greek and english versions)
213 | %
214 | \providecommand{\authorFirstGr}[1]{\gdef\@authorFirstGr{#1}}
215 | \providecommand{\authorFirstAbrGr}[1]{\gdef\@authorFirstAbrGr{#1}}
216 | \providecommand{\authorMiddleGr}[1]{\gdef\@authorMiddleGr{#1}}
217 | \providecommand{\authorLastGr}[1]{\gdef\@authorLastGr{#1}}
218 | \providecommand{\authorFirstEn}[1]{\gdef\@authorFirstEn{#1}}
219 | \providecommand{\authorFirstAbrEn}[1]{\gdef\@authorFirstAbrEn{#1}}
220 | \providecommand{\authorMiddleEn}[1]{\gdef\@authorMiddleEn{#1}}
221 | \providecommand{\authorLastEn}[1]{\gdef\@authorLastEn{#1}}
222 | \providecommand{\authorSn}[1]{\gdef\@authorSn{#1}}
223 | 
224 | %
225 | % Commands for the title of the thesis (greek and english versions)
226 | %
227 | \providecommand{\titleGr}[1]{\gdef\@titleGr{#1}}
228 | \providecommand{\titleEn}[1]{\gdef\@titleEn{#1}}
229 | 
230 | %
231 | % Commands for the date of the writing of the thesis (Month followed by Year)
232 | % [provide greek and english versions]
233 | %
234 | \providecommand{\dateGr}[1]{\gdef\@dateGr{#1}}
235 | \providecommand{\dateEn}[1]{\gdef\@dateEn{#1}}
236 | 
237 | 
238 | %
239 | % Commands for supervisor(s)
240 | % If more than one supervisor is declared, the class takes care to show
241 | % "Supervisors" instead of "Supervisor" and
242 | % "Επιβλέποντες" instead of "Επιβλέπων".
243 | %
244 | \gdef\@supervisorLabelSuffixGr{ων}
245 | \gdef\@supervisorLabelSuffixEn{}
246 | 
247 | \providecommand{\supervisorGr}[2]{%
248 |   \ifthenelse{\isundefined{\@thesupervisorslistGr}}{%
249 |     \def\@thesupervisorslistGr{\textbf{#1,} #2}
250 |   }{%
251 |     \g@addto@macro\@thesupervisorslistGr{\\&\textbf{#1,} #2}
252 |     \def\@supervisorLabelSuffixGr{οντες}
253 |   }
254 | }
255 | 
256 | \providecommand{\supervisorEn}[2]{%
257 |   \ifthenelse{\isundefined{\@thesupervisorslistEn}}{%
258 |     \def\@thesupervisorslistEn{\textbf{#1,} #2}
259 |   }{%
260 |     \g@addto@macro\@thesupervisorslistEn{\\&\textbf{#1,} #2}
261 |     \def\@supervisorLabelSuffixEn{s}
262 |   }
263 | }
264 | 
265 | %
266 | % Commands for setting up abstract (greek and english versions), ack (english
267 | % version), synopsis (greek version only), incsriptionEn (english version
268 | %only), and prefaceEn (english version only) pages
269 | %
270 | \providecommand{\abstractEn}[1]{\gdef\@abstractEn{#1}}
271 | \providecommand{\abstractGr}[1]{\gdef\@abstractGr{#1}}
272 | \providecommand{\synopsisGr}[1]{\gdef\@synopsisGr{#1}}
273 | \providecommand{\acksEn}[1]{\gdef\@acksEn{#1}}
274 | \providecommand{\inscriptionEn}[1]{\gdef\@inscriptionEn{%
275 | \vspace*{0.2\textheight}
276 | \begin{flushright}
277 | #1
278 | \end{flushright}
279 | }}
280 | \providecommand{\prefaceEn}[1]{\gdef\@prefaceEn{#1}}
281 | 
282 | %
283 | % Commands for Subject Area and Keywords fields
284 | % (greek and english versions)
285 | %
286 | \providecommand{\subjectAreaGr}[1]{\gdef\@subjectAreaGr{#1}}
287 | \providecommand{\subjectAreaEn}[1]{\gdef\@subjectAreaEn{#1}}
288 | \providecommand{\keywordsGr}[1]{\gdef\@keywordsGr{#1}}
289 | \providecommand{\keywordsEn}[1]{\gdef\@keywordsEn{#1}}
290 | 
291 | %
292 | % Command for specifiying the file containing the publications
293 | % in the context of the PhD - NOT USED FOR BSC THESES.
294 | %
295 | \providecommand{\lopfile}[1]{\newcites{lop}{List of
296 | publications}\gdef\@lopfileinternal{#1}}
297 | 
298 | %
299 | % Command for starting a new unumbered chapter (ToC'ed) for
300 | % holding the table of abbreviations and acronyms. The table
301 | % should be set by the user.
302 | %
303 | \providecommand{\abbreviations}{
304 |   \chapter*{ABBREVIATIONS - ACRONYMS}
305 |   \addcontentsline{toc}{chapter}{ABBREVIATIONS - ACRONYMS}
306 | }
307 | 
308 | %
309 | % Front page (greek and english versions)
310 | %
311 | \def\@frontpage{
312 | \begin{center}
313 | \includegraphics[scale=0.85]{emblems/athena-black}
314 | \end{center}
315 | \begin{minipage}[t]{\textwidth}
316 |     \begin{center}
317 |       {\large \bfseries
318 |       \ifenglishfrontpage
319 |       	NATIONAL AND KAPODISTRIAN UNIVERSITY OF ATHENS
320 |       \else
321 |       	ΕΘΝΙΚΟ ΚΑΙ ΚΑΠΟΔΙΣΤΡΙΑΚΟ ΠΑΝΕΠΙΣΤΗΜΙΟ ΑΘΗΝΩΝ
322 |       \fi
323 |       }
324 |       \linebreak
325 | 
326 |       {\bfseries
327 |       \ifenglishfrontpage
328 |       	SCHOOL OF SCIENCES \\ DEPARTMENT OF INFORMATICS AND TELECOMMUNICATIONS
329 |       \else
330 |       	ΣΧΟΛΗ ΘΕΤΙΚΩΝ ΕΠΙΣΤΗΜΩΝ \\ ΤΜΗΜΑ  ΠΛΗΡΟΦΟΡΙΚΗΣ ΚΑΙ ΤΗΛΕΠΙΚΟΙΝΩΝΙΩΝ
331 |       \fi
332 |       }
333 |       \linebreak\linebreak\linebreak\linebreak\linebreak
334 | 
335 |       {\bfseries
336 |       \ifenglishfrontpage
337 |       	BSc THESIS
338 |       \else
339 |       	ΠΤΥΧΙΑΚΗ ΕΡΓΑΣΙΑ
340 |       \fi}
341 |       \linebreak\linebreak
342 | 
343 |       {\Large \bfseries
344 |       \ifenglishfrontpage
345 |       	\@titleEn
346 |       \else
347 |       	\@titleGr
348 |       \fi}
349 |       \linebreak\linebreak\linebreak
350 | 
351 |       {\bfseries
352 |       \ifenglishfrontpage
353 | 	  	  \@authorFirstEn{} \@authorMiddleEn{} \@authorLastEn
354 | 	    \else
355 | 	  	  \@authorFirstGr{} \@authorMiddleGr{} \@authorLastGr
356 | 	    \fi}
357 |       \linebreak\linebreak\linebreak
358 |       \linebreak\linebreak\linebreak
359 |       \linebreak\linebreak
360 |     \end{center}
361 |     {
362 |     \ifenglishfrontpage
363 |       \begin{tabular}{l l}
364 |         \textbf{Supervisor\@supervisorLabelSuffixEn:} & \@thesupervisorslistEn \\
365 |       \end{tabular}
366 |     \else
367 |       \begin{tabular}{l l}
368 |         \textbf{Επιβλέπ\@supervisorLabelSuffixGr:} & \@thesupervisorslistGr \\
369 |       \end{tabular}
370 |     \fi
371 |     }
372 | \end{minipage}
373 |     \vfill
374 |     \begin{center}
375 |     {\bfseries
376 |     \ifenglishfrontpage
377 |       ATHENS
378 |     \else
379 |       ΑΘΗΝΑ
380 |     \fi}
381 |     \\\vspace*{4mm}
382 |     {\bfseries
383 |     \ifenglishfrontpage
384 |       \@dateEn
385 |     \else
386 |       \@dateGr
387 |     \fi}
388 |     \end{center}
389 |     \clearpage
390 | }
391 | 
392 | %
393 | % Committee page (greek and english versions)
394 | %
395 | \def\@committeepage{
396 |     \begin{center}
397 |     \vspace*{1.5cm}
398 |     {\bfseries
399 |     \ifenglishcommitteepage
400 |     	BSc THESIS
401 |     \else
402 |     	ΠΤΥΧΙΑΚΗ ΕΡΓΑΣΙΑ
403 |     \fi
404 |     }
405 |     \linebreak
406 | 
407 |     {
408 |     \ifenglishcommitteepage
409 |     	\@titleEn
410 |     \else
411 |     	\@titleGr
412 |     \fi
413 |     }
414 |     \linebreak\linebreak\linebreak
415 | 
416 |     {\bfseries
417 |     \ifenglishcommitteepage
418 |     	\@authorFirstEn{} \@authorMiddleEn{} \@authorLastEn
419 |     \else
420 |     	\@authorFirstGr{} \@authorMiddleGr{} \@authorLastGr
421 |     \fi
422 |     \vspace{2mm}
423 |     }
424 |     \linebreak
425 |     {
426 |     \ifenglishcommitteepage
427 |     	{\bfseries S.N.:} \@authorSn
428 |     \else
429 |     	{\bfseries Α.Μ.:} \@authorSn
430 |     \fi
431 |     }
432 |     \linebreak\linebreak\linebreak
433 |     \linebreak\linebreak\linebreak
434 |     \linebreak\linebreak\linebreak
435 |     \linebreak\linebreak\linebreak
436 |     \linebreak\linebreak
437 |     \end{center}
438 | 
439 |     {
440 |     \ifenglishcommitteepage
441 |       \begin{tabular}{l l}
442 |         \textbf{SUPERVISOR\MakeUppercase{\@supervisorLabelSuffixEn:}} & \@thesupervisorslistEn \\
443 |       \end{tabular}
444 |       % \@supervisorlabelEn
445 |     \else
446 |       \begin{tabular}{l l}
447 |         \textbf{ΕΠΙΒΛΕΠ\MakeUppercase{\@supervisorLabelSuffixGr:}} & \@thesupervisorslistGr \\
448 |       \end{tabular}
449 |       % \@supervisorlabelGr
450 |     \fi
451 |     }
452 |     \clearpage
453 | }
454 | 
455 | %
456 | % Abstract (english version)
457 | %
458 | \def\@absEn{
459 |   \chapter*{Abstract}
460 |   \thispagestyle{empty}
461 |   \@abstractEn{}
462 |   \vfill
463 | 
464 |   \begin{tabularx}{\textwidth}{l X}
465 |     {\bfseries SUBJECT AREA:}  & \@subjectAreaEn
466 |   \end{tabularx}
467 | 
468 |   \begin{tabularx}{\textwidth}{l X}
469 |     {\bfseries KEYWORDS:}  & \@keywordsEn
470 |   \end{tabularx}
471 | }
472 | 
473 | %
474 | % Abstract (greek version)
475 | %
476 | \def\@absGr{
477 |   \chapter*{ΠΕΡΙΛΗΨΗ}
478 |   \thispagestyle{empty}
479 |   \begin{greek}
480 |   \@abstractGr{}
481 |   \end{greek}
482 |   \vfill
483 | 
484 |   \begin{tabularx}{\textwidth}{l X}
485 |     {\bfseries ΘΕΜΑΤΙΚΗ ΠΕΡΙΟΧΗ:}  & \@subjectAreaGr
486 |   \end{tabularx}
487 | 
488 |   \begin{tabularx}{\textwidth}{l X}
489 |     {\bfseries ΛΕΞΕΙΣ ΚΛΕΙΔΙΑ:}  & \@keywordsGr
490 |   \end{tabularx}
491 | }
492 | 
493 | %
494 | % Greek synopsis of the thesis
495 | %
496 | \def\@synopsis{
497 |   \chapter*{ΣΥΝΟΠΤΙΚΗ ΠΑΡΟΥΣΙΑΣΗ ΤΗΣ ΔΙΔΑΚΤΟΡΙΚΗΣ ΔΙΑΤΡΙΒΗΣ}
498 |   \thispagestyle{empty}
499 |   \@synopsisGr{}
500 |   \clearpage
501 | }
502 | 
503 | %
504 | % Inscription page (optional)
505 | %
506 | % Enable it by passing option ``inscr'' to the document class.
507 | %
508 | \def\@inscr{
509 |   \cleardoublepage
510 |   \@inscriptionEn{}
511 |   \clearpage
512 | }
513 | 
514 | %
515 | % Acknowledgements (optional)
516 | %
517 | % Enable it by passing option ``ack'' to the document class.
518 | %
519 | \def\@acks{
520 |   \chapter*{Acknowledgements}
521 |   \thispagestyle{empty}
522 |   \@acksEn{}
523 |   \clearpage
524 | }
525 | 
526 | %
527 | % Preface page (optional)
528 | %
529 | % Enable it by passing option ``preface'' to the document class.
530 | %
531 | % Check the following page that gives a definition among the uses of Prologue,
532 | % Foreword, and Preface:
533 | %http://iankingsleyauthor.blogspot.de/2013/03/defined-prologue-epilogue-foreword.html.
534 | %
535 | \def\@preface{
536 |   \chapter*{Preface}
537 |   \thispagestyle{empty}
538 |   \@prefaceEn{}
539 |   \clearpage
540 | }
541 | 
542 | %
543 | % List of publications (optional)
544 | %
545 | % Enable it by passing option ``lop'' to the document class.
546 | %
547 | \def\@listofpubs{
548 |   \bibliographystylelop{unsrt}
549 |   \bibliographylop{\@lopfileinternal}
550 |   \thispagestyle{empty}
551 |   \clearpage
552 | }
553 | 
554 | 
555 | %%
556 | %% Format of Table of Contents, List of Figures and List of Tables.
557 | %% (Adapted from dithesis.cls made by Yannis Mantzouratos <mantzouratos@gmail.com>)
558 | %%
559 | \RequirePackage{titletoc}
560 | \RequirePackage[subfigure,titles]{tocloft}
561 | 
562 | % Table of Contents
563 | \let\oldtableofcontents\tableofcontents
564 | \DeclareRobustCommand{\tableofcontents}{
565 |   \newpage
566 |   \pagestyle{empty}
567 |   \oldtableofcontents
568 |   \clearpage
569 |   \pagestyle{fancy}
570 | }
571 | 
572 | % place dots between each section and the respective page number
573 | \renewcommand{\cftsecleader}{\bfseries\cftdotfill{\cftdotsep}}
574 | % place dots between each subsection and the respective page number
575 | \renewcommand{\cftsubsecleader}{\bfseries\cftdotfill{\cftdotsep}}
576 | 
577 | % place a dot after each chapter number
578 | \renewcommand{\cftchapaftersnum}{.}
579 | 
580 | % section entries should be 10 pt and bold
581 | \renewcommand{\cftsecfont}{\fontsize{10pt}{12pt}\selectfont\bfseries}
582 | % subsection entries should be 10 pt
583 | \renewcommand{\cftsubsecfont}{\fontsize{10pt}{12pt}\selectfont}
584 | % subsubsection entries should be 10 pt
585 | \renewcommand{\cftsubsubsecfont}{\fontsize{10pt}{12pt}\selectfont}
586 | 
587 | % sections should not be indented, whereas subsections and subsubsections should
588 | \setlength\cftsubsubsecindent\cftsubsecindent
589 | \setlength\cftsubsecindent\cftsecindent
590 | \setlength{\cftsecindent}{0pt}
591 | 
592 | % sections should have the same vertical space with chapters
593 | \setlength\cftbeforesecskip\cftbeforechapskip
594 | 
595 | % space between chapters and numbering in case of double digit numbers
596 | \newlength{\tocbinnumwidth}
597 | \settowidth{\tocbinnumwidth}{9}
598 | \addtolength{\cftchapnumwidth}{\tocbinnumwidth}
599 | 
600 | 
601 | %
602 | % Configure the frontmatter page
603 | %
604 | \renewcommand{\frontmatter}{
605 |     \pagestyle{empty}
606 |     % frontpage - english version
607 |     \englishfrontpagetrue
608 |     \@frontpage
609 | 
610 |     % frontpage - greek version
611 |     \englishfrontpagefalse
612 |     \@frontpage
613 | 
614 |     % examination committe page - english version
615 |     \englishcommitteepagetrue
616 |     \@committeepage
617 | 
618 |     % examination committe page - greek version
619 |     \englishcommitteepagefalse
620 |     \@committeepage
621 | 
622 |     % abstract (english and greek version)
623 |     \@absEn
624 |     \@absGr
625 | 
626 |     % inscription
627 |     \ifinscriptionpage
628 |       \@inscr
629 |     \fi
630 | 
631 |     % acknowledgements (english version only, OPTIONAL)
632 |     \ifackpage
633 |     	\@acks
634 |     \fi
635 | 
636 |     % table of contents
637 |     % add TOC as bookmark
638 |     \addtocontents{toc}{\protect{\pdfbookmark[0]{CONTENTS}{toc}}}
639 |     \tableofcontents
640 | 
641 |     % list of figures
642 |     %\listoffigures
643 |     %\thispagestyle{empty}
644 |     \cleardoublepage
645 |     \begingroup
646 |     \makeatletter
647 |     \let\ps@plain\ps@empty
648 |     \makeatother
649 | 
650 |     \pagestyle{empty}
651 |     \listoffigures
652 |     \cleardoublepage
653 |     \endgroup
654 | 
655 |     % list of tables
656 |     \listoftables
657 |     \thispagestyle{empty}
658 | 
659 |     % preface page (english version only, OPTIONAL)
660 |     \ifprefacepage
661 | 	    \@preface
662 |     \fi
663 | 
664 |     % Prepare things for the rest of the document
665 |     \clearpage
666 |     \thispagestyle{empty}
667 | }
668 | 
669 | %
670 | % Configure the mainmatter and backmatter pages
671 | %
672 | \renewcommand{\mainmatter}{
673 | 	% just set the style of the pages to be fancy
674 |     \pagestyle{fancy}
675 | }
676 | \renewcommand{\backmatter}{
677 | 	% just set the style of the pages to be fancy
678 |     \pagestyle{fancy}
679 | 
680 |     % and also, make the empty page style permanent, since
681 |     % when a \chapter command is invoked, the command
682 |     % \thispagestyle{plain} is invoked
683 |     % (thanks to http://tex.stackexchange.com/a/19741)
684 |     %\patchcmd{\chapter}{plain}{empty}{}{}
685 | }
686 | 
687 | %
688 | % Configure chapter printing and alignment (centered)
689 | %
690 | \def\@makechapterhead#1{%
691 |   %\vspace*{50\p@}%
692 |   {\parindent \z@ \raggedright \normalfont
693 |     \interlinepenalty\@M
694 |     \ifappendixstarted
695 |        \large \centering \bfseries APPENDIX \thechapter. \MakeTextUppercase{#1}
696 |     \else
697 | 	   \large \centering \bfseries \thechapter. \MakeTextUppercase{#1}
698 |     \fi
699 |     \par\nobreak
700 |     \vskip 20\p@
701 |   }}
702 | 
703 | \chaptertitlefont{\vspace*{-2.38cm} \large \centering \MakeTextUppercase}
704 | \sectionfont{\normalsize}
705 | \subsectionfont{\normalsize}
706 | \subsubsectionfont{\normalsize}
707 | \paragraphfont{\normalsize}
708 | \subparagraphfont{\normalsize}
709 | 
710 | % use capital letters for chapters in the ToC as well
711 | % TODO
712 | 
713 | %
714 | % Rename Bibliography to References
715 | %
716 | \renewcommand{\bibname}{REFERENCES}
717 | 
718 | %
719 | % Set ToC depth
720 | %
721 | \setcounter{tocdepth}{4}
722 | \setcounter{secnumdepth}{3}
723 | 
724 | %
725 | % No indentation for paragraphs
726 | %
727 | \setlength{\parindent}{0pt}
728 | 
729 | %
730 | % Paragraph spacing should be
731 | %
732 | \setlength{\parskip}{6pt}
733 | 
734 | %
735 | % Line spacing should be one line
736 | %
737 | \linespread{1}
738 | 
739 | %
740 | % Configure header and footer
741 | %
742 | %\pagestyle{fancy}
743 | \fancyhf{}
744 | \fancyhead[LEO]{{\scriptsize \@titleEn{}}}
745 | \ifdualpage
746 | \fancyfoot[C]{{\fontsize{10pt}{10pt}\selectfont \thepage}}
747 | \else
748 | \fancyfoot[R]{{\fontsize{10pt}{10pt}\selectfont \thepage}}
749 | \fi
750 | \fancyfoot[LE,LO]{{\scriptsize \@authorFirstAbrEn{} \@authorLastEn{}}}
751 | 
752 | % needed redefinition, because commands like \chapter call
753 | % \thispagestyle{plain} automatically
754 | \fancypagestyle{plain} {
755 |   \fancyhf{}
756 |   \fancyhead[LO]{{\scriptsize \@titleEn{}}}
757 |   \ifdualpage
758 |   \fancyfoot[C]{{\fontsize{10pt}{10pt}\selectfont \thepage}}
759 |   \else
760 |   \fancyfoot[R]{{\fontsize{10pt}{10pt}\selectfont \thepage}}
761 |   \fi
762 |   \fancyfoot[LE,LO]{{\scriptsize \@authorFirstAbrEn{} \@authorLastEn{}}}
763 | }
764 | \renewcommand{\headrulewidth}{0pt}
765 | 
766 | %
767 | % Configure the first page of ToC to have an empty pagestyle
768 | % (thanks to http://tex.stackexchange.com/a/5789)
769 | %
770 | \AtBeginDocument{\addtocontents{toc}{\protect\thispagestyle{empty}}}
771 | 


--------------------------------------------------------------------------------
/thesis_paper/emblems/athena-black.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-black.pdf


--------------------------------------------------------------------------------
/thesis_paper/emblems/athena-blue.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-blue.pdf


--------------------------------------------------------------------------------
/thesis_paper/emblems/athena-red.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena-red.pdf


--------------------------------------------------------------------------------
/thesis_paper/emblems/athena_black.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/emblems/athena_black.jpeg


--------------------------------------------------------------------------------
/thesis_paper/images/D.E. Idea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/D.E. Idea.png


--------------------------------------------------------------------------------
/thesis_paper/images/Figure_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/Figure_1.png


--------------------------------------------------------------------------------
/thesis_paper/images/Our Idea.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/Our Idea.png


--------------------------------------------------------------------------------
/thesis_paper/images/arx_accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/arx_accuracy.png


--------------------------------------------------------------------------------
/thesis_paper/images/arx_tool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/arx_tool.png


--------------------------------------------------------------------------------
/thesis_paper/images/emd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/emd.png


--------------------------------------------------------------------------------
/thesis_paper/images/epsilon_intro_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_intro_graph.png


--------------------------------------------------------------------------------
/thesis_paper/images/epsilon_measurements.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_measurements.png


--------------------------------------------------------------------------------
/thesis_paper/images/epsilon_others_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_others_kant.png


--------------------------------------------------------------------------------
/thesis_paper/images/epsilon_others_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_others_l1.png


--------------------------------------------------------------------------------
/thesis_paper/images/epsilon_our_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/epsilon_our_kant.png


--------------------------------------------------------------------------------
/thesis_paper/images/hierarchies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hierarchies.png


--------------------------------------------------------------------------------
/thesis_paper/images/hist_metrics_euclidean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hist_metrics_euclidean.png


--------------------------------------------------------------------------------
/thesis_paper/images/hist_metrics_kantorovich.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/hist_metrics_kantorovich.png


--------------------------------------------------------------------------------
/thesis_paper/images/increasing_ds_size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/increasing_ds_size.png


--------------------------------------------------------------------------------
/thesis_paper/images/local_vs_global.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/local_vs_global.png


--------------------------------------------------------------------------------
/thesis_paper/images/nusers_others_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/nusers_others_kant.png


--------------------------------------------------------------------------------
/thesis_paper/images/nusers_others_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/nusers_others_l1.png


--------------------------------------------------------------------------------
/thesis_paper/images/rr_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/rr_results.png


--------------------------------------------------------------------------------
/thesis_paper/images/simple_hists.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/simple_hists.png


--------------------------------------------------------------------------------
/thesis_paper/images/true_answers_ldp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/true_answers_ldp.png


--------------------------------------------------------------------------------
/thesis_paper/images/users_our_kant.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/users_our_kant.png


--------------------------------------------------------------------------------
/thesis_paper/images/users_our_l1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/images/users_our_l1.png


--------------------------------------------------------------------------------
/thesis_paper/latexmkrc:
--------------------------------------------------------------------------------
1 | $out_dir = "./build";
2 | $pdflatex = "xelatex %O %S";
3 | $pdf_mode = 1; $postscript_mode = $dvi_mode = 0;
4 | 
5 | 


--------------------------------------------------------------------------------
/thesis_paper/outerjoin10.mf:
--------------------------------------------------------------------------------
 1 | % This file was largely modified from lasy.mf and lasy10.mf
 2 | % by Scott Pakin <pa...@uiuc.edu> and Anonymous.
 3 | 
 4 | if unknown cmbase: input cmbase fi
 5 | def generate suffix t= enddef;
 6 | input cmsy10;
 7 | 
 8 | font_identifier:="OUTERJOIN";
 9 | font_coding_scheme:="Outer-join symbols";
10 | 
11 | % Here we steal a bit from mathsy
12 | 
13 | mode_setup; font_setup;
14 | autorounding:=0;
15 | 
16 | font_slant slant; font_x_height x_height#;
17 | font_quad 18u# if not monospace:+4letter_fit# fi;
18 | slant:=mono_charic#:=0;  % the remaining characters will not be slanted
19 | currenttransform:=identity yscaled aspect_ratio scaled granularity;
20 | 
21 | cmchar "Left outer-join operator";
22 | beginchar(oct"061",18u#,v_center(7u#));
23 | italcorr math_axis#*slant;
24 | adjust_fit(0,0); pickup rule.nib;
25 | numeric a; a=round(1.1*math_axis);
26 | x1=x4=good.x 6u; x2=x3=x1+10u;
27 | x5=x6=x1-5u;
28 | y1=y2=y5=good.y(.5(cap_height-2a)); y3=y4=y6=y1+2a;
29 | draw z1--z3--z2--z4--cycle;
30 | draw z1--z5;
31 | draw z4--z6;
32 | labels(1,2,3,4,5,6); endchar;
33 | 
34 | cmchar "Right outer-join operator";
35 | beginchar(oct"062",18u#,v_center(7u#));
36 | italcorr math_axis#*slant;
37 | adjust_fit(0,0); pickup rule.nib;
38 | numeric a; a=round(1.1*math_axis);
39 | x1=x4=good.x 1.5u; x2=x3=x1+10u;
40 | x5=x6=x2+5u;
41 | y1=y2=y5=good.y(.5(cap_height-2a)); y3=y4=y6=y1+2a;
42 | draw z1--z3--z2--z4--cycle;
43 | draw z2--z5;
44 | draw z3--z6;
45 | labels(1,2,3,4,5,6); endchar;
46 | 
47 | cmchar "Full outer-join operator";
48 | beginchar(oct"063",23u#,v_center(7u#));
49 | italcorr math_axis#*slant;
50 | adjust_fit(0,0); pickup rule.nib;
51 | numeric a; a=round(1.1*math_axis);
52 | x1=x4=good.x 6u; x2=x3=x1+10u;
53 | x5=x6=x1-5u; x7=x8=x2+5u;
54 | y1=y2=y5=y7=good.y(.5(cap_height-2a)); y3=y4=y6=y8=y1+2a;
55 | draw z1--z3--z2--z4--cycle;
56 | draw z1--z5;
57 | draw z4--z6;
58 | draw z2--z7;
59 | draw z3--z8;
60 | labels(1,2,3,4,5,6,7,8); endchar; 
61 | 
62 | cmchar "CSQL intersection operator";
63 | beginchar(oct"064",23u#,v_center(7u#));
64 | italcorr math_axis#*slant;
65 | adjust_fit(0,0); pickup rule.nib;
66 | numeric a; a=round(1.1*math_axis);
67 | x1=x4=good.x 6u; x2=x3=x1+10u;
68 | x5=x6=x1-5u; x7=x8=x2+5u;
69 | y1=y2=y5=y7=good.y(.5(cap_height-2a)); y3=y4=y6=y8=y1+2a;
70 | draw z1--z3--z2--z4--cycle;
71 | draw z1--z5;
72 | draw z4--z6;
73 | draw z2--z7;
74 | draw z3--z8;
75 | labels(1,2,3,4,5,6,7,8); endchar; 
76 | 
77 | 
78 | bye. 


--------------------------------------------------------------------------------
/thesis_paper/outerjoin10.pk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/outerjoin10.pk


--------------------------------------------------------------------------------
/thesis_paper/outerjoin10.tfm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nikosgalanis/local-dp-protocols/b5521e995f266ff1aeb9fecc220650483630dc04/thesis_paper/outerjoin10.tfm


--------------------------------------------------------------------------------
/thesis_paper/references.bib:
--------------------------------------------------------------------------------
 1 | @book{dwork,
 2 |   author    = {Dwork, C., & Roth, A.},
 3 |   title     = {The algorithmic foundations of differential privacy.},
 4 |   booktitle = {now Publishers Inc.},
 5 |   year      = {2014},
 6 |   url       = {https://www.tau.ac.il/~saharon/BigData2018/privacybook.pdf}
 7 | }
 8 | 
 9 | 
10 | 
11 | 
12 | % @inproceedings{dwork:glove,
13 | %   author = {Dwork, C., McSherry, F., Nissim, K., & Smith, A.},
14 | %   title = {Calibrating Noise to Sensitivity in Private Data Analysis.},
15 | %   booktitle = {Theory of Cryptography,},
16 | %   year = {2006},
17 | %   pages = {265–-284},
18 | %   url = {https://link.springer.com/content/pdf/10.1007/11681878_14.pdf},
19 | % }
20 | 
21 | 
22 | % @inproceedings{holohan:glove,
23 | %   author = {Holohan, N., Braghin, S., Mac Aonghusa, P., & Levacher, K.},
24 | %   title = {Diffprivlib: The IBM Differential Privacy Library.},
25 | %   year = {2019},
26 | %   url = {https://arxiv.org/pdf/1907.02444.pdf},
27 | % }
28 | 
29 | % @inproceedings{li:glove,
30 | %   author = {Li, N., Qardaji, W., & Su, D.},
31 | %   title = {On sampling, anonymization, and differential privacy or,k-anonymization meets differential privacy.},
32 | %   booktitle = {Proceedings of the 7th ACM Symposium on Information, Computer and Communications Security - ASIACCS '12},
33 | %   year = {2012},
34 | %   url = {https://arxiv.org/pdf/1101.2604.pdf},
35 | % }
36 | 


--------------------------------------------------------------------------------
/thesis_paper/refs.tex:
--------------------------------------------------------------------------------
1 | \chapter{BIBLIOGRAPHY}
2 | 
3 | asddfds


--------------------------------------------------------------------------------
/thesis_paper/thesis.tex:
--------------------------------------------------------------------------------
  1 | % demo.tex
  2 | %
  3 | % Enjoy, evolve, and share!
  4 | %
  5 | % Compile it as follows:
  6 | %   latexmk
  7 | %
  8 | % Check file `dithesis.cls' for other configuration options.
  9 | %
 10 | \documentclass[inscr]{dithesis}
 11 | 
 12 | %\usepackage{graphicx}
 13 | 
 14 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 15 | %%%%%%%%%%%%%%%%%%%% User-specific package inclusions %%%%%%%%%%%%%%%%%%%%%%%%%
 16 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 17 | \usepackage{booktabs}
 18 | \usepackage{hyperref}
 19 | \usepackage{lipsum}
 20 | \usepackage{enumerate}
 21 | \usepackage{amsmath}
 22 | \usepackage{amssymb}
 23 | \usepackage{listings}
 24 | 
 25 | \hypersetup{
 26 |     unicode=true,                     % non-Latin characters in bookmarks
 27 |     pdffitwindow=true,                % page fit to window when opened
 28 |     pdfnewwindow=true,                % links in new window
 29 |     pdfkeywords={},                   % list of keywords
 30 |     colorlinks=true,                  % false: boxed links; true: colored links
 31 |     linkcolor=black,                  % color of internal links
 32 |     citecolor=black,                  % color of links to bibliography
 33 |     filecolor=black,                  % color of file links
 34 |     urlcolor=black,                   % color of external links
 35 |     pdftitle={},                      % title
 36 |     pdfauthor={},                     % author
 37 |     pdfsubject={}                     % subject of the document
 38 | }
 39 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 40 | %%%%%%%%%%%%%%%%%%%% User-specific package inclusions %%%%%%%%%%%%%%%%%%%%%%%%%
 41 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 42 | 
 43 | 
 44 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 45 | %%%%%%%%%%%%%%%%%%%%%% User-specific configuration %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 46 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 47 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 48 | %%%%%%%%%%%%%%%%%%%%%% User-specific configuration %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 50 | 
 51 | 
 52 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 53 | %%%%%%%%%%%%%%%%%%%%%%%%%%% Required Metadata %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 55 | %
 56 | % First name, last name
 57 | %
 58 | \authorFirstGr{Νικόλαος}
 59 | \authorFirstAbrGr{Ν.} % abbreviation of first name
 60 | \authorMiddleGr{Γ.}   % abbreviation of father's first name
 61 | \authorLastGr{Γαλάνης}
 62 | \authorFirstEn{Nikolaos}
 63 | \authorFirstAbrEn{N.}
 64 | \authorMiddleEn{G.}
 65 | \authorLastEn{Galanis}
 66 | \authorSn{1115201700019}
 67 | 
 68 | %
 69 | % The title of the thesis
 70 | %
 71 | \titleEn{Protection of Sensitive Data: Creating, Analyzing and Testing Protocols of Differential Privacy}
 72 | \titleGr{Προστασία Ευαίσθητων Δεδομένων: Δημιουργία, Ανάλυση και Δοκιμή Πρωτοκόλλων Διαφορικής Ιδιωτικότητας}
 73 | 
 74 | %
 75 | % Month followed by Year
 76 | %
 77 | \dateGr{ΙΟΥΛΙΟΣ 2021}
 78 | \dateEn{JULY 2021}
 79 | 
 80 | %
 81 | % Supervisor(s) info
 82 | %
 83 | \supervisorGr{Κωνσταντίνος Χατζηκοκολάκης}{Αναπληρωτής Καθηγητής}
 84 | \supervisorEn{Konstantinos Chatzikokolakis}{Associate Professor}
 85 | 
 86 | %
 87 | % Abstract, synopsis, inscription, ack, and preface pages.
 88 | %
 89 | % \setlength\parindent{24pt}
 90 | 
 91 | \abstractEn{
 92 | \par The problem of preserving privacy while extracting information during data analysis, has been an everlasting one. Specifically, during the big-data era, user details can be easily compromised by a malicious handler, something considered both as a security, and as a privacy issue.
 93 | \par With that being the case, there is a simple solution of denying the access to user data, thus making the mining of useful information about a plethora of subjects impossible. On the other hand, a successful mechanism would be for the data to be flowing without control, something that would be beneficiary for the advance of sciences (because of the huge amount of information that would be available), but a significant compromisation for the individuals' privacy. \par
 94 | However, none of these two solutions are applicable and helpful for solving our problem. The answer is finding a balance, that would benefit both parties: the users and their privacy, as well as the researchers. The optimal fix to the subject, is Differential Privacy, which is actually a promise, made by the data handler to the user, that they will not be affected, by allowing their data to be used in any analysis, no matter what other studies/databases/info resources are available. Meanwhile, the output data statistics should be accurate enough for any researcher to extract useful information from them.\par
 95 | This is a promise that in the first sight, seems rather hard to be achieved. Despite that, during this thesis, we will look closely into the theory which makes this form of privacy possible, by the addition of random noise to the user data. Differential Privacy is based on probabilistic theories, well known from the $20^{th}$ century, however, it is a rather new technique, which has yet to be fully implemented in a handy way for all data-miners to use.
 96 | \par The goal of this thesis, is to examine and compare previously created mechanisms for D.P., while also creating our own mechanism, that serves to the purpose of achieving Local D.P., a form of Differential Privacy that is nowadays widely used in machine learning algorithms, aiming to protect the individuals that send their personal data for analysis. We will do so, by creating a library that is easy to use, and applies to all the rules of data privacy, and then extract conclusions from its use. 
 97 | 
 98 | During this thesis, a lot of testings will be made, in order to convince for the usability and the efficiency of Differential Privacy.
 99 | }
100 | \abstractGr{
101 | Το πρόβλημα της διατήρησης της ιδιωτικότητας κατά την ανάλυση δεδομένων, υφίσταται για πολύ καιρό. Συγκεκριμένα, στην εποχή των big-data, λεπτομέρειες των χρηστών μπορούν εύκολα να παραβιαστούν από κακόβουλους χειριστές των δεδομένων, γεγονός που θεωρείται ζήτημα τόσο όσον αφορά την ασφάλεια, όσο και την προστασία της ιδιωτικότητας του ατόμου.\par
102 | Mε την υπάρχουσα κατάσταση, υπάρχει η απλή λύση της άρνησης της πρόσβασης σε δεδομένα χρηστών, στον βωμό της προστασίας τους, κάτι που καθιστά την εξαγωγή συμπερασμάτων για ποικίλα θέματα αδύνατη. Από την άλλη, ένας επιτυχημένος μηχανισμός θα ήταν η ελεύθερη διακίνηση των δεδομένων, χωρίς φιλτράρισμά τους, γεγονός που θα ήταν ωφέλιμο για την πρόοδο των επιστημών (λόγω του μεγάλου όγκου δεδομένων που θα ήταν διαθέσιμος), αλλά μία μεγάλη παραβίαση της ιδιωτικότητας των ατόμων. 
103 | \par 
104 | Ωστόσο, καμία από τις δύο αυτές λύσεις δεν μπορεί να εφαρμοστεί και να μας βοηθήσει στην επίλυση τους προβλήματός μας. Η απάντηση είναι η εύρεση μίας ισορροπίας, η οποία ευνοεί και τα δύο μέρη: τους χρήστες και την ιδιωτικότητά τους, όπως και τους ερευνητές. Η βέλτιστη επίλυση του θέματος, είναι η Διαφορική Ιδιωτικότητα, που στην πραγματικότητα πρόκειται για μία υπόσχεση από τον χειριστή των δεδομένων προς τον χρήστη, πως ο χρήστης δεν θα επηρεαστεί αν επιτρέψει τη χρήση των δεδομένων του σε κάποια ανάλυση, χωρίς περιορισμούς όπως η παράλληλη ύπαρξη άλλων μελετών/βάσεων δεδομένων/πληροφοριών που υπάρχουν για αυτόν. Παράλληλα, τα στατιστικά του αποτελέσματος της ανάλυσης, πρέπει να είναι αρκετά ακριβή, ώστε ο ερευνητής να μπορεί να εξάγει χρήσιμη πληροφορία από αυτά. \par 
105 | Η υπόσχεση αυτή, δείχνει δύσκολα υλοποιήσιμη με την πρώτη ματιά. Παρόλα αυτά, σε αυτήν την πτυχιακή εργασία, θα ερευνήσουμε με λεπτομέρεια τη θεωρία που καθιστά εφικτή αυτή τη μορφή ιδιωτικότητας, με την προσθήκη τυχαίου θορύβου στα δεδομένα. Η Διαφορική Ιδιωτικότητα βασίζεται σε πιθανοτικές κατανομές, γνωστές ήδη από τον $20^o$ αιώνα, όμως παραμένει μία νέα τεχνική, η οποία δεν έχει πλήρως υλοποιηθεί με τρόπο τέτοιον ώστε να μπορεί να χρησιμοποιηθεί από πολλούς ανθρώπους που είναι υπεύθυνοι για την εξαγωγή δεδομένων.
106 | \par Σκοπός αυτής της πτυχιακής εργασίας, είναι να μελετήσουμε και να συγκρίνουμε ήδη υλοποιημένους μηχανισμούς πανω στην Δ.Ι., ενώ παράλληλα θα δημιουργήσουμε τον δικό μας μηχανισμό, ο οποίος χρησιμοποιείται για τους σκοπούς της Τοπικής Διαφορικής Ιδιωτικότητας που συναντάται την σήμερον ημέραν σε αλγορίθμους μηχανικής μάθησης, με στόχο να προστατέψει τα δεδομένα που αποστέλλουν για εκμάθηση οι χρήστες. Θα το κατορθώσουμε αυτό δημιουργώντας μία προγραμματιστική βιβλιοθήκη η οποία είναι εύκολη στη χρήση, ικανοποιώνατας παράλληλα τους κανόνες της προστασίας δεδομένων, και τέλος θα εξάγουμε συμπεράσματα από τη χρήση της βιβλιοθήκης αυτής.
107 | 
108 | Κατά την διάρκεια αυτής της εργασίας, θα πραγματοποιηθούν πολλές μετρήσεις, με στόχο να γίνει πειστική η χρησιμότητα και η αποτελεσματικότητα της Διαφορικής Ιδιωτικότητας.
109 | 
110 | }
111 | 
112 | \inscriptionEn{\emph{}}
113 | 
114 | %
115 | % Subject area and keywords
116 | %
117 | \subjectAreaGr{Προστασία και Ιδιωτικότητα Δεδομένων}
118 | \subjectAreaEn{Data Privacy}
119 | \keywordsGr{Διαφορική Ιδιωτικότητα, Ασφάλεια, Δεδομένα Χρηστών, Προστασία Δεδομένων, Θόρυβος σε Δεδομένα, Συλλογή Δεδομένων}
120 | \keywordsEn{Differential Privacy, Security, User data, Data Privacy, Noisy Data, Aggregation of Data}
121 | 
122 | %
123 | % Set the .bib file containing your paper publications (leave the extension out)
124 | %
125 | % This is optional, but it should be specified when option 'lop' is passed to
126 | % the document class.
127 | %
128 | % Then, inside the document environment, you may use the command '\nocitelop' to
129 | % site your papers, as you would traditionally do with the commands '\cite' or
130 | % '\nocite'.
131 | %
132 | % The papers are printed in reverse chronological order.
133 | %
134 | %\lopfile{mypapers/pubs}
135 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
136 | %%%%%%%%%%%%%%%%%%%%%%%%%%% Required Metadata %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
137 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
138 | 
139 | \begin{document}
140 | 
141 | \frontmatter
142 | 
143 | \mainmatter
144 | 
145 | \include{GDP/Intro}
146 | 
147 | \include{GDP/DP_definition}
148 | 
149 | \include{GDP/IBM}
150 | 
151 | \include{GDP/ARX}
152 | 
153 | \include{LDP/intro}
154 | 
155 | \include{LDP/other_protocols}
156 | 
157 | \include{LDP/our_protocol}
158 | 
159 | \chapter{CONCLUSIONS AND FUTURE WORK}
160 | 
161 | The goal of this thesis was to analyze the importance of protecting sensitive data, and doing so in an efficient way. After its elaboration, it is clear that Differential Privacy is a secure and efficient way for data anonymization. Having two forms, the Global and the Local, it can cover many different scenarios, including Machine Learning applications.
162 | 
163 | Differential Privacy is the future of Data Protection and Anonymization, as its results can not be compromised, due to the random noise that the algorithms introduce. Unlike previous methods, such as k-anonymity, there is not yet an attack that can reduce the privacy created by D.P. algorithms, which makes this technique ideal.
164 | 
165 | Despite the use of random noise the data is still useful, as the mathematical ideas behind the aggregation were built with the mindset of eliminating this noise using data normalization.
166 | 
167 | Having explored many different applications, algorithms and protocols we can safely say that when it comes down to Global D.P., IBM's diffprivlib is a state of the art library that produces extremely good results. Its use is quite simple as a Python API is provided, thus can be safely added to any numerical dataset.
168 | 
169 | When someone wants to apply L.D.P. during a survey, the pure protocols analyzed and tested are suitable for high efficiency combined with good protections of the members. With simple algorithms, they do not require a trusted curator in order to perform, hence users can perturb their data, and then safely report it. However, when the number of users is small, \emph{the Distance Sensitive Protocol created for the needs of this Thesis is the best option}, as the other protocols produce extreme noise in order to maintain the privacy levels. On the contrary, the D.S. protocol takes into account the distance between the true value and the one being reported when creating its probabilistic space, thus lowering the error produced.
170 | 
171 | Our plans for future work are centered around the D.S. protocol. We would like to perfect its aggregation method, as it may produce satisfying results, but with a different approach it can maybe become even better. Moreover, we would like to perform more demanding experiments for extreme cases of dataset sizes, domain sizes and theta values.
172 | 
173 | Finally, similar testings like the ones introduced in this Thesis can be performed in other D.P. libraries, as the accuracy measurements is a good indicator if someone wants to rank those libraries.
174 | 
175 | 
176 | \backmatter
177 | 
178 | % abbreviations table
179 | \abbreviations
180 | \begin{center}
181 | 	\renewcommand{\arraystretch}{1.5}
182 | 	\begin{longtable}{| l | @{\qquad} l |}
183 | 	\hline
184 | 	EMD & Earth Mover's Distance \\
185 |   \hline
186 |     QIF & Quantitative Information Flow \\
187 |   \hline
188 | 	DP & Differential Privacy\\
189 |   \hline
190 | 	Kant. & Kantorovich \\
191 |   \hline
192 |   LDP & Local Differential Privacy\\
193 |   \hline
194 |   GDP & Global Differential Privacy\\
195 |   \hline
196 |   CSV & Comma Separated Values\\
197 |   \hline
198 |   GUI & Graphical User Interface\\
199 |   \hline
200 |   RR & Randomized Response\\
201 |   \hline
202 |   DE & Direct Encoding\\
203 |   \hline
204 |   HE & Histogram Encoding\\
205 |   \hline
206 |   UE & Unary Encoding\\
207 |   \hline
208 |   DS & Distance Sensitive\\
209 |   \hline
210 | 	\end{longtable}
211 | \end{center}
212 | 
213 | % appendix
214 | \begin{appendix}
215 | % mark the beginning of the appendix
216 | \appendixstartedtrue
217 | 
218 | % add appendix line to ToC
219 | \phantomsection
220 | \addcontentsline{toc}{chapter}{APPENDICES}
221 | 
222 | \chapter{MATHEMATICAL PROOF OF THE D.S. PROTOCOL}
223 | During this Appendix, a mathematical explanation for the $a$ variable in the D.S. protocol will be given.
224 | 
225 | In order to find out the $\alpha$ value, we must solve the following equation:
226 | 
227 | \begin{align*}
228 |     p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1
229 | \end{align*}
230 | 
231 | At this point, we must note that $\aplha$ although not a constant, can be held out of the sums, because it is obviously independent from the $i$ variable, that is the variable parsing through the domain in order to retrieve the false elements' probabilities. Thus, we have:
232 | 
233 | \begin{align*}
234 |         p + \sum_{i = x - \theta}^{i = x + \theta} q + \sum_{i = 1}^{i = x - \theta -1} s + \sum_{i = x + \theta + 1}^{i = d} s = 1 \Longleftrightarrow \\
235 |       \sum_{i = x - \theta}^{i = x + \theta} \frac{a}{|x-i|(|x-i| + 1)} + \sum_{i = 1}^{i = x - \theta -1} \frac{a}{\theta(\theta+1)} + \sum_{i = x + \theta + 1}^{i = d} \frac{a}{\theta(\theta+1)} = 1 - p \Longleftrightarrow \\
236 |     \sum_{i = x - \theta}^{i = x - 1} \frac{a}{(x - i)(x - i + 1)} + \sum_{i = x + 1}^{i = x + \theta} \frac{a}{(i - x)(i - x + 1)} +\\+ \frac{a}{\theta(\theta+1)} \cdot (x - \theta - 1 + d - x - \theta) = 1 - p \Longleftrightarrow \\
237 | \end{align*}
238 | 
239 | For the first sum, we set $u = x - i$ and for the second one $u = i - x$, and we have:
240 | 
241 | \begin{align*}
242 |     \sum_{u = 1}^{u = \theta} \frac{a}{u (u + 1)} + \sum_{u = 1}^{u = \theta} \frac{a}{u (u + 1)} + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\
243 |     2 \cdot a (1 - \frac{1}{\theta + 1}) + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\
244 |     2 \cdot a \frac{\theta}{\theta + 1} + \frac{a}{\theta(\theta+1)} \cdot (d - 2\theta - 1) = 1 - p \Longleftrightarrow\\
245 |     \frac{a}{\theta + 1}(2 \theta + \frac{d - 2\theta - 1}{\theta}) = 1 - p \Longleftrightarrow \\ 
246 |     \frac{a}{\theta + 1}(\frac{2\theta^2 - 2\theta + d - 1}{\theta}) = 1 - p \Longleftrightarrow \\ 
247 |     \mathbf{a = \frac{\theta(\theta + 1) (1 - p)}{2\theta^2 - 2\theta + d - 1}}
248 | \end{align*}
249 | 
250 | \chapter{REPOSITORY OF THE THESIS}
251 | The implementation of all the testings, the libraries and the protocols can be found in the GitHub repository of this thesis, in the link: \url{https://github.com/nikosgalanis/bsc-thesis}. 
252 | 
253 | In the directory \textit{ibm\_lib\_work}, all the notebooks with the measurements made for the IBM library are included.
254 | 
255 | In the directory \textit{ARX\_work}, the java code for the measurements in ARX is included, as well as the datasets and the hierarchies used in order to test the protocol.
256 | 
257 | In the directory \textit{LDP}, the LDP library is implemented, using the already-known protocols from the Wang et. al. paper. Additionally, our own protocol created for the needs of this Thesis is included, alongside with a Python file responsible to create all the testings that were carried out.
258 | 
259 | Finally, in the directory \textit{papers\_used}, all of the papers referenced in this Thesis can be found. 
260 | 
261 | 
262 | More information for the repository and its contents can be found in the README file included.
263 | 
264 | \end{appendix}
265 | 
266 | % % manually include the bibliography
267 | \bibliographystyle{plain}
268 | 
269 | 
270 | {\huge \bibliography{references}}
271 | 
272 | % % include it also in ToC (do sth on your own)
273 | \addcontentsline{toc}{chapter}{REFERENCES}
274 | 
275 | 
276 | \fontsize{10}{14}\selectfont
277 | \setmainfont{Arial}
278 | 
279 | [1]\hspace{1cm}Dwork, C., & Roth, A. (2014). The algorithmic foundations of differential privacy. now Publishers Inc. 
280 | 
281 | [2]\hspace{1cm}Dwork, C., McSherry, F., Nissim, K., & Smith, A. (2006). Calibrating Noise to Sensitivity in Private Data Analysis. Theory of Cryptography, 265–284. 
282 | 
283 | [3]\hspace{1cm}Holohan, N., Braghin, S., Mac Aonghusa, P., & Levacher, K. (2019, July 4). Diffprivlib: The IBM Differential Privacy Library. arXiv.org. 
284 | 
285 | [4]\hspace{1cm}Li, N., Qardaji, W., & Su, D. (2012). On sampling, anonymization, and differential privacy or,k-anonymization meets differential privacy. Proceedings of the 7th ACM Symposium on Information, Computer and Communications Security - ASIACCS '12. 
286 | 
287 | [5]\hspace{1cm}Bild, R., Kuhn, K. A., & Prasser, F. (2018). SafePub: A Truthful Data Anonymization Algorithm With Strong Privacy Guarantees. Proceedings on Privacy Enhancing Technologies, 2018(1), 67–87.
288 | 
289 | [6]\hspace{1cm}Christofides, T. C. (2003). A generalized randomized response technique. Metrika, 57(2), 195–200. 
290 | 
291 | [7]\hspace{1cm}Chatzikokolakis, K., Palamidessi, C., & Stronati, M. (2015). Location privacy via geo-indistinguishability. ACM SIGLOG News, 2(3), 46–69. 
292 | 
293 | [8]\hspace{1cm}Jain, P., Gyanchandani, M., & Khare, N. (2018). Differential privacy: its technological prescriptive using big data. Journal of Big Data, 5(1). 
294 | 
295 | [9]\hspace{1cm}Bebensee, B. (2019, July 27). Local Differential Privacy: a tutorial. arXiv.org. 
296 | 
297 | [10]\hspace{1cm}Tianhao Wang, Jeremiah Blocki, Ninghui Li, and Somesh Jha. 2017. Locally differentially private protocols for frequency estimation. In Proceedings of the 26th USENIX Conference on Security Symposium (SEC'17). USENIX Association, USA, 729–745.
298 | 
299 | [11]\hspace{1cm}Chatzikokolakis, K., Andrés, M. E., Bordenabe, N. E., & Palamidessi, C. (2013). Broadening the Scope of Differential Privacy Using Metrics. Privacy Enhancing Technologies, 82–102.
300 | 
301 | [12]\hspace{1cm}Chamikara, M.A.P. & Bertok, P. & Khalil, Ibrahim & Liu, D. & Camtepe, Seyit. (2019). Local Differential Privacy for Deep Learning. 
302 | 
303 | [13]\hspace{1cm}Chatzikokolakis, K., Fernandes, N., & Palamidessi, C. (2020). Refinement Orders for Quantitative Information Flow and Differential Privacy. Journal of Cybersecurity and Privacy, 1(1), 40–77.
304 | 
305 | [14]\hspace{1cm}Bassily, R., & Smith, A. (2015). Local, Private, Efficient Protocols for Succinct Histograms. Proceedings of the Forty-Seventh Annual ACM Symposium on Theory of Computing.
306 | 
307 | [15]\hspace{1cm}Erlingsson, Ú., Pihur, V., & Korolova, A. (2014). RAPPOR. Proceedings of the 2014 ACM SIGSAC Conference on Computer and Communications Security.
308 | 
309 | [16]\hspace{1cm} "Surgery Charges Across the U.S.", https://data.world/dmikebishop/surgery-charges-across-the-u-s.
310 | 
311 | [17]\hspace{1cm} "NBA Salaries", https://data.world/datadavis/nba-salaries
312 | 
313 | \end{document}
314 | 


--------------------------------------------------------------------------------