├── .gitignore ├── 01 - Pseudonymization.ipynb ├── 02 - Exploring Differential Privacy with Laplace.ipynb ├── 03 - Differential Privacy with Gaussian Noise.ipynb ├── 06 - Local Differential Privacy via Randomized Response.ipynb ├── 07 - Automating Privacy in Pipelines.ipynb ├── 08 - Differential Privacy in Pipelines.ipynb ├── 09 - Differentially Private Training with Opacus.ipynb ├── 10 - Federated Learning with flwr.ipynb ├── 11 - Secret Sharing.ipynb ├── 12 - Homomorphic Encryption with Paillier.ipynb ├── 13 - PSI and Moose - Encrypted Computation for Data Sharing.ipynb ├── README.md ├── Video Course - Differential Privacy with Tumult Analytics.ipynb ├── Video Course - Example Dataset Creation.ipynb ├── data ├── alice_id.csv ├── alice_keys.csv ├── alice_keys.csv_metrics ├── block-stats.csv ├── bob_id.csv ├── bob_keys.csv ├── bob_keys.csv_metrics ├── browsing.csv ├── database.csv ├── iot_example.csv ├── members.csv ├── orders.csv ├── user_id_available_a.npy ├── user_id_available_b.npy ├── x_a.npy └── x_b.npy ├── database-reconstruction-attack.ipynb ├── dpia-template.docx ├── flower ├── client.py ├── personalization_use_case.txt ├── server.py ├── video_course_client.py ├── video_course_run.sh └── video_course_server.py ├── in-progress ├── 05 - Differential Privacy Experiments (Work In Progress!).ipynb ├── Generating Example Data.ipynb └── Presidio in Pipelines.ipynb ├── order_summary_for_sharing_expecation_file.json ├── reader-contributions ├── README.md └── database_reconstruction_attack_ortools.ipynb ├── requirements.txt └── solutions ├── filter_bounds.py ├── laplace_dp.py ├── lockpick.py ├── masked_pseudonym.py ├── multiply.py ├── pad_text.py ├── proper_encoding.py └── subtraction.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | spark-warehouse/ 3 | .ipynb_checkpoints/ 4 | data/client* 5 | data/steam* 6 | data/*_for_sharing.csv 7 | data/__MACOSX/ 8 | data/food-101/ 9 | data/snli* 10 | data/*.json 11 | -------------------------------------------------------------------------------- /01 - Pseudonymization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### 01. Pseudonymization\n", 8 | "\n", 9 | "In this notebook, we'll explore pseudonymization methods such as hashing, masking and format-preserving encryption.\n", 10 | "\n", 11 | "For more reading on the topic, please see: \n", 12 | "\n", 13 | "- [Medium (Alex Ewerlöf): Anonymization vs. Pseudonymization](https://medium.com/@alexewerlof/gdpr-pseudonymization-techniques-62f7b3b46a56)\n", 14 | "- [KIProtect: GDPR for Data Science](https://kiprotect.com/blog/gdpr_for_data_science.html)\n", 15 | "- [IAPP: Anonymization and Pseudonymization Compared in relation to GDPR compliance](https://iapp.org/media/pdf/resource_center/PA_WP2-Anonymous-pseudonymous-comparison.pdf)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import base64\n", 25 | "from hashlib import blake2b\n", 26 | "\n", 27 | "import pandas as pd\n", 28 | "import json\n", 29 | "import requests\n", 30 | "\n", 31 | "from faker import Faker\n", 32 | "from ff3 import FF3Cipher" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "#### Precheck: What is our data? \n", 40 | "- What information is contained in our data?\n", 41 | "- What privacy concerns are there?\n", 42 | "- How should we proceed?" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df = pd.read_csv('data/iot_example.csv')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "df.head()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "#### Section One: Hashing\n", 68 | "\n", 69 | "- Applying the blake2b hash\n", 70 | "- Allowing for de-pseudonymization\n", 71 | "- Creating a reusable method for hashing" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "username = df.iloc[0,1]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "username" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "hasher = blake2b()\n", 99 | "hasher.update(username)\n", 100 | "hasher.hexdigest()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "Oops. What went wrong? How can we fix?" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# %load solutions/proper_encoding.py\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "Great! Now we have a hash. Michael is safe! (or [is he?](https://nakedsecurity.sophos.com/2014/06/24/new-york-city-makes-a-hash-of-taxi-driver-data-disclosure/))\n", 131 | "\n", 132 | "But... what if we need to later determine that michaelsmith is a2a858011c091715...." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "hasher." 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "Okay, let's try something that we can reverse..." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# From https://stackoverflow.com/questions/2490334/simple-way-to-encode-a-string-according-to-a-password\n", 158 | "\n", 159 | "def encode(key, clear):\n", 160 | " enc = []\n", 161 | " for i in range(len(clear)):\n", 162 | " key_c = key[i % len(key)]\n", 163 | " #print(key_c)\n", 164 | " enc_c = (ord(clear[i]) + ord(key_c)) % 256\n", 165 | " #print(enc_c)\n", 166 | " enc.append(enc_c)\n", 167 | " return base64.urlsafe_b64encode(bytes(enc))\n", 168 | "\n", 169 | "def decode(key, enc):\n", 170 | " dec = []\n", 171 | " enc = base64.urlsafe_b64decode(enc)\n", 172 | " for i in range(len(enc)):\n", 173 | " key_c = key[i % len(key)]\n", 174 | " dec_c = chr((256 + enc[i] - ord(key_c)) % 256)\n", 175 | " dec.append(dec_c)\n", 176 | " return \"\".join(dec)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "encode('supa_secret', username)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "decode('supa_secret', b'4N7TycDY0dbfzujb')" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "#### Challenge\n", 202 | "\n", 203 | "- Can you come up with another string which will properly decode the secret which is *not* the same as the original key?\n", 204 | "- Hint: Take a look at the encode method and use the print statements for a clue." 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# %load solutions/lockpick.py\n" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "Welp. That maybe is not so great... " 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "#### Section Two: Data Masking and Tokenization\n", 235 | "\n", 236 | "- What should we mask?\n", 237 | "- How?\n", 238 | "- What do we do if we need realistic values?" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "df.sample(2)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "super_masked = df.applymap(lambda x: 'NOPE')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "super_masked.head()" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "😜\n", 273 | "\n", 274 | "Okay, no more jokes. But masking usually is just that. Replace your senstive data with some sort of represetation.\n", 275 | "\n", 276 | "But instead, we could also tokenize the data. This means to replace it with random fictitious data. How do we tokenize this?" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "fakes = Faker()" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "fakes.name()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "fakes." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "fakes.user_name()" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "#### Challenge\n", 320 | "\n", 321 | "Make a new column `pseudonym` which masks the data using the faker `user_name` method." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "# %load solutions/masked_pseudonym.py\n", 338 | "\n" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "Whaaaa!?!? Pretty cool, eh? \n", 346 | "\n", 347 | "(In case you want to read up on [how it works](https://github.com/joke2k/faker/blob/06d323f6cff95103d4ccda03f5d4ab2c45334e46/faker/providers/internet/__init__.py#L162))\n", 348 | "\n", 349 | "But.. we can't reverse it. It is tuned per locale (usually using probabilities based on names in locale). That said, works fabulous for test data!" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": {}, 355 | "source": [ 356 | "#### Step Three: Format-Preserving Encryption" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "key = \"2DE79D232DF5585D68CE47882AE256D6\"\n", 366 | "tweak = \"CBD09280979564\"\n", 367 | "\n", 368 | "c6 = FF3Cipher.withCustomAlphabet(key, tweak, \"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_\")\n", 369 | "\n", 370 | "plaintext = \"michaelsmith\"\n", 371 | "ciphertext = c6.encrypt(plaintext)\n", 372 | "\n", 373 | "ciphertext" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "decrypted = c6.decrypt(ciphertext)\n", 383 | "decrypted" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "df['username'] = df['username'].map(c6.encrypt)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "Oh no! What does this message mean and how can we fix it?" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "# %load solutions/pad_text.py\n" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": null, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "def add_padding_and_encrypt(cipher, username):\n", 418 | " # add code here" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "df['username'] = df['username'].map(lambda x: add_padding_and_encrypt(c6, x))" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "df['username']" 437 | ] 438 | }, 439 | { 440 | "cell_type": "markdown", 441 | "metadata": {}, 442 | "source": [ 443 | "### Questions\n", 444 | "\n", 445 | "- What would happen if someone found our key?\n", 446 | "- What happens if a username ends in X?\n", 447 | "- What properties do we need in our data in order to maintain encryption-level security?" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "#### Additional Challenge\n", 455 | "\n", 456 | "How would we build our own format-preserving encryption?" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "num_cipher = FF3Cipher.withCustomAlphabet(key, tweak, \"0123456789\")" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "example = \"2017-01-01T12:00:23\"" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [ 483 | "enc_date = num_cipher.encrypt(example.replace(\"T\",\"\").replace(\":\",\"\").replace(\"-\",\"\"))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": null, 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "enc_ts = f\"{enc_date[:4]}-{enc_date[4:6]}-{enc_date[6:8]}T{enc_date[8:10]}:{enc_date[10:12]}:{enc_date[12:14]}\"" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": null, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "enc_ts" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "#### Homework Challenge\n", 509 | "\n", 510 | "Create a function to format preserve another column in the data.\n", 511 | "\n", 512 | "Return a new dataframe of just the pseudonymized data." 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": null, 518 | "metadata": {}, 519 | "outputs": [], 520 | "source": [] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "Python 3 (ipykernel)", 533 | "language": "python", 534 | "name": "python3" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.9.13" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 2 551 | } 552 | -------------------------------------------------------------------------------- /06 - Local Differential Privacy via Randomized Response.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from collections import Counter\n", 10 | "from math import e\n", 11 | "from random import choice\n", 12 | "\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "%matplotlib inline" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Imagine we are asked to participate in a survey that will use randomized response as a way to gaurantee plausible deniability. We want to first determine what our personal threshhold (loss function) is to participating. We can imagine that one or more of the questions we will be asked are sensitive. Here are a few examples:\n", 23 | "\n", 24 | "- Have you knowingly consumed illegal drugs in the past month? \n", 25 | "- Have you tested positive for Covid in the past 6 months?\n", 26 | "- Have you ever lied to your boss?\n", 27 | "\n", 28 | "Answering yes (or, arguably, no) to any of these responses would be considered quite sensitive in terms of personal privacy. We will take yes as the most sensitive response and say that if you toss a coin and it turns up heads (1), that you tell the truth in your response. If it turns up tails, you toss the coin again and you report No (0) if it comes up tails and Yes (1) if it comes up heads. \n", 29 | "\n", 30 | "Let's take a look at how this mechanism ensures differential privacy!" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "outcomes = [0, 1]\n", 40 | "our_true_response = 1" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "coin_flip = choice(outcomes)\n", 50 | "print (\"Initial coin flip is %s\" % coin_flip)\n", 51 | "if coin_flip == 0:\n", 52 | " second_coin_flip = choice(outcomes)\n", 53 | " print(\"Second coin flip is %s\" % second_coin_flip)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "def flip_coin(outcomes, our_response=1):\n", 63 | " coin_flip = choice(outcomes)\n", 64 | " if coin_flip == 1:\n", 65 | " return our_response\n", 66 | " second_coin_flip = choice(outcomes)\n", 67 | " return second_coin_flip" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "def randomized_response(outcomes, num_trails=100):\n", 77 | " return [flip_coin(outcomes) for x in range(num_trails)]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "results = randomized_response(outcomes)\n", 87 | "Counter(results)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "plt.bar(Counter(results).keys(), Counter(results).values(), tick_label=['Yes', 'No'])" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We can see here the benefit of plausible deniability. Even though 100 of us would have answered yes, we see that there is always some proportion of no responses due to the randomized response mechanism. This protects all of us by allowing us culpable deniability. " 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "So for us personally, this aligns well with looking at our own probability distribution w.r.t the sensitive questions. If our true answer is yes we have:\n", 111 | "\n", 112 | "P[A(Yes)=Yes]=0.75, P[A(Yes)=No]=0.25\n", 113 | "\n", 114 | "75% chance of saying Yes, 25% chance of saying No.\n", 115 | "\n", 116 | "\n", 117 | "If our true answer is no we have:\n", 118 | "\n", 119 | "P[A(No)=Yes]=0.25, P[A(No)=No]=0.75\n", 120 | "\n", 121 | "This difference shows a sensitivity of 3x (.25 * 3 = .75), which means our probability should be bounded by 3 in differential privacy, which would mean\n", 122 | "\n", 123 | "e^epsilon = 3\n", 124 | "\n", 125 | "Making epsilon ≃ 1.1 if we are using (ɛ, 0)-differential privacy :) " 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "e**1.1" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "x = np.linspace(0,3,10000)\n", 144 | "y = np.power(e,x)\n", 145 | "\n", 146 | "fig,ax= plt.subplots()\n", 147 | "plt.style.use('ggplot')\n", 148 | "ax.plot(x,y);\n", 149 | "ax.set_title('e^epsilon')\n", 150 | "ax.set_xlabel('epsilon')\n", 151 | "ax.set_ylabel('probability bound');" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Of course, it also tells the researcher that there are quite a lot of true Yeses, due to the ratio of the outcomes above. Due to our epsilon bounds, we can say that the researcher can increase their probability of learning something about us by 3x with successive queries via this mechanism. Of course, we also have the power of large numbers in our favor here, meaning the researcher being able to link our exact response to us is quite rare and would require a targeted attack (or several). \n", 159 | "\n", 160 | "Now, how does the researcher learn from the survey? They would need to de-noise the data as best possible which means thinking about the biases they introduced with the differentially private mechanism.\n", 161 | "\n", 162 | "How the researcher might try to de-noise the data is as follows." 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "First, we will create some data where there is more initial variety in the responses! Feel free to modify below to try out different underlying distributions!" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "underlying_dist = np.random.choice([0, 1], size=100, p=[.2, .8])\n", 179 | "\n", 180 | "def randomized_response(outcomes):\n", 181 | " return [flip_coin(outcomes, our_response=x) for x in underlying_dist]\n", 182 | "results = randomized_response(outcomes)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "Counter(underlying_dist)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "Counter(results)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "def denoise_data(survey_results):\n", 210 | " results = Counter(survey_results)\n", 211 | " # We want to first figure out our guess of # of possible random responses per Yes, No answer\n", 212 | " num_random_responses = len(survey_results)*0.5 / len(results)\n", 213 | " denoised = {'Yes': 0, 'No':0, 'Unaccounted':0}\n", 214 | " for outcome, num_responses in results.items():\n", 215 | " if outcome == 0: \n", 216 | " outcome = 'No'\n", 217 | " else:\n", 218 | " outcome = 'Yes'\n", 219 | " # if the number of responses for this answer is more than we expect, we could try to subtrack the random responses (naive and crude!)\n", 220 | " if num_responses > num_random_responses:\n", 221 | " denoised[outcome] = num_responses - num_random_responses\n", 222 | " elif outcome == 1: # oops, we have too many Yes answers because our distribution is skewed\n", 223 | " denoised[outcome] = 0\n", 224 | " denoised['Unaccounted'] = num_random_responses - num_responses\n", 225 | " else: # or too many No!\n", 226 | " denoised[outcome] = 1 \n", 227 | " denoised['Unaccounted'] = num_random_responses - num_responses\n", 228 | " return denoised" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "results = denoise_data(randomized_response(outcomes))" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "results" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "plt.bar(Counter(results).keys(), Counter(results).values(), \n", 256 | " tick_label=['Yes', 'No', 'Unaccounted'])" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "And one way to interpret this would be that we could guess that a large percent of the population had a true answer of Yes. This is of course quite inaccurate compared to our knowledge that 100% of the population said yes, but it is a step in the right direction! Thus, we see both the accuracy limits and results of our randomized response! :)\n", 264 | "\n", 265 | "\n", 266 | "### Challenge\n", 267 | "\n", 268 | "How can we modify the functions above to show how it might work on different underlying distributions? (i.e. if 75% of the population would answer yes, or if 50% would answer yes?). What changes with the results and the accuracy? \n", 269 | "\n", 270 | "\n", 271 | "### Additional Challenge\n", 272 | "\n", 273 | "If you were a researcher trained in Bayesian statistics, could you use a prior here that would allow you to have a more accurate takeaway from the response? (Or from future responses?). Asked in another way, what information have we learned from our initial survey that could inform future surveys by using an appropriate probability distribution function?" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "### Recommended Reading\n", 281 | "\n", 282 | "- Ted on Privacy's Blog Series on Differential Privacy! https://desfontain.es/privacy/friendly-intro-to-differential-privacy.html" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3 (ipykernel)", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.9.13" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 4 321 | } 322 | -------------------------------------------------------------------------------- /08 - Differential Privacy in Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "268e15b0", 6 | "metadata": {}, 7 | "source": [ 8 | "### Differential Privacy with Spark using Tumult Analytics\n", 9 | "\n", 10 | "For this notebook, you'll need to follow the [Tumult installation instructions](https://docs.tmlt.dev/analytics/latest/installation.html) in a separate virtual environment, as the library currently has clashing requirements compared to some of the other libraries in these notebooks.\n", 11 | "\n", 12 | "Once you have it installed, you're ready to try out your differential privacy knowledge with Spark, let's go!" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 4, 18 | "id": "03d739e2", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import matplotlib.pyplot as plt\n", 23 | "import pandas as pd\n", 24 | "import seaborn as sns\n", 25 | "\n", 26 | "from pyspark.sql import SparkSession\n", 27 | "from tmlt.analytics.keyset import KeySet\n", 28 | "from tmlt.analytics.privacy_budget import PureDPBudget\n", 29 | "from tmlt.analytics.protected_change import AddOneRow\n", 30 | "from tmlt.analytics.query_builder import QueryBuilder, ColumnType, BinningSpec\n", 31 | "from tmlt.analytics.session import Session\n", 32 | "\n", 33 | "\n", 34 | "spark = SparkSession.builder.getOrCreate()\n", 35 | "members_df = spark.read.csv(\"data/members.csv\", header=True, inferSchema=True)\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "id": "07baa055", 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "session = Session.from_dataframe(\n", 46 | " privacy_budget=PureDPBudget(epsilon=1.1),\n", 47 | " source_id=\"members\",\n", 48 | " dataframe=members_df,\n", 49 | " protected_change=AddOneRow(),\n", 50 | ")" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 6, 56 | "id": "fdbefbb2", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "['id',\n", 63 | " 'name',\n", 64 | " 'age',\n", 65 | " 'gender',\n", 66 | " 'education_level',\n", 67 | " 'zip_code',\n", 68 | " 'books_borrowed',\n", 69 | " 'favorite_genres',\n", 70 | " 'date_joined']" 71 | ] 72 | }, 73 | "execution_count": 6, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "members_df.columns" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "33907a81", 85 | "metadata": {}, 86 | "source": [ 87 | "I'm curious if there is a correlation between education_level and books_borrowed. Let's take a look!" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "id": "ed6c5e8d", 93 | "metadata": {}, 94 | "source": [ 95 | "I first need to build a KeySet with the values I'd like to use... Normally I would use Spark to do this, but I need to get the list of values without looking at the data itself, as this would count towards my budget. Thankfully, we have well documented data, so I was able to get the following list! :)" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "id": "2bdbe680", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "edu_levels = KeySet.from_dict({\n", 106 | " \"education_level\": [\n", 107 | " \"up-to-high-school\",\n", 108 | " \"high-school-diploma\",\n", 109 | " \"bachelors-associate\",\n", 110 | " \"masters-degree\",\n", 111 | " \"doctorate-professional\",\n", 112 | " ]\n", 113 | "})" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "d64d73d8", 119 | "metadata": {}, 120 | "source": [ 121 | "Now I can use the QueryBuilder to group by education and then give an average. Here I am binning the number of books borrowed between 0 and 100." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 8, 127 | "id": "0131bb58", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stderr", 132 | "output_type": "stream", 133 | "text": [ 134 | "/opt/anaconda3/envs/tumult/lib/python3.8/site-packages/pyspark/sql/pandas/functions.py:394: UserWarning: In Python 3.6+ and Spark 3.0+, it is preferred to specify type hints for pandas UDF instead of specifying pandas UDF type which will be deprecated in the future releases. See SPARK-28264 for more details.\n", 135 | " warnings.warn(\n", 136 | " \r" 137 | ] 138 | }, 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "+----------------------+----------------------+\n", 144 | "|education_level |books_borrowed_average|\n", 145 | "+----------------------+----------------------+\n", 146 | "|masters-degree |19.0265726681128 |\n", 147 | "|doctorate-professional|19.13195435092725 |\n", 148 | "|bachelors-associate |19.177823348469314 |\n", 149 | "|up-to-high-school |19.37279031819418 |\n", 150 | "|high-school-diploma |19.603978997061514 |\n", 151 | "+----------------------+----------------------+\n", 152 | "\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "edu_average_books_query = (\n", 158 | " QueryBuilder(\"members\")\n", 159 | " .groupby(edu_levels)\n", 160 | " .average(\"books_borrowed\", low=0, high=100)\n", 161 | ")\n", 162 | "edu_average_books = session.evaluate(\n", 163 | " edu_average_books_query,\n", 164 | " privacy_budget=PureDPBudget(0.6),\n", 165 | ")\n", 166 | "edu_average_books.sort(\"books_borrowed_average\").show(truncate=False)\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "7a596713", 172 | "metadata": {}, 173 | "source": [ 174 | "There doesn't seem to be any correlation to find here! I wonder if age might be a better indicator, maybe even connected with an education level?\n", 175 | "\n", 176 | "To take a look, I first want to create age groups by binning the age in ranges." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 9, 182 | "id": "b9a24214", 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "age_binspec = BinningSpec([10*i for i in range(0, 11)])\n", 187 | "\n", 188 | "age_bin_keys = KeySet.from_dict({\n", 189 | " \"age_binned\": age_binspec.bins()\n", 190 | "})" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "id": "8bff1b4d", 196 | "metadata": {}, 197 | "source": [ 198 | "Now I can filter and group by age... Here I am singling out those with masters or doctorates and I am using a new bounds for my books borrowed as I think 100 was too high!" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "id": "c4782ac7", 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stderr", 209 | "output_type": "stream", 210 | "text": [ 211 | " \r" 212 | ] 213 | }, 214 | { 215 | "name": "stdout", 216 | "output_type": "stream", 217 | "text": [ 218 | "+----------+----------------------+\n", 219 | "|age_binned|books_borrowed_average|\n", 220 | "+----------+----------------------+\n", 221 | "|(10, 20] |-6.0 |\n", 222 | "|(20, 30] |10.846464646464646 |\n", 223 | "|(30, 40] |11.547257876312718 |\n", 224 | "|(40, 50] |11.070460704607045 |\n", 225 | "|(50, 60] |11.566094100074682 |\n", 226 | "|(60, 70] |11.075132275132274 |\n", 227 | "|(70, 80] |11.117088607594937 |\n", 228 | "|(80, 90] |10.222222222222221 |\n", 229 | "|(90, 100] |11.0 |\n", 230 | "|[0, 10] |10.0 |\n", 231 | "+----------+----------------------+\n", 232 | "\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "binned_age_with_filter_query = QueryBuilder(\"members\")\\\n", 238 | " .filter(\"education_level='masters-degree' or education_level='doctorate-professional'\")\\\n", 239 | " .bin_column(\"age\", age_binspec)\\\n", 240 | " .groupby(age_bin_keys)\\\n", 241 | " .average(\"books_borrowed\", low=0, high=22)\n", 242 | "\n", 243 | "session.evaluate(binned_age_with_filter_query, privacy_budget=PureDPBudget(0.4)).show(truncate=False)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "cacc5a8f", 249 | "metadata": {}, 250 | "source": [ 251 | "Oye! I can see that there is a lot of noise added to some of these columns. What did I do wrong? In this case, I filtered on age and did not take into account that some of the age groups represented would likely be underrepresented in my filter. The likelihood that a 8 year old has a masters degree is quite small...\n", 252 | "\n", 253 | "In the future, I might run a query like the following first! Getting an idea for books borrowed by age before filtering... :)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 11, 259 | "id": "7e3aa556", 260 | "metadata": {}, 261 | "outputs": [ 262 | { 263 | "name": "stderr", 264 | "output_type": "stream", 265 | "text": [ 266 | " \r" 267 | ] 268 | }, 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "+----------+----------------------+\n", 274 | "|age_binned|books_borrowed_average|\n", 275 | "+----------+----------------------+\n", 276 | "|(10, 20] |11.576746088557112 |\n", 277 | "|(20, 30] |11.46492337972726 |\n", 278 | "|(30, 40] |11.550365211482928 |\n", 279 | "|(40, 50] |11.257424458565685 |\n", 280 | "|(50, 60] |11.23477687403825 |\n", 281 | "|(60, 70] |11.349001351554287 |\n", 282 | "|(70, 80] |11.620332883490779 |\n", 283 | "|(80, 90] |10.83838383838384 |\n", 284 | "|(90, 100] |243.0 |\n", 285 | "|[0, 10] |11.138160325083119 |\n", 286 | "+----------+----------------------+\n", 287 | "\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "binned_age_query = QueryBuilder(\"members\")\\\n", 293 | " .bin_column(\"age\", age_binspec)\\\n", 294 | " .groupby(age_bin_keys)\\\n", 295 | " .average(\"books_borrowed\", low=0, high=22)\n", 296 | "\n", 297 | "session.evaluate(binned_age_query, privacy_budget=PureDPBudget(0.1)).show(truncate=False)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "id": "8feb256e", 303 | "metadata": {}, 304 | "source": [ 305 | "Or even just looking at a count...." 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "6efbb58b", 311 | "metadata": {}, 312 | "source": [ 313 | "Oh no! I ran out of budget!\n", 314 | "\n", 315 | "Good news: [Tumult Labs](https://www.tmlt.dev/) has a bunch of notebooks to try out with this dataset and there is an option to set your budget to inifinity as you play around and get to know the library. That said, when you are using Tumult or any differential privacy library in production, you'll need to first make real decisions on your queries and budget! \n", 316 | "\n", 317 | "Take a look at their tutorials and happy privacying!" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "id": "c51d1dd0", 323 | "metadata": {}, 324 | "source": [ 325 | "### Challenges\n", 326 | "\n", 327 | "- Fix the query so that you get a better result for the books borrowed average.\n", 328 | "- Use an unlimited privacy budget (`privacy_budget=PureDPBudget(epsilon=float('inf'))`), and investigate the correlations in the dataset further. If you find an interesting one, switch back to a budget and try to show it via matplotlib or seaborn\n", 329 | "- Go through the [Tumult Analytics Tutorial](https://docs.tmlt.dev/analytics/latest/tutorials/) to try out more features." 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "65344e49", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [] 339 | } 340 | ], 341 | "metadata": { 342 | "kernelspec": { 343 | "display_name": "Python 3 (ipykernel)", 344 | "language": "python", 345 | "name": "python3" 346 | }, 347 | "language_info": { 348 | "codemirror_mode": { 349 | "name": "ipython", 350 | "version": 3 351 | }, 352 | "file_extension": ".py", 353 | "mimetype": "text/x-python", 354 | "name": "python", 355 | "nbconvert_exporter": "python", 356 | "pygments_lexer": "ipython3", 357 | "version": "3.10.9" 358 | } 359 | }, 360 | "nbformat": 4, 361 | "nbformat_minor": 5 362 | } 363 | -------------------------------------------------------------------------------- /10 - Federated Learning with flwr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from collections import OrderedDict\n", 10 | "\n", 11 | "import torch\n", 12 | "import torch.nn as nn\n", 13 | "import torch.nn.functional as F\n", 14 | "import torchvision.transforms as transforms\n", 15 | "from torch.utils.data import DataLoader\n", 16 | "from torchvision.datasets import VisionDataset, Food101\n", 17 | "from typing import List, Tuple\n", 18 | "from flwr.common import Metrics\n", 19 | "\n", 20 | "import flwr as fl\n", 21 | "import numpy as np" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import os\n", 31 | "import multiprocessing\n", 32 | "\n", 33 | "data_path = os.path.join(os.getcwd(),'data', 'food-101')\n", 34 | "cpu_count = multiprocessing.cpu_count() - 1 # set as you like!\n", 35 | "#device = torch.device(\"mps\") #CHANGE THIS TO FIT YOUR DEVICE PLEASE :D (maybe under fits)\n", 36 | "device = torch.device(\"cuda:0\" if torch.cuda.is_available() else \"cpu\")" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 3, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "pool_size = 4 # number of dataset partions (= number of total clients)\n", 46 | "\n", 47 | "client_resources = {\n", 48 | " \"num_cpus\": cpu_count\n", 49 | "} # each client will get allocated 1 CPUs\n", 50 | "\n", 51 | "transformations = transforms.Compose([\n", 52 | " transforms.Resize((32,32)),\n", 53 | " transforms.ToTensor(),\n", 54 | "])\n", 55 | "\n", 56 | "\n", 57 | "# Download Dataset\n", 58 | "try:\n", 59 | " train_data = Food101(data_path, transform=transformations)\n", 60 | "except:\n", 61 | " train_data = Food101(data_path, transform=transformations, download=True) \n", 62 | "test_data = Food101(data_path, split='test', transform=transformations)\n", 63 | "\n", 64 | "lengths = []\n", 65 | "while sum(lengths) != len(train_data):\n", 66 | " lengths = [round(x) for x in np.random.dirichlet(\n", 67 | " np.ones(pool_size),size=1)[0] * len(train_data)]\n", 68 | " \n", 69 | "trainloader = DataLoader(train_data, batch_size=32, shuffle=True)\n", 70 | "testloader = DataLoader(test_data, batch_size=32)\n", 71 | "num_examples = {\"trainset\" : len(train_data), \"testset\" : len(test_data)}" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# borrowed from Pytorch quickstart example\n", 81 | "def train(net, trainloader, epochs, device: str):\n", 82 | " \"\"\"Train the network on the training set.\"\"\"\n", 83 | " criterion = torch.nn.CrossEntropyLoss(ignore_index=1)\n", 84 | " optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9)\n", 85 | " net.train()\n", 86 | " for _ in range(epochs):\n", 87 | " for images, labels in trainloader:\n", 88 | " images, labels = images.to(device), labels.to(device)\n", 89 | " optimizer.zero_grad()\n", 90 | " loss = criterion(net(images), labels)\n", 91 | " loss.backward()\n", 92 | " optimizer.step()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# borrowed from Pytorch quickstart example\n", 102 | "def test(net, testloader, device: str):\n", 103 | " \"\"\"Validate the network on the entire test set.\"\"\"\n", 104 | " criterion = torch.nn.CrossEntropyLoss()\n", 105 | " correct, total, loss = 0, 0, 0.0\n", 106 | " net.eval()\n", 107 | " with torch.no_grad():\n", 108 | " for batch in testloader:\n", 109 | " images, labels = batch[0].to(device), batch[1].to(device)\n", 110 | " outputs = net(images)\n", 111 | " loss += criterion(outputs, labels).item()\n", 112 | " _, predicted = torch.max(outputs.data, 1)\n", 113 | " total += labels.size(0)\n", 114 | " correct += (predicted == labels).sum().item()\n", 115 | " accuracy = correct / total\n", 116 | " print(\"Loss: %f, Accuracy: %f\" % (loss, accuracy))\n", 117 | " return loss, accuracy" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "class Net(nn.Module):\n", 127 | " def __init__(self) -> None:\n", 128 | " super(Net, self).__init__()\n", 129 | " self.conv1 = nn.Conv2d(3, 6, 5)\n", 130 | " self.pool = nn.MaxPool2d(2, 2)\n", 131 | " self.conv2 = nn.Conv2d(6, 16, 5)\n", 132 | " self.fc1 = nn.Linear(16 * 5 * 5, 120)\n", 133 | " self.fc2 = nn.Linear(120, 84)\n", 134 | " self.fc3 = nn.Linear(84, 101)\n", 135 | "\n", 136 | " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", 137 | " x = self.pool(F.relu(self.conv1(x)))\n", 138 | " x = self.pool(F.relu(self.conv2(x)))\n", 139 | " x = x.view(-1, 16 * 5 * 5)\n", 140 | " x = F.relu(self.fc1(x))\n", 141 | " x = F.relu(self.fc2(x))\n", 142 | " x = self.fc3(x)\n", 143 | " return x\n", 144 | "\n", 145 | "# Load model and data\n", 146 | "net = Net().to(device)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 7, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "class FlowerClient(fl.client.NumPyClient):\n", 156 | " \n", 157 | " def get_parameters(self, config):\n", 158 | " return [val.cpu().numpy() for _, val in net.state_dict().items()]\n", 159 | "\n", 160 | " def set_parameters(self, parameters):\n", 161 | " params_dict = zip(net.state_dict().keys(), parameters)\n", 162 | " state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict})\n", 163 | " net.load_state_dict(state_dict, strict=True)\n", 164 | "\n", 165 | " def fit(self, parameters, config):\n", 166 | " self.set_parameters(parameters)\n", 167 | " train(net, trainloader, 3, device)\n", 168 | " return self.get_parameters(config={}), num_examples[\"trainset\"], {}\n", 169 | "\n", 170 | " def evaluate(self, parameters, config):\n", 171 | " self.set_parameters(parameters)\n", 172 | " loss, accuracy = test(net, testloader, device)\n", 173 | " return float(loss), num_examples[\"testset\"], {\"accuracy\": float(accuracy)}\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "101" 185 | ] 186 | }, 187 | "execution_count": 8, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "len(test_data.classes)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### Before you start training!\n", 201 | "\n", 202 | "- Make sure your device is properly set above to fit your compute.\n", 203 | "- If you have made any changes to this script, download it as a python file and replace the flower/client.py file.\n", 204 | "- Open a separate terminal and run `python flower/server.py`.\n", 205 | "- Open 1-3 more terminals and run `python flower/client.py`.\n", 206 | "- Then run the following cell to also run a client here and watch! :)\n", 207 | "\n", 208 | "If you want to change any of the model parameters, structure or even the splits on the data, you'll want to restart the server and clients. Have fun and experiment!" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 9, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stderr", 218 | "output_type": "stream", 219 | "text": [ 220 | "INFO flwr 2023-02-28 17:38:50,146 | grpc.py:50 | Opened insecure gRPC connection (no certificates were passed)\n", 221 | "DEBUG flwr 2023-02-28 17:38:50,151 | connection.py:38 | ChannelConnectivity.IDLE\n", 222 | "DEBUG flwr 2023-02-28 17:38:50,206 | connection.py:38 | ChannelConnectivity.READY\n" 223 | ] 224 | }, 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "Loss: 3458.234933, Accuracy: 0.052713\n", 230 | "Loss: 3010.837580, Accuracy: 0.136515\n" 231 | ] 232 | }, 233 | { 234 | "name": "stderr", 235 | "output_type": "stream", 236 | "text": [ 237 | "DEBUG flwr 2023-02-28 18:44:55,237 | connection.py:109 | gRPC channel closed\n", 238 | "INFO flwr 2023-02-28 18:44:55,237 | app.py:153 | Disconnect and shut down\n" 239 | ] 240 | }, 241 | { 242 | "name": "stdout", 243 | "output_type": "stream", 244 | "text": [ 245 | "Loss: 2963.027775, Accuracy: 0.148515\n" 246 | ] 247 | } 248 | ], 249 | "source": [ 250 | "fl.client.start_numpy_client(\n", 251 | " server_address=\"127.0.0.1:8080\",\n", 252 | " client=FlowerClient(),\n", 253 | ")" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Challenges\n", 261 | "\n", 262 | "- Adjust the server settings and see how the performance changes (see flower/server.py).\n", 263 | "- Split the data unevenly across the clients and see how the trianing goes.\n", 264 | "- Try out another [Flower tutorial](https://flower.dev/docs/quickstart-pytorch.html).\n", 265 | "- Get a group of several folks together to try running flower in a distributed setup. Document your learnings and share in the reader-contributions!" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "Python 3 (ipykernel)", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.10.9" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 4 297 | } 298 | -------------------------------------------------------------------------------- /11 - Secret Sharing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Secret Sharing\n", 8 | "\n", 9 | "In this notebook, you'll learn some basic building blocks of Secure Multiparty Computation (MPC or SMPC). In doing so, you'll be able to more clearly reason about use cases where MPC is a good fit. Remember: the goal isn't to roll your own crypto, but instead to develop a basic mental model to understand why, where and how MPC can help.\n", 10 | "\n", 11 | "If this is interesting to you and you'd like to learn more, please have a read through [Morten Dahl's informative blog series on this topic and more in MPC!](https://mortendahl.github.io/2017/06/04/secret-sharing-part1/) " 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 3, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from random import randrange" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "x = 45" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "Let's try to \"hide\" our number in a few ways naively..." 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 5, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "keys = [100, 22, 43, 56]" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 6, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "enc_x = x - sum(keys)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 7, 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "data": { 64 | "text/plain": [ 65 | "-176" 66 | ] 67 | }, 68 | "execution_count": 7, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "enc_x" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 8, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "enc_x = x - sum(keys)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "This \"works\", but it actually leaks information! Let's take another x as an example." 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 9, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "x = -1000" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 10, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "enc_x = x - sum(keys)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 11, 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "-1221" 120 | ] 121 | }, 122 | "execution_count": 11, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "enc_x" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "Here, if someone was using the same keys, I could infer that one number is smaller than the other number or otherwise leak information about the original values.\n", 136 | "\n", 137 | "We could naively continue to find mechanisms that make this more secure, but there is actually a better way if we know some cryptography-math. Let's look at how we can hide information better in a field." 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 12, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "Q = 431\n", 147 | "x = 45" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 13, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "enc_x = (x - sum(keys)) % Q" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 14, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "255" 168 | ] 169 | }, 170 | "execution_count": 14, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "enc_x" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 15, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "keys = [33, 52, 167, 55, 77]" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 16, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "enc_x = (x - sum(keys)) % Q" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 17, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "92" 206 | ] 207 | }, 208 | "execution_count": 17, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "enc_x" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "Because a field helps the numbers \"wrap\", it hides the extra information that we leaked without it. \n", 222 | "\n", 223 | "![Cats and Clocks](https://c.tenor.com/yz_7VcX0WjYAAAAd/cat-changing-the-clock-changing-the-time.gif \"A black cat is standing up and playing with the minute arm of a clock, tapping it forward playfully.\")\n", 224 | "\n", 225 | "\n", 226 | "### But Math?\n", 227 | "Currently this \"encryption scheme\" is not very helpful, because I can't do math with it... if I just wanted to encrypt and decrypt, there are a bunch of schemes I could use. We are here to do math!\n", 228 | "\n", 229 | "To make it easier, I just wanted to support addition. I could combine the secret with my \"keys\" so that if you add them all together, they equal the secret." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 18, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "Q = 431\n", 239 | "num_players = 5\n", 240 | "x = 45\n", 241 | "\n", 242 | "shares = [randrange(Q) for _ in range(num_players-1)]\n", 243 | "shares += [(x - sum(shares)) % Q]\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 19, 249 | "metadata": {}, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "[151, 262, 162, 200, 132]" 255 | ] 256 | }, 257 | "execution_count": 19, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "shares" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 44, 269 | "metadata": {}, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "807" 275 | ] 276 | }, 277 | "execution_count": 44, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "sum(shares)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 20, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "data": { 293 | "text/plain": [ 294 | "45" 295 | ] 296 | }, 297 | "execution_count": 20, 298 | "metadata": {}, 299 | "output_type": "execute_result" 300 | } 301 | ], 302 | "source": [ 303 | "sum(shares) % Q" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## Now let's try a negative number!\n", 311 | "\n", 312 | "We can start the same way as our sum, but think about modular math. What happens when you \"subtract\" time (i.e. come 13 hours before noon.)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 29, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/plain": [ 323 | "23" 324 | ] 325 | }, 326 | "execution_count": 29, 327 | "metadata": {}, 328 | "output_type": "execute_result" 329 | } 330 | ], 331 | "source": [ 332 | "(12 - 13) + 24" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "To keep our \"negative\" in our positive field space, we add our Q back... now let's see how it works with our current scheme...." 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": 33, 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "[93, 159, 31, 340, 184]" 351 | ] 352 | }, 353 | "execution_count": 33, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "x = -55\n", 360 | "\n", 361 | "shares = [randrange(Q) for _ in range(num_players-1)]\n", 362 | "shares += [(x - sum(shares)) % Q]\n", 363 | "\n", 364 | "shares" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 34, 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "# now, try to \"decrypt\" your secret! Hint: take a look at the \"addition result\" and your field...." 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "# %load solutions/subtraction.py\n" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": {}, 388 | "source": [ 389 | "Let's make this easier by creating some helper functions to build our shares and combine them to decrypt." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 45, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "def create_additive_shares(secret, Q=431, num_players=5):\n", 399 | " shares = [randrange(Q) for _ in range(num_players-1)]\n", 400 | " shares += [(secret - sum(shares)) % Q]\n", 401 | " return shares\n", 402 | "\n", 403 | "def decrypt(result, Q=431):\n", 404 | " return result if result <= Q/2 else result - Q" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 46, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "data": { 414 | "text/plain": [ 415 | "-33" 416 | ] 417 | }, 418 | "execution_count": 46, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "shares = create_additive_shares(-33)\n", 425 | "decrypt(sum(shares) % Q)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "## Replicated Secret Sharing\n", 433 | "\n", 434 | "What if we wanted to multiply instead of add?" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 47, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "a = 44\n", 444 | "b = 55\n", 445 | "Q=2147487" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 48, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "a_shares = create_additive_shares(a, Q=2147487, num_players=3)\n", 455 | "b_shares = create_additive_shares(b, Q=2147487, num_players=3)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "These are then distributed to three players: each gets two shares from of each secret.\n", 463 | "\n", 464 | "Player 1 (owns A): keeps a1, a2, receives b2, b3\n", 465 | "Player 2 (owns B): keeps b1, b2, receives a1, a3\n", 466 | "Player 3: receives a2, a3, b1, b3\n", 467 | "\n", 468 | "\n", 469 | "c1= a1b2 + a1b3 + a2b2 \n", 470 | "\n", 471 | "c2= a1b1 + a3b1 + a3b2\n", 472 | "\n", 473 | "c3= a2b1 + a3b3 + a2b3\n", 474 | "\n", 475 | "\n", 476 | "c1 + c2 + c3 = a1(b1 + b2 + b3) + a2(b1 + b2 + b3) + a3(b1 + b2 + b3) \n", 477 | "\n", 478 | " = (a1 + a2 + a3)(b1 + b2 + b3) \n", 479 | " \n", 480 | " = ab" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": 27, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "a1, a2, a3 = a_shares\n", 490 | "b1, b2, b3 = b_shares" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 1, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "# write a method to multiply the shares. It should take the first share and the second share. They need to be\n", 500 | "# multiplied together and stay in the field!" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "# %load solutions/multiply.py\n" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 29, 515 | "metadata": { 516 | "scrolled": true 517 | }, 518 | "outputs": [ 519 | { 520 | "data": { 521 | "text/plain": [ 522 | "887134" 523 | ] 524 | }, 525 | "execution_count": 29, 526 | "metadata": {}, 527 | "output_type": "execute_result" 528 | } 529 | ], 530 | "source": [ 531 | "c1 = multiply(a1, b2) + multiply(a1, b3) + multiply(a2, b2)\n", 532 | "c1 % Q" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 30, 538 | "metadata": { 539 | "scrolled": true 540 | }, 541 | "outputs": [ 542 | { 543 | "data": { 544 | "text/plain": [ 545 | "296786" 546 | ] 547 | }, 548 | "execution_count": 30, 549 | "metadata": {}, 550 | "output_type": "execute_result" 551 | } 552 | ], 553 | "source": [ 554 | "c2= multiply(a1, b1) + multiply(a3, b1) + multiply(a3, b2)\n", 555 | "c2 % Q" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 31, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "data": { 565 | "text/plain": [ 566 | "965987" 567 | ] 568 | }, 569 | "execution_count": 31, 570 | "metadata": {}, 571 | "output_type": "execute_result" 572 | } 573 | ], 574 | "source": [ 575 | "c3= multiply(a2, b1) + multiply(a3, b3) + multiply(a2, b3)\n", 576 | "c3 % Q" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 32, 582 | "metadata": {}, 583 | "outputs": [ 584 | { 585 | "data": { 586 | "text/plain": [ 587 | "2420" 588 | ] 589 | }, 590 | "execution_count": 32, 591 | "metadata": {}, 592 | "output_type": "execute_result" 593 | } 594 | ], 595 | "source": [ 596 | "(c1+c2+c3) % Q" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 33, 602 | "metadata": {}, 603 | "outputs": [ 604 | { 605 | "data": { 606 | "text/plain": [ 607 | "2420" 608 | ] 609 | }, 610 | "execution_count": 33, 611 | "metadata": {}, 612 | "output_type": "execute_result" 613 | } 614 | ], 615 | "source": [ 616 | "a*b" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": {}, 622 | "source": [ 623 | "We can see here though that if two of the players collude, they have enough to recover extra information about the secret. They might also use the final step to learn more about the other shares. To avoid this, each player can \"blind\" their intermediary value before sharing it and then reveal the blind to another party later.\n", 624 | "\n", 625 | "Player 1 has c1, adds blind (r1), receives c2+r2, keeps r1\n", 626 | "\n", 627 | "Player 2 has c2, adds blind (r2), receives c3+r3, keeps r2\n", 628 | "\n", 629 | "Player 3 has c3, adds blind (r3), receives c1+r1, keeps r3\n", 630 | "\n", 631 | "Player 1 calculates: c2+r2-r1\n", 632 | "\n", 633 | "Player 2 calculates: c3+r3-r2\n", 634 | "\n", 635 | "Player 3 calculates: c1+r1-r3\n", 636 | "\n", 637 | "Combined, they get c1+c2+c3 without leaking the extra information." 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "metadata": {}, 643 | "source": [ 644 | "## Challenge\n", 645 | "\n", 646 | "- Can you write a way to securely generate the blinds and pass them among the participants?" 647 | ] 648 | }, 649 | { 650 | "cell_type": "code", 651 | "execution_count": null, 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [] 655 | } 656 | ], 657 | "metadata": { 658 | "kernelspec": { 659 | "display_name": "Python 3 (ipykernel)", 660 | "language": "python", 661 | "name": "python3" 662 | }, 663 | "language_info": { 664 | "codemirror_mode": { 665 | "name": "ipython", 666 | "version": 3 667 | }, 668 | "file_extension": ".py", 669 | "mimetype": "text/x-python", 670 | "name": "python", 671 | "nbconvert_exporter": "python", 672 | "pygments_lexer": "ipython3", 673 | "version": "3.10.9" 674 | } 675 | }, 676 | "nbformat": 4, 677 | "nbformat_minor": 4 678 | } 679 | -------------------------------------------------------------------------------- /12 - Homomorphic Encryption with Paillier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Homomorphic Encryption with Paillier\n", 8 | "\n", 9 | "This notebook provides an introduction to homomorphic encryption basics. The goal is to build a mental model that can help you notice use cases where homomorphic encyption would be a good fit. Again, please do not actually roll your own crypto! Instead, play around with the neat math on display here and build your understanding and intuition. :)\n", 10 | "\n", 11 | "Most of this notebook is code written by [Morten Dahl](https://github.com/mortendahl) in his [Private ML repo](https://github.com/mortendahl/privateml/) and [Paillier post](https://github.com/mortendahl/mortendahl.github.io/blob/master/_drafts/2019-04-15-paillier-encryption.md). Putting it together required several conversations where he helped me through how to understand the properties. Thank you Morten for the support in making this notebook and book possible. :)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import math\n", 21 | "import random" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Required Primatives & Utility Functions\n", 29 | "\n", 30 | "In this section, I'll use Morten's work to define some building blocks we need for Paillier!\n", 31 | "\n", 32 | "\n", 33 | "A brief introduction to some of the methods:\n", 34 | "\n", 35 | "- [The Rabin-Miller primality test](https://en.wikipedia.org/wiki/Miller%E2%80%93Rabin_primality_test) is used to probablistically determine if a number is a prime.\n", 36 | "- There is a method to sample randomness, so that we can actually safely encrypt.\n", 37 | "- [The Extended Euclidean algorithm (egcd)](https://en.wikipedia.org/wiki/Extended_Euclidean_algorithm) that is used to calculate greatest common divisor, but also additional parameters which we will use to do encrypted math in a field." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# see https://inventwithpython.com/rabinMiller.py\n", 47 | "\n", 48 | "n = 137\n", 49 | "SMALL_PRIMES = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, \n", 50 | " 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, \n", 51 | " 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, \n", 52 | " 223, 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, \n", 53 | " 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, 367, 373, 379, \n", 54 | " 383, 389, 397, 401, 409, 419, 421, 431, 433, 439, 443, 449, 457, 461, \n", 55 | " 463, 467, 479, 487, 491, 499, 503, 509, 521, 523, 541, 547, 557, 563, \n", 56 | " 569, 571, 577, 587, 593, 599, 601, 607, 613, 617, 619, 631, 641, 643, \n", 57 | " 647, 653, 659, 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, \n", 58 | " 743, 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, 829, \n", 59 | " 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, 919, 929, \n", 60 | " 937, 941, 947, 953, 967, 971, 977, 983, 991, 997]\n", 61 | "\n", 62 | "def rewrite(num):\n", 63 | " s = num - 1\n", 64 | " t = 0\n", 65 | " while s % 2 == 0:\n", 66 | " s = s // 2\n", 67 | " t += 1\n", 68 | " return s, t\n", 69 | "\n", 70 | "def rabin_miller(num, iterations=10):\n", 71 | " s, t = rewrite(num)\n", 72 | " for _ in range(iterations):\n", 73 | " a = random.randrange(2, num - 1)\n", 74 | " v = pow(a, s, num)\n", 75 | " if v != 1:\n", 76 | " i = 0\n", 77 | " while v != (num - 1):\n", 78 | " if i == t - 1:\n", 79 | " return False\n", 80 | " else:\n", 81 | " i = i + 1\n", 82 | " v = pow(v, 2, num)\n", 83 | " return True\n", 84 | "\n", 85 | "def is_prime(num):\n", 86 | " if (num < 2): return False\n", 87 | " for prime in SMALL_PRIMES:\n", 88 | " if num == prime: return True\n", 89 | " if num % prime == 0: return False\n", 90 | " return rabin_miller(num)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "def sample_randomness(ek):\n", 100 | " while True:\n", 101 | " r = random.randrange(ek.n)\n", 102 | " if math.gcd(r, ek.n) == 1:\n", 103 | " return r" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def sample_prime(bitsize):\n", 113 | " lower = 1 << (bitsize-1)\n", 114 | " upper = 1 << (bitsize)\n", 115 | " while True:\n", 116 | " candidate = random.randrange(lower, upper)\n", 117 | " if is_prime(candidate):\n", 118 | " return candidate" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# from http://www.ucl.ac.uk/~ucahcjm/combopt/ext_gcd_python_programs.pdf\n", 128 | "def egcd_binary(a, b):\n", 129 | " u, v, s, t, r = 1, 0, 0, 1, 0\n", 130 | " while (a % 2 == 0) and (b % 2 == 0):\n", 131 | " a, b, r = a//2, b//2, r+1\n", 132 | " alpha, beta = a, b\n", 133 | " while (a % 2 == 0):\n", 134 | " a = a//2\n", 135 | " if (u % 2 == 0) and (v % 2 == 0):\n", 136 | " u, v = u//2, v//2\n", 137 | " else:\n", 138 | " u, v = (u + beta)//2, (v - alpha)//2\n", 139 | " while a != b:\n", 140 | " if (b % 2 == 0):\n", 141 | " b = b//2\n", 142 | " if (s % 2 == 0) and (t % 2 == 0):\n", 143 | " s, t = s//2, t//2\n", 144 | " else:\n", 145 | " s, t = (s + beta)//2, (t - alpha)//2\n", 146 | " elif b < a:\n", 147 | " a, b, u, v, s, t = b, a, s, t, u, v\n", 148 | " else:\n", 149 | " b, s, t = b - a, s - u, t - v\n", 150 | " return (2 ** r) * a, s, t\n", 151 | "\n", 152 | "\n", 153 | "def inverse(a, field):\n", 154 | " _, b, _ = egcd_binary(a, field)\n", 155 | " return b" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "### Looking at multiplicative field inverses\n", 163 | "\n", 164 | "Now that we have the helper functions, we can take a look at the field properties. As you learned in the book, the multiplicative inverse works in a field to support multiplication primatives. It is a multiplicative inverse if the it can be multiplied with the original number and then modulo the field to equal 1. Check out the list below to see this in action." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "n = 137\n", 177 | "\n", 178 | "x: 1\t inverse(x): 1 \t x * inverse(x) mod n: 1\n", 179 | "x: 2\t inverse(x): 69 \t x * inverse(x) mod n: 1\n", 180 | "x: 3\t inverse(x): 46 \t x * inverse(x) mod n: 1\n", 181 | "x: 4\t inverse(x): 103 \t x * inverse(x) mod n: 1\n", 182 | "x: 5\t inverse(x): 55 \t x * inverse(x) mod n: 1\n", 183 | "x: 6\t inverse(x): 23 \t x * inverse(x) mod n: 1\n", 184 | "x: 7\t inverse(x): -39 \t x * inverse(x) mod n: 1\n", 185 | "x: 8\t inverse(x): 120 \t x * inverse(x) mod n: 1\n", 186 | "x: 9\t inverse(x): 61 \t x * inverse(x) mod n: 1\n", 187 | "x: 10\t inverse(x): 96 \t x * inverse(x) mod n: 1\n", 188 | "x: 11\t inverse(x): 25 \t x * inverse(x) mod n: 1\n", 189 | "x: 12\t inverse(x): 80 \t x * inverse(x) mod n: 1\n", 190 | "x: 13\t inverse(x): 116 \t x * inverse(x) mod n: 1\n", 191 | "x: 14\t inverse(x): 49 \t x * inverse(x) mod n: 1\n", 192 | "x: 15\t inverse(x): 64 \t x * inverse(x) mod n: 1\n", 193 | "x: 16\t inverse(x): 60 \t x * inverse(x) mod n: 1\n", 194 | "x: 17\t inverse(x): 129 \t x * inverse(x) mod n: 1\n", 195 | "x: 18\t inverse(x): 99 \t x * inverse(x) mod n: 1\n", 196 | "x: 19\t inverse(x): 101 \t x * inverse(x) mod n: 1\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "print(\"n = {}\\n\".format(n))\n", 202 | "\n", 203 | "for x in range(1,20):\n", 204 | " print(\"x: {}\\t inverse(x): {} \\t x * inverse(x) mod n: {}\".format(\n", 205 | " x, inverse(x, n), x * inverse(x, n) % n))" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 8, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "1" 217 | ] 218 | }, 219 | "execution_count": 8, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "17 * 129 % n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Paillier Encryption & Decryption Keys\n", 233 | "\n", 234 | "These helper functions (from Morten!) define the encryption and decryption keys for the Paillier cryptosystem. You will notice some of the calculations you already learned in Chapter 7 in these definitions!" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "class EncryptionKey:\n", 244 | " def __init__(self, n):\n", 245 | " self.n = n\n", 246 | " self.nn = n * n\n", 247 | " self.g = 1 + n\n", 248 | " \n", 249 | " def __repr__(self):\n", 250 | " return \"Encryption Key \".format(\n", 251 | " self.n, self.nn, self.g)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 10, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "class DecryptionKey:\n", 261 | " def __init__(self, p, q):\n", 262 | " n = p * q\n", 263 | "\n", 264 | " self.n = p * q\n", 265 | " self.nn = n * n\n", 266 | " self.g = 1 + n\n", 267 | "\n", 268 | " order_of_n = (p - 1) * (q - 1)\n", 269 | " self.d1 = order_of_n\n", 270 | " self.d2 = inverse(order_of_n, n)\n", 271 | " self.e = inverse(n, order_of_n)\n", 272 | " \n", 273 | " def __repr__(self):\n", 274 | " return \"Decryption-Key \".format(\n", 275 | " self.n, self.d1, self.e) " 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 11, 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "def keygen(n_bitlength=512): # should be 2048\n", 285 | " p = sample_prime(n_bitlength // 2)\n", 286 | " q = sample_prime(n_bitlength // 2)\n", 287 | " n = p * q\n", 288 | "\n", 289 | " return EncryptionKey(n), DecryptionKey(p, q)" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Encrypting, Decrypting and Extracting the Randomness (r)\n", 297 | "\n", 298 | "These functions allow you to encrypt and decrypt using the above keys and extract the randomness if that is required. Usually this would only be done to prove the \"correctness\" of the computation, and is therefore not always required." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 12, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "def enc(ek: EncryptionKey, x, r):\n", 308 | " gx = pow(ek.g, x, ek.nn)\n", 309 | " rn = pow(r, ek.n, ek.nn)\n", 310 | " c = (gx * rn) % ek.nn\n", 311 | " return c" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "def dec(dk: DecryptionKey, c):\n", 321 | " gxd = pow(c, dk.d1, dk.nn)\n", 322 | " xd = dlog(gxd, dk.n)\n", 323 | " x = (xd * dk.d2) % dk.n\n", 324 | " return x if x < dk.n/2 else x - dk.n" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 14, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "def dlog(gy, n):\n", 334 | " y = (gy - 1) // n\n", 335 | " return y" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 15, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "def extract(dk: DecryptionKey, c):\n", 345 | " x = dec(dk, c)\n", 346 | " gx = pow(dk.g, x, ek.nn)\n", 347 | " gx_inv = inverse(gx, ek.nn)\n", 348 | " rn = (c * gx_inv) % ek.nn\n", 349 | " r = pow(rn, dk.e, dk.n)\n", 350 | " return r" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Encrypting and Decrypting with Paillier\n", 358 | "\n", 359 | "Now you're ready to actually encrypt and decrypt using the Paillier cryptosystem. You'll generate your encryption and decryption keys, sample randomness in order to encrypt your message into ciphertext and then decrypt the message using your decryption key." 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 16, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "ek, dk = keygen()" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 17, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/plain": [ 379 | "Encryption Key " 382 | ] 383 | }, 384 | "execution_count": 17, 385 | "metadata": {}, 386 | "output_type": "execute_result" 387 | } 388 | ], 389 | "source": [ 390 | "ek" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 18, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "r = sample_randomness(ek)" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 19, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "msg = 4" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 20, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "ciphertext = enc(ek, msg, r)" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 21, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/plain": [ 428 | "13318562040696409702758528812924345335818299164953617929650042923992327244313640064803569416193632088674806904695878999880761923933432455325763026904104798673804522548252179782189132064552724542421613876038350839917013343021961311303770831146608043027815871740619537991742484293856289271519982275283282986213" 429 | ] 430 | }, 431 | "execution_count": 21, 432 | "metadata": {}, 433 | "output_type": "execute_result" 434 | } 435 | ], 436 | "source": [ 437 | "ciphertext" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 22, 443 | "metadata": {}, 444 | "outputs": [ 445 | { 446 | "data": { 447 | "text/plain": [ 448 | "True" 449 | ] 450 | }, 451 | "execution_count": 22, 452 | "metadata": {}, 453 | "output_type": "execute_result" 454 | } 455 | ], 456 | "source": [ 457 | "dec(dk, ciphertext) == msg" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "Note: you might want to extract the randomness provided as part of the encryption to prove the correctness of the decryption. For one implementation of how you might use this, take a look at [the tf-encrypted implementation of secure aggregation](https://medium.com/dropoutlabs/building-secure-aggregation-into-tensorflow-federated-4514fca40cc0) where it is used to prove correct decryption." 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": 23, 470 | "metadata": {}, 471 | "outputs": [ 472 | { 473 | "data": { 474 | "text/plain": [ 475 | "True" 476 | ] 477 | }, 478 | "execution_count": 23, 479 | "metadata": {}, 480 | "output_type": "execute_result" 481 | } 482 | ], 483 | "source": [ 484 | "extract(dk, ciphertext) == r" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "## Homomorphic Properties\n", 492 | "\n", 493 | "Okay, so you can encrypt and decrypt, cool! But really, the important part of the Paillier cryptosystem is that you can use it for homomorphic encryption. Let's see how you can do this via a few helper functions." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 24, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "def add_cipher(ek, c1, c2):\n", 503 | " c = (c1 * c2) % ek.nn\n", 504 | " return c\n", 505 | "\n", 506 | "def add_plain(ek, c1, x2):\n", 507 | " c2 = pow(ek.g, x2, ek.nn)\n", 508 | " c = (c1 * c2) % ek.nn\n", 509 | " return c\n", 510 | "\n", 511 | "def neg(ek, c):\n", 512 | " return inverse(c, ek.nn) \n", 513 | "\n", 514 | "def sub_cipher(ek, c1, c2):\n", 515 | " c = add_cipher(ek, c1, neg(ek, c2))\n", 516 | " return c\n", 517 | "\n", 518 | "def sub_plain(ek, c1, x2):\n", 519 | " c = add_plain(ek, c1, ek.n - x2)\n", 520 | " return c\n", 521 | "\n", 522 | "def mul_plain(ek, c1, x2):\n", 523 | " c = pow(c1, x2, ek.nn)\n", 524 | " return c" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 25, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "msg_one, msg_two = 45, 234" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 26, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "r1 = sample_randomness(ek)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 27, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "r2 = sample_randomness(ek)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": 28, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "c1 = enc(ek, msg_one, r1)\n", 561 | "c2 = enc(ek, msg_two, r2)" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 29, 567 | "metadata": {}, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/plain": [ 572 | "(8300495113123882981928995611828881848870500488811192567106333973552994675153967728301429493081586585645685831033361750025256343879840490091038837627647576831632164879332711852520618080888121301881134281374204062468850439810425971974733881003239259689285683978092152056294200674796153145116647799919448217892,\n", 573 | " 33452402939873505743893139364694959902023028400516601475505862327229878012365643816181605356897921228726624232821175484794497579216648870602743534131101510282069072588225503679084360452010980311459832832285567611991063143624052719024408987810400271746237133259246888193032702937161383783733229312870416499129)" 574 | ] 575 | }, 576 | "execution_count": 29, 577 | "metadata": {}, 578 | "output_type": "execute_result" 579 | } 580 | ], 581 | "source": [ 582 | "c1, c2" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": 30, 588 | "metadata": {}, 589 | "outputs": [], 590 | "source": [ 591 | "result_addition = add_cipher(ek, c1, c2)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 31, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "data": { 601 | "text/plain": [ 602 | "35095335123754385146727887296882861810552271776352568124652569998196032488955963715413728062632770206255525021056222528340129894596214981800335348265350405359318940795022893285765858690297764984882458146852205354288793002905878139434817941414723317902866109956826897234935159144629779820845063512909288772804" 603 | ] 604 | }, 605 | "execution_count": 31, 606 | "metadata": {}, 607 | "output_type": "execute_result" 608 | } 609 | ], 610 | "source": [ 611 | "result_addition" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 32, 617 | "metadata": {}, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "True" 623 | ] 624 | }, 625 | "execution_count": 32, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "dec(dk, result_addition) == msg_one + msg_two" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 33, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "result_subtraction = sub_cipher(ek, c1, c2)" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 34, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/plain": [ 651 | "True" 652 | ] 653 | }, 654 | "execution_count": 34, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "dec(dk, result_subtraction) == msg_one - msg_two" 661 | ] 662 | }, 663 | { 664 | "cell_type": "markdown", 665 | "metadata": {}, 666 | "source": [ 667 | "Wow! Pretty neat, eh? :D\n", 668 | "\n", 669 | "### Plaintext methods\n", 670 | "\n", 671 | "You can also perform plaintext operations, like multiplying, adding and subtracting publicly known values as part of the computation." 672 | ] 673 | }, 674 | { 675 | "cell_type": "code", 676 | "execution_count": 35, 677 | "metadata": {}, 678 | "outputs": [ 679 | { 680 | "data": { 681 | "text/plain": [ 682 | "True" 683 | ] 684 | }, 685 | "execution_count": 35, 686 | "metadata": {}, 687 | "output_type": "execute_result" 688 | } 689 | ], 690 | "source": [ 691 | "dec(dk, mul_plain(ek, c1, 5)) == 5 * msg_one" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": 36, 697 | "metadata": {}, 698 | "outputs": [ 699 | { 700 | "data": { 701 | "text/plain": [ 702 | "True" 703 | ] 704 | }, 705 | "execution_count": 36, 706 | "metadata": {}, 707 | "output_type": "execute_result" 708 | } 709 | ], 710 | "source": [ 711 | "dec(dk, sub_plain(ek, c2, 1000)) == msg_two - 1000" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 37, 717 | "metadata": {}, 718 | "outputs": [ 719 | { 720 | "data": { 721 | "text/plain": [ 722 | "True" 723 | ] 724 | }, 725 | "execution_count": 37, 726 | "metadata": {}, 727 | "output_type": "execute_result" 728 | } 729 | ], 730 | "source": [ 731 | "dec(dk, add_plain(ek, c1, 4893849327)) == msg_one + 4893849327" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": null, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": null, 744 | "metadata": {}, 745 | "outputs": [], 746 | "source": [] 747 | } 748 | ], 749 | "metadata": { 750 | "kernelspec": { 751 | "display_name": "Python 3 (ipykernel)", 752 | "language": "python", 753 | "name": "python3" 754 | }, 755 | "language_info": { 756 | "codemirror_mode": { 757 | "name": "ipython", 758 | "version": 3 759 | }, 760 | "file_extension": ".py", 761 | "mimetype": "text/x-python", 762 | "name": "python", 763 | "nbconvert_exporter": "python", 764 | "pygments_lexer": "ipython3", 765 | "version": "3.10.9" 766 | } 767 | }, 768 | "nbformat": 4, 769 | "nbformat_minor": 4 770 | } 771 | -------------------------------------------------------------------------------- /13 - PSI and Moose - Encrypted Computation for Data Sharing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Private Set Intersection and Moose\n", 8 | "\n", 9 | "This notebook takes a common use case: sharing data between several parties in order to compute something together.\n", 10 | "\n", 11 | "In this case, you will use [Facebook's PrivateID library](https://github.com/facebookresearch/Private-ID#private-id-1) to compute a join between the two datasets and then [Moose from the tf-encrypted team](https://github.com/tf-encrypted/moose) to compute the values using encrypted data.\n", 12 | "\n", 13 | "For this particular example, the players are two companies with some overlap in customers. They would like to find customers that are of interest for new product offerings based on the spending categories of their combined customer bases. They can then use this result to determine what product offerings would be most appealing to which customers and reach out for more customer research.\n", 14 | "\n", 15 | "This notebook was heavily based on [Yann Dupis' notebooks](https://github.com/yanndupis) with additional support and Moose fixes from [Jason Mancuso](https://github.com/jvmncs). I encourage you to follow their work on GitHub to keep up with the latest!\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "## Generate fake datasets\n", 23 | "\n", 24 | "In this example, you'll have Alice and Bob play the two data scientists at each company. They need to first have realistic data, so here you'll generate some example data. Feel free to play around with the generator to test out new ideas!" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pathlib\n", 34 | "import numpy as np\n", 35 | "import pandas as pd\n", 36 | "\n", 37 | "np.random.seed(1234)\n", 38 | "\n", 39 | "_DATA_DIR = pathlib.Path(\"./data\")\n", 40 | "\n", 41 | "def generate_mock_dataset(sample_size, n_features, sample_frac, \n", 42 | " scaler=100, positive=True, precision=4):\n", 43 | " x = np.random.randn(sample_size, n_features).round(precision) * scaler\n", 44 | " user_id = np.array(range(sample_size))\n", 45 | " df = pd.DataFrame(x, columns=[f\"x_{i}\" for i in range(n_features)])\n", 46 | " df.insert(loc=0, column='user_id', value=user_id)\n", 47 | " df = df.sample(frac=sample_frac)\n", 48 | " if positive:\n", 49 | " df = df.applymap(lambda x: x * -1 if x < 0 else x)\n", 50 | " return df\n", 51 | "\n", 52 | "alice_df = generate_mock_dataset(10, 20, 0.6)\n", 53 | "bob_df = generate_mock_dataset(10, 1, 0.7)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/html": [ 64 | "
\n", 65 | "\n", 78 | "\n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | "
user_idx_0x_1x_2x_3x_4x_5x_6x_7x_8...x_10x_11x_12x_13x_14x_15x_16x_17x_18x_19
1120.2665.6019.3455.34131.8246.9367.56181.7018.31...39.7833.74104.76104.5986.3712.2112.4732.2884.17239.10
0047.14119.10143.2731.2772.0688.7285.9663.651.57...115.0099.1995.33202.1333.410.2140.5528.91132.12154.69
8853.6874.3832.0291.6285.9722.6062.8818.6595.25...7.2655.0693.82123.9113.9722.30212.3712.23140.94142.30
227.6256.643.61207.5024.7889.7213.681.8375.54...84.10144.58140.2010.0954.8214.4635.403.5556.57154.57
5529.1256.6550.3628.5348.43136.3578.1146.80122.46...87.55171.0745.0874.9220.3918.2268.07181.854.7139.48
4446.44356.35132.1115.2616.4543.0176.7498.4927.08...7.9840.00102.7958.4781.668.1934.4852.83106.9051.19
\n", 252 | "

6 rows × 21 columns

\n", 253 | "
" 254 | ], 255 | "text/plain": [ 256 | " user_id x_0 x_1 x_2 x_3 x_4 x_5 x_6 x_7 \\\n", 257 | "1 1 20.26 65.60 19.34 55.34 131.82 46.93 67.56 181.70 \n", 258 | "0 0 47.14 119.10 143.27 31.27 72.06 88.72 85.96 63.65 \n", 259 | "8 8 53.68 74.38 32.02 91.62 85.97 22.60 62.88 18.65 \n", 260 | "2 2 7.62 56.64 3.61 207.50 24.78 89.72 13.68 1.83 \n", 261 | "5 5 29.12 56.65 50.36 28.53 48.43 136.35 78.11 46.80 \n", 262 | "4 4 46.44 356.35 132.11 15.26 16.45 43.01 76.74 98.49 \n", 263 | "\n", 264 | " x_8 ... x_10 x_11 x_12 x_13 x_14 x_15 x_16 x_17 \\\n", 265 | "1 18.31 ... 39.78 33.74 104.76 104.59 86.37 12.21 12.47 32.28 \n", 266 | "0 1.57 ... 115.00 99.19 95.33 202.13 33.41 0.21 40.55 28.91 \n", 267 | "8 95.25 ... 7.26 55.06 93.82 123.91 13.97 22.30 212.37 12.23 \n", 268 | "2 75.54 ... 84.10 144.58 140.20 10.09 54.82 14.46 35.40 3.55 \n", 269 | "5 122.46 ... 87.55 171.07 45.08 74.92 20.39 18.22 68.07 181.85 \n", 270 | "4 27.08 ... 7.98 40.00 102.79 58.47 81.66 8.19 34.48 52.83 \n", 271 | "\n", 272 | " x_18 x_19 \n", 273 | "1 84.17 239.10 \n", 274 | "0 132.12 154.69 \n", 275 | "8 140.94 142.30 \n", 276 | "2 56.57 154.57 \n", 277 | "5 4.71 39.48 \n", 278 | "4 106.90 51.19 \n", 279 | "\n", 280 | "[6 rows x 21 columns]" 281 | ] 282 | }, 283 | "execution_count": 2, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "alice_df" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "### Run Private-ID\n", 297 | "\n", 298 | "Now it's time to run [Private-ID](https://github.com/facebookresearch/Private-ID#private-id-1) on Alice and Bob's User IDs. You will want to install Private-ID in a parent folder near this repository and run these commands from the location where you installed Private ID. Make sure to update the input and output file paths should your location be different!\n", 299 | "\n", 300 | "```\n", 301 | "env RUST_LOG=info cargo run --bin private-id-server -- \\\n", 302 | " --host 0.0.0.0:10009 \\\n", 303 | " --input ../practical-data-privacy/data/alice_id.csv \\\n", 304 | " --output ../practical-data-privacy/data/alice_keys.csv \\\n", 305 | " --no-tls\n", 306 | " \n", 307 | "\n", 308 | "env RUST_LOG=info cargo run --bin private-id-client -- \\\n", 309 | " --company localhost:10009 \\\n", 310 | " --input ../practical-data-privacy/data/bob_id.csv \\\n", 311 | " --output ../practical-data-privacy/data/bob_keys.csv \\\n", 312 | " --no-tls\n", 313 | "```" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 3, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "alice_id = alice_df[\"user_id\"]\n", 323 | "alice_id.to_csv(_DATA_DIR / \"alice_id.csv\", index=False, header=False)\n", 324 | "\n", 325 | "bob_id = bob_df[\"user_id\"]\n", 326 | "bob_id.to_csv(_DATA_DIR / \"bob_id.csv\", index=False, header=False)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "Here is below the outputs from Private-ID. As you can see they both contain the same list of keys ordered the same way but for some of them they have a User Id mapped to them. " 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 4, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/html": [ 344 | "
\n", 345 | "\n", 358 | "\n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | "
keyuser_id
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.0
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.0
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.0
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...4.0
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...NaN
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.0
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...NaN
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.0
\n", 409 | "
" 410 | ], 411 | "text/plain": [ 412 | " key user_id\n", 413 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0\n", 414 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0\n", 415 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0\n", 416 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... 4.0\n", 417 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... NaN\n", 418 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0\n", 419 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... NaN\n", 420 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0" 421 | ] 422 | }, 423 | "execution_count": 4, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "alice_keys = pd.read_csv(_DATA_DIR / \"alice_keys.csv\", names=[\"key\", \"user_id\"])\n", 430 | "\n", 431 | "alice_keys" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 5, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "data": { 441 | "text/html": [ 442 | "
\n", 443 | "\n", 456 | "\n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | "
keyuser_id
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.0
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.0
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.0
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...NaN
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...7.0
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.0
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...9.0
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.0
\n", 507 | "
" 508 | ], 509 | "text/plain": [ 510 | " key user_id\n", 511 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0\n", 512 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0\n", 513 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0\n", 514 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... NaN\n", 515 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... 7.0\n", 516 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0\n", 517 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... 9.0\n", 518 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0" 519 | ] 520 | }, 521 | "execution_count": 5, 522 | "metadata": {}, 523 | "output_type": "execute_result" 524 | } 525 | ], 526 | "source": [ 527 | "bob_keys = pd.read_csv(_DATA_DIR / \"bob_keys.csv\", names=[\"key\", \"user_id\"])\n", 528 | "\n", 529 | "bob_keys" 530 | ] 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "metadata": {}, 535 | "source": [ 536 | "We then merge the Private-ID output to Alice and Bob's datasets so in the datasets we have the all the keys from Private-ID with the corresponding User ID and features." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 6, 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "alice_df = pd.merge(alice_keys, alice_df, on='user_id', how='left')\n", 546 | "bob_df = pd.merge(bob_keys, bob_df, on='user_id', how='left')" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 7, 552 | "metadata": {}, 553 | "outputs": [ 554 | { 555 | "data": { 556 | "text/html": [ 557 | "
\n", 558 | "\n", 571 | "\n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | "
keyuser_idx_0x_1x_2x_3x_4x_5x_6x_7...x_10x_11x_12x_13x_14x_15x_16x_17x_18x_19
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.07.6256.643.61207.5024.7889.7213.681.83...84.10144.58140.2010.0954.8214.4635.403.5556.57154.57
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.053.6874.3832.0291.6285.9722.6062.8818.65...7.2655.0693.82123.9113.9722.30212.3712.23140.94142.30
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.020.2665.6019.3455.34131.8246.9367.56181.70...39.7833.74104.76104.5986.3712.2112.4732.2884.17239.10
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...4.046.44356.35132.1115.2616.4543.0176.7498.49...7.9840.00102.7958.4781.668.1934.4852.83106.9051.19
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.029.1256.6550.3628.5348.43136.3578.1146.80...87.55171.0745.0874.9220.3918.2268.07181.854.7139.48
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...NaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.047.14119.10143.2731.2772.0688.7285.9663.65...115.0099.1995.33202.1333.410.2140.5528.91132.12154.69
\n", 793 | "

8 rows × 22 columns

\n", 794 | "
" 795 | ], 796 | "text/plain": [ 797 | " key user_id x_0 x_1 \\\n", 798 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0 7.62 56.64 \n", 799 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0 53.68 74.38 \n", 800 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0 20.26 65.60 \n", 801 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... 4.0 46.44 356.35 \n", 802 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... NaN NaN NaN \n", 803 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0 29.12 56.65 \n", 804 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... NaN NaN NaN \n", 805 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0 47.14 119.10 \n", 806 | "\n", 807 | " x_2 x_3 x_4 x_5 x_6 x_7 ... x_10 x_11 x_12 \\\n", 808 | "0 3.61 207.50 24.78 89.72 13.68 1.83 ... 84.10 144.58 140.20 \n", 809 | "1 32.02 91.62 85.97 22.60 62.88 18.65 ... 7.26 55.06 93.82 \n", 810 | "2 19.34 55.34 131.82 46.93 67.56 181.70 ... 39.78 33.74 104.76 \n", 811 | "3 132.11 15.26 16.45 43.01 76.74 98.49 ... 7.98 40.00 102.79 \n", 812 | "4 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN \n", 813 | "5 50.36 28.53 48.43 136.35 78.11 46.80 ... 87.55 171.07 45.08 \n", 814 | "6 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN \n", 815 | "7 143.27 31.27 72.06 88.72 85.96 63.65 ... 115.00 99.19 95.33 \n", 816 | "\n", 817 | " x_13 x_14 x_15 x_16 x_17 x_18 x_19 \n", 818 | "0 10.09 54.82 14.46 35.40 3.55 56.57 154.57 \n", 819 | "1 123.91 13.97 22.30 212.37 12.23 140.94 142.30 \n", 820 | "2 104.59 86.37 12.21 12.47 32.28 84.17 239.10 \n", 821 | "3 58.47 81.66 8.19 34.48 52.83 106.90 51.19 \n", 822 | "4 NaN NaN NaN NaN NaN NaN NaN \n", 823 | "5 74.92 20.39 18.22 68.07 181.85 4.71 39.48 \n", 824 | "6 NaN NaN NaN NaN NaN NaN NaN \n", 825 | "7 202.13 33.41 0.21 40.55 28.91 132.12 154.69 \n", 826 | "\n", 827 | "[8 rows x 22 columns]" 828 | ] 829 | }, 830 | "execution_count": 7, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "alice_df" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": 8, 842 | "metadata": {}, 843 | "outputs": [ 844 | { 845 | "data": { 846 | "text/html": [ 847 | "
\n", 848 | "\n", 861 | "\n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | "
keyuser_idx_0
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.00.31
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.089.36
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.0142.85
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...NaNNaN
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...7.052.65
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.030.27
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...9.040.11
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.087.34
\n", 921 | "
" 922 | ], 923 | "text/plain": [ 924 | " key user_id x_0\n", 925 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0 0.31\n", 926 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0 89.36\n", 927 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0 142.85\n", 928 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... NaN NaN\n", 929 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... 7.0 52.65\n", 930 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0 30.27\n", 931 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... 9.0 40.11\n", 932 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0 87.34" 933 | ] 934 | }, 935 | "execution_count": 8, 936 | "metadata": {}, 937 | "output_type": "execute_result" 938 | } 939 | ], 940 | "source": [ 941 | "bob_df" 942 | ] 943 | }, 944 | { 945 | "cell_type": "markdown", 946 | "metadata": {}, 947 | "source": [ 948 | "In each dataset, we create a flag `user_id_available` by checking if the user id is missing or not. For the column feature x_0, we replace the missing values with 0. The value doesn't matter since these records will be filtered out when taking the intersection." 949 | ] 950 | }, 951 | { 952 | "cell_type": "code", 953 | "execution_count": 9, 954 | "metadata": {}, 955 | "outputs": [ 956 | { 957 | "data": { 958 | "text/html": [ 959 | "
\n", 960 | "\n", 973 | "\n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | "
keyuser_idx_0user_id_available
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.00.311
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.089.361
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.0142.851
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...0.00.000
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...7.052.651
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.030.271
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...9.040.111
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.087.341
\n", 1042 | "
" 1043 | ], 1044 | "text/plain": [ 1045 | " key user_id x_0 \\\n", 1046 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0 0.31 \n", 1047 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0 89.36 \n", 1048 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0 142.85 \n", 1049 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... 0.0 0.00 \n", 1050 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... 7.0 52.65 \n", 1051 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0 30.27 \n", 1052 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... 9.0 40.11 \n", 1053 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0 87.34 \n", 1054 | "\n", 1055 | " user_id_available \n", 1056 | "0 1 \n", 1057 | "1 1 \n", 1058 | "2 1 \n", 1059 | "3 0 \n", 1060 | "4 1 \n", 1061 | "5 1 \n", 1062 | "6 1 \n", 1063 | "7 1 " 1064 | ] 1065 | }, 1066 | "execution_count": 9, 1067 | "metadata": {}, 1068 | "output_type": "execute_result" 1069 | } 1070 | ], 1071 | "source": [ 1072 | "bob_df['user_id_available'] = np.where(bob_df['user_id'].isnull(), 0, 1)\n", 1073 | "bob_df = bob_df.fillna(0)\n", 1074 | "bob_df" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": 10, 1080 | "metadata": {}, 1081 | "outputs": [ 1082 | { 1083 | "data": { 1084 | "text/html": [ 1085 | "
\n", 1086 | "\n", 1099 | "\n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | " \n", 1209 | " \n", 1210 | " \n", 1211 | " \n", 1212 | " \n", 1213 | " \n", 1214 | " \n", 1215 | " \n", 1216 | " \n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | "
keyuser_idx_0x_1x_2x_3x_4x_5x_6x_7...x_11x_12x_13x_14x_15x_16x_17x_18x_19user_id_available
0144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E...2.07.6256.643.61207.5024.7889.7213.681.83...144.58140.2010.0954.8214.4635.403.5556.57154.571
15CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B...8.053.6874.3832.0291.6285.9722.6062.8818.65...55.0693.82123.9113.9722.30212.3712.23140.94142.301
272A1523C8A9575605E886810AAB545E9957EB22DE5CCBC...1.020.2665.6019.3455.34131.8246.9367.56181.70...33.74104.76104.5986.3712.2112.4732.2884.17239.101
372F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F...4.046.44356.35132.1115.2616.4543.0176.7498.49...40.00102.7958.4781.668.1934.4852.83106.9051.191
48A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA...0.00.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000
592E45747889DB6D24755537D369BF68D384DC9549F1D77...5.029.1256.6550.3628.5348.43136.3578.1146.80...171.0745.0874.9220.3918.2268.07181.854.7139.481
6F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E...0.00.000.000.000.000.000.000.000.00...0.000.000.000.000.000.000.000.000.000
7F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2...0.047.14119.10143.2731.2772.0688.7285.9663.65...99.1995.33202.1333.410.2140.5528.91132.12154.691
\n", 1321 | "

8 rows × 23 columns

\n", 1322 | "
" 1323 | ], 1324 | "text/plain": [ 1325 | " key user_id x_0 x_1 \\\n", 1326 | "0 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4E... 2.0 7.62 56.64 \n", 1327 | "1 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B... 8.0 53.68 74.38 \n", 1328 | "2 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC... 1.0 20.26 65.60 \n", 1329 | "3 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097F... 4.0 46.44 356.35 \n", 1330 | "4 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDA... 0.0 0.00 0.00 \n", 1331 | "5 92E45747889DB6D24755537D369BF68D384DC9549F1D77... 5.0 29.12 56.65 \n", 1332 | "6 F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363E... 0.0 0.00 0.00 \n", 1333 | "7 F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A2... 0.0 47.14 119.10 \n", 1334 | "\n", 1335 | " x_2 x_3 x_4 x_5 x_6 x_7 ... x_11 x_12 x_13 \\\n", 1336 | "0 3.61 207.50 24.78 89.72 13.68 1.83 ... 144.58 140.20 10.09 \n", 1337 | "1 32.02 91.62 85.97 22.60 62.88 18.65 ... 55.06 93.82 123.91 \n", 1338 | "2 19.34 55.34 131.82 46.93 67.56 181.70 ... 33.74 104.76 104.59 \n", 1339 | "3 132.11 15.26 16.45 43.01 76.74 98.49 ... 40.00 102.79 58.47 \n", 1340 | "4 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 \n", 1341 | "5 50.36 28.53 48.43 136.35 78.11 46.80 ... 171.07 45.08 74.92 \n", 1342 | "6 0.00 0.00 0.00 0.00 0.00 0.00 ... 0.00 0.00 0.00 \n", 1343 | "7 143.27 31.27 72.06 88.72 85.96 63.65 ... 99.19 95.33 202.13 \n", 1344 | "\n", 1345 | " x_14 x_15 x_16 x_17 x_18 x_19 user_id_available \n", 1346 | "0 54.82 14.46 35.40 3.55 56.57 154.57 1 \n", 1347 | "1 13.97 22.30 212.37 12.23 140.94 142.30 1 \n", 1348 | "2 86.37 12.21 12.47 32.28 84.17 239.10 1 \n", 1349 | "3 81.66 8.19 34.48 52.83 106.90 51.19 1 \n", 1350 | "4 0.00 0.00 0.00 0.00 0.00 0.00 0 \n", 1351 | "5 20.39 18.22 68.07 181.85 4.71 39.48 1 \n", 1352 | "6 0.00 0.00 0.00 0.00 0.00 0.00 0 \n", 1353 | "7 33.41 0.21 40.55 28.91 132.12 154.69 1 \n", 1354 | "\n", 1355 | "[8 rows x 23 columns]" 1356 | ] 1357 | }, 1358 | "execution_count": 10, 1359 | "metadata": {}, 1360 | "output_type": "execute_result" 1361 | } 1362 | ], 1363 | "source": [ 1364 | "alice_df['user_id_available'] = np.where(alice_df['user_id'].isnull(), 0, 1)\n", 1365 | "alice_df = alice_df.fillna(0)\n", 1366 | "alice_df" 1367 | ] 1368 | }, 1369 | { 1370 | "cell_type": "markdown", 1371 | "metadata": {}, 1372 | "source": [ 1373 | "### Moose Computation\n", 1374 | "\n", 1375 | "To the Moose computation we will feed Alice's `user_id_available` flag and feature and Bob's `user_id_available` flag and feature.\n", 1376 | "\n", 1377 | "We do the intersection using a logical And operation between `user_id_available` flags from Alice and Bob, filter the datasets where the logical And returns 1, then compute a metric privately." 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": 12, 1383 | "metadata": {}, 1384 | "outputs": [], 1385 | "source": [ 1386 | "user_id_available_a = alice_df.user_id_available.values\n", 1387 | "user_id_available_b = bob_df.user_id_available.values\n", 1388 | "\n", 1389 | "alice_features_for_computation = alice_df.drop(['key', 'user_id', 'user_id_available'], axis=1)\n", 1390 | "bob_features_for_computation = bob_df.drop(['key', 'user_id', 'user_id_available'], axis=1)\n", 1391 | "\n", 1392 | "x_a = alice_features_for_computation.values\n", 1393 | "x_b = bob_features_for_computation.values" 1394 | ] 1395 | }, 1396 | { 1397 | "cell_type": "code", 1398 | "execution_count": 13, 1399 | "metadata": {}, 1400 | "outputs": [], 1401 | "source": [ 1402 | "np.save(_DATA_DIR / \"x_a\", x_a)\n", 1403 | "np.save(_DATA_DIR / \"x_b\", x_b)\n", 1404 | "np.save(_DATA_DIR / \"user_id_available_a\", user_id_available_a)\n", 1405 | "np.save(_DATA_DIR / \"user_id_available_b\", user_id_available_b)" 1406 | ] 1407 | }, 1408 | { 1409 | "cell_type": "code", 1410 | "execution_count": 14, 1411 | "metadata": {}, 1412 | "outputs": [], 1413 | "source": [ 1414 | "import pymoose as pm\n", 1415 | "\n", 1416 | "FIXED = pm.fixed(24, 40)\n", 1417 | "\n", 1418 | "alice = pm.host_placement(name=\"alice\")\n", 1419 | "bob = pm.host_placement(name=\"bob\")\n", 1420 | "carole = pm.host_placement(name=\"carole\")\n", 1421 | "rep = pm.replicated_placement(name=\"rep\", players=[alice, bob, carole])\n", 1422 | "mirrored = pm.mirrored_placement(name=\"mirrored\", players=[alice, bob, carole])\n", 1423 | "\n", 1424 | "@pm.computation\n", 1425 | "def psi_and_agg(): \n", 1426 | " with alice:\n", 1427 | " x_a = pm.load(\"x_a\", dtype=pm.float64)\n", 1428 | " user_id_available_a = pm.load(\"user_id_available_a\", dtype=pm.bool_)\n", 1429 | "\n", 1430 | " with bob:\n", 1431 | " x_b = pm.load(\"x_b\", dtype=pm.float64)\n", 1432 | " user_id_available_b = pm.load(\"user_id_available_b\", dtype=pm.bool_)\n", 1433 | "\n", 1434 | " # # Compute logical And between user_id_available from Alice and Bob.\n", 1435 | " # # If it returns 1, it means the User ID was in Alice and Bob's datasets\n", 1436 | " exist_in_alice_and_bob_bool = pm.logical_and(\n", 1437 | " user_id_available_a, user_id_available_b\n", 1438 | " )\n", 1439 | "\n", 1440 | " # # Filter Bob's feature to keep only records where exist_in_alice_and_bob_bool returned 1\n", 1441 | " x_b_sub = pm.select(x_b, axis=0, index=exist_in_alice_and_bob_bool)\n", 1442 | " x_b_sub = pm.cast(x_b_sub, dtype=FIXED)\n", 1443 | "\n", 1444 | " with alice:\n", 1445 | " # Filter Alice's feature to keep only records where exist_in_alice_and_bob_bool returned 1\n", 1446 | " x_a_sub = pm.select(x_a, axis=0, index=exist_in_alice_and_bob_bool)\n", 1447 | " x_a_sub = pm.cast(x_a_sub, dtype=FIXED)\n", 1448 | "\n", 1449 | " with mirrored:\n", 1450 | " ten_percent = pm.constant(0.1, dtype=FIXED)\n", 1451 | " \n", 1452 | " with rep:\n", 1453 | " # Aggregation: average ratio between sum of x_a_sub & x_b_sub\n", 1454 | " spend_per_category = x_a_sub + x_b_sub \n", 1455 | " spend_per_user = pm.sum(spend_per_category, axis=1)\n", 1456 | " category_percent = spend_per_category / pm.expand_dims(spend_per_user, axis=1)\n", 1457 | " res = pm.greater(category_percent, ten_percent)\n", 1458 | "\n", 1459 | " with alice:\n", 1460 | " res = pm.cast(res, dtype=pm.float64)\n", 1461 | " res = pm.save(\"agg_result\", res)\n", 1462 | "\n", 1463 | " return res" 1464 | ] 1465 | }, 1466 | { 1467 | "cell_type": "code", 1468 | "execution_count": 15, 1469 | "metadata": {}, 1470 | "outputs": [ 1471 | { 1472 | "name": "stdout", 1473 | "output_type": "stream", 1474 | "text": [ 1475 | "Aggregation result with Moose [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1.]\n", 1476 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 1477 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]\n", 1478 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n", 1479 | " [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]\n" 1480 | ] 1481 | } 1482 | ], 1483 | "source": [ 1484 | "executors_storage = {\n", 1485 | " \"alice\": {\"x_a\": x_a, \"user_id_available_a\": user_id_available_a.astype(np.bool_)},\n", 1486 | " \"bob\": {\"x_b\": x_b, \"user_id_available_b\": user_id_available_b.astype(np.bool_)}\n", 1487 | "}\n", 1488 | "\n", 1489 | "runtime = pm.LocalMooseRuntime(\n", 1490 | " identities=[\"alice\", \"bob\", \"carole\"],\n", 1491 | " storage_mapping=executors_storage,\n", 1492 | ")\n", 1493 | "\n", 1494 | "runtime.set_default()\n", 1495 | "\n", 1496 | "_ = psi_and_agg()\n", 1497 | "\n", 1498 | "agg_result = runtime.read_value_from_storage(\"alice\", \"agg_result\")\n", 1499 | "print(\"Aggregation result with Moose\", agg_result)" 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "code", 1504 | "execution_count": 30, 1505 | "metadata": {}, 1506 | "outputs": [ 1507 | { 1508 | "data": { 1509 | "text/plain": [ 1510 | "array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1],\n", 1511 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 1512 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", 1513 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],\n", 1514 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])" 1515 | ] 1516 | }, 1517 | "execution_count": 30, 1518 | "metadata": {}, 1519 | "output_type": "execute_result" 1520 | } 1521 | ], 1522 | "source": [ 1523 | "# In plaintext\n", 1524 | "\n", 1525 | "\n", 1526 | "inner_bool = np.logical_and(user_id_available_a, user_id_available_b)\n", 1527 | "x_a_sub = x_a[np.where(inner_bool==1)]\n", 1528 | "x_b_sub = x_b[np.where(inner_bool==1)]\n", 1529 | "\n", 1530 | "spend_per_category = x_a_sub + x_b_sub\n", 1531 | "spend_per_user = spend_per_category.sum(axis=1)\n", 1532 | "category_percent = spend_per_category / np.expand_dims(spend_per_user, axis=1)\n", 1533 | "plaintext_result = np.greater(category_percent, 0.1).astype(int) \n", 1534 | "plaintext_result" 1535 | ] 1536 | }, 1537 | { 1538 | "cell_type": "code", 1539 | "execution_count": 31, 1540 | "metadata": {}, 1541 | "outputs": [ 1542 | { 1543 | "data": { 1544 | "text/plain": [ 1545 | "array([[ True, True, True, True, True, True, True, True, True,\n", 1546 | " True, True, True, True, True, True, True, True, True,\n", 1547 | " True, True],\n", 1548 | " [ True, True, True, True, True, True, True, True, True,\n", 1549 | " True, True, True, True, True, True, True, True, True,\n", 1550 | " True, True],\n", 1551 | " [ True, True, True, True, True, True, True, True, True,\n", 1552 | " True, True, True, True, True, True, True, True, True,\n", 1553 | " True, True],\n", 1554 | " [ True, True, True, True, True, True, True, True, True,\n", 1555 | " True, True, True, True, True, True, True, True, True,\n", 1556 | " True, True],\n", 1557 | " [ True, True, True, True, True, True, True, True, True,\n", 1558 | " True, True, True, True, True, True, True, True, True,\n", 1559 | " True, True]])" 1560 | ] 1561 | }, 1562 | "execution_count": 31, 1563 | "metadata": {}, 1564 | "output_type": "execute_result" 1565 | } 1566 | ], 1567 | "source": [ 1568 | "plaintext_result == agg_result" 1569 | ] 1570 | }, 1571 | { 1572 | "cell_type": "markdown", 1573 | "metadata": {}, 1574 | "source": [ 1575 | "## Challenges\n", 1576 | "\n", 1577 | "- Adapt the computation so that it also returns the result to Bob\n", 1578 | "- Try out a new computation -- what if they wanted to find the categories where their shared customers spend the least amount of money?\n", 1579 | "- Add more rows and see how size affects the computation.\n", 1580 | "- Try out another Private-ID or Moose computation by building your own example. Feel free to contribute examples to the reader-examples folder!" 1581 | ] 1582 | }, 1583 | { 1584 | "cell_type": "code", 1585 | "execution_count": null, 1586 | "metadata": {}, 1587 | "outputs": [], 1588 | "source": [] 1589 | } 1590 | ], 1591 | "metadata": { 1592 | "kernelspec": { 1593 | "display_name": "Python 3 (ipykernel)", 1594 | "language": "python", 1595 | "name": "python3" 1596 | }, 1597 | "language_info": { 1598 | "codemirror_mode": { 1599 | "name": "ipython", 1600 | "version": 3 1601 | }, 1602 | "file_extension": ".py", 1603 | "mimetype": "text/x-python", 1604 | "name": "python", 1605 | "nbconvert_exporter": "python", 1606 | "pygments_lexer": "ipython3", 1607 | "version": "3.9.13" 1608 | }, 1609 | "vscode": { 1610 | "interpreter": { 1611 | "hash": "fd7cee8bcd971dcf8831d6be0fcf4125937c319bc3875e6dd50d059cbfec2fbc" 1612 | } 1613 | } 1614 | }, 1615 | "nbformat": 4, 1616 | "nbformat_minor": 2 1617 | } 1618 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Practical Data Privacy 2 | 3 | Notebooks to accompany the book _Practical Data Privacy: Enhancing Privacy and Security in Data_, O'Reilly, Spring 2023. 4 | 5 | There are several notebooks associated with the upcoming O'Reilly video course as well. 6 | 7 | You can [read the book on Safari now](https://www.oreilly.com/library/view/practical-data-privacy/9781098129453/). Pre-order is also available. 8 | 9 | These notebooks can also be used separately from the book, as a workhop or self-study to learn about practical data privacy methods. The audience is assumed to be a data scientist or data folks with an understanding of probability, math and data. 10 | 11 | ### Motivation 12 | 13 | The goal of the notebooks and the book is to help data scientists and other technologists learn about practical data privacy. I hope you can use these notebooks and the book to not only learn about data privacy, but also to guide implementation of data privacy in your work! 14 | 15 | These notebooks are not meant to replace exploring software or building sustainable, production-ready code; but instead are meant to help guide your learning and thinking around the topics. Please always try to use and support open-source libraries based on the learnings you get from these notebooks / the book. 16 | 17 | ### Installation 18 | 19 | Please utilize the included `requirements.txt` to install your requirements using `pip` (you can also do so in `conda`. The notebooks have *only* been tested with Python 3. 🙌🏻 20 | 21 | Unfortunately, some of these libraries have conflicting requirements, so you may need to adapt your libraries and install to use later notebooks after you install the earlier tools. You will also need to install several Rust libraries with Python bindings, which you will need to follow the direct installation information from those software packages. 22 | 23 | I recommend using [virtual environments](https://packaging.python.org/guides/installing-using-pip-and-virtualenv/) or [conda environments](https://conda.io/docs/user-guide/tasks/manage-environments.html). 24 | 25 | To run parts of these notebooks you will also need a running version of Apache Spark. Check [the latest documentation](https://spark.apache.org/downloads.html) to set up for your operating system. 26 | 27 | 28 | Notebooks 29 | -------- 30 | 31 | The notebooks follow the order that the ideas are introduced in the book. There are some additional notebooks added for those interested. Please file a pull request if you have an update to a notebook. I will also watch issues to ensure that the notebooks are usable and understandable. Feedback is very welcome! 32 | 33 | ### Recommended Reading and Challenges 34 | 35 | Several notebooks have a recommended reading and additional challenges section. I may update this README with additional reading of interest on this topic. I also recommend that you try out at least one or two challenges, to expand the knowledge you learned and practice using this for new problems. 36 | 37 | ### Reader Contributions 38 | 39 | I'm hoping this book and repository has inspired you to try out new libraries and tools related to data privacy. To encourage yourself and others to share their work, I have a folder here `reader-contributions`. If you try something new out, please consider contributing your notebook! To make it easier for others, please ensure you: 40 | 41 | - Write a brief introduction to the concept or library shown in the notebook, including any links for folks to learn more. What will they learn? What does it show? 42 | - Installation requirements 43 | - Your name (if you'd like recognition) and any details should people want to reach out (optional!) 44 | - Guide other readers through the notebook with occasional titles, markdown cells to take someone through the notebook when you cannot be there. 45 | - Recommended Reading or Challenges 46 | 47 | Feel free to send over Pull Requests once you've checked the above! 48 | 49 | Thank you for your work and contribution, and helping others learn more about privacy! 50 | 51 | ### Questions? 52 | 53 | Questions about getting set up or the content covered in the notebooks or book? Feel free to reach out via email at: katharine (at) kjamistan (dot) com 54 | 55 | ### Acknowledgements and Contributions 56 | 57 | These notebooks wouldn't have been possible without the following contributors and reviewers. Thank you! 58 | 59 | - [Damien Desfontaines](https://desfontain.es/serious.html) 60 | - [Morten Dahl](https://github.com/mortendahl) 61 | - [Jason Mancuso](https://github.com/jvmncs) 62 | - [Yann Dupis](https://github.com/yanndupis) 63 | - [Mitchell Lisle](https://github.com/mitchelllisle) 64 | 65 | Please note that the dpia-template.docx is downloaded from [the UK data protection authority (ICO)](https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/accountability-and-governance/guide-to-accountability-and-governance/accountability-and-governance/data-protection-impact-assessments/) and is meant to be used for educational purposes only. 66 | 67 | ### Update Log 68 | 69 | 23.08.2024: Main notebooks and examples for video course added. 70 | 20.02.2023: Main notebooks working and added reader-contributions folder. 71 | -------------------------------------------------------------------------------- /data/alice_id.csv: -------------------------------------------------------------------------------- 1 | 1 2 | 0 3 | 8 4 | 2 5 | 5 6 | 4 7 | -------------------------------------------------------------------------------- /data/alice_keys.csv: -------------------------------------------------------------------------------- 1 | 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4ED9AF64772F84E446E,2 2 | 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B6EBF1D184A8D2A974,8 3 | 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC47D2A2377FE75430,1 4 | 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097FAD35C407B15E5D66C,4 5 | 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDAD3B7B6889E79FC29, 6 | 92E45747889DB6D24755537D369BF68D384DC9549F1D77B7E9A6039E2F,5 7 | F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363EE3A249A2378372519, 8 | F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A27C0AA4A8FA77A5C4D,0 9 | -------------------------------------------------------------------------------- /data/alice_keys.csv_metrics: -------------------------------------------------------------------------------- 1 | {"protocol_name":"private-id","partner_input_size":7,"publisher_input_size":6,"union_file_size":8} -------------------------------------------------------------------------------- /data/block-stats.csv: -------------------------------------------------------------------------------- 1 | statistic,name,count,median,mean 2 | A1,total-population,7,30,38 3 | A2,non-smoker,4,30,33 4 | B2,smoker,3,30,44 5 | C2,unemployed,4,51,48 6 | D2,employed,3,24,24 7 | A3,single-adults,,, 8 | B3,married-adults,4,51,54 9 | A4,unemployed-non-smoker,3,36,37 10 | -------------------------------------------------------------------------------- /data/bob_id.csv: -------------------------------------------------------------------------------- 1 | 2 2 | 9 3 | 1 4 | 7 5 | 0 6 | 5 7 | 8 8 | -------------------------------------------------------------------------------- /data/bob_keys.csv: -------------------------------------------------------------------------------- 1 | 144418D81867C352A318B3C8BE1FC84BE959FC51FADF4ED9AF64772F84E446E,2 2 | 5CDC4E5348AA5E4DECC5F0AAB4DFB73D09C2BD4E4ED28B6EBF1D184A8D2A974,8 3 | 72A1523C8A9575605E886810AAB545E9957EB22DE5CCBC47D2A2377FE75430,1 4 | 72F5125B88854DDAF88DA9D225A348AC8B4E3DCE6F097FAD35C407B15E5D66C, 5 | 8A623CCEE51534754EAE9D65F66ADE66BF51E241783FDAD3B7B6889E79FC29,7 6 | 92E45747889DB6D24755537D369BF68D384DC9549F1D77B7E9A6039E2F,5 7 | F225409B2CBC3F22CD8EBCACDE75447816E81BD9DB363EE3A249A2378372519,9 8 | F438B71D793E7CD9C2A6A0DF5B0AE501F84833525237A27C0AA4A8FA77A5C4D,0 9 | -------------------------------------------------------------------------------- /data/bob_keys.csv_metrics: -------------------------------------------------------------------------------- 1 | {"protocol_name":"private-id-multi-key","partner_input_size":7,"publisher_input_size":6,"union_file_size":8} -------------------------------------------------------------------------------- /data/database.csv: -------------------------------------------------------------------------------- 1 | name,age,married,smoker,employed 2 | Sara Gray,8,False,False,False 3 | Joseph Collins,18,False,True,True 4 | Vincent Porter,24,False,False,True 5 | Tiffany Brown,30,True,True,True 6 | Brenda Small,36,True,False,False 7 | Dr. Tina Ayala,66,True,False,False 8 | Rodney Gonzalez,84,True,True,False -------------------------------------------------------------------------------- /data/user_id_available_a.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/practical-data-privacy/cf2bf66ad615a60ee321e23e86e1f5d26261f175/data/user_id_available_a.npy -------------------------------------------------------------------------------- /data/user_id_available_b.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/practical-data-privacy/cf2bf66ad615a60ee321e23e86e1f5d26261f175/data/user_id_available_b.npy -------------------------------------------------------------------------------- /data/x_a.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/practical-data-privacy/cf2bf66ad615a60ee321e23e86e1f5d26261f175/data/x_a.npy -------------------------------------------------------------------------------- /data/x_b.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/practical-data-privacy/cf2bf66ad615a60ee321e23e86e1f5d26261f175/data/x_b.npy -------------------------------------------------------------------------------- /dpia-template.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/practical-data-privacy/cf2bf66ad615a60ee321e23e86e1f5d26261f175/dpia-template.docx -------------------------------------------------------------------------------- /flower/client.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | from collections import OrderedDict 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | import torchvision.transforms as transforms 13 | from torch.utils.data import DataLoader 14 | from torchvision.datasets import VisionDataset, Food101 15 | from typing import List, Tuple 16 | from flwr.common import Metrics 17 | 18 | import flwr as fl 19 | import numpy as np 20 | 21 | 22 | # In[2]: 23 | 24 | 25 | import os 26 | import multiprocessing 27 | 28 | data_path = os.path.join(os.getcwd(),'data', 'food-101') 29 | cpu_count = multiprocessing.cpu_count() - 1 # set as you like! 30 | #device = torch.device("mps") #CHANGE THIS TO FIT YOUR DEVICE PLEASE :D (maybe under fits) 31 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 32 | 33 | 34 | # In[3]: 35 | 36 | 37 | pool_size = 4 # number of dataset partions (= number of total clients) 38 | 39 | client_resources = { 40 | "num_cpus": cpu_count 41 | } # each client will get allocated 1 CPUs 42 | 43 | transformations = transforms.Compose([ 44 | transforms.Resize((32,32)), 45 | transforms.ToTensor(), 46 | ]) 47 | 48 | 49 | # Download Dataset 50 | try: 51 | train_data = Food101(data_path, transform=transformations) 52 | except: 53 | train_data = Food101(data_path, transform=transformations, download=True) 54 | test_data = Food101(data_path, split='test', transform=transformations) 55 | 56 | lengths = [] 57 | while sum(lengths) != len(train_data): 58 | lengths = [round(x) for x in np.random.dirichlet( 59 | np.ones(pool_size),size=1)[0] * len(train_data)] 60 | 61 | trainloader = DataLoader(train_data, batch_size=32, shuffle=True) 62 | testloader = DataLoader(test_data, batch_size=32) 63 | num_examples = {"trainset" : len(train_data), "testset" : len(test_data)} 64 | 65 | 66 | # In[4]: 67 | 68 | 69 | # borrowed from Pytorch quickstart example 70 | def train(net, trainloader, epochs, device: str): 71 | """Train the network on the training set.""" 72 | criterion = torch.nn.CrossEntropyLoss(ignore_index=1) 73 | optimizer = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.9) 74 | net.train() 75 | for _ in range(epochs): 76 | for images, labels in trainloader: 77 | images, labels = images.to(device), labels.to(device) 78 | optimizer.zero_grad() 79 | loss = criterion(net(images), labels) 80 | loss.backward() 81 | optimizer.step() 82 | 83 | 84 | # In[5]: 85 | 86 | 87 | # borrowed from Pytorch quickstart example 88 | def test(net, testloader, device: str): 89 | """Validate the network on the entire test set.""" 90 | criterion = torch.nn.CrossEntropyLoss() 91 | correct, total, loss = 0, 0, 0.0 92 | net.eval() 93 | with torch.no_grad(): 94 | for batch in testloader: 95 | images, labels = batch[0].to(device), batch[1].to(device) 96 | outputs = net(images) 97 | loss += criterion(outputs, labels).item() 98 | _, predicted = torch.max(outputs.data, 1) 99 | total += labels.size(0) 100 | correct += (predicted == labels).sum().item() 101 | accuracy = correct / total 102 | print("Loss: %f, Accuracy: %f" % (loss, accuracy)) 103 | return loss, accuracy 104 | 105 | 106 | # In[6]: 107 | 108 | 109 | class Net(nn.Module): 110 | def __init__(self) -> None: 111 | super(Net, self).__init__() 112 | self.conv1 = nn.Conv2d(3, 6, 5) 113 | self.pool = nn.MaxPool2d(2, 2) 114 | self.conv2 = nn.Conv2d(6, 16, 5) 115 | self.fc1 = nn.Linear(16 * 5 * 5, 120) 116 | self.fc2 = nn.Linear(120, 84) 117 | self.fc3 = nn.Linear(84, 101) 118 | 119 | def forward(self, x: torch.Tensor) -> torch.Tensor: 120 | x = self.pool(F.relu(self.conv1(x))) 121 | x = self.pool(F.relu(self.conv2(x))) 122 | x = x.view(-1, 16 * 5 * 5) 123 | x = F.relu(self.fc1(x)) 124 | x = F.relu(self.fc2(x)) 125 | x = self.fc3(x) 126 | return x 127 | 128 | # Load model and data 129 | net = Net().to(device) 130 | 131 | 132 | # In[7]: 133 | 134 | 135 | class FlowerClient(fl.client.NumPyClient): 136 | 137 | def get_parameters(self, config): 138 | return [val.cpu().numpy() for _, val in net.state_dict().items()] 139 | 140 | def set_parameters(self, parameters): 141 | params_dict = zip(net.state_dict().keys(), parameters) 142 | state_dict = OrderedDict({k: torch.tensor(v) for k, v in params_dict}) 143 | net.load_state_dict(state_dict, strict=True) 144 | 145 | def fit(self, parameters, config): 146 | self.set_parameters(parameters) 147 | train(net, trainloader, 3, device) 148 | return self.get_parameters(config={}), num_examples["trainset"], {} 149 | 150 | def evaluate(self, parameters, config): 151 | self.set_parameters(parameters) 152 | loss, accuracy = test(net, testloader, device) 153 | return float(loss), num_examples["testset"], {"accuracy": float(accuracy)} 154 | 155 | 156 | # In[8]: 157 | 158 | 159 | len(test_data.classes) 160 | 161 | 162 | # ### Before you start training! 163 | # 164 | # - Make sure your device is properly set above to fit your compute. 165 | # - If you have made any changes to this script, download it as a python file and replace the flower/client.py file. 166 | # - Open a separate terminal and run `python flower/server.py`. 167 | # - Open 1-3 more terminals and run `python flower/client.py`. 168 | # - Then run the following cell to also run a client here and watch! :) 169 | # 170 | # If you want to change any of the model parameters, structure or even the splits on the data, you'll want to restart the server and clients. Have fun and experiment! 171 | 172 | # In[ ]: 173 | 174 | 175 | fl.client.start_numpy_client( 176 | server_address="127.0.0.1:8080", 177 | client=FlowerClient(), 178 | ) 179 | 180 | 181 | # ## Challenges 182 | # 183 | # - Adjust the fit and evaluate settings and see how the performance changes. 184 | # - Try out another [Flower tutorial](https://flower.dev/docs/quickstart-pytorch.html). 185 | # - Get a group of several folks together to try running flower in a distributed setup. Document your learnings and share in the reader-contributions! 186 | 187 | # In[ ]: 188 | 189 | 190 | 191 | 192 | -------------------------------------------------------------------------------- /flower/personalization_use_case.txt: -------------------------------------------------------------------------------- 1 | 2 | What if... 3 | 4 | I have normal groceries that I use every week and I want to pre-order every week? 5 | I am interested in particular types of foods, and want to know when new or old favorites come in? 6 | I have a particular dinner I'd like to try and I want to quickly compare the ingredients with inventory at my local store? 7 | I want to connect a recipe bookmark system and get recommendations when items are in stock at my local store? 8 | 9 | 10 | 11 | 1. Decide what data you might need for such a product at a high level. Categorize what you think is more sensitive or less. 12 | 2. Write up some user research questions around what data users might want to keep locally versus store centrally. 13 | - How might you find out what they are comfortable sharing and how? 14 | - How could you present a local-first option that might increase their comfort and reduce harms? 15 | 3. If you have time, interview a few friends or colleagues. 16 | - What did you find? 17 | - Were you surprised by the response? 18 | - Bonus: map a mental model of users and their data! 19 | -------------------------------------------------------------------------------- /flower/server.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import flwr as fl 4 | from flwr.common import Metrics 5 | 6 | 7 | # Define metric aggregation function 8 | def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics: 9 | # Multiply accuracy of each client by number of examples used 10 | accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics] 11 | examples = [num_examples for num_examples, _ in metrics] 12 | 13 | # Aggregate and return custom metric (weighted average) 14 | return {"accuracy": sum(accuracies) / sum(examples)} 15 | 16 | 17 | # Define strategy 18 | strategy = fl.server.strategy.FedAvg(evaluate_metrics_aggregation_fn=weighted_average) 19 | 20 | # Start Flower server 21 | fl.server.start_server( 22 | server_address="0.0.0.0:8080", 23 | config=fl.server.ServerConfig(num_rounds=3), 24 | strategy=strategy, 25 | ) 26 | -------------------------------------------------------------------------------- /flower/video_course_client.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import Dict, List, Tuple, Optional 4 | 5 | import flwr as fl 6 | 7 | from flwr.common import ( 8 | Scalar, 9 | parameters_to_ndarrays, 10 | ) 11 | 12 | import numpy as np 13 | import pandas as pd 14 | from flwr_datasets import FederatedDataset 15 | 16 | # Define Flower client 17 | class FlowerClient(fl.client.NumPyClient): 18 | def __init__(self, X: Dict): 19 | self.list = X['list_items'] 20 | self.stores = X['store_location_preferences'] 21 | 22 | def fit( 23 | self, parameters: List[np.ndarray], config: Dict[str, str] 24 | ) -> Tuple[List[np.ndarray], int, Dict]: 25 | truncated_list = [np.asarray( 26 | [x['item_id'], x['quantity']] + self.stores[:2]) for x in self.list] 27 | return ( 28 | truncated_list, 29 | len(truncated_list), 30 | {}, 31 | ) 32 | 33 | def evaluate( 34 | self, parameters: List[np.ndarray], config: Dict[str, str] 35 | ) -> Optional[Tuple[float, int, Dict]]: 36 | item_ids = [x['item_id'] for x in self.list] 37 | forecast_df = pd.DataFrame(parameters, 38 | columns=['store_id', 'item_id', 'forecast_quantity']) 39 | personal_forecast = forecast_df.loc[ \ 40 | (forecast_df['store_id'].isin(self.stores)) & \ 41 | (forecast_df['item_id'].isin(item_ids)) ] 42 | 43 | total_score_per_store = personal_forecast.groupby('store_id')['forecast_quantity'].sum() 44 | print('personal forecast score', total_score_per_store) 45 | 46 | # we don't actually need this for our first goal, but I 47 | # Challenge: Is there a creative use for this metric? 48 | # Maybe a customer satisfaction score?? :) 49 | return 0.1, 1, {"foo": 0.1} 50 | 51 | 52 | 53 | 54 | if __name__ == "__main__": 55 | N_CLIENTS = 5 56 | 57 | parser = argparse.ArgumentParser(description="Flower") 58 | parser.add_argument( 59 | "--partition-id", 60 | type=int, 61 | choices=range(0, N_CLIENTS), 62 | required=True, 63 | help="Specifies the partition id of artificially partitioned datasets.", 64 | ) 65 | args = parser.parse_args() 66 | partition_id = args.partition_id 67 | 68 | # Load the shoping list data 69 | with open('data/lists.json') as f: 70 | data = json.load(f) 71 | list = data[partition_id] 72 | # Start Flower client 73 | fl.client.start_client( 74 | server_address="127.0.0.1:8080", 75 | client=FlowerClient(list).to_client(), 76 | ) 77 | -------------------------------------------------------------------------------- /flower/video_course_run.sh: -------------------------------------------------------------------------------- 1 | echo "Starting server" 2 | python flower/video_course_server.py & 3 | sleep 3 # Sleep for 3s to give the server enough time to start 4 | 5 | for i in `seq 0 4`; do 6 | echo "Starting client $i" 7 | python flower/video_course_client.py --partition-id ${i} & 8 | done 9 | 10 | # This will allow you to use CTRL+C to stop all background processes 11 | trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM 12 | # Wait for all background processes to complete 13 | wait 14 | -------------------------------------------------------------------------------- /flower/video_course_server.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple, Union 2 | 3 | import flwr as fl 4 | import numpy as np 5 | import pandas as pd 6 | import itertools 7 | from flwr.common import ( 8 | EvaluateIns, 9 | EvaluateRes, 10 | FitIns, 11 | FitRes, 12 | Parameters, 13 | Scalar, 14 | ndarrays_to_parameters, 15 | parameters_to_ndarrays, 16 | ) 17 | from flwr.server.client_manager import ClientManager 18 | from flwr.server.client_proxy import ClientProxy 19 | from flwr.server.strategy import Strategy 20 | 21 | 22 | class FedAnalytics(Strategy): 23 | def initialize_parameters( 24 | self, client_manager: Optional[ClientManager] = None 25 | ) -> Optional[Parameters]: 26 | return None 27 | 28 | def configure_fit( 29 | self, server_round: int, parameters: Parameters, client_manager: ClientManager 30 | ) -> List[Tuple[ClientProxy, FitIns]]: 31 | config = {} 32 | fit_ins = FitIns(parameters, config) 33 | clients = client_manager.sample(num_clients=5, min_num_clients=5) 34 | return [(client, fit_ins) for client in clients] 35 | 36 | def aggregate_fit( 37 | self, 38 | server_round: int, 39 | results: List[Tuple[ClientProxy, FitRes]], 40 | failures: List[Union[Tuple[ClientProxy, FitRes], BaseException]], 41 | ) -> Tuple[Optional[Parameters], Dict[str, Scalar]]: 42 | # Get results from fit 43 | # Convert results 44 | 45 | values_aggregated = [ 46 | parameters_to_ndarrays(fit_res.parameters) for _, fit_res in results 47 | ] 48 | 49 | lists_df = pd.DataFrame(itertools.chain.from_iterable(values_aggregated), 50 | columns=['item_id', 'quantity', 'store_id', 'extra_store']) 51 | 52 | # some reshaping to do easier math 53 | extended_df = lists_df[['item_id', 'quantity', 'extra_store']].copy() 54 | extended_df['store_id'] = extended_df['extra_store'] 55 | extended_df = extended_df.drop(['extra_store'], axis=1) 56 | lists_df = lists_df.drop(['extra_store'], axis=1) 57 | lists_df = pd.concat([lists_df, extended_df]) 58 | 59 | # starting math bit 60 | lists_df['quantity'] = lists_df['quantity'] * -1 # lists remove quanities 61 | inventory_df = pd.read_json('data/inventory.json') 62 | combined_df = pd.concat([inventory_df.drop(['category', 'item'], axis=1), lists_df]) 63 | forecast_df = combined_df.groupby(['store_id', 'item_id'], as_index=False).sum() 64 | 65 | return ndarrays_to_parameters(forecast_df.to_numpy()), {} 66 | 67 | def evaluate( 68 | self, server_round: int, parameters: Parameters 69 | ) -> Optional[Tuple[float, Dict[str, Scalar]]]: 70 | agg_hist = [arr for arr in parameters_to_ndarrays(parameters)] 71 | return 0, {"Aggregated forecast": agg_hist} 72 | 73 | def configure_evaluate( 74 | self, server_round: int, parameters: Parameters, client_manager: ClientManager 75 | ) -> List[Tuple[ClientProxy, EvaluateIns]]: 76 | eval_ins = EvaluateIns(parameters, {}) 77 | 78 | # Challenge: what if you just sent back the parts of the inventory 79 | # with low medium high so that you aren't revealing proprietary data? 80 | 81 | # Extra challenge: what if you only sent back data to clients that matched 82 | # their store list? 83 | 84 | clients = client_manager.sample(num_clients=5, min_num_clients=5) 85 | return [(client, eval_ins) for client in clients] 86 | 87 | def aggregate_evaluate( 88 | self, 89 | server_round: int, 90 | results: List[Tuple[ClientProxy, EvaluateRes]], 91 | failures: List[Union[Tuple[ClientProxy, EvaluateRes], BaseException]], 92 | ) -> Tuple[Optional[float], Dict[str, Scalar]]: 93 | return 0.1, {'test': 1} 94 | 95 | 96 | # Start Flower server 97 | fl.server.start_server( 98 | server_address="0.0.0.0:8080", 99 | config=fl.server.ServerConfig(num_rounds=1), 100 | strategy=FedAnalytics(), 101 | ) 102 | -------------------------------------------------------------------------------- /in-progress/05 - Differential Privacy Experiments (Work In Progress!).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "280b47fe", 6 | "metadata": {}, 7 | "source": [ 8 | "### Experimenting with Differential Privacy and Bayesian Reasoning\n", 9 | "\n", 10 | "This notebook explores how we can develop Bayesian priors when working with differential privacy. It is a work in progress!" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "4d951b7a", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import numpy as np\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "from math import e, exp, log, sqrt\n", 23 | "from scipy.stats import norm\n", 24 | "plt.style.use('ggplot')\n", 25 | "\n", 26 | "np.random.seed(42)\n", 27 | "%matplotlib inline" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 4, 33 | "id": "583342cc", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Function definitions\n", 38 | "\n", 39 | "gaussianSigmaAccuracy = 1e-3\n", 40 | "\n", 41 | "def delta_for_gaussian(sigma, l0_sensitivity, linf_sensitivity, epsilon):\n", 42 | " l2_sensitivity = linf_sensitivity * sqrt(l0_sensitivity)\n", 43 | " a = l2_sensitivity / (2*sigma)\n", 44 | " b = epsilon * sigma / l2_sensitivity\n", 45 | " c = exp(epsilon)\n", 46 | " \n", 47 | " if np.isinf(c): \n", 48 | " return 0\n", 49 | " if np.isinf(b):\n", 50 | " return 0\n", 51 | " return norm.cdf (a-b) - norm.cdf(-a-b)\n", 52 | "\n", 53 | "def sigma_for_gaussian(l0_sensitivity, linf_sensitivity, epsilon, delta):\n", 54 | " if delta >=1:\n", 55 | " return 0\n", 56 | " \n", 57 | " l2_sensitivity = linf_sensitivity * sqrt(l0_sensitivity)\n", 58 | " upper_bound = l2_sensitivity\n", 59 | " \n", 60 | " while delta_for_gaussian(upper_bound, l0_sensitivity, linf_sensitivity, epsilon) > delta:\n", 61 | " lower_bound = upper_bound\n", 62 | " upper_bound = upper_bound * 2\n", 63 | " \n", 64 | " while upper_bound - lower_bound > gaussianSigmaAccuracy * lower_bound:\n", 65 | " middle = lower_bound * 0.5 + upper_bound * 0.5\n", 66 | " if delta_for_gaussian(middle, l0_sensitivity, linf_sensitivity, epsilon) > delta:\n", 67 | " lower_bound = middle\n", 68 | " else:\n", 69 | " upper_bound = middle\n", 70 | " \n", 71 | " return upper_bound\n", 72 | "\n", 73 | "def gaussian_dp_mechanism(value, epsilon, delta, linf_sensitivity, l0_sensitivity=1):\n", 74 | " # Please do not use this function in real life - it is susceptible to well known attacks\n", 75 | " # instead, use a well-known and audited open-source DP library \n", 76 | " orig_value = value\n", 77 | " gauss_scale = sigma_for_gaussian(l0_sensitivity, linf_sensitivity, epsilon, delta)\n", 78 | " value = np.random.normal(value, gauss_scale)\n", 79 | " print(\"Noise: {}\".format(value - orig_value))\n", 80 | " return value\n", 81 | "\n", 82 | "def laplace_dp_mechanism(value, epsilon, linf_sensitivity):\n", 83 | " # Please do not use this function in real life - it is susceptible to well known attacks\n", 84 | " # instead, use a well-known and audited open-source DP library \n", 85 | " orig_value = value\n", 86 | " value = np.random.laplace(value, linf_sensitivity/epsilon)\n", 87 | " #print(\"Noise: {}\".format(value - orig_value))\n", 88 | " return value" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 6, 94 | "id": "50de83a3", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "mid_level_age = 45\n", 99 | "mid_level_salary = 50000\n", 100 | "\n", 101 | "age_scale = 10 #scale represents one standard deviation\n", 102 | "salary_scale = 10000\n", 103 | "\n", 104 | "salaries = [round(np.random.normal(mid_level_salary,salary_scale)) for _ in range(1000)]\n", 105 | "ages = [round(np.random.normal(mid_level_age,age_scale)) for _ in range(1000)]" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "id": "5641a70f", 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "#TODO: run experiments!" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "id": "1f66d1b5", 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "#TODO: data chart on relative error" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "78f5f0ee", 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "#TODO: Bayesian prior development" 136 | ] 137 | } 138 | ], 139 | "metadata": { 140 | "kernelspec": { 141 | "display_name": "Python 3 (ipykernel)", 142 | "language": "python", 143 | "name": "python3" 144 | }, 145 | "language_info": { 146 | "codemirror_mode": { 147 | "name": "ipython", 148 | "version": 3 149 | }, 150 | "file_extension": ".py", 151 | "mimetype": "text/x-python", 152 | "name": "python", 153 | "nbconvert_exporter": "python", 154 | "pygments_lexer": "ipython3", 155 | "version": "3.8.9" 156 | } 157 | }, 158 | "nbformat": 4, 159 | "nbformat_minor": 5 160 | } 161 | -------------------------------------------------------------------------------- /in-progress/Generating Example Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from datetime import datetime, timedelta\n", 12 | "from datafuzz.generators import DatasetGenerator\n", 13 | "\n", 14 | "generator = DatasetGenerator({\n", 15 | " 'output': 'pandas',\n", 16 | " 'schema': {\n", 17 | " 'order_number': range(1000),\n", 18 | " 'user_id': 'faker.user_name',\n", 19 | " 'email': 'faker.email',\n", 20 | " 'street_address': 'faker.street_address',\n", 21 | " 'city': 'faker.city',\n", 22 | " 'state': 'faker.state',\n", 23 | " 'num_items': range(1,10),\n", 24 | " 'total_price': range(10, 200, 4)\n", 25 | " },\n", 26 | " 'num_rows': 10000,\n", 27 | " 'start_time': datetime.now() - timedelta(days=20),\n", 28 | " 'output': 'dataset',\n", 29 | " 'increments': 'minutes',\n", 30 | "})\n", 31 | "\n", 32 | "generator.generate()\n", 33 | "\n", 34 | "order_dataset = generator.to_output()\n", 35 | "order_dataset.records.to_csv('../data/orders.csv', index=False)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from datetime import datetime, timedelta\n", 45 | "from datafuzz.generators import DatasetGenerator\n", 46 | "\n", 47 | "generator = DatasetGenerator({\n", 48 | " 'output': 'pandas',\n", 49 | " 'schema': {\n", 50 | " 'order_number': range(1000),\n", 51 | " 'inbound_uri': 'faker.slug',\n", 52 | " 'browser_agent': 'faker.user_agent',\n", 53 | " 'ip': 'faker.ipv4',\n", 54 | " },\n", 55 | " 'num_rows': 10000,\n", 56 | " 'start_time': datetime.now() - timedelta(days=20),\n", 57 | " 'output': 'dataset',\n", 58 | " 'increments': 'minutes',\n", 59 | "})\n", 60 | "\n", 61 | "generator.generate()\n", 62 | "\n", 63 | "browser_dataset = generator.to_output()\n", 64 | "browser_dataset.records.to_csv('../data/browsing.csv', index=False)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [] 73 | } 74 | ], 75 | "metadata": { 76 | "kernelspec": { 77 | "display_name": "Python 3", 78 | "language": "python", 79 | "name": "python3" 80 | }, 81 | "language_info": { 82 | "codemirror_mode": { 83 | "name": "ipython", 84 | "version": 3 85 | }, 86 | "file_extension": ".py", 87 | "mimetype": "text/x-python", 88 | "name": "python", 89 | "nbconvert_exporter": "python", 90 | "pygments_lexer": "ipython3", 91 | "version": "3.8.10" 92 | } 93 | }, 94 | "nbformat": 4, 95 | "nbformat_minor": 5 96 | } 97 | -------------------------------------------------------------------------------- /in-progress/Presidio in Pipelines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "ada3ec3a", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from presidio_analyzer import AnalyzerEngine\n", 19 | "\n", 20 | "# Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers\n", 21 | "analyzer = AnalyzerEngine()\n", 22 | "\n", 23 | "# Call analyzer to get results\n", 24 | "results = analyzer.analyze(text=\"My phone number is 212-555-5555\",\n", 25 | " entities=[\"PHONE_NUMBER\"],\n", 26 | " language='en')\n", 27 | "print(results)" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "8b361b0a", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [] 37 | } 38 | ], 39 | "metadata": { 40 | "kernelspec": { 41 | "display_name": "Python 3 (ipykernel)", 42 | "language": "python", 43 | "name": "python3" 44 | }, 45 | "language_info": { 46 | "codemirror_mode": { 47 | "name": "ipython", 48 | "version": 3 49 | }, 50 | "file_extension": ".py", 51 | "mimetype": "text/x-python", 52 | "name": "python", 53 | "nbconvert_exporter": "python", 54 | "pygments_lexer": "ipython3", 55 | "version": "3.8.9" 56 | } 57 | }, 58 | "nbformat": 4, 59 | "nbformat_minor": 5 60 | } 61 | -------------------------------------------------------------------------------- /order_summary_for_sharing_expecation_file.json: -------------------------------------------------------------------------------- 1 | {"expectation_suite_name": "default", "meta": {"great_expectations_version": "0.15.49"}, "data_asset_type": "Dataset", "ge_cloud_id": null, "expectations": [{"meta": {}, "expectation_type": "expect_column_values_to_be_between", "kwargs": {"column": "total_price", "min_value": 1500, "max_value": 27000}}]} -------------------------------------------------------------------------------- /reader-contributions/README.md: -------------------------------------------------------------------------------- 1 | # Practical Data Privacy: Reader Contributions 2 | 3 | These notebooks can be used separately from the book, as a workhop or self-study to learn about practical data privacy methods. They are contributed by users to help teach others and experiment with privacy and privacy technologies! 4 | 5 | ### How to Contribute 6 | 7 | I'm hoping this book and repository has inspired you to try out new libraries and tools related to data privacy. If you try something new out, please consider contributing your notebook! To make it easier for others, please ensure you: 8 | 9 | - Write a brief introduction to the concept or library shown in the notebook, including any links for folks to learn more. What will they learn? What does it show? 10 | - Installation requirements 11 | - Your name (if you'd like recognition) and any details should people want to reach out (optional!) 12 | - Guide other readers through the notebook with occasional titles, markdown cells to take someone through the notebook when you cannot be there. 13 | - Recommended Reading or Challenges 14 | 15 | Feel free to send over Pull Requests once you've checked the above! 16 | 17 | Thank you for your work and contribution, and helping others learn more about privacy! 18 | 19 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jupyter 2 | numpy 3 | scipy 4 | seaborn 5 | pandas 6 | requests 7 | faker 8 | ff3 9 | matplotlib 10 | great_expectations 11 | transformers 12 | datasets 13 | opacus 14 | flwr 15 | ray 16 | torchvision 17 | -------------------------------------------------------------------------------- /solutions/filter_bounds.py: -------------------------------------------------------------------------------- 1 | def filter_bounds(value, lower_bound, upper_bound): 2 | if value < lower_bound: 3 | return lower_bound 4 | elif value > upper_bound: 5 | return upper_bound 6 | return value 7 | -------------------------------------------------------------------------------- /solutions/laplace_dp.py: -------------------------------------------------------------------------------- 1 | def laplace_dp_mechanism(value, epsilon, sensitivity=1): 2 | # Please do not use this function, ever :) 3 | orig_value = value 4 | value = np.random.laplace(value, sensitivity/epsilon) # now you see why the 1 was a poor choice! 5 | print("Noise: {}".format(value - orig_value)) 6 | return value 7 | -------------------------------------------------------------------------------- /solutions/lockpick.py: -------------------------------------------------------------------------------- 1 | decode('supa_secrets_for_you', b'4N7TycDY0dbfzujb') 2 | -------------------------------------------------------------------------------- /solutions/masked_pseudonym.py: -------------------------------------------------------------------------------- 1 | df['pseudonym'] = df['username'].map( 2 | lambda x: fakes.user_name()) 3 | df['pseudonym'].head() 4 | -------------------------------------------------------------------------------- /solutions/multiply.py: -------------------------------------------------------------------------------- 1 | def multiply(share1, share2): 2 | return (share1 * share2) % Q 3 | -------------------------------------------------------------------------------- /solutions/pad_text.py: -------------------------------------------------------------------------------- 1 | def add_padding_and_encrypt(cipher, username): 2 | if len(username) < 4: 3 | username += "X" * (4-len(username)) 4 | return cipher.encrypt(username) 5 | -------------------------------------------------------------------------------- /solutions/proper_encoding.py: -------------------------------------------------------------------------------- 1 | hasher = blake2b() 2 | hasher.update(username.encode('utf-8')) 3 | hasher.hexdigest() 4 | -------------------------------------------------------------------------------- /solutions/subtraction.py: -------------------------------------------------------------------------------- 1 | (sum(shares) % Q) - Q 2 | --------------------------------------------------------------------------------