├── code
    ├── requirements.txt
    └── inference.py
├── assets
    └── presidio.gif
├── README.md
├── .gitignore
└── sagemaker-notebook.ipynb


/code/requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | presidio-analyzer
3 | spacy
4 | transformers
5 | presidio-anonymizer
6 | 


--------------------------------------------------------------------------------
/assets/presidio.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/philschmid/advanced-pii-huggingface-sagemaker/HEAD/assets/presidio.gif


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Advanced PII detection and anonymization with Hugging Face Transformers and Amazon SageMaker
 2 | 
 3 | 
 4 | repository [philschmid/advanced-pii-huggingface-sagemaker](https://github.com/philschmid/advanced-pii-huggingface-sagemaker)
 5 | 
 6 | PII or Personally identifiable information (PII) is any data that could potentially identify a specific individual, e.g. to distinguish one person from another. Below are a few examples of PII:
 7 | 
 8 | - Name
 9 | - Address
10 | - Date of birth
11 | - Telephone number
12 | - Credit Card number
13 | 
14 | Protecting PII is essential for personal privacy, data privacy, data protection, information privacy, and information security. With just a few bits of an individual's personal information, thieves can create false accounts in the person's name, incur debt, create a falsified passport or sell a person's identity to a criminal.
15 | 
16 | Transformer models are changing the world of machine learning, starting with natural language processing (NLP), and now, with audio and computer vision. Hugging Face’s mission is to democratize good machine learning and give anyone the opportunity to use these new state-of-the-art machine learning models.
17 | 
18 | Models Like BERT, RoBERTa, T5, and GPT-2 captured the NLP space and are achieving state-of-the-art results across almost any NLP tasks including, text-classification, question-answering, and token-classification. 
19 | 
20 | ---
21 | 
22 | In this blog, you will learn how to use state-of-the-art Transformers models to recognize, detect and anonymize PII using Hugging Face Transformers, Presidio & Amazon SageMaker.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/code/inference.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from presidio_anonymizer import AnonymizerEngine
  3 | from presidio_analyzer import AnalyzerEngine
  4 | from typing import List
  5 | 
  6 | from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult
  7 | from presidio_analyzer.nlp_engine import NlpArtifacts
  8 | from transformers import pipeline
  9 | 
 10 | # load spacy model -> workaround
 11 | import os
 12 | os.system("spacy download en_core_web_lg")
 13 | 
 14 | # list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities
 15 | DEFAULT_ANOYNM_ENTITIES = [
 16 |     "CREDIT_CARD",
 17 |     "CRYPTO",
 18 |     "DATE_TIME",
 19 |     "EMAIL_ADDRESS",
 20 |     "IBAN_CODE",
 21 |     "IP_ADDRESS",
 22 |     "NRP",
 23 |     "LOCATION",
 24 |     "PERSON",
 25 |     "PHONE_NUMBER",
 26 |     "MEDICAL_LICENSE",
 27 |     "URL",
 28 |     "ORGANIZATION"
 29 | ]
 30 | 
 31 | # init anonymize engine
 32 | engine = AnonymizerEngine()
 33 | 
 34 | class HFTransformersRecognizer(EntityRecognizer):
 35 |     def __init__(
 36 |         self,
 37 |         model_id_or_path=None,
 38 |         aggregation_strategy="simple",
 39 |         supported_language="en",
 40 |         ignore_labels=["O", "MISC"],
 41 |     ):
 42 |         # inits transformers pipeline for given mode or path
 43 |         self.pipeline = pipeline(
 44 |             "token-classification", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels
 45 |         )
 46 |         # map labels to presidio labels
 47 |         self.label2presidio = {
 48 |             "PER": "PERSON",
 49 |             "LOC": "LOCATION",
 50 |             "ORG": "ORGANIZATION",
 51 |         }
 52 | 
 53 |         # passes entities from model into parent class
 54 |         super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)
 55 | 
 56 |     def load(self) -> None:
 57 |         """No loading is required."""
 58 |         pass
 59 | 
 60 |     def analyze(
 61 |         self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None
 62 |     ) -> List[RecognizerResult]:
 63 |         """
 64 |         Extracts entities using Transformers pipeline
 65 |         """
 66 |         results = []
 67 | 
 68 |         # keep max sequence length in mind
 69 |         predicted_entities = self.pipeline(text)
 70 |         if len(predicted_entities) > 0:
 71 |             for e in predicted_entities:
 72 |                 converted_entity = self.label2presidio[e["entity_group"]]
 73 |                 if converted_entity in entities or entities is None:
 74 |                     results.append(
 75 |                         RecognizerResult(
 76 |                             entity_type=converted_entity, start=e["start"], end=e["end"], score=e["score"]
 77 |                         )
 78 |                     )
 79 |         return results
 80 | 
 81 | 
 82 | def model_fn(model_dir):
 83 |     transformers_recognizer = HFTransformersRecognizer(model_dir)
 84 |     # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers
 85 |     analyzer = AnalyzerEngine()
 86 |     analyzer.registry.add_recognizer(transformers_recognizer)
 87 |     return analyzer
 88 | 
 89 | 
 90 | def predict_fn(data, analyzer):
 91 |     sentences = data.pop("inputs", data)
 92 |     if "parameters" in data:
 93 |         anonymization_entities = data["parameters"].get("entities", DEFAULT_ANOYNM_ENTITIES)
 94 |         anonymize_text = data["parameters"].get("anonymize", False)
 95 |     else:
 96 |         anonymization_entities = DEFAULT_ANOYNM_ENTITIES
 97 |         anonymize_text = False
 98 | 
 99 |     # identify entities
100 |     results = analyzer.analyze(text=sentences, entities=anonymization_entities, language="en")
101 |     # anonymize text
102 |     if anonymize_text:
103 |         result = engine.anonymize(text=sentences, analyzer_results=results)
104 |         return {"anonymized": result.text}
105 | 
106 |     return {"found": [entity.to_dict() for entity in results]}
107 | 


--------------------------------------------------------------------------------
/sagemaker-notebook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "788c45c8",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Advanced PII detection and anonymization with Hugging Face Transformers and Amazon SageMaker\n",
  9 |     "\n",
 10 |     "In this blog, you will learn how to use state-of-the-art Transformers models to recognize, detect and anonymize PII using Hugging Face Transformers, Presidio & Amazon SageMaker.\n",
 11 |     "\n",
 12 |     "### What is Presidio?\n",
 13 |     "\n",
 14 |     "_Presidio (Origin from Latin praesidium ‘protection, garrison’) helps to ensure sensitive data is properly managed and governed. It provides fast identification and anonymization modules for private entities in text and images such as credit card numbers, names, locations, social security numbers, bitcoin wallets, US phone numbers, financial data and more._ - [Documentation](https://microsoft.github.io/presidio/).\n",
 15 |     "\n",
 16 |     "\n",
 17 |     "![presidio-gif](assets/presidio.gif)  \n",
 18 |     "_- From Presidio [Documentation](https://microsoft.github.io/presidio/)_\n",
 19 |     "\n",
 20 |     "By Default Presidio is using `Spacy` for PII identification and extraction. In this example are we going to replace `spacy` with a Hugging Face Transformer to perform PII detection and anonymization. \n",
 21 |     "Presidio supports already out of the box [24 PII entities including](https://microsoft.github.io/presidio/supported_entities/), CREDIT_CARD, IBAN_CODE, EMAIL_ADDRESS, US_BANK_NUMBER, US_ITIN... \n",
 22 |     "We are going to extend this available 24 entities with transformers to include LOCATION, PERSON & ORGANIZATION. But it is possible to use any \"entity\" extracted by the transformers model. \n",
 23 |     "\n",
 24 |     "\n",
 25 |     "You will learn how to: \n",
 26 |     "\n",
 27 |     "1. Setup Environment and Permissions\n",
 28 |     "2. Create a new `transformers` based EntityRecognizer\n",
 29 |     "3. Create a custom `inference.py` including the `EntityRecognizer`\n",
 30 |     "4. Deploy the PII service to Amazon SageMaker\n",
 31 |     "5. Request and customization of requests\n",
 32 |     "\n",
 33 |     "Let's get started! 🚀\n",
 34 |     "\n",
 35 |     "---\n",
 36 |     "\n",
 37 |     "*If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.*"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "id": "5237d478",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 1. Setup Environment and Permissions\n",
 46 |     "\n",
 47 |     "_*Note:* we only install the required libraries from Hugging Face and AWS. You also need PyTorch or Tensorflow, if you haven´t it installed_"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "id": "69c59d90",
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "%pip install sagemaker --upgrade\n",
 58 |     "import sagemaker\n",
 59 |     "\n",
 60 |     "assert sagemaker.__version__ >= \"2.75.0\""
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "id": "ce0ef431",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "Install `git` and `git-lfs`"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "id": "96d8dfea",
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# For notebook instances (Amazon Linux)\n",
 79 |     "!sudo yum update -y \n",
 80 |     "!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.rpm.sh | sudo bash\n",
 81 |     "!sudo yum install git-lfs git -y\n",
 82 |     "# For other environments (Ubuntu)\n",
 83 |     "!sudo apt-get update -y \n",
 84 |     "!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash\n",
 85 |     "!sudo apt-get install git-lfs git -y"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "id": "9e4386d9",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "### Permissions\n",
 94 |     "\n",
 95 |     "_If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it._"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 2,
101 |    "id": "1c22e8d5",
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "name": "stderr",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "Couldn't call 'get_role' to get Role ARN from role name philippschmid to get Role path.\n"
109 |      ]
110 |     },
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "sagemaker role arn: arn:aws:iam::558105141721:role/sagemaker_execution_role\n",
116 |       "sagemaker bucket: sagemaker-us-east-1-558105141721\n",
117 |       "sagemaker session region: us-east-1\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "import sagemaker\n",
123 |     "import boto3\n",
124 |     "sess = sagemaker.Session()\n",
125 |     "# sagemaker session bucket -> used for uploading data, models and logs\n",
126 |     "# sagemaker will automatically create this bucket if it not exists\n",
127 |     "sagemaker_session_bucket=None\n",
128 |     "if sagemaker_session_bucket is None and sess is not None:\n",
129 |     "    # set to default bucket if a bucket name is not given\n",
130 |     "    sagemaker_session_bucket = sess.default_bucket()\n",
131 |     "\n",
132 |     "try:\n",
133 |     "    role = sagemaker.get_execution_role()\n",
134 |     "except ValueError:\n",
135 |     "    iam = boto3.client('iam')\n",
136 |     "    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']\n",
137 |     "\n",
138 |     "sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)\n",
139 |     "\n",
140 |     "print(f\"sagemaker role arn: {role}\")\n",
141 |     "print(f\"sagemaker bucket: {sess.default_bucket()}\")\n",
142 |     "print(f\"sagemaker session region: {sess.boto_region_name}\")"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "id": "39967284",
148 |    "metadata": {},
149 |    "source": [
150 |     "## 2. Create a new `transformers` based EntityRecognizer\n",
151 |     "\n",
152 |     "Presidio can be extended to support the detection of new types of PII entities and to support additional languages. These PII recognizers could be added **via code** or **ad-hoc as part of the request**.\n",
153 |     "\n",
154 |     "- The `EntityRecognizer` is an abstract class for all recognizers.\n",
155 |     "- The `RemoteRecognizer` is an abstract class for calling external PII detectors. See more info [here](https://microsoft.github.io/presidio/analyzer/adding_recognizers/#creating-a-remote-recognizer).\n",
156 |     "- The abstract class `LocalRecognizer` is implemented by all recognizers running within the Presidio-analyzer process.\n",
157 |     "- The `PatternRecognizer` is a class for supporting regex and deny-list-based recognition logic, including validation (e.g., with checksum) and context support. See an example [here](https://microsoft.github.io/presidio/analyzer/adding_recognizers/#simple-example).\n",
158 |     "\n",
159 |     "For simple recognizers based on regular expressions or deny-lists, we can leverage the provided `PatternRecognizer`:\n",
160 |     "\n",
161 |     "```python\n",
162 |     "from presidio_analyzer import PatternRecognizer\n",
163 |     "titles_recognizer = PatternRecognizer(supported_entity=\"TITLE\",\n",
164 |     "                                      deny_list=[\"Mr.\",\"Mrs.\",\"Miss\"])\n",
165 |     "```\n",
166 |     "\n",
167 |     "To create a Hugging Face Transformer recognizer you have to create a new class deriving the `EntityRecognizer` and implementing a `load` and `analyze` method. \n",
168 |     "\n",
169 |     "For this example the `__init__` method will be used to \"load\" and our model using the `transformers.pipeline` for `token-classification`. \n",
170 |     "If you want to learn more how you can customize/create recognizer you can check out the [documentation](https://microsoft.github.io/presidio/analyzer/adding_recognizers/#extending-the-analyzer-for-additional-pii-entities).\n",
171 |     "\n",
172 |     "\n",
173 |     "```python\n",
174 |     " class TransformersRecognizer(EntityRecognizer):    \n",
175 |     "    def __init__(self,model_id_or_path=None,aggregation_strategy=\"average\",supported_language=\"en\",ignore_labels=[\"O\",\"MISC\"]):\n",
176 |     "      # inits transformers pipeline for given mode or path\n",
177 |     "      self.pipeline = pipeline(\"token-classification\",model=model_id_or_path,aggregation_strategy=\"average\",ignore_labels=ignore_labels)\n",
178 |     "      # map labels to presidio labels\n",
179 |     "      self.label2presidio={\n",
180 |     "        \"PER\": \"PERSON\",\n",
181 |     "        \"LOC\": \"LOCATION\",\n",
182 |     "        \"ORG\": \"ORGANIZATION\",\n",
183 |     "      }\n",
184 |     "\n",
185 |     "      # passes entities from model into parent class\n",
186 |     "      super().__init__(supported_entities=list(self.label2presidio.values()),supported_language=supported_language)\n",
187 |     "\n",
188 |     "    def load(self) -> None:\n",
189 |     "        \"\"\"No loading is required.\"\"\"\n",
190 |     "        pass\n",
191 |     "\n",
192 |     "    def analyze(\n",
193 |     "        self, text: str, entities: List[str]=None, nlp_artifacts: NlpArtifacts=None\n",
194 |     "    ) -> List[RecognizerResult]:\n",
195 |     "        \"\"\"\n",
196 |     "        Extracts entities using Transformers pipeline\n",
197 |     "        \"\"\"\n",
198 |     "        results = []\n",
199 |     "        \n",
200 |     "        # keep max sequence length in mind\n",
201 |     "        predicted_entities = self.pipeline(text)\n",
202 |     "        if len(predicted_entities) >0:\n",
203 |     "          for e in predicted_entities:\n",
204 |     "            converted_entity = self.label2presidio[e[\"entity_group\"]]\n",
205 |     "            if converted_entity in entities or entities is None:\n",
206 |     "              results.append(\n",
207 |     "                  RecognizerResult(\n",
208 |     "                      entity_type=converted_entity,\n",
209 |     "                      start=e[\"start\"],\n",
210 |     "                      end=e[\"end\"],\n",
211 |     "                      score=e[\"score\"]\n",
212 |     "                      )\n",
213 |     "                  )\n",
214 |     "        return results\n",
215 |     "```"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "id": "755fd6da",
221 |    "metadata": {},
222 |    "source": [
223 |     "## 3. Create a custom `inference.py` including the `EntityRecognizer`\n",
224 |     "\n",
225 |     "To use the custom inference script, you need to create an `inference.py` script. In this example, we are going to overwrite the `model_fn` to load our `HFTransformersRecognizer` correctly and the `predict_fn` to run the PII analysis.\n",
226 |     "\n",
227 |     "Additionally we need to provide a `requirements.txt` in the `code/` directory to install `presidio` and other required dependencies"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 3,
233 |    "id": "cc01de44",
234 |    "metadata": {},
235 |    "outputs": [
236 |     {
237 |      "name": "stdout",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "mkdir: code: File exists\n"
241 |      ]
242 |     }
243 |    ],
244 |    "source": [
245 |     "!mkdir code"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "code",
250 |    "execution_count": 4,
251 |    "id": "3ce41529",
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "Overwriting code/inference.py\n"
259 |      ]
260 |     }
261 |    ],
262 |    "source": [
263 |     "%%writefile code/inference.py\n",
264 |     "\n",
265 |     "from presidio_anonymizer import AnonymizerEngine\n",
266 |     "from presidio_analyzer import AnalyzerEngine\n",
267 |     "from typing import List\n",
268 |     "\n",
269 |     "from presidio_analyzer import AnalyzerEngine, EntityRecognizer, RecognizerResult\n",
270 |     "from presidio_analyzer.nlp_engine import NlpArtifacts\n",
271 |     "from transformers import pipeline\n",
272 |     "\n",
273 |     "# load spacy model -> workaround\n",
274 |     "import os\n",
275 |     "os.system(\"spacy download en_core_web_lg\")\n",
276 |     "\n",
277 |     "# list of entities: https://microsoft.github.io/presidio/supported_entities/#list-of-supported-entities\n",
278 |     "DEFAULT_ANOYNM_ENTITIES = [\n",
279 |     "    \"CREDIT_CARD\",\n",
280 |     "    \"CRYPTO\",\n",
281 |     "    \"DATE_TIME\",\n",
282 |     "    \"EMAIL_ADDRESS\",\n",
283 |     "    \"IBAN_CODE\",\n",
284 |     "    \"IP_ADDRESS\",\n",
285 |     "    \"NRP\",\n",
286 |     "    \"LOCATION\",\n",
287 |     "    \"PERSON\",\n",
288 |     "    \"PHONE_NUMBER\",\n",
289 |     "    \"MEDICAL_LICENSE\",\n",
290 |     "    \"URL\",\n",
291 |     "    \"ORGANIZATION\"\n",
292 |     "]\n",
293 |     "\n",
294 |     "# init anonymize engine\n",
295 |     "engine = AnonymizerEngine()\n",
296 |     "\n",
297 |     "class HFTransformersRecognizer(EntityRecognizer):\n",
298 |     "    def __init__(\n",
299 |     "        self,\n",
300 |     "        model_id_or_path=None,\n",
301 |     "        aggregation_strategy=\"simple\",\n",
302 |     "        supported_language=\"en\",\n",
303 |     "        ignore_labels=[\"O\", \"MISC\"],\n",
304 |     "    ):\n",
305 |     "        # inits transformers pipeline for given mode or path\n",
306 |     "        self.pipeline = pipeline(\n",
307 |     "            \"token-classification\", model=model_id_or_path, aggregation_strategy=aggregation_strategy, ignore_labels=ignore_labels\n",
308 |     "        )\n",
309 |     "        # map labels to presidio labels\n",
310 |     "        self.label2presidio = {\n",
311 |     "            \"PER\": \"PERSON\",\n",
312 |     "            \"LOC\": \"LOCATION\",\n",
313 |     "            \"ORG\": \"ORGANIZATION\",\n",
314 |     "        }\n",
315 |     "\n",
316 |     "        # passes entities from model into parent class\n",
317 |     "        super().__init__(supported_entities=list(self.label2presidio.values()), supported_language=supported_language)\n",
318 |     "\n",
319 |     "    def load(self) -> None:\n",
320 |     "        \"\"\"No loading is required.\"\"\"\n",
321 |     "        pass\n",
322 |     "\n",
323 |     "    def analyze(\n",
324 |     "        self, text: str, entities: List[str] = None, nlp_artifacts: NlpArtifacts = None\n",
325 |     "    ) -> List[RecognizerResult]:\n",
326 |     "        \"\"\"\n",
327 |     "        Extracts entities using Transformers pipeline\n",
328 |     "        \"\"\"\n",
329 |     "        results = []\n",
330 |     "\n",
331 |     "        # keep max sequence length in mind\n",
332 |     "        predicted_entities = self.pipeline(text)\n",
333 |     "        if len(predicted_entities) > 0:\n",
334 |     "            for e in predicted_entities:\n",
335 |     "                converted_entity = self.label2presidio[e[\"entity_group\"]]\n",
336 |     "                if converted_entity in entities or entities is None:\n",
337 |     "                    results.append(\n",
338 |     "                        RecognizerResult(\n",
339 |     "                            entity_type=converted_entity, start=e[\"start\"], end=e[\"end\"], score=e[\"score\"]\n",
340 |     "                        )\n",
341 |     "                    )\n",
342 |     "        return results\n",
343 |     "\n",
344 |     "\n",
345 |     "def model_fn(model_dir):\n",
346 |     "    transformers_recognizer = HFTransformersRecognizer(model_dir)\n",
347 |     "    # Set up the engine, loads the NLP module (spaCy model by default) and other PII recognizers\n",
348 |     "    analyzer = AnalyzerEngine()\n",
349 |     "    analyzer.registry.add_recognizer(transformers_recognizer)\n",
350 |     "    return analyzer\n",
351 |     "\n",
352 |     "\n",
353 |     "def predict_fn(data, analyzer):\n",
354 |     "    sentences = data.pop(\"inputs\", data)\n",
355 |     "    if \"parameters\" in data:\n",
356 |     "        anonymization_entities = data[\"parameters\"].get(\"entities\", DEFAULT_ANOYNM_ENTITIES)\n",
357 |     "        anonymize_text = data[\"parameters\"].get(\"anonymize\", False)\n",
358 |     "    else:\n",
359 |     "        anonymization_entities = DEFAULT_ANOYNM_ENTITIES\n",
360 |     "        anonymize_text = False\n",
361 |     "\n",
362 |     "    # identify entities\n",
363 |     "    results = analyzer.analyze(text=sentences, entities=anonymization_entities, language=\"en\")\n",
364 |     "    # anonymize text\n",
365 |     "    if anonymize_text:\n",
366 |     "        result = engine.anonymize(text=sentences, analyzer_results=results)\n",
367 |     "        return {\"anonymized\": result.text}\n",
368 |     "\n",
369 |     "    return {\"found\": [entity.to_dict() for entity in results]}"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 5,
375 |    "id": "387a2fe9",
376 |    "metadata": {},
377 |    "outputs": [
378 |     {
379 |      "name": "stdout",
380 |      "output_type": "stream",
381 |      "text": [
382 |       "Overwriting code/requirements.txt\n"
383 |      ]
384 |     }
385 |    ],
386 |    "source": [
387 |     "%%writefile code/requirements.txt\n",
388 |     "\n",
389 |     "presidio-analyzer\n",
390 |     "spacy\n",
391 |     "transformers\n",
392 |     "presidio-anonymizer"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "id": "144d8ccb",
398 |    "metadata": {},
399 |    "source": [
400 |     "## 4. Deploy the PII service to Amazon SageMaker\n",
401 |     "\n",
402 |     "Before you can deploy a t he PII service to Amazon SageMaker you need to create `model.tar.gz` with inference script and model.\n",
403 |     "You need to bundle the `inference.py` and all model-artifcats, e.g. `pytorch_model.bin` into a `model.tar.gz`. The `inference.py` script will be placed into a `code/` folder. We will use `git` and `git-lfs` to easily download our model from hf.co/models and upload it to Amazon S3 so we can use it when creating our SageMaker endpoint.\n",
404 |     "\n",
405 |     "As the base model for the recognizer the example will use [Jean-Baptiste/roberta-large-ner-english](https://huggingface.co/Jean-Baptiste/roberta-large-ner-english)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 8,
411 |    "id": "952983b5",
412 |    "metadata": {},
413 |    "outputs": [],
414 |    "source": [
415 |     "repository = \"Jean-Baptiste/roberta-large-ner-english\"\n",
416 |     "model_id=repository.split(\"/\")[-1]\n",
417 |     "s3_location=f\"s3://{sess.default_bucket()}/custom_inference/{model_id}/model.tar.gz\""
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "id": "374ff630",
423 |    "metadata": {},
424 |    "source": [
425 |     "1. Download the model from hf.co/models with `git clone`."
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": null,
431 |    "id": "f8a134b7",
432 |    "metadata": {},
433 |    "outputs": [],
434 |    "source": [
435 |     "!git lfs install\n",
436 |     "!git clone https://huggingface.co/$repository\n"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "markdown",
441 |    "id": "09a6f330",
442 |    "metadata": {},
443 |    "source": [
444 |     "2. copy `inference.py`  into the `code/` directory of the model directory."
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 10,
450 |    "id": "6146af09",
451 |    "metadata": {},
452 |    "outputs": [
453 |     {
454 |      "name": "stdout",
455 |      "output_type": "stream",
456 |      "text": [
457 |       "/Users/philipp/Projects/personal/blog/advanced-pii-huggingface-sagemaker\n"
458 |      ]
459 |     }
460 |    ],
461 |    "source": [
462 |     "!cp -r code/ $model_id/code/"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "markdown",
467 |    "id": "04e1395a",
468 |    "metadata": {},
469 |    "source": [
470 |     "3. Create a `model.tar.gz` archive with all the model artifacts and the `inference.py` script.\n"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": null,
476 |    "id": "7dcfda24",
477 |    "metadata": {},
478 |    "outputs": [],
479 |    "source": [
480 |     "%cd $model_id\n",
481 |     "!tar zcvf model.tar.gz *"
482 |    ]
483 |   },
484 |   {
485 |    "cell_type": "markdown",
486 |    "id": "1c858560",
487 |    "metadata": {},
488 |    "source": [
489 |     "4. Upload the `model.tar.gz` to Amazon S3:\n"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": null,
495 |    "id": "a7dc7ec6",
496 |    "metadata": {},
497 |    "outputs": [],
498 |    "source": [
499 |     "!aws s3 cp model.tar.gz $s3_location"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "id": "0a146346",
505 |    "metadata": {},
506 |    "source": [
507 |     "After you uploaded the `model.tar.gz` archive to Amazon S3. You can create a custom `HuggingfaceModel` class. This class will be used to create and deploy our SageMaker endpoint."
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": 13,
513 |    "id": "a271a2e6",
514 |    "metadata": {},
515 |    "outputs": [
516 |     {
517 |      "name": "stdout",
518 |      "output_type": "stream",
519 |      "text": [
520 |       "----------!"
521 |      ]
522 |     }
523 |    ],
524 |    "source": [
525 |     "from sagemaker.huggingface.model import HuggingFaceModel\n",
526 |     "\n",
527 |     "\n",
528 |     "# create Hugging Face Model Class\n",
529 |     "huggingface_model = HuggingFaceModel(\n",
530 |     "   model_data=s3_location,       # path to your model and script\n",
531 |     "   role=role,                    # iam role with permissions to create an Endpoint\n",
532 |     "   transformers_version=\"4.17\",  # transformers version used\n",
533 |     "   pytorch_version=\"1.10\",        # pytorch version used\n",
534 |     "   py_version='py38',            # python version used\n",
535 |     ")\n",
536 |     "\n",
537 |     "# deploy the endpoint endpoint\n",
538 |     "predictor = huggingface_model.deploy(\n",
539 |     "    initial_instance_count=1,\n",
540 |     "    instance_type=\"ml.g4dn.xlarge\"\n",
541 |     "    )"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "markdown",
546 |    "id": "b6b3812f",
547 |    "metadata": {},
548 |    "source": [
549 |     "## 5. Request and customization of requests\n",
550 |     "\n",
551 |     "The `.deploy()` returns an `HuggingFacePredictor` object which can be used to request inference."
552 |    ]
553 |   },
554 |   {
555 |    "cell_type": "code",
556 |    "execution_count": 20,
557 |    "id": "45f06083",
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "payload=\"\"\"\n",
562 |     "Hello, my name is David Johnson and I live in Maine.\n",
563 |     "I work as a software engineer at Amazon. \n",
564 |     "You can call me at (123) 456-7890.\n",
565 |     "My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n",
566 |     "\n",
567 |     "On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.\n",
568 |     "My passport: 191280342 and my phone number: (212) 555-1234.\n",
569 |     "This is a valid International Bank Account Number: IL150120690000003111111. Can you please check the status on bank account 954567876544?\n",
570 |     "Kate's social security number is 078-05-1126.  Her driver license? it is 1234567A.\n",
571 |     "\n",
572 |     "\"\"\""
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "id": "f7a0baca",
578 |    "metadata": {},
579 |    "source": [
580 |     "**Simple detection request**\n"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 14,
586 |    "id": "dfdc9a79",
587 |    "metadata": {},
588 |    "outputs": [
589 |     {
590 |      "name": "stdout",
591 |      "output_type": "stream",
592 |      "text": [
593 |       "{'found': [{'entity_type': 'CREDIT_CARD', 'start': 120, 'end': 139, 'score': 1.0, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'CreditCardRecognizer'}}, {'entity_type': 'CRYPTO', 'start': 167, 'end': 201, 'score': 1.0, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'CryptoRecognizer'}}, {'entity_type': 'EMAIL_ADDRESS', 'start': 265, 'end': 283, 'score': 1.0, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'EmailRecognizer'}}, {'entity_type': 'IBAN_CODE', 'start': 421, 'end': 444, 'score': 1.0, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'IbanRecognizer'}}, {'entity_type': 'PERSON', 'start': 19, 'end': 32, 'score': 0.9997117519378662, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'LOCATION', 'start': 47, 'end': 52, 'score': 0.9993120431900024, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'PERSON', 'start': 508, 'end': 512, 'score': 0.9965325593948364, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 87, 'end': 93, 'score': 0.9888795614242554, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'IP_ADDRESS', 'start': 297, 'end': 308, 'score': 0.95, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'IpRecognizer'}}, {'entity_type': 'DATE_TIME', 'start': 207, 'end': 219, 'score': 0.85, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'SpacyRecognizer'}}, {'entity_type': 'PHONE_NUMBER', 'start': 354, 'end': 368, 'score': 0.75, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'PhoneRecognizer'}}, {'entity_type': 'PHONE_NUMBER', 'start': 541, 'end': 552, 'score': 0.75, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'PhoneRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 230, 'end': 239, 'score': 0.5814294815063477, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 274, 'end': 276, 'score': 0.5579692721366882, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'URL', 'start': 230, 'end': 243, 'score': 0.5, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'UrlRecognizer'}}, {'entity_type': 'URL', 'start': 270, 'end': 281, 'score': 0.5, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'UrlRecognizer'}}]}\n"
594 |      ]
595 |     }
596 |    ],
597 |    "source": [
598 |     "data = {\n",
599 |     "  \"inputs\": payload,\n",
600 |     "}\n",
601 |     "\n",
602 |     "res = predictor.predict(data=data)\n",
603 |     "print(res)\n"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "markdown",
608 |    "id": "40dede63",
609 |    "metadata": {},
610 |    "source": [
611 |     "**Detect only specific PII entities**"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 15,
617 |    "id": "049a66cf",
618 |    "metadata": {},
619 |    "outputs": [
620 |     {
621 |      "name": "stdout",
622 |      "output_type": "stream",
623 |      "text": [
624 |       "{'found': [{'entity_type': 'PERSON', 'start': 19, 'end': 32, 'score': 0.9997117519378662, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'LOCATION', 'start': 47, 'end': 52, 'score': 0.9993120431900024, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'PERSON', 'start': 508, 'end': 512, 'score': 0.9965325593948364, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 87, 'end': 93, 'score': 0.9888795614242554, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 230, 'end': 239, 'score': 0.5814294815063477, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}, {'entity_type': 'ORGANIZATION', 'start': 274, 'end': 276, 'score': 0.5579692721366882, 'analysis_explanation': None, 'recognition_metadata': {'recognizer_name': 'HFTransformersRecognizer'}}]}\n"
625 |      ]
626 |     }
627 |    ],
628 |    "source": [
629 |     "data = {\n",
630 |     "  \"inputs\": payload,\n",
631 |     "  \"parameters\": {\n",
632 |     "    \"entities\":[\"PERSON\",\"LOCATION\",\"ORGANIZATION\"]\n",
633 |     "  }\n",
634 |     "}\n",
635 |     "\n",
636 |     "res = predictor.predict(data=data)\n",
637 |     "print(res)\n"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "markdown",
642 |    "id": "494962a2",
643 |    "metadata": {},
644 |    "source": [
645 |     "**Anonzymizing PII entities**"
646 |    ]
647 |   },
648 |   {
649 |    "cell_type": "code",
650 |    "execution_count": 21,
651 |    "id": "5b265224",
652 |    "metadata": {},
653 |    "outputs": [
654 |     {
655 |      "name": "stdout",
656 |      "output_type": "stream",
657 |      "text": [
658 |       "\n",
659 |       "Hello, my name is <PERSON> and I live in <LOCATION>.\n",
660 |       "I work as a software engineer at <ORGANIZATION>. \n",
661 |       "You can call me at <PHONE_NUMBER>.\n",
662 |       "My credit card number is <CREDIT_CARD> and my crypto wallet id is <CRYPTO>.\n",
663 |       "\n",
664 |       "On <DATE_TIME> I visited <URL> and sent an email to <EMAIL_ADDRESS>, from the IP <IP_ADDRESS>.\n",
665 |       "My passport: 191280342 and my phone number: <PHONE_NUMBER>.\n",
666 |       "This is a valid International Bank Account Number: <IBAN_CODE>. Can you please check the status on bank account 954567876544?\n",
667 |       "<PERSON>'s social security number is <PHONE_NUMBER>.  Her driver license? it is 1234567A.\n",
668 |       "\n",
669 |       "\n"
670 |      ]
671 |     }
672 |    ],
673 |    "source": [
674 |     "data = {\n",
675 |     "  \"inputs\": payload,\n",
676 |     "  \"parameters\": {\n",
677 |     "    \"anonymize\": True,\n",
678 |     "  }\n",
679 |     "}\n",
680 |     "\n",
681 |     "res = predictor.predict(data=data)\n",
682 |     "print(res[\"anonymized\"])\n"
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "markdown",
687 |    "id": "e5db30a1",
688 |    "metadata": {},
689 |    "source": [
690 |     "**Anonzymizing only specific PII entities**"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 19,
696 |    "id": "eae2fb23",
697 |    "metadata": {},
698 |    "outputs": [
699 |     {
700 |      "name": "stdout",
701 |      "output_type": "stream",
702 |      "text": [
703 |       "\n",
704 |       "Hello, my name is <PERSON> and I live in <LOCATION>.\n",
705 |       "I work as a software engineer at Amazon.\n",
706 |       "My credit card number is 4095-2609-9393-4932 and my crypto wallet id is 16Yeky6GMjeNkAiNcBY7ZhrLoMSgg1BoyZ.\n",
707 |       "\n",
708 |       "On September 18 I visited microsoft.com and sent an email to test@presidio.site, from the IP 192.168.0.1.\n",
709 |       "My passport: 191280342 and my phone number: (212) 555-1234.\n",
710 |       "This is a valid International Bank Account Number: IL150120690000003111111. Can you please check the status on bank account 954567876544?\n",
711 |       "<PERSON>'s social security number is 078-05-1126.  Her driver license? it is 1234567A.\n",
712 |       "\n",
713 |       "\n"
714 |      ]
715 |     }
716 |    ],
717 |    "source": [
718 |     "data = {\n",
719 |     "  \"inputs\": payload,\n",
720 |     "  \"parameters\": {\n",
721 |     "    \"anonymize\": True,\n",
722 |     "    \"entities\":[\"PERSON\",\"LOCATION\"]\n",
723 |     "  }\n",
724 |     "}\n",
725 |     "\n",
726 |     "res = predictor.predict(data=data)\n",
727 |     "print(res[\"anonymized\"])\n"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "markdown",
732 |    "id": "cb10007d",
733 |    "metadata": {},
734 |    "source": [
735 |     "### Delete model and endpoint\n",
736 |     "\n",
737 |     "To clean up, we can delete the model and endpoint."
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": 29,
743 |    "id": "1e6fb7b8",
744 |    "metadata": {},
745 |    "outputs": [],
746 |    "source": [
747 |     "predictor.delete_model()\n",
748 |     "predictor.delete_endpoint()"
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "code",
753 |    "execution_count": null,
754 |    "id": "e2a90e25",
755 |    "metadata": {},
756 |    "outputs": [],
757 |    "source": []
758 |   }
759 |  ],
760 |  "metadata": {
761 |   "interpreter": {
762 |    "hash": "5fcf248a74081676ead7e77f54b2c239ba2921b952f7cbcdbbe5427323165924"
763 |   },
764 |   "kernelspec": {
765 |    "display_name": "Python 3.8.12 64-bit ('hf')",
766 |    "language": "python",
767 |    "name": "python3"
768 |   },
769 |   "language_info": {
770 |    "codemirror_mode": {
771 |     "name": "ipython",
772 |     "version": 3
773 |    },
774 |    "file_extension": ".py",
775 |    "mimetype": "text/x-python",
776 |    "name": "python",
777 |    "nbconvert_exporter": "python",
778 |    "pygments_lexer": "ipython3",
779 |    "version": "3.8.12"
780 |   }
781 |  },
782 |  "nbformat": 4,
783 |  "nbformat_minor": 5
784 | }
785 | 


--------------------------------------------------------------------------------