├── NOTICE ├── requirements.txt ├── Makefile ├── setup.cfg ├── .gitignore ├── LICENSE ├── .travis.yml ├── setup.py ├── README.md └── high_entropy_string └── __init__.py /NOTICE: -------------------------------------------------------------------------------- 1 | high-entropy-string 2 | Copyright 2016-2017 Lyft Inc. 3 | 4 | This product includes software developed at Lyft Inc. 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # License: MIT 2 | zxcvbn>=1.0,<=2.0 3 | # The modular source code checker: pep8, pyflakes and co 4 | # License: MIT 5 | # Upstream url: http://bitbucket.org/tarek/flake8 6 | flake8==2.3.0 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # bash needed for pipefail 2 | SHELL := /bin/bash 3 | 4 | test: test_lint 5 | 6 | test_lint: 7 | mkdir -p build 8 | set -o pipefail; flake8 | sed "s#^\./##" > build/flake8.txt || (cat build/flake8.txt && exit 1) 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # The jenkins violations plugin can read the pylint format. 3 | format = pylint 4 | max-line-length = 120 5 | 6 | # .svn,CVS,.bzr,.hg,.git,__pycache__: 7 | # default excludes 8 | # venv/: 9 | # third party libraries are all stored in venv - so we don't want to 10 | # check them for style issues. 11 | exclude = .git,__pycache__,venv,tests/,.ropeproject,examples 12 | 13 | [pep8] 14 | max-line-length = 120 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Packages 9 | *.egg 10 | *.egg-info 11 | dist 12 | build 13 | eggs 14 | parts 15 | bin 16 | var 17 | sdist 18 | develop-eggs 19 | .installed.cfg 20 | lib 21 | lib64 22 | __pycache__ 23 | 24 | # Installer logs 25 | pip-log.txt 26 | 27 | # Unit test / coverage reports 28 | .coverage 29 | .tox 30 | nosetests.xml 31 | 32 | # Translations 33 | *.mo 34 | 35 | # Mr Developer (mac, editor, IDEs, etc) 36 | .mr.developer.cfg 37 | .project 38 | .pydevproject 39 | .vagrant 40 | .ropeproject 41 | .tmp 42 | .sass-cache 43 | .DS_store 44 | .zedstate 45 | .idea 46 | 47 | # virtualenv excludes 48 | venv 49 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | high-entropy-string - A library for classifying strings as potential secrets. 2 | 3 | Copyright 2014-2015 Lyft Inc. 4 | 5 | Licensed under the Apache License, Version 2.0 (the "License"); 6 | you may not use this file except in compliance with the License. 7 | You may obtain a copy of the License at 8 | 9 | http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | Unless required by applicable law or agreed to in writing, software 12 | distributed under the License is distributed on an "AS IS" BASIS, 13 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | See the License for the specific language governing permissions and 15 | limitations under the License. 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '2.7' 4 | install: pip install -r requirements.txt 5 | script: make test 6 | deploy: 7 | provider: pypi 8 | user: lyftpypi 9 | password: 10 | secure: UAVA9gkzIBJGMGUNhI7FP1tjCJPJB1F8LOVjrtF5zvf0FXOwlkHN89zyX16fvawErOgsDQScV7foorrU+/j+j83tIT8xsLMq999g7WdBE/AfC2bNUNsdtR44nDhZWtb9+IMcwn8ad4wKEdwj5mEevbjzPTjDmA8/Cc4/AQ1wqhz6tMT5ZRuqGbbjY48be9jn0jwy0uarCmM95mn2dh0K9jKZCBhcNzuimiQxj5wvkz025tBB94VG/4ba2dScuRtvmyi+2LOLHSWDmlwiMXv35zotuq2dS7ifcp5OQ6U/LUlyxS0f2/BY18gWOHQ3cpEfSpvIXPSxzljaquL1jFhEH0vfrj/y6Elte89R0n6/yS9Q6LmeE5nq5uuHL/4LQuWrsTZFaSsMsk0nHlWl/6P/nlSOmw0lUajSZHcG5cZ0zQO79FEY1rPXqEzxkt+cHrT4L4hKAecVqBrirhf6zKT97vXWV2l8IH7xykvnVfeeAMjBQ3enpFm1DJStuHTD/+SFvy/baXlJ2du9gK8klXGIHfFXAPjy7KTI0NGMpnA6kgJMno5rxXVBMPCUYIBAtEVEnIH1quP5iGTgIvrnI44Qv8VD7S5WjNIeaZXlV/7A0iL02MWwlT6aE9gjVFNm+KmAHmWypOxVeTatI8dw9md6DHZIWuX4Fh6ilBEKDOkN94w= 11 | on: 12 | tags: true 13 | distributions: sdist bdist_wheel 14 | repo: lyft/high-entropy-string 15 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 10 | # implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | from setuptools import setup, find_packages 15 | 16 | 17 | requirements = [ 18 | # License: MIT 19 | # Upstream url: https://github.com/dropbox/zxcvbn 20 | # Use: For entropy checks 21 | 'zxcvbn>=1.0,<=1.999' 22 | ] 23 | 24 | setup( 25 | name="high-entropy-string", 26 | version="0.2.1", 27 | packages=find_packages(exclude=["test*"]), 28 | install_requires=requirements, 29 | author="Ryan Lane", 30 | author_email="rlane@lyft.com", 31 | description="A library for classifying strings as potential secrets.", 32 | license="apache2", 33 | url="https://github.com/lyft/high-entropy-string" 34 | ) 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ⚠️ This repository has been archived and is no longer accepting contributions ⚠️ 2 | 3 | # high-entropy-string 4 | 5 | A library for classifying strings as potential secrets. 6 | 7 | ## Installation 8 | 9 | ```bash 10 | virtualenv venv 11 | source venv/bin/activate 12 | pip install high-entropy-string 13 | ``` 14 | 15 | ## Usage 16 | 17 | ``` 18 | from high_entropy_string import PythonStringData 19 | 20 | data = PythonStringData( 21 | string='AKAI...', 22 | node_type='assignment', 23 | target='myvar', 24 | patterns_to_ignore=[r'example.com'], 25 | entropy_patterns_to_discount=[r'/BEGIN.*PUBLIC KEY/'] 26 | ) 27 | print(data.confidence) 28 | print(data.severity) 29 | ``` 30 | 31 | ## Contributing 32 | 33 | ### Code of conduct 34 | 35 | This project is governed by [Lyft's code of 36 | conduct](https://github.com/lyft/code-of-conduct). 37 | All contributors and participants agree to abide by its terms. 38 | 39 | ### Sign the Contributor License Agreement (CLA) 40 | 41 | We require a CLA for code contributions, so before we can accept a pull request 42 | we need to have a signed CLA. Please [visit our CLA 43 | service](https://oss.lyft.com/cla) 44 | follow the instructions to sign the CLA. 45 | 46 | ### How it works and how to help 47 | 48 | The library classifies a string based on its liklihood of being a secret. 49 | We nudge the confidence and severity of the string based on criterea: 50 | 51 | 1. Flags (ENTROPY_PATTERNS_TO_FLAG). Any Candidate that matches any regex in this 52 | list is automatically flagged as confidence/severity 3/3. If there's secret 53 | patterns you know conclusively are secrets, add them here. 54 | 2. Discounts (ENTROPY_PATTERNS_TO_DISCOUNT). Any Candidate that matches a regex in 55 | this list is discounted. If the Candidate matches multiple regexes in this 56 | list, it may be discounted further. This discount is used in the confidence 57 | calculation. 58 | 3. Secret hints (LOW_SECRET_HINTS, HIGH_SECRET_HINTS). If any target or caller 59 | matches a regex in these lists then it will be used as a hint that a 60 | Candidate is a secret. This hint is used in the confidence and severity 61 | calculations. LOW_SECRET_HINTS leads to a lower confidence increase and 62 | HIGH_SECRET_HINTS leads to a higher confidence increase. 63 | 4. Safe functions (SAFE_FUNCTION_HINTS). Any Candidate that has a caller that 64 | matches any string in this list will will be discounted. This is used in the 65 | confidence calculation. 66 | 5. Entropy. If a Candidate's confidence level can be more accurately gauged by 67 | a strings level of entropy, we calculate it and if the string has high 68 | entropy its confidence level is increased. This calculation is avoided if 69 | possible, as it's relatively expensive. 70 | 71 | The concept is to eliminate noise while more easily identifying Candidates that 72 | may be secrets. Some help we'd love to have: 73 | 74 | 1. Help with the discount regex list. The regexes in the list often match too 75 | much and there aren't enough that match common python strings. 76 | 2. Help with the safe functions list (and the way we match the safe functions). 77 | There's a lot of python functions that rarely include secrets but often 78 | contain high entropy strings. We currently don't identify these function 79 | calls very well, which leads to higher noise. 80 | 3. Add and improve string captures. We're not currently capturing all available strings 81 | in the AST and for some string captures we aren't capturing them as 82 | efficiently as we could. For instance with dicts, we capture info like: 83 | {'target': 'candidate'}, but don't capture: {'target': 'target': 'candidate'}, 84 | which could lead to better categorization. 85 | 86 | Feel free to submit issues and pull requests for anything else you think would be useful 87 | as well. 88 | -------------------------------------------------------------------------------- /high_entropy_string/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import mimetypes 4 | import logging 5 | 6 | import zxcvbn 7 | 8 | logger = logging.getLogger('high_entropy_string') 9 | 10 | # TODO: change caller logic to more accurately identify callers (like strftime 11 | # vs .strftime vs datetime.datetime.strftime) 12 | 13 | ENTROPY_PATTERNS_TO_FLAG = [ 14 | # AWS access keys (which often have secret keys listed with them) 15 | re.compile('AKIA'), 16 | # URLs with username/password combos 17 | re.compile('^[a-z]+://.*:.*@'), 18 | # PEM encoded PKCS8 private keys 19 | re.compile('BEGIN.*PRIVATE KEY'), 20 | # Slack webhook 21 | re.compile(r'https://hooks.slack.com/services/T\w{8}/B\w{8}/\w{24}') 22 | ] 23 | mimetypes.init() 24 | EXTS = [re.escape(i) for i in mimetypes.types_map.keys()] 25 | FILE_EXTENSIONS_MATCH = r'([a-zA-Z0-9\-_/\.]+{0})$'.format( 26 | r'|[a-zA-Z0-9\-_/.]+'.join(EXTS) 27 | ) 28 | MIMETYPES_MATCH = re.escape( 29 | r'^({0})$'.format('|'.join(mimetypes.types_map.values())) 30 | ) 31 | 32 | PATTERNS_TO_IGNORE = [ 33 | ] 34 | 35 | ENTROPY_PATTERNS_TO_DISCOUNT = [ 36 | # secrets don't contain whitespace 37 | re.compile(r'\s+'), 38 | # secrets don't end with file extensions 39 | re.compile(FILE_EXTENSIONS_MATCH), 40 | # secrets don't look like mime types 41 | re.compile(MIMETYPES_MATCH), 42 | # secrets don't contain domain names 43 | # Example: example.org 44 | re.compile(r'^([a-z0-9\-]+\.)+(com|net|me|org|edu)$'), 45 | # secrets don't have host names 46 | # Example: my-cool-hostname 47 | re.compile(r'^[a-z]*(-[a-z]*)*$'), 48 | # secrets don't look like python imports 49 | # Example import a.b.Hello_World1 50 | re.compile(r'^[a-zA-Z0-9_]+(\.[a-zA-Z0-9_]+)+$'), 51 | # secrets don't look like python variable names 52 | # Example: my_fun_variable_name1 53 | re.compile(r'^_?_?[a-zA-Z0-9]+(_[a-zA-Z0-9]+)+$'), 54 | # secrets don't have absolute paths 55 | # Example /a/b/1-B_z.txt 56 | re.compile(r'^/[a-zA-Z0-9\-_/\.]+$'), 57 | # secrets don't have relative paths 58 | # Example a/b/1-B_z.txt 59 | re.compile(r'^[a-zA-Z0-9\-_/\.]+/$'), 60 | # secrets don't have flask routes 61 | # Example: /v1/ or /v1/user//group 62 | re.compile(r'<([a-z_]+:[a-z_]+|[a-z_]+)>/'), 63 | re.compile(r'/<([a-z_]+:[a-z_]+|[a-z_]+)>'), 64 | # secrets don't look like urls with args 65 | # Example: /a/b/c?hello=world&test=me 66 | re.compile(r'/[a-zA-Z\-_\.]+(/[a-zA-Z\-_\.])*\?[a-zA-Z\-_\.=&]$'), 67 | # secrets don't email addresses 68 | # Example: test+spam@example.com 69 | re.compile(r'[a-zA-Z0-9_\-\+]+@[a-zA-Z0-9\-]+\.(com|net|me|edu)'), 70 | # secrets don't look like constants 71 | # Example: EXAMPLE_CONSTANT 72 | re.compile(r'^[A-Z]*(_[A-Z]*)*$'), 73 | # secrets don't look like session dict keys 74 | # Example: XSRF-TOKEN 75 | re.compile(r'^[A-Z]*(-[A-Z]*)*$'), 76 | # secrets don't look like URIs 77 | # Example: https://example.org 78 | re.compile(r'^[a-z]+://'), 79 | # secrets don't look like format strings 80 | # Example: {10!s} 81 | # TODO: consider false negatives 82 | re.compile(r'\{\d{0,2}\}'), 83 | # Example: {my_Var} 84 | # TODO: consider false negatives 85 | re.compile(r'\{_?_?[a-zA-Z]{1,10}[a-zA-Z0-9_]{0,10}\}'), 86 | # secrets don't look like headers, 87 | # Example: X-Forwarded-For 88 | re.compile(r'^[A-Z][a-z]*(-[A-Z][a-z]*)*$'), 89 | # secrets don't look like date formats 90 | # Example: %Y%m%dT%H%M%SZ 91 | re.compile(r'^(%[a-zA-Z\-]+)+$'), 92 | # Example: 2012-10-17T00:00:00Z 93 | re.compile(r'\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ'), 94 | # Example: 2021-08-22 95 | re.compile(r'\d\d\d\d-\d\d-\d\d'), 96 | # secrets don't look like phone numbers 97 | # Example: +15555555555 98 | re.compile(r'\d\d\d\d\d\d\d\d\d\d$'), 99 | # secrets don't look cli arguments 100 | # Example: --test_me-please 101 | re.compile(r'^--[a-zA-Z0-9\-_]$'), 102 | # key-lookups 103 | # Example: my_var:b:c 104 | re.compile(r'^[a-zA-Z0-9_\-]+(:[a-zA-Z0-9_\-])+$') 105 | ] 106 | LOW_SECRET_HINTS = [ 107 | re.compile(r'[a-z0-9_\.]+key[a-z0-9_\.]*', re.IGNORECASE), 108 | re.compile(r'pw', re.IGNORECASE), 109 | re.compile(r'tok', re.IGNORECASE), 110 | re.compile(r'tkn', re.IGNORECASE), 111 | re.compile(r'random', re.IGNORECASE), 112 | re.compile(r'auth', re.IGNORECASE) 113 | ] 114 | HIGH_SECRET_HINTS = [ 115 | re.compile(r'secret', re.IGNORECASE), 116 | re.compile(r'pass', re.IGNORECASE), 117 | re.compile(r'passwd', re.IGNORECASE), 118 | re.compile(r'password', re.IGNORECASE), 119 | re.compile(r'login', re.IGNORECASE) 120 | ] 121 | SAFE_VAR_HINTS = [ 122 | re.compile(r'format', re.IGNORECASE), 123 | re.compile(r'pattern', re.IGNORECASE), 124 | re.compile(r'id', re.IGNORECASE), 125 | re.compile(r'user-agent', re.IGNORECASE) 126 | ] 127 | SAFE_FUNCTION_HINTS = [ 128 | 'os.environ.get', 129 | 'os.path.join', 130 | 're.sub', 131 | 're.search', 132 | 're.split', 133 | 're.compile', 134 | 're.find', 135 | 'datetime.datetime.strptime', 136 | 'datetime.datetime.strftime', 137 | 'time.strftime', 138 | '.strftime', 139 | '.strptime', 140 | 'dateutil.parser.parse', 141 | 'pytz.timezone', 142 | 'hasattr', 143 | 'getattr', 144 | 'delattr', 145 | 'statsd.timer', 146 | 'timezone', 147 | 'open', 148 | '.split', 149 | 'csv.reader', 150 | 'flask.request.json.get', 151 | 'flask.request.args.get', 152 | 'flask.request.form.get', 153 | 'string.replace' 154 | ] 155 | 156 | 157 | class PythonStringData(object): 158 | 159 | def __init__( 160 | self, 161 | string=None, 162 | assigned=False, 163 | node_type=None, 164 | target=None, 165 | caller=None, 166 | patterns_to_ignore=None, 167 | entropy_patterns_to_discount=None): 168 | self.string = string 169 | self.assigned = assigned 170 | self.node_type = node_type 171 | self.target = target 172 | self.caller = caller 173 | self.cache = {} 174 | if patterns_to_ignore: 175 | self.patterns_to_ignore = patterns_to_ignore 176 | else: 177 | self.patterns_to_ignore = [] 178 | if entropy_patterns_to_discount: 179 | self.entropy_patterns_to_discount = entropy_patterns_to_discount 180 | else: 181 | self.entropy_patterns_to_discount = [] 182 | 183 | @property 184 | def ignored(self): 185 | for pattern in PATTERNS_TO_IGNORE: 186 | if pattern.search(self.string): 187 | return True 188 | for _pattern in self.patterns_to_ignore: 189 | pattern = re.compile(_pattern) 190 | if pattern.search(self.string): 191 | return True 192 | 193 | @property 194 | def discounts_regex(self): 195 | if self.cache.get('discounts_regex') is not None: 196 | return self.cache['discounts_regex'] 197 | patterns = [] 198 | for pattern in ENTROPY_PATTERNS_TO_DISCOUNT: 199 | if pattern.search(self.string): 200 | patterns.append(pattern.pattern) 201 | for _pattern in self.entropy_patterns_to_discount: 202 | pattern = re.compile(_pattern) 203 | if pattern.search(self.string): 204 | patterns.append(pattern.pattern) 205 | self.cache['discounts_regex'] = patterns 206 | return self.cache['discounts_regex'] 207 | 208 | @property 209 | def discounts(self): 210 | return len(self.discounts_regex) 211 | 212 | @property 213 | def flags_regex(self): 214 | if self.cache.get('flags_regex') is not None: 215 | return self.cache['flags_regex'] 216 | patterns = [] 217 | for pattern in ENTROPY_PATTERNS_TO_FLAG: 218 | if pattern.search(self.string): 219 | patterns.append(pattern.pattern) 220 | self.cache['flags_regex'] = patterns 221 | return self.cache['flags_regex'] 222 | 223 | @property 224 | def flags(self): 225 | return len(self.flags_regex) 226 | 227 | @property 228 | def secret_rating(self): 229 | if self.cache.get('secret_rating') is not None: 230 | return self.cache['secret_rating'] 231 | secret = 0 232 | if self.target: 233 | for pattern in LOW_SECRET_HINTS: 234 | if pattern.search(self.target): 235 | secret += 1 236 | for pattern in HIGH_SECRET_HINTS: 237 | if pattern.search(self.target): 238 | secret += 2 239 | self.cache['secret_rating'] = secret 240 | return self.cache['secret_rating'] 241 | 242 | @property 243 | def safety_rating(self): 244 | if self.cache.get('safety_rating') is not None: 245 | return self.cache['safety_rating'] 246 | safety = 0 247 | if self.target: 248 | for pattern in SAFE_VAR_HINTS: 249 | if pattern.search(self.target): 250 | safety += 1 251 | if self.caller and self.caller in SAFE_FUNCTION_HINTS: 252 | safety += 2 253 | self.cache['safety_rating'] = safety 254 | return self.cache['safety_rating'] 255 | 256 | @property 257 | def entropy(self): 258 | if self.cache.get('entropy') is not None: 259 | return self.cache['entropy'] 260 | if not self.string: 261 | return 0 262 | if len(self.string) > 100: 263 | check_str = self.string[:100] 264 | else: 265 | check_str = self.string 266 | try: 267 | entropy = zxcvbn.password_strength(check_str)['entropy'] 268 | except UnicodeDecodeError: 269 | logger.warning( 270 | 'Failed to get entropy due to unicode decode error.' 271 | ) 272 | entropy = 0 273 | except OverflowError: 274 | logger.warning( 275 | 'Failed to get entropy due to overflow error.' 276 | ) 277 | entropy = 0 278 | self.cache['entropy'] = entropy 279 | return self.cache['entropy'] 280 | 281 | @property 282 | def entropy_per_char(self): 283 | try: 284 | return self.entropy/float(len(self.string)) 285 | except ZeroDivisionError: 286 | return 0 287 | 288 | @property 289 | def confidence(self): 290 | if self.cache.get('confidence') is not None: 291 | return self.cache['confidence'] 292 | if self.flags > 0: 293 | return 3 294 | if len(self.string) == 0: 295 | return 0 296 | if self.ignored: 297 | return 0 298 | confidence = 0 299 | if self.secret_rating > 0: 300 | confidence += 1 301 | if self.secret_rating > 1: 302 | confidence += 2 303 | if len(self.string) < 5: 304 | confidence -= 1 305 | if self.discounts > 0: 306 | confidence -= 2 307 | if self.discounts > 2: 308 | confidence -= 1 309 | if self.safety_rating > 0: 310 | confidence -= 1 311 | if self.safety_rating > 1: 312 | confidence -= 1 313 | # Avoid entropy calculation if possible. 314 | if confidence > 2 or confidence < -1: 315 | return confidence 316 | if (self.entropy > 80 or 317 | (self.entropy > 40 and self.entropy_per_char > 3)): 318 | confidence += 1 319 | if self.entropy >= 120: 320 | confidence += 1 321 | self.cache['confidence'] = confidence 322 | return self.cache['confidence'] 323 | 324 | @property 325 | def severity(self): 326 | if self.flags: 327 | return 3 328 | severity = self.confidence 329 | if self.secret_rating > 0: 330 | severity += 1 331 | if self.secret_rating > 1: 332 | severity += 1 333 | return severity 334 | 335 | def __str__(self): 336 | discount_patterns = self.entropy_patterns_to_discount 337 | return json.dumps({ 338 | 'string_data.string': self.string, 339 | 'string_data.discounts': self.discounted, 340 | 'string_data.discounts_regex': self.discounts_regex, 341 | 'string_data.flags': self.discounts, 342 | 'string_data.flags_regex': self.flags_regex, 343 | 'string_data.entropy': self.entropy, 344 | 'string_data.entropy_per_char': self.entropy_per_char, 345 | 'string_data.secret_rating': self.secret_rating, 346 | 'string_data.safety_rating': self.safety_rating, 347 | 'string_data.node_type': self.node_type, 348 | 'string_data.patterns_to_ignore': self.patterns_to_ignore, 349 | 'string_data.entropy_patterns_to_discount': discount_patterns 350 | }) 351 | --------------------------------------------------------------------------------