├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py └── tlp ├── __init__.py ├── lib ├── __init__.py ├── effective_tld_names.dat ├── filter_list.py └── regex_list.py ├── tlp.py └── tlp_filter.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | .pypirc 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | 57 | # PyBuilder 58 | target/ 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 ministry of promise 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # threat language parser 2 | 3 | tlp is a python library that parses a body of text for indicators of compromise (iocs), leveraging the amazing [textblob](http://textblob.readthedocs.org/en/dev/) and [nltk](http://www.nltk.org/) natural language processing modules to derive context and color around those iocs. The goal of tlp is to allow security analysts and researchers to extract and store meaningful data from the endless stream of information they encounter daily, without the tedium of endless ctrl+c, ctrl+v workflow. 4 | 5 | To solve this problem, tlp uses a combination of regular expression, part-of-speech tagging, list filters, and simple statistical analysis to extract the following data from narritive-style prose: 6 | 7 | - document summary 8 | - indicators of compromise, with associated stats 9 | - key words and phrases, with associated stats 10 | - parser debugging information 11 | 12 | ## Installation 13 | 14 | tlp can be found on PyPi, and installed with: 15 | 16 | pip install tlp 17 | 18 | you are also able to clone this repo, and run: 19 | 20 | python setup.py install 21 | 22 | ## Dependencies 23 | 24 | The following modules are required for tlp to function: 25 | 26 | ### TextBlob 27 | TextBlob will be installed by default. if you need to install manually, run: 28 | 29 | pip install -U textblob 30 | 31 | *PLEASE NOTE:* Regardless of whether TextBlob is automatically or manually installed, you'll need to download and install the rest of the nltk corpora by running the following commands: 32 | 33 | python -m textblob.download_corpora 34 | python -m nltk.downloader stopwords 35 | 36 | ### numpy 37 | Note that most numpy installs require compilation, so you will probably have to install this as a standalone by running: 38 | 39 | pip install -U numpy 40 | 41 | ### python-Levenshtein 42 | This dependency should be installed by setuptools automatically, but in the event that fails: 43 | 44 | pip install -U python-Levenshtein 45 | 46 | ## Usage 47 | 48 | >>> from tlp import TLP 49 | >>> ... 50 | >>> threat_text = get_threat_data_from_something() 51 | >>> tlp = TLP(threat_text) 52 | >>> 53 | >>> # get summary 54 | >>> tlp.summary 55 | u"This report outlines a terrible scourge upon our internets: miscreants. We have 56 | discovered that miscreants are systematically taking over the series of tubes, and 57 | and are attempting to leverage them to proliferate their love of 'My Little Pony.' 58 | Let's explore how we punched them repeatedly with our data." 59 | >>> 60 | >>> # get keywords, occurance counts 61 | >>> tlp.keywords 62 | [ 63 | (u'miscreant', 97), 64 | (u'punch', 39), 65 | (u'whiskey', 18) 66 | ] 67 | >>> 68 | >>> # get iocs, sorted by type 69 | >>> tlp.iocs 70 | { 71 | 'cve': set([u'cve-2011-0611', 72 | u'cve-2013-1347', 73 | u'cve-2013-2465']), 74 | 'domain': set([u'miscreantsmustsuffer.com', 75 | u'ministryofpromise.co.uk']), 76 | 'ip': set([u'8.8.4.4', 77 | u'127.0.0.1']), 78 | 'md5': set([u'6fc67ebcb6423efa06198cd123ffc3ee']), 79 | 'sha1': set([]), 80 | 'sha256': set([]) 81 | } 82 | >>> 83 | >>> # get tlp color (if present) 84 | >>> tlp.color 85 | set([u'white']) 86 | >>> 87 | >>> # get debug info, including total word count, and word frequency mean/stddev 88 | >>> tlp.debug 89 | { 90 | 'keywords': { 91 | 'std': 2.5559937998299809, 92 | 'total': 1012, 93 | 'mean': 2.0321285140562249 94 | }, 95 | 'iocs': { 96 | 'ip': 2, 97 | 'domain': 2, 98 | 'md5': 1 99 | 'sha1': 0, 100 | 'sha256': 0, 101 | 'cve': 3 102 | } 103 | } 104 | >>> 105 | >>> # get complete filtered text used for data distillation 106 | >>> tlp.text 107 | u"This report outlines a terrible scourge upon our internets: miscreants. We have 108 | discovered that miscreants are systematically taking over the series of tubes, and 109 | and are attempting to leverage them to proliferate their love of 'My Little Pony.' 110 | Let's explore how we punched them repeatedly with our data. 111 | ... 112 | 113 | "In conclusion -- bottom's up!" 114 | 115 | 116 | ## Todo 117 | 118 | - Improve keyword accuracy with a more robust statistical approach and better contextual language processing 119 | - Modify ioc filter engine to allow for more modular filter management 120 | - Allow for more flexibility in parsing and filtering at object creation 121 | - Grow/improve regex and filter sets 122 | - Include the use of "title" __init__ arg in keyword weighting 123 | - Add post-install scripting of corpora actions 124 | 125 | ## Contributing 126 | This is very much an alpha, so we expect some folks will quickly spot inefficiencies or better ways to solve the problem. All pull requests are welcome. :) 127 | 128 | If you are a threat intelligence publisher who would like to be added to the tlp whitelist, please [contact](mailto:github@ministryofpromise.co.uk) us. 129 | 130 | ## License 131 | The MIT License (MIT) 132 | 133 | Copyright (c) 2015 { ministry of promise } 134 | 135 | Permission is hereby granted, free of charge, to any person obtaining a copy 136 | of this software and associated documentation files (the "Software"), to deal 137 | in the Software without restriction, including without limitation the rights 138 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 139 | copies of the Software, and to permit persons to whom the Software is 140 | furnished to do so, subject to the following conditions: 141 | 142 | The above copyright notice and this permission notice shall be included in all 143 | copies or substantial portions of the Software. 144 | 145 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 146 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 147 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 148 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 149 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 150 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 151 | SOFTWARE. 152 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | from setuptools import setup, find_packages 3 | from pkg_resources import resource_filename, Requirement 4 | 5 | # Utility function to read the README file. 6 | # Used for the long_description. It's nice, because now 1) we have a top level 7 | # README file and 2) it's easier to type in the README file than to put a raw 8 | # string in below ... 9 | def read(fname): 10 | with open(fname) as f: 11 | return f.read() 12 | 13 | setup( 14 | name = "tlp", 15 | version = "0.1.1", 16 | author = "{ ministry of promise }", 17 | author_email = "adam.j.nichols@gmail.com", 18 | description = ("tlp is a python library that parses a body of text for indicators of compromise (iocs) " 19 | "using natural language processing modules to derive their context."), 20 | license = "MIT", 21 | keywords = "tlp threat language parser ioc nlp textblob nltk", 22 | url = "http://github.com/ministryofpromise/tlp", 23 | packages=find_packages(), 24 | package_dir={'tlp': 'tlp'}, 25 | package_data={'tlp': ['lib/effective_tld_names.dat']}, 26 | long_description=read('README.md'), 27 | classifiers=[ 28 | "Development Status :: 3 - Alpha", 29 | "Topic :: Utilities", 30 | "License :: OSI Approved :: MIT License", 31 | ], 32 | install_requires=[ 33 | 'nltk', 34 | 'textblob', 35 | 'python-Levenshtein', 36 | 'numpy' 37 | ] 38 | ) 39 | -------------------------------------------------------------------------------- /tlp/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["tlp", "tlp_filter"] 2 | 3 | from .tlp import TLP 4 | from .tlp_filter import TLPFilter 5 | -------------------------------------------------------------------------------- /tlp/lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ministryofpromise/tlp/b95f1ceb1597365d5c8a169d5bfb1750cb3d4784/tlp/lib/__init__.py -------------------------------------------------------------------------------- /tlp/lib/filter_list.py: -------------------------------------------------------------------------------- 1 | __author__ = "{ ministry of promise }" 2 | __copyright__ = "Copyright 2015, { ministry of promise }" 3 | __license__ = "MIT" 4 | __version__ = "0.1.0" 5 | __maintainer__ = "Adam Nichols" 6 | __email__ = "adam.j.nichols@gmail.com" 7 | __status__ = "Development" 8 | 9 | keyword_filterlist = [ 10 | u'account', 11 | u'activity', 12 | u'actor', 13 | u'address', 14 | u'algorithm', 15 | u'analysis', 16 | u'appendix', 17 | u'application', 18 | u'april', 19 | u'attack', 20 | u'attacker', 21 | u'august', 22 | u'backdoor', 23 | u'based', 24 | u'between', 25 | u'botnet', 26 | u'c2', 27 | u'c server', 28 | u'c servers', 29 | u'campaign', 30 | u'checkwith', 31 | u'code', 32 | u'copy', 33 | u'command', 34 | u'communication', 35 | u'company', 36 | u'configuration', 37 | u'connect', 38 | u'contact', 39 | u'control', 40 | u'corporation', 41 | u'country', 42 | u'custom', 43 | u'datum', 44 | u'december', 45 | u'description', 46 | u'detection', 47 | u'detectionsystemrootkill', 48 | u'distribution', 49 | u'directory', 50 | u'dll', 51 | u'domain', 52 | u'dword', 53 | u'error', 54 | u'exploit', 55 | u'february', 56 | u'figure', 57 | u'file', 58 | u'from', 59 | u'fromversion', 60 | u'function', 61 | u'functionality', 62 | u'infection', 63 | u'information', 64 | u'informationfiguremalware', 65 | u'intelreport', 66 | u'inquirefile', 67 | u'inquire', 68 | u'inquiry', 69 | u'install', 70 | u'ip', 71 | u'into', 72 | u'january', 73 | u'july', 74 | u'june', 75 | u'kill', 76 | u'legitimate', 77 | u'list', 78 | u'loader', 79 | u'logo', 80 | u'main', 81 | u'malware', 82 | u'march', 83 | u'may', 84 | u'md5', 85 | u'micro', 86 | u'module', 87 | u'name', 88 | u'network', 89 | u'november', 90 | u'number', 91 | u'october', 92 | u'organization', 93 | u'page', 94 | u'payload', 95 | u'please', 96 | u'pleasecontact', 97 | u'port', 98 | u'process', 99 | u'product', 100 | u'report', 101 | u'sample', 102 | u'september', 103 | u'server', 104 | u'service', 105 | u'shellcode', 106 | u'site', 107 | u'software', 108 | u'stage', 109 | u'state', 110 | u'sysmain', 111 | u'system', 112 | u'telnet', 113 | u'temp', 114 | u'thread', 115 | u'threat', 116 | u'tlp', 117 | u'trademark', 118 | u'trojan', 119 | u'unique', 120 | u'updater', 121 | u'updatersoftware', 122 | u'user', 123 | u'username', 124 | u'value', 125 | u'version', 126 | u'versionpoint', 127 | u'view', 128 | u'victim', 129 | u'with', 130 | u'within' 131 | ] 132 | 133 | ioc_filterlist = { 134 | 'domain' : [ 135 | u'cisco.com', 136 | u'fireeye.com', 137 | u'ipviking.com', 138 | u'kaspersky.com', 139 | u'norse-corp.com', 140 | u'verisign.com' 141 | ], 142 | 'ip' : [ 143 | u'127.0.0.1' 144 | ] 145 | } 146 | 147 | # top 2k as of 4/2015 148 | alexa_filterlist = [ 149 | u'google.com', 150 | u'facebook.com', 151 | u'youtube.com', 152 | u'baidu.com', 153 | u'yahoo.com', 154 | u'wikipedia.org', 155 | u'amazon.com', 156 | u'twitter.com', 157 | u'taobao.com', 158 | u'qq.com', 159 | u'google.co.in', 160 | u'live.com', 161 | u'linkedin.com', 162 | u'sina.com.cn', 163 | u'weibo.com', 164 | u'yahoo.co.jp', 165 | u'tmall.com', 166 | u'google.co.jp', 167 | u'ebay.com', 168 | u't.co', 169 | u'blogspot.com', 170 | u'google.de', 171 | u'hao123.com', 172 | u'bing.com', 173 | u'yandex.ru', 174 | u'reddit.com', 175 | u'vk.com', 176 | u'google.co.uk', 177 | u'amazon.co.jp', 178 | u'msn.com', 179 | u'google.com.br', 180 | u'instagram.com', 181 | u'google.fr', 182 | u'tumblr.com', 183 | u'wordpress.com', 184 | u'sohu.com', 185 | u'imgur.com', 186 | u'pinterest.com', 187 | u'paypal.com', 188 | u'aliexpress.com', 189 | u'xvideos.com', 190 | u'apple.com', 191 | u'ask.com', 192 | u'microsoft.com', 193 | u'onclickads.net', 194 | u'google.it', 195 | u'gmail.com', 196 | u'imdb.com', 197 | u'mail.ru', 198 | u'fc2.com', 199 | u'alibaba.com', 200 | u'google.ru', 201 | u'google.es', 202 | u'adcash.com', 203 | u'amazon.de', 204 | u'stackoverflow.com', 205 | u'360.cn', 206 | u'netflix.com', 207 | u'googleadservices.com', 208 | u'diply.com', 209 | u'google.ca', 210 | u'craigslist.org', 211 | u'go.com', 212 | u'xhamster.com', 213 | u'google.com.hk', 214 | u'naver.com', 215 | u'tianya.cn', 216 | u'gmw.cn', 217 | u'163.com', 218 | u'bbc.co.uk', 219 | u'google.com.tr', 220 | u'amazon.co.uk', 221 | u'adobe.com', 222 | u'kickass.to', 223 | u'pornhub.com', 224 | u'rakuten.co.jp', 225 | u'ebay.de', 226 | u'dropbox.com', 227 | u'cnn.com', 228 | u'amazon.cn', 229 | u'nicovideo.jp', 230 | u'espn.go.com', 231 | u'google.com.mx', 232 | u'google.pl', 233 | u'ok.ru', 234 | u'soso.com', 235 | u'google.com.au', 236 | u'dailymotion.com', 237 | u'cntv.cn', 238 | u'googleusercontent.com', 239 | u'github.com', 240 | u'jd.com', 241 | u'directrev.com', 242 | u'youku.com', 243 | u'outbrain.com', 244 | u'alipay.com', 245 | u'people.com.cn', 246 | u'flipkart.com', 247 | u'pixnet.net', 248 | u'google.co.kr', 249 | u'google.co.id', 250 | u'chinadaily.com.cn', 251 | u'nytimes.com', 252 | u'blogger.com', 253 | u'buzzfeed.com', 254 | u'uol.com.br', 255 | u'huffingtonpost.com', 256 | u'ebay.co.uk', 257 | u'wikia.com', 258 | u'livedoor.com', 259 | u'indiatimes.com', 260 | u'booking.com', 261 | u'google.com.tw', 262 | u'amazon.in', 263 | u'china.com', 264 | u'sogou.com', 265 | u'tudou.com', 266 | u'bycontext.com', 267 | u'chase.com', 268 | u'blogspot.in', 269 | u'amazonaws.com', 270 | u'dailymail.co.uk', 271 | u'ettoday.net', 272 | u'xinhuanet.com', 273 | u'google.com.eg', 274 | u'flickr.com', 275 | u'xnxx.com', 276 | u'globo.com', 277 | u'wordpress.org', 278 | u'coccoc.com', 279 | u'douban.com', 280 | u'yelp.com', 281 | u'google.nl', 282 | u'bankofamerica.com', 283 | u'pconline.com.cn', 284 | u'salesforce.com', 285 | u'godaddy.com', 286 | u'themeforest.net', 287 | u'about.com', 288 | u'ameblo.jp', 289 | u'popads.net', 290 | u'dmm.co.jp', 291 | u'daum.net', 292 | u'cnet.com', 293 | u'slideshare.net', 294 | u'twitch.tv', 295 | u'google.com.pk', 296 | u'youradexchange.com', 297 | u'etsy.com', 298 | u'deviantart.com', 299 | u'bongacams.com', 300 | u'google.com.ar', 301 | u'redtube.com', 302 | u'amazon.fr', 303 | u'loading-delivery1.com', 304 | u'quora.com', 305 | u'bbc.com', 306 | u'theguardian.com', 307 | u'weather.com', 308 | u'youporn.com', 309 | u'adf.ly', 310 | u'naver.jp', 311 | u'warmportrait.com', 312 | u'ilividnewtab.com', 313 | u'stackexchange.com', 314 | u'life.com.tw', 315 | u'stamplive.com', 316 | u'vimeo.com', 317 | u'forbes.com', 318 | u'espncricinfo.com', 319 | u'indeed.com', 320 | u'soundcloud.com', 321 | u'snapdeal.com', 322 | u'walmart.com', 323 | u'aol.com', 324 | u'microsoftonline.com', 325 | u'google.com.sa', 326 | u'reference.com', 327 | u'bp.blogspot.com', 328 | u'wellsfargo.com', 329 | u'mozilla.org', 330 | u'google.co.za', 331 | u'w3schools.com', 332 | u'google.gr', 333 | u'zillow.com', 334 | u'feedly.com', 335 | u'amazon.it', 336 | u'leboncoin.fr', 337 | u'wikihow.com', 338 | u'theladbible.com', 339 | u'mailchimp.com', 340 | u'office365.com', 341 | u'google.co.th', 342 | u'mystart.com', 343 | u'china.com.cn', 344 | u'thepiratebay.se', 345 | u'tripadvisor.com', 346 | u'livejasmin.com', 347 | u'google.com.ua', 348 | u'businessinsider.com', 349 | u'allegro.pl', 350 | u'livejournal.com', 351 | u'vice.com', 352 | u'zol.com.cn', 353 | u'popcash.net', 354 | u'wikimedia.org', 355 | u'ikea.com', 356 | u'force.com', 357 | u'ifeng.com', 358 | u'washingtonpost.com', 359 | u'onet.pl', 360 | u'pixiv.net', 361 | u'kakaku.com', 362 | u'gamer.com.tw', 363 | u'files.wordpress.com', 364 | u'google.be', 365 | u'secureserver.net', 366 | u'mediafire.com', 367 | u'smzdm.com', 368 | u'9gag.com', 369 | u'nih.gov', 370 | u'sourceforge.net', 371 | u'google.com.my', 372 | u'taboola.com', 373 | u'google.com.co', 374 | u'foxnews.com', 375 | u'archive.org', 376 | u'web.de', 377 | u'xuite.net', 378 | u'tubecup.com', 379 | u'blogfa.com', 380 | u'bestbuy.com', 381 | u'google.com.sg', 382 | u'ups.com', 383 | u'trklnks.com', 384 | u'pclady.com.cn', 385 | u'bitauto.com', 386 | u'zhihu.com', 387 | u'comcast.net', 388 | u'google.cn', 389 | u'usps.com', 390 | u'google.ro', 391 | u'goo.ne.jp', 392 | u'americanexpress.com', 393 | u'abs-cbnnews.com', 394 | u'gmx.net', 395 | u'likes.com', 396 | u'doublepimp.com', 397 | u'wix.com', 398 | u'intuit.com', 399 | u'office.com', 400 | u'shutterstock.com', 401 | u'ppomppu.co.kr', 402 | u'google.com.ng', 403 | u'ndtv.com', 404 | u'akamaihd.net', 405 | u'doorblog.jp', 406 | u'google.at', 407 | u'stumbleupon.com', 408 | u'weebly.com', 409 | u'target.com', 410 | u'mashable.com', 411 | u'addthis.com', 412 | u'skype.com', 413 | u'steamcommunity.com', 414 | u'gfycat.com', 415 | u'avg.com', 416 | u'badoo.com', 417 | u'avito.ru', 418 | u'dmm.com', 419 | u'orange.fr', 420 | u'pandora.com', 421 | u'telegraph.co.uk', 422 | u'amazon.es', 423 | u'quikr.com', 424 | u'ign.com', 425 | u'haosou.com', 426 | u'hootsuite.com', 427 | u'mercadolivre.com.br', 428 | u'groupon.com', 429 | u'google.co.ve', 430 | u'bild.de', 431 | u'homedepot.com', 432 | u'softonic.com', 433 | u'google.pt', 434 | u'theadgateway.com', 435 | u'goodreads.com', 436 | u'github.io', 437 | u'google.se', 438 | u'youm7.com', 439 | u'hdfcbank.com', 440 | u'wordreference.com', 441 | u'bet365.com', 442 | u'icicibank.com', 443 | u'bomb01.com', 444 | u'wsj.com', 445 | u'spiegel.de', 446 | u'adplxmd.com', 447 | u'media.tumblr.com', 448 | u'gome.com.cn', 449 | u'pcbaby.com.cn', 450 | u'bilibili.com', 451 | u'jabong.com', 452 | u'caijing.com.cn', 453 | u'engadget.com', 454 | u'pchome.net', 455 | u'hulu.com', 456 | u'kaskus.co.id', 457 | u'steampowered.com', 458 | u't-online.de', 459 | u'trello.com', 460 | u'webmd.com', 461 | u'ask.fm', 462 | u'spotify.com', 463 | u'slickdeals.net', 464 | u'whatsapp.com', 465 | u'rediff.com', 466 | u'wow.com', 467 | u'hurriyet.com.tr', 468 | u'google.com.pe', 469 | u'udn.com', 470 | u'usatoday.com', 471 | u'google.ch', 472 | u'seznam.cz', 473 | u'zendesk.com', 474 | u'hp.com', 475 | u'fedex.com', 476 | u'lifehacker.com', 477 | u'webssearches.com', 478 | u'ijreview.com', 479 | u'tube8.com', 480 | u'bloomberg.com', 481 | u'fiverr.com', 482 | u'mobile01.com', 483 | u'wp.pl', 484 | u'chinatimes.com', 485 | u'bleacherreport.com', 486 | u'cbssports.com', 487 | u'uptodown.com', 488 | u'samsung.com', 489 | u'mama.cn', 490 | u'youth.cn', 491 | u'gameforge.com', 492 | u'hupu.com', 493 | u'rambler.ru', 494 | u'answers.com', 495 | u'torrentz.eu', 496 | u'google.ae', 497 | u'disqus.com', 498 | u'teepr.com', 499 | u'xcar.com.cn', 500 | u'google.com.ph', 501 | u'dell.com', 502 | u'speedtest.net', 503 | u'capitalone.com', 504 | u'google.no', 505 | u'rutracker.org', 506 | u'evernote.com', 507 | u'amazon.ca', 508 | u'accuweather.com', 509 | u'extratorrent.cc', 510 | u'ebay.in', 511 | u'moz.com', 512 | u'google.com.bd', 513 | u'39.net', 514 | u'techcrunch.com', 515 | u'iqiyi.com', 516 | u'gizmodo.com', 517 | u'newegg.com', 518 | u'photobucket.com', 519 | u'kompas.com', 520 | u'ltn.com.tw', 521 | u'gsmarena.com', 522 | u'cloudfront.net', 523 | u'meetup.com', 524 | u'truemediapipe.com', 525 | u'google.cz', 526 | u'kickstarter.com', 527 | u'varzesh3.com', 528 | u'vipcpms.com', 529 | u'2ch.net', 530 | u'detik.com', 531 | u'trackingclick.net', 532 | u'thefreedictionary.com', 533 | u'codecanyon.net', 534 | u'google.co.hu', 535 | u'libero.it', 536 | u'autohome.com.cn', 537 | u'milliyet.com.tr', 538 | u'ce.cn', 539 | u'infusionsoft.com', 540 | u'tistory.com', 541 | u'webtretho.com', 542 | u'reuters.com', 543 | u'bitly.com', 544 | u'justdial.com', 545 | u'googleapis.com', 546 | u'onlinesbi.com', 547 | u'google.co.il', 548 | u'babytree.com', 549 | u'twimg.com', 550 | u'sahibinden.com', 551 | u'ameba.jp', 552 | u'eksisozluk.com', 553 | u'google.ie', 554 | u'mydomainadvisor.com', 555 | u'reimageplus.com', 556 | u'hudong.com', 557 | u'oracle.com', 558 | u'verizonwireless.com', 559 | u'4shared.com', 560 | u'scribd.com', 561 | u'surveymonkey.com', 562 | u'xda-developers.com', 563 | u'time.com', 564 | u'liputan6.com', 565 | u'att.com', 566 | u'staticwebdom.com', 567 | u'battle.net', 568 | u'paytm.com', 569 | u'ptt01.cc', 570 | u'irs.gov', 571 | u'lady8844.com', 572 | u'eastmoney.com', 573 | u'ebay.com.au', 574 | u'expedia.com', 575 | u'macys.com', 576 | u'tlbb8.com', 577 | u'uploaded.net', 578 | u'privatehomeclips.com', 579 | u'ck101.com', 580 | u'ero-advertising.com', 581 | u'bhaskar.com', 582 | u'free.fr', 583 | u'yandex.ua', 584 | u'taleo.net', 585 | u'ebay.it', 586 | u'olx.in', 587 | u'list-manage.com', 588 | u'sabah.com.tr', 589 | u'blog.jp', 590 | u'theverge.com', 591 | u'lazada.co.id', 592 | u'liveinternet.ru', 593 | u'citibank.com', 594 | u'repubblica.it', 595 | u'nyaa.se', 596 | u'csdn.net', 597 | u'okcupid.com', 598 | u'warofclicks.com', 599 | u'google.com.vn', 600 | u'mlb.com', 601 | u'naukri.com', 602 | u'ganji.com', 603 | u'starsports.com', 604 | u'fbcdn.net', 605 | u'ink361.com', 606 | u'32d1d3b9c.se', 607 | u'nba.com', 608 | u'chaturbate.com', 609 | u'google.cl', 610 | u'exoclick.com', 611 | u'rt.com', 612 | u'ci123.com', 613 | u'retailmenot.com', 614 | u'livedoor.biz', 615 | u'gmarket.co.kr', 616 | u'b5m.com', 617 | u'trulia.com', 618 | u'gap.com', 619 | u'goal.com', 620 | u'mega.co.nz', 621 | u'elpais.com', 622 | u'xywy.com', 623 | u'icloud.com', 624 | u'kayak.com', 625 | u'npr.org', 626 | u'hostgator.com', 627 | u'mobile.de', 628 | u'xe.com', 629 | u'glassdoor.com', 630 | u'mi.com', 631 | u'flirchi.com', 632 | u'kouclo.com', 633 | u'asana.com', 634 | u'odesk.com', 635 | u'58.com', 636 | u'abril.com.br', 637 | u'blogimg.jp', 638 | u'6pm.com', 639 | u'php.net', 640 | u'woot.com', 641 | u'rbc.ru', 642 | u'hotels.com', 643 | u'marca.com', 644 | u'bestadbid.com', 645 | u'styletv.com.cn', 646 | u'eyny.com', 647 | u'chinaz.com', 648 | u'wonderlandads.com', 649 | u'constantcontact.com', 650 | u'google.fi', 651 | u'issuu.com', 652 | u'impress.co.jp', 653 | u'wetransfer.com', 654 | u'buzzfil.net', 655 | u'zippyshare.com', 656 | u'ad132m.com', 657 | u'timeanddate.com', 658 | u'enet.com.cn', 659 | u'gawker.com', 660 | u'doubleclick.com', 661 | u'sex.com', 662 | u'nordstrom.com', 663 | u'clien.net', 664 | u'cnzz.com', 665 | u'slack.com', 666 | u'urbandictionary.com', 667 | u'agoda.com', 668 | u'google.sk', 669 | u'cricbuzz.com', 670 | u'corriere.it', 671 | u'azlyrics.com', 672 | u'asos.com', 673 | u'google.dz', 674 | u'mystartsearch.com', 675 | u'kijiji.ca', 676 | u'hm.com', 677 | u'urdupoint.com', 678 | u'fanli.com', 679 | u'zoho.com', 680 | u'beeg.com', 681 | u'playstation.com', 682 | u'gamefaqs.com', 683 | u'elance.com', 684 | u'tmz.com', 685 | u'elmundo.es', 686 | u'shopclues.com', 687 | u'tabelog.com', 688 | u'airbnb.com', 689 | u'realtor.com', 690 | u'independent.co.uk', 691 | u'eastday.com', 692 | u'appledaily.com.tw', 693 | u'wunderground.com', 694 | u'lowes.com', 695 | u'latimes.com', 696 | u'nike.com', 697 | u'google.dk', 698 | u'eventbrite.com', 699 | u'statcounter.com', 700 | u'drseks.com', 701 | u'streamcloud.eu', 702 | u'ehow.com', 703 | u'onedio.com', 704 | u'it168.com', 705 | u'domaintools.com', 706 | u'irctc.co.in', 707 | u'shopify.com', 708 | u'aparat.com', 709 | u'houzz.com', 710 | u'adsrvmedia.net', 711 | u'savefrom.net', 712 | u'aweber.com', 713 | u'blog.me', 714 | u'goo.gl', 715 | u'vnexpress.net', 716 | u'ucoz.ru', 717 | u'ad120m.com', 718 | u'hatena.ne.jp', 719 | u'donga.com', 720 | u'stockstar.com', 721 | u'4dsply.com', 722 | u'olx.pl', 723 | u'squarespace.com', 724 | u'sberbank.ru', 725 | u'nbcnews.com', 726 | u'slate.com', 727 | u'blogspot.jp', 728 | u'statsmobi.com', 729 | u'11st.co.kr', 730 | u'duckduckgo.com', 731 | u'moneycontrol.com', 732 | u'mercadolibre.com.ar', 733 | u'intoday.in', 734 | u'subito.it', 735 | u'kohls.com', 736 | u'southwest.com', 737 | u'lenovo.com', 738 | u'youtube-mp3.org', 739 | u'ca.gov', 740 | u'box.com', 741 | u'jqw.com', 742 | u'youboy.com', 743 | u'mixi.jp', 744 | u'rottentomatoes.com', 745 | u'facenama.com', 746 | u'albawabhnews.com', 747 | u'hubspot.com', 748 | u'java.com', 749 | u'in.com', 750 | u'pinimg.com', 751 | u'myntra.com', 752 | u'hdzog.com', 753 | u'vcommission.com', 754 | u'nownews.com', 755 | u'udemy.com', 756 | u'ancestry.com', 757 | u'instructables.com', 758 | u'haber7.com', 759 | u'wired.com', 760 | u'taringa.net', 761 | u'hstpnetwork.com', 762 | u'pof.com', 763 | u'kinopoisk.ru', 764 | u'medium.com', 765 | u'shareba.com', 766 | u'etao.com', 767 | u'youdao.com', 768 | u'fidelity.com', 769 | u'ig.com.br', 770 | u'siteadvisor.com', 771 | u'gazeta.pl', 772 | u'zing.vn', 773 | u'neobux.com', 774 | u'subscene.com', 775 | u'sakura.ne.jp', 776 | u'airtel.in', 777 | u'jimdo.com', 778 | u'fh21.com.cn', 779 | u'wikiwiki.jp', 780 | u'wiktionary.org', 781 | u'priceline.com', 782 | u'lenta.ru', 783 | u'jrj.com.cn', 784 | u'verizon.com', 785 | u'nifty.com', 786 | u'bodybuilding.com', 787 | u'behance.net', 788 | u'sabq.org', 789 | u'xunlei.com', 790 | u'kotaku.com', 791 | u'entrepreneur.com', 792 | u'semrush.com', 793 | u'mackeeper.com', 794 | u'lemonde.fr', 795 | u'overstock.com', 796 | u'drudgereport.com', 797 | u'japanpost.jp', 798 | u'mirror.co.uk', 799 | u'digikala.com', 800 | u'allrecipes.com', 801 | u'basecamp.com', 802 | u'youjizz.com', 803 | u'goodgamestudios.com', 804 | u'nydailynews.com', 805 | u'soku.com', 806 | u'ebay.fr', 807 | u'interia.pl', 808 | u'academia.edu', 809 | u'17ok.com', 810 | u'onlylady.com', 811 | u'dianping.com', 812 | u'rednet.cn', 813 | u'elfagr.org', 814 | u'wwwpromoter.com', 815 | u'putlocker.is', 816 | u'vetogate.com', 817 | u'chip.de', 818 | u'ad6media.fr', 819 | u'abcnews.go.com', 820 | u'cracked.com', 821 | u'ero-video.net', 822 | u'junbi-tracker.com', 823 | u'chron.com', 824 | u'slimspots.com', 825 | u'asus.com', 826 | u'prntscr.com', 827 | u'livescore.com', 828 | u'discovercard.com', 829 | u'bluehost.com', 830 | u'torcache.net', 831 | u'ted.com', 832 | u'omiga-plus.com', 833 | u'pcgames.com.cn', 834 | u'change.org', 835 | u'rapidgator.net', 836 | u'ashleyrnadison.com', 837 | u'epweike.com', 838 | u'kooora.com', 839 | u'ticketmaster.com', 840 | u'auction.co.kr', 841 | u'makemytrip.com', 842 | u'souq.com', 843 | u'4chan.org', 844 | u'el-balad.com', 845 | u'freepik.com', 846 | u'twoo.com', 847 | u'voc.com.cn', 848 | u'saramin.co.kr', 849 | u'workercn.cn', 850 | u'google.co.nz', 851 | u'staples.com', 852 | u'cnblogs.com', 853 | u'blackboard.com', 854 | u'myfitnesspal.com', 855 | u'chinaso.com', 856 | u'eonline.com', 857 | u'so-net.ne.jp', 858 | u'yoka.com', 859 | u'seesaa.net', 860 | u'costco.com', 861 | u'hespress.com', 862 | u'renren.com', 863 | u'liveleak.com', 864 | u'shareasale.com', 865 | u'e-hentai.org', 866 | u'news.com.au', 867 | u'people.com', 868 | u'faithtap.com', 869 | u'ewt.cc', 870 | u'bhphotovideo.com', 871 | u'douyutv.com', 872 | u'm-w.com', 873 | u'battlefield.com', 874 | u'kinogo.net', 875 | u'vine.co', 876 | u'terra.com.br', 877 | u'lefigaro.fr', 878 | u'united.com', 879 | u'efix.com', 880 | u'android.com', 881 | u'patch.com', 882 | u'gutefrage.net', 883 | u'sears.com', 884 | u'custhelp.com', 885 | u'zulily.com', 886 | u'vi-view.com', 887 | u'folha.uol.com.br', 888 | u'disney.go.com', 889 | u'pcpop.com', 890 | u'westernjournalism.com', 891 | u'biglobe.ne.jp', 892 | u'adp.com', 893 | u'kdnet.net', 894 | u'zappos.com', 895 | u'milanuncios.com', 896 | u'digg.com', 897 | u'mint.com', 898 | u'atwiki.jp', 899 | u'focus.de', 900 | u'backpage.com', 901 | u'billdesk.com', 902 | u'315che.com', 903 | u'tinyurl.com', 904 | u'babycenter.com', 905 | u'22find.com', 906 | u'cnbc.com', 907 | u'primewire.ag', 908 | u'io9.com', 909 | u'cookpad.com', 910 | u'yhd.com', 911 | u'swagbucks.com', 912 | u'r10.net', 913 | u'coursera.org', 914 | u'bukalapak.com', 915 | u'leagueoflegends.com', 916 | u'delta.com', 917 | u'gyazo.com', 918 | u'ibm.com', 919 | u'momoshop.com.tw', 920 | u'comcast.com', 921 | u'webex.com', 922 | u'sfgate.com', 923 | u'atlassian.net', 924 | u'foodnetwork.com', 925 | u'sbnation.com', 926 | u'nate.com', 927 | u'nikkei.com', 928 | u'novinky.cz', 929 | u'pcmag.com', 930 | u'marketwatch.com', 931 | u'globososo.com', 932 | u'xbox.com', 933 | u'mihanblog.com', 934 | u'icc-cricket.com', 935 | u'scoop.it', 936 | u'ruten.com.tw', 937 | u'trovi.com', 938 | u'mit.edu', 939 | u'wav.tv', 940 | u'huaban.com', 941 | u'126.com', 942 | u'nhl.com', 943 | u'bookmyshow.com', 944 | u'commentcamarche.net', 945 | u'mynavi.jp', 946 | u'sh.st', 947 | u'sciencedirect.com', 948 | u'ytimg.com', 949 | u'rightmove.co.uk', 950 | u'olx.com.br', 951 | u'w3.org', 952 | u'weblio.jp', 953 | u'tomshardware.com', 954 | u'www.gov.uk', 955 | u'nhk.or.jp', 956 | u'aa.com', 957 | u'filehippo.com', 958 | u'mynet.com', 959 | u'web.tv', 960 | u'sulekha.com', 961 | u'exblog.jp', 962 | u'superuser.com', 963 | u'shaadi.com', 964 | u'histats.com', 965 | u'letv.com', 966 | u'williamhill.com', 967 | u'ncaa.com', 968 | u'mackolik.com', 969 | u'india.com', 970 | u'google.rs', 971 | u'indianexpress.com', 972 | u'stanford.edu', 973 | u'tagged.com', 974 | u'pchome.com.tw', 975 | u'usaa.com', 976 | u'gofundme.com', 977 | u'k618.cn', 978 | u'blogspot.de', 979 | u'myfreecams.com', 980 | u'reverso.net', 981 | u'usmagazine.com', 982 | u'porn.com', 983 | u'pch.com', 984 | u'farsnews.com', 985 | u'dreamstime.com', 986 | u'104.com.tw', 987 | u'torrentz.in', 988 | u'livestrong.com', 989 | u'tnaflix.com', 990 | u'instructure.com', 991 | u'gemius.pl', 992 | u'dangdang.com', 993 | u'sky.com', 994 | u'match.com', 995 | u'investopedia.com', 996 | u'thehindu.com', 997 | u'informer.com', 998 | u'zhaopin.com', 999 | u'altervista.org', 1000 | u'lequipe.fr', 1001 | u'cbslocal.com', 1002 | u'ixxx.com', 1003 | u'gamespot.com', 1004 | u'europa.eu', 1005 | u'sfr.fr', 1006 | u'movie4k.to', 1007 | u'cisco.com', 1008 | u'yodobashi.com', 1009 | u'hilton.com', 1010 | u'yandex.com.tr', 1011 | u'indiegogo.com', 1012 | u'mayoclinic.org', 1013 | u'bidvertiser.com', 1014 | u'acfun.tv', 1015 | u'scoopwhoop.com', 1016 | u'usbank.com', 1017 | u'sahadan.com', 1018 | u'tutorialspoint.com', 1019 | u'ad4game.com', 1020 | u'pagesjaunes.fr', 1021 | u'themidnightmatulas.com', 1022 | u'gamepedia.com', 1023 | u't-mobile.com', 1024 | u'safehomepage.com', 1025 | u'adme.ru', 1026 | u'vk.me', 1027 | u'ccb.com', 1028 | u'coupons.com', 1029 | u'cdiscount.com', 1030 | u'offpageads.com', 1031 | u'monster.com', 1032 | u'roblox.com', 1033 | u'xing.com', 1034 | u'cbsnews.com', 1035 | u'google.hr', 1036 | u'tsite.jp', 1037 | u'techradar.com', 1038 | u'ria.ru', 1039 | u'abplive.in', 1040 | u'forever21.com', 1041 | u'filmon-ads.com', 1042 | u'deadspin.com', 1043 | u'ampclicks.com', 1044 | u'google.bg', 1045 | u'gumtree.com', 1046 | u'lolwot.com', 1047 | u'dafont.com', 1048 | u'geocities.jp', 1049 | u'oyunskor.com', 1050 | u'inc.com', 1051 | u'padsdel.com', 1052 | u'drtuber.com', 1053 | u'motherless.com', 1054 | u'fitbit.com', 1055 | u'chexun.com', 1056 | u'tribunnews.com', 1057 | u'123rf.com', 1058 | u'sharepoint.com', 1059 | u'ew.com', 1060 | u'howtogeek.com', 1061 | u'friv.com', 1062 | u'zara.com', 1063 | u'4399.com', 1064 | u'thedailybeast.com', 1065 | u'nfl.com', 1066 | u'warriorforum.com', 1067 | u'ringring.vn', 1068 | u'suning.com', 1069 | u'asahi.com', 1070 | u'almasryalyoum.com', 1071 | u'himado.in', 1072 | u'redfin.com', 1073 | u'free-tv-video-online.info', 1074 | u'tabnak.ir', 1075 | u'hubpages.com', 1076 | u'sapo.pt', 1077 | u'indiamart.com', 1078 | u'12306.cn', 1079 | u'189.cn', 1080 | u'cbc.ca', 1081 | u'jmpdirect01.com', 1082 | u'olx.ua', 1083 | u'gismeteo.ru', 1084 | u'nypost.com', 1085 | u'istockphoto.com', 1086 | u'adultfriendfinder.com', 1087 | u'slrclub.com', 1088 | u'foursquare.com', 1089 | u'intel.com', 1090 | u'qianlong.com', 1091 | u'21cn.com', 1092 | u'2345.com', 1093 | u'immobilienscout24.de', 1094 | u'eenadu.net', 1095 | u'apache.org', 1096 | u'itmedia.co.jp', 1097 | u'searchengineland.com', 1098 | u'all2lnk.com', 1099 | u'harvard.edu', 1100 | u'as.com', 1101 | u'yellowpages.com', 1102 | u'pixabay.com', 1103 | u'researchgate.net', 1104 | u'popsugar.com', 1105 | u'imobile.com.cn', 1106 | u'androidcentral.com', 1107 | u'mercadolibre.com.mx', 1108 | u'quizlet.com', 1109 | u'allocine.fr', 1110 | u'pantip.com', 1111 | u'smh.com.au', 1112 | u'who.is', 1113 | u'linkwithin.com', 1114 | u'freelancer.com', 1115 | u'getbootstrap.com', 1116 | u'cpasbien.pw', 1117 | u'typepad.com', 1118 | u'prothom-alo.com', 1119 | u'clipconverter.cc', 1120 | u'yam.com', 1121 | u'sporx.com', 1122 | u'tutsplus.com', 1123 | u'nbcsports.com', 1124 | u'ebay.ca', 1125 | u'cbs.com', 1126 | u'caixa.gov.br', 1127 | u'distractify.com', 1128 | u'virgilio.it', 1129 | u'rakuten.ne.jp', 1130 | u'e97527f0.se', 1131 | u'lanacion.com.ar', 1132 | u'gigazine.net', 1133 | u'internethaber.com', 1134 | u'rakuten.com', 1135 | u'ultimate-guitar.com', 1136 | u'tripadvisor.co.uk', 1137 | u'rutor.org', 1138 | u'reduxmediia.com', 1139 | u'ashleymadison.com', 1140 | u'gittigidiyor.com', 1141 | u'itau.com.br', 1142 | u'nocookie.net', 1143 | u'gigacircle.com', 1144 | u'whitepages.com', 1145 | u'makeuseof.com', 1146 | u'wiley.com', 1147 | u'indianrail.gov.in', 1148 | u'theatlantic.com', 1149 | u'hh.ru', 1150 | u'food.com', 1151 | u'marktplaats.nl', 1152 | u'weather.gov', 1153 | u'marriott.com', 1154 | u'masrawy.com', 1155 | u'idnes.cz', 1156 | u'liveperson.net', 1157 | u'chiphell.com', 1158 | u'blogspot.mx', 1159 | u'51job.com', 1160 | u'deezer.com', 1161 | u'nationalgeographic.com', 1162 | u'mediaplex.com', 1163 | u'dict.cc', 1164 | u'24h.com.vn', 1165 | u'6park.com', 1166 | u'citrixonline.com', 1167 | u'ebates.com', 1168 | u'gazzetta.it', 1169 | u'iminent.com', 1170 | u'jumia.com.ng', 1171 | u'finstorieslive.com', 1172 | u'qingdaonews.com', 1173 | u'imagebam.com', 1174 | u'focus.cn', 1175 | u'jobrapido.com', 1176 | u'tahrirnews.com', 1177 | u'leadpages.net', 1178 | u'worldstarhiphop.com', 1179 | u'am15.net', 1180 | u'cnmo.com', 1181 | u'clarin.com', 1182 | u'getpocket.com', 1183 | u'what-character-are-you.com', 1184 | u'popupads.ir', 1185 | u'bt.com', 1186 | u'jquery.com', 1187 | u'delta-homes.com', 1188 | u'images-amazon.com', 1189 | u'google.com.ec', 1190 | u'today.com', 1191 | u'ccebba93.se', 1192 | u'cocolog-nifty.com', 1193 | u'emol.com', 1194 | u'salon.com', 1195 | u'ukr.net', 1196 | u'hypergames.net', 1197 | u'lightinthebox.com', 1198 | u'blackhatworld.com', 1199 | u'viralnova.com', 1200 | u'hardsextube.com', 1201 | u'blogspot.ru', 1202 | u'thenextweb.com', 1203 | u'state.gov', 1204 | u'adrotator.se', 1205 | u'thekitchn.com', 1206 | u'wayfair.com', 1207 | u'mercadolibre.com.ve', 1208 | u'wetter.com', 1209 | u'firstpost.com', 1210 | u'ensonhaber.com', 1211 | u'anitube.se', 1212 | u'sozcu.com.tr', 1213 | u'topix.com', 1214 | u'interpark.com', 1215 | u'mtv.com', 1216 | u'brainyquote.com', 1217 | u'americanas.com.br', 1218 | u'lapatilla.com', 1219 | u'barnesandnoble.com', 1220 | u'solarmovie.is', 1221 | u'macrumors.com', 1222 | u'audible.com', 1223 | u'fanfiction.net', 1224 | u'envato.com', 1225 | u'littlethings.com', 1226 | u'rarbg.com', 1227 | u'habrahabr.ru', 1228 | u'prezi.com', 1229 | u'bitbucket.org', 1230 | u'mediaset.it', 1231 | u'yts.to', 1232 | u'nikkeibp.co.jp', 1233 | u'webcrawler.com', 1234 | u'rtl.de', 1235 | u'yomiuri.co.jp', 1236 | u'dropboxusercontent.com', 1237 | u'ryanair.com', 1238 | u'usnews.com', 1239 | u'vg.no', 1240 | u'google.com.do', 1241 | u'hatenablog.com', 1242 | u'donanimhaber.com', 1243 | u'dribbble.com', 1244 | u'norton.com', 1245 | u'postimg.org', 1246 | u'investing.com', 1247 | u'hollywoodreporter.com', 1248 | u'elitedaily.com', 1249 | u'clicksvenue.com', 1250 | u'iconosquare.com', 1251 | u'jcpenney.com', 1252 | u'idealo.de', 1253 | u'xtube.com', 1254 | u'olx.co.id', 1255 | u'search-simple.com', 1256 | u'rollingstone.com', 1257 | u'videomega.tv', 1258 | u'nairaland.com', 1259 | u'mbc.net', 1260 | u'ctrip.com', 1261 | u'mapquest.com', 1262 | u'rei.com', 1263 | u'media-fire.org', 1264 | u'arstechnica.com', 1265 | u'zone-telechargement.com', 1266 | u'getresponse.com', 1267 | u'complex.com', 1268 | u'525j.com.cn', 1269 | u'nasa.gov', 1270 | u'zedo.com', 1271 | u'click4stat.com', 1272 | u'cnbeta.com', 1273 | u'zomato.com', 1274 | u'ly.net', 1275 | u'wwe.com', 1276 | u'beytoote.com', 1277 | u'jsfiddle.net', 1278 | u'discuss.com.hk', 1279 | u'persianblog.ir', 1280 | u'google.lk', 1281 | u'google.lt', 1282 | u'graphicriver.net', 1283 | u'microsoftstore.com', 1284 | u'114la.com', 1285 | u'uber.com', 1286 | u'netteller.com', 1287 | u'kinox.to', 1288 | u'jezebel.com', 1289 | u'9gag.tv', 1290 | u'linternaute.com', 1291 | u'upworthy.com', 1292 | u'ldblog.jp', 1293 | u'qualtrics.com', 1294 | u'biblegateway.com', 1295 | u'ocn.ne.jp', 1296 | u'alarabiya.net', 1297 | u'2chblog.jp', 1298 | u'narod.ru', 1299 | u'correios.com.br', 1300 | u'vox.com', 1301 | u'picmonkey.com', 1302 | u'elwatannews.com', 1303 | u'bahn.de', 1304 | u'heise.de', 1305 | u'feng.com', 1306 | u'bloglovin.com', 1307 | u'elegantthemes.com', 1308 | u'windowsphone.com', 1309 | u'lazada.vn', 1310 | u'abc.net.au', 1311 | u'elmogaz.com', 1312 | u'haberturk.com', 1313 | u'dhgate.com', 1314 | u'td.com', 1315 | u'istartsurf.com', 1316 | u'gazetaexpress.com', 1317 | u'ea.com', 1318 | u'walmart.com.br', 1319 | u'wp.com', 1320 | u'pbs.org', 1321 | u'theblaze.com', 1322 | u'vodlocker.com', 1323 | u'dubizzle.com', 1324 | u'nu.nl', 1325 | u'merdeka.com', 1326 | u'hindustantimes.com', 1327 | u'wattpad.com', 1328 | u'similarweb.com', 1329 | u'cc.com', 1330 | u'haiwainet.cn', 1331 | u'jagran.com', 1332 | u'tesco.com', 1333 | u'discogs.com', 1334 | u'spankwire.com', 1335 | u'ptt.cc', 1336 | u'360doc.com', 1337 | u'joomla.org', 1338 | u'mydrivers.com', 1339 | u'wiocha.pl', 1340 | u'juksy.com', 1341 | u'autoscout24.de', 1342 | u'citibank.co.in', 1343 | u'crunchbase.com', 1344 | u'axisbank.com', 1345 | u'todayhumor.co.kr', 1346 | u'iciba.com', 1347 | u'flightradar24.com', 1348 | u'infobae.com', 1349 | u'gaana.com', 1350 | u'clixsense.com', 1351 | u'couchtuner.eu', 1352 | u'cityadspix.com', 1353 | u'chefkoch.de', 1354 | u'video-one.com', 1355 | u'thechive.com', 1356 | u'walgreens.com', 1357 | u'popmog.com', 1358 | u'conduit.com', 1359 | u'bedbathandbeyond.com', 1360 | u'sitepoint.com', 1361 | u'list-manage1.com', 1362 | u'ebay.es', 1363 | u'katproxy.com', 1364 | u'boredpanda.com', 1365 | u'garanti.com.tr', 1366 | u'alfalfalfa.com', 1367 | u'iflscience.com', 1368 | u'brassring.com', 1369 | u'jalopnik.com', 1370 | u'biobiochile.cl', 1371 | u'amoory.com', 1372 | u'clickbank.com', 1373 | u'codeproject.com', 1374 | u'vente-privee.com', 1375 | u'3dmgame.com', 1376 | u'templatemonster.com', 1377 | u'skysports.com', 1378 | u'otto.de', 1379 | u'pole-emploi.fr', 1380 | u'askmebazaar.com', 1381 | u'lynda.com', 1382 | u'suara.com', 1383 | u'myway.com', 1384 | u'digitaltrends.com', 1385 | u'qidian.com', 1386 | u'bizjournals.com', 1387 | u'shaparak.ir', 1388 | u'ed.gov', 1389 | u'legacy.com', 1390 | u'piriform.com', 1391 | u'creditkarma.com', 1392 | u'autotrader.com', 1393 | u'finn.no', 1394 | u'vip.com', 1395 | u'blomaga.jp', 1396 | u'mbank.com.pl', 1397 | u'askubuntu.com', 1398 | u'gumtree.com.au', 1399 | u'telegraaf.nl', 1400 | u'bandcamp.com', 1401 | u'zeroredirect1.com', 1402 | u'ahrefs.com', 1403 | u'toysrus.com', 1404 | u'tripadvisor.in', 1405 | u'jalan.net', 1406 | u'seasonvar.ru', 1407 | u'super.cz', 1408 | u'websta.me', 1409 | u'oneindia.com', 1410 | u'mpnrs.com', 1411 | u'lotour.com', 1412 | u'yaplakal.com', 1413 | u'gamersky.com', 1414 | u'welt.de', 1415 | u'klikbca.com', 1416 | u'cloudflare.com', 1417 | u'babylon.com', 1418 | u'all-free-download.com', 1419 | u'o2.pl', 1420 | u'pcworld.com', 1421 | u'kuronekoyamato.co.jp', 1422 | u'mydala.com', 1423 | u'credit-agricole.fr', 1424 | u'drugs.com', 1425 | u'sephora.com', 1426 | u'blogspot.tw', 1427 | u'redirectvoluum.com', 1428 | u'weheartit.com', 1429 | u'xkcd.com', 1430 | u'doodle.com', 1431 | u'turbobit.net', 1432 | u'fang.com', 1433 | u'genius.com', 1434 | u'namecheap.com', 1435 | u'babal.net', 1436 | u'megaoferta.net', 1437 | u'friendlife.com', 1438 | u'springer.com', 1439 | u'zopim.com', 1440 | u'fool.com', 1441 | u'vesti.ru', 1442 | u'pixlr.com', 1443 | u'schwab.com', 1444 | u'buscape.com.br', 1445 | u'royalbank.com', 1446 | u'pnc.com', 1447 | u'nudevista.com', 1448 | u'dhl.de', 1449 | u'gazeta.ru', 1450 | u'capitalone360.com', 1451 | u'gilt.com', 1452 | u'staticflickr.com', 1453 | u'speedanalysis.net', 1454 | u'shutterfly.com', 1455 | u'hid.im', 1456 | u'auto.ru', 1457 | u'track300.com', 1458 | u'digitalocean.com', 1459 | u'aftonbladet.se', 1460 | u'gotowebinar.com', 1461 | u'howstuffworks.com', 1462 | u'mail.com', 1463 | u'kioskea.net', 1464 | u'niuche.com', 1465 | u'suik.info', 1466 | u'plarium.com', 1467 | u'google.com.kw', 1468 | u'crsdrz.com', 1469 | u'linkbucks.com', 1470 | u'takungpao.com', 1471 | u'moba-stream.com', 1472 | u'mgid.com', 1473 | u'canva.com', 1474 | u'tgbus.com', 1475 | u'kbb.com', 1476 | u'abc.es', 1477 | u'gamestop.com', 1478 | u'over-blog.com', 1479 | u'iherb.com', 1480 | u'codepen.io', 1481 | u'vsuch.com', 1482 | u'meituan.com', 1483 | u'lun.com', 1484 | u'jia.com', 1485 | u'yadi.sk', 1486 | u'dmv.org', 1487 | u'tomsguide.com', 1488 | u'leo.org', 1489 | u'chekb.com', 1490 | u's2d6.com', 1491 | u'homeway.com.cn', 1492 | u'city-data.com', 1493 | u'dpreview.com', 1494 | u'mangahere.co', 1495 | u'adweek.com', 1496 | u'labanquepostale.fr', 1497 | u'drom.ru', 1498 | u'gotomeeting.com', 1499 | u'fandango.com', 1500 | u'serverfault.com', 1501 | u'ibtimes.com', 1502 | u'jin115.com', 1503 | u'bankmellat.ir', 1504 | u'carview.co.jp', 1505 | u'netshoes.com.br', 1506 | u'excite.co.jp', 1507 | u'serving-sys.com', 1508 | u'jeuxvideo.com', 1509 | u'rozblog.com', 1510 | u'chicagotribune.com', 1511 | u'inquirer.net', 1512 | u'orbitz.com', 1513 | u'sprint.com', 1514 | u'echo.msk.ru', 1515 | u'n11.com', 1516 | u'line.me', 1517 | u'cars.com', 1518 | u'smallseotools.com', 1519 | u'r7.com', 1520 | u'herokuapp.com', 1521 | u'pingdom.com', 1522 | u'cheezburger.com', 1523 | u'y8.com', 1524 | u'lego.com', 1525 | u'keezmovies.com', 1526 | u'yourlust.com', 1527 | u'bankrate.com', 1528 | u'argos.co.uk', 1529 | u'20minutes.fr', 1530 | u'lockerdome.com', 1531 | u'ycombinator.com', 1532 | u'xiami.com', 1533 | u'qslpdk.com', 1534 | u'aljazeera.net', 1535 | u'lacaixa.es', 1536 | u'victoriassecret.com', 1537 | u'thisav.com', 1538 | u'flirt4free.com', 1539 | u'hellou.co.uk', 1540 | u'uploadable.ch', 1541 | u'mangafox.me', 1542 | u'ecnavi.jp', 1543 | u'shine.com', 1544 | u'zougla.gr', 1545 | u'watchseriestv.to', 1546 | u'refinery29.com', 1547 | u'sponichi.co.jp', 1548 | u'cvs.com', 1549 | u'livingsocial.com', 1550 | u'gstatic.com', 1551 | u'yandex.by', 1552 | u'berkeley.edu', 1553 | u'google.by', 1554 | u'careerbuilder.com', 1555 | u'dropbooks.tv', 1556 | u'hepsiburada.com', 1557 | u'lloydsbank.co.uk', 1558 | u'bufferapp.com', 1559 | u'metrolyrics.com', 1560 | u'kongregate.com', 1561 | u'sueddeutsche.de', 1562 | u'crunchyroll.com', 1563 | u'esuteru.com', 1564 | u'google.si', 1565 | u'uniqlo.com', 1566 | u'gtmetrix.com', 1567 | u'nuvid.com', 1568 | u'jcrew.com', 1569 | u'myanimelist.net', 1570 | u'ft.com', 1571 | u'gnavi.co.jp', 1572 | u'panasonic.jp', 1573 | u'radikal.com.tr', 1574 | u'orf.at', 1575 | u'ubuntu.com', 1576 | u'slashdot.org', 1577 | u'programme-tv.net', 1578 | u'docin.com', 1579 | u'viva.co.id', 1580 | u'anjuke.com', 1581 | u'seekingalpha.com', 1582 | u'billboard.com', 1583 | u'adxcore.com', 1584 | u'52pk.net', 1585 | u'mysql.com', 1586 | u'phonearena.com', 1587 | u'otomoto.pl', 1588 | u'alohatube.com', 1589 | u'smartshopping.com', 1590 | u'giphy.com', 1591 | u'tinypic.com', 1592 | u'metacritic.com', 1593 | u'garmin.com', 1594 | u'esporte.uol.com.br', 1595 | u'f54d6bf2b1.se', 1596 | u'ilmeteo.it', 1597 | u'hsbc.co.uk', 1598 | u'ccidnet.com', 1599 | u'newyorker.com', 1600 | u'unity3d.com', 1601 | u'tunein.com', 1602 | u'fortune.com', 1603 | u'freelotto.com', 1604 | u'tvguide.com', 1605 | u'yenisafak.com.tr', 1606 | u'cam4.com', 1607 | u'southcn.com', 1608 | u'qunar.com', 1609 | u'craigslist.ca', 1610 | u'axisbank.co.in', 1611 | u'imagefap.com', 1612 | u'commbank.com.au', 1613 | u'infoseek.co.jp', 1614 | u'nipic.com', 1615 | u'last.fm', 1616 | u'panet.co.il', 1617 | u'qvc.com', 1618 | u'ynet.co.il', 1619 | u'aliyun.com', 1620 | u'500px.com', 1621 | u'airasia.com', 1622 | u'aeriagames.com', 1623 | u'ning.com', 1624 | u'polyvore.com', 1625 | u'ngacn.cc', 1626 | u'kimiss.com', 1627 | u'discover.com', 1628 | u'hamariweb.com', 1629 | u'wpmudev.org', 1630 | u'angel.co', 1631 | u'1and1.com', 1632 | u'webs.com', 1633 | u'1und1.de', 1634 | u'ad123m.com', 1635 | u'sp.gov.br', 1636 | u'earthlink.net', 1637 | u'dlsite.com', 1638 | u'depositphotos.com', 1639 | u'danawa.com', 1640 | u'pogo.com', 1641 | u'vistaprint.com', 1642 | u'dagbladet.no', 1643 | u'zeit.de', 1644 | u'tobogo.net', 1645 | u'dx.com', 1646 | u'noticias.uol.com.br', 1647 | u'avira.com', 1648 | u'baiducontent.com', 1649 | u'mufg.jp', 1650 | u'abc.go.com', 1651 | u'grantland.com', 1652 | u'avclub.com', 1653 | u'banggood.com', 1654 | u'okwave.jp', 1655 | u'woothemes.com', 1656 | u'elcomercio.pe', 1657 | u'webmoney.ru', 1658 | u'opentable.com', 1659 | u'mangareader.net', 1660 | u'nosub.tv', 1661 | u'zozo.jp', 1662 | u'megafilmeshd.net', 1663 | u'yahoo-mbga.jp', 1664 | u'vrbo.com', 1665 | u'postbank.de', 1666 | u'chosun.com', 1667 | u'nouvelobs.com', 1668 | u'noaa.gov', 1669 | u'blogsky.com', 1670 | u'criteo.com', 1671 | u'4pda.ru', 1672 | u'kizi.com', 1673 | u'miniclip.com', 1674 | u'sakshi.com', 1675 | u'psu.edu', 1676 | u'economist.com', 1677 | u'edmunds.com', 1678 | u'kapanlagi.com', 1679 | u'pcanalysis.net', 1680 | u'mplife.com', 1681 | u'screencast.com', 1682 | u'icims.com', 1683 | u'haberler.com', 1684 | u'2ch-c.net', 1685 | u'sharelive.net', 1686 | u'onlinecreditcenter6.com', 1687 | u't411.io', 1688 | u'voyages-sncf.com', 1689 | u'zergnet.com', 1690 | u'p5w.net', 1691 | u'cloob.com', 1692 | u'elconfidencial.com', 1693 | u'bradesco.com.br', 1694 | u'7apps.me', 1695 | u'uptobox.com', 1696 | u'whois.com', 1697 | u'autotrader.co.uk', 1698 | u'wowhead.com', 1699 | u'venturebeat.com', 1700 | u'ceneo.pl', 1701 | u'fotostrana.ru', 1702 | u'bbb.org', 1703 | u'opensubtitles.org', 1704 | u'barclaycardus.com', 1705 | u'syosetu.com', 1706 | u'picofile.com', 1707 | u'geektoprofessional.com', 1708 | u'asriran.com', 1709 | u'pikabu.ru', 1710 | u'bioyun.com', 1711 | u'v9.com', 1712 | u'dostor.org', 1713 | u'hongkiat.com', 1714 | u'keepvid.com', 1715 | u'mtime.com', 1716 | u'skyscanner.net', 1717 | u'starbucks.com', 1718 | u'way2sms.com', 1719 | u'blogspot.com.ar', 1720 | u'qz.com', 1721 | u'zdnet.com', 1722 | u'oschina.net', 1723 | u'virginmedia.com', 1724 | u'internetdownloadmanager.com', 1725 | u'plurk.com', 1726 | u'cosmopolitan.com', 1727 | u'sports747.com', 1728 | u'gumtree.co.za', 1729 | u'mcafee.com', 1730 | u'sanook.com', 1731 | u'manta.com', 1732 | u'wpbeginner.com', 1733 | u'freegameszonetab.com', 1734 | u'bab.la', 1735 | u'sony.jp', 1736 | u'vgsgaming-ads.com', 1737 | u'dcinside.com', 1738 | u'tvn24.pl', 1739 | u'etorrent.co.kr', 1740 | u'cleartrip.com', 1741 | u'trgino.com', 1742 | u'teknoter.com', 1743 | u'dw.de', 1744 | u'foxsports.com', 1745 | u'xsrving.com', 1746 | u'cmbchina.com', 1747 | u'acesse.com', 1748 | u'sweet-page.com', 1749 | u'pastebin.com', 1750 | u'lg.com', 1751 | u'pcgamer.com', 1752 | u'chinabyte.com', 1753 | u'politico.com', 1754 | u'bookryanair.com', 1755 | u'wargaming.net', 1756 | u'duba.com', 1757 | u'porntube.com', 1758 | u'voyeurhit.com', 1759 | u'drive2.ru', 1760 | u'5278.cc', 1761 | u'dailykos.com', 1762 | u'metro.co.uk', 1763 | u'dantri.com.vn', 1764 | u'azet.sk', 1765 | u'jw.org', 1766 | u'premierleague.com', 1767 | u'xueqiu.com', 1768 | u'caixin.com', 1769 | u'zhibo8.cc', 1770 | u'rockstargames.com', 1771 | u'softpedia.com', 1772 | u'zazzle.com', 1773 | u'dhl.com', 1774 | u'ustream.tv', 1775 | u'sanjesh.org', 1776 | u'weather.com.cn', 1777 | u'breitbart.com', 1778 | u'fotolia.com', 1779 | u'sape.ru', 1780 | u'teamviewer.com', 1781 | u'ucoz.com', 1782 | u'affclicker.com', 1783 | u'iplt20.com', 1784 | u'uproxx.com', 1785 | u'gamme.com.tw', 1786 | u'tigerdirect.com', 1787 | u'netdna-cdn.com', 1788 | u'filmweb.pl', 1789 | u'easyjet.com', 1790 | u'humblebundle.com', 1791 | u'ecollege.com', 1792 | u'blogspot.kr', 1793 | u'ny.gov', 1794 | u'viadeo.com', 1795 | u'videodownloadconverter.com', 1796 | u'aastocks.com', 1797 | u'miniinthebox.com', 1798 | u'google.lv', 1799 | u'cj.com', 1800 | u'homeaway.com', 1801 | u'medicinenet.com', 1802 | u'peyvandha.ir', 1803 | u'myspace.com', 1804 | u'zimbio.com', 1805 | u'rackspace.com', 1806 | u'gumtree.pl', 1807 | u'nextmedia.com', 1808 | u'estadao.com.br', 1809 | u'popmyads.com', 1810 | u'theweathernetwork.com', 1811 | u'kinja.com', 1812 | u'blog.com', 1813 | u'umich.edu', 1814 | u'samsclub.com', 1815 | u'mysmartprice.com', 1816 | u'delicious.com', 1817 | u'greatandhra.com', 1818 | u'say-move.org', 1819 | u'fishki.net', 1820 | u'citi.com', 1821 | u'smi2.ru', 1822 | u'99acres.com', 1823 | u'submarino.com.br', 1824 | u'cliponyu.com', 1825 | u'junglee.com', 1826 | u'shahrekhabar.com', 1827 | u'bdnews24.com', 1828 | u'vporn.com', 1829 | u'wunderlist.com', 1830 | u'uludagsozluk.com', 1831 | u'segundamano.es', 1832 | u'euromillionairesystem.tv', 1833 | u'adultadworld.com', 1834 | u'privilegesbox.net', 1835 | u'trafficserving.com', 1836 | u'admaimai.com', 1837 | u'default-search.net', 1838 | u'sportskeeda.com', 1839 | u'trovigo.com', 1840 | u'ehowenespanol.com', 1841 | u'state.tx.us', 1842 | u'reclameaqui.com.br', 1843 | u'naij.com', 1844 | u'editor.wix.com', 1845 | u'grooveshark.com', 1846 | u'ilfattoquotidiano.it', 1847 | u'duolingo.com', 1848 | u'aljazeera.com', 1849 | u'douguo.com', 1850 | u'yiqifa.com', 1851 | u'fazenda.gov.br', 1852 | u'nature.com', 1853 | u'mic.com', 1854 | u'publishthis.com', 1855 | u'urbanoutfitters.com', 1856 | u'nymag.com', 1857 | u'v1.cn', 1858 | u'khanacademy.org', 1859 | u'greatergood.com', 1860 | u'drupal.org', 1861 | u'realestate.com.au', 1862 | u'mobtada.com', 1863 | u'delta-search.com', 1864 | u'traidnt.net', 1865 | u'nitroflare.com', 1866 | u'1337x.to', 1867 | u'brazzers.com', 1868 | u'mtsindia.in', 1869 | u'santander.co.uk', 1870 | u'huihui.cn', 1871 | u'infospace.com', 1872 | u'liveadoptimizer.com', 1873 | u'xiaomi.com', 1874 | u'ppstream.com', 1875 | u'115.com', 1876 | u'examiner.com', 1877 | u'couchtuner.eu.com', 1878 | u'duowan.com', 1879 | u'paidverts.com', 1880 | u'fanpage.gr', 1881 | u'storypick.com', 1882 | u'fatwallet.com', 1883 | u'kissanime.com', 1884 | u'linio.com.mx', 1885 | u'51cto.com', 1886 | u'zwaar.net', 1887 | u'ozon.ru', 1888 | u'traveltune.com', 1889 | u'inmotionhosting.com', 1890 | u'telekom.com', 1891 | u'poste.it', 1892 | u'wpengine.com', 1893 | u'magicbricks.com', 1894 | u'urbanspoon.com', 1895 | u'nbc.com', 1896 | u'konga.com', 1897 | u'telegraf.com.ua', 1898 | u'ulmart.ru', 1899 | u'ibtimes.co.uk', 1900 | u'n-tv.de', 1901 | u'aizhan.com', 1902 | u'familydoctor.com.cn', 1903 | u'nguoiduatin.vn', 1904 | u'tiscali.it', 1905 | u'yandex.kz', 1906 | u'jugem.jp', 1907 | u'national-lottery.co.uk', 1908 | u'patheos.com', 1909 | u'paytm.in', 1910 | u'purdue.edu', 1911 | u'cornell.edu', 1912 | u'quicksprout.com', 1913 | u'misr5.com', 1914 | u'filmibeat.com', 1915 | u'mashreghnews.ir', 1916 | u'windows.net', 1917 | u'newsmth.net', 1918 | u'fastcompany.com', 1919 | u'fanhuan.com', 1920 | u'bb.com.br', 1921 | u'dawn.com', 1922 | u'cambridge.org', 1923 | u'egrana.com.br', 1924 | u'888casino.com', 1925 | u'nrk.no', 1926 | u'francetvinfo.fr', 1927 | u'kompasiana.com', 1928 | u'kienthuc.net.vn', 1929 | u'ilbe.com', 1930 | u'archiveofourown.org', 1931 | u'profitboosterapp.com', 1932 | u'nguyentandung.org', 1933 | u'ad127m.com', 1934 | u'ranker.com', 1935 | u'vmware.com', 1936 | u'vanguard.com', 1937 | u'css-tricks.com', 1938 | u'soccerway.com', 1939 | u'shop-pro.jp', 1940 | u'4cdn.org', 1941 | u'id.net', 1942 | u'iconfinder.com', 1943 | u'tripadvisor.it', 1944 | u'tokopedia.com', 1945 | u'priceminister.com', 1946 | u'variety.com', 1947 | u'ascii.jp', 1948 | u'codecademy.com', 1949 | u'manoramaonline.com', 1950 | u'joins.com', 1951 | u'collegehumor.com', 1952 | u'autoblog.com', 1953 | u'actcorp.in', 1954 | u'eluniversal.com.mx', 1955 | u'toptenreviews.com', 1956 | u'forgeofempires.com', 1957 | u'index.hu', 1958 | u'freshbooks.com', 1959 | u'starwoodhotels.com', 1960 | u'tv.com', 1961 | u'ksl.com', 1962 | u'neogaf.com', 1963 | u'kicker.de', 1964 | u'clickadu.com', 1965 | u'korabia.com', 1966 | u'timewarnercable.com', 1967 | u'totaladperformance.com', 1968 | u'akhbarelyom.com', 1969 | u'csgolounge.com', 1970 | u'meishichina.com', 1971 | u'pornmd.com', 1972 | u'sunporno.com', 1973 | u'google.kz', 1974 | u'gozooms.com', 1975 | u'ihg.com', 1976 | u'hamusoku.com', 1977 | u'telecomitalia.it', 1978 | u'puu.sh', 1979 | u'hotpepper.jp', 1980 | u'computerbild.de', 1981 | u'cnki.net', 1982 | u'creativemarket.com', 1983 | u'khabaronline.ir', 1984 | u'officedepot.com', 1985 | u'commonfloor.com', 1986 | u'flipboard.com', 1987 | u'yjc.ir', 1988 | u'strava.com', 1989 | u'ucla.edu', 1990 | u'spanishdict.com', 1991 | u'shopstyle.com', 1992 | u'docusign.net', 1993 | u'17173.com', 1994 | u'ouedkniss.com', 1995 | u'rakuten-card.co.jp', 1996 | u'indeed.co.in', 1997 | u'lonelyplanet.com', 1998 | u'bmi.ir', 1999 | u'oneplus.net', 2000 | u'nowvideo.sx', 2001 | u'stubhub.com', 2002 | u'olx.ro', 2003 | u'addmefast.com', 2004 | u'thrillist.com', 2005 | u'flippa.com', 2006 | u'onliner.by', 2007 | u'perezhilton.com', 2008 | u'trafficfactory.biz', 2009 | u'techtudo.com.br', 2010 | u'redbox.com', 2011 | u'fnac.com', 2012 | u'about.me', 2013 | u'barclays.co.uk', 2014 | u'n-mobile.net', 2015 | u'searchenginejournal.com', 2016 | u'upsocl.com', 2017 | u'pravda.com.ua', 2018 | u'fastpic.ru', 2019 | u'fullonlinefilmizle.com', 2020 | u'businessweekly.com.tw', 2021 | u'doisongphapluat.com', 2022 | u'lindaikeji.blogspot.com', 2023 | u'rojadirecta.me', 2024 | u'yixun.com', 2025 | u'clip.vn', 2026 | u'meteofrance.com', 2027 | u'amarujala.com', 2028 | u'celebritytune.com', 2029 | u'kijiji.it', 2030 | u'boc.cn', 2031 | u'msnbc.com', 2032 | u'theglobeandmail.com', 2033 | u'reallifecam.com', 2034 | u'python.org', 2035 | u'17track.net', 2036 | u'nexusmods.com', 2037 | u'britishairways.com', 2038 | u'zoopla.co.uk', 2039 | u'travelocity.com', 2040 | u'so.com', 2041 | u'ilsole24ore.com', 2042 | u'akhbarak.net', 2043 | u'ad131m.com', 2044 | u'apartmenttherapy.com', 2045 | u'rikunabi.com', 2046 | u'wikispaces.com', 2047 | u'samanyoluhaber.com', 2048 | u'eztv.ch', 2049 | u'eroprofile.com', 2050 | u'unam.mx', 2051 | u'unicredit.it', 2052 | u'tagesschau.de', 2053 | u'nnm-club.me', 2054 | u'lumosity.com', 2055 | u'trademe.co.nz', 2056 | u'bigcartel.com', 2057 | u'yr.no', 2058 | u'columbia.edu', 2059 | u'theonion.com', 2060 | u'sankei.com', 2061 | u'face-masr.com', 2062 | u'washington.edu', 2063 | u'fatakat.com', 2064 | u'moviepilot.com', 2065 | u'ohmyzip.com', 2066 | u'83nsdjqqo1cau183xz.com', 2067 | u'miui.com', 2068 | u'enha.kr', 2069 | u'stern.de', 2070 | u'final.ir', 2071 | u'dsrlte.com', 2072 | u'ashemaletube.com', 2073 | u'uiuc.edu', 2074 | u'wemakeprice.com', 2075 | u'jiameng.com', 2076 | u'protothema.gr', 2077 | u'blocket.se', 2078 | u'rozetka.com.ua', 2079 | u'jvzoo.com', 2080 | u'mxttrf.com', 2081 | u'17k.com', 2082 | u'4tube.com', 2083 | u'dorkly.com', 2084 | u'lifehack.org', 2085 | u'wolframalpha.com', 2086 | u'gravatar.com', 2087 | u'buzzhand.com', 2088 | u'shopbop.com', 2089 | u'okezone.com', 2090 | u'hotwire.com', 2091 | u'multitran.ru', 2092 | u'dnaindia.com', 2093 | u'goibibo.com', 2094 | u'news24.com', 2095 | u'cinemablend.com', 2096 | u'alimama.com', 2097 | u'portaleducacao.com.br', 2098 | u'caisse-epargne.fr', 2099 | u'chess.com', 2100 | u'sierratradingpost.com', 2101 | u'techtarget.com', 2102 | u'kmart.com', 2103 | u'censor.net.ua', 2104 | u'hotukdeals.com', 2105 | u'ieee.org', 2106 | u'motthegioi.vn', 2107 | u'iqoption.com', 2108 | u'cdc.gov', 2109 | u'timeout.com', 2110 | u'tut.by', 2111 | u'ssisurveys.com', 2112 | u'guokr.com', 2113 | u'health.com', 2114 | u'blog.ir', 2115 | u'junkmail.co.za', 2116 | u'bgr.com', 2117 | u'diigo.com', 2118 | u'bayt.com', 2119 | u'mensxp.com', 2120 | u'2ch.sc', 2121 | u'indeed.co.uk', 2122 | u'google.com.pr', 2123 | u'xvideo-jp.com', 2124 | u'livetv.sx', 2125 | u'pitchfork.com', 2126 | u'camdolls.com', 2127 | u'dealmoon.com', 2128 | u'etrade.com', 2129 | u'inquisitr.com', 2130 | u'boston.com', 2131 | u'evite.com', 2132 | u'egou.com', 2133 | u'netsuite.com', 2134 | u'searchengines.guru', 2135 | u'privatbank.ua', 2136 | u'edx.org', 2137 | u'emirates.com', 2138 | u'1111.com.tw', 2139 | u'socialmediaexaminer.com', 2140 | u'asp.net', 2141 | u'allmyvideos.net', 2142 | u'wildberries.ru', 2143 | u'sfglobe.com', 2144 | u'wufoo.com', 2145 | u'fanpage.it', 2146 | u'divar.ir', 2147 | u'sony.com', 2148 | u'payoneer.com' 2149 | ] 2150 | -------------------------------------------------------------------------------- /tlp/lib/regex_list.py: -------------------------------------------------------------------------------- 1 | __author__ = "{ ministry of promise }" 2 | __copyright__ = "Copyright 2015, { ministry of promise }" 3 | __license__ = "MIT" 4 | __version__ = "0.1.0" 5 | __maintainer__ = "Adam Nichols" 6 | __email__ = "adam.j.nichols@gmail.com" 7 | __status__ = "Development" 8 | 9 | import re 10 | 11 | regexs = { 12 | 13 | 'ip': re.compile(r'^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$', re.I), 14 | 'domain': re.compile(r'([a-zA-Z0-9\-]{2,}\.)+[a-zA-Z]{2,}', re.I), 15 | 'md5': re.compile(r'^[a-fA-F0-9]{32}$', re.I), 16 | 'sha1': re.compile(r'^[a-fA-F0-9]{40}$', re.I), 17 | 'sha256': re.compile(r'^[a-fA-F0-9]{64}$', re.I), 18 | 'cve': re.compile(r'cve.+?\d{4}.+?\d{4}.*', re.I) 19 | 20 | } 21 | -------------------------------------------------------------------------------- /tlp/tlp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | tlp is a python library that parses a body of text for indicators of compromise (iocs), 5 | leveraging the amazing textblob and nltk natural language processing modules to derive 6 | context and color around those iocs. 7 | ''' 8 | 9 | import nltk,re,operator,math,pprint 10 | import numpy as np 11 | from tlp_filter import TLPFilter 12 | from nltk.corpus import stopwords 13 | from nltk.util import ngrams 14 | from collections import Counter 15 | from textblob import TextBlob 16 | from lib.regex_list import regexs 17 | 18 | __author__ = "{ ministry of promise }" 19 | __copyright__ = "Copyright 2015, { ministry of promise }" 20 | __license__ = "MIT" 21 | __version__ = "0.1.0" 22 | __maintainer__ = "Adam Nichols" 23 | __email__ = "adam.j.nichols@gmail.com" 24 | __status__ = "Development" 25 | 26 | class TLP: 27 | 28 | def __init__(self, raw_text=None, text_title=None): 29 | 30 | try: 31 | # props for internal use 32 | self._raw_text = raw_text 33 | self._text_title = text_title 34 | 35 | # props to store data 36 | self._summary = str() 37 | self._keywords = set() 38 | self._iocs = dict() 39 | self._tlp = None 40 | self._debug = dict({'iocs': dict(), 'keywords': dict()}) 41 | 42 | if self._raw_text != None: 43 | if not type(self._raw_text) is unicode: 44 | self._raw_text = self._raw_text.decode('utf8') 45 | self._tlpfilter = TLPFilter() 46 | self._clean_text = self._tlpfilter.text(self._raw_text) 47 | self._blob = TextBlob(self._raw_text) 48 | self._clean_blob = TextBlob(self._clean_text) 49 | 50 | except Exception as e: 51 | import traceback 52 | traceback.print_exc() 53 | 54 | 55 | @property 56 | def iocs(self): 57 | '''returns a filtered list of iocs''' 58 | 59 | try: 60 | if len(self._iocs) > 0: 61 | return self._iocs 62 | 63 | # prime the dict 64 | self._iocs = dict((k, set()) for k in regexs) 65 | 66 | # parse iocs 67 | data = self._tlpfilter.iocs(self._raw_text, mode='pre') 68 | for w in data: 69 | for name,pattern in regexs.iteritems(): 70 | if pattern.match(w): 71 | self._iocs[name].add(w) 72 | self._iocs = self._tlpfilter.iocs(self._iocs, mode='post') 73 | for key in self._iocs: 74 | self._debug['iocs'][key] = len(self._iocs[key]) 75 | return self._iocs 76 | 77 | except Exception as e: 78 | raise e 79 | 80 | 81 | @property 82 | def text(self): 83 | '''returns the complete filtered text''' 84 | 85 | try: 86 | return " ".join([s.raw for s in self._clean_blob.sentences]) 87 | 88 | except Exception as e: 89 | raise e 90 | 91 | 92 | @property 93 | def debug(self): 94 | '''returns debug info - must run 'keywords' or 'iocs' to populate''' 95 | return self._debug 96 | 97 | 98 | @property 99 | def summary(self): 100 | '''returns document summary''' 101 | 102 | try: 103 | if len(self._summary) > 0: 104 | return self._summary 105 | 106 | sentences = self._clean_blob.sentences 107 | slen = len(sentences) 108 | sixth_pctl = int(math.floor(slen * .06)) 109 | if sixth_pctl < 8: 110 | summ_len = sixth_pctl if sixth_pctl > 2 else 2 111 | else: 112 | summ_len = 8 113 | 114 | return " ".join([s.raw for s in sentences[:summ_len]]) 115 | 116 | except Exception as e: 117 | raise e 118 | 119 | 120 | @property 121 | def color(self): 122 | '''returns tlp color (if present)''' 123 | 124 | try: 125 | bigrams = ngrams(self._raw_text.split(), 2) 126 | colors = set() 127 | for b in bigrams: 128 | (one, two) = b 129 | if re.search('(?:tlp|TLP)', one): 130 | colors.add(two.lower()) 131 | 132 | return colors 133 | 134 | except Exception as e: 135 | raise e 136 | 137 | 138 | @property 139 | def keywords(self): 140 | '''returns document keywords and occurance counts''' 141 | 142 | try: 143 | if len(self._keywords) > 0: 144 | return self._keywords 145 | 146 | #blob = TextBlob(self.summary) 147 | blob = TextBlob(self._clean_text) 148 | keywords = self._blob.words 149 | keywords = self._tlpfilter.keywords(keywords) 150 | keywords_counted = dict(Counter(keywords)) 151 | total_count = 0 152 | keywords_dict = dict() 153 | for word, count in keywords_counted.iteritems(): 154 | 155 | if len(word) == 0: 156 | continue 157 | 158 | # you're certainly not popular if you only occur once 159 | # if you are popular, and you're longer than 3 chars, you win 160 | 161 | total_count += count if count > 1 else 0 162 | pos_array = nltk.pos_tag(nltk.word_tokenize(word)) 163 | w,pos = pos_array[0] 164 | if re.search('.*[NN|NP]$', pos): 165 | if len(w) > 3: 166 | keywords_dict[word] = count 167 | 168 | keyword_scores = [v for (k,v) in keywords_dict.iteritems()] 169 | keywords_count = np.count_nonzero(keyword_scores) 170 | keywords_mean = np.mean(keyword_scores) 171 | keywords_std = np.std(keyword_scores) 172 | 173 | self._debug['keywords']['total'] = sum(keyword_scores) 174 | self._debug['keywords']['mean'] = keywords_mean 175 | self._debug['keywords']['std'] = keywords_std 176 | 177 | new_dict = dict([(k,v) for (k,v) in keywords_dict.iteritems() if v > (keywords_mean + (keywords_std * 4))]) 178 | self._keywords = sorted(new_dict.items(), key=operator.itemgetter(1), reverse = True) 179 | 180 | return self._keywords 181 | 182 | except Exception as e: 183 | raise e 184 | -------------------------------------------------------------------------------- /tlp/tlp_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ''' 4 | tlp is a python library that parses a body of text for indicators of compromise (iocs), 5 | leveraging the amazing textblob and nltk natural language processing modules to derive 6 | context and color around those iocs. 7 | ''' 8 | 9 | from textblob import TextBlob 10 | from textblob import Sentence 11 | from textblob import WordList 12 | from textblob import Word 13 | from nltk.corpus import stopwords as sw 14 | from collections import Counter 15 | from Levenshtein import distance 16 | from lib.filter_list import * 17 | from pkg_resources import resource_filename, Requirement 18 | import numpy as np 19 | import types,re,operator,codecs 20 | 21 | __author__ = "{ ministry of promise }" 22 | __copyright__ = "Copyright 2015, { ministry of promise }" 23 | __license__ = "MIT" 24 | __version__ = "0.1.0" 25 | __maintainer__ = "Adam Nichols" 26 | __email__ = "adam.j.nichols@gmail.com" 27 | __status__ = "Development" 28 | 29 | class TLPFilter: 30 | 31 | def __init__(self, user_filterlist=None): 32 | 33 | try: 34 | # initialize some junk 35 | self.user_filterlist = None 36 | self.ubl_file_obj = None 37 | self.global_filterlist = list() 38 | 39 | # check for filterlist, handle accordingly 40 | 41 | if user_filterlist is not None: 42 | 43 | self.user_filterlist = user_filterlist 44 | 45 | if type(self.user_filterlist) is list: 46 | pass 47 | elif type(self.user_filterlist) is str: 48 | self.ubl_file_obj = open(self.user_filterlist, "r") 49 | for line in self.ubl_file_obj.readlines(): 50 | if re.match('^#', line): 51 | continue 52 | self.global_filterlist.append(unicode(line.strip().lower())) 53 | #self.global_filterlist + keyword_filterlist 54 | else: 55 | raise ValueError("supplied blacklist is not of type or ") 56 | 57 | except Exception as e: 58 | raise e 59 | 60 | 61 | ''' 62 | utility functions 63 | 64 | misc. stuff used elsewhere: 65 | - nonalpha_pct: function to determine the ratio of non-alpha tokens in a sentence 66 | - nonalpha_thresh: analyzes each sentence in a blob and derives the threshold 67 | ratio of junk tokens per sentence. 68 | - moz_tlds: return a list of tlds from the mozilla project tld list 69 | maintained at https://publicsuffix.org/list/effective_tld_names.dat 70 | ''' 71 | 72 | def nonalpha_pct(self, sentence): 73 | 74 | try: 75 | # TODO - NASTY HACK 76 | if len(sentence) == 0: 77 | return 1 78 | tokens = sentence.tokens 79 | nonalpha_count = 0 80 | for token in tokens: 81 | if not re.search('[a-zA-Z0-9]+', token): 82 | nonalpha_count += 1 83 | 84 | return (float(nonalpha_count)/float(len(tokens))) * 100 85 | 86 | except Exception as e: 87 | raise e 88 | 89 | 90 | def nonalpha_thresh(self, blob): 91 | 92 | try: 93 | sentences = blob.sentences 94 | nonalpha_ratios = [] 95 | for s in sentences: 96 | nonalpha_ratios.append(self.nonalpha_pct(s)) 97 | 98 | return(np.median(nonalpha_ratios) + (np.std(nonalpha_ratios) * 3)) 99 | 100 | except Exception as e: 101 | raise e 102 | 103 | 104 | def moz_tlds(self): 105 | 106 | try: 107 | effective_tld_names = resource_filename(Requirement.parse('tlp'), 'tlp/lib/effective_tld_names.dat') 108 | f = codecs.open(effective_tld_names, 'r', 'utf-8') 109 | moz_tlds = f.readlines() 110 | moz_tlds = [item.strip() for item in moz_tlds if not (re.match('^//', item) or re.match('^$', item))] 111 | 112 | return moz_tlds 113 | 114 | except Exception as e: 115 | raise e 116 | 117 | ''' 118 | filter functions 119 | 120 | take the TextBlob primative types (sentences, words, tokens) and clean them through the removal or normalization of: 121 | - items without punctuation 122 | - items with titlized capitalization 123 | - "sentences" that contain newlines (how to seperate junk from poorly formatted content?) 124 | - repeated sentences 125 | - headers/footers/content section labels 126 | - items with a high punctuation, or "junk" ratio (ToCs, page numbers, etc) 127 | 128 | each function in this section should produce output for itself, as well as adding to the global blacklist of items 129 | that should be removed from the statistical analysis of text to produce summary and keywords. 130 | ''' 131 | 132 | def text(self, text): 133 | 134 | try: 135 | # check for text, transform to unicode if necessary 136 | if text is not None: 137 | if not (type(text) is unicode or type(text) is str): 138 | raise TypeError("supplied text object of type that is not str or unicode") 139 | else: 140 | if not type(text) is unicode: 141 | text = text.decode('utf8') 142 | 143 | # unicode? unicode. 144 | blob = TextBlob(text) 145 | blob_nonalpha_thresh = self.nonalpha_thresh(blob) 146 | else: 147 | raise ValueError("no input text supplied") 148 | 149 | 150 | # replace all instances of sentences broken by newline 151 | # to ensure that we're dealing with contiguous text 152 | 153 | s1_text = re.sub(r'([a-z\,]+)[\n\r]+?([^A-Z0-9]+?)', r'\1 \2', text) 154 | s1_list = list() 155 | 156 | # try to remove header-type section labels through the use of some convoluted 157 | # rule bs. really not elegant, but it works. 158 | 159 | stopwords = sw.words("english") 160 | for sentence in s1_text.split('\n'): 161 | # line begins or ends with a number - maybe ToC or heading 162 | if re.match('(?:^[0-9]+?|[0-9]+?[\n\r]+?$)', sentence): 163 | continue 164 | words = sentence.split() 165 | if len(words) <= 3: 166 | continue 167 | for word in words: 168 | # boring word 169 | if word.lower() in stopwords: 170 | continue 171 | # not a word - unicode bullets or other nonsense 172 | if not re.match(r'\w+?', word): 173 | continue 174 | # links 175 | if re.match(r'^[a-zA-Z]+\:\/\/', word): 176 | continue 177 | # no title case headings 178 | if not re.match('^[0-9A-Z]{1}', word): 179 | s1_list.append(sentence) 180 | break 181 | 182 | # let's clear out anything with a nonalpha token ratio higher than the threshold 183 | 184 | s2_list = [s for s in s1_list if self.nonalpha_pct(Sentence(s)) < (len(s)/4)] 185 | 186 | # now that we've got a semi-clean set of data, we can do some statistical analysis 187 | # to determine if we've got a lot of repeat data like headers/footers/copyrights 188 | # that can skew our keyword stats 189 | 190 | sentence_counts = Counter(s2_list) 191 | sc_series = [v for (k,v) in sentence_counts.iteritems()] 192 | sc_std = np.std(sc_series) 193 | sc_median = np.median(sc_series) 194 | 195 | # if we have repeating text, rebuilt it minus that noise, or anything 196 | # specified in the global blacklist 197 | 198 | if sc_median >= 1: 199 | final_list = [] 200 | 201 | # some edge cases force us to break outlier "sentences" into smaller units 202 | # for comparison later 203 | # 204 | # once the list is built, we have to check a few different ways to ensure 205 | # we are removing all the noise we can 206 | 207 | sentence_outliers = [k.strip().lower() for (k,v) in sentence_counts.iteritems() if v >= (sc_median + (sc_std * 2)) > 1] 208 | self.global_filterlist += sentence_outliers 209 | for s in s2_list: 210 | if s.lower() in self.global_filterlist: 211 | continue 212 | for o in sentence_outliers: 213 | if distance(o, s.lower()) < float(len(s) * .35): 214 | break 215 | elif o in s.lower(): 216 | break 217 | else: 218 | final_list.append(s) 219 | 220 | # text had no repeats or noise to filter (rare) 221 | else: 222 | final_list = s2_list 223 | 224 | # we out 225 | return " ".join(final_list) 226 | 227 | except Exception as e: 228 | raise e 229 | 230 | 231 | def keywords(self, keywords): 232 | 233 | try: 234 | if keywords is not None: 235 | if not (isinstance(keywords, list) or isinstance(keywords, WordList)): 236 | raise TypeError('supplied keyword object of type that is not list or TextBlob.WordList') 237 | else: 238 | if isinstance(keywords, list): 239 | keywords = [Word(word.lower()) for word in keywords] 240 | else: 241 | raise ValueError('no input keywords supplied') 242 | 243 | # normalize case 244 | words = [word.lower() for word in keywords] 245 | 246 | # remove all stopwords 247 | stopwords = sw.words("english") 248 | words = [word for word in words if word not in stopwords] 249 | #words = [word for word in keywords] 250 | nwords = [] 251 | for word in words: 252 | if word in keyword_filterlist: 253 | #if word.string in keyword_filterlist: 254 | continue 255 | for term in self.global_filterlist: 256 | #if word.string in term: 257 | if word in term: 258 | pass 259 | #break 260 | else: 261 | nwords.append(word) 262 | 263 | # remove plural, reduce to stems 264 | # textblob breaks possessives and other contractions into 265 | # two distinct words, but sometimes leaves a trailing unicode 266 | # apostrophe - if so, strip it 267 | 268 | words = [word.strip(u'\u2019') for word in nwords] 269 | 270 | return words 271 | 272 | except Exception as e: 273 | raise e 274 | 275 | 276 | def iocs(self, data, mode): 277 | 278 | try: 279 | if not (mode == 'pre' or mode == 'post'): 280 | raise ValueError('invalid mode specified') 281 | 282 | # pre-filter to clean raw text for optimal ioc parsing 283 | if mode == 'pre': 284 | 285 | if not data or type(data) is not unicode: 286 | raise ValueError('invalid data supplied') 287 | 288 | # stupid "object replacement character" -- essentially a utf space 289 | data = [re.sub(ur'\uFFFC+', ' ', w) for w in data] 290 | 291 | # other whitespace 292 | data = [re.sub(ur'[\s\t\n\r]', ' ', w) for w in data] 293 | 294 | # remove the neuter braces, if present 295 | data = [re.sub(ur'[\[\]]+', '', w) for w in data] 296 | 297 | data = "".join(data).split(' ') 298 | 299 | return data 300 | 301 | # post-filter to remove good sites, and other blacklisted iocs 302 | if mode == 'post': 303 | 304 | if not type(data) is dict or data is None: 305 | raise ValueError('invalid data supplied') 306 | 307 | # filter domain 308 | moz_tlds = self.moz_tlds() 309 | for ioc in data['domain'].copy(): 310 | filterlist = alexa_filterlist + ioc_filterlist['domain'] 311 | for domain in filterlist: 312 | re_match = ur'.*' + re.escape(domain) + '$' 313 | if re.match(re_match, ioc): 314 | data['domain'].remove(ioc) 315 | break 316 | 317 | for tld in moz_tlds: 318 | re_match = ur'.*\.?' + re.escape(tld) + '$' 319 | if re.match(re_match, ioc): 320 | break 321 | else: 322 | data['domain'].remove(ioc) 323 | 324 | # filter ip 325 | for ioc in data['ip'].copy(): 326 | filterlist = ioc_filterlist['ip'] 327 | if ioc in filterlist: 328 | data['ip'].remove(ioc) 329 | 330 | return data 331 | 332 | except Exception as e: 333 | raise e 334 | --------------------------------------------------------------------------------