├── stringlifier ├── __init__.py ├── data ├── modules │ ├── __init__.py │ ├── training.py │ ├── stringc2.py │ └── stringc.py └── api.py ├── data ├── enhanced-c.conf ├── enhanced-c.last ├── enhanced-c.bestType └── enhanced-c.encodings ├── requirements.txt ├── MANIFEST.in ├── .github ├── PULL_REQUEST_TEMPLATE.md ├── ISSUE_TEMPLATE.md └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── COPYRIGHT ├── corpus └── README.md ├── scripts ├── 02-02-split-generated-enhanced-data.py ├── 01-01-generate-synthetic-training-data.py ├── 02-split-generated-data.py └── 01-02-generate-enhanced-synthetic-training-data.py ├── CONTRIBUTING.md ├── setup.py ├── .gitignore ├── CODE_OF_CONDUCT.md ├── README.md └── LICENSE /stringlifier/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /stringlifier/data: -------------------------------------------------------------------------------- 1 | ../data -------------------------------------------------------------------------------- /stringlifier/modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/enhanced-c.conf: -------------------------------------------------------------------------------- 1 | {"char_emb_size": 100, "rnn_layers": 2, "rnn_size": 100, "hidden": 500} -------------------------------------------------------------------------------- /data/enhanced-c.last: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/stringlifier/HEAD/data/enhanced-c.last -------------------------------------------------------------------------------- /data/enhanced-c.bestType: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adobe/stringlifier/HEAD/data/enhanced-c.bestType -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipdb>=0.13.4 2 | nptyping>=2.5.0 3 | numpy>=2.2.6 4 | PyJWT>=2.10.1 5 | setuptools>=80.7.1 6 | torch>=2.7.0 7 | tqdm>=4.67.1 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data/string-c.bestType 2 | include data/string-c.conf 3 | include data/string-c.encodings 4 | include data/enhanced-c.bestType 5 | include data/enhanced-c.conf 6 | include data/enhanced-c.encodings -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | Brief description of what this PR does, and why it is needed. 3 | 4 | ### Demo 5 | Optional. Screenshots, `curl` examples, etc. 6 | 7 | ### Notes 8 | Optional. Ancillary topics, caveats, alternative strategies that didn't work out, anything else. 9 | 10 | ## Testing Instructions 11 | * How to test this PR 12 | * Prefer bulleted description 13 | * Start after checking out this branch 14 | * Include any setup required, such as bundling scripts, restarting services, etc. 15 | * Include test case, and expected output 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Summary of the issue. 2 | 3 | ## Expected Result 4 | 5 | What you expected. 6 | 7 | ## Actual Result 8 | 9 | What happened instead. 10 | 11 | ## Reproduction Steps 12 | 13 | Please provide the commands list that help others reproduce the same issue you're experiencing. 14 | 15 | ## System Information 16 | 17 | Please provide some basic information about your system: 18 | * GPU support (enabled/disabled) 19 | * Hyperparameters used 20 | * Training set used (embeddings, language, ud-version) 21 | * Dynet version 22 | * Python version 23 | * Operating system -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | 5 | --- 6 | 7 | **Is your feature request related to a problem? Please describe.** 8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 9 | 10 | **Describe the solution you'd like** 11 | A clear and concise description of what you want to happen. 12 | 13 | **Describe alternatives you've considered** 14 | A clear and concise description of any alternative solutions or features you've considered. 15 | 16 | **Additional context** 17 | Add any other context or screenshots about the feature request here. 18 | -------------------------------------------------------------------------------- /data/enhanced-c.encodings: -------------------------------------------------------------------------------- 1 | {"char2int": {"": 0, "": 1, "{": 2, "+": 3, "c": 4, "r": 5, "e": 6, "a": 7, "m": 8, "i": 9, "l": 10, "y": 11, "}": 12, " ": 13, "$": 14, "5": 15, "f": 16, "9": 17, "1": 18, "3": 19, "8": 20, "2": 21, "-": 22, "7": 23, "0": 24, "4": 25, "d": 26, "6": 27, "b": 28, "x": 29, "t": 30, "w": 31, "u": 32, "v": 33, "n": 34, "h": 35, "o": 36, "%": 37, "q": 38, "<": 39, "s": 40, "g": 41, "/": 42, "p": 43, "#": 44, "j": 45, "k": 46, "z": 47, ".": 48, "_": 49, ":": 50, "*": 51, "=": 52, ",": 53, "&": 54, "'": 55, "?": 56, "\"": 57, ">": 58, "!": 59, "(": 60, ")": 61, "\\": 62, "[": 63, "]": 64, "|": 65, "`": 66, "~": 67, ";": 68, "@": 69}, "label2int": {"": 0, "C": 1, "U": 2, "H": 3, "J": 4, "N": 5, "I": 6}} -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | The following copyright message should appear at the top of all 2 | source files. This file can be removed from your repository. 3 | 4 | Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | 5 | --- 6 | 7 | **Describe the bug** 8 | A clear and concise description of what the bug is. 9 | 10 | **To Reproduce** 11 | Steps to reproduce the behavior: 12 | 1. Go to '...' 13 | 2. Click on '....' 14 | 3. Scroll down to '....' 15 | 4. See error 16 | 17 | **Expected behavior** 18 | A clear and concise description of what you expected to happen. 19 | 20 | **Screenshots** 21 | If applicable, add screenshots to help explain your problem. 22 | 23 | **Desktop (please complete the following information):** 24 | - OS: [e.g. iOS] 25 | - Browser [e.g. chrome, safari] 26 | - Version [e.g. 22] 27 | 28 | **Additional context** 29 | Add any other context about the problem here. 30 | -------------------------------------------------------------------------------- /corpus/README.md: -------------------------------------------------------------------------------- 1 | # Standard training data 2 | 3 | The training data was generated by running `scripts/01-generate-synthetic-training-data.py` and `scripts/02-split-generated-data.py` on a list of common english words, available [here](https://raw.githubusercontent.com/dwyl/english-words/master/words_alpha.txt). 4 | 5 | # Generating your own training data 6 | 7 | If you want to generate your own dataset, you simply need to create a training and a validation file. They follow a simple format: 8 | 9 | ```text 10 | 11 | ``` 12 | 13 | **Example** 14 | 15 | ```text 16 | ngnix STRING PROGRAM 17 | Y29tbWl4dHVyZQ== HASH PASSWORD 18 | b3d2cf2ec3894374b37d1b79edd57ad4 HASH API_KEY 19 | 9c795829-75bc-4596-87d3-3508372bbf5f HASH API_KEY 20 | licenser STRING WORD 21 | ``` 22 | 23 | **NOTE:** There are no predefined values for `type` and `subtype`. -------------------------------------------------------------------------------- /scripts/02-02-split-generated-enhanced-data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | lines = open('corpus/generated-enhanced').readlines() 18 | f_train = open('corpus/enhanced-train', 'w') 19 | f_dev = open('corpus/enhanced-dev', 'w') 20 | 21 | for ii in range(len(lines) // 2): 22 | word = lines[ii * 2] 23 | mask = lines[ii * 2 + 1] 24 | f = f_train 25 | if ii % 10 == 5: 26 | f = f_dev 27 | f.write(word + mask) 28 | 29 | f_train.close() 30 | f_dev.close() 31 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Thanks for choosing to contribute! 4 | 5 | The following are a set of guidelines to follow when contributing to this project. 6 | 7 | ## Code Of Conduct 8 | 9 | This project adheres to the Adobe [code of conduct](CODE_OF_CONDUCT.md). By participating, you are expected to uphold this code. Please report unacceptable behavior to Grp-opensourceoffice@adobe.com. 10 | 11 | ## Contributor License Agreement 12 | 13 | All third-party contributions to this project must be accompanied by a signed contributor license agreement. This gives Adobe permission to redistribute your contributions as part of the project. [Sign our CLA](http://opensource.adobe.com/cla.html). You only need to submit an Adobe CLA one time, so if you have submitted one previously, you are good to go! 14 | 15 | ## Code Reviews 16 | 17 | All submissions should come in the form of pull requests and need to be reviewed by project committers. Read [GitHub's pull request documentation](https://help.github.com/articles/about-pull-requests/) for more information on sending pull requests. 18 | 19 | Lastly, please follow the [pull request template](.github/PULL_REQUEST_TEMPLATE.md) when submitting a pull request! 20 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | def parse_requirements(filename, session=None): 5 | """ load requirements from a pip requirements file """ 6 | lineiter = (line.strip() for line in open(filename)) 7 | return [line for line in lineiter if line and not line.startswith("#")] 8 | 9 | 10 | with open("README.md", "r") as fh: 11 | long_description = fh.read() 12 | 13 | setuptools.setup( 14 | name="stringlifier", 15 | version="0.1.1.4", 16 | author="Multiple authors", 17 | author_email="tiberiu44@gmail.com", 18 | description="Python module for detecting password, api keys hashes and any other string that resembles a randomly generated character sequence.", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | url="https://github.com/adobe/stringlifier", 22 | packages=setuptools.find_packages(), 23 | install_requires=parse_requirements('requirements.txt', session=False), 24 | classifiers=( 25 | "Programming Language :: Python :: 3.0", 26 | "License :: OSI Approved :: Apache Software License", 27 | "Operating System :: OS Independent", 28 | ), 29 | include_package_data=True, 30 | package_data={ 31 | '': ['data/string-c.encodings', 'data/string-c.conf', 'data/string-c.bestType', 'data/enhanced-c.encodings', 32 | 'data/enhanced-c.conf', 'data/enhanced-c.bestType'] 33 | 34 | }, 35 | # data_files=['data/string-c.encodings', 'data/string-c.conf', 'data/string-c.bestType'], 36 | zip_safe=False 37 | ) 38 | -------------------------------------------------------------------------------- /scripts/01-01-generate-synthetic-training-data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | known_words = [] 18 | 19 | 20 | def generate_words(count, known_words): 21 | import uuid 22 | import datetime 23 | import base64 24 | generated = [] 25 | for ii in range(count): 26 | if ii % 4 == 0: 27 | generated.append(str(uuid.uuid4())) 28 | elif ii % 4 == 1: 29 | generated.append(str(uuid.uuid4().hex)) 30 | elif ii % 4 == 2: 31 | generated.append(str(datetime.datetime.now().timestamp())) 32 | elif ii % 4 == 3: 33 | message = known_words[ii] 34 | message_bytes = message.encode('ascii') 35 | base64_bytes = base64.b64encode(message_bytes) 36 | base64_message = base64_bytes.decode('ascii') 37 | generated.append(base64_message) 38 | return generated 39 | 40 | 41 | lines = open('corpus/words_alpha.txt').readlines() 42 | for line in lines: 43 | known_words.append(line.strip()) 44 | 45 | generated_words = generate_words(len(known_words), known_words) 46 | 47 | f = open('corpus/generated', 'w') 48 | for ii in range(len(known_words)): 49 | f.write(known_words[ii] + '\tSTRING\n') 50 | f.write(generated_words[ii] + '\tHASH\n') 51 | f.close() 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.conf 4 | *.py[cod] 5 | *.csv 6 | *$py.class 7 | *.idea 8 | # C extensions 9 | *.so 10 | *.json 11 | *.csv 12 | *.zip 13 | *.counts 14 | pan* 15 | corpus/ 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | .DS_Store 34 | data/ 35 | 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # celery beat schedule file 89 | celerybeat-schedule 90 | 91 | # SageMath parsed files 92 | *.sage.py 93 | 94 | # Environments 95 | .env 96 | .venv 97 | env/ 98 | venv/ 99 | ENV/ 100 | env.bak/ 101 | venv.bak/ 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | components/nod-c/etc/* 116 | -------------------------------------------------------------------------------- /scripts/02-split-generated-data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | 18 | string_list = [] 19 | hash_list = [] 20 | 21 | lines = open('corpus/generated').readlines() 22 | 23 | for line in lines: 24 | parts = line.strip().split('\t') 25 | if parts[1] == 'STRING': 26 | string_list.append(parts[0]) 27 | else: 28 | hash_list.append(parts[0]) 29 | 30 | train_data = [ 31 | ('usr', 'STRING'), 32 | ('var', 'STRING'), 33 | ('lib', 'STRING'), 34 | ('etc', 'STRING'), 35 | ('tmp', 'STRING'), 36 | ('dev', 'STRING'), 37 | ('libexec', 'STRING'), 38 | ('lib32', 'STRING'), 39 | ('lib64', 'STRING'), 40 | ('bin', 'STRING') 41 | ] 42 | dev_data = [] 43 | 44 | 45 | def add_data(train, dev, list, label): 46 | for ii in range(len(list)): 47 | if ii % 10 == 0: 48 | dev.append((list[ii], label)) 49 | else: 50 | train.append((list[ii], label)) 51 | 52 | 53 | add_data(train_data, dev_data, string_list, "STRING") 54 | add_data(train_data, dev_data, hash_list, "HASH") 55 | 56 | import random 57 | 58 | random.shuffle(train_data) 59 | random.shuffle(dev_data) 60 | 61 | f_train = open('corpus/string-train', 'w') 62 | f_dev = open('corpus/string-dev', 'w') 63 | 64 | for ii in range(len(train_data)): 65 | if train_data[ii][1] == 'HASH': 66 | stype = 'HASH' 67 | else: 68 | stype = 'WORD' 69 | f_train.write(train_data[ii][0] + '\t' + train_data[ii][1] + '\t' + stype + '\n') 70 | for ii in range(len(dev_data)): 71 | if dev_data[ii][1] == 'HASH': 72 | stype = 'HASH' 73 | else: 74 | stype = 'WORD' 75 | f_dev.write(dev_data[ii][0] + '\t' + dev_data[ii][1] + '\t' + stype + '\n') 76 | 77 | f_train.close() 78 | f_dev.close() 79 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Adobe Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at Grp-opensourceoffice@adobe.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Downloads](https://pepy.tech/badge/stringlifier)](https://pepy.tech/project/stringlifier) [![Downloads](https://pepy.tech/badge/stringlifier/month)](https://pepy.tech/project/stringlifier/month) ![Weekly](https://img.shields.io/pypi/dw/stringlifier.svg) ![daily](https://img.shields.io/pypi/dd/stringlifier.svg) 2 | ![Version](https://badge.fury.io/py/stringlifier.svg) [![Python 3](https://img.shields.io/badge/python-3-blue.svg)](https://www.python.org/downloads/release/python-360/) [![GitHub stars](https://img.shields.io/github/stars/adobe/stringlifier.svg?style=social&label=Star&maxAge=2592000)](https://github.com/adobe/stringlifier/stargazers/) 3 | 4 | # stringlifier 5 | String-classifier - is a python module for detecting random string and hashes text/code. 6 | 7 | Typical usage scenarios include: 8 | 9 | * Sanitizing application or security logs 10 | * Detecting accidentally exposed credentials (complex passwords or api keys) 11 | 12 | # Interactive notebook 13 | 14 | You can see Stringlifier in action by checking out this [interactive notebook hosted on Colaboratory](https://colab.research.google.com/drive/1bgZQSKhVAYU4r46wqb0v8Sfvuo_yMOLA?usp=sharing). 15 | 16 | # Quick start guide 17 | 18 | You can quickly use stringlifier via pip-installation: 19 | ```bash 20 | $ pip install stringlifier 21 | ``` 22 | In case you are using the pip3 installation that comes with Python3, use pip3 instead of pip in the above command. 23 | ```bash 24 | $ pip3 install stringlifier 25 | ``` 26 | 27 | API example: 28 | ```python 29 | from stringlifier.api import Stringlifier 30 | 31 | stringlifier=Stringlifier() 32 | 33 | s = stringlifier("com.docker.hyperkit -A -u -F vms/0/hyperkit.pid -c 8 -m 8192M -b 127.0.0.1 --pass=\"NlcXVpYWRvcg\" -s 0:0,hostbridge -s 31,lpc -s 1:0,virtio-vpnkit,path=vpnkit.eth.sock,uuid=45172425-08d1-41ec-9d13-437481803412 -U c6fb5010-a83e-4f74-9a5a-50d9086b9") 34 | ``` 35 | 36 | After this, `s` should be: 37 | 38 | ```'com.docker.hyperkit -A -u -F vms/0/hyperkit.pid -c 8 -m 8192M -b --pass="" -s 0:0,hostbridge -s 31,lpc -s 1:0,virtio-vpnkit,path=vpnkit.eth.sock,uuid= -U '``` 39 | 40 | You can also choose to see the full tokenization and classification output: 41 | 42 | ```python 43 | s, tokens = stringlifier("com.docker.hyperkit -A -u -F vms/0/hyperkit.pid -c 8 -m 8192M -b 127.0.0.1 --pass=\"NlcXVpYWRvcg\" -s 0:0,hostbridge -s 31,lpc -s 1:0,virtio-vpnkit,path=vpnkit.eth.sock,uuid=45172425-08d1-41ec-9d13-437481803412 -U c6fb5010-a83e-4f74-9a5a-50d9086b9", return_tokens=True) 44 | ``` 45 | 46 | `s` will be the same as before and `tokens` will contain the following data: 47 | ```python 48 | [[('0', 33, 34, ''), 49 | ('8', 51, 52, ''), 50 | ('8192', 56, 60, ''), 51 | ('127.0.0.1', 65, 74, ''), 52 | ('NlcXVpYWRvcg', 83, 95, ''), 53 | ('0', 100, 101, ''), 54 | ('0', 102, 103, ''), 55 | ('31', 118, 120, ''), 56 | ('1', 128, 129, ''), 57 | ('0', 130, 131, ''), 58 | ('45172425-08d1-41ec-9d13-437481803412', 172, 208, ''), 59 | ('c6fb5010-a83e-4f74-9a5a-50d9086b9', 212, 244, '')]] 60 | ``` 61 | 62 | 63 | 64 | # Building your own classifier 65 | 66 | You can also train your own model if you want to detect different types of strings. For this you can use the Command Line Interface for the string classifier: 67 | 68 | ```bash 69 | $ python3 stringlifier/modules/stringc.py --help 70 | 71 | Usage: stringc.py [options] 72 | 73 | Options: 74 | -h, --help show this help message and exit 75 | --interactive 76 | --train 77 | --resume 78 | --train-file=TRAIN_FILE 79 | --dev-file=DEV_FILE 80 | --store=OUTPUT_BASE 81 | --patience=PATIENCE (default=20) 82 | --batch-size=BATCH_SIZE 83 | (default=32) 84 | --device=DEVICE 85 | ``` 86 | 87 | For instructions on how to generate your training data, use [this link](corpus/README.md). 88 | 89 | **Important note:** This model might not scale if detecting a type of string depends on the surrounding tokens. In this case, you can look at a more advanced tool for sequence processing such as [NLP-Cube](https://github.com/adobe/NLP-Cube) 90 | -------------------------------------------------------------------------------- /scripts/01-02-generate-enhanced-synthetic-training-data.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | known_words = [] 18 | 19 | 20 | def generate_words(count, known_words): 21 | import uuid 22 | import datetime 23 | import base64 24 | generated = [] 25 | for ii in range(count): 26 | if ii % 4 == 0: 27 | generated.append(str(uuid.uuid4())) 28 | elif ii % 4 == 1: 29 | generated.append(str(uuid.uuid4().hex)) 30 | elif ii % 4 == 2: 31 | generated.append(str(datetime.datetime.now().timestamp())) 32 | elif ii % 4 == 3: 33 | message = known_words[ii] 34 | message_bytes = message.encode('ascii') 35 | base64_bytes = base64.b64encode(message_bytes) 36 | base64_message = base64_bytes.decode('ascii') 37 | generated.append(base64_message) 38 | return generated 39 | 40 | 41 | lines = open('corpus/words_alpha.txt').readlines() 42 | for line in lines: 43 | known_words.append(line.strip()) 44 | 45 | generated_words = generate_words(len(known_words), known_words) 46 | 47 | f = open('corpus/generated-enhanced', 'w') 48 | 49 | total_clis = (len(generated_words) + len(known_words)) 50 | 51 | known_index = 0 52 | gen_index = 0 53 | 54 | import random 55 | 56 | 57 | def _get_next_known(): 58 | global known_index 59 | s = known_words[known_index] 60 | known_index += 1 61 | if known_index == len(known_words): 62 | known_index = 0 63 | return s 64 | 65 | 66 | def _get_next_gen(): 67 | global gen_index 68 | s = generated_words[gen_index] 69 | gen_index += 1 70 | if gen_index == len(generated_words): 71 | gen_index = 0 72 | return s 73 | 74 | 75 | import random 76 | 77 | 78 | def _generate_next_cmd(): 79 | delimiters = ' /.,?!~|<>-=_~:;\\+-&*%$#@!' 80 | enclosers = '[]{}``""\'\'()' 81 | mask = '' 82 | cmd = '' 83 | num_words = random.randint(3, 15) 84 | use_space = False 85 | use_delimiter = False 86 | use_encloser = False 87 | append_number = False 88 | for ii in range(num_words): 89 | 90 | use_delimiter = random.random() > 0.5 91 | use_encloser = random.random() > 0.8 92 | use_gen_word = random.random() > 0.7 93 | case_style = random.randint(0, 2) 94 | use_gen_word = random.random() > 0.7 95 | 96 | del_index = random.randint(0, len(delimiters) - 1) 97 | enc_index = random.randint(0, len(enclosers) // 2 - 1) * 2 98 | if use_space: 99 | mask += 'C' 100 | cmd += ' ' 101 | if use_gen_word: 102 | wrd = _get_next_gen() 103 | if case_style == 1: 104 | wrd = wrd[0].upper() + wrd[1:] 105 | elif case_style == 2: 106 | wrd = wrd.upper() 107 | msk = '' 108 | for _ in range(len(wrd)): 109 | msk += 'H' 110 | else: 111 | wrd = _get_next_known() 112 | append_number = random.random() > 0.97 113 | if append_number: 114 | wrd = wrd + str(random.randint(0, 9999)) 115 | if case_style == 1: 116 | wrd = wrd[0].upper() + wrd[1:] 117 | elif case_style == 2: 118 | wrd = wrd.upper() 119 | msk = '' 120 | for _ in range(len(wrd)): 121 | msk += 'C' 122 | 123 | if use_delimiter: 124 | wrd = delimiters[del_index] + wrd 125 | msk = 'C' + msk 126 | if use_encloser: 127 | wrd = enclosers[enc_index] + wrd + enclosers[enc_index + 1] 128 | msk = 'C' + msk + 'C' 129 | 130 | cmd += wrd 131 | mask += msk 132 | use_space = random.random() > 0.7 133 | 134 | return cmd, mask 135 | 136 | 137 | for ii in range(total_clis): 138 | command, mask = _generate_next_cmd() 139 | f.write(command + '\n' + mask + '\n') 140 | 141 | f.close() 142 | -------------------------------------------------------------------------------- /stringlifier/modules/training.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | known_words = [] 18 | 19 | 20 | def _generate_word(known_words): 21 | import uuid 22 | import datetime 23 | import base64 24 | generated = None 25 | ii = random.randint(0, 5) 26 | mask = 'H' 27 | if ii == 0: 28 | generated = str(uuid.uuid4()) 29 | mask = 'U' 30 | elif ii == 1: 31 | generated = str(uuid.uuid4().hex) 32 | mask = 'H' 33 | elif ii == 2: 34 | c = random.randint(0, 3) 35 | if c == 0: 36 | generated = str(datetime.datetime.now().timestamp()) 37 | elif c == 1: 38 | generated = str(random.randint(0, 100000000000)) 39 | elif c == 2: 40 | generated = str(random.randint(0, 999)) + '.' + str(random.randint(0, 999)) 41 | else: 42 | generated = str(random.randint(0, 999)) + '.' + str(random.randint(0, 9999)) + '.' + str( 43 | random.randint(0, 9999)) 44 | mask = 'N' 45 | elif ii == 3: 46 | import string 47 | N = random.randint(5, 20) 48 | message = [random.choice(string.ascii_uppercase + 49 | string.digits + 50 | string.ascii_lowercase) for _ in range(N)] 51 | message = ''.join(message) 52 | i = random.randint(0, 2) 53 | if i == 0: 54 | message = message.lower() 55 | elif i == 1: 56 | message = message.upper() 57 | generated = message 58 | elif ii == 4: 59 | toks = [] 60 | for _ in range(4): 61 | toks.append(str(random.randint(0, 255))) 62 | generated = '.'.join(toks) 63 | mask = 'I' 64 | elif ii == 5: 65 | generated = _generate_JWT_token(known_words) 66 | mask = 'J' 67 | return str(generated), mask[0] 68 | 69 | 70 | lines = open('corpus/words_alpha.txt').readlines() 71 | for line in lines: 72 | known_words.append(line.strip()) 73 | 74 | 75 | def _generate_JWT_token(known_words): 76 | import jwt 77 | 78 | payload = {"id": str(random.random()), "client_id": str(random.random()), "user_id": str(random.random()), 79 | "type": "access_token", 80 | "expires_in": str(random.randint(10, 3600000)), "scope": "read, write", 81 | "created_at": str(random.randint(1900000, 9000000))} 82 | encoded_jwt = jwt.encode(payload, 'secret', algorithm='HS256') 83 | 84 | return str(encoded_jwt)[2:-1] 85 | 86 | 87 | # generated_words = generate_words(len(known_words), known_words) 88 | 89 | known_index = 0 90 | 91 | import random 92 | 93 | random.shuffle(known_words) 94 | 95 | 96 | def _get_next_known(): 97 | global known_index 98 | s = known_words[known_index] 99 | known_index += 1 100 | if known_index == len(known_words): 101 | known_index = 0 102 | random.shuffle(known_words) 103 | return s 104 | 105 | 106 | def _get_next_gen(): 107 | global known_words 108 | s, m = _generate_word(known_words) 109 | return s, m 110 | 111 | 112 | import random 113 | 114 | 115 | def generate_next_cmd(): 116 | delimiters = ' /.,?!~|<>-=_~:;\\+-&*%$#@!' 117 | enclosers = '[]{}``""\'\'()' 118 | mask = '' 119 | cmd = '' 120 | num_words = random.randint(3, 15) 121 | use_space = False 122 | use_delimiter = False 123 | use_encloser = False 124 | append_number = False 125 | for ii in range(num_words): 126 | 127 | use_delimiter = random.random() > 0.5 128 | use_encloser = random.random() > 0.8 129 | case_style = random.randint(0, 2) 130 | use_gen_word = random.random() > 0.7 131 | 132 | del_index = random.randint(0, len(delimiters) - 1) 133 | enc_index = random.randint(0, len(enclosers) // 2 - 1) * 2 134 | if use_space: 135 | mask += 'C' 136 | cmd += ' ' 137 | if use_gen_word: 138 | wrd, label = _get_next_gen() 139 | if case_style == 1: 140 | wrd = wrd[0].upper() + wrd[1:] 141 | elif case_style == 2: 142 | wrd = wrd.upper() 143 | msk = '' 144 | for _ in range(len(wrd)): 145 | msk += label 146 | else: 147 | wrd = _get_next_known() 148 | append_number = random.random() > 0.97 149 | if append_number: 150 | wrd = wrd + str(random.randint(0, 99)) 151 | if case_style == 1: 152 | wrd = wrd[0].upper() + wrd[1:] 153 | elif case_style == 2: 154 | wrd = wrd.upper() 155 | msk = '' 156 | for _ in range(len(wrd)): 157 | msk += 'C' 158 | 159 | if use_delimiter: 160 | wrd = delimiters[del_index] + wrd 161 | msk = 'C' + msk 162 | if use_encloser: 163 | wrd = enclosers[enc_index] + wrd + enclosers[enc_index + 1] 164 | msk = 'C' + msk + 'C' 165 | 166 | cmd += wrd 167 | mask += msk 168 | use_space = random.random() > 0.7 169 | 170 | return cmd, mask 171 | -------------------------------------------------------------------------------- /stringlifier/api.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | from nptyping import NDArray, Int64 18 | from stringlifier.modules.stringc import AwDoC, AwDoCConfig, Encodings 19 | from stringlifier.modules.stringc2 import CTagger, CTaggerConfig 20 | from stringlifier.modules.stringc2 import Encodings as CEncodings 21 | import torch 22 | from typing import List, Optional, Tuple, Union 23 | import pkg_resources 24 | 25 | 26 | class Stringlifier: 27 | def __init__(self, model_base: Optional[str] = None): 28 | encodings = CEncodings() 29 | if model_base is None: 30 | enc_file = pkg_resources.resource_filename(__name__, 'data/enhanced-c.encodings') 31 | conf_file = pkg_resources.resource_filename(__name__, 'data/enhanced-c.conf') 32 | model_file = pkg_resources.resource_filename(__name__, 'data/enhanced-c.bestType') 33 | else: 34 | enc_file = '{0}.encodings'.format(model_base) 35 | conf_file = '{0}.conf'.format(model_base) 36 | model_file = '{0}.bestType'.format(model_base) 37 | encodings.load(enc_file) 38 | config = CTaggerConfig() 39 | config.load(conf_file) 40 | self.classifier = CTagger(config, encodings) 41 | self.classifier.load(model_file) 42 | self.classifier.eval() 43 | self.encodings = encodings 44 | self._c_index: int = encodings._label2int['C'] 45 | 46 | def __call__(self, string_or_list: Union[str, List[str]], return_tokens: bool = False, cutoff: int = 5) -> Union[ 47 | Tuple[List[str], List[List[Tuple[str, int, int, str]]]], List[str]]: 48 | if isinstance(string_or_list, str): 49 | tokens = [string_or_list] 50 | else: 51 | tokens = string_or_list 52 | 53 | max_len = max([len(s) for s in tokens]) 54 | if max_len == 0: 55 | if return_tokens: 56 | return [''], [] 57 | else: 58 | return [''] 59 | 60 | with torch.no_grad(): 61 | p_ts = self.classifier(tokens) 62 | 63 | p_ts = torch.argmax(p_ts, dim=-1).detach().cpu().numpy() 64 | ext_tokens: List[List[Tuple[str, int, int, str]]] = [] 65 | new_strings: List[str] = [] 66 | 67 | for iBatch in range(p_ts.shape[0]): 68 | new_str, toks = self._extract_tokens(tokens[iBatch], p_ts[iBatch], cutoff=cutoff) 69 | new_strings.append(new_str) 70 | ext_tokens.append(toks) 71 | 72 | if return_tokens: 73 | return new_strings, ext_tokens 74 | else: 75 | return new_strings 76 | 77 | def _extract_tokens_2class(self, string: str, pred: NDArray[Int64]) -> Tuple[str, List[Tuple[str, int, int]]]: 78 | CUTOFF = 5 79 | mask = '' 80 | for p in pred: 81 | mask += self.encodings._label_list[p] 82 | start = 0 83 | tokens: List[Tuple[str, int, int]] = [] 84 | c_tok = '' 85 | for ii in range(len(string)): 86 | if mask[ii] == 'C': 87 | # check if we have a token 88 | 89 | if c_tok != '': 90 | stop = ii 91 | tokens.append((c_tok, start, stop)) 92 | c_tok = '' 93 | else: 94 | if c_tok == '': 95 | start = ii 96 | c_tok += string[ii] 97 | if c_tok != '': 98 | stop = len(string) 99 | tokens.append((c_tok, start, stop)) 100 | 101 | # filter small tokens 102 | final_toks: List[Tuple[str, int, int]] = [] 103 | for token in tokens: 104 | if token[2] - token[1] > CUTOFF: 105 | final_toks.append(token) 106 | # compose new string 107 | new_str: str = '' 108 | last_pos = 0 109 | for token in final_toks: 110 | if token[1] > last_pos: 111 | new_str += string[last_pos:token[1]] 112 | new_str += token[0] 113 | last_pos = token[2] + 1 114 | if last_pos < len(string): 115 | new_str += string[last_pos:] 116 | return new_str, final_toks 117 | 118 | def _extract_tokens(self, string: str, pred: NDArray[Int64], cutoff: int = 5) -> Tuple[ 119 | str, List[Tuple[str, int, int, str]]]: 120 | mask = '' 121 | numbers = {str(ii): 1 for ii in range(10)} 122 | 123 | for ii in range(len(pred)): 124 | p = pred[ii] 125 | cls = self.encodings._label_list[p] 126 | if ii < len(string) and cls == 'C' and string[ii] in numbers: 127 | mask += 'N' 128 | else: 129 | mask += cls 130 | start = 0 131 | tokens = [] 132 | c_tok = '' 133 | last_label = mask[0] 134 | type_: Optional[str] = None 135 | for ii in range(len(string)): 136 | # check if the label-type has changed 137 | if last_label != mask[ii]: 138 | if c_tok != '': 139 | if last_label == 'C': 140 | pass 141 | elif last_label == 'H': 142 | type_ = '' 143 | elif last_label == 'N': 144 | type_ = '' 145 | elif last_label == 'I': 146 | type_ = '' 147 | elif last_label == 'U': 148 | type_ = '' 149 | elif last_label == 'J': 150 | type_ = '' 151 | 152 | if last_label != 'C' and type_ is not None: 153 | tokens.append((c_tok, start, ii, type_)) 154 | c_tok = '' 155 | start = ii 156 | 157 | last_label = mask[ii] 158 | c_tok += string[ii] 159 | 160 | if c_tok != '': 161 | if last_label == 'C': 162 | pass 163 | elif last_label == 'H': 164 | type_ = '' 165 | elif last_label == 'N': 166 | type_ = '' 167 | elif last_label == 'I': 168 | type_ = '' 169 | elif last_label == 'U': 170 | type_ = '' 171 | elif last_label == 'J': 172 | type_ = '' 173 | if last_label != 'C' and type_ is not None: 174 | tokens.append((c_tok, start, ii, type_)) 175 | 176 | # filter small tokens 177 | final_toks: List[Tuple[str, int, int, str]] = [] 178 | for token in tokens: 179 | if token[2] - token[1] > cutoff: 180 | final_toks.append(token) 181 | # compose new string 182 | new_str: str = '' 183 | last_pos = 0 184 | 185 | # from ipdb import set_trace 186 | # set_trace() 187 | for token in final_toks: 188 | if token[1] > last_pos: 189 | new_str += string[last_pos:token[1]] 190 | new_str += token[3] 191 | last_pos = token[2] 192 | if last_pos < len(string) - 1: 193 | new_str += string[last_pos:] 194 | return new_str, final_toks 195 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /stringlifier/modules/stringc2.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import torch 18 | import torch.nn as nn 19 | import optparse 20 | import sys 21 | import json 22 | import numpy as np 23 | import random 24 | import tqdm 25 | 26 | 27 | class Encodings: 28 | def __init__(self, filename=None): 29 | self._char2int = {'': 0, '': 1} 30 | self._label2int = {'': 0} 31 | self._label_list = [''] 32 | if filename is not None: 33 | self.load(filename) 34 | 35 | def save(self, filename): 36 | json.dump({'char2int': self._char2int, 'label2int': self._label2int}, 37 | open(filename, 'w')) 38 | 39 | def load(self, file): 40 | if isinstance(file, str): 41 | stream = open(file, 'r') 42 | else: 43 | stream = file 44 | obj = json.load(stream) 45 | self._char2int = obj['char2int'] 46 | self._label2int = obj['label2int'] 47 | self._label_list = [None for _ in range(len(self._label2int))] 48 | for t in self._label2int: 49 | self._label_list[self._label2int[t]] = t 50 | 51 | def update_encodings(self, dataset, cutoff=2): 52 | char2count = {} 53 | for entry in tqdm.tqdm(dataset): 54 | text = entry[0] 55 | label = entry[1] 56 | for char in text: 57 | char = char.lower() 58 | if char in char2count: 59 | char2count[char] += 1 60 | else: 61 | char2count[char] = 1 62 | for ttype in label: 63 | if ttype not in self._label2int: 64 | self._label2int[ttype] = len(self._label2int) 65 | self._label_list.append(ttype) 66 | 67 | for char in char2count: 68 | if char not in self._char2int and char2count[char] > cutoff: 69 | self._char2int[char] = len(self._char2int) 70 | 71 | 72 | class CTaggerConfig: 73 | def __init__(self): 74 | self.char_emb_size = 100 75 | self.rnn_layers = 2 76 | self.rnn_size = 100 77 | self.hidden = 500 78 | 79 | def save(self, filename): 80 | json.dump({'char_emb_size': self.char_emb_size, 'rnn_layers': self.rnn_layers, 'rnn_size': self.rnn_size, 81 | 'hidden': self.hidden}, 82 | open(filename, 'w')) 83 | 84 | def load(self, file): 85 | if isinstance(file, str): 86 | stream = open(file, 'r') 87 | else: 88 | stream = file 89 | obj = json.load(stream) 90 | self.char_emb_size = obj['char_emb_size'] 91 | self.rnn_size = obj['rnn_size'] 92 | self.rnn_layers = obj['rnn_layers'] 93 | self.hidden = obj['hidden'] 94 | 95 | 96 | class CTagger(nn.Module): 97 | def __init__(self, config, encodings): 98 | super(CTagger, self).__init__() 99 | self._config = config 100 | self._encodings = encodings 101 | self._char_emb = nn.Embedding(len(encodings._char2int), config.char_emb_size, padding_idx=0) 102 | self._case_emb = nn.Embedding(4, 16, padding_idx=0) 103 | 104 | self._rnn = nn.LSTM(config.char_emb_size + 16, config.rnn_size, config.rnn_layers, batch_first=True, 105 | bidirectional=True) 106 | self._hidden = nn.Sequential(nn.Linear(config.rnn_size * 2, config.hidden), nn.Tanh(), nn.Dropout(0.5)) 107 | self._softmax_type = nn.Linear(config.hidden, len(encodings._label2int)) 108 | 109 | def _make_input(self, word_list): 110 | # we pad domain names and feed them in reversed character order to the LSTM 111 | max_seq_len = max([len(word) for word in word_list]) 112 | 113 | x_char = np.zeros((len(word_list), max_seq_len)) 114 | x_case = np.zeros((len(word_list), max_seq_len)) 115 | for iBatch in range(x_char.shape[0]): 116 | word = word_list[iBatch] 117 | for index in range(len(word)): 118 | char = word[index] 119 | case_idx = 0 120 | if char.lower() == char.upper(): 121 | case_idx = 1 # symbol 122 | elif char.lower() != char: 123 | case_idx = 2 # uppercase 124 | else: 125 | case_idx = 3 # lowercase 126 | char = char.lower() 127 | if char in self._encodings._char2int: 128 | char_idx = self._encodings._char2int[char] 129 | else: 130 | char_idx = 1 # UNK 131 | x_char[iBatch, index] = char_idx 132 | x_case[iBatch, index] = case_idx 133 | 134 | return x_char, x_case 135 | 136 | def forward(self, string_list): 137 | x_char, x_case = self._make_input(string_list) 138 | x_char = torch.tensor(x_char, dtype=torch.long, device=self._get_device()) 139 | x_case = torch.tensor(x_case, dtype=torch.long, device=self._get_device()) 140 | hidden = torch.cat([self._char_emb(x_char), self._case_emb(x_case)], dim=-1) 141 | hidden = torch.dropout(hidden, 0.5, self.training) 142 | output, _ = self._rnn(hidden) 143 | 144 | hidden = self._hidden(output) 145 | 146 | return self._softmax_type(hidden) 147 | 148 | def save(self, path): 149 | torch.save(self.state_dict(), path) 150 | 151 | def load(self, path): 152 | self.load_state_dict(torch.load(path, map_location='cpu')) 153 | 154 | def _get_device(self): 155 | if self._char_emb.weight.device.type == 'cpu': 156 | return 'cpu' 157 | return '{0}:{1}'.format(self._char_emb.weight.device.type, str(self._char_emb.weight.device.index)) 158 | 159 | 160 | def _load_dataset(filename): 161 | lines = open(filename, encoding='utf-8').readlines() 162 | dataset = [] 163 | for ii in range(len(lines) // 2): 164 | string = lines[ii * 2][:-1] 165 | mask = lines[ii * 2 + 1][:-1] 166 | dataset.append((string, mask)) 167 | return dataset 168 | 169 | 170 | def _eval(model, dataset, encodings): 171 | model.eval() 172 | test_x, test_y = _make_batches(dataset, batch_size=128) 173 | total_t = 0 174 | ok_t = 0 175 | with torch.no_grad(): 176 | pgb = tqdm.tqdm(zip(test_x, test_y), total=len(test_x), ncols=80, desc='\t\t\t\t') 177 | for x, y in pgb: 178 | y_pred_t = model(x) 179 | y_tar_t = _get_targets(y, encodings).reshape(-1) 180 | y_pred_t = torch.argmax(y_pred_t, dim=-1).detach().cpu().numpy().reshape(-1) 181 | for y_t_t, y_p_t in zip(y_tar_t, y_pred_t): 182 | if y_t_t != 0: 183 | total_t += 1 184 | 185 | if y_t_t == y_p_t: 186 | ok_t += 1 187 | 188 | return ok_t / total_t 189 | 190 | 191 | def _make_batches(dataset, batch_size=32): 192 | batches_x = [] 193 | batches_y = [] 194 | 195 | batch_x = [] 196 | batch_y = [] 197 | 198 | for entry in dataset: 199 | domain = entry[0] 200 | t = entry[1] 201 | batch_x.append(domain) 202 | batch_y.append(t) 203 | if len(batch_x) == batch_size: 204 | batches_x.append(batch_x) 205 | batches_y.append(batch_y) 206 | batch_x = [] 207 | batch_y = [] 208 | 209 | if len(batch_x) != 0: 210 | batches_x.append(batch_x) 211 | batches_y.append(batch_y) 212 | 213 | return batches_x, batches_y 214 | 215 | 216 | def _get_targets(y, encodings): 217 | max_len = max([len(yy) for yy in y]) 218 | y_t = np.zeros((len(y), max_len), dtype=np.long) 219 | for i in range(len(y)): 220 | for j in range(max_len): 221 | if j < len(y[i]): 222 | y_t[i, j] = encodings._label2int[y[i][j]] 223 | 224 | return y_t 225 | 226 | 227 | def _generate_dataset(count): 228 | from training import generate_next_cmd 229 | dataset = [] 230 | for ii in range(count): 231 | cmd, mask = generate_next_cmd() 232 | dataset.append((cmd, mask)) 233 | return dataset 234 | 235 | 236 | def _start_train(params): 237 | eval_at = 5000 238 | 239 | if params.resume: 240 | encodings = Encodings('{0}.encodings'.format(params.output_base)) 241 | else: 242 | sys.stdout.write('Generating new random data...') 243 | sys.stdout.flush() 244 | trainset = _generate_dataset(int(eval_at * 4 * params.batch_size)) 245 | sys.stdout.write('done\n') 246 | encodings = Encodings() 247 | encodings.update_encodings(trainset) 248 | 249 | print('chars={0}, types={1}'.format(len(encodings._char2int), len(encodings._label2int))) 250 | print(encodings._label2int) 251 | 252 | config = CTaggerConfig() 253 | if params.resume: 254 | config.load('{0}.conf'.format(params.output_base)) 255 | model = CTagger(config, encodings) 256 | model.to(params.device) 257 | if params.resume: 258 | model.load('{0}.last'.format(params.output_base)) 259 | optimizer = torch.optim.Adam(model.parameters()) 260 | criterion_t = torch.nn.CrossEntropyLoss(ignore_index=0) 261 | 262 | patience_left = params.patience 263 | best_type = 0 # _eval(model, devset, encodings) 264 | encodings.save('{0}.encodings'.format(params.output_base)) 265 | config.save('{0}.conf'.format(params.output_base)) 266 | model.save('{0}.last'.format(params.output_base)) 267 | print("Deveset evaluation acc={0}".format(best_type)) 268 | epoch = 0 269 | eval_at = 5000 270 | 271 | while patience_left > 0: 272 | sys.stdout.write('Generating new random data...') 273 | sys.stdout.flush() 274 | trainset = _generate_dataset(int(eval_at * params.batch_size)) 275 | devset = _generate_dataset(int(eval_at / 10 * params.batch_size)) 276 | sys.stdout.write('done\n') 277 | sys.stdout.flush() 278 | sys.stderr.flush() 279 | epoch += 1 280 | random.shuffle(trainset) 281 | train_x, train_y = _make_batches(trainset, batch_size=params.batch_size) 282 | sys.stdout.write('Starting epoch {0}\n'.format(epoch)) 283 | 284 | pgb = tqdm.tqdm(zip(train_x, train_y), total=len(train_x), ncols=80, desc='\tloss=N/A') 285 | model.train() 286 | total_loss = 0 287 | cnt = 0 288 | for x, y in pgb: 289 | cnt += 1 290 | if cnt % eval_at == 0: 291 | 292 | patience_left -= 1 293 | sys.stderr.flush() 294 | sys.stderr.flush() 295 | sys.stderr.write('\n\tEvaluating...') 296 | sys.stderr.flush() 297 | acc_t = _eval(model, devset, encodings) 298 | sys.stderr.write(' acc={0}\n'.format(acc_t)) 299 | sys.stderr.flush() 300 | filename = '{0}.last'.format(params.output_base) 301 | sys.stderr.write('\t\tStoring {0}\n'.format(filename)) 302 | sys.stderr.flush() 303 | model.save(filename) 304 | if acc_t > best_type: 305 | patience_left = params.patience 306 | best_type = acc_t 307 | filename = '{0}.bestType'.format(params.output_base) 308 | sys.stderr.write('\t\tStoring {0}\n'.format(filename)) 309 | sys.stderr.flush() 310 | model.save(filename) 311 | 312 | sys.stderr.write('\n') 313 | sys.stderr.flush() 314 | model.train() 315 | 316 | if patience_left <= 0: 317 | print("Stopping with maximum patience reached") 318 | sys.exit(0) 319 | 320 | y_pred_t = model(x) 321 | 322 | y_tar_t = _get_targets(y, encodings) 323 | y_tar_t = torch.tensor(y_tar_t, dtype=torch.long, device=params.device) 324 | y_pred = y_pred_t.view(-1, y_pred_t.shape[-1]) 325 | y_target = y_tar_t.view(-1) 326 | if y_pred.shape[0] != y_target.shape[0]: 327 | from ipdb import set_trace 328 | set_trace() 329 | loss = criterion_t(y_pred, y_target) 330 | 331 | optimizer.zero_grad() 332 | total_loss += loss.item() 333 | pgb.set_description('\tloss={0:.4f}'.format(total_loss / cnt)) 334 | loss.backward() 335 | optimizer.step() 336 | 337 | sys.stdout.write('AVG train loss={0} \n'.format(total_loss / len(train_x))) 338 | 339 | 340 | def _start_interactive(params): 341 | encodings = Encodings('{0}.encodings'.format(params.output_base)) 342 | config = CTaggerConfig() 343 | config.load('{0}.conf'.format(params.output_base)) 344 | model = CTagger(config, encodings) 345 | model.load('{0}.bestType'.format(params.output_base)) 346 | model.to(params.device) 347 | model.eval() 348 | sys.stdout.write('>>> ') 349 | sys.stdout.flush() 350 | string = input() 351 | while string != '/exit': 352 | p_t = model([string]) 353 | p_d_t = torch.argmax(p_t, dim=-1).detach().cpu().numpy() 354 | print("Results for \n{0}".format(string)) 355 | for ii in range(p_d_t.shape[-1]): 356 | sys.stdout.write(encodings._label_list[p_d_t[0, ii]]) 357 | sys.stdout.write('\n') 358 | 359 | print("") 360 | sys.stdout.write('>>> ') 361 | sys.stdout.flush() 362 | string = input() 363 | 364 | 365 | if __name__ == '__main__': 366 | parser = optparse.OptionParser() 367 | parser.add_option('--interactive', action='store_true', dest='interactive') 368 | parser.add_option('--train', action='store_true', dest='train') 369 | parser.add_option('--resume', action='store_true', dest='resume') 370 | 371 | parser.add_option('--store', action='store', dest='output_base') 372 | parser.add_option('--patience', action='store', dest='patience', type='int', default=20, help='(default=20)') 373 | parser.add_option('--batch-size', action='store', dest='batch_size', default=32, type='int', help='(default=32)') 374 | parser.add_option('--device', action='store', dest='device', default='cpu') 375 | 376 | (params, _) = parser.parse_args(sys.argv) 377 | 378 | if params.train: 379 | _start_train(params) 380 | elif params.interactive: 381 | _start_interactive(params) 382 | else: 383 | parser.print_help() 384 | -------------------------------------------------------------------------------- /stringlifier/modules/stringc.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020 Adobe Systems Incorporated. All rights reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import torch 18 | import torch.nn as nn 19 | import optparse 20 | import sys 21 | import json 22 | import numpy as np 23 | import random 24 | import tqdm 25 | 26 | sys.path.append('') 27 | 28 | 29 | class Encodings: 30 | def __init__(self, filename=None): 31 | self._char2int = {'': 0, '': 1} 32 | self._type2int = {} 33 | self._subtype2int = {'': 0} # this will not get backpropagated 34 | self._type_list = [] 35 | self._subtype_list = [] 36 | if filename is not None: 37 | self.load(filename) 38 | 39 | def save(self, filename): 40 | json.dump({'char2int': self._char2int, 'type2int': self._type2int, 'subtype2int': self._subtype2int}, 41 | open(filename, 'w')) 42 | 43 | def load(self, file): 44 | if isinstance(file, str): 45 | stream = open(file, 'r') 46 | else: 47 | stream = file 48 | obj = json.load(stream) 49 | self._char2int = obj['char2int'] 50 | self._type2int = obj['type2int'] 51 | self._subtype2int = obj['subtype2int'] 52 | self._type_list = [None for _ in range(len(self._type2int))] 53 | self._subtype_list = [None for _ in range(len(self._subtype2int))] 54 | for t in self._type2int: 55 | self._type_list[self._type2int[t]] = t 56 | 57 | for t in self._subtype2int: 58 | self._subtype_list[self._subtype2int[t]] = t 59 | 60 | def update_encodings(self, dataset, cutoff=2): 61 | char2count = {} 62 | for entry in dataset: 63 | domain = entry[0] 64 | ttype = entry[1] 65 | tsubtype = entry[2] 66 | for char in domain: 67 | char = char.lower() 68 | if char in char2count: 69 | char2count[char] += 1 70 | else: 71 | char2count[char] = 1 72 | if ttype not in self._type2int: 73 | self._type2int[ttype] = len(self._type2int) 74 | self._type_list.append(ttype) 75 | if tsubtype not in self._subtype2int: 76 | self._subtype2int[tsubtype] = len(self._subtype2int) 77 | self._subtype_list.append(tsubtype) 78 | 79 | for char in char2count: 80 | if char not in self._char2int: 81 | self._char2int[char] = len(self._char2int) 82 | 83 | 84 | class AwDoCConfig: 85 | def __init__(self): 86 | self.char_emb_size = 100 87 | self.rnn_layers = 2 88 | self.rnn_size = 100 89 | self.hidden = 500 90 | 91 | def save(self, filename): 92 | json.dump({'char_emb_size': self.char_emb_size, 'rnn_layers': self.rnn_layers, 'rnn_size': self.rnn_size, 93 | 'hidden': self.hidden}, 94 | open(filename, 'w')) 95 | 96 | def load(self, file): 97 | if isinstance(file, str): 98 | stream = open(file, 'r') 99 | else: 100 | stream = file 101 | obj = json.load(stream) 102 | self.char_emb_size = obj['char_emb_size'] 103 | self.rnn_size = obj['rnn_size'] 104 | self.rnn_layers = obj['rnn_layers'] 105 | self.hidden = obj['hidden'] 106 | 107 | 108 | class AwDoC(nn.Module): 109 | def __init__(self, config, encodings): 110 | super(AwDoC, self).__init__() 111 | self._config = config 112 | self._encodings = encodings 113 | self._char_emb = nn.Embedding(len(encodings._char2int), config.char_emb_size) 114 | 115 | self._rnn = nn.LSTM(config.char_emb_size, config.rnn_size, config.rnn_layers, batch_first=True) 116 | self._hidden = nn.Sequential(nn.Linear(config.rnn_size, config.hidden), nn.Tanh(), nn.Dropout(0.5)) 117 | self._softmax_type = nn.Linear(config.hidden, len(encodings._type2int)) 118 | self._softmax_subtype = nn.Linear(config.hidden, len(encodings._subtype2int)) 119 | 120 | def _make_input(self, domain_list): 121 | # we pad domain names and feed them in reversed character order to the LSTM 122 | max_seq_len = max([len(domain) for domain in domain_list]) 123 | 124 | x = np.zeros((len(domain_list), max_seq_len)) 125 | for iBatch in range(x.shape[0]): 126 | domain = domain_list[iBatch] 127 | n = len(domain) 128 | ofs_x = max_seq_len - n 129 | for iSeq in range(x.shape[1]): 130 | if iSeq < n: 131 | char = domain[-iSeq - 1].lower() 132 | if char in self._encodings._char2int: 133 | iChar = self._encodings._char2int[char] 134 | else: 135 | iChar = self._encodings._char2int[''] 136 | x[iBatch, iSeq + ofs_x] = iChar 137 | return x 138 | 139 | def forward(self, domain_list): 140 | 141 | x = torch.tensor(self._make_input(domain_list), dtype=torch.long, device=self._get_device()) 142 | hidden = self._char_emb(x) 143 | hidden = torch.dropout(hidden, 0.5, self.training) 144 | output, _ = self._rnn(hidden) 145 | output = output[:, -1, :] 146 | 147 | hidden = self._hidden(output) 148 | 149 | return self._softmax_type(hidden), self._softmax_subtype(hidden) 150 | 151 | def save(self, path): 152 | torch.save(self.state_dict(), path) 153 | 154 | def load(self, path): 155 | self.load_state_dict(torch.load(path, map_location='cpu')) 156 | 157 | def _get_device(self): 158 | if self._char_emb.weight.device.type == 'cpu': 159 | return 'cpu' 160 | return '{0}:{1}'.format(self._char_emb.weight.device.type, str(self._char_emb.weight.device.index)) 161 | 162 | 163 | def _load_dataset(filename): 164 | lines = open(filename, encoding='utf-8').readlines() 165 | dataset = [] 166 | for line in lines: 167 | line = line.strip() 168 | if line != '': 169 | parts = line.split('\t') 170 | if len(parts) == 3: 171 | dataset.append(parts) 172 | return dataset 173 | 174 | 175 | def _eval(model, dataset, encodings): 176 | model.eval() 177 | test_x, test_y = _make_batches(dataset, batch_size=128) 178 | total_t = 0 179 | total_st = 0 180 | ok_t = 0 181 | ok_st = 0 182 | with torch.no_grad(): 183 | pgb = tqdm.tqdm(zip(test_x, test_y), total=len(test_x), ncols=80, desc='\t\t\t\t') 184 | for x, y in pgb: 185 | y_pred_t, y_pred_st = model(x) 186 | y_tar_t, y_tar_st = _get_targets(y, encodings) 187 | y_pred_t = torch.argmax(y_pred_t, dim=1).detach().cpu().numpy() 188 | y_pred_st = torch.argmax(y_pred_st, dim=1).detach().cpu().numpy() 189 | for y_t_t, y_t_st, y_p_t, y_p_st in zip(y_tar_t, y_tar_st, y_pred_t, y_pred_st): 190 | total_t += 1 191 | if y_t_st != 0: 192 | total_st += 1 193 | if y_t_st == y_p_st: 194 | ok_st += 1 195 | if y_t_t == y_p_t: 196 | ok_t += 1 197 | 198 | return ok_t / total_t, ok_st / total_st 199 | 200 | 201 | def _make_batches(dataset, batch_size=32): 202 | batches_x = [] 203 | batches_y = [] 204 | 205 | batch_x = [] 206 | batch_y = [] 207 | 208 | for entry in dataset: 209 | domain = entry[0] 210 | t = entry[1] 211 | st = entry[2] 212 | batch_x.append(domain) 213 | batch_y.append((t, st)) 214 | if len(batch_x) == batch_size: 215 | batches_x.append(batch_x) 216 | batches_y.append(batch_y) 217 | batch_x = [] 218 | batch_y = [] 219 | 220 | if len(batch_x) != 0: 221 | batches_x.append(batch_x) 222 | batches_y.append(batch_y) 223 | 224 | return batches_x, batches_y 225 | 226 | 227 | def _get_targets(y, encodings): 228 | y_t = np.zeros((len(y))) 229 | y_st = np.zeros((len(y))) 230 | for i in range(len(y)): 231 | y_t[i] = encodings._type2int[y[i][0]] 232 | y_st[i] = encodings._subtype2int[y[i][1]] 233 | 234 | return y_t, y_st 235 | 236 | 237 | def _drop_tld(domain_list, p): 238 | new_list = [] 239 | for domain in domain_list: 240 | parts = domain.split('.') 241 | dp = random.random() 242 | if dp < p: 243 | if dp < p / 2: 244 | parts[-1] = ' ' 245 | else: 246 | parts[-1] = ' ' 247 | dom = '.'.join(parts) 248 | new_list.append(dom) 249 | return new_list 250 | 251 | 252 | def _start_train(params): 253 | trainset = _load_dataset(params.train_file) 254 | devset = _load_dataset(params.dev_file) 255 | if params.resume: 256 | encodings = Encodings('{0}.encodings'.format(params.output_base)) 257 | else: 258 | encodings = Encodings() 259 | encodings.update_encodings(trainset) 260 | print('chars={0}, types={1}, subtypes={2}'.format(len(encodings._char2int), len(encodings._type2int), 261 | len(encodings._subtype2int))) 262 | 263 | config = AwDoCConfig() 264 | if params.resume: 265 | config.load('{0}.conf'.format(params.output_base)) 266 | model = AwDoC(config, encodings) 267 | model.to(params.device) 268 | if params.resume: 269 | model.load('{0}.last'.format(params.output_base)) 270 | optimizer = torch.optim.Adam(model.parameters()) 271 | criterion_t = torch.nn.CrossEntropyLoss() 272 | criterion_st = torch.nn.CrossEntropyLoss(ignore_index=0) # we ignore unknown types 273 | 274 | patience_left = params.patience 275 | best_type, best_subtype = _eval(model, devset, encodings) 276 | encodings.save('{0}.encodings'.format(params.output_base)) 277 | config.save('{0}.conf'.format(params.output_base)) 278 | model.save('{0}.last'.format(params.output_base)) 279 | print("Deveset evaluation type_acc={0} subtype_acc={1}".format(best_type, best_subtype)) 280 | epoch = 0 281 | eval_at = 5000 282 | while patience_left > 0: 283 | epoch += 1 284 | random.shuffle(trainset) 285 | train_x, train_y = _make_batches(trainset, batch_size=params.batch_size) 286 | sys.stdout.write('Starting epoch {0}\n'.format(epoch)) 287 | 288 | pgb = tqdm.tqdm(zip(train_x, train_y), total=len(train_x), ncols=80, desc='\tloss=N/A') 289 | model.train() 290 | total_loss = 0 291 | cnt = 0 292 | for x, y in pgb: 293 | cnt += 1 294 | if cnt % eval_at == 0: 295 | patience_left -= 1 296 | sys.stderr.flush() 297 | sys.stderr.flush() 298 | sys.stderr.write('\n\tEvaluating...') 299 | sys.stderr.flush() 300 | acc_t, acc_st = _eval(model, devset, encodings) 301 | sys.stderr.write(' type_acc={0}, subtype_acc={1}\n'.format(acc_t, acc_st)) 302 | sys.stderr.flush() 303 | filename = '{0}.last'.format(params.output_base) 304 | sys.stderr.write('\t\tStoring {0}\n'.format(filename)) 305 | sys.stderr.flush() 306 | model.save(filename) 307 | if acc_t > best_type: 308 | patience_left = params.patience 309 | best_type = acc_t 310 | filename = '{0}.bestType'.format(params.output_base) 311 | sys.stderr.write('\t\tStoring {0}\n'.format(filename)) 312 | sys.stderr.flush() 313 | model.save(filename) 314 | if acc_st > best_subtype: 315 | patience_left = params.patience 316 | best_subtype = acc_st 317 | filename = '{0}.bestSubtype'.format(params.output_base) 318 | sys.stderr.write('\t\tStoring {0}\n'.format(filename)) 319 | sys.stderr.flush() 320 | model.save(filename) 321 | sys.stderr.write('\n') 322 | sys.stderr.flush() 323 | model.train() 324 | if patience_left <= 0: 325 | print("Stopping with maximum patience reached") 326 | sys.exit(0) 327 | 328 | x = _drop_tld(x, 0.5) 329 | y_pred_t, y_pred_st = model(x) 330 | 331 | y_tar_t, y_tar_st = _get_targets(y, encodings) 332 | y_tar_t = torch.tensor(y_tar_t, dtype=torch.long, device=params.device) 333 | y_tar_st = torch.tensor(y_tar_st, dtype=torch.long, device=params.device) 334 | 335 | loss = criterion_t(y_pred_t, y_tar_t) + \ 336 | criterion_st(y_pred_st, y_tar_st) 337 | 338 | optimizer.zero_grad() 339 | total_loss += loss.item() 340 | pgb.set_description('\tloss={0:.4f}'.format(total_loss / cnt)) 341 | loss.backward() 342 | optimizer.step() 343 | 344 | sys.stdout.write('AVG train loss={0}\n'.format(total_loss / len(train_x))) 345 | 346 | 347 | def _start_interactive(params): 348 | encodings = Encodings('{0}.encodings'.format(params.output_base)) 349 | config = AwDoCConfig() 350 | config.load('{0}.conf'.format(params.output_base)) 351 | model = AwDoC(config, encodings) 352 | model.load('{0}.bestType'.format(params.output_base)) 353 | model.to(params.device) 354 | model.eval() 355 | sys.stdout.write('>>> ') 356 | sys.stdout.flush() 357 | domain = input() 358 | while domain != '/exit': 359 | p_t, p_st = model([domain]) 360 | print(p_t) 361 | print(p_st) 362 | p_d_t = torch.argmax(p_t, dim=1).detach().cpu().item() 363 | p_d_st = torch.argmax(p_st, dim=1).detach().cpu().item() 364 | print("Results for '{0}'".format(domain)) 365 | print(encodings._type_list[p_d_t]) 366 | 367 | print(encodings._subtype_list[p_d_st]) 368 | 369 | print("") 370 | sys.stdout.write('>>> ') 371 | sys.stdout.flush() 372 | domain = input() 373 | 374 | 375 | if __name__ == '__main__': 376 | parser = optparse.OptionParser() 377 | parser.add_option('--interactive', action='store_true', dest='interactive') 378 | parser.add_option('--train', action='store_true', dest='train') 379 | parser.add_option('--resume', action='store_true', dest='resume') 380 | parser.add_option('--train-file', action='store', dest='train_file') 381 | parser.add_option('--dev-file', action='store', dest='dev_file') 382 | parser.add_option('--store', action='store', dest='output_base') 383 | parser.add_option('--patience', action='store', dest='patience', type='int', default=20, help='(default=20)') 384 | parser.add_option('--batch-size', action='store', dest='batch_size', default=32, type='int', help='(default=32)') 385 | parser.add_option('--device', action='store', dest='device', default='cpu') 386 | 387 | (params, _) = parser.parse_args(sys.argv) 388 | 389 | if params.train: 390 | _start_train(params) 391 | elif params.interactive: 392 | _start_interactive(params) 393 | else: 394 | parser.print_help() 395 | --------------------------------------------------------------------------------